git.roojs.org Git - gnome.introspection-doc-generator/blob - JSDOC/TokenReader.vala

   1 //<script type="text/javascript">
   2
   3
   4 // test code
   5
   6 void main() {
   7          var tr = new  JSDOC.TokenReader();
   8          tr.keepDocs =true;
   9             tr.keepWhite = true;
  10             tr.keepComments = true;
  11             tr.sepIdents = true;
  12             tr.collapseWhite = false;
  13             tr.filename = "test";
  14         });
  15         this.timerPrint("START" + fn);
  16
  17         // we can load translation map here...
  18
  19         var toks = tr.tokenize(new TextStream(str)); // dont merge xxx + . + yyyy etc.
  20
  21 }
  22
  23 //const Token   = imports.Token.Token;
  24 //const Lang    = imports.Lang.Lang;
  25
  26 /**
  27         @class Search a {@link JSDOC.TextStream} for language tokens.
  28 */
  29
  30
  31
  32
  33 namespace JSDOC {
  34
  35     public class TokenArray: Object {
  36
  37         public Gee.ArrayList<Token> tokens;
  38         public int length {
  39             get { return this.tokens.size; }
  40         }
  41
  42         public TokenArray()
  43         {
  44             this.tokens = new Gee.ArrayList<Token>();
  45         }
  46
  47         public Token? last() {
  48             if (this.tokens.size > 0) {
  49                 return this.tokens.get(this.tokens.size-1);
  50             }
  51             return null;
  52         }
  53         public Token? lastSym () {
  54             for (var i = this.tokens.size-1; i >= 0; i--) {
  55                 if (!(this.tokens.get(i).is("WHIT") || this.tokens.get(i).is("COMM")))  {
  56                     return this.tokens.get(i);
  57                 }
  58             }
  59             return null;
  60         }
  61         public void push (Token t) {
  62             this.tokens.add(t);
  63         }
  64         public Token? pop ()
  65         {
  66             if (this.tokens.size > 0) {
  67                 return this.tokens.remove_at(this.tokens.size-1);
  68             }
  69             return null;
  70         }
  71
  72             public new Token get(int i) {
  73             return this.tokens.get(i);
  74         }
  75     }
  76
  77     public errordomain TokenReader_Error {
  78             ArgumentError
  79     }
  80
  81
  82     public class TokenReader : Object
  83     {
  84
  85
  86
  87         /*
  88          *
  89          * I wonder if this will accept the prop: value, prop2 :value construxtor if we do not define one...
  90          */
  91
  92         /** @cfg {Boolean} collapseWhite merge multiple whitespace/comments into a single token **/
  93         public bool collapseWhite = false; // only reduces white space...
  94         /** @cfg {Boolean} keepDocs keep JSDOC comments **/
  95         public bool keepDocs = true;
  96         /** @cfg {Boolean} keepWhite keep White space **/
  97         public bool keepWhite = false;
  98         /** @cfg {Boolean} keepComments  keep all comments **/
  99         public bool keepComments = false;
 100         /** @cfg {Boolean} sepIdents seperate identifiers (eg. a.b.c into ['a', '.', 'b', '.', 'c'] ) **/
 101         public bool sepIdents = false;
 102         /** @cfg {String} filename name of file being parsed. **/
 103         public string filename = "";
 104         /** @config {Boolean} ignoreBadGrammer do not throw errors if we find stuff that might break compression **/
 105         public bool ignoreBadGrammer = false;
 106
 107
 108         int line = 0;
 109
 110         /**
 111          * tokenize a stream
 112          * @return {Array} of tokens
 113          *
 114          * ts = new TextStream(File.read(str));
 115          * tr = TokenReader({ keepComments : true, keepWhite : true });
 116          * tr.tokenize(ts)
 117          *
 118          */
 119         public TokenArray tokenize(TextStream stream)
 120         {
 121             this.line =1;
 122             var tokens = new TokenArray();
 123
 124
 125             while (!stream.lookEOF()) {
 126
 127
 128                 if (this.read_mlcomment(stream, tokens)) continue;
 129                 if (this.read_slcomment(stream, tokens)) continue;
 130                 if (this.read_dbquote(stream, tokens))   continue;
 131                 if (this.read_snquote(stream, tokens))   continue;
 132                 if (this.read_regx(stream, tokens))      continue;
 133                 if (this.read_numb(stream, tokens))      continue;
 134                 if (this.read_punc(stream, tokens))      continue;
 135                 if (this.read_newline(stream, tokens))   continue;
 136                 if (this.read_space(stream, tokens))     continue;
 137                 if (this.read_word(stream, tokens))      continue;
 138
 139                 // if execution reaches here then an error has happened
 140                 tokens.push(
 141                         new Token(stream.next(), "TOKN", "UNKNOWN_TOKEN", this.line)
 142                 );
 143             }
 144
 145
 146
 147             return tokens;
 148         }
 149
 150         /**
 151          * findPuncToken - find the id of a token (previous to current)
 152          * need to back check syntax..
 153          *
 154          * @arg {Array} tokens the array of tokens.
 155          * @arg {String} token data (eg. '(')
 156          * @arg {Number} offset where to start reading from
 157          * @return {Number} position of token
 158          */
 159         public int findPuncToken(TokenArray tokens, string data, int n)
 160         {
 161             n = n > 0 ? n :  tokens.length -1;
 162             var stack = 0;
 163             while (n > -1) {
 164
 165                 if (stack < 1 && tokens.get(n).data == data) {
 166                     return n;
 167                 }
 168
 169                 if (tokens.get(n).data  == ")" || tokens.get(n).data  == "}") {
 170                     stack++;
 171                     n--;
 172                     continue;
 173                 }
 174                 if (stack > 0 && (tokens.get(n).data  == "{" || tokens.get(n).data  == "(")) {
 175                     stack--;
 176                     n--;
 177                     continue;
 178                 }
 179
 180
 181                 n--;
 182             }
 183             return -1;
 184         }
 185         /**
 186          * lastSym - find the last token symbol
 187          * need to back check syntax..
 188          *
 189          * @arg {Array} tokens the array of tokens.
 190          * @arg {Number} offset where to start..
 191          * @return {Token} the token
 192          */
 193         public Token? lastSym(TokenArray tokens, int n)
 194         {
 195             for (var i = n-1; i >= 0; i--) {
 196                 if (!(tokens.get(i).is("WHIT") || tokens.get(i).is("COMM"))) {
 197                     return tokens.get(i);
 198                 }
 199             }
 200             return null;
 201         }
 202
 203
 204
 205         /**
 206             @returns {Boolean} Was the token found?
 207          */
 208         public bool read_word (TextStream stream, TokenArray tokens)
 209         {
 210             string found = "";
 211             while (!stream.lookEOF() && Lang.isWordChar((string)stream.look())) {
 212                 found += stream.next();
 213             }
 214
 215             if (found == "") {
 216                 return false;
 217             }
 218
 219             var name = Lang.keyword(found);
 220             if (name != null) {
 221
 222                 // look for "()return" ?? why ???
 223                 var ls = tokens.lastSym();
 224                 if (found == "return" && ls != null && ls.data == ")") {
 225                     //Seed.print('@' + tokens.length);
 226                     var n = this.findPuncToken(tokens, ")", 0);
 227                     //Seed.print(')@' + n);
 228                     n = this.findPuncToken(tokens, "(", n-1);
 229                     //Seed.print('(@' + n);
 230
 231                     //var lt = this.lastSym(tokens, n);
 232                     /*
 233                     //print(JSON.stringify(lt));
 234                     if (lt.type != "KEYW" || ["IF", 'WHILE'].indexOf(lt.name) < -1) {
 235                         if (!this.ignoreBadGrammer) {
 236                             throw new TokenReader_Error.ArgumentError(
 237                                 this.filename + ":" + this.line + " Error - return found after )"
 238                             );
 239                         }
 240                     }
 241
 242                     */
 243
 244                 }
 245
 246                 tokens.push(new Token(found, "KEYW", name, this.line));
 247                 return true;
 248             }
 249
 250             if (!this.sepIdents || found.index_of(".") < 0 ) {
 251                 tokens.push(new Token(found, "NAME", "NAME", this.line));
 252                 return true;
 253             }
 254             var n = found.split(".");
 255             var p = false;
 256             foreach (unowned string nm in n) {
 257                 if (p) {
 258                     tokens.push(new Token(".", "PUNC", "DOT", this.line));
 259                 }
 260                 p=true;
 261                 tokens.push(new Token(nm, "NAME", "NAME", this.line));
 262             }
 263             return true;
 264
 265
 266         }
 267
 268         /**
 269             @returns {Boolean} Was the token found?
 270          */
 271         public bool read_punc (TextStream stream, TokenArray tokens) throws TokenReader_Error
 272         {
 273             string found = "";
 274
 275             while (!stream.lookEOF() && Lang.punc(found + (string)stream.look()).length > 0) {
 276                 found += stream.next();
 277             }
 278
 279
 280             if (found == "") {
 281                 return false;
 282             }
 283
 284             var ls = tokens.lastSym();
 285
 286             if ((found == "}" || found == "]") && ls != null && ls.data == ",") {
 287                 //print("Error - comma found before " + found);
 288                 //print(JSON.stringify(tokens.lastSym(), null,4));
 289                 if (this.ignoreBadGrammer) {
 290                     print("\n" + this.filename + ":" + this.line.to_string() + " Error - comma found before " + found);
 291                 } else {
 292                     throw new TokenReader_Error.ArgumentError(
 293                                 this.filename + ":" + this.line.to_string() + "  comma found before " + found
 294
 295                     );
 296
 297                 }
 298             }
 299
 300             tokens.push(new Token(found, "PUNC", Lang.punc(found), this.line));
 301             return true;
 302
 303         }
 304
 305         /**
 306             @returns {Boolean} Was the token found?
 307          */
 308         public bool read_space  (TextStream stream, TokenArray tokens)
 309         {
 310             var found = "";
 311
 312             while (!stream.lookEOF() && Lang.isSpaceC(  stream.look()) && !Lang.isNewlineC(stream.look())) {
 313                 found += stream.next();
 314             }
 315
 316             if (found == "") {
 317                 return false;
 318             }
 319             //print("WHITE = " + JSON.stringify(found));
 320
 321
 322             if (this.collapseWhite) {
 323                 found = " "; // this might work better if it was a '\n' ???
 324             }
 325             if (this.keepWhite) {
 326                 tokens.push(new Token(found, "WHIT", "SPACE", this.line));
 327             }
 328             return true;
 329
 330         }
 331
 332         /**
 333             @returns {Boolean} Was the token found?
 334          */
 335         public bool read_newline  (TextStream stream, TokenArray tokens)
 336         {
 337             var found = "";
 338             var line = this.line;
 339             while (!stream.lookEOF() && Lang.isNewlineC(stream.look())) {
 340                 this.line++;
 341                 found += stream.next();
 342             }
 343
 344             if (found == "") {
 345                 return false;
 346             }
 347
 348             // if we found a new line, then we could check if previous character was a ';' - if so we can drop it.
 349             // otherwise generally keep it.. in which case it should reduce our issue with stripping new lines..
 350
 351
 352             //this.line++;
 353             if (this.collapseWhite) {
 354                 found = "\n"; // reduces multiple line breaks into a single one...
 355             }
 356
 357             if (this.keepWhite) {
 358                 var last = tokens.pop();
 359                 if (last != null && last.name != "WHIT") {
 360                     tokens.push(last);
 361                 }
 362                 // replaces last new line...
 363                 tokens.push(new Token(found, "WHIT", "NEWLINE", line));
 364             }
 365             return true;
 366         }
 367
 368         /**
 369             @returns {Boolean} Was the token found?
 370          */
 371         public bool read_mlcomment  (TextStream stream, TokenArray tokens)
 372         {
 373             if (stream.look() != '/') {
 374                 return false;
 375             }
 376             if (stream.look(1) != '*') {
 377                 return false;
 378             }
 379             var found = stream.next(2);
 380             string  c = "";
 381             var line = this.line;
 382             while (!stream.lookEOF() && !(stream.look(-1) == '/' && stream.look(-2) == '*')) {
 383                 c = stream.next();
 384                 if (c == "\n") {
 385                     this.line++;
 386                 }
 387                 found += c;
 388             }
 389
 390             // to start doclet we allow /** or /*** but not /**/ or /****
 391             //if (found.length /^\/\*\*([^\/]|\*[^*])/.test(found) && this.keepDocs) {
 392             if (this.keepDocs && found.length > 4 && found.index_of("/**") == 0 && found[3] != '/') {
 393                 tokens.push(new Token(found, "COMM", "JSDOC", this.line));
 394             } else if (this.keepComments) {
 395                 tokens.push(new Token(found, "COMM", "MULTI_LINE_COMM", line));
 396             }
 397             return true;
 398
 399         }
 400
 401         /**
 402             @returns {Boolean} Was the token found?
 403          */
 404          public bool read_slcomment  (TextStream stream, TokenArray tokens)
 405          {
 406             var found = "";
 407             if (
 408                 (stream.look() == '/' && stream.look(1) == '/' && (""!=(found=stream.next(2))))
 409                 ||
 410                 (stream.look() == '<' && stream.look(1) == '!' && stream.look(2) == '-' && stream.look(3) == '-' && (""!=(found=stream.next(4))))
 411             ) {
 412                 var line = this.line;
 413                 while (!stream.lookEOF() && !Lang.isNewline((string)stream.look())) {
 414                     found += stream.next();
 415                 }
 416                 //if (!stream.lookEOF()) { // what? << eat the EOL?
 417                     found += stream.next();
 418                 //}
 419                 if (this.keepComments) {
 420                     tokens.push(new Token(found, "COMM", "SINGLE_LINE_COMM", line));
 421                 }
 422                 this.line++;
 423                 return true;
 424             }
 425             return false;
 426         }
 427
 428         /**
 429             @returns {Boolean} Was the token found?
 430          */
 431         public bool read_dbquote  (TextStream stream, TokenArray tokens)
 432         {
 433             if (stream.look() != '"') {
 434                 return false;
 435             }
 436                 // find terminator
 437             var str = stream.next();
 438
 439             while (!stream.lookEOF()) {
 440                 if (stream.look() == '\\') {
 441                     if (Lang.isNewline((string)stream.look(1))) {
 442                         do {
 443                             stream.next();
 444                         } while (!stream.lookEOF() && Lang.isNewline((string)stream.look()));
 445                         str += "\\\n";
 446                     }
 447                     else {
 448                         str += stream.next(2);
 449                     }
 450                     continue;
 451                 }
 452                 if (stream.look() == '"') {
 453                     str += stream.next();
 454                     tokens.push(new Token(str, "STRN", "DOUBLE_QUOTE", this.line));
 455                     return true;
 456                 }
 457
 458                 str += stream.next();
 459
 460             }
 461             return false;
 462         }
 463
 464         /**
 465             @returns {Boolean} Was the token found?
 466          */
 467         public bool read_snquote  (TextStream stream, TokenArray tokens)
 468         {
 469             if (stream.look() != '\'') {
 470                 return false;
 471             }
 472             // find terminator
 473             var str = stream.next();
 474
 475             while (!stream.lookEOF()) {
 476                 if (stream.look() == '\\') { // escape sequence
 477                     str += stream.next(2);
 478                     continue;
 479                 }
 480                 if (stream.look() == '\'') {
 481                     str += stream.next();
 482                     tokens.push(new Token(str, "STRN", "SINGLE_QUOTE", this.line));
 483                     return true;
 484                 }
 485                 str += stream.next();
 486
 487             }
 488             return false;
 489         }
 490
 491
 492         /**
 493             @returns {Boolean} Was the token found?
 494          */
 495         public bool read_numb  (TextStream stream, TokenArray tokens)
 496         {
 497             if (stream.look() == '0' && stream.look(1) == 'x') {
 498                 return this.read_hex(stream, tokens);
 499             }
 500
 501             var found = "";
 502
 503             while (!stream.lookEOF() && Lang.isNumber(found+(string)stream.look())){
 504                 found += stream.next();
 505             }
 506
 507             if (found == "") {
 508                 return false;
 509             }
 510             if (GLib.Regex.match_simple("^0[0-7]", found)) {
 511                 tokens.push(new Token(found, "NUMB", "OCTAL", this.line));
 512                 return true;
 513             }
 514             tokens.push(new Token(found, "NUMB", "DECIMAL", this.line));
 515             return true;
 516
 517         }
 518
 519         /**
 520             @returns {Boolean} Was the token found?
 521          */
 522         public bool read_hex  (TextStream stream, TokenArray tokens)
 523         {
 524             var found = stream.next(2);
 525
 526             while (!stream.lookEOF()) {
 527                 if (Lang.isHexDec(found) && !Lang.isHexDec(found+(string)stream.look())) { // done
 528                     tokens.push(new Token(found, "NUMB", "HEX_DEC", this.line));
 529                     return true;
 530                 }
 531
 532                 found += stream.next();
 533
 534             }
 535             return false;
 536         }
 537
 538         /**
 539             @returns {Boolean} Was the token found?
 540          */
 541         public bool read_regx (TextStream stream, TokenArray tokens)
 542         {
 543
 544             if (stream.look() != '/') {
 545                 return false;
 546             }
 547             var  last = tokens.lastSym();
 548             if (
 549                 (last == null)
 550                 ||
 551                 (
 552                        !last.is("NUMB")   // stuff that can not appear before a regex..
 553                     && !last.is("NAME")
 554                     && !last.is("RIGHT_PAREN")
 555                     && !last.is("RIGHT_BRACKET")
 556                 )
 557             )  {
 558                 var regex = stream.next();
 559
 560                 while (!stream.lookEOF()) {
 561                     if (stream.look() == '\\') { // escape sequence
 562                         regex += stream.next(2);
 563                         continue;
 564                     }
 565                     if (stream.look() == '/') {
 566                         regex += stream.next();
 567
 568                         while (GLib.Regex.match_simple("[gmi]", (string)stream.look())) {
 569                             regex += stream.next();
 570                         }
 571
 572                         tokens.push(new Token(regex, "REGX", "REGX", this.line));
 573                         return true;
 574                     }
 575
 576                     regex += stream.next();
 577
 578                 }
 579                 // error: unterminated regex
 580             }
 581             return false;
 582         }
 583     }
 584 }