git.roojs.org Git - gnome.introspection-doc-generator/blob - JSDOC/TokenReader.vala

   1 //<script type="text/javascript">
   2
   3
   4 // test code
   5
   6 void main() {
   7          var tr = new  JSDOC.TokenReader();
   8          tr.keepDocs =true;
   9         tr.keepWhite = true;
  10         tr.keepComments = true;
  11         tr.sepIdents = true;
  12         tr.collapseWhite = false;
  13         tr.filename = "test";
  14         string str;
  15         FileUtils.get_contents("/home/alan/gitlive/gnome.introspection-doc-generator/JSDOC/Walker2.js", str);
  16
  17         var toks = tr.tokenize(new JSDOC.TextStream(str)); // dont merge xxx + . + yyyy etc.
  18
  19 }
  20
  21 //const Token   = imports.Token.Token;
  22 //const Lang    = imports.Lang.Lang;
  23
  24 /**
  25         @class Search a {@link JSDOC.TextStream} for language tokens.
  26 */
  27
  28
  29
  30
  31 namespace JSDOC {
  32
  33     public class TokenArray: Object {
  34
  35         public Gee.ArrayList<Token> tokens;
  36         public int length {
  37             get { return this.tokens.size; }
  38         }
  39
  40         public TokenArray()
  41         {
  42             this.tokens = new Gee.ArrayList<Token>();
  43         }
  44
  45         public Token? last() {
  46             if (this.tokens.size > 0) {
  47                 return this.tokens.get(this.tokens.size-1);
  48             }
  49             return null;
  50         }
  51         public Token? lastSym () {
  52             for (var i = this.tokens.size-1; i >= 0; i--) {
  53                 if (!(this.tokens.get(i).is("WHIT") || this.tokens.get(i).is("COMM")))  {
  54                     return this.tokens.get(i);
  55                 }
  56             }
  57             return null;
  58         }
  59         public void push (Token t) {
  60             this.tokens.add(t);
  61         }
  62         public Token? pop ()
  63         {
  64             if (this.tokens.size > 0) {
  65                 return this.tokens.remove_at(this.tokens.size-1);
  66             }
  67             return null;
  68         }
  69
  70             public new Token get(int i) {
  71             return this.tokens.get(i);
  72         }
  73     }
  74
  75     public errordomain TokenReader_Error {
  76             ArgumentError
  77     }
  78
  79
  80     public class TokenReader : Object
  81     {
  82
  83
  84
  85         /*
  86          *
  87          * I wonder if this will accept the prop: value, prop2 :value construxtor if we do not define one...
  88          */
  89
  90         /** @cfg {Boolean} collapseWhite merge multiple whitespace/comments into a single token **/
  91         public bool collapseWhite = false; // only reduces white space...
  92         /** @cfg {Boolean} keepDocs keep JSDOC comments **/
  93         public bool keepDocs = true;
  94         /** @cfg {Boolean} keepWhite keep White space **/
  95         public bool keepWhite = false;
  96         /** @cfg {Boolean} keepComments  keep all comments **/
  97         public bool keepComments = false;
  98         /** @cfg {Boolean} sepIdents seperate identifiers (eg. a.b.c into ['a', '.', 'b', '.', 'c'] ) **/
  99         public bool sepIdents = false;
 100         /** @cfg {String} filename name of file being parsed. **/
 101         public string filename = "";
 102         /** @config {Boolean} ignoreBadGrammer do not throw errors if we find stuff that might break compression **/
 103         public bool ignoreBadGrammer = false;
 104
 105
 106         int line = 0;
 107
 108         /**
 109          * tokenize a stream
 110          * @return {Array} of tokens
 111          *
 112          * ts = new TextStream(File.read(str));
 113          * tr = TokenReader({ keepComments : true, keepWhite : true });
 114          * tr.tokenize(ts)
 115          *
 116          */
 117         public TokenArray tokenize(TextStream stream)
 118         {
 119             this.line =1;
 120             var tokens = new TokenArray();
 121
 122
 123             while (!stream.lookEOF()) {
 124
 125
 126                 if (this.read_mlcomment(stream, tokens)) continue;
 127                 if (this.read_slcomment(stream, tokens)) continue;
 128                 if (this.read_dbquote(stream, tokens))   continue;
 129                 if (this.read_snquote(stream, tokens))   continue;
 130                 if (this.read_regx(stream, tokens))      continue;
 131                 if (this.read_numb(stream, tokens))      continue;
 132                 if (this.read_punc(stream, tokens))      continue;
 133                 if (this.read_newline(stream, tokens))   continue;
 134                 if (this.read_space(stream, tokens))     continue;
 135                 if (this.read_word(stream, tokens))      continue;
 136
 137                 // if execution reaches here then an error has happened
 138                 tokens.push(
 139                         new Token(stream.next(), "TOKN", "UNKNOWN_TOKEN", this.line)
 140                 );
 141             }
 142
 143
 144
 145             return tokens;
 146         }
 147
 148         /**
 149          * findPuncToken - find the id of a token (previous to current)
 150          * need to back check syntax..
 151          *
 152          * @arg {Array} tokens the array of tokens.
 153          * @arg {String} token data (eg. '(')
 154          * @arg {Number} offset where to start reading from
 155          * @return {Number} position of token
 156          */
 157         public int findPuncToken(TokenArray tokens, string data, int n)
 158         {
 159             n = n > 0 ? n :  tokens.length -1;
 160             var stack = 0;
 161             while (n > -1) {
 162
 163                 if (stack < 1 && tokens.get(n).data == data) {
 164                     return n;
 165                 }
 166
 167                 if (tokens.get(n).data  == ")" || tokens.get(n).data  == "}") {
 168                     stack++;
 169                     n--;
 170                     continue;
 171                 }
 172                 if (stack > 0 && (tokens.get(n).data  == "{" || tokens.get(n).data  == "(")) {
 173                     stack--;
 174                     n--;
 175                     continue;
 176                 }
 177
 178
 179                 n--;
 180             }
 181             return -1;
 182         }
 183         /**
 184          * lastSym - find the last token symbol
 185          * need to back check syntax..
 186          *
 187          * @arg {Array} tokens the array of tokens.
 188          * @arg {Number} offset where to start..
 189          * @return {Token} the token
 190          */
 191         public Token? lastSym(TokenArray tokens, int n)
 192         {
 193             for (var i = n-1; i >= 0; i--) {
 194                 if (!(tokens.get(i).is("WHIT") || tokens.get(i).is("COMM"))) {
 195                     return tokens.get(i);
 196                 }
 197             }
 198             return null;
 199         }
 200
 201
 202
 203         /**
 204             @returns {Boolean} Was the token found?
 205          */
 206         public bool read_word (TextStream stream, TokenArray tokens)
 207         {
 208             string found = "";
 209             while (!stream.lookEOF() && Lang.isWordChar((string)stream.look())) {
 210                 found += stream.next();
 211             }
 212
 213             if (found == "") {
 214                 return false;
 215             }
 216
 217             var name = Lang.keyword(found);
 218             if (name != null) {
 219
 220                 // look for "()return" ?? why ???
 221                 var ls = tokens.lastSym();
 222                 if (found == "return" && ls != null && ls.data == ")") {
 223                     //Seed.print('@' + tokens.length);
 224                     var n = this.findPuncToken(tokens, ")", 0);
 225                     //Seed.print(')@' + n);
 226                     n = this.findPuncToken(tokens, "(", n-1);
 227                     //Seed.print('(@' + n);
 228
 229                     //var lt = this.lastSym(tokens, n);
 230                     /*
 231                     //print(JSON.stringify(lt));
 232                     if (lt.type != "KEYW" || ["IF", 'WHILE'].indexOf(lt.name) < -1) {
 233                         if (!this.ignoreBadGrammer) {
 234                             throw new TokenReader_Error.ArgumentError(
 235                                 this.filename + ":" + this.line + " Error - return found after )"
 236                             );
 237                         }
 238                     }
 239
 240                     */
 241
 242                 }
 243
 244                 tokens.push(new Token(found, "KEYW", name, this.line));
 245                 return true;
 246             }
 247
 248             if (!this.sepIdents || found.index_of(".") < 0 ) {
 249                 tokens.push(new Token(found, "NAME", "NAME", this.line));
 250                 return true;
 251             }
 252             var n = found.split(".");
 253             var p = false;
 254             foreach (unowned string nm in n) {
 255                 if (p) {
 256                     tokens.push(new Token(".", "PUNC", "DOT", this.line));
 257                 }
 258                 p=true;
 259                 tokens.push(new Token(nm, "NAME", "NAME", this.line));
 260             }
 261             return true;
 262
 263
 264         }
 265
 266         /**
 267             @returns {Boolean} Was the token found?
 268          */
 269         public bool read_punc (TextStream stream, TokenArray tokens) throws TokenReader_Error
 270         {
 271             string found = "";
 272
 273             while (!stream.lookEOF() && Lang.punc(found + (string)stream.look()).length > 0) {
 274                 found += stream.next();
 275             }
 276
 277
 278             if (found == "") {
 279                 return false;
 280             }
 281
 282             var ls = tokens.lastSym();
 283
 284             if ((found == "}" || found == "]") && ls != null && ls.data == ",") {
 285                 //print("Error - comma found before " + found);
 286                 //print(JSON.stringify(tokens.lastSym(), null,4));
 287                 if (this.ignoreBadGrammer) {
 288                     print("\n" + this.filename + ":" + this.line.to_string() + " Error - comma found before " + found);
 289                 } else {
 290                     throw new TokenReader_Error.ArgumentError(
 291                                 this.filename + ":" + this.line.to_string() + "  comma found before " + found
 292
 293                     );
 294
 295                 }
 296             }
 297
 298             tokens.push(new Token(found, "PUNC", Lang.punc(found), this.line));
 299             return true;
 300
 301         }
 302
 303         /**
 304             @returns {Boolean} Was the token found?
 305          */
 306         public bool read_space  (TextStream stream, TokenArray tokens)
 307         {
 308             var found = "";
 309
 310             while (!stream.lookEOF() && Lang.isSpaceC(  stream.look()) && !Lang.isNewlineC(stream.look())) {
 311                 found += stream.next();
 312             }
 313
 314             if (found == "") {
 315                 return false;
 316             }
 317             //print("WHITE = " + JSON.stringify(found));
 318
 319
 320             if (this.collapseWhite) {
 321                 found = " "; // this might work better if it was a '\n' ???
 322             }
 323             if (this.keepWhite) {
 324                 tokens.push(new Token(found, "WHIT", "SPACE", this.line));
 325             }
 326             return true;
 327
 328         }
 329
 330         /**
 331             @returns {Boolean} Was the token found?
 332          */
 333         public bool read_newline  (TextStream stream, TokenArray tokens)
 334         {
 335             var found = "";
 336             var line = this.line;
 337             while (!stream.lookEOF() && Lang.isNewlineC(stream.look())) {
 338                 this.line++;
 339                 found += stream.next();
 340             }
 341
 342             if (found == "") {
 343                 return false;
 344             }
 345
 346             // if we found a new line, then we could check if previous character was a ';' - if so we can drop it.
 347             // otherwise generally keep it.. in which case it should reduce our issue with stripping new lines..
 348
 349
 350             //this.line++;
 351             if (this.collapseWhite) {
 352                 found = "\n"; // reduces multiple line breaks into a single one...
 353             }
 354
 355             if (this.keepWhite) {
 356                 var last = tokens.pop();
 357                 if (last != null && last.name != "WHIT") {
 358                     tokens.push(last);
 359                 }
 360                 // replaces last new line...
 361                 tokens.push(new Token(found, "WHIT", "NEWLINE", line));
 362             }
 363             return true;
 364         }
 365
 366         /**
 367             @returns {Boolean} Was the token found?
 368          */
 369         public bool read_mlcomment  (TextStream stream, TokenArray tokens)
 370         {
 371             if (stream.look() != '/') {
 372                 return false;
 373             }
 374             if (stream.look(1) != '*') {
 375                 return false;
 376             }
 377             var found = stream.next(2);
 378             string  c = "";
 379             var line = this.line;
 380             while (!stream.lookEOF() && !(stream.look(-1) == '/' && stream.look(-2) == '*')) {
 381                 c = stream.next();
 382                 if (c == "\n") {
 383                     this.line++;
 384                 }
 385                 found += c;
 386             }
 387
 388             // to start doclet we allow /** or /*** but not /**/ or /****
 389             //if (found.length /^\/\*\*([^\/]|\*[^*])/.test(found) && this.keepDocs) {
 390             if (this.keepDocs && found.length > 4 && found.index_of("/**") == 0 && found[3] != '/') {
 391                 tokens.push(new Token(found, "COMM", "JSDOC", this.line));
 392             } else if (this.keepComments) {
 393                 tokens.push(new Token(found, "COMM", "MULTI_LINE_COMM", line));
 394             }
 395             return true;
 396
 397         }
 398
 399         /**
 400             @returns {Boolean} Was the token found?
 401          */
 402          public bool read_slcomment  (TextStream stream, TokenArray tokens)
 403          {
 404             var found = "";
 405             if (
 406                 (stream.look() == '/' && stream.look(1) == '/' && (""!=(found=stream.next(2))))
 407                 ||
 408                 (stream.look() == '<' && stream.look(1) == '!' && stream.look(2) == '-' && stream.look(3) == '-' && (""!=(found=stream.next(4))))
 409             ) {
 410                 var line = this.line;
 411                 while (!stream.lookEOF() && !Lang.isNewline((string)stream.look())) {
 412                     found += stream.next();
 413                 }
 414                 //if (!stream.lookEOF()) { // what? << eat the EOL?
 415                     found += stream.next();
 416                 //}
 417                 if (this.keepComments) {
 418                     tokens.push(new Token(found, "COMM", "SINGLE_LINE_COMM", line));
 419                 }
 420                 this.line++;
 421                 return true;
 422             }
 423             return false;
 424         }
 425
 426         /**
 427             @returns {Boolean} Was the token found?
 428          */
 429         public bool read_dbquote  (TextStream stream, TokenArray tokens)
 430         {
 431             if (stream.look() != '"') {
 432                 return false;
 433             }
 434                 // find terminator
 435             var str = stream.next();
 436
 437             while (!stream.lookEOF()) {
 438                 if (stream.look() == '\\') {
 439                     if (Lang.isNewline((string)stream.look(1))) {
 440                         do {
 441                             stream.next();
 442                         } while (!stream.lookEOF() && Lang.isNewline((string)stream.look()));
 443                         str += "\\\n";
 444                     }
 445                     else {
 446                         str += stream.next(2);
 447                     }
 448                     continue;
 449                 }
 450                 if (stream.look() == '"') {
 451                     str += stream.next();
 452                     tokens.push(new Token(str, "STRN", "DOUBLE_QUOTE", this.line));
 453                     return true;
 454                 }
 455
 456                 str += stream.next();
 457
 458             }
 459             return false;
 460         }
 461
 462         /**
 463             @returns {Boolean} Was the token found?
 464          */
 465         public bool read_snquote  (TextStream stream, TokenArray tokens)
 466         {
 467             if (stream.look() != '\'') {
 468                 return false;
 469             }
 470             // find terminator
 471             var str = stream.next();
 472
 473             while (!stream.lookEOF()) {
 474                 if (stream.look() == '\\') { // escape sequence
 475                     str += stream.next(2);
 476                     continue;
 477                 }
 478                 if (stream.look() == '\'') {
 479                     str += stream.next();
 480                     tokens.push(new Token(str, "STRN", "SINGLE_QUOTE", this.line));
 481                     return true;
 482                 }
 483                 str += stream.next();
 484
 485             }
 486             return false;
 487         }
 488
 489
 490         /**
 491             @returns {Boolean} Was the token found?
 492          */
 493         public bool read_numb  (TextStream stream, TokenArray tokens)
 494         {
 495             if (stream.look() == '0' && stream.look(1) == 'x') {
 496                 return this.read_hex(stream, tokens);
 497             }
 498
 499             var found = "";
 500
 501             while (!stream.lookEOF() && Lang.isNumber(found+(string)stream.look())){
 502                 found += stream.next();
 503             }
 504
 505             if (found == "") {
 506                 return false;
 507             }
 508             if (GLib.Regex.match_simple("^0[0-7]", found)) {
 509                 tokens.push(new Token(found, "NUMB", "OCTAL", this.line));
 510                 return true;
 511             }
 512             tokens.push(new Token(found, "NUMB", "DECIMAL", this.line));
 513             return true;
 514
 515         }
 516
 517         /**
 518             @returns {Boolean} Was the token found?
 519          */
 520         public bool read_hex  (TextStream stream, TokenArray tokens)
 521         {
 522             var found = stream.next(2);
 523
 524             while (!stream.lookEOF()) {
 525                 if (Lang.isHexDec(found) && !Lang.isHexDec(found+(string)stream.look())) { // done
 526                     tokens.push(new Token(found, "NUMB", "HEX_DEC", this.line));
 527                     return true;
 528                 }
 529
 530                 found += stream.next();
 531
 532             }
 533             return false;
 534         }
 535
 536         /**
 537             @returns {Boolean} Was the token found?
 538          */
 539         public bool read_regx (TextStream stream, TokenArray tokens)
 540         {
 541
 542             if (stream.look() != '/') {
 543                 return false;
 544             }
 545             var  last = tokens.lastSym();
 546             if (
 547                 (last == null)
 548                 ||
 549                 (
 550                        !last.is("NUMB")   // stuff that can not appear before a regex..
 551                     && !last.is("NAME")
 552                     && !last.is("RIGHT_PAREN")
 553                     && !last.is("RIGHT_BRACKET")
 554                 )
 555             )  {
 556                 var regex = stream.next();
 557
 558                 while (!stream.lookEOF()) {
 559                     if (stream.look() == '\\') { // escape sequence
 560                         regex += stream.next(2);
 561                         continue;
 562                     }
 563                     if (stream.look() == '/') {
 564                         regex += stream.next();
 565
 566                         while (GLib.Regex.match_simple("[gmi]", (string)stream.look())) {
 567                             regex += stream.next();
 568                         }
 569
 570                         tokens.push(new Token(regex, "REGX", "REGX", this.line));
 571                         return true;
 572                     }
 573
 574                     regex += stream.next();
 575
 576                 }
 577                 // error: unterminated regex
 578             }
 579             return false;
 580         }
 581     }
 582 }