git.roojs.org Git - gnome.introspection-doc-generator/blob - JSDOC/TokenReader.vala

   1 //<script type="text/javascript">
   2
   3
   4 // test code
   5
   6 void main() {
   7          var tr = new  JSDOC.TokenReader();
   8          tr.keepDocs =true;
   9         tr.keepWhite = true;
  10         tr.keepComments = true;
  11         tr.sepIdents = true;
  12         tr.collapseWhite = false;
  13         tr.filename = "test";
  14         string str;
  15         FileUtils.get_contents("/home/alan/gitlive/gnome.introspection-doc-generator/JSDOC/Walker2.js", out  str);
  16
  17         var toks = tr.tokenize(new JSDOC.TextStream(str)); // dont merge xxx + . + yyyy etc.
  18     toks.dump();
  19 }
  20
  21 //const Token   = imports.Token.Token;
  22 //const Lang    = imports.Lang.Lang;
  23
  24 /**
  25         @class Search a {@link JSDOC.TextStream} for language tokens.
  26 */
  27
  28
  29
  30
  31 namespace JSDOC {
  32
  33     public class TokenArray: Object {
  34
  35         public Gee.ArrayList<Token> tokens;
  36         public int length {
  37             get { return this.tokens.size; }
  38         }
  39
  40         public TokenArray()
  41         {
  42             this.tokens = new Gee.ArrayList<Token>();
  43         }
  44
  45         public Token? last() {
  46             if (this.tokens.size > 0) {
  47                 return this.tokens.get(this.tokens.size-1);
  48             }
  49             return null;
  50         }
  51         public Token? lastSym () {
  52             for (var i = this.tokens.size-1; i >= 0; i--) {
  53                 if (!(this.tokens.get(i).is("WHIT") || this.tokens.get(i).is("COMM")))  {
  54                     return this.tokens.get(i);
  55                 }
  56             }
  57             return null;
  58         }
  59         public void push (Token t) {
  60             this.tokens.add(t);
  61         }
  62         public Token? pop ()
  63         {
  64             if (this.tokens.size > 0) {
  65                 return this.tokens.remove_at(this.tokens.size-1);
  66             }
  67             return null;
  68         }
  69
  70             public new Token get(int i) {
  71             return this.tokens.get(i);
  72         }
  73         public void dump()
  74         {
  75                 foreach(var token in this.tokens) {
  76                         print(token.asString());
  77                 }
  78         }
  79
  80     }
  81
  82     public errordomain TokenReader_Error {
  83             ArgumentError
  84     }
  85
  86
  87     public class TokenReader : Object
  88     {
  89
  90
  91
  92         /*
  93          *
  94          * I wonder if this will accept the prop: value, prop2 :value construxtor if we do not define one...
  95          */
  96
  97         /** @cfg {Boolean} collapseWhite merge multiple whitespace/comments into a single token **/
  98         public bool collapseWhite = false; // only reduces white space...
  99         /** @cfg {Boolean} keepDocs keep JSDOC comments **/
 100         public bool keepDocs = true;
 101         /** @cfg {Boolean} keepWhite keep White space **/
 102         public bool keepWhite = false;
 103         /** @cfg {Boolean} keepComments  keep all comments **/
 104         public bool keepComments = false;
 105         /** @cfg {Boolean} sepIdents seperate identifiers (eg. a.b.c into ['a', '.', 'b', '.', 'c'] ) **/
 106         public bool sepIdents = false;
 107         /** @cfg {String} filename name of file being parsed. **/
 108         public string filename = "";
 109         /** @config {Boolean} ignoreBadGrammer do not throw errors if we find stuff that might break compression **/
 110         public bool ignoreBadGrammer = false;
 111
 112
 113         int line = 0;
 114
 115         /**
 116          * tokenize a stream
 117          * @return {Array} of tokens
 118          *
 119          * ts = new TextStream(File.read(str));
 120          * tr = TokenReader({ keepComments : true, keepWhite : true });
 121          * tr.tokenize(ts)
 122          *
 123          */
 124         public TokenArray tokenize(TextStream stream)
 125         {
 126             this.line =1;
 127             var tokens = new TokenArray();
 128
 129
 130             while (!stream.lookEOF()) {
 131
 132
 133                 if (this.read_mlcomment(stream, tokens)) continue;
 134                 if (this.read_slcomment(stream, tokens)) continue;
 135                 if (this.read_dbquote(stream, tokens))   continue;
 136                 if (this.read_snquote(stream, tokens))   continue;
 137                 if (this.read_regx(stream, tokens))      continue;
 138                 if (this.read_numb(stream, tokens))      continue;
 139                 if (this.read_punc(stream, tokens))      continue;
 140                 if (this.read_newline(stream, tokens))   continue;
 141                 if (this.read_space(stream, tokens))     continue;
 142                 if (this.read_word(stream, tokens))      continue;
 143
 144                 // if execution reaches here then an error has happened
 145                 tokens.push(
 146                         new Token(stream.next(), "TOKN", "UNKNOWN_TOKEN", this.line)
 147                 );
 148             }
 149
 150
 151
 152             return tokens;
 153         }
 154
 155         /**
 156          * findPuncToken - find the id of a token (previous to current)
 157          * need to back check syntax..
 158          *
 159          * @arg {Array} tokens the array of tokens.
 160          * @arg {String} token data (eg. '(')
 161          * @arg {Number} offset where to start reading from
 162          * @return {Number} position of token
 163          */
 164         public int findPuncToken(TokenArray tokens, string data, int n)
 165         {
 166             n = n > 0 ? n :  tokens.length -1;
 167             var stack = 0;
 168             while (n > -1) {
 169
 170                 if (stack < 1 && tokens.get(n).data == data) {
 171                     return n;
 172                 }
 173
 174                 if (tokens.get(n).data  == ")" || tokens.get(n).data  == "}") {
 175                     stack++;
 176                     n--;
 177                     continue;
 178                 }
 179                 if (stack > 0 && (tokens.get(n).data  == "{" || tokens.get(n).data  == "(")) {
 180                     stack--;
 181                     n--;
 182                     continue;
 183                 }
 184
 185
 186                 n--;
 187             }
 188             return -1;
 189         }
 190         /**
 191          * lastSym - find the last token symbol
 192          * need to back check syntax..
 193          *
 194          * @arg {Array} tokens the array of tokens.
 195          * @arg {Number} offset where to start..
 196          * @return {Token} the token
 197          */
 198         public Token? lastSym(TokenArray tokens, int n)
 199         {
 200             for (var i = n-1; i >= 0; i--) {
 201                 if (!(tokens.get(i).is("WHIT") || tokens.get(i).is("COMM"))) {
 202                     return tokens.get(i);
 203                 }
 204             }
 205             return null;
 206         }
 207
 208
 209
 210         /**
 211             @returns {Boolean} Was the token found?
 212          */
 213         public bool read_word (TextStream stream, TokenArray tokens)
 214         {
 215             string found = "";
 216             while (!stream.lookEOF() && Lang.isWordChar(stream.look().to_string())) {
 217                 found += stream.next();
 218             }
 219
 220             if (found == "") {
 221                 return false;
 222             }
 223
 224             var name = Lang.keyword(found);
 225             if (name != null) {
 226
 227                 // look for "()return" ?? why ???
 228                 var ls = tokens.lastSym();
 229                 if (found == "return" && ls != null && ls.data == ")") {
 230                     //Seed.print('@' + tokens.length);
 231                     var n = this.findPuncToken(tokens, ")", 0);
 232                     //Seed.print(')@' + n);
 233                     n = this.findPuncToken(tokens, "(", n-1);
 234                     //Seed.print('(@' + n);
 235
 236                     //var lt = this.lastSym(tokens, n);
 237                     /*
 238                     //print(JSON.stringify(lt));
 239                     if (lt.type != "KEYW" || ["IF", 'WHILE'].indexOf(lt.name) < -1) {
 240                         if (!this.ignoreBadGrammer) {
 241                             throw new TokenReader_Error.ArgumentError(
 242                                 this.filename + ":" + this.line + " Error - return found after )"
 243                             );
 244                         }
 245                     }
 246
 247                     */
 248
 249                 }
 250
 251                 tokens.push(new Token(found, "KEYW", name, this.line));
 252                 return true;
 253             }
 254
 255             if (!this.sepIdents || found.index_of(".") < 0 ) {
 256                 tokens.push(new Token(found, "NAME", "NAME", this.line));
 257                 return true;
 258             }
 259             var n = found.split(".");
 260             var p = false;
 261             foreach (unowned string nm in n) {
 262                 if (p) {
 263                     tokens.push(new Token(".", "PUNC", "DOT", this.line));
 264                 }
 265                 p=true;
 266                 tokens.push(new Token(nm, "NAME", "NAME", this.line));
 267             }
 268             return true;
 269
 270
 271         }
 272
 273         /**
 274             @returns {Boolean} Was the token found?
 275          */
 276         public bool read_punc (TextStream stream, TokenArray tokens) throws TokenReader_Error
 277         {
 278             string found = "";
 279
 280             while (!stream.lookEOF() && Lang.punc(found + stream.look().to_string()).length > 0) {
 281                 found += stream.next();
 282             }
 283
 284
 285             if (found == "") {
 286                 return false;
 287             }
 288
 289             var ls = tokens.lastSym();
 290
 291             if ((found == "}" || found == "]") && ls != null && ls.data == ",") {
 292                 //print("Error - comma found before " + found);
 293                 //print(JSON.stringify(tokens.lastSym(), null,4));
 294                 if (this.ignoreBadGrammer) {
 295                     print("\n" + this.filename + ":" + this.line.to_string() + " Error - comma found before " + found);
 296                 } else {
 297                     throw new TokenReader_Error.ArgumentError(
 298                                 this.filename + ":" + this.line.to_string() + "  comma found before " + found
 299
 300                     );
 301
 302                 }
 303             }
 304
 305             tokens.push(new Token(found, "PUNC", Lang.punc(found), this.line));
 306             return true;
 307
 308         }
 309
 310         /**
 311             @returns {Boolean} Was the token found?
 312          */
 313         public bool read_space  (TextStream stream, TokenArray tokens)
 314         {
 315             var found = "";
 316
 317             while (!stream.lookEOF() && Lang.isSpaceC(  stream.look()) && !Lang.isNewlineC(stream.look())) {
 318                 found += stream.next();
 319             }
 320
 321             if (found == "") {
 322                 return false;
 323             }
 324             //print("WHITE = " + JSON.stringify(found));
 325
 326
 327             if (this.collapseWhite) {
 328                 found = " "; // this might work better if it was a '\n' ???
 329             }
 330             if (this.keepWhite) {
 331                 tokens.push(new Token(found, "WHIT", "SPACE", this.line));
 332             }
 333             return true;
 334
 335         }
 336
 337         /**
 338             @returns {Boolean} Was the token found?
 339          */
 340         public bool read_newline  (TextStream stream, TokenArray tokens)
 341         {
 342             var found = "";
 343             var line = this.line;
 344             while (!stream.lookEOF() && Lang.isNewlineC(stream.look())) {
 345                 this.line++;
 346                 found += stream.next();
 347             }
 348
 349             if (found == "") {
 350                 return false;
 351             }
 352
 353             // if we found a new line, then we could check if previous character was a ';' - if so we can drop it.
 354             // otherwise generally keep it.. in which case it should reduce our issue with stripping new lines..
 355
 356
 357             //this.line++;
 358             if (this.collapseWhite) {
 359                 found = "\n"; // reduces multiple line breaks into a single one...
 360             }
 361
 362             if (this.keepWhite) {
 363                 var last = tokens.pop();
 364                 if (last != null && last.name != "WHIT") {
 365                     tokens.push(last);
 366                 }
 367                 // replaces last new line...
 368                 tokens.push(new Token(found, "WHIT", "NEWLINE", line));
 369             }
 370             return true;
 371         }
 372
 373         /**
 374             @returns {Boolean} Was the token found?
 375          */
 376         public bool read_mlcomment  (TextStream stream, TokenArray tokens)
 377         {
 378             if (stream.look() != '/') {
 379                 return false;
 380             }
 381             if (stream.look(1) != '*') {
 382                 return false;
 383             }
 384             var found = stream.next(2);
 385             string  c = "";
 386             var line = this.line;
 387             while (!stream.lookEOF() && !(stream.look(-1) == '/' && stream.look(-2) == '*')) {
 388                 c = stream.next();
 389                 if (c == "\n") {
 390                     this.line++;
 391                 }
 392                 found += c;
 393             }
 394
 395             // to start doclet we allow /** or /*** but not /**/ or /****
 396             //if (found.length /^\/\*\*([^\/]|\*[^*])/.test(found) && this.keepDocs) {
 397             if (this.keepDocs && found.length > 4 && found.index_of("/**") == 0 && found[3] != '/') {
 398                 tokens.push(new Token(found, "COMM", "JSDOC", this.line));
 399             } else if (this.keepComments) {
 400                 tokens.push(new Token(found, "COMM", "MULTI_LINE_COMM", line));
 401             }
 402             return true;
 403
 404         }
 405
 406         /**
 407             @returns {Boolean} Was the token found?
 408          */
 409          public bool read_slcomment  (TextStream stream, TokenArray tokens)
 410          {
 411             var found = "";
 412             if (
 413                 (stream.look() == '/' && stream.look(1) == '/' && (""!=(found=stream.next(2))))
 414                 ||
 415                 (stream.look() == '<' && stream.look(1) == '!' && stream.look(2) == '-' && stream.look(3) == '-' && (""!=(found=stream.next(4))))
 416             ) {
 417                 var line = this.line;
 418                 while (!stream.lookEOF()) {
 419                                         print(stream.look().to_string());
 420                         if ( Lang.isNewline(stream.look().to_string())) {
 421                                 break;
 422                         }
 423                     found += stream.next();
 424                 }
 425                 if (!stream.lookEOF()) { // lookinng for end  of line... if we got it, then do not eat the character..
 426                     found += stream.next();
 427                 }
 428                 if (this.keepComments) {
 429                     tokens.push(new Token(found, "COMM", "SINGLE_LINE_COMM", line));
 430                 }
 431                 this.line++;
 432                 return true;
 433             }
 434             return false;
 435         }
 436
 437         /**
 438             @returns {Boolean} Was the token found?
 439          */
 440         public bool read_dbquote  (TextStream stream, TokenArray tokens)
 441         {
 442             if (stream.look() != '"') {
 443                 return false;
 444             }
 445                 // find terminator
 446             var str = stream.next();
 447
 448             while (!stream.lookEOF()) {
 449                 if (stream.look() == '\\') {
 450                     if (Lang.isNewline(stream.look(1).to_string())) {
 451                         do {
 452                             stream.next();
 453                         } while (!stream.lookEOF() && Lang.isNewline(stream.look().to_string()));
 454                         str += "\\\n";
 455                     }
 456                     else {
 457                         str += stream.next(2);
 458                     }
 459                     continue;
 460                 }
 461                 if (stream.look() == '"') {
 462                     str += stream.next();
 463                     tokens.push(new Token(str, "STRN", "DOUBLE_QUOTE", this.line));
 464                     return true;
 465                 }
 466
 467                 str += stream.next();
 468
 469             }
 470             return false;
 471         }
 472
 473         /**
 474             @returns {Boolean} Was the token found?
 475          */
 476         public bool read_snquote  (TextStream stream, TokenArray tokens)
 477         {
 478             if (stream.look() != '\'') {
 479                 return false;
 480             }
 481             // find terminator
 482             var str = stream.next();
 483
 484             while (!stream.lookEOF()) {
 485                 if (stream.look() == '\\') { // escape sequence
 486                     str += stream.next(2);
 487                     continue;
 488                 }
 489                 if (stream.look() == '\'') {
 490                     str += stream.next();
 491                     tokens.push(new Token(str, "STRN", "SINGLE_QUOTE", this.line));
 492                     return true;
 493                 }
 494                 str += stream.next();
 495
 496             }
 497             return false;
 498         }
 499
 500
 501         /**
 502             @returns {Boolean} Was the token found?
 503          */
 504         public bool read_numb  (TextStream stream, TokenArray tokens)
 505         {
 506             if (stream.look() == '0' && stream.look(1) == 'x') {
 507                 return this.read_hex(stream, tokens);
 508             }
 509
 510             var found = "";
 511
 512             while (!stream.lookEOF() && Lang.isNumber(found+stream.look().to_string())){
 513                 found += stream.next();
 514             }
 515
 516             if (found == "") {
 517                 return false;
 518             }
 519             if (GLib.Regex.match_simple("^0[0-7]", found)) {
 520                 tokens.push(new Token(found, "NUMB", "OCTAL", this.line));
 521                 return true;
 522             }
 523             tokens.push(new Token(found, "NUMB", "DECIMAL", this.line));
 524             return true;
 525
 526         }
 527
 528         /**
 529             @returns {Boolean} Was the token found?
 530          */
 531         public bool read_hex  (TextStream stream, TokenArray tokens)
 532         {
 533             var found = stream.next(2);
 534
 535             while (!stream.lookEOF()) {
 536                 if (Lang.isHexDec(found) && !Lang.isHexDec(found+stream.look().to_string())) { // done
 537                     tokens.push(new Token(found, "NUMB", "HEX_DEC", this.line));
 538                     return true;
 539                 }
 540
 541                 found += stream.next();
 542
 543             }
 544             return false;
 545         }
 546
 547         /**
 548             @returns {Boolean} Was the token found?
 549          */
 550         public bool read_regx (TextStream stream, TokenArray tokens)
 551         {
 552
 553             if (stream.look() != '/') {
 554                 return false;
 555             }
 556             var  last = tokens.lastSym();
 557             if (
 558                 (last == null)
 559                 ||
 560                 (
 561                        !last.is("NUMB")   // stuff that can not appear before a regex..
 562                     && !last.is("NAME")
 563                     && !last.is("RIGHT_PAREN")
 564                     && !last.is("RIGHT_BRACKET")
 565                 )
 566             )  {
 567                 var regex = stream.next();
 568
 569                 while (!stream.lookEOF()) {
 570                     if (stream.look() == '\\') { // escape sequence
 571                         regex += stream.next(2);
 572                         continue;
 573                     }
 574                     if (stream.look() == '/') {
 575                         regex += stream.next();
 576
 577                         while (GLib.Regex.match_simple("[gmi]", stream.look().to_string())) {
 578                             regex += stream.next();
 579                         }
 580
 581                         tokens.push(new Token(regex, "REGX", "REGX", this.line));
 582                         return true;
 583                     }
 584
 585                     regex += stream.next();
 586
 587                 }
 588                 // error: unterminated regex
 589             }
 590             return false;
 591         }
 592     }
 593 }