JSDOC/TokenReader.vala
[gnome.introspection-doc-generator] / JSDOC / TokenReader.vala
1 //<script type="text/javascript">
2
3  
4 // test code
5
6 void main() {
7          var tr = new  JSDOC.TokenReader();
8          tr.keepDocs =true;
9             tr.keepWhite = true;
10             tr.keepComments = true;
11             tr.sepIdents = true;
12             tr.collapseWhite = false;
13             tr.filename = "test";
14         });
15         this.timerPrint("START" + fn);
16         
17         // we can load translation map here...
18         
19         var toks = tr.tokenize(new TextStream(str)); // dont merge xxx + . + yyyy etc.
20         
21 }
22
23 //const Token   = imports.Token.Token;
24 //const Lang    = imports.Lang.Lang;
25
26 /**
27         @class Search a {@link JSDOC.TextStream} for language tokens.
28 */
29
30
31
32
33 namespace JSDOC {
34
35     public class TokenArray: Object {
36         
37         public Gee.ArrayList<Token> tokens;
38         public int length {
39             get { return this.tokens.size; }
40         }
41         
42         public TokenArray()
43         {
44             this.tokens = new Gee.ArrayList<Token>();
45         }
46         
47         public Token? last() {
48             if (this.tokens.size > 0) {
49                 return this.tokens.get(this.tokens.size-1);
50             }
51             return null;
52         }
53         public Token? lastSym () {
54             for (var i = this.tokens.size-1; i >= 0; i--) {
55                 if (!(this.tokens.get(i).is("WHIT") || this.tokens.get(i).is("COMM")))  {
56                     return this.tokens.get(i);
57                 }
58             }
59             return null;
60         }
61         public void push (Token t) {
62             this.tokens.add(t);
63         }
64         public Token? pop ()
65         {
66             if (this.tokens.size > 0) {
67                 return this.tokens.remove_at(this.tokens.size-1);
68             }
69             return null;
70         }
71         
72             public new Token get(int i) {
73             return this.tokens.get(i);
74         }
75     }
76
77     public errordomain TokenReader_Error {
78             ArgumentError
79     }
80     
81
82     public class TokenReader : Object
83     {
84         
85         
86         
87         /*
88          *
89          * I wonder if this will accept the prop: value, prop2 :value construxtor if we do not define one...
90          */
91         
92         /** @cfg {Boolean} collapseWhite merge multiple whitespace/comments into a single token **/
93         public bool collapseWhite = false; // only reduces white space...
94         /** @cfg {Boolean} keepDocs keep JSDOC comments **/
95         public bool keepDocs = true;
96         /** @cfg {Boolean} keepWhite keep White space **/
97         public bool keepWhite = false;
98         /** @cfg {Boolean} keepComments  keep all comments **/
99         public bool keepComments = false;
100         /** @cfg {Boolean} sepIdents seperate identifiers (eg. a.b.c into ['a', '.', 'b', '.', 'c'] ) **/
101         public bool sepIdents = false;
102         /** @cfg {String} filename name of file being parsed. **/
103         public string filename = "";
104         /** @config {Boolean} ignoreBadGrammer do not throw errors if we find stuff that might break compression **/
105         public bool ignoreBadGrammer = false;
106         
107         
108         int line = 0;
109         
110         /**
111          * tokenize a stream
112          * @return {Array} of tokens
113          * 
114          * ts = new TextStream(File.read(str));
115          * tr = TokenReader({ keepComments : true, keepWhite : true });
116          * tr.tokenize(ts)
117          * 
118          */
119         public TokenArray tokenize(TextStream stream)
120         {
121             this.line =1;
122             var tokens = new TokenArray();
123            
124          
125             while (!stream.lookEOF()) {
126                 
127
128                 if (this.read_mlcomment(stream, tokens)) continue;
129                 if (this.read_slcomment(stream, tokens)) continue;
130                 if (this.read_dbquote(stream, tokens))   continue;
131                 if (this.read_snquote(stream, tokens))   continue;
132                 if (this.read_regx(stream, tokens))      continue;
133                 if (this.read_numb(stream, tokens))      continue;
134                 if (this.read_punc(stream, tokens))      continue;
135                 if (this.read_newline(stream, tokens))   continue;
136                 if (this.read_space(stream, tokens))     continue;
137                 if (this.read_word(stream, tokens))      continue;
138                 
139                 // if execution reaches here then an error has happened
140                 tokens.push(
141                         new Token(stream.next(), "TOKN", "UNKNOWN_TOKEN", this.line)
142                 );
143             }
144             
145             
146             
147             return tokens;
148         }
149
150         /**
151          * findPuncToken - find the id of a token (previous to current)
152          * need to back check syntax..
153          * 
154          * @arg {Array} tokens the array of tokens.
155          * @arg {String} token data (eg. '(')
156          * @arg {Number} offset where to start reading from
157          * @return {Number} position of token
158          */
159         public int findPuncToken(TokenArray tokens, string data, int n)
160         {
161             n = n > 0 ? n :  tokens.length -1;
162             var stack = 0;
163             while (n > -1) {
164                 
165                 if (stack < 1 && tokens.get(n).data == data) {
166                     return n;
167                 }
168                 
169                 if (tokens.get(n).data  == ")" || tokens.get(n).data  == "}") {
170                     stack++;
171                     n--;
172                     continue;
173                 }
174                 if (stack > 0 && (tokens.get(n).data  == "{" || tokens.get(n).data  == "(")) {
175                     stack--;
176                     n--;
177                     continue;
178                 }
179                 
180                 
181                 n--;
182             }
183             return -1;
184         }
185         /**
186          * lastSym - find the last token symbol
187          * need to back check syntax..
188          * 
189          * @arg {Array} tokens the array of tokens.
190          * @arg {Number} offset where to start..
191          * @return {Token} the token
192          */
193         public Token? lastSym(TokenArray tokens, int n)
194         {
195             for (var i = n-1; i >= 0; i--) {
196                 if (!(tokens.get(i).is("WHIT") || tokens.get(i).is("COMM"))) {
197                     return tokens.get(i);
198                 }
199             }
200             return null;
201         }
202         
203          
204         
205         /**
206             @returns {Boolean} Was the token found?
207          */
208         public bool read_word (TextStream stream, TokenArray tokens)
209         {
210             string found = "";
211             while (!stream.lookEOF() && Lang.isWordChar((string)stream.look())) {
212                 found += stream.next();
213             }
214             
215             if (found == "") {
216                 return false;
217             }
218             
219             var name = Lang.keyword(found);
220             if (name != null) {
221                 
222                 // look for "()return" ?? why ???
223                 var ls = tokens.lastSym();
224                 if (found == "return" && ls != null && ls.data == ")") {
225                     //Seed.print('@' + tokens.length);
226                     var n = this.findPuncToken(tokens, ")", 0);
227                     //Seed.print(')@' + n);
228                     n = this.findPuncToken(tokens, "(", n-1);
229                     //Seed.print('(@' + n);
230                     
231                     //var lt = this.lastSym(tokens, n);
232                     /*
233                     //print(JSON.stringify(lt));
234                     if (lt.type != "KEYW" || ["IF", 'WHILE'].indexOf(lt.name) < -1) {
235                         if (!this.ignoreBadGrammer) {
236                             throw new TokenReader_Error.ArgumentError(
237                                 this.filename + ":" + this.line + " Error - return found after )"
238                             );
239                         }
240                     }
241                     
242                     */
243                     
244                 }
245                 
246                 tokens.push(new Token(found, "KEYW", name, this.line));
247                 return true;
248             }
249             
250             if (!this.sepIdents || found.index_of(".") < 0 ) {
251                 tokens.push(new Token(found, "NAME", "NAME", this.line));
252                 return true;
253             }
254             var n = found.split(".");
255             var p = false;
256             foreach (unowned string nm in n) {
257                 if (p) {
258                     tokens.push(new Token(".", "PUNC", "DOT", this.line));
259                 }
260                 p=true;
261                 tokens.push(new Token(nm, "NAME", "NAME", this.line));
262             }
263             return true;
264                 
265
266         }
267
268         /**
269             @returns {Boolean} Was the token found?
270          */
271         public bool read_punc (TextStream stream, TokenArray tokens) throws TokenReader_Error
272         {
273             string found = "";
274             
275             while (!stream.lookEOF() && Lang.punc(found + (string)stream.look()).length > 0) {
276                 found += stream.next();
277             }
278             
279             
280             if (found == "") {
281                 return false;
282             }
283             
284             var ls = tokens.lastSym();
285             
286             if ((found == "}" || found == "]") && ls != null && ls.data == ",") {
287                 //print("Error - comma found before " + found);
288                 //print(JSON.stringify(tokens.lastSym(), null,4));
289                 if (this.ignoreBadGrammer) {
290                     print("\n" + this.filename + ":" + this.line.to_string() + " Error - comma found before " + found);
291                 } else {
292                     throw new TokenReader_Error.ArgumentError(
293                                 this.filename + ":" + this.line.to_string() + "  comma found before " + found
294                   
295                     );
296                      
297                 }
298             }
299             
300             tokens.push(new Token(found, "PUNC", Lang.punc(found), this.line));
301             return true;
302             
303         } 
304
305         /**
306             @returns {Boolean} Was the token found?
307          */
308         public bool read_space  (TextStream stream, TokenArray tokens)
309         {
310             var found = "";
311             
312             while (!stream.lookEOF() && Lang.isSpaceC(  stream.look()) && !Lang.isNewlineC(stream.look())) {
313                 found += stream.next();
314             }
315             
316             if (found == "") {
317                 return false;
318             }
319             //print("WHITE = " + JSON.stringify(found));
320             
321              
322             if (this.collapseWhite) {
323                 found = " "; // this might work better if it was a '\n' ???
324             }
325             if (this.keepWhite) {
326                 tokens.push(new Token(found, "WHIT", "SPACE", this.line));
327             }
328             return true;
329         
330         }
331
332         /**
333             @returns {Boolean} Was the token found?
334          */
335         public bool read_newline  (TextStream stream, TokenArray tokens)
336         {
337             var found = "";
338             var line = this.line;
339             while (!stream.lookEOF() && Lang.isNewlineC(stream.look())) {
340                 this.line++;
341                 found += stream.next();
342             }
343             
344             if (found == "") {
345                 return false;
346             }
347             
348             // if we found a new line, then we could check if previous character was a ';' - if so we can drop it.
349             // otherwise generally keep it.. in which case it should reduce our issue with stripping new lines..
350            
351             
352             //this.line++;
353             if (this.collapseWhite) {
354                 found = "\n"; // reduces multiple line breaks into a single one...
355             }
356             
357             if (this.keepWhite) {
358                 var last = tokens.pop();
359                 if (last != null && last.name != "WHIT") {
360                     tokens.push(last);
361                 }
362                 // replaces last new line... 
363                 tokens.push(new Token(found, "WHIT", "NEWLINE", line));
364             }
365             return true;
366         }
367
368         /**
369             @returns {Boolean} Was the token found?
370          */
371         public bool read_mlcomment  (TextStream stream, TokenArray tokens)
372         {
373             if (stream.look() != '/') {
374                 return false;
375             }
376             if (stream.look(1) != '*') {
377                 return false;
378             }
379             var found = stream.next(2);
380             string  c = "";
381             var line = this.line;
382             while (!stream.lookEOF() && !(stream.look(-1) == '/' && stream.look(-2) == '*')) {
383                 c = stream.next();
384                 if (c == "\n") {
385                     this.line++;
386                 }
387                 found += c;
388             }
389             
390             // to start doclet we allow /** or /*** but not /**/ or /****
391             //if (found.length /^\/\*\*([^\/]|\*[^*])/.test(found) && this.keepDocs) {
392             if (this.keepDocs && found.length > 4 && found.index_of("/**") == 0 && found[3] != '/') {
393                 tokens.push(new Token(found, "COMM", "JSDOC", this.line));
394             } else if (this.keepComments) {
395                 tokens.push(new Token(found, "COMM", "MULTI_LINE_COMM", line));
396             }
397             return true;
398         
399         } 
400
401         /**
402             @returns {Boolean} Was the token found?
403          */
404          public bool read_slcomment  (TextStream stream, TokenArray tokens)
405          {
406             var found = "";
407             if (
408                 (stream.look() == '/' && stream.look(1) == '/' && (""!=(found=stream.next(2))))
409                 || 
410                 (stream.look() == '<' && stream.look(1) == '!' && stream.look(2) == '-' && stream.look(3) == '-' && (""!=(found=stream.next(4))))
411             ) {
412                 var line = this.line;
413                 while (!stream.lookEOF() && !Lang.isNewline((string)stream.look())) {
414                     found += stream.next();
415                 }
416                 //if (!stream.lookEOF()) { // what? << eat the EOL?
417                     found += stream.next();
418                 //}
419                 if (this.keepComments) {
420                     tokens.push(new Token(found, "COMM", "SINGLE_LINE_COMM", line));
421                 }
422                 this.line++;
423                 return true;
424             }
425             return false;
426         }
427
428         /**
429             @returns {Boolean} Was the token found?
430          */
431         public bool read_dbquote  (TextStream stream, TokenArray tokens)
432         {
433             if (stream.look() != '"') {
434                 return false;
435             }
436                 // find terminator
437             var str = stream.next();
438             
439             while (!stream.lookEOF()) {
440                 if (stream.look() == '\\') {
441                     if (Lang.isNewline((string)stream.look(1))) {
442                         do {
443                             stream.next();
444                         } while (!stream.lookEOF() && Lang.isNewline((string)stream.look()));
445                         str += "\\\n";
446                     }
447                     else {
448                         str += stream.next(2);
449                     }
450                     continue;
451                 }
452                 if (stream.look() == '"') {
453                     str += stream.next();
454                     tokens.push(new Token(str, "STRN", "DOUBLE_QUOTE", this.line));
455                     return true;
456                 }
457             
458                 str += stream.next();
459                 
460             }
461             return false;
462         }
463
464         /**
465             @returns {Boolean} Was the token found?
466          */
467         public bool read_snquote  (TextStream stream, TokenArray tokens)
468         {
469             if (stream.look() != '\'') {
470                 return false;
471             }
472             // find terminator
473             var str = stream.next();
474             
475             while (!stream.lookEOF()) {
476                 if (stream.look() == '\\') { // escape sequence
477                     str += stream.next(2);
478                     continue;
479                 }
480                 if (stream.look() == '\'') {
481                     str += stream.next();
482                     tokens.push(new Token(str, "STRN", "SINGLE_QUOTE", this.line));
483                     return true;
484                 }
485                 str += stream.next();
486                 
487             }
488             return false;
489         }
490         
491
492         /**
493             @returns {Boolean} Was the token found?
494          */
495         public bool read_numb  (TextStream stream, TokenArray tokens)
496         {
497             if (stream.look() == '0' && stream.look(1) == 'x') {
498                 return this.read_hex(stream, tokens);
499             }
500             
501             var found = "";
502             
503             while (!stream.lookEOF() && Lang.isNumber(found+(string)stream.look())){
504                 found += stream.next();
505             }
506             
507             if (found == "") {
508                 return false;
509             }
510             if (GLib.Regex.match_simple("^0[0-7]", found)) {
511                 tokens.push(new Token(found, "NUMB", "OCTAL", this.line));
512                 return true;
513             }
514             tokens.push(new Token(found, "NUMB", "DECIMAL", this.line));
515             return true;
516         
517         }
518        
519         /**
520             @returns {Boolean} Was the token found?
521          */
522         public bool read_hex  (TextStream stream, TokenArray tokens)
523         {
524             var found = stream.next(2);
525             
526             while (!stream.lookEOF()) {
527                 if (Lang.isHexDec(found) && !Lang.isHexDec(found+(string)stream.look())) { // done
528                     tokens.push(new Token(found, "NUMB", "HEX_DEC", this.line));
529                     return true;
530                 }
531                 
532                 found += stream.next();
533                
534             }
535             return false;
536         }
537
538         /**
539             @returns {Boolean} Was the token found?
540          */
541         public bool read_regx (TextStream stream, TokenArray tokens)
542         {
543               
544             if (stream.look() != '/') {
545                 return false;
546             }
547             var  last = tokens.lastSym();
548             if (
549                 (last == null)
550                 || 
551                 (
552                        !last.is("NUMB")   // stuff that can not appear before a regex..
553                     && !last.is("NAME")
554                     && !last.is("RIGHT_PAREN")
555                     && !last.is("RIGHT_BRACKET")
556                 )
557             )  {
558                 var regex = stream.next();
559                 
560                 while (!stream.lookEOF()) {
561                     if (stream.look() == '\\') { // escape sequence
562                         regex += stream.next(2);
563                         continue;
564                     }
565                     if (stream.look() == '/') {
566                         regex += stream.next();
567                         
568                         while (GLib.Regex.match_simple("[gmi]", (string)stream.look())) {
569                             regex += stream.next();
570                         }
571                         
572                         tokens.push(new Token(regex, "REGX", "REGX", this.line));
573                         return true;
574                     }
575                      
576                     regex += stream.next();
577                      
578                 }
579                 // error: unterminated regex
580             }
581             return false;
582         }
583     }
584 }