JSDOC/TokenReader.vala
[gnome.introspection-doc-generator] / JSDOC / TokenReader.vala
1 //<script type="text/javascript">
2
3  
4 // test code
5
6 void main() {
7          var tr = new  JSDOC.TokenReader();
8          tr.keepDocs =true;
9         tr.keepWhite = true;
10         tr.keepComments = true;
11         tr.sepIdents = true;
12         tr.collapseWhite = false;
13         tr.filename = "test";
14         string str;
15         FileUtils.get_contents("/home/alan/gitlive/gnome.introspection-doc-generator/JSDOC/Walker2.js", str);
16         
17         var toks = tr.tokenize(new JSDOC.TextStream(str)); // dont merge xxx + . + yyyy etc.
18         
19 }
20
21 //const Token   = imports.Token.Token;
22 //const Lang    = imports.Lang.Lang;
23
24 /**
25         @class Search a {@link JSDOC.TextStream} for language tokens.
26 */
27
28
29
30
31 namespace JSDOC {
32
33     public class TokenArray: Object {
34         
35         public Gee.ArrayList<Token> tokens;
36         public int length {
37             get { return this.tokens.size; }
38         }
39         
40         public TokenArray()
41         {
42             this.tokens = new Gee.ArrayList<Token>();
43         }
44         
45         public Token? last() {
46             if (this.tokens.size > 0) {
47                 return this.tokens.get(this.tokens.size-1);
48             }
49             return null;
50         }
51         public Token? lastSym () {
52             for (var i = this.tokens.size-1; i >= 0; i--) {
53                 if (!(this.tokens.get(i).is("WHIT") || this.tokens.get(i).is("COMM")))  {
54                     return this.tokens.get(i);
55                 }
56             }
57             return null;
58         }
59         public void push (Token t) {
60             this.tokens.add(t);
61         }
62         public Token? pop ()
63         {
64             if (this.tokens.size > 0) {
65                 return this.tokens.remove_at(this.tokens.size-1);
66             }
67             return null;
68         }
69         
70             public new Token get(int i) {
71             return this.tokens.get(i);
72         }
73     }
74
75     public errordomain TokenReader_Error {
76             ArgumentError
77     }
78     
79
80     public class TokenReader : Object
81     {
82         
83         
84         
85         /*
86          *
87          * I wonder if this will accept the prop: value, prop2 :value construxtor if we do not define one...
88          */
89         
90         /** @cfg {Boolean} collapseWhite merge multiple whitespace/comments into a single token **/
91         public bool collapseWhite = false; // only reduces white space...
92         /** @cfg {Boolean} keepDocs keep JSDOC comments **/
93         public bool keepDocs = true;
94         /** @cfg {Boolean} keepWhite keep White space **/
95         public bool keepWhite = false;
96         /** @cfg {Boolean} keepComments  keep all comments **/
97         public bool keepComments = false;
98         /** @cfg {Boolean} sepIdents seperate identifiers (eg. a.b.c into ['a', '.', 'b', '.', 'c'] ) **/
99         public bool sepIdents = false;
100         /** @cfg {String} filename name of file being parsed. **/
101         public string filename = "";
102         /** @config {Boolean} ignoreBadGrammer do not throw errors if we find stuff that might break compression **/
103         public bool ignoreBadGrammer = false;
104         
105         
106         int line = 0;
107         
108         /**
109          * tokenize a stream
110          * @return {Array} of tokens
111          * 
112          * ts = new TextStream(File.read(str));
113          * tr = TokenReader({ keepComments : true, keepWhite : true });
114          * tr.tokenize(ts)
115          * 
116          */
117         public TokenArray tokenize(TextStream stream)
118         {
119             this.line =1;
120             var tokens = new TokenArray();
121            
122          
123             while (!stream.lookEOF()) {
124                 
125
126                 if (this.read_mlcomment(stream, tokens)) continue;
127                 if (this.read_slcomment(stream, tokens)) continue;
128                 if (this.read_dbquote(stream, tokens))   continue;
129                 if (this.read_snquote(stream, tokens))   continue;
130                 if (this.read_regx(stream, tokens))      continue;
131                 if (this.read_numb(stream, tokens))      continue;
132                 if (this.read_punc(stream, tokens))      continue;
133                 if (this.read_newline(stream, tokens))   continue;
134                 if (this.read_space(stream, tokens))     continue;
135                 if (this.read_word(stream, tokens))      continue;
136                 
137                 // if execution reaches here then an error has happened
138                 tokens.push(
139                         new Token(stream.next(), "TOKN", "UNKNOWN_TOKEN", this.line)
140                 );
141             }
142             
143             
144             
145             return tokens;
146         }
147
148         /**
149          * findPuncToken - find the id of a token (previous to current)
150          * need to back check syntax..
151          * 
152          * @arg {Array} tokens the array of tokens.
153          * @arg {String} token data (eg. '(')
154          * @arg {Number} offset where to start reading from
155          * @return {Number} position of token
156          */
157         public int findPuncToken(TokenArray tokens, string data, int n)
158         {
159             n = n > 0 ? n :  tokens.length -1;
160             var stack = 0;
161             while (n > -1) {
162                 
163                 if (stack < 1 && tokens.get(n).data == data) {
164                     return n;
165                 }
166                 
167                 if (tokens.get(n).data  == ")" || tokens.get(n).data  == "}") {
168                     stack++;
169                     n--;
170                     continue;
171                 }
172                 if (stack > 0 && (tokens.get(n).data  == "{" || tokens.get(n).data  == "(")) {
173                     stack--;
174                     n--;
175                     continue;
176                 }
177                 
178                 
179                 n--;
180             }
181             return -1;
182         }
183         /**
184          * lastSym - find the last token symbol
185          * need to back check syntax..
186          * 
187          * @arg {Array} tokens the array of tokens.
188          * @arg {Number} offset where to start..
189          * @return {Token} the token
190          */
191         public Token? lastSym(TokenArray tokens, int n)
192         {
193             for (var i = n-1; i >= 0; i--) {
194                 if (!(tokens.get(i).is("WHIT") || tokens.get(i).is("COMM"))) {
195                     return tokens.get(i);
196                 }
197             }
198             return null;
199         }
200         
201          
202         
203         /**
204             @returns {Boolean} Was the token found?
205          */
206         public bool read_word (TextStream stream, TokenArray tokens)
207         {
208             string found = "";
209             while (!stream.lookEOF() && Lang.isWordChar((string)stream.look())) {
210                 found += stream.next();
211             }
212             
213             if (found == "") {
214                 return false;
215             }
216             
217             var name = Lang.keyword(found);
218             if (name != null) {
219                 
220                 // look for "()return" ?? why ???
221                 var ls = tokens.lastSym();
222                 if (found == "return" && ls != null && ls.data == ")") {
223                     //Seed.print('@' + tokens.length);
224                     var n = this.findPuncToken(tokens, ")", 0);
225                     //Seed.print(')@' + n);
226                     n = this.findPuncToken(tokens, "(", n-1);
227                     //Seed.print('(@' + n);
228                     
229                     //var lt = this.lastSym(tokens, n);
230                     /*
231                     //print(JSON.stringify(lt));
232                     if (lt.type != "KEYW" || ["IF", 'WHILE'].indexOf(lt.name) < -1) {
233                         if (!this.ignoreBadGrammer) {
234                             throw new TokenReader_Error.ArgumentError(
235                                 this.filename + ":" + this.line + " Error - return found after )"
236                             );
237                         }
238                     }
239                     
240                     */
241                     
242                 }
243                 
244                 tokens.push(new Token(found, "KEYW", name, this.line));
245                 return true;
246             }
247             
248             if (!this.sepIdents || found.index_of(".") < 0 ) {
249                 tokens.push(new Token(found, "NAME", "NAME", this.line));
250                 return true;
251             }
252             var n = found.split(".");
253             var p = false;
254             foreach (unowned string nm in n) {
255                 if (p) {
256                     tokens.push(new Token(".", "PUNC", "DOT", this.line));
257                 }
258                 p=true;
259                 tokens.push(new Token(nm, "NAME", "NAME", this.line));
260             }
261             return true;
262                 
263
264         }
265
266         /**
267             @returns {Boolean} Was the token found?
268          */
269         public bool read_punc (TextStream stream, TokenArray tokens) throws TokenReader_Error
270         {
271             string found = "";
272             
273             while (!stream.lookEOF() && Lang.punc(found + (string)stream.look()).length > 0) {
274                 found += stream.next();
275             }
276             
277             
278             if (found == "") {
279                 return false;
280             }
281             
282             var ls = tokens.lastSym();
283             
284             if ((found == "}" || found == "]") && ls != null && ls.data == ",") {
285                 //print("Error - comma found before " + found);
286                 //print(JSON.stringify(tokens.lastSym(), null,4));
287                 if (this.ignoreBadGrammer) {
288                     print("\n" + this.filename + ":" + this.line.to_string() + " Error - comma found before " + found);
289                 } else {
290                     throw new TokenReader_Error.ArgumentError(
291                                 this.filename + ":" + this.line.to_string() + "  comma found before " + found
292                   
293                     );
294                      
295                 }
296             }
297             
298             tokens.push(new Token(found, "PUNC", Lang.punc(found), this.line));
299             return true;
300             
301         } 
302
303         /**
304             @returns {Boolean} Was the token found?
305          */
306         public bool read_space  (TextStream stream, TokenArray tokens)
307         {
308             var found = "";
309             
310             while (!stream.lookEOF() && Lang.isSpaceC(  stream.look()) && !Lang.isNewlineC(stream.look())) {
311                 found += stream.next();
312             }
313             
314             if (found == "") {
315                 return false;
316             }
317             //print("WHITE = " + JSON.stringify(found));
318             
319              
320             if (this.collapseWhite) {
321                 found = " "; // this might work better if it was a '\n' ???
322             }
323             if (this.keepWhite) {
324                 tokens.push(new Token(found, "WHIT", "SPACE", this.line));
325             }
326             return true;
327         
328         }
329
330         /**
331             @returns {Boolean} Was the token found?
332          */
333         public bool read_newline  (TextStream stream, TokenArray tokens)
334         {
335             var found = "";
336             var line = this.line;
337             while (!stream.lookEOF() && Lang.isNewlineC(stream.look())) {
338                 this.line++;
339                 found += stream.next();
340             }
341             
342             if (found == "") {
343                 return false;
344             }
345             
346             // if we found a new line, then we could check if previous character was a ';' - if so we can drop it.
347             // otherwise generally keep it.. in which case it should reduce our issue with stripping new lines..
348            
349             
350             //this.line++;
351             if (this.collapseWhite) {
352                 found = "\n"; // reduces multiple line breaks into a single one...
353             }
354             
355             if (this.keepWhite) {
356                 var last = tokens.pop();
357                 if (last != null && last.name != "WHIT") {
358                     tokens.push(last);
359                 }
360                 // replaces last new line... 
361                 tokens.push(new Token(found, "WHIT", "NEWLINE", line));
362             }
363             return true;
364         }
365
366         /**
367             @returns {Boolean} Was the token found?
368          */
369         public bool read_mlcomment  (TextStream stream, TokenArray tokens)
370         {
371             if (stream.look() != '/') {
372                 return false;
373             }
374             if (stream.look(1) != '*') {
375                 return false;
376             }
377             var found = stream.next(2);
378             string  c = "";
379             var line = this.line;
380             while (!stream.lookEOF() && !(stream.look(-1) == '/' && stream.look(-2) == '*')) {
381                 c = stream.next();
382                 if (c == "\n") {
383                     this.line++;
384                 }
385                 found += c;
386             }
387             
388             // to start doclet we allow /** or /*** but not /**/ or /****
389             //if (found.length /^\/\*\*([^\/]|\*[^*])/.test(found) && this.keepDocs) {
390             if (this.keepDocs && found.length > 4 && found.index_of("/**") == 0 && found[3] != '/') {
391                 tokens.push(new Token(found, "COMM", "JSDOC", this.line));
392             } else if (this.keepComments) {
393                 tokens.push(new Token(found, "COMM", "MULTI_LINE_COMM", line));
394             }
395             return true;
396         
397         } 
398
399         /**
400             @returns {Boolean} Was the token found?
401          */
402          public bool read_slcomment  (TextStream stream, TokenArray tokens)
403          {
404             var found = "";
405             if (
406                 (stream.look() == '/' && stream.look(1) == '/' && (""!=(found=stream.next(2))))
407                 || 
408                 (stream.look() == '<' && stream.look(1) == '!' && stream.look(2) == '-' && stream.look(3) == '-' && (""!=(found=stream.next(4))))
409             ) {
410                 var line = this.line;
411                 while (!stream.lookEOF() && !Lang.isNewline((string)stream.look())) {
412                     found += stream.next();
413                 }
414                 //if (!stream.lookEOF()) { // what? << eat the EOL?
415                     found += stream.next();
416                 //}
417                 if (this.keepComments) {
418                     tokens.push(new Token(found, "COMM", "SINGLE_LINE_COMM", line));
419                 }
420                 this.line++;
421                 return true;
422             }
423             return false;
424         }
425
426         /**
427             @returns {Boolean} Was the token found?
428          */
429         public bool read_dbquote  (TextStream stream, TokenArray tokens)
430         {
431             if (stream.look() != '"') {
432                 return false;
433             }
434                 // find terminator
435             var str = stream.next();
436             
437             while (!stream.lookEOF()) {
438                 if (stream.look() == '\\') {
439                     if (Lang.isNewline((string)stream.look(1))) {
440                         do {
441                             stream.next();
442                         } while (!stream.lookEOF() && Lang.isNewline((string)stream.look()));
443                         str += "\\\n";
444                     }
445                     else {
446                         str += stream.next(2);
447                     }
448                     continue;
449                 }
450                 if (stream.look() == '"') {
451                     str += stream.next();
452                     tokens.push(new Token(str, "STRN", "DOUBLE_QUOTE", this.line));
453                     return true;
454                 }
455             
456                 str += stream.next();
457                 
458             }
459             return false;
460         }
461
462         /**
463             @returns {Boolean} Was the token found?
464          */
465         public bool read_snquote  (TextStream stream, TokenArray tokens)
466         {
467             if (stream.look() != '\'') {
468                 return false;
469             }
470             // find terminator
471             var str = stream.next();
472             
473             while (!stream.lookEOF()) {
474                 if (stream.look() == '\\') { // escape sequence
475                     str += stream.next(2);
476                     continue;
477                 }
478                 if (stream.look() == '\'') {
479                     str += stream.next();
480                     tokens.push(new Token(str, "STRN", "SINGLE_QUOTE", this.line));
481                     return true;
482                 }
483                 str += stream.next();
484                 
485             }
486             return false;
487         }
488         
489
490         /**
491             @returns {Boolean} Was the token found?
492          */
493         public bool read_numb  (TextStream stream, TokenArray tokens)
494         {
495             if (stream.look() == '0' && stream.look(1) == 'x') {
496                 return this.read_hex(stream, tokens);
497             }
498             
499             var found = "";
500             
501             while (!stream.lookEOF() && Lang.isNumber(found+(string)stream.look())){
502                 found += stream.next();
503             }
504             
505             if (found == "") {
506                 return false;
507             }
508             if (GLib.Regex.match_simple("^0[0-7]", found)) {
509                 tokens.push(new Token(found, "NUMB", "OCTAL", this.line));
510                 return true;
511             }
512             tokens.push(new Token(found, "NUMB", "DECIMAL", this.line));
513             return true;
514         
515         }
516        
517         /**
518             @returns {Boolean} Was the token found?
519          */
520         public bool read_hex  (TextStream stream, TokenArray tokens)
521         {
522             var found = stream.next(2);
523             
524             while (!stream.lookEOF()) {
525                 if (Lang.isHexDec(found) && !Lang.isHexDec(found+(string)stream.look())) { // done
526                     tokens.push(new Token(found, "NUMB", "HEX_DEC", this.line));
527                     return true;
528                 }
529                 
530                 found += stream.next();
531                
532             }
533             return false;
534         }
535
536         /**
537             @returns {Boolean} Was the token found?
538          */
539         public bool read_regx (TextStream stream, TokenArray tokens)
540         {
541               
542             if (stream.look() != '/') {
543                 return false;
544             }
545             var  last = tokens.lastSym();
546             if (
547                 (last == null)
548                 || 
549                 (
550                        !last.is("NUMB")   // stuff that can not appear before a regex..
551                     && !last.is("NAME")
552                     && !last.is("RIGHT_PAREN")
553                     && !last.is("RIGHT_BRACKET")
554                 )
555             )  {
556                 var regex = stream.next();
557                 
558                 while (!stream.lookEOF()) {
559                     if (stream.look() == '\\') { // escape sequence
560                         regex += stream.next(2);
561                         continue;
562                     }
563                     if (stream.look() == '/') {
564                         regex += stream.next();
565                         
566                         while (GLib.Regex.match_simple("[gmi]", (string)stream.look())) {
567                             regex += stream.next();
568                         }
569                         
570                         tokens.push(new Token(regex, "REGX", "REGX", this.line));
571                         return true;
572                     }
573                      
574                     regex += stream.next();
575                      
576                 }
577                 // error: unterminated regex
578             }
579             return false;
580         }
581     }
582 }