JSDOC/TokenReader.vala
[gnome.introspection-doc-generator] / JSDOC / TokenReader.vala
1 //<script type="text/javascript">
2
3  
4 // test code
5
6 void main() {
7          var tr = new  JSDOC.TokenReader();
8          tr.keepDocs =true;
9         tr.keepWhite = true;
10         tr.keepComments = true;
11         tr.sepIdents = true;
12         tr.collapseWhite = false;
13         tr.filename = "test";
14         string str;
15         FileUtils.get_contents("/home/alan/gitlive/gnome.introspection-doc-generator/JSDOC/Walker2.js", out  str);
16         
17         var toks = tr.tokenize(new JSDOC.TextStream(str)); // dont merge xxx + . + yyyy etc.
18     toks.dump();
19 }
20
21 //const Token   = imports.Token.Token;
22 //const Lang    = imports.Lang.Lang;
23
24 /**
25         @class Search a {@link JSDOC.TextStream} for language tokens.
26 */
27
28
29
30
31 namespace JSDOC {
32
33     public class TokenArray: Object {
34         
35         public Gee.ArrayList<Token> tokens;
36         public int length {
37             get { return this.tokens.size; }
38         }
39         
40         public TokenArray()
41         {
42             this.tokens = new Gee.ArrayList<Token>();
43         }
44         
45         public Token? last() {
46             if (this.tokens.size > 0) {
47                 return this.tokens.get(this.tokens.size-1);
48             }
49             return null;
50         }
51         public Token? lastSym () {
52             for (var i = this.tokens.size-1; i >= 0; i--) {
53                 if (!(this.tokens.get(i).is("WHIT") || this.tokens.get(i).is("COMM")))  {
54                     return this.tokens.get(i);
55                 }
56             }
57             return null;
58         }
59         public void push (Token t) {
60             this.tokens.add(t);
61         }
62         public Token? pop ()
63         {
64             if (this.tokens.size > 0) {
65                 return this.tokens.remove_at(this.tokens.size-1);
66             }
67             return null;
68         }
69         
70             public new Token get(int i) {
71             return this.tokens.get(i);
72         }
73         public void dump()
74         {
75                 foreach(var token in this.tokens) {
76                         print(token.asString());
77                 }
78         }
79         
80     }
81
82     public errordomain TokenReader_Error {
83             ArgumentError
84     }
85     
86
87     public class TokenReader : Object
88     {
89         
90         
91         
92         /*
93          *
94          * I wonder if this will accept the prop: value, prop2 :value construxtor if we do not define one...
95          */
96         
97         /** @cfg {Boolean} collapseWhite merge multiple whitespace/comments into a single token **/
98         public bool collapseWhite = false; // only reduces white space...
99         /** @cfg {Boolean} keepDocs keep JSDOC comments **/
100         public bool keepDocs = true;
101         /** @cfg {Boolean} keepWhite keep White space **/
102         public bool keepWhite = false;
103         /** @cfg {Boolean} keepComments  keep all comments **/
104         public bool keepComments = false;
105         /** @cfg {Boolean} sepIdents seperate identifiers (eg. a.b.c into ['a', '.', 'b', '.', 'c'] ) **/
106         public bool sepIdents = false;
107         /** @cfg {String} filename name of file being parsed. **/
108         public string filename = "";
109         /** @config {Boolean} ignoreBadGrammer do not throw errors if we find stuff that might break compression **/
110         public bool ignoreBadGrammer = false;
111         
112         
113         int line = 0;
114         
115         /**
116          * tokenize a stream
117          * @return {Array} of tokens
118          * 
119          * ts = new TextStream(File.read(str));
120          * tr = TokenReader({ keepComments : true, keepWhite : true });
121          * tr.tokenize(ts)
122          * 
123          */
124         public TokenArray tokenize(TextStream stream)
125         {
126             this.line =1;
127             var tokens = new TokenArray();
128            
129          
130             while (!stream.lookEOF()) {
131                 
132
133                 if (this.read_mlcomment(stream, tokens)) continue;
134                 if (this.read_slcomment(stream, tokens)) continue;
135                 if (this.read_dbquote(stream, tokens))   continue;
136                 if (this.read_snquote(stream, tokens))   continue;
137                 if (this.read_regx(stream, tokens))      continue;
138                 if (this.read_numb(stream, tokens))      continue;
139                 if (this.read_punc(stream, tokens))      continue;
140                 if (this.read_newline(stream, tokens))   continue;
141                 if (this.read_space(stream, tokens))     continue;
142                 if (this.read_word(stream, tokens))      continue;
143                 
144                 // if execution reaches here then an error has happened
145                 tokens.push(
146                         new Token(stream.next(), "TOKN", "UNKNOWN_TOKEN", this.line)
147                 );
148             }
149             
150             
151             
152             return tokens;
153         }
154
155         /**
156          * findPuncToken - find the id of a token (previous to current)
157          * need to back check syntax..
158          * 
159          * @arg {Array} tokens the array of tokens.
160          * @arg {String} token data (eg. '(')
161          * @arg {Number} offset where to start reading from
162          * @return {Number} position of token
163          */
164         public int findPuncToken(TokenArray tokens, string data, int n)
165         {
166             n = n > 0 ? n :  tokens.length -1;
167             var stack = 0;
168             while (n > -1) {
169                 
170                 if (stack < 1 && tokens.get(n).data == data) {
171                     return n;
172                 }
173                 
174                 if (tokens.get(n).data  == ")" || tokens.get(n).data  == "}") {
175                     stack++;
176                     n--;
177                     continue;
178                 }
179                 if (stack > 0 && (tokens.get(n).data  == "{" || tokens.get(n).data  == "(")) {
180                     stack--;
181                     n--;
182                     continue;
183                 }
184                 
185                 
186                 n--;
187             }
188             return -1;
189         }
190         /**
191          * lastSym - find the last token symbol
192          * need to back check syntax..
193          * 
194          * @arg {Array} tokens the array of tokens.
195          * @arg {Number} offset where to start..
196          * @return {Token} the token
197          */
198         public Token? lastSym(TokenArray tokens, int n)
199         {
200             for (var i = n-1; i >= 0; i--) {
201                 if (!(tokens.get(i).is("WHIT") || tokens.get(i).is("COMM"))) {
202                     return tokens.get(i);
203                 }
204             }
205             return null;
206         }
207         
208          
209         
210         /**
211             @returns {Boolean} Was the token found?
212          */
213         public bool read_word (TextStream stream, TokenArray tokens)
214         {
215             string found = "";
216             while (!stream.lookEOF() && Lang.isWordChar(stream.look().to_string())) {
217                 found += stream.next();
218             }
219             
220             if (found == "") {
221                 return false;
222             }
223             
224             var name = Lang.keyword(found);
225             if (name != null) {
226                 
227                 // look for "()return" ?? why ???
228                 var ls = tokens.lastSym();
229                 if (found == "return" && ls != null && ls.data == ")") {
230                     //Seed.print('@' + tokens.length);
231                     var n = this.findPuncToken(tokens, ")", 0);
232                     //Seed.print(')@' + n);
233                     n = this.findPuncToken(tokens, "(", n-1);
234                     //Seed.print('(@' + n);
235                     
236                     //var lt = this.lastSym(tokens, n);
237                     /*
238                     //print(JSON.stringify(lt));
239                     if (lt.type != "KEYW" || ["IF", 'WHILE'].indexOf(lt.name) < -1) {
240                         if (!this.ignoreBadGrammer) {
241                             throw new TokenReader_Error.ArgumentError(
242                                 this.filename + ":" + this.line + " Error - return found after )"
243                             );
244                         }
245                     }
246                     
247                     */
248                     
249                 }
250                 
251                 tokens.push(new Token(found, "KEYW", name, this.line));
252                 return true;
253             }
254             
255             if (!this.sepIdents || found.index_of(".") < 0 ) {
256                 tokens.push(new Token(found, "NAME", "NAME", this.line));
257                 return true;
258             }
259             var n = found.split(".");
260             var p = false;
261             foreach (unowned string nm in n) {
262                 if (p) {
263                     tokens.push(new Token(".", "PUNC", "DOT", this.line));
264                 }
265                 p=true;
266                 tokens.push(new Token(nm, "NAME", "NAME", this.line));
267             }
268             return true;
269                 
270
271         }
272
273         /**
274             @returns {Boolean} Was the token found?
275          */
276         public bool read_punc (TextStream stream, TokenArray tokens) throws TokenReader_Error
277         {
278             string found = "";
279             
280             while (!stream.lookEOF() && Lang.punc(found + stream.look().to_string()).length > 0) {
281                 found += stream.next();
282             }
283             
284             
285             if (found == "") {
286                 return false;
287             }
288             
289             var ls = tokens.lastSym();
290             
291             if ((found == "}" || found == "]") && ls != null && ls.data == ",") {
292                 //print("Error - comma found before " + found);
293                 //print(JSON.stringify(tokens.lastSym(), null,4));
294                 if (this.ignoreBadGrammer) {
295                     print("\n" + this.filename + ":" + this.line.to_string() + " Error - comma found before " + found);
296                 } else {
297                     throw new TokenReader_Error.ArgumentError(
298                                 this.filename + ":" + this.line.to_string() + "  comma found before " + found
299                   
300                     );
301                      
302                 }
303             }
304             
305             tokens.push(new Token(found, "PUNC", Lang.punc(found), this.line));
306             return true;
307             
308         } 
309
310         /**
311             @returns {Boolean} Was the token found?
312          */
313         public bool read_space  (TextStream stream, TokenArray tokens)
314         {
315             var found = "";
316             
317             while (!stream.lookEOF() && Lang.isSpaceC(  stream.look()) && !Lang.isNewlineC(stream.look())) {
318                 found += stream.next();
319             }
320             
321             if (found == "") {
322                 return false;
323             }
324             //print("WHITE = " + JSON.stringify(found));
325             
326              
327             if (this.collapseWhite) {
328                 found = " "; // this might work better if it was a '\n' ???
329             }
330             if (this.keepWhite) {
331                 tokens.push(new Token(found, "WHIT", "SPACE", this.line));
332             }
333             return true;
334         
335         }
336
337         /**
338             @returns {Boolean} Was the token found?
339          */
340         public bool read_newline  (TextStream stream, TokenArray tokens)
341         {
342             var found = "";
343             var line = this.line;
344             while (!stream.lookEOF() && Lang.isNewlineC(stream.look())) {
345                 this.line++;
346                 found += stream.next();
347             }
348             
349             if (found == "") {
350                 return false;
351             }
352             
353             // if we found a new line, then we could check if previous character was a ';' - if so we can drop it.
354             // otherwise generally keep it.. in which case it should reduce our issue with stripping new lines..
355            
356             
357             //this.line++;
358             if (this.collapseWhite) {
359                 found = "\n"; // reduces multiple line breaks into a single one...
360             }
361             
362             if (this.keepWhite) {
363                 var last = tokens.pop();
364                 if (last != null && last.name != "WHIT") {
365                     tokens.push(last);
366                 }
367                 // replaces last new line... 
368                 tokens.push(new Token(found, "WHIT", "NEWLINE", line));
369             }
370             return true;
371         }
372
373         /**
374             @returns {Boolean} Was the token found?
375          */
376         public bool read_mlcomment  (TextStream stream, TokenArray tokens)
377         {
378             if (stream.look() != '/') {
379                 return false;
380             }
381             if (stream.look(1) != '*') {
382                 return false;
383             }
384             var found = stream.next(2);
385             string  c = "";
386             var line = this.line;
387             while (!stream.lookEOF() && !(stream.look(-1) == '/' && stream.look(-2) == '*')) {
388                 c = stream.next();
389                 if (c == "\n") {
390                     this.line++;
391                 }
392                 found += c;
393             }
394             
395             // to start doclet we allow /** or /*** but not /**/ or /****
396             //if (found.length /^\/\*\*([^\/]|\*[^*])/.test(found) && this.keepDocs) {
397             if (this.keepDocs && found.length > 4 && found.index_of("/**") == 0 && found[3] != '/') {
398                 tokens.push(new Token(found, "COMM", "JSDOC", this.line));
399             } else if (this.keepComments) {
400                 tokens.push(new Token(found, "COMM", "MULTI_LINE_COMM", line));
401             }
402             return true;
403         
404         } 
405
406         /**
407             @returns {Boolean} Was the token found?
408          */
409          public bool read_slcomment  (TextStream stream, TokenArray tokens)
410          {
411             var found = "";
412             if (
413                 (stream.look() == '/' && stream.look(1) == '/' && (""!=(found=stream.next(2))))
414                 || 
415                 (stream.look() == '<' && stream.look(1) == '!' && stream.look(2) == '-' && stream.look(3) == '-' && (""!=(found=stream.next(4))))
416             ) {
417                 var line = this.line;
418                 while (!stream.lookEOF()) {
419                                         print(Lang.isNewline(stream.look().to_string());
420                         if ( !Lang.isNewline(stream.look().to_string())) {
421                                 
422                         }
423                     found += stream.next();
424                 }
425                 //if (!stream.lookEOF()) { // what? << eat the EOL?
426                     found += stream.next();
427                 //}
428                 if (this.keepComments) {
429                     tokens.push(new Token(found, "COMM", "SINGLE_LINE_COMM", line));
430                 }
431                 this.line++;
432                 return true;
433             }
434             return false;
435         }
436
437         /**
438             @returns {Boolean} Was the token found?
439          */
440         public bool read_dbquote  (TextStream stream, TokenArray tokens)
441         {
442             if (stream.look() != '"') {
443                 return false;
444             }
445                 // find terminator
446             var str = stream.next();
447             
448             while (!stream.lookEOF()) {
449                 if (stream.look() == '\\') {
450                     if (Lang.isNewline(stream.look(1).to_string())) {
451                         do {
452                             stream.next();
453                         } while (!stream.lookEOF() && Lang.isNewline(stream.look().to_string()));
454                         str += "\\\n";
455                     }
456                     else {
457                         str += stream.next(2);
458                     }
459                     continue;
460                 }
461                 if (stream.look() == '"') {
462                     str += stream.next();
463                     tokens.push(new Token(str, "STRN", "DOUBLE_QUOTE", this.line));
464                     return true;
465                 }
466             
467                 str += stream.next();
468                 
469             }
470             return false;
471         }
472
473         /**
474             @returns {Boolean} Was the token found?
475          */
476         public bool read_snquote  (TextStream stream, TokenArray tokens)
477         {
478             if (stream.look() != '\'') {
479                 return false;
480             }
481             // find terminator
482             var str = stream.next();
483             
484             while (!stream.lookEOF()) {
485                 if (stream.look() == '\\') { // escape sequence
486                     str += stream.next(2);
487                     continue;
488                 }
489                 if (stream.look() == '\'') {
490                     str += stream.next();
491                     tokens.push(new Token(str, "STRN", "SINGLE_QUOTE", this.line));
492                     return true;
493                 }
494                 str += stream.next();
495                 
496             }
497             return false;
498         }
499         
500
501         /**
502             @returns {Boolean} Was the token found?
503          */
504         public bool read_numb  (TextStream stream, TokenArray tokens)
505         {
506             if (stream.look() == '0' && stream.look(1) == 'x') {
507                 return this.read_hex(stream, tokens);
508             }
509             
510             var found = "";
511             
512             while (!stream.lookEOF() && Lang.isNumber(found+stream.look().to_string())){
513                 found += stream.next();
514             }
515             
516             if (found == "") {
517                 return false;
518             }
519             if (GLib.Regex.match_simple("^0[0-7]", found)) {
520                 tokens.push(new Token(found, "NUMB", "OCTAL", this.line));
521                 return true;
522             }
523             tokens.push(new Token(found, "NUMB", "DECIMAL", this.line));
524             return true;
525         
526         }
527        
528         /**
529             @returns {Boolean} Was the token found?
530          */
531         public bool read_hex  (TextStream stream, TokenArray tokens)
532         {
533             var found = stream.next(2);
534             
535             while (!stream.lookEOF()) {
536                 if (Lang.isHexDec(found) && !Lang.isHexDec(found+stream.look().to_string())) { // done
537                     tokens.push(new Token(found, "NUMB", "HEX_DEC", this.line));
538                     return true;
539                 }
540                 
541                 found += stream.next();
542                
543             }
544             return false;
545         }
546
547         /**
548             @returns {Boolean} Was the token found?
549          */
550         public bool read_regx (TextStream stream, TokenArray tokens)
551         {
552               
553             if (stream.look() != '/') {
554                 return false;
555             }
556             var  last = tokens.lastSym();
557             if (
558                 (last == null)
559                 || 
560                 (
561                        !last.is("NUMB")   // stuff that can not appear before a regex..
562                     && !last.is("NAME")
563                     && !last.is("RIGHT_PAREN")
564                     && !last.is("RIGHT_BRACKET")
565                 )
566             )  {
567                 var regex = stream.next();
568                 
569                 while (!stream.lookEOF()) {
570                     if (stream.look() == '\\') { // escape sequence
571                         regex += stream.next(2);
572                         continue;
573                     }
574                     if (stream.look() == '/') {
575                         regex += stream.next();
576                         
577                         while (GLib.Regex.match_simple("[gmi]", stream.look().to_string())) {
578                             regex += stream.next();
579                         }
580                         
581                         tokens.push(new Token(regex, "REGX", "REGX", this.line));
582                         return true;
583                     }
584                      
585                     regex += stream.next();
586                      
587                 }
588                 // error: unterminated regex
589             }
590             return false;
591         }
592     }
593 }