JSDOC/TokenReader.vala
[gnome.introspection-doc-generator] / JSDOC / TokenReader.vala
1 //<script type="text/javascript">
2
3  
4 // test code
5
6 void main() {
7         var lc = new JSDOC.Lang_Class ();
8          var tr = new  JSDOC.TokenReader();
9          tr.keepDocs =true;
10         tr.keepWhite = true;
11         tr.keepComments = true;
12         tr.sepIdents = true;
13         tr.collapseWhite = false;
14         tr.filename = "test";
15         string str;
16         FileUtils.get_contents("/home/alan/gitlive/gnome.introspection-doc-generator/JSDOC/Walker2.js", out  str);
17         
18         var toks = tr.tokenize(new JSDOC.TextStream(str)); // dont merge xxx + . + yyyy etc.
19     toks.dump();
20 }
21
22 //const Token   = imports.Token.Token;
23 //const Lang    = imports.Lang.Lang;
24
25 /**
26         @class Search a {@link JSDOC.TextStream} for language tokens.
27 */
28
29
30
31
32 namespace JSDOC {
33
34     public class TokenArray: Object {
35         
36         public Gee.ArrayList<Token> tokens;
37         public int length {
38             get { return this.tokens.size; }
39         }
40         
41         public TokenArray()
42         {
43             this.tokens = new Gee.ArrayList<Token>();
44         }
45         
46         public Token? last() {
47             if (this.tokens.size > 0) {
48                 return this.tokens.get(this.tokens.size-1);
49             }
50             return null;
51         }
52         public Token? lastSym () {
53             for (var i = this.tokens.size-1; i >= 0; i--) {
54                 if (!(this.tokens.get(i).is("WHIT") || this.tokens.get(i).is("COMM")))  {
55                     return this.tokens.get(i);
56                 }
57             }
58             return null;
59         }
60         public void push (Token t) {
61             this.tokens.add(t);
62         }
63         public Token? pop ()
64         {
65             if (this.tokens.size > 0) {
66                 return this.tokens.remove_at(this.tokens.size-1);
67             }
68             return null;
69         }
70         
71             public new Token get(int i) {
72             return this.tokens.get(i);
73         }
74         public void dump()
75         {
76                 foreach(var token in this.tokens) {
77                         print(token.asString() +"\n");
78                 }
79         }
80         
81     }
82
83     public errordomain TokenReader_Error {
84             ArgumentError
85     }
86     
87
88     public class TokenReader : Object
89     {
90         
91         
92         
93         /*
94          *
95          * I wonder if this will accept the prop: value, prop2 :value construxtor if we do not define one...
96          */
97         
98         /** @cfg {Boolean} collapseWhite merge multiple whitespace/comments into a single token **/
99         public bool collapseWhite = false; // only reduces white space...
100         /** @cfg {Boolean} keepDocs keep JSDOC comments **/
101         public bool keepDocs = true;
102         /** @cfg {Boolean} keepWhite keep White space **/
103         public bool keepWhite = false;
104         /** @cfg {Boolean} keepComments  keep all comments **/
105         public bool keepComments = false;
106         /** @cfg {Boolean} sepIdents seperate identifiers (eg. a.b.c into ['a', '.', 'b', '.', 'c'] ) **/
107         public bool sepIdents = false;
108         /** @cfg {String} filename name of file being parsed. **/
109         public string filename = "";
110         /** @config {Boolean} ignoreBadGrammer do not throw errors if we find stuff that might break compression **/
111         public bool ignoreBadGrammer = false;
112         
113         
114         int line = 0;
115         
116         /**
117          * tokenize a stream
118          * @return {Array} of tokens
119          * 
120          * ts = new TextStream(File.read(str));
121          * tr = TokenReader({ keepComments : true, keepWhite : true });
122          * tr.tokenize(ts)
123          * 
124          */
125         public TokenArray tokenize(TextStream stream)
126         {
127             this.line =1;
128             var tokens = new TokenArray();
129            
130          
131             while (!stream.lookEOF()) {
132                 
133
134                 if (this.read_mlcomment(stream, tokens)) continue;
135                 if (this.read_slcomment(stream, tokens)) continue;
136                 if (this.read_dbquote(stream, tokens))   continue;
137                 if (this.read_snquote(stream, tokens))   continue;
138                 if (this.read_regx(stream, tokens))      continue;
139                 if (this.read_numb(stream, tokens))      continue;
140                 if (this.read_punc(stream, tokens))      continue;
141                 if (this.read_newline(stream, tokens))   continue;
142                 if (this.read_space(stream, tokens))     continue;
143                 if (this.read_word(stream, tokens))      continue;
144                 
145                 // if execution reaches here then an error has happened
146                 tokens.push(
147                         new Token(stream.next(), "TOKN", "UNKNOWN_TOKEN", this.line)
148                 );
149             }
150             
151             
152             
153             return tokens;
154         }
155
156         /**
157          * findPuncToken - find the id of a token (previous to current)
158          * need to back check syntax..
159          * 
160          * @arg {Array} tokens the array of tokens.
161          * @arg {String} token data (eg. '(')
162          * @arg {Number} offset where to start reading from
163          * @return {Number} position of token
164          */
165         public int findPuncToken(TokenArray tokens, string data, int n)
166         {
167             n = n > 0 ? n :  tokens.length -1;
168             var stack = 0;
169             while (n > -1) {
170                 
171                 if (stack < 1 && tokens.get(n).data == data) {
172                     return n;
173                 }
174                 
175                 if (tokens.get(n).data  == ")" || tokens.get(n).data  == "}") {
176                     stack++;
177                     n--;
178                     continue;
179                 }
180                 if (stack > 0 && (tokens.get(n).data  == "{" || tokens.get(n).data  == "(")) {
181                     stack--;
182                     n--;
183                     continue;
184                 }
185                 
186                 
187                 n--;
188             }
189             return -1;
190         }
191         /**
192          * lastSym - find the last token symbol
193          * need to back check syntax..
194          * 
195          * @arg {Array} tokens the array of tokens.
196          * @arg {Number} offset where to start..
197          * @return {Token} the token
198          */
199         public Token? lastSym(TokenArray tokens, int n)
200         {
201             for (var i = n-1; i >= 0; i--) {
202                 if (!(tokens.get(i).is("WHIT") || tokens.get(i).is("COMM"))) {
203                     return tokens.get(i);
204                 }
205             }
206             return null;
207         }
208         
209          
210         
211         /**
212             @returns {Boolean} Was the token found?
213          */
214         public bool read_word (TextStream stream, TokenArray tokens)
215         {
216             string found = "";
217             while (!stream.lookEOF() && Lang.isWordChar(stream.look().to_string())) {
218                 found += stream.next();
219             }
220             
221             if (found == "") {
222                 return false;
223             }
224             
225             var name = Lang.keyword(found);
226             if (name != null) {
227                 
228                 // look for "()return" ?? why ???
229                 var ls = tokens.lastSym();
230                 if (found == "return" && ls != null && ls.data == ")") {
231                     //Seed.print('@' + tokens.length);
232                     var n = this.findPuncToken(tokens, ")", 0);
233                     //Seed.print(')@' + n);
234                     n = this.findPuncToken(tokens, "(", n-1);
235                     //Seed.print('(@' + n);
236                     
237                     //var lt = this.lastSym(tokens, n);
238                     /*
239                     //print(JSON.stringify(lt));
240                     if (lt.type != "KEYW" || ["IF", 'WHILE'].indexOf(lt.name) < -1) {
241                         if (!this.ignoreBadGrammer) {
242                             throw new TokenReader_Error.ArgumentError(
243                                 this.filename + ":" + this.line + " Error - return found after )"
244                             );
245                         }
246                     }
247                     
248                     */
249                     
250                 }
251                 
252                 tokens.push(new Token(found, "KEYW", name, this.line));
253                 return true;
254             }
255             
256             if (!this.sepIdents || found.index_of(".") < 0 ) {
257                 tokens.push(new Token(found, "NAME", "NAME", this.line));
258                 return true;
259             }
260             var n = found.split(".");
261             var p = false;
262             foreach (unowned string nm in n) {
263                 if (p) {
264                     tokens.push(new Token(".", "PUNC", "DOT", this.line));
265                 }
266                 p=true;
267                 tokens.push(new Token(nm, "NAME", "NAME", this.line));
268             }
269             return true;
270                 
271
272         }
273
274         /**
275             @returns {Boolean} Was the token found?
276          */
277         public bool read_punc (TextStream stream, TokenArray tokens) throws TokenReader_Error
278         {
279             string found = "";
280             
281             while (!stream.lookEOF()) {
282                         var ns = stream.look().to_string();
283
284                     if (null == Lang.punc(found + ns )) {
285                                 break;
286                         }
287                 found += stream.next();
288             }
289             
290             
291             if (found == "") {
292                 return false;
293             }
294             
295             var ls = tokens.lastSym();
296             
297             if ((found == "}" || found == "]") && ls != null && ls.data == ",") {
298                 //print("Error - comma found before " + found);
299                 //print(JSON.stringify(tokens.lastSym(), null,4));
300                 if (this.ignoreBadGrammer) {
301                     print("\n" + this.filename + ":" + this.line.to_string() + " Error - comma found before " + found);
302                 } else {
303                     throw new TokenReader_Error.ArgumentError(
304                                 this.filename + ":" + this.line.to_string() + "  comma found before " + found
305                   
306                     );
307                      
308                 }
309             }
310             
311             tokens.push(new Token(found, "PUNC", Lang.punc(found), this.line));
312             return true;
313             
314         } 
315
316         /**
317             @returns {Boolean} Was the token found?
318          */
319         public bool read_space  (TextStream stream, TokenArray tokens)
320         {
321             var found = "";
322             
323             while (!stream.lookEOF() && Lang.isSpaceC(  stream.look()) && !Lang.isNewlineC(stream.look())) {
324                 found += stream.next();
325             }
326             
327             if (found == "") {
328                 return false;
329             }
330             //print("WHITE = " + JSON.stringify(found));
331             
332              
333             if (this.collapseWhite) {
334                 found = " "; // this might work better if it was a '\n' ???
335             }
336             if (this.keepWhite) {
337                 tokens.push(new Token(found, "WHIT", "SPACE", this.line));
338             }
339             return true;
340         
341         }
342
343         /**
344             @returns {Boolean} Was the token found?
345          */
346         public bool read_newline  (TextStream stream, TokenArray tokens)
347         {
348             var found = "";
349             var line = this.line;
350             while (!stream.lookEOF() && Lang.isNewlineC(stream.look())) {
351                 this.line++;
352                 found += stream.next();
353             }
354             
355             if (found == "") {
356                 return false;
357             }
358             
359             // if we found a new line, then we could check if previous character was a ';' - if so we can drop it.
360             // otherwise generally keep it.. in which case it should reduce our issue with stripping new lines..
361            
362             
363             //this.line++;
364             if (this.collapseWhite) {
365                 found = "\n"; // reduces multiple line breaks into a single one...
366             }
367             
368             if (this.keepWhite) {
369                 var last = tokens.pop();
370                 if (last != null && last.name != "WHIT") {
371                     tokens.push(last);
372                 }
373                 // replaces last new line... 
374                 tokens.push(new Token(found, "WHIT", "NEWLINE", line));
375             }
376             return true;
377         }
378
379         /**
380             @returns {Boolean} Was the token found?
381          */
382         public bool read_mlcomment  (TextStream stream, TokenArray tokens)
383         {
384             if (stream.look() != '/') {
385                 return false;
386             }
387             if (stream.look(1) != '*') {
388                 return false;
389             }
390             var found = stream.next(2);
391             string  c = "";
392             var line = this.line;
393             while (!stream.lookEOF() && !(stream.look(-1) == '/' && stream.look(-2) == '*')) {
394                 c = stream.next();
395                 if (c == "\n") {
396                     this.line++;
397                 }
398                 found += c;
399             }
400             
401             // to start doclet we allow /** or /*** but not /**/ or /****
402             //if (found.length /^\/\*\*([^\/]|\*[^*])/.test(found) && this.keepDocs) {
403             if (this.keepDocs && found.length > 4 && found.index_of("/**") == 0 && found[3] != '/') {
404                 tokens.push(new Token(found, "COMM", "JSDOC", this.line));
405             } else if (this.keepComments) {
406                 tokens.push(new Token(found, "COMM", "MULTI_LINE_COMM", line));
407             }
408             return true;
409         
410         } 
411
412         /**
413             @returns {Boolean} Was the token found?
414          */
415          public bool read_slcomment  (TextStream stream, TokenArray tokens)
416          {
417             var found = "";
418             if (
419                 (stream.look() == '/' && stream.look(1) == '/' && (""!=(found=stream.next(2))))
420                 || 
421                 (stream.look() == '<' && stream.look(1) == '!' && stream.look(2) == '-' && stream.look(3) == '-' && (""!=(found=stream.next(4))))
422             ) {
423                 var line = this.line;
424                 while (!stream.lookEOF()) {
425                                         //print(stream.look().to_string());
426                         if ( Lang.isNewline(stream.look().to_string())) {
427                                 break;
428                         }
429                     found += stream.next();
430                 }
431                 if (!stream.lookEOF()) { // lookinng for end  of line... if we got it, then do not eat the character..
432                     found += stream.next();
433                 }
434                 if (this.keepComments) {
435                     tokens.push(new Token(found, "COMM", "SINGLE_LINE_COMM", line));
436                 }
437                 this.line++;
438                 return true;
439             }
440             return false;
441         }
442
443         /**
444             @returns {Boolean} Was the token found?
445          */
446         public bool read_dbquote  (TextStream stream, TokenArray tokens)
447         {
448             if (stream.look() != '"') {
449                 return false;
450             }
451                 // find terminator
452             var str = stream.next();
453             
454             while (!stream.lookEOF()) {
455                 if (stream.look() == '\\') {
456                     if (Lang.isNewline(stream.look(1).to_string())) {
457                         do {
458                             stream.next();
459                         } while (!stream.lookEOF() && Lang.isNewline(stream.look().to_string()));
460                         str += "\\\n";
461                     }
462                     else {
463                         str += stream.next(2);
464                     }
465                     continue;
466                 }
467                 if (stream.look() == '"') {
468                     str += stream.next();
469                     tokens.push(new Token(str, "STRN", "DOUBLE_QUOTE", this.line));
470                     return true;
471                 }
472             
473                 str += stream.next();
474                 
475             }
476             return false;
477         }
478
479         /**
480             @returns {Boolean} Was the token found?
481          */
482         public bool read_snquote  (TextStream stream, TokenArray tokens)
483         {
484             if (stream.look() != '\'') {
485                 return false;
486             }
487             // find terminator
488             var str = stream.next();
489             
490             while (!stream.lookEOF()) {
491                 if (stream.look() == '\\') { // escape sequence
492                     str += stream.next(2);
493                     continue;
494                 }
495                 if (stream.look() == '\'') {
496                     str += stream.next();
497                     tokens.push(new Token(str, "STRN", "SINGLE_QUOTE", this.line));
498                     return true;
499                 }
500                 str += stream.next();
501                 
502             }
503             return false;
504         }
505         
506
507         /**
508             @returns {Boolean} Was the token found?
509          */
510         public bool read_numb  (TextStream stream, TokenArray tokens)
511         {
512             if (stream.look() == '0' && stream.look(1) == 'x') {
513                 return this.read_hex(stream, tokens);
514             }
515             
516             var found = "";
517             
518             while (!stream.lookEOF() && Lang.isNumber(found+stream.look().to_string())){
519                 found += stream.next();
520             }
521             
522             if (found == "") {
523                 return false;
524             }
525             if (GLib.Regex.match_simple("^0[0-7]", found)) {
526                 tokens.push(new Token(found, "NUMB", "OCTAL", this.line));
527                 return true;
528             }
529             tokens.push(new Token(found, "NUMB", "DECIMAL", this.line));
530             return true;
531         
532         }
533        
534         /**
535             @returns {Boolean} Was the token found?
536          */
537         public bool read_hex  (TextStream stream, TokenArray tokens)
538         {
539             var found = stream.next(2);
540             
541             while (!stream.lookEOF()) {
542                 if (Lang.isHexDec(found) && !Lang.isHexDec(found+stream.look().to_string())) { // done
543                     tokens.push(new Token(found, "NUMB", "HEX_DEC", this.line));
544                     return true;
545                 }
546                 
547                 found += stream.next();
548                
549             }
550             return false;
551         }
552
553         /**
554             @returns {Boolean} Was the token found?
555          */
556         public bool read_regx (TextStream stream, TokenArray tokens)
557         {
558               
559             if (stream.look() != '/') {
560                 return false;
561             }
562             var  last = tokens.lastSym();
563             if (
564                 (last == null)
565                 || 
566                 (
567                        !last.is("NUMB")   // stuff that can not appear before a regex..
568                     && !last.is("NAME")
569                     && !last.is("RIGHT_PAREN")
570                     && !last.is("RIGHT_BRACKET")
571                 )
572             )  {
573                 var regex = stream.next();
574                 
575                 while (!stream.lookEOF()) {
576                     if (stream.look() == '\\') { // escape sequence
577                         regex += stream.next(2);
578                         continue;
579                     }
580                     if (stream.look() == '/') {
581                         regex += stream.next();
582                         
583                         while (GLib.Regex.match_simple("[gmi]", stream.look().to_string())) {
584                             regex += stream.next();
585                         }
586                         
587                         tokens.push(new Token(regex, "REGX", "REGX", this.line));
588                         return true;
589                     }
590                      
591                     regex += stream.next();
592                      
593                 }
594                 // error: unterminated regex
595             }
596             return false;
597         }
598     }
599 }