JSDOC/TokenReader.vala
[gnome.introspection-doc-generator] / JSDOC / TokenReader.vala
1 //<script type="text/javascript">
2
3  
4 // test code
5
6 void main() {
7         var lc = new JSDOC.Lang_Class ();
8          var tr = new  JSDOC.TokenReader();
9          tr.keepDocs =true;
10         tr.keepWhite = true;
11         tr.keepComments = true;
12         tr.sepIdents = true;
13         tr.collapseWhite = false;
14         tr.filename = "test";
15         string str;
16         FileUtils.get_contents("/home/alan/gitlive/gnome.introspection-doc-generator/JSDOC/Walker2.js", out  str);
17         
18         var toks = tr.tokenize(new JSDOC.TextStream(str)); // dont merge xxx + . + yyyy etc.
19     toks.dump();
20 }
21
22 //const Token   = imports.Token.Token;
23 //const Lang    = imports.Lang.Lang;
24
25 /**
26         @class Search a {@link JSDOC.TextStream} for language tokens.
27 */
28
29
30
31
32 namespace JSDOC {
33
34     public class TokenArray: Object {
35         
36         public Gee.ArrayList<Token> tokens;
37         public int length {
38             get { return this.tokens.size; }
39         }
40         
41         public TokenArray()
42         {
43             this.tokens = new Gee.ArrayList<Token>();
44         }
45         
46         public Token? last() {
47             if (this.tokens.size > 0) {
48                 return this.tokens.get(this.tokens.size-1);
49             }
50             return null;
51         }
52         public Token? lastSym () {
53             for (var i = this.tokens.size-1; i >= 0; i--) {
54                 if (!(this.tokens.get(i).is("WHIT") || this.tokens.get(i).is("COMM")))  {
55                     return this.tokens.get(i);
56                 }
57             }
58             return null;
59         }
60         public void push (Token t) {
61             this.tokens.add(t);
62         }
63         public Token? pop ()
64         {
65             if (this.tokens.size > 0) {
66                 return this.tokens.remove_at(this.tokens.size-1);
67             }
68             return null;
69         }
70         
71             public new Token get(int i) {
72             return this.tokens.get(i);
73         }
74         public void dump()
75         {
76                 foreach(var token in this.tokens) {
77                         print(token.asString());
78                 }
79         }
80         
81     }
82
83     public errordomain TokenReader_Error {
84             ArgumentError
85     }
86     
87
88     public class TokenReader : Object
89     {
90         
91         
92         
93         /*
94          *
95          * I wonder if this will accept the prop: value, prop2 :value construxtor if we do not define one...
96          */
97         
98         /** @cfg {Boolean} collapseWhite merge multiple whitespace/comments into a single token **/
99         public bool collapseWhite = false; // only reduces white space...
100         /** @cfg {Boolean} keepDocs keep JSDOC comments **/
101         public bool keepDocs = true;
102         /** @cfg {Boolean} keepWhite keep White space **/
103         public bool keepWhite = false;
104         /** @cfg {Boolean} keepComments  keep all comments **/
105         public bool keepComments = false;
106         /** @cfg {Boolean} sepIdents seperate identifiers (eg. a.b.c into ['a', '.', 'b', '.', 'c'] ) **/
107         public bool sepIdents = false;
108         /** @cfg {String} filename name of file being parsed. **/
109         public string filename = "";
110         /** @config {Boolean} ignoreBadGrammer do not throw errors if we find stuff that might break compression **/
111         public bool ignoreBadGrammer = false;
112         
113         
114         int line = 0;
115         
116         /**
117          * tokenize a stream
118          * @return {Array} of tokens
119          * 
120          * ts = new TextStream(File.read(str));
121          * tr = TokenReader({ keepComments : true, keepWhite : true });
122          * tr.tokenize(ts)
123          * 
124          */
125         public TokenArray tokenize(TextStream stream)
126         {
127             this.line =1;
128             var tokens = new TokenArray();
129            
130          
131             while (!stream.lookEOF()) {
132                 
133
134                 if (this.read_mlcomment(stream, tokens)) continue;
135                 if (this.read_slcomment(stream, tokens)) continue;
136                 if (this.read_dbquote(stream, tokens))   continue;
137                 if (this.read_snquote(stream, tokens))   continue;
138                 if (this.read_regx(stream, tokens))      continue;
139                 if (this.read_numb(stream, tokens))      continue;
140                 if (this.read_punc(stream, tokens))      continue;
141                 if (this.read_newline(stream, tokens))   continue;
142                 if (this.read_space(stream, tokens))     continue;
143                 if (this.read_word(stream, tokens))      continue;
144                 
145                 // if execution reaches here then an error has happened
146                 tokens.push(
147                         new Token(stream.next(), "TOKN", "UNKNOWN_TOKEN", this.line)
148                 );
149             }
150             
151             
152             
153             return tokens;
154         }
155
156         /**
157          * findPuncToken - find the id of a token (previous to current)
158          * need to back check syntax..
159          * 
160          * @arg {Array} tokens the array of tokens.
161          * @arg {String} token data (eg. '(')
162          * @arg {Number} offset where to start reading from
163          * @return {Number} position of token
164          */
165         public int findPuncToken(TokenArray tokens, string data, int n)
166         {
167             n = n > 0 ? n :  tokens.length -1;
168             var stack = 0;
169             while (n > -1) {
170                 
171                 if (stack < 1 && tokens.get(n).data == data) {
172                     return n;
173                 }
174                 
175                 if (tokens.get(n).data  == ")" || tokens.get(n).data  == "}") {
176                     stack++;
177                     n--;
178                     continue;
179                 }
180                 if (stack > 0 && (tokens.get(n).data  == "{" || tokens.get(n).data  == "(")) {
181                     stack--;
182                     n--;
183                     continue;
184                 }
185                 
186                 
187                 n--;
188             }
189             return -1;
190         }
191         /**
192          * lastSym - find the last token symbol
193          * need to back check syntax..
194          * 
195          * @arg {Array} tokens the array of tokens.
196          * @arg {Number} offset where to start..
197          * @return {Token} the token
198          */
199         public Token? lastSym(TokenArray tokens, int n)
200         {
201             for (var i = n-1; i >= 0; i--) {
202                 if (!(tokens.get(i).is("WHIT") || tokens.get(i).is("COMM"))) {
203                     return tokens.get(i);
204                 }
205             }
206             return null;
207         }
208         
209          
210         
211         /**
212             @returns {Boolean} Was the token found?
213          */
214         public bool read_word (TextStream stream, TokenArray tokens)
215         {
216             string found = "";
217             while (!stream.lookEOF() && Lang.isWordChar(stream.look().to_string())) {
218                 found += stream.next();
219             }
220             
221             if (found == "") {
222                 return false;
223             }
224             
225             var name = Lang.keyword(found);
226             if (name != null) {
227                 
228                 // look for "()return" ?? why ???
229                 var ls = tokens.lastSym();
230                 if (found == "return" && ls != null && ls.data == ")") {
231                     //Seed.print('@' + tokens.length);
232                     var n = this.findPuncToken(tokens, ")", 0);
233                     //Seed.print(')@' + n);
234                     n = this.findPuncToken(tokens, "(", n-1);
235                     //Seed.print('(@' + n);
236                     
237                     //var lt = this.lastSym(tokens, n);
238                     /*
239                     //print(JSON.stringify(lt));
240                     if (lt.type != "KEYW" || ["IF", 'WHILE'].indexOf(lt.name) < -1) {
241                         if (!this.ignoreBadGrammer) {
242                             throw new TokenReader_Error.ArgumentError(
243                                 this.filename + ":" + this.line + " Error - return found after )"
244                             );
245                         }
246                     }
247                     
248                     */
249                     
250                 }
251                 
252                 tokens.push(new Token(found, "KEYW", name, this.line));
253                 return true;
254             }
255             
256             if (!this.sepIdents || found.index_of(".") < 0 ) {
257                 tokens.push(new Token(found, "NAME", "NAME", this.line));
258                 return true;
259             }
260             var n = found.split(".");
261             var p = false;
262             foreach (unowned string nm in n) {
263                 if (p) {
264                     tokens.push(new Token(".", "PUNC", "DOT", this.line));
265                 }
266                 p=true;
267                 tokens.push(new Token(nm, "NAME", "NAME", this.line));
268             }
269             return true;
270                 
271
272         }
273
274         /**
275             @returns {Boolean} Was the token found?
276          */
277         public bool read_punc (TextStream stream, TokenArray tokens) throws TokenReader_Error
278         {
279             string found = "";
280             
281             while (!stream.lookEOF() && Lang.punc(found + stream.look().to_string()).length > 0) {
282                 found += stream.next();
283             }
284             
285             
286             if (found == "") {
287                 return false;
288             }
289             
290             var ls = tokens.lastSym();
291             
292             if ((found == "}" || found == "]") && ls != null && ls.data == ",") {
293                 //print("Error - comma found before " + found);
294                 //print(JSON.stringify(tokens.lastSym(), null,4));
295                 if (this.ignoreBadGrammer) {
296                     print("\n" + this.filename + ":" + this.line.to_string() + " Error - comma found before " + found);
297                 } else {
298                     throw new TokenReader_Error.ArgumentError(
299                                 this.filename + ":" + this.line.to_string() + "  comma found before " + found
300                   
301                     );
302                      
303                 }
304             }
305             
306             tokens.push(new Token(found, "PUNC", Lang.punc(found), this.line));
307             return true;
308             
309         } 
310
311         /**
312             @returns {Boolean} Was the token found?
313          */
314         public bool read_space  (TextStream stream, TokenArray tokens)
315         {
316             var found = "";
317             
318             while (!stream.lookEOF() && Lang.isSpaceC(  stream.look()) && !Lang.isNewlineC(stream.look())) {
319                 found += stream.next();
320             }
321             
322             if (found == "") {
323                 return false;
324             }
325             //print("WHITE = " + JSON.stringify(found));
326             
327              
328             if (this.collapseWhite) {
329                 found = " "; // this might work better if it was a '\n' ???
330             }
331             if (this.keepWhite) {
332                 tokens.push(new Token(found, "WHIT", "SPACE", this.line));
333             }
334             return true;
335         
336         }
337
338         /**
339             @returns {Boolean} Was the token found?
340          */
341         public bool read_newline  (TextStream stream, TokenArray tokens)
342         {
343             var found = "";
344             var line = this.line;
345             while (!stream.lookEOF() && Lang.isNewlineC(stream.look())) {
346                 this.line++;
347                 found += stream.next();
348             }
349             
350             if (found == "") {
351                 return false;
352             }
353             
354             // if we found a new line, then we could check if previous character was a ';' - if so we can drop it.
355             // otherwise generally keep it.. in which case it should reduce our issue with stripping new lines..
356            
357             
358             //this.line++;
359             if (this.collapseWhite) {
360                 found = "\n"; // reduces multiple line breaks into a single one...
361             }
362             
363             if (this.keepWhite) {
364                 var last = tokens.pop();
365                 if (last != null && last.name != "WHIT") {
366                     tokens.push(last);
367                 }
368                 // replaces last new line... 
369                 tokens.push(new Token(found, "WHIT", "NEWLINE", line));
370             }
371             return true;
372         }
373
374         /**
375             @returns {Boolean} Was the token found?
376          */
377         public bool read_mlcomment  (TextStream stream, TokenArray tokens)
378         {
379             if (stream.look() != '/') {
380                 return false;
381             }
382             if (stream.look(1) != '*') {
383                 return false;
384             }
385             var found = stream.next(2);
386             string  c = "";
387             var line = this.line;
388             while (!stream.lookEOF() && !(stream.look(-1) == '/' && stream.look(-2) == '*')) {
389                 c = stream.next();
390                 if (c == "\n") {
391                     this.line++;
392                 }
393                 found += c;
394             }
395             
396             // to start doclet we allow /** or /*** but not /**/ or /****
397             //if (found.length /^\/\*\*([^\/]|\*[^*])/.test(found) && this.keepDocs) {
398             if (this.keepDocs && found.length > 4 && found.index_of("/**") == 0 && found[3] != '/') {
399                 tokens.push(new Token(found, "COMM", "JSDOC", this.line));
400             } else if (this.keepComments) {
401                 tokens.push(new Token(found, "COMM", "MULTI_LINE_COMM", line));
402             }
403             return true;
404         
405         } 
406
407         /**
408             @returns {Boolean} Was the token found?
409          */
410          public bool read_slcomment  (TextStream stream, TokenArray tokens)
411          {
412             var found = "";
413             if (
414                 (stream.look() == '/' && stream.look(1) == '/' && (""!=(found=stream.next(2))))
415                 || 
416                 (stream.look() == '<' && stream.look(1) == '!' && stream.look(2) == '-' && stream.look(3) == '-' && (""!=(found=stream.next(4))))
417             ) {
418                 var line = this.line;
419                 while (!stream.lookEOF()) {
420                                         print(stream.look().to_string());
421                         if ( Lang.isNewline(stream.look().to_string())) {
422                                 break;
423                         }
424                     found += stream.next();
425                 }
426                 if (!stream.lookEOF()) { // lookinng for end  of line... if we got it, then do not eat the character..
427                     found += stream.next();
428                 }
429                 if (this.keepComments) {
430                     tokens.push(new Token(found, "COMM", "SINGLE_LINE_COMM", line));
431                 }
432                 this.line++;
433                 return true;
434             }
435             return false;
436         }
437
438         /**
439             @returns {Boolean} Was the token found?
440          */
441         public bool read_dbquote  (TextStream stream, TokenArray tokens)
442         {
443             if (stream.look() != '"') {
444                 return false;
445             }
446                 // find terminator
447             var str = stream.next();
448             
449             while (!stream.lookEOF()) {
450                 if (stream.look() == '\\') {
451                     if (Lang.isNewline(stream.look(1).to_string())) {
452                         do {
453                             stream.next();
454                         } while (!stream.lookEOF() && Lang.isNewline(stream.look().to_string()));
455                         str += "\\\n";
456                     }
457                     else {
458                         str += stream.next(2);
459                     }
460                     continue;
461                 }
462                 if (stream.look() == '"') {
463                     str += stream.next();
464                     tokens.push(new Token(str, "STRN", "DOUBLE_QUOTE", this.line));
465                     return true;
466                 }
467             
468                 str += stream.next();
469                 
470             }
471             return false;
472         }
473
474         /**
475             @returns {Boolean} Was the token found?
476          */
477         public bool read_snquote  (TextStream stream, TokenArray tokens)
478         {
479             if (stream.look() != '\'') {
480                 return false;
481             }
482             // find terminator
483             var str = stream.next();
484             
485             while (!stream.lookEOF()) {
486                 if (stream.look() == '\\') { // escape sequence
487                     str += stream.next(2);
488                     continue;
489                 }
490                 if (stream.look() == '\'') {
491                     str += stream.next();
492                     tokens.push(new Token(str, "STRN", "SINGLE_QUOTE", this.line));
493                     return true;
494                 }
495                 str += stream.next();
496                 
497             }
498             return false;
499         }
500         
501
502         /**
503             @returns {Boolean} Was the token found?
504          */
505         public bool read_numb  (TextStream stream, TokenArray tokens)
506         {
507             if (stream.look() == '0' && stream.look(1) == 'x') {
508                 return this.read_hex(stream, tokens);
509             }
510             
511             var found = "";
512             
513             while (!stream.lookEOF() && Lang.isNumber(found+stream.look().to_string())){
514                 found += stream.next();
515             }
516             
517             if (found == "") {
518                 return false;
519             }
520             if (GLib.Regex.match_simple("^0[0-7]", found)) {
521                 tokens.push(new Token(found, "NUMB", "OCTAL", this.line));
522                 return true;
523             }
524             tokens.push(new Token(found, "NUMB", "DECIMAL", this.line));
525             return true;
526         
527         }
528        
529         /**
530             @returns {Boolean} Was the token found?
531          */
532         public bool read_hex  (TextStream stream, TokenArray tokens)
533         {
534             var found = stream.next(2);
535             
536             while (!stream.lookEOF()) {
537                 if (Lang.isHexDec(found) && !Lang.isHexDec(found+stream.look().to_string())) { // done
538                     tokens.push(new Token(found, "NUMB", "HEX_DEC", this.line));
539                     return true;
540                 }
541                 
542                 found += stream.next();
543                
544             }
545             return false;
546         }
547
548         /**
549             @returns {Boolean} Was the token found?
550          */
551         public bool read_regx (TextStream stream, TokenArray tokens)
552         {
553               
554             if (stream.look() != '/') {
555                 return false;
556             }
557             var  last = tokens.lastSym();
558             if (
559                 (last == null)
560                 || 
561                 (
562                        !last.is("NUMB")   // stuff that can not appear before a regex..
563                     && !last.is("NAME")
564                     && !last.is("RIGHT_PAREN")
565                     && !last.is("RIGHT_BRACKET")
566                 )
567             )  {
568                 var regex = stream.next();
569                 
570                 while (!stream.lookEOF()) {
571                     if (stream.look() == '\\') { // escape sequence
572                         regex += stream.next(2);
573                         continue;
574                     }
575                     if (stream.look() == '/') {
576                         regex += stream.next();
577                         
578                         while (GLib.Regex.match_simple("[gmi]", stream.look().to_string())) {
579                             regex += stream.next();
580                         }
581                         
582                         tokens.push(new Token(regex, "REGX", "REGX", this.line));
583                         return true;
584                     }
585                      
586                     regex += stream.next();
587                      
588                 }
589                 // error: unterminated regex
590             }
591             return false;
592         }
593     }
594 }