JSDOC/TokenReader.vala
[gnome.introspection-doc-generator] / JSDOC / TokenReader.vala
1 //<script type="text/javascript">
2
3  
4 // test code
5
6 void main() {
7         var lc = new JSDOC.Lang_Class ();
8          var tr = new  JSDOC.TokenReader();
9          tr.keepDocs =true;
10         tr.keepWhite = true;
11         tr.keepComments = true;
12         tr.sepIdents = true;
13         tr.collapseWhite = false;
14         tr.filename = "test";
15         string str;
16         FileUtils.get_contents("/home/alan/gitlive/gnome.introspection-doc-generator/JSDOC/Walker2.js", out  str);
17         
18         var toks = tr.tokenize(new JSDOC.TextStream(str)); // dont merge xxx + . + yyyy etc.
19     toks.dump();
20 }
21
22 //const Token   = imports.Token.Token;
23 //const Lang    = imports.Lang.Lang;
24
25 /**
26         @class Search a {@link JSDOC.TextStream} for language tokens.
27 */
28
29
30
31
32 namespace JSDOC {
33
34     public class TokenArray: Object {
35         
36         public Gee.ArrayList<Token> tokens;
37         public int length {
38             get { return this.tokens.size; }
39         }
40         
41         public TokenArray()
42         {
43             this.tokens = new Gee.ArrayList<Token>();
44         }
45         
46         public Token? last() {
47             if (this.tokens.size > 0) {
48                 return this.tokens.get(this.tokens.size-1);
49             }
50             return null;
51         }
52         public Token? lastSym () {
53             for (var i = this.tokens.size-1; i >= 0; i--) {
54                 if (!(this.tokens.get(i).is("WHIT") || this.tokens.get(i).is("COMM")))  {
55                     return this.tokens.get(i);
56                 }
57             }
58             return null;
59         }
60         public void push (Token t) {
61             this.tokens.add(t);
62         }
63         public Token? pop ()
64         {
65             if (this.tokens.size > 0) {
66                 return this.tokens.remove_at(this.tokens.size-1);
67             }
68             return null;
69         }
70         
71             public new Token get(int i) {
72             return this.tokens.get(i);
73         }
74         public void dump()
75         {
76                 foreach(var token in this.tokens) {
77                         print(token.asString());
78                 }
79         }
80         
81     }
82
83     public errordomain TokenReader_Error {
84             ArgumentError
85     }
86     
87
88     public class TokenReader : Object
89     {
90         
91         
92         
93         /*
94          *
95          * I wonder if this will accept the prop: value, prop2 :value construxtor if we do not define one...
96          */
97         
98         /** @cfg {Boolean} collapseWhite merge multiple whitespace/comments into a single token **/
99         public bool collapseWhite = false; // only reduces white space...
100         /** @cfg {Boolean} keepDocs keep JSDOC comments **/
101         public bool keepDocs = true;
102         /** @cfg {Boolean} keepWhite keep White space **/
103         public bool keepWhite = false;
104         /** @cfg {Boolean} keepComments  keep all comments **/
105         public bool keepComments = false;
106         /** @cfg {Boolean} sepIdents seperate identifiers (eg. a.b.c into ['a', '.', 'b', '.', 'c'] ) **/
107         public bool sepIdents = false;
108         /** @cfg {String} filename name of file being parsed. **/
109         public string filename = "";
110         /** @config {Boolean} ignoreBadGrammer do not throw errors if we find stuff that might break compression **/
111         public bool ignoreBadGrammer = false;
112         
113         
114         int line = 0;
115         
116         /**
117          * tokenize a stream
118          * @return {Array} of tokens
119          * 
120          * ts = new TextStream(File.read(str));
121          * tr = TokenReader({ keepComments : true, keepWhite : true });
122          * tr.tokenize(ts)
123          * 
124          */
125         public TokenArray tokenize(TextStream stream)
126         {
127             this.line =1;
128             var tokens = new TokenArray();
129            
130          
131             while (!stream.lookEOF()) {
132                 
133
134                 if (this.read_mlcomment(stream, tokens)) continue;
135                 if (this.read_slcomment(stream, tokens)) continue;
136                 if (this.read_dbquote(stream, tokens))   continue;
137                 if (this.read_snquote(stream, tokens))   continue;
138                 if (this.read_regx(stream, tokens))      continue;
139                 if (this.read_numb(stream, tokens))      continue;
140                 if (this.read_punc(stream, tokens))      continue;
141                 if (this.read_newline(stream, tokens))   continue;
142                 if (this.read_space(stream, tokens))     continue;
143                 if (this.read_word(stream, tokens))      continue;
144                 
145                 // if execution reaches here then an error has happened
146                 tokens.push(
147                         new Token(stream.next(), "TOKN", "UNKNOWN_TOKEN", this.line)
148                 );
149             }
150             
151             
152             
153             return tokens;
154         }
155
156         /**
157          * findPuncToken - find the id of a token (previous to current)
158          * need to back check syntax..
159          * 
160          * @arg {Array} tokens the array of tokens.
161          * @arg {String} token data (eg. '(')
162          * @arg {Number} offset where to start reading from
163          * @return {Number} position of token
164          */
165         public int findPuncToken(TokenArray tokens, string data, int n)
166         {
167             n = n > 0 ? n :  tokens.length -1;
168             var stack = 0;
169             while (n > -1) {
170                 
171                 if (stack < 1 && tokens.get(n).data == data) {
172                     return n;
173                 }
174                 
175                 if (tokens.get(n).data  == ")" || tokens.get(n).data  == "}") {
176                     stack++;
177                     n--;
178                     continue;
179                 }
180                 if (stack > 0 && (tokens.get(n).data  == "{" || tokens.get(n).data  == "(")) {
181                     stack--;
182                     n--;
183                     continue;
184                 }
185                 
186                 
187                 n--;
188             }
189             return -1;
190         }
191         /**
192          * lastSym - find the last token symbol
193          * need to back check syntax..
194          * 
195          * @arg {Array} tokens the array of tokens.
196          * @arg {Number} offset where to start..
197          * @return {Token} the token
198          */
199         public Token? lastSym(TokenArray tokens, int n)
200         {
201             for (var i = n-1; i >= 0; i--) {
202                 if (!(tokens.get(i).is("WHIT") || tokens.get(i).is("COMM"))) {
203                     return tokens.get(i);
204                 }
205             }
206             return null;
207         }
208         
209          
210         
211         /**
212             @returns {Boolean} Was the token found?
213          */
214         public bool read_word (TextStream stream, TokenArray tokens)
215         {
216             string found = "";
217             while (!stream.lookEOF() && Lang.isWordChar(stream.look().to_string())) {
218                 found += stream.next();
219             }
220             
221             if (found == "") {
222                 return false;
223             }
224             
225             var name = Lang.keyword(found);
226             if (name != null) {
227                 
228                 // look for "()return" ?? why ???
229                 var ls = tokens.lastSym();
230                 if (found == "return" && ls != null && ls.data == ")") {
231                     //Seed.print('@' + tokens.length);
232                     var n = this.findPuncToken(tokens, ")", 0);
233                     //Seed.print(')@' + n);
234                     n = this.findPuncToken(tokens, "(", n-1);
235                     //Seed.print('(@' + n);
236                     
237                     //var lt = this.lastSym(tokens, n);
238                     /*
239                     //print(JSON.stringify(lt));
240                     if (lt.type != "KEYW" || ["IF", 'WHILE'].indexOf(lt.name) < -1) {
241                         if (!this.ignoreBadGrammer) {
242                             throw new TokenReader_Error.ArgumentError(
243                                 this.filename + ":" + this.line + " Error - return found after )"
244                             );
245                         }
246                     }
247                     
248                     */
249                     
250                 }
251                 
252                 tokens.push(new Token(found, "KEYW", name, this.line));
253                 return true;
254             }
255             
256             if (!this.sepIdents || found.index_of(".") < 0 ) {
257                 tokens.push(new Token(found, "NAME", "NAME", this.line));
258                 return true;
259             }
260             var n = found.split(".");
261             var p = false;
262             foreach (unowned string nm in n) {
263                 if (p) {
264                     tokens.push(new Token(".", "PUNC", "DOT", this.line));
265                 }
266                 p=true;
267                 tokens.push(new Token(nm, "NAME", "NAME", this.line));
268             }
269             return true;
270                 
271
272         }
273
274         /**
275             @returns {Boolean} Was the token found?
276          */
277         public bool read_punc (TextStream stream, TokenArray tokens) throws TokenReader_Error
278         {
279             string found = "";
280             
281             while (!stream.lookEOF()) {
282                         var ns = stream.look().to_string();
283                         found += ns;
284                     if (Lang.punc(found + stream.look().to_string()) ) {
285                 found += stream.next();
286             }
287             
288             
289             if (found == "") {
290                 return false;
291             }
292             
293             var ls = tokens.lastSym();
294             
295             if ((found == "}" || found == "]") && ls != null && ls.data == ",") {
296                 //print("Error - comma found before " + found);
297                 //print(JSON.stringify(tokens.lastSym(), null,4));
298                 if (this.ignoreBadGrammer) {
299                     print("\n" + this.filename + ":" + this.line.to_string() + " Error - comma found before " + found);
300                 } else {
301                     throw new TokenReader_Error.ArgumentError(
302                                 this.filename + ":" + this.line.to_string() + "  comma found before " + found
303                   
304                     );
305                      
306                 }
307             }
308             
309             tokens.push(new Token(found, "PUNC", Lang.punc(found), this.line));
310             return true;
311             
312         } 
313
314         /**
315             @returns {Boolean} Was the token found?
316          */
317         public bool read_space  (TextStream stream, TokenArray tokens)
318         {
319             var found = "";
320             
321             while (!stream.lookEOF() && Lang.isSpaceC(  stream.look()) && !Lang.isNewlineC(stream.look())) {
322                 found += stream.next();
323             }
324             
325             if (found == "") {
326                 return false;
327             }
328             //print("WHITE = " + JSON.stringify(found));
329             
330              
331             if (this.collapseWhite) {
332                 found = " "; // this might work better if it was a '\n' ???
333             }
334             if (this.keepWhite) {
335                 tokens.push(new Token(found, "WHIT", "SPACE", this.line));
336             }
337             return true;
338         
339         }
340
341         /**
342             @returns {Boolean} Was the token found?
343          */
344         public bool read_newline  (TextStream stream, TokenArray tokens)
345         {
346             var found = "";
347             var line = this.line;
348             while (!stream.lookEOF() && Lang.isNewlineC(stream.look())) {
349                 this.line++;
350                 found += stream.next();
351             }
352             
353             if (found == "") {
354                 return false;
355             }
356             
357             // if we found a new line, then we could check if previous character was a ';' - if so we can drop it.
358             // otherwise generally keep it.. in which case it should reduce our issue with stripping new lines..
359            
360             
361             //this.line++;
362             if (this.collapseWhite) {
363                 found = "\n"; // reduces multiple line breaks into a single one...
364             }
365             
366             if (this.keepWhite) {
367                 var last = tokens.pop();
368                 if (last != null && last.name != "WHIT") {
369                     tokens.push(last);
370                 }
371                 // replaces last new line... 
372                 tokens.push(new Token(found, "WHIT", "NEWLINE", line));
373             }
374             return true;
375         }
376
377         /**
378             @returns {Boolean} Was the token found?
379          */
380         public bool read_mlcomment  (TextStream stream, TokenArray tokens)
381         {
382             if (stream.look() != '/') {
383                 return false;
384             }
385             if (stream.look(1) != '*') {
386                 return false;
387             }
388             var found = stream.next(2);
389             string  c = "";
390             var line = this.line;
391             while (!stream.lookEOF() && !(stream.look(-1) == '/' && stream.look(-2) == '*')) {
392                 c = stream.next();
393                 if (c == "\n") {
394                     this.line++;
395                 }
396                 found += c;
397             }
398             
399             // to start doclet we allow /** or /*** but not /**/ or /****
400             //if (found.length /^\/\*\*([^\/]|\*[^*])/.test(found) && this.keepDocs) {
401             if (this.keepDocs && found.length > 4 && found.index_of("/**") == 0 && found[3] != '/') {
402                 tokens.push(new Token(found, "COMM", "JSDOC", this.line));
403             } else if (this.keepComments) {
404                 tokens.push(new Token(found, "COMM", "MULTI_LINE_COMM", line));
405             }
406             return true;
407         
408         } 
409
410         /**
411             @returns {Boolean} Was the token found?
412          */
413          public bool read_slcomment  (TextStream stream, TokenArray tokens)
414          {
415             var found = "";
416             if (
417                 (stream.look() == '/' && stream.look(1) == '/' && (""!=(found=stream.next(2))))
418                 || 
419                 (stream.look() == '<' && stream.look(1) == '!' && stream.look(2) == '-' && stream.look(3) == '-' && (""!=(found=stream.next(4))))
420             ) {
421                 var line = this.line;
422                 while (!stream.lookEOF()) {
423                                         print(stream.look().to_string());
424                         if ( Lang.isNewline(stream.look().to_string())) {
425                                 break;
426                         }
427                     found += stream.next();
428                 }
429                 if (!stream.lookEOF()) { // lookinng for end  of line... if we got it, then do not eat the character..
430                     found += stream.next();
431                 }
432                 if (this.keepComments) {
433                     tokens.push(new Token(found, "COMM", "SINGLE_LINE_COMM", line));
434                 }
435                 this.line++;
436                 return true;
437             }
438             return false;
439         }
440
441         /**
442             @returns {Boolean} Was the token found?
443          */
444         public bool read_dbquote  (TextStream stream, TokenArray tokens)
445         {
446             if (stream.look() != '"') {
447                 return false;
448             }
449                 // find terminator
450             var str = stream.next();
451             
452             while (!stream.lookEOF()) {
453                 if (stream.look() == '\\') {
454                     if (Lang.isNewline(stream.look(1).to_string())) {
455                         do {
456                             stream.next();
457                         } while (!stream.lookEOF() && Lang.isNewline(stream.look().to_string()));
458                         str += "\\\n";
459                     }
460                     else {
461                         str += stream.next(2);
462                     }
463                     continue;
464                 }
465                 if (stream.look() == '"') {
466                     str += stream.next();
467                     tokens.push(new Token(str, "STRN", "DOUBLE_QUOTE", this.line));
468                     return true;
469                 }
470             
471                 str += stream.next();
472                 
473             }
474             return false;
475         }
476
477         /**
478             @returns {Boolean} Was the token found?
479          */
480         public bool read_snquote  (TextStream stream, TokenArray tokens)
481         {
482             if (stream.look() != '\'') {
483                 return false;
484             }
485             // find terminator
486             var str = stream.next();
487             
488             while (!stream.lookEOF()) {
489                 if (stream.look() == '\\') { // escape sequence
490                     str += stream.next(2);
491                     continue;
492                 }
493                 if (stream.look() == '\'') {
494                     str += stream.next();
495                     tokens.push(new Token(str, "STRN", "SINGLE_QUOTE", this.line));
496                     return true;
497                 }
498                 str += stream.next();
499                 
500             }
501             return false;
502         }
503         
504
505         /**
506             @returns {Boolean} Was the token found?
507          */
508         public bool read_numb  (TextStream stream, TokenArray tokens)
509         {
510             if (stream.look() == '0' && stream.look(1) == 'x') {
511                 return this.read_hex(stream, tokens);
512             }
513             
514             var found = "";
515             
516             while (!stream.lookEOF() && Lang.isNumber(found+stream.look().to_string())){
517                 found += stream.next();
518             }
519             
520             if (found == "") {
521                 return false;
522             }
523             if (GLib.Regex.match_simple("^0[0-7]", found)) {
524                 tokens.push(new Token(found, "NUMB", "OCTAL", this.line));
525                 return true;
526             }
527             tokens.push(new Token(found, "NUMB", "DECIMAL", this.line));
528             return true;
529         
530         }
531        
532         /**
533             @returns {Boolean} Was the token found?
534          */
535         public bool read_hex  (TextStream stream, TokenArray tokens)
536         {
537             var found = stream.next(2);
538             
539             while (!stream.lookEOF()) {
540                 if (Lang.isHexDec(found) && !Lang.isHexDec(found+stream.look().to_string())) { // done
541                     tokens.push(new Token(found, "NUMB", "HEX_DEC", this.line));
542                     return true;
543                 }
544                 
545                 found += stream.next();
546                
547             }
548             return false;
549         }
550
551         /**
552             @returns {Boolean} Was the token found?
553          */
554         public bool read_regx (TextStream stream, TokenArray tokens)
555         {
556               
557             if (stream.look() != '/') {
558                 return false;
559             }
560             var  last = tokens.lastSym();
561             if (
562                 (last == null)
563                 || 
564                 (
565                        !last.is("NUMB")   // stuff that can not appear before a regex..
566                     && !last.is("NAME")
567                     && !last.is("RIGHT_PAREN")
568                     && !last.is("RIGHT_BRACKET")
569                 )
570             )  {
571                 var regex = stream.next();
572                 
573                 while (!stream.lookEOF()) {
574                     if (stream.look() == '\\') { // escape sequence
575                         regex += stream.next(2);
576                         continue;
577                     }
578                     if (stream.look() == '/') {
579                         regex += stream.next();
580                         
581                         while (GLib.Regex.match_simple("[gmi]", stream.look().to_string())) {
582                             regex += stream.next();
583                         }
584                         
585                         tokens.push(new Token(regex, "REGX", "REGX", this.line));
586                         return true;
587                     }
588                      
589                     regex += stream.next();
590                      
591                 }
592                 // error: unterminated regex
593             }
594             return false;
595         }
596     }
597 }