JSDOC/TokenReader.vala
[gnome.introspection-doc-generator] / JSDOC / TokenReader.vala
1 //<script type="text/javascript">
2
3  
4 // test code
5
6 void main() {
7          var tr = new  JSDOC.TokenReader();
8          tr.keepDocs =true;
9         tr.keepWhite = true;
10         tr.keepComments = true;
11         tr.sepIdents = true;
12         tr.collapseWhite = false;
13         tr.filename = "test";
14
15         
16         var toks = tr.tokenize(new JSDOC.TextStream(str)); // dont merge xxx + . + yyyy etc.
17         
18 }
19
20 //const Token   = imports.Token.Token;
21 //const Lang    = imports.Lang.Lang;
22
23 /**
24         @class Search a {@link JSDOC.TextStream} for language tokens.
25 */
26
27
28
29
30 namespace JSDOC {
31
32     public class TokenArray: Object {
33         
34         public Gee.ArrayList<Token> tokens;
35         public int length {
36             get { return this.tokens.size; }
37         }
38         
39         public TokenArray()
40         {
41             this.tokens = new Gee.ArrayList<Token>();
42         }
43         
44         public Token? last() {
45             if (this.tokens.size > 0) {
46                 return this.tokens.get(this.tokens.size-1);
47             }
48             return null;
49         }
50         public Token? lastSym () {
51             for (var i = this.tokens.size-1; i >= 0; i--) {
52                 if (!(this.tokens.get(i).is("WHIT") || this.tokens.get(i).is("COMM")))  {
53                     return this.tokens.get(i);
54                 }
55             }
56             return null;
57         }
58         public void push (Token t) {
59             this.tokens.add(t);
60         }
61         public Token? pop ()
62         {
63             if (this.tokens.size > 0) {
64                 return this.tokens.remove_at(this.tokens.size-1);
65             }
66             return null;
67         }
68         
69             public new Token get(int i) {
70             return this.tokens.get(i);
71         }
72     }
73
74     public errordomain TokenReader_Error {
75             ArgumentError
76     }
77     
78
79     public class TokenReader : Object
80     {
81         
82         
83         
84         /*
85          *
86          * I wonder if this will accept the prop: value, prop2 :value construxtor if we do not define one...
87          */
88         
89         /** @cfg {Boolean} collapseWhite merge multiple whitespace/comments into a single token **/
90         public bool collapseWhite = false; // only reduces white space...
91         /** @cfg {Boolean} keepDocs keep JSDOC comments **/
92         public bool keepDocs = true;
93         /** @cfg {Boolean} keepWhite keep White space **/
94         public bool keepWhite = false;
95         /** @cfg {Boolean} keepComments  keep all comments **/
96         public bool keepComments = false;
97         /** @cfg {Boolean} sepIdents seperate identifiers (eg. a.b.c into ['a', '.', 'b', '.', 'c'] ) **/
98         public bool sepIdents = false;
99         /** @cfg {String} filename name of file being parsed. **/
100         public string filename = "";
101         /** @config {Boolean} ignoreBadGrammer do not throw errors if we find stuff that might break compression **/
102         public bool ignoreBadGrammer = false;
103         
104         
105         int line = 0;
106         
107         /**
108          * tokenize a stream
109          * @return {Array} of tokens
110          * 
111          * ts = new TextStream(File.read(str));
112          * tr = TokenReader({ keepComments : true, keepWhite : true });
113          * tr.tokenize(ts)
114          * 
115          */
116         public TokenArray tokenize(TextStream stream)
117         {
118             this.line =1;
119             var tokens = new TokenArray();
120            
121          
122             while (!stream.lookEOF()) {
123                 
124
125                 if (this.read_mlcomment(stream, tokens)) continue;
126                 if (this.read_slcomment(stream, tokens)) continue;
127                 if (this.read_dbquote(stream, tokens))   continue;
128                 if (this.read_snquote(stream, tokens))   continue;
129                 if (this.read_regx(stream, tokens))      continue;
130                 if (this.read_numb(stream, tokens))      continue;
131                 if (this.read_punc(stream, tokens))      continue;
132                 if (this.read_newline(stream, tokens))   continue;
133                 if (this.read_space(stream, tokens))     continue;
134                 if (this.read_word(stream, tokens))      continue;
135                 
136                 // if execution reaches here then an error has happened
137                 tokens.push(
138                         new Token(stream.next(), "TOKN", "UNKNOWN_TOKEN", this.line)
139                 );
140             }
141             
142             
143             
144             return tokens;
145         }
146
147         /**
148          * findPuncToken - find the id of a token (previous to current)
149          * need to back check syntax..
150          * 
151          * @arg {Array} tokens the array of tokens.
152          * @arg {String} token data (eg. '(')
153          * @arg {Number} offset where to start reading from
154          * @return {Number} position of token
155          */
156         public int findPuncToken(TokenArray tokens, string data, int n)
157         {
158             n = n > 0 ? n :  tokens.length -1;
159             var stack = 0;
160             while (n > -1) {
161                 
162                 if (stack < 1 && tokens.get(n).data == data) {
163                     return n;
164                 }
165                 
166                 if (tokens.get(n).data  == ")" || tokens.get(n).data  == "}") {
167                     stack++;
168                     n--;
169                     continue;
170                 }
171                 if (stack > 0 && (tokens.get(n).data  == "{" || tokens.get(n).data  == "(")) {
172                     stack--;
173                     n--;
174                     continue;
175                 }
176                 
177                 
178                 n--;
179             }
180             return -1;
181         }
182         /**
183          * lastSym - find the last token symbol
184          * need to back check syntax..
185          * 
186          * @arg {Array} tokens the array of tokens.
187          * @arg {Number} offset where to start..
188          * @return {Token} the token
189          */
190         public Token? lastSym(TokenArray tokens, int n)
191         {
192             for (var i = n-1; i >= 0; i--) {
193                 if (!(tokens.get(i).is("WHIT") || tokens.get(i).is("COMM"))) {
194                     return tokens.get(i);
195                 }
196             }
197             return null;
198         }
199         
200          
201         
202         /**
203             @returns {Boolean} Was the token found?
204          */
205         public bool read_word (TextStream stream, TokenArray tokens)
206         {
207             string found = "";
208             while (!stream.lookEOF() && Lang.isWordChar((string)stream.look())) {
209                 found += stream.next();
210             }
211             
212             if (found == "") {
213                 return false;
214             }
215             
216             var name = Lang.keyword(found);
217             if (name != null) {
218                 
219                 // look for "()return" ?? why ???
220                 var ls = tokens.lastSym();
221                 if (found == "return" && ls != null && ls.data == ")") {
222                     //Seed.print('@' + tokens.length);
223                     var n = this.findPuncToken(tokens, ")", 0);
224                     //Seed.print(')@' + n);
225                     n = this.findPuncToken(tokens, "(", n-1);
226                     //Seed.print('(@' + n);
227                     
228                     //var lt = this.lastSym(tokens, n);
229                     /*
230                     //print(JSON.stringify(lt));
231                     if (lt.type != "KEYW" || ["IF", 'WHILE'].indexOf(lt.name) < -1) {
232                         if (!this.ignoreBadGrammer) {
233                             throw new TokenReader_Error.ArgumentError(
234                                 this.filename + ":" + this.line + " Error - return found after )"
235                             );
236                         }
237                     }
238                     
239                     */
240                     
241                 }
242                 
243                 tokens.push(new Token(found, "KEYW", name, this.line));
244                 return true;
245             }
246             
247             if (!this.sepIdents || found.index_of(".") < 0 ) {
248                 tokens.push(new Token(found, "NAME", "NAME", this.line));
249                 return true;
250             }
251             var n = found.split(".");
252             var p = false;
253             foreach (unowned string nm in n) {
254                 if (p) {
255                     tokens.push(new Token(".", "PUNC", "DOT", this.line));
256                 }
257                 p=true;
258                 tokens.push(new Token(nm, "NAME", "NAME", this.line));
259             }
260             return true;
261                 
262
263         }
264
265         /**
266             @returns {Boolean} Was the token found?
267          */
268         public bool read_punc (TextStream stream, TokenArray tokens) throws TokenReader_Error
269         {
270             string found = "";
271             
272             while (!stream.lookEOF() && Lang.punc(found + (string)stream.look()).length > 0) {
273                 found += stream.next();
274             }
275             
276             
277             if (found == "") {
278                 return false;
279             }
280             
281             var ls = tokens.lastSym();
282             
283             if ((found == "}" || found == "]") && ls != null && ls.data == ",") {
284                 //print("Error - comma found before " + found);
285                 //print(JSON.stringify(tokens.lastSym(), null,4));
286                 if (this.ignoreBadGrammer) {
287                     print("\n" + this.filename + ":" + this.line.to_string() + " Error - comma found before " + found);
288                 } else {
289                     throw new TokenReader_Error.ArgumentError(
290                                 this.filename + ":" + this.line.to_string() + "  comma found before " + found
291                   
292                     );
293                      
294                 }
295             }
296             
297             tokens.push(new Token(found, "PUNC", Lang.punc(found), this.line));
298             return true;
299             
300         } 
301
302         /**
303             @returns {Boolean} Was the token found?
304          */
305         public bool read_space  (TextStream stream, TokenArray tokens)
306         {
307             var found = "";
308             
309             while (!stream.lookEOF() && Lang.isSpaceC(  stream.look()) && !Lang.isNewlineC(stream.look())) {
310                 found += stream.next();
311             }
312             
313             if (found == "") {
314                 return false;
315             }
316             //print("WHITE = " + JSON.stringify(found));
317             
318              
319             if (this.collapseWhite) {
320                 found = " "; // this might work better if it was a '\n' ???
321             }
322             if (this.keepWhite) {
323                 tokens.push(new Token(found, "WHIT", "SPACE", this.line));
324             }
325             return true;
326         
327         }
328
329         /**
330             @returns {Boolean} Was the token found?
331          */
332         public bool read_newline  (TextStream stream, TokenArray tokens)
333         {
334             var found = "";
335             var line = this.line;
336             while (!stream.lookEOF() && Lang.isNewlineC(stream.look())) {
337                 this.line++;
338                 found += stream.next();
339             }
340             
341             if (found == "") {
342                 return false;
343             }
344             
345             // if we found a new line, then we could check if previous character was a ';' - if so we can drop it.
346             // otherwise generally keep it.. in which case it should reduce our issue with stripping new lines..
347            
348             
349             //this.line++;
350             if (this.collapseWhite) {
351                 found = "\n"; // reduces multiple line breaks into a single one...
352             }
353             
354             if (this.keepWhite) {
355                 var last = tokens.pop();
356                 if (last != null && last.name != "WHIT") {
357                     tokens.push(last);
358                 }
359                 // replaces last new line... 
360                 tokens.push(new Token(found, "WHIT", "NEWLINE", line));
361             }
362             return true;
363         }
364
365         /**
366             @returns {Boolean} Was the token found?
367          */
368         public bool read_mlcomment  (TextStream stream, TokenArray tokens)
369         {
370             if (stream.look() != '/') {
371                 return false;
372             }
373             if (stream.look(1) != '*') {
374                 return false;
375             }
376             var found = stream.next(2);
377             string  c = "";
378             var line = this.line;
379             while (!stream.lookEOF() && !(stream.look(-1) == '/' && stream.look(-2) == '*')) {
380                 c = stream.next();
381                 if (c == "\n") {
382                     this.line++;
383                 }
384                 found += c;
385             }
386             
387             // to start doclet we allow /** or /*** but not /**/ or /****
388             //if (found.length /^\/\*\*([^\/]|\*[^*])/.test(found) && this.keepDocs) {
389             if (this.keepDocs && found.length > 4 && found.index_of("/**") == 0 && found[3] != '/') {
390                 tokens.push(new Token(found, "COMM", "JSDOC", this.line));
391             } else if (this.keepComments) {
392                 tokens.push(new Token(found, "COMM", "MULTI_LINE_COMM", line));
393             }
394             return true;
395         
396         } 
397
398         /**
399             @returns {Boolean} Was the token found?
400          */
401          public bool read_slcomment  (TextStream stream, TokenArray tokens)
402          {
403             var found = "";
404             if (
405                 (stream.look() == '/' && stream.look(1) == '/' && (""!=(found=stream.next(2))))
406                 || 
407                 (stream.look() == '<' && stream.look(1) == '!' && stream.look(2) == '-' && stream.look(3) == '-' && (""!=(found=stream.next(4))))
408             ) {
409                 var line = this.line;
410                 while (!stream.lookEOF() && !Lang.isNewline((string)stream.look())) {
411                     found += stream.next();
412                 }
413                 //if (!stream.lookEOF()) { // what? << eat the EOL?
414                     found += stream.next();
415                 //}
416                 if (this.keepComments) {
417                     tokens.push(new Token(found, "COMM", "SINGLE_LINE_COMM", line));
418                 }
419                 this.line++;
420                 return true;
421             }
422             return false;
423         }
424
425         /**
426             @returns {Boolean} Was the token found?
427          */
428         public bool read_dbquote  (TextStream stream, TokenArray tokens)
429         {
430             if (stream.look() != '"') {
431                 return false;
432             }
433                 // find terminator
434             var str = stream.next();
435             
436             while (!stream.lookEOF()) {
437                 if (stream.look() == '\\') {
438                     if (Lang.isNewline((string)stream.look(1))) {
439                         do {
440                             stream.next();
441                         } while (!stream.lookEOF() && Lang.isNewline((string)stream.look()));
442                         str += "\\\n";
443                     }
444                     else {
445                         str += stream.next(2);
446                     }
447                     continue;
448                 }
449                 if (stream.look() == '"') {
450                     str += stream.next();
451                     tokens.push(new Token(str, "STRN", "DOUBLE_QUOTE", this.line));
452                     return true;
453                 }
454             
455                 str += stream.next();
456                 
457             }
458             return false;
459         }
460
461         /**
462             @returns {Boolean} Was the token found?
463          */
464         public bool read_snquote  (TextStream stream, TokenArray tokens)
465         {
466             if (stream.look() != '\'') {
467                 return false;
468             }
469             // find terminator
470             var str = stream.next();
471             
472             while (!stream.lookEOF()) {
473                 if (stream.look() == '\\') { // escape sequence
474                     str += stream.next(2);
475                     continue;
476                 }
477                 if (stream.look() == '\'') {
478                     str += stream.next();
479                     tokens.push(new Token(str, "STRN", "SINGLE_QUOTE", this.line));
480                     return true;
481                 }
482                 str += stream.next();
483                 
484             }
485             return false;
486         }
487         
488
489         /**
490             @returns {Boolean} Was the token found?
491          */
492         public bool read_numb  (TextStream stream, TokenArray tokens)
493         {
494             if (stream.look() == '0' && stream.look(1) == 'x') {
495                 return this.read_hex(stream, tokens);
496             }
497             
498             var found = "";
499             
500             while (!stream.lookEOF() && Lang.isNumber(found+(string)stream.look())){
501                 found += stream.next();
502             }
503             
504             if (found == "") {
505                 return false;
506             }
507             if (GLib.Regex.match_simple("^0[0-7]", found)) {
508                 tokens.push(new Token(found, "NUMB", "OCTAL", this.line));
509                 return true;
510             }
511             tokens.push(new Token(found, "NUMB", "DECIMAL", this.line));
512             return true;
513         
514         }
515        
516         /**
517             @returns {Boolean} Was the token found?
518          */
519         public bool read_hex  (TextStream stream, TokenArray tokens)
520         {
521             var found = stream.next(2);
522             
523             while (!stream.lookEOF()) {
524                 if (Lang.isHexDec(found) && !Lang.isHexDec(found+(string)stream.look())) { // done
525                     tokens.push(new Token(found, "NUMB", "HEX_DEC", this.line));
526                     return true;
527                 }
528                 
529                 found += stream.next();
530                
531             }
532             return false;
533         }
534
535         /**
536             @returns {Boolean} Was the token found?
537          */
538         public bool read_regx (TextStream stream, TokenArray tokens)
539         {
540               
541             if (stream.look() != '/') {
542                 return false;
543             }
544             var  last = tokens.lastSym();
545             if (
546                 (last == null)
547                 || 
548                 (
549                        !last.is("NUMB")   // stuff that can not appear before a regex..
550                     && !last.is("NAME")
551                     && !last.is("RIGHT_PAREN")
552                     && !last.is("RIGHT_BRACKET")
553                 )
554             )  {
555                 var regex = stream.next();
556                 
557                 while (!stream.lookEOF()) {
558                     if (stream.look() == '\\') { // escape sequence
559                         regex += stream.next(2);
560                         continue;
561                     }
562                     if (stream.look() == '/') {
563                         regex += stream.next();
564                         
565                         while (GLib.Regex.match_simple("[gmi]", (string)stream.look())) {
566                             regex += stream.next();
567                         }
568                         
569                         tokens.push(new Token(regex, "REGX", "REGX", this.line));
570                         return true;
571                     }
572                      
573                     regex += stream.next();
574                      
575                 }
576                 // error: unterminated regex
577             }
578             return false;
579         }
580     }
581 }