JSDOC/CompressWhite.vala
[gnome.introspection-doc-generator] / JSDOC / TokenReader.vala
1 //<script type="text/javascript">
2
3  
4 // test code
5  
6 //const Token   = imports.Token.Token;
7 //const Lang    = imports.Lang.Lang;
8
9 /**
10         @class Search a {@link JSDOC.TextStream} for language tokens.
11 */
12  
13 namespace JSDOC {
14
15     public class TokenArray: Object {
16         
17         public Gee.ArrayList<Token> tokens;
18         public int length {
19             get { return this.tokens.size; }
20         }
21         
22         public TokenArray()
23         {
24             this.tokens = new Gee.ArrayList<Token>();
25         }
26         
27         public Token? last() {
28             if (this.tokens.size > 0) {
29                 return this.tokens.get(this.tokens.size-1);
30             }
31             return null;
32         }
33         public Token? lastSym () {
34             for (var i = this.tokens.size-1; i >= 0; i--) {
35                 if (!(this.tokens.get(i).is("WHIT") || this.tokens.get(i).is("COMM")))  {
36                     return this.tokens.get(i);
37                 }
38             }
39             return null;
40         }
41         public void push (Token t) {
42             this.tokens.add(t);
43         }
44         public Token? pop ()
45         {
46             if (this.tokens.size > 0) {
47                 return this.tokens.remove_at(this.tokens.size-1);
48             }
49             return null;
50         }
51         
52             public new Token get(int i) {
53             return this.tokens.get(i);
54         }
55         public void dump()
56         {
57                 foreach(var token in this.tokens) {
58                         print(token.asString() +"\n");
59                 }
60         }
61         
62     }
63
64     public errordomain TokenReader_Error {
65             ArgumentError
66     }
67     
68
69     public class TokenReader : Object
70     {
71         
72         
73         
74         /*
75          *
76          * I wonder if this will accept the prop: value, prop2 :value construxtor if we do not define one...
77          */
78         
79         /** @cfg {Boolean} collapseWhite merge multiple whitespace/comments into a single token **/
80         public bool collapseWhite = false; // only reduces white space...
81         /** @cfg {Boolean} keepDocs keep JSDOC comments **/
82         public bool keepDocs = true;
83         /** @cfg {Boolean} keepWhite keep White space **/
84         public bool keepWhite = false;
85         /** @cfg {Boolean} keepComments  keep all comments **/
86         public bool keepComments = false;
87         /** @cfg {Boolean} sepIdents seperate identifiers (eg. a.b.c into ['a', '.', 'b', '.', 'c'] ) **/
88         public bool sepIdents = false;
89         /** @cfg {String} filename name of file being parsed. **/
90         public string filename = "";
91         /** @config {Boolean} ignoreBadGrammer do not throw errors if we find stuff that might break compression **/
92         public bool ignoreBadGrammer = false;
93         
94         
95         int line = 0;
96         
97         /**
98          * tokenize a stream
99          * @return {Array} of tokens
100          * 
101          * ts = new TextStream(File.read(str));
102          * tr = TokenReader({ keepComments : true, keepWhite : true });
103          * tr.tokenize(ts)
104          * 
105          */
106         public TokenArray tokenize(TextStream stream)
107         {
108             this.line =1;
109             var tokens = new TokenArray();
110            
111          
112             while (!stream.lookEOF()) {
113                 
114
115                 if (this.read_mlcomment(stream, tokens)) continue;
116                 if (this.read_slcomment(stream, tokens)) continue;
117                 if (this.read_dbquote(stream, tokens))   continue;
118                 if (this.read_snquote(stream, tokens))   continue;
119                 if (this.read_regx(stream, tokens))      continue;
120                 if (this.read_numb(stream, tokens))      continue;
121                 if (this.read_punc(stream, tokens))      continue;
122                 if (this.read_newline(stream, tokens))   continue;
123                 if (this.read_space(stream, tokens))     continue;
124                 if (this.read_word(stream, tokens))      continue;
125                 
126                 // if execution reaches here then an error has happened
127                 tokens.push(
128                         new Token(stream.next(), "TOKN", "UNKNOWN_TOKEN", this.line)
129                 );
130             }
131             
132             
133             
134             return tokens;
135         }
136
137         /**
138          * findPuncToken - find the id of a token (previous to current)
139          * need to back check syntax..
140          * 
141          * @arg {Array} tokens the array of tokens.
142          * @arg {String} token data (eg. '(')
143          * @arg {Number} offset where to start reading from
144          * @return {Number} position of token
145          */
146         public int findPuncToken(TokenArray tokens, string data, int n)
147         {
148             n = n > 0 ? n :  tokens.length -1;
149             var stack = 0;
150             while (n > -1) {
151                 
152                 if (stack < 1 && tokens.get(n).data == data) {
153                     return n;
154                 }
155                 
156                 if (tokens.get(n).data  == ")" || tokens.get(n).data  == "}") {
157                     stack++;
158                     n--;
159                     continue;
160                 }
161                 if (stack > 0 && (tokens.get(n).data  == "{" || tokens.get(n).data  == "(")) {
162                     stack--;
163                     n--;
164                     continue;
165                 }
166                 
167                 
168                 n--;
169             }
170             return -1;
171         }
172         /**
173          * lastSym - find the last token symbol
174          * need to back check syntax..
175          * 
176          * @arg {Array} tokens the array of tokens.
177          * @arg {Number} offset where to start..
178          * @return {Token} the token
179          */
180         public Token? lastSym(TokenArray tokens, int n)
181         {
182             for (var i = n-1; i >= 0; i--) {
183                 if (!(tokens.get(i).is("WHIT") || tokens.get(i).is("COMM"))) {
184                     return tokens.get(i);
185                 }
186             }
187             return null;
188         }
189         
190          
191         
192         /**
193             @returns {Boolean} Was the token found?
194          */
195         public bool read_word (TextStream stream, TokenArray tokens)
196         {
197             string found = "";
198             while (!stream.lookEOF() && Lang.isWordChar(stream.look().to_string())) {
199                 found += stream.next();
200             }
201             
202             if (found == "") {
203                 return false;
204             }
205             
206             var name = Lang.keyword(found);
207             if (name != null) {
208                 
209                 // look for "()return" ?? why ???
210                 var ls = tokens.lastSym();
211                 if (found == "return" && ls != null && ls.data == ")") {
212                     //Seed.print('@' + tokens.length);
213                     var n = this.findPuncToken(tokens, ")", 0);
214                     //Seed.print(')@' + n);
215                     n = this.findPuncToken(tokens, "(", n-1);
216                     //Seed.print('(@' + n);
217                     
218                     //var lt = this.lastSym(tokens, n);
219                     /*
220                     //print(JSON.stringify(lt));
221                     if (lt.type != "KEYW" || ["IF", 'WHILE'].indexOf(lt.name) < -1) {
222                         if (!this.ignoreBadGrammer) {
223                             throw new TokenReader_Error.ArgumentError(
224                                 this.filename + ":" + this.line + " Error - return found after )"
225                             );
226                         }
227                     }
228                     
229                     */
230                     
231                 }
232                 
233                 tokens.push(new Token(found, "KEYW", name, this.line));
234                 return true;
235             }
236             
237             if (!this.sepIdents || found.index_of(".") < 0 ) {
238                 tokens.push(new Token(found, "NAME", "NAME", this.line));
239                 return true;
240             }
241             var n = found.split(".");
242             var p = false;
243             foreach (unowned string nm in n) {
244                 if (p) {
245                     tokens.push(new Token(".", "PUNC", "DOT", this.line));
246                 }
247                 p=true;
248                 tokens.push(new Token(nm, "NAME", "NAME", this.line));
249             }
250             return true;
251                 
252
253         }
254
255         /**
256             @returns {Boolean} Was the token found?
257          */
258         public bool read_punc (TextStream stream, TokenArray tokens) throws TokenReader_Error
259         {
260             string found = "";
261             
262             while (!stream.lookEOF()) {
263                         var ns = stream.look().to_string();
264
265                     if (null == Lang.punc(found + ns )) {
266                                 break;
267                         }
268                 found += stream.next();
269             }
270             
271             
272             if (found == "") {
273                 return false;
274             }
275             
276             var ls = tokens.lastSym();
277             
278             if ((found == "}" || found == "]") && ls != null && ls.data == ",") {
279                 //print("Error - comma found before " + found);
280                 //print(JSON.stringify(tokens.lastSym(), null,4));
281                 if (this.ignoreBadGrammer) {
282                     print("\n" + this.filename + ":" + this.line.to_string() + " Error - comma found before " + found);
283                 } else {
284                     throw new TokenReader_Error.ArgumentError(
285                                 this.filename + ":" + this.line.to_string() + "  comma found before " + found
286                   
287                     );
288                      
289                 }
290             }
291             
292             tokens.push(new Token(found, "PUNC", Lang.punc(found), this.line));
293             return true;
294             
295         } 
296
297         /**
298             @returns {Boolean} Was the token found?
299          */
300         public bool read_space  (TextStream stream, TokenArray tokens)
301         {
302             var found = "";
303             
304             while (!stream.lookEOF() && Lang.isSpaceC(  stream.look()) && !Lang.isNewlineC(stream.look())) {
305                 found += stream.next();
306             }
307             
308             if (found == "") {
309                 return false;
310             }
311             //print("WHITE = " + JSON.stringify(found));
312             
313              
314             if (this.collapseWhite) {
315                 found = " "; // this might work better if it was a '\n' ???
316             }
317             if (this.keepWhite) {
318                 tokens.push(new Token(found, "WHIT", "SPACE", this.line));
319             }
320             return true;
321         
322         }
323
324         /**
325             @returns {Boolean} Was the token found?
326          */
327         public bool read_newline  (TextStream stream, TokenArray tokens)
328         {
329             var found = "";
330             var line = this.line;
331             while (!stream.lookEOF() && Lang.isNewlineC(stream.look())) {
332                 this.line++;
333                 found += stream.next();
334             }
335             
336             if (found == "") {
337                 return false;
338             }
339             
340             // if we found a new line, then we could check if previous character was a ';' - if so we can drop it.
341             // otherwise generally keep it.. in which case it should reduce our issue with stripping new lines..
342            
343             
344             //this.line++;
345             if (this.collapseWhite) {
346                 found = "\n"; // reduces multiple line breaks into a single one...
347             }
348             
349             if (this.keepWhite) {
350                 var last = tokens.pop();
351                 if (last != null && last.name != "WHIT") {
352                     tokens.push(last);
353                 }
354                 // replaces last new line... 
355                 tokens.push(new Token(found, "WHIT", "NEWLINE", line));
356             }
357             return true;
358         }
359
360         /**
361             @returns {Boolean} Was the token found?
362          */
363         public bool read_mlcomment  (TextStream stream, TokenArray tokens)
364         {
365             if (stream.look() != '/') {
366                 return false;
367             }
368             if (stream.look(1) != '*') {
369                 return false;
370             }
371             var found = stream.next(2);
372             string  c = "";
373             var line = this.line;
374             while (!stream.lookEOF() && !(stream.look(-1) == '/' && stream.look(-2) == '*')) {
375                 c = stream.next();
376                 if (c == "\n") {
377                     this.line++;
378                 }
379                 found += c;
380             }
381             
382             // to start doclet we allow /** or /*** but not /**/ or /****
383             //if (found.length /^\/\*\*([^\/]|\*[^*])/.test(found) && this.keepDocs) {
384             if (this.keepDocs && found.length > 4 && found.index_of("/**") == 0 && found[3] != '/') {
385                 tokens.push(new Token(found, "COMM", "JSDOC", this.line));
386             } else if (this.keepComments) {
387                 tokens.push(new Token(found, "COMM", "MULTI_LINE_COMM", line));
388             }
389             return true;
390         
391         } 
392
393         /**
394             @returns {Boolean} Was the token found?
395          */
396          public bool read_slcomment  (TextStream stream, TokenArray tokens)
397          {
398             var found = "";
399             if (
400                 (stream.look() == '/' && stream.look(1) == '/' && (""!=(found=stream.next(2))))
401                 || 
402                 (stream.look() == '<' && stream.look(1) == '!' && stream.look(2) == '-' && stream.look(3) == '-' && (""!=(found=stream.next(4))))
403             ) {
404                 var line = this.line;
405                 while (!stream.lookEOF()) {
406                                         //print(stream.look().to_string());
407                         if ( Lang.isNewline(stream.look().to_string())) {
408                                 break;
409                         }
410                     found += stream.next();
411                 }
412                 if (!stream.lookEOF()) { // lookinng for end  of line... if we got it, then do not eat the character..
413                     found += stream.next();
414                 }
415                 if (this.keepComments) {
416                     tokens.push(new Token(found, "COMM", "SINGLE_LINE_COMM", line));
417                 }
418                 this.line++;
419                 return true;
420             }
421             return false;
422         }
423
424         /**
425             @returns {Boolean} Was the token found?
426          */
427         public bool read_dbquote  (TextStream stream, TokenArray tokens)
428         {
429             if (stream.look() != '"') {
430                 return false;
431             }
432                 // find terminator
433             var str = stream.next();
434             
435             while (!stream.lookEOF()) {
436                 if (stream.look() == '\\') {
437                     if (Lang.isNewline(stream.look(1).to_string())) {
438                         do {
439                             stream.next();
440                         } while (!stream.lookEOF() && Lang.isNewline(stream.look().to_string()));
441                         str += "\\\n";
442                     }
443                     else {
444                         str += stream.next(2);
445                     }
446                     continue;
447                 }
448                 if (stream.look() == '"') {
449                     str += stream.next();
450                     tokens.push(new Token(str, "STRN", "DOUBLE_QUOTE", this.line));
451                     return true;
452                 }
453             
454                 str += stream.next();
455                 
456             }
457             return false;
458         }
459
460         /**
461             @returns {Boolean} Was the token found?
462          */
463         public bool read_snquote  (TextStream stream, TokenArray tokens)
464         {
465             if (stream.look() != '\'') {
466                 return false;
467             }
468             // find terminator
469             var str = stream.next();
470             
471             while (!stream.lookEOF()) {
472                 if (stream.look() == '\\') { // escape sequence
473                     str += stream.next(2);
474                     continue;
475                 }
476                 if (stream.look() == '\'') {
477                     str += stream.next();
478                     tokens.push(new Token(str, "STRN", "SINGLE_QUOTE", this.line));
479                     return true;
480                 }
481                 str += stream.next();
482                 
483             }
484             return false;
485         }
486         
487
488         /**
489             @returns {Boolean} Was the token found?
490          */
491         public bool read_numb  (TextStream stream, TokenArray tokens)
492         {
493             if (stream.look() == '0' && stream.look(1) == 'x') {
494                 return this.read_hex(stream, tokens);
495             }
496             
497             var found = "";
498             
499             while (!stream.lookEOF() && Lang.isNumber(found+stream.look().to_string())){
500                 found += stream.next();
501             }
502             
503             if (found == "") {
504                 return false;
505             }
506             if (GLib.Regex.match_simple("^0[0-7]", found)) {
507                 tokens.push(new Token(found, "NUMB", "OCTAL", this.line));
508                 return true;
509             }
510             tokens.push(new Token(found, "NUMB", "DECIMAL", this.line));
511             return true;
512         
513         }
514        
515         /**
516             @returns {Boolean} Was the token found?
517          */
518         public bool read_hex  (TextStream stream, TokenArray tokens)
519         {
520             var found = stream.next(2);
521             
522             while (!stream.lookEOF()) {
523                 if (Lang.isHexDec(found) && !Lang.isHexDec(found+stream.look().to_string())) { // done
524                     tokens.push(new Token(found, "NUMB", "HEX_DEC", this.line));
525                     return true;
526                 }
527                 
528                 found += stream.next();
529                
530             }
531             return false;
532         }
533
534         /**
535             @returns {Boolean} Was the token found?
536          */
537         public bool read_regx (TextStream stream, TokenArray tokens)
538         {
539               
540             if (stream.look() != '/') {
541                 return false;
542             }
543             var  last = tokens.lastSym();
544             if (
545                 (last == null)
546                 || 
547                 (
548                        !last.is("NUMB")   // stuff that can not appear before a regex..
549                     && !last.is("NAME")
550                     && !last.is("RIGHT_PAREN")
551                     && !last.is("RIGHT_BRACKET")
552                 )
553             )  {
554                 var regex = stream.next();
555                 
556                 while (!stream.lookEOF()) {
557                     if (stream.look() == '\\') { // escape sequence
558                         regex += stream.next(2);
559                         continue;
560                     }
561                     if (stream.look() == '/') {
562                         regex += stream.next();
563                         
564                         while (GLib.Regex.match_simple("[gmi]", stream.look().to_string())) {
565                             regex += stream.next();
566                         }
567                         
568                         tokens.push(new Token(regex, "REGX", "REGX", this.line));
569                         return true;
570                     }
571                      
572                     regex += stream.next();
573                      
574                 }
575                 // error: unterminated regex
576             }
577             return false;
578         }
579     }
580 }