Fix #5682 - fix path in title of source file
[roojspacker] / src / jsdoc / TokenReader.vala
1 //<script type="text/javascript">
2
3  
4 // test code
5  
6 //const Token   = imports.Token.Token;
7 //const Lang    = imports.Lang.Lang;
8
9 /**
10         @class Search a {@link JSDOC.TextStream} for language tokens.
11 */
12  
13 namespace JSDOC {
14
15          
16
17     public class TokenArray: Object {
18         
19         private Packer? packer;
20         private TokenReader reader;
21         
22         public Gee.ArrayList<Token> tokens;
23         Token lastAdded = null;
24         
25         public int length {
26             get { return this.tokens.size; }
27         }
28         
29         public TokenArray(Packer? packer, TokenReader reader)
30         {
31             this.packer = packer;
32             this.reader  = reader;
33             this.tokens = new Gee.ArrayList<Token>();
34         }
35         
36         public Token? last() {
37             if (this.tokens.size > 0) {
38                 return this.tokens.get(this.tokens.size-1);
39             }
40             return null;
41         }
42         public Token? lastSym () {
43             for (var i = this.tokens.size-1; i >= 0; i--) {
44                 if (!(this.tokens.get(i).isType(TokenType.WHIT) || this.tokens.get(i).isType(TokenType.COMM)))  {
45                     return this.tokens.get(i);
46                 }
47             }
48             return null;
49         }
50
51         
52         public void push (Token t)   
53         {
54                 if (this.lastAdded != null) {
55                 
56                         if (
57                                          (
58                                                 this.lastAdded.isType(TokenType.NAME) ||
59                                                 this.lastAdded.isType(TokenType.STRN) ||
60                                                 this.lastAdded.isType(TokenType.NUMB) ||
61                                                 (
62                                                         this.lastAdded.isType(TokenType.KEYW) && 
63                                                         (
64                                                                 this.lastAdded.isName(TokenName.TRUE) || this.lastAdded.isName(TokenName.FALSE)
65                                                         )
66                                                 )
67                                         )
68                                          &&
69                                          (
70                                                 t.isType(TokenType.NAME) ||     // NAME -> ???
71                                                 t.isType(TokenType.STRN) ||
72                                                 t.isType(TokenType.NUMB) ||
73                                                 (t.isType(TokenType.KEYW)  && 
74                                                         !(t.isName(TokenName.IN) || t.isName(TokenName.INSTANCEOF) || t.isName(TokenName.INSTANCEOF))
75                                                 )
76                                         )
77                                 ) {
78                                         //print("%s\n%s\n", this.lastAdded.asString(), t.asString());
79                                         if (this.packer != null) {
80                                                 this.packer.logError(
81                                                 Packer.ResultType.err,
82                                                 this.reader.filename,
83                                                 t.line,
84                                                 "'" + this.lastAdded.data+ "' token followed by " + t.name.to_string() + ":" + t.data
85                                                 );
86                                         }
87                                         
88                                          
89                                 }
90                         // other pattern that are not valid
91                         //  ] or )   followed by KEYW "STRING" or number ?
92                         if (
93                                 (this.lastAdded.isName( TokenName.RIGHT_BRACE) || this.lastAdded.isName( TokenName.RIGHT_PAREN))
94                                 &&
95                                 (
96                                         (t.isType(TokenType.KEYW) &&
97                                                 !(t.isName(TokenName.IN) || t.isName(TokenName.INSTANCEOF) || t.isName(TokenName.INSTANCEOF))
98                                         ) || 
99                                         t.isType(TokenType.NAME) ||     // NAME -> ???
100                                                 t.isType(TokenType.STRN) ||
101                                                 t.isType(TokenType.NUMB) 
102                                         )
103                                 ) {
104                                         if (this.packer != null) {                              
105                                         //print("%s\n%s\n", this.lastAdded.asString(), t.asString());
106                                                 this.packer.logError(
107                                                 Packer.ResultType.err,
108                                                 this.reader.filename,
109                                                 t.line,
110                                                 "'" + this.lastAdded.data+ "' token followed by " + t.name.to_string() + ":" + t.data
111                                                 );
112                                         }
113                         }
114                 }
115                 
116                 
117                 
118             this.tokens.add(t);
119             
120             if (t.isType(TokenType.WHIT) || t.isType(TokenType.COMM)){
121                // do not set last...
122             } else {
123                         this.lastAdded = t;
124             }
125             
126         }
127         public Token? pop ()
128         {
129             if (this.tokens.size > 0) {
130                 return this.tokens.remove_at(this.tokens.size-1);
131             }
132             return null;
133         }
134         
135             public new Token get(int i) {
136             return this.tokens.get(i);
137         }
138         public void dump()
139         {
140                         var line = 0;
141                 foreach(var token in this.tokens) {
142                         if (token.line != line) {
143                                 print("%d: ", token.line);
144                                 line = token.line;
145                                 }
146                                 print("%s",token.data);
147                         }
148                         print("\n----\n");
149                 foreach(var token in this.tokens) {
150                         stdout.printf ("%s\n", token.asString());
151                 }
152         }
153         
154     }
155
156     public errordomain TokenReader_Error {
157             ArgumentError,
158             SyntaxError
159     }
160     
161
162     public class TokenReader : Object
163     {
164         
165         
166         
167         /*
168          *
169          * I wonder if this will accept the prop: value, prop2 :value construxtor if we do not define one...
170          */
171         
172         /** @cfg {Boolean} collapseWhite merge multiple whitespace/comments into a single token **/
173         public bool collapseWhite = false; // only reduces white space...
174         /** @cfg {Boolean} keepDocs keep JSDOC comments **/
175         public bool keepDocs = true;
176         /** @cfg {Boolean} keepWhite keep White space **/
177         public bool keepWhite = false;
178         /** @cfg {Boolean} keepComments  keep all comments **/
179         public bool keepComments = false;
180         /** @cfg {Boolean} sepIdents seperate identifiers (eg. a.b.c into ['a', '.', 'b', '.', 'c'] ) **/
181         public bool sepIdents = false;
182         /** @cfg {String} filename name of file being parsed. **/
183         public string filename = "";
184         /** @config {Boolean} ignoreBadGrammer do not throw errors if we find stuff that might break compression **/
185         public bool ignoreBadGrammer = false;
186         
187         
188         int line = 0;
189         
190         private Packer? packer;
191         
192         public TokenReader(Packer? packer)
193         {
194                 this.packer = packer;
195                 }
196         
197         /**
198          * tokenize a stream
199          * @return {Array} of tokens
200          * 
201          * ts = new TextStream(File.read(str));
202          * tr = TokenReader({ keepComments : true, keepWhite : true });
203          * tr.tokenize(ts)
204          * 
205          */
206         public TokenArray tokenize(TextStream stream)  
207         {
208             this.line =1;
209             var tokens = new TokenArray(this.packer, this);
210            
211          
212             while (!stream.lookEOF()) {
213                 
214
215                 if (this.read_mlcomment(stream, tokens)) continue;
216                 if (this.read_slcomment(stream, tokens)) continue;
217                 if (this.read_dbquote(stream, tokens))   continue;
218                 if (this.read_snquote(stream, tokens))   continue;
219                 if (this.read_regx(stream, tokens))      continue;
220                 if (this.read_numb(stream, tokens))      continue;
221                 if (this.read_punc(stream, tokens))      continue;
222                 if (this.read_newline(stream, tokens))   continue;
223                 if (this.read_space(stream, tokens))     continue;
224                 if (this.read_word(stream, tokens))      continue;
225                 
226                 // if execution reaches here then an error has happened
227                 tokens.push(
228                         new Token(stream.nextS(), TokenType.TOKN, TokenName.UNKNOWN_TOKEN, this.line)
229                 );
230             }
231             
232             
233             
234             return tokens;
235         }
236
237         /**
238          * findPuncToken - find the id of a token (previous to current)
239          * need to back check syntax..
240          * 
241          * @arg {Array} tokens the array of tokens.
242          * @arg {String} token data (eg. '(')
243          * @arg {Number} offset where to start reading from
244          * @return {Number} position of token
245          */
246         public int findPuncToken(TokenArray tokens, string data, int n)
247         {
248             n = n > 0 ? n :  tokens.length -1;
249             var stack = 0;
250             while (n > -1) {
251                 
252                 if (stack < 1 && tokens.get(n).data == data) {
253                     return n;
254                 }
255                 
256                 if (tokens.get(n).data  == ")" || tokens.get(n).data  == "}") {
257                     stack++;
258                     n--;
259                     continue;
260                 }
261                 if (stack > 0 && (tokens.get(n).data  == "{" || tokens.get(n).data  == "(")) {
262                     stack--;
263                     n--;
264                     continue;
265                 }
266                 
267                 
268                 n--;
269             }
270             return -1;
271         }
272         /**
273          * lastSym - find the last token symbol
274          * need to back check syntax..
275          * 
276          * @arg {Array} tokens the array of tokens.
277          * @arg {Number} offset where to start..
278          * @return {Token} the token
279          */
280         public Token? lastSym(TokenArray tokens, int n)
281         {
282             for (var i = n-1; i >= 0; i--) {
283                 if (!(tokens.get(i).isType(TokenType.WHIT) || tokens.get(i).isType(TokenType.COMM))) {
284                     return tokens.get(i);
285                 }
286             }
287             return null;
288         }
289         
290          
291         
292         /**
293             @returns {Boolean} Was the token found?
294          */
295         public bool read_word (TextStream stream, TokenArray tokens)  
296         {
297             string found = "";
298             while (!stream.lookEOF() && Lang.isWordChar(stream.lookC() )) {
299                 found += stream.nextC().to_string();
300             }
301              
302             if (found == "") {
303                 return false;
304             }
305             TokenName name;
306             try {
307                         name = Lang.keyword(found);
308                         tokens.push(new Token(found, TokenType.KEYW, name, this.line));
309                         return true;
310                 }  catch (LangError e) {        
311                         // noop -- then it's a word / not a keyword...
312                 }
313                  /*
314                         What did all this do...
315                         
316                 //
317                 
318                 // look for "()return" ?? why ???
319                 var ls = tokens.lastSym();
320                 if (found == "return" && ls != null && ls.data == ")") {
321                     //Seed.print('@' + tokens.length);
322                     var n = this.findPuncToken(tokens, ")", 0);
323                     //Seed.print(')@' + n);
324                     n = this.findPuncToken(tokens, "(", n-1);
325                     //Seed.print('(@' + n);
326                     
327                     //var lt = this.lastSym(tokens, n);
328                     /*
329                     //print(JSON.stringify(lt));
330                     if (lt.type != "KEYW" || ["IF", 'WHILE'].indexOf(lt.name) < -1) {
331                         if (!this.ignoreBadGrammer) {
332                             throw new TokenReader_Error.ArgumentError(
333                                 this.filename + ":" + this.line + " Error - return found after )"
334                             );
335                         }
336                     }
337                     
338                     */
339                     /*
340                 }
341                 
342                 
343                 tokens.push(new Token(found, TokenType.KEYW, name, this.line));
344                 return true;
345             }
346             */
347             if (!this.sepIdents || found.index_of(".") < 0 ) {
348                 tokens.push(new Token(found, TokenType.NAME, TokenName.NAME, this.line));
349                 return true;
350             }
351             var n = found.split(".");
352             var p = false;
353             foreach (unowned string nm in n) {
354                 
355                 if (p) {
356                     tokens.push(new Token(".", TokenType.PUNC, TokenName.DOT, this.line));
357                 }
358                 p=true;
359                 if (nm.length < 1 ) {
360                                 continue;
361                         }
362                 tokens.push(new Token(nm, TokenType.NAME, TokenName.NAME, this.line));
363             }
364             return true;
365                 
366
367         }
368
369         /**
370             @returns {Boolean} Was the token found?
371          */
372         public bool read_punc (TextStream stream, TokenArray tokens)  
373         {
374             string found = "";
375             int pos = 0;
376             TokenName tokname = TokenName.UNKNOWN;
377             while (!stream.lookEOF()) {
378                         var ns = stream.lookC();
379                                 if (pos == 0 ){
380                                         tokname = Lang.puncFirstString(ns);
381                                         if (TokenName.UNKNOWN == tokname) {
382                                                 break;
383                                         } 
384                                         pos++;
385                         found += stream.nextS();
386                                         continue;
387                                 }
388                         var nx = Lang.puncString(found + ns.to_string() );
389                                 if (TokenName.UNKNOWN == nx) {
390                                         break;
391                                 }
392                                 
393                                 tokname = nx;
394                 found += stream.nextS();
395             }
396             
397             
398             if (tokname == TokenName.UNKNOWN) {
399                 return false;
400             }
401             
402             var ls = tokens.lastSym();
403             
404             if ((found == "}" || found == "]") && ls != null && ls.data == ",") {
405                 //print("Error - comma found before " + found);
406                 //print(JSON.stringify(tokens.lastSym(), null,4));
407                 if (this.packer != null) {
408                             this.packer.logError(
409                                         this.ignoreBadGrammer ? Packer.ResultType.warn : Packer.ResultType.err,
410                                         this.filename,
411                                         this.line,
412                                         "comma found before " + found
413                                 );
414                 }
415                  
416             }
417             
418             tokens.push(new Token(found, TokenType.PUNC, tokname, this.line));
419             return true;
420             
421         } 
422
423         /**
424             @returns {Boolean} Was the token found?
425          */
426         public bool read_space  (TextStream stream, TokenArray tokens)  
427         {
428             // not supported yet.. newlines can be unicode...
429             var found = "";
430             
431             while (!stream.lookEOF() && Lang.isSpace(  stream.lookS()) && !Lang.isNewline(stream.lookS())) {
432                 found += stream.nextS();
433             }
434             
435             if (found == "") {
436                 return false;
437             }
438             //print("WHITE = " + JSON.stringify(found));
439             
440              
441             if (this.collapseWhite) {
442                 found = " "; // this might work better if it was a '\n' ???
443             }
444             if (this.keepWhite) {
445                 tokens.push(new Token(found, TokenType.WHIT, TokenName.SPACE, this.line));
446             }
447             return true;
448         
449         }
450
451         /**
452             @returns {Boolean} Was the token found?
453          */
454         public bool read_newline  (TextStream stream, TokenArray tokens)  
455         {
456             // we do not support it yet, but newlines can be UNICODE..
457             var found = "";
458
459             var line = this.line;
460             // \r  or \r\n 
461             var lastc = "";
462             while (!stream.lookEOF() && Lang.isNewline(stream.lookS())) {
463                         var cur = stream.lookS();;
464                         if (lastc == "\r" && cur == "\n") {
465                                 // dont add new line..
466                         } else {
467                         this.line++;
468                 }
469                 lastc = cur;
470                          
471
472                 found += stream.nextS();
473             }
474
475             if (found == "") {
476                 return false;
477             }
478             //print("NEWLINE @%d  changing this.line to %d\n", line, this.line);
479             // if we found a new line, then we could check if previous character was a ';' - if so we can drop it.
480             // otherwise generally keep it.. in which case it should reduce our issue with stripping new lines..
481            
482             
483             //this.line++;
484             if (this.collapseWhite) {
485                 found = "\n"; // reduces multiple line breaks into a single one...
486             }
487             
488             if (this.keepWhite) {
489                 var last = tokens.last();
490                 if (last != null && last.type != TokenType.WHIT) {
491                     //tokens.push(last);
492                 } else {
493                         tokens.pop(); // remove the last token..
494                 }
495                 // replaces last new line... 
496                 tokens.push(new Token(found, TokenType.WHIT, TokenName.NEWLINE, line));
497             }
498             return true;
499         }
500
501         /**
502             @returns {Boolean} Was the token found?
503          */
504         public bool read_mlcomment  (TextStream stream, TokenArray tokens)  
505         {
506             if (stream.lookC() != '/') {
507                 return false;
508             }
509             if (stream.lookC(1) != '*') {
510                 return false;
511             }
512             
513             var found = new StringBuilder();
514             found.append(stream.nextS(2));
515            
516
517             string  c = "";
518             var line = this.line;
519             while (!stream.lookEOF() && !(stream.lookC(-1) == '/' && stream.lookC(-2) == '*')) {
520                 c = stream.nextS();
521                 if (c == "\n") {
522                     this.line++;
523                 }
524                 found.append(c);
525             }
526             
527             // to start doclet we allow /** or /*** but not /**/ or /****
528             //if (found.length /^\/\*\*([^\/]|\*[^*])/.test(found) && this.keepDocs) {
529             if (this.keepDocs && found.len > 4 && found.str.index_of("/**") == 0 && found.str[3] != '/') {
530                 tokens.push(new Token(found.str, TokenType.COMM, TokenName.JSDOC, this.line));
531             } else if (this.keepComments) {
532                 tokens.push(new Token(found.str, TokenType.COMM, TokenName.MULTI_LINE_COMM, line));
533             }
534             return true;
535         
536         } 
537  
538         /**
539             @returns {Boolean} Was the token found?
540          */
541          public bool read_slcomment  (TextStream stream, TokenArray tokens)  
542          {
543             var found = "";
544             if (
545                 (stream.lookC() == '/' && stream.lookC(1) == '/' && (""!=(found=stream.nextS(2))))
546                 || 
547                 (stream.lookC() == '<' && stream.lookC(1) == '!' && stream.lookC(2) == '-' && stream.lookC(3) == '-' && (""!=(found=stream.nextS(4))))
548             ) {
549                 var line = this.line;
550                 while (!stream.lookEOF()) {
551                                         //print(stream.look().to_string());
552                         if ( Lang.isNewline(stream.lookS().to_string())) {
553                                 break;
554                         }
555                     found += stream.nextS();
556                 }
557                 if (!stream.lookEOF()) { // lookinng for end  of line... if we got it, then do not eat the character..
558                     found += stream.nextS();
559                 }
560                 if (this.keepComments) {
561                     tokens.push(new Token(found, TokenType.COMM, TokenName.SINGLE_LINE_COMM, line));
562                 }
563                 this.line++;
564                 return true;
565             }
566             return false;
567         }
568
569         /**
570             @returns {Boolean} Was the token found?
571          */
572         public bool read_dbquote  (TextStream stream, TokenArray tokens)  
573         {
574             if (stream.lookC() != '"') {
575                 return false;
576             }
577                 // find terminator
578             var str = new StringBuilder();
579             str.append_unichar(stream.nextC());
580             
581             while (!stream.lookEOF()) {
582                 if (stream.lookC() == '\\') {
583                     if (Lang.isNewline(stream.lookS(1).to_string())) {
584                         do {
585                             stream.nextC();
586                         } while (!stream.lookEOF() && Lang.isNewline(stream.lookS().to_string()));
587                         str.append( "\\\n");
588                     }
589                     else {
590                         str.append(stream.nextS(2));
591                     }
592                     continue;
593                 }
594                 if (stream.lookC() == '"') {
595                     str.append_unichar(stream.nextC());
596                     tokens.push(new Token(str.str, TokenType.STRN, TokenName.DOUBLE_QUOTE, this.line));
597                     return true;
598                 }
599             
600                 str.append(stream.nextS());
601                 
602             }
603             return false;
604         }
605
606         /**
607             @returns {Boolean} Was the token found?
608          */
609         public bool read_snquote  (TextStream stream, TokenArray tokens)  
610         {
611             if (stream.lookC() != '\'') {
612                 return false;
613             }
614             // find terminator
615             var str = new StringBuilder();
616                 str.append_unichar(stream.nextC());
617             
618             while (!stream.lookEOF()) {
619                 if (stream.lookC() == '\\') { // escape sequence
620                     str.append( stream.nextS(2));
621                     continue;
622                 }
623                 if (stream.lookC() == '\'') {
624                     str.append_unichar(stream.nextC());
625                     tokens.push(new Token(str.str, TokenType.STRN, TokenName.SINGLE_QUOTE, this.line));
626                     return true;
627                 }
628                 str.append(stream.nextS());
629                 
630             }
631             return false;
632         }
633         
634
635         /**
636             @returns {Boolean} Was the token found?
637          */
638         public bool read_numb  (TextStream stream, TokenArray tokens)  
639         {
640             if (stream.lookC() == '0' && stream.lookC(1) == 'x') {
641                 return this.read_hex(stream, tokens);
642             }
643             
644             var found = "";
645             
646             while (!stream.lookEOF() && !Lang.isNewline(stream.lookS()) && Lang.isNumber(found+stream.lookC().to_string())){
647                 found += stream.nextS();
648             }
649             
650             if (found == "") {
651                 return false;
652             }
653             // if we hit an 'e'.... then we need to carry on parsing..
654             if (stream.lookC() == 'e' || stream.lookC() == 'E') {
655                         found += stream.nextS();
656                         var nc = stream.lookC();
657                         if (nc == '+' || nc == '-' || (nc >= '0' && nc <= '9')) {
658                                 found += stream.nextS();
659                             while (!stream.lookEOF() && !Lang.isNewline(stream.lookS()) && Lang.isNumber(found+stream.lookC().to_string())){
660                                         found += stream.nextS();
661                                     }
662                                     if (!Lang.isNumber(found)) {
663                                         if (this.packer != null) {
664                                                   this.packer.logError(
665                                                                 Packer.ResultType.err,
666                                                                 this.filename,
667                                                                 this.line,
668                                                                 "Invalid Number " + found
669                                                         );
670                                                 }
671                                                 return true; // eat the characters and continue...
672                         }
673                                                 
674                         } else {
675                         if (this.packer != null) {
676                                           this.packer.logError(
677                                                         Packer.ResultType.err,
678                                                         this.filename,
679                                                         this.line,
680                                                         "could not find +/- or 0-9 after Number '" + found
681                                                 );
682                                         return true;
683                                 }
684                         }
685                         
686             }
687              
688             if (GLib.Regex.match_simple("^0[0-7]", found)) {
689                 tokens.push(new Token(found, TokenType.NUMB, TokenName.OCTAL, this.line));
690                 return true;
691             }
692             //print("got number '%s'\n", found);
693             
694             tokens.push(new Token(found, TokenType.NUMB, TokenName.DECIMAL, this.line));
695             return true;
696         
697         }
698        
699         /**
700             @returns {Boolean} Was the token found?
701          */
702         public bool read_hex  (TextStream stream, TokenArray tokens)  
703         {
704             var found = stream.nextS(2);
705             
706             while (!stream.lookEOF()) {
707                 if (Lang.isHexDec(found) && !Lang.isHexDec(found+stream.lookC().to_string())) { // done
708                     tokens.push(new Token(found, TokenType.NUMB, TokenName.HEX_DEC, this.line));
709                     return true;
710                 }
711                 
712                 found += stream.nextS();
713                
714             }
715             return false;
716         }
717
718         /**
719             @returns {Boolean} Was the token found?
720          */
721         public bool read_regx (TextStream stream, TokenArray tokens)  
722         {
723               
724             if (stream.lookC() != '/') {
725                 return false;
726             }
727             var  last = tokens.lastSym();
728             if (
729                 (last == null)
730                 || 
731                 (
732                        !last.isType(TokenType.NUMB)   // stuff that can not appear before a regex..
733                     && !last.isType(TokenType.NAME)
734                     && !last.isName(TokenName.RIGHT_PAREN)
735                     && !last.isName(TokenName.RIGHT_BRACE)
736                 )
737             )  {
738                 var regex = stream.nextS();
739                 var in_brace = false; // this is really hacky... we ignore [ .../ ]  so aforward slash in a regex.. 
740                 while (!stream.lookEOF()) {
741                         if (stream.lookC() == '[') {
742                                 in_brace = true;
743                         }
744                         if (in_brace && stream.lookC() == ']') {
745                                 in_brace = false;
746                         }
747                         
748                     if (stream.lookC() == '\\') { // escape sequence
749                         regex += stream.nextS(2);
750                         continue;
751                     }
752                     if (!in_brace && stream.lookC() == '/') {
753                         regex += stream.nextS();
754                         
755                         while (GLib.Regex.match_simple("[gmi]", stream.lookS().to_string())) {
756                             regex += stream.nextS();
757                         }
758                         
759                         tokens.push(new Token(regex, TokenType.REGX, TokenName.REGX, this.line));
760                         return true;
761                     }
762                      
763                     regex += stream.nextS();
764                      
765                 }
766                 // error: unterminated regex
767             }
768             return false;
769         }
770     }
771 }