JSDOC/TokenReader.js
[gnome.introspection-doc-generator] / JSDOC / TokenReader.js
1 //<script type="text/javascript">
2
3  
4 const XObject = imports.XObject.XObject;
5 const console = imports.console.console;
6
7
8 const Token   = imports.Token.Token;
9 const Lang    = imports.Lang.Lang;
10
11 /**
12         @class Search a {@link JSDOC.TextStream} for language tokens.
13 */
14 const TokenReader = XObject.define(
15     function(o) {
16         
17         XObject.extend(this, o || {});
18         
19     },
20     Object,
21     {
22         /** @cfg {Boolean} collapseWhite merge multiple whitespace/comments into a single token **/
23         collapseWhite : false, // only reduces white space...
24         /** @cfg {Boolean} keepDocs keep JSDOC comments **/
25         keepDocs : true,
26         /** @cfg {Boolean} keepWhite keep White space **/
27         keepWhite : false,
28         /** @cfg {Boolean} keepComments  keep all comments **/
29         keepComments : false,
30         /** @cfg {Boolean} sepIdents seperate identifiers (eg. a.b.c into ['a', '.', 'b', '.', 'c'] ) **/
31         sepIdents : false,
32         /** @cfg {String} filename name of file being parsed. **/
33         filename : '',
34         /** @config {Boolean} ignoreBadGrammer do not throw errors if we find stuff that might break compression **/
35         ignoreBadGrammer : false,
36         /**
37          * tokenize a stream
38          * @return {Array} of tokens
39          * 
40          * ts = new TextStream(File.read(str));
41          * tr = TokenReader({ keepComments : true, keepWhite : true });
42          * tr.tokenize(ts)
43          * 
44          */
45         tokenize : function(/**JSDOC.TextStream*/stream) {
46             this.line =1;
47             var tokens = [];
48             /**@ignore*/ 
49             tokens.last    = function() { return tokens[tokens.length-1]; }
50             /**@ignore*/ 
51             tokens.lastSym = function() {
52                 for (var i = tokens.length-1; i >= 0; i--) {
53                     if (!(tokens[i].is("WHIT") || tokens[i].is("COMM"))) return tokens[i];
54                 }
55                 return true;
56             }
57
58             while (!stream.look().eof) {
59                 if (this.read_mlcomment(stream, tokens)) continue;
60                 if (this.read_slcomment(stream, tokens)) continue;
61                 if (this.read_dbquote(stream, tokens))   continue;
62                 if (this.read_snquote(stream, tokens))   continue;
63                 if (this.read_regx(stream, tokens))      continue;
64                 if (this.read_numb(stream, tokens))      continue;
65                 if (this.read_punc(stream, tokens))      continue;
66                 if (this.read_newline(stream, tokens))   continue;
67                 if (this.read_space(stream, tokens))     continue;
68                 if (this.read_word(stream, tokens))      continue;
69                 
70                 // if execution reaches here then an error has happened
71                 tokens.push(new Token(stream.next(), "TOKN", "UNKNOWN_TOKEN", this.line));
72             }
73             
74             
75             
76             return tokens;
77         },
78
79         /**
80          * findPuncToken - find the id of a token (previous to current)
81          * need to back check syntax..
82          * 
83          * @arg {Array} tokens the array of tokens.
84          * @arg {String} token data (eg. '(')
85          * @arg {Number} offset where to start reading from
86          * @return {Number} position of token
87          */
88         findPuncToken : function(tokens, data, n) {
89             n = n || tokens.length -1;
90             var stack = 0;
91             while (n > -1) {
92                 
93                 if (!stack && tokens[n].data == data) {
94                     return n;
95                 }
96                 
97                 if (tokens[n].data  == ')' || tokens[n].data  == '}') {
98                     stack++;
99                     n--;
100                     continue;
101                 }
102                 if (stack && (tokens[n].data  == '{' || tokens[n].data  == '(')) {
103                     stack--;
104                     n--;
105                     continue;
106                 }
107                 
108                 
109                 n--;
110             }
111             return -1;
112         },
113         /**
114          * lastSym - find the last token symbol
115          * need to back check syntax..
116          * 
117          * @arg {Array} tokens the array of tokens.
118          * @arg {Number} offset where to start..
119          * @return {Token} the token
120          */
121         lastSym : function(tokens, n) {
122             for (var i = n-1; i >= 0; i--) {
123                 if (!(tokens[i].is("WHIT") || tokens[i].is("COMM"))) return tokens[i];
124             }
125             return null;
126         },
127         
128          
129         
130         /**
131             @returns {Boolean} Was the token found?
132          */
133         read_word : function(/**JSDOC.TokenStream*/stream, tokens) {
134             var found = "";
135             while (!stream.look().eof && Lang.isWordChar(stream.look())) {
136                 found += stream.next();
137             }
138             
139             if (found === "") {
140                 return false;
141             }
142             
143             var name;
144             if ((name = Lang.keyword(found))) {
145                 if (found == 'return' && tokens.lastSym().data == ')') {
146                     //Seed.print('@' + tokens.length);
147                     var n = this.findPuncToken(tokens, ')');
148                     //Seed.print(')@' + n);
149                     n = this.findPuncToken(tokens, '(', n-1);
150                     //Seed.print('(@' + n);
151                     
152                     var lt = this.lastSym(tokens, n);
153                     print(JSON.stringify(lt));
154                     if (lt.type != 'KEYW' || ['IF', 'WHILE'].indexOf(lt.name) < -1) {
155                         if (!this.ignoreBadGrammer) {
156                             throw {
157                                 name : "ArgumentError", 
158                                 message: "\n" + this.filename + ':' + this.line + " Error - return found after )"
159                             }
160                         }
161                     }
162                     
163                     
164                     
165                 }
166                 
167                 tokens.push(new Token(found, "KEYW", name, this.line));
168                 return true;
169             }
170             if (!this.sepIdents || found.indexOf('.') < 0 ) {
171                 tokens.push(new Token(found, "NAME", "NAME", this.line));
172                 return true;
173             }
174             var n = found.split('.');
175             var p = false;
176             var _this = this;
177             n.forEach(function(nm) {
178                 if (p) {
179                     tokens.push(new Token('.', "PUNC", "DOT", _this.line));
180                 }
181                 p=true;
182                 tokens.push(new Token(nm, "NAME", "NAME", _this.line));
183             });
184             return true;
185                 
186
187         },
188
189         /**
190             @returns {Boolean} Was the token found?
191          */
192         read_punc : function(/**JSDOC.TokenStream*/stream, tokens) {
193             var found = "";
194             var name;
195             while (!stream.look().eof && Lang.punc(found+stream.look())) {
196                 found += stream.next();
197             }
198             
199             
200             if (found === "") {
201                 return false;
202             }
203             
204             if ((found == '}' || found == ']') && tokens.lastSym().data == ',') {
205                 //print("Error - comma found before " + found);
206                 //print(JSON.stringify(tokens.lastSym(), null,4));
207                 if (this.ignoreBadGrammer) {
208                     print("\n" + this.filename + ':' + this.line + " Error - comma found before " + found);
209                 } else {
210                     
211                     throw {
212                         name : "ArgumentError", 
213                         message: "\n" + this.filename + ':' + this.line + " Error - comma found before " + found
214                     }
215                 }
216             }
217             
218             tokens.push(new Token(found, "PUNC", Lang.punc(found), this.line));
219             return true;
220             
221         },
222
223         /**
224             @returns {Boolean} Was the token found?
225          */
226         read_space : function(/**JSDOC.TokenStream*/stream, tokens) {
227             var found = "";
228             
229             while (!stream.look().eof && Lang.isSpace(stream.look()) && !Lang.isNewline(stream.look())) {
230                 found += stream.next();
231             }
232             
233             if (found === "") {
234                 return false;
235             }
236             //print("WHITE = " + JSON.stringify(found)); 
237             if (this.collapseWhite) found = " ";
238             if (this.keepWhite) tokens.push(new Token(found, "WHIT", "SPACE", this.line));
239             return true;
240         
241         },
242
243         /**
244             @returns {Boolean} Was the token found?
245          */
246         read_newline : function(/**JSDOC.TokenStream*/stream, tokens) {
247             var found = "";
248             var line = this.line;
249             while (!stream.look().eof && Lang.isNewline(stream.look())) {
250                 this.line++;
251                 found += stream.next();
252             }
253             
254             if (found === "") {
255                 return false;
256             }
257             //this.line++;
258             if (this.collapseWhite) {
259                 found = "\n";
260             }
261              if (this.keepWhite) {
262                 var last = tokens ? tokens.pop() : false;
263                 if (last && last.name != "WHIT") {
264                     tokens.push(last);
265                 }
266                 
267                 tokens.push(new Token(found, "WHIT", "NEWLINE", line));
268             }
269             return true;
270         },
271
272         /**
273             @returns {Boolean} Was the token found?
274          */
275         read_mlcomment : function(/**JSDOC.TokenStream*/stream, tokens) {
276             if (stream.look() == "/" && stream.look(1) == "*") {
277                 var found = stream.next(2);
278                 var c = '';
279                 var line = this.line;
280                 while (!stream.look().eof && !(stream.look(-1) == "/" && stream.look(-2) == "*")) {
281                     c = stream.next();
282                     if (c == "\n") this.line++;
283                     found += c;
284                 }
285                 
286                 // to start doclet we allow /** or /*** but not /**/ or /****
287                 if (/^\/\*\*([^\/]|\*[^*])/.test(found) && this.keepDocs) tokens.push(new Token(found, "COMM", "JSDOC", this.line));
288                 else if (this.keepComments) tokens.push(new Token(found, "COMM", "MULTI_LINE_COMM", line));
289                 return true;
290             }
291             return false;
292         },
293
294         /**
295             @returns {Boolean} Was the token found?
296          */
297         read_slcomment : function(/**JSDOC.TokenStream*/stream, tokens) {
298             var found;
299             if (
300                 (stream.look() == "/" && stream.look(1) == "/" && (found=stream.next(2)))
301                 || 
302                 (stream.look() == "<" && stream.look(1) == "!" && stream.look(2) == "-" && stream.look(3) == "-" && (found=stream.next(4)))
303             ) {
304                 var line = this.line;
305                 while (!stream.look().eof && !Lang.isNewline(stream.look())) {
306                     found += stream.next();
307                 }
308                 if (!stream.look().eof) {
309                     found += stream.next();
310                 }
311                 if (this.keepComments) {
312                     tokens.push(new Token(found, "COMM", "SINGLE_LINE_COMM", line));
313                 }
314                 this.line++;
315                 return true;
316             }
317             return false;
318         },
319
320         /**
321             @returns {Boolean} Was the token found?
322          */
323         read_dbquote : function(/**JSDOC.TokenStream*/stream, tokens) {
324             if (stream.look() == "\"") {
325                 // find terminator
326                 var string = stream.next();
327                 
328                 while (!stream.look().eof) {
329                     if (stream.look() == "\\") {
330                         if (Lang.isNewline(stream.look(1))) {
331                             do {
332                                 stream.next();
333                             } while (!stream.look().eof && Lang.isNewline(stream.look()));
334                             string += "\\\n";
335                         }
336                         else {
337                             string += stream.next(2);
338                         }
339                     }
340                     else if (stream.look() == "\"") {
341                         string += stream.next();
342                         tokens.push(new Token(string, "STRN", "DOUBLE_QUOTE", this.line));
343                         return true;
344                     }
345                     else {
346                         string += stream.next();
347                     }
348                 }
349             }
350             return false; // error! unterminated string
351         },
352
353         /**
354             @returns {Boolean} Was the token found?
355          */
356         read_snquote : function(/**JSDOC.TokenStream*/stream, tokens) {
357             if (stream.look() == "'") {
358                 // find terminator
359                 var string = stream.next();
360                 
361                 while (!stream.look().eof) {
362                     if (stream.look() == "\\") { // escape sequence
363                         string += stream.next(2);
364                     }
365                     else if (stream.look() == "'") {
366                         string += stream.next();
367                         tokens.push(new Token(string, "STRN", "SINGLE_QUOTE", this.line));
368                         return true;
369                     }
370                     else {
371                         string += stream.next();
372                     }
373                 }
374             }
375             return false; // error! unterminated string
376         },
377
378         /**
379             @returns {Boolean} Was the token found?
380          */
381         read_numb : function(/**JSDOC.TokenStream*/stream, tokens) {
382             if (stream.look() === "0" && stream.look(1) == "x") {
383                 return this.read_hex(stream, tokens);
384             }
385             
386             var found = "";
387             
388             while (!stream.look().eof && Lang.isNumber(found+stream.look())){
389                 found += stream.next();
390             }
391             
392             if (found === "") {
393                 return false;
394             }
395             else {
396                 if (/^0[0-7]/.test(found)) tokens.push(new Token(found, "NUMB", "OCTAL", this.line));
397                 else tokens.push(new Token(found, "NUMB", "DECIMAL", this.line));
398                 return true;
399             }
400         },
401         /*t:
402             requires("../lib/JSDOC/TextStream.js");
403             requires("../lib/JSDOC/Token.js");
404             requires("../lib/JSDOC/Lang.js");
405             
406             plan(3, "testing read_numb");
407             
408             //// setup
409             var src = "function foo(num){while (num+8.0 >= 0x20 && num < 0777){}}";
410             var tr = new TokenReader();
411             var tokens = tr.tokenize(new TextStream(src));
412             
413             var hexToken, octToken, decToken;
414             for (var i = 0; i < tokens.length; i++) {
415                 if (tokens[i].name == "HEX_DEC") hexToken = tokens[i];
416                 if (tokens[i].name == "OCTAL") octToken = tokens[i];
417                 if (tokens[i].name == "DECIMAL") decToken = tokens[i];
418             }
419             ////
420             
421             is(decToken.data, "8.0", "decimal number is found in source.");
422             is(hexToken.data, "0x20", "hexdec number is found in source (issue #99).");
423             is(octToken.data, "0777", "octal number is found in source.");
424         */
425
426         /**
427             @returns {Boolean} Was the token found?
428          */
429         read_hex : function(/**JSDOC.TokenStream*/stream, tokens) {
430             var found = stream.next(2);
431             
432             while (!stream.look().eof) {
433                 if (Lang.isHexDec(found) && !Lang.isHexDec(found+stream.look())) { // done
434                     tokens.push(new Token(found, "NUMB", "HEX_DEC", this.line));
435                     return true;
436                 }
437                 else {
438                     found += stream.next();
439                 }
440             }
441             return false;
442         },
443
444         /**
445             @returns {Boolean} Was the token found?
446          */
447         read_regx : function(/**JSDOC.TokenStream*/stream, tokens) {
448             var last;
449             if (
450                 stream.look() == "/"
451                 && 
452                 (
453                     
454                     (
455                         !(last = tokens.lastSym()) // there is no last, the regex is the first symbol
456                         || 
457                         (
458                                !last.is("NUMB")
459                             && !last.is("NAME")
460                             && !last.is("RIGHT_PAREN")
461                             && !last.is("RIGHT_BRACKET")
462                         )
463                     )
464                 )
465             ) {
466                 var regex = stream.next();
467                 var in_brace = false;
468                 while (!stream.look().eof) {
469                     
470                     if (stream.look() == "[") { // escape sequence
471                         in_brace = true;
472                         continue;
473                     }
474                     
475                     if (stream.look() == "]") { // escape sequence
476                         in_brace = false;
477                         continue;
478                     }
479                     
480                     if (stream.look() == "\\") { // escape sequence
481                         regex += stream.next(2);
482                         continue;
483                     }
484                     
485                     if (!in_brace && stream.look() == "/") {
486                         regex += stream.next();
487                         
488                         while (/[gmi]/.test(stream.look())) {
489                             regex += stream.next();
490                         }
491                         
492                         tokens.push(new Token(regex, "REGX", "REGX", this.line));
493                         return true;
494                     }
495                     
496                     regex += stream.next();
497                     
498                 }
499                 // error: unterminated regex
500             }
501             return false;
502         }
503 });