1 //<script type="text/javascript">
7 var tr = new JSDOC.TokenReader();
10 tr.keepComments = true;
12 tr.collapseWhite = false;
15 this.timerPrint("START" + fn);
17 // we can load translation map here...
19 var toks = tr.tokenize(new TextStream(str)); // dont merge xxx + . + yyyy etc.
23 //const Token = imports.Token.Token;
24 //const Lang = imports.Lang.Lang;
27 @class Search a {@link JSDOC.TextStream} for language tokens.
35 public class TokenArray: Object {
37 public Gee.ArrayList<Token> tokens;
39 get { return this.tokens.size; }
44 this.tokens = new Gee.ArrayList<Token>();
47 public Token? last() {
48 if (this.tokens.size > 0) {
49 return this.tokens.get(this.tokens.size-1);
53 public Token? lastSym () {
54 for (var i = this.tokens.size-1; i >= 0; i--) {
55 if (!(this.tokens.get(i).is("WHIT") || this.tokens.get(i).is("COMM"))) {
56 return this.tokens.get(i);
61 public void push (Token t) {
66 if (this.tokens.size > 0) {
67 return this.tokens.remove_at(this.tokens.size-1);
72 public new Token get(int i) {
73 return this.tokens.get(i);
77 public errordomain TokenReader_Error {
82 public class TokenReader : Object
89 * I wonder if this will accept the prop: value, prop2 :value construxtor if we do not define one...
92 /** @cfg {Boolean} collapseWhite merge multiple whitespace/comments into a single token **/
93 public bool collapseWhite = false; // only reduces white space...
94 /** @cfg {Boolean} keepDocs keep JSDOC comments **/
95 public bool keepDocs = true;
96 /** @cfg {Boolean} keepWhite keep White space **/
97 public bool keepWhite = false;
98 /** @cfg {Boolean} keepComments keep all comments **/
99 public bool keepComments = false;
100 /** @cfg {Boolean} sepIdents seperate identifiers (eg. a.b.c into ['a', '.', 'b', '.', 'c'] ) **/
101 public bool sepIdents = false;
102 /** @cfg {String} filename name of file being parsed. **/
103 public string filename = "";
104 /** @config {Boolean} ignoreBadGrammer do not throw errors if we find stuff that might break compression **/
105 public bool ignoreBadGrammer = false;
112 * @return {Array} of tokens
114 * ts = new TextStream(File.read(str));
115 * tr = TokenReader({ keepComments : true, keepWhite : true });
119 public TokenArray tokenize(TextStream stream)
122 var tokens = new TokenArray();
125 while (!stream.lookEOF()) {
128 if (this.read_mlcomment(stream, tokens)) continue;
129 if (this.read_slcomment(stream, tokens)) continue;
130 if (this.read_dbquote(stream, tokens)) continue;
131 if (this.read_snquote(stream, tokens)) continue;
132 if (this.read_regx(stream, tokens)) continue;
133 if (this.read_numb(stream, tokens)) continue;
134 if (this.read_punc(stream, tokens)) continue;
135 if (this.read_newline(stream, tokens)) continue;
136 if (this.read_space(stream, tokens)) continue;
137 if (this.read_word(stream, tokens)) continue;
139 // if execution reaches here then an error has happened
141 new Token(stream.next(), "TOKN", "UNKNOWN_TOKEN", this.line)
151 * findPuncToken - find the id of a token (previous to current)
152 * need to back check syntax..
154 * @arg {Array} tokens the array of tokens.
155 * @arg {String} token data (eg. '(')
156 * @arg {Number} offset where to start reading from
157 * @return {Number} position of token
159 public int findPuncToken(TokenArray tokens, string data, int n)
161 n = n > 0 ? n : tokens.length -1;
165 if (stack < 1 && tokens.get(n).data == data) {
169 if (tokens.get(n).data == ")" || tokens.get(n).data == "}") {
174 if (stack > 0 && (tokens.get(n).data == "{" || tokens.get(n).data == "(")) {
186 * lastSym - find the last token symbol
187 * need to back check syntax..
189 * @arg {Array} tokens the array of tokens.
190 * @arg {Number} offset where to start..
191 * @return {Token} the token
193 public Token? lastSym(TokenArray tokens, int n)
195 for (var i = n-1; i >= 0; i--) {
196 if (!(tokens.get(i).is("WHIT") || tokens.get(i).is("COMM"))) {
197 return tokens.get(i);
206 @returns {Boolean} Was the token found?
208 public bool read_word (TextStream stream, TokenArray tokens)
211 while (!stream.lookEOF() && Lang.isWordChar((string)stream.look())) {
212 found += stream.next();
219 var name = Lang.keyword(found);
222 // look for "()return" ?? why ???
223 var ls = tokens.lastSym();
224 if (found == "return" && ls != null && ls.data == ")") {
225 //Seed.print('@' + tokens.length);
226 var n = this.findPuncToken(tokens, ")", 0);
227 //Seed.print(')@' + n);
228 n = this.findPuncToken(tokens, "(", n-1);
229 //Seed.print('(@' + n);
231 //var lt = this.lastSym(tokens, n);
233 //print(JSON.stringify(lt));
234 if (lt.type != "KEYW" || ["IF", 'WHILE'].indexOf(lt.name) < -1) {
235 if (!this.ignoreBadGrammer) {
236 throw new TokenReader_Error.ArgumentError(
237 this.filename + ":" + this.line + " Error - return found after )"
246 tokens.push(new Token(found, "KEYW", name, this.line));
250 if (!this.sepIdents || found.index_of(".") < 0 ) {
251 tokens.push(new Token(found, "NAME", "NAME", this.line));
254 var n = found.split(".");
256 foreach (unowned string nm in n) {
258 tokens.push(new Token(".", "PUNC", "DOT", this.line));
261 tokens.push(new Token(nm, "NAME", "NAME", this.line));
269 @returns {Boolean} Was the token found?
271 public bool read_punc (TextStream stream, TokenArray tokens) throws TokenReader_Error
275 while (!stream.lookEOF() && Lang.punc(found + (string)stream.look()).length > 0) {
276 found += stream.next();
284 var ls = tokens.lastSym();
286 if ((found == "}" || found == "]") && ls != null && ls.data == ",") {
287 //print("Error - comma found before " + found);
288 //print(JSON.stringify(tokens.lastSym(), null,4));
289 if (this.ignoreBadGrammer) {
290 print("\n" + this.filename + ":" + this.line.to_string() + " Error - comma found before " + found);
292 throw new TokenReader_Error.ArgumentError(
293 this.filename + ":" + this.line.to_string() + " comma found before " + found
300 tokens.push(new Token(found, "PUNC", Lang.punc(found), this.line));
306 @returns {Boolean} Was the token found?
308 public bool read_space (TextStream stream, TokenArray tokens)
312 while (!stream.lookEOF() && Lang.isSpaceC( stream.look()) && !Lang.isNewlineC(stream.look())) {
313 found += stream.next();
319 //print("WHITE = " + JSON.stringify(found));
322 if (this.collapseWhite) {
323 found = " "; // this might work better if it was a '\n' ???
325 if (this.keepWhite) {
326 tokens.push(new Token(found, "WHIT", "SPACE", this.line));
333 @returns {Boolean} Was the token found?
335 public bool read_newline (TextStream stream, TokenArray tokens)
338 var line = this.line;
339 while (!stream.lookEOF() && Lang.isNewlineC(stream.look())) {
341 found += stream.next();
348 // if we found a new line, then we could check if previous character was a ';' - if so we can drop it.
349 // otherwise generally keep it.. in which case it should reduce our issue with stripping new lines..
353 if (this.collapseWhite) {
354 found = "\n"; // reduces multiple line breaks into a single one...
357 if (this.keepWhite) {
358 var last = tokens.pop();
359 if (last != null && last.name != "WHIT") {
362 // replaces last new line...
363 tokens.push(new Token(found, "WHIT", "NEWLINE", line));
369 @returns {Boolean} Was the token found?
371 public bool read_mlcomment (TextStream stream, TokenArray tokens)
373 if (stream.look() != '/') {
376 if (stream.look(1) != '*') {
379 var found = stream.next(2);
381 var line = this.line;
382 while (!stream.lookEOF() && !(stream.look(-1) == '/' && stream.look(-2) == '*')) {
390 // to start doclet we allow /** or /*** but not /**/ or /****
391 //if (found.length /^\/\*\*([^\/]|\*[^*])/.test(found) && this.keepDocs) {
392 if (this.keepDocs && found.length > 4 && found.index_of("/**") == 0 && found[3] != '/') {
393 tokens.push(new Token(found, "COMM", "JSDOC", this.line));
394 } else if (this.keepComments) {
395 tokens.push(new Token(found, "COMM", "MULTI_LINE_COMM", line));
402 @returns {Boolean} Was the token found?
404 public bool read_slcomment (TextStream stream, TokenArray tokens)
408 (stream.look() == '/' && stream.look(1) == '/' && (""!=(found=stream.next(2))))
410 (stream.look() == '<' && stream.look(1) == '!' && stream.look(2) == '-' && stream.look(3) == '-' && (""!=(found=stream.next(4))))
412 var line = this.line;
413 while (!stream.lookEOF() && !Lang.isNewline((string)stream.look())) {
414 found += stream.next();
416 //if (!stream.lookEOF()) { // what? << eat the EOL?
417 found += stream.next();
419 if (this.keepComments) {
420 tokens.push(new Token(found, "COMM", "SINGLE_LINE_COMM", line));
429 @returns {Boolean} Was the token found?
431 public bool read_dbquote (TextStream stream, TokenArray tokens)
433 if (stream.look() != '"') {
437 var str = stream.next();
439 while (!stream.lookEOF()) {
440 if (stream.look() == '\\') {
441 if (Lang.isNewline((string)stream.look(1))) {
444 } while (!stream.lookEOF() && Lang.isNewline((string)stream.look()));
448 str += stream.next(2);
452 if (stream.look() == '"') {
453 str += stream.next();
454 tokens.push(new Token(str, "STRN", "DOUBLE_QUOTE", this.line));
458 str += stream.next();
465 @returns {Boolean} Was the token found?
467 public bool read_snquote (TextStream stream, TokenArray tokens)
469 if (stream.look() != '\'') {
473 var str = stream.next();
475 while (!stream.lookEOF()) {
476 if (stream.look() == '\\') { // escape sequence
477 str += stream.next(2);
480 if (stream.look() == '\'') {
481 str += stream.next();
482 tokens.push(new Token(str, "STRN", "SINGLE_QUOTE", this.line));
485 str += stream.next();
493 @returns {Boolean} Was the token found?
495 public bool read_numb (TextStream stream, TokenArray tokens)
497 if (stream.look() == '0' && stream.look(1) == 'x') {
498 return this.read_hex(stream, tokens);
503 while (!stream.lookEOF() && Lang.isNumber(found+(string)stream.look())){
504 found += stream.next();
510 if (GLib.Regex.match_simple("^0[0-7]", found)) {
511 tokens.push(new Token(found, "NUMB", "OCTAL", this.line));
514 tokens.push(new Token(found, "NUMB", "DECIMAL", this.line));
520 @returns {Boolean} Was the token found?
522 public bool read_hex (TextStream stream, TokenArray tokens)
524 var found = stream.next(2);
526 while (!stream.lookEOF()) {
527 if (Lang.isHexDec(found) && !Lang.isHexDec(found+(string)stream.look())) { // done
528 tokens.push(new Token(found, "NUMB", "HEX_DEC", this.line));
532 found += stream.next();
539 @returns {Boolean} Was the token found?
541 public bool read_regx (TextStream stream, TokenArray tokens)
544 if (stream.look() != '/') {
547 var last = tokens.lastSym();
552 !last.is("NUMB") // stuff that can not appear before a regex..
554 && !last.is("RIGHT_PAREN")
555 && !last.is("RIGHT_BRACKET")
558 var regex = stream.next();
560 while (!stream.lookEOF()) {
561 if (stream.look() == '\\') { // escape sequence
562 regex += stream.next(2);
565 if (stream.look() == '/') {
566 regex += stream.next();
568 while (GLib.Regex.match_simple("[gmi]", (string)stream.look())) {
569 regex += stream.next();
572 tokens.push(new Token(regex, "REGX", "REGX", this.line));
576 regex += stream.next();
579 // error: unterminated regex