1 //<script type="text/javascript">
6 //const Token = imports.Token.Token;
7 //const Lang = imports.Lang.Lang;
10 @class Search a {@link JSDOC.TextStream} for language tokens.
17 public class TokenArray: Object {
19 private Packer? packer;
20 private TokenReader reader;
22 public Gee.ArrayList<Token> tokens;
23 Token lastAdded = null;
26 get { return this.tokens.size; }
29 public TokenArray(Packer? packer, TokenReader reader)
33 this.tokens = new Gee.ArrayList<Token>();
36 public Token? last() {
37 if (this.tokens.size > 0) {
38 return this.tokens.get(this.tokens.size-1);
42 public Token? lastSym () {
43 for (var i = this.tokens.size-1; i >= 0; i--) {
44 if (!(this.tokens.get(i).isType(TokenType.WHIT) || this.tokens.get(i).isType(TokenType.COMM))) {
45 return this.tokens.get(i);
52 public void push (Token t)
54 if (this.lastAdded != null) {
58 this.lastAdded.isType(TokenType.NAME) ||
59 this.lastAdded.isType(TokenType.STRN) ||
60 this.lastAdded.isType(TokenType.NUMB) ||
62 this.lastAdded.isType(TokenType.KEYW) &&
64 this.lastAdded.isName(TokenName.TRUE) || this.lastAdded.isName(TokenName.FALSE)
70 t.isType(TokenType.NAME) || // NAME -> ???
71 t.isType(TokenType.STRN) ||
72 t.isType(TokenType.NUMB) ||
73 (t.isType(TokenType.KEYW) &&
74 !(t.isName(TokenName.IN) || t.isName(TokenName.INSTANCEOF) || t.isName(TokenName.INSTANCEOF))
78 //print("%s\n%s\n", this.lastAdded.asString(), t.asString());
79 if (this.packer != null) {
81 Packer.ResultType.err,
84 "'" + this.lastAdded.data+ "' token followed by " + t.name.to_string() + ":" + t.data
90 // other pattern that are not valid
91 // ] or ) followed by KEYW "STRING" or number ?
93 (this.lastAdded.isName( TokenName.RIGHT_BRACE) || this.lastAdded.isName( TokenName.RIGHT_PAREN))
96 (t.isType(TokenType.KEYW) &&
97 !(t.isName(TokenName.IN) || t.isName(TokenName.INSTANCEOF) || t.isName(TokenName.INSTANCEOF))
99 t.isType(TokenType.NAME) || // NAME -> ???
100 t.isType(TokenType.STRN) ||
101 t.isType(TokenType.NUMB)
104 if (this.packer != null) {
105 //print("%s\n%s\n", this.lastAdded.asString(), t.asString());
106 this.packer.logError(
107 Packer.ResultType.err,
108 this.reader.filename,
110 "'" + this.lastAdded.data+ "' token followed by " + t.name.to_string() + ":" + t.data
120 if (t.isType(TokenType.WHIT) || t.isType(TokenType.COMM)){
121 // do not set last...
129 if (this.tokens.size > 0) {
130 return this.tokens.remove_at(this.tokens.size-1);
135 public new Token get(int i) {
136 return this.tokens.get(i);
141 foreach(var token in this.tokens) {
142 if (token.line != line) {
143 print("%d: ", token.line);
146 print("%s",token.data);
149 foreach(var token in this.tokens) {
150 stdout.printf ("%s\n", token.asString());
156 public errordomain TokenReader_Error {
162 public class TokenReader : Object
169 * I wonder if this will accept the prop: value, prop2 :value construxtor if we do not define one...
172 /** @cfg {Boolean} collapseWhite merge multiple whitespace/comments into a single token **/
173 public bool collapseWhite = false; // only reduces white space...
174 /** @cfg {Boolean} keepDocs keep JSDOC comments **/
175 public bool keepDocs = true;
176 /** @cfg {Boolean} keepWhite keep White space **/
177 public bool keepWhite = false;
178 /** @cfg {Boolean} keepComments keep all comments **/
179 public bool keepComments = false;
180 /** @cfg {Boolean} sepIdents seperate identifiers (eg. a.b.c into ['a', '.', 'b', '.', 'c'] ) **/
181 public bool sepIdents = false;
182 /** @cfg {String} filename name of file being parsed. **/
183 public string filename = "";
184 /** @config {Boolean} ignoreBadGrammer do not throw errors if we find stuff that might break compression **/
185 public bool ignoreBadGrammer = false;
190 private Packer? packer;
192 public TokenReader(Packer? packer)
194 this.packer = packer;
199 * @return {Array} of tokens
201 * ts = new TextStream(File.read(str));
202 * tr = TokenReader({ keepComments : true, keepWhite : true });
206 public TokenArray tokenize(TextStream stream)
209 var tokens = new TokenArray(this.packer, this);
212 while (!stream.lookEOF()) {
215 if (this.read_mlcomment(stream, tokens)) continue;
216 if (this.read_slcomment(stream, tokens)) continue;
217 if (this.read_dbquote(stream, tokens)) continue;
218 if (this.read_snquote(stream, tokens)) continue;
219 if (this.read_regx(stream, tokens)) continue;
220 if (this.read_numb(stream, tokens)) continue;
221 if (this.read_punc(stream, tokens)) continue;
222 if (this.read_newline(stream, tokens)) continue;
223 if (this.read_space(stream, tokens)) continue;
224 if (this.read_word(stream, tokens)) continue;
226 // if execution reaches here then an error has happened
228 new Token(stream.nextS(), TokenType.TOKN, TokenName.UNKNOWN_TOKEN, this.line)
238 * findPuncToken - find the id of a token (previous to current)
239 * need to back check syntax..
241 * @arg {Array} tokens the array of tokens.
242 * @arg {String} token data (eg. '(')
243 * @arg {Number} offset where to start reading from
244 * @return {Number} position of token
246 public int findPuncToken(TokenArray tokens, string data, int n)
248 n = n > 0 ? n : tokens.length -1;
252 if (stack < 1 && tokens.get(n).data == data) {
256 if (tokens.get(n).data == ")" || tokens.get(n).data == "}") {
261 if (stack > 0 && (tokens.get(n).data == "{" || tokens.get(n).data == "(")) {
273 * lastSym - find the last token symbol
274 * need to back check syntax..
276 * @arg {Array} tokens the array of tokens.
277 * @arg {Number} offset where to start..
278 * @return {Token} the token
280 public Token? lastSym(TokenArray tokens, int n)
282 for (var i = n-1; i >= 0; i--) {
283 if (!(tokens.get(i).isType(TokenType.WHIT) || tokens.get(i).isType(TokenType.COMM))) {
284 return tokens.get(i);
293 @returns {Boolean} Was the token found?
295 public bool read_word (TextStream stream, TokenArray tokens)
298 while (!stream.lookEOF() && Lang.isWordChar(stream.lookC() )) {
299 found += stream.nextC().to_string();
307 name = Lang.keyword(found);
308 tokens.push(new Token(found, TokenType.KEYW, name, this.line));
310 } catch (LangError e) {
311 // noop -- then it's a word / not a keyword...
314 What did all this do...
318 // look for "()return" ?? why ???
319 var ls = tokens.lastSym();
320 if (found == "return" && ls != null && ls.data == ")") {
321 //Seed.print('@' + tokens.length);
322 var n = this.findPuncToken(tokens, ")", 0);
323 //Seed.print(')@' + n);
324 n = this.findPuncToken(tokens, "(", n-1);
325 //Seed.print('(@' + n);
327 //var lt = this.lastSym(tokens, n);
329 //print(JSON.stringify(lt));
330 if (lt.type != "KEYW" || ["IF", 'WHILE'].indexOf(lt.name) < -1) {
331 if (!this.ignoreBadGrammer) {
332 throw new TokenReader_Error.ArgumentError(
333 this.filename + ":" + this.line + " Error - return found after )"
343 tokens.push(new Token(found, TokenType.KEYW, name, this.line));
347 if (!this.sepIdents || found.index_of(".") < 0 ) {
348 tokens.push(new Token(found, TokenType.NAME, TokenName.NAME, this.line));
351 var n = found.split(".");
353 foreach (unowned string nm in n) {
356 tokens.push(new Token(".", TokenType.PUNC, TokenName.DOT, this.line));
359 if (nm.length < 1 ) {
362 tokens.push(new Token(nm, TokenType.NAME, TokenName.NAME, this.line));
370 @returns {Boolean} Was the token found?
372 public bool read_punc (TextStream stream, TokenArray tokens)
376 TokenName tokname = TokenName.UNKNOWN;
377 while (!stream.lookEOF()) {
378 var ns = stream.lookC();
380 tokname = Lang.puncFirstString(ns);
381 if (TokenName.UNKNOWN == tokname) {
385 found += stream.nextS();
388 var nx = Lang.puncString(found + ns.to_string() );
389 if (TokenName.UNKNOWN == nx) {
394 found += stream.nextS();
398 if (tokname == TokenName.UNKNOWN) {
402 var ls = tokens.lastSym();
404 if ((found == "}" || found == "]") && ls != null && ls.data == ",") {
405 //print("Error - comma found before " + found);
406 //print(JSON.stringify(tokens.lastSym(), null,4));
407 if (this.packer != null) {
408 this.packer.logError(
409 this.ignoreBadGrammer ? Packer.ResultType.warn : Packer.ResultType.err,
412 "comma found before " + found
418 tokens.push(new Token(found, TokenType.PUNC, tokname, this.line));
424 @returns {Boolean} Was the token found?
426 public bool read_space (TextStream stream, TokenArray tokens)
428 // not supported yet.. newlines can be unicode...
431 while (!stream.lookEOF() && Lang.isSpace( stream.lookS()) && !Lang.isNewline(stream.lookS())) {
432 found += stream.nextS();
438 //print("WHITE = " + JSON.stringify(found));
441 if (this.collapseWhite) {
442 found = " "; // this might work better if it was a '\n' ???
444 if (this.keepWhite) {
445 tokens.push(new Token(found, TokenType.WHIT, TokenName.SPACE, this.line));
452 @returns {Boolean} Was the token found?
454 public bool read_newline (TextStream stream, TokenArray tokens)
456 // we do not support it yet, but newlines can be UNICODE..
459 var line = this.line;
462 while (!stream.lookEOF() && Lang.isNewline(stream.lookS())) {
463 var cur = stream.lookS();;
464 if (lastc == "\r" && cur == "\n") {
465 // dont add new line..
472 found += stream.nextS();
478 //print("NEWLINE @%d changing this.line to %d\n", line, this.line);
479 // if we found a new line, then we could check if previous character was a ';' - if so we can drop it.
480 // otherwise generally keep it.. in which case it should reduce our issue with stripping new lines..
484 if (this.collapseWhite) {
485 found = "\n"; // reduces multiple line breaks into a single one...
488 if (this.keepWhite) {
489 var last = tokens.last();
490 if (last != null && last.type != TokenType.WHIT) {
493 tokens.pop(); // remove the last token..
495 // replaces last new line...
496 tokens.push(new Token(found, TokenType.WHIT, TokenName.NEWLINE, line));
502 @returns {Boolean} Was the token found?
504 public bool read_mlcomment (TextStream stream, TokenArray tokens)
506 if (stream.lookC() != '/') {
509 if (stream.lookC(1) != '*') {
513 var found = new StringBuilder();
514 found.append(stream.nextS(2));
518 var line = this.line;
519 while (!stream.lookEOF() && !(stream.lookC(-1) == '/' && stream.lookC(-2) == '*')) {
527 // to start doclet we allow /** or /*** but not /**/ or /****
528 //if (found.length /^\/\*\*([^\/]|\*[^*])/.test(found) && this.keepDocs) {
529 if (this.keepDocs && found.len > 4 && found.str.index_of("/**") == 0 && found.str[3] != '/') {
530 tokens.push(new Token(found.str, TokenType.COMM, TokenName.JSDOC, this.line));
531 } else if (this.keepComments) {
532 tokens.push(new Token(found.str, TokenType.COMM, TokenName.MULTI_LINE_COMM, line));
539 @returns {Boolean} Was the token found?
541 public bool read_slcomment (TextStream stream, TokenArray tokens)
545 (stream.lookC() == '/' && stream.lookC(1) == '/' && (""!=(found=stream.nextS(2))))
547 (stream.lookC() == '<' && stream.lookC(1) == '!' && stream.lookC(2) == '-' && stream.lookC(3) == '-' && (""!=(found=stream.nextS(4))))
549 var line = this.line;
550 while (!stream.lookEOF()) {
551 //print(stream.look().to_string());
552 if ( Lang.isNewline(stream.lookS().to_string())) {
555 found += stream.nextS();
557 if (!stream.lookEOF()) { // lookinng for end of line... if we got it, then do not eat the character..
558 found += stream.nextS();
560 if (this.keepComments) {
561 tokens.push(new Token(found, TokenType.COMM, TokenName.SINGLE_LINE_COMM, line));
570 @returns {Boolean} Was the token found?
572 public bool read_dbquote (TextStream stream, TokenArray tokens)
574 if (stream.lookC() != '"') {
578 var str = new StringBuilder();
579 str.append_unichar(stream.nextC());
581 while (!stream.lookEOF()) {
582 if (stream.lookC() == '\\') {
583 if (Lang.isNewline(stream.lookS(1).to_string())) {
586 } while (!stream.lookEOF() && Lang.isNewline(stream.lookS().to_string()));
590 str.append(stream.nextS(2));
594 if (stream.lookC() == '"') {
595 str.append_unichar(stream.nextC());
596 tokens.push(new Token(str.str, TokenType.STRN, TokenName.DOUBLE_QUOTE, this.line));
600 str.append(stream.nextS());
607 @returns {Boolean} Was the token found?
609 public bool read_snquote (TextStream stream, TokenArray tokens)
611 if (stream.lookC() != '\'') {
615 var str = new StringBuilder();
616 str.append_unichar(stream.nextC());
618 while (!stream.lookEOF()) {
619 if (stream.lookC() == '\\') { // escape sequence
620 str.append( stream.nextS(2));
623 if (stream.lookC() == '\'') {
624 str.append_unichar(stream.nextC());
625 tokens.push(new Token(str.str, TokenType.STRN, TokenName.SINGLE_QUOTE, this.line));
628 str.append(stream.nextS());
636 @returns {Boolean} Was the token found?
638 public bool read_numb (TextStream stream, TokenArray tokens)
640 if (stream.lookC() == '0' && stream.lookC(1) == 'x') {
641 return this.read_hex(stream, tokens);
646 while (!stream.lookEOF() && !Lang.isNewline(stream.lookS()) && Lang.isNumber(found+stream.lookC().to_string())){
647 found += stream.nextS();
653 // if we hit an 'e'.... then we need to carry on parsing..
654 if (stream.lookC() == 'e' || stream.lookC() == 'E') {
655 found += stream.nextS();
656 var nc = stream.lookC();
657 if (nc == '+' || nc == '-' || (nc >= '0' && nc <= '9')) {
658 found += stream.nextS();
659 while (!stream.lookEOF() && !Lang.isNewline(stream.lookS()) && Lang.isNumber(found+stream.lookC().to_string())){
660 found += stream.nextS();
662 if (!Lang.isNumber(found)) {
663 if (this.packer != null) {
664 this.packer.logError(
665 Packer.ResultType.err,
668 "Invalid Number " + found
671 return true; // eat the characters and continue...
675 if (this.packer != null) {
676 this.packer.logError(
677 Packer.ResultType.err,
680 "could not find +/- or 0-9 after Number '" + found
688 if (GLib.Regex.match_simple("^0[0-7]", found)) {
689 tokens.push(new Token(found, TokenType.NUMB, TokenName.OCTAL, this.line));
692 //print("got number '%s'\n", found);
694 tokens.push(new Token(found, TokenType.NUMB, TokenName.DECIMAL, this.line));
700 @returns {Boolean} Was the token found?
702 public bool read_hex (TextStream stream, TokenArray tokens)
704 var found = stream.nextS(2);
706 while (!stream.lookEOF()) {
707 if (Lang.isHexDec(found) && !Lang.isHexDec(found+stream.lookC().to_string())) { // done
708 tokens.push(new Token(found, TokenType.NUMB, TokenName.HEX_DEC, this.line));
712 found += stream.nextS();
719 @returns {Boolean} Was the token found?
721 public bool read_regx (TextStream stream, TokenArray tokens)
724 if (stream.lookC() != '/') {
727 var last = tokens.lastSym();
732 !last.isType(TokenType.NUMB) // stuff that can not appear before a regex..
733 && !last.isType(TokenType.NAME)
734 && !last.isName(TokenName.RIGHT_PAREN)
735 && !last.isName(TokenName.RIGHT_BRACE)
738 var regex = stream.nextS();
739 var in_brace = false; // this is really hacky... we ignore [ .../ ] so aforward slash in a regex..
740 while (!stream.lookEOF()) {
741 if (stream.lookC() == '[') {
744 if (in_brace && stream.lookC() == ']') {
748 if (stream.lookC() == '\\') { // escape sequence
749 regex += stream.nextS(2);
752 if (!in_brace && stream.lookC() == '/') {
753 regex += stream.nextS();
755 while (GLib.Regex.match_simple("[gmi]", stream.lookS().to_string())) {
756 regex += stream.nextS();
759 tokens.push(new Token(regex, TokenType.REGX, TokenName.REGX, this.line));
763 regex += stream.nextS();
766 // error: unterminated regex