1 //<script type="text/javascript">
6 //const Token = imports.Token.Token;
7 //const Lang = imports.Lang.Lang;
10 @class Search a {@link JSDOC.TextStream} for language tokens.
18 public class TokenArray: Object {
20 public Gee.ArrayList<Token> tokens;
22 get { return this.tokens.size; }
27 this.tokens = new Gee.ArrayList<Token>();
30 public Token? last() {
31 if (this.tokens.size > 0) {
32 return this.tokens.get(this.tokens.size-1);
36 public Token? lastSym () {
37 for (var i = this.tokens.size-1; i >= 0; i--) {
38 if (!(this.tokens.get(i).is("WHIT") || this.tokens.get(i).is("COMM"))) {
39 return this.tokens.get(i);
44 public void push (Token t) {
49 if (this.tokens.size > 0) {
50 return this.tokens.remove_at(this.tokens.size-1);
55 public new Token get(int i) {
56 return this.tokens.get(i);
60 foreach(var token in this.tokens) {
61 print(token.asString() +"\n");
67 public errordomain TokenReader_Error {
72 public class TokenReader : Object
79 * I wonder if this will accept the prop: value, prop2 :value construxtor if we do not define one...
82 /** @cfg {Boolean} collapseWhite merge multiple whitespace/comments into a single token **/
83 public bool collapseWhite = false; // only reduces white space...
84 /** @cfg {Boolean} keepDocs keep JSDOC comments **/
85 public bool keepDocs = true;
86 /** @cfg {Boolean} keepWhite keep White space **/
87 public bool keepWhite = false;
88 /** @cfg {Boolean} keepComments keep all comments **/
89 public bool keepComments = false;
90 /** @cfg {Boolean} sepIdents seperate identifiers (eg. a.b.c into ['a', '.', 'b', '.', 'c'] ) **/
91 public bool sepIdents = false;
92 /** @cfg {String} filename name of file being parsed. **/
93 public string filename = "";
94 /** @config {Boolean} ignoreBadGrammer do not throw errors if we find stuff that might break compression **/
95 public bool ignoreBadGrammer = false;
102 * @return {Array} of tokens
104 * ts = new TextStream(File.read(str));
105 * tr = TokenReader({ keepComments : true, keepWhite : true });
109 public TokenArray tokenize(TextStream stream)
112 var tokens = new TokenArray();
115 while (!stream.lookEOF()) {
118 if (this.read_mlcomment(stream, tokens)) continue;
119 if (this.read_slcomment(stream, tokens)) continue;
120 if (this.read_dbquote(stream, tokens)) continue;
121 if (this.read_snquote(stream, tokens)) continue;
122 if (this.read_regx(stream, tokens)) continue;
123 if (this.read_numb(stream, tokens)) continue;
124 if (this.read_punc(stream, tokens)) continue;
125 if (this.read_newline(stream, tokens)) continue;
126 if (this.read_space(stream, tokens)) continue;
127 if (this.read_word(stream, tokens)) continue;
129 // if execution reaches here then an error has happened
131 new Token(stream.next(), "TOKN", "UNKNOWN_TOKEN", this.line)
141 * findPuncToken - find the id of a token (previous to current)
142 * need to back check syntax..
144 * @arg {Array} tokens the array of tokens.
145 * @arg {String} token data (eg. '(')
146 * @arg {Number} offset where to start reading from
147 * @return {Number} position of token
149 public int findPuncToken(TokenArray tokens, string data, int n)
151 n = n > 0 ? n : tokens.length -1;
155 if (stack < 1 && tokens.get(n).data == data) {
159 if (tokens.get(n).data == ")" || tokens.get(n).data == "}") {
164 if (stack > 0 && (tokens.get(n).data == "{" || tokens.get(n).data == "(")) {
176 * lastSym - find the last token symbol
177 * need to back check syntax..
179 * @arg {Array} tokens the array of tokens.
180 * @arg {Number} offset where to start..
181 * @return {Token} the token
183 public Token? lastSym(TokenArray tokens, int n)
185 for (var i = n-1; i >= 0; i--) {
186 if (!(tokens.get(i).is("WHIT") || tokens.get(i).is("COMM"))) {
187 return tokens.get(i);
196 @returns {Boolean} Was the token found?
198 public bool read_word (TextStream stream, TokenArray tokens)
201 while (!stream.lookEOF() && Lang.isWordChar(stream.look().to_string())) {
202 found += stream.next();
209 var name = Lang.keyword(found);
212 // look for "()return" ?? why ???
213 var ls = tokens.lastSym();
214 if (found == "return" && ls != null && ls.data == ")") {
215 //Seed.print('@' + tokens.length);
216 var n = this.findPuncToken(tokens, ")", 0);
217 //Seed.print(')@' + n);
218 n = this.findPuncToken(tokens, "(", n-1);
219 //Seed.print('(@' + n);
221 //var lt = this.lastSym(tokens, n);
223 //print(JSON.stringify(lt));
224 if (lt.type != "KEYW" || ["IF", 'WHILE'].indexOf(lt.name) < -1) {
225 if (!this.ignoreBadGrammer) {
226 throw new TokenReader_Error.ArgumentError(
227 this.filename + ":" + this.line + " Error - return found after )"
236 tokens.push(new Token(found, "KEYW", name, this.line));
240 if (!this.sepIdents || found.index_of(".") < 0 ) {
241 tokens.push(new Token(found, "NAME", "NAME", this.line));
244 var n = found.split(".");
246 foreach (unowned string nm in n) {
248 tokens.push(new Token(".", "PUNC", "DOT", this.line));
251 tokens.push(new Token(nm, "NAME", "NAME", this.line));
259 @returns {Boolean} Was the token found?
261 public bool read_punc (TextStream stream, TokenArray tokens) throws TokenReader_Error
265 while (!stream.lookEOF()) {
266 var ns = stream.look().to_string();
268 if (null == Lang.punc(found + ns )) {
271 found += stream.next();
279 var ls = tokens.lastSym();
281 if ((found == "}" || found == "]") && ls != null && ls.data == ",") {
282 //print("Error - comma found before " + found);
283 //print(JSON.stringify(tokens.lastSym(), null,4));
284 if (this.ignoreBadGrammer) {
285 print("\n" + this.filename + ":" + this.line.to_string() + " Error - comma found before " + found);
287 throw new TokenReader_Error.ArgumentError(
288 this.filename + ":" + this.line.to_string() + " comma found before " + found
295 tokens.push(new Token(found, "PUNC", Lang.punc(found), this.line));
301 @returns {Boolean} Was the token found?
303 public bool read_space (TextStream stream, TokenArray tokens)
307 while (!stream.lookEOF() && Lang.isSpaceC( stream.look()) && !Lang.isNewlineC(stream.look())) {
308 found += stream.next();
314 //print("WHITE = " + JSON.stringify(found));
317 if (this.collapseWhite) {
318 found = " "; // this might work better if it was a '\n' ???
320 if (this.keepWhite) {
321 tokens.push(new Token(found, "WHIT", "SPACE", this.line));
328 @returns {Boolean} Was the token found?
330 public bool read_newline (TextStream stream, TokenArray tokens)
333 var line = this.line;
334 while (!stream.lookEOF() && Lang.isNewlineC(stream.look())) {
336 found += stream.next();
343 // if we found a new line, then we could check if previous character was a ';' - if so we can drop it.
344 // otherwise generally keep it.. in which case it should reduce our issue with stripping new lines..
348 if (this.collapseWhite) {
349 found = "\n"; // reduces multiple line breaks into a single one...
352 if (this.keepWhite) {
353 var last = tokens.pop();
354 if (last != null && last.name != "WHIT") {
357 // replaces last new line...
358 tokens.push(new Token(found, "WHIT", "NEWLINE", line));
364 @returns {Boolean} Was the token found?
366 public bool read_mlcomment (TextStream stream, TokenArray tokens)
368 if (stream.look() != '/') {
371 if (stream.look(1) != '*') {
374 var found = stream.next(2);
376 var line = this.line;
377 while (!stream.lookEOF() && !(stream.look(-1) == '/' && stream.look(-2) == '*')) {
385 // to start doclet we allow /** or /*** but not /**/ or /****
386 //if (found.length /^\/\*\*([^\/]|\*[^*])/.test(found) && this.keepDocs) {
387 if (this.keepDocs && found.length > 4 && found.index_of("/**") == 0 && found[3] != '/') {
388 tokens.push(new Token(found, "COMM", "JSDOC", this.line));
389 } else if (this.keepComments) {
390 tokens.push(new Token(found, "COMM", "MULTI_LINE_COMM", line));
397 @returns {Boolean} Was the token found?
399 public bool read_slcomment (TextStream stream, TokenArray tokens)
403 (stream.look() == '/' && stream.look(1) == '/' && (""!=(found=stream.next(2))))
405 (stream.look() == '<' && stream.look(1) == '!' && stream.look(2) == '-' && stream.look(3) == '-' && (""!=(found=stream.next(4))))
407 var line = this.line;
408 while (!stream.lookEOF()) {
409 //print(stream.look().to_string());
410 if ( Lang.isNewline(stream.look().to_string())) {
413 found += stream.next();
415 if (!stream.lookEOF()) { // lookinng for end of line... if we got it, then do not eat the character..
416 found += stream.next();
418 if (this.keepComments) {
419 tokens.push(new Token(found, "COMM", "SINGLE_LINE_COMM", line));
428 @returns {Boolean} Was the token found?
430 public bool read_dbquote (TextStream stream, TokenArray tokens)
432 if (stream.look() != '"') {
436 var str = stream.next();
438 while (!stream.lookEOF()) {
439 if (stream.look() == '\\') {
440 if (Lang.isNewline(stream.look(1).to_string())) {
443 } while (!stream.lookEOF() && Lang.isNewline(stream.look().to_string()));
447 str += stream.next(2);
451 if (stream.look() == '"') {
452 str += stream.next();
453 tokens.push(new Token(str, "STRN", "DOUBLE_QUOTE", this.line));
457 str += stream.next();
464 @returns {Boolean} Was the token found?
466 public bool read_snquote (TextStream stream, TokenArray tokens)
468 if (stream.look() != '\'') {
472 var str = stream.next();
474 while (!stream.lookEOF()) {
475 if (stream.look() == '\\') { // escape sequence
476 str += stream.next(2);
479 if (stream.look() == '\'') {
480 str += stream.next();
481 tokens.push(new Token(str, "STRN", "SINGLE_QUOTE", this.line));
484 str += stream.next();
492 @returns {Boolean} Was the token found?
494 public bool read_numb (TextStream stream, TokenArray tokens)
496 if (stream.look() == '0' && stream.look(1) == 'x') {
497 return this.read_hex(stream, tokens);
502 while (!stream.lookEOF() && Lang.isNumber(found+stream.look().to_string())){
503 found += stream.next();
509 if (GLib.Regex.match_simple("^0[0-7]", found)) {
510 tokens.push(new Token(found, "NUMB", "OCTAL", this.line));
513 tokens.push(new Token(found, "NUMB", "DECIMAL", this.line));
519 @returns {Boolean} Was the token found?
521 public bool read_hex (TextStream stream, TokenArray tokens)
523 var found = stream.next(2);
525 while (!stream.lookEOF()) {
526 if (Lang.isHexDec(found) && !Lang.isHexDec(found+stream.look().to_string())) { // done
527 tokens.push(new Token(found, "NUMB", "HEX_DEC", this.line));
531 found += stream.next();
538 @returns {Boolean} Was the token found?
540 public bool read_regx (TextStream stream, TokenArray tokens)
543 if (stream.look() != '/') {
546 var last = tokens.lastSym();
551 !last.is("NUMB") // stuff that can not appear before a regex..
553 && !last.is("RIGHT_PAREN")
554 && !last.is("RIGHT_BRACKET")
557 var regex = stream.next();
559 while (!stream.lookEOF()) {
560 if (stream.look() == '\\') { // escape sequence
561 regex += stream.next(2);
564 if (stream.look() == '/') {
565 regex += stream.next();
567 while (GLib.Regex.match_simple("[gmi]", stream.look().to_string())) {
568 regex += stream.next();
571 tokens.push(new Token(regex, "REGX", "REGX", this.line));
575 regex += stream.next();
578 // error: unterminated regex