mildew.lexer source code

1 /**
2  * This module implements the Token and Lexer structs
3  */
4 module mildew.lexer;
5 
6 import mildew.exceptions: ScriptCompileException;
7 
8 import std.ascii; // temp until unicode support
9 import std.container.rbtree;
10 import std.conv: to;
11 import std.format: format;
12 
13 /**
14  * This struct represents the line and column number of a token, starting at 1.
15  */
16 struct Position
17 {
18     /// Line and column number.
19     int line, column;
20 
21     /// Returns a string representing the line and column number
22     string toString() const 
23     {
24         return format("line %s, column %s", line, column);
25     }
26 
27     /// Determines line and column number based on char that is read
28     void advance(char ch)
29     {
30         if(ch == '\0')
31         {
32             return;
33         }
34         else if(ch == '\n')
35         {
36             ++line;
37             column = 1;
38         }
39         else
40         {
41             ++column;
42         }
43     }
44 }
45 
46 /**
47  * This struct represents a token, a fundamental building block of all scripts. The code of a script
48  * is first separated by token so that the parser can analyze each token.
49  */
50 struct Token 
51 {
52     /**
53      * The type of a token.
54      */
55     enum Type 
56     {
57         EOF, KEYWORD, INTEGER, DOUBLE, STRING, IDENTIFIER, 
58         NOT, AND, OR, GT, GE, LT, LE,
59         EQUALS, NEQUALS, STRICT_EQUALS, STRICT_NEQUALS,
60         ASSIGN, PLUS_ASSIGN, DASH_ASSIGN,
61         PLUS, DASH, STAR, FSLASH, PERCENT, POW, DOT,
62         INC, DEC, // ++ and --
63         BIT_AND, BIT_XOR, BIT_OR, BIT_NOT, BIT_LSHIFT, BIT_RSHIFT, BIT_URSHIFT,
64         LPAREN, RPAREN, LBRACE, RBRACE, LBRACKET, RBRACKET, 
65         SEMICOLON, COMMA, LABEL, QUESTION, COLON, INVALID
66     }
67 
68     /**
69      * This enum is for literal value tokens that require special handling by the parser
70      */
71     enum LiteralFlag
72     {
73         NONE, BINARY, OCTAL, HEXADECIMAL, TEMPLATE_STRING
74     }
75 
76     /// Type of token
77     Type type;
78     /// Position where token occurs
79     Position position;
80     /// Optional text for keywords and identifiers
81     string text;
82     /// Optional flag for integer literals.
83     LiteralFlag literalFlag = LiteralFlag.NONE;
84 
85     /**
86      * Returns a string representing the type of the token and the optional text if present.
87      */
88     string toString() const
89     {
90         string str = format("[%s", type.to!string);
91         if(text != null)
92             str ~= "|" ~ text;
93         str ~= "]";
94         return str;
95     }
96 
97     /**
98      * Returns a textual representation of the token as it was found in the original script source code.
99      */
100     string symbol() const
101     {
102         final switch(type)
103         {
104         case Type.EOF:
105             return "\0";
106         case Type.KEYWORD: case Type.INTEGER: case Type.DOUBLE: case Type.STRING: case Type.IDENTIFIER:
107             return text;
108         case Type.NOT: return "!";
109         case Type.AND: return "&&";
110         case Type.OR: return "||";
111         case Type.GT: return ">";
112         case Type.GE: return ">=";
113         case Type.LT: return "<";
114         case Type.LE: return "<=";
115         case Type.EQUALS: return "==";
116         case Type.NEQUALS: return "!=";
117         case Type.STRICT_EQUALS: return "===";
118         case Type.STRICT_NEQUALS: return "!==";
119         case Type.ASSIGN: return "=";
120         case Type.PLUS_ASSIGN: return "+=";
121         case Type.DASH_ASSIGN: return "-=";
122         case Type.PLUS: return "+";
123         case Type.DASH: return "-";
124         case Type.STAR: return "*";
125         case Type.FSLASH: return "/";
126         case Type.PERCENT: return "%";
127         case Type.POW: return "**";
128         case Type.DOT: return ".";
129         case Type.INC: return "++";
130         case Type.DEC: return "--"; 
131         case Type.BIT_AND: return "&";
132         case Type.BIT_XOR: return "^";
133         case Type.BIT_OR: return "|";
134         case Type.BIT_NOT: return "~";
135         case Type.BIT_LSHIFT: return "<<";
136         case Type.BIT_RSHIFT: return ">>";
137         case Type.BIT_URSHIFT: return ">>>";
138         case Type.LPAREN: return "(";
139         case Type.RPAREN: return ")";
140         case Type.LBRACE: return "{";
141         case Type.RBRACE: return "}";
142         case Type.LBRACKET: return "[";
143         case Type.RBRACKET: return "]";
144         case Type.SEMICOLON: return ";";
145         case Type.COMMA: return ",";
146         case Type.LABEL: return text ~ ":";
147         case Type.QUESTION: return "?";
148         case Type.COLON: return ":";
149         case Type.INVALID: return "#";
150         }
151     }
152 
153     /**
154      * Returns true if a token is both a keyword and a specific keyword.
155      */
156     bool isKeyword(in string keyword) const
157     {
158         return (type == Type.KEYWORD && text == keyword);
159     }
160 
161     /**
162      * Checks for a specific identifier
163      */
164     bool isIdentifier(in string id) const 
165     {
166         return (type == Type.IDENTIFIER && text == id);
167     }
168 
169     /**
170      * Returns true if the token is an assignment operator such as =, +=, or -=, etc.
171      */
172     bool isAssignmentOperator()
173     {
174         return (type == Type.ASSIGN || type == Type.PLUS_ASSIGN || type == Type.DASH_ASSIGN);
175     }
176 
177     /**
178      * Generates an invalid token at the given position. This is used by the Lexer to throw
179      * an exception that requires a token.
180      */
181     static Token createInvalidToken(in Position pos, in string text="")
182     {
183         auto token = Token(Token.Type.INVALID, pos, text);
184         return token;
185     }
186 
187     /**
188      * Used by the parser
189      */
190     static Token createFakeToken(in Type t, in string txt)
191     {
192         Token tok;
193         tok.type = t;
194         tok.position = Position(0,0);
195         tok.text = txt;
196         return tok;
197     }
198 }
199 
200 private bool startsKeywordOrIdentifier(in char ch)
201 {
202     // TODO support unicode by converting string to dchar
203     return ch.isAlpha || ch == '_' || ch == '$';
204 }
205 
206 private bool continuesKeywordOrIdentifier(in char ch)
207 {
208     // TODO support unicode by converting string to dchar
209     return ch.isAlphaNum || ch == '_' || ch == '$';
210 }
211 
212 private bool charIsValidDigit(in char ch, in Token.LiteralFlag lflag)
213 {
214     if(lflag == Token.LiteralFlag.NONE)
215         return ch.isDigit || ch == '.' || ch == 'e';
216     else if(lflag == Token.LiteralFlag.HEXADECIMAL)
217         return ch.isDigit || (ch.toLower >= 'a' && ch.toLower <= 'f');
218     else if(lflag == Token.LiteralFlag.OCTAL)
219         return (ch >= '0' && ch <= '7');
220     else if(lflag == Token.LiteralFlag.BINARY)
221         return ch == '0' || ch == '1';
222     return false;
223 }
224 
225 /// Lexes code and returns the individual tokens
226 struct Lexer 
227 {
228 public:
229     /// Constructor takes code as text to tokenize
230     this(string code)
231     {
232         _text = code;
233     }
234 
235     /// Returns tokens from lexing a string of code
236     Token[] tokenize()
237     {
238         Token[] tokens = [];
239         if (_text == "")
240             return tokens;
241         while(_index < _text.length)
242         {
243             // ignore white space
244             while(currentChar.isWhite())
245                 advanceChar();
246             if(currentChar.startsKeywordOrIdentifier)
247                 tokens ~= makeIdKwOrLabel();
248             else if(currentChar.isDigit)
249                 tokens ~= makeIntOrDoubleToken();
250             else if(currentChar == '\'' || currentChar == '"' || currentChar == '`')
251                 tokens ~= makeStringToken();
252             else if(currentChar == '>')
253                 tokens ~= makeRAngleBracketToken();
254             else if(currentChar == '<')
255                 tokens ~= makeLAngleBracketToken();
256             else if(currentChar == '=')
257                 tokens ~= makeEqualToken();
258             else if(currentChar == '!')
259                 tokens ~= makeNotToken();
260             else if(currentChar == '&')
261                 tokens ~= makeAndToken();
262             else if(currentChar == '|')
263                 tokens ~= makeOrToken();
264             else if(currentChar == '+')
265                 tokens ~= makePlusToken();
266             else if(currentChar == '-')
267                 tokens ~= makeDashToken();
268             else if(currentChar == '*')
269                 tokens ~= makeStarToken();
270             else if(currentChar == '/')
271                 tokens = handleFSlash(tokens);
272             else if(currentChar == '%')
273                 tokens ~= Token(Token.Type.PERCENT, _position);
274             else if(currentChar == '^')
275                 tokens ~= Token(Token.Type.BIT_XOR, _position);
276             else if(currentChar == '~')
277                 tokens ~= Token(Token.Type.BIT_NOT, _position);
278             else if(currentChar == '(')
279                 tokens ~= Token(Token.Type.LPAREN, _position);
280             else if(currentChar == ')')
281                 tokens ~= Token(Token.Type.RPAREN, _position);
282             else if(currentChar == '{')
283                 tokens ~= Token(Token.Type.LBRACE, _position);
284             else if(currentChar == '}')
285                 tokens ~= Token(Token.Type.RBRACE, _position);
286             else if(currentChar == '[')
287                 tokens ~= Token(Token.Type.LBRACKET, _position);
288             else if(currentChar == ']')
289                 tokens ~= Token(Token.Type.RBRACKET, _position);
290             else if(currentChar == ';')
291                 tokens ~= Token(Token.Type.SEMICOLON, _position);
292             else if(currentChar == ',')
293                 tokens ~= Token(Token.Type.COMMA, _position);
294             else if(currentChar == '.')
295                 tokens ~= Token(Token.Type.DOT, _position);
296             else if(currentChar == ':')
297                 tokens ~= Token(Token.Type.COLON, _position);
298             else if(currentChar == '?')
299                 tokens ~= Token(Token.Type.QUESTION, _position);
300             else if(currentChar == '\0')
301                 tokens ~= Token(Token.Type.EOF, _position);
302             else
303                 throw new ScriptCompileException("Invalid character " ~ currentChar, 
304                     Token.createInvalidToken(_position, [currentChar]));
305             advanceChar();
306         }
307         return tokens;
308     }
309 
310     /// Hash table of keywords
311     static immutable KEYWORDS = redBlackTree(
312         "true", "false", "undefined", "null",
313         "var", "let", "const", 
314         "if", "else", "while", "do", "for", "of", "in",
315         "switch", "case", "default",
316         "break", "continue", "return", 
317         "function", "class", "super", "extends",
318         "new", "delete", "typeof", "instanceof",
319         "throw", "try", "catch"
320     );
321 
322     /// AA of look up for escape chars based on character after \
323     static immutable char[char] ESCAPE_CHARS;
324 
325     /// Initializes the associative array of escape chars
326     shared static this()
327     {
328         ESCAPE_CHARS = [
329             'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t', 'v': '\v', 
330             '0': '\0', '\'': '\'', '"': '"', '\\': '\\'
331         ];
332     }
333 
334 private:
335 
336     void advanceChar()
337     {
338         ++_index;
339         _position.advance(currentChar());
340     }
341 
342     char currentChar()
343     {
344         if(_index < _text.length)
345             return _text[_index];
346         else
347             return '\0';
348     }
349 
350     char peekChar()
351     {
352         if(_index + 1 < _text.length)
353             return _text[_index + 1];
354         else
355             return '\0';
356     }
357 
358     Token makeIdKwOrLabel()
359     {
360         immutable start = _index;
361         immutable startpos = _position;
362         advanceChar();
363         while(currentChar.continuesKeywordOrIdentifier)
364             advanceChar();
365         auto text = _text[start.._index];
366         --_index; // UGLY but IDK what else to do
367         // first check for keyword, that can't be a label
368         if(text in KEYWORDS)
369         {
370             return Token(Token.Type.KEYWORD, startpos, text);
371         }
372         else if(peekChar == ':')
373         {
374             advanceChar();
375             return Token(Token.Type.LABEL, startpos, text);
376         }
377         else
378         {
379             return Token(Token.Type.IDENTIFIER, startpos, text);
380         }
381     }
382 
383     Token makeIntOrDoubleToken()
384     {
385         immutable start = _index;
386         immutable startpos = _position;
387         auto dotCounter = 0;
388         auto eCounter = 0;
389         Token.LiteralFlag lflag = Token.LiteralFlag.NONE;
390         if(peekChar.toLower == 'x')
391         {
392             lflag = Token.LiteralFlag.HEXADECIMAL;
393             advanceChar();
394         }
395         else if(peekChar.toLower == 'o')
396         {
397             lflag = Token.LiteralFlag.OCTAL;
398             advanceChar();
399         }
400         else if(peekChar.toLower == 'b')
401         {
402             lflag = Token.LiteralFlag.BINARY;
403             advanceChar();
404         }
405         // if the lflag was set, the first char has to be 0
406         if(lflag != Token.LiteralFlag.NONE && _text[start] != '0')
407             throw new ScriptCompileException("Malformed integer literal", Token.createInvalidToken(startpos));
408         
409         // while(peekChar.isDigit || peekChar == '.' || peekChar.toLower == 'e')
410         while(peekChar.charIsValidDigit(lflag))
411         {
412             advanceChar();
413             if(lflag == Token.LiteralFlag.NONE)
414             {
415                 if(currentChar == '.')
416                 {
417                     ++dotCounter;
418                     if(dotCounter > 1)
419                         throw new ScriptCompileException("Too many decimals in number literal", 
420                             Token.createInvalidToken(_position));
421                 }
422                 else if(currentChar.toLower == 'e')
423                 {
424                     ++eCounter;
425                     if(eCounter > 1)
426                         throw new ScriptCompileException("Numbers can only have one exponent specifier", 
427                             Token.createInvalidToken(_position));
428                     if(peekChar == '+' || peekChar == '-')
429                         advanceChar();
430                     if(!peekChar.isDigit)
431                         throw new ScriptCompileException("Exponent specifier must be followed by number", 
432                             Token.createInvalidToken(_position));
433                 }
434             }
435         }
436         auto text = _text[start.._index+1];
437         if(lflag != Token.LiteralFlag.NONE && text.length <= 2)
438             throw new ScriptCompileException("Malformed hex/octal/binary integer", Token.createInvalidToken(startpos));
439         Token resultToken;
440         if(dotCounter == 0 && eCounter == 0)
441             resultToken = Token(Token.Type.INTEGER, startpos, text);
442         else
443             resultToken = Token(Token.Type.DOUBLE, startpos, text);
444         resultToken.literalFlag = lflag;
445         return resultToken;
446     }
447 
448     Token makeStringToken()
449     {
450         immutable closeQuote = currentChar;
451         auto startpos = _position;
452         advanceChar();
453         string text = "";
454         Token.LiteralFlag lflag = Token.LiteralFlag.NONE;
455         if(closeQuote == '`')
456             lflag = Token.LiteralFlag.TEMPLATE_STRING;
457         while(currentChar != closeQuote)
458         {
459             if(currentChar == '\0')
460                 throw new ScriptCompileException("Missing close quote for string literal", 
461                     Token.createInvalidToken(_position, text));
462             else if(currentChar == '\n' && lflag != Token.LiteralFlag.TEMPLATE_STRING)
463                 throw new ScriptCompileException("Line breaks inside string literal are not allowed", 
464                     Token.createInvalidToken(_position, text));
465             else if(currentChar == '\\') // TODO handle \u0000 and \u00 sequences
466             {
467                 advanceChar();
468                 if(currentChar in ESCAPE_CHARS)
469                     text ~= ESCAPE_CHARS[currentChar];
470                 else
471                     throw new ScriptCompileException("Unknown escape character " ~ currentChar, 
472                         Token.createInvalidToken(_position));
473             }
474             else
475                 text ~= currentChar;
476             advanceChar();
477         }
478         auto tok = Token(Token.Type.STRING, startpos, text);
479         tok.literalFlag = lflag;
480         return tok;
481     }
482 
483     Token makeRAngleBracketToken()
484     {
485         auto startpos = _position;
486         if(peekChar == '=')
487         {
488             advanceChar();
489             return Token(Token.Type.GE, startpos);
490         }
491         else if(peekChar == '>')
492         {
493             advanceChar();
494             if(peekChar == '>')
495             {
496                 advanceChar();
497                 return Token(Token.Type.BIT_URSHIFT, startpos);
498             }
499             else
500             {
501                 return Token(Token.Type.BIT_RSHIFT, startpos);
502             }
503         }
504         else
505         {
506             return Token(Token.Type.GT, startpos);
507         }
508     }
509 
510     Token makeLAngleBracketToken()
511     {
512         auto startpos = _position;
513         if(peekChar == '=')
514         {
515             advanceChar();
516             return Token(Token.Type.LE, startpos);
517         }
518         else if(peekChar == '<')
519         {
520             advanceChar();
521             return Token(Token.Type.BIT_LSHIFT, startpos);
522         }
523         else
524         {
525             return Token(Token.Type.LT, startpos);
526         }
527     }
528 
529     Token makeEqualToken()
530     {
531         auto startpos = _position;
532         if(peekChar == '=')
533         {
534             advanceChar();
535             if(peekChar == '=')
536             {
537                 advanceChar();
538                 return Token(Token.Type.STRICT_EQUALS);
539             }
540             else
541             {
542                 return Token(Token.Type.EQUALS, startpos);
543             }
544         }
545         else
546         {
547             return Token(Token.Type.ASSIGN, startpos);
548         }
549     }
550 
551     Token makeNotToken()
552     {
553         auto startpos = _position;
554         if(peekChar == '=')
555         {
556             advanceChar();
557             if(peekChar == '=')
558             {
559                 advanceChar();
560                 return Token(Token.Type.STRICT_NEQUALS, startpos);
561             }
562             else
563             {
564                 return Token(Token.Type.NEQUALS, startpos);
565             }
566         }
567         else
568         {
569             return Token(Token.Type.NOT, startpos);
570         }
571     }
572 
573     Token makeAndToken()
574     {
575         auto startpos = _position;
576         if(peekChar == '&')
577         {
578             advanceChar();
579             return Token(Token.Type.AND, startpos);
580         }
581         else
582         {
583             return Token(Token.Type.BIT_AND, startpos);
584         }
585     }
586 
587     Token makeOrToken()
588     {
589         auto startpos = _position;
590         if(peekChar == '|')
591         {
592             advanceChar();
593             return Token(Token.Type.OR, startpos);
594         }
595         else
596         {
597             return Token(Token.Type.BIT_OR, startpos);
598         }
599     }
600 
601     Token makePlusToken()
602     {
603         auto startpos = _position;
604         if(peekChar == '+')
605         {
606             advanceChar();
607             return Token(Token.Type.INC, startpos);
608         }
609         else if(peekChar == '=')
610         {
611             advanceChar();
612             return Token(Token.Type.PLUS_ASSIGN, startpos);
613         }
614         else
615         {
616             return Token(Token.Type.PLUS, startpos);
617         }
618     }
619 
620     Token makeDashToken()
621     {
622         auto startpos = _position;
623         if(peekChar == '-')
624         {
625             advanceChar();
626             return Token(Token.Type.DEC, startpos);
627         }
628         else if(peekChar == '=')
629         {
630             advanceChar();
631             return Token(Token.Type.DASH_ASSIGN, startpos);
632         }
633         else
634         {
635             return Token(Token.Type.DASH, startpos);
636         }
637     }
638 
639     Token makeStarToken()
640     {
641         auto startpos = _position;
642         if(peekChar == '*')
643         {
644             advanceChar();
645             return Token(Token.Type.POW, startpos);
646         }
647         else
648         {
649             return Token(Token.Type.STAR, startpos);
650         }
651     }
652 
653     Token[] handleFSlash(Token[] tokens)
654     {
655         if(peekChar == '*')
656         {
657             advanceChar();
658             while(peekChar != '\0')
659             {
660                 if(peekChar == '*')
661                 {
662                     advanceChar();
663                     if(peekChar == '/')
664                         break;
665                 }
666                 advanceChar();
667             }
668             advanceChar();
669         }
670         else if(peekChar == '/')
671         {
672             advanceChar();
673             while(peekChar != '\n' && peekChar != '\0')
674             {
675                 advanceChar();
676             }
677         }
678         else
679         {
680             tokens ~= Token(Token.Type.FSLASH, _position);
681         }
682 
683         return tokens;
684     }
685 
686     Position _position = {1, 1};
687     string _text;
688     size_t _index = 0;
689 }
690 
691 unittest
692 {
693     auto lexer = Lexer("1.2 34 5.e-99 'foo' ");
694     auto tokens = lexer.tokenize();
695     assert(tokens[0].type == Token.Type.DOUBLE);
696     assert(tokens[1].type == Token.Type.INTEGER);
697     assert(tokens[2].type == Token.Type.DOUBLE);
698     assert(tokens[3].type == Token.Type.STRING && tokens[3].text == "foo");
699     // TODO complete unit tests of every token type
700 }