mildew.lexer source code

1 /**
2 This module implements the Token and Lexer structs
3 
4 ────────────────────────────────────────────────────────────────────────────────
5 
6 Copyright (C) 2021 pillager86.rf.gd
7 
8 This program is free software: you can redistribute it and/or modify it under 
9 the terms of the GNU General Public License as published by the Free Software 
10 Foundation, either version 3 of the License, or (at your option) any later 
11 version.
12 
13 This program is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.
16 
17 You should have received a copy of the GNU General Public License along with 
18 this program.  If not, see <https://www.gnu.org/licenses/>.
19 */
20 module mildew.lexer;
21 
22 import mildew.exceptions: ScriptCompileException;
23 import mildew.util.regex;
24 
25 import std.ascii; // temp until unicode support
26 import std.container.rbtree;
27 import std.conv: to;
28 import std.format: format;
29 import std.utf: encode;
30 
31 /**
32  * This struct represents the line and column number of a token, starting at 1.
33  */
34 struct Position
35 {
36     /// Line and column number.
37     int line, column;
38 
39     /// Returns a string representing the line and column number
40     string toString() const 
41     {
42         return format("line %s, column %s", line, column);
43     }
44 
45     /// Determines line and column number based on char that is read
46     void advance(char ch)
47     {
48         if(ch == '\0')
49         {
50             return;
51         }
52         else if(ch == '\n')
53         {
54             ++line;
55             column = 1;
56         }
57         else
58         {
59             ++column;
60         }
61     }
62 }
63 
64 /**
65  * This struct represents a token, a fundamental building block of all scripts. The code of a script
66  * is first separated by token so that the parser can analyze each token.
67  */
68 struct Token 
69 {
70     /**
71      * The type of a token.
72      */
73     enum Type 
74     {
75         EOF, KEYWORD, INTEGER, DOUBLE, STRING, IDENTIFIER, REGEX,
76         NOT, AND, OR, GT, GE, LT, LE,
77         EQUALS, NEQUALS, STRICT_EQUALS, STRICT_NEQUALS,
78 
79         ASSIGN, 
80         POW_ASSIGN, STAR_ASSIGN, FSLASH_ASSIGN, PERCENT_ASSIGN,
81         PLUS_ASSIGN, DASH_ASSIGN,
82         BAND_ASSIGN, BXOR_ASSIGN, BOR_ASSIGN, BLS_ASSIGN, BRS_ASSIGN, BURS_ASSIGN,
83 
84         PLUS, DASH, STAR, FSLASH, PERCENT, POW, DOT, TDOT,
85         INC, DEC, // ++ and --
86         BIT_AND, BIT_XOR, BIT_OR, BIT_NOT, BIT_LSHIFT, BIT_RSHIFT, BIT_URSHIFT,
87         LPAREN, RPAREN, LBRACE, RBRACE, LBRACKET, RBRACKET, 
88         SEMICOLON, COMMA, LABEL, QUESTION, COLON, ARROW, 
89         NULLC, // null coalesce
90         
91         INVALID
92     }
93 
94     /**
95      * This enum is for literal value tokens that require special handling by the parser
96      */
97     enum LiteralFlag
98     {
99         NONE, BINARY, OCTAL, HEXADECIMAL, TEMPLATE_STRING
100     }
101 
102     /// Type of token
103     Type type;
104     /// Position where token occurs
105     Position position;
106     /// Optional text for keywords and identifiers
107     string text;
108     /// Optional flag for integer literals.
109     LiteralFlag literalFlag = LiteralFlag.NONE;
110 
111     /**
112      * Returns a string representing the type of the token and the optional text if present.
113      */
114     string toString() const
115     {
116         string str = format("[%s", type.to!string);
117         if(text != null)
118             str ~= "|" ~ text;
119         str ~= "]";
120         return str;
121     }
122 
123     /**
124      * Returns a textual representation of the token as it was found in the original script source code.
125      */
126     string symbol() const
127     {
128         final switch(type)
129         {
130         case Type.EOF:
131             return "\0";
132         case Type.KEYWORD: case Type.INTEGER: case Type.DOUBLE: case Type.STRING: case Type.IDENTIFIER: case Type.REGEX:
133             return text;
134         case Type.NOT: return "!";
135         case Type.AND: return "&&";
136         case Type.OR: return "||";
137         case Type.GT: return ">";
138         case Type.GE: return ">=";
139         case Type.LT: return "<";
140         case Type.LE: return "<=";
141         case Type.EQUALS: return "==";
142         case Type.NEQUALS: return "!=";
143         case Type.STRICT_EQUALS: return "===";
144         case Type.STRICT_NEQUALS: return "!==";
145         case Type.ASSIGN: return "=";
146         case Type.POW_ASSIGN: return "**=";
147         case Type.STAR_ASSIGN: return "*=";
148         case Type.FSLASH_ASSIGN: return "/=";
149         case Type.PERCENT_ASSIGN: return "%=";
150         case Type.PLUS_ASSIGN: return "+=";
151         case Type.DASH_ASSIGN: return "-=";
152         case Type.BAND_ASSIGN: return "&=";
153         case Type.BXOR_ASSIGN: return "^=";
154         case Type.BOR_ASSIGN: return "|=";
155         case Type.BLS_ASSIGN: return "<<=";
156         case Type.BRS_ASSIGN: return ">>=";
157         case Type.BURS_ASSIGN: return ">>>=";
158         case Type.PLUS: return "+";
159         case Type.DASH: return "-";
160         case Type.STAR: return "*";
161         case Type.FSLASH: return "/";
162         case Type.PERCENT: return "%";
163         case Type.POW: return "**";
164         case Type.DOT: return ".";
165         case Type.TDOT: return "...";
166         case Type.INC: return "++";
167         case Type.DEC: return "--"; 
168         case Type.BIT_AND: return "&";
169         case Type.BIT_XOR: return "^";
170         case Type.BIT_OR: return "|";
171         case Type.BIT_NOT: return "~";
172         case Type.BIT_LSHIFT: return "<<";
173         case Type.BIT_RSHIFT: return ">>";
174         case Type.BIT_URSHIFT: return ">>>";
175         case Type.LPAREN: return "(";
176         case Type.RPAREN: return ")";
177         case Type.LBRACE: return "{";
178         case Type.RBRACE: return "}";
179         case Type.LBRACKET: return "[";
180         case Type.RBRACKET: return "]";
181         case Type.SEMICOLON: return ";";
182         case Type.COMMA: return ",";
183         case Type.LABEL: return text ~ ":";
184         case Type.QUESTION: return "?";
185         case Type.COLON: return ":";
186         case Type.ARROW: return "=>";
187         case Type.NULLC: return "??";
188         case Type.INVALID: return "#";
189         }
190     }
191 
192     /**
193      * Returns true if a token is both a keyword and a specific keyword.
194      */
195     bool isKeyword(in string keyword) const
196     {
197         return (type == Type.KEYWORD && text == keyword);
198     }
199 
200     /**
201      * Checks for a specific identifier
202      */
203     bool isIdentifier(in string id) const 
204     {
205         return (type == Type.IDENTIFIER && text == id);
206     }
207 
208     /**
209      * Returns true if the token is an assignment operator such as =, +=, or -=, etc.
210      */
211     bool isAssignmentOperator()
212     {
213         return (type == Type.ASSIGN || 
214                 type == Type.POW_ASSIGN ||
215                 type == Type.STAR_ASSIGN ||
216                 type == Type.FSLASH_ASSIGN ||
217                 type == Type.PERCENT_ASSIGN ||
218                 type == Type.PLUS_ASSIGN || 
219                 type == Type.DASH_ASSIGN ||
220                 type == Type.BAND_ASSIGN ||
221                 type == Type.BXOR_ASSIGN ||
222                 type == Type.BOR_ASSIGN ||
223                 type == Type.BLS_ASSIGN ||
224                 type == Type.BRS_ASSIGN ||
225                 type == Type.BURS_ASSIGN
226         );
227     }
228 
229     /**
230      * Generates an invalid token at the given position. This is used by the Lexer to throw
231      * an exception that requires a token.
232      */
233     static Token createInvalidToken(in Position pos, in string text="")
234     {
235         auto token = Token(Token.Type.INVALID, pos, text);
236         return token;
237     }
238 
239     /**
240      * Used by the parser and compiler
241      */
242     static Token createFakeToken(in Type t, in string txt)
243     {
244         Token tok;
245         tok.type = t;
246         tok.position = Position(0,0);
247         tok.text = txt;
248         return tok;
249     }
250 }
251 
252 private bool startsKeywordOrIdentifier(in char ch)
253 {
254     // TODO support unicode by converting string to dchar
255     return ch.isAlpha || ch == '_' || ch == '$';
256 }
257 
258 private bool continuesKeywordOrIdentifier(in char ch)
259 {
260     // TODO support unicode by converting string to dchar
261     return ch.isAlphaNum || ch == '_' || ch == '$';
262 }
263 
264 private bool charIsValidDigit(in char ch, in Token.LiteralFlag lflag)
265 {
266     if(lflag == Token.LiteralFlag.NONE)
267         return ch.isDigit || ch == '.' || ch == 'e';
268     else if(lflag == Token.LiteralFlag.HEXADECIMAL)
269         return ch.isDigit || (ch.toLower >= 'a' && ch.toLower <= 'f');
270     else if(lflag == Token.LiteralFlag.OCTAL)
271         return (ch >= '0' && ch <= '7');
272     else if(lflag == Token.LiteralFlag.BINARY)
273         return ch == '0' || ch == '1';
274     return false;
275 }
276 
277 /// Lexes code and returns the individual tokens
278 struct Lexer 
279 {
280 public:
281     /// Constructor takes code as text to tokenize
282     this(string code)
283     {
284         _text = code;
285     }
286 
287     /// Returns tokens from lexing a string of code
288     Token[] tokenize()
289     {
290         Token[] tokens = [];
291         if (_text == "")
292             return tokens;
293         while(_index < _text.length)
294         {
295             // ignore white space
296             while(currentChar.isWhite())
297                 advanceChar();
298             if(currentChar.startsKeywordOrIdentifier)
299                 tokens ~= makeIdKwOrLabel(tokens);
300             else if(currentChar.isDigit)
301                 tokens ~= makeIntOrDoubleToken();
302             else if(currentChar == '\'' || currentChar == '"' || currentChar == '`')
303                 tokens ~= makeStringToken(tokens);
304             else if(currentChar == '>')
305                 tokens ~= makeRAngleBracketToken();
306             else if(currentChar == '<')
307                 tokens ~= makeLAngleBracketToken();
308             else if(currentChar == '=')
309                 tokens ~= makeEqualToken();
310             else if(currentChar == '!')
311                 tokens ~= makeNotToken();
312             else if(currentChar == '&')
313                 tokens ~= makeAndToken();
314             else if(currentChar == '|')
315                 tokens ~= makeOrToken();
316             else if(currentChar == '+')
317                 tokens ~= makePlusToken();
318             else if(currentChar == '-')
319                 tokens ~= makeDashToken();
320             else if(currentChar == '*')
321                 tokens ~= makeStarToken();
322             else if(currentChar == '/')
323                 tokens = handleFSlash(tokens);
324             else if(currentChar == '%')
325                 tokens ~= makePercentToken();
326             else if(currentChar == '^')
327                 tokens ~= makeXorToken();
328             else if(currentChar == '~')
329                 tokens ~= Token(Token.Type.BIT_NOT, _position);
330             else if(currentChar == '(')
331                 tokens ~= Token(Token.Type.LPAREN, _position);
332             else if(currentChar == ')')
333                 tokens ~= Token(Token.Type.RPAREN, _position);
334             else if(currentChar == '{')
335                 tokens ~= Token(Token.Type.LBRACE, _position);
336             else if(currentChar == '}')
337                 tokens ~= Token(Token.Type.RBRACE, _position);
338             else if(currentChar == '[')
339                 tokens ~= Token(Token.Type.LBRACKET, _position);
340             else if(currentChar == ']')
341                 tokens ~= Token(Token.Type.RBRACKET, _position);
342             else if(currentChar == ';')
343                 tokens ~= Token(Token.Type.SEMICOLON, _position);
344             else if(currentChar == ',')
345                 tokens ~= Token(Token.Type.COMMA, _position);
346             else if(currentChar == '.')
347                 tokens ~= makeDotTokens();
348             else if(currentChar == ':')
349                 tokens ~= Token(Token.Type.COLON, _position);
350             else if(currentChar == '?')
351                 tokens ~= makeQuestionToken();
352             else if(currentChar == '\0')
353                 tokens ~= Token(Token.Type.EOF, _position);
354             else
355                 throw new ScriptCompileException("Invalid character " ~ currentChar, 
356                     Token.createInvalidToken(_position, [currentChar]));
357             advanceChar();
358         }
359         return tokens;
360     }
361 
362     /// Hash table of keywords
363     static immutable KEYWORDS = redBlackTree(
364         "true", "false", "undefined", "null",
365         "var", "let", "const", 
366         "if", "else", "while", "do", "for", "in",
367         "switch", "case", "default",
368         "break", "continue", "return", 
369         "function", "class", "super", "extends",
370         "new", "delete", "typeof", "instanceof",
371         "throw", "try", "catch", "finally", 
372         "yield"
373     );
374 
375     /// AA of look up for escape chars based on character after \
376     static immutable char[char] ESCAPE_CHARS;
377 
378     /// Initializes the associative array of escape chars
379     shared static this()
380     {
381         ESCAPE_CHARS = [
382             'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t', 'v': '\v', 
383             '0': '\0', '\'': '\'', '"': '"', '\\': '\\'
384         ];
385     }
386 
387 private:
388 
389     void advanceChar()
390     {
391         ++_index;
392         _position.advance(currentChar());
393     }
394 
395     char currentChar()
396     {
397         if(_index < _text.length)
398             return _text[_index];
399         else
400             return '\0';
401     }
402 
403     char peekChar()
404     {
405         if(_index + 1 < _text.length)
406             return _text[_index + 1];
407         else
408             return '\0';
409     }
410 
411     bool canMakeRegex(Token[] tokens)
412     {
413         if(tokens.length == 0)
414             return true;
415         switch(tokens[$-1].type)
416         {
417         case Token.Type.IDENTIFIER:
418         case Token.Type.INTEGER:
419         case Token.Type.DOUBLE:
420         case Token.Type.STRING:
421         case Token.Type.RBRACKET:
422         case Token.Type.RPAREN:
423         case Token.Type.INC:
424         case Token.Type.DEC:
425             return false;
426         case Token.Type.KEYWORD:
427             switch(tokens[$-1].text)
428             {
429             case "null":
430             case "true":
431             case "false":
432                 return false;
433             default:
434                 return true;
435             }
436         default:
437             return true;
438         }
439     }
440 
441     Token makeIdKwOrLabel(Token[] tokens)
442     {
443         immutable start = _index;
444         immutable startpos = _position;
445         advanceChar();
446         while(currentChar.continuesKeywordOrIdentifier)
447             advanceChar();
448         auto text = _text[start.._index];
449         --_index; // UGLY but IDK what else to do
450         // first check for keyword, that can't be a label
451 
452         // these words can be used as object members but not as labels
453         if(text == "return" || text == "throw" || text == "delete"
454          || text == "catch" || text == "finally")
455         {
456             if(tokens.length > 0 && tokens[$-1].type == Token.Type.DOT)
457                 return Token(Token.Type.IDENTIFIER, startpos, text);
458         }
459 
460         if(text in KEYWORDS)
461         {
462             return Token(Token.Type.KEYWORD, startpos, text);
463         }
464         else if(peekChar == ':')
465         {
466             advanceChar();
467             return Token(Token.Type.LABEL, startpos, text);
468         }
469         else
470         {
471             return Token(Token.Type.IDENTIFIER, startpos, text);
472         }
473     }
474 
475     Token makeIntOrDoubleToken()
476     {
477         immutable start = _index;
478         immutable startpos = _position;
479         auto dotCounter = 0;
480         auto eCounter = 0;
481         Token.LiteralFlag lflag = Token.LiteralFlag.NONE;
482         if(peekChar.toLower == 'x')
483         {
484             lflag = Token.LiteralFlag.HEXADECIMAL;
485             advanceChar();
486         }
487         else if(peekChar.toLower == 'o')
488         {
489             lflag = Token.LiteralFlag.OCTAL;
490             advanceChar();
491         }
492         else if(peekChar.toLower == 'b')
493         {
494             lflag = Token.LiteralFlag.BINARY;
495             advanceChar();
496         }
497         // if the lflag was set, the first char has to be 0
498         if(lflag != Token.LiteralFlag.NONE && _text[start] != '0')
499             throw new ScriptCompileException("Malformed integer literal", Token.createInvalidToken(startpos));
500         
501         while(peekChar.charIsValidDigit(lflag))
502         {
503             advanceChar();
504             if(lflag == Token.LiteralFlag.NONE)
505             {
506                 if(currentChar == '.')
507                 {
508                     ++dotCounter;
509                     if(dotCounter > 1)
510                         throw new ScriptCompileException("Too many decimals in number literal", 
511                             Token.createInvalidToken(_position));
512                 }
513                 else if(currentChar.toLower == 'e')
514                 {
515                     ++eCounter;
516                     if(eCounter > 1)
517                         throw new ScriptCompileException("Numbers may only have one exponent specifier", 
518                             Token.createInvalidToken(_position));
519                     if(peekChar == '+' || peekChar == '-')
520                         advanceChar();
521                     if(!peekChar.isDigit)
522                         throw new ScriptCompileException("Exponent specifier must be followed by number", 
523                             Token.createInvalidToken(_position));
524                 }
525             }
526         }
527         auto text = _text[start.._index+1];
528         if(lflag != Token.LiteralFlag.NONE && text.length <= 2)
529             throw new ScriptCompileException("Malformed hex/octal/binary integer", Token.createInvalidToken(startpos));
530         Token resultToken;
531         if(dotCounter == 0 && eCounter == 0)
532             resultToken = Token(Token.Type.INTEGER, startpos, text);
533         else
534             resultToken = Token(Token.Type.DOUBLE, startpos, text);
535         resultToken.literalFlag = lflag;
536         return resultToken;
537     }
538 
539     Token makeStringToken(ref Token[] previous)
540     {
541         immutable closeQuote = currentChar;
542         auto startpos = _position;
543         advanceChar();
544         string text = "";
545         bool escapeChars = true;
546         if(previous.length >= 3)
547         {
548             if(previous[$-1].isIdentifier("raw") &&
549                previous[$-2].type == Token.Type.DOT &&
550                previous[$-3].isIdentifier("String"))
551             {
552                 escapeChars = false;
553                 previous = previous[0.. $-3];
554             }
555         }
556         Token.LiteralFlag lflag = Token.LiteralFlag.NONE;
557         if(closeQuote == '`')
558             lflag = Token.LiteralFlag.TEMPLATE_STRING;
559         while(currentChar != closeQuote)
560         {
561             if(currentChar == '\0')
562                 throw new ScriptCompileException("Missing close quote for string literal", 
563                     Token.createInvalidToken(_position, text));
564             else if(currentChar == '\n' && lflag != Token.LiteralFlag.TEMPLATE_STRING)
565                 throw new ScriptCompileException("Line breaks inside regular string literals are not allowed", 
566                     Token.createInvalidToken(_position, text));
567             else if(currentChar == '\\' && escapeChars)
568             {
569                 advanceChar();
570                 if(currentChar in ESCAPE_CHARS)
571                     text ~= ESCAPE_CHARS[currentChar];
572                 else if(currentChar == 'u')
573                 {
574                     advanceChar();
575                     string accum = "";
576                     bool usingBraces = false;
577                     int limitCounter;
578                     immutable LIMIT = 4; // without the braces
579                     if(currentChar == '{')
580                     {
581                         advanceChar();
582                         usingBraces = true;
583                     }
584                     while(currentChar.charIsValidDigit(Token.LiteralFlag.HEXADECIMAL))
585                     {
586                         if(limitCounter >= LIMIT && !usingBraces)
587                             break;
588                         accum ~= currentChar;
589                         advanceChar();
590                         if(!usingBraces)
591                             ++limitCounter;
592                     }
593                     if(currentChar == '}' && usingBraces)
594                         advanceChar();
595                     --_index;
596                     try 
597                     {
598                         dchar result = cast(dchar)to!uint(accum, 16);
599                         char[] buf;
600                         encode(buf, result);
601                         text ~= buf;
602                     }
603                     catch(Exception ex)
604                     {
605                         throw new ScriptCompileException("Invalid UTF sequence in \\u char", 
606                             Token.createInvalidToken(_position, accum));
607                     }
608                 }
609                 else if(currentChar == 'x')
610                 {
611                     advanceChar();
612                     string accum = "";
613                     accum ~= currentChar;
614                     advanceChar();
615                     accum ~= currentChar;
616                     try 
617                     {
618                         char result = cast(char)to!ubyte(accum, 16);
619                         text ~= result;
620                     }
621                     catch(Exception ex)
622                     {
623                         throw new ScriptCompileException("Invalid hexadecimal number in \\x char",
624                             Token.createInvalidToken(_position, accum));
625                     }
626                 }
627                 else
628                     throw new ScriptCompileException("Unknown escape character " ~ currentChar, 
629                         Token.createInvalidToken(_position));
630             }
631             else
632                 text ~= currentChar;
633             advanceChar();
634         }
635         auto tok = Token(Token.Type.STRING, startpos, text);
636         tok.literalFlag = lflag;
637         return tok;
638     }
639 
640     Token makeRAngleBracketToken()
641     {
642         auto startpos = _position;
643         if(peekChar == '=')
644         {
645             advanceChar();
646             return Token(Token.Type.GE, startpos);
647         }
648         else if(peekChar == '>')
649         {
650             advanceChar();
651             if(peekChar == '>')
652             {
653                 advanceChar();
654                 if(peekChar == '=')
655                 {
656                     advanceChar();
657                     return Token(Token.Type.BURS_ASSIGN, startpos);
658                 }
659                 else
660                 {
661                     return Token(Token.Type.BIT_URSHIFT, startpos);
662                 }
663             }
664             else if(peekChar == '=')
665             {
666                 advanceChar();
667                 return Token(Token.Type.BRS_ASSIGN, startpos);
668             }
669             else
670             {
671                 return Token(Token.Type.BIT_RSHIFT, startpos);
672             }
673         }
674         else
675         {
676             return Token(Token.Type.GT, startpos);
677         }
678     }
679 
680     Token makeLAngleBracketToken()
681     {
682         auto startpos = _position;
683         if(peekChar == '=')
684         {
685             advanceChar();
686             return Token(Token.Type.LE, startpos);
687         }
688         else if(peekChar == '<')
689         {
690             advanceChar();
691             if(peekChar == '=')
692             {
693                 advanceChar();
694                 return Token(Token.Type.BLS_ASSIGN);
695             }
696             else
697             {
698                 return Token(Token.Type.BIT_LSHIFT, startpos);
699             }
700         }
701         else
702         {
703             return Token(Token.Type.LT, startpos);
704         }
705     }
706 
707     Token makeEqualToken()
708     {
709         auto startpos = _position;
710         if(peekChar == '=')
711         {
712             advanceChar();
713             if(peekChar == '=')
714             {
715                 advanceChar();
716                 return Token(Token.Type.STRICT_EQUALS);
717             }
718             else
719             {
720                 return Token(Token.Type.EQUALS, startpos);
721             }
722         }
723         else if(peekChar == '>')
724         {
725             advanceChar();
726             return Token(Token.Type.ARROW, startpos);
727         }
728         else
729         {
730             return Token(Token.Type.ASSIGN, startpos);
731         }
732     }
733 
734     Token makeNotToken()
735     {
736         auto startpos = _position;
737         if(peekChar == '=')
738         {
739             advanceChar();
740             if(peekChar == '=')
741             {
742                 advanceChar();
743                 return Token(Token.Type.STRICT_NEQUALS, startpos);
744             }
745             else
746             {
747                 return Token(Token.Type.NEQUALS, startpos);
748             }
749         }
750         else
751         {
752             return Token(Token.Type.NOT, startpos);
753         }
754     }
755 
756     Token makeAndToken()
757     {
758         auto startpos = _position;
759         if(peekChar == '&')
760         {
761             advanceChar();
762             return Token(Token.Type.AND, startpos);
763         }
764         else if(peekChar == '=')
765         {            
766             advanceChar();
767             return Token(Token.Type.BAND_ASSIGN, startpos);   
768         }
769         else 
770         {
771             return Token(Token.Type.BIT_AND, startpos);
772         }
773     }
774 
775     Token makeOrToken()
776     {
777         auto startpos = _position;
778         if(peekChar == '|')
779         {
780             advanceChar();
781             return Token(Token.Type.OR, startpos);
782         }
783         else if(peekChar == '=')
784         {
785             advanceChar();
786             return Token(Token.Type.BOR_ASSIGN, startpos);
787         }
788         else
789         {
790             return Token(Token.Type.BIT_OR, startpos);
791         }
792     }
793 
794     Token makePlusToken()
795     {
796         auto startpos = _position;
797         if(peekChar == '+')
798         {
799             advanceChar();
800             return Token(Token.Type.INC, startpos);
801         }
802         else if(peekChar == '=')
803         {
804             advanceChar();
805             return Token(Token.Type.PLUS_ASSIGN, startpos);
806         }
807         else
808         {
809             return Token(Token.Type.PLUS, startpos);
810         }
811     }
812 
813     Token makeDashToken()
814     {
815         auto startpos = _position;
816         if(peekChar == '-')
817         {
818             advanceChar();
819             return Token(Token.Type.DEC, startpos);
820         }
821         else if(peekChar == '=')
822         {
823             advanceChar();
824             return Token(Token.Type.DASH_ASSIGN, startpos);
825         }
826         else
827         {
828             return Token(Token.Type.DASH, startpos);
829         }
830     }
831 
832     Token makeStarToken()
833     {
834         auto startpos = _position;
835         if(peekChar == '*')
836         {
837             advanceChar();
838             if(peekChar == '=')
839             {
840                 advanceChar();
841                 return Token(Token.Type.POW_ASSIGN, startpos);
842             }
843             else
844             {
845                 return Token(Token.Type.POW, startpos);
846             }
847         }
848         else if(peekChar == '=')
849         {
850             advanceChar();
851             return Token(Token.Type.STAR_ASSIGN, startpos);
852         }
853         else
854         {
855             return Token(Token.Type.STAR, startpos);
856         }
857     }
858 
859     Token makePercentToken()
860     {
861         if(peekChar == '=')
862         {
863             immutable startpos = _position;
864             advanceChar();
865             return Token(Token.Type.PERCENT_ASSIGN, startpos);
866         }
867         else
868         {
869             return Token(Token.Type.PERCENT, _position);
870         }
871     }
872 
873     Token makeXorToken()
874     {
875         if(peekChar == '=')
876         {
877             immutable startpos = _position;
878             advanceChar();
879             return Token(Token.Type.BXOR_ASSIGN, startpos);
880         }
881         else
882         {
883             return Token(Token.Type.BIT_XOR, _position);
884         }
885     }
886 
887     Token[] makeDotTokens()
888     {
889         immutable startPos = _position;
890         if(peekChar == '.')
891         {
892             advanceChar();
893             if(peekChar == '.')
894             {
895                 advanceChar();
896                 return [Token(Token.Type.TDOT, startPos)];
897             }
898             else
899             {
900                 return [Token(Token.Type.DOT, startPos), Token(Token.Type.DOT, _position)];
901             }
902         }
903         else
904         {
905             return [Token(Token.Type.DOT, _position)];
906         }
907     }
908 
909     Token makeQuestionToken()
910     {
911         immutable startpos = _position;
912         if(peekChar == '?')
913         {
914             advanceChar();
915             return Token(Token.Type.NULLC, _position);
916         }
917         else
918         {
919             return Token(Token.Type.QUESTION, _position);
920         }
921     }
922 
923     Token[] handleFSlash(Token[] tokens)
924     {
925         if(peekChar == '*') // block comment
926         {
927             advanceChar();
928             while(peekChar != '\0')
929             {
930                 if(peekChar == '*')
931                 {
932                     advanceChar();
933                     if(peekChar == '/')
934                         break;
935                 }
936                 advanceChar();
937             }
938             advanceChar();
939         }
940         else if(peekChar == '/') // comment
941         {
942             advanceChar();
943             while(peekChar != '\n' && peekChar != '\0')
944             {
945                 advanceChar();
946             }
947         }
948         else if(canMakeRegex(tokens))
949         {
950             string accum = "";
951             auto startPos = _position;
952             accum ~= currentChar;
953             bool gettingFlags = false;
954             advanceChar();
955             while(currentChar)
956             {
957                 if(!gettingFlags)
958                 {
959                     if(currentChar == '\\')
960                     {
961                         accum ~= currentChar;
962                         advanceChar();
963                         if(currentChar)
964                         {
965                             accum ~= currentChar;
966                             advanceChar();
967                         }
968                     }
969                     else if(currentChar == '/')
970                     {
971                         accum ~= currentChar;
972                         advanceChar();
973                         gettingFlags = true;
974                     }
975                     else
976                     {
977                         accum ~= currentChar;
978                         advanceChar();
979                     }
980                 }
981                 else
982                 {
983                     if(!isAlpha(currentChar))
984                         break;
985                     accum ~= currentChar;
986                     advanceChar();
987                 }
988             }
989             --_index;
990             bool valid;
991             try 
992             {
993                 auto extracted = extract(accum);
994                 valid = isValid(extracted[0], extracted[1]);
995             }
996             catch(Exception ex)
997             {
998                 throw new ScriptCompileException("Malformed regex literal", Token.createInvalidToken(startPos, accum));
999             }
1000             if(!valid)
1001                 throw new ScriptCompileException("Invalid regex literal", Token.createInvalidToken(startPos, accum));
1002             tokens ~= Token(Token.Type.REGEX, startPos, accum);
1003         }
1004         else if(peekChar == '=')
1005         {
1006             immutable startpos = _position;
1007             advanceChar();
1008             tokens ~= Token(Token.Type.FSLASH_ASSIGN, startpos);
1009         }
1010         else
1011         {
1012             tokens ~= Token(Token.Type.FSLASH, _position);
1013         }
1014 
1015         return tokens;
1016     }
1017 
1018     Position _position = {1, 1};
1019     string _text;
1020     size_t _index = 0;
1021 }
1022 
1023 unittest
1024 {
1025     auto lexer = Lexer("1.2 34 5.e-99 'foo' ");
1026     auto tokens = lexer.tokenize();
1027     assert(tokens[0].type == Token.Type.DOUBLE);
1028     assert(tokens[1].type == Token.Type.INTEGER);
1029     assert(tokens[2].type == Token.Type.DOUBLE);
1030     assert(tokens[3].type == Token.Type.STRING && tokens[3].text == "foo");
1031     // TODO complete unit tests of every token type
1032 }