1 /**
2 This module implements the Token and Lexer structs
3 
4 ────────────────────────────────────────────────────────────────────────────────
5 
6 Copyright (C) 2021 pillager86.rf.gd
7 
8 This program is free software: you can redistribute it and/or modify it under 
9 the terms of the GNU General Public License as published by the Free Software 
10 Foundation, either version 3 of the License, or (at your option) any later 
11 version.
12 
13 This program is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.
16 
17 You should have received a copy of the GNU General Public License along with 
18 this program.  If not, see <https://www.gnu.org/licenses/>.
19 */
20 module mildew.lexer;
21 
22 import mildew.exceptions: ScriptCompileException;
23 import mildew.util.regex;
24 
25 import std.ascii; // temp until unicode support
26 import std.container.rbtree;
27 import std.conv: to;
28 import std.format: format;
29 import std.utf: encode;
30 
31 /**
32  * This struct represents the line and column number of a token, starting at 1.
33  */
34 struct Position
35 {
36     /// Line and column number.
37     int line, column;
38 
39     /// Returns a string representing the line and column number
40     string toString() const 
41     {
42         return format("line %s, column %s", line, column);
43     }
44 
45     /// Determines line and column number based on char that is read
46     void advance(char ch)
47     {
48         if(ch == '\0')
49         {
50             return;
51         }
52         else if(ch == '\n')
53         {
54             ++line;
55             column = 1;
56         }
57         else
58         {
59             ++column;
60         }
61     }
62 }
63 
64 /**
65  * This struct represents a token, a fundamental building block of all scripts. The code of a script
66  * is first separated by token so that the parser can analyze each token.
67  */
68 struct Token 
69 {
70     /**
71      * The type of a token.
72      */
73     enum Type 
74     {
75         EOF, KEYWORD, INTEGER, DOUBLE, STRING, IDENTIFIER, REGEX,
76         NOT, AND, OR, GT, GE, LT, LE,
77         EQUALS, NEQUALS, STRICT_EQUALS, STRICT_NEQUALS,
78         ASSIGN, PLUS_ASSIGN, DASH_ASSIGN,
79         PLUS, DASH, STAR, FSLASH, PERCENT, POW, DOT,
80         INC, DEC, // ++ and --
81         BIT_AND, BIT_XOR, BIT_OR, BIT_NOT, BIT_LSHIFT, BIT_RSHIFT, BIT_URSHIFT,
82         LPAREN, RPAREN, LBRACE, RBRACE, LBRACKET, RBRACKET, 
83         SEMICOLON, COMMA, LABEL, QUESTION, COLON, ARROW,
84         
85         INVALID
86     }
87 
88     /**
89      * This enum is for literal value tokens that require special handling by the parser
90      */
91     enum LiteralFlag
92     {
93         NONE, BINARY, OCTAL, HEXADECIMAL, TEMPLATE_STRING
94     }
95 
96     /// Type of token
97     Type type;
98     /// Position where token occurs
99     Position position;
100     /// Optional text for keywords and identifiers
101     string text;
102     /// Optional flag for integer literals.
103     LiteralFlag literalFlag = LiteralFlag.NONE;
104 
105     /**
106      * Returns a string representing the type of the token and the optional text if present.
107      */
108     string toString() const
109     {
110         string str = format("[%s", type.to!string);
111         if(text != null)
112             str ~= "|" ~ text;
113         str ~= "]";
114         return str;
115     }
116 
117     /**
118      * Returns a textual representation of the token as it was found in the original script source code.
119      */
120     string symbol() const
121     {
122         final switch(type)
123         {
124         case Type.EOF:
125             return "\0";
126         case Type.KEYWORD: case Type.INTEGER: case Type.DOUBLE: case Type.STRING: case Type.IDENTIFIER: case Type.REGEX:
127             return text;
128         case Type.NOT: return "!";
129         case Type.AND: return "&&";
130         case Type.OR: return "||";
131         case Type.GT: return ">";
132         case Type.GE: return ">=";
133         case Type.LT: return "<";
134         case Type.LE: return "<=";
135         case Type.EQUALS: return "==";
136         case Type.NEQUALS: return "!=";
137         case Type.STRICT_EQUALS: return "===";
138         case Type.STRICT_NEQUALS: return "!==";
139         case Type.ASSIGN: return "=";
140         case Type.PLUS_ASSIGN: return "+=";
141         case Type.DASH_ASSIGN: return "-=";
142         case Type.PLUS: return "+";
143         case Type.DASH: return "-";
144         case Type.STAR: return "*";
145         case Type.FSLASH: return "/";
146         case Type.PERCENT: return "%";
147         case Type.POW: return "**";
148         case Type.DOT: return ".";
149         case Type.INC: return "++";
150         case Type.DEC: return "--"; 
151         case Type.BIT_AND: return "&";
152         case Type.BIT_XOR: return "^";
153         case Type.BIT_OR: return "|";
154         case Type.BIT_NOT: return "~";
155         case Type.BIT_LSHIFT: return "<<";
156         case Type.BIT_RSHIFT: return ">>";
157         case Type.BIT_URSHIFT: return ">>>";
158         case Type.LPAREN: return "(";
159         case Type.RPAREN: return ")";
160         case Type.LBRACE: return "{";
161         case Type.RBRACE: return "}";
162         case Type.LBRACKET: return "[";
163         case Type.RBRACKET: return "]";
164         case Type.SEMICOLON: return ";";
165         case Type.COMMA: return ",";
166         case Type.LABEL: return text ~ ":";
167         case Type.QUESTION: return "?";
168         case Type.COLON: return ":";
169         case Type.ARROW: return "=>";
170         case Type.INVALID: return "#";
171         }
172     }
173 
174     /**
175      * Returns true if a token is both a keyword and a specific keyword.
176      */
177     bool isKeyword(in string keyword) const
178     {
179         return (type == Type.KEYWORD && text == keyword);
180     }
181 
182     /**
183      * Checks for a specific identifier
184      */
185     bool isIdentifier(in string id) const 
186     {
187         return (type == Type.IDENTIFIER && text == id);
188     }
189 
190     /**
191      * Returns true if the token is an assignment operator such as =, +=, or -=, etc.
192      */
193     bool isAssignmentOperator()
194     {
195         return (type == Type.ASSIGN || type == Type.PLUS_ASSIGN || type == Type.DASH_ASSIGN);
196     }
197 
198     /**
199      * Generates an invalid token at the given position. This is used by the Lexer to throw
200      * an exception that requires a token.
201      */
202     static Token createInvalidToken(in Position pos, in string text="")
203     {
204         auto token = Token(Token.Type.INVALID, pos, text);
205         return token;
206     }
207 
208     /**
209      * Used by the parser
210      */
211     static Token createFakeToken(in Type t, in string txt)
212     {
213         Token tok;
214         tok.type = t;
215         tok.position = Position(0,0);
216         tok.text = txt;
217         return tok;
218     }
219 }
220 
221 private bool startsKeywordOrIdentifier(in char ch)
222 {
223     // TODO support unicode by converting string to dchar
224     return ch.isAlpha || ch == '_' || ch == '$';
225 }
226 
227 private bool continuesKeywordOrIdentifier(in char ch)
228 {
229     // TODO support unicode by converting string to dchar
230     return ch.isAlphaNum || ch == '_' || ch == '$';
231 }
232 
233 private bool charIsValidDigit(in char ch, in Token.LiteralFlag lflag)
234 {
235     if(lflag == Token.LiteralFlag.NONE)
236         return ch.isDigit || ch == '.' || ch == 'e';
237     else if(lflag == Token.LiteralFlag.HEXADECIMAL)
238         return ch.isDigit || (ch.toLower >= 'a' && ch.toLower <= 'f');
239     else if(lflag == Token.LiteralFlag.OCTAL)
240         return (ch >= '0' && ch <= '7');
241     else if(lflag == Token.LiteralFlag.BINARY)
242         return ch == '0' || ch == '1';
243     return false;
244 }
245 
246 /// Lexes code and returns the individual tokens
247 struct Lexer 
248 {
249 public:
250     /// Constructor takes code as text to tokenize
251     this(string code)
252     {
253         _text = code;
254     }
255 
256     /// Returns tokens from lexing a string of code
257     Token[] tokenize()
258     {
259         Token[] tokens = [];
260         if (_text == "")
261             return tokens;
262         while(_index < _text.length)
263         {
264             // ignore white space
265             while(currentChar.isWhite())
266                 advanceChar();
267             if(currentChar.startsKeywordOrIdentifier)
268                 tokens ~= makeIdKwOrLabel(tokens);
269             else if(currentChar.isDigit)
270                 tokens ~= makeIntOrDoubleToken();
271             else if(currentChar == '\'' || currentChar == '"' || currentChar == '`')
272                 tokens ~= makeStringToken(tokens);
273             else if(currentChar == '>')
274                 tokens ~= makeRAngleBracketToken();
275             else if(currentChar == '<')
276                 tokens ~= makeLAngleBracketToken();
277             else if(currentChar == '=')
278                 tokens ~= makeEqualToken();
279             else if(currentChar == '!')
280                 tokens ~= makeNotToken();
281             else if(currentChar == '&')
282                 tokens ~= makeAndToken();
283             else if(currentChar == '|')
284                 tokens ~= makeOrToken();
285             else if(currentChar == '+')
286                 tokens ~= makePlusToken();
287             else if(currentChar == '-')
288                 tokens ~= makeDashToken();
289             else if(currentChar == '*')
290                 tokens ~= makeStarToken();
291             else if(currentChar == '/')
292                 tokens = handleFSlash(tokens);
293             else if(currentChar == '%')
294                 tokens ~= Token(Token.Type.PERCENT, _position);
295             else if(currentChar == '^')
296                 tokens ~= Token(Token.Type.BIT_XOR, _position);
297             else if(currentChar == '~')
298                 tokens ~= Token(Token.Type.BIT_NOT, _position);
299             else if(currentChar == '(')
300                 tokens ~= Token(Token.Type.LPAREN, _position);
301             else if(currentChar == ')')
302                 tokens ~= Token(Token.Type.RPAREN, _position);
303             else if(currentChar == '{')
304                 tokens ~= Token(Token.Type.LBRACE, _position);
305             else if(currentChar == '}')
306                 tokens ~= Token(Token.Type.RBRACE, _position);
307             else if(currentChar == '[')
308                 tokens ~= Token(Token.Type.LBRACKET, _position);
309             else if(currentChar == ']')
310                 tokens ~= Token(Token.Type.RBRACKET, _position);
311             else if(currentChar == ';')
312                 tokens ~= Token(Token.Type.SEMICOLON, _position);
313             else if(currentChar == ',')
314                 tokens ~= Token(Token.Type.COMMA, _position);
315             else if(currentChar == '.')
316                 tokens ~= Token(Token.Type.DOT, _position);
317             else if(currentChar == ':')
318                 tokens ~= Token(Token.Type.COLON, _position);
319             else if(currentChar == '?')
320                 tokens ~= Token(Token.Type.QUESTION, _position);
321             else if(currentChar == '\0')
322                 tokens ~= Token(Token.Type.EOF, _position);
323             else
324                 throw new ScriptCompileException("Invalid character " ~ currentChar, 
325                     Token.createInvalidToken(_position, [currentChar]));
326             advanceChar();
327         }
328         return tokens;
329     }
330 
331     /// Hash table of keywords
332     static immutable KEYWORDS = redBlackTree(
333         "true", "false", "undefined", "null",
334         "var", "let", "const", 
335         "if", "else", "while", "do", "for", "in",
336         "switch", "case", "default",
337         "break", "continue", "return", 
338         "function", "class", "super", "extends",
339         "new", "delete", "typeof", "instanceof",
340         "throw", "try", "catch", "finally", 
341         "yield"
342     );
343 
344     /// AA of look up for escape chars based on character after \
345     static immutable char[char] ESCAPE_CHARS;
346 
347     /// Initializes the associative array of escape chars
348     shared static this()
349     {
350         ESCAPE_CHARS = [
351             'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t', 'v': '\v', 
352             '0': '\0', '\'': '\'', '"': '"', '\\': '\\'
353         ];
354     }
355 
356 private:
357 
358     void advanceChar()
359     {
360         ++_index;
361         _position.advance(currentChar());
362     }
363 
364     char currentChar()
365     {
366         if(_index < _text.length)
367             return _text[_index];
368         else
369             return '\0';
370     }
371 
372     char peekChar()
373     {
374         if(_index + 1 < _text.length)
375             return _text[_index + 1];
376         else
377             return '\0';
378     }
379 
380     bool canMakeRegex(Token[] tokens)
381     {
382         if(tokens.length == 0)
383             return true;
384         switch(tokens[$-1].type)
385         {
386         case Token.Type.IDENTIFIER:
387         case Token.Type.INTEGER:
388         case Token.Type.DOUBLE:
389         case Token.Type.STRING:
390         case Token.Type.RBRACKET:
391         case Token.Type.RPAREN:
392         case Token.Type.INC:
393         case Token.Type.DEC:
394             return false;
395         case Token.Type.KEYWORD:
396             switch(tokens[$-1].text)
397             {
398             case "null":
399             case "true":
400             case "false":
401                 return false;
402             default:
403                 return true;
404             }
405         default:
406             return true;
407         }
408     }
409 
410     Token makeIdKwOrLabel(Token[] tokens)
411     {
412         immutable start = _index;
413         immutable startpos = _position;
414         advanceChar();
415         while(currentChar.continuesKeywordOrIdentifier)
416             advanceChar();
417         auto text = _text[start.._index];
418         --_index; // UGLY but IDK what else to do
419         // first check for keyword, that can't be a label
420 
421         // return is a special case after "."
422         if(text == "return")
423         {
424             if(tokens.length > 0 && tokens[$-1].type == Token.Type.DOT)
425                 return Token(Token.Type.IDENTIFIER, startpos, text);
426         }
427 
428         if(text in KEYWORDS)
429         {
430             return Token(Token.Type.KEYWORD, startpos, text);
431         }
432         else if(peekChar == ':')
433         {
434             advanceChar();
435             return Token(Token.Type.LABEL, startpos, text);
436         }
437         else
438         {
439             return Token(Token.Type.IDENTIFIER, startpos, text);
440         }
441     }
442 
443     Token makeIntOrDoubleToken()
444     {
445         immutable start = _index;
446         immutable startpos = _position;
447         auto dotCounter = 0;
448         auto eCounter = 0;
449         Token.LiteralFlag lflag = Token.LiteralFlag.NONE;
450         if(peekChar.toLower == 'x')
451         {
452             lflag = Token.LiteralFlag.HEXADECIMAL;
453             advanceChar();
454         }
455         else if(peekChar.toLower == 'o')
456         {
457             lflag = Token.LiteralFlag.OCTAL;
458             advanceChar();
459         }
460         else if(peekChar.toLower == 'b')
461         {
462             lflag = Token.LiteralFlag.BINARY;
463             advanceChar();
464         }
465         // if the lflag was set, the first char has to be 0
466         if(lflag != Token.LiteralFlag.NONE && _text[start] != '0')
467             throw new ScriptCompileException("Malformed integer literal", Token.createInvalidToken(startpos));
468         
469         // while(peekChar.isDigit || peekChar == '.' || peekChar.toLower == 'e')
470         while(peekChar.charIsValidDigit(lflag))
471         {
472             advanceChar();
473             if(lflag == Token.LiteralFlag.NONE)
474             {
475                 if(currentChar == '.')
476                 {
477                     ++dotCounter;
478                     if(dotCounter > 1)
479                         throw new ScriptCompileException("Too many decimals in number literal", 
480                             Token.createInvalidToken(_position));
481                 }
482                 else if(currentChar.toLower == 'e')
483                 {
484                     ++eCounter;
485                     if(eCounter > 1)
486                         throw new ScriptCompileException("Numbers can only have one exponent specifier", 
487                             Token.createInvalidToken(_position));
488                     if(peekChar == '+' || peekChar == '-')
489                         advanceChar();
490                     if(!peekChar.isDigit)
491                         throw new ScriptCompileException("Exponent specifier must be followed by number", 
492                             Token.createInvalidToken(_position));
493                 }
494             }
495         }
496         auto text = _text[start.._index+1];
497         if(lflag != Token.LiteralFlag.NONE && text.length <= 2)
498             throw new ScriptCompileException("Malformed hex/octal/binary integer", Token.createInvalidToken(startpos));
499         Token resultToken;
500         if(dotCounter == 0 && eCounter == 0)
501             resultToken = Token(Token.Type.INTEGER, startpos, text);
502         else
503             resultToken = Token(Token.Type.DOUBLE, startpos, text);
504         resultToken.literalFlag = lflag;
505         return resultToken;
506     }
507 
508     Token makeStringToken(ref Token[] previous)
509     {
510         immutable closeQuote = currentChar;
511         auto startpos = _position;
512         advanceChar();
513         string text = "";
514         bool escapeChars = true;
515         if(previous.length >= 3)
516         {
517             if(previous[$-1].isIdentifier("raw") &&
518                previous[$-2].type == Token.Type.DOT &&
519                previous[$-3].isIdentifier("String"))
520             {
521                 escapeChars = false;
522                 previous = previous[0.. $-3];
523             }
524         }
525         Token.LiteralFlag lflag = Token.LiteralFlag.NONE;
526         if(closeQuote == '`')
527             lflag = Token.LiteralFlag.TEMPLATE_STRING;
528         while(currentChar != closeQuote)
529         {
530             if(currentChar == '\0')
531                 throw new ScriptCompileException("Missing close quote for string literal", 
532                     Token.createInvalidToken(_position, text));
533             else if(currentChar == '\n' && lflag != Token.LiteralFlag.TEMPLATE_STRING)
534                 throw new ScriptCompileException("Line breaks inside regular string literals are not allowed", 
535                     Token.createInvalidToken(_position, text));
536             else if(currentChar == '\\' && escapeChars) // TODO handle \u0000 and \u00 sequences
537             {
538                 advanceChar();
539                 if(currentChar in ESCAPE_CHARS)
540                     text ~= ESCAPE_CHARS[currentChar];
541                 else if(currentChar == 'u')
542                 {
543                     advanceChar();
544                     string accum = "";
545                     bool usingBraces = false;
546                     int limitCounter;
547                     immutable LIMIT = 4; // without the braces
548                     if(currentChar == '{')
549                     {
550                         advanceChar();
551                         usingBraces = true;
552                     }
553                     while(currentChar.charIsValidDigit(Token.LiteralFlag.HEXADECIMAL))
554                     {
555                         if(limitCounter >= LIMIT && !usingBraces)
556                             break;
557                         accum ~= currentChar;
558                         advanceChar();
559                         if(!usingBraces)
560                             ++limitCounter;
561                     }
562                     if(currentChar == '}' && usingBraces)
563                         advanceChar();
564                     --_index;
565                     try 
566                     {
567                         dchar result = cast(dchar)to!uint(accum, 16);
568                         char[] buf;
569                         encode(buf, result);
570                         text ~= buf;
571                     }
572                     catch(Exception ex)
573                     {
574                         throw new ScriptCompileException("Invalid UTF sequence in \\u char", 
575                             Token.createInvalidToken(_position, accum));
576                     }
577                 }
578                 else if(currentChar == 'x')
579                 {
580                     advanceChar();
581                     string accum = "";
582                     accum ~= currentChar;
583                     advanceChar();
584                     accum ~= currentChar;
585                     try 
586                     {
587                         char result = cast(char)to!ubyte(accum, 16);
588                         text ~= result;
589                     }
590                     catch(Exception ex)
591                     {
592                         throw new ScriptCompileException("Invalid hexadecimal number in \\x char",
593                             Token.createInvalidToken(_position, accum));
594                     }
595                 }
596                 else
597                     throw new ScriptCompileException("Unknown escape character " ~ currentChar, 
598                         Token.createInvalidToken(_position));
599             }
600             else
601                 text ~= currentChar;
602             advanceChar();
603         }
604         auto tok = Token(Token.Type.STRING, startpos, text);
605         tok.literalFlag = lflag;
606         return tok;
607     }
608 
609     Token makeRAngleBracketToken()
610     {
611         auto startpos = _position;
612         if(peekChar == '=')
613         {
614             advanceChar();
615             return Token(Token.Type.GE, startpos);
616         }
617         else if(peekChar == '>')
618         {
619             advanceChar();
620             if(peekChar == '>')
621             {
622                 advanceChar();
623                 return Token(Token.Type.BIT_URSHIFT, startpos);
624             }
625             else
626             {
627                 return Token(Token.Type.BIT_RSHIFT, startpos);
628             }
629         }
630         else
631         {
632             return Token(Token.Type.GT, startpos);
633         }
634     }
635 
636     Token makeLAngleBracketToken()
637     {
638         auto startpos = _position;
639         if(peekChar == '=')
640         {
641             advanceChar();
642             return Token(Token.Type.LE, startpos);
643         }
644         else if(peekChar == '<')
645         {
646             advanceChar();
647             return Token(Token.Type.BIT_LSHIFT, startpos);
648         }
649         else
650         {
651             return Token(Token.Type.LT, startpos);
652         }
653     }
654 
655     Token makeEqualToken()
656     {
657         auto startpos = _position;
658         if(peekChar == '=')
659         {
660             advanceChar();
661             if(peekChar == '=')
662             {
663                 advanceChar();
664                 return Token(Token.Type.STRICT_EQUALS);
665             }
666             else
667             {
668                 return Token(Token.Type.EQUALS, startpos);
669             }
670         }
671         else if(peekChar == '>')
672         {
673             advanceChar();
674             return Token(Token.Type.ARROW, startpos);
675         }
676         else
677         {
678             return Token(Token.Type.ASSIGN, startpos);
679         }
680     }
681 
682     Token makeNotToken()
683     {
684         auto startpos = _position;
685         if(peekChar == '=')
686         {
687             advanceChar();
688             if(peekChar == '=')
689             {
690                 advanceChar();
691                 return Token(Token.Type.STRICT_NEQUALS, startpos);
692             }
693             else
694             {
695                 return Token(Token.Type.NEQUALS, startpos);
696             }
697         }
698         else
699         {
700             return Token(Token.Type.NOT, startpos);
701         }
702     }
703 
704     Token makeAndToken()
705     {
706         auto startpos = _position;
707         if(peekChar == '&')
708         {
709             advanceChar();
710             return Token(Token.Type.AND, startpos);
711         }
712         else
713         {
714             return Token(Token.Type.BIT_AND, startpos);
715         }
716     }
717 
718     Token makeOrToken()
719     {
720         auto startpos = _position;
721         if(peekChar == '|')
722         {
723             advanceChar();
724             return Token(Token.Type.OR, startpos);
725         }
726         else
727         {
728             return Token(Token.Type.BIT_OR, startpos);
729         }
730     }
731 
732     Token makePlusToken()
733     {
734         auto startpos = _position;
735         if(peekChar == '+')
736         {
737             advanceChar();
738             return Token(Token.Type.INC, startpos);
739         }
740         else if(peekChar == '=')
741         {
742             advanceChar();
743             return Token(Token.Type.PLUS_ASSIGN, startpos);
744         }
745         else
746         {
747             return Token(Token.Type.PLUS, startpos);
748         }
749     }
750 
751     Token makeDashToken()
752     {
753         auto startpos = _position;
754         if(peekChar == '-')
755         {
756             advanceChar();
757             return Token(Token.Type.DEC, startpos);
758         }
759         else if(peekChar == '=')
760         {
761             advanceChar();
762             return Token(Token.Type.DASH_ASSIGN, startpos);
763         }
764         else
765         {
766             return Token(Token.Type.DASH, startpos);
767         }
768     }
769 
770     Token makeStarToken()
771     {
772         auto startpos = _position;
773         if(peekChar == '*')
774         {
775             advanceChar();
776             return Token(Token.Type.POW, startpos);
777         }
778         else
779         {
780             return Token(Token.Type.STAR, startpos);
781         }
782     }
783 
784     Token[] handleFSlash(Token[] tokens)
785     {
786         if(peekChar == '*')
787         {
788             advanceChar();
789             while(peekChar != '\0')
790             {
791                 if(peekChar == '*')
792                 {
793                     advanceChar();
794                     if(peekChar == '/')
795                         break;
796                 }
797                 advanceChar();
798             }
799             advanceChar();
800         }
801         else if(peekChar == '/')
802         {
803             advanceChar();
804             while(peekChar != '\n' && peekChar != '\0')
805             {
806                 advanceChar();
807             }
808         }
809         else if(canMakeRegex(tokens))
810         {
811             string accum = "";
812             auto startPos = _position;
813             accum ~= currentChar;
814             bool gettingFlags = false;
815             advanceChar();
816             while(currentChar)
817             {
818                 if(!gettingFlags)
819                 {
820                     if(currentChar == '\\')
821                     {
822                         accum ~= currentChar;
823                         advanceChar();
824                         if(currentChar)
825                         {
826                             accum ~= currentChar;
827                             advanceChar();
828                         }
829                     }
830                     else if(currentChar == '/')
831                     {
832                         accum ~= currentChar;
833                         advanceChar();
834                         gettingFlags = true;
835                     }
836                     else
837                     {
838                         accum ~= currentChar;
839                         advanceChar();
840                     }
841                 }
842                 else
843                 {
844                     if(!isAlpha(currentChar))
845                         break;
846                     accum ~= currentChar;
847                     advanceChar();
848                 }
849             }
850             --_index;
851             bool valid;
852             try 
853             {
854                 auto extracted = extract(accum);
855                 valid = isValid(extracted[0], extracted[1]);
856             }
857             catch(Exception ex)
858             {
859                 throw new ScriptCompileException("Malformed regex literal", Token.createInvalidToken(startPos, accum));
860             }
861             if(!valid)
862                 throw new ScriptCompileException("Invalid regex literal", Token.createInvalidToken(startPos, accum));
863             tokens ~= Token(Token.Type.REGEX, startPos, accum);
864         }
865         else
866         {
867             tokens ~= Token(Token.Type.FSLASH, _position);
868         }
869 
870         return tokens;
871     }
872 
873     Position _position = {1, 1};
874     string _text;
875     size_t _index = 0;
876 }
877 
878 unittest
879 {
880     auto lexer = Lexer("1.2 34 5.e-99 'foo' ");
881     auto tokens = lexer.tokenize();
882     assert(tokens[0].type == Token.Type.DOUBLE);
883     assert(tokens[1].type == Token.Type.INTEGER);
884     assert(tokens[2].type == Token.Type.DOUBLE);
885     assert(tokens[3].type == Token.Type.STRING && tokens[3].text == "foo");
886     // TODO complete unit tests of every token type
887 }