1 /**
2 This module implements the Token and Lexer structs
3 
4 ────────────────────────────────────────────────────────────────────────────────
5 
6 Copyright (C) 2021 pillager86.rf.gd
7 
8 This program is free software: you can redistribute it and/or modify it under 
9 the terms of the GNU General Public License as published by the Free Software 
10 Foundation, either version 3 of the License, or (at your option) any later 
11 version.
12 
13 This program is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.
16 
17 You should have received a copy of the GNU General Public License along with 
18 this program.  If not, see <https://www.gnu.org/licenses/>.
19 */
20 module mildew.lexer;
21 
22 import mildew.exceptions: ScriptCompileException;
23 import mildew.util.regex;
24 
25 import std.ascii; // temp until unicode support
26 import std.container.rbtree;
27 import std.conv: to;
28 import std.format: format;
29 import std.utf: encode;
30 
31 /**
32  * This struct represents the line and column number of a token, starting at 1.
33  */
34 struct Position
35 {
36     /// Line and column number.
37     int line, column;
38 
39     /// Returns a string representing the line and column number
40     string toString() const 
41     {
42         return format("line %s, column %s", line, column);
43     }
44 
45     /// Determines line and column number based on char that is read
46     void advance(char ch)
47     {
48         if(ch == '\0')
49         {
50             return;
51         }
52         else if(ch == '\n')
53         {
54             ++line;
55             column = 1;
56         }
57         else
58         {
59             ++column;
60         }
61     }
62 }
63 
64 /**
65  * This struct represents a token, a fundamental building block of all scripts. The code of a script
66  * is first separated by token so that the parser can analyze each token.
67  */
68 struct Token 
69 {
70     /**
71      * The type of a token.
72      */
73     enum Type 
74     {
75         EOF, KEYWORD, INTEGER, DOUBLE, STRING, IDENTIFIER, REGEX,
76         NOT, AND, OR, GT, GE, LT, LE,
77         EQUALS, NEQUALS, STRICT_EQUALS, STRICT_NEQUALS,
78 
79         ASSIGN, 
80         POW_ASSIGN, STAR_ASSIGN, FSLASH_ASSIGN, PERCENT_ASSIGN,
81         PLUS_ASSIGN, DASH_ASSIGN,
82         BAND_ASSIGN, BXOR_ASSIGN, BOR_ASSIGN, BLS_ASSIGN, BRS_ASSIGN, BURS_ASSIGN,
83 
84         PLUS, DASH, STAR, FSLASH, PERCENT, POW, DOT, TDOT,
85         INC, DEC, // ++ and --
86         BIT_AND, BIT_XOR, BIT_OR, BIT_NOT, BIT_LSHIFT, BIT_RSHIFT, BIT_URSHIFT,
87         LPAREN, RPAREN, LBRACE, RBRACE, LBRACKET, RBRACKET, 
88         SEMICOLON, COMMA, LABEL, QUESTION, COLON, ARROW,
89         
90         INVALID
91     }
92 
93     /**
94      * This enum is for literal value tokens that require special handling by the parser
95      */
96     enum LiteralFlag
97     {
98         NONE, BINARY, OCTAL, HEXADECIMAL, TEMPLATE_STRING
99     }
100 
101     /// Type of token
102     Type type;
103     /// Position where token occurs
104     Position position;
105     /// Optional text for keywords and identifiers
106     string text;
107     /// Optional flag for integer literals.
108     LiteralFlag literalFlag = LiteralFlag.NONE;
109 
110     /**
111      * Returns a string representing the type of the token and the optional text if present.
112      */
113     string toString() const
114     {
115         string str = format("[%s", type.to!string);
116         if(text != null)
117             str ~= "|" ~ text;
118         str ~= "]";
119         return str;
120     }
121 
122     /**
123      * Returns a textual representation of the token as it was found in the original script source code.
124      */
125     string symbol() const
126     {
127         final switch(type)
128         {
129         case Type.EOF:
130             return "\0";
131         case Type.KEYWORD: case Type.INTEGER: case Type.DOUBLE: case Type.STRING: case Type.IDENTIFIER: case Type.REGEX:
132             return text;
133         case Type.NOT: return "!";
134         case Type.AND: return "&&";
135         case Type.OR: return "||";
136         case Type.GT: return ">";
137         case Type.GE: return ">=";
138         case Type.LT: return "<";
139         case Type.LE: return "<=";
140         case Type.EQUALS: return "==";
141         case Type.NEQUALS: return "!=";
142         case Type.STRICT_EQUALS: return "===";
143         case Type.STRICT_NEQUALS: return "!==";
144         case Type.ASSIGN: return "=";
145         case Type.POW_ASSIGN: return "**=";
146         case Type.STAR_ASSIGN: return "*=";
147         case Type.FSLASH_ASSIGN: return "/=";
148         case Type.PERCENT_ASSIGN: return "%=";
149         case Type.PLUS_ASSIGN: return "+=";
150         case Type.DASH_ASSIGN: return "-=";
151         case Type.BAND_ASSIGN: return "&=";
152         case Type.BXOR_ASSIGN: return "^=";
153         case Type.BOR_ASSIGN: return "|=";
154         case Type.BLS_ASSIGN: return "<<=";
155         case Type.BRS_ASSIGN: return ">>=";
156         case Type.BURS_ASSIGN: return ">>>=";
157         case Type.PLUS: return "+";
158         case Type.DASH: return "-";
159         case Type.STAR: return "*";
160         case Type.FSLASH: return "/";
161         case Type.PERCENT: return "%";
162         case Type.POW: return "**";
163         case Type.DOT: return ".";
164         case Type.TDOT: return "...";
165         case Type.INC: return "++";
166         case Type.DEC: return "--"; 
167         case Type.BIT_AND: return "&";
168         case Type.BIT_XOR: return "^";
169         case Type.BIT_OR: return "|";
170         case Type.BIT_NOT: return "~";
171         case Type.BIT_LSHIFT: return "<<";
172         case Type.BIT_RSHIFT: return ">>";
173         case Type.BIT_URSHIFT: return ">>>";
174         case Type.LPAREN: return "(";
175         case Type.RPAREN: return ")";
176         case Type.LBRACE: return "{";
177         case Type.RBRACE: return "}";
178         case Type.LBRACKET: return "[";
179         case Type.RBRACKET: return "]";
180         case Type.SEMICOLON: return ";";
181         case Type.COMMA: return ",";
182         case Type.LABEL: return text ~ ":";
183         case Type.QUESTION: return "?";
184         case Type.COLON: return ":";
185         case Type.ARROW: return "=>";
186         case Type.INVALID: return "#";
187         }
188     }
189 
190     /**
191      * Returns true if a token is both a keyword and a specific keyword.
192      */
193     bool isKeyword(in string keyword) const
194     {
195         return (type == Type.KEYWORD && text == keyword);
196     }
197 
198     /**
199      * Checks for a specific identifier
200      */
201     bool isIdentifier(in string id) const 
202     {
203         return (type == Type.IDENTIFIER && text == id);
204     }
205 
206     /**
207      * Returns true if the token is an assignment operator such as =, +=, or -=, etc.
208      */
209     bool isAssignmentOperator()
210     {
211         return (type == Type.ASSIGN || 
212                 type == Type.POW_ASSIGN ||
213                 type == Type.STAR_ASSIGN ||
214                 type == Type.FSLASH_ASSIGN ||
215                 type == Type.PERCENT_ASSIGN ||
216                 type == Type.PLUS_ASSIGN || 
217                 type == Type.DASH_ASSIGN ||
218                 type == Type.BAND_ASSIGN ||
219                 type == Type.BXOR_ASSIGN ||
220                 type == Type.BOR_ASSIGN ||
221                 type == Type.BLS_ASSIGN ||
222                 type == Type.BRS_ASSIGN ||
223                 type == Type.BURS_ASSIGN
224         );
225     }
226 
227     /**
228      * Generates an invalid token at the given position. This is used by the Lexer to throw
229      * an exception that requires a token.
230      */
231     static Token createInvalidToken(in Position pos, in string text="")
232     {
233         auto token = Token(Token.Type.INVALID, pos, text);
234         return token;
235     }
236 
237     /**
238      * Used by the parser
239      */
240     static Token createFakeToken(in Type t, in string txt)
241     {
242         Token tok;
243         tok.type = t;
244         tok.position = Position(0,0);
245         tok.text = txt;
246         return tok;
247     }
248 }
249 
250 private bool startsKeywordOrIdentifier(in char ch)
251 {
252     // TODO support unicode by converting string to dchar
253     return ch.isAlpha || ch == '_' || ch == '$';
254 }
255 
256 private bool continuesKeywordOrIdentifier(in char ch)
257 {
258     // TODO support unicode by converting string to dchar
259     return ch.isAlphaNum || ch == '_' || ch == '$';
260 }
261 
262 private bool charIsValidDigit(in char ch, in Token.LiteralFlag lflag)
263 {
264     if(lflag == Token.LiteralFlag.NONE)
265         return ch.isDigit || ch == '.' || ch == 'e';
266     else if(lflag == Token.LiteralFlag.HEXADECIMAL)
267         return ch.isDigit || (ch.toLower >= 'a' && ch.toLower <= 'f');
268     else if(lflag == Token.LiteralFlag.OCTAL)
269         return (ch >= '0' && ch <= '7');
270     else if(lflag == Token.LiteralFlag.BINARY)
271         return ch == '0' || ch == '1';
272     return false;
273 }
274 
275 /// Lexes code and returns the individual tokens
276 struct Lexer 
277 {
278 public:
279     /// Constructor takes code as text to tokenize
280     this(string code)
281     {
282         _text = code;
283     }
284 
285     /// Returns tokens from lexing a string of code
286     Token[] tokenize()
287     {
288         Token[] tokens = [];
289         if (_text == "")
290             return tokens;
291         while(_index < _text.length)
292         {
293             // ignore white space
294             while(currentChar.isWhite())
295                 advanceChar();
296             if(currentChar.startsKeywordOrIdentifier)
297                 tokens ~= makeIdKwOrLabel(tokens);
298             else if(currentChar.isDigit)
299                 tokens ~= makeIntOrDoubleToken();
300             else if(currentChar == '\'' || currentChar == '"' || currentChar == '`')
301                 tokens ~= makeStringToken(tokens);
302             else if(currentChar == '>')
303                 tokens ~= makeRAngleBracketToken();
304             else if(currentChar == '<')
305                 tokens ~= makeLAngleBracketToken();
306             else if(currentChar == '=')
307                 tokens ~= makeEqualToken();
308             else if(currentChar == '!')
309                 tokens ~= makeNotToken();
310             else if(currentChar == '&')
311                 tokens ~= makeAndToken();
312             else if(currentChar == '|')
313                 tokens ~= makeOrToken();
314             else if(currentChar == '+')
315                 tokens ~= makePlusToken();
316             else if(currentChar == '-')
317                 tokens ~= makeDashToken();
318             else if(currentChar == '*')
319                 tokens ~= makeStarToken();
320             else if(currentChar == '/')
321                 tokens = handleFSlash(tokens);
322             else if(currentChar == '%')
323                 tokens ~= makePercentToken();
324             else if(currentChar == '^')
325                 tokens ~= makeXorToken();
326             else if(currentChar == '~')
327                 tokens ~= Token(Token.Type.BIT_NOT, _position);
328             else if(currentChar == '(')
329                 tokens ~= Token(Token.Type.LPAREN, _position);
330             else if(currentChar == ')')
331                 tokens ~= Token(Token.Type.RPAREN, _position);
332             else if(currentChar == '{')
333                 tokens ~= Token(Token.Type.LBRACE, _position);
334             else if(currentChar == '}')
335                 tokens ~= Token(Token.Type.RBRACE, _position);
336             else if(currentChar == '[')
337                 tokens ~= Token(Token.Type.LBRACKET, _position);
338             else if(currentChar == ']')
339                 tokens ~= Token(Token.Type.RBRACKET, _position);
340             else if(currentChar == ';')
341                 tokens ~= Token(Token.Type.SEMICOLON, _position);
342             else if(currentChar == ',')
343                 tokens ~= Token(Token.Type.COMMA, _position);
344             else if(currentChar == '.')
345                 tokens ~= makeDotTokens();
346             else if(currentChar == ':')
347                 tokens ~= Token(Token.Type.COLON, _position);
348             else if(currentChar == '?')
349                 tokens ~= Token(Token.Type.QUESTION, _position);
350             else if(currentChar == '\0')
351                 tokens ~= Token(Token.Type.EOF, _position);
352             else
353                 throw new ScriptCompileException("Invalid character " ~ currentChar, 
354                     Token.createInvalidToken(_position, [currentChar]));
355             advanceChar();
356         }
357         return tokens;
358     }
359 
360     /// Hash table of keywords
361     static immutable KEYWORDS = redBlackTree(
362         "true", "false", "undefined", "null",
363         "var", "let", "const", 
364         "if", "else", "while", "do", "for", "in",
365         "switch", "case", "default",
366         "break", "continue", "return", 
367         "function", "class", "super", "extends",
368         "new", "delete", "typeof", "instanceof",
369         "throw", "try", "catch", "finally", 
370         "yield"
371     );
372 
373     /// AA of look up for escape chars based on character after \
374     static immutable char[char] ESCAPE_CHARS;
375 
376     /// Initializes the associative array of escape chars
377     shared static this()
378     {
379         ESCAPE_CHARS = [
380             'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t', 'v': '\v', 
381             '0': '\0', '\'': '\'', '"': '"', '\\': '\\'
382         ];
383     }
384 
385 private:
386 
387     void advanceChar()
388     {
389         ++_index;
390         _position.advance(currentChar());
391     }
392 
393     char currentChar()
394     {
395         if(_index < _text.length)
396             return _text[_index];
397         else
398             return '\0';
399     }
400 
401     char peekChar()
402     {
403         if(_index + 1 < _text.length)
404             return _text[_index + 1];
405         else
406             return '\0';
407     }
408 
409     bool canMakeRegex(Token[] tokens)
410     {
411         if(tokens.length == 0)
412             return true;
413         switch(tokens[$-1].type)
414         {
415         case Token.Type.IDENTIFIER:
416         case Token.Type.INTEGER:
417         case Token.Type.DOUBLE:
418         case Token.Type.STRING:
419         case Token.Type.RBRACKET:
420         case Token.Type.RPAREN:
421         case Token.Type.INC:
422         case Token.Type.DEC:
423             return false;
424         case Token.Type.KEYWORD:
425             switch(tokens[$-1].text)
426             {
427             case "null":
428             case "true":
429             case "false":
430                 return false;
431             default:
432                 return true;
433             }
434         default:
435             return true;
436         }
437     }
438 
439     Token makeIdKwOrLabel(Token[] tokens)
440     {
441         immutable start = _index;
442         immutable startpos = _position;
443         advanceChar();
444         while(currentChar.continuesKeywordOrIdentifier)
445             advanceChar();
446         auto text = _text[start.._index];
447         --_index; // UGLY but IDK what else to do
448         // first check for keyword, that can't be a label
449 
450         // these words can be used as object members but not as labels
451         if(text == "return" || text == "throw" || text == "delete"
452          || text == "catch" || text == "finally")
453         {
454             if(tokens.length > 0 && tokens[$-1].type == Token.Type.DOT)
455                 return Token(Token.Type.IDENTIFIER, startpos, text);
456         }
457 
458         if(text in KEYWORDS)
459         {
460             return Token(Token.Type.KEYWORD, startpos, text);
461         }
462         else if(peekChar == ':')
463         {
464             advanceChar();
465             return Token(Token.Type.LABEL, startpos, text);
466         }
467         else
468         {
469             return Token(Token.Type.IDENTIFIER, startpos, text);
470         }
471     }
472 
473     Token makeIntOrDoubleToken()
474     {
475         immutable start = _index;
476         immutable startpos = _position;
477         auto dotCounter = 0;
478         auto eCounter = 0;
479         Token.LiteralFlag lflag = Token.LiteralFlag.NONE;
480         if(peekChar.toLower == 'x')
481         {
482             lflag = Token.LiteralFlag.HEXADECIMAL;
483             advanceChar();
484         }
485         else if(peekChar.toLower == 'o')
486         {
487             lflag = Token.LiteralFlag.OCTAL;
488             advanceChar();
489         }
490         else if(peekChar.toLower == 'b')
491         {
492             lflag = Token.LiteralFlag.BINARY;
493             advanceChar();
494         }
495         // if the lflag was set, the first char has to be 0
496         if(lflag != Token.LiteralFlag.NONE && _text[start] != '0')
497             throw new ScriptCompileException("Malformed integer literal", Token.createInvalidToken(startpos));
498         
499         // while(peekChar.isDigit || peekChar == '.' || peekChar.toLower == 'e')
500         while(peekChar.charIsValidDigit(lflag))
501         {
502             advanceChar();
503             if(lflag == Token.LiteralFlag.NONE)
504             {
505                 if(currentChar == '.')
506                 {
507                     ++dotCounter;
508                     if(dotCounter > 1)
509                         throw new ScriptCompileException("Too many decimals in number literal", 
510                             Token.createInvalidToken(_position));
511                 }
512                 else if(currentChar.toLower == 'e')
513                 {
514                     ++eCounter;
515                     if(eCounter > 1)
516                         throw new ScriptCompileException("Numbers can only have one exponent specifier", 
517                             Token.createInvalidToken(_position));
518                     if(peekChar == '+' || peekChar == '-')
519                         advanceChar();
520                     if(!peekChar.isDigit)
521                         throw new ScriptCompileException("Exponent specifier must be followed by number", 
522                             Token.createInvalidToken(_position));
523                 }
524             }
525         }
526         auto text = _text[start.._index+1];
527         if(lflag != Token.LiteralFlag.NONE && text.length <= 2)
528             throw new ScriptCompileException("Malformed hex/octal/binary integer", Token.createInvalidToken(startpos));
529         Token resultToken;
530         if(dotCounter == 0 && eCounter == 0)
531             resultToken = Token(Token.Type.INTEGER, startpos, text);
532         else
533             resultToken = Token(Token.Type.DOUBLE, startpos, text);
534         resultToken.literalFlag = lflag;
535         return resultToken;
536     }
537 
538     Token makeStringToken(ref Token[] previous)
539     {
540         immutable closeQuote = currentChar;
541         auto startpos = _position;
542         advanceChar();
543         string text = "";
544         bool escapeChars = true;
545         if(previous.length >= 3)
546         {
547             if(previous[$-1].isIdentifier("raw") &&
548                previous[$-2].type == Token.Type.DOT &&
549                previous[$-3].isIdentifier("String"))
550             {
551                 escapeChars = false;
552                 previous = previous[0.. $-3];
553             }
554         }
555         Token.LiteralFlag lflag = Token.LiteralFlag.NONE;
556         if(closeQuote == '`')
557             lflag = Token.LiteralFlag.TEMPLATE_STRING;
558         while(currentChar != closeQuote)
559         {
560             if(currentChar == '\0')
561                 throw new ScriptCompileException("Missing close quote for string literal", 
562                     Token.createInvalidToken(_position, text));
563             else if(currentChar == '\n' && lflag != Token.LiteralFlag.TEMPLATE_STRING)
564                 throw new ScriptCompileException("Line breaks inside regular string literals are not allowed", 
565                     Token.createInvalidToken(_position, text));
566             else if(currentChar == '\\' && escapeChars) // TODO handle \u0000 and \u00 sequences
567             {
568                 advanceChar();
569                 if(currentChar in ESCAPE_CHARS)
570                     text ~= ESCAPE_CHARS[currentChar];
571                 else if(currentChar == 'u')
572                 {
573                     advanceChar();
574                     string accum = "";
575                     bool usingBraces = false;
576                     int limitCounter;
577                     immutable LIMIT = 4; // without the braces
578                     if(currentChar == '{')
579                     {
580                         advanceChar();
581                         usingBraces = true;
582                     }
583                     while(currentChar.charIsValidDigit(Token.LiteralFlag.HEXADECIMAL))
584                     {
585                         if(limitCounter >= LIMIT && !usingBraces)
586                             break;
587                         accum ~= currentChar;
588                         advanceChar();
589                         if(!usingBraces)
590                             ++limitCounter;
591                     }
592                     if(currentChar == '}' && usingBraces)
593                         advanceChar();
594                     --_index;
595                     try 
596                     {
597                         dchar result = cast(dchar)to!uint(accum, 16);
598                         char[] buf;
599                         encode(buf, result);
600                         text ~= buf;
601                     }
602                     catch(Exception ex)
603                     {
604                         throw new ScriptCompileException("Invalid UTF sequence in \\u char", 
605                             Token.createInvalidToken(_position, accum));
606                     }
607                 }
608                 else if(currentChar == 'x')
609                 {
610                     advanceChar();
611                     string accum = "";
612                     accum ~= currentChar;
613                     advanceChar();
614                     accum ~= currentChar;
615                     try 
616                     {
617                         char result = cast(char)to!ubyte(accum, 16);
618                         text ~= result;
619                     }
620                     catch(Exception ex)
621                     {
622                         throw new ScriptCompileException("Invalid hexadecimal number in \\x char",
623                             Token.createInvalidToken(_position, accum));
624                     }
625                 }
626                 else
627                     throw new ScriptCompileException("Unknown escape character " ~ currentChar, 
628                         Token.createInvalidToken(_position));
629             }
630             else
631                 text ~= currentChar;
632             advanceChar();
633         }
634         auto tok = Token(Token.Type.STRING, startpos, text);
635         tok.literalFlag = lflag;
636         return tok;
637     }
638 
639     Token makeRAngleBracketToken()
640     {
641         auto startpos = _position;
642         if(peekChar == '=')
643         {
644             advanceChar();
645             return Token(Token.Type.GE, startpos);
646         }
647         else if(peekChar == '>')
648         {
649             advanceChar();
650             if(peekChar == '>')
651             {
652                 advanceChar();
653                 if(peekChar == '=')
654                 {
655                     advanceChar();
656                     return Token(Token.Type.BURS_ASSIGN, startpos);
657                 }
658                 else
659                 {
660                     return Token(Token.Type.BIT_URSHIFT, startpos);
661                 }
662             }
663             else if(peekChar == '=')
664             {
665                 advanceChar();
666                 return Token(Token.Type.BRS_ASSIGN, startpos);
667             }
668             else
669             {
670                 return Token(Token.Type.BIT_RSHIFT, startpos);
671             }
672         }
673         else
674         {
675             return Token(Token.Type.GT, startpos);
676         }
677     }
678 
679     Token makeLAngleBracketToken()
680     {
681         auto startpos = _position;
682         if(peekChar == '=')
683         {
684             advanceChar();
685             return Token(Token.Type.LE, startpos);
686         }
687         else if(peekChar == '<')
688         {
689             advanceChar();
690             if(peekChar == '=')
691             {
692                 advanceChar();
693                 return Token(Token.Type.BLS_ASSIGN);
694             }
695             else
696             {
697                 return Token(Token.Type.BIT_LSHIFT, startpos);
698             }
699         }
700         else
701         {
702             return Token(Token.Type.LT, startpos);
703         }
704     }
705 
706     Token makeEqualToken()
707     {
708         auto startpos = _position;
709         if(peekChar == '=')
710         {
711             advanceChar();
712             if(peekChar == '=')
713             {
714                 advanceChar();
715                 return Token(Token.Type.STRICT_EQUALS);
716             }
717             else
718             {
719                 return Token(Token.Type.EQUALS, startpos);
720             }
721         }
722         else if(peekChar == '>')
723         {
724             advanceChar();
725             return Token(Token.Type.ARROW, startpos);
726         }
727         else
728         {
729             return Token(Token.Type.ASSIGN, startpos);
730         }
731     }
732 
733     Token makeNotToken()
734     {
735         auto startpos = _position;
736         if(peekChar == '=')
737         {
738             advanceChar();
739             if(peekChar == '=')
740             {
741                 advanceChar();
742                 return Token(Token.Type.STRICT_NEQUALS, startpos);
743             }
744             else
745             {
746                 return Token(Token.Type.NEQUALS, startpos);
747             }
748         }
749         else
750         {
751             return Token(Token.Type.NOT, startpos);
752         }
753     }
754 
755     Token makeAndToken()
756     {
757         auto startpos = _position;
758         if(peekChar == '&')
759         {
760             advanceChar();
761             return Token(Token.Type.AND, startpos);
762         }
763         else if(peekChar == '=')
764         {            
765             advanceChar();
766             return Token(Token.Type.BAND_ASSIGN, startpos);   
767         }
768         else 
769         {
770             return Token(Token.Type.BIT_AND, startpos);
771         }
772     }
773 
774     Token makeOrToken()
775     {
776         auto startpos = _position;
777         if(peekChar == '|')
778         {
779             advanceChar();
780             return Token(Token.Type.OR, startpos);
781         }
782         else if(peekChar == '=')
783         {
784             advanceChar();
785             return Token(Token.Type.BOR_ASSIGN, startpos);
786         }
787         else
788         {
789             return Token(Token.Type.BIT_OR, startpos);
790         }
791     }
792 
793     Token makePlusToken()
794     {
795         auto startpos = _position;
796         if(peekChar == '+')
797         {
798             advanceChar();
799             return Token(Token.Type.INC, startpos);
800         }
801         else if(peekChar == '=')
802         {
803             advanceChar();
804             return Token(Token.Type.PLUS_ASSIGN, startpos);
805         }
806         else
807         {
808             return Token(Token.Type.PLUS, startpos);
809         }
810     }
811 
812     Token makeDashToken()
813     {
814         auto startpos = _position;
815         if(peekChar == '-')
816         {
817             advanceChar();
818             return Token(Token.Type.DEC, startpos);
819         }
820         else if(peekChar == '=')
821         {
822             advanceChar();
823             return Token(Token.Type.DASH_ASSIGN, startpos);
824         }
825         else
826         {
827             return Token(Token.Type.DASH, startpos);
828         }
829     }
830 
831     Token makeStarToken()
832     {
833         auto startpos = _position;
834         if(peekChar == '*')
835         {
836             advanceChar();
837             if(peekChar == '=')
838             {
839                 advanceChar();
840                 return Token(Token.Type.POW_ASSIGN, startpos);
841             }
842             else
843             {
844                 return Token(Token.Type.POW, startpos);
845             }
846         }
847         else if(peekChar == '=')
848         {
849             advanceChar();
850             return Token(Token.Type.STAR_ASSIGN, startpos);
851         }
852         else
853         {
854             return Token(Token.Type.STAR, startpos);
855         }
856     }
857 
858     Token makePercentToken()
859     {
860         if(peekChar == '=')
861         {
862             immutable startpos = _position;
863             advanceChar();
864             return Token(Token.Type.PERCENT_ASSIGN, startpos);
865         }
866         else
867         {
868             return Token(Token.Type.PERCENT, _position);
869         }
870     }
871 
872     Token makeXorToken()
873     {
874         if(peekChar == '=')
875         {
876             immutable startpos = _position;
877             advanceChar();
878             return Token(Token.Type.BXOR_ASSIGN, startpos);
879         }
880         else
881         {
882             return Token(Token.Type.BIT_XOR, _position);
883         }
884     }
885 
886     Token[] makeDotTokens()
887     {
888         immutable startPos = _position;
889         if(peekChar == '.')
890         {
891             advanceChar();
892             if(peekChar == '.')
893             {
894                 advanceChar();
895                 return [Token(Token.Type.TDOT, startPos)];
896             }
897             else
898             {
899                 return [Token(Token.Type.DOT, startPos), Token(Token.Type.DOT, _position)];
900             }
901         }
902         else
903         {
904             return [Token(Token.Type.DOT, _position)];
905         }
906     }
907 
908     Token[] handleFSlash(Token[] tokens)
909     {
910         if(peekChar == '*') // block comment
911         {
912             advanceChar();
913             while(peekChar != '\0')
914             {
915                 if(peekChar == '*')
916                 {
917                     advanceChar();
918                     if(peekChar == '/')
919                         break;
920                 }
921                 advanceChar();
922             }
923             advanceChar();
924         }
925         else if(peekChar == '/') // comment
926         {
927             advanceChar();
928             while(peekChar != '\n' && peekChar != '\0')
929             {
930                 advanceChar();
931             }
932         }
933         else if(canMakeRegex(tokens))
934         {
935             string accum = "";
936             auto startPos = _position;
937             accum ~= currentChar;
938             bool gettingFlags = false;
939             advanceChar();
940             while(currentChar)
941             {
942                 if(!gettingFlags)
943                 {
944                     if(currentChar == '\\')
945                     {
946                         accum ~= currentChar;
947                         advanceChar();
948                         if(currentChar)
949                         {
950                             accum ~= currentChar;
951                             advanceChar();
952                         }
953                     }
954                     else if(currentChar == '/')
955                     {
956                         accum ~= currentChar;
957                         advanceChar();
958                         gettingFlags = true;
959                     }
960                     else
961                     {
962                         accum ~= currentChar;
963                         advanceChar();
964                     }
965                 }
966                 else
967                 {
968                     if(!isAlpha(currentChar))
969                         break;
970                     accum ~= currentChar;
971                     advanceChar();
972                 }
973             }
974             --_index;
975             bool valid;
976             try 
977             {
978                 auto extracted = extract(accum);
979                 valid = isValid(extracted[0], extracted[1]);
980             }
981             catch(Exception ex)
982             {
983                 throw new ScriptCompileException("Malformed regex literal", Token.createInvalidToken(startPos, accum));
984             }
985             if(!valid)
986                 throw new ScriptCompileException("Invalid regex literal", Token.createInvalidToken(startPos, accum));
987             tokens ~= Token(Token.Type.REGEX, startPos, accum);
988         }
989         else if(peekChar == '=')
990         {
991             immutable startpos = _position;
992             advanceChar();
993             tokens ~= Token(Token.Type.FSLASH_ASSIGN, startpos);
994         }
995         else
996         {
997             tokens ~= Token(Token.Type.FSLASH, _position);
998         }
999 
1000         return tokens;
1001     }
1002 
1003     Position _position = {1, 1};
1004     string _text;
1005     size_t _index = 0;
1006 }
1007 
1008 unittest
1009 {
1010     auto lexer = Lexer("1.2 34 5.e-99 'foo' ");
1011     auto tokens = lexer.tokenize();
1012     assert(tokens[0].type == Token.Type.DOUBLE);
1013     assert(tokens[1].type == Token.Type.INTEGER);
1014     assert(tokens[2].type == Token.Type.DOUBLE);
1015     assert(tokens[3].type == Token.Type.STRING && tokens[3].text == "foo");
1016     // TODO complete unit tests of every token type
1017 }