1 /** 2 This module implements the Token and Lexer structs 3 4 ──────────────────────────────────────────────────────────────────────────────── 5 6 Copyright (C) 2021 pillager86.rf.gd 7 8 This program is free software: you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation, either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along with 18 this program. If not, see <https://www.gnu.org/licenses/>. 19 */ 20 module mildew.lexer; 21 22 import mildew.exceptions: ScriptCompileException; 23 import mildew.util.regex; 24 25 import std.ascii; // temp until unicode support 26 import std.container.rbtree; 27 import std.conv: to; 28 import std.format: format; 29 import std.utf: encode; 30 31 /** 32 * This struct represents the line and column number of a token, starting at 1. 33 */ 34 struct Position 35 { 36 /// Line and column number. 37 int line, column; 38 39 /// Returns a string representing the line and column number 40 string toString() const 41 { 42 return format("line %s, column %s", line, column); 43 } 44 45 /// Determines line and column number based on char that is read 46 void advance(char ch) 47 { 48 if(ch == '\0') 49 { 50 return; 51 } 52 else if(ch == '\n') 53 { 54 ++line; 55 column = 1; 56 } 57 else 58 { 59 ++column; 60 } 61 } 62 } 63 64 /** 65 * This struct represents a token, a fundamental building block of all scripts. The code of a script 66 * is first separated by token so that the parser can analyze each token. 67 */ 68 struct Token 69 { 70 /** 71 * The type of a token. 72 */ 73 enum Type 74 { 75 EOF, KEYWORD, INTEGER, DOUBLE, STRING, IDENTIFIER, REGEX, 76 NOT, AND, OR, GT, GE, LT, LE, 77 EQUALS, NEQUALS, STRICT_EQUALS, STRICT_NEQUALS, 78 ASSIGN, PLUS_ASSIGN, DASH_ASSIGN, 79 PLUS, DASH, STAR, FSLASH, PERCENT, POW, DOT, 80 INC, DEC, // ++ and -- 81 BIT_AND, BIT_XOR, BIT_OR, BIT_NOT, BIT_LSHIFT, BIT_RSHIFT, BIT_URSHIFT, 82 LPAREN, RPAREN, LBRACE, RBRACE, LBRACKET, RBRACKET, 83 SEMICOLON, COMMA, LABEL, QUESTION, COLON, ARROW, 84 85 INVALID 86 } 87 88 /** 89 * This enum is for literal value tokens that require special handling by the parser 90 */ 91 enum LiteralFlag 92 { 93 NONE, BINARY, OCTAL, HEXADECIMAL, TEMPLATE_STRING 94 } 95 96 /// Type of token 97 Type type; 98 /// Position where token occurs 99 Position position; 100 /// Optional text for keywords and identifiers 101 string text; 102 /// Optional flag for integer literals. 103 LiteralFlag literalFlag = LiteralFlag.NONE; 104 105 /** 106 * Returns a string representing the type of the token and the optional text if present. 107 */ 108 string toString() const 109 { 110 string str = format("[%s", type.to!string); 111 if(text != null) 112 str ~= "|" ~ text; 113 str ~= "]"; 114 return str; 115 } 116 117 /** 118 * Returns a textual representation of the token as it was found in the original script source code. 119 */ 120 string symbol() const 121 { 122 final switch(type) 123 { 124 case Type.EOF: 125 return "\0"; 126 case Type.KEYWORD: case Type.INTEGER: case Type.DOUBLE: case Type.STRING: case Type.IDENTIFIER: case Type.REGEX: 127 return text; 128 case Type.NOT: return "!"; 129 case Type.AND: return "&&"; 130 case Type.OR: return "||"; 131 case Type.GT: return ">"; 132 case Type.GE: return ">="; 133 case Type.LT: return "<"; 134 case Type.LE: return "<="; 135 case Type.EQUALS: return "=="; 136 case Type.NEQUALS: return "!="; 137 case Type.STRICT_EQUALS: return "==="; 138 case Type.STRICT_NEQUALS: return "!=="; 139 case Type.ASSIGN: return "="; 140 case Type.PLUS_ASSIGN: return "+="; 141 case Type.DASH_ASSIGN: return "-="; 142 case Type.PLUS: return "+"; 143 case Type.DASH: return "-"; 144 case Type.STAR: return "*"; 145 case Type.FSLASH: return "/"; 146 case Type.PERCENT: return "%"; 147 case Type.POW: return "**"; 148 case Type.DOT: return "."; 149 case Type.INC: return "++"; 150 case Type.DEC: return "--"; 151 case Type.BIT_AND: return "&"; 152 case Type.BIT_XOR: return "^"; 153 case Type.BIT_OR: return "|"; 154 case Type.BIT_NOT: return "~"; 155 case Type.BIT_LSHIFT: return "<<"; 156 case Type.BIT_RSHIFT: return ">>"; 157 case Type.BIT_URSHIFT: return ">>>"; 158 case Type.LPAREN: return "("; 159 case Type.RPAREN: return ")"; 160 case Type.LBRACE: return "{"; 161 case Type.RBRACE: return "}"; 162 case Type.LBRACKET: return "["; 163 case Type.RBRACKET: return "]"; 164 case Type.SEMICOLON: return ";"; 165 case Type.COMMA: return ","; 166 case Type.LABEL: return text ~ ":"; 167 case Type.QUESTION: return "?"; 168 case Type.COLON: return ":"; 169 case Type.ARROW: return "=>"; 170 case Type.INVALID: return "#"; 171 } 172 } 173 174 /** 175 * Returns true if a token is both a keyword and a specific keyword. 176 */ 177 bool isKeyword(in string keyword) const 178 { 179 return (type == Type.KEYWORD && text == keyword); 180 } 181 182 /** 183 * Checks for a specific identifier 184 */ 185 bool isIdentifier(in string id) const 186 { 187 return (type == Type.IDENTIFIER && text == id); 188 } 189 190 /** 191 * Returns true if the token is an assignment operator such as =, +=, or -=, etc. 192 */ 193 bool isAssignmentOperator() 194 { 195 return (type == Type.ASSIGN || type == Type.PLUS_ASSIGN || type == Type.DASH_ASSIGN); 196 } 197 198 /** 199 * Generates an invalid token at the given position. This is used by the Lexer to throw 200 * an exception that requires a token. 201 */ 202 static Token createInvalidToken(in Position pos, in string text="") 203 { 204 auto token = Token(Token.Type.INVALID, pos, text); 205 return token; 206 } 207 208 /** 209 * Used by the parser 210 */ 211 static Token createFakeToken(in Type t, in string txt) 212 { 213 Token tok; 214 tok.type = t; 215 tok.position = Position(0,0); 216 tok.text = txt; 217 return tok; 218 } 219 } 220 221 private bool startsKeywordOrIdentifier(in char ch) 222 { 223 // TODO support unicode by converting string to dchar 224 return ch.isAlpha || ch == '_' || ch == '$'; 225 } 226 227 private bool continuesKeywordOrIdentifier(in char ch) 228 { 229 // TODO support unicode by converting string to dchar 230 return ch.isAlphaNum || ch == '_' || ch == '$'; 231 } 232 233 private bool charIsValidDigit(in char ch, in Token.LiteralFlag lflag) 234 { 235 if(lflag == Token.LiteralFlag.NONE) 236 return ch.isDigit || ch == '.' || ch == 'e'; 237 else if(lflag == Token.LiteralFlag.HEXADECIMAL) 238 return ch.isDigit || (ch.toLower >= 'a' && ch.toLower <= 'f'); 239 else if(lflag == Token.LiteralFlag.OCTAL) 240 return (ch >= '0' && ch <= '7'); 241 else if(lflag == Token.LiteralFlag.BINARY) 242 return ch == '0' || ch == '1'; 243 return false; 244 } 245 246 /// Lexes code and returns the individual tokens 247 struct Lexer 248 { 249 public: 250 /// Constructor takes code as text to tokenize 251 this(string code) 252 { 253 _text = code; 254 } 255 256 /// Returns tokens from lexing a string of code 257 Token[] tokenize() 258 { 259 Token[] tokens = []; 260 if (_text == "") 261 return tokens; 262 while(_index < _text.length) 263 { 264 // ignore white space 265 while(currentChar.isWhite()) 266 advanceChar(); 267 if(currentChar.startsKeywordOrIdentifier) 268 tokens ~= makeIdKwOrLabel(tokens); 269 else if(currentChar.isDigit) 270 tokens ~= makeIntOrDoubleToken(); 271 else if(currentChar == '\'' || currentChar == '"' || currentChar == '`') 272 tokens ~= makeStringToken(tokens); 273 else if(currentChar == '>') 274 tokens ~= makeRAngleBracketToken(); 275 else if(currentChar == '<') 276 tokens ~= makeLAngleBracketToken(); 277 else if(currentChar == '=') 278 tokens ~= makeEqualToken(); 279 else if(currentChar == '!') 280 tokens ~= makeNotToken(); 281 else if(currentChar == '&') 282 tokens ~= makeAndToken(); 283 else if(currentChar == '|') 284 tokens ~= makeOrToken(); 285 else if(currentChar == '+') 286 tokens ~= makePlusToken(); 287 else if(currentChar == '-') 288 tokens ~= makeDashToken(); 289 else if(currentChar == '*') 290 tokens ~= makeStarToken(); 291 else if(currentChar == '/') 292 tokens = handleFSlash(tokens); 293 else if(currentChar == '%') 294 tokens ~= Token(Token.Type.PERCENT, _position); 295 else if(currentChar == '^') 296 tokens ~= Token(Token.Type.BIT_XOR, _position); 297 else if(currentChar == '~') 298 tokens ~= Token(Token.Type.BIT_NOT, _position); 299 else if(currentChar == '(') 300 tokens ~= Token(Token.Type.LPAREN, _position); 301 else if(currentChar == ')') 302 tokens ~= Token(Token.Type.RPAREN, _position); 303 else if(currentChar == '{') 304 tokens ~= Token(Token.Type.LBRACE, _position); 305 else if(currentChar == '}') 306 tokens ~= Token(Token.Type.RBRACE, _position); 307 else if(currentChar == '[') 308 tokens ~= Token(Token.Type.LBRACKET, _position); 309 else if(currentChar == ']') 310 tokens ~= Token(Token.Type.RBRACKET, _position); 311 else if(currentChar == ';') 312 tokens ~= Token(Token.Type.SEMICOLON, _position); 313 else if(currentChar == ',') 314 tokens ~= Token(Token.Type.COMMA, _position); 315 else if(currentChar == '.') 316 tokens ~= Token(Token.Type.DOT, _position); 317 else if(currentChar == ':') 318 tokens ~= Token(Token.Type.COLON, _position); 319 else if(currentChar == '?') 320 tokens ~= Token(Token.Type.QUESTION, _position); 321 else if(currentChar == '\0') 322 tokens ~= Token(Token.Type.EOF, _position); 323 else 324 throw new ScriptCompileException("Invalid character " ~ currentChar, 325 Token.createInvalidToken(_position, [currentChar])); 326 advanceChar(); 327 } 328 return tokens; 329 } 330 331 /// Hash table of keywords 332 static immutable KEYWORDS = redBlackTree( 333 "true", "false", "undefined", "null", 334 "var", "let", "const", 335 "if", "else", "while", "do", "for", "in", 336 "switch", "case", "default", 337 "break", "continue", "return", 338 "function", "class", "super", "extends", 339 "new", "delete", "typeof", "instanceof", 340 "throw", "try", "catch", "finally", 341 "yield" 342 ); 343 344 /// AA of look up for escape chars based on character after \ 345 static immutable char[char] ESCAPE_CHARS; 346 347 /// Initializes the associative array of escape chars 348 shared static this() 349 { 350 ESCAPE_CHARS = [ 351 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t', 'v': '\v', 352 '0': '\0', '\'': '\'', '"': '"', '\\': '\\' 353 ]; 354 } 355 356 private: 357 358 void advanceChar() 359 { 360 ++_index; 361 _position.advance(currentChar()); 362 } 363 364 char currentChar() 365 { 366 if(_index < _text.length) 367 return _text[_index]; 368 else 369 return '\0'; 370 } 371 372 char peekChar() 373 { 374 if(_index + 1 < _text.length) 375 return _text[_index + 1]; 376 else 377 return '\0'; 378 } 379 380 bool canMakeRegex(Token[] tokens) 381 { 382 if(tokens.length == 0) 383 return true; 384 switch(tokens[$-1].type) 385 { 386 case Token.Type.IDENTIFIER: 387 case Token.Type.INTEGER: 388 case Token.Type.DOUBLE: 389 case Token.Type.STRING: 390 case Token.Type.RBRACKET: 391 case Token.Type.RPAREN: 392 case Token.Type.INC: 393 case Token.Type.DEC: 394 return false; 395 case Token.Type.KEYWORD: 396 switch(tokens[$-1].text) 397 { 398 case "null": 399 case "true": 400 case "false": 401 return false; 402 default: 403 return true; 404 } 405 default: 406 return true; 407 } 408 } 409 410 Token makeIdKwOrLabel(Token[] tokens) 411 { 412 immutable start = _index; 413 immutable startpos = _position; 414 advanceChar(); 415 while(currentChar.continuesKeywordOrIdentifier) 416 advanceChar(); 417 auto text = _text[start.._index]; 418 --_index; // UGLY but IDK what else to do 419 // first check for keyword, that can't be a label 420 421 // return is a special case after "." 422 if(text == "return") 423 { 424 if(tokens.length > 0 && tokens[$-1].type == Token.Type.DOT) 425 return Token(Token.Type.IDENTIFIER, startpos, text); 426 } 427 428 if(text in KEYWORDS) 429 { 430 return Token(Token.Type.KEYWORD, startpos, text); 431 } 432 else if(peekChar == ':') 433 { 434 advanceChar(); 435 return Token(Token.Type.LABEL, startpos, text); 436 } 437 else 438 { 439 return Token(Token.Type.IDENTIFIER, startpos, text); 440 } 441 } 442 443 Token makeIntOrDoubleToken() 444 { 445 immutable start = _index; 446 immutable startpos = _position; 447 auto dotCounter = 0; 448 auto eCounter = 0; 449 Token.LiteralFlag lflag = Token.LiteralFlag.NONE; 450 if(peekChar.toLower == 'x') 451 { 452 lflag = Token.LiteralFlag.HEXADECIMAL; 453 advanceChar(); 454 } 455 else if(peekChar.toLower == 'o') 456 { 457 lflag = Token.LiteralFlag.OCTAL; 458 advanceChar(); 459 } 460 else if(peekChar.toLower == 'b') 461 { 462 lflag = Token.LiteralFlag.BINARY; 463 advanceChar(); 464 } 465 // if the lflag was set, the first char has to be 0 466 if(lflag != Token.LiteralFlag.NONE && _text[start] != '0') 467 throw new ScriptCompileException("Malformed integer literal", Token.createInvalidToken(startpos)); 468 469 // while(peekChar.isDigit || peekChar == '.' || peekChar.toLower == 'e') 470 while(peekChar.charIsValidDigit(lflag)) 471 { 472 advanceChar(); 473 if(lflag == Token.LiteralFlag.NONE) 474 { 475 if(currentChar == '.') 476 { 477 ++dotCounter; 478 if(dotCounter > 1) 479 throw new ScriptCompileException("Too many decimals in number literal", 480 Token.createInvalidToken(_position)); 481 } 482 else if(currentChar.toLower == 'e') 483 { 484 ++eCounter; 485 if(eCounter > 1) 486 throw new ScriptCompileException("Numbers can only have one exponent specifier", 487 Token.createInvalidToken(_position)); 488 if(peekChar == '+' || peekChar == '-') 489 advanceChar(); 490 if(!peekChar.isDigit) 491 throw new ScriptCompileException("Exponent specifier must be followed by number", 492 Token.createInvalidToken(_position)); 493 } 494 } 495 } 496 auto text = _text[start.._index+1]; 497 if(lflag != Token.LiteralFlag.NONE && text.length <= 2) 498 throw new ScriptCompileException("Malformed hex/octal/binary integer", Token.createInvalidToken(startpos)); 499 Token resultToken; 500 if(dotCounter == 0 && eCounter == 0) 501 resultToken = Token(Token.Type.INTEGER, startpos, text); 502 else 503 resultToken = Token(Token.Type.DOUBLE, startpos, text); 504 resultToken.literalFlag = lflag; 505 return resultToken; 506 } 507 508 Token makeStringToken(ref Token[] previous) 509 { 510 immutable closeQuote = currentChar; 511 auto startpos = _position; 512 advanceChar(); 513 string text = ""; 514 bool escapeChars = true; 515 if(previous.length >= 3) 516 { 517 if(previous[$-1].isIdentifier("raw") && 518 previous[$-2].type == Token.Type.DOT && 519 previous[$-3].isIdentifier("String")) 520 { 521 escapeChars = false; 522 previous = previous[0.. $-3]; 523 } 524 } 525 Token.LiteralFlag lflag = Token.LiteralFlag.NONE; 526 if(closeQuote == '`') 527 lflag = Token.LiteralFlag.TEMPLATE_STRING; 528 while(currentChar != closeQuote) 529 { 530 if(currentChar == '\0') 531 throw new ScriptCompileException("Missing close quote for string literal", 532 Token.createInvalidToken(_position, text)); 533 else if(currentChar == '\n' && lflag != Token.LiteralFlag.TEMPLATE_STRING) 534 throw new ScriptCompileException("Line breaks inside regular string literals are not allowed", 535 Token.createInvalidToken(_position, text)); 536 else if(currentChar == '\\' && escapeChars) // TODO handle \u0000 and \u00 sequences 537 { 538 advanceChar(); 539 if(currentChar in ESCAPE_CHARS) 540 text ~= ESCAPE_CHARS[currentChar]; 541 else if(currentChar == 'u') 542 { 543 advanceChar(); 544 string accum = ""; 545 bool usingBraces = false; 546 int limitCounter; 547 immutable LIMIT = 4; // without the braces 548 if(currentChar == '{') 549 { 550 advanceChar(); 551 usingBraces = true; 552 } 553 while(currentChar.charIsValidDigit(Token.LiteralFlag.HEXADECIMAL)) 554 { 555 if(limitCounter >= LIMIT && !usingBraces) 556 break; 557 accum ~= currentChar; 558 advanceChar(); 559 if(!usingBraces) 560 ++limitCounter; 561 } 562 if(currentChar == '}' && usingBraces) 563 advanceChar(); 564 --_index; 565 try 566 { 567 dchar result = cast(dchar)to!uint(accum, 16); 568 char[] buf; 569 encode(buf, result); 570 text ~= buf; 571 } 572 catch(Exception ex) 573 { 574 throw new ScriptCompileException("Invalid UTF sequence in \\u char", 575 Token.createInvalidToken(_position, accum)); 576 } 577 } 578 else if(currentChar == 'x') 579 { 580 advanceChar(); 581 string accum = ""; 582 accum ~= currentChar; 583 advanceChar(); 584 accum ~= currentChar; 585 try 586 { 587 char result = cast(char)to!ubyte(accum, 16); 588 text ~= result; 589 } 590 catch(Exception ex) 591 { 592 throw new ScriptCompileException("Invalid hexadecimal number in \\x char", 593 Token.createInvalidToken(_position, accum)); 594 } 595 } 596 else 597 throw new ScriptCompileException("Unknown escape character " ~ currentChar, 598 Token.createInvalidToken(_position)); 599 } 600 else 601 text ~= currentChar; 602 advanceChar(); 603 } 604 auto tok = Token(Token.Type.STRING, startpos, text); 605 tok.literalFlag = lflag; 606 return tok; 607 } 608 609 Token makeRAngleBracketToken() 610 { 611 auto startpos = _position; 612 if(peekChar == '=') 613 { 614 advanceChar(); 615 return Token(Token.Type.GE, startpos); 616 } 617 else if(peekChar == '>') 618 { 619 advanceChar(); 620 if(peekChar == '>') 621 { 622 advanceChar(); 623 return Token(Token.Type.BIT_URSHIFT, startpos); 624 } 625 else 626 { 627 return Token(Token.Type.BIT_RSHIFT, startpos); 628 } 629 } 630 else 631 { 632 return Token(Token.Type.GT, startpos); 633 } 634 } 635 636 Token makeLAngleBracketToken() 637 { 638 auto startpos = _position; 639 if(peekChar == '=') 640 { 641 advanceChar(); 642 return Token(Token.Type.LE, startpos); 643 } 644 else if(peekChar == '<') 645 { 646 advanceChar(); 647 return Token(Token.Type.BIT_LSHIFT, startpos); 648 } 649 else 650 { 651 return Token(Token.Type.LT, startpos); 652 } 653 } 654 655 Token makeEqualToken() 656 { 657 auto startpos = _position; 658 if(peekChar == '=') 659 { 660 advanceChar(); 661 if(peekChar == '=') 662 { 663 advanceChar(); 664 return Token(Token.Type.STRICT_EQUALS); 665 } 666 else 667 { 668 return Token(Token.Type.EQUALS, startpos); 669 } 670 } 671 else if(peekChar == '>') 672 { 673 advanceChar(); 674 return Token(Token.Type.ARROW, startpos); 675 } 676 else 677 { 678 return Token(Token.Type.ASSIGN, startpos); 679 } 680 } 681 682 Token makeNotToken() 683 { 684 auto startpos = _position; 685 if(peekChar == '=') 686 { 687 advanceChar(); 688 if(peekChar == '=') 689 { 690 advanceChar(); 691 return Token(Token.Type.STRICT_NEQUALS, startpos); 692 } 693 else 694 { 695 return Token(Token.Type.NEQUALS, startpos); 696 } 697 } 698 else 699 { 700 return Token(Token.Type.NOT, startpos); 701 } 702 } 703 704 Token makeAndToken() 705 { 706 auto startpos = _position; 707 if(peekChar == '&') 708 { 709 advanceChar(); 710 return Token(Token.Type.AND, startpos); 711 } 712 else 713 { 714 return Token(Token.Type.BIT_AND, startpos); 715 } 716 } 717 718 Token makeOrToken() 719 { 720 auto startpos = _position; 721 if(peekChar == '|') 722 { 723 advanceChar(); 724 return Token(Token.Type.OR, startpos); 725 } 726 else 727 { 728 return Token(Token.Type.BIT_OR, startpos); 729 } 730 } 731 732 Token makePlusToken() 733 { 734 auto startpos = _position; 735 if(peekChar == '+') 736 { 737 advanceChar(); 738 return Token(Token.Type.INC, startpos); 739 } 740 else if(peekChar == '=') 741 { 742 advanceChar(); 743 return Token(Token.Type.PLUS_ASSIGN, startpos); 744 } 745 else 746 { 747 return Token(Token.Type.PLUS, startpos); 748 } 749 } 750 751 Token makeDashToken() 752 { 753 auto startpos = _position; 754 if(peekChar == '-') 755 { 756 advanceChar(); 757 return Token(Token.Type.DEC, startpos); 758 } 759 else if(peekChar == '=') 760 { 761 advanceChar(); 762 return Token(Token.Type.DASH_ASSIGN, startpos); 763 } 764 else 765 { 766 return Token(Token.Type.DASH, startpos); 767 } 768 } 769 770 Token makeStarToken() 771 { 772 auto startpos = _position; 773 if(peekChar == '*') 774 { 775 advanceChar(); 776 return Token(Token.Type.POW, startpos); 777 } 778 else 779 { 780 return Token(Token.Type.STAR, startpos); 781 } 782 } 783 784 Token[] handleFSlash(Token[] tokens) 785 { 786 if(peekChar == '*') 787 { 788 advanceChar(); 789 while(peekChar != '\0') 790 { 791 if(peekChar == '*') 792 { 793 advanceChar(); 794 if(peekChar == '/') 795 break; 796 } 797 advanceChar(); 798 } 799 advanceChar(); 800 } 801 else if(peekChar == '/') 802 { 803 advanceChar(); 804 while(peekChar != '\n' && peekChar != '\0') 805 { 806 advanceChar(); 807 } 808 } 809 else if(canMakeRegex(tokens)) 810 { 811 string accum = ""; 812 auto startPos = _position; 813 accum ~= currentChar; 814 bool gettingFlags = false; 815 advanceChar(); 816 while(currentChar) 817 { 818 if(!gettingFlags) 819 { 820 if(currentChar == '\\') 821 { 822 accum ~= currentChar; 823 advanceChar(); 824 if(currentChar) 825 { 826 accum ~= currentChar; 827 advanceChar(); 828 } 829 } 830 else if(currentChar == '/') 831 { 832 accum ~= currentChar; 833 advanceChar(); 834 gettingFlags = true; 835 } 836 else 837 { 838 accum ~= currentChar; 839 advanceChar(); 840 } 841 } 842 else 843 { 844 if(!isAlpha(currentChar)) 845 break; 846 accum ~= currentChar; 847 advanceChar(); 848 } 849 } 850 --_index; 851 bool valid; 852 try 853 { 854 auto extracted = extract(accum); 855 valid = isValid(extracted[0], extracted[1]); 856 } 857 catch(Exception ex) 858 { 859 throw new ScriptCompileException("Malformed regex literal", Token.createInvalidToken(startPos, accum)); 860 } 861 if(!valid) 862 throw new ScriptCompileException("Invalid regex literal", Token.createInvalidToken(startPos, accum)); 863 tokens ~= Token(Token.Type.REGEX, startPos, accum); 864 } 865 else 866 { 867 tokens ~= Token(Token.Type.FSLASH, _position); 868 } 869 870 return tokens; 871 } 872 873 Position _position = {1, 1}; 874 string _text; 875 size_t _index = 0; 876 } 877 878 unittest 879 { 880 auto lexer = Lexer("1.2 34 5.e-99 'foo' "); 881 auto tokens = lexer.tokenize(); 882 assert(tokens[0].type == Token.Type.DOUBLE); 883 assert(tokens[1].type == Token.Type.INTEGER); 884 assert(tokens[2].type == Token.Type.DOUBLE); 885 assert(tokens[3].type == Token.Type.STRING && tokens[3].text == "foo"); 886 // TODO complete unit tests of every token type 887 }