1 /** 2 This module implements the Token and Lexer structs 3 4 ──────────────────────────────────────────────────────────────────────────────── 5 6 Copyright (C) 2021 pillager86.rf.gd 7 8 This program is free software: you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation, either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along with 18 this program. If not, see <https://www.gnu.org/licenses/>. 19 */ 20 module mildew.lexer; 21 22 import mildew.exceptions: ScriptCompileException; 23 import mildew.util.regex; 24 25 import std.ascii; // temp until unicode support 26 import std.container.rbtree; 27 import std.conv: to; 28 import std.format: format; 29 import std.utf: encode; 30 31 /** 32 * This struct represents the line and column number of a token, starting at 1. 33 */ 34 struct Position 35 { 36 /// Line and column number. 37 int line, column; 38 39 /// Returns a string representing the line and column number 40 string toString() const 41 { 42 return format("line %s, column %s", line, column); 43 } 44 45 /// Determines line and column number based on char that is read 46 void advance(char ch) 47 { 48 if(ch == '\0') 49 { 50 return; 51 } 52 else if(ch == '\n') 53 { 54 ++line; 55 column = 1; 56 } 57 else 58 { 59 ++column; 60 } 61 } 62 } 63 64 /** 65 * This struct represents a token, a fundamental building block of all scripts. The code of a script 66 * is first separated by token so that the parser can analyze each token. 67 */ 68 struct Token 69 { 70 /** 71 * The type of a token. 72 */ 73 enum Type 74 { 75 EOF, KEYWORD, INTEGER, DOUBLE, STRING, IDENTIFIER, REGEX, 76 NOT, AND, OR, GT, GE, LT, LE, 77 EQUALS, NEQUALS, STRICT_EQUALS, STRICT_NEQUALS, 78 79 ASSIGN, 80 POW_ASSIGN, STAR_ASSIGN, FSLASH_ASSIGN, PERCENT_ASSIGN, 81 PLUS_ASSIGN, DASH_ASSIGN, 82 BAND_ASSIGN, BXOR_ASSIGN, BOR_ASSIGN, BLS_ASSIGN, BRS_ASSIGN, BURS_ASSIGN, 83 84 PLUS, DASH, STAR, FSLASH, PERCENT, POW, DOT, TDOT, 85 INC, DEC, // ++ and -- 86 BIT_AND, BIT_XOR, BIT_OR, BIT_NOT, BIT_LSHIFT, BIT_RSHIFT, BIT_URSHIFT, 87 LPAREN, RPAREN, LBRACE, RBRACE, LBRACKET, RBRACKET, 88 SEMICOLON, COMMA, LABEL, QUESTION, COLON, ARROW, 89 90 INVALID 91 } 92 93 /** 94 * This enum is for literal value tokens that require special handling by the parser 95 */ 96 enum LiteralFlag 97 { 98 NONE, BINARY, OCTAL, HEXADECIMAL, TEMPLATE_STRING 99 } 100 101 /// Type of token 102 Type type; 103 /// Position where token occurs 104 Position position; 105 /// Optional text for keywords and identifiers 106 string text; 107 /// Optional flag for integer literals. 108 LiteralFlag literalFlag = LiteralFlag.NONE; 109 110 /** 111 * Returns a string representing the type of the token and the optional text if present. 112 */ 113 string toString() const 114 { 115 string str = format("[%s", type.to!string); 116 if(text != null) 117 str ~= "|" ~ text; 118 str ~= "]"; 119 return str; 120 } 121 122 /** 123 * Returns a textual representation of the token as it was found in the original script source code. 124 */ 125 string symbol() const 126 { 127 final switch(type) 128 { 129 case Type.EOF: 130 return "\0"; 131 case Type.KEYWORD: case Type.INTEGER: case Type.DOUBLE: case Type.STRING: case Type.IDENTIFIER: case Type.REGEX: 132 return text; 133 case Type.NOT: return "!"; 134 case Type.AND: return "&&"; 135 case Type.OR: return "||"; 136 case Type.GT: return ">"; 137 case Type.GE: return ">="; 138 case Type.LT: return "<"; 139 case Type.LE: return "<="; 140 case Type.EQUALS: return "=="; 141 case Type.NEQUALS: return "!="; 142 case Type.STRICT_EQUALS: return "==="; 143 case Type.STRICT_NEQUALS: return "!=="; 144 case Type.ASSIGN: return "="; 145 case Type.POW_ASSIGN: return "**="; 146 case Type.STAR_ASSIGN: return "*="; 147 case Type.FSLASH_ASSIGN: return "/="; 148 case Type.PERCENT_ASSIGN: return "%="; 149 case Type.PLUS_ASSIGN: return "+="; 150 case Type.DASH_ASSIGN: return "-="; 151 case Type.BAND_ASSIGN: return "&="; 152 case Type.BXOR_ASSIGN: return "^="; 153 case Type.BOR_ASSIGN: return "|="; 154 case Type.BLS_ASSIGN: return "<<="; 155 case Type.BRS_ASSIGN: return ">>="; 156 case Type.BURS_ASSIGN: return ">>>="; 157 case Type.PLUS: return "+"; 158 case Type.DASH: return "-"; 159 case Type.STAR: return "*"; 160 case Type.FSLASH: return "/"; 161 case Type.PERCENT: return "%"; 162 case Type.POW: return "**"; 163 case Type.DOT: return "."; 164 case Type.TDOT: return "..."; 165 case Type.INC: return "++"; 166 case Type.DEC: return "--"; 167 case Type.BIT_AND: return "&"; 168 case Type.BIT_XOR: return "^"; 169 case Type.BIT_OR: return "|"; 170 case Type.BIT_NOT: return "~"; 171 case Type.BIT_LSHIFT: return "<<"; 172 case Type.BIT_RSHIFT: return ">>"; 173 case Type.BIT_URSHIFT: return ">>>"; 174 case Type.LPAREN: return "("; 175 case Type.RPAREN: return ")"; 176 case Type.LBRACE: return "{"; 177 case Type.RBRACE: return "}"; 178 case Type.LBRACKET: return "["; 179 case Type.RBRACKET: return "]"; 180 case Type.SEMICOLON: return ";"; 181 case Type.COMMA: return ","; 182 case Type.LABEL: return text ~ ":"; 183 case Type.QUESTION: return "?"; 184 case Type.COLON: return ":"; 185 case Type.ARROW: return "=>"; 186 case Type.INVALID: return "#"; 187 } 188 } 189 190 /** 191 * Returns true if a token is both a keyword and a specific keyword. 192 */ 193 bool isKeyword(in string keyword) const 194 { 195 return (type == Type.KEYWORD && text == keyword); 196 } 197 198 /** 199 * Checks for a specific identifier 200 */ 201 bool isIdentifier(in string id) const 202 { 203 return (type == Type.IDENTIFIER && text == id); 204 } 205 206 /** 207 * Returns true if the token is an assignment operator such as =, +=, or -=, etc. 208 */ 209 bool isAssignmentOperator() 210 { 211 return (type == Type.ASSIGN || 212 type == Type.POW_ASSIGN || 213 type == Type.STAR_ASSIGN || 214 type == Type.FSLASH_ASSIGN || 215 type == Type.PERCENT_ASSIGN || 216 type == Type.PLUS_ASSIGN || 217 type == Type.DASH_ASSIGN || 218 type == Type.BAND_ASSIGN || 219 type == Type.BXOR_ASSIGN || 220 type == Type.BOR_ASSIGN || 221 type == Type.BLS_ASSIGN || 222 type == Type.BRS_ASSIGN || 223 type == Type.BURS_ASSIGN 224 ); 225 } 226 227 /** 228 * Generates an invalid token at the given position. This is used by the Lexer to throw 229 * an exception that requires a token. 230 */ 231 static Token createInvalidToken(in Position pos, in string text="") 232 { 233 auto token = Token(Token.Type.INVALID, pos, text); 234 return token; 235 } 236 237 /** 238 * Used by the parser 239 */ 240 static Token createFakeToken(in Type t, in string txt) 241 { 242 Token tok; 243 tok.type = t; 244 tok.position = Position(0,0); 245 tok.text = txt; 246 return tok; 247 } 248 } 249 250 private bool startsKeywordOrIdentifier(in char ch) 251 { 252 // TODO support unicode by converting string to dchar 253 return ch.isAlpha || ch == '_' || ch == '$'; 254 } 255 256 private bool continuesKeywordOrIdentifier(in char ch) 257 { 258 // TODO support unicode by converting string to dchar 259 return ch.isAlphaNum || ch == '_' || ch == '$'; 260 } 261 262 private bool charIsValidDigit(in char ch, in Token.LiteralFlag lflag) 263 { 264 if(lflag == Token.LiteralFlag.NONE) 265 return ch.isDigit || ch == '.' || ch == 'e'; 266 else if(lflag == Token.LiteralFlag.HEXADECIMAL) 267 return ch.isDigit || (ch.toLower >= 'a' && ch.toLower <= 'f'); 268 else if(lflag == Token.LiteralFlag.OCTAL) 269 return (ch >= '0' && ch <= '7'); 270 else if(lflag == Token.LiteralFlag.BINARY) 271 return ch == '0' || ch == '1'; 272 return false; 273 } 274 275 /// Lexes code and returns the individual tokens 276 struct Lexer 277 { 278 public: 279 /// Constructor takes code as text to tokenize 280 this(string code) 281 { 282 _text = code; 283 } 284 285 /// Returns tokens from lexing a string of code 286 Token[] tokenize() 287 { 288 Token[] tokens = []; 289 if (_text == "") 290 return tokens; 291 while(_index < _text.length) 292 { 293 // ignore white space 294 while(currentChar.isWhite()) 295 advanceChar(); 296 if(currentChar.startsKeywordOrIdentifier) 297 tokens ~= makeIdKwOrLabel(tokens); 298 else if(currentChar.isDigit) 299 tokens ~= makeIntOrDoubleToken(); 300 else if(currentChar == '\'' || currentChar == '"' || currentChar == '`') 301 tokens ~= makeStringToken(tokens); 302 else if(currentChar == '>') 303 tokens ~= makeRAngleBracketToken(); 304 else if(currentChar == '<') 305 tokens ~= makeLAngleBracketToken(); 306 else if(currentChar == '=') 307 tokens ~= makeEqualToken(); 308 else if(currentChar == '!') 309 tokens ~= makeNotToken(); 310 else if(currentChar == '&') 311 tokens ~= makeAndToken(); 312 else if(currentChar == '|') 313 tokens ~= makeOrToken(); 314 else if(currentChar == '+') 315 tokens ~= makePlusToken(); 316 else if(currentChar == '-') 317 tokens ~= makeDashToken(); 318 else if(currentChar == '*') 319 tokens ~= makeStarToken(); 320 else if(currentChar == '/') 321 tokens = handleFSlash(tokens); 322 else if(currentChar == '%') 323 tokens ~= makePercentToken(); 324 else if(currentChar == '^') 325 tokens ~= makeXorToken(); 326 else if(currentChar == '~') 327 tokens ~= Token(Token.Type.BIT_NOT, _position); 328 else if(currentChar == '(') 329 tokens ~= Token(Token.Type.LPAREN, _position); 330 else if(currentChar == ')') 331 tokens ~= Token(Token.Type.RPAREN, _position); 332 else if(currentChar == '{') 333 tokens ~= Token(Token.Type.LBRACE, _position); 334 else if(currentChar == '}') 335 tokens ~= Token(Token.Type.RBRACE, _position); 336 else if(currentChar == '[') 337 tokens ~= Token(Token.Type.LBRACKET, _position); 338 else if(currentChar == ']') 339 tokens ~= Token(Token.Type.RBRACKET, _position); 340 else if(currentChar == ';') 341 tokens ~= Token(Token.Type.SEMICOLON, _position); 342 else if(currentChar == ',') 343 tokens ~= Token(Token.Type.COMMA, _position); 344 else if(currentChar == '.') 345 tokens ~= makeDotTokens(); 346 else if(currentChar == ':') 347 tokens ~= Token(Token.Type.COLON, _position); 348 else if(currentChar == '?') 349 tokens ~= Token(Token.Type.QUESTION, _position); 350 else if(currentChar == '\0') 351 tokens ~= Token(Token.Type.EOF, _position); 352 else 353 throw new ScriptCompileException("Invalid character " ~ currentChar, 354 Token.createInvalidToken(_position, [currentChar])); 355 advanceChar(); 356 } 357 return tokens; 358 } 359 360 /// Hash table of keywords 361 static immutable KEYWORDS = redBlackTree( 362 "true", "false", "undefined", "null", 363 "var", "let", "const", 364 "if", "else", "while", "do", "for", "in", 365 "switch", "case", "default", 366 "break", "continue", "return", 367 "function", "class", "super", "extends", 368 "new", "delete", "typeof", "instanceof", 369 "throw", "try", "catch", "finally", 370 "yield" 371 ); 372 373 /// AA of look up for escape chars based on character after \ 374 static immutable char[char] ESCAPE_CHARS; 375 376 /// Initializes the associative array of escape chars 377 shared static this() 378 { 379 ESCAPE_CHARS = [ 380 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t', 'v': '\v', 381 '0': '\0', '\'': '\'', '"': '"', '\\': '\\' 382 ]; 383 } 384 385 private: 386 387 void advanceChar() 388 { 389 ++_index; 390 _position.advance(currentChar()); 391 } 392 393 char currentChar() 394 { 395 if(_index < _text.length) 396 return _text[_index]; 397 else 398 return '\0'; 399 } 400 401 char peekChar() 402 { 403 if(_index + 1 < _text.length) 404 return _text[_index + 1]; 405 else 406 return '\0'; 407 } 408 409 bool canMakeRegex(Token[] tokens) 410 { 411 if(tokens.length == 0) 412 return true; 413 switch(tokens[$-1].type) 414 { 415 case Token.Type.IDENTIFIER: 416 case Token.Type.INTEGER: 417 case Token.Type.DOUBLE: 418 case Token.Type.STRING: 419 case Token.Type.RBRACKET: 420 case Token.Type.RPAREN: 421 case Token.Type.INC: 422 case Token.Type.DEC: 423 return false; 424 case Token.Type.KEYWORD: 425 switch(tokens[$-1].text) 426 { 427 case "null": 428 case "true": 429 case "false": 430 return false; 431 default: 432 return true; 433 } 434 default: 435 return true; 436 } 437 } 438 439 Token makeIdKwOrLabel(Token[] tokens) 440 { 441 immutable start = _index; 442 immutable startpos = _position; 443 advanceChar(); 444 while(currentChar.continuesKeywordOrIdentifier) 445 advanceChar(); 446 auto text = _text[start.._index]; 447 --_index; // UGLY but IDK what else to do 448 // first check for keyword, that can't be a label 449 450 // these words can be used as object members but not as labels 451 if(text == "return" || text == "throw" || text == "delete" 452 || text == "catch" || text == "finally") 453 { 454 if(tokens.length > 0 && tokens[$-1].type == Token.Type.DOT) 455 return Token(Token.Type.IDENTIFIER, startpos, text); 456 } 457 458 if(text in KEYWORDS) 459 { 460 return Token(Token.Type.KEYWORD, startpos, text); 461 } 462 else if(peekChar == ':') 463 { 464 advanceChar(); 465 return Token(Token.Type.LABEL, startpos, text); 466 } 467 else 468 { 469 return Token(Token.Type.IDENTIFIER, startpos, text); 470 } 471 } 472 473 Token makeIntOrDoubleToken() 474 { 475 immutable start = _index; 476 immutable startpos = _position; 477 auto dotCounter = 0; 478 auto eCounter = 0; 479 Token.LiteralFlag lflag = Token.LiteralFlag.NONE; 480 if(peekChar.toLower == 'x') 481 { 482 lflag = Token.LiteralFlag.HEXADECIMAL; 483 advanceChar(); 484 } 485 else if(peekChar.toLower == 'o') 486 { 487 lflag = Token.LiteralFlag.OCTAL; 488 advanceChar(); 489 } 490 else if(peekChar.toLower == 'b') 491 { 492 lflag = Token.LiteralFlag.BINARY; 493 advanceChar(); 494 } 495 // if the lflag was set, the first char has to be 0 496 if(lflag != Token.LiteralFlag.NONE && _text[start] != '0') 497 throw new ScriptCompileException("Malformed integer literal", Token.createInvalidToken(startpos)); 498 499 // while(peekChar.isDigit || peekChar == '.' || peekChar.toLower == 'e') 500 while(peekChar.charIsValidDigit(lflag)) 501 { 502 advanceChar(); 503 if(lflag == Token.LiteralFlag.NONE) 504 { 505 if(currentChar == '.') 506 { 507 ++dotCounter; 508 if(dotCounter > 1) 509 throw new ScriptCompileException("Too many decimals in number literal", 510 Token.createInvalidToken(_position)); 511 } 512 else if(currentChar.toLower == 'e') 513 { 514 ++eCounter; 515 if(eCounter > 1) 516 throw new ScriptCompileException("Numbers can only have one exponent specifier", 517 Token.createInvalidToken(_position)); 518 if(peekChar == '+' || peekChar == '-') 519 advanceChar(); 520 if(!peekChar.isDigit) 521 throw new ScriptCompileException("Exponent specifier must be followed by number", 522 Token.createInvalidToken(_position)); 523 } 524 } 525 } 526 auto text = _text[start.._index+1]; 527 if(lflag != Token.LiteralFlag.NONE && text.length <= 2) 528 throw new ScriptCompileException("Malformed hex/octal/binary integer", Token.createInvalidToken(startpos)); 529 Token resultToken; 530 if(dotCounter == 0 && eCounter == 0) 531 resultToken = Token(Token.Type.INTEGER, startpos, text); 532 else 533 resultToken = Token(Token.Type.DOUBLE, startpos, text); 534 resultToken.literalFlag = lflag; 535 return resultToken; 536 } 537 538 Token makeStringToken(ref Token[] previous) 539 { 540 immutable closeQuote = currentChar; 541 auto startpos = _position; 542 advanceChar(); 543 string text = ""; 544 bool escapeChars = true; 545 if(previous.length >= 3) 546 { 547 if(previous[$-1].isIdentifier("raw") && 548 previous[$-2].type == Token.Type.DOT && 549 previous[$-3].isIdentifier("String")) 550 { 551 escapeChars = false; 552 previous = previous[0.. $-3]; 553 } 554 } 555 Token.LiteralFlag lflag = Token.LiteralFlag.NONE; 556 if(closeQuote == '`') 557 lflag = Token.LiteralFlag.TEMPLATE_STRING; 558 while(currentChar != closeQuote) 559 { 560 if(currentChar == '\0') 561 throw new ScriptCompileException("Missing close quote for string literal", 562 Token.createInvalidToken(_position, text)); 563 else if(currentChar == '\n' && lflag != Token.LiteralFlag.TEMPLATE_STRING) 564 throw new ScriptCompileException("Line breaks inside regular string literals are not allowed", 565 Token.createInvalidToken(_position, text)); 566 else if(currentChar == '\\' && escapeChars) // TODO handle \u0000 and \u00 sequences 567 { 568 advanceChar(); 569 if(currentChar in ESCAPE_CHARS) 570 text ~= ESCAPE_CHARS[currentChar]; 571 else if(currentChar == 'u') 572 { 573 advanceChar(); 574 string accum = ""; 575 bool usingBraces = false; 576 int limitCounter; 577 immutable LIMIT = 4; // without the braces 578 if(currentChar == '{') 579 { 580 advanceChar(); 581 usingBraces = true; 582 } 583 while(currentChar.charIsValidDigit(Token.LiteralFlag.HEXADECIMAL)) 584 { 585 if(limitCounter >= LIMIT && !usingBraces) 586 break; 587 accum ~= currentChar; 588 advanceChar(); 589 if(!usingBraces) 590 ++limitCounter; 591 } 592 if(currentChar == '}' && usingBraces) 593 advanceChar(); 594 --_index; 595 try 596 { 597 dchar result = cast(dchar)to!uint(accum, 16); 598 char[] buf; 599 encode(buf, result); 600 text ~= buf; 601 } 602 catch(Exception ex) 603 { 604 throw new ScriptCompileException("Invalid UTF sequence in \\u char", 605 Token.createInvalidToken(_position, accum)); 606 } 607 } 608 else if(currentChar == 'x') 609 { 610 advanceChar(); 611 string accum = ""; 612 accum ~= currentChar; 613 advanceChar(); 614 accum ~= currentChar; 615 try 616 { 617 char result = cast(char)to!ubyte(accum, 16); 618 text ~= result; 619 } 620 catch(Exception ex) 621 { 622 throw new ScriptCompileException("Invalid hexadecimal number in \\x char", 623 Token.createInvalidToken(_position, accum)); 624 } 625 } 626 else 627 throw new ScriptCompileException("Unknown escape character " ~ currentChar, 628 Token.createInvalidToken(_position)); 629 } 630 else 631 text ~= currentChar; 632 advanceChar(); 633 } 634 auto tok = Token(Token.Type.STRING, startpos, text); 635 tok.literalFlag = lflag; 636 return tok; 637 } 638 639 Token makeRAngleBracketToken() 640 { 641 auto startpos = _position; 642 if(peekChar == '=') 643 { 644 advanceChar(); 645 return Token(Token.Type.GE, startpos); 646 } 647 else if(peekChar == '>') 648 { 649 advanceChar(); 650 if(peekChar == '>') 651 { 652 advanceChar(); 653 if(peekChar == '=') 654 { 655 advanceChar(); 656 return Token(Token.Type.BURS_ASSIGN, startpos); 657 } 658 else 659 { 660 return Token(Token.Type.BIT_URSHIFT, startpos); 661 } 662 } 663 else if(peekChar == '=') 664 { 665 advanceChar(); 666 return Token(Token.Type.BRS_ASSIGN, startpos); 667 } 668 else 669 { 670 return Token(Token.Type.BIT_RSHIFT, startpos); 671 } 672 } 673 else 674 { 675 return Token(Token.Type.GT, startpos); 676 } 677 } 678 679 Token makeLAngleBracketToken() 680 { 681 auto startpos = _position; 682 if(peekChar == '=') 683 { 684 advanceChar(); 685 return Token(Token.Type.LE, startpos); 686 } 687 else if(peekChar == '<') 688 { 689 advanceChar(); 690 if(peekChar == '=') 691 { 692 advanceChar(); 693 return Token(Token.Type.BLS_ASSIGN); 694 } 695 else 696 { 697 return Token(Token.Type.BIT_LSHIFT, startpos); 698 } 699 } 700 else 701 { 702 return Token(Token.Type.LT, startpos); 703 } 704 } 705 706 Token makeEqualToken() 707 { 708 auto startpos = _position; 709 if(peekChar == '=') 710 { 711 advanceChar(); 712 if(peekChar == '=') 713 { 714 advanceChar(); 715 return Token(Token.Type.STRICT_EQUALS); 716 } 717 else 718 { 719 return Token(Token.Type.EQUALS, startpos); 720 } 721 } 722 else if(peekChar == '>') 723 { 724 advanceChar(); 725 return Token(Token.Type.ARROW, startpos); 726 } 727 else 728 { 729 return Token(Token.Type.ASSIGN, startpos); 730 } 731 } 732 733 Token makeNotToken() 734 { 735 auto startpos = _position; 736 if(peekChar == '=') 737 { 738 advanceChar(); 739 if(peekChar == '=') 740 { 741 advanceChar(); 742 return Token(Token.Type.STRICT_NEQUALS, startpos); 743 } 744 else 745 { 746 return Token(Token.Type.NEQUALS, startpos); 747 } 748 } 749 else 750 { 751 return Token(Token.Type.NOT, startpos); 752 } 753 } 754 755 Token makeAndToken() 756 { 757 auto startpos = _position; 758 if(peekChar == '&') 759 { 760 advanceChar(); 761 return Token(Token.Type.AND, startpos); 762 } 763 else if(peekChar == '=') 764 { 765 advanceChar(); 766 return Token(Token.Type.BAND_ASSIGN, startpos); 767 } 768 else 769 { 770 return Token(Token.Type.BIT_AND, startpos); 771 } 772 } 773 774 Token makeOrToken() 775 { 776 auto startpos = _position; 777 if(peekChar == '|') 778 { 779 advanceChar(); 780 return Token(Token.Type.OR, startpos); 781 } 782 else if(peekChar == '=') 783 { 784 advanceChar(); 785 return Token(Token.Type.BOR_ASSIGN, startpos); 786 } 787 else 788 { 789 return Token(Token.Type.BIT_OR, startpos); 790 } 791 } 792 793 Token makePlusToken() 794 { 795 auto startpos = _position; 796 if(peekChar == '+') 797 { 798 advanceChar(); 799 return Token(Token.Type.INC, startpos); 800 } 801 else if(peekChar == '=') 802 { 803 advanceChar(); 804 return Token(Token.Type.PLUS_ASSIGN, startpos); 805 } 806 else 807 { 808 return Token(Token.Type.PLUS, startpos); 809 } 810 } 811 812 Token makeDashToken() 813 { 814 auto startpos = _position; 815 if(peekChar == '-') 816 { 817 advanceChar(); 818 return Token(Token.Type.DEC, startpos); 819 } 820 else if(peekChar == '=') 821 { 822 advanceChar(); 823 return Token(Token.Type.DASH_ASSIGN, startpos); 824 } 825 else 826 { 827 return Token(Token.Type.DASH, startpos); 828 } 829 } 830 831 Token makeStarToken() 832 { 833 auto startpos = _position; 834 if(peekChar == '*') 835 { 836 advanceChar(); 837 if(peekChar == '=') 838 { 839 advanceChar(); 840 return Token(Token.Type.POW_ASSIGN, startpos); 841 } 842 else 843 { 844 return Token(Token.Type.POW, startpos); 845 } 846 } 847 else if(peekChar == '=') 848 { 849 advanceChar(); 850 return Token(Token.Type.STAR_ASSIGN, startpos); 851 } 852 else 853 { 854 return Token(Token.Type.STAR, startpos); 855 } 856 } 857 858 Token makePercentToken() 859 { 860 if(peekChar == '=') 861 { 862 immutable startpos = _position; 863 advanceChar(); 864 return Token(Token.Type.PERCENT_ASSIGN, startpos); 865 } 866 else 867 { 868 return Token(Token.Type.PERCENT, _position); 869 } 870 } 871 872 Token makeXorToken() 873 { 874 if(peekChar == '=') 875 { 876 immutable startpos = _position; 877 advanceChar(); 878 return Token(Token.Type.BXOR_ASSIGN, startpos); 879 } 880 else 881 { 882 return Token(Token.Type.BIT_XOR, _position); 883 } 884 } 885 886 Token[] makeDotTokens() 887 { 888 immutable startPos = _position; 889 if(peekChar == '.') 890 { 891 advanceChar(); 892 if(peekChar == '.') 893 { 894 advanceChar(); 895 return [Token(Token.Type.TDOT, startPos)]; 896 } 897 else 898 { 899 return [Token(Token.Type.DOT, startPos), Token(Token.Type.DOT, _position)]; 900 } 901 } 902 else 903 { 904 return [Token(Token.Type.DOT, _position)]; 905 } 906 } 907 908 Token[] handleFSlash(Token[] tokens) 909 { 910 if(peekChar == '*') // block comment 911 { 912 advanceChar(); 913 while(peekChar != '\0') 914 { 915 if(peekChar == '*') 916 { 917 advanceChar(); 918 if(peekChar == '/') 919 break; 920 } 921 advanceChar(); 922 } 923 advanceChar(); 924 } 925 else if(peekChar == '/') // comment 926 { 927 advanceChar(); 928 while(peekChar != '\n' && peekChar != '\0') 929 { 930 advanceChar(); 931 } 932 } 933 else if(canMakeRegex(tokens)) 934 { 935 string accum = ""; 936 auto startPos = _position; 937 accum ~= currentChar; 938 bool gettingFlags = false; 939 advanceChar(); 940 while(currentChar) 941 { 942 if(!gettingFlags) 943 { 944 if(currentChar == '\\') 945 { 946 accum ~= currentChar; 947 advanceChar(); 948 if(currentChar) 949 { 950 accum ~= currentChar; 951 advanceChar(); 952 } 953 } 954 else if(currentChar == '/') 955 { 956 accum ~= currentChar; 957 advanceChar(); 958 gettingFlags = true; 959 } 960 else 961 { 962 accum ~= currentChar; 963 advanceChar(); 964 } 965 } 966 else 967 { 968 if(!isAlpha(currentChar)) 969 break; 970 accum ~= currentChar; 971 advanceChar(); 972 } 973 } 974 --_index; 975 bool valid; 976 try 977 { 978 auto extracted = extract(accum); 979 valid = isValid(extracted[0], extracted[1]); 980 } 981 catch(Exception ex) 982 { 983 throw new ScriptCompileException("Malformed regex literal", Token.createInvalidToken(startPos, accum)); 984 } 985 if(!valid) 986 throw new ScriptCompileException("Invalid regex literal", Token.createInvalidToken(startPos, accum)); 987 tokens ~= Token(Token.Type.REGEX, startPos, accum); 988 } 989 else if(peekChar == '=') 990 { 991 immutable startpos = _position; 992 advanceChar(); 993 tokens ~= Token(Token.Type.FSLASH_ASSIGN, startpos); 994 } 995 else 996 { 997 tokens ~= Token(Token.Type.FSLASH, _position); 998 } 999 1000 return tokens; 1001 } 1002 1003 Position _position = {1, 1}; 1004 string _text; 1005 size_t _index = 0; 1006 } 1007 1008 unittest 1009 { 1010 auto lexer = Lexer("1.2 34 5.e-99 'foo' "); 1011 auto tokens = lexer.tokenize(); 1012 assert(tokens[0].type == Token.Type.DOUBLE); 1013 assert(tokens[1].type == Token.Type.INTEGER); 1014 assert(tokens[2].type == Token.Type.DOUBLE); 1015 assert(tokens[3].type == Token.Type.STRING && tokens[3].text == "foo"); 1016 // TODO complete unit tests of every token type 1017 }