1 /** 2 This module implements the Token and Lexer structs 3 4 ──────────────────────────────────────────────────────────────────────────────── 5 6 Copyright (C) 2021 pillager86.rf.gd 7 8 This program is free software: you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation, either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along with 18 this program. If not, see <https://www.gnu.org/licenses/>. 19 */ 20 module mildew.lexer; 21 22 import mildew.exceptions: ScriptCompileException; 23 import mildew.util.regex; 24 25 import std.ascii; // temp until unicode support 26 import std.container.rbtree; 27 import std.conv: to; 28 import std.format: format; 29 import std.utf: encode; 30 31 /** 32 * This struct represents the line and column number of a token, starting at 1. 33 */ 34 struct Position 35 { 36 /// Line and column number. 37 int line, column; 38 39 /// Returns a string representing the line and column number 40 string toString() const 41 { 42 return format("line %s, column %s", line, column); 43 } 44 45 /// Determines line and column number based on char that is read 46 void advance(char ch) 47 { 48 if(ch == '\0') 49 { 50 return; 51 } 52 else if(ch == '\n') 53 { 54 ++line; 55 column = 1; 56 } 57 else 58 { 59 ++column; 60 } 61 } 62 } 63 64 /** 65 * This struct represents a token, a fundamental building block of all scripts. The code of a script 66 * is first separated by token so that the parser can analyze each token. 67 */ 68 struct Token 69 { 70 /** 71 * The type of a token. 72 */ 73 enum Type 74 { 75 EOF, KEYWORD, INTEGER, DOUBLE, STRING, IDENTIFIER, REGEX, 76 NOT, AND, OR, GT, GE, LT, LE, 77 EQUALS, NEQUALS, STRICT_EQUALS, STRICT_NEQUALS, 78 79 ASSIGN, 80 POW_ASSIGN, STAR_ASSIGN, FSLASH_ASSIGN, PERCENT_ASSIGN, 81 PLUS_ASSIGN, DASH_ASSIGN, 82 BAND_ASSIGN, BXOR_ASSIGN, BOR_ASSIGN, BLS_ASSIGN, BRS_ASSIGN, BURS_ASSIGN, 83 84 PLUS, DASH, STAR, FSLASH, PERCENT, POW, DOT, TDOT, 85 INC, DEC, // ++ and -- 86 BIT_AND, BIT_XOR, BIT_OR, BIT_NOT, BIT_LSHIFT, BIT_RSHIFT, BIT_URSHIFT, 87 LPAREN, RPAREN, LBRACE, RBRACE, LBRACKET, RBRACKET, 88 SEMICOLON, COMMA, LABEL, QUESTION, COLON, ARROW, 89 NULLC, // null coalesce 90 91 INVALID 92 } 93 94 /** 95 * This enum is for literal value tokens that require special handling by the parser 96 */ 97 enum LiteralFlag 98 { 99 NONE, BINARY, OCTAL, HEXADECIMAL, TEMPLATE_STRING 100 } 101 102 /// Type of token 103 Type type; 104 /// Position where token occurs 105 Position position; 106 /// Optional text for keywords and identifiers 107 string text; 108 /// Optional flag for integer literals. 109 LiteralFlag literalFlag = LiteralFlag.NONE; 110 111 /** 112 * Returns a string representing the type of the token and the optional text if present. 113 */ 114 string toString() const 115 { 116 string str = format("[%s", type.to!string); 117 if(text != null) 118 str ~= "|" ~ text; 119 str ~= "]"; 120 return str; 121 } 122 123 /** 124 * Returns a textual representation of the token as it was found in the original script source code. 125 */ 126 string symbol() const 127 { 128 final switch(type) 129 { 130 case Type.EOF: 131 return "\0"; 132 case Type.KEYWORD: case Type.INTEGER: case Type.DOUBLE: case Type.STRING: case Type.IDENTIFIER: case Type.REGEX: 133 return text; 134 case Type.NOT: return "!"; 135 case Type.AND: return "&&"; 136 case Type.OR: return "||"; 137 case Type.GT: return ">"; 138 case Type.GE: return ">="; 139 case Type.LT: return "<"; 140 case Type.LE: return "<="; 141 case Type.EQUALS: return "=="; 142 case Type.NEQUALS: return "!="; 143 case Type.STRICT_EQUALS: return "==="; 144 case Type.STRICT_NEQUALS: return "!=="; 145 case Type.ASSIGN: return "="; 146 case Type.POW_ASSIGN: return "**="; 147 case Type.STAR_ASSIGN: return "*="; 148 case Type.FSLASH_ASSIGN: return "/="; 149 case Type.PERCENT_ASSIGN: return "%="; 150 case Type.PLUS_ASSIGN: return "+="; 151 case Type.DASH_ASSIGN: return "-="; 152 case Type.BAND_ASSIGN: return "&="; 153 case Type.BXOR_ASSIGN: return "^="; 154 case Type.BOR_ASSIGN: return "|="; 155 case Type.BLS_ASSIGN: return "<<="; 156 case Type.BRS_ASSIGN: return ">>="; 157 case Type.BURS_ASSIGN: return ">>>="; 158 case Type.PLUS: return "+"; 159 case Type.DASH: return "-"; 160 case Type.STAR: return "*"; 161 case Type.FSLASH: return "/"; 162 case Type.PERCENT: return "%"; 163 case Type.POW: return "**"; 164 case Type.DOT: return "."; 165 case Type.TDOT: return "..."; 166 case Type.INC: return "++"; 167 case Type.DEC: return "--"; 168 case Type.BIT_AND: return "&"; 169 case Type.BIT_XOR: return "^"; 170 case Type.BIT_OR: return "|"; 171 case Type.BIT_NOT: return "~"; 172 case Type.BIT_LSHIFT: return "<<"; 173 case Type.BIT_RSHIFT: return ">>"; 174 case Type.BIT_URSHIFT: return ">>>"; 175 case Type.LPAREN: return "("; 176 case Type.RPAREN: return ")"; 177 case Type.LBRACE: return "{"; 178 case Type.RBRACE: return "}"; 179 case Type.LBRACKET: return "["; 180 case Type.RBRACKET: return "]"; 181 case Type.SEMICOLON: return ";"; 182 case Type.COMMA: return ","; 183 case Type.LABEL: return text ~ ":"; 184 case Type.QUESTION: return "?"; 185 case Type.COLON: return ":"; 186 case Type.ARROW: return "=>"; 187 case Type.NULLC: return "??"; 188 case Type.INVALID: return "#"; 189 } 190 } 191 192 /** 193 * Returns true if a token is both a keyword and a specific keyword. 194 */ 195 bool isKeyword(in string keyword) const 196 { 197 return (type == Type.KEYWORD && text == keyword); 198 } 199 200 /** 201 * Checks for a specific identifier 202 */ 203 bool isIdentifier(in string id) const 204 { 205 return (type == Type.IDENTIFIER && text == id); 206 } 207 208 /** 209 * Returns true if the token is an assignment operator such as =, +=, or -=, etc. 210 */ 211 bool isAssignmentOperator() 212 { 213 return (type == Type.ASSIGN || 214 type == Type.POW_ASSIGN || 215 type == Type.STAR_ASSIGN || 216 type == Type.FSLASH_ASSIGN || 217 type == Type.PERCENT_ASSIGN || 218 type == Type.PLUS_ASSIGN || 219 type == Type.DASH_ASSIGN || 220 type == Type.BAND_ASSIGN || 221 type == Type.BXOR_ASSIGN || 222 type == Type.BOR_ASSIGN || 223 type == Type.BLS_ASSIGN || 224 type == Type.BRS_ASSIGN || 225 type == Type.BURS_ASSIGN 226 ); 227 } 228 229 /** 230 * Generates an invalid token at the given position. This is used by the Lexer to throw 231 * an exception that requires a token. 232 */ 233 static Token createInvalidToken(in Position pos, in string text="") 234 { 235 auto token = Token(Token.Type.INVALID, pos, text); 236 return token; 237 } 238 239 /** 240 * Used by the parser and compiler 241 */ 242 static Token createFakeToken(in Type t, in string txt) 243 { 244 Token tok; 245 tok.type = t; 246 tok.position = Position(0,0); 247 tok.text = txt; 248 return tok; 249 } 250 } 251 252 private bool startsKeywordOrIdentifier(in char ch) 253 { 254 // TODO support unicode by converting string to dchar 255 return ch.isAlpha || ch == '_' || ch == '$'; 256 } 257 258 private bool continuesKeywordOrIdentifier(in char ch) 259 { 260 // TODO support unicode by converting string to dchar 261 return ch.isAlphaNum || ch == '_' || ch == '$'; 262 } 263 264 private bool charIsValidDigit(in char ch, in Token.LiteralFlag lflag) 265 { 266 if(lflag == Token.LiteralFlag.NONE) 267 return ch.isDigit || ch == '.' || ch == 'e'; 268 else if(lflag == Token.LiteralFlag.HEXADECIMAL) 269 return ch.isDigit || (ch.toLower >= 'a' && ch.toLower <= 'f'); 270 else if(lflag == Token.LiteralFlag.OCTAL) 271 return (ch >= '0' && ch <= '7'); 272 else if(lflag == Token.LiteralFlag.BINARY) 273 return ch == '0' || ch == '1'; 274 return false; 275 } 276 277 /// Lexes code and returns the individual tokens 278 struct Lexer 279 { 280 public: 281 /// Constructor takes code as text to tokenize 282 this(string code) 283 { 284 _text = code; 285 } 286 287 /// Returns tokens from lexing a string of code 288 Token[] tokenize() 289 { 290 Token[] tokens = []; 291 if (_text == "") 292 return tokens; 293 while(_index < _text.length) 294 { 295 // ignore white space 296 while(currentChar.isWhite()) 297 advanceChar(); 298 if(currentChar.startsKeywordOrIdentifier) 299 tokens ~= makeIdKwOrLabel(tokens); 300 else if(currentChar.isDigit) 301 tokens ~= makeIntOrDoubleToken(); 302 else if(currentChar == '\'' || currentChar == '"' || currentChar == '`') 303 tokens ~= makeStringToken(tokens); 304 else if(currentChar == '>') 305 tokens ~= makeRAngleBracketToken(); 306 else if(currentChar == '<') 307 tokens ~= makeLAngleBracketToken(); 308 else if(currentChar == '=') 309 tokens ~= makeEqualToken(); 310 else if(currentChar == '!') 311 tokens ~= makeNotToken(); 312 else if(currentChar == '&') 313 tokens ~= makeAndToken(); 314 else if(currentChar == '|') 315 tokens ~= makeOrToken(); 316 else if(currentChar == '+') 317 tokens ~= makePlusToken(); 318 else if(currentChar == '-') 319 tokens ~= makeDashToken(); 320 else if(currentChar == '*') 321 tokens ~= makeStarToken(); 322 else if(currentChar == '/') 323 tokens = handleFSlash(tokens); 324 else if(currentChar == '%') 325 tokens ~= makePercentToken(); 326 else if(currentChar == '^') 327 tokens ~= makeXorToken(); 328 else if(currentChar == '~') 329 tokens ~= Token(Token.Type.BIT_NOT, _position); 330 else if(currentChar == '(') 331 tokens ~= Token(Token.Type.LPAREN, _position); 332 else if(currentChar == ')') 333 tokens ~= Token(Token.Type.RPAREN, _position); 334 else if(currentChar == '{') 335 tokens ~= Token(Token.Type.LBRACE, _position); 336 else if(currentChar == '}') 337 tokens ~= Token(Token.Type.RBRACE, _position); 338 else if(currentChar == '[') 339 tokens ~= Token(Token.Type.LBRACKET, _position); 340 else if(currentChar == ']') 341 tokens ~= Token(Token.Type.RBRACKET, _position); 342 else if(currentChar == ';') 343 tokens ~= Token(Token.Type.SEMICOLON, _position); 344 else if(currentChar == ',') 345 tokens ~= Token(Token.Type.COMMA, _position); 346 else if(currentChar == '.') 347 tokens ~= makeDotTokens(); 348 else if(currentChar == ':') 349 tokens ~= Token(Token.Type.COLON, _position); 350 else if(currentChar == '?') 351 tokens ~= makeQuestionToken(); 352 else if(currentChar == '\0') 353 tokens ~= Token(Token.Type.EOF, _position); 354 else 355 throw new ScriptCompileException("Invalid character " ~ currentChar, 356 Token.createInvalidToken(_position, [currentChar])); 357 advanceChar(); 358 } 359 return tokens; 360 } 361 362 /// Hash table of keywords 363 static immutable KEYWORDS = redBlackTree( 364 "true", "false", "undefined", "null", 365 "var", "let", "const", 366 "if", "else", "while", "do", "for", "in", 367 "switch", "case", "default", 368 "break", "continue", "return", 369 "function", "class", "super", "extends", 370 "new", "delete", "typeof", "instanceof", 371 "throw", "try", "catch", "finally", 372 "yield" 373 ); 374 375 /// AA of look up for escape chars based on character after \ 376 static immutable char[char] ESCAPE_CHARS; 377 378 /// Initializes the associative array of escape chars 379 shared static this() 380 { 381 ESCAPE_CHARS = [ 382 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t', 'v': '\v', 383 '0': '\0', '\'': '\'', '"': '"', '\\': '\\' 384 ]; 385 } 386 387 private: 388 389 void advanceChar() 390 { 391 ++_index; 392 _position.advance(currentChar()); 393 } 394 395 char currentChar() 396 { 397 if(_index < _text.length) 398 return _text[_index]; 399 else 400 return '\0'; 401 } 402 403 char peekChar() 404 { 405 if(_index + 1 < _text.length) 406 return _text[_index + 1]; 407 else 408 return '\0'; 409 } 410 411 bool canMakeRegex(Token[] tokens) 412 { 413 if(tokens.length == 0) 414 return true; 415 switch(tokens[$-1].type) 416 { 417 case Token.Type.IDENTIFIER: 418 case Token.Type.INTEGER: 419 case Token.Type.DOUBLE: 420 case Token.Type.STRING: 421 case Token.Type.RBRACKET: 422 case Token.Type.RPAREN: 423 case Token.Type.INC: 424 case Token.Type.DEC: 425 return false; 426 case Token.Type.KEYWORD: 427 switch(tokens[$-1].text) 428 { 429 case "null": 430 case "true": 431 case "false": 432 return false; 433 default: 434 return true; 435 } 436 default: 437 return true; 438 } 439 } 440 441 Token makeIdKwOrLabel(Token[] tokens) 442 { 443 immutable start = _index; 444 immutable startpos = _position; 445 advanceChar(); 446 while(currentChar.continuesKeywordOrIdentifier) 447 advanceChar(); 448 auto text = _text[start.._index]; 449 --_index; // UGLY but IDK what else to do 450 // first check for keyword, that can't be a label 451 452 // these words can be used as object members but not as labels 453 if(text == "return" || text == "throw" || text == "delete" 454 || text == "catch" || text == "finally") 455 { 456 if(tokens.length > 0 && tokens[$-1].type == Token.Type.DOT) 457 return Token(Token.Type.IDENTIFIER, startpos, text); 458 } 459 460 if(text in KEYWORDS) 461 { 462 return Token(Token.Type.KEYWORD, startpos, text); 463 } 464 else if(peekChar == ':') 465 { 466 advanceChar(); 467 return Token(Token.Type.LABEL, startpos, text); 468 } 469 else 470 { 471 return Token(Token.Type.IDENTIFIER, startpos, text); 472 } 473 } 474 475 Token makeIntOrDoubleToken() 476 { 477 immutable start = _index; 478 immutable startpos = _position; 479 auto dotCounter = 0; 480 auto eCounter = 0; 481 Token.LiteralFlag lflag = Token.LiteralFlag.NONE; 482 if(peekChar.toLower == 'x') 483 { 484 lflag = Token.LiteralFlag.HEXADECIMAL; 485 advanceChar(); 486 } 487 else if(peekChar.toLower == 'o') 488 { 489 lflag = Token.LiteralFlag.OCTAL; 490 advanceChar(); 491 } 492 else if(peekChar.toLower == 'b') 493 { 494 lflag = Token.LiteralFlag.BINARY; 495 advanceChar(); 496 } 497 // if the lflag was set, the first char has to be 0 498 if(lflag != Token.LiteralFlag.NONE && _text[start] != '0') 499 throw new ScriptCompileException("Malformed integer literal", Token.createInvalidToken(startpos)); 500 501 while(peekChar.charIsValidDigit(lflag)) 502 { 503 advanceChar(); 504 if(lflag == Token.LiteralFlag.NONE) 505 { 506 if(currentChar == '.') 507 { 508 ++dotCounter; 509 if(dotCounter > 1) 510 throw new ScriptCompileException("Too many decimals in number literal", 511 Token.createInvalidToken(_position)); 512 } 513 else if(currentChar.toLower == 'e') 514 { 515 ++eCounter; 516 if(eCounter > 1) 517 throw new ScriptCompileException("Numbers may only have one exponent specifier", 518 Token.createInvalidToken(_position)); 519 if(peekChar == '+' || peekChar == '-') 520 advanceChar(); 521 if(!peekChar.isDigit) 522 throw new ScriptCompileException("Exponent specifier must be followed by number", 523 Token.createInvalidToken(_position)); 524 } 525 } 526 } 527 auto text = _text[start.._index+1]; 528 if(lflag != Token.LiteralFlag.NONE && text.length <= 2) 529 throw new ScriptCompileException("Malformed hex/octal/binary integer", Token.createInvalidToken(startpos)); 530 Token resultToken; 531 if(dotCounter == 0 && eCounter == 0) 532 resultToken = Token(Token.Type.INTEGER, startpos, text); 533 else 534 resultToken = Token(Token.Type.DOUBLE, startpos, text); 535 resultToken.literalFlag = lflag; 536 return resultToken; 537 } 538 539 Token makeStringToken(ref Token[] previous) 540 { 541 immutable closeQuote = currentChar; 542 auto startpos = _position; 543 advanceChar(); 544 string text = ""; 545 bool escapeChars = true; 546 if(previous.length >= 3) 547 { 548 if(previous[$-1].isIdentifier("raw") && 549 previous[$-2].type == Token.Type.DOT && 550 previous[$-3].isIdentifier("String")) 551 { 552 escapeChars = false; 553 previous = previous[0.. $-3]; 554 } 555 } 556 Token.LiteralFlag lflag = Token.LiteralFlag.NONE; 557 if(closeQuote == '`') 558 lflag = Token.LiteralFlag.TEMPLATE_STRING; 559 while(currentChar != closeQuote) 560 { 561 if(currentChar == '\0') 562 throw new ScriptCompileException("Missing close quote for string literal", 563 Token.createInvalidToken(_position, text)); 564 else if(currentChar == '\n' && lflag != Token.LiteralFlag.TEMPLATE_STRING) 565 throw new ScriptCompileException("Line breaks inside regular string literals are not allowed", 566 Token.createInvalidToken(_position, text)); 567 else if(currentChar == '\\' && escapeChars) 568 { 569 advanceChar(); 570 if(currentChar in ESCAPE_CHARS) 571 text ~= ESCAPE_CHARS[currentChar]; 572 else if(currentChar == 'u') 573 { 574 advanceChar(); 575 string accum = ""; 576 bool usingBraces = false; 577 int limitCounter; 578 immutable LIMIT = 4; // without the braces 579 if(currentChar == '{') 580 { 581 advanceChar(); 582 usingBraces = true; 583 } 584 while(currentChar.charIsValidDigit(Token.LiteralFlag.HEXADECIMAL)) 585 { 586 if(limitCounter >= LIMIT && !usingBraces) 587 break; 588 accum ~= currentChar; 589 advanceChar(); 590 if(!usingBraces) 591 ++limitCounter; 592 } 593 if(currentChar == '}' && usingBraces) 594 advanceChar(); 595 --_index; 596 try 597 { 598 dchar result = cast(dchar)to!uint(accum, 16); 599 char[] buf; 600 encode(buf, result); 601 text ~= buf; 602 } 603 catch(Exception ex) 604 { 605 throw new ScriptCompileException("Invalid UTF sequence in \\u char", 606 Token.createInvalidToken(_position, accum)); 607 } 608 } 609 else if(currentChar == 'x') 610 { 611 advanceChar(); 612 string accum = ""; 613 accum ~= currentChar; 614 advanceChar(); 615 accum ~= currentChar; 616 try 617 { 618 char result = cast(char)to!ubyte(accum, 16); 619 text ~= result; 620 } 621 catch(Exception ex) 622 { 623 throw new ScriptCompileException("Invalid hexadecimal number in \\x char", 624 Token.createInvalidToken(_position, accum)); 625 } 626 } 627 else 628 throw new ScriptCompileException("Unknown escape character " ~ currentChar, 629 Token.createInvalidToken(_position)); 630 } 631 else 632 text ~= currentChar; 633 advanceChar(); 634 } 635 auto tok = Token(Token.Type.STRING, startpos, text); 636 tok.literalFlag = lflag; 637 return tok; 638 } 639 640 Token makeRAngleBracketToken() 641 { 642 auto startpos = _position; 643 if(peekChar == '=') 644 { 645 advanceChar(); 646 return Token(Token.Type.GE, startpos); 647 } 648 else if(peekChar == '>') 649 { 650 advanceChar(); 651 if(peekChar == '>') 652 { 653 advanceChar(); 654 if(peekChar == '=') 655 { 656 advanceChar(); 657 return Token(Token.Type.BURS_ASSIGN, startpos); 658 } 659 else 660 { 661 return Token(Token.Type.BIT_URSHIFT, startpos); 662 } 663 } 664 else if(peekChar == '=') 665 { 666 advanceChar(); 667 return Token(Token.Type.BRS_ASSIGN, startpos); 668 } 669 else 670 { 671 return Token(Token.Type.BIT_RSHIFT, startpos); 672 } 673 } 674 else 675 { 676 return Token(Token.Type.GT, startpos); 677 } 678 } 679 680 Token makeLAngleBracketToken() 681 { 682 auto startpos = _position; 683 if(peekChar == '=') 684 { 685 advanceChar(); 686 return Token(Token.Type.LE, startpos); 687 } 688 else if(peekChar == '<') 689 { 690 advanceChar(); 691 if(peekChar == '=') 692 { 693 advanceChar(); 694 return Token(Token.Type.BLS_ASSIGN); 695 } 696 else 697 { 698 return Token(Token.Type.BIT_LSHIFT, startpos); 699 } 700 } 701 else 702 { 703 return Token(Token.Type.LT, startpos); 704 } 705 } 706 707 Token makeEqualToken() 708 { 709 auto startpos = _position; 710 if(peekChar == '=') 711 { 712 advanceChar(); 713 if(peekChar == '=') 714 { 715 advanceChar(); 716 return Token(Token.Type.STRICT_EQUALS); 717 } 718 else 719 { 720 return Token(Token.Type.EQUALS, startpos); 721 } 722 } 723 else if(peekChar == '>') 724 { 725 advanceChar(); 726 return Token(Token.Type.ARROW, startpos); 727 } 728 else 729 { 730 return Token(Token.Type.ASSIGN, startpos); 731 } 732 } 733 734 Token makeNotToken() 735 { 736 auto startpos = _position; 737 if(peekChar == '=') 738 { 739 advanceChar(); 740 if(peekChar == '=') 741 { 742 advanceChar(); 743 return Token(Token.Type.STRICT_NEQUALS, startpos); 744 } 745 else 746 { 747 return Token(Token.Type.NEQUALS, startpos); 748 } 749 } 750 else 751 { 752 return Token(Token.Type.NOT, startpos); 753 } 754 } 755 756 Token makeAndToken() 757 { 758 auto startpos = _position; 759 if(peekChar == '&') 760 { 761 advanceChar(); 762 return Token(Token.Type.AND, startpos); 763 } 764 else if(peekChar == '=') 765 { 766 advanceChar(); 767 return Token(Token.Type.BAND_ASSIGN, startpos); 768 } 769 else 770 { 771 return Token(Token.Type.BIT_AND, startpos); 772 } 773 } 774 775 Token makeOrToken() 776 { 777 auto startpos = _position; 778 if(peekChar == '|') 779 { 780 advanceChar(); 781 return Token(Token.Type.OR, startpos); 782 } 783 else if(peekChar == '=') 784 { 785 advanceChar(); 786 return Token(Token.Type.BOR_ASSIGN, startpos); 787 } 788 else 789 { 790 return Token(Token.Type.BIT_OR, startpos); 791 } 792 } 793 794 Token makePlusToken() 795 { 796 auto startpos = _position; 797 if(peekChar == '+') 798 { 799 advanceChar(); 800 return Token(Token.Type.INC, startpos); 801 } 802 else if(peekChar == '=') 803 { 804 advanceChar(); 805 return Token(Token.Type.PLUS_ASSIGN, startpos); 806 } 807 else 808 { 809 return Token(Token.Type.PLUS, startpos); 810 } 811 } 812 813 Token makeDashToken() 814 { 815 auto startpos = _position; 816 if(peekChar == '-') 817 { 818 advanceChar(); 819 return Token(Token.Type.DEC, startpos); 820 } 821 else if(peekChar == '=') 822 { 823 advanceChar(); 824 return Token(Token.Type.DASH_ASSIGN, startpos); 825 } 826 else 827 { 828 return Token(Token.Type.DASH, startpos); 829 } 830 } 831 832 Token makeStarToken() 833 { 834 auto startpos = _position; 835 if(peekChar == '*') 836 { 837 advanceChar(); 838 if(peekChar == '=') 839 { 840 advanceChar(); 841 return Token(Token.Type.POW_ASSIGN, startpos); 842 } 843 else 844 { 845 return Token(Token.Type.POW, startpos); 846 } 847 } 848 else if(peekChar == '=') 849 { 850 advanceChar(); 851 return Token(Token.Type.STAR_ASSIGN, startpos); 852 } 853 else 854 { 855 return Token(Token.Type.STAR, startpos); 856 } 857 } 858 859 Token makePercentToken() 860 { 861 if(peekChar == '=') 862 { 863 immutable startpos = _position; 864 advanceChar(); 865 return Token(Token.Type.PERCENT_ASSIGN, startpos); 866 } 867 else 868 { 869 return Token(Token.Type.PERCENT, _position); 870 } 871 } 872 873 Token makeXorToken() 874 { 875 if(peekChar == '=') 876 { 877 immutable startpos = _position; 878 advanceChar(); 879 return Token(Token.Type.BXOR_ASSIGN, startpos); 880 } 881 else 882 { 883 return Token(Token.Type.BIT_XOR, _position); 884 } 885 } 886 887 Token[] makeDotTokens() 888 { 889 immutable startPos = _position; 890 if(peekChar == '.') 891 { 892 advanceChar(); 893 if(peekChar == '.') 894 { 895 advanceChar(); 896 return [Token(Token.Type.TDOT, startPos)]; 897 } 898 else 899 { 900 return [Token(Token.Type.DOT, startPos), Token(Token.Type.DOT, _position)]; 901 } 902 } 903 else 904 { 905 return [Token(Token.Type.DOT, _position)]; 906 } 907 } 908 909 Token makeQuestionToken() 910 { 911 immutable startpos = _position; 912 if(peekChar == '?') 913 { 914 advanceChar(); 915 return Token(Token.Type.NULLC, _position); 916 } 917 else 918 { 919 return Token(Token.Type.QUESTION, _position); 920 } 921 } 922 923 Token[] handleFSlash(Token[] tokens) 924 { 925 if(peekChar == '*') // block comment 926 { 927 advanceChar(); 928 while(peekChar != '\0') 929 { 930 if(peekChar == '*') 931 { 932 advanceChar(); 933 if(peekChar == '/') 934 break; 935 } 936 advanceChar(); 937 } 938 advanceChar(); 939 } 940 else if(peekChar == '/') // comment 941 { 942 advanceChar(); 943 while(peekChar != '\n' && peekChar != '\0') 944 { 945 advanceChar(); 946 } 947 } 948 else if(canMakeRegex(tokens)) 949 { 950 string accum = ""; 951 auto startPos = _position; 952 accum ~= currentChar; 953 bool gettingFlags = false; 954 advanceChar(); 955 while(currentChar) 956 { 957 if(!gettingFlags) 958 { 959 if(currentChar == '\\') 960 { 961 accum ~= currentChar; 962 advanceChar(); 963 if(currentChar) 964 { 965 accum ~= currentChar; 966 advanceChar(); 967 } 968 } 969 else if(currentChar == '/') 970 { 971 accum ~= currentChar; 972 advanceChar(); 973 gettingFlags = true; 974 } 975 else 976 { 977 accum ~= currentChar; 978 advanceChar(); 979 } 980 } 981 else 982 { 983 if(!isAlpha(currentChar)) 984 break; 985 accum ~= currentChar; 986 advanceChar(); 987 } 988 } 989 --_index; 990 bool valid; 991 try 992 { 993 auto extracted = extract(accum); 994 valid = isValid(extracted[0], extracted[1]); 995 } 996 catch(Exception ex) 997 { 998 throw new ScriptCompileException("Malformed regex literal", Token.createInvalidToken(startPos, accum)); 999 } 1000 if(!valid) 1001 throw new ScriptCompileException("Invalid regex literal", Token.createInvalidToken(startPos, accum)); 1002 tokens ~= Token(Token.Type.REGEX, startPos, accum); 1003 } 1004 else if(peekChar == '=') 1005 { 1006 immutable startpos = _position; 1007 advanceChar(); 1008 tokens ~= Token(Token.Type.FSLASH_ASSIGN, startpos); 1009 } 1010 else 1011 { 1012 tokens ~= Token(Token.Type.FSLASH, _position); 1013 } 1014 1015 return tokens; 1016 } 1017 1018 Position _position = {1, 1}; 1019 string _text; 1020 size_t _index = 0; 1021 } 1022 1023 unittest 1024 { 1025 auto lexer = Lexer("1.2 34 5.e-99 'foo' "); 1026 auto tokens = lexer.tokenize(); 1027 assert(tokens[0].type == Token.Type.DOUBLE); 1028 assert(tokens[1].type == Token.Type.INTEGER); 1029 assert(tokens[2].type == Token.Type.DOUBLE); 1030 assert(tokens[3].type == Token.Type.STRING && tokens[3].text == "foo"); 1031 // TODO complete unit tests of every token type 1032 }