1 /** 2 * This module implements the Token and Lexer structs 3 */ 4 module mildew.lexer; 5 6 import mildew.exceptions: ScriptCompileException; 7 8 import std.ascii; // temp until unicode support 9 import std.container.rbtree; 10 import std.conv: to; 11 import std.format: format; 12 13 /** 14 * This struct represents the line and column number of a token, starting at 1. 15 */ 16 struct Position 17 { 18 /// Line and column number. 19 int line, column; 20 21 /// Returns a string representing the line and column number 22 string toString() const 23 { 24 return format("line %s, column %s", line, column); 25 } 26 27 /// Determines line and column number based on char that is read 28 void advance(char ch) 29 { 30 if(ch == '\0') 31 { 32 return; 33 } 34 else if(ch == '\n') 35 { 36 ++line; 37 column = 1; 38 } 39 else 40 { 41 ++column; 42 } 43 } 44 } 45 46 /** 47 * This struct represents a token, a fundamental building block of all scripts. The code of a script 48 * is first separated by token so that the parser can analyze each token. 49 */ 50 struct Token 51 { 52 /** 53 * The type of a token. 54 */ 55 enum Type 56 { 57 EOF, KEYWORD, INTEGER, DOUBLE, STRING, IDENTIFIER, 58 NOT, AND, OR, GT, GE, LT, LE, 59 EQUALS, NEQUALS, STRICT_EQUALS, STRICT_NEQUALS, 60 ASSIGN, PLUS_ASSIGN, DASH_ASSIGN, 61 PLUS, DASH, STAR, FSLASH, PERCENT, POW, DOT, 62 INC, DEC, // ++ and -- 63 BIT_AND, BIT_XOR, BIT_OR, BIT_NOT, BIT_LSHIFT, BIT_RSHIFT, BIT_URSHIFT, 64 LPAREN, RPAREN, LBRACE, RBRACE, LBRACKET, RBRACKET, 65 SEMICOLON, COMMA, LABEL, QUESTION, COLON, INVALID 66 } 67 68 /** 69 * This enum is for literal value tokens that require special handling by the parser 70 */ 71 enum LiteralFlag 72 { 73 NONE, BINARY, OCTAL, HEXADECIMAL, TEMPLATE_STRING 74 } 75 76 /// Type of token 77 Type type; 78 /// Position where token occurs 79 Position position; 80 /// Optional text for keywords and identifiers 81 string text; 82 /// Optional flag for integer literals. 83 LiteralFlag literalFlag = LiteralFlag.NONE; 84 85 /** 86 * Returns a string representing the type of the token and the optional text if present. 87 */ 88 string toString() const 89 { 90 string str = format("[%s", type.to!string); 91 if(text != null) 92 str ~= "|" ~ text; 93 str ~= "]"; 94 return str; 95 } 96 97 /** 98 * Returns a textual representation of the token as it was found in the original script source code. 99 */ 100 string symbol() const 101 { 102 final switch(type) 103 { 104 case Type.EOF: 105 return "\0"; 106 case Type.KEYWORD: case Type.INTEGER: case Type.DOUBLE: case Type.STRING: case Type.IDENTIFIER: 107 return text; 108 case Type.NOT: return "!"; 109 case Type.AND: return "&&"; 110 case Type.OR: return "||"; 111 case Type.GT: return ">"; 112 case Type.GE: return ">="; 113 case Type.LT: return "<"; 114 case Type.LE: return "<="; 115 case Type.EQUALS: return "=="; 116 case Type.NEQUALS: return "!="; 117 case Type.STRICT_EQUALS: return "==="; 118 case Type.STRICT_NEQUALS: return "!=="; 119 case Type.ASSIGN: return "="; 120 case Type.PLUS_ASSIGN: return "+="; 121 case Type.DASH_ASSIGN: return "-="; 122 case Type.PLUS: return "+"; 123 case Type.DASH: return "-"; 124 case Type.STAR: return "*"; 125 case Type.FSLASH: return "/"; 126 case Type.PERCENT: return "%"; 127 case Type.POW: return "**"; 128 case Type.DOT: return "."; 129 case Type.INC: return "++"; 130 case Type.DEC: return "--"; 131 case Type.BIT_AND: return "&"; 132 case Type.BIT_XOR: return "^"; 133 case Type.BIT_OR: return "|"; 134 case Type.BIT_NOT: return "~"; 135 case Type.BIT_LSHIFT: return "<<"; 136 case Type.BIT_RSHIFT: return ">>"; 137 case Type.BIT_URSHIFT: return ">>>"; 138 case Type.LPAREN: return "("; 139 case Type.RPAREN: return ")"; 140 case Type.LBRACE: return "{"; 141 case Type.RBRACE: return "}"; 142 case Type.LBRACKET: return "["; 143 case Type.RBRACKET: return "]"; 144 case Type.SEMICOLON: return ";"; 145 case Type.COMMA: return ","; 146 case Type.LABEL: return text ~ ":"; 147 case Type.QUESTION: return "?"; 148 case Type.COLON: return ":"; 149 case Type.INVALID: return "#"; 150 } 151 } 152 153 /** 154 * Returns true if a token is both a keyword and a specific keyword. 155 */ 156 bool isKeyword(in string keyword) const 157 { 158 return (type == Type.KEYWORD && text == keyword); 159 } 160 161 /** 162 * Checks for a specific identifier 163 */ 164 bool isIdentifier(in string id) const 165 { 166 return (type == Type.IDENTIFIER && text == id); 167 } 168 169 /** 170 * Returns true if the token is an assignment operator such as =, +=, or -=, etc. 171 */ 172 bool isAssignmentOperator() 173 { 174 return (type == Type.ASSIGN || type == Type.PLUS_ASSIGN || type == Type.DASH_ASSIGN); 175 } 176 177 /** 178 * Generates an invalid token at the given position. This is used by the Lexer to throw 179 * an exception that requires a token. 180 */ 181 static Token createInvalidToken(in Position pos, in string text="") 182 { 183 auto token = Token(Token.Type.INVALID, pos, text); 184 return token; 185 } 186 187 /** 188 * Used by the parser 189 */ 190 static Token createFakeToken(in Type t, in string txt) 191 { 192 Token tok; 193 tok.type = t; 194 tok.position = Position(0,0); 195 tok.text = txt; 196 return tok; 197 } 198 } 199 200 private bool startsKeywordOrIdentifier(in char ch) 201 { 202 // TODO support unicode by converting string to dchar 203 return ch.isAlpha || ch == '_' || ch == '$'; 204 } 205 206 private bool continuesKeywordOrIdentifier(in char ch) 207 { 208 // TODO support unicode by converting string to dchar 209 return ch.isAlphaNum || ch == '_' || ch == '$'; 210 } 211 212 private bool charIsValidDigit(in char ch, in Token.LiteralFlag lflag) 213 { 214 if(lflag == Token.LiteralFlag.NONE) 215 return ch.isDigit || ch == '.' || ch == 'e'; 216 else if(lflag == Token.LiteralFlag.HEXADECIMAL) 217 return ch.isDigit || (ch.toLower >= 'a' && ch.toLower <= 'f'); 218 else if(lflag == Token.LiteralFlag.OCTAL) 219 return (ch >= '0' && ch <= '7'); 220 else if(lflag == Token.LiteralFlag.BINARY) 221 return ch == '0' || ch == '1'; 222 return false; 223 } 224 225 /// Lexes code and returns the individual tokens 226 struct Lexer 227 { 228 public: 229 /// Constructor takes code as text to tokenize 230 this(string code) 231 { 232 _text = code; 233 } 234 235 /// Returns tokens from lexing a string of code 236 Token[] tokenize() 237 { 238 Token[] tokens = []; 239 if (_text == "") 240 return tokens; 241 while(_index < _text.length) 242 { 243 // ignore white space 244 while(currentChar.isWhite()) 245 advanceChar(); 246 if(currentChar.startsKeywordOrIdentifier) 247 tokens ~= makeIdKwOrLabel(); 248 else if(currentChar.isDigit) 249 tokens ~= makeIntOrDoubleToken(); 250 else if(currentChar == '\'' || currentChar == '"' || currentChar == '`') 251 tokens ~= makeStringToken(); 252 else if(currentChar == '>') 253 tokens ~= makeRAngleBracketToken(); 254 else if(currentChar == '<') 255 tokens ~= makeLAngleBracketToken(); 256 else if(currentChar == '=') 257 tokens ~= makeEqualToken(); 258 else if(currentChar == '!') 259 tokens ~= makeNotToken(); 260 else if(currentChar == '&') 261 tokens ~= makeAndToken(); 262 else if(currentChar == '|') 263 tokens ~= makeOrToken(); 264 else if(currentChar == '+') 265 tokens ~= makePlusToken(); 266 else if(currentChar == '-') 267 tokens ~= makeDashToken(); 268 else if(currentChar == '*') 269 tokens ~= makeStarToken(); 270 else if(currentChar == '/') 271 tokens = handleFSlash(tokens); 272 else if(currentChar == '%') 273 tokens ~= Token(Token.Type.PERCENT, _position); 274 else if(currentChar == '^') 275 tokens ~= Token(Token.Type.BIT_XOR, _position); 276 else if(currentChar == '~') 277 tokens ~= Token(Token.Type.BIT_NOT, _position); 278 else if(currentChar == '(') 279 tokens ~= Token(Token.Type.LPAREN, _position); 280 else if(currentChar == ')') 281 tokens ~= Token(Token.Type.RPAREN, _position); 282 else if(currentChar == '{') 283 tokens ~= Token(Token.Type.LBRACE, _position); 284 else if(currentChar == '}') 285 tokens ~= Token(Token.Type.RBRACE, _position); 286 else if(currentChar == '[') 287 tokens ~= Token(Token.Type.LBRACKET, _position); 288 else if(currentChar == ']') 289 tokens ~= Token(Token.Type.RBRACKET, _position); 290 else if(currentChar == ';') 291 tokens ~= Token(Token.Type.SEMICOLON, _position); 292 else if(currentChar == ',') 293 tokens ~= Token(Token.Type.COMMA, _position); 294 else if(currentChar == '.') 295 tokens ~= Token(Token.Type.DOT, _position); 296 else if(currentChar == ':') 297 tokens ~= Token(Token.Type.COLON, _position); 298 else if(currentChar == '?') 299 tokens ~= Token(Token.Type.QUESTION, _position); 300 else if(currentChar == '\0') 301 tokens ~= Token(Token.Type.EOF, _position); 302 else 303 throw new ScriptCompileException("Invalid character " ~ currentChar, 304 Token.createInvalidToken(_position, [currentChar])); 305 advanceChar(); 306 } 307 return tokens; 308 } 309 310 /// Hash table of keywords 311 static immutable KEYWORDS = redBlackTree( 312 "true", "false", "undefined", "null", 313 "var", "let", "const", 314 "if", "else", "while", "do", "for", "of", "in", 315 "switch", "case", "default", 316 "break", "continue", "return", 317 "function", "class", "super", "extends", 318 "new", "delete", "typeof", "instanceof", 319 "throw", "try", "catch" 320 ); 321 322 /// AA of look up for escape chars based on character after \ 323 static immutable char[char] ESCAPE_CHARS; 324 325 /// Initializes the associative array of escape chars 326 shared static this() 327 { 328 ESCAPE_CHARS = [ 329 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t', 'v': '\v', 330 '0': '\0', '\'': '\'', '"': '"', '\\': '\\' 331 ]; 332 } 333 334 private: 335 336 void advanceChar() 337 { 338 ++_index; 339 _position.advance(currentChar()); 340 } 341 342 char currentChar() 343 { 344 if(_index < _text.length) 345 return _text[_index]; 346 else 347 return '\0'; 348 } 349 350 char peekChar() 351 { 352 if(_index + 1 < _text.length) 353 return _text[_index + 1]; 354 else 355 return '\0'; 356 } 357 358 Token makeIdKwOrLabel() 359 { 360 immutable start = _index; 361 immutable startpos = _position; 362 advanceChar(); 363 while(currentChar.continuesKeywordOrIdentifier) 364 advanceChar(); 365 auto text = _text[start.._index]; 366 --_index; // UGLY but IDK what else to do 367 // first check for keyword, that can't be a label 368 if(text in KEYWORDS) 369 { 370 return Token(Token.Type.KEYWORD, startpos, text); 371 } 372 else if(peekChar == ':') 373 { 374 advanceChar(); 375 return Token(Token.Type.LABEL, startpos, text); 376 } 377 else 378 { 379 return Token(Token.Type.IDENTIFIER, startpos, text); 380 } 381 } 382 383 Token makeIntOrDoubleToken() 384 { 385 immutable start = _index; 386 immutable startpos = _position; 387 auto dotCounter = 0; 388 auto eCounter = 0; 389 Token.LiteralFlag lflag = Token.LiteralFlag.NONE; 390 if(peekChar.toLower == 'x') 391 { 392 lflag = Token.LiteralFlag.HEXADECIMAL; 393 advanceChar(); 394 } 395 else if(peekChar.toLower == 'o') 396 { 397 lflag = Token.LiteralFlag.OCTAL; 398 advanceChar(); 399 } 400 else if(peekChar.toLower == 'b') 401 { 402 lflag = Token.LiteralFlag.BINARY; 403 advanceChar(); 404 } 405 // if the lflag was set, the first char has to be 0 406 if(lflag != Token.LiteralFlag.NONE && _text[start] != '0') 407 throw new ScriptCompileException("Malformed integer literal", Token.createInvalidToken(startpos)); 408 409 // while(peekChar.isDigit || peekChar == '.' || peekChar.toLower == 'e') 410 while(peekChar.charIsValidDigit(lflag)) 411 { 412 advanceChar(); 413 if(lflag == Token.LiteralFlag.NONE) 414 { 415 if(currentChar == '.') 416 { 417 ++dotCounter; 418 if(dotCounter > 1) 419 throw new ScriptCompileException("Too many decimals in number literal", 420 Token.createInvalidToken(_position)); 421 } 422 else if(currentChar.toLower == 'e') 423 { 424 ++eCounter; 425 if(eCounter > 1) 426 throw new ScriptCompileException("Numbers can only have one exponent specifier", 427 Token.createInvalidToken(_position)); 428 if(peekChar == '+' || peekChar == '-') 429 advanceChar(); 430 if(!peekChar.isDigit) 431 throw new ScriptCompileException("Exponent specifier must be followed by number", 432 Token.createInvalidToken(_position)); 433 } 434 } 435 } 436 auto text = _text[start.._index+1]; 437 if(lflag != Token.LiteralFlag.NONE && text.length <= 2) 438 throw new ScriptCompileException("Malformed hex/octal/binary integer", Token.createInvalidToken(startpos)); 439 Token resultToken; 440 if(dotCounter == 0 && eCounter == 0) 441 resultToken = Token(Token.Type.INTEGER, startpos, text); 442 else 443 resultToken = Token(Token.Type.DOUBLE, startpos, text); 444 resultToken.literalFlag = lflag; 445 return resultToken; 446 } 447 448 Token makeStringToken() 449 { 450 immutable closeQuote = currentChar; 451 auto startpos = _position; 452 advanceChar(); 453 string text = ""; 454 Token.LiteralFlag lflag = Token.LiteralFlag.NONE; 455 if(closeQuote == '`') 456 lflag = Token.LiteralFlag.TEMPLATE_STRING; 457 while(currentChar != closeQuote) 458 { 459 if(currentChar == '\0') 460 throw new ScriptCompileException("Missing close quote for string literal", 461 Token.createInvalidToken(_position, text)); 462 else if(currentChar == '\n' && lflag != Token.LiteralFlag.TEMPLATE_STRING) 463 throw new ScriptCompileException("Line breaks inside string literal are not allowed", 464 Token.createInvalidToken(_position, text)); 465 else if(currentChar == '\\') // TODO handle \u0000 and \u00 sequences 466 { 467 advanceChar(); 468 if(currentChar in ESCAPE_CHARS) 469 text ~= ESCAPE_CHARS[currentChar]; 470 else 471 throw new ScriptCompileException("Unknown escape character " ~ currentChar, 472 Token.createInvalidToken(_position)); 473 } 474 else 475 text ~= currentChar; 476 advanceChar(); 477 } 478 auto tok = Token(Token.Type.STRING, startpos, text); 479 tok.literalFlag = lflag; 480 return tok; 481 } 482 483 Token makeRAngleBracketToken() 484 { 485 auto startpos = _position; 486 if(peekChar == '=') 487 { 488 advanceChar(); 489 return Token(Token.Type.GE, startpos); 490 } 491 else if(peekChar == '>') 492 { 493 advanceChar(); 494 if(peekChar == '>') 495 { 496 advanceChar(); 497 return Token(Token.Type.BIT_URSHIFT, startpos); 498 } 499 else 500 { 501 return Token(Token.Type.BIT_RSHIFT, startpos); 502 } 503 } 504 else 505 { 506 return Token(Token.Type.GT, startpos); 507 } 508 } 509 510 Token makeLAngleBracketToken() 511 { 512 auto startpos = _position; 513 if(peekChar == '=') 514 { 515 advanceChar(); 516 return Token(Token.Type.LE, startpos); 517 } 518 else if(peekChar == '<') 519 { 520 advanceChar(); 521 return Token(Token.Type.BIT_LSHIFT, startpos); 522 } 523 else 524 { 525 return Token(Token.Type.LT, startpos); 526 } 527 } 528 529 Token makeEqualToken() 530 { 531 auto startpos = _position; 532 if(peekChar == '=') 533 { 534 advanceChar(); 535 if(peekChar == '=') 536 { 537 advanceChar(); 538 return Token(Token.Type.STRICT_EQUALS); 539 } 540 else 541 { 542 return Token(Token.Type.EQUALS, startpos); 543 } 544 } 545 else 546 { 547 return Token(Token.Type.ASSIGN, startpos); 548 } 549 } 550 551 Token makeNotToken() 552 { 553 auto startpos = _position; 554 if(peekChar == '=') 555 { 556 advanceChar(); 557 if(peekChar == '=') 558 { 559 advanceChar(); 560 return Token(Token.Type.STRICT_NEQUALS, startpos); 561 } 562 else 563 { 564 return Token(Token.Type.NEQUALS, startpos); 565 } 566 } 567 else 568 { 569 return Token(Token.Type.NOT, startpos); 570 } 571 } 572 573 Token makeAndToken() 574 { 575 auto startpos = _position; 576 if(peekChar == '&') 577 { 578 advanceChar(); 579 return Token(Token.Type.AND, startpos); 580 } 581 else 582 { 583 return Token(Token.Type.BIT_AND, startpos); 584 } 585 } 586 587 Token makeOrToken() 588 { 589 auto startpos = _position; 590 if(peekChar == '|') 591 { 592 advanceChar(); 593 return Token(Token.Type.OR, startpos); 594 } 595 else 596 { 597 return Token(Token.Type.BIT_OR, startpos); 598 } 599 } 600 601 Token makePlusToken() 602 { 603 auto startpos = _position; 604 if(peekChar == '+') 605 { 606 advanceChar(); 607 return Token(Token.Type.INC, startpos); 608 } 609 else if(peekChar == '=') 610 { 611 advanceChar(); 612 return Token(Token.Type.PLUS_ASSIGN, startpos); 613 } 614 else 615 { 616 return Token(Token.Type.PLUS, startpos); 617 } 618 } 619 620 Token makeDashToken() 621 { 622 auto startpos = _position; 623 if(peekChar == '-') 624 { 625 advanceChar(); 626 return Token(Token.Type.DEC, startpos); 627 } 628 else if(peekChar == '=') 629 { 630 advanceChar(); 631 return Token(Token.Type.DASH_ASSIGN, startpos); 632 } 633 else 634 { 635 return Token(Token.Type.DASH, startpos); 636 } 637 } 638 639 Token makeStarToken() 640 { 641 auto startpos = _position; 642 if(peekChar == '*') 643 { 644 advanceChar(); 645 return Token(Token.Type.POW, startpos); 646 } 647 else 648 { 649 return Token(Token.Type.STAR, startpos); 650 } 651 } 652 653 Token[] handleFSlash(Token[] tokens) 654 { 655 if(peekChar == '*') 656 { 657 advanceChar(); 658 while(peekChar != '\0') 659 { 660 if(peekChar == '*') 661 { 662 advanceChar(); 663 if(peekChar == '/') 664 break; 665 } 666 advanceChar(); 667 } 668 advanceChar(); 669 } 670 else if(peekChar == '/') 671 { 672 advanceChar(); 673 while(peekChar != '\n' && peekChar != '\0') 674 { 675 advanceChar(); 676 } 677 } 678 else 679 { 680 tokens ~= Token(Token.Type.FSLASH, _position); 681 } 682 683 return tokens; 684 } 685 686 Position _position = {1, 1}; 687 string _text; 688 size_t _index = 0; 689 } 690 691 unittest 692 { 693 auto lexer = Lexer("1.2 34 5.e-99 'foo' "); 694 auto tokens = lexer.tokenize(); 695 assert(tokens[0].type == Token.Type.DOUBLE); 696 assert(tokens[1].type == Token.Type.INTEGER); 697 assert(tokens[2].type == Token.Type.DOUBLE); 698 assert(tokens[3].type == Token.Type.STRING && tokens[3].text == "foo"); 699 // TODO complete unit tests of every token type 700 }