1 /** 2 * Compiler implementation of the 3 * $(LINK2 http://www.dlang.org, D programming language). 4 * 5 * Copyright: Copyright (c) 1999-2016 by Digital Mars, All Rights Reserved 6 * Authors: $(LINK2 http://www.digitalmars.com, Walter Bright) 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 * Source: $(DMDSRC _lexer.d) 9 */ 10 11 module ddmd.lexer; 12 13 import core.stdc.ctype; 14 import core.stdc.errno; 15 import core.stdc.stdarg; 16 import core.stdc.stdio; 17 import core.stdc..string; 18 import core.stdc.time; 19 20 import ddmd.entity; 21 import ddmd.errors; 22 import ddmd.globals; 23 import ddmd.id; 24 import ddmd.identifier; 25 import ddmd.root.ctfloat; 26 import ddmd.root.outbuffer; 27 import ddmd.root.port; 28 import ddmd.root.rmem; 29 import ddmd.tokens; 30 import ddmd.utf; 31 32 enum LS = 0x2028; // UTF line separator 33 enum PS = 0x2029; // UTF paragraph separator 34 35 /******************************************** 36 * Do our own char maps 37 */ 38 immutable ubyte[256] cmtable; 39 enum CMoctal = 0x1; 40 enum CMhex = 0x2; 41 enum CMidchar = 0x4; 42 enum CMzerosecond = 0x8; 43 enum CMdigitsecond = 0x10; 44 enum CMsinglechar = 0x20; 45 46 bool isoctal(char c) 47 { 48 return (cmtable[c] & CMoctal) != 0; 49 } 50 51 bool ishex(char c) 52 { 53 return (cmtable[c] & CMhex) != 0; 54 } 55 56 bool isidchar(char c) 57 { 58 return (cmtable[c] & CMidchar) != 0; 59 } 60 61 bool isZeroSecond(char c) 62 { 63 return (cmtable[c] & CMzerosecond) != 0; 64 } 65 66 bool isDigitSecond(char c) 67 { 68 return (cmtable[c] & CMdigitsecond) != 0; 69 } 70 71 bool issinglechar(char c) 72 { 73 return (cmtable[c] & CMsinglechar) != 0; 74 } 75 76 static this() 77 { 78 foreach (const c; 0 .. cmtable.length) 79 { 80 if ('0' <= c && c <= '7') 81 cmtable[c] |= CMoctal; 82 if (isxdigit(c)) 83 cmtable[c] |= CMhex; 84 if (isalnum(c) || c == '_') 85 cmtable[c] |= CMidchar; 86 87 switch (c) 88 { 89 case 'x': case 'X': 90 case 'b': case 'B': 91 cmtable[c] |= CMzerosecond; 92 break; 93 94 case '0': .. case '9': 95 case 'e': case 'E': 96 case 'f': case 'F': 97 case 'l': case 'L': 98 case 'p': case 'P': 99 case 'u': case 'U': 100 case 'i': 101 case '.': 102 case '_': 103 cmtable[c] |= CMzerosecond | CMdigitsecond; 104 break; 105 106 default: 107 break; 108 } 109 110 switch (c) 111 { 112 case '\\': 113 case '\n': 114 case '\r': 115 case 0: 116 case 0x1A: 117 case '\'': 118 break; 119 default: 120 if (!(c & 0x80)) 121 cmtable[c] |= CMsinglechar; 122 break; 123 } 124 } 125 } 126 127 unittest 128 { 129 //printf("lexer.unittest\n"); 130 /* Not much here, just trying things out. 131 */ 132 string text = "int"; 133 scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, 0, 0); 134 TOK tok; 135 tok = lex1.nextToken(); 136 //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOKint32); 137 assert(tok == TOKint32); 138 tok = lex1.nextToken(); 139 assert(tok == TOKeof); 140 tok = lex1.nextToken(); 141 assert(tok == TOKeof); 142 } 143 144 /*********************************************************** 145 */ 146 class Lexer 147 { 148 __gshared OutBuffer stringbuffer; 149 150 Loc scanloc; // for error messages 151 152 const(char)* base; // pointer to start of buffer 153 const(char)* end; // past end of buffer 154 const(char)* p; // current character 155 const(char)* line; // start of current line 156 Token token; 157 bool doDocComment; // collect doc comment information 158 bool anyToken; // seen at least one token 159 bool commentToken; // comments are TOKcomment's 160 bool errors; // errors occurred during lexing or parsing 161 162 /********************* 163 * Creates a Lexer. 164 * Params: 165 * filename = used for error messages 166 * base = source code, ending in a 0 byte 167 * begoffset = starting offset into base[] 168 * endoffset = last offset into base[] 169 * doDocComment = handle documentation comments 170 * commentToken = comments become TOKcomment's 171 */ 172 this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset, bool doDocComment, bool commentToken) 173 { 174 scanloc = Loc(filename, 1, 1); 175 //printf("Lexer::Lexer(%p,%d)\n",base,length); 176 //printf("lexer.filename = %s\n", filename); 177 token = Token.init; 178 this.base = base; 179 this.end = base + endoffset; 180 p = base + begoffset; 181 line = p; 182 this.doDocComment = doDocComment; 183 this.commentToken = commentToken; 184 //initKeywords(); 185 /* If first line starts with '#!', ignore the line 186 */ 187 if (p[0] == '#' && p[1] == '!') 188 { 189 p += 2; 190 while (1) 191 { 192 char c = *p; 193 switch (c) 194 { 195 case '\n': 196 p++; 197 break; 198 case '\r': 199 p++; 200 if (*p == '\n') 201 p++; 202 break; 203 case 0: 204 case 0x1A: 205 break; 206 default: 207 if (c & 0x80) 208 { 209 uint u = decodeUTF(); 210 if (u == PS || u == LS) 211 break; 212 } 213 p++; 214 continue; 215 } 216 break; 217 } 218 endOfLine(); 219 } 220 } 221 222 final TOK nextToken() 223 { 224 if (token.next) 225 { 226 Token* t = token.next; 227 memcpy(&token, t, Token.sizeof); 228 t.free(); 229 } 230 else 231 { 232 scan(&token); 233 } 234 //token.print(); 235 return token.value; 236 } 237 238 /*********************** 239 * Look ahead at next token's value. 240 */ 241 final TOK peekNext() 242 { 243 return peek(&token).value; 244 } 245 246 /*********************** 247 * Look 2 tokens ahead at value. 248 */ 249 final TOK peekNext2() 250 { 251 Token* t = peek(&token); 252 return peek(t).value; 253 } 254 255 /**************************** 256 * Turn next token in buffer into a token. 257 */ 258 final void scan(Token* t) 259 { 260 const lastLine = scanloc.linnum; 261 Loc startLoc; 262 t.blockComment = null; 263 t.lineComment = null; 264 while (1) 265 { 266 t.ptr = p; 267 //printf("p = %p, *p = '%c'\n",p,*p); 268 t.loc = loc(); 269 switch (*p) 270 { 271 case 0: 272 case 0x1A: 273 t.value = TOKeof; // end of file 274 return; 275 case ' ': 276 case '\t': 277 case '\v': 278 case '\f': 279 p++; 280 continue; // skip white space 281 case '\r': 282 p++; 283 if (*p != '\n') // if CR stands by itself 284 endOfLine(); 285 continue; // skip white space 286 case '\n': 287 p++; 288 endOfLine(); 289 continue; // skip white space 290 case '0': 291 if (!isZeroSecond(p[1])) // if numeric literal does not continue 292 { 293 ++p; 294 t.uns64value = 0; 295 t.value = TOKint32v; 296 return; 297 } 298 goto Lnumber; 299 300 case '1': .. case '9': 301 if (!isDigitSecond(p[1])) // if numeric literal does not continue 302 { 303 t.uns64value = *p - '0'; 304 ++p; 305 t.value = TOKint32v; 306 return; 307 } 308 Lnumber: 309 t.value = number(t); 310 return; 311 312 case '\'': 313 if (issinglechar(p[1]) && p[2] == '\'') 314 { 315 t.uns64value = p[1]; // simple one character literal 316 t.value = TOKcharv; 317 p += 3; 318 } 319 else 320 t.value = charConstant(t); 321 return; 322 case 'r': 323 if (p[1] != '"') 324 goto case_ident; 325 p++; 326 goto case '`'; 327 case '`': 328 t.value = wysiwygStringConstant(t, *p); 329 return; 330 case 'x': 331 if (p[1] != '"') 332 goto case_ident; 333 p++; 334 t.value = hexStringConstant(t); 335 return; 336 case 'q': 337 if (p[1] == '"') 338 { 339 p++; 340 t.value = delimitedStringConstant(t); 341 return; 342 } 343 else if (p[1] == '{') 344 { 345 p++; 346 t.value = tokenStringConstant(t); 347 return; 348 } 349 else 350 goto case_ident; 351 case '"': 352 t.value = escapeStringConstant(t, 0); 353 return; 354 case 'a': 355 case 'b': 356 case 'c': 357 case 'd': 358 case 'e': 359 case 'f': 360 case 'g': 361 case 'h': 362 case 'i': 363 case 'j': 364 case 'k': 365 case 'l': 366 case 'm': 367 case 'n': 368 case 'o': 369 case 'p': 370 /*case 'q': case 'r':*/ 371 case 's': 372 case 't': 373 case 'u': 374 case 'v': 375 case 'w': 376 /*case 'x':*/ 377 case 'y': 378 case 'z': 379 case 'A': 380 case 'B': 381 case 'C': 382 case 'D': 383 case 'E': 384 case 'F': 385 case 'G': 386 case 'H': 387 case 'I': 388 case 'J': 389 case 'K': 390 case 'L': 391 case 'M': 392 case 'N': 393 case 'O': 394 case 'P': 395 case 'Q': 396 case 'R': 397 case 'S': 398 case 'T': 399 case 'U': 400 case 'V': 401 case 'W': 402 case 'X': 403 case 'Y': 404 case 'Z': 405 case '_': 406 case_ident: 407 { 408 while (1) 409 { 410 const c = *++p; 411 if (isidchar(c)) 412 continue; 413 else if (c & 0x80) 414 { 415 const s = p; 416 const u = decodeUTF(); 417 if (isUniAlpha(u)) 418 continue; 419 error("char 0x%04x not allowed in identifier", u); 420 p = s; 421 } 422 break; 423 } 424 Identifier id = Identifier.idPool(cast(char*)t.ptr, p - t.ptr); 425 t.ident = id; 426 t.value = cast(TOK)id.getValue(); 427 anyToken = 1; 428 if (*t.ptr == '_') // if special identifier token 429 { 430 __gshared bool initdone = false; 431 __gshared char[11 + 1] date; 432 __gshared char[8 + 1] time; 433 __gshared char[24 + 1] timestamp; 434 if (!initdone) // lazy evaluation 435 { 436 initdone = true; 437 time_t ct; 438 .time(&ct); 439 const p = ctime(&ct); 440 assert(p); 441 sprintf(&date[0], "%.6s %.4s", p + 4, p + 20); 442 sprintf(&time[0], "%.8s", p + 11); 443 sprintf(×tamp[0], "%.24s", p); 444 } 445 if (id == Id.DATE) 446 { 447 t.ustring = date.ptr; 448 goto Lstr; 449 } 450 else if (id == Id.TIME) 451 { 452 t.ustring = time.ptr; 453 goto Lstr; 454 } 455 else if (id == Id.VENDOR) 456 { 457 t.ustring = global.compiler.vendor; 458 goto Lstr; 459 } 460 else if (id == Id.TIMESTAMP) 461 { 462 t.ustring = timestamp.ptr; 463 Lstr: 464 t.value = TOKstring; 465 t.postfix = 0; 466 t.len = cast(uint)strlen(t.ustring); 467 } 468 else if (id == Id.VERSIONX) 469 { 470 uint major = 0; 471 uint minor = 0; 472 bool point = false; 473 for (const(char)* p = global._version + 1; 1; p++) 474 { 475 const c = *p; 476 if (isdigit(cast(char)c)) 477 minor = minor * 10 + c - '0'; 478 else if (c == '.') 479 { 480 if (point) 481 break; // ignore everything after second '.' 482 point = true; 483 major = minor; 484 minor = 0; 485 } 486 else 487 break; 488 } 489 t.value = TOKint64v; 490 t.uns64value = major * 1000 + minor; 491 } 492 else if (id == Id.EOFX) 493 { 494 t.value = TOKeof; 495 // Advance scanner to end of file 496 while (!(*p == 0 || *p == 0x1A)) 497 p++; 498 } 499 } 500 //printf("t->value = %d\n",t->value); 501 return; 502 } 503 case '/': 504 p++; 505 switch (*p) 506 { 507 case '=': 508 p++; 509 t.value = TOKdivass; 510 return; 511 case '*': 512 p++; 513 startLoc = loc(); 514 while (1) 515 { 516 while (1) 517 { 518 const c = *p; 519 switch (c) 520 { 521 case '/': 522 break; 523 case '\n': 524 endOfLine(); 525 p++; 526 continue; 527 case '\r': 528 p++; 529 if (*p != '\n') 530 endOfLine(); 531 continue; 532 case 0: 533 case 0x1A: 534 error("unterminated /* */ comment"); 535 p = end; 536 t.loc = loc(); 537 t.value = TOKeof; 538 return; 539 default: 540 if (c & 0x80) 541 { 542 const u = decodeUTF(); 543 if (u == PS || u == LS) 544 endOfLine(); 545 } 546 p++; 547 continue; 548 } 549 break; 550 } 551 p++; 552 if (p[-2] == '*' && p - 3 != t.ptr) 553 break; 554 } 555 if (commentToken) 556 { 557 t.loc = startLoc; 558 t.value = TOKcomment; 559 return; 560 } 561 else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr) 562 { 563 // if /** but not /**/ 564 getDocComment(t, lastLine == startLoc.linnum); 565 } 566 continue; 567 case '/': // do // style comments 568 startLoc = loc(); 569 while (1) 570 { 571 const c = *++p; 572 switch (c) 573 { 574 case '\n': 575 break; 576 case '\r': 577 if (p[1] == '\n') 578 p++; 579 break; 580 case 0: 581 case 0x1A: 582 if (commentToken) 583 { 584 p = end; 585 t.loc = startLoc; 586 t.value = TOKcomment; 587 return; 588 } 589 if (doDocComment && t.ptr[2] == '/') 590 getDocComment(t, lastLine == startLoc.linnum); 591 p = end; 592 t.loc = loc(); 593 t.value = TOKeof; 594 return; 595 default: 596 if (c & 0x80) 597 { 598 const u = decodeUTF(); 599 if (u == PS || u == LS) 600 break; 601 } 602 continue; 603 } 604 break; 605 } 606 if (commentToken) 607 { 608 p++; 609 endOfLine(); 610 t.loc = startLoc; 611 t.value = TOKcomment; 612 return; 613 } 614 if (doDocComment && t.ptr[2] == '/') 615 getDocComment(t, lastLine == startLoc.linnum); 616 p++; 617 endOfLine(); 618 continue; 619 case '+': 620 { 621 int nest; 622 startLoc = loc(); 623 p++; 624 nest = 1; 625 while (1) 626 { 627 char c = *p; 628 switch (c) 629 { 630 case '/': 631 p++; 632 if (*p == '+') 633 { 634 p++; 635 nest++; 636 } 637 continue; 638 case '+': 639 p++; 640 if (*p == '/') 641 { 642 p++; 643 if (--nest == 0) 644 break; 645 } 646 continue; 647 case '\r': 648 p++; 649 if (*p != '\n') 650 endOfLine(); 651 continue; 652 case '\n': 653 endOfLine(); 654 p++; 655 continue; 656 case 0: 657 case 0x1A: 658 error("unterminated /+ +/ comment"); 659 p = end; 660 t.loc = loc(); 661 t.value = TOKeof; 662 return; 663 default: 664 if (c & 0x80) 665 { 666 uint u = decodeUTF(); 667 if (u == PS || u == LS) 668 endOfLine(); 669 } 670 p++; 671 continue; 672 } 673 break; 674 } 675 if (commentToken) 676 { 677 t.loc = startLoc; 678 t.value = TOKcomment; 679 return; 680 } 681 if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr) 682 { 683 // if /++ but not /++/ 684 getDocComment(t, lastLine == startLoc.linnum); 685 } 686 continue; 687 } 688 default: 689 break; 690 } 691 t.value = TOKdiv; 692 return; 693 case '.': 694 p++; 695 if (isdigit(*p)) 696 { 697 /* Note that we don't allow ._1 and ._ as being 698 * valid floating point numbers. 699 */ 700 p--; 701 t.value = inreal(t); 702 } 703 else if (p[0] == '.') 704 { 705 if (p[1] == '.') 706 { 707 p += 2; 708 t.value = TOKdotdotdot; 709 } 710 else 711 { 712 p++; 713 t.value = TOKslice; 714 } 715 } 716 else 717 t.value = TOKdot; 718 return; 719 case '&': 720 p++; 721 if (*p == '=') 722 { 723 p++; 724 t.value = TOKandass; 725 } 726 else if (*p == '&') 727 { 728 p++; 729 t.value = TOKandand; 730 } 731 else 732 t.value = TOKand; 733 return; 734 case '|': 735 p++; 736 if (*p == '=') 737 { 738 p++; 739 t.value = TOKorass; 740 } 741 else if (*p == '|') 742 { 743 p++; 744 t.value = TOKoror; 745 } 746 else 747 t.value = TOKor; 748 return; 749 case '-': 750 p++; 751 if (*p == '=') 752 { 753 p++; 754 t.value = TOKminass; 755 } 756 else if (*p == '-') 757 { 758 p++; 759 t.value = TOKminusminus; 760 } 761 else 762 t.value = TOKmin; 763 return; 764 case '+': 765 p++; 766 if (*p == '=') 767 { 768 p++; 769 t.value = TOKaddass; 770 } 771 else if (*p == '+') 772 { 773 p++; 774 t.value = TOKplusplus; 775 } 776 else 777 t.value = TOKadd; 778 return; 779 case '<': 780 p++; 781 if (*p == '=') 782 { 783 p++; 784 t.value = TOKle; // <= 785 } 786 else if (*p == '<') 787 { 788 p++; 789 if (*p == '=') 790 { 791 p++; 792 t.value = TOKshlass; // <<= 793 } 794 else 795 t.value = TOKshl; // << 796 } 797 else if (*p == '>') 798 { 799 p++; 800 if (*p == '=') 801 { 802 p++; 803 t.value = TOKleg; // <>= 804 } 805 else 806 t.value = TOKlg; // <> 807 } 808 else 809 t.value = TOKlt; // < 810 return; 811 case '>': 812 p++; 813 if (*p == '=') 814 { 815 p++; 816 t.value = TOKge; // >= 817 } 818 else if (*p == '>') 819 { 820 p++; 821 if (*p == '=') 822 { 823 p++; 824 t.value = TOKshrass; // >>= 825 } 826 else if (*p == '>') 827 { 828 p++; 829 if (*p == '=') 830 { 831 p++; 832 t.value = TOKushrass; // >>>= 833 } 834 else 835 t.value = TOKushr; // >>> 836 } 837 else 838 t.value = TOKshr; // >> 839 } 840 else 841 t.value = TOKgt; // > 842 return; 843 case '!': 844 p++; 845 if (*p == '=') 846 { 847 p++; 848 t.value = TOKnotequal; // != 849 } 850 else if (*p == '<') 851 { 852 p++; 853 if (*p == '>') 854 { 855 p++; 856 if (*p == '=') 857 { 858 p++; 859 t.value = TOKunord; // !<>= 860 } 861 else 862 t.value = TOKue; // !<> 863 } 864 else if (*p == '=') 865 { 866 p++; 867 t.value = TOKug; // !<= 868 } 869 else 870 t.value = TOKuge; // !< 871 } 872 else if (*p == '>') 873 { 874 p++; 875 if (*p == '=') 876 { 877 p++; 878 t.value = TOKul; // !>= 879 } 880 else 881 t.value = TOKule; // !> 882 } 883 else 884 t.value = TOKnot; // ! 885 return; 886 case '=': 887 p++; 888 if (*p == '=') 889 { 890 p++; 891 t.value = TOKequal; // == 892 } 893 else if (*p == '>') 894 { 895 p++; 896 t.value = TOKgoesto; // => 897 } 898 else 899 t.value = TOKassign; // = 900 return; 901 case '~': 902 p++; 903 if (*p == '=') 904 { 905 p++; 906 t.value = TOKcatass; // ~= 907 } 908 else 909 t.value = TOKtilde; // ~ 910 return; 911 case '^': 912 p++; 913 if (*p == '^') 914 { 915 p++; 916 if (*p == '=') 917 { 918 p++; 919 t.value = TOKpowass; // ^^= 920 } 921 else 922 t.value = TOKpow; // ^^ 923 } 924 else if (*p == '=') 925 { 926 p++; 927 t.value = TOKxorass; // ^= 928 } 929 else 930 t.value = TOKxor; // ^ 931 return; 932 case '(': 933 p++; 934 t.value = TOKlparen; 935 return; 936 case ')': 937 p++; 938 t.value = TOKrparen; 939 return; 940 case '[': 941 p++; 942 t.value = TOKlbracket; 943 return; 944 case ']': 945 p++; 946 t.value = TOKrbracket; 947 return; 948 case '{': 949 p++; 950 t.value = TOKlcurly; 951 return; 952 case '}': 953 p++; 954 t.value = TOKrcurly; 955 return; 956 case '?': 957 p++; 958 t.value = TOKquestion; 959 return; 960 case ',': 961 p++; 962 t.value = TOKcomma; 963 return; 964 case ';': 965 p++; 966 t.value = TOKsemicolon; 967 return; 968 case ':': 969 p++; 970 t.value = TOKcolon; 971 return; 972 case '$': 973 p++; 974 t.value = TOKdollar; 975 return; 976 case '@': 977 p++; 978 t.value = TOKat; 979 return; 980 case '*': 981 p++; 982 if (*p == '=') 983 { 984 p++; 985 t.value = TOKmulass; 986 } 987 else 988 t.value = TOKmul; 989 return; 990 case '%': 991 p++; 992 if (*p == '=') 993 { 994 p++; 995 t.value = TOKmodass; 996 } 997 else 998 t.value = TOKmod; 999 return; 1000 case '#': 1001 { 1002 p++; 1003 Token n; 1004 scan(&n); 1005 if (n.value == TOKidentifier && n.ident == Id.line) 1006 { 1007 poundLine(); 1008 continue; 1009 } 1010 else 1011 { 1012 t.value = TOKpound; 1013 return; 1014 } 1015 } 1016 default: 1017 { 1018 dchar c = *p; 1019 if (c & 0x80) 1020 { 1021 c = decodeUTF(); 1022 // Check for start of unicode identifier 1023 if (isUniAlpha(c)) 1024 goto case_ident; 1025 if (c == PS || c == LS) 1026 { 1027 endOfLine(); 1028 p++; 1029 continue; 1030 } 1031 } 1032 if (c < 0x80 && isprint(c)) 1033 error("character '%c' is not a valid token", c); 1034 else 1035 error("character 0x%02x is not a valid token", c); 1036 p++; 1037 continue; 1038 } 1039 } 1040 } 1041 } 1042 1043 final Token* peek(Token* ct) 1044 { 1045 Token* t; 1046 if (ct.next) 1047 t = ct.next; 1048 else 1049 { 1050 t = Token.alloc(); 1051 scan(t); 1052 ct.next = t; 1053 } 1054 return t; 1055 } 1056 1057 /********************************* 1058 * tk is on the opening (. 1059 * Look ahead and return token that is past the closing ). 1060 */ 1061 final Token* peekPastParen(Token* tk) 1062 { 1063 //printf("peekPastParen()\n"); 1064 int parens = 1; 1065 int curlynest = 0; 1066 while (1) 1067 { 1068 tk = peek(tk); 1069 //tk->print(); 1070 switch (tk.value) 1071 { 1072 case TOKlparen: 1073 parens++; 1074 continue; 1075 case TOKrparen: 1076 --parens; 1077 if (parens) 1078 continue; 1079 tk = peek(tk); 1080 break; 1081 case TOKlcurly: 1082 curlynest++; 1083 continue; 1084 case TOKrcurly: 1085 if (--curlynest >= 0) 1086 continue; 1087 break; 1088 case TOKsemicolon: 1089 if (curlynest) 1090 continue; 1091 break; 1092 case TOKeof: 1093 break; 1094 default: 1095 continue; 1096 } 1097 return tk; 1098 } 1099 } 1100 1101 /******************************************* 1102 * Parse escape sequence. 1103 */ 1104 final uint escapeSequence() 1105 { 1106 uint c = *p; 1107 int ndigits; 1108 switch (c) 1109 { 1110 case '\'': 1111 case '"': 1112 case '?': 1113 case '\\': 1114 Lconsume: 1115 p++; 1116 break; 1117 case 'a': 1118 c = 7; 1119 goto Lconsume; 1120 case 'b': 1121 c = 8; 1122 goto Lconsume; 1123 case 'f': 1124 c = 12; 1125 goto Lconsume; 1126 case 'n': 1127 c = 10; 1128 goto Lconsume; 1129 case 'r': 1130 c = 13; 1131 goto Lconsume; 1132 case 't': 1133 c = 9; 1134 goto Lconsume; 1135 case 'v': 1136 c = 11; 1137 goto Lconsume; 1138 case 'u': 1139 ndigits = 4; 1140 goto Lhex; 1141 case 'U': 1142 ndigits = 8; 1143 goto Lhex; 1144 case 'x': 1145 ndigits = 2; 1146 Lhex: 1147 p++; 1148 c = *p; 1149 if (ishex(cast(char)c)) 1150 { 1151 uint v = 0; 1152 int n = 0; 1153 while (1) 1154 { 1155 if (isdigit(cast(char)c)) 1156 c -= '0'; 1157 else if (islower(c)) 1158 c -= 'a' - 10; 1159 else 1160 c -= 'A' - 10; 1161 v = v * 16 + c; 1162 c = *++p; 1163 if (++n == ndigits) 1164 break; 1165 if (!ishex(cast(char)c)) 1166 { 1167 error("escape hex sequence has %d hex digits instead of %d", n, ndigits); 1168 break; 1169 } 1170 } 1171 if (ndigits != 2 && !utf_isValidDchar(v)) 1172 { 1173 error("invalid UTF character \\U%08x", v); 1174 v = '?'; // recover with valid UTF character 1175 } 1176 c = v; 1177 } 1178 else 1179 error("undefined escape hex sequence \\%c", c); 1180 break; 1181 case '&': 1182 // named character entity 1183 for (const idstart = ++p; 1; p++) 1184 { 1185 switch (*p) 1186 { 1187 case ';': 1188 c = HtmlNamedEntity(idstart, p - idstart); 1189 if (c == ~0) 1190 { 1191 error("unnamed character entity &%.*s;", cast(int)(p - idstart), idstart); 1192 c = ' '; 1193 } 1194 p++; 1195 break; 1196 default: 1197 if (isalpha(*p) || (p != idstart && isdigit(*p))) 1198 continue; 1199 error("unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart); 1200 break; 1201 } 1202 break; 1203 } 1204 break; 1205 case 0: 1206 case 0x1A: 1207 // end of file 1208 c = '\\'; 1209 break; 1210 default: 1211 if (isoctal(cast(char)c)) 1212 { 1213 uint v = 0; 1214 int n = 0; 1215 do 1216 { 1217 v = v * 8 + (c - '0'); 1218 c = *++p; 1219 } 1220 while (++n < 3 && isoctal(cast(char)c)); 1221 c = v; 1222 if (c > 0xFF) 1223 error("escape octal sequence \\%03o is larger than \\377", c); 1224 } 1225 else 1226 error("undefined escape sequence \\%c", c); 1227 break; 1228 } 1229 return c; 1230 } 1231 1232 /************************************** 1233 */ 1234 final TOK wysiwygStringConstant(Token* t, int tc) 1235 { 1236 Loc start = loc(); 1237 p++; 1238 stringbuffer.reset(); 1239 while (1) 1240 { 1241 dchar c = *p++; 1242 switch (c) 1243 { 1244 case '\n': 1245 endOfLine(); 1246 break; 1247 case '\r': 1248 if (*p == '\n') 1249 continue; // ignore 1250 c = '\n'; // treat EndOfLine as \n character 1251 endOfLine(); 1252 break; 1253 case 0: 1254 case 0x1A: 1255 error("unterminated string constant starting at %s", start.toChars()); 1256 t.setString(); 1257 return TOKstring; 1258 case '"': 1259 case '`': 1260 if (c == tc) 1261 { 1262 t.setString(stringbuffer); 1263 stringPostfix(t); 1264 return TOKstring; 1265 } 1266 break; 1267 default: 1268 if (c & 0x80) 1269 { 1270 p--; 1271 const u = decodeUTF(); 1272 p++; 1273 if (u == PS || u == LS) 1274 endOfLine(); 1275 stringbuffer.writeUTF8(u); 1276 continue; 1277 } 1278 break; 1279 } 1280 stringbuffer.writeByte(c); 1281 } 1282 } 1283 1284 /************************************** 1285 * Lex hex strings: 1286 * x"0A ae 34FE BD" 1287 */ 1288 final TOK hexStringConstant(Token* t) 1289 { 1290 Loc start = loc(); 1291 uint n = 0; 1292 uint v = ~0; // dead assignment, needed to suppress warning 1293 p++; 1294 stringbuffer.reset(); 1295 while (1) 1296 { 1297 dchar c = *p++; 1298 switch (c) 1299 { 1300 case ' ': 1301 case '\t': 1302 case '\v': 1303 case '\f': 1304 continue; // skip white space 1305 case '\r': 1306 if (*p == '\n') 1307 continue; // ignore '\r' if followed by '\n' 1308 // Treat isolated '\r' as if it were a '\n' 1309 goto case '\n'; 1310 case '\n': 1311 endOfLine(); 1312 continue; 1313 case 0: 1314 case 0x1A: 1315 error("unterminated string constant starting at %s", start.toChars()); 1316 t.setString(); 1317 return TOKxstring; 1318 case '"': 1319 if (n & 1) 1320 { 1321 error("odd number (%d) of hex characters in hex string", n); 1322 stringbuffer.writeByte(v); 1323 } 1324 t.setString(stringbuffer); 1325 stringPostfix(t); 1326 return TOKxstring; 1327 default: 1328 if (c >= '0' && c <= '9') 1329 c -= '0'; 1330 else if (c >= 'a' && c <= 'f') 1331 c -= 'a' - 10; 1332 else if (c >= 'A' && c <= 'F') 1333 c -= 'A' - 10; 1334 else if (c & 0x80) 1335 { 1336 p--; 1337 const u = decodeUTF(); 1338 p++; 1339 if (u == PS || u == LS) 1340 endOfLine(); 1341 else 1342 error("non-hex character \\u%04x in hex string", u); 1343 } 1344 else 1345 error("non-hex character '%c' in hex string", c); 1346 if (n & 1) 1347 { 1348 v = (v << 4) | c; 1349 stringbuffer.writeByte(v); 1350 } 1351 else 1352 v = c; 1353 n++; 1354 break; 1355 } 1356 } 1357 assert(0); // see bug 15731 1358 } 1359 1360 /************************************** 1361 * Lex delimited strings: 1362 * q"(foo(xxx))" // "foo(xxx)" 1363 * q"[foo$(LPAREN)]" // "foo$(LPAREN)" 1364 * q"/foo]/" // "foo]" 1365 * q"HERE 1366 * foo 1367 * HERE" // "foo\n" 1368 * Input: 1369 * p is on the " 1370 */ 1371 final TOK delimitedStringConstant(Token* t) 1372 { 1373 Loc start = loc(); 1374 dchar delimleft = 0; 1375 dchar delimright = 0; 1376 uint nest = 1; 1377 uint nestcount = ~0; // dead assignment, needed to suppress warning 1378 Identifier hereid = null; 1379 uint blankrol = 0; 1380 uint startline = 0; 1381 p++; 1382 stringbuffer.reset(); 1383 while (1) 1384 { 1385 dchar c = *p++; 1386 //printf("c = '%c'\n", c); 1387 switch (c) 1388 { 1389 case '\n': 1390 Lnextline: 1391 endOfLine(); 1392 startline = 1; 1393 if (blankrol) 1394 { 1395 blankrol = 0; 1396 continue; 1397 } 1398 if (hereid) 1399 { 1400 stringbuffer.writeUTF8(c); 1401 continue; 1402 } 1403 break; 1404 case '\r': 1405 if (*p == '\n') 1406 continue; // ignore 1407 c = '\n'; // treat EndOfLine as \n character 1408 goto Lnextline; 1409 case 0: 1410 case 0x1A: 1411 error("unterminated delimited string constant starting at %s", start.toChars()); 1412 t.setString(); 1413 return TOKstring; 1414 default: 1415 if (c & 0x80) 1416 { 1417 p--; 1418 c = decodeUTF(); 1419 p++; 1420 if (c == PS || c == LS) 1421 goto Lnextline; 1422 } 1423 break; 1424 } 1425 if (delimleft == 0) 1426 { 1427 delimleft = c; 1428 nest = 1; 1429 nestcount = 1; 1430 if (c == '(') 1431 delimright = ')'; 1432 else if (c == '{') 1433 delimright = '}'; 1434 else if (c == '[') 1435 delimright = ']'; 1436 else if (c == '<') 1437 delimright = '>'; 1438 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) 1439 { 1440 // Start of identifier; must be a heredoc 1441 Token tok; 1442 p--; 1443 scan(&tok); // read in heredoc identifier 1444 if (tok.value != TOKidentifier) 1445 { 1446 error("identifier expected for heredoc, not %s", tok.toChars()); 1447 delimright = c; 1448 } 1449 else 1450 { 1451 hereid = tok.ident; 1452 //printf("hereid = '%s'\n", hereid->toChars()); 1453 blankrol = 1; 1454 } 1455 nest = 0; 1456 } 1457 else 1458 { 1459 delimright = c; 1460 nest = 0; 1461 if (isspace(c)) 1462 error("delimiter cannot be whitespace"); 1463 } 1464 } 1465 else 1466 { 1467 if (blankrol) 1468 { 1469 error("heredoc rest of line should be blank"); 1470 blankrol = 0; 1471 continue; 1472 } 1473 if (nest == 1) 1474 { 1475 if (c == delimleft) 1476 nestcount++; 1477 else if (c == delimright) 1478 { 1479 nestcount--; 1480 if (nestcount == 0) 1481 goto Ldone; 1482 } 1483 } 1484 else if (c == delimright) 1485 goto Ldone; 1486 if (startline && isalpha(c) && hereid) 1487 { 1488 Token tok; 1489 auto psave = p; 1490 p--; 1491 scan(&tok); // read in possible heredoc identifier 1492 //printf("endid = '%s'\n", tok.ident->toChars()); 1493 if (tok.value == TOKidentifier && tok.ident.equals(hereid)) 1494 { 1495 /* should check that rest of line is blank 1496 */ 1497 goto Ldone; 1498 } 1499 p = psave; 1500 } 1501 stringbuffer.writeUTF8(c); 1502 startline = 0; 1503 } 1504 } 1505 Ldone: 1506 if (*p == '"') 1507 p++; 1508 else if (hereid) 1509 error("delimited string must end in %s\"", hereid.toChars()); 1510 else 1511 error("delimited string must end in %c\"", delimright); 1512 t.setString(stringbuffer); 1513 stringPostfix(t); 1514 return TOKstring; 1515 } 1516 1517 /************************************** 1518 * Lex delimited strings: 1519 * q{ foo(xxx) } // " foo(xxx) " 1520 * q{foo$(LPAREN)} // "foo$(LPAREN)" 1521 * q{{foo}"}"} // "{foo}"}"" 1522 * Input: 1523 * p is on the q 1524 */ 1525 final TOK tokenStringConstant(Token* t) 1526 { 1527 uint nest = 1; 1528 const start = loc(); 1529 const pstart = ++p; 1530 while (1) 1531 { 1532 Token tok; 1533 scan(&tok); 1534 switch (tok.value) 1535 { 1536 case TOKlcurly: 1537 nest++; 1538 continue; 1539 case TOKrcurly: 1540 if (--nest == 0) 1541 { 1542 t.setString(pstart, p - 1 - pstart); 1543 stringPostfix(t); 1544 return TOKstring; 1545 } 1546 continue; 1547 case TOKeof: 1548 error("unterminated token string constant starting at %s", start.toChars()); 1549 t.setString(); 1550 return TOKstring; 1551 default: 1552 continue; 1553 } 1554 } 1555 } 1556 1557 /************************************** 1558 */ 1559 final TOK escapeStringConstant(Token* t, int wide) 1560 { 1561 const start = loc(); 1562 p++; 1563 stringbuffer.reset(); 1564 while (1) 1565 { 1566 dchar c = *p++; 1567 switch (c) 1568 { 1569 case '\\': 1570 switch (*p) 1571 { 1572 case 'u': 1573 case 'U': 1574 case '&': 1575 c = escapeSequence(); 1576 stringbuffer.writeUTF8(c); 1577 continue; 1578 default: 1579 c = escapeSequence(); 1580 break; 1581 } 1582 break; 1583 case '\n': 1584 endOfLine(); 1585 break; 1586 case '\r': 1587 if (*p == '\n') 1588 continue; // ignore 1589 c = '\n'; // treat EndOfLine as \n character 1590 endOfLine(); 1591 break; 1592 case '"': 1593 t.setString(stringbuffer); 1594 stringPostfix(t); 1595 return TOKstring; 1596 case 0: 1597 case 0x1A: 1598 p--; 1599 error("unterminated string constant starting at %s", start.toChars()); 1600 t.setString(); 1601 return TOKstring; 1602 default: 1603 if (c & 0x80) 1604 { 1605 p--; 1606 c = decodeUTF(); 1607 if (c == LS || c == PS) 1608 { 1609 c = '\n'; 1610 endOfLine(); 1611 } 1612 p++; 1613 stringbuffer.writeUTF8(c); 1614 continue; 1615 } 1616 break; 1617 } 1618 stringbuffer.writeByte(c); 1619 } 1620 } 1621 1622 /************************************** 1623 */ 1624 final TOK charConstant(Token* t) 1625 { 1626 TOK tk = TOKcharv; 1627 //printf("Lexer::charConstant\n"); 1628 p++; 1629 dchar c = *p++; 1630 switch (c) 1631 { 1632 case '\\': 1633 switch (*p) 1634 { 1635 case 'u': 1636 t.uns64value = escapeSequence(); 1637 tk = TOKwcharv; 1638 break; 1639 case 'U': 1640 case '&': 1641 t.uns64value = escapeSequence(); 1642 tk = TOKdcharv; 1643 break; 1644 default: 1645 t.uns64value = escapeSequence(); 1646 break; 1647 } 1648 break; 1649 case '\n': 1650 L1: 1651 endOfLine(); 1652 goto case; 1653 case '\r': 1654 case 0: 1655 case 0x1A: 1656 case '\'': 1657 error("unterminated character constant"); 1658 t.uns64value = '?'; 1659 return tk; 1660 default: 1661 if (c & 0x80) 1662 { 1663 p--; 1664 c = decodeUTF(); 1665 p++; 1666 if (c == LS || c == PS) 1667 goto L1; 1668 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE)) 1669 tk = TOKwcharv; 1670 else 1671 tk = TOKdcharv; 1672 } 1673 t.uns64value = c; 1674 break; 1675 } 1676 if (*p != '\'') 1677 { 1678 error("unterminated character constant"); 1679 t.uns64value = '?'; 1680 return tk; 1681 } 1682 p++; 1683 return tk; 1684 } 1685 1686 /*************************************** 1687 * Get postfix of string literal. 1688 */ 1689 final void stringPostfix(Token* t) 1690 { 1691 switch (*p) 1692 { 1693 case 'c': 1694 case 'w': 1695 case 'd': 1696 t.postfix = *p; 1697 p++; 1698 break; 1699 default: 1700 t.postfix = 0; 1701 break; 1702 } 1703 } 1704 1705 /************************************** 1706 * Read in a number. 1707 * If it's an integer, store it in tok.TKutok.Vlong. 1708 * integers can be decimal, octal or hex 1709 * Handle the suffixes U, UL, LU, L, etc. 1710 * If it's double, store it in tok.TKutok.Vdouble. 1711 * Returns: 1712 * TKnum 1713 * TKdouble,... 1714 */ 1715 final TOK number(Token* t) 1716 { 1717 int base = 10; 1718 const start = p; 1719 uinteger_t n = 0; // unsigned >=64 bit integer type 1720 int d; 1721 bool err = false; 1722 bool overflow = false; 1723 dchar c = *p; 1724 if (c == '0') 1725 { 1726 ++p; 1727 c = *p; 1728 switch (c) 1729 { 1730 case '0': 1731 case '1': 1732 case '2': 1733 case '3': 1734 case '4': 1735 case '5': 1736 case '6': 1737 case '7': 1738 n = c - '0'; 1739 ++p; 1740 base = 8; 1741 break; 1742 case 'x': 1743 case 'X': 1744 ++p; 1745 base = 16; 1746 break; 1747 case 'b': 1748 case 'B': 1749 ++p; 1750 base = 2; 1751 break; 1752 case '.': 1753 if (p[1] == '.') 1754 goto Ldone; // if ".." 1755 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80) 1756 goto Ldone; // if ".identifier" or ".unicode" 1757 goto Lreal; // '.' is part of current token 1758 case 'i': 1759 case 'f': 1760 case 'F': 1761 goto Lreal; 1762 case '_': 1763 ++p; 1764 base = 8; 1765 break; 1766 case 'L': 1767 if (p[1] == 'i') 1768 goto Lreal; 1769 break; 1770 default: 1771 break; 1772 } 1773 } 1774 while (1) 1775 { 1776 c = *p; 1777 switch (c) 1778 { 1779 case '0': 1780 case '1': 1781 ++p; 1782 d = c - '0'; 1783 break; 1784 case '2': 1785 case '3': 1786 case '4': 1787 case '5': 1788 case '6': 1789 case '7': 1790 if (base == 2 && !err) 1791 { 1792 error("binary digit expected"); 1793 err = true; 1794 } 1795 ++p; 1796 d = c - '0'; 1797 break; 1798 case '8': 1799 case '9': 1800 ++p; 1801 if (base < 10 && !err) 1802 { 1803 error("radix %d digit expected, not '%c'", base, c); 1804 err = true; 1805 } 1806 d = c - '0'; 1807 break; 1808 case 'a': 1809 case 'b': 1810 case 'c': 1811 case 'd': 1812 case 'e': 1813 case 'f': 1814 case 'A': 1815 case 'B': 1816 case 'C': 1817 case 'D': 1818 case 'E': 1819 case 'F': 1820 ++p; 1821 if (base != 16) 1822 { 1823 if (c == 'e' || c == 'E' || c == 'f' || c == 'F') 1824 goto Lreal; 1825 if (!err) 1826 { 1827 error("radix %d digit expected, not '%c'", base, c); 1828 err = true; 1829 } 1830 } 1831 if (c >= 'a') 1832 d = c + 10 - 'a'; 1833 else 1834 d = c + 10 - 'A'; 1835 break; 1836 case 'L': 1837 if (p[1] == 'i') 1838 goto Lreal; 1839 goto Ldone; 1840 case '.': 1841 if (p[1] == '.') 1842 goto Ldone; // if ".." 1843 if (base == 10 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)) 1844 goto Ldone; // if ".identifier" or ".unicode" 1845 goto Lreal; // otherwise as part of a floating point literal 1846 case 'p': 1847 case 'P': 1848 case 'i': 1849 Lreal: 1850 p = start; 1851 return inreal(t); 1852 case '_': 1853 ++p; 1854 continue; 1855 default: 1856 goto Ldone; 1857 } 1858 // Avoid expensive overflow check if we aren't at risk of overflow 1859 if (n <= 0x0FFF_FFFF_FFFF_FFFFUL) 1860 n = n * base + d; 1861 else 1862 { 1863 import core.checkedint : mulu, addu; 1864 1865 n = mulu(n, base, overflow); 1866 n = addu(n, d, overflow); 1867 } 1868 } 1869 Ldone: 1870 if (overflow && !err) 1871 { 1872 error("integer overflow"); 1873 err = true; 1874 } 1875 enum FLAGS : int 1876 { 1877 FLAGS_none = 0, 1878 FLAGS_decimal = 1, // decimal 1879 FLAGS_unsigned = 2, // u or U suffix 1880 FLAGS_long = 4, // L suffix 1881 } 1882 1883 alias FLAGS_none = FLAGS.FLAGS_none; 1884 alias FLAGS_decimal = FLAGS.FLAGS_decimal; 1885 alias FLAGS_unsigned = FLAGS.FLAGS_unsigned; 1886 alias FLAGS_long = FLAGS.FLAGS_long; 1887 1888 FLAGS flags = (base == 10) ? FLAGS_decimal : FLAGS_none; 1889 // Parse trailing 'u', 'U', 'l' or 'L' in any combination 1890 const psuffix = p; 1891 while (1) 1892 { 1893 FLAGS f; 1894 switch (*p) 1895 { 1896 case 'U': 1897 case 'u': 1898 f = FLAGS_unsigned; 1899 goto L1; 1900 case 'l': 1901 f = FLAGS_long; 1902 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead"); 1903 goto L1; 1904 case 'L': 1905 f = FLAGS_long; 1906 L1: 1907 p++; 1908 if ((flags & f) && !err) 1909 { 1910 error("unrecognized token"); 1911 err = true; 1912 } 1913 flags = cast(FLAGS)(flags | f); 1914 continue; 1915 default: 1916 break; 1917 } 1918 break; 1919 } 1920 if (base == 8 && n >= 8) 1921 error("octal literals 0%llo%.*s are no longer supported, use std.conv.octal!%llo%.*s instead", n, p - psuffix, psuffix, n, p - psuffix, psuffix); 1922 TOK result; 1923 switch (flags) 1924 { 1925 case FLAGS_none: 1926 /* Octal or Hexadecimal constant. 1927 * First that fits: int, uint, long, ulong 1928 */ 1929 if (n & 0x8000000000000000L) 1930 result = TOKuns64v; 1931 else if (n & 0xFFFFFFFF00000000L) 1932 result = TOKint64v; 1933 else if (n & 0x80000000) 1934 result = TOKuns32v; 1935 else 1936 result = TOKint32v; 1937 break; 1938 case FLAGS_decimal: 1939 /* First that fits: int, long, long long 1940 */ 1941 if (n & 0x8000000000000000L) 1942 { 1943 if (!err) 1944 { 1945 error("signed integer overflow"); 1946 err = true; 1947 } 1948 result = TOKuns64v; 1949 } 1950 else if (n & 0xFFFFFFFF80000000L) 1951 result = TOKint64v; 1952 else 1953 result = TOKint32v; 1954 break; 1955 case FLAGS_unsigned: 1956 case FLAGS_decimal | FLAGS_unsigned: 1957 /* First that fits: uint, ulong 1958 */ 1959 if (n & 0xFFFFFFFF00000000L) 1960 result = TOKuns64v; 1961 else 1962 result = TOKuns32v; 1963 break; 1964 case FLAGS_decimal | FLAGS_long: 1965 if (n & 0x8000000000000000L) 1966 { 1967 if (!err) 1968 { 1969 error("signed integer overflow"); 1970 err = true; 1971 } 1972 result = TOKuns64v; 1973 } 1974 else 1975 result = TOKint64v; 1976 break; 1977 case FLAGS_long: 1978 if (n & 0x8000000000000000L) 1979 result = TOKuns64v; 1980 else 1981 result = TOKint64v; 1982 break; 1983 case FLAGS_unsigned | FLAGS_long: 1984 case FLAGS_decimal | FLAGS_unsigned | FLAGS_long: 1985 result = TOKuns64v; 1986 break; 1987 default: 1988 debug 1989 { 1990 printf("%x\n", flags); 1991 } 1992 assert(0); 1993 } 1994 t.uns64value = n; 1995 return result; 1996 } 1997 1998 /************************************** 1999 * Read in characters, converting them to real. 2000 * Bugs: 2001 * Exponent overflow not detected. 2002 * Too much requested precision is not detected. 2003 */ 2004 final TOK inreal(Token* t) 2005 { 2006 //printf("Lexer::inreal()\n"); 2007 debug 2008 { 2009 assert(*p == '.' || isdigit(*p)); 2010 } 2011 stringbuffer.reset(); 2012 auto pstart = p; 2013 bool hex = false; 2014 dchar c = *p++; 2015 // Leading '0x' 2016 if (c == '0') 2017 { 2018 c = *p++; 2019 if (c == 'x' || c == 'X') 2020 { 2021 hex = true; 2022 c = *p++; 2023 } 2024 } 2025 // Digits to left of '.' 2026 while (1) 2027 { 2028 if (c == '.') 2029 { 2030 c = *p++; 2031 break; 2032 } 2033 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') 2034 { 2035 c = *p++; 2036 continue; 2037 } 2038 break; 2039 } 2040 // Digits to right of '.' 2041 while (1) 2042 { 2043 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') 2044 { 2045 c = *p++; 2046 continue; 2047 } 2048 break; 2049 } 2050 if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P'))) 2051 { 2052 c = *p++; 2053 if (c == '-' || c == '+') 2054 { 2055 c = *p++; 2056 } 2057 bool anyexp = false; 2058 while (1) 2059 { 2060 if (isdigit(c)) 2061 { 2062 anyexp = true; 2063 c = *p++; 2064 continue; 2065 } 2066 if (c == '_') 2067 { 2068 c = *p++; 2069 continue; 2070 } 2071 if (!anyexp) 2072 error("missing exponent"); 2073 break; 2074 } 2075 } 2076 else if (hex) 2077 error("exponent required for hex float"); 2078 --p; 2079 while (pstart < p) 2080 { 2081 if (*pstart != '_') 2082 stringbuffer.writeByte(*pstart); 2083 ++pstart; 2084 } 2085 stringbuffer.writeByte(0); 2086 auto sbufptr = cast(const(char)*)stringbuffer.data; 2087 TOK result; 2088 bool isOutOfRange = false; 2089 t.floatvalue = CTFloat.parse(sbufptr, &isOutOfRange); 2090 switch (*p) 2091 { 2092 case 'F': 2093 case 'f': 2094 isOutOfRange = (isOutOfRange || Port.isFloat32LiteralOutOfRange(sbufptr)); 2095 result = TOKfloat32v; 2096 p++; 2097 break; 2098 default: 2099 isOutOfRange = (isOutOfRange || Port.isFloat64LiteralOutOfRange(sbufptr)); 2100 result = TOKfloat64v; 2101 break; 2102 case 'l': 2103 error("use 'L' suffix instead of 'l'"); 2104 goto case 'L'; 2105 case 'L': 2106 result = TOKfloat80v; 2107 p++; 2108 break; 2109 } 2110 if (*p == 'i' || *p == 'I') 2111 { 2112 if (*p == 'I') 2113 error("use 'i' suffix instead of 'I'"); 2114 p++; 2115 switch (result) 2116 { 2117 case TOKfloat32v: 2118 result = TOKimaginary32v; 2119 break; 2120 case TOKfloat64v: 2121 result = TOKimaginary64v; 2122 break; 2123 case TOKfloat80v: 2124 result = TOKimaginary80v; 2125 break; 2126 default: 2127 break; 2128 } 2129 } 2130 const isLong = (result == TOKfloat80v || result == TOKimaginary80v); 2131 if (isOutOfRange && !isLong) 2132 { 2133 const char* suffix = (result == TOKfloat32v || result == TOKimaginary32v) ? "f" : ""; 2134 error(scanloc, "number '%s%s' is not representable", sbufptr, suffix); 2135 } 2136 debug 2137 { 2138 switch (result) 2139 { 2140 case TOKfloat32v: 2141 case TOKfloat64v: 2142 case TOKfloat80v: 2143 case TOKimaginary32v: 2144 case TOKimaginary64v: 2145 case TOKimaginary80v: 2146 break; 2147 default: 2148 assert(0); 2149 } 2150 } 2151 return result; 2152 } 2153 2154 final Loc loc() 2155 { 2156 scanloc.charnum = cast(uint)(1 + p - line); 2157 return scanloc; 2158 } 2159 2160 final void error(const(char)* format, ...) 2161 { 2162 va_list ap; 2163 va_start(ap, format); 2164 .verror(token.loc, format, ap); 2165 va_end(ap); 2166 errors = true; 2167 } 2168 2169 final void error(Loc loc, const(char)* format, ...) 2170 { 2171 va_list ap; 2172 va_start(ap, format); 2173 .verror(loc, format, ap); 2174 va_end(ap); 2175 errors = true; 2176 } 2177 2178 final void deprecation(const(char)* format, ...) 2179 { 2180 va_list ap; 2181 va_start(ap, format); 2182 .vdeprecation(token.loc, format, ap); 2183 va_end(ap); 2184 if (global.params.useDeprecated == 0) 2185 errors = true; 2186 } 2187 2188 /********************************************* 2189 * parse: 2190 * #line linnum [filespec] 2191 * also allow __LINE__ for linnum, and __FILE__ for filespec 2192 */ 2193 final void poundLine() 2194 { 2195 auto linnum = this.scanloc.linnum; 2196 const(char)* filespec = null; 2197 const loc = this.loc(); 2198 Token tok; 2199 scan(&tok); 2200 if (tok.value == TOKint32v || tok.value == TOKint64v) 2201 { 2202 const lin = cast(int)(tok.uns64value - 1); 2203 if (lin != tok.uns64value - 1) 2204 error("line number %lld out of range", cast(ulong)tok.uns64value); 2205 else 2206 linnum = lin; 2207 } 2208 else if (tok.value == TOKline) 2209 { 2210 } 2211 else 2212 goto Lerr; 2213 while (1) 2214 { 2215 switch (*p) 2216 { 2217 case 0: 2218 case 0x1A: 2219 case '\n': 2220 Lnewline: 2221 this.scanloc.linnum = linnum; 2222 if (filespec) 2223 this.scanloc.filename = filespec; 2224 return; 2225 case '\r': 2226 p++; 2227 if (*p != '\n') 2228 { 2229 p--; 2230 goto Lnewline; 2231 } 2232 continue; 2233 case ' ': 2234 case '\t': 2235 case '\v': 2236 case '\f': 2237 p++; 2238 continue; // skip white space 2239 case '_': 2240 if (memcmp(p, "__FILE__".ptr, 8) == 0) 2241 { 2242 p += 8; 2243 filespec = mem.xstrdup(scanloc.filename); 2244 continue; 2245 } 2246 goto Lerr; 2247 case '"': 2248 if (filespec) 2249 goto Lerr; 2250 stringbuffer.reset(); 2251 p++; 2252 while (1) 2253 { 2254 uint c; 2255 c = *p; 2256 switch (c) 2257 { 2258 case '\n': 2259 case '\r': 2260 case 0: 2261 case 0x1A: 2262 goto Lerr; 2263 case '"': 2264 stringbuffer.writeByte(0); 2265 filespec = mem.xstrdup(cast(const(char)*)stringbuffer.data); 2266 p++; 2267 break; 2268 default: 2269 if (c & 0x80) 2270 { 2271 uint u = decodeUTF(); 2272 if (u == PS || u == LS) 2273 goto Lerr; 2274 } 2275 stringbuffer.writeByte(c); 2276 p++; 2277 continue; 2278 } 2279 break; 2280 } 2281 continue; 2282 default: 2283 if (*p & 0x80) 2284 { 2285 uint u = decodeUTF(); 2286 if (u == PS || u == LS) 2287 goto Lnewline; 2288 } 2289 goto Lerr; 2290 } 2291 } 2292 Lerr: 2293 error(loc, "#line integer [\"filespec\"]\\n expected"); 2294 } 2295 2296 /******************************************** 2297 * Decode UTF character. 2298 * Issue error messages for invalid sequences. 2299 * Return decoded character, advance p to last character in UTF sequence. 2300 */ 2301 final uint decodeUTF() 2302 { 2303 const s = p; 2304 assert(*s & 0x80); 2305 // Check length of remaining string up to 6 UTF-8 characters 2306 size_t len; 2307 for (len = 1; len < 6 && s[len]; len++) 2308 { 2309 } 2310 size_t idx = 0; 2311 dchar u; 2312 const msg = utf_decodeChar(s, len, idx, u); 2313 p += idx - 1; 2314 if (msg) 2315 { 2316 error("%s", msg); 2317 } 2318 return u; 2319 } 2320 2321 /*************************************************** 2322 * Parse doc comment embedded between t->ptr and p. 2323 * Remove trailing blanks and tabs from lines. 2324 * Replace all newlines with \n. 2325 * Remove leading comment character from each line. 2326 * Decide if it's a lineComment or a blockComment. 2327 * Append to previous one for this token. 2328 */ 2329 final void getDocComment(Token* t, uint lineComment) 2330 { 2331 /* ct tells us which kind of comment it is: '/', '*', or '+' 2332 */ 2333 const ct = t.ptr[2]; 2334 /* Start of comment text skips over / * *, / + +, or / / / 2335 */ 2336 const(char)* q = t.ptr + 3; // start of comment text 2337 const(char)* qend = p; 2338 if (ct == '*' || ct == '+') 2339 qend -= 2; 2340 /* Scan over initial row of ****'s or ++++'s or ////'s 2341 */ 2342 for (; q < qend; q++) 2343 { 2344 if (*q != ct) 2345 break; 2346 } 2347 /* Remove leading spaces until start of the comment 2348 */ 2349 int linestart = 0; 2350 if (ct == '/') 2351 { 2352 while (q < qend && (*q == ' ' || *q == '\t')) 2353 ++q; 2354 } 2355 else if (q < qend) 2356 { 2357 if (*q == '\r') 2358 { 2359 ++q; 2360 if (q < qend && *q == '\n') 2361 ++q; 2362 linestart = 1; 2363 } 2364 else if (*q == '\n') 2365 { 2366 ++q; 2367 linestart = 1; 2368 } 2369 } 2370 /* Remove trailing row of ****'s or ++++'s 2371 */ 2372 if (ct != '/') 2373 { 2374 for (; q < qend; qend--) 2375 { 2376 if (qend[-1] != ct) 2377 break; 2378 } 2379 } 2380 /* Comment is now [q .. qend]. 2381 * Canonicalize it into buf[]. 2382 */ 2383 OutBuffer buf; 2384 2385 void trimTrailingWhitespace() 2386 { 2387 const s = buf.peekSlice(); 2388 auto len = s.length; 2389 while (len && (s[len - 1] == ' ' || s[len - 1] == '\t')) 2390 --len; 2391 buf.setsize(len); 2392 } 2393 2394 for (; q < qend; q++) 2395 { 2396 char c = *q; 2397 switch (c) 2398 { 2399 case '*': 2400 case '+': 2401 if (linestart && c == ct) 2402 { 2403 linestart = 0; 2404 /* Trim preceding whitespace up to preceding \n 2405 */ 2406 trimTrailingWhitespace(); 2407 continue; 2408 } 2409 break; 2410 case ' ': 2411 case '\t': 2412 break; 2413 case '\r': 2414 if (q[1] == '\n') 2415 continue; // skip the \r 2416 goto Lnewline; 2417 default: 2418 if (c == 226) 2419 { 2420 // If LS or PS 2421 if (q[1] == 128 && (q[2] == 168 || q[2] == 169)) 2422 { 2423 q += 2; 2424 goto Lnewline; 2425 } 2426 } 2427 linestart = 0; 2428 break; 2429 Lnewline: 2430 c = '\n'; // replace all newlines with \n 2431 goto case; 2432 case '\n': 2433 linestart = 1; 2434 /* Trim trailing whitespace 2435 */ 2436 trimTrailingWhitespace(); 2437 break; 2438 } 2439 buf.writeByte(c); 2440 } 2441 /* Trim trailing whitespace (if the last line does not have newline) 2442 */ 2443 trimTrailingWhitespace(); 2444 2445 // Always end with a newline 2446 const s = buf.peekSlice(); 2447 if (s.length == 0 || s[$ - 1] != '\n') 2448 buf.writeByte('\n'); 2449 2450 // It's a line comment if the start of the doc comment comes 2451 // after other non-whitespace on the same line. 2452 auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment; 2453 // Combine with previous doc comment, if any 2454 if (*dc) 2455 *dc = combineComments(*dc, buf.peekString()); 2456 else 2457 *dc = buf.extractString(); 2458 } 2459 2460 /******************************************** 2461 * Combine two document comments into one, 2462 * separated by a newline. 2463 */ 2464 static const(char)* combineComments(const(char)* c1, const(char)* c2) 2465 { 2466 //printf("Lexer::combineComments('%s', '%s')\n", c1, c2); 2467 auto c = c2; 2468 if (c1) 2469 { 2470 c = c1; 2471 if (c2) 2472 { 2473 size_t len1 = strlen(c1); 2474 size_t len2 = strlen(c2); 2475 int insertNewLine = 0; 2476 if (len1 && c1[len1 - 1] != '\n') 2477 { 2478 ++len1; 2479 insertNewLine = 1; 2480 } 2481 auto p = cast(char*)mem.xmalloc(len1 + 1 + len2 + 1); 2482 memcpy(p, c1, len1 - insertNewLine); 2483 if (insertNewLine) 2484 p[len1 - 1] = '\n'; 2485 p[len1] = '\n'; 2486 memcpy(p + len1 + 1, c2, len2); 2487 p[len1 + 1 + len2] = 0; 2488 c = p; 2489 } 2490 } 2491 return c; 2492 } 2493 2494 private: 2495 final void endOfLine() 2496 { 2497 scanloc.linnum++; 2498 line = p; 2499 } 2500 }