1 /**
2  * Compiler implementation of the
3  * $(LINK2 http://www.dlang.org, D programming language).
4  *
5  * Copyright:   Copyright (c) 1999-2016 by Digital Mars, All Rights Reserved
6  * Authors:     $(LINK2 http://www.digitalmars.com, Walter Bright)
7  * License:     $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8  * Source:      $(DMDSRC _lexer.d)
9  */
10 
11 module ddmd.lexer;
12 
13 import core.stdc.ctype;
14 import core.stdc.errno;
15 import core.stdc.stdarg;
16 import core.stdc.stdio;
17 import core.stdc..string;
18 import core.stdc.time;
19 
20 import ddmd.entity;
21 import ddmd.errors;
22 import ddmd.globals;
23 import ddmd.id;
24 import ddmd.identifier;
25 import ddmd.root.ctfloat;
26 import ddmd.root.outbuffer;
27 import ddmd.root.port;
28 import ddmd.root.rmem;
29 import ddmd.tokens;
30 import ddmd.utf;
31 
32 enum LS = 0x2028;       // UTF line separator
33 enum PS = 0x2029;       // UTF paragraph separator
34 
35 /********************************************
36  * Do our own char maps
37  */
38 immutable ubyte[256] cmtable;
39 enum CMoctal  = 0x1;
40 enum CMhex    = 0x2;
41 enum CMidchar = 0x4;
42 enum CMzerosecond = 0x8;
43 enum CMdigitsecond = 0x10;
44 enum CMsinglechar = 0x20;
45 
46 bool isoctal(char c)
47 {
48     return (cmtable[c] & CMoctal) != 0;
49 }
50 
51 bool ishex(char c)
52 {
53     return (cmtable[c] & CMhex) != 0;
54 }
55 
56 bool isidchar(char c)
57 {
58     return (cmtable[c] & CMidchar) != 0;
59 }
60 
61 bool isZeroSecond(char c)
62 {
63     return (cmtable[c] & CMzerosecond) != 0;
64 }
65 
66 bool isDigitSecond(char c)
67 {
68     return (cmtable[c] & CMdigitsecond) != 0;
69 }
70 
71 bool issinglechar(char c)
72 {
73     return (cmtable[c] & CMsinglechar) != 0;
74 }
75 
76 static this()
77 {
78     foreach (const c; 0 .. cmtable.length)
79     {
80         if ('0' <= c && c <= '7')
81             cmtable[c] |= CMoctal;
82         if (isxdigit(c))
83             cmtable[c] |= CMhex;
84         if (isalnum(c) || c == '_')
85             cmtable[c] |= CMidchar;
86 
87         switch (c)
88         {
89             case 'x': case 'X':
90             case 'b': case 'B':
91                 cmtable[c] |= CMzerosecond;
92                 break;
93 
94             case '0': .. case '9':
95             case 'e': case 'E':
96             case 'f': case 'F':
97             case 'l': case 'L':
98             case 'p': case 'P':
99             case 'u': case 'U':
100             case 'i':
101             case '.':
102             case '_':
103                 cmtable[c] |= CMzerosecond | CMdigitsecond;
104                 break;
105 
106             default:
107                 break;
108         }
109 
110         switch (c)
111         {
112             case '\\':
113             case '\n':
114             case '\r':
115             case 0:
116             case 0x1A:
117             case '\'':
118                 break;
119             default:
120                 if (!(c & 0x80))
121                     cmtable[c] |= CMsinglechar;
122                 break;
123         }
124     }
125 }
126 
127 unittest
128 {
129     //printf("lexer.unittest\n");
130     /* Not much here, just trying things out.
131      */
132     string text = "int";
133     scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, 0, 0);
134     TOK tok;
135     tok = lex1.nextToken();
136     //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOKint32);
137     assert(tok == TOKint32);
138     tok = lex1.nextToken();
139     assert(tok == TOKeof);
140     tok = lex1.nextToken();
141     assert(tok == TOKeof);
142 }
143 
144 /***********************************************************
145  */
146 class Lexer
147 {
148     __gshared OutBuffer stringbuffer;
149 
150     Loc scanloc;            // for error messages
151 
152     const(char)* base;      // pointer to start of buffer
153     const(char)* end;       // past end of buffer
154     const(char)* p;         // current character
155     const(char)* line;      // start of current line
156     Token token;
157     bool doDocComment;      // collect doc comment information
158     bool anyToken;          // seen at least one token
159     bool commentToken;      // comments are TOKcomment's
160     bool errors;            // errors occurred during lexing or parsing
161 
162     /*********************
163      * Creates a Lexer.
164      * Params:
165      *  filename = used for error messages
166      *  base = source code, ending in a 0 byte
167      *  begoffset = starting offset into base[]
168      *  endoffset = last offset into base[]
169      *  doDocComment = handle documentation comments
170      *  commentToken = comments become TOKcomment's
171      */
172     this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset, bool doDocComment, bool commentToken)
173     {
174         scanloc = Loc(filename, 1, 1);
175         //printf("Lexer::Lexer(%p,%d)\n",base,length);
176         //printf("lexer.filename = %s\n", filename);
177         token = Token.init;
178         this.base = base;
179         this.end = base + endoffset;
180         p = base + begoffset;
181         line = p;
182         this.doDocComment = doDocComment;
183         this.commentToken = commentToken;
184         //initKeywords();
185         /* If first line starts with '#!', ignore the line
186          */
187         if (p[0] == '#' && p[1] == '!')
188         {
189             p += 2;
190             while (1)
191             {
192                 char c = *p;
193                 switch (c)
194                 {
195                 case '\n':
196                     p++;
197                     break;
198                 case '\r':
199                     p++;
200                     if (*p == '\n')
201                         p++;
202                     break;
203                 case 0:
204                 case 0x1A:
205                     break;
206                 default:
207                     if (c & 0x80)
208                     {
209                         uint u = decodeUTF();
210                         if (u == PS || u == LS)
211                             break;
212                     }
213                     p++;
214                     continue;
215                 }
216                 break;
217             }
218             endOfLine();
219         }
220     }
221 
222     final TOK nextToken()
223     {
224         if (token.next)
225         {
226             Token* t = token.next;
227             memcpy(&token, t, Token.sizeof);
228             t.free();
229         }
230         else
231         {
232             scan(&token);
233         }
234         //token.print();
235         return token.value;
236     }
237 
238     /***********************
239      * Look ahead at next token's value.
240      */
241     final TOK peekNext()
242     {
243         return peek(&token).value;
244     }
245 
246     /***********************
247      * Look 2 tokens ahead at value.
248      */
249     final TOK peekNext2()
250     {
251         Token* t = peek(&token);
252         return peek(t).value;
253     }
254 
255     /****************************
256      * Turn next token in buffer into a token.
257      */
258     final void scan(Token* t)
259     {
260         const lastLine = scanloc.linnum;
261         Loc startLoc;
262         t.blockComment = null;
263         t.lineComment = null;
264         while (1)
265         {
266             t.ptr = p;
267             //printf("p = %p, *p = '%c'\n",p,*p);
268             t.loc = loc();
269             switch (*p)
270             {
271             case 0:
272             case 0x1A:
273                 t.value = TOKeof; // end of file
274                 return;
275             case ' ':
276             case '\t':
277             case '\v':
278             case '\f':
279                 p++;
280                 continue; // skip white space
281             case '\r':
282                 p++;
283                 if (*p != '\n') // if CR stands by itself
284                     endOfLine();
285                 continue; // skip white space
286             case '\n':
287                 p++;
288                 endOfLine();
289                 continue; // skip white space
290             case '0':
291                 if (!isZeroSecond(p[1]))        // if numeric literal does not continue
292                 {
293                     ++p;
294                     t.uns64value = 0;
295                     t.value = TOKint32v;
296                     return;
297                 }
298                 goto Lnumber;
299 
300             case '1': .. case '9':
301                 if (!isDigitSecond(p[1]))       // if numeric literal does not continue
302                 {
303                     t.uns64value = *p - '0';
304                     ++p;
305                     t.value = TOKint32v;
306                     return;
307                 }
308             Lnumber:
309                 t.value = number(t);
310                 return;
311 
312             case '\'':
313                 if (issinglechar(p[1]) && p[2] == '\'')
314                 {
315                     t.uns64value = p[1];        // simple one character literal
316                     t.value = TOKcharv;
317                     p += 3;
318                 }
319                 else
320                     t.value = charConstant(t);
321                 return;
322             case 'r':
323                 if (p[1] != '"')
324                     goto case_ident;
325                 p++;
326                 goto case '`';
327             case '`':
328                 t.value = wysiwygStringConstant(t, *p);
329                 return;
330             case 'x':
331                 if (p[1] != '"')
332                     goto case_ident;
333                 p++;
334                 t.value = hexStringConstant(t);
335                 return;
336             case 'q':
337                 if (p[1] == '"')
338                 {
339                     p++;
340                     t.value = delimitedStringConstant(t);
341                     return;
342                 }
343                 else if (p[1] == '{')
344                 {
345                     p++;
346                     t.value = tokenStringConstant(t);
347                     return;
348                 }
349                 else
350                     goto case_ident;
351             case '"':
352                 t.value = escapeStringConstant(t, 0);
353                 return;
354             case 'a':
355             case 'b':
356             case 'c':
357             case 'd':
358             case 'e':
359             case 'f':
360             case 'g':
361             case 'h':
362             case 'i':
363             case 'j':
364             case 'k':
365             case 'l':
366             case 'm':
367             case 'n':
368             case 'o':
369             case 'p':
370                 /*case 'q': case 'r':*/
371             case 's':
372             case 't':
373             case 'u':
374             case 'v':
375             case 'w':
376                 /*case 'x':*/
377             case 'y':
378             case 'z':
379             case 'A':
380             case 'B':
381             case 'C':
382             case 'D':
383             case 'E':
384             case 'F':
385             case 'G':
386             case 'H':
387             case 'I':
388             case 'J':
389             case 'K':
390             case 'L':
391             case 'M':
392             case 'N':
393             case 'O':
394             case 'P':
395             case 'Q':
396             case 'R':
397             case 'S':
398             case 'T':
399             case 'U':
400             case 'V':
401             case 'W':
402             case 'X':
403             case 'Y':
404             case 'Z':
405             case '_':
406             case_ident:
407                 {
408                     while (1)
409                     {
410                         const c = *++p;
411                         if (isidchar(c))
412                             continue;
413                         else if (c & 0x80)
414                         {
415                             const s = p;
416                             const u = decodeUTF();
417                             if (isUniAlpha(u))
418                                 continue;
419                             error("char 0x%04x not allowed in identifier", u);
420                             p = s;
421                         }
422                         break;
423                     }
424                     Identifier id = Identifier.idPool(cast(char*)t.ptr, p - t.ptr);
425                     t.ident = id;
426                     t.value = cast(TOK)id.getValue();
427                     anyToken = 1;
428                     if (*t.ptr == '_') // if special identifier token
429                     {
430                         __gshared bool initdone = false;
431                         __gshared char[11 + 1] date;
432                         __gshared char[8 + 1] time;
433                         __gshared char[24 + 1] timestamp;
434                         if (!initdone) // lazy evaluation
435                         {
436                             initdone = true;
437                             time_t ct;
438                             .time(&ct);
439                             const p = ctime(&ct);
440                             assert(p);
441                             sprintf(&date[0], "%.6s %.4s", p + 4, p + 20);
442                             sprintf(&time[0], "%.8s", p + 11);
443                             sprintf(&timestamp[0], "%.24s", p);
444                         }
445                         if (id == Id.DATE)
446                         {
447                             t.ustring = date.ptr;
448                             goto Lstr;
449                         }
450                         else if (id == Id.TIME)
451                         {
452                             t.ustring = time.ptr;
453                             goto Lstr;
454                         }
455                         else if (id == Id.VENDOR)
456                         {
457                             t.ustring = global.compiler.vendor;
458                             goto Lstr;
459                         }
460                         else if (id == Id.TIMESTAMP)
461                         {
462                             t.ustring = timestamp.ptr;
463                         Lstr:
464                             t.value = TOKstring;
465                             t.postfix = 0;
466                             t.len = cast(uint)strlen(t.ustring);
467                         }
468                         else if (id == Id.VERSIONX)
469                         {
470                             uint major = 0;
471                             uint minor = 0;
472                             bool point = false;
473                             for (const(char)* p = global._version + 1; 1; p++)
474                             {
475                                 const c = *p;
476                                 if (isdigit(cast(char)c))
477                                     minor = minor * 10 + c - '0';
478                                 else if (c == '.')
479                                 {
480                                     if (point)
481                                         break; // ignore everything after second '.'
482                                     point = true;
483                                     major = minor;
484                                     minor = 0;
485                                 }
486                                 else
487                                     break;
488                             }
489                             t.value = TOKint64v;
490                             t.uns64value = major * 1000 + minor;
491                         }
492                         else if (id == Id.EOFX)
493                         {
494                             t.value = TOKeof;
495                             // Advance scanner to end of file
496                             while (!(*p == 0 || *p == 0x1A))
497                                 p++;
498                         }
499                     }
500                     //printf("t->value = %d\n",t->value);
501                     return;
502                 }
503             case '/':
504                 p++;
505                 switch (*p)
506                 {
507                 case '=':
508                     p++;
509                     t.value = TOKdivass;
510                     return;
511                 case '*':
512                     p++;
513                     startLoc = loc();
514                     while (1)
515                     {
516                         while (1)
517                         {
518                             const c = *p;
519                             switch (c)
520                             {
521                             case '/':
522                                 break;
523                             case '\n':
524                                 endOfLine();
525                                 p++;
526                                 continue;
527                             case '\r':
528                                 p++;
529                                 if (*p != '\n')
530                                     endOfLine();
531                                 continue;
532                             case 0:
533                             case 0x1A:
534                                 error("unterminated /* */ comment");
535                                 p = end;
536                                 t.loc = loc();
537                                 t.value = TOKeof;
538                                 return;
539                             default:
540                                 if (c & 0x80)
541                                 {
542                                     const u = decodeUTF();
543                                     if (u == PS || u == LS)
544                                         endOfLine();
545                                 }
546                                 p++;
547                                 continue;
548                             }
549                             break;
550                         }
551                         p++;
552                         if (p[-2] == '*' && p - 3 != t.ptr)
553                             break;
554                     }
555                     if (commentToken)
556                     {
557                         t.loc = startLoc;
558                         t.value = TOKcomment;
559                         return;
560                     }
561                     else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr)
562                     {
563                         // if /** but not /**/
564                         getDocComment(t, lastLine == startLoc.linnum);
565                     }
566                     continue;
567                 case '/': // do // style comments
568                     startLoc = loc();
569                     while (1)
570                     {
571                         const c = *++p;
572                         switch (c)
573                         {
574                         case '\n':
575                             break;
576                         case '\r':
577                             if (p[1] == '\n')
578                                 p++;
579                             break;
580                         case 0:
581                         case 0x1A:
582                             if (commentToken)
583                             {
584                                 p = end;
585                                 t.loc = startLoc;
586                                 t.value = TOKcomment;
587                                 return;
588                             }
589                             if (doDocComment && t.ptr[2] == '/')
590                                 getDocComment(t, lastLine == startLoc.linnum);
591                             p = end;
592                             t.loc = loc();
593                             t.value = TOKeof;
594                             return;
595                         default:
596                             if (c & 0x80)
597                             {
598                                 const u = decodeUTF();
599                                 if (u == PS || u == LS)
600                                     break;
601                             }
602                             continue;
603                         }
604                         break;
605                     }
606                     if (commentToken)
607                     {
608                         p++;
609                         endOfLine();
610                         t.loc = startLoc;
611                         t.value = TOKcomment;
612                         return;
613                     }
614                     if (doDocComment && t.ptr[2] == '/')
615                         getDocComment(t, lastLine == startLoc.linnum);
616                     p++;
617                     endOfLine();
618                     continue;
619                 case '+':
620                     {
621                         int nest;
622                         startLoc = loc();
623                         p++;
624                         nest = 1;
625                         while (1)
626                         {
627                             char c = *p;
628                             switch (c)
629                             {
630                             case '/':
631                                 p++;
632                                 if (*p == '+')
633                                 {
634                                     p++;
635                                     nest++;
636                                 }
637                                 continue;
638                             case '+':
639                                 p++;
640                                 if (*p == '/')
641                                 {
642                                     p++;
643                                     if (--nest == 0)
644                                         break;
645                                 }
646                                 continue;
647                             case '\r':
648                                 p++;
649                                 if (*p != '\n')
650                                     endOfLine();
651                                 continue;
652                             case '\n':
653                                 endOfLine();
654                                 p++;
655                                 continue;
656                             case 0:
657                             case 0x1A:
658                                 error("unterminated /+ +/ comment");
659                                 p = end;
660                                 t.loc = loc();
661                                 t.value = TOKeof;
662                                 return;
663                             default:
664                                 if (c & 0x80)
665                                 {
666                                     uint u = decodeUTF();
667                                     if (u == PS || u == LS)
668                                         endOfLine();
669                                 }
670                                 p++;
671                                 continue;
672                             }
673                             break;
674                         }
675                         if (commentToken)
676                         {
677                             t.loc = startLoc;
678                             t.value = TOKcomment;
679                             return;
680                         }
681                         if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr)
682                         {
683                             // if /++ but not /++/
684                             getDocComment(t, lastLine == startLoc.linnum);
685                         }
686                         continue;
687                     }
688                 default:
689                     break;
690                 }
691                 t.value = TOKdiv;
692                 return;
693             case '.':
694                 p++;
695                 if (isdigit(*p))
696                 {
697                     /* Note that we don't allow ._1 and ._ as being
698                      * valid floating point numbers.
699                      */
700                     p--;
701                     t.value = inreal(t);
702                 }
703                 else if (p[0] == '.')
704                 {
705                     if (p[1] == '.')
706                     {
707                         p += 2;
708                         t.value = TOKdotdotdot;
709                     }
710                     else
711                     {
712                         p++;
713                         t.value = TOKslice;
714                     }
715                 }
716                 else
717                     t.value = TOKdot;
718                 return;
719             case '&':
720                 p++;
721                 if (*p == '=')
722                 {
723                     p++;
724                     t.value = TOKandass;
725                 }
726                 else if (*p == '&')
727                 {
728                     p++;
729                     t.value = TOKandand;
730                 }
731                 else
732                     t.value = TOKand;
733                 return;
734             case '|':
735                 p++;
736                 if (*p == '=')
737                 {
738                     p++;
739                     t.value = TOKorass;
740                 }
741                 else if (*p == '|')
742                 {
743                     p++;
744                     t.value = TOKoror;
745                 }
746                 else
747                     t.value = TOKor;
748                 return;
749             case '-':
750                 p++;
751                 if (*p == '=')
752                 {
753                     p++;
754                     t.value = TOKminass;
755                 }
756                 else if (*p == '-')
757                 {
758                     p++;
759                     t.value = TOKminusminus;
760                 }
761                 else
762                     t.value = TOKmin;
763                 return;
764             case '+':
765                 p++;
766                 if (*p == '=')
767                 {
768                     p++;
769                     t.value = TOKaddass;
770                 }
771                 else if (*p == '+')
772                 {
773                     p++;
774                     t.value = TOKplusplus;
775                 }
776                 else
777                     t.value = TOKadd;
778                 return;
779             case '<':
780                 p++;
781                 if (*p == '=')
782                 {
783                     p++;
784                     t.value = TOKle; // <=
785                 }
786                 else if (*p == '<')
787                 {
788                     p++;
789                     if (*p == '=')
790                     {
791                         p++;
792                         t.value = TOKshlass; // <<=
793                     }
794                     else
795                         t.value = TOKshl; // <<
796                 }
797                 else if (*p == '>')
798                 {
799                     p++;
800                     if (*p == '=')
801                     {
802                         p++;
803                         t.value = TOKleg; // <>=
804                     }
805                     else
806                         t.value = TOKlg; // <>
807                 }
808                 else
809                     t.value = TOKlt; // <
810                 return;
811             case '>':
812                 p++;
813                 if (*p == '=')
814                 {
815                     p++;
816                     t.value = TOKge; // >=
817                 }
818                 else if (*p == '>')
819                 {
820                     p++;
821                     if (*p == '=')
822                     {
823                         p++;
824                         t.value = TOKshrass; // >>=
825                     }
826                     else if (*p == '>')
827                     {
828                         p++;
829                         if (*p == '=')
830                         {
831                             p++;
832                             t.value = TOKushrass; // >>>=
833                         }
834                         else
835                             t.value = TOKushr; // >>>
836                     }
837                     else
838                         t.value = TOKshr; // >>
839                 }
840                 else
841                     t.value = TOKgt; // >
842                 return;
843             case '!':
844                 p++;
845                 if (*p == '=')
846                 {
847                     p++;
848                     t.value = TOKnotequal; // !=
849                 }
850                 else if (*p == '<')
851                 {
852                     p++;
853                     if (*p == '>')
854                     {
855                         p++;
856                         if (*p == '=')
857                         {
858                             p++;
859                             t.value = TOKunord; // !<>=
860                         }
861                         else
862                             t.value = TOKue; // !<>
863                     }
864                     else if (*p == '=')
865                     {
866                         p++;
867                         t.value = TOKug; // !<=
868                     }
869                     else
870                         t.value = TOKuge; // !<
871                 }
872                 else if (*p == '>')
873                 {
874                     p++;
875                     if (*p == '=')
876                     {
877                         p++;
878                         t.value = TOKul; // !>=
879                     }
880                     else
881                         t.value = TOKule; // !>
882                 }
883                 else
884                     t.value = TOKnot; // !
885                 return;
886             case '=':
887                 p++;
888                 if (*p == '=')
889                 {
890                     p++;
891                     t.value = TOKequal; // ==
892                 }
893                 else if (*p == '>')
894                 {
895                     p++;
896                     t.value = TOKgoesto; // =>
897                 }
898                 else
899                     t.value = TOKassign; // =
900                 return;
901             case '~':
902                 p++;
903                 if (*p == '=')
904                 {
905                     p++;
906                     t.value = TOKcatass; // ~=
907                 }
908                 else
909                     t.value = TOKtilde; // ~
910                 return;
911             case '^':
912                 p++;
913                 if (*p == '^')
914                 {
915                     p++;
916                     if (*p == '=')
917                     {
918                         p++;
919                         t.value = TOKpowass; // ^^=
920                     }
921                     else
922                         t.value = TOKpow; // ^^
923                 }
924                 else if (*p == '=')
925                 {
926                     p++;
927                     t.value = TOKxorass; // ^=
928                 }
929                 else
930                     t.value = TOKxor; // ^
931                 return;
932             case '(':
933                 p++;
934                 t.value = TOKlparen;
935                 return;
936             case ')':
937                 p++;
938                 t.value = TOKrparen;
939                 return;
940             case '[':
941                 p++;
942                 t.value = TOKlbracket;
943                 return;
944             case ']':
945                 p++;
946                 t.value = TOKrbracket;
947                 return;
948             case '{':
949                 p++;
950                 t.value = TOKlcurly;
951                 return;
952             case '}':
953                 p++;
954                 t.value = TOKrcurly;
955                 return;
956             case '?':
957                 p++;
958                 t.value = TOKquestion;
959                 return;
960             case ',':
961                 p++;
962                 t.value = TOKcomma;
963                 return;
964             case ';':
965                 p++;
966                 t.value = TOKsemicolon;
967                 return;
968             case ':':
969                 p++;
970                 t.value = TOKcolon;
971                 return;
972             case '$':
973                 p++;
974                 t.value = TOKdollar;
975                 return;
976             case '@':
977                 p++;
978                 t.value = TOKat;
979                 return;
980             case '*':
981                 p++;
982                 if (*p == '=')
983                 {
984                     p++;
985                     t.value = TOKmulass;
986                 }
987                 else
988                     t.value = TOKmul;
989                 return;
990             case '%':
991                 p++;
992                 if (*p == '=')
993                 {
994                     p++;
995                     t.value = TOKmodass;
996                 }
997                 else
998                     t.value = TOKmod;
999                 return;
1000             case '#':
1001                 {
1002                     p++;
1003                     Token n;
1004                     scan(&n);
1005                     if (n.value == TOKidentifier && n.ident == Id.line)
1006                     {
1007                         poundLine();
1008                         continue;
1009                     }
1010                     else
1011                     {
1012                         t.value = TOKpound;
1013                         return;
1014                     }
1015                 }
1016             default:
1017                 {
1018                     dchar c = *p;
1019                     if (c & 0x80)
1020                     {
1021                         c = decodeUTF();
1022                         // Check for start of unicode identifier
1023                         if (isUniAlpha(c))
1024                             goto case_ident;
1025                         if (c == PS || c == LS)
1026                         {
1027                             endOfLine();
1028                             p++;
1029                             continue;
1030                         }
1031                     }
1032                     if (c < 0x80 && isprint(c))
1033                         error("character '%c' is not a valid token", c);
1034                     else
1035                         error("character 0x%02x is not a valid token", c);
1036                     p++;
1037                     continue;
1038                 }
1039             }
1040         }
1041     }
1042 
1043     final Token* peek(Token* ct)
1044     {
1045         Token* t;
1046         if (ct.next)
1047             t = ct.next;
1048         else
1049         {
1050             t = Token.alloc();
1051             scan(t);
1052             ct.next = t;
1053         }
1054         return t;
1055     }
1056 
1057     /*********************************
1058      * tk is on the opening (.
1059      * Look ahead and return token that is past the closing ).
1060      */
1061     final Token* peekPastParen(Token* tk)
1062     {
1063         //printf("peekPastParen()\n");
1064         int parens = 1;
1065         int curlynest = 0;
1066         while (1)
1067         {
1068             tk = peek(tk);
1069             //tk->print();
1070             switch (tk.value)
1071             {
1072             case TOKlparen:
1073                 parens++;
1074                 continue;
1075             case TOKrparen:
1076                 --parens;
1077                 if (parens)
1078                     continue;
1079                 tk = peek(tk);
1080                 break;
1081             case TOKlcurly:
1082                 curlynest++;
1083                 continue;
1084             case TOKrcurly:
1085                 if (--curlynest >= 0)
1086                     continue;
1087                 break;
1088             case TOKsemicolon:
1089                 if (curlynest)
1090                     continue;
1091                 break;
1092             case TOKeof:
1093                 break;
1094             default:
1095                 continue;
1096             }
1097             return tk;
1098         }
1099     }
1100 
1101     /*******************************************
1102      * Parse escape sequence.
1103      */
1104     final uint escapeSequence()
1105     {
1106         uint c = *p;
1107         int ndigits;
1108         switch (c)
1109         {
1110         case '\'':
1111         case '"':
1112         case '?':
1113         case '\\':
1114         Lconsume:
1115             p++;
1116             break;
1117         case 'a':
1118             c = 7;
1119             goto Lconsume;
1120         case 'b':
1121             c = 8;
1122             goto Lconsume;
1123         case 'f':
1124             c = 12;
1125             goto Lconsume;
1126         case 'n':
1127             c = 10;
1128             goto Lconsume;
1129         case 'r':
1130             c = 13;
1131             goto Lconsume;
1132         case 't':
1133             c = 9;
1134             goto Lconsume;
1135         case 'v':
1136             c = 11;
1137             goto Lconsume;
1138         case 'u':
1139             ndigits = 4;
1140             goto Lhex;
1141         case 'U':
1142             ndigits = 8;
1143             goto Lhex;
1144         case 'x':
1145             ndigits = 2;
1146         Lhex:
1147             p++;
1148             c = *p;
1149             if (ishex(cast(char)c))
1150             {
1151                 uint v = 0;
1152                 int n = 0;
1153                 while (1)
1154                 {
1155                     if (isdigit(cast(char)c))
1156                         c -= '0';
1157                     else if (islower(c))
1158                         c -= 'a' - 10;
1159                     else
1160                         c -= 'A' - 10;
1161                     v = v * 16 + c;
1162                     c = *++p;
1163                     if (++n == ndigits)
1164                         break;
1165                     if (!ishex(cast(char)c))
1166                     {
1167                         error("escape hex sequence has %d hex digits instead of %d", n, ndigits);
1168                         break;
1169                     }
1170                 }
1171                 if (ndigits != 2 && !utf_isValidDchar(v))
1172                 {
1173                     error("invalid UTF character \\U%08x", v);
1174                     v = '?'; // recover with valid UTF character
1175                 }
1176                 c = v;
1177             }
1178             else
1179                 error("undefined escape hex sequence \\%c", c);
1180             break;
1181         case '&':
1182             // named character entity
1183             for (const idstart = ++p; 1; p++)
1184             {
1185                 switch (*p)
1186                 {
1187                 case ';':
1188                     c = HtmlNamedEntity(idstart, p - idstart);
1189                     if (c == ~0)
1190                     {
1191                         error("unnamed character entity &%.*s;", cast(int)(p - idstart), idstart);
1192                         c = ' ';
1193                     }
1194                     p++;
1195                     break;
1196                 default:
1197                     if (isalpha(*p) || (p != idstart && isdigit(*p)))
1198                         continue;
1199                     error("unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart);
1200                     break;
1201                 }
1202                 break;
1203             }
1204             break;
1205         case 0:
1206         case 0x1A:
1207             // end of file
1208             c = '\\';
1209             break;
1210         default:
1211             if (isoctal(cast(char)c))
1212             {
1213                 uint v = 0;
1214                 int n = 0;
1215                 do
1216                 {
1217                     v = v * 8 + (c - '0');
1218                     c = *++p;
1219                 }
1220                 while (++n < 3 && isoctal(cast(char)c));
1221                 c = v;
1222                 if (c > 0xFF)
1223                     error("escape octal sequence \\%03o is larger than \\377", c);
1224             }
1225             else
1226                 error("undefined escape sequence \\%c", c);
1227             break;
1228         }
1229         return c;
1230     }
1231 
1232     /**************************************
1233      */
1234     final TOK wysiwygStringConstant(Token* t, int tc)
1235     {
1236         Loc start = loc();
1237         p++;
1238         stringbuffer.reset();
1239         while (1)
1240         {
1241             dchar c = *p++;
1242             switch (c)
1243             {
1244             case '\n':
1245                 endOfLine();
1246                 break;
1247             case '\r':
1248                 if (*p == '\n')
1249                     continue; // ignore
1250                 c = '\n'; // treat EndOfLine as \n character
1251                 endOfLine();
1252                 break;
1253             case 0:
1254             case 0x1A:
1255                 error("unterminated string constant starting at %s", start.toChars());
1256                 t.setString();
1257                 return TOKstring;
1258             case '"':
1259             case '`':
1260                 if (c == tc)
1261                 {
1262                     t.setString(stringbuffer);
1263                     stringPostfix(t);
1264                     return TOKstring;
1265                 }
1266                 break;
1267             default:
1268                 if (c & 0x80)
1269                 {
1270                     p--;
1271                     const u = decodeUTF();
1272                     p++;
1273                     if (u == PS || u == LS)
1274                         endOfLine();
1275                     stringbuffer.writeUTF8(u);
1276                     continue;
1277                 }
1278                 break;
1279             }
1280             stringbuffer.writeByte(c);
1281         }
1282     }
1283 
1284     /**************************************
1285      * Lex hex strings:
1286      *      x"0A ae 34FE BD"
1287      */
1288     final TOK hexStringConstant(Token* t)
1289     {
1290         Loc start = loc();
1291         uint n = 0;
1292         uint v = ~0; // dead assignment, needed to suppress warning
1293         p++;
1294         stringbuffer.reset();
1295         while (1)
1296         {
1297             dchar c = *p++;
1298             switch (c)
1299             {
1300             case ' ':
1301             case '\t':
1302             case '\v':
1303             case '\f':
1304                 continue; // skip white space
1305             case '\r':
1306                 if (*p == '\n')
1307                     continue; // ignore '\r' if followed by '\n'
1308                 // Treat isolated '\r' as if it were a '\n'
1309                 goto case '\n';
1310             case '\n':
1311                 endOfLine();
1312                 continue;
1313             case 0:
1314             case 0x1A:
1315                 error("unterminated string constant starting at %s", start.toChars());
1316                 t.setString();
1317                 return TOKxstring;
1318             case '"':
1319                 if (n & 1)
1320                 {
1321                     error("odd number (%d) of hex characters in hex string", n);
1322                     stringbuffer.writeByte(v);
1323                 }
1324                 t.setString(stringbuffer);
1325                 stringPostfix(t);
1326                 return TOKxstring;
1327             default:
1328                 if (c >= '0' && c <= '9')
1329                     c -= '0';
1330                 else if (c >= 'a' && c <= 'f')
1331                     c -= 'a' - 10;
1332                 else if (c >= 'A' && c <= 'F')
1333                     c -= 'A' - 10;
1334                 else if (c & 0x80)
1335                 {
1336                     p--;
1337                     const u = decodeUTF();
1338                     p++;
1339                     if (u == PS || u == LS)
1340                         endOfLine();
1341                     else
1342                         error("non-hex character \\u%04x in hex string", u);
1343                 }
1344                 else
1345                     error("non-hex character '%c' in hex string", c);
1346                 if (n & 1)
1347                 {
1348                     v = (v << 4) | c;
1349                     stringbuffer.writeByte(v);
1350                 }
1351                 else
1352                     v = c;
1353                 n++;
1354                 break;
1355             }
1356         }
1357         assert(0); // see bug 15731
1358     }
1359 
1360     /**************************************
1361      * Lex delimited strings:
1362      *      q"(foo(xxx))"   // "foo(xxx)"
1363      *      q"[foo$(LPAREN)]"       // "foo$(LPAREN)"
1364      *      q"/foo]/"       // "foo]"
1365      *      q"HERE
1366      *      foo
1367      *      HERE"           // "foo\n"
1368      * Input:
1369      *      p is on the "
1370      */
1371     final TOK delimitedStringConstant(Token* t)
1372     {
1373         Loc start = loc();
1374         dchar delimleft = 0;
1375         dchar delimright = 0;
1376         uint nest = 1;
1377         uint nestcount = ~0; // dead assignment, needed to suppress warning
1378         Identifier hereid = null;
1379         uint blankrol = 0;
1380         uint startline = 0;
1381         p++;
1382         stringbuffer.reset();
1383         while (1)
1384         {
1385             dchar c = *p++;
1386             //printf("c = '%c'\n", c);
1387             switch (c)
1388             {
1389             case '\n':
1390             Lnextline:
1391                 endOfLine();
1392                 startline = 1;
1393                 if (blankrol)
1394                 {
1395                     blankrol = 0;
1396                     continue;
1397                 }
1398                 if (hereid)
1399                 {
1400                     stringbuffer.writeUTF8(c);
1401                     continue;
1402                 }
1403                 break;
1404             case '\r':
1405                 if (*p == '\n')
1406                     continue; // ignore
1407                 c = '\n'; // treat EndOfLine as \n character
1408                 goto Lnextline;
1409             case 0:
1410             case 0x1A:
1411                 error("unterminated delimited string constant starting at %s", start.toChars());
1412                 t.setString();
1413                 return TOKstring;
1414             default:
1415                 if (c & 0x80)
1416                 {
1417                     p--;
1418                     c = decodeUTF();
1419                     p++;
1420                     if (c == PS || c == LS)
1421                         goto Lnextline;
1422                 }
1423                 break;
1424             }
1425             if (delimleft == 0)
1426             {
1427                 delimleft = c;
1428                 nest = 1;
1429                 nestcount = 1;
1430                 if (c == '(')
1431                     delimright = ')';
1432                 else if (c == '{')
1433                     delimright = '}';
1434                 else if (c == '[')
1435                     delimright = ']';
1436                 else if (c == '<')
1437                     delimright = '>';
1438                 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1439                 {
1440                     // Start of identifier; must be a heredoc
1441                     Token tok;
1442                     p--;
1443                     scan(&tok); // read in heredoc identifier
1444                     if (tok.value != TOKidentifier)
1445                     {
1446                         error("identifier expected for heredoc, not %s", tok.toChars());
1447                         delimright = c;
1448                     }
1449                     else
1450                     {
1451                         hereid = tok.ident;
1452                         //printf("hereid = '%s'\n", hereid->toChars());
1453                         blankrol = 1;
1454                     }
1455                     nest = 0;
1456                 }
1457                 else
1458                 {
1459                     delimright = c;
1460                     nest = 0;
1461                     if (isspace(c))
1462                         error("delimiter cannot be whitespace");
1463                 }
1464             }
1465             else
1466             {
1467                 if (blankrol)
1468                 {
1469                     error("heredoc rest of line should be blank");
1470                     blankrol = 0;
1471                     continue;
1472                 }
1473                 if (nest == 1)
1474                 {
1475                     if (c == delimleft)
1476                         nestcount++;
1477                     else if (c == delimright)
1478                     {
1479                         nestcount--;
1480                         if (nestcount == 0)
1481                             goto Ldone;
1482                     }
1483                 }
1484                 else if (c == delimright)
1485                     goto Ldone;
1486                 if (startline && isalpha(c) && hereid)
1487                 {
1488                     Token tok;
1489                     auto psave = p;
1490                     p--;
1491                     scan(&tok); // read in possible heredoc identifier
1492                     //printf("endid = '%s'\n", tok.ident->toChars());
1493                     if (tok.value == TOKidentifier && tok.ident.equals(hereid))
1494                     {
1495                         /* should check that rest of line is blank
1496                          */
1497                         goto Ldone;
1498                     }
1499                     p = psave;
1500                 }
1501                 stringbuffer.writeUTF8(c);
1502                 startline = 0;
1503             }
1504         }
1505     Ldone:
1506         if (*p == '"')
1507             p++;
1508         else if (hereid)
1509             error("delimited string must end in %s\"", hereid.toChars());
1510         else
1511             error("delimited string must end in %c\"", delimright);
1512         t.setString(stringbuffer);
1513         stringPostfix(t);
1514         return TOKstring;
1515     }
1516 
1517     /**************************************
1518      * Lex delimited strings:
1519      *      q{ foo(xxx) } // " foo(xxx) "
1520      *      q{foo$(LPAREN)}       // "foo$(LPAREN)"
1521      *      q{{foo}"}"}   // "{foo}"}""
1522      * Input:
1523      *      p is on the q
1524      */
1525     final TOK tokenStringConstant(Token* t)
1526     {
1527         uint nest = 1;
1528         const start = loc();
1529         const pstart = ++p;
1530         while (1)
1531         {
1532             Token tok;
1533             scan(&tok);
1534             switch (tok.value)
1535             {
1536             case TOKlcurly:
1537                 nest++;
1538                 continue;
1539             case TOKrcurly:
1540                 if (--nest == 0)
1541                 {
1542                     t.setString(pstart, p - 1 - pstart);
1543                     stringPostfix(t);
1544                     return TOKstring;
1545                 }
1546                 continue;
1547             case TOKeof:
1548                 error("unterminated token string constant starting at %s", start.toChars());
1549                 t.setString();
1550                 return TOKstring;
1551             default:
1552                 continue;
1553             }
1554         }
1555     }
1556 
1557     /**************************************
1558      */
1559     final TOK escapeStringConstant(Token* t, int wide)
1560     {
1561         const start = loc();
1562         p++;
1563         stringbuffer.reset();
1564         while (1)
1565         {
1566             dchar c = *p++;
1567             switch (c)
1568             {
1569             case '\\':
1570                 switch (*p)
1571                 {
1572                 case 'u':
1573                 case 'U':
1574                 case '&':
1575                     c = escapeSequence();
1576                     stringbuffer.writeUTF8(c);
1577                     continue;
1578                 default:
1579                     c = escapeSequence();
1580                     break;
1581                 }
1582                 break;
1583             case '\n':
1584                 endOfLine();
1585                 break;
1586             case '\r':
1587                 if (*p == '\n')
1588                     continue; // ignore
1589                 c = '\n'; // treat EndOfLine as \n character
1590                 endOfLine();
1591                 break;
1592             case '"':
1593                 t.setString(stringbuffer);
1594                 stringPostfix(t);
1595                 return TOKstring;
1596             case 0:
1597             case 0x1A:
1598                 p--;
1599                 error("unterminated string constant starting at %s", start.toChars());
1600                 t.setString();
1601                 return TOKstring;
1602             default:
1603                 if (c & 0x80)
1604                 {
1605                     p--;
1606                     c = decodeUTF();
1607                     if (c == LS || c == PS)
1608                     {
1609                         c = '\n';
1610                         endOfLine();
1611                     }
1612                     p++;
1613                     stringbuffer.writeUTF8(c);
1614                     continue;
1615                 }
1616                 break;
1617             }
1618             stringbuffer.writeByte(c);
1619         }
1620     }
1621 
1622     /**************************************
1623      */
1624     final TOK charConstant(Token* t)
1625     {
1626         TOK tk = TOKcharv;
1627         //printf("Lexer::charConstant\n");
1628         p++;
1629         dchar c = *p++;
1630         switch (c)
1631         {
1632         case '\\':
1633             switch (*p)
1634             {
1635             case 'u':
1636                 t.uns64value = escapeSequence();
1637                 tk = TOKwcharv;
1638                 break;
1639             case 'U':
1640             case '&':
1641                 t.uns64value = escapeSequence();
1642                 tk = TOKdcharv;
1643                 break;
1644             default:
1645                 t.uns64value = escapeSequence();
1646                 break;
1647             }
1648             break;
1649         case '\n':
1650         L1:
1651             endOfLine();
1652             goto case;
1653         case '\r':
1654         case 0:
1655         case 0x1A:
1656         case '\'':
1657             error("unterminated character constant");
1658             t.uns64value = '?';
1659             return tk;
1660         default:
1661             if (c & 0x80)
1662             {
1663                 p--;
1664                 c = decodeUTF();
1665                 p++;
1666                 if (c == LS || c == PS)
1667                     goto L1;
1668                 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1669                     tk = TOKwcharv;
1670                 else
1671                     tk = TOKdcharv;
1672             }
1673             t.uns64value = c;
1674             break;
1675         }
1676         if (*p != '\'')
1677         {
1678             error("unterminated character constant");
1679             t.uns64value = '?';
1680             return tk;
1681         }
1682         p++;
1683         return tk;
1684     }
1685 
1686     /***************************************
1687      * Get postfix of string literal.
1688      */
1689     final void stringPostfix(Token* t)
1690     {
1691         switch (*p)
1692         {
1693         case 'c':
1694         case 'w':
1695         case 'd':
1696             t.postfix = *p;
1697             p++;
1698             break;
1699         default:
1700             t.postfix = 0;
1701             break;
1702         }
1703     }
1704 
1705     /**************************************
1706      * Read in a number.
1707      * If it's an integer, store it in tok.TKutok.Vlong.
1708      *      integers can be decimal, octal or hex
1709      *      Handle the suffixes U, UL, LU, L, etc.
1710      * If it's double, store it in tok.TKutok.Vdouble.
1711      * Returns:
1712      *      TKnum
1713      *      TKdouble,...
1714      */
1715     final TOK number(Token* t)
1716     {
1717         int base = 10;
1718         const start = p;
1719         uinteger_t n = 0; // unsigned >=64 bit integer type
1720         int d;
1721         bool err = false;
1722         bool overflow = false;
1723         dchar c = *p;
1724         if (c == '0')
1725         {
1726             ++p;
1727             c = *p;
1728             switch (c)
1729             {
1730             case '0':
1731             case '1':
1732             case '2':
1733             case '3':
1734             case '4':
1735             case '5':
1736             case '6':
1737             case '7':
1738                 n = c - '0';
1739                 ++p;
1740                 base = 8;
1741                 break;
1742             case 'x':
1743             case 'X':
1744                 ++p;
1745                 base = 16;
1746                 break;
1747             case 'b':
1748             case 'B':
1749                 ++p;
1750                 base = 2;
1751                 break;
1752             case '.':
1753                 if (p[1] == '.')
1754                     goto Ldone; // if ".."
1755                 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)
1756                     goto Ldone; // if ".identifier" or ".unicode"
1757                 goto Lreal; // '.' is part of current token
1758             case 'i':
1759             case 'f':
1760             case 'F':
1761                 goto Lreal;
1762             case '_':
1763                 ++p;
1764                 base = 8;
1765                 break;
1766             case 'L':
1767                 if (p[1] == 'i')
1768                     goto Lreal;
1769                 break;
1770             default:
1771                 break;
1772             }
1773         }
1774         while (1)
1775         {
1776             c = *p;
1777             switch (c)
1778             {
1779             case '0':
1780             case '1':
1781                 ++p;
1782                 d = c - '0';
1783                 break;
1784             case '2':
1785             case '3':
1786             case '4':
1787             case '5':
1788             case '6':
1789             case '7':
1790                 if (base == 2 && !err)
1791                 {
1792                     error("binary digit expected");
1793                     err = true;
1794                 }
1795                 ++p;
1796                 d = c - '0';
1797                 break;
1798             case '8':
1799             case '9':
1800                 ++p;
1801                 if (base < 10 && !err)
1802                 {
1803                     error("radix %d digit expected, not '%c'", base, c);
1804                     err = true;
1805                 }
1806                 d = c - '0';
1807                 break;
1808             case 'a':
1809             case 'b':
1810             case 'c':
1811             case 'd':
1812             case 'e':
1813             case 'f':
1814             case 'A':
1815             case 'B':
1816             case 'C':
1817             case 'D':
1818             case 'E':
1819             case 'F':
1820                 ++p;
1821                 if (base != 16)
1822                 {
1823                     if (c == 'e' || c == 'E' || c == 'f' || c == 'F')
1824                         goto Lreal;
1825                     if (!err)
1826                     {
1827                         error("radix %d digit expected, not '%c'", base, c);
1828                         err = true;
1829                     }
1830                 }
1831                 if (c >= 'a')
1832                     d = c + 10 - 'a';
1833                 else
1834                     d = c + 10 - 'A';
1835                 break;
1836             case 'L':
1837                 if (p[1] == 'i')
1838                     goto Lreal;
1839                 goto Ldone;
1840             case '.':
1841                 if (p[1] == '.')
1842                     goto Ldone; // if ".."
1843                 if (base == 10 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80))
1844                     goto Ldone; // if ".identifier" or ".unicode"
1845                 goto Lreal; // otherwise as part of a floating point literal
1846             case 'p':
1847             case 'P':
1848             case 'i':
1849             Lreal:
1850                 p = start;
1851                 return inreal(t);
1852             case '_':
1853                 ++p;
1854                 continue;
1855             default:
1856                 goto Ldone;
1857             }
1858             // Avoid expensive overflow check if we aren't at risk of overflow
1859             if (n <= 0x0FFF_FFFF_FFFF_FFFFUL)
1860                 n = n * base + d;
1861             else
1862             {
1863                 import core.checkedint : mulu, addu;
1864 
1865                 n = mulu(n, base, overflow);
1866                 n = addu(n, d, overflow);
1867             }
1868         }
1869     Ldone:
1870         if (overflow && !err)
1871         {
1872             error("integer overflow");
1873             err = true;
1874         }
1875         enum FLAGS : int
1876         {
1877             FLAGS_none = 0,
1878             FLAGS_decimal = 1, // decimal
1879             FLAGS_unsigned = 2, // u or U suffix
1880             FLAGS_long = 4, // L suffix
1881         }
1882 
1883         alias FLAGS_none = FLAGS.FLAGS_none;
1884         alias FLAGS_decimal = FLAGS.FLAGS_decimal;
1885         alias FLAGS_unsigned = FLAGS.FLAGS_unsigned;
1886         alias FLAGS_long = FLAGS.FLAGS_long;
1887 
1888         FLAGS flags = (base == 10) ? FLAGS_decimal : FLAGS_none;
1889         // Parse trailing 'u', 'U', 'l' or 'L' in any combination
1890         const psuffix = p;
1891         while (1)
1892         {
1893             FLAGS f;
1894             switch (*p)
1895             {
1896             case 'U':
1897             case 'u':
1898                 f = FLAGS_unsigned;
1899                 goto L1;
1900             case 'l':
1901                 f = FLAGS_long;
1902                 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
1903                 goto L1;
1904             case 'L':
1905                 f = FLAGS_long;
1906             L1:
1907                 p++;
1908                 if ((flags & f) && !err)
1909                 {
1910                     error("unrecognized token");
1911                     err = true;
1912                 }
1913                 flags = cast(FLAGS)(flags | f);
1914                 continue;
1915             default:
1916                 break;
1917             }
1918             break;
1919         }
1920         if (base == 8 && n >= 8)
1921             error("octal literals 0%llo%.*s are no longer supported, use std.conv.octal!%llo%.*s instead", n, p - psuffix, psuffix, n, p - psuffix, psuffix);
1922         TOK result;
1923         switch (flags)
1924         {
1925         case FLAGS_none:
1926             /* Octal or Hexadecimal constant.
1927              * First that fits: int, uint, long, ulong
1928              */
1929             if (n & 0x8000000000000000L)
1930                 result = TOKuns64v;
1931             else if (n & 0xFFFFFFFF00000000L)
1932                 result = TOKint64v;
1933             else if (n & 0x80000000)
1934                 result = TOKuns32v;
1935             else
1936                 result = TOKint32v;
1937             break;
1938         case FLAGS_decimal:
1939             /* First that fits: int, long, long long
1940              */
1941             if (n & 0x8000000000000000L)
1942             {
1943                 if (!err)
1944                 {
1945                     error("signed integer overflow");
1946                     err = true;
1947                 }
1948                 result = TOKuns64v;
1949             }
1950             else if (n & 0xFFFFFFFF80000000L)
1951                 result = TOKint64v;
1952             else
1953                 result = TOKint32v;
1954             break;
1955         case FLAGS_unsigned:
1956         case FLAGS_decimal | FLAGS_unsigned:
1957             /* First that fits: uint, ulong
1958              */
1959             if (n & 0xFFFFFFFF00000000L)
1960                 result = TOKuns64v;
1961             else
1962                 result = TOKuns32v;
1963             break;
1964         case FLAGS_decimal | FLAGS_long:
1965             if (n & 0x8000000000000000L)
1966             {
1967                 if (!err)
1968                 {
1969                     error("signed integer overflow");
1970                     err = true;
1971                 }
1972                 result = TOKuns64v;
1973             }
1974             else
1975                 result = TOKint64v;
1976             break;
1977         case FLAGS_long:
1978             if (n & 0x8000000000000000L)
1979                 result = TOKuns64v;
1980             else
1981                 result = TOKint64v;
1982             break;
1983         case FLAGS_unsigned | FLAGS_long:
1984         case FLAGS_decimal | FLAGS_unsigned | FLAGS_long:
1985             result = TOKuns64v;
1986             break;
1987         default:
1988             debug
1989             {
1990                 printf("%x\n", flags);
1991             }
1992             assert(0);
1993         }
1994         t.uns64value = n;
1995         return result;
1996     }
1997 
1998     /**************************************
1999      * Read in characters, converting them to real.
2000      * Bugs:
2001      *      Exponent overflow not detected.
2002      *      Too much requested precision is not detected.
2003      */
2004     final TOK inreal(Token* t)
2005     {
2006         //printf("Lexer::inreal()\n");
2007         debug
2008         {
2009             assert(*p == '.' || isdigit(*p));
2010         }
2011         stringbuffer.reset();
2012         auto pstart = p;
2013         bool hex = false;
2014         dchar c = *p++;
2015         // Leading '0x'
2016         if (c == '0')
2017         {
2018             c = *p++;
2019             if (c == 'x' || c == 'X')
2020             {
2021                 hex = true;
2022                 c = *p++;
2023             }
2024         }
2025         // Digits to left of '.'
2026         while (1)
2027         {
2028             if (c == '.')
2029             {
2030                 c = *p++;
2031                 break;
2032             }
2033             if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2034             {
2035                 c = *p++;
2036                 continue;
2037             }
2038             break;
2039         }
2040         // Digits to right of '.'
2041         while (1)
2042         {
2043             if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2044             {
2045                 c = *p++;
2046                 continue;
2047             }
2048             break;
2049         }
2050         if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P')))
2051         {
2052             c = *p++;
2053             if (c == '-' || c == '+')
2054             {
2055                 c = *p++;
2056             }
2057             bool anyexp = false;
2058             while (1)
2059             {
2060                 if (isdigit(c))
2061                 {
2062                     anyexp = true;
2063                     c = *p++;
2064                     continue;
2065                 }
2066                 if (c == '_')
2067                 {
2068                     c = *p++;
2069                     continue;
2070                 }
2071                 if (!anyexp)
2072                     error("missing exponent");
2073                 break;
2074             }
2075         }
2076         else if (hex)
2077             error("exponent required for hex float");
2078         --p;
2079         while (pstart < p)
2080         {
2081             if (*pstart != '_')
2082                 stringbuffer.writeByte(*pstart);
2083             ++pstart;
2084         }
2085         stringbuffer.writeByte(0);
2086         auto sbufptr = cast(const(char)*)stringbuffer.data;
2087         TOK result;
2088         bool isOutOfRange = false;
2089         t.floatvalue = CTFloat.parse(sbufptr, &isOutOfRange);
2090         switch (*p)
2091         {
2092         case 'F':
2093         case 'f':
2094             isOutOfRange = (isOutOfRange || Port.isFloat32LiteralOutOfRange(sbufptr));
2095             result = TOKfloat32v;
2096             p++;
2097             break;
2098         default:
2099             isOutOfRange = (isOutOfRange || Port.isFloat64LiteralOutOfRange(sbufptr));
2100             result = TOKfloat64v;
2101             break;
2102         case 'l':
2103             error("use 'L' suffix instead of 'l'");
2104             goto case 'L';
2105         case 'L':
2106             result = TOKfloat80v;
2107             p++;
2108             break;
2109         }
2110         if (*p == 'i' || *p == 'I')
2111         {
2112             if (*p == 'I')
2113                 error("use 'i' suffix instead of 'I'");
2114             p++;
2115             switch (result)
2116             {
2117             case TOKfloat32v:
2118                 result = TOKimaginary32v;
2119                 break;
2120             case TOKfloat64v:
2121                 result = TOKimaginary64v;
2122                 break;
2123             case TOKfloat80v:
2124                 result = TOKimaginary80v;
2125                 break;
2126             default:
2127                 break;
2128             }
2129         }
2130         const isLong = (result == TOKfloat80v || result == TOKimaginary80v);
2131         if (isOutOfRange && !isLong)
2132         {
2133             const char* suffix = (result == TOKfloat32v || result == TOKimaginary32v) ? "f" : "";
2134             error(scanloc, "number '%s%s' is not representable", sbufptr, suffix);
2135         }
2136         debug
2137         {
2138             switch (result)
2139             {
2140             case TOKfloat32v:
2141             case TOKfloat64v:
2142             case TOKfloat80v:
2143             case TOKimaginary32v:
2144             case TOKimaginary64v:
2145             case TOKimaginary80v:
2146                 break;
2147             default:
2148                 assert(0);
2149             }
2150         }
2151         return result;
2152     }
2153 
2154     final Loc loc()
2155     {
2156         scanloc.charnum = cast(uint)(1 + p - line);
2157         return scanloc;
2158     }
2159 
2160     final void error(const(char)* format, ...)
2161     {
2162         va_list ap;
2163         va_start(ap, format);
2164         .verror(token.loc, format, ap);
2165         va_end(ap);
2166         errors = true;
2167     }
2168 
2169     final void error(Loc loc, const(char)* format, ...)
2170     {
2171         va_list ap;
2172         va_start(ap, format);
2173         .verror(loc, format, ap);
2174         va_end(ap);
2175         errors = true;
2176     }
2177 
2178     final void deprecation(const(char)* format, ...)
2179     {
2180         va_list ap;
2181         va_start(ap, format);
2182         .vdeprecation(token.loc, format, ap);
2183         va_end(ap);
2184         if (global.params.useDeprecated == 0)
2185             errors = true;
2186     }
2187 
2188     /*********************************************
2189      * parse:
2190      *      #line linnum [filespec]
2191      * also allow __LINE__ for linnum, and __FILE__ for filespec
2192      */
2193     final void poundLine()
2194     {
2195         auto linnum = this.scanloc.linnum;
2196         const(char)* filespec = null;
2197         const loc = this.loc();
2198         Token tok;
2199         scan(&tok);
2200         if (tok.value == TOKint32v || tok.value == TOKint64v)
2201         {
2202             const lin = cast(int)(tok.uns64value - 1);
2203             if (lin != tok.uns64value - 1)
2204                 error("line number %lld out of range", cast(ulong)tok.uns64value);
2205             else
2206                 linnum = lin;
2207         }
2208         else if (tok.value == TOKline)
2209         {
2210         }
2211         else
2212             goto Lerr;
2213         while (1)
2214         {
2215             switch (*p)
2216             {
2217             case 0:
2218             case 0x1A:
2219             case '\n':
2220             Lnewline:
2221                 this.scanloc.linnum = linnum;
2222                 if (filespec)
2223                     this.scanloc.filename = filespec;
2224                 return;
2225             case '\r':
2226                 p++;
2227                 if (*p != '\n')
2228                 {
2229                     p--;
2230                     goto Lnewline;
2231                 }
2232                 continue;
2233             case ' ':
2234             case '\t':
2235             case '\v':
2236             case '\f':
2237                 p++;
2238                 continue; // skip white space
2239             case '_':
2240                 if (memcmp(p, "__FILE__".ptr, 8) == 0)
2241                 {
2242                     p += 8;
2243                     filespec = mem.xstrdup(scanloc.filename);
2244                     continue;
2245                 }
2246                 goto Lerr;
2247             case '"':
2248                 if (filespec)
2249                     goto Lerr;
2250                 stringbuffer.reset();
2251                 p++;
2252                 while (1)
2253                 {
2254                     uint c;
2255                     c = *p;
2256                     switch (c)
2257                     {
2258                     case '\n':
2259                     case '\r':
2260                     case 0:
2261                     case 0x1A:
2262                         goto Lerr;
2263                     case '"':
2264                         stringbuffer.writeByte(0);
2265                         filespec = mem.xstrdup(cast(const(char)*)stringbuffer.data);
2266                         p++;
2267                         break;
2268                     default:
2269                         if (c & 0x80)
2270                         {
2271                             uint u = decodeUTF();
2272                             if (u == PS || u == LS)
2273                                 goto Lerr;
2274                         }
2275                         stringbuffer.writeByte(c);
2276                         p++;
2277                         continue;
2278                     }
2279                     break;
2280                 }
2281                 continue;
2282             default:
2283                 if (*p & 0x80)
2284                 {
2285                     uint u = decodeUTF();
2286                     if (u == PS || u == LS)
2287                         goto Lnewline;
2288                 }
2289                 goto Lerr;
2290             }
2291         }
2292     Lerr:
2293         error(loc, "#line integer [\"filespec\"]\\n expected");
2294     }
2295 
2296     /********************************************
2297      * Decode UTF character.
2298      * Issue error messages for invalid sequences.
2299      * Return decoded character, advance p to last character in UTF sequence.
2300      */
2301     final uint decodeUTF()
2302     {
2303         const s = p;
2304         assert(*s & 0x80);
2305         // Check length of remaining string up to 6 UTF-8 characters
2306         size_t len;
2307         for (len = 1; len < 6 && s[len]; len++)
2308         {
2309         }
2310         size_t idx = 0;
2311         dchar u;
2312         const msg = utf_decodeChar(s, len, idx, u);
2313         p += idx - 1;
2314         if (msg)
2315         {
2316             error("%s", msg);
2317         }
2318         return u;
2319     }
2320 
2321     /***************************************************
2322      * Parse doc comment embedded between t->ptr and p.
2323      * Remove trailing blanks and tabs from lines.
2324      * Replace all newlines with \n.
2325      * Remove leading comment character from each line.
2326      * Decide if it's a lineComment or a blockComment.
2327      * Append to previous one for this token.
2328      */
2329     final void getDocComment(Token* t, uint lineComment)
2330     {
2331         /* ct tells us which kind of comment it is: '/', '*', or '+'
2332          */
2333         const ct = t.ptr[2];
2334         /* Start of comment text skips over / * *, / + +, or / / /
2335          */
2336         const(char)* q = t.ptr + 3; // start of comment text
2337         const(char)* qend = p;
2338         if (ct == '*' || ct == '+')
2339             qend -= 2;
2340         /* Scan over initial row of ****'s or ++++'s or ////'s
2341          */
2342         for (; q < qend; q++)
2343         {
2344             if (*q != ct)
2345                 break;
2346         }
2347         /* Remove leading spaces until start of the comment
2348          */
2349         int linestart = 0;
2350         if (ct == '/')
2351         {
2352             while (q < qend && (*q == ' ' || *q == '\t'))
2353                 ++q;
2354         }
2355         else if (q < qend)
2356         {
2357             if (*q == '\r')
2358             {
2359                 ++q;
2360                 if (q < qend && *q == '\n')
2361                     ++q;
2362                 linestart = 1;
2363             }
2364             else if (*q == '\n')
2365             {
2366                 ++q;
2367                 linestart = 1;
2368             }
2369         }
2370         /* Remove trailing row of ****'s or ++++'s
2371          */
2372         if (ct != '/')
2373         {
2374             for (; q < qend; qend--)
2375             {
2376                 if (qend[-1] != ct)
2377                     break;
2378             }
2379         }
2380         /* Comment is now [q .. qend].
2381          * Canonicalize it into buf[].
2382          */
2383         OutBuffer buf;
2384 
2385         void trimTrailingWhitespace()
2386         {
2387             const s = buf.peekSlice();
2388             auto len = s.length;
2389             while (len && (s[len - 1] == ' ' || s[len - 1] == '\t'))
2390                 --len;
2391             buf.setsize(len);
2392         }
2393 
2394         for (; q < qend; q++)
2395         {
2396             char c = *q;
2397             switch (c)
2398             {
2399             case '*':
2400             case '+':
2401                 if (linestart && c == ct)
2402                 {
2403                     linestart = 0;
2404                     /* Trim preceding whitespace up to preceding \n
2405                      */
2406                     trimTrailingWhitespace();
2407                     continue;
2408                 }
2409                 break;
2410             case ' ':
2411             case '\t':
2412                 break;
2413             case '\r':
2414                 if (q[1] == '\n')
2415                     continue; // skip the \r
2416                 goto Lnewline;
2417             default:
2418                 if (c == 226)
2419                 {
2420                     // If LS or PS
2421                     if (q[1] == 128 && (q[2] == 168 || q[2] == 169))
2422                     {
2423                         q += 2;
2424                         goto Lnewline;
2425                     }
2426                 }
2427                 linestart = 0;
2428                 break;
2429             Lnewline:
2430                 c = '\n'; // replace all newlines with \n
2431                 goto case;
2432             case '\n':
2433                 linestart = 1;
2434                 /* Trim trailing whitespace
2435                  */
2436                 trimTrailingWhitespace();
2437                 break;
2438             }
2439             buf.writeByte(c);
2440         }
2441         /* Trim trailing whitespace (if the last line does not have newline)
2442          */
2443         trimTrailingWhitespace();
2444 
2445         // Always end with a newline
2446         const s = buf.peekSlice();
2447         if (s.length == 0 || s[$ - 1] != '\n')
2448             buf.writeByte('\n');
2449 
2450         // It's a line comment if the start of the doc comment comes
2451         // after other non-whitespace on the same line.
2452         auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment;
2453         // Combine with previous doc comment, if any
2454         if (*dc)
2455             *dc = combineComments(*dc, buf.peekString());
2456         else
2457             *dc = buf.extractString();
2458     }
2459 
2460     /********************************************
2461      * Combine two document comments into one,
2462      * separated by a newline.
2463      */
2464     static const(char)* combineComments(const(char)* c1, const(char)* c2)
2465     {
2466         //printf("Lexer::combineComments('%s', '%s')\n", c1, c2);
2467         auto c = c2;
2468         if (c1)
2469         {
2470             c = c1;
2471             if (c2)
2472             {
2473                 size_t len1 = strlen(c1);
2474                 size_t len2 = strlen(c2);
2475                 int insertNewLine = 0;
2476                 if (len1 && c1[len1 - 1] != '\n')
2477                 {
2478                     ++len1;
2479                     insertNewLine = 1;
2480                 }
2481                 auto p = cast(char*)mem.xmalloc(len1 + 1 + len2 + 1);
2482                 memcpy(p, c1, len1 - insertNewLine);
2483                 if (insertNewLine)
2484                     p[len1 - 1] = '\n';
2485                 p[len1] = '\n';
2486                 memcpy(p + len1 + 1, c2, len2);
2487                 p[len1 + 1 + len2] = 0;
2488                 c = p;
2489             }
2490         }
2491         return c;
2492     }
2493 
2494 private:
2495     final void endOfLine()
2496     {
2497         scanloc.linnum++;
2498         line = p;
2499     }
2500 }