fix(lexer): improved scanner performance

meriyah · Jul 27, 2019 · c637ee5 · c637ee5 · KFlash · Jul 27, 2019
1 parent 0899ad3
commit c637ee5
Show file tree

Hide file tree

Showing 13 changed files with 326 additions and 276 deletions.
diff --git a/src/common.ts b/src/common.ts
@@ -202,7 +202,7 @@ export interface ParserState {
   sourceFile: string | void;
   assignable: AssignmentKind | DestructuringKind;
   destructible: AssignmentKind | DestructuringKind;
-  nextCP: number;
+  currentChar: number;
   exportedNames: any;
   exportedBindings: any;
 }

diff --git a/src/errors.ts b/src/errors.ts
@@ -385,6 +385,7 @@ export function report(parser: ParserState, type: Errors, ...params: string[]):
 export function reportScopeError(scope: any): never {
   throw new ParseError(scope.index, scope.line, scope.column, scope.type);
 }
+
 /**
  * Throws an error at a given position
  *
@@ -395,8 +396,21 @@ export function reportScopeError(scope: any): never {
  * @param {number} column
  * @param {Errors} type
  * @param {...string[]} params
- * @returns {never}
  */
 export function reportMessageAt(index: number, line: number, column: number, type: Errors, ...params: string[]): never {
   throw new ParseError(index, line, column, type, ...params);
 }
+
+/**
+ * Throws an error at a given position
+ *
+ * @export
+ * @param {ParserState} state
+ * @param {number} index
+ * @param {number} line
+ * @param {number} column
+ * @param {Errors} type
+ */
+export function reportScannerError(index: number, line: number, column: number, type: Errors): never {
+  throw new ParseError(index, line, column, type);
+}
diff --git a/src/lexer/comments.ts b/src/lexer/comments.ts
@@ -1,4 +1,4 @@
-import { nextCP, CharTypes, CharFlags, LexerState, scanNewLine, consumeLineFeed } from './';
+import { advanceChar, CharTypes, CharFlags, LexerState, scanNewLine, consumeLineFeed } from './';
 import { Chars } from '../chars';
 import { Context, ParserState } from '../common';
 import { report, Errors } from '../errors';
@@ -21,7 +21,7 @@ export const CommentTypes = ['SingleLine', 'MultiLine', 'HTMLOpen', 'HTMLClose',
 export function skipHashBang(parser: ParserState): void {
   // HashbangComment ::
   //   #!  SingleLineCommentChars_opt
-  if (parser.nextCP === Chars.Hash && parser.source.charCodeAt(parser.index + 1) === Chars.Exclamation) {
+  if (parser.currentChar === Chars.Hash && parser.source.charCodeAt(parser.index + 1) === Chars.Exclamation) {
     skipSingleLineComment(parser, LexerState.None, CommentType.HashBang);
   }
 }
@@ -45,12 +45,12 @@ export function skipSingleHTMLComment(
 export function skipSingleLineComment(parser: ParserState, state: LexerState, type: CommentType): LexerState {
   const { index } = parser;
   while (parser.index < parser.end) {
-    if (CharTypes[parser.nextCP] & CharFlags.LineTerminator || (parser.nextCP ^ Chars.LineSeparator) <= 1) {
+    if (CharTypes[parser.currentChar] & CharFlags.LineTerminator || (parser.currentChar ^ Chars.LineSeparator) <= 1) {
       state = (state | LexerState.LastIsCR | LexerState.NewLine) ^ LexerState.LastIsCR;
       scanNewLine(parser);
       return state;
     }
-    nextCP(parser);
+    advanceChar(parser);
   }
   if (parser.onComment)
     parser.onComment(CommentTypes[type & 0xff], parser.source.slice(index, parser.index), parser, parser.index);
@@ -66,9 +66,9 @@ export function skipSingleLineComment(parser: ParserState, state: LexerState, ty
 export function skipMultiLineComment(parser: ParserState, state: LexerState): LexerState | void {
   const { index } = parser;
   while (parser.index < parser.end) {
-    while (parser.nextCP === Chars.Asterisk) {
-      if (nextCP(parser) === Chars.Slash) {
-        nextCP(parser);
+    while (parser.currentChar === Chars.Asterisk) {
+      if (advanceChar(parser) === Chars.Slash) {
+        advanceChar(parser);
         if (parser.onComment)
           parser.onComment(
             CommentTypes[CommentType.Multi & 0xff],
@@ -80,17 +80,17 @@ export function skipMultiLineComment(parser: ParserState, state: LexerState): Le
       }
     }
 
-    if (parser.nextCP === Chars.CarriageReturn) {
+    if (parser.currentChar === Chars.CarriageReturn) {
       state |= LexerState.NewLine | LexerState.LastIsCR;
       scanNewLine(parser);
-    } else if (parser.nextCP === Chars.LineFeed) {
+    } else if (parser.currentChar === Chars.LineFeed) {
       consumeLineFeed(parser, (state & LexerState.LastIsCR) !== 0);
       state = (state | LexerState.LastIsCR | LexerState.NewLine) ^ LexerState.LastIsCR;
-    } else if ((parser.nextCP ^ Chars.LineSeparator) <= 1) {
+    } else if ((parser.currentChar ^ Chars.LineSeparator) <= 1) {
       state = (state | LexerState.LastIsCR | LexerState.NewLine) ^ LexerState.LastIsCR;
       scanNewLine(parser);
     } else {
-      nextCP(parser);
+      advanceChar(parser);
     }
   }
 

diff --git a/src/lexer/common.ts b/src/lexer/common.ts
@@ -26,9 +26,9 @@ export const enum NumberKind {
  *
  * @param parser The parser instance
  */
-export function nextCP(parser: ParserState): number {
+export function advanceChar(parser: ParserState): number {
   parser.column++;
-  return (parser.nextCP = parser.source.charCodeAt(++parser.index));
+  return (parser.currentChar = parser.source.charCodeAt(++parser.index));
 }
 
 /**
@@ -42,7 +42,7 @@ export function consumeMultiUnitCodePoint(parser: ParserState, hi: number): 0 |
   if ((hi & 0xfc00) !== Chars.LeadSurrogateMin) return 0;
   const lo = parser.source.charCodeAt(parser.index + 1);
   if ((lo & 0xfc00) !== 0xdc00) return 0;
-  hi = parser.nextCP = Chars.NonBMPMin + ((hi & 0x3ff) << 10) + (lo & 0x3ff);
+  hi = parser.currentChar = Chars.NonBMPMin + ((hi & 0x3ff) << 10) + (lo & 0x3ff);
   if (((unicodeLookup[(hi >>> 5) + 0] >>> hi) & 31 & 1) === 0) {
     report(parser, Errors.IllegalCaracter, fromCodePoint(hi));
   }
@@ -55,7 +55,7 @@ export function consumeMultiUnitCodePoint(parser: ParserState, hi: number): 0 |
  * Use to consume a line feed instead of `scanNewLine`.
  */
 export function consumeLineFeed(parser: ParserState, lastIsCR: boolean): void {
-  parser.nextCP = parser.source.charCodeAt(++parser.index);
+  parser.currentChar = parser.source.charCodeAt(++parser.index);
   parser.flags |= Flags.NewLine;
   if (!lastIsCR) {
     parser.column = 0;
@@ -65,7 +65,7 @@ export function consumeLineFeed(parser: ParserState, lastIsCR: boolean): void {
 
 export function scanNewLine(parser: ParserState): void {
   parser.flags |= Flags.NewLine;
-  parser.nextCP = parser.source.charCodeAt(++parser.index);
+  parser.currentChar = parser.source.charCodeAt(++parser.index);
   parser.column = 0;
   parser.line++;
 }

diff --git a/src/lexer/identifier.ts b/src/lexer/identifier.ts
@@ -1,9 +1,9 @@
 import { ParserState, Context } from '../common';
 import { Token, descKeywordTable } from '../token';
 import { Chars } from '../chars';
-import { nextCP, consumeMultiUnitCodePoint, fromCodePoint, toHex } from './';
+import { advanceChar, consumeMultiUnitCodePoint, fromCodePoint, toHex } from './';
 import { CharTypes, CharFlags, isIdentifierPart, isIdentifierStart } from './charClassifier';
-import { report, reportMessageAt, Errors } from '../errors';
+import { report, reportScannerError, Errors } from '../errors';
 
 /**
  * Scans identifier
@@ -12,9 +12,9 @@ import { report, reportMessageAt, Errors } from '../errors';
  * @param context Context masks
  */
 export function scanIdentifier(parser: ParserState, context: Context, isValidAsKeyword: 0 | 1): Token {
-  while ((CharTypes[nextCP(parser)] & CharFlags.IdentifierPart) !== 0) {}
+  while ((CharTypes[advanceChar(parser)] & CharFlags.IdentifierPart) !== 0) {}
   parser.tokenValue = parser.source.slice(parser.tokenPos, parser.index);
-  return parser.nextCP !== Chars.Backslash && parser.nextCP < 0x7e
+  return parser.currentChar !== Chars.Backslash && parser.currentChar < 0x7e
     ? descKeywordTable[parser.tokenValue] || Token.Identifier
     : scanIdentifierSlowCase(parser, context, 0, isValidAsKeyword);
 }
@@ -49,16 +49,16 @@ export function scanIdentifierSlowCase(
   let start = parser.index;
 
   while (parser.index < parser.end) {
-    if (parser.nextCP === Chars.Backslash) {
+    if (parser.currentChar === Chars.Backslash) {
       parser.tokenValue += parser.source.slice(start, parser.index);
       hasEscape = 1;
       const code = scanIdentifierUnicodeEscape(parser);
       if (!isIdentifierPart(code)) report(parser, Errors.InvalidUnicodeEscapeSequence);
       isValidAsKeyword = isValidAsKeyword && CharTypes[code] & CharFlags.KeywordCandidate;
       parser.tokenValue += fromCodePoint(code);
       start = parser.index;
-    } else if (isIdentifierPart(parser.nextCP) || consumeMultiUnitCodePoint(parser, parser.nextCP)) {
-      nextCP(parser);
+    } else if (isIdentifierPart(parser.currentChar) || consumeMultiUnitCodePoint(parser, parser.currentChar)) {
+      advanceChar(parser);
     } else {
       break;
     }
@@ -94,7 +94,7 @@ export function scanIdentifierSlowCase(
  * @param parser  Parser object
  */
 export function scanPrivateName(parser: ParserState): Token {
-  if (!isIdentifierStart(nextCP(parser))) report(parser, Errors.MissingPrivateName);
+  if (!isIdentifierStart(advanceChar(parser))) report(parser, Errors.MissingPrivateName);
   return Token.PrivateField;
 }
 
@@ -109,31 +109,32 @@ export function scanIdentifierUnicodeEscape(parser: ParserState): number {
   if (parser.source.charCodeAt(parser.index + 1) !== Chars.LowerU) {
     report(parser, Errors.InvalidUnicodeEscapeSequence);
   }
-  parser.nextCP = parser.source.charCodeAt((parser.index += 2));
-  return scanUnicodeEscapeValue(parser);
+  parser.currentChar = parser.source.charCodeAt((parser.index += 2));
+  return scanUnicodeEscape(parser);
 }
 
 /**
  * Scans unicode escape value
  *
  * @param parser  Parser object
  */
-export function scanUnicodeEscapeValue(parser: ParserState): number {
+export function scanUnicodeEscape(parser: ParserState): number {
+  // Accept both \uxxxx and \u{xxxxxx}
   let codePoint = 0;
-  const char = parser.nextCP;
+  const char = parser.currentChar;
   // First handle a delimited Unicode escape, e.g. \u{1F4A9}
   if (char === Chars.LeftBrace) {
-    const startPos = parser.index;
-    while (CharTypes[nextCP(parser)] & CharFlags.Hex) {
-      codePoint = (codePoint << 4) | toHex(parser.nextCP);
-      if (codePoint > Chars.NonBMPMax) report(parser, Errors.UnicodeOverflow);
+    const begin = parser.index - 2;
+    while (CharTypes[advanceChar(parser)] & CharFlags.Hex) {
+      codePoint = (codePoint << 4) | toHex(parser.currentChar);
+      if (codePoint > Chars.NonBMPMax) reportScannerError(begin, parser.line, parser.index + 1, Errors.UnicodeOverflow);
     }
 
     // At least 4 characters have to be read
-    if (codePoint < 1 || (parser.nextCP as number) !== Chars.RightBrace) {
-      reportMessageAt(startPos, parser.line, startPos - 1, Errors.InvalidHexEscapeSequence);
+    if (codePoint < 1 || (parser.currentChar as number) !== Chars.RightBrace) {
+      reportScannerError(begin, parser.line, parser.index - 1, Errors.InvalidHexEscapeSequence);
     }
-    nextCP(parser); // consumes '}'
+    advanceChar(parser); // consumes '}'
     return codePoint;
   }
 
@@ -148,7 +149,7 @@ export function scanUnicodeEscapeValue(parser: ParserState): number {
 
   codePoint = (toHex(char) << 12) | (toHex(char2) << 8) | (toHex(char3) << 4) | toHex(char4);
 
-  parser.nextCP = parser.source.charCodeAt((parser.index += 4));
+  parser.currentChar = parser.source.charCodeAt((parser.index += 4));
 
   return codePoint;
 }
diff --git a/src/lexer/index.ts b/src/lexer/index.ts
@@ -7,7 +7,7 @@ export {
   CommentType
 } from './comments';
 export {
-  nextCP,
+  advanceChar,
   consumeMultiUnitCodePoint,
   isExoticECMAScriptWhitespace,
   fromCodePoint,
@@ -23,7 +23,7 @@ export {
   scanIdentifierSlowCase,
   scanUnicodeIdentifier,
   scanPrivateName,
-  scanUnicodeEscapeValue
+  scanUnicodeEscape
 } from './identifier';
 export { scanString } from './string';
 export { scanNumber } from './numeric';

diff --git a/src/lexer/jsx.ts b/src/lexer/jsx.ts
@@ -3,7 +3,7 @@ import { Chars } from '../chars';
 import { Token } from '../token';
 import { ParserState, Context } from '../common';
 import { report, Errors } from '../errors';
-import { nextCP, LexerState, TokenLookup } from './';
+import { advanceChar, LexerState, TokenLookup } from './';
 import { scanSingleToken } from './scan';
 
 /**
@@ -17,7 +17,7 @@ export function scanJSXAttributeValue(parser: ParserState, context: Context): To
   parser.startColumn = parser.column;
   parser.startLine = parser.line;
   parser.token =
-    CharTypes[parser.nextCP] & CharFlags.StringLiteral
+    CharTypes[parser.currentChar] & CharFlags.StringLiteral
       ? scanJSXString(parser)
       : scanSingleToken(parser, context, LexerState.None);
   return parser.token;
@@ -29,18 +29,18 @@ export function scanJSXAttributeValue(parser: ParserState, context: Context): To
  * @param parser The parser object
  */
 export function scanJSXString(parser: ParserState): Token {
-  const quote = parser.nextCP;
-  let char = nextCP(parser);
+  const quote = parser.currentChar;
+  let char = advanceChar(parser);
   const start = parser.index;
   while (char !== quote) {
     if (parser.index >= parser.end) report(parser, Errors.UnterminatedString);
-    char = nextCP(parser);
+    char = advanceChar(parser);
   }
 
   // check for unterminated string
   if (char !== quote) report(parser, Errors.UnterminatedString);
   parser.tokenValue = parser.source.slice(start, parser.index);
-  nextCP(parser); // skip the quote
+  advanceChar(parser); // skip the quote
   return Token.StringLiteral;
 }
 
@@ -60,22 +60,22 @@ export function scanJSXToken(parser: ParserState): Token {
 
   switch (token) {
     case Token.LessThan: {
-      nextCP(parser);
-      if (parser.nextCP === Chars.Slash) {
-        nextCP(parser);
+      advanceChar(parser);
+      if (parser.currentChar === Chars.Slash) {
+        advanceChar(parser);
         return (parser.token = Token.JSXClose);
       }
 
       return (parser.token = Token.LessThan);
     }
     case Token.LeftBrace: {
-      nextCP(parser);
+      advanceChar(parser);
       return (parser.token = Token.LeftBrace);
     }
     default: // ignore
   }
 
-  while (parser.index < parser.end && (CharTypes[nextCP(parser)] & CharFlags.JSXToken) === 0) {}
+  while (parser.index < parser.end && (CharTypes[advanceChar(parser)] & CharFlags.JSXToken) === 0) {}
 
   parser.tokenValue = parser.source.slice(parser.tokenPos, parser.index);
 
@@ -90,9 +90,9 @@ export function scanJSXToken(parser: ParserState): Token {
 export function scanJSXIdentifier(parser: ParserState): Token {
   if ((parser.token & Token.IsIdentifier) === Token.IsIdentifier) {
     const { index } = parser;
-    let char = parser.nextCP;
+    let char = parser.currentChar;
     while ((CharTypes[char] & (CharFlags.Hyphen | CharFlags.IdentifierPart)) !== 0) {
-      char = nextCP(parser);
+      char = advanceChar(parser);
     }
     parser.tokenValue += parser.source.slice(index, parser.index);
   }