You are here

class JSTokenizer in Advanced CSS/JS Aggregation 8.2

Same name and namespace in other branches
  1. 8.4 advagg_js_minify/jsminplus.inc \JSTokenizer
  2. 8.3 advagg_js_minify/jsminplus.inc \JSTokenizer
  3. 6 advagg_js_compress/jsminplus.inc \JSTokenizer
  4. 7.2 advagg_js_compress/jsminplus.inc \JSTokenizer
  5. 7 advagg_js_compress/jsminplus.inc \JSTokenizer

Hierarchy

Expanded class hierarchy of JSTokenizer

File

advagg_js_minify/jsminplus.inc, line 1856
JSMinPlus version 1.4

View source
class JSTokenizer {
  private $cursor = 0;
  private $source;
  public $tokens = array();
  public $tokenIndex = 0;
  public $lookahead = 0;
  public $scanNewlines = false;
  public $scanOperand = true;
  public $filename;
  public $lineno;
  private $keywords = array(
    'break',
    'case',
    'catch',
    'const',
    'continue',
    'debugger',
    'default',
    'delete',
    'do',
    'else',
    'enum',
    'false',
    'finally',
    'for',
    'function',
    'if',
    'in',
    'instanceof',
    'new',
    'null',
    'return',
    'switch',
    'this',
    'throw',
    'true',
    'try',
    'typeof',
    'var',
    'void',
    'while',
    'with',
  );
  private $opTypeNames = array(
    ';',
    ',',
    '?',
    ':',
    '||',
    '&&',
    '|',
    '^',
    '&',
    '===',
    '==',
    '=',
    '!==',
    '!=',
    '<<',
    '<=',
    '<',
    '>>>',
    '>>',
    '>=',
    '>',
    '++',
    '--',
    '+',
    '-',
    '*',
    '/',
    '%',
    '!',
    '~',
    '.',
    '[',
    ']',
    '{',
    '}',
    '(',
    ')',
    '@*/',
  );
  private $assignOps = array(
    '|',
    '^',
    '&',
    '<<',
    '>>',
    '>>>',
    '+',
    '-',
    '*',
    '/',
    '%',
  );
  private $opRegExp;
  public function __construct() {
    $this->opRegExp = '#^(' . implode('|', array_map('preg_quote', $this->opTypeNames)) . ')#';
  }
  public function init($source, $filename = '', $lineno = 1) {
    $this->source = $source;
    $this->filename = $filename ? $filename : '[inline]';
    $this->lineno = $lineno;
    $this->cursor = 0;
    $this->tokens = array();
    $this->tokenIndex = 0;
    $this->lookahead = 0;
    $this->scanNewlines = false;
    $this->scanOperand = true;
  }
  public function getInput($chunksize) {
    if ($chunksize) {
      return substr($this->source, $this->cursor, $chunksize);
    }
    return substr($this->source, $this->cursor);
  }
  public function isDone() {
    return $this
      ->peek() == TOKEN_END;
  }
  public function match($tt) {
    return $this
      ->get() == $tt || $this
      ->unget();
  }
  public function mustMatch($tt) {
    if (!$this
      ->match($tt)) {
      throw $this
        ->newSyntaxError('Unexpected token; token ' . $tt . ' expected');
    }
    return $this
      ->currentToken();
  }
  public function peek() {
    if ($this->lookahead) {
      $next = $this->tokens[$this->tokenIndex + $this->lookahead & 3];
      if ($this->scanNewlines && $next->lineno != $this->lineno) {
        $tt = TOKEN_NEWLINE;
      }
      else {
        $tt = $next->type;
      }
    }
    else {
      $tt = $this
        ->get();
      $this
        ->unget();
    }
    return $tt;
  }
  public function peekOnSameLine() {
    $this->scanNewlines = true;
    $tt = $this
      ->peek();
    $this->scanNewlines = false;
    return $tt;
  }
  public function currentToken() {
    if (!empty($this->tokens)) {
      return $this->tokens[$this->tokenIndex];
    }
  }
  public function get($chunksize = 1000) {
    while ($this->lookahead) {
      $this->lookahead--;
      $this->tokenIndex = $this->tokenIndex + 1 & 3;
      $token = $this->tokens[$this->tokenIndex];
      if ($token->type != TOKEN_NEWLINE || $this->scanNewlines) {
        return $token->type;
      }
    }
    $conditional_comment = false;

    // strip whitespace and comments
    while (true) {
      $input = $this
        ->getInput($chunksize);

      // whitespace handling; gobble up \r as well (effectively we don't have support for MAC newlines!)
      $re = $this->scanNewlines ? '/^[ \\r\\t]+/' : '/^\\s+/';
      if (preg_match($re, $input, $match)) {
        $spaces = $match[0];
        $spacelen = strlen($spaces);
        $this->cursor += $spacelen;
        if (!$this->scanNewlines) {
          $this->lineno += substr_count($spaces, "\n");
        }
        if ($spacelen == $chunksize) {
          continue;

          // complete chunk contained whitespace
        }
        $input = $this
          ->getInput($chunksize);
        if ($input == '' || $input[0] != '/') {
          break;
        }
      }

      // Comments
      if (!preg_match('/^\\/(?:\\*(@(?:cc_on|if|elif|else|end))?.*?\\*\\/|\\/[^\\n]*)/s', $input, $match)) {
        if (!$chunksize) {
          break;
        }

        // retry with a full chunk fetch; this also prevents breakage of long regular expressions (which will never match a comment)
        $chunksize = null;
        continue;
      }

      // check if this is a conditional (JScript) comment
      if (!empty($match[1])) {
        $match[0] = '/*' . $match[1];
        $conditional_comment = true;
        break;
      }
      else {
        $this->cursor += strlen($match[0]);
        $this->lineno += substr_count($match[0], "\n");
      }
    }
    if ($input == '') {
      $tt = TOKEN_END;
      $match = array(
        '',
      );
    }
    elseif ($conditional_comment) {
      $tt = TOKEN_CONDCOMMENT_START;
    }
    else {
      switch ($input[0]) {
        case '0':

          // hexadecimal
          if (($input[1] == 'x' || $input[1] == 'X') && preg_match('/^0x[0-9a-f]+/i', $input, $match)) {
            $tt = TOKEN_NUMBER;
            break;
          }

        // FALL THROUGH
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':

          // should always match
          preg_match('/^\\d+(?:\\.\\d*)?(?:[eE][-+]?\\d+)?/', $input, $match);
          $tt = TOKEN_NUMBER;
          break;
        case "'":
          if (preg_match('/^\'(?:[^\\\\\'\\r\\n]++|\\\\(?:.|\\r?\\n))*\'/', $input, $match)) {
            $tt = TOKEN_STRING;
          }
          else {
            if ($chunksize) {
              return $this
                ->get(null);

              // retry with a full chunk fetch
            }
            throw $this
              ->newSyntaxError('Unterminated string literal');
          }
          break;
        case '"':
          if (preg_match('/^"(?:[^\\\\"\\r\\n]++|\\\\(?:.|\\r?\\n))*"/', $input, $match)) {
            $tt = TOKEN_STRING;
          }
          else {
            if ($chunksize) {
              return $this
                ->get(null);

              // retry with a full chunk fetch
            }
            throw $this
              ->newSyntaxError('Unterminated string literal');
          }
          break;
        case '/':
          if ($this->scanOperand && preg_match('/^\\/((?:\\\\.|\\[(?:\\\\.|[^\\]])*\\]|[^\\/])+)\\/([gimy]*)/', $input, $match)) {
            $tt = TOKEN_REGEXP;
            break;
          }

        // FALL THROUGH
        case '|':
        case '^':
        case '&':
        case '<':
        case '>':
        case '+':
        case '-':
        case '*':
        case '%':
        case '=':
        case '!':

          // should always match
          preg_match($this->opRegExp, $input, $match);
          $op = $match[0];
          if (in_array($op, $this->assignOps) && $input[strlen($op)] == '=') {
            $tt = OP_ASSIGN;
            $match[0] .= '=';
          }
          else {
            $tt = $op;
            if ($this->scanOperand) {
              if ($op == OP_PLUS) {
                $tt = OP_UNARY_PLUS;
              }
              elseif ($op == OP_MINUS) {
                $tt = OP_UNARY_MINUS;
              }
            }
            $op = null;
          }
          break;
        case '.':
          if (preg_match('/^\\.\\d+(?:[eE][-+]?\\d+)?/', $input, $match)) {
            $tt = TOKEN_NUMBER;
            break;
          }

        // FALL THROUGH
        case ';':
        case ',':
        case '?':
        case ':':
        case '~':
        case '[':
        case ']':
        case '{':
        case '}':
        case '(':
        case ')':

          // these are all single
          $match = array(
            $input[0],
          );
          $tt = $input[0];
          break;
        case '@':

          // check end of conditional comment
          if (substr($input, 0, 3) == '@*/') {
            $match = array(
              '@*/',
            );
            $tt = TOKEN_CONDCOMMENT_END;
          }
          else {
            throw $this
              ->newSyntaxError('Illegal token');
          }
          break;
        case "\n":
          if ($this->scanNewlines) {
            $match = array(
              "\n",
            );
            $tt = TOKEN_NEWLINE;
          }
          else {
            throw $this
              ->newSyntaxError('Illegal token');
          }
          break;
        default:

          // Fast path for identifiers: word chars followed by whitespace or various other tokens.
          // Note we don't need to exclude digits in the first char, as they've already been found
          // above.
          if (!preg_match('/^[$\\w]+(?=[\\s\\/\\|\\^\\&<>\\+\\-\\*%=!.;,\\?:~\\[\\]\\{\\}\\(\\)@])/', $input, $match)) {

            // Character classes per ECMA-262 edition 5.1 section 7.6
            // Per spec, must accept Unicode 3.0, *may* accept later versions.
            // We'll take whatever PCRE understands, which should be more recent.
            $identifierStartChars = "\\p{L}\\p{Nl}" . "\$" . "_";
            $identifierPartChars = $identifierStartChars . "\\p{Mn}\\p{Mc}" . "\\p{Nd}" . "\\p{Pc}";

            # UnicodeConnectorPunctuation
            $unicodeEscape = "\\\\u[0-9A-F-a-f]{4}";
            $identifierRegex = "/^" . "(?:[{$identifierStartChars}]|{$unicodeEscape})" . "(?:[{$identifierPartChars}]|{$unicodeEscape})*" . "/uS";
            if (preg_match($identifierRegex, $input, $match)) {
              if (strpos($match[0], '\\') !== false) {

                // Per ECMA-262 edition 5.1, section 7.6 escape sequences should behave as if they were
                // the original chars, but only within the boundaries of the identifier.
                $decoded = preg_replace_callback('/\\\\u([0-9A-Fa-f]{4})/', array(
                  __CLASS__,
                  'unicodeEscapeCallback',
                ), $match[0]);

                // Since our original regex didn't de-escape the originals, we need to check for validity again.
                // No need to worry about token boundaries, as anything outside the identifier is illegal!
                if (!preg_match("/^[{$identifierStartChars}][{$identifierPartChars}]*\$/u", $decoded)) {
                  throw $this
                    ->newSyntaxError('Illegal token');
                }

                // Per spec it _ought_ to work to use these escapes for keywords words as well...
                // but IE rejects them as invalid, while Firefox and Chrome treat them as identifiers
                // that don't match the keyword.
                if (in_array($decoded, $this->keywords)) {
                  throw $this
                    ->newSyntaxError('Illegal token');
                }

                // TODO: save the decoded form for output?
              }
            }
            else {
              throw $this
                ->newSyntaxError('Illegal token');
            }
          }
          $tt = in_array($match[0], $this->keywords) ? $match[0] : TOKEN_IDENTIFIER;
      }
    }
    $this->tokenIndex = $this->tokenIndex + 1 & 3;
    if (!isset($this->tokens[$this->tokenIndex])) {
      $this->tokens[$this->tokenIndex] = new JSToken();
    }
    $token = $this->tokens[$this->tokenIndex];
    $token->type = $tt;
    if ($tt == OP_ASSIGN) {
      $token->assignOp = $op;
    }
    $token->start = $this->cursor;
    $token->value = $match[0];
    $this->cursor += strlen($match[0]);
    $token->end = $this->cursor;
    $token->lineno = $this->lineno;
    return $tt;
  }
  public function unget() {
    if (++$this->lookahead == 4) {
      throw $this
        ->newSyntaxError('PANIC: too much lookahead!');
    }
    $this->tokenIndex = $this->tokenIndex - 1 & 3;
  }
  public function newSyntaxError($m) {
    return new Exception('Parse error: ' . $m . ' in file \'' . $this->filename . '\' on line ' . $this->lineno);
  }
  public static function unicodeEscapeCallback($m) {
    return html_entity_decode('&#x' . $m[1] . ';', ENT_QUOTES, 'UTF-8');
  }

}

Members

Namesort descending Modifiers Type Description Overrides
JSTokenizer::$assignOps private property
JSTokenizer::$cursor private property
JSTokenizer::$filename public property
JSTokenizer::$keywords private property
JSTokenizer::$lineno public property
JSTokenizer::$lookahead public property
JSTokenizer::$opRegExp private property
JSTokenizer::$opTypeNames private property
JSTokenizer::$scanNewlines public property
JSTokenizer::$scanOperand public property
JSTokenizer::$source private property
JSTokenizer::$tokenIndex public property
JSTokenizer::$tokens public property
JSTokenizer::currentToken public function
JSTokenizer::get public function
JSTokenizer::getInput public function
JSTokenizer::init public function
JSTokenizer::isDone public function
JSTokenizer::match public function
JSTokenizer::mustMatch public function
JSTokenizer::newSyntaxError public function
JSTokenizer::peek public function
JSTokenizer::peekOnSameLine public function
JSTokenizer::unget public function
JSTokenizer::unicodeEscapeCallback public static function
JSTokenizer::__construct public function