class JSTokenizer in Advanced CSS/JS Aggregation 8.2
Same name and namespace in other branches
- 8.4 advagg_js_minify/jsminplus.inc \JSTokenizer
- 8.3 advagg_js_minify/jsminplus.inc \JSTokenizer
- 6 advagg_js_compress/jsminplus.inc \JSTokenizer
- 7.2 advagg_js_compress/jsminplus.inc \JSTokenizer
- 7 advagg_js_compress/jsminplus.inc \JSTokenizer
Hierarchy
- class \JSTokenizer
Expanded class hierarchy of JSTokenizer
File
- advagg_js_minify/
jsminplus.inc, line 1856 - JSMinPlus version 1.4
View source
class JSTokenizer {
private $cursor = 0;
private $source;
public $tokens = array();
public $tokenIndex = 0;
public $lookahead = 0;
public $scanNewlines = false;
public $scanOperand = true;
public $filename;
public $lineno;
private $keywords = array(
'break',
'case',
'catch',
'const',
'continue',
'debugger',
'default',
'delete',
'do',
'else',
'enum',
'false',
'finally',
'for',
'function',
'if',
'in',
'instanceof',
'new',
'null',
'return',
'switch',
'this',
'throw',
'true',
'try',
'typeof',
'var',
'void',
'while',
'with',
);
private $opTypeNames = array(
';',
',',
'?',
':',
'||',
'&&',
'|',
'^',
'&',
'===',
'==',
'=',
'!==',
'!=',
'<<',
'<=',
'<',
'>>>',
'>>',
'>=',
'>',
'++',
'--',
'+',
'-',
'*',
'/',
'%',
'!',
'~',
'.',
'[',
']',
'{',
'}',
'(',
')',
'@*/',
);
private $assignOps = array(
'|',
'^',
'&',
'<<',
'>>',
'>>>',
'+',
'-',
'*',
'/',
'%',
);
private $opRegExp;
public function __construct() {
$this->opRegExp = '#^(' . implode('|', array_map('preg_quote', $this->opTypeNames)) . ')#';
}
public function init($source, $filename = '', $lineno = 1) {
$this->source = $source;
$this->filename = $filename ? $filename : '[inline]';
$this->lineno = $lineno;
$this->cursor = 0;
$this->tokens = array();
$this->tokenIndex = 0;
$this->lookahead = 0;
$this->scanNewlines = false;
$this->scanOperand = true;
}
public function getInput($chunksize) {
if ($chunksize) {
return substr($this->source, $this->cursor, $chunksize);
}
return substr($this->source, $this->cursor);
}
public function isDone() {
return $this
->peek() == TOKEN_END;
}
public function match($tt) {
return $this
->get() == $tt || $this
->unget();
}
public function mustMatch($tt) {
if (!$this
->match($tt)) {
throw $this
->newSyntaxError('Unexpected token; token ' . $tt . ' expected');
}
return $this
->currentToken();
}
public function peek() {
if ($this->lookahead) {
$next = $this->tokens[$this->tokenIndex + $this->lookahead & 3];
if ($this->scanNewlines && $next->lineno != $this->lineno) {
$tt = TOKEN_NEWLINE;
}
else {
$tt = $next->type;
}
}
else {
$tt = $this
->get();
$this
->unget();
}
return $tt;
}
public function peekOnSameLine() {
$this->scanNewlines = true;
$tt = $this
->peek();
$this->scanNewlines = false;
return $tt;
}
public function currentToken() {
if (!empty($this->tokens)) {
return $this->tokens[$this->tokenIndex];
}
}
public function get($chunksize = 1000) {
while ($this->lookahead) {
$this->lookahead--;
$this->tokenIndex = $this->tokenIndex + 1 & 3;
$token = $this->tokens[$this->tokenIndex];
if ($token->type != TOKEN_NEWLINE || $this->scanNewlines) {
return $token->type;
}
}
$conditional_comment = false;
// strip whitespace and comments
while (true) {
$input = $this
->getInput($chunksize);
// whitespace handling; gobble up \r as well (effectively we don't have support for MAC newlines!)
$re = $this->scanNewlines ? '/^[ \\r\\t]+/' : '/^\\s+/';
if (preg_match($re, $input, $match)) {
$spaces = $match[0];
$spacelen = strlen($spaces);
$this->cursor += $spacelen;
if (!$this->scanNewlines) {
$this->lineno += substr_count($spaces, "\n");
}
if ($spacelen == $chunksize) {
continue;
// complete chunk contained whitespace
}
$input = $this
->getInput($chunksize);
if ($input == '' || $input[0] != '/') {
break;
}
}
// Comments
if (!preg_match('/^\\/(?:\\*(@(?:cc_on|if|elif|else|end))?.*?\\*\\/|\\/[^\\n]*)/s', $input, $match)) {
if (!$chunksize) {
break;
}
// retry with a full chunk fetch; this also prevents breakage of long regular expressions (which will never match a comment)
$chunksize = null;
continue;
}
// check if this is a conditional (JScript) comment
if (!empty($match[1])) {
$match[0] = '/*' . $match[1];
$conditional_comment = true;
break;
}
else {
$this->cursor += strlen($match[0]);
$this->lineno += substr_count($match[0], "\n");
}
}
if ($input == '') {
$tt = TOKEN_END;
$match = array(
'',
);
}
elseif ($conditional_comment) {
$tt = TOKEN_CONDCOMMENT_START;
}
else {
switch ($input[0]) {
case '0':
// hexadecimal
if (($input[1] == 'x' || $input[1] == 'X') && preg_match('/^0x[0-9a-f]+/i', $input, $match)) {
$tt = TOKEN_NUMBER;
break;
}
// FALL THROUGH
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
// should always match
preg_match('/^\\d+(?:\\.\\d*)?(?:[eE][-+]?\\d+)?/', $input, $match);
$tt = TOKEN_NUMBER;
break;
case "'":
if (preg_match('/^\'(?:[^\\\\\'\\r\\n]++|\\\\(?:.|\\r?\\n))*\'/', $input, $match)) {
$tt = TOKEN_STRING;
}
else {
if ($chunksize) {
return $this
->get(null);
// retry with a full chunk fetch
}
throw $this
->newSyntaxError('Unterminated string literal');
}
break;
case '"':
if (preg_match('/^"(?:[^\\\\"\\r\\n]++|\\\\(?:.|\\r?\\n))*"/', $input, $match)) {
$tt = TOKEN_STRING;
}
else {
if ($chunksize) {
return $this
->get(null);
// retry with a full chunk fetch
}
throw $this
->newSyntaxError('Unterminated string literal');
}
break;
case '/':
if ($this->scanOperand && preg_match('/^\\/((?:\\\\.|\\[(?:\\\\.|[^\\]])*\\]|[^\\/])+)\\/([gimy]*)/', $input, $match)) {
$tt = TOKEN_REGEXP;
break;
}
// FALL THROUGH
case '|':
case '^':
case '&':
case '<':
case '>':
case '+':
case '-':
case '*':
case '%':
case '=':
case '!':
// should always match
preg_match($this->opRegExp, $input, $match);
$op = $match[0];
if (in_array($op, $this->assignOps) && $input[strlen($op)] == '=') {
$tt = OP_ASSIGN;
$match[0] .= '=';
}
else {
$tt = $op;
if ($this->scanOperand) {
if ($op == OP_PLUS) {
$tt = OP_UNARY_PLUS;
}
elseif ($op == OP_MINUS) {
$tt = OP_UNARY_MINUS;
}
}
$op = null;
}
break;
case '.':
if (preg_match('/^\\.\\d+(?:[eE][-+]?\\d+)?/', $input, $match)) {
$tt = TOKEN_NUMBER;
break;
}
// FALL THROUGH
case ';':
case ',':
case '?':
case ':':
case '~':
case '[':
case ']':
case '{':
case '}':
case '(':
case ')':
// these are all single
$match = array(
$input[0],
);
$tt = $input[0];
break;
case '@':
// check end of conditional comment
if (substr($input, 0, 3) == '@*/') {
$match = array(
'@*/',
);
$tt = TOKEN_CONDCOMMENT_END;
}
else {
throw $this
->newSyntaxError('Illegal token');
}
break;
case "\n":
if ($this->scanNewlines) {
$match = array(
"\n",
);
$tt = TOKEN_NEWLINE;
}
else {
throw $this
->newSyntaxError('Illegal token');
}
break;
default:
// Fast path for identifiers: word chars followed by whitespace or various other tokens.
// Note we don't need to exclude digits in the first char, as they've already been found
// above.
if (!preg_match('/^[$\\w]+(?=[\\s\\/\\|\\^\\&<>\\+\\-\\*%=!.;,\\?:~\\[\\]\\{\\}\\(\\)@])/', $input, $match)) {
// Character classes per ECMA-262 edition 5.1 section 7.6
// Per spec, must accept Unicode 3.0, *may* accept later versions.
// We'll take whatever PCRE understands, which should be more recent.
$identifierStartChars = "\\p{L}\\p{Nl}" . "\$" . "_";
$identifierPartChars = $identifierStartChars . "\\p{Mn}\\p{Mc}" . "\\p{Nd}" . "\\p{Pc}";
# UnicodeConnectorPunctuation
$unicodeEscape = "\\\\u[0-9A-F-a-f]{4}";
$identifierRegex = "/^" . "(?:[{$identifierStartChars}]|{$unicodeEscape})" . "(?:[{$identifierPartChars}]|{$unicodeEscape})*" . "/uS";
if (preg_match($identifierRegex, $input, $match)) {
if (strpos($match[0], '\\') !== false) {
// Per ECMA-262 edition 5.1, section 7.6 escape sequences should behave as if they were
// the original chars, but only within the boundaries of the identifier.
$decoded = preg_replace_callback('/\\\\u([0-9A-Fa-f]{4})/', array(
__CLASS__,
'unicodeEscapeCallback',
), $match[0]);
// Since our original regex didn't de-escape the originals, we need to check for validity again.
// No need to worry about token boundaries, as anything outside the identifier is illegal!
if (!preg_match("/^[{$identifierStartChars}][{$identifierPartChars}]*\$/u", $decoded)) {
throw $this
->newSyntaxError('Illegal token');
}
// Per spec it _ought_ to work to use these escapes for keywords words as well...
// but IE rejects them as invalid, while Firefox and Chrome treat them as identifiers
// that don't match the keyword.
if (in_array($decoded, $this->keywords)) {
throw $this
->newSyntaxError('Illegal token');
}
// TODO: save the decoded form for output?
}
}
else {
throw $this
->newSyntaxError('Illegal token');
}
}
$tt = in_array($match[0], $this->keywords) ? $match[0] : TOKEN_IDENTIFIER;
}
}
$this->tokenIndex = $this->tokenIndex + 1 & 3;
if (!isset($this->tokens[$this->tokenIndex])) {
$this->tokens[$this->tokenIndex] = new JSToken();
}
$token = $this->tokens[$this->tokenIndex];
$token->type = $tt;
if ($tt == OP_ASSIGN) {
$token->assignOp = $op;
}
$token->start = $this->cursor;
$token->value = $match[0];
$this->cursor += strlen($match[0]);
$token->end = $this->cursor;
$token->lineno = $this->lineno;
return $tt;
}
public function unget() {
if (++$this->lookahead == 4) {
throw $this
->newSyntaxError('PANIC: too much lookahead!');
}
$this->tokenIndex = $this->tokenIndex - 1 & 3;
}
public function newSyntaxError($m) {
return new Exception('Parse error: ' . $m . ' in file \'' . $this->filename . '\' on line ' . $this->lineno);
}
public static function unicodeEscapeCallback($m) {
return html_entity_decode('&#x' . $m[1] . ';', ENT_QUOTES, 'UTF-8');
}
}