You are here

public function JSTokenizer::get in Advanced CSS/JS Aggregation 8.4

Same name and namespace in other branches
  1. 8.2 advagg_js_minify/jsminplus.inc \JSTokenizer::get()
  2. 8.3 advagg_js_minify/jsminplus.inc \JSTokenizer::get()
  3. 6 advagg_js_compress/jsminplus.inc \JSTokenizer::get()
  4. 7.2 advagg_js_compress/jsminplus.inc \JSTokenizer::get()
  5. 7 advagg_js_compress/jsminplus.inc \JSTokenizer::get()
2 calls to JSTokenizer::get()
JSTokenizer::match in advagg_js_minify/jsminplus.inc
JSTokenizer::peek in advagg_js_minify/jsminplus.inc

File

advagg_js_minify/jsminplus.inc, line 2020
JSMinPlus version 1.4

Class

JSTokenizer

Code

public function get($chunksize = 1000, $op_dot = false) {
  while ($this->lookahead) {
    $this->lookahead--;
    $this->tokenIndex = $this->tokenIndex + 1 & 3;
    $token = $this->tokens[$this->tokenIndex];
    if ($token->type != TOKEN_NEWLINE || $this->scanNewlines) {
      return $token->type;
    }
  }
  $conditional_comment = false;

  // strip whitespace and comments
  while (true) {
    $input = $this
      ->getInput($chunksize);

    // whitespace handling; gobble up \r as well (effectively we don't have support for MAC newlines!)
    $re = $this->scanNewlines ? '/^[ \\r\\t]+/' : '/^\\s+/';
    if (preg_match($re, $input, $match)) {
      $spaces = $match[0];
      $spacelen = strlen($spaces);
      $this->cursor += $spacelen;
      if (!$this->scanNewlines) {
        $this->lineno += substr_count($spaces, "\n");
      }
      if ($spacelen == $chunksize) {
        continue;

        // complete chunk contained whitespace
      }
      $input = $this
        ->getInput($chunksize);
      if ($input == '' || $input[0] != '/') {
        break;
      }
    }

    // Comments
    if (!preg_match('/^\\/(?:\\*(@(?:cc_on|if|elif|else|end))?.*?\\*\\/|\\/[^\\n]*)/s', $input, $match)) {
      if (!$chunksize) {
        break;
      }

      // retry with a full chunk fetch; this also prevents breakage of long regular expressions (which will never match a comment)
      $chunksize = null;
      continue;
    }

    // check if this is a conditional (JScript) comment
    if (!empty($match[1])) {
      $match[0] = '/*' . $match[1];
      $conditional_comment = true;
      break;
    }
    else {
      $this->cursor += strlen($match[0]);
      $this->lineno += substr_count($match[0], "\n");
    }
  }
  if ($input == '') {
    $tt = TOKEN_END;
    $match = array(
      '',
    );
  }
  elseif ($conditional_comment) {
    $tt = TOKEN_CONDCOMMENT_START;
  }
  else {
    switch ($input[0]) {
      case '0':

        // hexadecimal
        if (($input[1] == 'x' || $input[1] == 'X') && preg_match('/^0x[0-9a-f]+/i', $input, $match)) {
          $tt = TOKEN_NUMBER;
          break;
        }

      // FALL THROUGH
      case '1':
      case '2':
      case '3':
      case '4':
      case '5':
      case '6':
      case '7':
      case '8':
      case '9':

        // should always match
        preg_match('/^\\d+(?:\\.\\d*)?(?:[eE][-+]?\\d+)?/', $input, $match);
        $tt = TOKEN_NUMBER;
        break;
      case "'":
        if (preg_match('/^\'(?:[^\\\\\'\\r\\n]++|\\\\(?:.|\\r?\\n))*\'/', $input, $match)) {
          $tt = TOKEN_STRING;
        }
        else {
          if ($chunksize) {
            return $this
              ->get(null);

            // retry with a full chunk fetch
          }
          throw $this
            ->newSyntaxError('Unterminated string literal');
        }
        break;
      case '"':
        if (preg_match('/^"(?:[^\\\\"\\r\\n]++|\\\\(?:.|\\r?\\n))*"/', $input, $match)) {
          $tt = TOKEN_STRING;
        }
        else {
          if ($chunksize) {
            return $this
              ->get(null);

            // retry with a full chunk fetch
          }
          throw $this
            ->newSyntaxError('Unterminated string literal');
        }
        break;
      case '/':
        if ($this->scanOperand && preg_match('/^\\/((?:\\\\.|\\[(?:\\\\.|[^\\]])*\\]|[^\\/])+)\\/([gimy]*)/', $input, $match)) {
          $tt = TOKEN_REGEXP;
          break;
        }

      // FALL THROUGH
      case '|':
      case '^':
      case '&':
      case '<':
      case '>':
      case '+':
      case '-':
      case '*':
      case '%':
      case '=':
      case '!':

        // should always match
        preg_match($this->opRegExp, $input, $match);
        $op = $match[0];
        if (in_array($op, $this->assignOps) && $input[strlen($op)] == '=') {
          $tt = OP_ASSIGN;
          $match[0] .= '=';
        }
        else {
          $tt = $op;
          if ($this->scanOperand) {
            if ($op == OP_PLUS) {
              $tt = OP_UNARY_PLUS;
            }
            elseif ($op == OP_MINUS) {
              $tt = OP_UNARY_MINUS;
            }
          }
          $op = null;
        }
        break;
      case '.':
        if (preg_match('/^\\.\\d+(?:[eE][-+]?\\d+)?/', $input, $match)) {
          $tt = TOKEN_NUMBER;
          break;
        }

      // FALL THROUGH
      case ';':
      case ',':
      case '?':
      case ':':
      case '~':
      case '[':
      case ']':
      case '{':
      case '}':
      case '(':
      case ')':

        // these are all single
        $match = array(
          $input[0],
        );
        $tt = $input[0];
        break;
      case '@':

        // check end of conditional comment
        if (substr($input, 0, 3) == '@*/') {
          $match = array(
            '@*/',
          );
          $tt = TOKEN_CONDCOMMENT_END;
        }
        else {
          throw $this
            ->newSyntaxError('Illegal token');
        }
        break;
      case "\n":
        if ($this->scanNewlines) {
          $match = array(
            "\n",
          );
          $tt = TOKEN_NEWLINE;
        }
        else {
          throw $this
            ->newSyntaxError('Illegal token');
        }
        break;
      default:

        // Fast path for identifiers: word chars followed by whitespace or various other tokens.
        // Note we don't need to exclude digits in the first char, as they've already been found
        // above.
        if (!preg_match('/^[$\\w]+(?=[\\s\\/\\|\\^\\&<>\\+\\-\\*%=!.;,\\?:~\\[\\]\\{\\}\\(\\)@])/', $input, $match)) {

          // Character classes per ECMA-262 edition 5.1 section 7.6
          // Per spec, must accept Unicode 3.0, *may* accept later versions.
          // We'll take whatever PCRE understands, which should be more recent.
          $identifierStartChars = "\\p{L}\\p{Nl}" . "\$" . "_";
          $identifierPartChars = $identifierStartChars . "\\p{Mn}\\p{Mc}" . "\\p{Nd}" . "\\p{Pc}";

          # UnicodeConnectorPunctuation
          $unicodeEscape = "\\\\u[0-9A-F-a-f]{4}";
          $identifierRegex = "/^" . "(?:[{$identifierStartChars}]|{$unicodeEscape})" . "(?:[{$identifierPartChars}]|{$unicodeEscape})*" . "/uS";
          if (preg_match($identifierRegex, $input, $match)) {
            if (strpos($match[0], '\\') !== false) {

              // Per ECMA-262 edition 5.1, section 7.6 escape sequences should behave as if they were
              // the original chars, but only within the boundaries of the identifier.
              $decoded = preg_replace_callback('/\\\\u([0-9A-Fa-f]{4})/', array(
                __CLASS__,
                'unicodeEscapeCallback',
              ), $match[0]);

              // Since our original regex didn't de-escape the originals, we need to check for validity again.
              // No need to worry about token boundaries, as anything outside the identifier is illegal!
              if (!preg_match("/^[{$identifierStartChars}][{$identifierPartChars}]*\$/u", $decoded)) {
                throw $this
                  ->newSyntaxError('Illegal token');
              }

              // Per spec it _ought_ to work to use these escapes for keywords words as well...
              // but IE rejects them as invalid, while Firefox and Chrome treat them as identifiers
              // that don't match the keyword.
              if (in_array($decoded, $this->keywords)) {
                throw $this
                  ->newSyntaxError('Illegal token');
              }

              // TODO: save the decoded form for output?
            }
          }
          else {
            throw $this
              ->newSyntaxError('Illegal token');
          }
        }

        // Identifiers after an OP_DOT can include otherwise reserve keywords.
        if ($op_dot) {
          $tt = TOKEN_IDENTIFIER;
        }
        else {
          $tt = in_array($match[0], $this->keywords) ? $match[0] : TOKEN_IDENTIFIER;
        }
    }
  }
  $this->tokenIndex = $this->tokenIndex + 1 & 3;
  if (!isset($this->tokens[$this->tokenIndex])) {
    $this->tokens[$this->tokenIndex] = new JSToken();
  }
  $token = $this->tokens[$this->tokenIndex];
  $token->type = $tt;
  if ($tt == OP_ASSIGN) {
    $token->assignOp = $op;
  }
  $token->start = $this->cursor;
  $token->value = $match[0];
  $this->cursor += strlen($match[0]);
  $token->end = $this->cursor;
  $token->lineno = $this->lineno;
  return $tt;
}