You are here

protected function Tokenizer::decodeCharacterReference in Zircon Profile 8

Same name and namespace in other branches
  1. 8.0 vendor/masterminds/html5/src/HTML5/Parser/Tokenizer.php \Masterminds\HTML5\Parser\Tokenizer::decodeCharacterReference()

Decode a character reference and return the string.

Returns false if the entity could not be found. If $inAttribute is set to true, a bare & will be returned as-is.

Parameters

boolean $inAttribute: Set to true if the text is inside of an attribute value. false otherwise.

4 calls to Tokenizer::decodeCharacterReference()
Tokenizer::characterReference in vendor/masterminds/html5/src/HTML5/Parser/Tokenizer.php
Handle character references (aka entities).
Tokenizer::quotedAttributeValue in vendor/masterminds/html5/src/HTML5/Parser/Tokenizer.php
Get an attribute value string.
Tokenizer::rcdata in vendor/masterminds/html5/src/HTML5/Parser/Tokenizer.php
Read text in RCDATA mode.
Tokenizer::unquotedAttributeValue in vendor/masterminds/html5/src/HTML5/Parser/Tokenizer.php

File

vendor/masterminds/html5/src/HTML5/Parser/Tokenizer.php, line 1004

Class

Tokenizer
The HTML5 tokenizer.

Namespace

Masterminds\HTML5\Parser

Code

protected function decodeCharacterReference($inAttribute = false) {

  // If it fails this, it's definitely not an entity.
  if ($this->scanner
    ->current() != '&') {
    return false;
  }

  // Next char after &.
  $tok = $this->scanner
    ->next();
  $entity = '';
  $start = $this->scanner
    ->position();
  if ($tok == false) {
    return '&';
  }

  // These indicate not an entity. We return just
  // the &.
  if (strspn($tok, static::WHITE . "&<") == 1) {

    // $this->scanner->next();
    return '&';
  }

  // Numeric entity
  if ($tok == '#') {
    $tok = $this->scanner
      ->next();

    // Hexidecimal encoding.
    // X[0-9a-fA-F]+;
    // x[0-9a-fA-F]+;
    if ($tok == 'x' || $tok == 'X') {
      $tok = $this->scanner
        ->next();

      // Consume x
      // Convert from hex code to char.
      $hex = $this->scanner
        ->getHex();
      if (empty($hex)) {
        $this
          ->parseError("Expected &#xHEX;, got &#x%s", $tok);

        // We unconsume because we don't know what parser rules might
        // be in effect for the remaining chars. For example. '&#>'
        // might result in a specific parsing rule inside of tag
        // contexts, while not inside of pcdata context.
        $this->scanner
          ->unconsume(2);
        return '&';
      }
      $entity = CharacterReference::lookupHex($hex);
    }
    else {

      // Convert from decimal to char.
      $numeric = $this->scanner
        ->getNumeric();
      if ($numeric === false) {
        $this
          ->parseError("Expected &#DIGITS;, got &#%s", $tok);
        $this->scanner
          ->unconsume(2);
        return '&';
      }
      $entity = CharacterReference::lookupDecimal($numeric);
    }
  }
  else {

    // Attempt to consume a string up to a ';'.
    // [a-zA-Z0-9]+;
    $cname = $this->scanner
      ->getAsciiAlpha();
    $entity = CharacterReference::lookupName($cname);

    // When no entity is found provide the name of the unmatched string
    // and continue on as the & is not part of an entity. The & will
    // be converted to &amp; elsewhere.
    if ($entity == null) {
      $this
        ->parseError("No match in entity table for '%s'", $cname);
      $this->scanner
        ->unconsume($this->scanner
        ->position() - $start);
      return '&';
    }
  }

  // The scanner has advanced the cursor for us.
  $tok = $this->scanner
    ->current();

  // We have an entity. We're done here.
  if ($tok == ';') {
    $this->scanner
      ->next();
    return $entity;
  }

  // If in an attribute, then failing to match ; means unconsume the
  // entire string. Otherwise, failure to match is an error.
  if ($inAttribute) {
    $this->scanner
      ->unconsume($this->scanner
      ->position() - $start);
    return '&';
  }
  $this
    ->parseError("Expected &ENTITY;, got &ENTITY%s (no trailing ;) ", $tok);
  return '&' . $entity;
}