You are here

protected function simple_html_dom::read_tag in simplehtmldom API 7

Same name and namespace in other branches
  1. 5.2 simplehtmldom/simple_html_dom.php \simple_html_dom::read_tag()
  2. 6 simplehtmldom/simple_html_dom.php \simple_html_dom::read_tag()
1 call to simple_html_dom::read_tag()
simple_html_dom::parse in simplehtmldom/simple_html_dom.php

File

simplehtmldom/simple_html_dom.php, line 626

Class

simple_html_dom

Code

protected function read_tag() {
  if ($this->char !== '<') {
    $this->root->_[HDOM_INFO_END] = $this->cursor;
    return false;
  }
  $begin_tag_pos = $this->pos;
  $this->char = ++$this->pos < $this->size ? $this->doc[$this->pos] : null;

  // next
  // end tag
  if ($this->char === '/') {
    $this->char = ++$this->pos < $this->size ? $this->doc[$this->pos] : null;

    // next
    $this
      ->skip($this->token_blank_t);
    $tag = $this
      ->copy_until_char('>');

    // skip attributes in end tag
    if (($pos = strpos($tag, ' ')) !== false) {
      $tag = substr($tag, 0, $pos);
    }
    $parent_lower = strtolower($this->parent->tag);
    $tag_lower = strtolower($tag);
    if ($parent_lower !== $tag_lower) {
      if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) {
        $this->parent->_[HDOM_INFO_END] = 0;
        $org_parent = $this->parent;
        while ($this->parent->parent && strtolower($this->parent->tag) !== $tag_lower) {
          $this->parent = $this->parent->parent;
        }
        if (strtolower($this->parent->tag) !== $tag_lower) {
          $this->parent = $org_parent;

          // restore origonal parent
          if ($this->parent->parent) {
            $this->parent = $this->parent->parent;
          }
          $this->parent->_[HDOM_INFO_END] = $this->cursor;
          return $this
            ->as_text_node($tag);
        }
      }
      else {
        if ($this->parent->parent && isset($this->block_tags[$tag_lower])) {
          $this->parent->_[HDOM_INFO_END] = 0;
          $org_parent = $this->parent;
          while ($this->parent->parent && strtolower($this->parent->tag) !== $tag_lower) {
            $this->parent = $this->parent->parent;
          }
          if (strtolower($this->parent->tag) !== $tag_lower) {
            $this->parent = $org_parent;

            // restore origonal parent
            $this->parent->_[HDOM_INFO_END] = $this->cursor;
            return $this
              ->as_text_node($tag);
          }
        }
        else {
          if ($this->parent->parent && strtolower($this->parent->parent->tag) === $tag_lower) {
            $this->parent->_[HDOM_INFO_END] = 0;
            $this->parent = $this->parent->parent;
          }
          else {
            return $this
              ->as_text_node($tag);
          }
        }
      }
    }
    $this->parent->_[HDOM_INFO_END] = $this->cursor;
    if ($this->parent->parent) {
      $this->parent = $this->parent->parent;
    }
    $this->char = ++$this->pos < $this->size ? $this->doc[$this->pos] : null;

    // next
    return true;
  }
  $node = new simple_html_dom_node($this);
  $node->_[HDOM_INFO_BEGIN] = $this->cursor;
  ++$this->cursor;
  $tag = $this
    ->copy_until($this->token_slash);

  // doctype, cdata & comments...
  if (isset($tag[0]) && $tag[0] === '!') {
    $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this
      ->copy_until_char('>');
    if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') {
      $node->nodetype = HDOM_TYPE_COMMENT;
      $node->tag = 'comment';
    }
    else {
      $node->nodetype = HDOM_TYPE_UNKNOWN;
      $node->tag = 'unknown';
    }
    if ($this->char === '>') {
      $node->_[HDOM_INFO_TEXT] .= '>';
    }
    $this
      ->link_nodes($node, true);
    $this->char = ++$this->pos < $this->size ? $this->doc[$this->pos] : null;

    // next
    return true;
  }

  // text
  if ($pos = strpos($tag, '<') !== false) {
    $tag = '<' . substr($tag, 0, -1);
    $node->_[HDOM_INFO_TEXT] = $tag;
    $this
      ->link_nodes($node, false);
    $this->char = $this->doc[--$this->pos];

    // prev
    return true;
  }
  if (!preg_match("/^[\\w-:]+\$/", $tag)) {
    $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this
      ->copy_until('<>');
    if ($this->char === '<') {
      $this
        ->link_nodes($node, false);
      return true;
    }
    if ($this->char === '>') {
      $node->_[HDOM_INFO_TEXT] .= '>';
    }
    $this
      ->link_nodes($node, false);
    $this->char = ++$this->pos < $this->size ? $this->doc[$this->pos] : null;

    // next
    return true;
  }

  // begin tag
  $node->nodetype = HDOM_TYPE_ELEMENT;
  $tag_lower = strtolower($tag);
  $node->tag = $this->lowercase ? $tag_lower : $tag;

  // handle optional closing tags
  if (isset($this->optional_closing_tags[$tag_lower])) {
    while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
      $this->parent->_[HDOM_INFO_END] = 0;
      $this->parent = $this->parent->parent;
    }
    $node->parent = $this->parent;
  }
  $guard = 0;

  // prevent infinity loop
  $space = array(
    $this
      ->copy_skip($this->token_blank),
    '',
    '',
  );

  // attributes
  do {
    if ($this->char !== null && $space[0] === '') {
      break;
    }
    $name = $this
      ->copy_until($this->token_equal);
    if ($guard === $this->pos) {
      $this->char = ++$this->pos < $this->size ? $this->doc[$this->pos] : null;

      // next
      continue;
    }
    $guard = $this->pos;

    // handle endless '<'
    if ($this->pos >= $this->size - 1 && $this->char !== '>') {
      $node->nodetype = HDOM_TYPE_TEXT;
      $node->_[HDOM_INFO_END] = 0;
      $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
      $node->tag = 'text';
      $this
        ->link_nodes($node, false);
      return true;
    }

    // handle mismatch '<'
    if ($this->doc[$this->pos - 1] == '<') {
      $node->nodetype = HDOM_TYPE_TEXT;
      $node->tag = 'text';
      $node->attr = array();
      $node->_[HDOM_INFO_END] = 0;
      $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos - $begin_tag_pos - 1);
      $this->pos -= 2;
      $this->char = ++$this->pos < $this->size ? $this->doc[$this->pos] : null;

      // next
      $this
        ->link_nodes($node, false);
      return true;
    }
    if ($name !== '/' && $name !== '') {
      $space[1] = $this
        ->copy_skip($this->token_blank);
      $name = $this
        ->restore_noise($name);
      if ($this->lowercase) {
        $name = strtolower($name);
      }
      if ($this->char === '=') {
        $this->char = ++$this->pos < $this->size ? $this->doc[$this->pos] : null;

        // next
        $this
          ->parse_attr($node, $name, $space);
      }
      else {

        //no value attr: nowrap, checked selected...
        $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
        $node->attr[$name] = true;
        if ($this->char != '>') {
          $this->char = $this->doc[--$this->pos];
        }

        // prev
      }
      $node->_[HDOM_INFO_SPACE][] = $space;
      $space = array(
        $this
          ->copy_skip($this->token_blank),
        '',
        '',
      );
    }
    else {
      break;
    }
  } while ($this->char !== '>' && $this->char !== '/');
  $this
    ->link_nodes($node, true);
  $node->_[HDOM_INFO_ENDSPACE] = $space[0];

  // check self closing
  if ($this
    ->copy_until_char_escape('>') === '/') {
    $node->_[HDOM_INFO_ENDSPACE] .= '/';
    $node->_[HDOM_INFO_END] = 0;
  }
  else {

    // reset parent
    if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
      $this->parent = $node;
    }
  }
  $this->char = ++$this->pos < $this->size ? $this->doc[$this->pos] : null;

  // next
  return true;
}