You are here

class FeedsXPathParserHTML in Feeds XPath Parser 7

Same name and namespace in other branches
  1. 6 FeedsXPathParserHTML.inc \FeedsXPathParserHTML

XPath parsing for HTML.

Hierarchy

Expanded class hierarchy of FeedsXPathParserHTML

2 string references to 'FeedsXPathParserHTML'
FeedsXPathParseHTMLTestCase::test in tests/feeds_xpathparser_parser_html.test
Run tests.
feeds_xpathparser_feeds_plugins in ./feeds_xpathparser.module
Implements hook_feeds_plugins().

File

./FeedsXPathParserHTML.inc, line 11
s Contains FeedsXPathParserHTML.

View source
class FeedsXPathParserHTML extends FeedsXPathParserBase {

  /**
   * Whether this version of PHP has a useable saveHTML() method.
   *
   * @var bool
   */
  protected $hasSaveHTML = FALSE;

  /**
   * {@inheritdoc}
   */
  public function __construct($id) {
    parent::__construct($id);

    // DOMDocument::saveHTML() cannot take $node as an argument prior to 5.3.6.
    if (version_compare(phpversion(), '5.3.6', '>=')) {
      $this->hasSaveHTML = TRUE;
    }
  }

  /**
   * {@inheritdoc}
   */
  protected function setup($source_config, FeedsFetcherResult $fetcher_result) {
    if (!empty($source_config['exp']['tidy']) && extension_loaded('tidy')) {
      $config = array(
        'merge-divs' => FALSE,
        'merge-spans' => FALSE,
        'join-styles' => FALSE,
        'drop-empty-paras' => FALSE,
        'wrap' => 0,
        'tidy-mark' => FALSE,
        'escape-cdata' => TRUE,
        'word-2000' => TRUE,
      );

      // Default tidy encoding is UTF8.
      $encoding = $source_config['exp']['tidy_encoding'];
      $raw = tidy_repair_string($fetcher_result
        ->getRaw(), $config, $encoding);
    }
    else {
      $raw = $fetcher_result
        ->getRaw();
    }

    // Some versions of PHP do not handle null bytes.
    $raw = str_replace("\0", '', $raw);
    $document = new DOMDocument();
    $document->strictErrorChecking = FALSE;
    $document->recover = TRUE;

    // Use our own error handling.
    $use = $this
      ->errorStart();
    if (version_compare(PHP_VERSION, '5.4.0', '>=')) {
      $options = LIBXML_NONET;
      $options |= defined('LIBXML_COMPACT') ? LIBXML_COMPACT : 0;
      $options |= defined('LIBXML_PARSEHUGE') ? LIBXML_PARSEHUGE : 0;
      $success = $document
        ->loadHTML($raw, $options);
    }
    else {
      $success = $document
        ->loadHTML($raw);
    }
    $this
      ->errorStop($use, $source_config['exp']['errors']);
    if (!$success) {
      throw new Exception(t('There was an error parsing the HTML document.'));
    }
    return $document;
  }

  /**
   * {@inheritdoc}
   */
  protected function getRaw(DOMNode $node) {
    if ($this->hasSaveHTML) {
      return $this->doc
        ->saveHTML($node);
    }
    return $this->doc
      ->saveXML($node, LIBXML_NOEMPTYTAG);
  }

}

Members

Namesort descending Modifiers Type Description Overrides
FeedsXPathParserBase::$doc protected property The DOMDocument used for parsing.
FeedsXPathParserBase::$loader protected property The return value of libxml_disable_entity_loader().
FeedsXPathParserBase::$rawXML protected property The elements that should be displayed in raw XML.
FeedsXPathParserBase::$xpath protected property The DOMXPath objet used for parsing.
FeedsXPathParserBase::configDefaults public function Overrides parent::configDefaults().
FeedsXPathParserBase::configForm public function Overrides parent::configForm().
FeedsXPathParserBase::configFormValidate public function Overrides parent::sourceFormValidate().
FeedsXPathParserBase::errorStart protected function Starts custom error handling.
FeedsXPathParserBase::errorStop protected function Stops custom error handling.
FeedsXPathParserBase::filterMappings protected function Filters mappings, returning the ones that belong to us.
FeedsXPathParserBase::getMappingSources public function Overrides parent::getMappingSources().
FeedsXPathParserBase::getOwnMappings protected function Gets the mappings that are defined by this parser.
FeedsXPathParserBase::getUniques protected function Gets the unique mappings targets that are used by this parser.
FeedsXPathParserBase::hasSourceConfig public function Overrides parent::hasSourceConfig().
FeedsXPathParserBase::parse public function Implements FeedsParser::parse().
FeedsXPathParserBase::parseSourceElement protected function Parses one item from the context array.
FeedsXPathParserBase::sourceDefaults public function Overrides parent::sourceDefaults().
FeedsXPathParserBase::sourceForm public function Overrides parent::sourceForm().
FeedsXPathParserBase::sourceFormValidate public function Overrides parent::sourceFormValidate().
FeedsXPathParserHTML::$hasSaveHTML protected property Whether this version of PHP has a useable saveHTML() method.
FeedsXPathParserHTML::getRaw protected function Helper callback to return the raw value. Overrides FeedsXPathParserBase::getRaw
FeedsXPathParserHTML::setup protected function Classes that use FeedsXPathParserBase must implement this. Overrides FeedsXPathParserBase::setup
FeedsXPathParserHTML::__construct public function