You are here

protected function FeedsXPathParserHTML::setup in Feeds XPath Parser 7

Same name and namespace in other branches
  1. 6 FeedsXPathParserHTML.inc \FeedsXPathParserHTML::setup()

Classes that use FeedsXPathParserBase must implement this.

Parameters

array $source_config: The configuration for the source.

FeedsFetcherResult $fetcher_result: A FeedsFetcherResult object.

Return value

DOMDocument The DOMDocument to perform XPath queries on.

Overrides FeedsXPathParserBase::setup

File

./FeedsXPathParserHTML.inc, line 35
s Contains FeedsXPathParserHTML.

Class

FeedsXPathParserHTML
XPath parsing for HTML.

Code

protected function setup($source_config, FeedsFetcherResult $fetcher_result) {
  if (!empty($source_config['exp']['tidy']) && extension_loaded('tidy')) {
    $config = array(
      'merge-divs' => FALSE,
      'merge-spans' => FALSE,
      'join-styles' => FALSE,
      'drop-empty-paras' => FALSE,
      'wrap' => 0,
      'tidy-mark' => FALSE,
      'escape-cdata' => TRUE,
      'word-2000' => TRUE,
    );

    // Default tidy encoding is UTF8.
    $encoding = $source_config['exp']['tidy_encoding'];
    $raw = tidy_repair_string($fetcher_result
      ->getRaw(), $config, $encoding);
  }
  else {
    $raw = $fetcher_result
      ->getRaw();
  }

  // Some versions of PHP do not handle null bytes.
  $raw = str_replace("\0", '', $raw);
  $document = new DOMDocument();
  $document->strictErrorChecking = FALSE;
  $document->recover = TRUE;

  // Use our own error handling.
  $use = $this
    ->errorStart();
  if (version_compare(PHP_VERSION, '5.4.0', '>=')) {
    $options = LIBXML_NONET;
    $options |= defined('LIBXML_COMPACT') ? LIBXML_COMPACT : 0;
    $options |= defined('LIBXML_PARSEHUGE') ? LIBXML_PARSEHUGE : 0;
    $success = $document
      ->loadHTML($raw, $options);
  }
  else {
    $success = $document
      ->loadHTML($raw);
  }
  $this
    ->errorStop($use, $source_config['exp']['errors']);
  if (!$success) {
    throw new Exception(t('There was an error parsing the HTML document.'));
  }
  return $document;
}