FeedsCrawler.inc in Feeds Crawler 7

Same filename and directory in other branches
6.2 FeedsCrawler.inc
Home of the FeedsCrawler.
File

FeedsCrawler.inc
View source
<?php

/**
 * @file
 * Home of the FeedsCrawler.
 */

/**
 * Fetches data via HTTP.
 */
class FeedsCrawler extends FeedsHTTPFetcher {

  /**
   * Implements FeedsFetcher::fetch().
   */
  public function fetch(FeedsSource $source) {
    $source_config = $source
      ->getConfigFor($this);
    $config = $source_config['crawler'];
    if ($this->config['use_pubsubhubbub'] && ($raw = $this
      ->subscriber($source->feed_nid)
      ->receive())) {
      return new FeedsFetcherResult($raw);
    }
    $state = $source
      ->state(FEEDS_FETCH);
    if ($state->total == 0) {
      $state->total = $config['num_pages'] == 0 ? 100000 : $config['num_pages'];
      $state->count = $state->total;
      $url = $source_config['source'];
    }
    else {
      $url = $state->next_url;
    }
    $result = new FeedsHTTPFetcherResult($url);
    if ($config['first_run'] && $source->config['FeedsCrawler']['crawled']) {
      $state
        ->progress($state->total, $state->total);
      return $result;
    }
    sleep($config['delay']);
    if ($config['auto'] && ($href = $this
      ->parseAuto($result, $source_config)) != FALSE || $config['url']['url_pattern'] && ($href = $this
      ->parseUrl($config, $state)) != FALSE || $config['xpath'] && ($href = $this
      ->parseXPath($result, $source_config)) != FALSE) {
      $state->next_url = $href;
      $state->count--;
      $state
        ->progress($state->total, $state->total - $state->count);
      if ($state->count <= 0) {
        $source->config['FeedsCrawler']['crawled'] = TRUE;
      }
    }
    else {
      $state
        ->progress($state->total, $state->total);
    }
    return $result;
  }

  /**
   * Increments a pager using a URL pattern.
   */
  protected function parseUrl($config, $state) {
    if (!isset($state->inc)) {
      $state->inc = $config['url']['initial'];
    }
    else {
      $state->inc += $config['url']['increment'];
    }
    return str_replace('$index', $state->inc, $config['url']['url_pattern']);
  }

  /**
   * Paginates using Atom's rel=next link automatically.
   */
  protected function parseAuto($result, $source_config) {
    $errors = $this
      ->errorStart();
    $xml = new SimpleXMLElement($result
      ->getRaw());
    feeds_include_library('common_syndication_parser.inc', 'common_syndication_parser');
    $format = _parser_common_syndication_feed_format_detect($xml);
    if ($format) {
      $xml
        ->registerXpathNamespace('atom', 'http://www.w3.org/2005/Atom');
      $xpath = 'atom:link[@rel="next"]/@href';
    }
    else {
      $xpath = 'link[@rel="next"]/@href';
    }
    $href = $xml
      ->xpath($xpath);
    unset($xml);
    $this
      ->errorStop($errors, FALSE);
    return $this
      ->parseHref($href, $source_config['source']);
  }

  /**
   * Finds the "next" link on a page via XPath.
   */
  protected function parseXPath($result, $source_config) {
    $errors = $this
      ->errorStart();
    $dom = new DOMDocument();
    $dom
      ->loadHTML($result
      ->getRaw());
    $xml = simplexml_import_dom($dom);
    unset($dom);
    $href = $xml
      ->xpath($source_config['crawler']['xpath']);
    unset($xml);
    $this
      ->errorStop($errors, FALSE);
    return $this
      ->parseHref($href, $source_config['source']);
  }

  /**
   * Builds a fully qualified URL from the source URL if necessary.
   */
  protected function parseHref($href, $source_url) {
    if ($href === FALSE || empty($href)) {
      return FALSE;
    }
    foreach ($href as $h) {
      $h = trim((string) $h);
      if (!empty($h)) {
        $href = $h;
        break;
      }
    }
    if (strpos($href, 'http://') !== 0 && strpos($href, 'https://') !== 0) {
      if (substr($href, 0, 1) == '/') {
        $href = ltrim($href, '/');
        $href = $this
          ->baseUrl($source_url) . '/' . $href;
      }
      else {
        $href = $this
          ->baseUrl($source_url, TRUE) . '/' . $href;
      }
    }
    return $href;
  }

  /**
   * Breaks a url up removing everything but the http://example.com.
   *
   * @param string $url
   *   The url string to return the base path for.
   * @param bool $relative
   *   (Optional) If TRUE, returns the base path as well fir relative urls.
   *   Defaults to FALSE.
   */
  protected function baseUrl($url, $relative = FALSE) {
    $p = parse_url($url);
    $output = '';
    $output .= isset($p['scheme']) ? $p['scheme'] : 'http';
    $output .= '://';
    $output .= isset($p['user']) ? $p['user'] : '';
    $output .= isset($p['pass']) ? ':' . $p['pass'] : '';
    $output .= isset($p['user']) ? '@' : '';
    $output .= $p['host'];
    $output .= isset($p['port']) ? ':' . $p['port'] : '';
    if ($relative) {
      $output .= isset($p['path']) ? substr($p['path'], 0, strrpos($p['path'], '/')) : '';
    }
    return $output;
  }

  /**
   * Overrides parent::sourceDefaults().
   */
  public function sourceDefaults() {
    return $this->config;
  }

  /**
   * Overrides parent::sourceForm().
   */
  public function sourceForm($source_config) {
    $form = parent::sourceForm($source_config);
    $form['crawler'] = array(
      '#type' => 'fieldset',
      '#title' => t('Feeds Crawler settings'),
      '#collapsed' => TRUE,
      '#collapsible' => TRUE,
      '#tree' => TRUE,
    );
    $form['crawler']['num_pages'] = array(
      '#type' => 'textfield',
      '#title' => t('Number of pages'),
      '#description' => t('The number of pages to fetch. 0 for unlimited'),
      '#default_value' => isset($source_config['crawler']['num_pages']) ? $source_config['crawler']['num_pages'] : 10,
      '#maxlength' => 10,
    );
    $form['crawler']['delay'] = array(
      '#type' => 'textfield',
      '#title' => t('Delay'),
      '#description' => t('Number of seconds to delay in between fetches.'),
      '#default_value' => isset($source_config['crawler']['delay']) ? $source_config['crawler']['delay'] : 1,
    );
    $form['crawler']['first_run'] = array(
      '#type' => 'checkbox',
      '#title' => t('Crawl on first run only'),
      '#description' => t('Only crawl on initial run. Use regular import afterword.'),
      '#default_value' => isset($source_config['crawler']['first_run']) ? $source_config['crawler']['first_run'] : FALSE,
    );
    $form['crawler']['auto'] = array(
      '#type' => 'checkbox',
      '#title' => t('Auto detect next link'),
      '#description' => t('Attempt to autodetect the next link for RSS and ATOM feeds.'),
      '#default_value' => isset($source_config['crawler']['auto']) ? $source_config['crawler']['auto'] : FALSE,
    );
    $form['crawler']['xpath'] = array(
      '#type' => 'textfield',
      '#title' => t('XPath selector for next link'),
      '#description' => t('The XPath selector for the next link.'),
      '#default_value' => isset($source_config['crawler']['xpath']) ? $source_config['crawler']['xpath'] : '',
      '#maxlength' => NULL,
    );
    $form['crawler']['url'] = array(
      '#type' => 'fieldset',
      '#title' => t('URL replacement options'),
    );
    $form['crawler']['url']['url_pattern'] = array(
      '#type' => 'textfield',
      '#title' => t('URL pattern'),
      '#description' => t('A URL with the variable $index replaced with an increnting number. For example: http://example.com?page=$index.'),
      '#default_value' => isset($source_config['crawler']['url']['url_pattern']) ? $source_config['crawler']['url']['url_pattern'] : '',
      '#maxlength' => NULL,
    );
    $form['crawler']['url']['initial'] = array(
      '#type' => 'textfield',
      '#title' => t('Initial value of $index'),
      '#description' => t('The initial value of the $index variable.'),
      '#default_value' => isset($source_config['crawler']['url']['initial']) ? $source_config['crawler']['url']['initial'] : '',
    );
    $form['crawler']['url']['increment'] = array(
      '#type' => 'textfield',
      '#title' => t('Increment $index by'),
      '#description' => t('The increment the value of $index variable.'),
      '#default_value' => isset($source_config['crawler']['url']['increment']) ? $source_config['crawler']['url']['increment'] : '',
    );
    $form['crawled'] = array(
      '#type' => 'hidden',
      '#value' => isset($source_config['crawled']) ? $source_config['crawled'] : FALSE,
    );
    return $form;
  }

  /**
   * Trims a string by reference.
   *
   * This is a helper method for use in array_walk_recursive().
   */
  public function trim(&$value) {
    $value = trim($value);
  }

  /**
   * Overrides parent::sourceFormValidate().
   */
  public function sourceFormValidate(&$values) {
    $vs =& $values['crawler'];
    array_walk_recursive($vs, array(
      $this,
      'trim',
    ));
    if (empty($vs['num_pages'])) {
      $vs['num_pages'] = 0;
    }
    if (empty($vs['delay'])) {
      $vs['delay'] = 0;
    }
    if (!empty($vs['xpath'])) {
      $xml = new SimpleXMLElement('<?xml version="1.0" encoding="UTF-8"?>' . "\n<items></items>");
      $use_errors = libxml_use_internal_errors(TRUE);
      $result = $xml
        ->xpath($vs['xpath']);
      $error = libxml_get_last_error();
      libxml_clear_errors();
      libxml_use_internal_errors($use_errors);
      if ($error) {
        form_set_error('crawler][xpath', t('There was an error with the XPath selector: ') . $error->message);
      }
    }
    if (!empty($vs['url']['url_pattern']) && !(stripos($vs['url']['url_pattern'], 'http://') === 0 || stripos($vs['url']['url_pattern'], 'https://') === 0)) {
      form_set_error('crawler][url][url_pattern', t('The url pattern must be an absolute url. It must start with http:// or https://'));
    }
    if (!empty($vs['num_pages']) && !is_int($vs['num_pages']) && !ctype_digit($vs['num_pages'])) {
      form_set_error('crawler][num_pages', t('Must be an integer.'));
    }
    if (!empty($vs['delay']) && !is_numeric($vs['delay'])) {
      form_set_error('crawler][delay', t('Must be a number.'));
    }
    if (!empty($vs['url']['initial']) && !is_numeric($vs['url']['initial'])) {
      form_set_error('crawler][url][initial', t('Must be a number.'));
    }
    if (!empty($vs['url']['increment']) && !is_numeric($vs['url']['increment'])) {
      form_set_error('crawler][url][increment', t('Must be a number.'));
    }
  }

  /**
   * Overrides parent::configDefaults().
   */
  public function configDefaults() {
    $defaults = parent::configDefaults();
    $defaults['crawler'] = array(
      'num_pages' => 10,
      'first_run' => TRUE,
      'delay' => 1,
      'auto' => FALSE,
      'xpath' => '',
      'url' => array(
        'url_pattern' => '',
        'initial' => 0,
        'increment' => 1,
      ),
    );
    $defaults['crawled'] = FALSE;
    return $defaults;
  }

  /**
   * Overrides parent::configForm().
   */
  public function configForm(&$form_state) {
    $form = $this
      ->sourceForm($this->config);
    unset($form['source']);
    unset($form['crawled']);
    $form['crawler']['#title'] = t('Feeds Crawler default settings');
    return parent::configForm($form_state) + $form;
  }

  /**
   * Overrides parent::configFormValidate().
   */
  public function configFormValidate(&$values) {
    $this
      ->sourceFormValidate($values);
  }

  /**
   * Starts custom error handling.
   *
   * @return bool
   *   The previous value of use_errors.
   */
  protected function errorStart() {
    return libxml_use_internal_errors(TRUE);
  }

  /**
   * Stops custom error handling.
   *
   * @param bool $use
   *   The previous value of use_errors.
   * @param bool $print
   *   (Optional) Whether to print errors to the screen. Defaults to TRUE.
   */
  protected function errorStop($use, $print = TRUE) {
    if ($print) {
      foreach (libxml_get_errors() as $error) {
        switch ($error->level) {
          case LIBXML_ERR_WARNING:
          case LIBXML_ERR_ERROR:
            $type = 'warning';
            break;
          case LIBXML_ERR_FATAL:
            $type = 'error';
            break;
        }
        $args = array(
          '%error' => trim($error->message),
          '%num' => $error->line,
          '%code' => $error->code,
        );
        $message = t('%error on line %num. Error code: %code', $args);
        drupal_set_message($message, $type, FALSE);
      }
    }
    libxml_clear_errors();
    libxml_use_internal_errors($use);
  }

}
Classes

Name	Description
FeedsCrawler	Fetches data via HTTP.
You are here

FeedsCrawler.inc in Feeds Crawler 7

File

Classes

API Navigation