FeedsCrawler.inc in Feeds Crawler 6.2

Same filename and directory in other branches
7 FeedsCrawler.inc
Home of the FeedsCrawler.
File

FeedsCrawler.inc
View source
<?php

/**
 * @file
 * Home of the FeedsCrawler.
 */

/**
 * Fetches data via HTTP.
 */
class FeedsCrawler extends FeedsHTTPFetcher {

  /**
   * Implements FeedsFetcher::fetch().
   */
  public function fetch(FeedsSource $source) {
    $source_config = $source
      ->getConfigFor($this);

    // Use a variable to store state. --hack.
    $state = variable_get('feeds_crawler_' . $this->id . '_' . $source->feed_nid, new stdClass());

    // Support Pubsubhubbub
    if ($this->config['use_pubsubhubbub'] && ($raw = $this
      ->subscriber($source->feed_nid)
      ->receive())) {
      $state->count = 0;
      variable_set('feeds_crawler_' . $this->id . '_' . $source->feed_nid, $state);
      return new FeedsImportBatch($raw, $source->feed_nid);
    }

    //return new FeedsHTTPBatch($source_config['source'], $source->feed_nid);
    $config = $source_config['crawler'];

    // First run.
    if (empty($state->total)) {
      $state->total = $config['num_pages'] == 0 ? 100000 : $config['num_pages'];
      $state->count = $state->total;
      $url = $source_config['source'];
    }
    else {
      $url = $state->next_url;
    }
    $result = new FeedsHTTPBatch($url, $source->feed_nid);
    if ($config['first_run'] && $source->config['FeedsCrawler']['crawled']) {
      $state->count = 0;
      variable_set('feeds_crawler_' . $this->id . '_' . $source->feed_nid, $state);
      return $result;
    }
    sleep($config['delay']);
    if ($config['auto'] && ($href = $this
      ->parseAuto($result, $source_config)) != FALSE || $config['url']['url_pattern'] && ($href = $this
      ->parseUrl($config, $state)) != FALSE || $config['xpath'] && ($href = $this
      ->parseXPath($result, $source_config)) != FALSE) {
      $state->next_url = $href;
      $state->count--;
      if ($state->count <= 0) {
        $source->config['FeedsCrawler']['crawled'] = TRUE;
      }
    }
    else {
      $state->count = 0;
    }
    variable_set('feeds_crawler_' . $this->id . '_' . $source->feed_nid, $state);
    return $result;
  }
  private function parseUrl($config, $state) {
    if (!isset($state->inc)) {
      $state->inc = $config['url']['initial'];
    }
    else {
      $state->inc += $config['url']['increment'];
    }
    return str_replace('$index', $state->inc, $config['url']['url_pattern']);
  }
  private function parseAuto($result, $source_config) {
    $xml = new SimpleXMLElement($result
      ->getRaw());
    feeds_include_library('common_syndication_parser.inc', 'common_syndication_parser');
    $format = _parser_common_syndication_feed_format_detect($xml);
    if ($format) {
      $xml
        ->registerXpathNamespace('atom', 'http://www.w3.org/2005/Atom');
      $xpath = 'atom:link[@rel="next"]/@href';
    }
    else {
      $xpath = 'link[@rel="next"]/@href';
    }
    $href = $xml
      ->xpath($xpath);
    unset($xml);
    return $this
      ->parseHref($href, $source_config['source']);
  }
  private function parseXPath($result, $source_config) {
    $dom = new DOMDocument();
    $dom
      ->loadHTML($result
      ->getRaw());
    $xml = simplexml_import_dom($dom);
    unset($dom);
    $href = $xml
      ->xpath($source_config['crawler']['xpath']);
    unset($xml);
    return $this
      ->parseHref($href, $source_config['source']);
  }
  private function parseHref($href, $source_url) {
    if ($href === FALSE || empty($href)) {
      return FALSE;
    }
    foreach ($href as $h) {
      $h = trim((string) $h);
      if (!empty($h)) {
        $href = $h;
        break;
      }
    }
    if (strpos($href, 'http://') !== 0 && strpos($href, 'https://') !== 0) {
      $href = ltrim($href, '/');
      $href = $this
        ->baseUrl($source_url) . '/' . $href;
    }
    return $href;
  }

  /**
   * Breaks a url up removing everything but the http://example.com.
   */
  private function baseUrl($url) {
    $p = parse_url($url);
    $output = '';
    $output .= isset($p['scheme']) ? $p['scheme'] : 'http';
    $output .= '://';
    $output .= isset($p['user']) ? $p['user'] : '';
    $output .= isset($p['pass']) ? ':' . $p['pass'] : '';
    $output .= isset($p['user']) ? '@' : '';
    $output .= $p['host'];
    $output .= isset($p['port']) ? ':' . $p['port'] : '';
    return $output;
  }

  /**
   * Define defaults.
   */
  public function sourceDefaults() {
    return $this->config;
  }

  /**
   * Expose source form.
   */
  public function sourceForm($source_config) {
    $form = parent::sourceForm($source_config);
    $form['crawler'] = array(
      '#type' => 'fieldset',
      '#title' => t('Feeds Crawler settings'),
      '#collapsed' => TRUE,
      '#collapsible' => TRUE,
      '#tree' => TRUE,
    );
    $form['crawler']['num_pages'] = array(
      '#type' => 'textfield',
      '#title' => t('Number of pages'),
      '#description' => t('The number of pages to fetch. 0 for unlimited'),
      '#default_value' => isset($source_config['crawler']['num_pages']) ? $source_config['crawler']['num_pages'] : 10,
      '#maxlength' => 10,
    );
    $form['crawler']['delay'] = array(
      '#type' => 'textfield',
      '#title' => t('Delay'),
      '#description' => t('Number of seconds to delay in between fetches.'),
      '#default_value' => isset($source_config['crawler']['delay']) ? $source_config['crawler']['delay'] : 1,
    );
    $form['crawler']['first_run'] = array(
      '#type' => 'checkbox',
      '#title' => t('Crawl on first run only'),
      '#description' => t('Only crawl on initial run. Use regular import afterword.'),
      '#default_value' => isset($source_config['crawler']['first_run']) ? $source_config['crawler']['first_run'] : FALSE,
    );
    $form['crawler']['auto'] = array(
      '#type' => 'checkbox',
      '#title' => t('Auto detect next link'),
      '#description' => t('Attempt to autodetect the next link for RSS and ATOM feeds.'),
      '#default_value' => isset($source_config['crawler']['auto']) ? $source_config['crawler']['auto'] : FALSE,
    );
    $form['crawler']['xpath'] = array(
      '#type' => 'textfield',
      '#title' => t('XPath selector for next link'),
      '#description' => t('The XPath selector for the next link.'),
      '#default_value' => isset($source_config['crawler']['xpath']) ? $source_config['crawler']['xpath'] : '',
      '#maxlength' => NULL,
    );
    $form['crawler']['url'] = array(
      '#type' => 'fieldset',
      '#title' => t('URL replacement options'),
    );
    $form['crawler']['url']['url_pattern'] = array(
      '#type' => 'textfield',
      '#title' => t('URL pattern'),
      '#description' => t('A URL with the variable $index replaced with an increnting number. For example: http://example.com?page=$index.'),
      '#default_value' => isset($source_config['crawler']['url']['url_pattern']) ? $source_config['crawler']['url']['url_pattern'] : '',
      '#maxlength' => NULL,
    );
    $form['crawler']['url']['initial'] = array(
      '#type' => 'textfield',
      '#title' => t('Initial value of $index'),
      '#description' => t('The initial value of the $index variable.'),
      '#default_value' => isset($source_config['crawler']['url']['initial']) ? $source_config['crawler']['url']['initial'] : '',
    );
    $form['crawler']['url']['increment'] = array(
      '#type' => 'textfield',
      '#title' => t('Increment $index by'),
      '#description' => t('The increment the value of $index variable.'),
      '#default_value' => isset($source_config['crawler']['url']['increment']) ? $source_config['crawler']['url']['increment'] : '',
    );
    $form['crawled'] = array(
      '#type' => 'hidden',
      '#value' => isset($source_config['crawled']) ? $source_config['crawled'] : FALSE,
    );
    return $form;
  }
  public function trim(&$value) {
    $value = trim($value);
  }
  public function sourceFormValidate(&$values) {
    $vs =& $values['crawler'];
    array_walk_recursive($vs, array(
      $this,
      'trim',
    ));
    if (empty($vs['num_pages'])) {
      $vs['num_pages'] = 0;
    }
    if (empty($vs['delay'])) {
      $vs['delay'] = 0;
    }
    if (!empty($vs['xpath'])) {
      $xml = new SimpleXMLElement('<?xml version="1.0" encoding="UTF-8"?>' . "\n<items></items>");
      $use_errors = libxml_use_internal_errors(TRUE);
      $result = $xml
        ->xpath($vs['xpath']);
      $error = libxml_get_last_error();
      libxml_clear_errors();
      libxml_use_internal_errors($use_errors);
      if ($error) {
        form_set_error('crawler][xpath', t('There was an error with the XPath selector: ') . $error->message);
      }
    }
    if (!empty($vs['url']['url_pattern']) && !(stripos($vs['url']['url_pattern'], 'http://') === 0 || stripos($vs['url']['url_pattern'], 'https://') === 0)) {
      form_set_error('crawler][url][url_pattern', t('The url pattern must be an absolute url. It must start with http:// or https://'));
    }
    if (!empty($vs['num_pages']) && !is_int($vs['num_pages']) && !ctype_digit($vs['num_pages'])) {
      form_set_error('crawler][num_pages', t('Must be an integer.'));
    }
    if (!empty($vs['delay']) && !is_numeric($vs['delay'])) {
      form_set_error('crawler][delay', t('Must be a number.'));
    }
    if (!empty($vs['url']['initial']) && !is_numeric($vs['url']['initial'])) {
      form_set_error('crawler][url][initial', t('Must be a number.'));
    }
    if (!empty($vs['url']['increment']) && !is_numeric($vs['url']['increment'])) {
      form_set_error('crawler][url][increment', t('Must be a number.'));
    }
  }

  /**
   * Override parent::configDefaults().
   */
  public function configDefaults() {
    $defaults = parent::configDefaults();
    $defaults['crawler'] = array(
      'num_pages' => 10,
      'first_run' => TRUE,
      'delay' => 1,
      'auto' => FALSE,
      'xpath' => '',
      'url' => array(
        'url_pattern' => '',
        'initial' => 0,
        'increment' => 1,
      ),
    );
    $defaults['crawled'] = FALSE;
    return $defaults;
  }

  /**
   * Override parent::configForm().
   */
  public function configForm(&$form_state) {
    $form = $this
      ->sourceForm($this->config);
    unset($form['source']);
    unset($form['crawled']);
    $form['crawler']['#title'] = t('Feeds Crawler default settings');
    $form['crawler']['#collapsed'] = FALSE;
    return parent::configForm($form_state) + $form;
  }
  public function configFormValidate(&$values) {
    $this
      ->sourceFormValidate($values);
  }

}
Classes

Name	Description
FeedsCrawler	Fetches data via HTTP.
You are here

FeedsCrawler.inc in Feeds Crawler 6.2

File

Classes

API Navigation