You are here

feeds_crawler.module in Feeds Crawler 6

File

feeds_crawler.module
View source
<?php

/**
 *  Implementation of hook_menu().
 */
function feeds_crawler_menu() {
  $items = array();
  $items['admin/build/feeds_crawler'] = array(
    'title' => 'Feeds Crawler',
    'page callback' => 'drupal_get_form',
    'page arguments' => array(
      'feeds_crawler_admin_form',
    ),
    'access arguments' => array(
      'access content',
    ),
    'file' => 'feeds_crawler.admin.inc',
    'type' => MENU_NORMAL_ITEM,
  );
  return $items;
}

/**
 * Batch callback.
 */
function feeds_crawler_batch($importer_id, $nid, $xpath, $offset_url, $num_pages, $autodetect, $html, &$context) {
  require_once 'FeedsSourceCrawler.inc';
  $feeds_source_class = variable_get('feeds_source_class', NULL);
  variable_set('feeds_source_class', 'FeedsSourceCrawler');
  $source = feeds_source($importer_id, $nid);
  if (!isset($context['sandbox']['progress'])) {
    $context['sandbox']['progress'] = 0;
  }
  if (!$offset_url) {
    $offset_url = $source->config['FeedsHTTPFetcher']['source'];
  }
  if (!isset($context['sandbox']['next_url'])) {
    $context['sandbox']['next_url'] = $offset_url;
  }
  $base_url = feeds_crawler_base($source->config['FeedsHTTPFetcher']['source']);
  $source
    ->setHTTPSource($context['sandbox']['next_url']);
  while (FEEDS_BATCH_COMPLETE != $source
    ->import()) {
  }
  $next_url = feeds_crawler_find_next($autodetect, $xpath, $source
    ->getRaw(), $base_url, $html);
  if ($next_url === FALSE) {
    feeds_crawler_reset_source_class($feeds_source_class);
    $context['finished'] = 1;
    if ($num_pages != 0) {
      drupal_set_message(t('Unable to find the next link.'), 'error');
    }
    return;
  }
  $context['sandbox']['next_url'] = $next_url;
  $context['sandbox']['progress']++;
  if ($num_pages != 0) {
    $context['finished'] = $context['sandbox']['progress'] / $num_pages;
  }
  else {
    $context['finished'] = $context['sandbox']['progress'] / 1000000;
  }
  feeds_crawler_reset_source_class($feeds_source_class);
}
function feeds_crawler_find_next($autodetect, $xpath, $raw, $base_url, $html) {
  if ($html == 'html') {
    $dom = new DOMDocument();
    $success = @$dom
      ->loadHTML($raw);
    if (!$success) {
      drupal_set_message(t('There was an error parsing the HTML document at %url.', array(
        '%url' => $base_url,
      )), 'error');
      return FALSE;
    }
    $xml = simplexml_import_dom($dom);
    unset($dom);
  }
  else {
    $xml = @new SimpleXMLElement($raw);
  }
  if ($autodetect && $html == 'xml') {
    feeds_include_library('common_syndication_parser.inc', 'common_syndication_parser');
    $format = _parser_common_syndication_feed_format_detect($xml);
    if ($format) {
      $xml
        ->registerXpathNamespace('atom', 'http://www.w3.org/2005/Atom');
      $xpath = 'atom:link[@rel="next"]/@href';
    }
    else {
      $xpath = 'link[@rel="next"]/@href';
    }
  }
  else {
    if ($autodetect && $html == 'html') {
      $xpath = 'link[@rel="next"]';
    }
  }
  $href = $xml
    ->xpath($xpath);
  unset($xml);
  if ($href === FALSE || empty($href)) {
    return FALSE;
  }
  $href = (string) $href[0];
  if (strpos($href, 'http') !== 0) {
    $href = $base_url . $href;
  }
  return $href;
}
function feeds_crawler_reset_source_class($feeds_source_class) {
  if ($feeds_source_class == NULL || $feeds_source_class == 'FeedsSourceCrawler') {
    variable_del('feeds_source_class');
  }
  else {
    variable_set('feeds_source_class', $feeds_source_class);
  }
}
function feeds_crawler_base($url) {
  $p = parse_url($url);
  $output = '';
  $output .= isset($p['scheme']) ? $p['scheme'] : 'http';
  $output .= '://';
  $output .= isset($p['user']) ? $p['user'] : '';
  $output .= isset($p['pass']) ? ':' . $p['pass'] : '';
  $output .= isset($p['user']) ? '@' : '';
  $output .= $p['host'];
  $output .= isset($p['port']) ? ':' . $p['port'] : '';
  return $output;
}