View source
<?php
function feeds_crawler_menu() {
$items = array();
$items['admin/build/feeds_crawler'] = array(
'title' => 'Feeds Crawler',
'page callback' => 'drupal_get_form',
'page arguments' => array(
'feeds_crawler_admin_form',
),
'access arguments' => array(
'access content',
),
'file' => 'feeds_crawler.admin.inc',
'type' => MENU_NORMAL_ITEM,
);
return $items;
}
function feeds_crawler_batch($importer_id, $nid, $xpath, $offset_url, $num_pages, $autodetect, $html, &$context) {
require_once 'FeedsSourceCrawler.inc';
$feeds_source_class = variable_get('feeds_source_class', NULL);
variable_set('feeds_source_class', 'FeedsSourceCrawler');
$source = feeds_source($importer_id, $nid);
if (!isset($context['sandbox']['progress'])) {
$context['sandbox']['progress'] = 0;
}
if (!$offset_url) {
$offset_url = $source->config['FeedsHTTPFetcher']['source'];
}
if (!isset($context['sandbox']['next_url'])) {
$context['sandbox']['next_url'] = $offset_url;
}
$base_url = feeds_crawler_base($source->config['FeedsHTTPFetcher']['source']);
$source
->setHTTPSource($context['sandbox']['next_url']);
while (FEEDS_BATCH_COMPLETE != $source
->import()) {
}
$next_url = feeds_crawler_find_next($autodetect, $xpath, $source
->getRaw(), $base_url, $html);
if ($next_url === FALSE) {
feeds_crawler_reset_source_class($feeds_source_class);
$context['finished'] = 1;
if ($num_pages != 0) {
drupal_set_message(t('Unable to find the next link.'), 'error');
}
return;
}
$context['sandbox']['next_url'] = $next_url;
$context['sandbox']['progress']++;
if ($num_pages != 0) {
$context['finished'] = $context['sandbox']['progress'] / $num_pages;
}
else {
$context['finished'] = $context['sandbox']['progress'] / 1000000;
}
feeds_crawler_reset_source_class($feeds_source_class);
}
function feeds_crawler_find_next($autodetect, $xpath, $raw, $base_url, $html) {
if ($html == 'html') {
$dom = new DOMDocument();
$success = @$dom
->loadHTML($raw);
if (!$success) {
drupal_set_message(t('There was an error parsing the HTML document at %url.', array(
'%url' => $base_url,
)), 'error');
return FALSE;
}
$xml = simplexml_import_dom($dom);
unset($dom);
}
else {
$xml = @new SimpleXMLElement($raw);
}
if ($autodetect && $html == 'xml') {
feeds_include_library('common_syndication_parser.inc', 'common_syndication_parser');
$format = _parser_common_syndication_feed_format_detect($xml);
if ($format) {
$xml
->registerXpathNamespace('atom', 'http://www.w3.org/2005/Atom');
$xpath = 'atom:link[@rel="next"]/@href';
}
else {
$xpath = 'link[@rel="next"]/@href';
}
}
else {
if ($autodetect && $html == 'html') {
$xpath = 'link[@rel="next"]';
}
}
$href = $xml
->xpath($xpath);
unset($xml);
if ($href === FALSE || empty($href)) {
return FALSE;
}
$href = (string) $href[0];
if (strpos($href, 'http') !== 0) {
$href = $base_url . $href;
}
return $href;
}
function feeds_crawler_reset_source_class($feeds_source_class) {
if ($feeds_source_class == NULL || $feeds_source_class == 'FeedsSourceCrawler') {
variable_del('feeds_source_class');
}
else {
variable_set('feeds_source_class', $feeds_source_class);
}
}
function feeds_crawler_base($url) {
$p = parse_url($url);
$output = '';
$output .= isset($p['scheme']) ? $p['scheme'] : 'http';
$output .= '://';
$output .= isset($p['user']) ? $p['user'] : '';
$output .= isset($p['pass']) ? ':' . $p['pass'] : '';
$output .= isset($p['user']) ? '@' : '';
$output .= $p['host'];
$output .= isset($p['port']) ? ':' . $p['port'] : '';
return $output;
}