class FeedsCrawler in Feeds Crawler 6.2
Same name and namespace in other branches
- 7 FeedsCrawler.inc \FeedsCrawler
Fetches data via HTTP.
Hierarchy
- class \FeedsConfigurable
- class \FeedsPlugin implements FeedsSourceInterface
- class \FeedsFetcher
- class \FeedsHTTPFetcher
- class \FeedsCrawler
- class \FeedsHTTPFetcher
- class \FeedsFetcher
- class \FeedsPlugin implements FeedsSourceInterface
Expanded class hierarchy of FeedsCrawler
File
- ./
FeedsCrawler.inc, line 11 - Home of the FeedsCrawler.
View source
class FeedsCrawler extends FeedsHTTPFetcher {
/**
* Implements FeedsFetcher::fetch().
*/
public function fetch(FeedsSource $source) {
$source_config = $source
->getConfigFor($this);
// Use a variable to store state. --hack.
$state = variable_get('feeds_crawler_' . $this->id . '_' . $source->feed_nid, new stdClass());
// Support Pubsubhubbub
if ($this->config['use_pubsubhubbub'] && ($raw = $this
->subscriber($source->feed_nid)
->receive())) {
$state->count = 0;
variable_set('feeds_crawler_' . $this->id . '_' . $source->feed_nid, $state);
return new FeedsImportBatch($raw, $source->feed_nid);
}
//return new FeedsHTTPBatch($source_config['source'], $source->feed_nid);
$config = $source_config['crawler'];
// First run.
if (empty($state->total)) {
$state->total = $config['num_pages'] == 0 ? 100000 : $config['num_pages'];
$state->count = $state->total;
$url = $source_config['source'];
}
else {
$url = $state->next_url;
}
$result = new FeedsHTTPBatch($url, $source->feed_nid);
if ($config['first_run'] && $source->config['FeedsCrawler']['crawled']) {
$state->count = 0;
variable_set('feeds_crawler_' . $this->id . '_' . $source->feed_nid, $state);
return $result;
}
sleep($config['delay']);
if ($config['auto'] && ($href = $this
->parseAuto($result, $source_config)) != FALSE || $config['url']['url_pattern'] && ($href = $this
->parseUrl($config, $state)) != FALSE || $config['xpath'] && ($href = $this
->parseXPath($result, $source_config)) != FALSE) {
$state->next_url = $href;
$state->count--;
if ($state->count <= 0) {
$source->config['FeedsCrawler']['crawled'] = TRUE;
}
}
else {
$state->count = 0;
}
variable_set('feeds_crawler_' . $this->id . '_' . $source->feed_nid, $state);
return $result;
}
private function parseUrl($config, $state) {
if (!isset($state->inc)) {
$state->inc = $config['url']['initial'];
}
else {
$state->inc += $config['url']['increment'];
}
return str_replace('$index', $state->inc, $config['url']['url_pattern']);
}
private function parseAuto($result, $source_config) {
$xml = new SimpleXMLElement($result
->getRaw());
feeds_include_library('common_syndication_parser.inc', 'common_syndication_parser');
$format = _parser_common_syndication_feed_format_detect($xml);
if ($format) {
$xml
->registerXpathNamespace('atom', 'http://www.w3.org/2005/Atom');
$xpath = 'atom:link[@rel="next"]/@href';
}
else {
$xpath = 'link[@rel="next"]/@href';
}
$href = $xml
->xpath($xpath);
unset($xml);
return $this
->parseHref($href, $source_config['source']);
}
private function parseXPath($result, $source_config) {
$dom = new DOMDocument();
$dom
->loadHTML($result
->getRaw());
$xml = simplexml_import_dom($dom);
unset($dom);
$href = $xml
->xpath($source_config['crawler']['xpath']);
unset($xml);
return $this
->parseHref($href, $source_config['source']);
}
private function parseHref($href, $source_url) {
if ($href === FALSE || empty($href)) {
return FALSE;
}
foreach ($href as $h) {
$h = trim((string) $h);
if (!empty($h)) {
$href = $h;
break;
}
}
if (strpos($href, 'http://') !== 0 && strpos($href, 'https://') !== 0) {
$href = ltrim($href, '/');
$href = $this
->baseUrl($source_url) . '/' . $href;
}
return $href;
}
/**
* Breaks a url up removing everything but the http://example.com.
*/
private function baseUrl($url) {
$p = parse_url($url);
$output = '';
$output .= isset($p['scheme']) ? $p['scheme'] : 'http';
$output .= '://';
$output .= isset($p['user']) ? $p['user'] : '';
$output .= isset($p['pass']) ? ':' . $p['pass'] : '';
$output .= isset($p['user']) ? '@' : '';
$output .= $p['host'];
$output .= isset($p['port']) ? ':' . $p['port'] : '';
return $output;
}
/**
* Define defaults.
*/
public function sourceDefaults() {
return $this->config;
}
/**
* Expose source form.
*/
public function sourceForm($source_config) {
$form = parent::sourceForm($source_config);
$form['crawler'] = array(
'#type' => 'fieldset',
'#title' => t('Feeds Crawler settings'),
'#collapsed' => TRUE,
'#collapsible' => TRUE,
'#tree' => TRUE,
);
$form['crawler']['num_pages'] = array(
'#type' => 'textfield',
'#title' => t('Number of pages'),
'#description' => t('The number of pages to fetch. 0 for unlimited'),
'#default_value' => isset($source_config['crawler']['num_pages']) ? $source_config['crawler']['num_pages'] : 10,
'#maxlength' => 10,
);
$form['crawler']['delay'] = array(
'#type' => 'textfield',
'#title' => t('Delay'),
'#description' => t('Number of seconds to delay in between fetches.'),
'#default_value' => isset($source_config['crawler']['delay']) ? $source_config['crawler']['delay'] : 1,
);
$form['crawler']['first_run'] = array(
'#type' => 'checkbox',
'#title' => t('Crawl on first run only'),
'#description' => t('Only crawl on initial run. Use regular import afterword.'),
'#default_value' => isset($source_config['crawler']['first_run']) ? $source_config['crawler']['first_run'] : FALSE,
);
$form['crawler']['auto'] = array(
'#type' => 'checkbox',
'#title' => t('Auto detect next link'),
'#description' => t('Attempt to autodetect the next link for RSS and ATOM feeds.'),
'#default_value' => isset($source_config['crawler']['auto']) ? $source_config['crawler']['auto'] : FALSE,
);
$form['crawler']['xpath'] = array(
'#type' => 'textfield',
'#title' => t('XPath selector for next link'),
'#description' => t('The XPath selector for the next link.'),
'#default_value' => isset($source_config['crawler']['xpath']) ? $source_config['crawler']['xpath'] : '',
'#maxlength' => NULL,
);
$form['crawler']['url'] = array(
'#type' => 'fieldset',
'#title' => t('URL replacement options'),
);
$form['crawler']['url']['url_pattern'] = array(
'#type' => 'textfield',
'#title' => t('URL pattern'),
'#description' => t('A URL with the variable $index replaced with an increnting number. For example: http://example.com?page=$index.'),
'#default_value' => isset($source_config['crawler']['url']['url_pattern']) ? $source_config['crawler']['url']['url_pattern'] : '',
'#maxlength' => NULL,
);
$form['crawler']['url']['initial'] = array(
'#type' => 'textfield',
'#title' => t('Initial value of $index'),
'#description' => t('The initial value of the $index variable.'),
'#default_value' => isset($source_config['crawler']['url']['initial']) ? $source_config['crawler']['url']['initial'] : '',
);
$form['crawler']['url']['increment'] = array(
'#type' => 'textfield',
'#title' => t('Increment $index by'),
'#description' => t('The increment the value of $index variable.'),
'#default_value' => isset($source_config['crawler']['url']['increment']) ? $source_config['crawler']['url']['increment'] : '',
);
$form['crawled'] = array(
'#type' => 'hidden',
'#value' => isset($source_config['crawled']) ? $source_config['crawled'] : FALSE,
);
return $form;
}
public function trim(&$value) {
$value = trim($value);
}
public function sourceFormValidate(&$values) {
$vs =& $values['crawler'];
array_walk_recursive($vs, array(
$this,
'trim',
));
if (empty($vs['num_pages'])) {
$vs['num_pages'] = 0;
}
if (empty($vs['delay'])) {
$vs['delay'] = 0;
}
if (!empty($vs['xpath'])) {
$xml = new SimpleXMLElement('<?xml version="1.0" encoding="UTF-8"?>' . "\n<items></items>");
$use_errors = libxml_use_internal_errors(TRUE);
$result = $xml
->xpath($vs['xpath']);
$error = libxml_get_last_error();
libxml_clear_errors();
libxml_use_internal_errors($use_errors);
if ($error) {
form_set_error('crawler][xpath', t('There was an error with the XPath selector: ') . $error->message);
}
}
if (!empty($vs['url']['url_pattern']) && !(stripos($vs['url']['url_pattern'], 'http://') === 0 || stripos($vs['url']['url_pattern'], 'https://') === 0)) {
form_set_error('crawler][url][url_pattern', t('The url pattern must be an absolute url. It must start with http:// or https://'));
}
if (!empty($vs['num_pages']) && !is_int($vs['num_pages']) && !ctype_digit($vs['num_pages'])) {
form_set_error('crawler][num_pages', t('Must be an integer.'));
}
if (!empty($vs['delay']) && !is_numeric($vs['delay'])) {
form_set_error('crawler][delay', t('Must be a number.'));
}
if (!empty($vs['url']['initial']) && !is_numeric($vs['url']['initial'])) {
form_set_error('crawler][url][initial', t('Must be a number.'));
}
if (!empty($vs['url']['increment']) && !is_numeric($vs['url']['increment'])) {
form_set_error('crawler][url][increment', t('Must be a number.'));
}
}
/**
* Override parent::configDefaults().
*/
public function configDefaults() {
$defaults = parent::configDefaults();
$defaults['crawler'] = array(
'num_pages' => 10,
'first_run' => TRUE,
'delay' => 1,
'auto' => FALSE,
'xpath' => '',
'url' => array(
'url_pattern' => '',
'initial' => 0,
'increment' => 1,
),
);
$defaults['crawled'] = FALSE;
return $defaults;
}
/**
* Override parent::configForm().
*/
public function configForm(&$form_state) {
$form = $this
->sourceForm($this->config);
unset($form['source']);
unset($form['crawled']);
$form['crawler']['#title'] = t('Feeds Crawler default settings');
$form['crawler']['#collapsed'] = FALSE;
return parent::configForm($form_state) + $form;
}
public function configFormValidate(&$values) {
$this
->sourceFormValidate($values);
}
}
Members
Name | Modifiers | Type | Description | Overrides |
---|---|---|---|---|
FeedsConfigurable:: |
protected | property | ||
FeedsConfigurable:: |
protected | property | CTools export enabled status of this object. | |
FeedsConfigurable:: |
protected | property | ||
FeedsConfigurable:: |
protected | property | ||
FeedsConfigurable:: |
public | function | Similar to setConfig but adds to existing configuration. | 1 |
FeedsConfigurable:: |
public | function | Submission handler for configForm(). | 3 |
FeedsConfigurable:: |
public | function | Copy a configuration. | 1 |
FeedsConfigurable:: |
public | function | Determine whether this object is persistent and enabled. I. e. it is defined either in code or in the database and it is enabled. | 1 |
FeedsConfigurable:: |
public | function | Implementation of getConfig(). | 1 |
FeedsConfigurable:: |
public static | function | Instantiate a FeedsConfigurable object. | 1 |
FeedsConfigurable:: |
public | function | Set configuration. | 1 |
FeedsConfigurable:: |
public | function | Override magic method __get(). Make sure that $this->config goes through getConfig() | |
FeedsConfigurable:: |
public | function | Override magic method __isset(). This is needed due to overriding __get(). | |
FeedsCrawler:: |
private | function | Breaks a url up removing everything but the http://example.com. | |
FeedsCrawler:: |
public | function |
Override parent::configDefaults(). Overrides FeedsHTTPFetcher:: |
|
FeedsCrawler:: |
public | function |
Override parent::configForm(). Overrides FeedsHTTPFetcher:: |
|
FeedsCrawler:: |
public | function |
Validation handler for configForm(). Overrides FeedsConfigurable:: |
|
FeedsCrawler:: |
public | function |
Implements FeedsFetcher::fetch(). Overrides FeedsHTTPFetcher:: |
|
FeedsCrawler:: |
private | function | ||
FeedsCrawler:: |
private | function | ||
FeedsCrawler:: |
private | function | ||
FeedsCrawler:: |
private | function | ||
FeedsCrawler:: |
public | function |
Define defaults. Overrides FeedsPlugin:: |
|
FeedsCrawler:: |
public | function |
Expose source form. Overrides FeedsHTTPFetcher:: |
|
FeedsCrawler:: |
public | function |
Override parent::sourceFormValidate(). Overrides FeedsHTTPFetcher:: |
|
FeedsCrawler:: |
public | function | ||
FeedsFetcher:: |
public | function | Menu item definition for fetchers of this class. Note how the path component in the item definition matches the return value of FeedsFetcher::path(); | |
FeedsFetcher:: |
public | function | Construct a path for a concrete fetcher/source combination. The result of this method matches up with the general path definition in FeedsFetcher::menuItem(). For example usage look at FeedsHTTPFetcher. | |
FeedsHTTPFetcher:: |
public | function |
Clear caches. Overrides FeedsFetcher:: |
|
FeedsHTTPFetcher:: |
public | function |
Implement FeedsFetcher::importPeriod(). Overrides FeedsFetcher:: |
|
FeedsHTTPFetcher:: |
public | function |
Implements FeedsFetcher::request(). Overrides FeedsFetcher:: |
|
FeedsHTTPFetcher:: |
public | function |
Override sourceDelete() - unsubscribe from hub. Overrides FeedsPlugin:: |
|
FeedsHTTPFetcher:: |
public | function |
Override sourceSave() - subscribe to hub. Overrides FeedsPlugin:: |
|
FeedsHTTPFetcher:: |
public | function |
Implement FeedsFetcher::subscribe() - subscribe to hub. Overrides FeedsFetcher:: |
|
FeedsHTTPFetcher:: |
protected | function | Convenience method for instantiating a subscriber object. | |
FeedsHTTPFetcher:: |
public | function |
Implement FeedsFetcher::unsubscribe() - unsubscribe from hub. Overrides FeedsFetcher:: |
|
FeedsPlugin:: |
public | function |
Returns TRUE if $this->sourceForm() returns a form. Overrides FeedsSourceInterface:: |
|
FeedsPlugin:: |
protected static | function | Loads on-behalf implementations from mappers/ directory. | |
FeedsPlugin:: |
public | function |
Save changes to the configuration of this object.
Delegate saving to parent (= Feed) which will collect
information from this object by way of getConfig() and store it. Overrides FeedsConfigurable:: |
|
FeedsPlugin:: |
protected | function |
Constructor. Overrides FeedsConfigurable:: |