FeedsCrawlerBase.php in Feeds Crawler 7.2
Contains FeedsCrawler.
File
src/FeedsCrawlerBase.phpView source
<?php
/**
* @file
* Contains FeedsCrawler.
*/
/**
* The fetcher class that implements crawling.
*/
abstract class FeedsCrawlerBase extends FeedsHTTPFetcher {
/**
* Subclasses must override this to return the next URL.
*
* @param FeedsSource $source
* The feed source.
* @param string $current_url
* The current URL being fetched.
*
* @return string
* The next URL.
*
* @throws FeedsCrawlerLinkNotFoundException
* Thrown if the next link could not be found.
*/
protected abstract function getNextUrl(FeedsSource $source, $current_url);
/**
* {@inheritdoc}
*/
public function fetch(FeedsSource $source) {
$source_config = $source
->getConfigFor($this);
// If this is only configured to execute on the first run.
if ($this->config['first_run'] && $source_config['crawled']) {
return $this
->getFetcherResult($source_config['source']);
}
$state = $source
->state(FEEDS_FETCH);
$this
->beginFetch($source, $state);
$url = isset($state->next_url) ? $state->next_url : $source_config['source'];
try {
$state->next_url = $this
->getNextUrl($source, $url);
$this
->endFetch($source, $state);
} catch (FeedsCrawlerLinkNotFoundException $e) {
$state
->progress(1, 1);
}
if ($state->progress == FEEDS_BATCH_COMPLETE) {
$source_config['crawled'] = TRUE;
$source
->setConfigFor($this, $source_config);
}
return $this
->getFetcherResult($url);
}
/**
* {@inheritdoc}
*/
public function clear(FeedsSource $source) {
parent::clear($source);
$source_config = $source
->getConfigFor($this);
$source_config['crawled'] = FALSE;
$source
->setConfigFor($this, $source_config);
}
/**
* {@inheritdoc}
*/
public function configDefaults() {
return array(
'num_pages' => 10,
'first_run' => TRUE,
'delay' => 1,
) + parent::configDefaults();
}
/**
* {@inheritdoc}
*/
public function hasConfigForm() {
return TRUE;
}
/**
* {@inheritdoc}
*/
public function configForm(&$form_state) {
$form = array();
$form['num_pages'] = array(
'#type' => 'textfield',
'#title' => t('Page count'),
'#description' => t('The number of pages to crawl. Set to 0 for unlimited.'),
'#default_value' => $this->config['num_pages'],
'#element_validate' => array(
'element_validate_integer_positive',
),
);
$form['delay'] = array(
'#type' => 'textfield',
'#title' => t('Delay'),
'#description' => t('The delay (in seconds) to wait between fetching pages.'),
'#default_value' => $this->config['delay'],
'#element_validate' => array(
'element_validate_integer',
),
);
$form['first_run'] = array(
'#type' => 'checkbox',
'#title' => t('First run'),
'#description' => t('Only crawl pages on the first run of the import.'),
'#default_value' => $this->config['first_run'],
);
return $form + parent::configForm($form_state);
}
/**
* {@inheritdoc}
*/
public function configFormValidate(&$values) {
$values['num_pages'] = (int) $values['num_pages'];
$values['delay'] = (int) $values['delay'];
if ($values['delay'] < 0) {
form_set_error('delay', t('<em class="placeholder">Delay</em> must be greater than or equal to zero.'));
}
}
/**
* {@inheritdoc}
*/
public function sourceDefaults() {
return array(
'crawled' => FALSE,
) + parent::sourceDefaults();
}
/**
* {@inheritdoc}
*/
public function sourceForm($source_config) {
$form = parent::sourceForm($source_config);
$form['crawled'] = array(
'#type' => 'hidden',
'#value' => !empty($source_config['crawled']),
);
return $form;
}
/**
* {@inheritdoc}
*/
public function sourceFormValidate(&$values) {
$values += $this
->sourceDefaults();
}
/**
* Called before fetching the next link.
*
* Subclasses can override this to manage state.
*
* @param FeedsSource $source
* The feed source.
* @param FeedsState $state
* The state object.
*/
protected function beginFetch(FeedsSource $source, FeedsState $state) {
if ($state->total == 0) {
$state->total = $this->config['num_pages'] == 0 ? 100000 : $this->config['num_pages'];
$state->count = $state->total;
return;
}
sleep($this->config['delay']);
}
/**
* Called after fetching the next link.
*
* @param FeedsSource $source
* The feed source.
* @param FeedsState $state
* The state object.
*/
protected function endFetch(FeedsSource $source, FeedsState $state) {
$state->count--;
$state
->progress($state->total, $state->total - $state->count);
}
/**
* Returns a new fetcher result object.
*
* @param string $url
* The URL for the fetcher result.
*
* @return FeedsHTTPFetcherResult
* A fetcher result object.
*/
protected function getFetcherResult($url) {
$result = new FeedsHTTPFetcherResult($url);
// When request_timeout is empty, the global value is used.
$result
->setTimeout($this->config['request_timeout']);
$result
->setAcceptInvalidCert($this->config['accept_invalid_cert']);
return $result;
}
/**
* Builds an absolute URL.
*
* @param string $url
* The URL to make absolute.
* @param string $base_url
* The base url to reference.
*
* @return string
* The absolute URL.
*/
protected function makeUrlAbsolute($url, $base_url) {
feeds_include_library('http_request.inc', 'http_request');
return http_request_create_absolute_url($url, $base_url);
}
}
/**
* Thrown when the next link was not found.
*/
class FeedsCrawlerLinkNotFoundException extends RuntimeException {
}
Classes
Name | Description |
---|---|
FeedsCrawlerBase | The fetcher class that implements crawling. |
FeedsCrawlerLinkNotFoundException | Thrown when the next link was not found. |