You are here

abstract class FeedsCrawlerBase in Feeds Crawler 7.2

The fetcher class that implements crawling.

Hierarchy

Expanded class hierarchy of FeedsCrawlerBase

1 string reference to 'FeedsCrawlerBase'
feeds_crawler_feeds_plugins in ./feeds_crawler.feeds.inc
Implements hook_feeds_plugings().

File

src/FeedsCrawlerBase.php, line 11
Contains FeedsCrawler.

View source
abstract class FeedsCrawlerBase extends FeedsHTTPFetcher {

  /**
   * Subclasses must override this to return the next URL.
   *
   * @param FeedsSource $source
   *   The feed source.
   * @param string $current_url
   *   The current URL being fetched.
   *
   * @return string
   *   The next URL.
   *
   * @throws FeedsCrawlerLinkNotFoundException
   *   Thrown if the next link could not be found.
   */
  protected abstract function getNextUrl(FeedsSource $source, $current_url);

  /**
   * {@inheritdoc}
   */
  public function fetch(FeedsSource $source) {
    $source_config = $source
      ->getConfigFor($this);

    // If this is only configured to execute on the first run.
    if ($this->config['first_run'] && $source_config['crawled']) {
      return $this
        ->getFetcherResult($source_config['source']);
    }
    $state = $source
      ->state(FEEDS_FETCH);
    $this
      ->beginFetch($source, $state);
    $url = isset($state->next_url) ? $state->next_url : $source_config['source'];
    try {
      $state->next_url = $this
        ->getNextUrl($source, $url);
      $this
        ->endFetch($source, $state);
    } catch (FeedsCrawlerLinkNotFoundException $e) {
      $state
        ->progress(1, 1);
    }
    if ($state->progress == FEEDS_BATCH_COMPLETE) {
      $source_config['crawled'] = TRUE;
      $source
        ->setConfigFor($this, $source_config);
    }
    return $this
      ->getFetcherResult($url);
  }

  /**
   * {@inheritdoc}
   */
  public function clear(FeedsSource $source) {
    parent::clear($source);
    $source_config = $source
      ->getConfigFor($this);
    $source_config['crawled'] = FALSE;
    $source
      ->setConfigFor($this, $source_config);
  }

  /**
   * {@inheritdoc}
   */
  public function configDefaults() {
    return array(
      'num_pages' => 10,
      'first_run' => TRUE,
      'delay' => 1,
    ) + parent::configDefaults();
  }

  /**
   * {@inheritdoc}
   */
  public function hasConfigForm() {
    return TRUE;
  }

  /**
   * {@inheritdoc}
   */
  public function configForm(&$form_state) {
    $form = array();
    $form['num_pages'] = array(
      '#type' => 'textfield',
      '#title' => t('Page count'),
      '#description' => t('The number of pages to crawl. Set to 0 for unlimited.'),
      '#default_value' => $this->config['num_pages'],
      '#element_validate' => array(
        'element_validate_integer_positive',
      ),
    );
    $form['delay'] = array(
      '#type' => 'textfield',
      '#title' => t('Delay'),
      '#description' => t('The delay (in seconds) to wait between fetching pages.'),
      '#default_value' => $this->config['delay'],
      '#element_validate' => array(
        'element_validate_integer',
      ),
    );
    $form['first_run'] = array(
      '#type' => 'checkbox',
      '#title' => t('First run'),
      '#description' => t('Only crawl pages on the first run of the import.'),
      '#default_value' => $this->config['first_run'],
    );
    return $form + parent::configForm($form_state);
  }

  /**
   * {@inheritdoc}
   */
  public function configFormValidate(&$values) {
    $values['num_pages'] = (int) $values['num_pages'];
    $values['delay'] = (int) $values['delay'];
    if ($values['delay'] < 0) {
      form_set_error('delay', t('<em class="placeholder">Delay</em> must be greater than or equal to zero.'));
    }
  }

  /**
   * {@inheritdoc}
   */
  public function sourceDefaults() {
    return array(
      'crawled' => FALSE,
    ) + parent::sourceDefaults();
  }

  /**
   * {@inheritdoc}
   */
  public function sourceForm($source_config) {
    $form = parent::sourceForm($source_config);
    $form['crawled'] = array(
      '#type' => 'hidden',
      '#value' => !empty($source_config['crawled']),
    );
    return $form;
  }

  /**
   * {@inheritdoc}
   */
  public function sourceFormValidate(&$values) {
    $values += $this
      ->sourceDefaults();
  }

  /**
   * Called before fetching the next link.
   *
   * Subclasses can override this to manage state.
   *
   * @param FeedsSource $source
   *   The feed source.
   * @param FeedsState $state
   *   The state object.
   */
  protected function beginFetch(FeedsSource $source, FeedsState $state) {
    if ($state->total == 0) {
      $state->total = $this->config['num_pages'] == 0 ? 100000 : $this->config['num_pages'];
      $state->count = $state->total;
      return;
    }
    sleep($this->config['delay']);
  }

  /**
   * Called after fetching the next link.
   *
   * @param FeedsSource $source
   *   The feed source.
   * @param FeedsState $state
   *   The state object.
   */
  protected function endFetch(FeedsSource $source, FeedsState $state) {
    $state->count--;
    $state
      ->progress($state->total, $state->total - $state->count);
  }

  /**
   * Returns a new fetcher result object.
   *
   * @param string $url
   *   The URL for the fetcher result.
   *
   * @return FeedsHTTPFetcherResult
   *   A fetcher result object.
   */
  protected function getFetcherResult($url) {
    $result = new FeedsHTTPFetcherResult($url);

    // When request_timeout is empty, the global value is used.
    $result
      ->setTimeout($this->config['request_timeout']);
    $result
      ->setAcceptInvalidCert($this->config['accept_invalid_cert']);
    return $result;
  }

  /**
   * Builds an absolute URL.
   *
   * @param string $url
   *   The URL to make absolute.
   * @param string $base_url
   *   The base url to reference.
   *
   * @return string
   *   The absolute URL.
   */
  protected function makeUrlAbsolute($url, $base_url) {
    feeds_include_library('http_request.inc', 'http_request');
    return http_request_create_absolute_url($url, $base_url);
  }

}

Members

Namesort descending Modifiers Type Description Overrides
FeedsCrawlerBase::beginFetch protected function Called before fetching the next link. 1
FeedsCrawlerBase::clear public function
FeedsCrawlerBase::configDefaults public function
FeedsCrawlerBase::configForm public function
FeedsCrawlerBase::configFormValidate public function
FeedsCrawlerBase::endFetch protected function Called after fetching the next link.
FeedsCrawlerBase::fetch public function
FeedsCrawlerBase::getFetcherResult protected function Returns a new fetcher result object.
FeedsCrawlerBase::getNextUrl abstract protected function Subclasses must override this to return the next URL. 2
FeedsCrawlerBase::hasConfigForm public function
FeedsCrawlerBase::makeUrlAbsolute protected function Builds an absolute URL.
FeedsCrawlerBase::sourceDefaults public function 1
FeedsCrawlerBase::sourceForm public function 1
FeedsCrawlerBase::sourceFormValidate public function