You are here

function _linkchecker_check_links in Link checker 6.2

Same name and namespace in other branches
  1. 7 linkchecker.module \_linkchecker_check_links()

Run link checks.

1 call to _linkchecker_check_links()
linkchecker_cron in ./linkchecker.module
Implementation of hook_cron().
1 string reference to '_linkchecker_check_links'
linkchecker_cron in ./linkchecker.module
Implementation of hook_cron().

File

./linkchecker.module, line 375
This module periodically check links in given node types, blocks, cck fields, etc.

Code

function _linkchecker_check_links() {

  // Get max_execution_time from configuration, override 0 with 240 seconds.
  $max_execution_time = ini_get('max_execution_time') == 0 ? 240 : ini_get('max_execution_time');

  // Make sure we have enough time to validate all of the links.
  linkchecker_set_time_limit($max_execution_time);

  // Make sure this is the only process trying to run this function.
  if (!lock_acquire(__FUNCTION__, $max_execution_time)) {
    watchdog('linkchecker', 'Attempted to re-run link checks while they are already running.', array(), WATCHDOG_WARNING);
    return FALSE;
  }
  $has_httprl = module_exists('httprl') && variable_get('linkchecker_check_library', 'core') == 'httprl';

  // Do not confuse admins with a setting of maximum checkable links per cron
  // run and guess that 2 links can be checked per second with 1 thread, what is
  // nevertheless uncommon. The max_execution_time can be used to calculate
  // a useful value that is higher, but not totally out of scope and limits the
  // query result set to a reasonable size.
  $linkchecker_check_connections_max = variable_get('linkchecker_check_connections_max', 8);
  $check_links_max_per_cron_run = $has_httprl ? $linkchecker_check_connections_max * $max_execution_time : $max_execution_time;
  $linkchecker_check_links_interval = variable_get('linkchecker_check_links_interval', 2419200);
  $linkchecker_check_useragent = variable_get('linkchecker_check_useragent', 'Drupal (+http://drupal.org/)');

  // Connection limit can be overridden via settings.php. Two connections is the
  // limit defined in RFC http://www.ietf.org/rfc/rfc2616.txt. Modern browsers
  // are typically using 6-8 connections and no more. Never use more and keep
  // in mind that you can overload other people servers.
  $linkchecker_check_domain_connections = variable_get('linkchecker_check_domain_connections', 2);

  // Get URLs for checking.
  $links = db_query_range("SELECT * FROM {linkchecker_links} WHERE last_checked < %d AND status = %d ORDER BY last_checked, lid ASC", time() - $linkchecker_check_links_interval, 1, 0, $check_links_max_per_cron_run);

  // D6 database API does not provide a generic way to return the number of rows
  // in a result set and $links->num_rows only works with 'mysqli'. The only
  // workaround is to run the statement again with a COUNT query.
  $links_remaining = db_result(db_query_range("SELECT COUNT(lid) AS num_rows FROM {linkchecker_links} WHERE last_checked < %d AND status = %d ORDER BY last_checked, lid ASC", time() - $linkchecker_check_links_interval, 1, 0, $check_links_max_per_cron_run));
  while ($link = db_fetch_object($links)) {
    $headers = array();
    $headers['User-Agent'] = $linkchecker_check_useragent;
    $uri = @parse_url($link->url);

    // URL contains a fragment.
    if (in_array($link->method, array(
      'HEAD',
      'GET',
    )) && !empty($uri['fragment'])) {

      // We need the full content and not only the HEAD.
      $link->method = 'GET';

      // Request text content only (like Firefox/Chrome).
      $headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
    }
    elseif ($link->method == 'GET') {

      // Range: Only request the first 1024 bytes from remote server. This is
      // required to prevent timeouts on URLs that are large downloads.
      $headers['Range'] = 'bytes=0-1024';
    }

    // Add in the headers.
    $options = array(
      'headers' => $headers,
      'method' => $link->method,
      'max_redirects' => 0,
    );
    if ($has_httprl) {

      // Define the callback and add the $link object to it.
      // Notes:
      // - 'global_timeout' does not require a timer_read('page'), as this job
      //   runs in a new process, independent of cron.
      $options += array(
        'global_connections' => $linkchecker_check_connections_max,
        'global_timeout' => $max_execution_time - 30,
        'domain_connections' => $linkchecker_check_domain_connections,
        'callback' => array(
          array(
            'function' => '_linkchecker_status_handling',
          ),
          $link,
        ),
      );

      // Queue up the requests.
      httprl_request($link->url, $options);
      $links_remaining--;

      // After all links are queued, run the url checks.
      if ($links_remaining == 0) {
        httprl_send_request();
      }
    }
    else {

      // Drupal core
      $response = drupal_http_request($link->url, $options['headers'], $options['method'], NULL, $options['max_redirects']);

      // Add 'redirect_code' property to core response object for consistency
      // with HTTPRL object.
      if ($response->code == 301 && !isset($response->redirect_code)) {
        $response->redirect_code = $response->code;
      }

      // Add 'uri' property to core response object for 'fragment' check and
      // consistency with HTTPRL object.
      $response->uri = $uri;
      _linkchecker_status_handling($response, $link);
      if (timer_read('page') / 1000 > $max_execution_time / 2) {
        break;

        // Stop once we have used over half of the maximum execution time.
      }
    }
  }

  // Release the lock.
  lock_release(__FUNCTION__);
  watchdog('linkchecker', 'Link checks completed.', array(), WATCHDOG_INFO);

  // Peak memory usage is only available in PHP >= 5.2.
  if (version_compare(phpversion(), '5.2.0', '>=')) {
    watchdog('linkchecker', 'Memory usage: @memory_get_usage, Peak memory usage: @memory_get_peak_usage.', array(
      '@memory_get_peak_usage' => format_size(memory_get_peak_usage()),
      '@memory_get_usage' => format_size(memory_get_usage()),
    ), WATCHDOG_DEBUG);
  }
  else {
    watchdog('linkchecker', 'Memory usage: @memory_get_usage.', array(
      '@memory_get_usage' => format_size(memory_get_usage()),
    ), WATCHDOG_DEBUG);
  }
  return TRUE;
}