You are here

function _linkchecker_check_links in Link checker 7

Same name and namespace in other branches
  1. 6.2 linkchecker.module \_linkchecker_check_links()

Run link checks.

2 calls to _linkchecker_check_links()
drush_linkchecker_check in ./linkchecker.drush.inc
Callback for command linkchecker-check.
linkchecker_cron in ./linkchecker.module
Implements hook_cron().
1 string reference to '_linkchecker_check_links'
linkchecker_cron in ./linkchecker.module
Implements hook_cron().

File

./linkchecker.module, line 477
This module periodically check links in given node types, blocks etc.

Code

function _linkchecker_check_links() {

  // Get max_execution_time from configuration, override 0 with 240 seconds.
  $max_execution_time = ini_get('max_execution_time') == 0 ? 240 : ini_get('max_execution_time');

  // Make sure we have enough time to validate all of the links.
  drupal_set_time_limit($max_execution_time);

  // Make sure this is the only process trying to run this function.
  if (!lock_acquire(__FUNCTION__, $max_execution_time)) {
    linkchecker_watchdog_log('linkchecker', 'Attempted to re-run link checks while they are already running.', array(), WATCHDOG_WARNING);
    return FALSE;
  }
  $has_httprl = module_exists('httprl') && variable_get('linkchecker_check_library', 'core') == 'httprl';

  // Do not confuse admins with a setting of maximum checkable links per cron
  // run and guess that 2 links can be checked per second with 1 thread, what is
  // nevertheless uncommon. The max_execution_time can be used to calculate
  // a useful value that is higher, but not totally out of scope and limits the
  // query result set to a reasonable size.
  $linkchecker_check_connections_max = variable_get('linkchecker_check_connections_max', 8);
  $check_links_max_per_cron_run = $has_httprl ? $linkchecker_check_connections_max * $max_execution_time : $max_execution_time;
  $linkchecker_check_links_interval = variable_get('linkchecker_check_links_interval', 2419200);
  $linkchecker_check_useragent = variable_get('linkchecker_check_useragent', 'Drupal (+http://drupal.org/)');

  // Connection limit can be overridden via settings.php. Two connections is the
  // limit defined in RFC https://www.ietf.org/rfc/rfc2616.txt. Modern browsers
  // are typically using 6-8 connections and no more. Never use more and keep
  // in mind that you can overload other people servers.
  $linkchecker_check_domain_connections = variable_get('linkchecker_check_domain_connections', 2);

  // Get URLs for checking.
  $links = db_query_range('SELECT * FROM {linkchecker_link} WHERE last_checked < :last_checked AND status = :status ORDER BY last_checked, lid ASC', 0, $check_links_max_per_cron_run, array(
    ':last_checked' => REQUEST_TIME - $linkchecker_check_links_interval,
    ':status' => 1,
  ));
  $links_remaining = $links
    ->rowCount();
  foreach ($links as $link) {
    $headers = array();
    $headers['User-Agent'] = $linkchecker_check_useragent;
    $uri = @parse_url($link->url);

    // URL contains a fragment.
    if (in_array($link->method, array(
      'HEAD',
      'GET',
    )) && !empty($uri['fragment'])) {

      // We need the full content and not only the HEAD.
      $link->method = 'GET';

      // Request text content only (like Firefox/Chrome).
      $headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
    }
    elseif ($link->method == 'GET') {

      // Range: Only request the first 1024 bytes from remote server. This is
      // required to prevent timeouts on URLs that are large downloads.
      $headers['Range'] = 'bytes=0-1024';
    }

    // Add in the headers.
    $options = array(
      'headers' => $headers,
      'method' => $link->method,
      'max_redirects' => 0,
    );
    if ($has_httprl) {

      // Define the callback and add the $link object to it.
      // Notes:
      // - 'global_timeout' does not require a timer_read('page'), as this job
      //   runs in a new process, independent of cron.
      $options += array(
        'global_connections' => $linkchecker_check_connections_max,
        'global_timeout' => $max_execution_time - 30,
        'domain_connections' => $linkchecker_check_domain_connections,
        'callback' => array(
          array(
            'function' => '_linkchecker_status_handling',
          ),
          $link,
        ),
      );

      // Queue up the requests.
      httprl_request($link->url, $options);
      $links_remaining--;

      // After all links are queued, run the url checks.
      if ($links_remaining == 0) {
        httprl_send_request();
      }
    }
    else {

      // Drupal core.
      $response = drupal_http_request($link->url, $options);

      // Add 'redirect_code' property to core response object for consistency
      // with HTTPRL object.
      if ($response->code == 301 && !isset($response->redirect_code)) {
        $response->redirect_code = $response->code;
      }

      // Add 'uri' property to core response object for 'fragment' check and
      // consistency with HTTPRL object.
      $response->uri = $uri;
      _linkchecker_status_handling($response, $link);
      if (timer_read('page') / 1000 > $max_execution_time / 2) {

        // Stop once we have used over half of the maximum execution time.
        break;
      }
    }
  }

  // Release the lock.
  lock_release(__FUNCTION__);
  linkchecker_watchdog_log('linkchecker', 'Link checks completed.', array(), WATCHDOG_INFO);
  linkchecker_watchdog_log('linkchecker', 'Memory usage: @memory_get_usage, Peak memory usage: @memory_get_peak_usage.', array(
    '@memory_get_peak_usage' => format_size(memory_get_peak_usage()),
    '@memory_get_usage' => format_size(memory_get_usage()),
  ), WATCHDOG_DEBUG);
  return TRUE;
}