You are here

linkchecker.module in Link checker 5.2

This module periodically check links in given node types, blocks, cck fields, etc.

Developed by Alexander Hass, http://www.yaml-for-drupal.com/.

File

linkchecker.module
View source
<?php

/**
 * @file
 * This module periodically check links in given node types, blocks, cck fields, etc.
 *
 * Developed by Alexander Hass, http://www.yaml-for-drupal.com/.
 */

/**
 * Defines the maximum limit of links collected in one chunk if content is
 * scanned for links. A value that is too high may overload the database server.
 */
define('LINKCHECKER_SCAN_MAX_LINKS_PER_RUN', '100');

/**
 * A list of domain names reserved for use in documentation and not available
 * for registration. See RFC 2606, Section 3 for more information.
 */
define('LINKCHECKER_RESERVED_DOCUMENTATION_DOMAINS', "example.com\nexample.net\nexample.org");

/**
 * A list of blacklisted filters the modules do not need to run for the link
 * extraction process. This filters only eat processing time or holds references
 * to other nodes.
 *
 * - Line break converter, http://drupal.org/project/drupal
 *     name: filter/2
 * - Insert block, http://drupal.org/project/insert_block
 *     name: insert_block/0
 *     tags: [block:name of module=delta of block]
 * - Insert node, http://drupal.org/project/InsertNode
 *     name: insert_node/0
 *     tags: [node:<name of node> <parameters>]
 * - Insert view filter, http://drupal.org/project/insert_view
 *     name: insert_view/0
 *     tags: [view:my_view]
 * - Smileys Filter, http://drupal.org/project/smileys
 *     name: smileys/0
 *     tags: Depends on icon set, for e.g: ":) :-) :smile:"
 * - Weblink filter, http://drupal.org/project/links
 *     name: links_weblink/0
 *     tags: [weblink:node_id|text], [weblink:node_id/link_id], [weblink:http://weblink.example.com/]
 * - Web Links Embed, http://drupal.org/project/weblinks
 *     name: weblinks_embed/0
 *     tags: [links-embed: id], [links-embed: name]
 * - Web Links Filter, http://drupal.org/project/weblinks
 *     name: weblinks_filter/0
 *     tags: [link: title]
 */
define('LINKCHECKER_DEFAULT_FILTER_BLACKLIST', 'filter/2|insert_block/0|insert_node/0|insert_view/0|smileys/0|links_weblink/0|weblinks_embed/0|weblinks_filter/0');

/**
 * Implementation of hook_perm().
 */
function linkchecker_perm() {
  return array(
    'access broken links report',
    'administer linkchecker',
    'edit link settings',
  );
}

/**
 * Implementation of hook_help().
 */
function linkchecker_help($section) {
  switch ($section) {
    case 'admin/help#linkchecker':
      return '<p>' . t('This module provides an aid to finding broken links on your site. It periodically checks contents of all public nodes, tries to find any html links and check for their validity. It reports broken links through the admin interface. For more information about status codes see <a href="@rfc">Status Code Definitions</a>.', array(
        '@rfc' => 'http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html',
      )) . '</p>';
    case 'admin/logs/linkchecker':
      $links_unchecked = db_result(db_query('SELECT COUNT(1) FROM {linkchecker_links} WHERE last_checked = %d AND status = %d', 0, 1));
      if ($links_unchecked > 0) {
        $links_all = db_result(db_query('SELECT COUNT(1) FROM {linkchecker_links} WHERE status = %d', 1));
        drupal_set_message(strtr(format_plural($links_unchecked, 'There is 1 unchecked link of about @links_all links in the database. Please be patient until all links have been checked via cron.', 'There are @count unchecked links of about @links_all links in the database. Please be patient until all links have been checked via cron.'), array(
          '@links_all' => $links_all,
        )), 'warning');
      }
  }
}

/**
 * Implementation of hook_menu().
 */
function linkchecker_menu($may_cache) {
  $items = array();
  if ($may_cache) {
    $items[] = array(
      'path' => 'admin/settings/linkchecker',
      'title' => t('Link checker'),
      'description' => t('Configure the link checker settings.'),
      'callback' => 'drupal_get_form',
      'callback arguments' => array(
        'linkchecker_admin_settings_form',
      ),
      'access' => user_access('administer linkchecker'),
    );
    $items[] = array(
      'path' => 'admin/logs/linkchecker',
      'title' => t('Broken links'),
      'description' => t('Shows a list of broken links in content.'),
      'callback' => 'linkchecker_admin_report',
      'access' => user_access('access broken links report'),
    );
  }
  else {
    $items[] = array(
      'path' => 'linkchecker/' . arg(1) . '/edit',
      'title' => t('Edit link settings'),
      'callback' => 'drupal_get_form',
      'callback arguments' => array(
        'linkchecker_link_edit_form',
        arg(1),
      ),
      'access' => user_access('edit link settings'),
      'type' => MENU_CALLBACK,
    );
  }
  return $items;
}
function linkchecker_admin_settings_form() {
  $form['settings'] = array(
    '#type' => 'fieldset',
    '#title' => t('General settings'),
    '#collapsible' => FALSE,
  );
  $form['settings']['linkchecker_scan_nodetypes'] = array(
    '#type' => 'checkboxes',
    '#title' => t('Scan node types for links'),
    '#default_value' => variable_get('linkchecker_scan_nodetypes', array()),
    '#options' => array_map('check_plain', node_get_types('names')),
    '#description' => t('Enable link checking for the selected node type(s).'),
  );
  $comment_dependencies = '<div class="admin-dependencies">';
  $comment_dependencies .= t('Depends on: !dependencies', array(
    '!dependencies' => module_exists('comment') ? t('@module (<span class="admin-enabled">enabled</span>)', array(
      '@module' => 'Comment',
    )) : t('@module (<span class="admin-disabled">disabled</span>)', array(
      '@module' => 'Comment',
    )),
  ));
  $comment_dependencies .= '</div>';
  $form['settings']['linkchecker_scan_comments'] = array(
    '#default_value' => variable_get('linkchecker_scan_comments', 0),
    '#type' => 'checkbox',
    '#title' => t('Scan comments for links'),
    '#description' => t('Enable this checkbox if links in comments of the above selected node type(s) should be checked.') . $comment_dependencies,
    '#disabled' => module_exists('comment') ? FALSE : TRUE,
  );
  $form['settings']['linkchecker_scan_blocks'] = array(
    '#default_value' => variable_get('linkchecker_scan_blocks', 0),
    '#type' => 'checkbox',
    '#title' => t('Scan blocks for links'),
    '#description' => t('Enable this checkbox if links in blocks should be checked.'),
  );
  $form['settings']['linkchecker_fqdn_only'] = array(
    '#default_value' => variable_get('linkchecker_fqdn_only', 1),
    '#type' => 'checkbox',
    '#title' => t('Check full qualified domain names only'),
    '#description' => t('Enable this checkbox if only full qualified URLs (http://example.com/foo/bar) should be checked. If unchecked, all internal (/node/123) and external (http://example.com/foo/bar) URLs will be checked.'),
  );
  $form['tag'] = array(
    '#type' => 'fieldset',
    '#title' => t('Link extraction'),
    '#collapsible' => FALSE,
  );
  $form['tag']['linkchecker_extract_from_a'] = array(
    '#default_value' => variable_get('linkchecker_extract_from_a', 1),
    '#type' => 'checkbox',
    '#title' => t('Extract links in <code>&lt;a&gt;</code> and <code>&lt;area&gt;</code> tags'),
    '#description' => t('Enable this checkbox if normal hyperlinks should be extracted. The anchor element defines a hyperlink, the named target destination for a hyperlink, or both. The area element defines a hot-spot region on an image, and associates it with a hypertext link.'),
  );
  $form['tag']['linkchecker_extract_from_audio'] = array(
    '#default_value' => variable_get('linkchecker_extract_from_audio', 0),
    '#type' => 'checkbox',
    '#title' => t('Extract links in <code>&lt;audio&gt;</code> tags'),
    '#description' => t('Enable this checkbox if links in audio tags should be extracted. The audio element is used to embed sound content.'),
  );
  $form['tag']['linkchecker_extract_from_embed'] = array(
    '#default_value' => variable_get('linkchecker_extract_from_embed', 0),
    '#type' => 'checkbox',
    '#title' => t('Extract links in <code>&lt;embed&gt;</code> tags'),
    '#description' => t('Enable this checkbox if links in embed tags should be extracted. This is an obsolete and non-standard element that was used for embedding plugins in past and should no longer used in modern websites.'),
  );
  $form['tag']['linkchecker_extract_from_iframe'] = array(
    '#default_value' => variable_get('linkchecker_extract_from_iframe', 0),
    '#type' => 'checkbox',
    '#title' => t('Extract links in <code>&lt;iframe&gt;</code> tags'),
    '#description' => t('Enable this checkbox if links in iframe tags should be extracted. The iframe element is used to embed another HTML page into a page.'),
  );
  $form['tag']['linkchecker_extract_from_img'] = array(
    '#default_value' => variable_get('linkchecker_extract_from_img', 0),
    '#type' => 'checkbox',
    '#title' => t('Extract links in <code>&lt;img&gt;</code> tags'),
    '#description' => t('Enable this checkbox if links in image tags should be extracted. The img element is used to add images to the content.'),
  );
  $form['tag']['linkchecker_extract_from_object'] = array(
    '#default_value' => variable_get('linkchecker_extract_from_object', 0),
    '#type' => 'checkbox',
    '#title' => t('Extract links in <code>&lt;object&gt;</code> and <code>&lt;param&gt;</code> tags'),
    '#description' => t('Enable this checkbox if multimedia and other links in object and their param tags should be extracted. The object tag is used for flash, java, quicktime and other applets.'),
  );
  $form['tag']['linkchecker_extract_from_source'] = array(
    '#default_value' => variable_get('linkchecker_extract_from_source', 0),
    '#type' => 'checkbox',
    '#title' => t('Extract links in <code>&lt;source&gt;</code> tags'),
    '#description' => t('Enable this checkbox if links in source tags should be extracted. The source element is used to specify multiple media resources for audio and video elements.'),
  );
  $form['tag']['linkchecker_extract_from_video'] = array(
    '#default_value' => variable_get('linkchecker_extract_from_video', 0),
    '#type' => 'checkbox',
    '#title' => t('Extract links in <code>&lt;video&gt;</code> tags'),
    '#description' => t('Enable this checkbox if links in video tags should be extracted. The video element is used in to embed video content.'),
  );

  // Get all filters available on the system.
  $filters = filter_list_all();
  $filter_options = array();
  foreach ($filters as $filter) {
    $filter_options[$filter->module . '/' . $filter->delta] = $filter->name;
  }
  $form['tag']['linkchecker_filter_blacklist'] = array(
    '#type' => 'checkboxes',
    '#title' => t('Filters disabled for link extraction'),
    '#default_value' => variable_get('linkchecker_filter_blacklist', explode('|', LINKCHECKER_DEFAULT_FILTER_BLACKLIST)),
    '#options' => $filter_options,
    '#description' => t('If a filter has been enabled for an input format it runs first and afterwards the link extraction. This helps the link checker module to find all links normally created by custom filters (e.g. Markdown filter, Bbcode). All filters used as an inline references (e.g. Weblink filter <code>[link: id]</code>) to other content and filters only wasting processing time (e.g. Line break converter) should be disabled. This setting does not have any effect on how content is shown on a page. This feature optimizes the internal link extraction process for link checker and prevents false alarms about broken links in content not having the real data of a link.'),
  );
  $form['check'] = array(
    '#type' => 'fieldset',
    '#title' => t('Check settings'),
    //'#description' => t('For simultaneous link checks it is highly recommended to install the <a href="@curl">cURL</a> library. This may be <strong>necessary</strong> on larger sites with very many links, but may still improve (speed up), link checking performance on smaller sites.', array('@curl' => 'http://www.php.net/manual/en/book.curl.php')),
    '#collapsible' => FALSE,
  );
  $form['check']['linkchecker_check_useragent'] = array(
    '#type' => 'select',
    '#title' => t('User-Agent'),
    '#description' => t('Defines the user agent that will be used for checking links on remote sites. If someone blocks the standard Drupal user agent you can try with a more common browser.'),
    '#default_value' => variable_get('linkchecker_check_useragent', 'Drupal (+http://drupal.org/)'),
    '#options' => array(
      'Drupal (+http://drupal.org/)' => 'Drupal (+http://drupal.org/)',
      'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;)' => 'Windows XP / Internet Explorer 7.0',
      'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5' => 'Windows XP / Mozilla Firefox 3.0.5',
      'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0;)' => 'Windows Vista / Internet Explorer 7.0',
      'Mozilla/5.0 (Windows; U; Windows NT 6.0; de; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5' => 'Windows Vista / Mozilla Firefox 3.0.5',
    ),
  );

  /* For now it's better to guess this value from max_execution_time.
     $form['check']['linkchecker_check_links_max'] = array(
      '#type' => 'select',
      '#title' => t('Check number of links per cron run'),
      '#description' => t('Defines the number of links that will be checked per cron run. The possible setting hardly depends on your PHP timeout value and remote servers speed and may increased if <a href="@curl">cURL</a> is installed and use simultaneous request checking. All cron jobs may fail, if this setting is too high!', array('@curl' => 'http://www.php.net/manual/en/book.curl.php')),
      '#default_value' => variable_get('linkchecker_check_links_max', 10),
      '#options' => drupal_map_assoc(array(5, 10, 25, 50, 100, 200, 250, 300, 350, 400, 450, 500, 750, 1000)),
    ); */
  $form['check']['linkchecker_check_links_interval'] = array(
    '#type' => 'select',
    '#title' => t('Check interval for links'),
    '#description' => t('This interval setting defines how often cron will re-check the status of links.'),
    '#default_value' => variable_get('linkchecker_check_links_interval', 2419200),
    '#options' => drupal_map_assoc(array(
      86400,
      172800,
      259200,
      604800,
      1209600,
      2419200,
      4838400,
    ), 'format_interval'),
  );
  $form['check']['linkchecker_disable_link_check_for_urls'] = array(
    '#default_value' => variable_get('linkchecker_disable_link_check_for_urls', LINKCHECKER_RESERVED_DOCUMENTATION_DOMAINS),
    '#type' => 'textarea',
    '#title' => t('Do not check the link status of links containing these URLs'),
    '#description' => t('By default this list contains the domain names reserved for use in documentation and not available for registration. See <a href="@rfc-2606">RFC 2606</a>, Section 3 for more information. URLs on this list are still extracted, but the link setting <em>Check link status</em> becomes automatically disabled to prevent false alarms. If you change this list you need to clear all link data and re-analyze your content. Otherwise this setting will only affect new links added after the configuration change.', array(
      '@rfc-2606' => 'http://www.rfc-editor.org/rfc/rfc2606.txt',
    )),
    '#wysiwyg' => FALSE,
  );
  $form['error'] = array(
    '#type' => 'fieldset',
    '#title' => t('Error handling'),
    '#description' => t('Defines error handling and custom actions to be executed if specific HTTP requests are failing.'),
    '#collapsible' => FALSE,
  );
  $form['error']['linkchecker_action_status_code_301'] = array(
    '#title' => t('Update permanently moved links'),
    '#description' => t('If enabled, outdated links in content providing a status <em>Moved Permanently</em> (status code 301) are automatically updated to the most recent URL. If used, it is recommended to use a value of <em>three</em> to make sure this is not only a temporarily change. This feature trust sites to provide a valid permanent redirect. A new node revision is automatically created on link updates if <em>create new revision</em> is enabled in the <a href="@node_types">node type</a> workflow settings. It is recommended to create new revisions for all link checker enabled node types. Link updates are nevertheless always logged in <a href="@dblog">recent log entries</a>.', array(
      '@dblog' => url('admin/logs/watchdog'),
      '@node_types' => url('admin/content/types'),
    )),
    '#type' => 'select',
    '#default_value' => variable_get('linkchecker_action_status_code_301', 0),
    '#options' => array(
      0 => t('Disabled'),
      1 => t('After one failed check'),
      2 => t('After two failed checks'),
      3 => t('After three failed checks'),
      5 => t('After five failed checks'),
      10 => t('After ten failed checks'),
    ),
  );
  $form['error']['linkchecker_action_status_code_404'] = array(
    '#title' => t('Unpublish node on file not found error'),
    '#description' => t('If enabled, a node with one or more broken links (status code 404) will be unpublished and moved to moderation queue for review after the number of specified fails. If used, it is recommended to use a value of <em>three</em> to make sure this is not only a temporarily error.'),
    '#type' => 'select',
    '#default_value' => variable_get('linkchecker_action_status_code_404', 0),
    '#options' => array(
      0 => t('Disabled'),
      1 => t('After one file not found error'),
      2 => t('After two file not found errors'),
      3 => t('After three file not found errors'),
      5 => t('After five file not found errors'),
      10 => t('After ten file not found errors'),
    ),
  );
  $form['error']['linkchecker_ignore_response_codes'] = array(
    '#default_value' => variable_get('linkchecker_ignore_response_codes', "200\n302\n304\n401\n403"),
    '#type' => 'textarea',
    '#title' => t("Don't treat these response codes as errors"),
    '#description' => t('One HTTP status code per line, e.g. 403.'),
    '#wysiwyg' => FALSE,
  );

  // Buttons are only required for testing and debugging reasons.
  $description = '<p>' . t('These actions will either clear all link checker tables in the database and/or analyze all selected node types, blocks and cck fields (see settings above) for new/updated/removed links. Normally there is no need to press one of these buttons. Use this only for immediate cleanup tasks and to force a full re-build of the links to be checked in the linkchecker tables. Keep in mind that all custom link settings will be lost!') . '</p>';
  $description .= '<p>' . t('<strong>Note</strong>: These functions ONLY collect the links, they do not evaluate the HTTP response codes, this will be done during normal cron runs.') . '</p>';
  $form['clear'] = array(
    '#type' => 'fieldset',
    '#title' => t('Clear link data'),
    '#description' => $description,
    '#collapsible' => TRUE,
    '#collapsed' => TRUE,
  );
  $form['clear']['linkchecker_analyze'] = array(
    '#type' => 'submit',
    '#value' => t('Analyze content for links'),
  );
  $form['clear']['linkchecker_clear_analyze'] = array(
    '#type' => 'submit',
    '#value' => t('Clear link data and analyze content for links'),
  );
  return system_settings_form($form);
}
function linkchecker_admin_settings_form_validate($form_id, &$form_values) {
  $form_values['linkchecker_disable_link_check_for_urls'] = trim($form_values['linkchecker_disable_link_check_for_urls']);
  $form_values['linkchecker_ignore_response_codes'] = trim($form_values['linkchecker_ignore_response_codes']);
  $ignore_response_codes = preg_split('/(\\r\\n?|\\n)/', $form_values['linkchecker_ignore_response_codes']);
  foreach ($ignore_response_codes as $ignore_response_code) {
    if (!_linkchecker_isvalid_response_code($ignore_response_code)) {
      form_set_error('linkchecker_ignore_response_codes', t('Invalid response code %code found.', array(
        '%code' => $ignore_response_code,
      )));
    }
  }

  // Prevent the removal of RFC documentation domains. This are the official and
  // reserved documentation domains and not "example" hostnames!
  $linkchecker_disable_link_check_for_urls = array_filter(preg_split('/(\\r\\n?|\\n)/', $form_values['linkchecker_disable_link_check_for_urls']));
  $form_values['linkchecker_disable_link_check_for_urls'] = implode("\n", array_unique(array_merge(explode("\n", LINKCHECKER_RESERVED_DOCUMENTATION_DOMAINS), $linkchecker_disable_link_check_for_urls)));
}
function linkchecker_admin_settings_form_submit($form_id, $form_values) {

  // Exclude unnecessary elements.
  unset($form_values['linkchecker_analyze'], $form_values['linkchecker_clear_analyze']);

  // Submit handler per button workaround
  switch ($form_values['op']) {
    case t('Analyze content for links'):

      // Save form settings.
      system_settings_form_submit($form_id, $form_values);

      // Start batch and analyze all content.
      _linkchecker_batch_import();
      drupal_set_message(t('Content analysis has been queued for processing via cron.'));
      drupal_goto('admin/settings/linkchecker');
      break;
    case t('Clear link data and analyze content for links'):

      // Save form settings.
      system_settings_form_submit($form_id, $form_values);
      db_query("TRUNCATE TABLE {linkchecker_nodes}");
      db_query("TRUNCATE TABLE {linkchecker_comments}");
      db_query("TRUNCATE TABLE {linkchecker_boxes}");
      db_query("TRUNCATE TABLE {linkchecker_links}");
      db_query("DELETE FROM {sequences} WHERE name = '%s'", 'linkchecker_links_lid');

      // Start batch and analyze all content.
      _linkchecker_batch_import();
      drupal_set_message(t('Cleared link data and content analysis has been queued for processing via cron.'));
      drupal_goto('admin/settings/linkchecker');
      break;
    default:

      // Have node types or comment or block selection changed?
      $additional_nodetypes_selected = array_diff($form_values['linkchecker_scan_nodetypes'], variable_get('linkchecker_scan_nodetypes', array()));
      $linkchecker_scan_comments_changed = $form_values['linkchecker_scan_comments'] > variable_get('linkchecker_scan_comments', 0) ? TRUE : FALSE;
      $linkchecker_scan_blocks = $form_values['linkchecker_scan_blocks'] > variable_get('linkchecker_scan_blocks', 0) ? TRUE : FALSE;

      // Save form settings.
      system_settings_form_submit($form_id, $form_values);

      // Re-scan items, if node types or comment or block selection have been changed.
      if (!empty($additional_nodetypes_selected) || $linkchecker_scan_comments_changed) {
        $node_types = array_keys(array_filter($form_values['linkchecker_scan_nodetypes']));

        // If one or more node types have been selected.
        if (!empty($node_types)) {
          _linkchecker_batch_import_nodes($node_types);
          drupal_set_message(t('Node types analysis have been queued for processing via cron.'));

          // If comment scanning of node types has been selected.
          if ($linkchecker_scan_comments_changed) {
            _linkchecker_batch_import_comments($node_types);
            drupal_set_message(t('Comment analysis has been queued for processing via cron.'));
          }
        }
      }

      // If block scanning has been selected.
      if ($linkchecker_scan_blocks) {
        _linkchecker_batch_import_boxes();
        drupal_set_message(t('Block analysis has been queued for processing via cron.'));
      }
  }
}

/**
 * Menu callback for reporting.
 */
function linkchecker_admin_report() {
  $header = array(
    array(
      'data' => t('URL'),
      'field' => 'url',
      'sort' => 'desc',
    ),
    array(
      'data' => t('Response'),
      'field' => 'code',
      'sort' => 'desc',
    ),
    array(
      'data' => t('Error'),
      'field' => 'error',
    ),
    array(
      'data' => t('Operations'),
    ),
  );
  $ignore_response_codes = preg_split('/(\\r\\n?|\\n)/', variable_get('linkchecker_ignore_response_codes', "200\n302\n304\n401\n403"));
  $placeholders = implode(',', array_fill(0, count($ignore_response_codes), '%d'));
  $links_report_sql = "SELECT ll.*\n    FROM {linkchecker_links} ll\n    INNER JOIN (\n      SELECT lid FROM (\n        SELECT DISTINCT lid FROM {linkchecker_boxes}\n        UNION\n        SELECT DISTINCT lid FROM {linkchecker_comments}\n        UNION\n        SELECT DISTINCT lid FROM {linkchecker_nodes}\n      ) q1\n    ) q2 ON q2.lid = ll.lid\n    WHERE ll.last_checked <> %d AND ll.status = %d AND ll.code NOT IN (" . $placeholders . ")";
  $result = pager_query($links_report_sql . tablesort_sql($header), 50, 0, NULL, array_merge(array(
    0,
    1,
  ), $ignore_response_codes));
  $rows = array();
  while ($link = db_fetch_object($result)) {
    $links = array();

    // Show links to link settings.
    if (user_access('edit link settings')) {
      $links[] = l(t('Edit link settings'), 'linkchecker/' . $link->lid . '/edit', array(), 'destination=admin/logs/linkchecker');
    }

    // Show link to nodes having this broken link.
    $nodes = db_query('SELECT nid FROM {linkchecker_nodes} WHERE lid = %d', $link->lid);
    while ($node = db_fetch_object($nodes)) {
      $links[] = l(t('Edit node @node', array(
        '@node' => $node->nid,
      )), 'node/' . $node->nid . '/edit', array(), 'destination=admin/logs/linkchecker');
    }

    // Show link to comments having this broken link.
    $comments = db_query('SELECT cid FROM {linkchecker_comments} WHERE lid = %d', $link->lid);
    while ($comment = db_fetch_object($comments)) {
      $links[] = l(t('Edit comment @comment', array(
        '@comment' => $comment->cid,
      )), 'comment/edit/' . $comment->cid, array(), 'destination=admin/logs/linkchecker');
    }

    // Show link to blocks having this broken link.
    $boxes = db_query('SELECT bid FROM {linkchecker_boxes} WHERE lid = %d', $link->lid);
    while ($box = db_fetch_object($boxes)) {
      $links[] = l(t('Edit block @block', array(
        '@block' => $box->bid,
      )), 'admin/build/block/configure/block/' . $box->bid, array(), 'destination=admin/logs/linkchecker');
    }

    // Create table data for output.
    $rows[] = array(
      l(_filter_url_trim($link->url, 40), $link->url),
      $link->code,
      check_plain($link->error),
      theme('item_list', $links),
    );
  }
  if (empty($rows)) {
    $rows[] = array(
      array(
        'data' => t('No broken links have been found.'),
        'colspan' => count($header),
      ),
    );
  }
  $output = theme('table', $header, $rows);
  $output .= theme('pager', NULL, 3000, 0);
  return $output;
}

/**
 * Menu callback for link setting.
 */
function linkchecker_link_edit_form($lid) {
  $link = linkchecker_link_load($lid);
  $form['settings'] = array(
    '#type' => 'fieldset',
    '#title' => t('Settings'),
    '#collapsible' => FALSE,
    '#description' => t('The link <a href="@url">@url</a> has been checked lastly at @last_checked and failed @fail_count times.', array(
      '@url' => $link['url'],
      '@fail_count' => $link['fail_count'],
      '@last_checked' => format_date($link['last_checked']),
    )),
  );
  $form['settings']['lid'] = array(
    '#type' => 'hidden',
    '#value' => $link['lid'],
  );
  $form['settings']['url'] = array(
    '#type' => 'hidden',
    '#value' => $link['url'],
  );
  $form['settings']['method_old'] = array(
    '#type' => 'hidden',
    '#value' => $link['method'],
  );
  $form['settings']['method'] = array(
    '#type' => 'select',
    '#title' => t('Select request method'),
    '#default_value' => $link['method'],
    '#options' => array(
      'HEAD' => t('HEAD'),
      'GET' => t('GET'),
    ),
    '#description' => t('Select the request method used for link checks of this link. If you encounter issues like status code 500 errors with the HEAD request method you should try the GET request method before ignoring a link.'),
  );
  $form['settings']['status'] = array(
    '#default_value' => $link['status'],
    '#type' => 'checkbox',
    '#title' => t('Check link status'),
    '#description' => t("Disable this checkbox if you don't like to get informed any longer about this broken link. Use this setting only as the very last option if there is no other way to solve a failed link check."),
  );
  $form['maintenance'] = array(
    '#type' => 'fieldset',
    '#title' => t('Maintenance'),
    '#collapsible' => FALSE,
  );
  $form['maintenance']['recheck'] = array(
    '#default_value' => 0,
    '#type' => 'checkbox',
    '#title' => t('Re-check link status on next cron run'),
    '#description' => t('Enable this checkbox if you need an immediate re-check of the link and cannot wait until the next scheduled check at @date.', array(
      '@date' => format_date($link['last_checked'] + variable_get('linkchecker_check_links_interval', 2419200)),
    )),
  );
  $form['buttons']['submit'] = array(
    '#type' => 'submit',
    '#value' => t('Save configuration'),
  );
  $form['buttons']['reset'] = array(
    '#type' => 'submit',
    '#value' => t('Reset to defaults'),
  );
  return $form;
}
function linkchecker_link_edit_form_submit($form_id, $form_values) {

  // Force asap link re-check.
  if ($form_values['recheck']) {
    db_query("UPDATE {linkchecker_links} SET last_checked = %d WHERE lid = %d", 0, $form_values['lid']);
    drupal_set_message(t('The link %url will be checked again on the next cron run.', array(
      '%url' => $form_values['url'],
    )));
  }
  if ($form_values['method'] != $form_values['method_old']) {

    // Update settings and reset statistics for a quick re-check.
    db_query("UPDATE {linkchecker_links} SET method = '%s', fail_count = %d, last_checked = %d, status = %d WHERE lid = %d", $form_values['method'], 0, 0, $form_values['status'], $form_values['lid']);
    drupal_set_message(t('The link settings for %url have been saved and the fail counter has been reset.', array(
      '%url' => $form_values['url'],
    )));
  }
  else {

    // Update setting only.
    db_query("UPDATE {linkchecker_links} SET method = '%s', status = %d WHERE lid = %d", $form_values['method'], $form_values['status'], $form_values['lid']);
    drupal_set_message(t('The link settings for %url have been saved.', array(
      '%url' => $form_values['url'],
    )));
  }
}

/**
 * Trigger batch import job.
 */
function _linkchecker_batch_import() {

  // Start batch and analyze all nodes.
  $node_types = array_keys(array_filter(variable_get('linkchecker_scan_nodetypes', array())));
  if (!empty($node_types)) {
    _linkchecker_batch_import_nodes($node_types);
    if (variable_get('linkchecker_scan_comments', 0)) {
      _linkchecker_batch_import_comments($node_types);
    }
  }
  if (variable_get('linkchecker_scan_blocks', 0)) {
    _linkchecker_batch_import_boxes();
  }
}

/**
 * Batch: Load all nodes 100 by hundred.
 */
function _linkchecker_batch_import_nodes($node_types = array()) {

  // Node import count.
  $placeholders = implode(',', array_fill(0, count($node_types), "'%s'"));
  $rows = db_result(db_query('SELECT COUNT(DISTINCT nid) FROM {node} WHERE status = %d AND type IN (' . $placeholders . ')', array_merge(array(
    1,
  ), $node_types)));
  for ($row = 0; $row < $rows; $row = $row + LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) {
    job_queue_add('_linkchecker_batch_node_import_op', 'Scan nodes from row ' . $row . ' to ' . ($row + LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) . '.', array(
      $row,
      LINKCHECKER_SCAN_MAX_LINKS_PER_RUN,
      $node_types,
    ), '', TRUE);
  }
}

/**
 * Batch operation: Load all nodes, 100 by hundred.
 */
function _linkchecker_batch_node_import_op($row, $limit, $node_types) {

  // Retrieve the next group of records.
  $placeholders = implode(',', array_fill(0, count($node_types), "'%s'"));
  $result = db_query_range('SELECT nid FROM {node} WHERE status = %d AND type IN (' . $placeholders . ') ORDER BY nid ASC', array_merge(array(
    1,
  ), $node_types), $row, $limit);
  while ($res = db_fetch_array($result)) {

    // Load the node and scan for links.
    $node = node_load($res['nid'], NULL, TRUE);
    _linkchecker_add_node_links($node);
  }
}

/**
 * Batch: Load all comments 100 by hundred.
 */
function _linkchecker_batch_import_comments($node_types = array()) {

  // Comment import count.
  $placeholders = implode(',', array_fill(0, count($node_types), "'%s'"));
  $rows = db_result(db_query('SELECT COUNT(DISTINCT cid) FROM {comments} c INNER JOIN {node} n ON c.nid = n.nid WHERE c.status <> %d AND n.status = %d AND n.type IN (' . $placeholders . ')', array_merge(array(
    COMMENT_NOT_PUBLISHED,
    1,
  ), $node_types)));
  for ($row = 0; $row < $rows; $row = $row + LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) {
    job_queue_add('_linkchecker_batch_comments_import_op', 'Scan comments from row ' . $row . ' to ' . ($row + LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) . '.', array(
      $row,
      LINKCHECKER_SCAN_MAX_LINKS_PER_RUN,
      $node_types,
    ), '', TRUE);
  }
}

/**
 * Batch operation: Load all comments, 100 by hundred.
 */
function _linkchecker_batch_comments_import_op($row, $limit, $node_types) {

  // Retrieve the next group of records.
  $placeholders = implode(',', array_fill(0, count($node_types), "'%s'"));
  $result = db_query_range('SELECT cid FROM {comments} c INNER JOIN {node} n ON c.nid = n.nid WHERE c.status <> %d AND n.status = %d AND n.type IN (' . $placeholders . ') ORDER BY cid ASC', array_merge(array(
    COMMENT_NOT_PUBLISHED,
    1,
  ), $node_types), $row, $limit);
  while ($res = db_fetch_array($result)) {

    // Load the comment and scan for links.
    $comment = _linkchecker_comment_load($res['cid']);
    _linkchecker_add_comment_links($comment);
  }
}

/**
 * Batch operation: Load all boxes, 100 by 100.
 */
function _linkchecker_batch_import_comments_op($row, $limit) {

  // Retrieve the next group of bids.
  $result = db_query_range("SELECT bid FROM {boxes} ORDER BY bid ASC", $row, $limit);
  while ($res = db_fetch_array($result)) {

    // Load the comment and scan for links.
    $comment = _linkchecker_comment_load($res['cid']);
    _linkchecker_add_comment_links($comment);
  }
}

/**
 * Batch: Load all boxes 100 by hundred.
 */
function _linkchecker_batch_import_boxes() {

  // Block import.
  $rows = db_result(db_query('SELECT COUNT(DISTINCT bid) FROM {boxes}'));
  for ($row = 0; $row < $rows; $row = $row + LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) {
    job_queue_add('_linkchecker_batch_import_boxes_op', 'Scan boxes from row ' . $row . ' to ' . ($row + LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) . '.', array(
      $row,
      LINKCHECKER_SCAN_MAX_LINKS_PER_RUN,
    ), '', TRUE);
  }
}

/**
 * Batch operation: Load all boxes, 100 by 100.
 */
function _linkchecker_batch_import_boxes_op($row, $limit) {

  // Retrieve the next group of bids.
  $result = db_query_range("SELECT bid FROM {boxes} ORDER BY bid ASC", $row, $limit);
  while ($res = db_fetch_array($result)) {

    // Load the box and scan for links.
    $box = block_box_get($res['bid']);
    _linkchecker_add_box_links($box, $box['bid']);
  }
}

/**
 * Implementation of hook_cron().
 */
function linkchecker_cron() {

  // Get max_execution_time from configuration, override 0 with 240 seconds.
  $max_execution_time = ini_get('max_execution_time') == 0 ? 240 : ini_get('max_execution_time');

  // Remove outdated links no longer in use once per day.
  if (time() - variable_get('linkchecker_cleanup_links_last', 0) >= 86400) {
    _linkchecker_cleanup_links();
    variable_set('linkchecker_cleanup_links_last', time());
  }

  // TODO: Implement cURL support.

  //$has_curl = function_exists('curl_init');

  // TODO: Remove some confusion about the max links that can be checked per
  // cron run and guess that 2 link can be checked per second what is
  // nevertheless uncommon. But we can use the max_execution_time to calculate
  // a value that is higher, but not totally out of scope to keep the query
  // resultset small. For cURL we need to add this setting back or a thread
  // limit per remote server for not overloading them.
  $check_links_max_per_cron_run = $max_execution_time;

  //$check_links_max_per_cron_run = variable_get('linkchecker_check_links_max', 10);
  $check_links_interval = variable_get('linkchecker_check_links_interval', 2419200);
  $useragent = variable_get('linkchecker_check_useragent', 'Drupal (+http://drupal.org/)');

  // Get URLs for checking.
  $result = db_query_range("SELECT * FROM {linkchecker_links} WHERE last_checked < %d AND status = %d ORDER BY last_checked, lid ASC", time() - $check_links_interval, 1, 0, $check_links_max_per_cron_run);
  while ($link = db_fetch_object($result)) {

    // Fetch URL.
    $response = drupal_http_request($link->url, array(
      'User-Agent' => 'User-Agent: ' . $useragent,
    ), $link->method, NULL, 1);
    _linkchecker_status_handling($link, $response);
    if (timer_read('page') / 1000 > $max_execution_time / 2) {
      break;

      // Stop once we have used over half of the maximum execution time.
    }
  }
}

/**
 * Status code handling.
 *
 * @param $link
 *   An object containing the url, lid and fail_count.
 *
 * @param $response
 *   An object containing the HTTP request headers, response code, headers,
 *   data and redirect status.
 */
function _linkchecker_status_handling($link, $response) {
  $useragent = variable_get('linkchecker_check_useragent', 'Drupal (+http://drupal.org/)');
  $ignore_response_codes = preg_split('/(\\r\\n?|\\n)/', variable_get('linkchecker_ignore_response_codes', "200\n302\n304\n401\n403"));

  // FIXME: drupal_http_request() may not provide an UTF8 encoded error message
  // what results in a database UPDATE failure. See http://drupal.org/node/371495
  // for more information. ISO-8859-1 as source encoding may be wrong, but WFM.
  if (!empty($response->error) && !drupal_validate_utf8($response->error)) {
    $response->error = drupal_convert_to_utf8($response->error, 'ISO-8859-1');
  }

  // Prevent E_ALL warnings for non-existing $response->error.
  if (!isset($response->error)) {
    $response->error = '';
  }
  switch ($response->code) {
    case 200:
    case 304:
      db_query("UPDATE {linkchecker_links} SET code = %d, error = '%s', fail_count = %d, last_checked = %d WHERE lid = %d", $response->code, $response->error, 0, time(), $link->lid);

      //watchdog('linkchecker', t('Checked %link successfully.', array('%link' => $link->url)));
      break;
    case 301:
      db_query("UPDATE {linkchecker_links} SET code = %d, error = '%s', fail_count = fail_count+1, last_checked = %d WHERE lid = %d", $response->code, $response->error, time(), $link->lid);

      // A HTTP status code of 301 tells us an existing link have changed to
      // a new link. The remote site owner was so kind to provide us the new
      // link and if we trust this change we are able to replace the old link
      // with the new one without any hand work.
      $auto_repair_301 = variable_get('linkchecker_action_status_code_301', 0);
      if ($auto_repair_301 && $auto_repair_301 <= $link->fail_count + 1 && $response->redirect_code == 200 && valid_url($response->redirect_url, TRUE)) {

        // NODES: Autorepair all nodes having this outdated link.
        $res = db_query("SELECT * FROM {linkchecker_nodes} WHERE lid = %d", $link->lid);
        while ($row = db_fetch_object($res)) {
          $node = node_load(array(
            'nid' => $row->nid,
          ));

          // Create array of node fields to scan (for e.g. $node->title, $node->links_weblink_url).
          $text_items = array();
          $text_items[] = 'title';
          $text_items[] = 'body';
          $text_items[] = 'teaser';

          // Update 'weblink' nodes from 'links' module package.
          if (module_exists('links_weblink') && $node->type == 'weblink' && isset($node->links_weblink_url)) {
            $text_items[] = 'links_weblink_url';
          }

          // Update 'weblinks' nodes from 'weblinks' module.
          if (module_exists('weblinks') && $node->type == 'weblinks' && isset($node->url)) {
            $text_items[] = 'url';
          }

          // Now replace the outdated link with the permanently moved one in all node fields.
          foreach ($text_items as $text_item) {
            _linkchecker_link_replace($node->{$text_item}, $link->url, $response->redirect_url);
          }

          // Search for CCK-fields of types 'link' and 'text'.
          if (module_exists('content')) {
            $fields = content_fields(NULL, $node->type);
            foreach ($fields as $field) {
              if (isset($node->{$field['field_name']})) {
                if (module_exists('link') && $field['type'] == 'link') {
                  foreach ($node->{$field}['field_name'] as $delta => $item) {
                    _linkchecker_link_replace($node->{$field['field_name']}[$delta]['url'], $link->url, $response->redirect_url);
                  }
                }
                elseif (module_exists('text') && $field['type'] == 'text') {
                  foreach ($node->{$field}['field_name'] as $delta => $item) {
                    _linkchecker_link_replace($node->{$field['field_name']}[$delta]['value'], $link->url, $response->redirect_url);
                  }
                }
              }
            }
          }

          // Always use the default revision setting. See node_form().
          $node_options = variable_get('node_options_' . $node->type, array(
            'status',
            'promote',
          ));
          $node->revision = in_array('revision', $node_options);

          // Generate a log message for the node_revisions table, visible on the node's revisions tab.
          $log_message = t('Changed permanently moved link in %node from %src to %dst.', array(
            '%node' => url('node/' . $row->nid),
            '%src' => $link->url,
            '%dst' => $response->redirect_url,
          ));
          $node->log = $log_message;

          // Save changed node and update the node link list.
          node_save($node);
          watchdog('linkchecker', $log_message);
        }

        // COMMENTS: Autorepair all comments having this outdated link.
        if (module_exists('comment')) {
          $res = db_query("SELECT * FROM {linkchecker_comments} WHERE lid = %d", $link->lid);
          while ($row = db_fetch_object($res)) {
            $comment = _linkchecker_comment_load($row->cid);

            // Create array of comment fields to scan (for e.g. $comment->subject, $comment->comment).
            $text_items = array();
            $text_items[] = 'subject';
            $text_items[] = 'comment';

            // Now replace the outdated link with the permanently moved one in all comment fields.
            foreach ($text_items as $text_item) {
              _linkchecker_link_replace($comment[$text_item], $link->url, $response->redirect_url);
            }

            // Save changed comment and update the comment link list.
            comment_save($comment);
            watchdog('linkchecker', t('Changed permanently moved link in comment %comment from %src to %dst.', array(
              '%comment' => $comment['cid'],
              '%src' => $link->url,
              '%dst' => $response->redirect_url,
            )));
          }
        }

        // BOXES: Autorepair all boxes having this outdated link.
        $res = db_query("SELECT * FROM {linkchecker_boxes} WHERE lid = %d", $link->lid);
        while ($row = db_fetch_object($res)) {
          $box = block_box_get($row->bid);

          // Create array of box fields to scan.
          $text_items = array();
          $text_items[] = 'info';
          $text_items[] = 'title';
          $text_items[] = 'body';

          // Now replace the outdated link with the permanently moved one in all box fields.
          foreach ($text_items as $text_item) {
            _linkchecker_link_replace($box[$text_item], $link->url, $response->redirect_url);
          }

          // Save changed node and update the node link list.
          block_box_save($box, $row->bid);

          // There is no hook that fires on block_box_save(), therefore do it programmatically.
          _linkchecker_add_box_links($box, $row->bid);
          watchdog('linkchecker', t('Changed permanently moved link in box %bid from %src to %dst.', array(
            '%bid' => $row->bid,
            '%src' => $link->url,
            '%dst' => $response->redirect_url,
          )));
        }
      }
      else {
        watchdog('linkchecker', t('Link %link has changed and needs to be updated.', array(
          '%link' => $link->url,
        )), WATCHDOG_NOTICE, l(t('Broken links'), 'admin/logs/linkchecker'));
      }
      break;
    case 404:
      db_query("UPDATE {linkchecker_links} SET code = %d, error = '%s', fail_count = fail_count+1, last_checked = %d WHERE lid = %d", $response->code, $response->error, time(), $link->lid);
      watchdog('linkchecker', t('Broken link %link has been found.', array(
        '%link' => $link->url,
      )), WATCHDOG_NOTICE, l(t('Broken links'), 'admin/logs/linkchecker'));

      // If unpublishing limit is reached, unpublish all nodes having this link.
      $linkchecker_action_status_code_404 = variable_get('linkchecker_action_status_code_404', 0);
      if ($linkchecker_action_status_code_404 && $linkchecker_action_status_code_404 <= $link->fail_count + 1) {
        _linkchecker_unpublish_nodes($link->lid);
      }
      break;
    case 405:

      // Special error handling if method is not allowed. Switch link checking to GET method and try again.
      $response = drupal_http_request($link->url, array(
        'User-Agent' => 'User-Agent: ' . $useragent,
      ), 'GET', NULL, 0);
      if ($response->code == 200) {
        db_query("UPDATE {linkchecker_links} SET code = %d, error = '%s', fail_count = %d, last_checked = %d, method = '%s' WHERE lid = %d", $response->code, $response->error, 0, time(), 'GET', $link->lid);
      }
      else {
        db_query("UPDATE {linkchecker_links} SET code = %d, error = '%s', fail_count = fail_count+1, last_checked = %d, method = '%s' WHERE lid = %d", $response->code, $response->error, time(), 'GET', $link->lid);
      }
      watchdog('linkchecker', t('Method HEAD is not allowed for link %link. Method has been changed to GET.', array(
        '%link' => $link->url,
      )), WATCHDOG_NOTICE, l(t('Broken links'), 'admin/logs/linkchecker'));
      break;
    default:

      // Don't treat ignored response codes as errors.
      if (in_array($response->code, $ignore_response_codes)) {
        db_query("UPDATE {linkchecker_links} SET code = %d, error = '%s', fail_count = %d, last_checked = %d WHERE lid = %d", $response->code, $response->error, 0, time(), $link->lid);

        //watchdog('linkchecker', t('Unhandled link error %link has been found.', array('%link' => $link->url)), WATCHDOG_ERROR, l(t('Broken links'), 'admin/logs/linkchecker'));
      }
      else {
        db_query("UPDATE {linkchecker_links} SET code = %d, error = '%s', fail_count = fail_count+1, last_checked = %d WHERE lid = %d", $response->code, $response->error, time(), $link->lid);

        //watchdog('linkchecker', t('Unhandled link error %link has been found.', array('%link' => $link->url)), WATCHDOG_ERROR, l(t('Broken links'), 'admin/logs/linkchecker'));
      }
  }
}
function linkchecker_nodeapi(&$node, $op, $a3 = NULL, $a4 = NULL) {
  switch ($op) {
    case 'insert':
    case 'update':

      // The node is going to be published.
      if ($node->status && _linkchecker_scan_nodetype($node->type)) {
        _linkchecker_add_node_links($node);
      }
      break;
    case 'delete':
      _linkchecker_delete_node_links($node->nid);
      break;
    case 'prepare':

      // Node edit tab is viewed.
      if (arg(0) == 'node' && is_numeric(arg(1)) && arg(2) == 'edit') {

        // Show a message if a link check failed once or more.
        $ignore_response_codes = preg_split('/(\\r\\n?|\\n)/', variable_get('linkchecker_ignore_response_codes', "200\n302\n304\n401\n403"));
        $placeholders = implode(',', array_fill(0, count($ignore_response_codes), '%d'));
        $links = db_query("SELECT url, code, fail_count FROM {linkchecker_nodes} ln INNER JOIN {linkchecker_links} ll ON ln.lid = ll.lid WHERE ln.nid = %d AND ll.fail_count > %d AND ll.status = %d AND ll.code NOT IN (" . $placeholders . ")", array_merge(array(
          $node->nid,
          0,
          1,
        ), $ignore_response_codes));
        while ($link = db_fetch_object($links)) {
          drupal_set_message(strtr(format_plural($link->fail_count, 'Link check of <a href="@url">@url</a> failed once (status code: @code).', 'Link check of <a href="@url">@url</a> failed @count times (status code: @code).'), array(
            '@url' => check_plain($link->url),
            '@code' => $link->code,
          )), 'warning');
        }
      }
      break;
  }
}
function linkchecker_comment($comment, $op) {

  // Convert $comment object (admin/content/comment) to array (comment/edit/[cid]).
  $comment = (array) $comment;
  switch ($op) {
    case 'insert':
    case 'update':
    case 'publish':
      $node_type = db_result(db_query("SELECT type FROM {node} WHERE nid = %d", $comment['nid']));

      // Use $comment['status'] to hack around comment_save() not hooking on publish (D5 only).
      if ($comment['status'] == COMMENT_PUBLISHED && _linkchecker_scan_nodetype($node_type)) {
        _linkchecker_add_comment_links($comment);
      }
      elseif ($comment['status'] == COMMENT_NOT_PUBLISHED) {
        _linkchecker_delete_comment_links($comment['cid']);
      }
      break;
    case 'unpublish':
    case 'delete':
      _linkchecker_delete_comment_links($comment['cid']);
      break;
  }
}
function linkchecker_form_alter($form_id, &$form) {
  switch ($form_id) {

    // Catch the block add/configure form and add custom submit handler.
    case 'block_box_form':

      // Add custom submit handler to block add form.
      $form['#submit']['linkchecker_block_add_form_submit'] = array();
      break;
    case 'block_admin_configure':

      // When displaying the form, show the broken links warning.
      if (empty($form['#post']) && is_numeric(arg(5))) {

        // Show a message on block edit page if a link check failed once or more.
        $ignore_response_codes = preg_split('/(\\r\\n?|\\n)/', variable_get('linkchecker_ignore_response_codes', "200\n302\n304\n401\n403"));
        $placeholders = implode(',', array_fill(0, count($ignore_response_codes), '%d'));
        $links = db_query("SELECT url, code, fail_count FROM {linkchecker_boxes} lb INNER JOIN {linkchecker_links} ll ON lb.lid = ll.lid WHERE lb.bid = %d AND ll.fail_count > %d AND ll.status = %d AND ll.code NOT IN (" . $placeholders . ")", array_merge(array(
          arg(5),
          0,
          1,
        ), $ignore_response_codes));
        while ($link = db_fetch_object($links)) {
          drupal_set_message(strtr(format_plural($link->fail_count, 'Link check of <a href="@url">@url</a> failed once (status code: @code).', 'Link check of <a href="@url">@url</a> failed @count times (status code: @code).'), array(
            '@url' => check_plain($link->url),
            '@code' => $link->code,
          )), 'warning');
        }
      }

      // Add custom submit handler to block configuration form.
      $form['#submit']['linkchecker_block_configure_form_submit'] = array();
      break;
    case 'block_box_delete':

      // Add custom submit handler to block delete form.
      $form['#submit']['linkchecker_block_box_delete_form_submit'] = array();
      break;
    case 'comment_form':

      // When displaying the form as 'view' or 'preview', show the broken links warning.
      if ((empty($form['#post']) || isset($form['#post']['op']) && $form['#post']['op'] == t('Preview comment')) && arg(0) == 'comment' && arg(1) == 'edit' && is_numeric(arg(2))) {

        // Show a message on comment edit page if a link check failed once or more.
        $ignore_response_codes = preg_split('/(\\r\\n?|\\n)/', variable_get('linkchecker_ignore_response_codes', "200\n302\n304\n401\n403"));
        $placeholders = implode(',', array_fill(0, count($ignore_response_codes), '%d'));
        $links = db_query("SELECT url, code, fail_count FROM {linkchecker_comments} lc INNER JOIN {linkchecker_links} ll ON lc.lid = ll.lid WHERE lc.cid = %d AND ll.fail_count > %d AND ll.status = %d AND ll.code NOT IN (" . $placeholders . ")", array_merge(array(
          arg(2),
          0,
          1,
        ), $ignore_response_codes));
        while ($link = db_fetch_object($links)) {
          drupal_set_message(strtr(format_plural($link->fail_count, 'Link check of <a href="@url">@url</a> failed once (status code: @code).', 'Link check of <a href="@url">@url</a> failed @count times (status code: @code).'), array(
            '@url' => check_plain($link->url),
            '@code' => $link->code,
          )), 'warning');
        }
      }
      break;
  }
}

/**
 * Custom submit handler for block add page.
 */
function linkchecker_block_add_form_submit($form_id, $form_values) {
  $bid = db_result(db_query("SELECT MAX(bid) FROM {boxes}"));
  _linkchecker_add_box_links($form_values, $bid);
}

/**
 * Custom submit handler for block configure page.
 */
function linkchecker_block_configure_form_submit($form_id, $form_values) {
  _linkchecker_add_box_links($form_values, $form_values['delta']);
}

/**
 * Custom submit handler for block delete page.
 */
function linkchecker_block_box_delete_form_submit($form_id, $form_values) {
  _linkchecker_delete_box_links($form_values['bid']);
}

/**
 * Add node links to database.
 */
function _linkchecker_add_node_links($node) {

  // Create array of node fields to scan.
  $text_items = array();
  $text_items[] = _filter_url($node->title, $node->format);
  $text_items[] = _linkchecker_check_markup($node->body, $node->format, FALSE);
  $text_items[] = _linkchecker_check_markup($node->teaser, $node->format, FALSE);

  // Search for links in 'weblink' nodes from 'links' module package.
  if (module_exists('links_weblink') && $node->type == 'weblink' && isset($node->links_weblink_url)) {
    $text_items[] = _filter_url($node->links_weblink_url, $node->format);
  }

  // Search for links in 'weblinks' nodes from 'weblinks' module.
  if (module_exists('weblinks') && $node->type == 'weblinks' && isset($node->url)) {
    $text_items[] = _filter_url($node->url, $node->format);
  }

  // Search for CCK-fields of types 'link' and 'text'.
  if (module_exists('content')) {
    $fields = content_fields(NULL, $node->type);
    foreach ($fields as $field) {
      if (!empty($node->{$field['field_name']})) {
        if (module_exists('link') && $field['type'] == 'link') {
          foreach ($node->{$field}['field_name'] as $delta => $item) {
            $text_items[] = _filter_url($item['url'], $node->format);
          }
        }
        elseif (module_exists('text') && $field['type'] == 'text') {
          foreach ($node->{$field}['field_name'] as $delta => $item) {
            $text_items[] = _filter_url($item['value'], $node->format);
          }
        }
      }
    }
  }

  // Get the absolute node path for extraction of relative links.
  $path = url('node/' . $node->nid, NULL, NULL, TRUE);

  // Extract all links in a node.
  $links = _linkchecker_extract_links(implode(' ', $text_items), $path);

  // Node have links.
  if (!empty($links)) {

    // Remove all links from the links array already in the database
    // and only add missing links to database.
    $missing_links = _linkchecker_node_links_missing($node->nid, $links);

    // Add a job for scanning the next LINKCHECKER_SCAN_MAX_LINKS_PER_RUN links via job_queue module.
    $missing_links_count = count($missing_links) - LINKCHECKER_SCAN_MAX_LINKS_PER_RUN;
    if (module_exists('job_queue') && $missing_links_count > 0) {
      job_queue_add('_linkchecker_scan_node_links', 'Scan node ' . $node->nid . ' having ' . $missing_links_count . ' links not yet added to linkchecker_links table.', array(
        $node->nid,
      ), '', FALSE);
    }

    // Only add links to database that do not exists.
    $i = 0;
    foreach ($missing_links as $link) {
      $lid = db_result(db_query("SELECT lid FROM {linkchecker_links} WHERE token = '%s'", md5($link)));
      if (!$lid) {
        $lid = db_next_id('linkchecker_links_lid');
        db_query("INSERT INTO {linkchecker_links} (lid, token, url, status) VALUES (%d, '%s', '%s', %d)", $lid, md5($link), $link, _linkchecker_link_check_status_filter($link));
      }
      db_query("INSERT INTO {linkchecker_nodes} (nid, lid) VALUES (%d, %d)", $node->nid, $lid);

      // Break processing if max links limit per run has been reached.
      $i++;
      if ($i >= LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) {
        break;
      }
    }
  }

  // Remove dead link references for cleanup reasons as very last step.
  _linkchecker_cleanup_node_references($node->nid, $links);
}

/**
 * Add comment links to database.
 */
function _linkchecker_add_comment_links($comment) {

  // Create array of comment fields to scan.
  $text_items = array();
  $text_items[] = _filter_url($comment['subject'], $comment['format']);
  $text_items[] = _linkchecker_check_markup($comment['comment'], $comment['format'], FALSE);

  // Get the absolute node path for extraction of relative links.
  $path = url('node/' . $comment['nid'], NULL, NULL, TRUE);

  // Extract all links in a comment.
  $links = _linkchecker_extract_links(implode(' ', $text_items), $path);

  // Comment have links.
  if (!empty($links)) {

    // Remove all links from the links array already in the database
    // and only add missing links to database.
    $missing_links = _linkchecker_comment_links_missing($comment['cid'], $links);

    // Add a job for scanning the next 100 links via job_queue module.
    $missing_links_count = count($missing_links) - LINKCHECKER_SCAN_MAX_LINKS_PER_RUN;
    if (module_exists('job_queue') && $missing_links_count > 0) {
      job_queue_add('_linkchecker_scan_comment_links', 'Scan comment ' . $comment['cid'] . ' having ' . $missing_links_count . ' links not yet added to linkchecker_links table.', array(
        $comment['cid'],
      ), '', FALSE);
    }

    // Only add unique links to database that do not exist.
    $i = 0;
    foreach ($missing_links as $link) {
      $lid = db_result(db_query("SELECT lid FROM {linkchecker_links} WHERE token = '%s'", md5($link)));
      if (!$lid) {
        $lid = db_next_id('linkchecker_links_lid');
        db_query("INSERT INTO {linkchecker_links} (lid, token, url, status) VALUES (%d, '%s', '%s', %d)", $lid, md5($link), $link, _linkchecker_link_check_status_filter($link));
      }
      db_query("INSERT INTO {linkchecker_comments} (cid, lid) VALUES (%d, %d)", $comment['cid'], $lid);

      // Break processing if max links limit per run has been reached.
      $i++;
      if ($i >= LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) {
        break;
      }
    }
  }

  // Remove dead link references for cleanup reasons as very last step.
  _linkchecker_cleanup_comment_references($comment['cid'], $links);
}

/**
 * Add box links to database.
 */
function _linkchecker_add_box_links($box, $bid) {

  // Create array of box fields to scan.
  $text_items = array();
  $text_items[] = _filter_url($box['info'], $box['format']);
  $text_items[] = _filter_url($box['title'], $box['format']);
  $text_items[] = _linkchecker_check_markup($box['body'], $box['format'], FALSE);

  // Extract all links in a box.
  $links = _linkchecker_extract_links(implode(' ', $text_items));

  // Box have links.
  if (!empty($links)) {

    // Remove all links from the links array already in the database
    // and only add missing links to database.
    $missing_links = _linkchecker_box_links_missing($bid, $links);

    // Add a job for scanning the next 100 links via job_queue module.
    $missing_links_count = count($missing_links) - LINKCHECKER_SCAN_MAX_LINKS_PER_RUN;
    if (module_exists('job_queue') && $missing_links_count > 0) {
      job_queue_add('_linkchecker_scan_box_links', 'Scan block ' . $bid . ' having ' . $missing_links_count . ' links not yet added to linkchecker_links table.', array(
        $bid,
      ), '', FALSE);
    }

    // Only add unique links to database that do not exist.
    $i = 0;
    foreach ($missing_links as $link) {
      $lid = db_result(db_query("SELECT lid FROM {linkchecker_links} WHERE token = '%s'", md5($link)));
      if (!$lid) {
        $lid = db_next_id('linkchecker_links_lid');
        db_query("INSERT INTO {linkchecker_links} (lid, token, url, status) VALUES (%d, '%s', '%s', %d)", $lid, md5($link), $link, _linkchecker_link_check_status_filter($link));
      }
      db_query("INSERT INTO {linkchecker_boxes} (bid, lid) VALUES (%d, %d)", $bid, $lid);

      // Break processing if max links limit per run has been reached.
      $i++;
      if ($i >= LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) {
        break;
      }
    }
  }

  // Remove dead link references for cleanup reasons as very last step.
  _linkchecker_cleanup_box_references($bid, $links);
}

/**
 * Remove all node references to links in the linkchecker_nodes table.
 */
function _linkchecker_delete_node_links($nid) {
  return db_query("DELETE FROM {linkchecker_nodes} WHERE nid = %d", $nid);
}

/**
 * Remove all comment references to links in the linkchecker_comments table.
 */
function _linkchecker_delete_comment_links($cid) {
  return db_query("DELETE FROM {linkchecker_comments} WHERE cid = %d", $cid);
}

/**
 * Remove all box references to links in the linkchecker_boxes table.
 */
function _linkchecker_delete_box_links($bid) {
  return db_query("DELETE FROM {linkchecker_boxes} WHERE bid = %d", $bid);
}

/**
 * Cleanup no longer used node references to links in the linkchecker_nodes table.
 */
function _linkchecker_cleanup_node_references($nid = 0, $links = array()) {
  if (empty($links)) {

    // Node do not have links. Delete all references if exists.
    db_query("DELETE FROM {linkchecker_nodes} WHERE nid = %d", $nid);
  }
  else {

    // The node still have more than one link, but other links may have been
    // removed and links no longer in the content need to be deleted from the
    // linkchecker_nodes reference table.
    $placeholders = implode(',', array_fill(0, count($links), "'%s'"));
    db_query("DELETE FROM {linkchecker_nodes} WHERE nid = %d AND lid NOT IN (SELECT lid FROM {linkchecker_links} WHERE token IN (" . $placeholders . "))", array_merge(array(
      $nid,
    ), array_map('md5', $links)));
  }
}

/**
 * Cleanup no longer used comment references to links in the linkchecker_comments table.
 */
function _linkchecker_cleanup_comment_references($cid = 0, $links = array()) {
  if (empty($links)) {

    // Comment do not have links. Delete all references if exists.
    db_query("DELETE FROM {linkchecker_comments} WHERE cid = %d", $cid);
  }
  else {

    // The comment still have more than one link, but other links may have been
    // removed and links no longer in the content need to be deleted from the
    // linkchecker_comments reference table.
    $placeholders = implode(',', array_fill(0, count($links), "'%s'"));
    db_query("DELETE FROM {linkchecker_comments} WHERE cid = %d AND lid NOT IN (SELECT lid FROM {linkchecker_links} WHERE token IN (" . $placeholders . "))", array_merge(array(
      $cid,
    ), array_map('md5', $links)));
  }
}

/**
 * Cleanup no longer used box references to links in the linkchecker_boxes table.
 */
function _linkchecker_cleanup_box_references($bid = 0, $links = array()) {
  if (empty($links)) {

    // Block do not have links. Delete all references if exists.
    db_query("DELETE FROM {linkchecker_boxes} WHERE bid = %d", $bid);
  }
  else {

    // The block still have more than one link, but other links may have been
    // removed and links no longer in the content need to be deleted from the
    // linkchecker_boxes reference table.
    $placeholders = implode(',', array_fill(0, count($links), "'%s'"));
    db_query("DELETE FROM {linkchecker_boxes} WHERE bid = %d AND lid NOT IN (SELECT lid FROM {linkchecker_links} WHERE token IN (" . $placeholders . "))", array_merge(array(
      $bid,
    ), array_map('md5', $links)));
  }
}

/**
 * Returns an array of node references missing in the linkchecker_nodes table.
 */
function _linkchecker_node_links_missing($nid, $links) {
  $placeholders = implode(',', array_fill(0, count($links), "'%s'"));
  $res = db_query("SELECT url FROM {linkchecker_links} ll INNER JOIN {linkchecker_nodes} ln ON ll.lid = ln.lid WHERE ln.nid = %d AND token IN (" . $placeholders . ")", array_merge(array(
    $nid,
  ), array_map('md5', $links)));
  $links_in_database = array();
  while ($row = db_fetch_object($res)) {
    $links_in_database[] = $row->url;
  }
  return array_diff($links, $links_in_database);
}

/**
 * Returns an array of comment references missing in the linkchecker_comments table.
 */
function _linkchecker_comment_links_missing($cid, $links) {
  $placeholders = implode(',', array_fill(0, count($links), "'%s'"));
  $res = db_query("SELECT url FROM {linkchecker_links} ll INNER JOIN {linkchecker_comments} lc ON ll.lid = lc.lid WHERE lc.cid = %d AND token IN (" . $placeholders . ")", array_merge(array(
    $cid,
  ), array_map('md5', $links)));
  $links_in_database = array();
  while ($row = db_fetch_object($res)) {
    $links_in_database[] = $row->url;
  }
  return array_diff($links, $links_in_database);
}

/**
 * Returns an array of box references missing in the linkchecker_boxes table.
 */
function _linkchecker_box_links_missing($bid, $links) {
  $placeholders = implode(',', array_fill(0, count($links), "'%s'"));
  $res = db_query("SELECT url FROM {linkchecker_links} ll INNER JOIN {linkchecker_boxes} lb ON ll.lid = lb.lid WHERE lb.bid = %d AND token IN (" . $placeholders . ")", array_merge(array(
    $bid,
  ), array_map('md5', $links)));
  $links_in_database = array();
  while ($row = db_fetch_object($res)) {
    $links_in_database[] = $row->url;
  }
  return array_diff($links, $links_in_database);
}

/**
 * Scan specified node for links. Helper function for job_queue scans.
 *
 * @param $nid
 *   The node id to scan.
 */
function _linkchecker_scan_node_links($nid) {
  $node = node_load(array(
    'nid' => $nid,
  ));
  _linkchecker_add_node_links($node);
}

/**
 * Scan specified comment for links. Helper function for job_queue scans.
 *
 * @param $cid
 *   The comment id to scan.
 */
function _linkchecker_scan_comment_links($cid) {
  $comment = _linkchecker_comment_load($cid);
  _linkchecker_add_comment_links($comment);
}

/**
 * Scan specified box for links. Helper function for job_queue scans.
 *
 * @param $bid
 *   The box id to scan.
 */
function _linkchecker_scan_box_links($bid) {
  $box = block_box_get($bid);
  _linkchecker_add_box_links($box);
}

/**
 * Run perodically via cron and delete all links without a references.
 *
 * For speed reasons and check results we keep the links for some time
 * as they may be reused by other new content.
 */
function _linkchecker_cleanup_links() {

  // Remove disabled node types no longer in use.
  $node_types = array_keys(array_filter(variable_get('linkchecker_scan_nodetypes', array())));
  $placeholders = implode(',', array_fill(0, count($node_types), "'%s'"));
  if (!empty($node_types)) {
    db_query('DELETE FROM {linkchecker_nodes} WHERE nid IN (SELECT nid FROM {node} n WHERE n.type NOT IN (' . $placeholders . '))', $node_types);

    // FIXME: Remove comment references of unpublished nodes.

    //db_query('DELETE FROM {linkchecker_comments} WHERE cid IN (SELECT nid FROM {node} n WHERE n.type NOT IN (' . $placeholders . '))', $node_types);
  }
  else {
    db_query('DELETE FROM {linkchecker_nodes}');

    // FIXME: Remove comment references of unpublished nodes.
  }

  // Remove comment links if comment scanning is disabled.
  // TODO: Remove comment references of unpublished nodes.
  if (variable_get('linkchecker_scan_comments', 0) == 0) {
    db_query('DELETE FROM {linkchecker_comments}');
  }

  // Remove block links if block scanning is disabled.
  if (variable_get('linkchecker_scan_blocks', 0) == 0) {
    db_query('DELETE FROM {linkchecker_boxes}');
  }

  // TODO: Requires MySQL 5.x for subselects. Untested with pgsql.
  db_query('DELETE FROM {linkchecker_links}
            WHERE lid NOT IN (
              SELECT DISTINCT lid FROM {linkchecker_boxes}
              UNION
              SELECT DISTINCT lid FROM {linkchecker_comments}
              UNION
              SELECT DISTINCT lid FROM {linkchecker_nodes}
            )');
}

/**
 * Extract links from content.
 *
 * @param $text
 *    The text to be scanned for links.
 * @param $content_path
 *    Path to the content that is currently scanned for links. This value is
 *    required to build full qualified links from relative links. Relative links
 *    are not extracted from content, if path is not provided.
 * @return
 *    Array of full qualified and unique URLs found in content.
 */
function _linkchecker_extract_links($text = '', $content_path = NULL) {
  global $base_root;

  // Finds all hyperlinks in the content.
  $matches_a = array();
  if (variable_get('linkchecker_extract_from_a', 1) == 1) {
    $pattern_a = '/<(a|area)\\s[^>]*href=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_a, $text, $matches_a);
  }

  // Finds all audio links in the content.
  $matches_audio = array();
  if (variable_get('linkchecker_extract_from_audio', 1) == 1) {
    $pattern_audio = '/<audio\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_audio, $text, $matches_audio);
  }

  // Finds embed tags with links in the content.
  $matches_embed = array();
  if (variable_get('linkchecker_extract_from_embed', 0) == 1) {
    $pattern_embed_src = '/<embed\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_embed_pluginurl = '/<embed\\s[^>]*pluginurl=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_embed_pluginspage = '/<embed\\s[^>]*pluginspage=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_embed_src, $text, $matches_embed_src);
    preg_match_all($pattern_embed_pluginurl, $text, $matches_embed_pluginurl);
    preg_match_all($pattern_embed_pluginspage, $text, $matches_embed_pluginspage);
    $matches_embed = array_merge((array) $matches_embed_src[1], (array) $matches_embed_pluginurl[1], (array) $matches_embed_pluginspage[1]);
  }

  // Finds iframe tags with links in the content.
  $matches_iframe = array();
  if (variable_get('linkchecker_extract_from_iframe', 0) == 1) {
    $pattern_iframe = '/<iframe\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_iframe, $text, $matches_iframe);
  }

  // Finds img tags with links in the content.
  $matches_img = array();
  if (variable_get('linkchecker_extract_from_img', 0) == 1) {
    $pattern_img = '/<img\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_img, $text, $matches_img);
  }

  // Finds object/param tags with links in the content.
  $matches_object = array();
  if (variable_get('linkchecker_extract_from_object', 0) == 1) {

    // TODO's:
    //  * Allow flipped order of attributes in "param".
    //  * Try to extract links in unkown "flashvars" values (for e.g. file=http://, data=http://).
    $pattern_object_data = '/<object\\s[^>]*data=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_object_codebase = '/<object\\s[^>]*codebase=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_param = '/<param\\s[^>]*((name|src)=["\'](archive|filename|href|movie|src|url)["\']\\s[^>]*)+value=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_object_data, $text, $matches_object_data);
    preg_match_all($pattern_object_codebase, $text, $matches_object_codebase);
    preg_match_all($pattern_param, $text, $matches_param);
    $matches_object = array_merge((array) $matches_object_data[1], (array) $matches_object_codebase[1], (array) $matches_param[4]);
  }

  // Finds source tags with links in the content.
  $matches_source = array();
  if (variable_get('linkchecker_extract_from_source', 0) == 1) {
    $pattern_source = '/<source\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_source, $text, $matches_source);
  }

  // Finds video tags with links in the content.
  $matches_video = array();
  if (variable_get('linkchecker_extract_from_video', 0) == 1) {
    $pattern_video_poster = '/<video\\s[^>]*poster=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_video_src = '/<video\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_video_poster, $text, $matches_video_poster);
    preg_match_all($pattern_video_src, $text, $matches_video_src);
    $matches_video = array_merge((array) $matches_video_poster[1], (array) $matches_video_src[1]);
  }

  // Merge all extracted links into one array.
  $urls = array_merge((array) $matches_a[2], (array) $matches_audio[1], (array) $matches_embed, (array) $matches_iframe[1], (array) $matches_img[1], (array) $matches_object, (array) $matches_source[1], (array) $matches_video);

  // Remove empty values.
  $urls = array_filter($urls);

  // Decode HTML links into plain text links.
  $urls = array_map('decode_entities', $urls);

  // Remove duplicate urls.
  $urls = array_unique($urls);
  $links = array();
  foreach ($urls as $url) {

    // Full qualified URLs.
    if (valid_url($url, TRUE)) {

      // Add to Array and change HTML links into plain text links.
      $links[] = $url;
    }
    elseif (preg_match('/^\\w[\\w.+]*:/', $url)) {
      continue;
    }
    elseif (valid_url($url, FALSE) && variable_get('linkchecker_fqdn_only', 1) == 0) {

      // Get full qualified url with base path of content.
      $absolute_content_path = _linkchecker_absolute_content_path($content_path);

      // Absolute local URLs need to start with [/].
      if (preg_match('!^/!', $url)) {

        // Add to Array and change HTML encoded links into plain text links.
        $links[] = $base_root . $url;
      }
      elseif (!empty($content_path) && preg_match('!^[?#]!', $url)) {

        // Add to Array and change HTML encoded links into plain text links.
        $links[] = $content_path . $url;
      }
      elseif (!empty($absolute_content_path) && preg_match('!^\\.{1,2}/!', $url)) {
        $path = $absolute_content_path . $url;

        // Remove './' segments where possible.
        $path = str_replace('/./', '/', $path);

        // Remove '../' segments where possible. Loop until all segments are removed.
        // Taken over from _drupal_build_css_path() in common.inc.
        $last = '';
        while ($path != $last) {
          $last = $path;
          $path = preg_replace('`(^|/)(?!\\.\\./)([^/]+)/\\.\\./`', '$1', $path);
        }

        // Add URLs to array.
        $links[] = $path;
      }
      elseif (!empty($absolute_content_path) && preg_match('!^[^/]!', $url)) {
        $links[] = $absolute_content_path . $url;
      }
      else {

        // TODO: Are there more special cases the module need to handle?
      }
    }
  }
  return array_unique($links);
}

/**
 * Replaces old link with new link in text.
 *
 * @param $text
 *   The text a link is inside. Passed in as a reference.
 * @param $old_link_fqdn
 *   The old link to search for in strings.
 * @param $new_link_fqdn
 *   The old link should be overwritten with this new link.
 */
function _linkchecker_link_replace(&$text, $old_link_fqdn = '', $new_link_fqdn = '') {

  // Don't do any string replacement if one of the values is empty.
  if (!empty($text) && !empty($old_link_fqdn) && !empty($new_link_fqdn)) {

    // Remove protocols and hostname from local URLs.
    $base_roots = array(
      strtolower('http://' . $_SERVER['HTTP_HOST']),
      strtolower('https://' . $_SERVER['HTTP_HOST']),
    );
    $old_link = str_replace($base_roots, '', $old_link_fqdn);
    $new_link = str_replace($base_roots, '', $new_link_fqdn);

    // Build variables with all URLs and run check_url() only once.
    $old_html_link_fqdn = check_url($old_link_fqdn);
    $new_html_link_fqdn = check_url($new_link_fqdn);
    $old_html_link = check_url($old_link);
    $new_html_link = check_url($new_link);

    // Replace links in CCK link and text and Links weblink fields.
    if (in_array($text, array(
      $old_html_link_fqdn,
      $old_html_link,
      $old_link_fqdn,
      $old_link,
    ))) {

      // Keep old and new links in the same encoding and format and short or fully qualified.
      $text = str_replace($old_html_link_fqdn, $new_html_link_fqdn, $text);
      $text = str_replace($old_html_link, $new_html_link, $text);
      $text = str_replace($old_link_fqdn, $new_link_fqdn, $text);
      $text = str_replace($old_link, $new_link, $text);
    }
    else {

      // Create an array of preg quoted links with HTML decoded and encoded URLs.
      $old_links_quoted = array();
      $old_links_quoted[] = preg_quote($old_html_link_fqdn, '/');
      $old_links_quoted[] = preg_quote($old_html_link, '/');
      $old_links_quoted[] = preg_quote($old_link, '/');

      // Remove duplicate URLs from array if URLs do not have URL parameters.
      // If more than one URL parameter exists - one URL in the array will have
      // an unencoded ampersand "&" and a second URL will have an HTML encoded
      // ampersand "&amp;".
      $regex_old_links = implode('|', array_unique($old_links_quoted));

      // Create array to fill with replacement rules.
      $replacements = array();

      // Add replace rules for a/area tags.
      if (variable_get('linkchecker_extract_from_a', 1) == 1) {

        // TODO: If link text between opening an closing a-tag having the same
        // URL, also replace the link text. Create a replace regex for this task.
        $text = str_replace(array(
          '>' . $old_html_link_fqdn . '</a>',
          '>' . $old_html_link . '</a>',
          '>' . $old_link . '</a>',
        ), '>' . $new_html_link . '</a>', $text);
        $replacements['/(<(a|area)\\s[^>]*href=["\'])(' . $regex_old_links . ')(["\'][^>]*>)/i'] = '\\1' . $new_html_link . '\\4';
      }

      // Add replace rules for audio tags.
      if (variable_get('linkchecker_extract_from_audio', 0) == 1) {
        $replacements['/(<audio\\s[^>]*src=["\'])(' . $regex_old_links . ')(["\'][^>]*>)/i'] = '\\1' . $new_html_link . '\\3';
      }

      // Add replace rules for embed tags.
      if (variable_get('linkchecker_extract_from_embed', 0) == 1) {
        $replacements['/(<embed\\s[^>]*src=["\'])(' . $regex_old_links . ')(["\'][^>]*>)/i'] = '\\1' . $new_html_link . '\\3';
        $replacements['/(<embed\\s[^>]*pluginurl=["\'])(' . $regex_old_links . ')(["\'][^>]*>)/i'] = '\\1' . $new_html_link . '\\3';
        $replacements['/(<embed\\s[^>]*pluginspage=["\'])(' . $regex_old_links . ')(["\'][^>]*>)/i'] = '\\1' . $new_html_link . '\\3';
      }

      // Add replace rules for iframe tags.
      if (variable_get('linkchecker_extract_from_iframe', 0) == 1) {
        $replacements['/(<iframe\\s[^>]*src=["\'])(' . $regex_old_links . ')(["\'][^>]*>)/i'] = '\\1' . $new_html_link . '\\3';
      }

      // Add replace rules for img tags.
      if (variable_get('linkchecker_extract_from_img', 0) == 1) {
        $replacements['/(<img\\s[^>]*src=["\'])(' . $regex_old_links . ')(["\'][^>]*>)/i'] = '\\1' . $new_html_link . '\\3';
      }

      // Add replace rules for object/param tags.
      if (variable_get('linkchecker_extract_from_object', 0) == 1) {
        $replacements['/(<object\\s[^>]*data=["\'])(' . $regex_old_links . ')(["\'][^>]*>)/i'] = '\\1' . $new_html_link . '\\3';
        $replacements['/(<object\\s[^>]*codebase=["\'])(' . $regex_old_links . ')(["\'][^>]*>)/i'] = '\\1' . $new_html_link . '\\3';
        $replacements['/(<param\\s[^>]*((name|src)=["\'](archive|filename|href|movie|src|url)["\']\\s[^>]*)+value=["\'])(' . $regex_old_links . ')(["\'][^>]*>)/i'] = '\\1' . $new_html_link . '\\6';
      }

      // Add replace rules for source tags.
      if (variable_get('linkchecker_extract_from_source', 0) == 1) {
        $replacements['/(<source\\s[^>]*src=["\'])(' . $regex_old_links . ')(["\'][^>]*>)/i'] = '\\1' . $new_html_link . '\\3';
      }

      // Add replace rules for video tags.
      if (variable_get('linkchecker_extract_from_video', 0) == 1) {
        $replacements['/(<video\\s[^>]*poster=["\'])(' . $regex_old_links . ')(["\'][^>]*>)/i'] = '\\1' . $new_html_link . '\\3';
        $replacements['/(<video\\s[^>]*src=["\'])(' . $regex_old_links . ')(["\'][^>]*>)/i'] = '\\1' . $new_html_link . '\\3';
      }

      // Replace link by applying all replacement rules on text.
      foreach ($replacements as $pattern => $replacement) {
        $text = preg_replace($pattern, $replacement, $text);
      }
    }
  }
}

/**
 * Customized clone of core check_markup() function with additional filter blacklist.
 *
 * See http://api.drupal.org/api/function/check_markup for API documentation.
 */
function _linkchecker_check_markup($text, $format = FILTER_FORMAT_DEFAULT, $check = TRUE) {

  // When $check = TRUE, do an access check on $format.
  if (isset($text) && (!$check || filter_access($format))) {
    $format = filter_resolve_format($format);

    // Check for a cached version of this piece of text.
    $cache_id = 'linkchecker:' . $format . ':' . md5($text);
    if ($cached = cache_get($cache_id, 'cache_filter')) {
      return $cached->data;
    }

    // See if caching is allowed for this format.
    $cache = filter_format_allowcache($format);

    // Convert all Windows and Mac newlines to a single newline,
    // so filters only need to deal with one possibility.
    $text = str_replace(array(
      "\r\n",
      "\r",
    ), "\n", $text);

    // Get a complete list of filters, ordered properly.
    $filters = filter_list_format($format);

    // Do not run placeholder or special tag filters used as references
    // to nodes like 'weblink' or 'weblinks' node types. If the original
    // link node is updated, all links are automatically up-to-date and
    // there is no need to notify about the broken link on all nodes having
    // a link reference in content. This would only confuse the authors as
    // they may also not be able to fix the source node of the reference.
    $filters_blacklist = array_keys(array_filter(variable_get('linkchecker_filter_blacklist', explode('|', LINKCHECKER_DEFAULT_FILTER_BLACKLIST))));

    // Give filters the chance to escape HTML-like data such as code or formulas.
    foreach ($filters as $filter) {
      if (!in_array($filter->module . '/' . $filter->delta, $filters_blacklist)) {
        $text = module_invoke($filter->module, 'filter', 'prepare', $filter->delta, $format, $text, $cache_id);
      }
    }

    // Perform filtering.
    foreach ($filters as $filter) {
      if (!in_array($filter->module . '/' . $filter->delta, $filters_blacklist)) {
        $text = module_invoke($filter->module, 'filter', 'process', $filter->delta, $format, $text, $cache_id);
      }
    }

    // Store in cache with a minimum expiration time of 1 day.
    if ($cache) {
      cache_set($cache_id, 'cache_filter', $text, time() + 60 * 60 * 24);
    }
  }
  else {
    $text = t('n/a');
  }
  return $text;
}

/**
 * Get the path of an URL.
 *
 * @param $url
 *   The http/https URL to parse.
 *
 * @return
 *   Full qualified URL with absolute path of the URL.
 */
function _linkchecker_absolute_content_path($url) {

  // Parse the URL and make sure we can handle the schema.
  $uri = @parse_url($url);
  if ($uri == FALSE) {
    return NULL;
  }
  if (!isset($uri['scheme'])) {
    return NULL;
  }

  // Break if the schema is not supported.
  if (!in_array($uri['scheme'], array(
    'http',
    'https',
  ))) {
    return NULL;
  }
  $scheme = isset($uri['scheme']) ? $uri['scheme'] . '://' : '';
  $user = isset($uri['user']) ? $uri['user'] . ($uri['pass'] ? ':' . $uri['pass'] : '') . '@' : '';
  $port = isset($uri['port']) ? $uri['port'] : 80;
  $host = $uri['host'] . ($port != 80 ? ':' . $port : '');
  $path = isset($uri['path']) ? $uri['path'] : '/';

  // Glue the URL variables.
  $absolute_url = $scheme . $user . $host . $path;

  // Find the last slash and remove all after the last slash to get the path.
  $last_slash = strrpos($absolute_url, '/');
  $absolute_content_url = drupal_substr($absolute_url, 0, $last_slash + 1);
  return $absolute_content_url;
}

/**
 * Verifies against the url blacklist, if the link status should be checked or not.
 */
function _linkchecker_link_check_status_filter($url) {
  $status = TRUE;
  $urls = variable_get('linkchecker_disable_link_check_for_urls', LINKCHECKER_RESERVED_DOCUMENTATION_DOMAINS);
  if (!empty($urls) && preg_match('/' . implode('|', array_map(create_function('$links', 'return preg_quote($links, \'/\');'), preg_split('/(\\r\\n?|\\n)/', $urls))) . '/', $url)) {
    $status = FALSE;
  }
  return $status;
}

/**
 * Defines the list of allowed response codes for form input validation.
 *
 * @param $code
 *   An numeric response code.
 * @return
 *   TRUE if the status code is valid.
 */
function _linkchecker_isvalid_response_code($code) {
  $responses = array(
    100 => 'Continue',
    101 => 'Switching Protocols',
    200 => 'OK',
    201 => 'Created',
    202 => 'Accepted',
    203 => 'Non-Authoritative Information',
    204 => 'No Content',
    205 => 'Reset Content',
    206 => 'Partial Content',
    300 => 'Multiple Choices',
    301 => 'Moved Permanently',
    302 => 'Found',
    303 => 'See Other',
    304 => 'Not Modified',
    305 => 'Use Proxy',
    307 => 'Temporary Redirect',
    400 => 'Bad Request',
    401 => 'Unauthorized',
    402 => 'Payment Required',
    403 => 'Forbidden',
    404 => 'Not Found',
    405 => 'Method Not Allowed',
    406 => 'Not Acceptable',
    407 => 'Proxy Authentication Required',
    408 => 'Request Time-out',
    409 => 'Conflict',
    410 => 'Gone',
    411 => 'Length Required',
    412 => 'Precondition Failed',
    413 => 'Request Entity Too Large',
    414 => 'Request-URI Too Large',
    415 => 'Unsupported Media Type',
    416 => 'Requested range not satisfiable',
    417 => 'Expectation Failed',
    500 => 'Internal Server Error',
    501 => 'Not Implemented',
    502 => 'Bad Gateway',
    503 => 'Service Unavailable',
    504 => 'Gateway Time-out',
    505 => 'HTTP Version not supported',
  );
  return array_key_exists($code, $responses);
}

/**
 * Should the defined node type scanned for links?
 *
 * @param $node_type
 *   Verifies if the node type is enabled for link checks and should be scanned.
 * @return
 *   TRUE if node type should be scanned, otherwise FALSE.
 */
function _linkchecker_scan_nodetype($node_type = NULL) {
  $enabled = FALSE;
  $node_types = array_keys(array_filter(variable_get('linkchecker_scan_nodetypes', array())));

  // Scan specific node types only.
  if (in_array($node_type, $node_types)) {
    $enabled = TRUE;
  }
  return $enabled;
}

/**
 * Unpublishes all nodes having the specified link id.
 *
 * @param $lid
 *   A link ID that have reached a defined failcount.
 */
function _linkchecker_unpublish_nodes($lid) {
  $res = db_query("SELECT * FROM {linkchecker_nodes} WHERE lid = %d", $lid);
  while ($row = db_fetch_object($res)) {
    $node = node_load(array(
      'nid' => $row->nid,
    ));
    $node->status = 0;
    node_save($node);
    watchdog('linkchecker', t('Set @type %title to unpublished.', array(
      '@type' => $node->type,
      '%title' => $node->title,
    )));

    // TODO: Add email notification for authors.
  }
}

/**
 * Load comment as array.
 */
function _linkchecker_comment_load($cid) {
  return db_fetch_array(db_query('SELECT * FROM {comments} WHERE cid = %d', $cid));
}

/**
 * Load link as array.
 */
function linkchecker_link_load($lid) {
  return db_fetch_array(db_query("SELECT * FROM {linkchecker_links} WHERE lid = %d", $lid));
}

Functions

Namesort descending Description
linkchecker_admin_report Menu callback for reporting.
linkchecker_admin_settings_form
linkchecker_admin_settings_form_submit
linkchecker_admin_settings_form_validate
linkchecker_block_add_form_submit Custom submit handler for block add page.
linkchecker_block_box_delete_form_submit Custom submit handler for block delete page.
linkchecker_block_configure_form_submit Custom submit handler for block configure page.
linkchecker_comment
linkchecker_cron Implementation of hook_cron().
linkchecker_form_alter
linkchecker_help Implementation of hook_help().
linkchecker_link_edit_form Menu callback for link setting.
linkchecker_link_edit_form_submit
linkchecker_link_load Load link as array.
linkchecker_menu Implementation of hook_menu().
linkchecker_nodeapi
linkchecker_perm Implementation of hook_perm().
_linkchecker_absolute_content_path Get the path of an URL.
_linkchecker_add_box_links Add box links to database.
_linkchecker_add_comment_links Add comment links to database.
_linkchecker_add_node_links Add node links to database.
_linkchecker_batch_comments_import_op Batch operation: Load all comments, 100 by hundred.
_linkchecker_batch_import Trigger batch import job.
_linkchecker_batch_import_boxes Batch: Load all boxes 100 by hundred.
_linkchecker_batch_import_boxes_op Batch operation: Load all boxes, 100 by 100.
_linkchecker_batch_import_comments Batch: Load all comments 100 by hundred.
_linkchecker_batch_import_comments_op Batch operation: Load all boxes, 100 by 100.
_linkchecker_batch_import_nodes Batch: Load all nodes 100 by hundred.
_linkchecker_batch_node_import_op Batch operation: Load all nodes, 100 by hundred.
_linkchecker_box_links_missing Returns an array of box references missing in the linkchecker_boxes table.
_linkchecker_check_markup Customized clone of core check_markup() function with additional filter blacklist.
_linkchecker_cleanup_box_references Cleanup no longer used box references to links in the linkchecker_boxes table.
_linkchecker_cleanup_comment_references Cleanup no longer used comment references to links in the linkchecker_comments table.
_linkchecker_cleanup_links Run perodically via cron and delete all links without a references.
_linkchecker_cleanup_node_references Cleanup no longer used node references to links in the linkchecker_nodes table.
_linkchecker_comment_links_missing Returns an array of comment references missing in the linkchecker_comments table.
_linkchecker_comment_load Load comment as array.
_linkchecker_delete_box_links Remove all box references to links in the linkchecker_boxes table.
_linkchecker_delete_comment_links Remove all comment references to links in the linkchecker_comments table.
_linkchecker_delete_node_links Remove all node references to links in the linkchecker_nodes table.
_linkchecker_extract_links Extract links from content.
_linkchecker_isvalid_response_code Defines the list of allowed response codes for form input validation.
_linkchecker_link_check_status_filter Verifies against the url blacklist, if the link status should be checked or not.
_linkchecker_link_replace Replaces old link with new link in text.
_linkchecker_node_links_missing Returns an array of node references missing in the linkchecker_nodes table.
_linkchecker_scan_box_links Scan specified box for links. Helper function for job_queue scans.
_linkchecker_scan_comment_links Scan specified comment for links. Helper function for job_queue scans.
_linkchecker_scan_nodetype Should the defined node type scanned for links?
_linkchecker_scan_node_links Scan specified node for links. Helper function for job_queue scans.
_linkchecker_status_handling Status code handling.
_linkchecker_unpublish_nodes Unpublishes all nodes having the specified link id.

Constants

Namesort descending Description
LINKCHECKER_DEFAULT_FILTER_BLACKLIST A list of blacklisted filters the modules do not need to run for the link extraction process. This filters only eat processing time or holds references to other nodes.
LINKCHECKER_RESERVED_DOCUMENTATION_DOMAINS A list of domain names reserved for use in documentation and not available for registration. See RFC 2606, Section 3 for more information.
LINKCHECKER_SCAN_MAX_LINKS_PER_RUN Defines the maximum limit of links collected in one chunk if content is scanned for links. A value that is too high may overload the database server.