You are here

linkchecker.module in Link checker 5

This module periodically check html links referenced by drupal nodes Developed and maintained by Marek Tichy, marek@ecn.cz

File

linkchecker.module
View source
<?php

// $Id$

/**
 * @file
 * This module periodically check html links referenced by drupal nodes
 * Developed and maintained by Marek Tichy, marek@ecn.cz
 */

/**
 * Implementation of hook_help().
 */
function linkchecker_help($section) {
  switch ($section) {
    case 'admin/help#linkchecker':
      return "<p>" . t("This module provides an aid to finding broken links on your site. It periodically checks contents of all public nodes, tries to find any html links and check for their validity. It reports broken links through the admin interface.") . "</p>";
  }
}

/**
 * Implementation of hook_menu().
 */
function linkchecker_menu($may_cache) {
  $items = array();
  $items[] = array(
    'path' => 'admin/settings/linkchecker',
    'title' => t('Link checker'),
    'description' => t('Link checker configuration'),
    'callback' => 'drupal_get_form',
    'callback arguments' => array(
      'linkchecker_admin_settings',
    ),
    'access' => user_access('administer linkchecker'),
    'type' => MENU_NORMAL_ITEM,
  );
  $items[] = array(
    'path' => 'linkchecker/report' . $path,
    'title' => t('Broken links report'),
    'callback' => 'linkchecker_report',
    'callback arguments' => array(
      'node',
      $path,
    ),
    'access' => user_access('access linkchecker'),
    'type' => MENU_NORMAL_ITEM,
  );
  $items[] = array(
    'path' => 'admin/linkchecker/debug' . $path,
    'title' => t('Linkchecker debug mode'),
    'callback' => 'linkchecker_debug_run',
    'callback arguments' => array(
      'node',
      $path,
    ),
    'access' => user_access('administer linkchecker'),
    'type' => MENU_NORMAL_ITEM,
  );
  return $items;
}
function linkchecker_admin_settings() {
  $instruction_text = '<div>Configure Link checker core parameters.</div>';
  $form['instructions'] = array(
    '#type' => 'markup',
    '#value' => $instruction_text,
  );
  $form['linkchecker_rebuild'] = array(
    '#default_value' => variable_get('linkchecker_rebuild', 1),
    '#type' => 'select',
    '#title' => t('Delete all existing reports and scheduled tasks and start linkchecker from scratch'),
    '#description' => t("Choose how often should linkchecker inspect all existing nodes again. Menwhile, the linkchecker operates in an incremental mode, when only newly added and updated nodes are being checked."),
    '#options' => array(
      "0" => t("Never"),
      "1" => t("Next cron run"),
      "604800" => t("Weekly"),
      "2419200" => t("Monthly"),
    ),
  );
  $form['linkchecker_fqdn_only'] = array(
    '#default_value' => variable_get('linkchecker_fqdn_only', 1),
    '#type' => 'checkbox',
    '#title' => t('Consider only fully qualified URLs ( not local links )'),
    '#description' => "",
  );
  $form['linkchecker_ignore_responses'] = array(
    '#default_value' => variable_get('linkchecker_ignore_responses', "401\n403"),
    '#type' => 'textarea',
    '#title' => t("Don't treat those response codes as errors"),
    '#description' => "One per line, HTTP code only (e.g. 403) or full response string (e.g. 402 Payment Required)",
  );
  $form['linkchecker_maxtime'] = array(
    '#default_value' => variable_get('linkchecker_maxtime', 30),
    '#type' => 'textfield',
    '#title' => t('Maximum runtime'),
    '#size' => 50,
    '#maxlength' => 100,
    '#description' => t('Maximum allowed time (in seconds) that can be spent on link checking every cron job run. The default value is 30 seconds. '),
  );
  $form['linkchecker_socket_timeout'] = array(
    '#default_value' => variable_get('linkchecker_socket_timeout', 3),
    '#type' => 'textfield',
    '#title' => t('Socket timeout (seconds)'),
    '#size' => 50,
    '#maxlength' => 100,
    '#description' => t('If the linkchecker does not get at least some response from a remote site within the socket timeout, the link is considered broken.'),
  );
  $form['linkchecker_max_links_per_node'] = array(
    '#default_value' => variable_get('linkchecker_max_links_per_node', 0),
    '#type' => 'textfield',
    '#title' => t('Max links per node'),
    '#size' => 50,
    '#maxlength' => 100,
    '#description' => t('Set this limit if you have nodes with many links for which the link checking often takes longer than maximum runtime (you will see messages in the log files). 0 means no limit (default).'),
  );
  $form['linkchecker_remove_after'] = array(
    '#default_value' => variable_get('linkchecker_remove_after', 30),
    '#type' => 'textfield',
    '#title' => t('Days to keep reports'),
    '#size' => 50,
    '#maxlength' => 100,
    '#description' => t('If the node is not fixed within this number of days, we remove it.'),
  );
  $form['linkchecker_give_up'] = array(
    '#default_value' => variable_get('linkchecker_give_up', 5),
    '#type' => 'textfield',
    '#title' => t('Max attempts'),
    '#size' => 50,
    '#maxlength' => 100,
    '#description' => t('If the linkchecker keeps timing out on a node, give up on it after this number of attempts.'),
  );
  return system_settings_form($form);
}

/**
 * Implementation of hook_perm().
 */
function linkchecker_perm() {
  return array(
    'access linkchecker',
    'administer linkchecker',
  );
}

/**
 * menu callback for reporting
 */
function linkchecker_report() {
  $header = array(
    array(
      'data' => t('Node'),
      'field' => 'nodeid',
      'sort' => 'desc',
    ),
    array(
      'data' => t('URL'),
      'field' => 'url',
      'sort' => 'desc',
    ),
    array(
      'data' => t('Error'),
      'field' => 'response',
    ),
  );
  $result = pager_query('SELECT * FROM `linkchecker_tasks` INNER JOIN linkchecker_results ON linkchecker_tasks.taskid = linkchecker_results.taskid  ' . tablesort_sql($header), 3000, 0, 'SELECT COUNT(*) `linkchecker_tasks`;');
  while ($foo = db_fetch_object($result)) {
    $rows[] = array(
      '<a href="/node/' . $foo->nodeid . '">' . $foo->nodeid . '</a>',
      $foo->url,
      $foo->response,
    );
  }
  drupal_set_title(check_plain($node->title));
  $output = theme('table', $header, $rows);
  $output .= theme('pager', NULL, 3000, 0);
  print theme('page', $output, FALSE);
}
function linkchecker_debug_run() {
  global $_LINKCHECKER_DEBUG;
  $_LINKCHECKER_DEBUG = true;
  linkchecker_cron();
  die("<br /> *** " . t("Debugging finished") . "  ***");
}

/**
 * Implementation of hook_cron()
 * Rebuild the table if necessary.
 */
function linkchecker_cron() {
  $res = db_query("SELECT * FROM `linkchecker_tasks` WHERE `taskid` = 0;");
  $foo = db_fetch_array($res);
  $finish = false;
  $debug_report = "Linkchecker run";
  $lastrun = $foo["update"] ? strtotime($foo["update"]) : 0;
  d_("Lastrun: {$lastrun}");

  // Check if linkchecking process has not exceeded it's maximum run time
  $maxtime = variable_get('linkchecker_maxtime', 30);
  if (lc_CheckRuntime($maxtime) == false) {
    d_("Finishing early");
    $finish = true;
  }
  else {

    // Check if the table needs rebuilding
    $rebuildnow = false;
    $rebuild = variable_get('linkchecker_rebuild', 1);
    if ($rebuild) {
      $res = db_query("SELECT * FROM `linkchecker_tasks` WHERE taskid=0;");
      $foo = db_fetch_array($res);
      if (empty($foo)) {

        // the 0 record is missing, add it
        $sql = "INSERT INTO `linkchecker_tasks` VALUES (0," . lc_now_to_int() . ",0,NOW());";
        db_query($sql);
        d_("Cannot find time record, adding it: {$sql}.");
        $foo["status"] = 0;
      }
      else {
        $age = lc_int_to_age($foo["nodeid"]);
        d_("The entire site check has been initiated less than {$age} seconds ago.");
        d_("Maxage is set to {$rebuild}.");
        if ($age > $rebuild) {
          $rebuildnow = true;
          d_("Should rebuild now");
        }
      }
    }
    if ($rebuildnow) {
      d_("Completely rebuilding the table");
      watchdog("linkchecker", t("Rebuilding the entire linkchecker database from scratch"));
      db_query("TRUNCATE TABLE linkchecker_tasks;");

      //would be nice to find a slightly less destructive way
      db_query("TRUNCATE TABLE linkchecker_results;");
      db_query("INSERT INTO `linkchecker_tasks` VALUES (0," . lc_now_to_int() . ",0,NOW());");
      $lastrun = 0;
      if ($rebuild == 1) {

        // next cron run then never
        variable_set('linkchecker_rebuild', 0);
      }
    }

    // Tasks table maintenance
    // - garbage collect
    $maxage = time() - 24 * 60 * 60 * variable_get('linkchecker_remove_after', 30);
    d_("SELECT * FROM `linkchecker_tasks` WHERE `update` < FROM_UNIXTIME({$maxage})");
    $res = db_query("SELECT * FROM `linkchecker_tasks` WHERE `update` < FROM_UNIXTIME({$maxage})");
    while ($task = db_fetch_array($res)) {
      d_("Remove aged task: {$task['taskid']}");
      lc_RemoveTask($task);
    }

    // Find and delete orphaned reports
    $res = db_query("SELECT DISTINCT taskid FROM `linkchecker_results`;");
    while ($task = db_fetch_array($res)) {
      $res2 = db_query("SELECT * FROM `linkchecker_tasks` WHERE `taskid` = " . $task["taskid"] . ";");
      if (!db_result($res2)) {
        d_("Remove orphaned report for task : " . $task["taskid"]);
        lc_RemoveTask($task);
      }
    }

    //  - add new tasks
    $res = db_query("SELECT * FROM `node` WHERE `changed` > '{$lastrun}'");
    $i = 0;
    while ($foo = db_fetch_array($res)) {
      $task = array(
        "nodeid" => $foo["nid"],
      );
      d_("Pushing node: {$foo['nid']}");
      lc_PushTask($task);
      $i++;
    }
    $debug_report .= ", loaded {$i} new or updated nodes";

    //  - load tasks one by one and process them
    $res = db_query("SELECT * FROM `linkchecker_tasks` WHERE `nodeid` > 0 AND `status` < 999 ORDER BY status ASC;");
    $i = 0;
    $j = 0;
    while ($task = db_fetch_array($res)) {
      $i++;
      d_("Processing task {$task['taskid']}");
      $report = array();
      if (lc_DoLinkChecks($task, $maxtime, $report)) {
        if (empty($report)) {
          lc_RemoveTask($task);
        }
        else {
          lc_AddReport($task, $report);
          $j++;
        }
      }
      else {
        $newstatus = $task["status"] + 1;
        $max_attempts = variable_get('linkchecker_give_up', 5);
        if ($newstatus > $max_attempts) {
          lc_RemoveTask($task);
          watchdog("linkchecker", "Linkchecker was unable to check node {$task['nodeid']} for {$max_attempts} times, giving up on it for good.");
        }
        else {
          db_query("UPDATE `linkchecker_tasks` SET `status` = " . ($task["status"] + 1) . ", `update` = NOW() WHERE taskid = " . $task["taskid"] . ";");
          watchdog("linkchecker", "Linkchecker was unable to check node " . $task["nodeid"] . " within the given maximum runtime.");
        }
        d_("Finishing early");
        $debug_report .= ", processed {$i}";
        $debug_report .= ", problems found in {$j} nodes, finishing early (not enough time to process all)";
        $finish = true;
        break;
      }
    }
    db_query("UPDATE `linkchecker_tasks` SET `status` = 0, `update` = NOW() WHERE taskid=0;");
    if (!$finish) {
      d_("Finishing properly");
      $debug_report .= ", processed {$i}";
      $debug_report .= ", problems found in {$j} nodes";
    }
    watchdog("linkchecker", $debug_report);
  }
}
function lc_DoLinkChecks($task, $maxtime, &$report) {
  $nid = lc_Task2Nid($task);
  d_("Loading node {$nid}");
  $node = node_load($nid);
  $nodecontent = node_view($node);
  $uurls = lc_RandReduce(lc_GetUniqueUrls($nodecontent));
  d_("Checking " . sizeof($uurls) . " links.");
  foreach ($uurls as $url) {
    if (lc_CheckRuntime($maxtime) == false) {
      d_("Time is up, bailing out");
      return false;
    }
    d_("<br />Testing {$url} found in node {$nid}");
    $r = trim(lc_GetResponse($url));
    list($code) = split(" ", $r);
    d_("<br />Got response code: {$code} ( {$r} )");
    if ($code >= "400" || $code < 0) {
      if (!lc_ignore_response($r, $code)) {
        $report[] = array(
          'taskid' => $task["taskid"],
          'url' => $url,
          'response' => $r,
        );
      }
      else {
        d_("<br />Ignoring response: {$code} ( {$r} )");
      }
    }
  }
  unset($node);
  unset($nodecontent);
  return true;
}

// Convert task to node ID
function lc_Task2Nid($task) {
  return $task["nodeid"];
}

// Remove task from the table
function lc_RemoveTask($task) {
  $tid = $task["taskid"];
  d_("Removing task {$tid}");
  if (is_numeric($tid)) {
    db_query("DELETE FROM `linkchecker_tasks` WHERE `taskid` = {$tid};");
    db_query("DELETE FROM `linkchecker_results` WHERE `taskid` = {$tid};");
  }
  else {

    // Error: invalid task id
  }
}

// Add task to the table
function lc_PushTask($task) {
  $res = db_query("SELECT max( taskid ) AS maxid FROM `linkchecker_tasks`");
  $foo = db_fetch_array($res);
  $newid = $foo["maxid"] ? $foo["maxid"] + 1 : 1;

  // Have we seen and processed this node ? If yes, delete it first
  $res = db_query("SELECT * FROM `linkchecker_tasks` where `status` = 999 AND `nodeid` = {$task['nodeid']}");
  $foo = db_fetch_array($res);
  $taskid = $foo["taskid"];
  if ($taskid) {
    lc_RemoveTask(array(
      'taskid' => $taskid,
    ));
    d_("Removing task {$taskid} and it's reports since node {$task['nodeid']} has been updated");
  }

  // Push task into a queue for (re)checking
  db_query("INSERT INTO `linkchecker_tasks` ( `taskid` , `nodeid` , `status` , `update` )\n                VALUES ({$newid}, {$task['nodeid']}, '0', NOW( ));");
}
function lc_AddReport($task, $report) {
  db_query("DELETE FROM `linkchecker_results` WHERE `taskid` = " . $task["taskid"] . ";");
  foreach ($report as $r) {
    db_query("INSERT INTO `linkchecker_results` VALUES ('" . $task["taskid"] . "','" . $r["url"] . "','" . $r["response"] . "');");
  }
  db_query("UPDATE `linkchecker_tasks` SET `status` = 999, `update` = NOW() WHERE taskid = " . $task["taskid"] . ";");
}

// Checks how long we have been running, if runtime is 0 (not running) it starts counting and returns true, if we have exceeded maxtime it stops counting and returns false
function lc_CheckRuntime($maxtime) {
  d_("Checking time, max time is {$maxtime} seconds");

  /* TASK ID 0 is used in a special way
       taskid  - 0
       nodeid  - timestamp of the last database rebuild
       status  - 1 - running, 0 - not running
       update  - timestamp of the last start
     */
  $res = db_query("SELECT * FROM `linkchecker_tasks` WHERE taskid=0;");
  $foo = db_fetch_array($res);
  if (empty($foo)) {

    // the 0 record is missing, add it
    $sql = "INSERT INTO `linkchecker_tasks` VALUES (0," . lc_now_to_int() . ",0,NOW());";
    db_query($sql);
    d_("Cannot find time record, adding it: {$sql}.");
    $foo["status"] = 0;
  }
  if ($foo["status"] == 0) {

    // not running yet
    d_("Setting the process state to 1 (running)");
    db_query("UPDATE `linkchecker_tasks` SET `status` = 1, `update` = NOW() WHERE taskid=0;");
    $time = 0;
  }
  else {

    // has been running for a while, check time
    $time = time() - strtotime($foo["update"]);
    d_("Time record found, it says we have been running since {$foo['update']}, which is {$time} seconds ago.");
  }
  if ($time < $maxtime) {

    //
    $ret = true;
    d_("Time left:" . ($maxtime - $time));
    d_("Memory usage:" . number_format(memory_get_usage(), 0, '.', ',') . " bytes");
  }
  else {
    d_("No time left, switching to state 0 (not running)");
    db_query("UPDATE `linkchecker_tasks` SET `status` = 0, `update` = NOW() WHERE taskid=0;");
    $ret = false;
  }
  return $ret;
}

/************************ Functions for URL checking *******************/

// Gets urls
function lc_GetUrls($html) {

  // Finds any links in the HTML
  $matches = array();
  preg_match_all("@\n  <\n  (a|area)\n  \\s\n  (.(?!(href)))*?\n  \\s*\n   (href\\s*=\\s*['\"]?\n    ([^\\'#\\[%\">][^\\'\">]*[^\\'\"> ])\n    \\s*['\"]?)\n  @iex", $html, $matches);
  $links = array();
  $ret = $matches[5];
  for ($i = 0; isset($ret[$i]); $i++) {
    if (preg_match("|^http://(.*)|i", $ret[$i])) {
      $links[] = $ret[$i];
    }
    elseif (preg_match("|^/(.*)|i", $ret[$i])) {
      if (variable_get('linkchecker_fqdn_only', 1) == 0) {
        $links[] = "http://" . $_SERVER["SERVER_NAME"] . "" . $ret[$i];
      }
    }
  }
  return $links;
}

// Gets Unique Urls
function lc_GetUniqueUrls($html) {
  if (!$html) {
    return false;
  }

  // Gets the list of urls
  $urls = lc_GetUrls($html);
  $uurls = array();
  for ($i = 0; isset($urls[$i]); $i++) {

    // Checks if the url is in the array
    if (!in_array($urls[$i], $uurls)) {

      // If it's not it adds it
      $uurls[] = $urls[$i];
    }
  }
  return $uurls;
}

// Gets headers
function lc_GetHeaders($url) {

  // Gets url ready to use
  $info = @parse_url($url);

  // Opens socket
  $fp = @fsockopen($info["host"], 80, $errno, $errstr, variable_get('linkchecker_socket_timeout', 3));

  // Makes sure the socket is open or returns false
  if (!$fp) {
    return false;
  }
  else {

    // Checks the path is not empty
    if (empty($info["path"])) {

      // If it is empty it fills it
      $info["path"] = "/";
    }
    $query = "";

    // Checks if there is a query string in the url
    if (isset($info["query"])) {

      // If there is a query string it adds a ? to the front of it
      $query = "?" . $info["query"] . "";
    }
    $info["path"] = str_replace(" ", "%20", $info["path"]);

    // Sets the headers to send
    $out = "HEAD " . $info["path"] . "" . $query . " HTTP/1.0\r\n";
    $out .= "Host: " . $info['host'] . "\r\n";
    $out .= "Connection: close \r\n";
    $out .= "Accept-language: en-us;q=0.7,en;q=0.3 \r\n";
    $out .= "Accept:   text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
    $out .= "Accept-charset: ISO-8859-2,utf-8;q=0.7,*;q=0.7";
    $out .= "User-Agent:   Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.5) Gecko/20061201 Firefox/2.0.0.5 (Ubuntu-feisty) \r\n\r\n";
    d_("Headers sent: {$out}");

    // writes the headers out
    fwrite($fp, $out);
    $html = '';

    // Reads what gets sent back
    //  while ( !feof( $fp ) ) { - commented out, no need to read the whole thing
    $html .= fread($fp, 8192);

    //}

    // Closes socket
    fclose($fp);
  }
  return $html;
}

// Gets status code
function lc_GetStatusCode($header) {

  // Splits the headers into an array
  $headers = explode("\r\n", $header);
  unset($header);
  for ($i = 0; isset($headers[$i]); $i++) {

    // Checks if the header is the status header
    if (preg_match("/HTTP\\/[0-9A-Za-z +]/i", $headers[$i])) {

      // If it is save the status
      $status = preg_replace("/http\\/[0-9]\\.[0-9]/i", "", $headers[$i]);
    }
  }
  return $status;
}
function lc_GetResponse($url) {
  $headers = lc_GetHeaders($url);
  if (!$headers) {
    $response = "-1 Unable to connect";
  }
  else {

    // Get status code
    $response = lc_GetStatusCode($headers);
  }
  return $response;
}

// Randomly reduce the number of links to check
function lc_RandReduce($urls) {
  $maxtime = variable_get('linkchecker_maxtime', 30);
  $timeout = variable_get('linkchecker_socket_timeout', 3);
  if ($timeout < 1) {
    $timeout = 1;
  }
  $maxlinks = variable_get('linkchecker_max_links_per_node', 0);
  $newurls = array();
  srand((double) microtime() * 10000000);
  if ($maxlinks > 0 && sizeof($urls) > $maxlinks) {
    $rand = array_rand($urls, $maxlinks);
    foreach ($rand as $key) {
      $newurls[] = $urls[$key];
    }
    d_("Too many links, reducing to {$maxlinks} only.");
  }
  else {
    $newurls = $urls;
  }
  return $newurls;
}

// Debug function
function d_($foo) {
  global $_LINKCHECKER_DEBUG;
  if ($_LINKCHECKER_DEBUG) {
    echo "<br />{$foo}";
  }
}
function lc_now_to_int() {
  return date("Ymd", time());
}
function lc_int_to_age($when) {
  $y = substr($when, 0, 4);
  $m = substr($when, 4, 2);
  $d = substr($when, 6, 2);
  $time = time() - mktime(0, 0, 0, $m, $d, $y);
  return $time;
}
function lc_ignore_response($response, $code) {
  $ignored = split("\n", variable_get('linkchecker_ignore_responses', ""));
  var_dump($ignored);
  foreach ($ignored as $i) {
    if ($code == trim($i)) {
      return true;
    }
    if ($response == trim($i)) {
      return true;
    }
  }
  return false;
}