You are here

function _linkchecker_extract_links in Link checker 6.2

Same name and namespace in other branches
  1. 5.2 linkchecker.module \_linkchecker_extract_links()
  2. 7 linkchecker.module \_linkchecker_extract_links()

Extract links from content.

Parameters

string $text: The text to be scanned for links.

string $content_path: Path to the content that is currently scanned for links. This value is required to build full qualified links from relative links. Relative links are not extracted from content, if path is not provided.

Return value

array Array whose keys are fully qualified and unique URLs found in the content, and whose values are arrays of actual text (raw URLs or paths) corresponding to each fully qualified URL.

3 calls to _linkchecker_extract_links()
_linkchecker_add_box_links in ./linkchecker.module
Add block links to database.
_linkchecker_add_comment_links in ./linkchecker.module
Add comment links to database.
_linkchecker_extract_node_links in ./linkchecker.module
Extracts links from a node.

File

./linkchecker.module, line 1375
This module periodically check links in given node types, blocks, cck fields, etc.

Code

function _linkchecker_extract_links($text = '', $content_path = NULL) {
  global $base_root;

  // Finds all hyperlinks in the content.
  $matches_a = array(
    1 => NULL,
  );
  if (variable_get('linkchecker_extract_from_a', 1) == 1) {

    // Extract all chars in the href value, except double and single quotes.
    $pattern_a = '/<(?:a|area)\\s[^>]*href=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_a, $text, $matches_a);
  }

  // Finds all audio links in the content.
  $matches_audio = array(
    1 => NULL,
  );
  if (variable_get('linkchecker_extract_from_audio', 0) == 1) {
    $pattern_audio = '/<audio\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_audio, $text, $matches_audio);
  }

  // Finds embed tags with links in the content.
  $matches_embed = array();
  if (variable_get('linkchecker_extract_from_embed', 0) == 1) {
    $pattern_embed_src = '/<embed\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_embed_pluginurl = '/<embed\\s[^>]*pluginurl=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_embed_pluginspage = '/<embed\\s[^>]*pluginspage=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_embed_src, $text, $matches_embed_src);
    preg_match_all($pattern_embed_pluginurl, $text, $matches_embed_pluginurl);
    preg_match_all($pattern_embed_pluginspage, $text, $matches_embed_pluginspage);
    $matches_embed = array_merge((array) $matches_embed_src[1], (array) $matches_embed_pluginurl[1], (array) $matches_embed_pluginspage[1]);
  }

  // Finds iframe tags with links in the content.
  $matches_iframe = array(
    1 => NULL,
  );
  if (variable_get('linkchecker_extract_from_iframe', 0) == 1) {
    $pattern_iframe = '/<iframe\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_iframe, $text, $matches_iframe);
  }

  // Finds img tags with links in the content.
  $matches_img = array(
    1 => NULL,
  );
  if (variable_get('linkchecker_extract_from_img', 0) == 1) {
    $pattern_img = '/<img\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_img, $text, $matches_img);
  }

  // Finds object/param tags with links in the content.
  $matches_object = array();
  if (variable_get('linkchecker_extract_from_object', 0) == 1) {

    // TODO's:
    //  * Allow flipped order of attributes in "param".
    //  * Try to extract links in unkown "flashvars" values (for e.g. file=http://, data=http://).
    $pattern_object_data = '/<object\\s[^>]*data=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_object_codebase = '/<object\\s[^>]*codebase=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_param = '/<param\\s[^>]*((name|src)=["\'](archive|filename|href|movie|src|url)["\']\\s[^>]*)+value=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_object_data, $text, $matches_object_data);
    preg_match_all($pattern_object_codebase, $text, $matches_object_codebase);
    preg_match_all($pattern_param, $text, $matches_param);
    $matches_object = array_merge((array) $matches_object_data[1], (array) $matches_object_codebase[1], (array) $matches_param[4]);
  }

  // Finds source tags with links in the content.
  $matches_source = array(
    1 => NULL,
  );
  if (variable_get('linkchecker_extract_from_source', 0) == 1) {
    $pattern_source = '/<source\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_source, $text, $matches_source);
  }

  // Finds video tags with links in the content.
  $matches_video = array();
  if (variable_get('linkchecker_extract_from_video', 0) == 1) {
    $pattern_video_poster = '/<video\\s[^>]*poster=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_video_src = '/<video\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_video_poster, $text, $matches_video_poster);
    preg_match_all($pattern_video_src, $text, $matches_video_src);
    $matches_video = array_merge((array) $matches_video_poster[1], (array) $matches_video_src[1]);
  }

  // Merge all extracted links into one array.
  $urls = array_merge((array) $matches_a[1], (array) $matches_audio[1], (array) $matches_embed, (array) $matches_iframe[1], (array) $matches_img[1], (array) $matches_object, (array) $matches_source[1], (array) $matches_video);

  // Remove empty values.
  $urls = array_filter($urls);

  // Remove duplicate urls.
  $urls = array_unique($urls);

  // What type of links schould be checked?
  $linkchecker_check_links_types = variable_get('linkchecker_fqdn_only', 1);
  $links = array();
  foreach ($urls as $url) {

    // Decode HTML links into plain text links.
    $url_decoded = decode_entities($url);

    // Prefix protocol relative urls with a protocol to allow link checking.
    if (preg_match('!^//!', $url_decoded)) {
      $http_protocol = isset($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' ? 'https' : 'http';
      $url_decoded = $http_protocol . ':' . $url_decoded;
    }

    // FIXME: #1149596 HACK - Encode spaces in URLs, so validation equals TRUE and link gets added.
    $url_encoded = str_replace(' ', '%20', $url_decoded);

    // Full qualified URLs.
    if ($linkchecker_check_links_types != 2 && valid_url($url_encoded, TRUE)) {

      // Add to Array and change HTML links into plain text links.
      $links[$url_decoded][] = $url;
    }
    elseif (preg_match('/^\\w[\\w.+]*:/', $url_decoded)) {
      continue;
    }
    elseif ($linkchecker_check_links_types != 1 && valid_url($url_encoded, FALSE)) {

      // Get full qualified url with base path of content.
      $absolute_content_path = _linkchecker_absolute_content_path($content_path);

      // Absolute local URLs need to start with [/].
      if (preg_match('!^/!', $url_decoded)) {

        // Add to Array and change HTML encoded links into plain text links.
        $links[$base_root . $url_decoded][] = $url;
      }
      elseif (!empty($content_path) && preg_match('!^[?#]!', $url_decoded)) {

        // Add to Array and change HTML encoded links into plain text links.
        $links[$content_path . $url_decoded][] = $url;
      }
      elseif (!empty($absolute_content_path) && preg_match('!^\\.{1,2}/!', $url_decoded)) {

        // Build the URI without hostname before the URI is normalized and
        // dot-segments will be removed. The hostname is added back after the
        // normalization has completed to prevent hostname removal by the regex.
        // This logic intentionally does not implement all the rules definied in
        // RFC 3986, section 5.2.4 to show broken links and over-dot-segmented
        // URIs; e.g. http://example.com/../../foo/bar.
        // For more information, see http://drupal.org/node/832388.
        $path = substr_replace($absolute_content_path . $url_decoded, '', 0, strlen($base_root));

        // Remove './' segments where possible.
        $path = str_replace('/./', '/', $path);

        // Remove '../' segments where possible. Loop until all segments are
        // removed. Taken over from _drupal_build_css_path() in common.inc.
        $last = '';
        while ($path != $last) {
          $last = $path;
          $path = preg_replace('`(^|/)(?!\\.\\./)([^/]+)/\\.\\./`', '$1', $path);
        }

        // Glue the hostname and path to full-qualified URI.
        $links[$base_root . $path][] = $url;
      }
      elseif (!empty($absolute_content_path) && preg_match('!^[^/]!', $url_decoded)) {
        $links[$absolute_content_path . $url_decoded][] = $url;
      }
      else {

        // TODO: Are there more special cases the module need to handle?
      }
    }
  }
  return $links;
}