You are here

function _linkchecker_extract_links in Link checker 5.2

Same name and namespace in other branches
  1. 6.2 linkchecker.module \_linkchecker_extract_links()
  2. 7 linkchecker.module \_linkchecker_extract_links()

Extract links from content.

Parameters

$text: The text to be scanned for links.

$content_path: Path to the content that is currently scanned for links. This value is required to build full qualified links from relative links. Relative links are not extracted from content, if path is not provided.

Return value

Array of full qualified and unique URLs found in content.

3 calls to _linkchecker_extract_links()
_linkchecker_add_box_links in ./linkchecker.module
Add box links to database.
_linkchecker_add_comment_links in ./linkchecker.module
Add comment links to database.
_linkchecker_add_node_links in ./linkchecker.module
Add node links to database.

File

./linkchecker.module, line 1397
This module periodically check links in given node types, blocks, cck fields, etc.

Code

function _linkchecker_extract_links($text = '', $content_path = NULL) {
  global $base_root;

  // Finds all hyperlinks in the content.
  $matches_a = array();
  if (variable_get('linkchecker_extract_from_a', 1) == 1) {
    $pattern_a = '/<(a|area)\\s[^>]*href=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_a, $text, $matches_a);
  }

  // Finds all audio links in the content.
  $matches_audio = array();
  if (variable_get('linkchecker_extract_from_audio', 1) == 1) {
    $pattern_audio = '/<audio\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_audio, $text, $matches_audio);
  }

  // Finds embed tags with links in the content.
  $matches_embed = array();
  if (variable_get('linkchecker_extract_from_embed', 0) == 1) {
    $pattern_embed_src = '/<embed\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_embed_pluginurl = '/<embed\\s[^>]*pluginurl=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_embed_pluginspage = '/<embed\\s[^>]*pluginspage=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_embed_src, $text, $matches_embed_src);
    preg_match_all($pattern_embed_pluginurl, $text, $matches_embed_pluginurl);
    preg_match_all($pattern_embed_pluginspage, $text, $matches_embed_pluginspage);
    $matches_embed = array_merge((array) $matches_embed_src[1], (array) $matches_embed_pluginurl[1], (array) $matches_embed_pluginspage[1]);
  }

  // Finds iframe tags with links in the content.
  $matches_iframe = array();
  if (variable_get('linkchecker_extract_from_iframe', 0) == 1) {
    $pattern_iframe = '/<iframe\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_iframe, $text, $matches_iframe);
  }

  // Finds img tags with links in the content.
  $matches_img = array();
  if (variable_get('linkchecker_extract_from_img', 0) == 1) {
    $pattern_img = '/<img\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_img, $text, $matches_img);
  }

  // Finds object/param tags with links in the content.
  $matches_object = array();
  if (variable_get('linkchecker_extract_from_object', 0) == 1) {

    // TODO's:
    //  * Allow flipped order of attributes in "param".
    //  * Try to extract links in unkown "flashvars" values (for e.g. file=http://, data=http://).
    $pattern_object_data = '/<object\\s[^>]*data=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_object_codebase = '/<object\\s[^>]*codebase=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_param = '/<param\\s[^>]*((name|src)=["\'](archive|filename|href|movie|src|url)["\']\\s[^>]*)+value=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_object_data, $text, $matches_object_data);
    preg_match_all($pattern_object_codebase, $text, $matches_object_codebase);
    preg_match_all($pattern_param, $text, $matches_param);
    $matches_object = array_merge((array) $matches_object_data[1], (array) $matches_object_codebase[1], (array) $matches_param[4]);
  }

  // Finds source tags with links in the content.
  $matches_source = array();
  if (variable_get('linkchecker_extract_from_source', 0) == 1) {
    $pattern_source = '/<source\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_source, $text, $matches_source);
  }

  // Finds video tags with links in the content.
  $matches_video = array();
  if (variable_get('linkchecker_extract_from_video', 0) == 1) {
    $pattern_video_poster = '/<video\\s[^>]*poster=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_video_src = '/<video\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_video_poster, $text, $matches_video_poster);
    preg_match_all($pattern_video_src, $text, $matches_video_src);
    $matches_video = array_merge((array) $matches_video_poster[1], (array) $matches_video_src[1]);
  }

  // Merge all extracted links into one array.
  $urls = array_merge((array) $matches_a[2], (array) $matches_audio[1], (array) $matches_embed, (array) $matches_iframe[1], (array) $matches_img[1], (array) $matches_object, (array) $matches_source[1], (array) $matches_video);

  // Remove empty values.
  $urls = array_filter($urls);

  // Decode HTML links into plain text links.
  $urls = array_map('decode_entities', $urls);

  // Remove duplicate urls.
  $urls = array_unique($urls);
  $links = array();
  foreach ($urls as $url) {

    // Full qualified URLs.
    if (valid_url($url, TRUE)) {

      // Add to Array and change HTML links into plain text links.
      $links[] = $url;
    }
    elseif (preg_match('/^\\w[\\w.+]*:/', $url)) {
      continue;
    }
    elseif (valid_url($url, FALSE) && variable_get('linkchecker_fqdn_only', 1) == 0) {

      // Get full qualified url with base path of content.
      $absolute_content_path = _linkchecker_absolute_content_path($content_path);

      // Absolute local URLs need to start with [/].
      if (preg_match('!^/!', $url)) {

        // Add to Array and change HTML encoded links into plain text links.
        $links[] = $base_root . $url;
      }
      elseif (!empty($content_path) && preg_match('!^[?#]!', $url)) {

        // Add to Array and change HTML encoded links into plain text links.
        $links[] = $content_path . $url;
      }
      elseif (!empty($absolute_content_path) && preg_match('!^\\.{1,2}/!', $url)) {
        $path = $absolute_content_path . $url;

        // Remove './' segments where possible.
        $path = str_replace('/./', '/', $path);

        // Remove '../' segments where possible. Loop until all segments are removed.
        // Taken over from _drupal_build_css_path() in common.inc.
        $last = '';
        while ($path != $last) {
          $last = $path;
          $path = preg_replace('`(^|/)(?!\\.\\./)([^/]+)/\\.\\./`', '$1', $path);
        }

        // Add URLs to array.
        $links[] = $path;
      }
      elseif (!empty($absolute_content_path) && preg_match('!^[^/]!', $url)) {
        $links[] = $absolute_content_path . $url;
      }
      else {

        // TODO: Are there more special cases the module need to handle?
      }
    }
  }
  return array_unique($links);
}