You are here

function _linkchecker_extract_links in Link checker 7

Same name and namespace in other branches
  1. 5.2 linkchecker.module \_linkchecker_extract_links()
  2. 6.2 linkchecker.module \_linkchecker_extract_links()

Extract links from content.

Parameters

string $text: The text to be scanned for links.

string $content_path: Path to the content that is currently scanned for links. This value is required to build full qualified links from relative links. Relative links are not extracted from content, if path is not provided.

Return value

array Array whose keys are fully qualified and unique URLs found in the content, and whose values are arrays of actual text (raw URLs or paths) corresponding to each fully qualified URL.

3 calls to _linkchecker_extract_links()
_linkchecker_add_block_custom_links in ./linkchecker.module
Add custom block links to database.
_linkchecker_add_comment_links in ./linkchecker.module
Add comment links to database.
_linkchecker_extract_node_links in ./linkchecker.module
Extracts links from a node.

File

./linkchecker.module, line 1917
This module periodically check links in given node types, blocks etc.

Code

function _linkchecker_extract_links($text = '', $content_path = NULL) {
  global $base_root, $is_https;
  $html_dom = filter_dom_load($text);
  $urls = array();

  // Finds all hyperlinks in the content.
  if (variable_get('linkchecker_extract_from_a', 1) == 1) {
    $links = $html_dom
      ->getElementsByTagName('a');
    foreach ($links as $link) {
      $urls[] = $link
        ->getAttribute('href');
    }
    $links = $html_dom
      ->getElementsByTagName('area');
    foreach ($links as $link) {
      $urls[] = $link
        ->getAttribute('href');
    }
  }

  // Finds all audio links in the content.
  if (variable_get('linkchecker_extract_from_audio', 0) == 1) {
    $audios = $html_dom
      ->getElementsByTagName('audio');
    foreach ($audios as $audio) {
      $urls[] = $audio
        ->getAttribute('src');

      // Finds source tags with links in the audio tag.
      $sources = $audio
        ->getElementsByTagName('source');
      foreach ($sources as $source) {
        $urls[] = $source
          ->getAttribute('src');
      }

      // Finds track tags with links in the audio tag.
      $tracks = $audio
        ->getElementsByTagName('track');
      foreach ($tracks as $track) {
        $urls[] = $track
          ->getAttribute('src');
      }
    }
  }

  // Finds embed tags with links in the content.
  if (variable_get('linkchecker_extract_from_embed', 0) == 1) {
    $embeds = $html_dom
      ->getElementsByTagName('embed');
    foreach ($embeds as $embed) {
      $urls[] = $embed
        ->getAttribute('src');
      $urls[] = $embed
        ->getAttribute('pluginurl');
      $urls[] = $embed
        ->getAttribute('pluginspage');
    }
  }

  // Finds iframe tags with links in the content.
  if (variable_get('linkchecker_extract_from_iframe', 0) == 1) {
    $iframes = $html_dom
      ->getElementsByTagName('iframe');
    foreach ($iframes as $iframe) {
      $urls[] = $iframe
        ->getAttribute('src');
    }
  }

  // Finds img tags with links in the content.
  if (variable_get('linkchecker_extract_from_img', 0) == 1) {
    $imgs = $html_dom
      ->getElementsByTagName('img');
    foreach ($imgs as $img) {
      $urls[] = $img
        ->getAttribute('src');
      $urls[] = $img
        ->getAttribute('longdesc');
    }
  }

  // Finds object/param tags with links in the content.
  if (variable_get('linkchecker_extract_from_object', 0) == 1) {
    $objects = $html_dom
      ->getElementsByTagName('object');
    foreach ($objects as $object) {
      $urls[] = $object
        ->getAttribute('data');
      $urls[] = $object
        ->getAttribute('codebase');

      // Finds param tags with links in the object tag.
      $params = $object
        ->getElementsByTagName('param');
      foreach ($params as $param) {

        // @todo
        // - Try to extract links in unkown "flashvars" values
        //   (e.g., file=http://, data=http://).
        $names = array(
          'archive',
          'filename',
          'href',
          'movie',
          'src',
          'url',
        );
        if ($param
          ->hasAttribute('name') && in_array($param
          ->getAttribute('name'), $names)) {
          $urls[] = $param
            ->getAttribute('value');
        }
        $srcs = array(
          'movie',
        );
        if ($param
          ->hasAttribute('src') && in_array($param
          ->getAttribute('src'), $srcs)) {
          $urls[] = $param
            ->getAttribute('value');
        }
      }
    }
  }

  // Finds video tags with links in the content.
  if (variable_get('linkchecker_extract_from_video', 0) == 1) {
    $videos = $html_dom
      ->getElementsByTagName('video');
    foreach ($videos as $video) {
      $urls[] = $video
        ->getAttribute('poster');
      $urls[] = $video
        ->getAttribute('src');

      // Finds source tags with links in the video tag.
      $sources = $video
        ->getElementsByTagName('source');
      foreach ($sources as $source) {
        $urls[] = $source
          ->getAttribute('src');
      }

      // Finds track tags with links in the audio tag.
      $tracks = $video
        ->getElementsByTagName('track');
      foreach ($tracks as $track) {
        $urls[] = $track
          ->getAttribute('src');
      }
    }
  }

  // Remove empty values.
  $urls = array_filter($urls);

  // Remove duplicate urls.
  $urls = array_unique($urls);

  // What type of links should be checked?
  $linkchecker_check_links_types = variable_get('linkchecker_check_links_types', 1);
  $links = array();
  foreach ($urls as $url) {

    // Decode HTML links into plain text links.
    // DOMDocument->loadHTML does not provide the RAW url from code. All html
    // entities are already decoded.
    // @todo: Try to find a way to get the raw value.
    $url_decoded = $url;

    // Prefix protocol relative urls with a protocol to allow link checking.
    if (preg_match('!^//!', $url_decoded)) {
      $http_protocol = $is_https ? 'https' : 'http';
      $url_decoded = $http_protocol . ':' . $url_decoded;
    }

    // FIXME: #1149596 HACK - Encode spaces in URLs, so validation equals TRUE and link gets added.
    $url_encoded = str_replace(' ', '%20', $url_decoded);

    // Full qualified URLs.
    if ($linkchecker_check_links_types != 2 && valid_url($url_encoded, TRUE)) {

      // Add to Array and change HTML links into plain text links.
      $links[$url_decoded][] = $url;
    }
    elseif (preg_match('/^\\w[\\w.+]*:/', $url_decoded)) {
      continue;
    }
    elseif ($linkchecker_check_links_types != 1 && valid_url($url_encoded, FALSE)) {

      // Get full qualified url with base path of content.
      $absolute_content_path = _linkchecker_absolute_content_path($content_path);

      // Absolute local URLs need to start with [/].
      if (preg_match('!^/!', $url_decoded)) {

        // Add to Array and change HTML encoded links into plain text links.
        $links[$base_root . $url_decoded][] = $url;
      }
      elseif (!empty($content_path) && preg_match('!^[?#]!', $url_decoded)) {

        // Add to Array and change HTML encoded links into plain text links.
        $links[$content_path . $url_decoded][] = $url;
      }
      elseif (!empty($absolute_content_path) && preg_match('!^\\.{1,2}/!', $url_decoded)) {

        // Build the URI without hostname before the URI is normalized and
        // dot-segments will be removed. The hostname is added back after the
        // normalization has completed to prevent hostname removal by the regex.
        // This logic intentionally does not implement all the rules definied in
        // RFC 3986, section 5.2.4 to show broken links and over-dot-segmented
        // URIs; e.g., http://example.com/../../foo/bar.
        // For more information, see https://drupal.org/node/832388.
        $path = substr_replace($absolute_content_path . $url_decoded, '', 0, strlen($base_root));

        // Remove './' segments where possible.
        $path = str_replace('/./', '/', $path);

        // Remove '../' segments where possible. Loop until all segments are
        // removed. Taken over from _drupal_build_css_path() in common.inc.
        $last = '';
        while ($path != $last) {
          $last = $path;
          $path = preg_replace('`(^|/)(?!\\.\\./)([^/]+)/\\.\\./`', '$1', $path);
        }

        // Glue the hostname and path to full-qualified URI.
        $links[$base_root . $path][] = $url;
      }
      elseif (!empty($absolute_content_path) && preg_match('!^[^/]!', $url_decoded)) {
        $links[$absolute_content_path . $url_decoded][] = $url;
      }
      else {

        // @todo Are there more special cases the module need to handle?
      }
    }
  }
  return $links;
}