You are here

function _site_disclaimer_extract_links in Site Disclaimer 7

Same name and namespace in other branches
  1. 6 site_disclaimer.admin.inc \_site_disclaimer_extract_links()

Extract links from content (copied and modified from _linkchecker_extract_links() linkchecker.module).

Parameters

string $text: The text to be scanned for links.

string $content_path: Path to the content that is currently scanned for links. This value is required to build full qualified links from relative links. Relative links are not extracted from content, if path is not provided.

Return value

array Array whose keys are fully qualified and unique URLs found in the content, and whose values are arrays of actual text (raw URLs or paths) corresponding to each fully qualified URL.

1 call to _site_disclaimer_extract_links()
_site_disclaimer_prep_allowed_paths in ./site_disclaimer.admin.inc
Create 'site_disclaimer_allow_nodes' variable from links found in:

File

./site_disclaimer.admin.inc, line 442
Administration settings for Site Disclaimer module.

Code

function _site_disclaimer_extract_links($text = '', $content_path = NULL) {

  // Just in case the linkchecker.module is more up to date.

  //if (module_exists('linkchecker')) return _linkchecker_extract_links($text, $path);
  global $base_root;

  // Finds all hyperlinks in the content.
  $matches_a = array(
    1 => NULL,
  );
  if (1 || variable_get('linkchecker_extract_from_a', 1) == 1) {

    // Extract all chars in the href value, except double and single quotes.
    $pattern_a = '/<(?:a|area)\\s[^>]*href=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_a, $text, $matches_a);
  }

  // Finds all audio links in the content.
  $matches_audio = array(
    1 => NULL,
  );
  if (0 && variable_get('linkchecker_extract_from_audio', 1) == 1) {
    $pattern_audio = '/<audio\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_audio, $text, $matches_audio);
  }

  // Finds embed tags with links in the content.
  $matches_embed = array();
  if (0 && variable_get('linkchecker_extract_from_embed', 0) == 1) {
    $pattern_embed_src = '/<embed\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_embed_pluginurl = '/<embed\\s[^>]*pluginurl=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_embed_pluginspage = '/<embed\\s[^>]*pluginspage=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_embed_src, $text, $matches_embed_src);
    preg_match_all($pattern_embed_pluginurl, $text, $matches_embed_pluginurl);
    preg_match_all($pattern_embed_pluginspage, $text, $matches_embed_pluginspage);
    $matches_embed = array_merge((array) $matches_embed_src[1], (array) $matches_embed_pluginurl[1], (array) $matches_embed_pluginspage[1]);
  }

  // Finds iframe tags with links in the content.
  $matches_iframe = array(
    1 => NULL,
  );
  if (1 || variable_get('linkchecker_extract_from_iframe', 0) == 1) {
    $pattern_iframe = '/<iframe\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_iframe, $text, $matches_iframe);
  }

  // Finds img tags with links in the content.
  $matches_img = array(
    1 => NULL,
  );
  if (1 || variable_get('linkchecker_extract_from_img', 0) == 1) {
    $pattern_img = '/<img\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_img, $text, $matches_img);
  }

  // Finds object/param tags with links in the content.
  $matches_object = array();
  if (0 && variable_get('linkchecker_extract_from_object', 0) == 1) {

    // TODO's:
    //  * Allow flipped order of attributes in "param".
    //  * Try to extract links in unkown "flashvars" values (for e.g. file=http://, data=http://).
    $pattern_object_data = '/<object\\s[^>]*data=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_object_codebase = '/<object\\s[^>]*codebase=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_param = '/<param\\s[^>]*((name|src)=["\'](archive|filename|href|movie|src|url)["\']\\s[^>]*)+value=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_object_data, $text, $matches_object_data);
    preg_match_all($pattern_object_codebase, $text, $matches_object_codebase);
    preg_match_all($pattern_param, $text, $matches_param);
    $matches_object = array_merge((array) $matches_object_data[1], (array) $matches_object_codebase[1], (array) $matches_param[4]);
  }

  // Finds source tags with links in the content.
  $matches_source = array(
    1 => NULL,
  );
  if (0 || variable_get('linkchecker_extract_from_source', 0) == 1) {
    $pattern_source = '/<source\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_source, $text, $matches_source);
  }

  // Finds video tags with links in the content.
  $matches_video = array();
  if (0 || variable_get('linkchecker_extract_from_video', 0) == 1) {
    $pattern_video_poster = '/<video\\s[^>]*poster=["\']([^"\']*)["\'][^>]*>/i';
    $pattern_video_src = '/<video\\s[^>]*src=["\']([^"\']*)["\'][^>]*>/i';
    preg_match_all($pattern_video_poster, $text, $matches_video_poster);
    preg_match_all($pattern_video_src, $text, $matches_video_src);
    $matches_video = array_merge((array) $matches_video_poster[1], (array) $matches_video_src[1]);
  }

  // Merge all extracted links into one array.
  $urls = array_merge((array) $matches_a[1], (array) $matches_audio[1], (array) $matches_embed, (array) $matches_iframe[1], (array) $matches_img[1], (array) $matches_object, (array) $matches_source[1], (array) $matches_video);

  // Remove empty values.
  $urls = array_filter($urls);

  // Remove duplicate urls.
  $urls = array_unique($urls);

  // What type of links should be checked?
  $linkchecker_check_links_types = variable_get('linkchecker_fqdn_only', 1);
  $links = array();
  foreach ($urls as $url) {

    // Decode HTML links into plain text links.
    $url_decoded = decode_entities($url);

    // FIXME: #1149596 HACK - Encode spaces in URLs, so validation equals TRUE and link gets added.
    $url_encoded = str_replace(' ', '%20', $url_decoded);

    // Full qualified URLs.
    if ((1 || $linkchecker_check_links_types != 2) && valid_url($url_encoded, TRUE)) {

      // Add to Array and change HTML links into plain text links.
      $links[$url_decoded][] = $url;
    }
    elseif (preg_match('/^\\w[\\w.+]*:/', $url_decoded)) {
      continue;
    }
    elseif ((1 || $linkchecker_check_links_types != 1) && valid_url($url_encoded, FALSE)) {

      // Get full qualified url with base path of content.
      $absolute_content_path = _site_disclaimer_absolute_content_path($content_path);

      // Absolute local URLs need to start with [/].
      if (preg_match('!^/!', $url_decoded)) {

        // Add to Array and change HTML encoded links into plain text links.
        $links[$base_root . $url_decoded][] = $url;
      }
      elseif (!empty($content_path) && preg_match('!^[?#]!', $url_decoded)) {

        // Add to Array and change HTML encoded links into plain text links.
        $links[$content_path . $url_decoded][] = $url;
      }
      elseif (!empty($absolute_content_path) && preg_match('!^\\.{1,2}/!', $url_decoded)) {

        // Build the URI without hostname before the URI is normalized and
        // dot-segments will be removed. The hostname is added back after the
        // normalization has completed to prevent hostname removal by the regex.
        // This logic intentionally does not implement all the rules definied in
        // RFC 3986, section 5.2.4 to show broken links and over-dot-segmented
        // URIs; e.g. http://example.com/../../foo/bar.
        // For more information, see http://drupal.org/node/832388.
        $path = substr_replace($absolute_content_path . $url_decoded, '', 0, drupal_strlen($base_root));

        // Remove './' segments where possible.
        $path = str_replace('/./', '/', $path);

        // Remove '../' segments where possible. Loop until all segments are
        // removed. Taken over from _drupal_build_css_path() in common.inc.
        $last = '';
        while ($path != $last) {
          $last = $path;
          $path = preg_replace('`(^|/)(?!\\.\\./)([^/]+)/\\.\\./`', '$1', $path);
        }

        // Glue the hostname and path to full-qualified URI.
        $links[$base_root . $path][] = $url;
      }
      elseif (!empty($absolute_content_path) && preg_match('!^[^/]!', $url_decoded)) {
        $links[$absolute_content_path . $url_decoded][] = $url;
      }
      else {

        // TODO: Are there more special cases the module need to handle?
      }
    }
  }
  return $links;
}