You are here

public function LinkExtractorService::getLinks in Link checker 8

Filters URL that do not need to check.

Parameters

array $urls: Array of URLs.

string $baseContentUrl: Base URL for internal, not absolute urls.

Return value

array List of links.

1 call to LinkExtractorService::getLinks()
LinkExtractorService::extractFromField in src/LinkExtractorService.php
Extracts links from field.

File

src/LinkExtractorService.php, line 181

Class

LinkExtractorService
Class LinkExtractor.

Namespace

Drupal\linkchecker

Code

public function getLinks(array $urls, $baseContentUrl = NULL) {

  // What type of links should be checked?
  $checkLinksType = $this->linkcheckerSetting
    ->get('check_links_types');
  if (isset($this->request)) {
    $httpProtocol = $this->request
      ->getScheme() . '://';
    $baseUrl = $this->request
      ->getSchemeAndHttpHost();
  }
  else {
    $httpProtocol = $this->linkcheckerSetting
      ->get('default_url_scheme');
    $baseUrl = $httpProtocol . $this->linkcheckerSetting
      ->get('base_path');
  }
  if (empty($baseContentUrl)) {
    $baseContentUrl = $baseUrl;
  }
  $links = [];
  foreach ($urls as $url) {

    // Decode HTML links into plain text links.
    // DOMDocument->loadHTML does not provide the RAW url from code. All html
    // entities are already decoded.
    // @todo: Try to find a way to get the raw value.
    $urlDecoded = $url;

    // Prefix protocol relative urls with a protocol to allow link checking.
    if (preg_match('!^//!', $urlDecoded)) {
      $urlDecoded = $httpProtocol . ':' . $urlDecoded;
    }

    // FIXME: #1149596 HACK - Encode spaces in URLs, so validation equals TRUE
    // and link gets added.
    $urlEncoded = str_replace(' ', '%20', $urlDecoded);

    // Full qualified URLs.
    if ($checkLinksType != LinkCheckerLinkInterface::TYPE_INTERNAL && UrlHelper::isValid($urlEncoded, TRUE)) {

      // Add to Array and change HTML links into plain text links.
      $links[$urlDecoded] = $url;
    }
    elseif (preg_match('/^\\w[\\w.+]*:/', $urlDecoded)) {
      continue;
    }
    elseif ($checkLinksType != LinkCheckerLinkInterface::TYPE_EXTERNAL && UrlHelper::isValid($urlEncoded, FALSE)) {
      $absoluteContentPath = $this
        ->getAbsoluteContentPath($baseContentUrl);

      // Absolute local URLs need to start with [/].
      if (preg_match('!^/!', $urlDecoded)) {

        // Add to Array and change HTML encoded links into plain text links.
        $links[$baseUrl . $urlDecoded] = $baseUrl . $url;
      }
      elseif (!empty($baseContentUrl) && preg_match('!^[?#]!', $urlDecoded)) {

        // Add to Array and change HTML encoded links into plain text links.
        $links[$baseContentUrl . $baseContentUrl] = $baseContentUrl . $url;
      }
      elseif (!empty($absoluteContentPath) && preg_match('!^\\.{1,2}/!', $urlDecoded)) {

        // Build the URI without hostname before the URI is normalized and
        // dot-segments will be removed. The hostname is added back after the
        // normalization has completed to prevent hostname removal by the
        // regex. This logic intentionally does not implement all the rules
        // defined in RFC 3986, section 5.2.4 to show broken links and
        // over-dot-segmented URIs; e.g., https://example.com/../../foo/bar.
        // For more information, see https://drupal.org/node/832388.
        $path = substr_replace($absoluteContentPath . $urlDecoded, '', 0, strlen($baseUrl));

        // Remove './' segments where possible.
        $path = str_replace('/./', '/', $path);

        // Remove '../' segments where possible. Loop until all segments are
        // removed. Taken over from _drupal_build_css_path() in common.inc.
        $last = '';
        while ($path != $last) {
          $last = $path;
          $path = preg_replace('`(^|/)(?!\\.\\./)([^/]+)/\\.\\./`', '$1', $path);
        }

        // Glue the hostname and path to full-qualified URI.
        $links[$baseUrl . $path] = $baseUrl . $path;
      }
      elseif (!empty($absoluteContentPath) && preg_match('!^[^/]!', $urlDecoded)) {
        $links[$absoluteContentPath . $url] = $absoluteContentPath . $url;
      }
      else {

        // @todo Are there more special cases the module need to handle?
      }
    }
  }
  return array_filter($links, function ($url) {
    return !$this
      ->isUrlBlacklisted($url);
  });
}