public function LinkExtractorService::getLinks in Link checker 8
Filters URL that do not need to check.
Parameters
array $urls: Array of URLs.
string $baseContentUrl: Base URL for internal, not absolute urls.
Return value
array List of links.
1 call to LinkExtractorService::getLinks()
- LinkExtractorService::extractFromField in src/
LinkExtractorService.php - Extracts links from field.
File
- src/
LinkExtractorService.php, line 181
Class
- LinkExtractorService
- Class LinkExtractor.
Namespace
Drupal\linkcheckerCode
public function getLinks(array $urls, $baseContentUrl = NULL) {
// What type of links should be checked?
$checkLinksType = $this->linkcheckerSetting
->get('check_links_types');
if (isset($this->request)) {
$httpProtocol = $this->request
->getScheme() . '://';
$baseUrl = $this->request
->getSchemeAndHttpHost();
}
else {
$httpProtocol = $this->linkcheckerSetting
->get('default_url_scheme');
$baseUrl = $httpProtocol . $this->linkcheckerSetting
->get('base_path');
}
if (empty($baseContentUrl)) {
$baseContentUrl = $baseUrl;
}
$links = [];
foreach ($urls as $url) {
// Decode HTML links into plain text links.
// DOMDocument->loadHTML does not provide the RAW url from code. All html
// entities are already decoded.
// @todo: Try to find a way to get the raw value.
$urlDecoded = $url;
// Prefix protocol relative urls with a protocol to allow link checking.
if (preg_match('!^//!', $urlDecoded)) {
$urlDecoded = $httpProtocol . ':' . $urlDecoded;
}
// FIXME: #1149596 HACK - Encode spaces in URLs, so validation equals TRUE
// and link gets added.
$urlEncoded = str_replace(' ', '%20', $urlDecoded);
// Full qualified URLs.
if ($checkLinksType != LinkCheckerLinkInterface::TYPE_INTERNAL && UrlHelper::isValid($urlEncoded, TRUE)) {
// Add to Array and change HTML links into plain text links.
$links[$urlDecoded] = $url;
}
elseif (preg_match('/^\\w[\\w.+]*:/', $urlDecoded)) {
continue;
}
elseif ($checkLinksType != LinkCheckerLinkInterface::TYPE_EXTERNAL && UrlHelper::isValid($urlEncoded, FALSE)) {
$absoluteContentPath = $this
->getAbsoluteContentPath($baseContentUrl);
// Absolute local URLs need to start with [/].
if (preg_match('!^/!', $urlDecoded)) {
// Add to Array and change HTML encoded links into plain text links.
$links[$baseUrl . $urlDecoded] = $baseUrl . $url;
}
elseif (!empty($baseContentUrl) && preg_match('!^[?#]!', $urlDecoded)) {
// Add to Array and change HTML encoded links into plain text links.
$links[$baseContentUrl . $baseContentUrl] = $baseContentUrl . $url;
}
elseif (!empty($absoluteContentPath) && preg_match('!^\\.{1,2}/!', $urlDecoded)) {
// Build the URI without hostname before the URI is normalized and
// dot-segments will be removed. The hostname is added back after the
// normalization has completed to prevent hostname removal by the
// regex. This logic intentionally does not implement all the rules
// defined in RFC 3986, section 5.2.4 to show broken links and
// over-dot-segmented URIs; e.g., https://example.com/../../foo/bar.
// For more information, see https://drupal.org/node/832388.
$path = substr_replace($absoluteContentPath . $urlDecoded, '', 0, strlen($baseUrl));
// Remove './' segments where possible.
$path = str_replace('/./', '/', $path);
// Remove '../' segments where possible. Loop until all segments are
// removed. Taken over from _drupal_build_css_path() in common.inc.
$last = '';
while ($path != $last) {
$last = $path;
$path = preg_replace('`(^|/)(?!\\.\\./)([^/]+)/\\.\\./`', '$1', $path);
}
// Glue the hostname and path to full-qualified URI.
$links[$baseUrl . $path] = $baseUrl . $path;
}
elseif (!empty($absoluteContentPath) && preg_match('!^[^/]!', $urlDecoded)) {
$links[$absoluteContentPath . $url] = $absoluteContentPath . $url;
}
else {
// @todo Are there more special cases the module need to handle?
}
}
}
return array_filter($links, function ($url) {
return !$this
->isUrlBlacklisted($url);
});
}