View source
<?php
namespace Drupal\linkchecker;
use Drupal\Component\Datetime\TimeInterface;
use Drupal\Component\Utility\UrlHelper;
use Drupal\Core\Config\ConfigFactory;
use Drupal\Core\Database\Connection;
use Drupal\Core\Entity\EntityTypeManagerInterface;
use Drupal\Core\Entity\FieldableEntityInterface;
use Drupal\Core\Entity\TranslatableInterface;
use Drupal\Core\Field\FieldItemListInterface;
use Drupal\linkchecker\Entity\LinkCheckerLink;
use Drupal\linkchecker\Plugin\LinkExtractorManager;
use Symfony\Component\HttpFoundation\RequestStack;
class LinkExtractorService {
protected $extractorManager;
protected $entityTypeManager;
protected $linkcheckerSetting;
protected $request;
protected $database;
protected $time;
protected $pos;
public function __construct(LinkExtractorManager $extractorManager, EntityTypeManagerInterface $entityTypeManager, ConfigFactory $configFactory, RequestStack $requestStack, Connection $dbConnection, TimeInterface $time) {
$this->extractorManager = $extractorManager;
$this->entityTypeManager = $entityTypeManager;
$this->linkcheckerSetting = $configFactory
->get('linkchecker.settings');
$this->request = $requestStack
->getCurrentRequest();
$this->database = $dbConnection;
$this->time = $time;
}
public function extractFromEntity(FieldableEntityInterface $entity) {
$links = [];
$this->pos = 0;
foreach ($entity
->getFieldDefinitions() as $fieldDefinition) {
if ($entity instanceof TranslatableInterface && $fieldDefinition
->isTranslatable()) {
foreach ($entity
->getTranslationLanguages() as $language) {
$translation = $entity
->getTranslation($language
->getId());
$links += $this
->extractFromField($translation
->get($fieldDefinition
->getName()));
}
}
else {
$links += $this
->extractFromField($entity
->get($fieldDefinition
->getName()));
}
}
return $links;
}
public function extractFromField(FieldItemListInterface $fieldItemList) {
$urls = [];
$entity = $fieldItemList
->getEntity();
$entityBundle = $fieldItemList
->getEntity()
->bundle();
$fieldConfig = $fieldItemList
->getFieldDefinition()
->getConfig($entityBundle);
$scan = $fieldConfig
->getThirdPartySetting('linkchecker', 'scan', FALSE);
if ($scan) {
try {
$baseContentUrl = $entity
->toUrl()
->setAbsolute()
->toString();
} catch (\Exception $e) {
$baseContentUrl = NULL;
}
$extractorName = $fieldConfig
->getThirdPartySetting('linkchecker', 'extractor', NULL);
$extractor = $this->extractorManager
->createInstance($extractorName);
$urls = $extractor
->extract($fieldItemList
->getValue());
$urls = array_filter($urls);
$urls = array_unique($urls);
$urls = $this
->getLinks($urls, $baseContentUrl);
}
$linkCheckerLinks = [];
foreach ($urls as $link) {
$linkCheckerLinks[$this->pos] = LinkCheckerLink::create([
'url' => $link,
'entity_id' => [
'target_id' => $entity
->id(),
'target_type' => $entity
->getEntityTypeId(),
],
'entity_field' => $fieldItemList
->getFieldDefinition()
->getName(),
'entity_langcode' => $fieldItemList
->getLangcode(),
]);
$this->pos++;
}
return $linkCheckerLinks;
}
public function getLinks(array $urls, $baseContentUrl = NULL) {
$checkLinksType = $this->linkcheckerSetting
->get('check_links_types');
if (isset($this->request)) {
$httpProtocol = $this->request
->getScheme() . '://';
$baseUrl = $this->request
->getSchemeAndHttpHost();
}
else {
$httpProtocol = $this->linkcheckerSetting
->get('default_url_scheme');
$baseUrl = $httpProtocol . $this->linkcheckerSetting
->get('base_path');
}
if (empty($baseContentUrl)) {
$baseContentUrl = $baseUrl;
}
$links = [];
foreach ($urls as $url) {
$urlDecoded = $url;
if (preg_match('!^//!', $urlDecoded)) {
$urlDecoded = $httpProtocol . ':' . $urlDecoded;
}
$urlEncoded = str_replace(' ', '%20', $urlDecoded);
if ($checkLinksType != LinkCheckerLinkInterface::TYPE_INTERNAL && UrlHelper::isValid($urlEncoded, TRUE)) {
$links[$urlDecoded] = $url;
}
elseif (preg_match('/^\\w[\\w.+]*:/', $urlDecoded)) {
continue;
}
elseif ($checkLinksType != LinkCheckerLinkInterface::TYPE_EXTERNAL && UrlHelper::isValid($urlEncoded, FALSE)) {
$absoluteContentPath = $this
->getAbsoluteContentPath($baseContentUrl);
if (preg_match('!^/!', $urlDecoded)) {
$links[$baseUrl . $urlDecoded] = $baseUrl . $url;
}
elseif (!empty($baseContentUrl) && preg_match('!^[?#]!', $urlDecoded)) {
$links[$baseContentUrl . $baseContentUrl] = $baseContentUrl . $url;
}
elseif (!empty($absoluteContentPath) && preg_match('!^\\.{1,2}/!', $urlDecoded)) {
$path = substr_replace($absoluteContentPath . $urlDecoded, '', 0, strlen($baseUrl));
$path = str_replace('/./', '/', $path);
$last = '';
while ($path != $last) {
$last = $path;
$path = preg_replace('`(^|/)(?!\\.\\./)([^/]+)/\\.\\./`', '$1', $path);
}
$links[$baseUrl . $path] = $baseUrl . $path;
}
elseif (!empty($absoluteContentPath) && preg_match('!^[^/]!', $urlDecoded)) {
$links[$absoluteContentPath . $url] = $absoluteContentPath . $url;
}
else {
}
}
}
return array_filter($links, function ($url) {
return !$this
->isUrlBlacklisted($url);
});
}
public function isLinkExists(LinkCheckerLinkInterface $link) {
$entity = $link
->getParentEntity();
if (!isset($entity)) {
return FALSE;
}
if ($entity instanceof TranslatableInterface) {
if ($entity
->hasTranslation($link
->getParentEntityLangcode())) {
$entity = $entity
->getTranslation($link
->getParentEntityLangcode());
}
else {
return FALSE;
}
}
if (!$entity
->hasField($link
->getParentEntityFieldName())) {
return FALSE;
}
$links = $this
->extractFromField($entity
->get($link
->getParentEntityFieldName()));
foreach ($links as $extractedLink) {
if (LinkCheckerLink::generateHash($extractedLink
->getUrl()) == $link
->getHash()) {
return TRUE;
}
}
return FALSE;
}
public function saveLinkMultiple(array $links) {
foreach ($links as $link) {
$this
->saveLink($link);
}
}
public function saveLink(LinkCheckerLinkInterface $link) {
$storage = $this->entityTypeManager
->getStorage($link
->getEntityTypeId());
$query = $storage
->getQuery();
$query
->condition('urlhash', LinkCheckerLink::generateHash($link
->getUrl()))
->condition('entity_id.target_id', $link
->getParentEntity()
->id())
->condition('entity_id.target_type', $link
->getParentEntity()
->getEntityTypeId())
->condition('entity_field', $link
->getParentEntityFieldName())
->condition('entity_langcode', $link
->getParentEntityLangcode());
$ids = $query
->execute();
if (empty($ids)) {
$link
->save();
}
}
public function updateEntityExtractIndex(FieldableEntityInterface $entity) {
$isExistsQuery = $this->database
->select('linkchecker_index', 'i');
$isExistsQuery
->fields('i');
$isExistsQuery
->condition('entity_id', $entity
->id());
$isExistsQuery
->condition('entity_type', $entity
->getEntityTypeId());
$isExistsQuery
->range(0, 1);
$isExists = $isExistsQuery
->execute()
->fetchField();
if (empty($isExists)) {
$this->database
->insert('linkchecker_index')
->fields([
'entity_id' => $entity
->id(),
'entity_type' => $entity
->getEntityTypeId(),
'last_extracted_time' => $this->time
->getCurrentTime(),
])
->execute();
}
else {
$this->database
->update('linkchecker_index')
->fields([
'last_extracted_time' => $this->time
->getCurrentTime(),
])
->condition('entity_id', $entity
->id())
->condition('entity_type', $entity
->getEntityTypeId())
->execute();
}
}
protected function isUrlBlacklisted($url) {
$urls = $this->linkcheckerSetting
->get('check.disable_link_check_for_urls');
if (!empty($urls) && preg_match('/' . implode('|', array_map(function ($links) {
return preg_quote($links, '/');
}, preg_split('/(\\r\\n?|\\n)/', $urls))) . '/', $url)) {
return TRUE;
}
if (!preg_match('/^(https?):\\/\\//i', $url)) {
return TRUE;
}
return FALSE;
}
protected function getAbsoluteContentPath($url) {
$uri = @parse_url($url);
if ($uri == FALSE) {
return NULL;
}
if (!isset($uri['scheme'])) {
return NULL;
}
if (!in_array($uri['scheme'], [
'http',
'https',
])) {
return NULL;
}
$scheme = isset($uri['scheme']) ? $uri['scheme'] . '://' : '';
$user = isset($uri['user']) ? $uri['user'] . ($uri['pass'] ? ':' . $uri['pass'] : '') . '@' : '';
$port = isset($uri['port']) ? $uri['port'] : 80;
$host = $uri['host'] . ($port != 80 ? ':' . $port : '');
$path = isset($uri['path']) ? $uri['path'] : '/';
$absoluteUrl = $scheme . $user . $host . $path;
$lastSlash = strrpos($absoluteUrl, '/');
$absoluteContentPath = mb_substr($absoluteUrl, 0, $lastSlash + 1);
return $absoluteContentPath;
}
}