View source  
  <?php
namespace Drupal\linkchecker;
use Drupal\Component\Datetime\TimeInterface;
use Drupal\Component\Utility\UrlHelper;
use Drupal\Core\Config\ConfigFactory;
use Drupal\Core\Database\Connection;
use Drupal\Core\Entity\EntityTypeManagerInterface;
use Drupal\Core\Entity\FieldableEntityInterface;
use Drupal\Core\Entity\TranslatableInterface;
use Drupal\Core\Field\FieldItemListInterface;
use Drupal\linkchecker\Entity\LinkCheckerLink;
use Drupal\linkchecker\Plugin\LinkExtractorManager;
use Symfony\Component\HttpFoundation\RequestStack;
class LinkExtractorService {
  
  protected $extractorManager;
  
  protected $entityTypeManager;
  
  protected $linkcheckerSetting;
  
  protected $request;
  
  protected $database;
  
  protected $time;
  
  protected $pos;
  
  public function __construct(LinkExtractorManager $extractorManager, EntityTypeManagerInterface $entityTypeManager, ConfigFactory $configFactory, RequestStack $requestStack, Connection $dbConnection, TimeInterface $time) {
    $this->extractorManager = $extractorManager;
    $this->entityTypeManager = $entityTypeManager;
    $this->linkcheckerSetting = $configFactory
      ->get('linkchecker.settings');
    $this->request = $requestStack
      ->getCurrentRequest();
    $this->database = $dbConnection;
    $this->time = $time;
  }
  
  public function extractFromEntity(FieldableEntityInterface $entity) {
    $links = [];
    $this->pos = 0;
    foreach ($entity
      ->getFieldDefinitions() as $fieldDefinition) {
      if ($entity instanceof TranslatableInterface && $fieldDefinition
        ->isTranslatable()) {
        foreach ($entity
          ->getTranslationLanguages() as $language) {
          $translation = $entity
            ->getTranslation($language
            ->getId());
          $links += $this
            ->extractFromField($translation
            ->get($fieldDefinition
            ->getName()));
        }
      }
      else {
        $links += $this
          ->extractFromField($entity
          ->get($fieldDefinition
          ->getName()));
      }
    }
    return $links;
  }
  
  public function extractFromField(FieldItemListInterface $fieldItemList) {
    $urls = [];
    $entity = $fieldItemList
      ->getEntity();
    $entityBundle = $fieldItemList
      ->getEntity()
      ->bundle();
    $fieldConfig = $fieldItemList
      ->getFieldDefinition()
      ->getConfig($entityBundle);
    $scan = $fieldConfig
      ->getThirdPartySetting('linkchecker', 'scan', FALSE);
    if ($scan) {
      try {
        $baseContentUrl = $entity
          ->toUrl()
          ->setAbsolute()
          ->toString();
      } catch (\Exception $e) {
        $baseContentUrl = NULL;
      }
      $extractorName = $fieldConfig
        ->getThirdPartySetting('linkchecker', 'extractor', NULL);
      
      $extractor = $this->extractorManager
        ->createInstance($extractorName);
      $urls = $extractor
        ->extract($fieldItemList
        ->getValue());
      
      $urls = array_filter($urls);
      
      $urls = array_unique($urls);
      $urls = $this
        ->getLinks($urls, $baseContentUrl);
    }
    $linkCheckerLinks = [];
    foreach ($urls as $link) {
      $linkCheckerLinks[$this->pos] = LinkCheckerLink::create([
        'url' => $link,
        'entity_id' => [
          'target_id' => $entity
            ->id(),
          'target_type' => $entity
            ->getEntityTypeId(),
        ],
        'entity_field' => $fieldItemList
          ->getFieldDefinition()
          ->getName(),
        'entity_langcode' => $fieldItemList
          ->getLangcode(),
      ]);
      $this->pos++;
    }
    return $linkCheckerLinks;
  }
  
  public function getLinks(array $urls, $baseContentUrl = NULL) {
    
    $checkLinksType = $this->linkcheckerSetting
      ->get('check_links_types');
    if (isset($this->request)) {
      $httpProtocol = $this->request
        ->getScheme() . '://';
      $baseUrl = $this->request
        ->getSchemeAndHttpHost();
    }
    else {
      $httpProtocol = $this->linkcheckerSetting
        ->get('default_url_scheme');
      $baseUrl = $httpProtocol . $this->linkcheckerSetting
        ->get('base_path');
    }
    if (empty($baseContentUrl)) {
      $baseContentUrl = $baseUrl;
    }
    $links = [];
    foreach ($urls as $url) {
      
      $urlDecoded = $url;
      
      if (preg_match('!^//!', $urlDecoded)) {
        $urlDecoded = $httpProtocol . ':' . $urlDecoded;
      }
      
      $urlEncoded = str_replace(' ', '%20', $urlDecoded);
      
      if ($checkLinksType != LinkCheckerLinkInterface::TYPE_INTERNAL && UrlHelper::isValid($urlEncoded, TRUE)) {
        
        $links[$urlDecoded] = $url;
      }
      elseif (preg_match('/^\\w[\\w.+]*:/', $urlDecoded)) {
        continue;
      }
      elseif ($checkLinksType != LinkCheckerLinkInterface::TYPE_EXTERNAL && UrlHelper::isValid($urlEncoded, FALSE)) {
        $absoluteContentPath = $this
          ->getAbsoluteContentPath($baseContentUrl);
        
        if (preg_match('!^/!', $urlDecoded)) {
          
          $links[$baseUrl . $urlDecoded] = $baseUrl . $url;
        }
        elseif (!empty($baseContentUrl) && preg_match('!^[?#]!', $urlDecoded)) {
          
          $links[$baseContentUrl . $baseContentUrl] = $baseContentUrl . $url;
        }
        elseif (!empty($absoluteContentPath) && preg_match('!^\\.{1,2}/!', $urlDecoded)) {
          
          $path = substr_replace($absoluteContentPath . $urlDecoded, '', 0, strlen($baseUrl));
          
          $path = str_replace('/./', '/', $path);
          
          $last = '';
          while ($path != $last) {
            $last = $path;
            $path = preg_replace('`(^|/)(?!\\.\\./)([^/]+)/\\.\\./`', '$1', $path);
          }
          
          $links[$baseUrl . $path] = $baseUrl . $path;
        }
        elseif (!empty($absoluteContentPath) && preg_match('!^[^/]!', $urlDecoded)) {
          $links[$absoluteContentPath . $url] = $absoluteContentPath . $url;
        }
        else {
          
        }
      }
    }
    return array_filter($links, function ($url) {
      return !$this
        ->isUrlBlacklisted($url);
    });
  }
  
  public function isLinkExists(LinkCheckerLinkInterface $link) {
    $entity = $link
      ->getParentEntity();
    
    if (!isset($entity)) {
      return FALSE;
    }
    if ($entity instanceof TranslatableInterface) {
      if ($entity
        ->hasTranslation($link
        ->getParentEntityLangcode())) {
        $entity = $entity
          ->getTranslation($link
          ->getParentEntityLangcode());
      }
      else {
        return FALSE;
      }
    }
    
    if (!$entity
      ->hasField($link
      ->getParentEntityFieldName())) {
      return FALSE;
    }
    $links = $this
      ->extractFromField($entity
      ->get($link
      ->getParentEntityFieldName()));
    foreach ($links as $extractedLink) {
      if (LinkCheckerLink::generateHash($extractedLink
        ->getUrl()) == $link
        ->getHash()) {
        return TRUE;
      }
    }
    
    return FALSE;
  }
  
  public function saveLinkMultiple(array $links) {
    foreach ($links as $link) {
      $this
        ->saveLink($link);
    }
  }
  
  public function saveLink(LinkCheckerLinkInterface $link) {
    $storage = $this->entityTypeManager
      ->getStorage($link
      ->getEntityTypeId());
    $query = $storage
      ->getQuery();
    $query
      ->condition('urlhash', LinkCheckerLink::generateHash($link
      ->getUrl()))
      ->condition('entity_id.target_id', $link
      ->getParentEntity()
      ->id())
      ->condition('entity_id.target_type', $link
      ->getParentEntity()
      ->getEntityTypeId())
      ->condition('entity_field', $link
      ->getParentEntityFieldName())
      ->condition('entity_langcode', $link
      ->getParentEntityLangcode());
    $ids = $query
      ->execute();
    if (empty($ids)) {
      $link
        ->save();
    }
  }
  
  public function updateEntityExtractIndex(FieldableEntityInterface $entity) {
    
    $isExistsQuery = $this->database
      ->select('linkchecker_index', 'i');
    $isExistsQuery
      ->fields('i');
    $isExistsQuery
      ->condition('entity_id', $entity
      ->id());
    $isExistsQuery
      ->condition('entity_type', $entity
      ->getEntityTypeId());
    $isExistsQuery
      ->range(0, 1);
    $isExists = $isExistsQuery
      ->execute()
      ->fetchField();
    if (empty($isExists)) {
      $this->database
        ->insert('linkchecker_index')
        ->fields([
        'entity_id' => $entity
          ->id(),
        'entity_type' => $entity
          ->getEntityTypeId(),
        'last_extracted_time' => $this->time
          ->getCurrentTime(),
      ])
        ->execute();
    }
    else {
      $this->database
        ->update('linkchecker_index')
        ->fields([
        'last_extracted_time' => $this->time
          ->getCurrentTime(),
      ])
        ->condition('entity_id', $entity
        ->id())
        ->condition('entity_type', $entity
        ->getEntityTypeId())
        ->execute();
    }
  }
  
  protected function isUrlBlacklisted($url) {
    
    $urls = $this->linkcheckerSetting
      ->get('check.disable_link_check_for_urls');
    if (!empty($urls) && preg_match('/' . implode('|', array_map(function ($links) {
      return preg_quote($links, '/');
    }, preg_split('/(\\r\\n?|\\n)/', $urls))) . '/', $url)) {
      return TRUE;
    }
    
    if (!preg_match('/^(https?):\\/\\//i', $url)) {
      return TRUE;
    }
    return FALSE;
  }
  
  protected function getAbsoluteContentPath($url) {
    
    $uri = @parse_url($url);
    if ($uri == FALSE) {
      return NULL;
    }
    if (!isset($uri['scheme'])) {
      return NULL;
    }
    
    if (!in_array($uri['scheme'], [
      'http',
      'https',
    ])) {
      return NULL;
    }
    $scheme = isset($uri['scheme']) ? $uri['scheme'] . '://' : '';
    $user = isset($uri['user']) ? $uri['user'] . ($uri['pass'] ? ':' . $uri['pass'] : '') . '@' : '';
    $port = isset($uri['port']) ? $uri['port'] : 80;
    $host = $uri['host'] . ($port != 80 ? ':' . $port : '');
    $path = isset($uri['path']) ? $uri['path'] : '/';
    
    $absoluteUrl = $scheme . $user . $host . $path;
    
    $lastSlash = strrpos($absoluteUrl, '/');
    $absoluteContentPath = mb_substr($absoluteUrl, 0, $lastSlash + 1);
    return $absoluteContentPath;
  }
}