You are here

class LinkCheckerService in Link checker 8

Class LinkCheckerService.

Hierarchy

Expanded class hierarchy of LinkCheckerService

2 files declare their use of LinkCheckerService
LinkCheck.php in src/Plugin/QueueWorker/LinkCheck.php
LinkCheckerAdminSettingsForm.php in src/Form/LinkCheckerAdminSettingsForm.php
1 string reference to 'LinkCheckerService'
linkchecker.services.yml in ./linkchecker.services.yml
linkchecker.services.yml
1 service uses LinkCheckerService
linkchecker.checker in ./linkchecker.services.yml
Drupal\linkchecker\LinkCheckerService

File

src/LinkCheckerService.php, line 21

Namespace

Drupal\linkchecker
View source
class LinkCheckerService {
  use StringTranslationTrait;

  /**
   * The entity type manager.
   *
   * @var \Drupal\Core\Entity\EntityTypeManagerInterface
   */
  protected $entityTypeManager;

  /**
   * The Linkchecker settings.
   *
   * @var \Drupal\Core\Config\ImmutableConfig
   */
  protected $linkcheckerSetting;

  /**
   * The http client.
   *
   * @var \GuzzleHttp\Client
   */
  protected $httpClient;

  /**
   * The report link.
   *
   * @var \Drupal\Core\Link
   */
  protected $reportLink;

  /**
   * The time.
   *
   * @var \Drupal\Component\Datetime\TimeInterface
   */
  protected $time;

  /**
   * The queue.
   *
   * @var \Drupal\Core\Queue\QueueInterface
   */
  protected $queue;

  /**
   * The status handler manager.
   *
   * @var \Drupal\linkchecker\Plugin\LinkStatusHandlerManager
   */
  protected $statusHandlerManager;

  /**
   * Constructs a new LinkCheckerService object.
   */
  public function __construct(EntityTypeManagerInterface $entityTypeManager, ConfigFactory $config, Client $httpClient, TimeInterface $time, QueueFactory $queueFactory, LinkStatusHandlerManager $statusHandlerManager) {
    $this->entityTypeManager = $entityTypeManager;
    $this->linkcheckerSetting = $config
      ->get('linkchecker.settings');
    $this->httpClient = $httpClient;
    $this->time = $time;
    $this->queue = $queueFactory
      ->get('linkchecker_check');
    $this->statusHandlerManager = $statusHandlerManager;
  }

  /**
   * Queue all links for checking.
   *
   * @param bool $rebuild
   *   Defines whether rebuild queue or not.
   *
   * @return int
   *   Nubmer of queued items.
   */
  public function queueLinks($rebuild = FALSE) {
    if ($rebuild) {
      $this->queue
        ->deleteQueue();
    }
    if (!empty($this->queue
      ->numberOfItems())) {
      return $this->queue
        ->numberOfItems();
    }
    $checkInterval = $this->linkcheckerSetting
      ->get('check.interval');
    $linkIds = $this->entityTypeManager
      ->getStorage('linkcheckerlink')
      ->getAggregateQuery()
      ->groupBy('urlhash')
      ->aggregate('lid', 'MIN')
      ->condition('last_check', $this->time
      ->getRequestTime() - $checkInterval, '<=')
      ->execute();
    $this->queue
      ->createQueue();
    if (!empty($linkIds)) {
      $linkIds = array_column($linkIds, 'lid_min');
      $maxConnections = $this->linkcheckerSetting
        ->get('check.connections_max');

      // Split ids by max connection amount to make possible send concurrent
      // requests.
      $linkIds = array_chunk($linkIds, $maxConnections);
    }
    else {
      $linkIds = [];
    }
    foreach ($linkIds as $ids) {
      $this->queue
        ->createItem($ids);
    }
    return $this->queue
      ->numberOfItems();
  }

  /**
   * Check the link.
   *
   * @param \Drupal\linkchecker\LinkCheckerLinkInterface $link
   *   The link to check.
   *
   * @return \GuzzleHttp\Promise\PromiseInterface
   *   Promise of link checking request.
   */
  public function check(LinkCheckerLinkInterface $link) {
    $userAgent = $this->linkcheckerSetting
      ->get('check.useragent');
    $headers = [];
    $headers['User-Agent'] = $userAgent;
    $uri = @parse_url($link
      ->getUrl());

    // URL contains a fragment.
    if (in_array($link
      ->getRequestMethod(), [
      'HEAD',
      'GET',
    ]) && !empty($uri['fragment'])) {

      // We need the full content and not only the HEAD.
      $link
        ->setRequestMethod('GET');

      // Request text content only (like Firefox/Chrome).
      $headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
    }
    elseif ($link
      ->getRequestMethod() == 'GET') {

      // Range: Only request the first 1024 bytes from remote server. This is
      // required to prevent timeouts on URLs that are large downloads.
      $headers['Range'] = 'bytes=0-1024';
    }

    // Add in the headers.
    $options = [
      'headers' => $headers,
      'max_redirects' => 0,
      'http_errors' => FALSE,
      'allow_redirects' => FALSE,
      'synchronous' => FALSE,
    ];
    return $this->httpClient
      ->requestAsync($link
      ->getRequestMethod(), $link
      ->getUrl(), $options)
      ->then(function (ResponseInterface $response) use ($link, $uri) {
      if (!empty($uri['fragment'])) {
        $response = $response
          ->withHeader('Fragment', $uri['fragment']);
      }
      $this
        ->statusHandling($response, $link);
    }, function (RequestException $e) use ($link) {
      $this
        ->exceptionHandling($e, $link);
    });
  }

  /**
   * Status code handling.
   *
   * @param \Psr\Http\Message\ResponseInterface $response
   *   An object containing the HTTP request headers, response code, headers,
   *   data and redirect status.
   * @param \Drupal\linkchecker\LinkCheckerLinkInterface $link
   *   The link.
   */
  protected function statusHandling(ResponseInterface $response, LinkCheckerLinkInterface $link) {
    $ignoreResponseCodes = preg_split('/(\\r\\n?|\\n)/', $this->linkcheckerSetting
      ->get('error.ignore_response_codes'));
    $error = $response
      ->getReasonPhrase();
    if (!isset($error)) {
      $error = '';
    }

    // Destination anchors in HTML documents may be specified either by:
    // - the A element (naming it with the name attribute)
    // - or by any other element (naming with the id attribute)
    // - and must not contain a key/value pair as these type of hash fragments
    //   are typically used by AJAX applications to prevent additionally HTTP
    //   requests e.g. https://www.example.com/ajax.html#key1=value1&key2=value2
    // - and must not contain '/' or ',' as this are not normal anchors.
    // - and '#top' is a reserved fragment that must not exist in a page.
    // See https://www.w3.org/TR/html401/struct/links.html
    $statusCode = $response
      ->getStatusCode();
    if ($statusCode == 200 && !empty($response
      ->getBody()) && !empty($response
      ->getHeader('Content-Type')) && $response
      ->hasHeader('Fragment') && preg_match('/=|\\/|,/', $response
      ->getHeaderLine('Fragment')) == FALSE && $response
      ->getHeader('Fragment') !== '#top' && in_array($response
      ->getHeaderLine('Content-Type'), [
      'text/html',
      'application/xhtml+xml',
      'application/xml',
    ]) && !preg_match('/(\\s[^>]*(name|id)(\\s+)?=(\\s+)?["\'])(' . preg_quote(urldecode($response
      ->getHeaderLine('Fragment')), '/') . ')(["\'][^>]*>)/i', $response
      ->getBody())) {

      // Override status code 200 with status code 404 so it can be handled with
      // default status code 404 logic and custom error text.
      $statusCode = 404;
      $error = 'URL fragment identifier not found in content';
    }
    switch ($statusCode) {
      case 301:
        $link
          ->setStatusCode($statusCode);
        $link
          ->setErrorMessage($error);
        $link
          ->setFailCount($link
          ->getFailCount() + 1);
        $link
          ->setLastCheckTime($this->time
          ->getCurrentTime());
        $link
          ->save();
        linkchecker_watchdog_log('linkchecker', 'Link %link has changed and needs to be updated.', [
          '%link' => $link
            ->getUrl(),
        ], RfcLogLevel::NOTICE, $this
          ->getReportLink());
        break;
      case 404:
        $link
          ->setStatusCode($statusCode);
        $link
          ->setErrorMessage($error);
        $link
          ->setFailCount($link
          ->getFailCount() + 1);
        $link
          ->setLastCheckTime($this->time
          ->getCurrentTime());
        $link
          ->save();
        linkchecker_watchdog_log('linkchecker', 'Broken link %link has been found.', [
          '%link' => $link
            ->getUrl(),
        ], RfcLogLevel::NOTICE, $this
          ->getReportLink());
        break;
      case 405:

        // - 405: Special error handling if method is not allowed. Switch link
        //   checking to GET method and try again.
        $link
          ->setRequestMethod('GET');
        $link
          ->setStatusCode($statusCode);
        $link
          ->setErrorMessage($error);
        $link
          ->setFailCount($link
          ->getFailCount() + 1);
        $link
          ->setLastCheckTime($this->time
          ->getCurrentTime());
        $link
          ->save();
        linkchecker_watchdog_log('linkchecker', 'Method HEAD is not allowed for link %link. Method has been changed to GET.', [
          '%link' => $link
            ->getUrl(),
        ], RfcLogLevel::NOTICE, $this
          ->getReportLink());
        break;
      case 500:

        // - 500: Like WGET, try with GET on "500 Internal server error".
        // - If GET also fails with status code 500, than the link is broken.
        if ($link
          ->getRequestMethod() == 'GET') {
          $link
            ->setStatusCode($statusCode);
          $link
            ->setErrorMessage($error);
          $link
            ->setFailCount($link
            ->getFailCount() + 1);
          $link
            ->setLastCheckTime($this->time
            ->getCurrentTime());
          $link
            ->save();
          linkchecker_watchdog_log('linkchecker', 'Broken link %link has been found.', [
            '%link' => $link
              ->getUrl(),
          ], RfcLogLevel::NOTICE, $this
            ->getReportLink());
        }
        else {
          $link
            ->setRequestMethod('GET');
          $link
            ->setStatusCode($statusCode);
          $link
            ->setErrorMessage($error);
          $link
            ->setFailCount($link
            ->getFailCount() + 1);
          $link
            ->setLastCheckTime($this->time
            ->getCurrentTime());
          $link
            ->save();
          linkchecker_watchdog_log('linkchecker', 'Internal server error for link %link. Method has been changed to GET.', [
            '%link' => $link
              ->getUrl(),
          ], RfcLogLevel::NOTICE, $this
            ->getReportLink());
        }
        break;
      default:

        // Don't treat ignored response codes as errors.
        if (in_array($statusCode, $ignoreResponseCodes)) {
          $link
            ->setStatusCode($statusCode);
          $link
            ->setErrorMessage($error);
          $link
            ->setFailCount(0);
          $link
            ->setLastCheckTime($this->time
            ->getCurrentTime());
          $link
            ->save();
        }
        else {
          $link
            ->setStatusCode($statusCode);
          $link
            ->setErrorMessage($error);
          $link
            ->setFailCount($link
            ->getFailCount() + 1);
          $link
            ->setLastCheckTime($this->time
            ->getCurrentTime());
          $link
            ->save();
          linkchecker_watchdog_log('linkchecker', 'Unhandled link error %link has been found.', [
            '%link' => $link
              ->getUrl(),
          ], RfcLogLevel::ERROR, $this
            ->getReportLink());
        }
    }
    $this
      ->updateSameLinks($link);
    foreach ($this->statusHandlerManager
      ->getDefinitions() as $definition) {
      if (in_array($statusCode, $definition['status_codes'])) {

        /** @var \Drupal\linkchecker\Plugin\LinkStatusHandlerInterface $handler */
        $handler = $this->statusHandlerManager
          ->createInstance($definition['id']);
        $handler
          ->queueItems($link, $response);
      }
    }
  }

  /**
   * Exception handling.
   *
   * @param \GuzzleHttp\Exception\RequestException $e
   *   An object containing the Exception.
   * @param \Drupal\linkchecker\LinkCheckerLinkInterface $link
   *   The link.
   */
  protected function exceptionHandling(RequestException $e, LinkCheckerLinkInterface $link) {
    $link
      ->setStatusCode('502');
    $link
      ->setErrorMessage($e
      ->getMessage());
    $link
      ->setFailCount($link
      ->getFailCount() + 1);
    $link
      ->setLastCheckTime($this->time
      ->getCurrentTime());
    $link
      ->save();
    linkchecker_watchdog_log('linkchecker', 'Unhandled link error %link has been found: : %message.', [
      '%link' => $link
        ->getUrl(),
      '%message' => $e
        ->getMessage(),
    ], RfcLogLevel::ERROR, $this
      ->getReportLink());
    $this
      ->updateSameLinks($link);
  }

  /**
   * Helper function to create report link.
   */
  protected function getReportLink() {
    if (!isset($this->reportLink)) {
      $this->reportLink = Link::fromTextAndUrl($this
        ->t('Broken links'), Url::fromUserInput('/admin/reports/linkchecker'));
    }
    return $this->reportLink;
  }

  /**
   * Helper function to update same links that were found in other entities.
   */
  protected function updateSameLinks(LinkCheckerLinkInterface $link) {
    $storage = $this->entityTypeManager
      ->getStorage($link
      ->getEntityTypeId());
    $query = $storage
      ->getQuery();
    $query
      ->condition('urlhash', $link
      ->getHash());
    $query
      ->condition('lid', $link
      ->id(), '!=');
    $ids = $query
      ->execute();
    foreach ($ids as $id) {

      /** @var \Drupal\linkchecker\LinkCheckerLinkInterface $linkToUpdate */
      $linkToUpdate = $storage
        ->load($id);
      $linkToUpdate
        ->setRequestMethod($link
        ->getRequestMethod());
      $linkToUpdate
        ->setStatusCode($link
        ->getStatusCode());
      $linkToUpdate
        ->setErrorMessage($link
        ->getErrorMessage());
      $linkToUpdate
        ->setFailCount($link
        ->getFailCount());
      $linkToUpdate
        ->setLastCheckTime($link
        ->getLastCheckTime());
      $linkToUpdate
        ->save();
    }
  }

}

Members

Namesort descending Modifiers Type Description Overrides
LinkCheckerService::$entityTypeManager protected property The entity type manager.
LinkCheckerService::$httpClient protected property The http client.
LinkCheckerService::$linkcheckerSetting protected property The Linkchecker settings.
LinkCheckerService::$queue protected property The queue.
LinkCheckerService::$reportLink protected property The report link.
LinkCheckerService::$statusHandlerManager protected property The status handler manager.
LinkCheckerService::$time protected property The time.
LinkCheckerService::check public function Check the link.
LinkCheckerService::exceptionHandling protected function Exception handling.
LinkCheckerService::getReportLink protected function Helper function to create report link.
LinkCheckerService::queueLinks public function Queue all links for checking.
LinkCheckerService::statusHandling protected function Status code handling.
LinkCheckerService::updateSameLinks protected function Helper function to update same links that were found in other entities.
LinkCheckerService::__construct public function Constructs a new LinkCheckerService object.
StringTranslationTrait::$stringTranslation protected property The string translation service. 1
StringTranslationTrait::formatPlural protected function Formats a string containing a count of items.
StringTranslationTrait::getNumberOfPlurals protected function Returns the number of plurals supported by a given language.
StringTranslationTrait::getStringTranslation protected function Gets the string translation service.
StringTranslationTrait::setStringTranslation public function Sets the string translation service to use. 2
StringTranslationTrait::t protected function Translates a string to the current language or to a given language.