You are here

public static function ConvertUrlToEmbedFilter::convertUrls in URL Embed 8

Replaces appearances of supported URLs with <drupal-url> embed elements.

Logic of this function is copied from _filter_url() and slightly adopted for our use case. _filter_url() is unfortunately not general enough to re-use it.

Parameters

string $text: Text to be processed.

string $url_prefix: (Optional) Prefix that should be used to manually choose which URLs should be converted.

Return value

string Processed text.

1 call to ConvertUrlToEmbedFilter::convertUrls()
ConvertUrlToEmbedFilter::process in src/Plugin/Filter/ConvertUrlToEmbedFilter.php
Performs the filter processing.

File

src/Plugin/Filter/ConvertUrlToEmbedFilter.php, line 62

Class

ConvertUrlToEmbedFilter
Provides a filter to display embedded entities based on data attributes.

Namespace

Drupal\url_embed\Plugin\Filter

Code

public static function convertUrls($text, $url_prefix = '') {

  // Tags to skip and not recurse into.
  $ignore_tags = 'a|script|style|code|pre';

  // Create an array which contains the regexps for each type of link.
  // The key to the regexp is the name of a function that is used as
  // callback function to process matches of the regexp. The callback function
  // is to return the replacement for the match. The array is used and
  // matching/replacement done below inside some loops.
  $tasks = [];

  // Prepare protocols pattern for absolute URLs.
  // \Drupal\Component\Utility\UrlHelper::stripDangerousProtocols() will replace
  // any bad protocols with HTTP, so we need to support the identical list.
  // While '//' is technically optional for MAILTO only, we cannot cleanly
  // differ between protocols here without hard-coding MAILTO, so '//' is
  // optional for all protocols.
  // @see \Drupal\Component\Utility\UrlHelper::stripDangerousProtocols()
  $protocols = \Drupal::getContainer()
    ->getParameter('filter_protocols');
  $protocols = implode(':(?://)?|', $protocols) . ':(?://)?';
  $valid_url_path_characters = "[\\p{L}\\p{M}\\p{N}!\\*\\';:=\\+,\\.\$\\/%#\\[\\]\\-_~@&]";

  // Allow URL paths to contain balanced parens
  // 1. Used in Wikipedia URLs like /Primer_(film)
  // 2. Used in IIS sessions like /S(dfd346)/
  $valid_url_balanced_parens = '\\(' . $valid_url_path_characters . '+\\)';

  // Valid end-of-path characters (so /foo. does not gobble the period).
  // 1. Allow =&# for empty URL parameters and other URL-join artifacts
  $valid_url_ending_characters = '[\\p{L}\\p{M}\\p{N}:_+~#=/]|(?:' . $valid_url_balanced_parens . ')';
  $valid_url_query_chars = '[a-zA-Z0-9!?\\*\'@\\(\\);:&=\\+\\$\\/%#\\[\\]\\-_\\.,~|]';
  $valid_url_query_ending_chars = '[a-zA-Z0-9_&=#\\/]';

  //full path

  //and allow @ in a url, but only in the middle. Catch things like http://example.com/@user/
  $valid_url_path = '(?:(?:' . $valid_url_path_characters . '*(?:' . $valid_url_balanced_parens . $valid_url_path_characters . '*)*' . $valid_url_ending_characters . ')|(?:@' . $valid_url_path_characters . '+\\/))';

  // Prepare domain name pattern.
  // The ICANN seems to be on track towards accepting more diverse top level
  // domains, so this pattern has been "future-proofed" to allow for TLDs
  // of length 2-64.
  $domain = '(?:[\\p{L}\\p{M}\\p{N}._+-]+\\.)?[\\p{L}\\p{M}]{2,64}\\b';
  $ip = '(?:[0-9]{1,3}\\.){3}[0-9]{1,3}';
  $auth = '[\\p{L}\\p{M}\\p{N}:%_+*~#?&=.,/;-]+@';
  $trail = '(' . $valid_url_path . '*)?(\\?' . $valid_url_query_chars . '*' . $valid_url_query_ending_chars . ')?';

  // Match absolute URLs.
  $url_pattern = "(?:{$auth})?(?:{$domain}|{$ip})/?(?:{$trail})?";
  $pattern = "`{$url_prefix}((?:{$protocols})(?:{$url_pattern}))`u";
  $tasks['replaceFullLinks'] = $pattern;

  // HTML comments need to be handled separately, as they may contain HTML
  // markup, especially a '>'. Therefore, remove all comment contents and add
  // them back later.
  _filter_url_escape_comments('', TRUE);
  $text = preg_replace_callback('`<!--(.*?)-->`s', '_filter_url_escape_comments', $text);

  // Split at all tags; ensures that no tags or attributes are processed.
  $chunks = preg_split('/(<.+?>)/is', $text, -1, PREG_SPLIT_DELIM_CAPTURE);

  // PHP ensures that the array consists of alternating delimiters and
  // literals, and begins and ends with a literal (inserting NULL as
  // required). Therefore, the first chunk is always text:
  $chunk_type = 'text';

  // If a tag of $ignore_tags is found, it is stored in $open_tag and only
  // removed when the closing tag is found. Until the closing tag is found,
  // no replacements are made.
  $open_tag = '';
  for ($i = 0; $i < count($chunks); $i++) {
    if ($chunk_type == 'text') {

      // Only process this text if there are no unclosed $ignore_tags.
      if ($open_tag == '') {

        // If there is a match, inject a link into this chunk via the callback
        // function contained in $task.
        $chunks[$i] = preg_replace_callback($pattern, function ($match) {
          if (\Drupal::service('url_embed')
            ->getEmbed(Html::decodeEntities($match[1]))) {
            return '<drupal-url data-embed-url="' . $match[1] . '"></drupal-url>';
          }
          else {
            return $match[1];
          }
        }, $chunks[$i]);
      }

      // Text chunk is done, so next chunk must be a tag.
      $chunk_type = 'tag';
    }
    else {

      // Only process this tag if there are no unclosed $ignore_tags.
      if ($open_tag == '') {

        // Check whether this tag is contained in $ignore_tags.
        if (preg_match("`<({$ignore_tags})(?:\\s|>)`i", $chunks[$i], $matches)) {
          $open_tag = $matches[1];
        }
      }
      else {
        if (preg_match("`<\\/{$open_tag}>`i", $chunks[$i], $matches)) {
          $open_tag = '';
        }
      }

      // Tag chunk is done, so next chunk must be text.
      $chunk_type = 'text';
    }
  }
  $text = implode($chunks);

  // Revert to the original comment contents
  _filter_url_escape_comments('', FALSE);
  return preg_replace_callback('`<!--(.*?)-->`', '_filter_url_escape_comments', $text);
}