You are here

public function AliasCleaner::cleanString in Pathauto 8

Clean up a string segment to be used in an URL alias.

Performs the following possible alterations:

  • Remove all HTML tags.
  • Process the string through the transliteration module.
  • Replace or remove punctuation with the separator character.
  • Remove back-slashes.
  • Replace non-ascii and non-numeric characters with the separator.
  • Remove common words.
  • Replace whitespace with the separator character.
  • Trim duplicate, leading, and trailing separators.
  • Convert to lower-case.
  • Shorten to a desired length and logical position based on word boundaries.

This function should *not* be called on URL alias or path strings because it is assumed that they are already clean.

Parameters

string $string: A string to clean.

array $options: (optional) A keyed array of settings and flags to control the Pathauto clean string replacement process. Supported options are:

  • langcode: A language code to be used when translating strings.

Return value

string The cleaned string.

Overrides AliasCleanerInterface::cleanString

1 call to AliasCleaner::cleanString()
AliasCleaner::cleanTokenValues in src/AliasCleaner.php
Clean tokens so they are URL friendly.

File

src/AliasCleaner.php, line 161

Class

AliasCleaner
Provides an alias cleaner.

Namespace

Drupal\pathauto

Code

public function cleanString($string, array $options = []) {
  if (empty($this->cleanStringCache)) {

    // Generate and cache variables used in this method.
    $config = $this->configFactory
      ->get('pathauto.settings');
    $this->cleanStringCache = [
      'separator' => $config
        ->get('separator'),
      'strings' => [],
      'transliterate' => $config
        ->get('transliterate'),
      'punctuation' => [],
      'reduce_ascii' => (bool) $config
        ->get('reduce_ascii'),
      'ignore_words_regex' => FALSE,
      'lowercase' => (bool) $config
        ->get('case'),
      'maxlength' => min($config
        ->get('max_component_length'), $this->aliasStorageHelper
        ->getAliasSchemaMaxLength()),
    ];

    // Generate and cache the punctuation replacements for strtr().
    $punctuation = $this
      ->getPunctuationCharacters();
    foreach ($punctuation as $name => $details) {
      $action = $config
        ->get('punctuation.' . $name);
      switch ($action) {
        case PathautoGeneratorInterface::PUNCTUATION_REMOVE:
          $this->cleanStringCache['punctuation'][$details['value']] = '';
          break;
        case PathautoGeneratorInterface::PUNCTUATION_REPLACE:
          $this->cleanStringCache['punctuation'][$details['value']] = $this->cleanStringCache['separator'];
          break;
        case PathautoGeneratorInterface::PUNCTUATION_DO_NOTHING:

          // Literally do nothing.
          break;
      }
    }

    // Generate and cache the ignored words regular expression.
    $ignore_words = $config
      ->get('ignore_words');
    $ignore_words_regex = preg_replace([
      '/^[,\\s]+|[,\\s]+$/',
      '/[,\\s]+/',
    ], [
      '',
      '\\b|\\b',
    ], $ignore_words);
    if ($ignore_words_regex) {
      $this->cleanStringCache['ignore_words_regex'] = '\\b' . $ignore_words_regex . '\\b';
      if (function_exists('mb_eregi_replace')) {
        mb_regex_encoding('UTF-8');
        $this->cleanStringCache['ignore_words_callback'] = 'mb_eregi_replace';
      }
      else {
        $this->cleanStringCache['ignore_words_callback'] = 'preg_replace';
        $this->cleanStringCache['ignore_words_regex'] = '/' . $this->cleanStringCache['ignore_words_regex'] . '/i';
      }
    }
  }

  // Empty strings do not need any processing.
  if ($string === '' || $string === NULL) {
    return '';
  }
  $langcode = NULL;
  if (!empty($options['language'])) {
    $langcode = $options['language']
      ->getId();
  }
  elseif (!empty($options['langcode'])) {
    $langcode = $options['langcode'];
  }

  // Check if the string has already been processed, and if so return the
  // cached result.
  if (isset($this->cleanStringCache['strings'][$langcode][(string) $string])) {
    return $this->cleanStringCache['strings'][$langcode][(string) $string];
  }

  // Remove all HTML tags from the string.
  $output = Html::decodeEntities($string);
  $output = PlainTextOutput::renderFromHtml($output);

  // Replace or drop punctuation based on user settings.
  $output = strtr($output, $this->cleanStringCache['punctuation']);

  // Optionally transliterate.
  if ($this->cleanStringCache['transliterate']) {

    // If the reduce strings to letters and numbers is enabled, don't bother
    // replacing unknown characters with a question mark. Use an empty string
    // instead.
    $output = $this->transliteration
      ->transliterate($output, $langcode, $this->cleanStringCache['reduce_ascii'] ? '' : '?');

    // Replace or drop punctuation again as the transliteration process can
    // convert special characters to punctuation.
    $output = strtr($output, $this->cleanStringCache['punctuation']);
  }

  // Reduce strings to letters and numbers.
  if ($this->cleanStringCache['reduce_ascii']) {
    $output = preg_replace('/[^a-zA-Z0-9\\/]+/', $this->cleanStringCache['separator'], $output);
  }

  // Get rid of words that are on the ignore list.
  if ($this->cleanStringCache['ignore_words_regex']) {
    $words_removed = $this->cleanStringCache['ignore_words_callback']($this->cleanStringCache['ignore_words_regex'], '', $output);
    if (mb_strlen(trim($words_removed)) > 0) {
      $output = $words_removed;
    }
  }

  // Always replace whitespace with the separator.
  $output = preg_replace('/\\s+/', $this->cleanStringCache['separator'], $output);

  // Trim duplicates and remove trailing and leading separators.
  $output = $this
    ->getCleanSeparators($this
    ->getCleanSeparators($output, $this->cleanStringCache['separator']));

  // Optionally convert to lower case.
  if ($this->cleanStringCache['lowercase']) {
    $output = mb_strtolower($output);
  }

  // Shorten to a logical place based on word boundaries.
  $output = Unicode::truncate($output, $this->cleanStringCache['maxlength'], TRUE);

  // Cache this result in the static array.
  $this->cleanStringCache['strings'][$langcode][(string) $string] = $output;
  return $output;
}