You are here

function pathauto_cleanstring in Pathauto 6

Same name and namespace in other branches
  1. 5.2 pathauto.inc \pathauto_cleanstring()
  2. 5 pathauto.module \pathauto_cleanstring()
  3. 6.2 pathauto.inc \pathauto_cleanstring()
  4. 7 pathauto.inc \pathauto_cleanstring()

Clean up a string segment to be used in an URL alias.

Performs the following possible alterations:

  • Remove all HTML tags.
  • Process the string through the transliteration module.
  • Replace or remove punctuation with the separator character.
  • Remove back-slashes.
  • Replace non-ascii and non-numeric characters with the separator.
  • Remove common words.
  • Replace whitespace with the separator character.
  • Trim duplicate, leading, and trailing separators.
  • Convert to lower-case.
  • Shorten to a desired length and logical position based on word boundaries.

This function should *not* be called on URL alias or path strings because it is assumed that they are already clean.

Parameters

$string: A string to clean.

Return value

The cleaned string.

2 calls to pathauto_cleanstring()
PathautoUnitTestCase::testCleanString in ./pathauto.test
Test pathauto_cleanstring().
pathauto_clean_token_values in ./pathauto.inc
Clean tokens so they are URL friendly.
1 string reference to 'pathauto_cleanstring'
pathauto_clean_token_values in ./pathauto.inc
Clean tokens so they are URL friendly.

File

./pathauto.inc, line 81
Miscellaneous functions for Pathauto.

Code

function pathauto_cleanstring($string) {
  static $strings = array();

  // Empty strings do not need any proccessing.
  if ($string === '' || $string === NULL) {
    return '';
  }

  // Check if the string has already been processed, and if so return the
  // cached result.
  if (isset($strings[$string])) {
    return $strings[$string];
  }

  // Remove all HTML tags from the string.
  $output = strip_tags(decode_entities($string));

  // Optionally remove accents and transliterate
  if (variable_get('pathauto_transliterate', FALSE)) {
    static $translations;
    if (!isset($translations)) {
      $translations = FALSE;
      if ($file = _pathauto_get_i18n_file()) {
        $translations = parse_ini_file($file);
      }
    }
    if (!empty($translations)) {
      $output = strtr($output, $translations);
    }
  }

  // Replace or drop punctuation based on user settings
  $separator = variable_get('pathauto_separator', '-');
  $punctuation = pathauto_punctuation_chars();
  foreach ($punctuation as $name => $details) {
    $action = variable_get('pathauto_punctuation_' . $name, 0);

    // 2 is the action for "do nothing" with the punctuation
    if ($action != 2) {

      // Slightly tricky inline if which either replaces with the separator or nothing
      $output = str_replace($details['value'], $action ? $separator : '', $output);
    }
  }

  // Reduce strings to letters and numbers
  if (variable_get('pathauto_reduce_ascii', FALSE)) {
    $pattern = '/[^a-zA-Z0-9\\/]+/';
    $output = preg_replace($pattern, $separator, $output);
  }

  // Calculate and statically cache the ignored words regex expression.
  static $ignore_words_regex;
  if (!isset($ignore_words_regex)) {
    $ignore_words = array(
      'a',
      'an',
      'as',
      'at',
      'before',
      'but',
      'by',
      'for',
      'from',
      'is',
      'in',
      'into',
      'like',
      'of',
      'off',
      'on',
      'onto',
      'per',
      'since',
      'than',
      'the',
      'this',
      'that',
      'to',
      'up',
      'via',
      'with',
    );
    $ignore_words = variable_get('pathauto_ignore_words', $ignore_words);
    $ignore_words_regex = preg_replace(array(
      '/^[,\\s]+|[,\\s]+$/',
      '/[,\\s]+/',
    ), array(
      '',
      '\\b|\\b',
    ), $ignore_words);
    if ($ignore_words_regex) {
      $ignore_words_regex = '\\b' . $ignore_words_regex . '\\b';
    }
  }

  // Get rid of words that are on the ignore list
  if ($ignore_words_regex) {
    if (function_exists('mb_eregi_replace')) {
      $words_removed = mb_eregi_replace($ignore_words_regex, '', $output);
    }
    else {
      $words_removed = preg_replace("/{$ignore_words_regex}/i", '', $output);
    }
    if (drupal_strlen(trim($words_removed)) > 0) {
      $output = $words_removed;
    }
  }

  // Always replace whitespace with the separator.
  $output = preg_replace('/\\s+/', $separator, $output);

  // Trim duplicates and remove trailing and leading separators.
  $output = _pathauto_clean_separators($output);

  // Optionally convert to lower case.
  if (variable_get('pathauto_case', 1)) {
    $output = drupal_strtolower($output);
  }

  // Enforce the maximum component length.
  $maxlength = min(variable_get('pathauto_max_component_length', 100), _pathauto_get_schema_alias_maxlength());
  $output = drupal_substr($output, 0, $maxlength);

  // Cache this result in the static array.
  $strings[$string] = $output;
  return $output;
}