You are here

Tokenizer.php in Search API 8

File

src/Plugin/search_api/processor/Tokenizer.php
View source
<?php

namespace Drupal\search_api\Plugin\search_api\processor;

use Drupal\Component\Utility\Unicode;
use Drupal\Core\Form\FormStateInterface;
use Drupal\Core\Url;
use Drupal\search_api\Item\FieldInterface;
use Drupal\search_api\Plugin\search_api\data_type\value\TextValueInterface;
use Drupal\search_api\Processor\FieldsProcessorPluginBase;
use Drupal\search_api\Utility\Utility;

/**
 * Splits text into individual words for searching.
 *
 * @SearchApiProcessor(
 *   id = "tokenizer",
 *   label = @Translation("Tokenizer"),
 *   description = @Translation("Splits text into individual words for searching."),
 *   stages = {
 *     "pre_index_save" = 0,
 *     "preprocess_index" = -6,
 *     "preprocess_query" = -6
 *   }
 * )
 */
class Tokenizer extends FieldsProcessorPluginBase {

  /**
   * PCRE character class contents identifying ignored characters.
   *
   * @var string
   */
  protected $ignored;

  /**
   * PCRE character class contents identifying spaces.
   *
   * @var string
   */
  protected $spaces;

  /**
   * {@inheritdoc}
   */
  public function defaultConfiguration() {
    $configuration = parent::defaultConfiguration();
    $configuration += [
      'ignored' => '._-',
      'spaces' => '',
      'overlap_cjk' => TRUE,
      'minimum_word_size' => 3,
    ];
    return $configuration;
  }

  /**
   * {@inheritdoc}
   */
  public function setConfiguration(array $configuration) {
    parent::setConfiguration($configuration);
    unset($this->spaces);
  }

  /**
   * {@inheritdoc}
   */
  public function buildConfigurationForm(array $form, FormStateInterface $form_state) {
    $form = parent::buildConfigurationForm($form, $form_state);
    $args = [
      ':pcre-url' => Url::fromUri('https://php.net/manual/regexp.reference.character-classes.php')
        ->toString(),
      ':doc-url' => Url::fromUri('https://api.drupal.org/api/drupal/core!lib!Drupal!Component!Utility!Unicode.php/constant/Unicode%3A%3APREG_CLASS_WORD_BOUNDARY/8')
        ->toString(),
    ];
    $form['ignored'] = [
      '#type' => 'textfield',
      '#title' => $this
        ->t('Ignored characters'),
      '#description' => $this
        ->t('Specify the characters that should be removed prior to processing. Dots, dashes, and underscores are ignored by default to allow meaningful search behavior with acronyms and URLs. Specify the characters as the inside of a <a href=":pcre-url">PCRE character class</a>.', $args),
      '#default_value' => $this->configuration['ignored'],
    ];
    $form['spaces'] = [
      '#type' => 'textfield',
      '#title' => $this
        ->t('Whitespace characters'),
      '#description' => $this
        ->t('Specify the characters that should be regarded as whitespace and therefore used as word-delimiters. Specify the characters as the inside of a <a href=":pcre-url">PCRE character class</a>. Leave empty to use a <a href=":doc-url">default</a> which should be suitable for most languages with a Latin alphabet.', $args),
      '#default_value' => $this->configuration['spaces'],
    ];
    $form['overlap_cjk'] = [
      '#type' => 'checkbox',
      '#title' => $this
        ->t('Simple CJK handling'),
      '#default_value' => $this->configuration['overlap_cjk'],
      '#description' => $this
        ->t('Whether to apply a simple Chinese/Japanese/Korean tokenizer based on overlapping sequences. Does not affect other languages.'),
    ];
    $form['minimum_word_size'] = [
      '#type' => 'number',
      '#title' => $this
        ->t('Minimum word length to index'),
      '#default_value' => $this->configuration['minimum_word_size'],
      '#min' => 1,
      '#max' => 1000,
      '#description' => $this
        ->t('The number of characters a word has to be to be indexed. A lower setting means better search result ranking, but also a larger database. Each search query must contain at least one keyword that is this size (or longer).'),
    ];
    return $form;
  }

  /**
   * {@inheritdoc}
   */
  public function validateConfigurationForm(array &$form, FormStateInterface $form_state) {
    parent::validateConfigurationForm($form, $form_state);
    foreach ([
      'spaces',
      'ignored',
    ] as $field) {
      $field_value = $form_state
        ->getValue($field, '');
      $field_value = str_replace('/', '\\/', trim($field_value));
      if ($field_value !== '' && @preg_match('/[' . $field_value . ']+/u', '') === FALSE) {
        $form_state
          ->setError($form[$field], $form[$field]['#title'] . ': ' . $this
          ->t('The entered text is no valid PCRE character class.'));
      }
    }
  }

  /**
   * {@inheritdoc}
   */
  protected function testType($type) {
    return $this
      ->getDataTypeHelper()
      ->isTextType($type);
  }

  /**
   * {@inheritdoc}
   */
  protected function processField(FieldInterface $field) {
    parent::processField($field);
    foreach ($field
      ->getValues() as $value) {
      if ($value instanceof TextValueInterface) {
        $value
          ->setProperty('tokenized');
      }
    }
  }

  /**
   * Matches all 'N' Unicode character classes (numbers).
   *
   * @return string
   *   A string of Unicode characters to use in the regular expression.
   */
  protected function getPregClassNumbers() {
    return '\\x{30}-\\x{39}\\x{b2}\\x{b3}\\x{b9}\\x{bc}-\\x{be}\\x{660}-\\x{669}\\x{6f0}-\\x{6f9}' . '\\x{966}-\\x{96f}\\x{9e6}-\\x{9ef}\\x{9f4}-\\x{9f9}\\x{a66}-\\x{a6f}\\x{ae6}-\\x{aef}' . '\\x{b66}-\\x{b6f}\\x{be7}-\\x{bf2}\\x{c66}-\\x{c6f}\\x{ce6}-\\x{cef}\\x{d66}-\\x{d6f}' . '\\x{e50}-\\x{e59}\\x{ed0}-\\x{ed9}\\x{f20}-\\x{f33}\\x{1040}-\\x{1049}\\x{1369}-' . '\\x{137c}\\x{16ee}-\\x{16f0}\\x{17e0}-\\x{17e9}\\x{17f0}-\\x{17f9}\\x{1810}-\\x{1819}' . '\\x{1946}-\\x{194f}\\x{2070}\\x{2074}-\\x{2079}\\x{2080}-\\x{2089}\\x{2153}-\\x{2183}' . '\\x{2460}-\\x{249b}\\x{24ea}-\\x{24ff}\\x{2776}-\\x{2793}\\x{3007}\\x{3021}-\\x{3029}' . '\\x{3038}-\\x{303a}\\x{3192}-\\x{3195}\\x{3220}-\\x{3229}\\x{3251}-\\x{325f}\\x{3280}-' . '\\x{3289}\\x{32b1}-\\x{32bf}\\x{ff10}-\\x{ff19}';
  }

  /**
   * Matches all 'P' Unicode character classes (punctuation).
   *
   * @return string
   *   A string of Unicode characters to use in the regular expression.
   */
  protected function getPregClassPunctuation() {
    return '\\x{21}-\\x{23}\\x{25}-\\x{2a}\\x{2c}-\\x{2f}\\x{3a}\\x{3b}\\x{3f}\\x{40}\\x{5b}-\\x{5d}' . '\\x{5f}\\x{7b}\\x{7d}\\x{a1}\\x{ab}\\x{b7}\\x{bb}\\x{bf}\\x{37e}\\x{387}\\x{55a}-\\x{55f}' . '\\x{589}\\x{58a}\\x{5be}\\x{5c0}\\x{5c3}\\x{5f3}\\x{5f4}\\x{60c}\\x{60d}\\x{61b}\\x{61f}' . '\\x{66a}-\\x{66d}\\x{6d4}\\x{700}-\\x{70d}\\x{964}\\x{965}\\x{970}\\x{df4}\\x{e4f}' . '\\x{e5a}\\x{e5b}\\x{f04}-\\x{f12}\\x{f3a}-\\x{f3d}\\x{f85}\\x{104a}-\\x{104f}\\x{10fb}' . '\\x{1361}-\\x{1368}\\x{166d}\\x{166e}\\x{169b}\\x{169c}\\x{16eb}-\\x{16ed}\\x{1735}' . '\\x{1736}\\x{17d4}-\\x{17d6}\\x{17d8}-\\x{17da}\\x{1800}-\\x{180a}\\x{1944}\\x{1945}' . '\\x{2010}-\\x{2027}\\x{2030}-\\x{2043}\\x{2045}-\\x{2051}\\x{2053}\\x{2054}\\x{2057}' . '\\x{207d}\\x{207e}\\x{208d}\\x{208e}\\x{2329}\\x{232a}\\x{23b4}-\\x{23b6}\\x{2768}-' . '\\x{2775}\\x{27e6}-\\x{27eb}\\x{2983}-\\x{2998}\\x{29d8}-\\x{29db}\\x{29fc}\\x{29fd}' . '\\x{3001}-\\x{3003}\\x{3008}-\\x{3011}\\x{3014}-\\x{301f}\\x{3030}\\x{303d}\\x{30a0}' . '\\x{30fb}\\x{fd3e}\\x{fd3f}\\x{fe30}-\\x{fe52}\\x{fe54}-\\x{fe61}\\x{fe63}\\x{fe68}' . '\\x{fe6a}\\x{fe6b}\\x{ff01}-\\x{ff03}\\x{ff05}-\\x{ff0a}\\x{ff0c}-\\x{ff0f}\\x{ff1a}' . '\\x{ff1b}\\x{ff1f}\\x{ff20}\\x{ff3b}-\\x{ff3d}\\x{ff3f}\\x{ff5b}\\x{ff5d}\\x{ff5f}-' . '\\x{ff65}';
  }

  /**
   * Matches CJK (Chinese, Japanese, Korean) letter-like characters.
   *
   * This list is derived from the "East Asian Scripts" section of
   * http://www.unicode.org/charts/index.html, as well as a comment on
   * http://unicode.org/reports/tr11/tr11-11.html listing some character
   * ranges that are reserved for additional CJK ideographs.
   *
   * The character ranges do not include numbers, punctuation, or symbols, since
   * these are handled separately in search. Note that radicals and strokes are
   * considered symbols. (See
   * http://www.unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt)
   *
   * @return string
   *   A string of Unicode characters to use in the regular expression.
   *
   * @see search_expand_cjk()
   */
  protected function getPregClassCjk() {
    return '\\x{1100}-\\x{11FF}\\x{3040}-\\x{309F}\\x{30A1}-\\x{318E}' . '\\x{31A0}-\\x{31B7}\\x{31F0}-\\x{31FF}\\x{3400}-\\x{4DBF}\\x{4E00}-\\x{9FCF}' . '\\x{A000}-\\x{A48F}\\x{A4D0}-\\x{A4FD}\\x{A960}-\\x{A97F}\\x{AC00}-\\x{D7FF}' . '\\x{F900}-\\x{FAFF}\\x{FF21}-\\x{FF3A}\\x{FF41}-\\x{FF5A}\\x{FF66}-\\x{FFDC}' . '\\x{20000}-\\x{2FFFD}\\x{30000}-\\x{3FFFD}';
  }

  /**
   * {@inheritdoc}
   */
  protected function processFieldValue(&$value, $type) {
    $this
      ->prepare();
    $text = $this
      ->simplifyText($value);

    // Split on spaces. The configured (or default) delimiters have been
    // replaced by those already in simplifyText().
    $arr = explode(' ', $text);
    $value = [];
    foreach ($arr as $token) {
      if (is_numeric($token) || mb_strlen($token) >= $this->configuration['minimum_word_size']) {
        $value[] = Utility::createTextToken($token);
      }
    }
  }

  /**
   * Simplifies a string according to indexing rules.
   *
   * @param string $text
   *   The text to simplify.
   *
   * @return string
   *   The text with tokens split by single spaces.
   *
   * @see search_simplify()
   */
  protected function simplifyText($text) {

    // Optionally apply simple CJK handling to the text.
    if ($this->configuration['overlap_cjk']) {
      $text = preg_replace_callback('/[' . $this
        ->getPregClassCjk() . ']+/u', [
        $this,
        'expandCjk',
      ], $text);
    }

    // To improve searching for numerical data such as dates, IP addresses or
    // version numbers, we consider a group of numerical characters separated
    // only by punctuation characters to be one piece. This also means, for
    // example, that searching for "20/03/1984" also returns results with
    // "20-03-1984" in them.
    // Readable regular expression: "([number]+)[punctuation]+(?=[number])".
    $text = preg_replace('/([' . $this
      ->getPregClassNumbers() . ']+)[' . $this
      ->getPregClassPunctuation() . ']+(?=[' . $this
      ->getPregClassNumbers() . '])/u', '\\1', $text);

    // A group of multiple ignored characters is still treated as whitespace.
    $text = preg_replace('/[' . $this->ignored . ']{2,}/u', ' ', $text);

    // Remove all other instances of ignored characters.
    $text = preg_replace('/[' . $this->ignored . ']+/u', '', $text);

    // Finally, convert all characters we want to treat as word boundaries to
    // plain spaces.
    $text = preg_replace('/[' . $this->spaces . ']+/u', ' ', $text);
    return trim($text);
  }

  /**
   * Splits CJK (Chinese, Japanese, Korean) text into tokens.
   *
   * Callback for preg_replace_callback() in simplifyText().
   *
   * Normally, searches should match exact words, where a word is defined to be
   * a sequence of characters delimited by spaces or punctuation. CJK languages
   * are written in long strings of characters, though, not split up into words.
   * So in order to allow search matching, we split up CJK text into tokens
   * consisting of consecutive, overlapping sequences of characters whose length
   * is equal to the "minimum_word_size" setting. This tokenizing is only done
   * if the "overlap_cjk" setting is enabled.
   *
   * @param array $matches
   *   A PCRE match array, containing the complete match as the only element.
   *
   * @return string
   *   Tokenized text, with tokens separated with space characters and starting
   *   and ending with a space.
   *
   * @see search_expand_cjk()
   */
  protected function expandCjk(array $matches) {
    $min = $this->configuration['minimum_word_size'];
    $str = $matches[0];
    $length = mb_strlen($str);

    // If the text is shorter than the minimum word size, don't tokenize it.
    if ($length <= $min) {
      return ' ' . $str . ' ';
    }
    $tokens = ' ';

    // Build a FIFO queue of characters.
    $chars = [];
    for ($i = 0; $i < $length; $i++) {

      // Add the next character off the beginning of the string to the queue.
      $current = mb_substr($str, 0, 1);
      $str = substr($str, strlen($current));
      $chars[] = $current;
      if ($i >= $min - 1) {

        // Make a token of $min characters, and add it to the token string.
        $tokens .= implode('', $chars) . ' ';

        // Shift out the first character in the queue.
        array_shift($chars);
      }
    }
    return $tokens;
  }

  /**
   * {@inheritdoc}
   */
  protected function process(&$value) {
    $this
      ->prepare();
    $value = trim($this
      ->simplifyText($value));
    $min = $this->configuration['minimum_word_size'];
    if ($min > 1) {
      $words = explode(' ', $value);
      foreach ($words as $i => $word) {
        if (mb_strlen($word) < $min) {
          unset($words[$i]);
        }
      }
      $value = implode(' ', $words);
    }
  }

  /**
   * Prepares the processor by setting the $spaces property.
   */
  protected function prepare() {
    if (!isset($this->spaces)) {
      if ($this->configuration['spaces'] !== '') {
        $this->spaces = str_replace('/', '\\/', $this->configuration['spaces']);
      }
      else {
        $this->spaces = Unicode::PREG_CLASS_WORD_BOUNDARY;
      }
    }
    if (!isset($this->ignored)) {
      if ($this->configuration['ignored'] !== '') {
        $this->ignored = str_replace('/', '\\/', $this->configuration['ignored']);
      }
      else {
        $this->ignored = '._-';
      }
    }
  }

}

Classes

Namesort descending Description
Tokenizer Splits text into individual words for searching.