You are here

class Tokenizer in Search API 8

Splits text into individual words for searching.

Plugin annotation


@SearchApiProcessor(
  id = "tokenizer",
  label = @Translation("Tokenizer"),
  description = @Translation("Splits text into individual words for searching."),
  stages = {
    "pre_index_save" = 0,
    "preprocess_index" = -6,
    "preprocess_query" = -6
  }
)

Hierarchy

Expanded class hierarchy of Tokenizer

1 file declares its use of Tokenizer
TokenizerTest.php in tests/src/Unit/Processor/TokenizerTest.php
1 string reference to 'Tokenizer'
ProcessorIntegrationTest::testLimitProcessors in tests/src/Functional/ProcessorIntegrationTest.php
Tests that processors discouraged by the backend are correctly hidden.

File

src/Plugin/search_api/processor/Tokenizer.php, line 27

Namespace

Drupal\search_api\Plugin\search_api\processor
View source
class Tokenizer extends FieldsProcessorPluginBase {

  /**
   * PCRE character class contents identifying ignored characters.
   *
   * @var string
   */
  protected $ignored;

  /**
   * PCRE character class contents identifying spaces.
   *
   * @var string
   */
  protected $spaces;

  /**
   * {@inheritdoc}
   */
  public function defaultConfiguration() {
    $configuration = parent::defaultConfiguration();
    $configuration += [
      'ignored' => '._-',
      'spaces' => '',
      'overlap_cjk' => TRUE,
      'minimum_word_size' => 3,
    ];
    return $configuration;
  }

  /**
   * {@inheritdoc}
   */
  public function setConfiguration(array $configuration) {
    parent::setConfiguration($configuration);
    unset($this->spaces);
  }

  /**
   * {@inheritdoc}
   */
  public function buildConfigurationForm(array $form, FormStateInterface $form_state) {
    $form = parent::buildConfigurationForm($form, $form_state);
    $args = [
      ':pcre-url' => Url::fromUri('https://php.net/manual/regexp.reference.character-classes.php')
        ->toString(),
      ':doc-url' => Url::fromUri('https://api.drupal.org/api/drupal/core!lib!Drupal!Component!Utility!Unicode.php/constant/Unicode%3A%3APREG_CLASS_WORD_BOUNDARY/8')
        ->toString(),
    ];
    $form['ignored'] = [
      '#type' => 'textfield',
      '#title' => $this
        ->t('Ignored characters'),
      '#description' => $this
        ->t('Specify the characters that should be removed prior to processing. Dots, dashes, and underscores are ignored by default to allow meaningful search behavior with acronyms and URLs. Specify the characters as the inside of a <a href=":pcre-url">PCRE character class</a>.', $args),
      '#default_value' => $this->configuration['ignored'],
    ];
    $form['spaces'] = [
      '#type' => 'textfield',
      '#title' => $this
        ->t('Whitespace characters'),
      '#description' => $this
        ->t('Specify the characters that should be regarded as whitespace and therefore used as word-delimiters. Specify the characters as the inside of a <a href=":pcre-url">PCRE character class</a>. Leave empty to use a <a href=":doc-url">default</a> which should be suitable for most languages with a Latin alphabet.', $args),
      '#default_value' => $this->configuration['spaces'],
    ];
    $form['overlap_cjk'] = [
      '#type' => 'checkbox',
      '#title' => $this
        ->t('Simple CJK handling'),
      '#default_value' => $this->configuration['overlap_cjk'],
      '#description' => $this
        ->t('Whether to apply a simple Chinese/Japanese/Korean tokenizer based on overlapping sequences. Does not affect other languages.'),
    ];
    $form['minimum_word_size'] = [
      '#type' => 'number',
      '#title' => $this
        ->t('Minimum word length to index'),
      '#default_value' => $this->configuration['minimum_word_size'],
      '#min' => 1,
      '#max' => 1000,
      '#description' => $this
        ->t('The number of characters a word has to be to be indexed. A lower setting means better search result ranking, but also a larger database. Each search query must contain at least one keyword that is this size (or longer).'),
    ];
    return $form;
  }

  /**
   * {@inheritdoc}
   */
  public function validateConfigurationForm(array &$form, FormStateInterface $form_state) {
    parent::validateConfigurationForm($form, $form_state);
    foreach ([
      'spaces',
      'ignored',
    ] as $field) {
      $field_value = $form_state
        ->getValue($field, '');
      $field_value = str_replace('/', '\\/', trim($field_value));
      if ($field_value !== '' && @preg_match('/[' . $field_value . ']+/u', '') === FALSE) {
        $form_state
          ->setError($form[$field], $form[$field]['#title'] . ': ' . $this
          ->t('The entered text is no valid PCRE character class.'));
      }
    }
  }

  /**
   * {@inheritdoc}
   */
  protected function testType($type) {
    return $this
      ->getDataTypeHelper()
      ->isTextType($type);
  }

  /**
   * {@inheritdoc}
   */
  protected function processField(FieldInterface $field) {
    parent::processField($field);
    foreach ($field
      ->getValues() as $value) {
      if ($value instanceof TextValueInterface) {
        $value
          ->setProperty('tokenized');
      }
    }
  }

  /**
   * Matches all 'N' Unicode character classes (numbers).
   *
   * @return string
   *   A string of Unicode characters to use in the regular expression.
   */
  protected function getPregClassNumbers() {
    return '\\x{30}-\\x{39}\\x{b2}\\x{b3}\\x{b9}\\x{bc}-\\x{be}\\x{660}-\\x{669}\\x{6f0}-\\x{6f9}' . '\\x{966}-\\x{96f}\\x{9e6}-\\x{9ef}\\x{9f4}-\\x{9f9}\\x{a66}-\\x{a6f}\\x{ae6}-\\x{aef}' . '\\x{b66}-\\x{b6f}\\x{be7}-\\x{bf2}\\x{c66}-\\x{c6f}\\x{ce6}-\\x{cef}\\x{d66}-\\x{d6f}' . '\\x{e50}-\\x{e59}\\x{ed0}-\\x{ed9}\\x{f20}-\\x{f33}\\x{1040}-\\x{1049}\\x{1369}-' . '\\x{137c}\\x{16ee}-\\x{16f0}\\x{17e0}-\\x{17e9}\\x{17f0}-\\x{17f9}\\x{1810}-\\x{1819}' . '\\x{1946}-\\x{194f}\\x{2070}\\x{2074}-\\x{2079}\\x{2080}-\\x{2089}\\x{2153}-\\x{2183}' . '\\x{2460}-\\x{249b}\\x{24ea}-\\x{24ff}\\x{2776}-\\x{2793}\\x{3007}\\x{3021}-\\x{3029}' . '\\x{3038}-\\x{303a}\\x{3192}-\\x{3195}\\x{3220}-\\x{3229}\\x{3251}-\\x{325f}\\x{3280}-' . '\\x{3289}\\x{32b1}-\\x{32bf}\\x{ff10}-\\x{ff19}';
  }

  /**
   * Matches all 'P' Unicode character classes (punctuation).
   *
   * @return string
   *   A string of Unicode characters to use in the regular expression.
   */
  protected function getPregClassPunctuation() {
    return '\\x{21}-\\x{23}\\x{25}-\\x{2a}\\x{2c}-\\x{2f}\\x{3a}\\x{3b}\\x{3f}\\x{40}\\x{5b}-\\x{5d}' . '\\x{5f}\\x{7b}\\x{7d}\\x{a1}\\x{ab}\\x{b7}\\x{bb}\\x{bf}\\x{37e}\\x{387}\\x{55a}-\\x{55f}' . '\\x{589}\\x{58a}\\x{5be}\\x{5c0}\\x{5c3}\\x{5f3}\\x{5f4}\\x{60c}\\x{60d}\\x{61b}\\x{61f}' . '\\x{66a}-\\x{66d}\\x{6d4}\\x{700}-\\x{70d}\\x{964}\\x{965}\\x{970}\\x{df4}\\x{e4f}' . '\\x{e5a}\\x{e5b}\\x{f04}-\\x{f12}\\x{f3a}-\\x{f3d}\\x{f85}\\x{104a}-\\x{104f}\\x{10fb}' . '\\x{1361}-\\x{1368}\\x{166d}\\x{166e}\\x{169b}\\x{169c}\\x{16eb}-\\x{16ed}\\x{1735}' . '\\x{1736}\\x{17d4}-\\x{17d6}\\x{17d8}-\\x{17da}\\x{1800}-\\x{180a}\\x{1944}\\x{1945}' . '\\x{2010}-\\x{2027}\\x{2030}-\\x{2043}\\x{2045}-\\x{2051}\\x{2053}\\x{2054}\\x{2057}' . '\\x{207d}\\x{207e}\\x{208d}\\x{208e}\\x{2329}\\x{232a}\\x{23b4}-\\x{23b6}\\x{2768}-' . '\\x{2775}\\x{27e6}-\\x{27eb}\\x{2983}-\\x{2998}\\x{29d8}-\\x{29db}\\x{29fc}\\x{29fd}' . '\\x{3001}-\\x{3003}\\x{3008}-\\x{3011}\\x{3014}-\\x{301f}\\x{3030}\\x{303d}\\x{30a0}' . '\\x{30fb}\\x{fd3e}\\x{fd3f}\\x{fe30}-\\x{fe52}\\x{fe54}-\\x{fe61}\\x{fe63}\\x{fe68}' . '\\x{fe6a}\\x{fe6b}\\x{ff01}-\\x{ff03}\\x{ff05}-\\x{ff0a}\\x{ff0c}-\\x{ff0f}\\x{ff1a}' . '\\x{ff1b}\\x{ff1f}\\x{ff20}\\x{ff3b}-\\x{ff3d}\\x{ff3f}\\x{ff5b}\\x{ff5d}\\x{ff5f}-' . '\\x{ff65}';
  }

  /**
   * Matches CJK (Chinese, Japanese, Korean) letter-like characters.
   *
   * This list is derived from the "East Asian Scripts" section of
   * http://www.unicode.org/charts/index.html, as well as a comment on
   * http://unicode.org/reports/tr11/tr11-11.html listing some character
   * ranges that are reserved for additional CJK ideographs.
   *
   * The character ranges do not include numbers, punctuation, or symbols, since
   * these are handled separately in search. Note that radicals and strokes are
   * considered symbols. (See
   * http://www.unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt)
   *
   * @return string
   *   A string of Unicode characters to use in the regular expression.
   *
   * @see search_expand_cjk()
   */
  protected function getPregClassCjk() {
    return '\\x{1100}-\\x{11FF}\\x{3040}-\\x{309F}\\x{30A1}-\\x{318E}' . '\\x{31A0}-\\x{31B7}\\x{31F0}-\\x{31FF}\\x{3400}-\\x{4DBF}\\x{4E00}-\\x{9FCF}' . '\\x{A000}-\\x{A48F}\\x{A4D0}-\\x{A4FD}\\x{A960}-\\x{A97F}\\x{AC00}-\\x{D7FF}' . '\\x{F900}-\\x{FAFF}\\x{FF21}-\\x{FF3A}\\x{FF41}-\\x{FF5A}\\x{FF66}-\\x{FFDC}' . '\\x{20000}-\\x{2FFFD}\\x{30000}-\\x{3FFFD}';
  }

  /**
   * {@inheritdoc}
   */
  protected function processFieldValue(&$value, $type) {
    $this
      ->prepare();
    $text = $this
      ->simplifyText($value);

    // Split on spaces. The configured (or default) delimiters have been
    // replaced by those already in simplifyText().
    $arr = explode(' ', $text);
    $value = [];
    foreach ($arr as $token) {
      if (is_numeric($token) || mb_strlen($token) >= $this->configuration['minimum_word_size']) {
        $value[] = Utility::createTextToken($token);
      }
    }
  }

  /**
   * Simplifies a string according to indexing rules.
   *
   * @param string $text
   *   The text to simplify.
   *
   * @return string
   *   The text with tokens split by single spaces.
   *
   * @see search_simplify()
   */
  protected function simplifyText($text) {

    // Optionally apply simple CJK handling to the text.
    if ($this->configuration['overlap_cjk']) {
      $text = preg_replace_callback('/[' . $this
        ->getPregClassCjk() . ']+/u', [
        $this,
        'expandCjk',
      ], $text);
    }

    // To improve searching for numerical data such as dates, IP addresses or
    // version numbers, we consider a group of numerical characters separated
    // only by punctuation characters to be one piece. This also means, for
    // example, that searching for "20/03/1984" also returns results with
    // "20-03-1984" in them.
    // Readable regular expression: "([number]+)[punctuation]+(?=[number])".
    $text = preg_replace('/([' . $this
      ->getPregClassNumbers() . ']+)[' . $this
      ->getPregClassPunctuation() . ']+(?=[' . $this
      ->getPregClassNumbers() . '])/u', '\\1', $text);

    // A group of multiple ignored characters is still treated as whitespace.
    $text = preg_replace('/[' . $this->ignored . ']{2,}/u', ' ', $text);

    // Remove all other instances of ignored characters.
    $text = preg_replace('/[' . $this->ignored . ']+/u', '', $text);

    // Finally, convert all characters we want to treat as word boundaries to
    // plain spaces.
    $text = preg_replace('/[' . $this->spaces . ']+/u', ' ', $text);
    return trim($text);
  }

  /**
   * Splits CJK (Chinese, Japanese, Korean) text into tokens.
   *
   * Callback for preg_replace_callback() in simplifyText().
   *
   * Normally, searches should match exact words, where a word is defined to be
   * a sequence of characters delimited by spaces or punctuation. CJK languages
   * are written in long strings of characters, though, not split up into words.
   * So in order to allow search matching, we split up CJK text into tokens
   * consisting of consecutive, overlapping sequences of characters whose length
   * is equal to the "minimum_word_size" setting. This tokenizing is only done
   * if the "overlap_cjk" setting is enabled.
   *
   * @param array $matches
   *   A PCRE match array, containing the complete match as the only element.
   *
   * @return string
   *   Tokenized text, with tokens separated with space characters and starting
   *   and ending with a space.
   *
   * @see search_expand_cjk()
   */
  protected function expandCjk(array $matches) {
    $min = $this->configuration['minimum_word_size'];
    $str = $matches[0];
    $length = mb_strlen($str);

    // If the text is shorter than the minimum word size, don't tokenize it.
    if ($length <= $min) {
      return ' ' . $str . ' ';
    }
    $tokens = ' ';

    // Build a FIFO queue of characters.
    $chars = [];
    for ($i = 0; $i < $length; $i++) {

      // Add the next character off the beginning of the string to the queue.
      $current = mb_substr($str, 0, 1);
      $str = substr($str, strlen($current));
      $chars[] = $current;
      if ($i >= $min - 1) {

        // Make a token of $min characters, and add it to the token string.
        $tokens .= implode('', $chars) . ' ';

        // Shift out the first character in the queue.
        array_shift($chars);
      }
    }
    return $tokens;
  }

  /**
   * {@inheritdoc}
   */
  protected function process(&$value) {
    $this
      ->prepare();
    $value = trim($this
      ->simplifyText($value));
    $min = $this->configuration['minimum_word_size'];
    if ($min > 1) {
      $words = explode(' ', $value);
      foreach ($words as $i => $word) {
        if (mb_strlen($word) < $min) {
          unset($words[$i]);
        }
      }
      $value = implode(' ', $words);
    }
  }

  /**
   * Prepares the processor by setting the $spaces property.
   */
  protected function prepare() {
    if (!isset($this->spaces)) {
      if ($this->configuration['spaces'] !== '') {
        $this->spaces = str_replace('/', '\\/', $this->configuration['spaces']);
      }
      else {
        $this->spaces = Unicode::PREG_CLASS_WORD_BOUNDARY;
      }
    }
    if (!isset($this->ignored)) {
      if ($this->configuration['ignored'] !== '') {
        $this->ignored = str_replace('/', '\\/', $this->configuration['ignored']);
      }
      else {
        $this->ignored = '._-';
      }
    }
  }

}

Members

Namesort descending Modifiers Type Description Overrides
ConfigurablePluginBase::calculateDependencies public function Calculates dependencies for the configured plugin. Overrides DependentPluginInterface::calculateDependencies 6
ConfigurablePluginBase::calculatePluginDependencies Deprecated protected function Calculates and adds dependencies of a specific plugin instance.
ConfigurablePluginBase::getConfiguration public function Gets this plugin's configuration. Overrides ConfigurableInterface::getConfiguration
ConfigurablePluginBase::getDescription public function Returns the plugin's description. Overrides ConfigurablePluginInterface::getDescription
ConfigurablePluginBase::getPluginDependencies Deprecated protected function Calculates and returns dependencies of a specific plugin instance.
ConfigurablePluginBase::label public function Returns the label for use on the administration pages. Overrides ConfigurablePluginInterface::label
ConfigurablePluginBase::moduleHandler Deprecated protected function Wraps the module handler.
ConfigurablePluginBase::onDependencyRemoval public function Informs the plugin that some of its dependencies are being removed. Overrides ConfigurablePluginInterface::onDependencyRemoval 5
ConfigurablePluginBase::themeHandler Deprecated protected function Wraps the theme handler.
DependencySerializationTrait::$_entityStorages protected property An array of entity type IDs keyed by the property name of their storages.
DependencySerializationTrait::$_serviceIds protected property An array of service IDs keyed by property name used for serialization.
DependencySerializationTrait::__sleep public function 1
DependencySerializationTrait::__wakeup public function 2
DependencyTrait::$dependencies protected property The object's dependencies.
DependencyTrait::addDependencies protected function Adds multiple dependencies.
DependencyTrait::addDependency protected function Adds a dependency.
FieldsProcessorPluginBase::$dataTypeHelper protected property The data type helper. 1
FieldsProcessorPluginBase::$elementInfoManager protected property The element info manager.
FieldsProcessorPluginBase::create public static function Creates an instance of the plugin. Overrides ProcessorPluginBase::create 1
FieldsProcessorPluginBase::getDataTypeHelper public function Retrieves the data type helper. 1
FieldsProcessorPluginBase::getElementInfoManager public function Retrieves the element info manager.
FieldsProcessorPluginBase::preIndexSave public function Preprocesses the search index entity before it is saved. Overrides ProcessorPluginBase::preIndexSave
FieldsProcessorPluginBase::preprocessIndexItems public function Preprocesses search items for indexing. Overrides ProcessorPluginBase::preprocessIndexItems 1
FieldsProcessorPluginBase::preprocessSearchQuery public function Preprocesses a search query. Overrides ProcessorPluginBase::preprocessSearchQuery 2
FieldsProcessorPluginBase::preRenderFieldsCheckboxes public static function Preprocesses the "fields" checkboxes before rendering.
FieldsProcessorPluginBase::processConditions protected function Preprocesses the query conditions.
FieldsProcessorPluginBase::processConditionValue protected function Processes a single condition value. 1
FieldsProcessorPluginBase::processKey protected function Processes a single search keyword. 1
FieldsProcessorPluginBase::processKeys protected function Preprocesses the search keywords.
FieldsProcessorPluginBase::setDataTypeHelper public function Sets the data type helper. 1
FieldsProcessorPluginBase::setElementInfoManager public function Sets the element info manager.
FieldsProcessorPluginBase::shouldProcess protected function Determines whether a single value (not an array) should be processed. 1
FieldsProcessorPluginBase::testField protected function Tests whether a certain field should be processed. 1
FieldsProcessorPluginBase::trustedCallbacks public static function Lists the trusted callbacks provided by the implementing class. Overrides TrustedCallbackInterface::trustedCallbacks
IndexPluginBase::$index protected property The index this processor is configured for.
IndexPluginBase::getIndex public function Retrieves the index this plugin is configured for. Overrides IndexPluginInterface::getIndex
IndexPluginBase::setIndex public function Sets the index this plugin is configured for. Overrides IndexPluginInterface::setIndex
IndexPluginBase::__construct public function Constructs a \Drupal\Component\Plugin\PluginBase object. Overrides ConfigurablePluginBase::__construct 2
MessengerTrait::$messenger protected property The messenger. 29
MessengerTrait::messenger public function Gets the messenger. 29
MessengerTrait::setMessenger public function Sets the messenger.
PluginBase::$configuration protected property Configuration information passed into the plugin. 1
PluginBase::$pluginDefinition protected property The plugin implementation definition. 1
PluginBase::$pluginId protected property The plugin_id.
PluginBase::DERIVATIVE_SEPARATOR constant A string which is used to separate base plugin IDs from the derivative ID.
PluginBase::getBaseId public function Gets the base_plugin_id of the plugin instance. Overrides DerivativeInspectionInterface::getBaseId
PluginBase::getDerivativeId public function Gets the derivative_id of the plugin instance. Overrides DerivativeInspectionInterface::getDerivativeId
PluginBase::getPluginDefinition public function Gets the definition of the plugin implementation. Overrides PluginInspectionInterface::getPluginDefinition 3
PluginBase::getPluginId public function Gets the plugin_id of the plugin instance. Overrides PluginInspectionInterface::getPluginId
PluginBase::isConfigurable public function Determines if the plugin is configurable.
PluginDependencyTrait::calculatePluginDependencies protected function Calculates and adds dependencies of a specific plugin instance. Aliased as: traitCalculatePluginDependencies 1
PluginDependencyTrait::getPluginDependencies protected function Calculates and returns dependencies of a specific plugin instance. Aliased as: traitGetPluginDependencies
PluginDependencyTrait::moduleHandler protected function Wraps the module handler. Aliased as: traitModuleHandler 1
PluginDependencyTrait::themeHandler protected function Wraps the theme handler. Aliased as: traitThemeHandler 1
PluginFormTrait::submitConfigurationForm public function Form submission handler. 7
ProcessorInterface::STAGE_ADD_PROPERTIES constant Processing stage: add properties.
ProcessorInterface::STAGE_ALTER_ITEMS constant Processing stage: alter indexed items.
ProcessorInterface::STAGE_POSTPROCESS_QUERY constant Processing stage: postprocess query.
ProcessorInterface::STAGE_PREPROCESS_INDEX constant Processing stage: preprocess index.
ProcessorInterface::STAGE_PREPROCESS_QUERY constant Processing stage: preprocess query.
ProcessorInterface::STAGE_PRE_INDEX_SAVE constant Processing stage: preprocess index.
ProcessorPluginBase::$fieldsHelper protected property The fields helper. 1
ProcessorPluginBase::addFieldValues public function Adds the values of properties defined by this processor to the item. Overrides ProcessorInterface::addFieldValues 8
ProcessorPluginBase::alterIndexedItems public function Alter the items to be indexed. Overrides ProcessorInterface::alterIndexedItems 3
ProcessorPluginBase::ensureField protected function Ensures that a field with certain properties is indexed on the index.
ProcessorPluginBase::findField protected function Finds a certain field in the index.
ProcessorPluginBase::getFieldsHelper public function Retrieves the fields helper. 1
ProcessorPluginBase::getPropertyDefinitions public function Retrieves the properties this processor defines for the given datasource. Overrides ProcessorInterface::getPropertyDefinitions 8
ProcessorPluginBase::getWeight public function Returns the weight for a specific processing stage. Overrides ProcessorInterface::getWeight
ProcessorPluginBase::isHidden public function Determines whether this plugin should be hidden in the UI. Overrides HideablePluginBase::isHidden
ProcessorPluginBase::isLocked public function Determines whether this processor should always be enabled. Overrides ProcessorInterface::isLocked
ProcessorPluginBase::postprocessSearchResults public function Postprocess search results before they are returned by the query. Overrides ProcessorInterface::postprocessSearchResults 2
ProcessorPluginBase::requiresReindexing public function Determines whether re-indexing is required after a settings change. Overrides ProcessorInterface::requiresReindexing
ProcessorPluginBase::setFieldsHelper public function Sets the fields helper. 1
ProcessorPluginBase::setWeight public function Sets the weight for a specific processing stage. Overrides ProcessorInterface::setWeight
ProcessorPluginBase::supportsIndex public static function Checks whether this processor is applicable for a certain index. Overrides ProcessorInterface::supportsIndex 8
ProcessorPluginBase::supportsStage public function Checks whether this processor implements a particular stage. Overrides ProcessorInterface::supportsStage 2
StringTranslationTrait::$stringTranslation protected property The string translation service. 1
StringTranslationTrait::formatPlural protected function Formats a string containing a count of items.
StringTranslationTrait::getNumberOfPlurals protected function Returns the number of plurals supported by a given language.
StringTranslationTrait::getStringTranslation protected function Gets the string translation service.
StringTranslationTrait::setStringTranslation public function Sets the string translation service to use. 2
StringTranslationTrait::t protected function Translates a string to the current language or to a given language.
Tokenizer::$ignored protected property PCRE character class contents identifying ignored characters.
Tokenizer::$spaces protected property PCRE character class contents identifying spaces.
Tokenizer::buildConfigurationForm public function Form constructor. Overrides FieldsProcessorPluginBase::buildConfigurationForm
Tokenizer::defaultConfiguration public function Gets default configuration for this plugin. Overrides FieldsProcessorPluginBase::defaultConfiguration
Tokenizer::expandCjk protected function Splits CJK (Chinese, Japanese, Korean) text into tokens.
Tokenizer::getPregClassCjk protected function Matches CJK (Chinese, Japanese, Korean) letter-like characters.
Tokenizer::getPregClassNumbers protected function Matches all 'N' Unicode character classes (numbers).
Tokenizer::getPregClassPunctuation protected function Matches all 'P' Unicode character classes (punctuation).
Tokenizer::prepare protected function Prepares the processor by setting the $spaces property.
Tokenizer::process protected function Processes a single string value. Overrides FieldsProcessorPluginBase::process
Tokenizer::processField protected function Processes a single field's value. Overrides FieldsProcessorPluginBase::processField
Tokenizer::processFieldValue protected function Processes a single text element in a field. Overrides FieldsProcessorPluginBase::processFieldValue
Tokenizer::setConfiguration public function Sets the configuration for this plugin instance. Overrides ConfigurablePluginBase::setConfiguration
Tokenizer::simplifyText protected function Simplifies a string according to indexing rules.
Tokenizer::testType protected function Determines whether a field of a certain type should be preprocessed. Overrides FieldsProcessorPluginBase::testType
Tokenizer::validateConfigurationForm public function Form validation handler. Overrides FieldsProcessorPluginBase::validateConfigurationForm
TrustedCallbackInterface::THROW_EXCEPTION constant Untrusted callbacks throw exceptions.
TrustedCallbackInterface::TRIGGER_SILENCED_DEPRECATION constant Untrusted callbacks trigger silenced E_USER_DEPRECATION errors.
TrustedCallbackInterface::TRIGGER_WARNING constant Untrusted callbacks trigger E_USER_WARNING errors.