You are here

class SearchTextProcessor in Drupal 9

Same name and namespace in other branches
  1. 10 core/modules/search/src/SearchTextProcessor.php \Drupal\search\SearchTextProcessor

Processes search text for indexing.

Hierarchy

Expanded class hierarchy of SearchTextProcessor

1 string reference to 'SearchTextProcessor'
search.services.yml in core/modules/search/search.services.yml
core/modules/search/search.services.yml
1 service uses SearchTextProcessor
search.text_processor in core/modules/search/search.services.yml
Drupal\search\SearchTextProcessor

File

core/modules/search/src/SearchTextProcessor.php, line 14

Namespace

Drupal\search
View source
class SearchTextProcessor implements SearchTextProcessorInterface {

  /**
   * The transliteration service.
   *
   * @var \Drupal\Component\Transliteration\TransliterationInterface
   */
  protected $transliteration;

  /**
   * The config factory.
   *
   * @var \Drupal\Core\Config\ConfigFactoryInterface
   */
  protected $configFactory;

  /**
   * The module handler.
   *
   * @var \Drupal\Core\Extension\ModuleHandlerInterface
   */
  protected $moduleHandler;

  /**
   * SearchTextProcessor constructor.
   *
   * @param \Drupal\Component\Transliteration\TransliterationInterface $transliteration
   *   The transliteration service.
   * @param \Drupal\Core\Config\ConfigFactoryInterface $config_factory
   *   The config factory.
   * @param \Drupal\Core\Extension\ModuleHandlerInterface $module_handler
   *   The module handler.
   */
  public function __construct(TransliterationInterface $transliteration, ConfigFactoryInterface $config_factory, ModuleHandlerInterface $module_handler) {
    $this->transliteration = $transliteration;
    $this->configFactory = $config_factory;
    $this->moduleHandler = $module_handler;
  }

  /**
   * {@inheritdoc}
   */
  public function process(string $text, ?string $langcode = NULL) : array {
    $text = $this
      ->analyze($text, $langcode);
    return explode(' ', $text);
  }

  /**
   * {@inheritdoc}
   */
  public function analyze(string $text, ?string $langcode = NULL) : string {

    // Decode entities to UTF-8.
    $text = Html::decodeEntities($text);

    // Lowercase.
    $text = mb_strtolower($text);

    // Remove diacritics.
    $text = $this->transliteration
      ->removeDiacritics($text);

    // Call an external processor for word handling.
    $this
      ->invokePreprocess($text, $langcode);

    // Simple CJK handling.
    if ($this->configFactory
      ->get('search.settings')
      ->get('index.overlap_cjk')) {
      $text = preg_replace_callback('/[' . self::PREG_CLASS_CJK . ']+/u', [
        $this,
        'expandCjk',
      ], $text);
    }

    // To improve searching for numerical data such as dates, IP addresses
    // or version numbers, we consider a group of numerical characters
    // separated only by punctuation characters to be one piece.
    // This also means that searching for e.g. '20/03/1984' also returns
    // results with '20-03-1984' in them.
    // Readable regexp: ([number]+)[punctuation]+(?=[number])
    $text = preg_replace('/([' . self::PREG_CLASS_NUMBERS . ']+)[' . self::PREG_CLASS_PUNCTUATION . ']+(?=[' . self::PREG_CLASS_NUMBERS . '])/u', '\\1', $text);

    // Multiple dot and dash groups are word boundaries and replaced with space.
    // No need to use the unicode modifier here because 0-127 ASCII characters
    // can't match higher UTF-8 characters as the leftmost bit of those are 1.
    $text = preg_replace('/[.-]{2,}/', ' ', $text);

    // The dot, underscore and dash are simply removed. This allows meaningful
    // search behavior with acronyms and URLs. See unicode note directly above.
    $text = preg_replace('/[._-]+/', '', $text);

    // With the exception of the rules above, we consider all punctuation,
    // marks, spacers, etc, to be a word boundary.
    $text = preg_replace('/[' . Unicode::PREG_CLASS_WORD_BOUNDARY . ']+/u', ' ', $text);

    // Truncate everything to 50 characters.
    $words = explode(' ', $text);
    array_walk($words, [
      $this,
      'truncate',
    ]);
    $text = implode(' ', $words);
    return $text;
  }

  /**
   * Invokes hook_search_preprocess() to simplify text.
   *
   * @param string $text
   *   Text to preprocess, passed by reference and altered in place.
   * @param string|null $langcode
   *   Language code for the language of $text, if known.
   */
  protected function invokePreprocess(string &$text, ?string $langcode = NULL) : void {
    foreach ($this->moduleHandler
      ->getImplementations('search_preprocess') as $module) {
      $text = $this->moduleHandler
        ->invoke($module, 'search_preprocess', [
        $text,
        $langcode,
      ]);
    }
  }

  /**
   * Splits CJK (Chinese, Japanese, Korean) text into tokens.
   *
   * The Search module matches exact words, where a word is defined to be a
   * sequence of characters delimited by spaces or punctuation. CJK languages
   * are written in long strings of characters, though, not split up into words.
   * So in order to allow search matching, we split up CJK text into tokens
   * consisting of consecutive, overlapping sequences of characters whose length
   * is equal to the 'minimum_word_size' variable. This tokenizing is only done
   * if the 'overlap_cjk' variable is TRUE.
   *
   * @param array $matches
   *   This function is a callback for preg_replace_callback(), which is called
   *   from self::analyze(). So, $matches is an array of regular expression
   *   matches, which means that $matches[0] contains the matched text -- a
   *   string of CJK characters to tokenize.
   *
   * @return string
   *   Tokenized text, starting and ending with a space character.
   */
  protected function expandCjk(array $matches) : string {
    $min = $this->configFactory
      ->get('search.settings')
      ->get('index.minimum_word_size');
    $str = $matches[0];
    $length = mb_strlen($str);

    // If the text is shorter than the minimum word size, don't tokenize it.
    if ($length <= $min) {
      return ' ' . $str . ' ';
    }
    $tokens = ' ';

    // Build a FIFO queue of characters.
    $chars = [];
    for ($i = 0; $i < $length; $i++) {

      // Add the next character off the beginning of the string to the queue.
      $current = mb_substr($str, 0, 1);
      $str = substr($str, strlen($current));
      $chars[] = $current;
      if ($i >= $min - 1) {

        // Make a token of $min characters, and add it to the token string.
        $tokens .= implode('', $chars) . ' ';

        // Shift out the first character in the queue.
        array_shift($chars);
      }
    }
    return $tokens;
  }

  /**
   * Helper function for array_walk in ::analyze().
   *
   * @param string $text
   *   The text to be truncated.
   */
  protected function truncate(string &$text) : void {
    if (is_numeric($text)) {
      $text = ltrim($text, '0');
    }
    if (mb_strlen($text) <= 50) {
      return;
    }
    $text = mb_substr($text, 0, 50);
  }

}

Members

Namesort descending Modifiers Type Description Overrides
SearchTextProcessor::$configFactory protected property The config factory.
SearchTextProcessor::$moduleHandler protected property The module handler.
SearchTextProcessor::$transliteration protected property The transliteration service.
SearchTextProcessor::analyze public function Runs the text through character analyzers in preparation for indexing. Overrides SearchTextProcessorInterface::analyze
SearchTextProcessor::expandCjk protected function Splits CJK (Chinese, Japanese, Korean) text into tokens.
SearchTextProcessor::invokePreprocess protected function Invokes hook_search_preprocess() to simplify text.
SearchTextProcessor::process public function Processes text into words for indexing. Overrides SearchTextProcessorInterface::process
SearchTextProcessor::truncate protected function Helper function for array_walk in ::analyze().
SearchTextProcessor::__construct public function SearchTextProcessor constructor.
SearchTextProcessorInterface::PREG_CLASS_CJK constant Matches CJK (Chinese, Japanese, Korean) letter-like characters.
SearchTextProcessorInterface::PREG_CLASS_NUMBERS constant Matches all 'N' Unicode character classes (numbers)
SearchTextProcessorInterface::PREG_CLASS_PUNCTUATION constant Matches all 'P' Unicode character classes (punctuation)