class SearchTextProcessor in Drupal 9
Same name and namespace in other branches
- 10 core/modules/search/src/SearchTextProcessor.php \Drupal\search\SearchTextProcessor
Processes search text for indexing.
Hierarchy
- class \Drupal\search\SearchTextProcessor implements SearchTextProcessorInterface
Expanded class hierarchy of SearchTextProcessor
1 string reference to 'SearchTextProcessor'
- search.services.yml in core/
modules/ search/ search.services.yml - core/modules/search/search.services.yml
1 service uses SearchTextProcessor
- search.text_processor in core/
modules/ search/ search.services.yml - Drupal\search\SearchTextProcessor
File
- core/
modules/ search/ src/ SearchTextProcessor.php, line 14
Namespace
Drupal\searchView source
class SearchTextProcessor implements SearchTextProcessorInterface {
/**
* The transliteration service.
*
* @var \Drupal\Component\Transliteration\TransliterationInterface
*/
protected $transliteration;
/**
* The config factory.
*
* @var \Drupal\Core\Config\ConfigFactoryInterface
*/
protected $configFactory;
/**
* The module handler.
*
* @var \Drupal\Core\Extension\ModuleHandlerInterface
*/
protected $moduleHandler;
/**
* SearchTextProcessor constructor.
*
* @param \Drupal\Component\Transliteration\TransliterationInterface $transliteration
* The transliteration service.
* @param \Drupal\Core\Config\ConfigFactoryInterface $config_factory
* The config factory.
* @param \Drupal\Core\Extension\ModuleHandlerInterface $module_handler
* The module handler.
*/
public function __construct(TransliterationInterface $transliteration, ConfigFactoryInterface $config_factory, ModuleHandlerInterface $module_handler) {
$this->transliteration = $transliteration;
$this->configFactory = $config_factory;
$this->moduleHandler = $module_handler;
}
/**
* {@inheritdoc}
*/
public function process(string $text, ?string $langcode = NULL) : array {
$text = $this
->analyze($text, $langcode);
return explode(' ', $text);
}
/**
* {@inheritdoc}
*/
public function analyze(string $text, ?string $langcode = NULL) : string {
// Decode entities to UTF-8.
$text = Html::decodeEntities($text);
// Lowercase.
$text = mb_strtolower($text);
// Remove diacritics.
$text = $this->transliteration
->removeDiacritics($text);
// Call an external processor for word handling.
$this
->invokePreprocess($text, $langcode);
// Simple CJK handling.
if ($this->configFactory
->get('search.settings')
->get('index.overlap_cjk')) {
$text = preg_replace_callback('/[' . self::PREG_CLASS_CJK . ']+/u', [
$this,
'expandCjk',
], $text);
}
// To improve searching for numerical data such as dates, IP addresses
// or version numbers, we consider a group of numerical characters
// separated only by punctuation characters to be one piece.
// This also means that searching for e.g. '20/03/1984' also returns
// results with '20-03-1984' in them.
// Readable regexp: ([number]+)[punctuation]+(?=[number])
$text = preg_replace('/([' . self::PREG_CLASS_NUMBERS . ']+)[' . self::PREG_CLASS_PUNCTUATION . ']+(?=[' . self::PREG_CLASS_NUMBERS . '])/u', '\\1', $text);
// Multiple dot and dash groups are word boundaries and replaced with space.
// No need to use the unicode modifier here because 0-127 ASCII characters
// can't match higher UTF-8 characters as the leftmost bit of those are 1.
$text = preg_replace('/[.-]{2,}/', ' ', $text);
// The dot, underscore and dash are simply removed. This allows meaningful
// search behavior with acronyms and URLs. See unicode note directly above.
$text = preg_replace('/[._-]+/', '', $text);
// With the exception of the rules above, we consider all punctuation,
// marks, spacers, etc, to be a word boundary.
$text = preg_replace('/[' . Unicode::PREG_CLASS_WORD_BOUNDARY . ']+/u', ' ', $text);
// Truncate everything to 50 characters.
$words = explode(' ', $text);
array_walk($words, [
$this,
'truncate',
]);
$text = implode(' ', $words);
return $text;
}
/**
* Invokes hook_search_preprocess() to simplify text.
*
* @param string $text
* Text to preprocess, passed by reference and altered in place.
* @param string|null $langcode
* Language code for the language of $text, if known.
*/
protected function invokePreprocess(string &$text, ?string $langcode = NULL) : void {
foreach ($this->moduleHandler
->getImplementations('search_preprocess') as $module) {
$text = $this->moduleHandler
->invoke($module, 'search_preprocess', [
$text,
$langcode,
]);
}
}
/**
* Splits CJK (Chinese, Japanese, Korean) text into tokens.
*
* The Search module matches exact words, where a word is defined to be a
* sequence of characters delimited by spaces or punctuation. CJK languages
* are written in long strings of characters, though, not split up into words.
* So in order to allow search matching, we split up CJK text into tokens
* consisting of consecutive, overlapping sequences of characters whose length
* is equal to the 'minimum_word_size' variable. This tokenizing is only done
* if the 'overlap_cjk' variable is TRUE.
*
* @param array $matches
* This function is a callback for preg_replace_callback(), which is called
* from self::analyze(). So, $matches is an array of regular expression
* matches, which means that $matches[0] contains the matched text -- a
* string of CJK characters to tokenize.
*
* @return string
* Tokenized text, starting and ending with a space character.
*/
protected function expandCjk(array $matches) : string {
$min = $this->configFactory
->get('search.settings')
->get('index.minimum_word_size');
$str = $matches[0];
$length = mb_strlen($str);
// If the text is shorter than the minimum word size, don't tokenize it.
if ($length <= $min) {
return ' ' . $str . ' ';
}
$tokens = ' ';
// Build a FIFO queue of characters.
$chars = [];
for ($i = 0; $i < $length; $i++) {
// Add the next character off the beginning of the string to the queue.
$current = mb_substr($str, 0, 1);
$str = substr($str, strlen($current));
$chars[] = $current;
if ($i >= $min - 1) {
// Make a token of $min characters, and add it to the token string.
$tokens .= implode('', $chars) . ' ';
// Shift out the first character in the queue.
array_shift($chars);
}
}
return $tokens;
}
/**
* Helper function for array_walk in ::analyze().
*
* @param string $text
* The text to be truncated.
*/
protected function truncate(string &$text) : void {
if (is_numeric($text)) {
$text = ltrim($text, '0');
}
if (mb_strlen($text) <= 50) {
return;
}
$text = mb_substr($text, 0, 50);
}
}
Members
Name | Modifiers | Type | Description | Overrides |
---|---|---|---|---|
SearchTextProcessor:: |
protected | property | The config factory. | |
SearchTextProcessor:: |
protected | property | The module handler. | |
SearchTextProcessor:: |
protected | property | The transliteration service. | |
SearchTextProcessor:: |
public | function |
Runs the text through character analyzers in preparation for indexing. Overrides SearchTextProcessorInterface:: |
|
SearchTextProcessor:: |
protected | function | Splits CJK (Chinese, Japanese, Korean) text into tokens. | |
SearchTextProcessor:: |
protected | function | Invokes hook_search_preprocess() to simplify text. | |
SearchTextProcessor:: |
public | function |
Processes text into words for indexing. Overrides SearchTextProcessorInterface:: |
|
SearchTextProcessor:: |
protected | function | Helper function for array_walk in ::analyze(). | |
SearchTextProcessor:: |
public | function | SearchTextProcessor constructor. | |
SearchTextProcessorInterface:: |
constant | Matches CJK (Chinese, Japanese, Korean) letter-like characters. | ||
SearchTextProcessorInterface:: |
constant | Matches all 'N' Unicode character classes (numbers) | ||
SearchTextProcessorInterface:: |
constant | Matches all 'P' Unicode character classes (punctuation) |