function _search_find_match_with_simplify in Drupal 9
Same name and namespace in other branches
- 8 core/modules/search/search.module \_search_find_match_with_simplify()
Finds an appropriate keyword in text.
Parameters
string $key: The keyword to find.
string $text: The text to search for the keyword.
string $boundary: Regular expression for the boundary character class (characters that indicate spaces between words).
string|null $langcode: Language code for the language of $text, if known.
Return value
string|null A segment of $text that is between word boundary characters that either matches $key directly, or matches $key when both this text segment and $key are processed by \Drupal\search\SearchTextProcessorInterface::analyze(). If a matching text segment is not located, NULL is returned.
1 call to _search_find_match_with_simplify()
- search_excerpt in core/
modules/ search/ search.module - Returns snippets from a piece of text, with search keywords highlighted.
File
- core/
modules/ search/ search.module, line 472 - Enables site-wide keyword searching.
Code
function _search_find_match_with_simplify($key, $text, $boundary, $langcode = NULL) {
$preceded_by_boundary = '(?<=' . $boundary . ')';
$followed_by_boundary = '(?=' . $boundary . ')';
// See if $key appears as-is. When testing, make sure $text starts/ends with
// a space, because we require $key to be surrounded by word boundary
// characters.
$temp = trim($key);
if ($temp == '') {
return NULL;
}
if (preg_match('/' . $preceded_by_boundary . preg_quote($temp, '/') . $followed_by_boundary . '/iu', ' ' . $text . ' ')) {
return $temp;
}
// See if there is a match after lower-casing and removing diacritics in
// both, which should preserve the string length.
$new_text = mb_strtolower($text);
$new_text = \Drupal::service('transliteration')
->removeDiacritics($new_text);
$new_key = mb_strtolower($temp);
$new_key = \Drupal::service('transliteration')
->removeDiacritics($new_key);
if (preg_match('/' . $preceded_by_boundary . preg_quote($new_key, '/') . $followed_by_boundary . '/u', ' ' . $new_text . ' ')) {
$position = mb_strpos($new_text, $new_key);
return mb_substr($text, $position, mb_strlen($new_key));
}
// Run both text and key through text processor.
/** @var \Drupal\search\SearchTextProcessorInterface $text_processor */
$text_processor = \Drupal::service('search.text_processor');
$simplified_key = trim($text_processor
->analyze($key, $langcode));
$simplified_text = trim($text_processor
->analyze($text, $langcode));
if ($simplified_key == '' || $simplified_text == '' || strpos($simplified_text, $simplified_key) === FALSE) {
// The simplified keyword and text do not match at all, or are empty.
return NULL;
}
// Split $text into words, keeping track of where the word boundaries are.
$words = preg_split('/' . $boundary . '+/u', $text, NULL, PREG_SPLIT_OFFSET_CAPTURE);
// Add an entry pointing to the end of the string, for the loop below.
$words[] = [
'',
strlen($text),
];
// Using a binary search, find the earliest possible ending position in
// $text where it will still match the keyword after applying
// \Drupal\search\SearchTextProcessorInterface::analyze().
$start_index = 0;
$start_pos = $words[$start_index][1];
$min_end_index = 1;
$max_end_index = count($words) - 1;
while ($max_end_index > $min_end_index) {
// Check the index half way between min and max. See if we ended there,
// if we would still have a match.
$proposed_end_index = floor(($max_end_index + $min_end_index) / 2);
$proposed_end_pos = $words[$proposed_end_index][1];
// Since the split was done with preg_split(), the positions are byte counts
// not character counts, so use substr() not mb_substr() here.
$trial_text = trim($text_processor
->analyze(substr($text, $start_pos, $proposed_end_pos - $start_pos), $langcode));
if (strpos($trial_text, $simplified_key) !== FALSE) {
// The proposed endpoint is fine, text still matches.
$max_end_index = $proposed_end_index;
}
else {
// The proposed endpoint index is too early, so the earliest possible
// OK ending point would be the next index.
$min_end_index = $proposed_end_index + 1;
}
}
// Now do the same for the starting position: using a binary search, find the
// latest possible starting position in $text where it will still match the
// keyword after applying
// \Drupal\search\SearchTextProcessorInterface::analyze().
$end_index = $min_end_index;
$end_pos = $words[$end_index][1];
$min_start_index = 0;
$max_start_index = $end_index - 1;
while ($max_start_index > $min_start_index) {
// Check the index half way between min and max. See if we started there,
// if we would still have a match.
$proposed_start_index = ceil(($max_start_index + $min_start_index) / 2);
$proposed_start_pos = $words[$proposed_start_index][1];
// Since the split was done with preg_split(), the positions are byte counts
// not character counts, so use substr() not mb_substr() here.
$trial_text = trim($text_processor
->analyze(substr($text, $proposed_start_pos, $end_pos - $proposed_start_pos), $langcode));
if (strpos($trial_text, $simplified_key) !== FALSE) {
// The proposed start point is fine, text still matches.
$min_start_index = $proposed_start_index;
}
else {
// The proposed start point index is too late, so the latest possible
// OK starting point would be the previous index.
$max_start_index = $proposed_start_index - 1;
}
}
$start_index = $max_start_index;
// Return the matching text. We need to use substr() here and not the
// mb_substr() function, because the indices in $words came from preg_split(),
// so they are Unicode-safe byte positions, not character positions.
return trim(substr($text, $words[$start_index][1], $words[$end_index][1] - $words[$start_index][1]));
}