public function SearchIndex::index in Drupal 9
Same name and namespace in other branches
- 8 core/modules/search/src/SearchIndex.php \Drupal\search\SearchIndex::index()
Updates the full-text search index for a particular item.
Parameters
string $type: The plugin ID or other machine-readable type of this item, which should be less than 64 bytes.
int $sid: An ID number identifying this particular item (e.g., node ID).
string $langcode: Language code for the language of the text being indexed.
string $text: The content of this item. Must be a piece of HTML or plain text.
bool $update_weights: (optional) TRUE if word weights should be updated. FALSE otherwise; defaults to TRUE. If you pass in FALSE, then you need to have your calls to this method in a try/finally block, and at the end of your index run in the finally clause, you will need to call self::updateWordWeights(), passing in all of the returned words, to update the word weights.
Return value
string[] The words to be updated.
Throws
\Drupal\search\Exception\SearchIndexException If there is an error indexing the text.
Overrides SearchIndexInterface::index
File
- core/
modules/ search/ src/ SearchIndex.php, line 79
Class
- SearchIndex
- Provides search index management functions.
Namespace
Drupal\searchCode
public function index($type, $sid, $langcode, $text, $update_weights = TRUE) {
$settings = $this->configFactory
->get('search.settings');
$minimum_word_size = $settings
->get('index.minimum_word_size');
// Keep track of the words that need to have their weights updated.
$current_words = [];
// Multipliers for scores of words inside certain HTML tags. The weights are
// stored in config so that modules can overwrite the default weights.
// Note: 'a' must be included for link ranking to work.
$tags = $settings
->get('index.tag_weights');
// Strip off all ignored tags to speed up processing, but insert space
// before and after them to keep word boundaries.
$text = str_replace([
'<',
'>',
], [
' <',
'> ',
], $text);
$text = strip_tags($text, '<' . implode('><', array_keys($tags)) . '>');
// Split HTML tags from plain text.
$split = preg_split('/\\s*<([^>]+?)>\\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
// Note: PHP ensures the array consists of alternating delimiters and
// literals and begins and ends with a literal (inserting $null as
// required).
// Odd/even counter. Tag or no tag.
$tag = FALSE;
// Starting score per word.
$score = 1;
// Accumulator for cleaned up data.
$accum = ' ';
// Stack with open tags.
$tagstack = [];
// Counter for consecutive words.
$tagwords = 0;
// Focus state.
$focus = 1;
// Accumulator for words for index.
$scored_words = [];
foreach ($split as $value) {
if ($tag) {
// Increase or decrease score per word based on tag.
list($tagname) = explode(' ', $value, 2);
$tagname = mb_strtolower($tagname);
// Closing or opening tag?
if ($tagname[0] == '/') {
$tagname = substr($tagname, 1);
// If we encounter unexpected tags, reset score to avoid incorrect
// boosting.
if (!count($tagstack) || $tagstack[0] != $tagname) {
$tagstack = [];
$score = 1;
}
else {
// Remove from tag stack and decrement score.
$score = max(1, $score - $tags[array_shift($tagstack)]);
}
}
else {
if (isset($tagstack[0]) && $tagstack[0] == $tagname) {
// None of the tags we look for make sense when nested identically.
// If they are, it's probably broken HTML.
$tagstack = [];
$score = 1;
}
else {
// Add to open tag stack and increment score.
array_unshift($tagstack, $tagname);
$score += $tags[$tagname];
}
}
// A tag change occurred, reset counter.
$tagwords = 0;
}
else {
// Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty
// values.
if ($value != '') {
$words = $this->textProcessor
->process($value, $langcode);
foreach ($words as $word) {
// Add word to accumulator.
$accum .= $word . ' ';
// Check word length.
if (is_numeric($word) || mb_strlen($word) >= $minimum_word_size) {
if (!isset($scored_words[$word])) {
$scored_words[$word] = 0;
}
$scored_words[$word] += $score * $focus;
// Focus is a decaying value in terms of the amount of unique
// words up to this point. From 100 words and more, it decays, to
// e.g. 0.5 at 500 words and 0.3 at 1000 words.
$focus = min(1, 0.01 + 3.5 / (2 + count($scored_words) * 0.015));
}
$tagwords++;
// Too many words inside a single tag probably mean a tag was
// accidentally left open.
if (count($tagstack) && $tagwords >= 15) {
$tagstack = [];
$score = 1;
}
}
}
}
$tag = !$tag;
}
// Remove the item $sid from the search index, and invalidate the relevant
// cache tags.
$this
->clear($type, $sid, $langcode);
try {
// Insert cleaned up data into dataset.
$this->connection
->insert('search_dataset')
->fields([
'sid' => $sid,
'langcode' => $langcode,
'type' => $type,
'data' => $accum,
'reindex' => 0,
])
->execute();
// Insert results into search index.
foreach ($scored_words as $word => $score) {
// If a word already exists in the database, its score gets increased
// appropriately. If not, we create a new record with the appropriate
// starting score.
$this->connection
->merge('search_index')
->keys([
'word' => $word,
'sid' => $sid,
'langcode' => $langcode,
'type' => $type,
])
->fields([
'score' => $score,
])
->expression('score', '[score] + :score', [
':score' => $score,
])
->execute();
$current_words[$word] = TRUE;
}
} catch (\Exception $e) {
throw new SearchIndexException("Failed to insert dataset in index for type '{$type}', sid '{$sid}' and langcode '{$langcode}'", 0, $e);
} finally {
if ($update_weights) {
$this
->updateWordWeights($current_words);
}
}
return $current_words;
}