You are here

function search_excerpt in Drupal 10

Same name and namespace in other branches
  1. 8 core/modules/search/search.module \search_excerpt()
  2. 4 modules/search.module \search_excerpt()
  3. 5 modules/search/search.module \search_excerpt()
  4. 6 modules/search/search.module \search_excerpt()
  5. 7 modules/search/search.module \search_excerpt()
  6. 9 core/modules/search/search.module \search_excerpt()

Returns snippets from a piece of text, with search keywords highlighted.

Used for formatting search results. All HTML tags will be stripped from $text.

Parameters

string $keys: A string containing a search query.

string $text: The text to extract fragments from.

string|null $langcode: Language code for the language of $text, if known.

Return value

array A render array containing HTML for the excerpt.

Related topics

3 calls to search_excerpt()
HelpSearch::prepareResults in core/modules/help_topics/src/Plugin/Search/HelpSearch.php
Prepares search results for display.
NodeSearch::prepareResults in core/modules/node/src/Plugin/Search/NodeSearch.php
Prepares search results for rendering.
SearchExcerptTest::doSearchExcerpt in core/modules/search/tests/src/Kernel/SearchExcerptTest.php
Calls search_excerpt() and renders output.

File

core/modules/search/search.module, line 134
Enables site-wide keyword searching.

Code

function search_excerpt($keys, $text, $langcode = NULL) {

  // We highlight around non-indexable or CJK characters.
  $boundary_character = '[' . Unicode::PREG_CLASS_WORD_BOUNDARY . SearchTextProcessorInterface::PREG_CLASS_CJK . ']';
  $preceded_by_boundary = '(?<=' . $boundary_character . ')';
  $followed_by_boundary = '(?=' . $boundary_character . ')';

  // Extract positive keywords and phrases.
  preg_match_all('/ ("([^"]+)"|(?!OR)([^" ]+))/', ' ' . $keys, $matches);
  $keys = array_merge($matches[2], $matches[3]);

  // Prepare text by stripping HTML tags and decoding HTML entities.
  $text = strip_tags(str_replace([
    '<',
    '>',
  ], [
    ' <',
    '> ',
  ], $text));
  $text = Html::decodeEntities($text);
  $text_length = strlen($text);

  // Make a list of unique keywords that are actually found in the text,
  // which could be items in $keys or replacements that are equivalent through
  // \Drupal\search\SearchTextProcessorInterface::analyze().
  $temp_keys = [];
  foreach ($keys as $key) {
    $key = _search_find_match_with_simplify($key, $text, $boundary_character, $langcode);
    if (isset($key)) {

      // Quote slashes so they can be used in regular expressions.
      $temp_keys[] = preg_quote($key, '/');
    }
  }

  // Several keywords could have simplified down to the same thing, so pick
  // out the unique ones.
  $keys = array_unique($temp_keys);

  // Extract fragments of about 60 characters around keywords, bounded by word
  // boundary characters. Try to reach 256 characters, using second occurrences
  // if necessary.
  $ranges = [];
  $length = 0;
  $look_start = [];
  $remaining_keys = $keys;
  while ($length < 256 && !empty($remaining_keys)) {
    $found_keys = [];
    foreach ($remaining_keys as $key) {
      if ($length >= 256) {
        break;
      }

      // Remember where we last found $key, in case we are coming through a
      // second time.
      if (!isset($look_start[$key])) {
        $look_start[$key] = 0;
      }

      // See if we can find $key after where we found it the last time. Since
      // we are requiring a match on a word boundary, make sure $text starts
      // and ends with a space.
      $matches = [];
      if (preg_match('/' . $preceded_by_boundary . $key . $followed_by_boundary . '/iu', ' ' . $text . ' ', $matches, PREG_OFFSET_CAPTURE, $look_start[$key])) {
        $found_position = $matches[0][1];
        $look_start[$key] = $found_position + 1;

        // Keep track of which keys we found this time, in case we need to
        // pass through again to find more text.
        $found_keys[] = $key;

        // Locate a space before and after this match, leaving about 60
        // characters of context on each end.
        $before = strpos(' ' . $text, ' ', max(0, $found_position - 61));
        if ($before !== FALSE && $before <= $found_position) {
          if ($text_length > $found_position + 60) {
            $after = strrpos(substr($text, 0, $found_position + 60), ' ', $found_position);
          }
          else {
            $after = $text_length;
          }
          if ($after !== FALSE && $after > $found_position) {

            // Account for the spaces we added.
            $before = max($before - 1, 0);
            if ($before < $after) {

              // Save this range.
              $ranges[$before] = $after;
              $length += $after - $before;
            }
          }
        }
      }
    }

    // Next time through this loop, only look for keys we found this time,
    // if any.
    $remaining_keys = $found_keys;
  }
  if (empty($ranges)) {

    // We didn't find any keyword matches, so just return the first part of the
    // text. We also need to re-encode any HTML special characters that we
    // entity-decoded above.
    return [
      '#plain_text' => Unicode::truncate($text, 256, TRUE, TRUE),
    ];
  }

  // Sort the text ranges by starting position.
  ksort($ranges);

  // Collapse overlapping text ranges into one. The sorting makes it O(n).
  $new_ranges = [];
  $max_end = 0;
  foreach ($ranges as $this_from => $this_to) {
    $max_end = max($max_end, $this_to);
    if (!isset($working_from)) {

      // This is the first time through this loop: initialize.
      $working_from = $this_from;
      $working_to = $this_to;
      continue;
    }
    if ($this_from <= $working_to) {

      // The ranges overlap: combine them.
      $working_to = max($working_to, $this_to);
    }
    else {

      // The ranges do not overlap: save the working range and start a new one.
      $new_ranges[$working_from] = $working_to;
      $working_from = $this_from;
      $working_to = $this_to;
    }
  }

  // Save the remaining working range.
  $new_ranges[$working_from] = $working_to;

  // Fetch text within the combined ranges we found.
  $out = [];
  foreach ($new_ranges as $from => $to) {
    $out[] = substr($text, $from, $to - $from);
  }

  // Combine the text chunks with "…" separators. The "…" needs to be
  // translated. Let translators have the … separator text as one chunk.
  $ellipses = explode('@excerpt', t('… @excerpt … @excerpt …'));
  $text = (isset($new_ranges[0]) ? '' : $ellipses[0]) . implode($ellipses[1], $out) . ($max_end < strlen($text) - 1 ? $ellipses[2] : '');
  $text = Html::escape($text);

  // Highlight keywords. Must be done at once to prevent conflicts ('strong'
  // and '<strong>').
  $text = trim(preg_replace('/' . $preceded_by_boundary . '(?:' . implode('|', $keys) . ')' . $followed_by_boundary . '/iu', '<strong>\\0</strong>', ' ' . $text . ' '));
  return [
    '#markup' => $text,
    '#allowed_tags' => [
      'strong',
    ],
  ];
}