You are here

protected function Highlight::createExcerpt in Search API 8

Returns snippets from a piece of text, with certain keywords highlighted.

Largely copied from search_excerpt().

Parameters

string $text: The text to extract fragments from.

array $keys: The search keywords entered by the user.

Return value

string|null A string containing HTML for the excerpt. Or NULL if no excerpt could be created.

1 call to Highlight::createExcerpt()
Highlight::addExcerpts in src/Plugin/search_api/processor/Highlight.php
Adds excerpts to all results, if possible.

File

src/Plugin/search_api/processor/Highlight.php, line 456

Class

Highlight
Adds a highlighted excerpt to results and highlights returned fields.

Namespace

Drupal\search_api\Plugin\search_api\processor

Code

protected function createExcerpt($text, array $keys) {

  // Remove HTML tags <script> and <style> with all of their contents.
  $text = preg_replace('#<(style|script).*?>.*?</\\1>#is', ' ', $text);

  // Prepare text by stripping HTML tags and decoding HTML entities.
  $text = strip_tags(str_replace([
    '<',
    '>',
  ], [
    ' <',
    '> ',
  ], $text));
  $text = Html::decodeEntities($text);
  $text = preg_replace('/\\s+/', ' ', $text);
  $text = trim($text, ' ');
  $text_length = mb_strlen($text);

  // Try to reach the requested excerpt length with about two fragments (each
  // with a keyword and some context).
  $ranges = [];
  $length = 0;
  $look_start = [];
  $remaining_keys = $keys;

  // Get the set excerpt length from the configuration. If the length is too
  // small, only use one fragment.
  $excerpt_length = $this->configuration['excerpt_length'];
  $context_length = round($excerpt_length / 4) - 3;
  if ($context_length < 32) {
    $context_length = round($excerpt_length / 2) - 1;
  }
  while ($length < $excerpt_length && !empty($remaining_keys)) {
    $found_keys = [];
    foreach ($remaining_keys as $key) {
      if ($length >= $excerpt_length) {
        break;
      }

      // Remember where we last found $key, in case we are coming through a
      // second time.
      if (!isset($look_start[$key])) {
        $look_start[$key] = 0;
      }

      // See if we can find $key after where we found it the last time. Since
      // we are requiring a match on a word boundary, make sure $text starts
      // and ends with a space.
      $matches = [];
      if (!$this->configuration['highlight_partial']) {
        $found_position = FALSE;
        $regex = '/' . static::$boundary . preg_quote($key, '/') . static::$boundary . '/iu';

        // $look_start contains the position as character offset, while
        // preg_match() takes a byte offset.
        $offset = $look_start[$key];
        if ($offset > 0) {
          $offset = strlen(mb_substr(' ' . $text, 0, $offset));
        }
        if (preg_match($regex, ' ' . $text . ' ', $matches, PREG_OFFSET_CAPTURE, $offset)) {
          $found_position = $matches[0][1];

          // Convert the byte position into a multi-byte character position.
          $found_position = mb_strlen(substr(" {$text}", 0, $found_position));
        }
      }
      else {
        $found_position = mb_stripos($text, $key, $look_start[$key], 'UTF-8');
      }
      if ($found_position !== FALSE) {
        $look_start[$key] = $found_position + 1;

        // Keep track of which keys we found this time, in case we need to
        // pass through again to find more text.
        $found_keys[] = $key;

        // Locate a space before and after this match, leaving some context on
        // each end.
        if ($found_position > $context_length) {
          $before = mb_strpos($text, ' ', $found_position - $context_length);
          if ($before !== FALSE) {
            ++$before;
          }

          // If we can’t find a space anywhere within the context length, just
          // settle for a non-space.
          if ($before === FALSE || $before > $found_position) {
            $before = $found_position - $context_length;
          }
        }
        else {
          $before = 0;
        }
        if ($before !== FALSE && $before <= $found_position) {
          if ($text_length > $found_position + $context_length) {
            $after = mb_strrpos(mb_substr($text, 0, $found_position + $context_length), ' ', $found_position);
          }
          else {
            $after = $text_length;
          }
          if ($after !== FALSE && $after > $found_position) {
            if ($before < $after) {

              // Save this range.
              $ranges[$before] = $after;
              $length += $after - $before;
            }
          }
        }
      }
    }

    // Next time through this loop, only look for keys we found this time,
    // if any.
    $remaining_keys = $found_keys;
  }
  if (!$ranges) {

    // We didn't find any keyword matches, return NULL.
    return NULL;
  }

  // Sort the text ranges by starting position.
  ksort($ranges);

  // Collapse overlapping text ranges into one. The sorting makes it O(n).
  $new_ranges = [];
  $working_from = $working_to = NULL;
  foreach ($ranges as $this_from => $this_to) {
    if ($working_from === NULL) {

      // This is the first time through this loop: initialize.
      $working_from = $this_from;
      $working_to = $this_to;
      continue;
    }
    if ($this_from <= $working_to) {

      // The ranges overlap: combine them.
      $working_to = max($working_to, $this_to);
    }
    else {

      // The ranges do not overlap: save the working range and start a new
      // one.
      $new_ranges[$working_from] = $working_to;
      $working_from = $this_from;
      $working_to = $this_to;
    }
  }

  // Save the remaining working range.
  $new_ranges[$working_from] = $working_to;

  // Fetch text within the combined ranges we found.
  $out = [];
  foreach ($new_ranges as $from => $to) {
    $out[] = Html::escape(mb_substr($text, $from, $to - $from));
  }
  if (!$out) {
    return NULL;
  }
  $ellipses = $this
    ->getEllipses();
  $excerpt = $ellipses[0] . implode($ellipses[1], $out) . $ellipses[2];

  // Since we stripped the tags at the beginning, highlighting doesn't need to
  // handle HTML anymore.
  return $this
    ->highlightField($excerpt, $keys, FALSE);
}