You are here

function SearchByPageService::excerpt in Search by Page 8

Returns a search excerpt, with matched keywords highlighted. The ex-function search_by_page_excerpt($keys, $text) from [D8 ported] .module file.

This is a drop-in replacement for the core search_exerpt() function. The difference is that it allows stemming modules (or other modules that preprocess search text and terms) to highlight words other than exact keyword matches in the text, by implementing hook_search_by_page_excerpt_match().

Parameters

$keys: A string containing a search query.

$text: The text to extract fragments from.

Return value

A string containing HTML for the excerpt.

File

src/Services/SearchByPageService.php, line 165
Search By Page Service and helper methods.

Class

SearchByPageService

Namespace

Drupal\search_by_page\Services

Code

function excerpt($keys, $text) {

  // We highlight around non-indexable or CJK characters.
  $boundary = '(?:(?<=[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . PREG_CLASS_CJK . '])|(?=[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . PREG_CLASS_CJK . ']))';

  // Extract positive keywords and phrases
  preg_match_all('/ ("([^"]+)"|(?!OR)([^"]+))/', ' ' . $keys, $matches);
  $keys = array_merge($matches[2], $matches[3]);

  // Prepare text
  $text = ' ' . strip_tags(str_replace([
    '<',
    '>',
  ], [
    ' <',
    '> ',
  ], $text)) . ' ';
  array_walk($keys, '_search_excerpt_replace');
  $workkeys = $keys;

  // Extract fragments around keywords.
  // First we collect ranges of text around each keyword, starting/ending
  // at spaces, trying to get to 256 characters.
  // If the sum of all fragments is too short, we look for second occurrences.
  $ranges = [];
  $included = [];
  $foundkeys = [];
  $length = 0;
  while ($length < 256 && count($workkeys)) {
    foreach ($workkeys as $k => $key) {
      if (!strlen($key)) {
        unset($workkeys[$k]);
        unset($keys[$k]);
        continue;
      }
      if ($length >= 256) {
        break;
      }

      // Remember occurrence of key so we can skip over it if more occurrences
      // are desired.
      if (!isset($included[$key])) {
        $included[$key] = 0;
      }

      // Locate a keyword (position $p, always >0 because $text starts with
      // a space). Try a bare keyword and let stemming modules try to find a
      // derived form. Make sure to keep the leftmost match found.
      $p = 0;
      if (preg_match('/' . $boundary . $key . $boundary . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) {
        $p = $match[0][1];
      }
      foreach (\Drupal::moduleHandler()
        ->getImplementations('search_by_page_excerpt_match') as $module) {
        $info = \Drupal::moduleHandler()
          ->invoke($module, 'search_by_page_excerpt_match', [
          $key,
          $text,
          $included[$key],
          $boundary,
        ]);
        if ($info['where']) {
          if (!$p || $info['where'] < $p) {
            $p = $info['where'];
          }
          if ($info['keyword']) {
            $foundkeys[] = $info['keyword'];
          }
        }
      }

      // Now locate a space in front (position $q) and behind it (position $s),
      // leaving about 60 characters extra before and after for context.
      // Note that a space was added to the front and end of $text above.
      if ($p) {
        if (($q = strpos($text, ' ', max(0, $p - 60))) !== FALSE) {
          $end = substr($text, $p, 80);

          // CODER-IGNORE-THIS
          if (($s = strrpos($end, ' ')) !== FALSE) {
            $ranges[$q] = $p + $s;
            $length += $p + $s - $q;
            $included[$key] = $p + 1;
          }
          else {
            unset($workkeys[$k]);
          }
        }
        else {
          unset($workkeys[$k]);
        }
      }
      else {
        unset($workkeys[$k]);
      }
    }
  }

  // If we didn't find anything, return the beginning.
  if (count($ranges) == 0) {
    return mb_substr($text, 256) . ' ...';
  }

  // Sort the text ranges by starting position.
  ksort($ranges);

  // Now we collapse overlapping text ranges into one. The sorting makes it O(n).
  $newranges = [];
  foreach ($ranges as $from2 => $to2) {
    if (!isset($from1)) {
      $from1 = $from2;
      $to1 = $to2;
      continue;
    }
    if ($from2 <= $to1) {
      $to1 = max($to1, $to2);
    }
    else {
      $newranges[$from1] = $to1;
      $from1 = $from2;
      $to1 = $to2;
    }
  }
  $newranges[$from1] = $to1;

  // Fetch text
  $out = [];
  foreach ($newranges as $from => $to) {
    $out[] = substr($text, $from, $to - $from);

    // CODER-IGNORE-THIS
  }
  $text = (isset($newranges[0]) ? '' : '... ') . implode(' ... ', $out) . ' ...';

  // Highlight keywords. Must be done at once to prevent conflicts ('strong' and '<strong>').
  $keys = $keys + $foundkeys;
  $text = preg_replace('/' . $boundary . '(' . implode('|', $keys) . ')' . $boundary . '/iu', '<strong>\\0</strong>', $text);
  return $text;
}