You are here

function search_by_page_excerpt in Search by Page 7

Same name and namespace in other branches
  1. 6 search_by_page.module \search_by_page_excerpt()

Returns a search excerpt, with matched keywords highlighted.

This is a drop-in replacement for the core search_exerpt() function. The difference is that it allows stemming modules (or other modules that preprocess search text and terms) to highlight words other than exact keyword matches in the text, by implementing hook_sbp_excerpt_match().

Parameters

$keys: A string containing a search query.

$text: The text to extract fragments from.

Return value

A string containing HTML for the excerpt.

6 calls to search_by_page_excerpt()
hook_sbp_details in ./search_by_page.api.php
Return details about an indexed path (required sub-module hook).
sbp_attach_sbp_details in ./sbp_attach.module
Implements Search by Page hook_sbp_details().
sbp_nodes_sbp_details in ./sbp_nodes.module
Implements Search by Page hook_sbp_details().
sbp_paths_sbp_details in ./sbp_paths.module
Implements Search by Page hook_sbp_details().
sbp_users_sbp_details in ./sbp_users.module
Implements Search by Page hook_sbp_details().

... See full list

File

./search_by_page.module, line 228
Main module file for Drupal module Search by Page.

Code

function search_by_page_excerpt($keys, $text) {

  // We highlight around non-indexable or CJK characters.
  $boundary = '(?:(?<=[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . PREG_CLASS_CJK . '])|(?=[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . PREG_CLASS_CJK . ']))';

  // Extract positive keywords and phrases
  preg_match_all('/ ("([^"]+)"|(?!OR)([^"]+))/', ' ' . $keys, $matches);
  $keys = array_merge($matches[2], $matches[3]);

  // Prepare text
  $text = ' ' . strip_tags(str_replace(array(
    '<',
    '>',
  ), array(
    ' <',
    '> ',
  ), $text)) . ' ';
  array_walk($keys, '_search_excerpt_replace');
  $workkeys = $keys;

  // Extract fragments around keywords.
  // First we collect ranges of text around each keyword, starting/ending
  // at spaces, trying to get to 256 characters.
  // If the sum of all fragments is too short, we look for second occurrences.
  $ranges = array();
  $included = array();
  $foundkeys = array();
  $length = 0;
  while ($length < 256 && count($workkeys)) {
    foreach ($workkeys as $k => $key) {
      if (!strlen($key)) {
        unset($workkeys[$k]);
        unset($keys[$k]);
        continue;
      }
      if ($length >= 256) {
        break;
      }

      // Remember occurrence of key so we can skip over it if more occurrences
      // are desired.
      if (!isset($included[$key])) {
        $included[$key] = 0;
      }

      // Locate a keyword (position $p, always >0 because $text starts with
      // a space). Try a bare keyword and let stemming modules try to find a
      // derived form. Make sure to keep the leftmost match found.
      $p = 0;
      if (preg_match('/' . $boundary . $key . $boundary . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) {
        $p = $match[0][1];
      }
      foreach (module_implements('sbp_excerpt_match') as $module) {
        $info = module_invoke($module, 'sbp_excerpt_match', $key, $text, $included[$key], $boundary);
        if ($info['where']) {
          if (!$p || $info['where'] < $p) {
            $p = $info['where'];
          }
          if ($info['keyword']) {
            $foundkeys[] = $info['keyword'];
          }
        }
      }

      // Now locate a space in front (position $q) and behind it (position $s),
      // leaving about 60 characters extra before and after for context.
      // Note that a space was added to the front and end of $text above.
      if ($p) {
        if (($q = strpos($text, ' ', max(0, $p - 60))) !== FALSE) {
          $end = substr($text, $p, 80);

          // CODER-IGNORE-THIS
          if (($s = strrpos($end, ' ')) !== FALSE) {
            $ranges[$q] = $p + $s;
            $length += $p + $s - $q;
            $included[$key] = $p + 1;
          }
          else {
            unset($workkeys[$k]);
          }
        }
        else {
          unset($workkeys[$k]);
        }
      }
      else {
        unset($workkeys[$k]);
      }
    }
  }

  // If we didn't find anything, return the beginning.
  if (count($ranges) == 0) {
    return truncate_utf8($text, 256) . ' ...';
  }

  // Sort the text ranges by starting position.
  ksort($ranges);

  // Now we collapse overlapping text ranges into one. The sorting makes it O(n).
  $newranges = array();
  foreach ($ranges as $from2 => $to2) {
    if (!isset($from1)) {
      $from1 = $from2;
      $to1 = $to2;
      continue;
    }
    if ($from2 <= $to1) {
      $to1 = max($to1, $to2);
    }
    else {
      $newranges[$from1] = $to1;
      $from1 = $from2;
      $to1 = $to2;
    }
  }
  $newranges[$from1] = $to1;

  // Fetch text
  $out = array();
  foreach ($newranges as $from => $to) {
    $out[] = substr($text, $from, $to - $from);

    // CODER-IGNORE-THIS
  }
  $text = (isset($newranges[0]) ? '' : '... ') . implode(' ... ', $out) . ' ...';

  // Highlight keywords. Must be done at once to prevent conflicts ('strong' and '<strong>').
  $keys = $keys + $foundkeys;
  $text = preg_replace('/' . $boundary . '(' . implode('|', $keys) . ')' . $boundary . '/iu', '<strong>\\0</strong>', $text);
  return $text;
}