You are here

function fuzzysearch_process_excerpt in Fuzzy Search 7

Fuzzysearch process excerpt.

1 call to fuzzysearch_process_excerpt()
fuzzysearch_build_excerpt in ./fuzzysearch.module
Fuzzysearch build excerpt.

File

./fuzzysearch.module, line 276
Fuzzysearch module.

Code

function fuzzysearch_process_excerpt($ngrams, $text, $keys, $fuzzy) {
  global $multibyte;
  $boundary = '(?:(?<=[' . FUZZYSEARCH_PREG_CLASS_SEARCH_EXCLUDE . FUZZYSEARCH_PREG_CLASS_CJK . '])|(?=[' . FUZZYSEARCH_PREG_CLASS_SEARCH_EXCLUDE . FUZZYSEARCH_PREG_CLASS_CJK . ']))';

  // Ngrams can occur multiple times, so filter.
  $clean_grams = fuzzysearch_unique($ngrams);

  // This will hold our search terms.
  $clean_words = $keys;
  $debug = '';

  // Now we rebuild the words stripping out misspelled ngrams.
  foreach ($clean_words as $key => $clean_word) {

    // If we have an exact match, let's skip the work to check for misspellings.
    if (!preg_match('/\\b' . $clean_word . '\\b/iu', $text)) {
      $pos = $id_count = array();
      $len = drupal_strlen($clean_word);

      // Ignore search terms less than the ngram length.
      if ($len >= $fuzzy['nlength']) {

        // Get the position of each good hit.
        foreach ($clean_grams as $n => $gram) {
          if ($multibyte == UNICODE_MULTIBYTE) {
            if (mb_stripos($clean_word, $n) !== FALSE) {
              $pos[mb_stripos($clean_word, $n)] = $n;

              // Keep count of our word ids so we can try to guess which word
              // we are trying to match.
              foreach ($clean_grams[$n] as $ngram_data) {
                if (!isset($id_count[$ngram_data['word_id']])) {
                  $id_count[$ngram_data['word_id']] = '';
                }
                $id_count[$ngram_data['word_id']] = (int) $id_count[$ngram_data['word_id']] + 1;
              }
            }
          }
          else {
            if (stripos($clean_word, $n) !== FALSE) {
              $pos[stripos($clean_word, $n)] = $n;

              // Keep count of our word ids so we can try to guess which word
              // we are trying to match.
              foreach ($clean_grams[$n] as $ngram_data) {
                $id_count[$ngram_data['word_id']] = '';
                $id_count[$ngram_data['word_id']] = $id_count[$ngram_data['word_id']] + 1;
              }
            }
          }
        }
        ksort($pos);

        // This gives us an array with the most common word_id as the first
        // element.
        arsort($id_count);
        $id_count = array_keys($id_count);

        // Remove any position matches that are not in our likely word (the
        // word with the highest word_id count).
        foreach ($pos as $position => $pgram) {
          $pmatch = FALSE;
          foreach ($clean_grams[$pgram] as $pid) {
            if ($pid['word_id'] == $id_count[0]) {
              $pmatch = TRUE;
            }
          }
          if (!$pmatch) {
            unset($pos[$position]);
          }
        }

        // Start with a dummy word at the right length, but only if there are
        // some matching ngram hits.
        $newword = '';
        if (count($pos)) {
          $newword = str_pad('', $len, '.');
        }
        $hits = $misses = $i = $pos_plus = 0;

        // Check character by character for ngram matches. We don't need to
        // check beyond the first character of the ngram.
        for ($i = 0; $i <= $len - $fuzzy['nlength']; $i++) {

          // This is a match, so insert it into our dummy word.
          if (isset($pos[$i])) {
            $newword = drupal_substr($newword, 0, $i + $pos_plus) . $pos[$i] . drupal_substr($newword, $i + $pos_plus + $fuzzy['nlength'], $len);
            ++$hits;
          }
          else {

            // But don't overwrite a letter, only a '.' .
            if (drupal_substr($newword, $i + $pos_plus + $fuzzy['nlength'] - 1, 1) == '.') {
              $newword = $i == 0 || $i + $pos_plus > $len - $fuzzy['nlength'] ? $newword : drupal_substr($newword, 0, $i + $pos_plus + $fuzzy['nlength'] - 1) . '.*' . drupal_substr($newword, $i + $pos_plus + 1 + $fuzzy['nlength'] - 1);

              // If we insert here, we need to adjust the positions in the
              // $pos array.
              $pos_plus++;
              $len++;
            }
            ++$misses;
          }
        }

        // Only keep our rebuilt word, if it meets our minimum spelling match
        // score.
        // Subtract $pos_plus from $len to get the original search term length.
        // Then subtract $fuzzy['nlength'] - 1 to get the number of ngrams in
        // the term.
        $spell_percent = $hits / ($len - $pos_plus - $fuzzy['nlength'] + 1) * 100;
        if ($spell_percent >= $fuzzy['spelling']) {

          // Remove consecutive wildcards and add word boundaries.
          $newword = preg_replace("/\\.\\./", ".*", $newword);
          $newword = preg_replace("/\\.\\*\\.\\*/", ".*", $newword);
          $newword = '\\b\\w*' . trim($newword, '.*') . '.*?\\b';
          $clean_words[$key] = $newword;
        }
        else {
          unset($clean_words[$key]);
        }
      }
      else {
        unset($clean_words[$key]);
      }
      if ($fuzzy['debug_score']) {
        $debug .= '<p><strong>' . t('Highlighting regex @newword -- Ngram spelling match is @percent%', array(
          '@newword' => $newword,
          '@percent' => number_format($spell_percent, 2),
        )) . '</strong></p>';
      }
    }
  }

  // Build a replacement node body containing sections of text with the found.
  // words, with leading and trailing text.
  $section = array();
  $section_length = array();
  foreach ($clean_words as $k => $word) {
    $location = 0;

    // If the word is found, add its position to $section.
    while (preg_match('/' . $word . '/iu', $text, $matches, PREG_OFFSET_CAPTURE, $location) && $word != '') {

      // Make sure we didn't traverse any word breaks by checking for spaces.
      // Pretty sure we don't need mb_stripos() here because we don't actually.
      // care about the position.
      if (!stripos($matches[0][0], ' ')) {
        $section[] = _fuzzysearch_char_count($text, $matches[0][1]);
        $section_length[$matches[0][1]] = drupal_strlen($word);
        $clean_words[$k] = $matches[0][0];
      }

      // Increase $location by one so we don't find the previous location.
      $location = $matches[0][1] + 1;
    }
  }

  // If there are no matches, the our word has a lot of common ngrams, but not.
  // in the right places. Return.
  if (empty($section)) {
    return;
  }

  // Because we found words one by one, the locations are out of order. Sort.
  // so that the locations are in natural order.
  asort($section);
  ksort($section_length);
  $section = array_unique(array_values($section));
  $section_length = array_values($section_length);
  $p = 0;
  $found = $newbody = '';
  $trail = $lead = $fuzzy['excerpt_length'] / 2;
  $start = $section[0];
  while (isset($section[$p])) {

    // If the current section is within the previous, let's not create a
    // new one, so we don't have any duplicate text.
    if (isset($section[$p + 1]) && $section[$p] + $lead + $section_length[$p] + $trail > $section[$p + 1]) {
      $trail = $section[$p + 1] + $section_length[$p + 1] + $lead - $start;
      $p++;
      continue;
    }

    // Put an excerpt into our replacement node body, with the.
    // found word in the center.
    $found = $start - $lead < 0 ? drupal_substr($text, 0, $fuzzy['excerpt_length']) : drupal_substr($text, $start - $lead, $trail + $lead);
    if ($fuzzy['max_result_length'] && drupal_strlen($newbody . $found) > $fuzzy['max_result_length']) {
      break;
    }
    $newbody .= '...' . $found . '... ';
    $p++;
    $start = isset($section[$p]) ? $section[$p] : 0;
    $trail = $lead;
  }

  // Wrap the found words in a <strong> tag to highlight them.
  $newbody = preg_replace('/' . $boundary . '[^' . FUZZYSEARCH_PREG_CLASS_SEARCH_EXCLUDE . FUZZYSEARCH_PREG_CLASS_CJK . ']*' . '(' . implode('|', $clean_words) . ')' . '[^' . FUZZYSEARCH_PREG_CLASS_SEARCH_EXCLUDE . FUZZYSEARCH_PREG_CLASS_CJK . ']*' . $boundary . '/iu', '<strong>\\0</strong>', $newbody);
  return $newbody . $debug;
}