You are here

function fuzzysearch_process in Fuzzy Search 6

Process the search query

1 call to fuzzysearch_process()
theme_fuzzysearch_show_results in ./fuzzysearch.module
Theme hook for rendering search results.

File

./fuzzysearch.module, line 586
Module file for fuzzysearch module.

Code

function fuzzysearch_process($query, $theme = NODE_THEME, $limit = 10) {
  global $user;
  global $multibyte;

  // if no keys were entered do not display anything below the search form
  if (!$query) {
    return;
  }

  // Sanitize query again because it can be submitted from url as well as form.
  // Do this word-by-word to keep words whole even after removing excluded
  // characters. Keep the original for highlighting.
  $orig_query = $query = trim($query);
  $parts = explode(' ', $query);
  foreach ($parts as $part) {
    $query_array[] = str_replace(' ', '', fuzzysearch_cleanse($part));
  }
  $query = implode(' ', $query_array);

  // Log the search keys:
  watchdog('fuzzysearch', '%query', array(
    '%query' => $query,
  ), WATCHDOG_NOTICE, l(t('results'), variable_get('fuzzysearch_path_name', 'fuzzysearch/results') . '/' . $query));

  // Hook_fuzzysearch_filter lets modules filter text. This should be used for
  // more complex filtering. Stop words should not use this. Create a stopword
  // file instead. See fuzzysearch/stopwords/README.txt.
  foreach (module_implements('fuzzysearch_filter') as $name) {
    $function = $name . '_fuzzysearch_filter';
    $query = $function('search', $query);
  }

  // Remove stopwords.
  $query = fuzzysearch_stopwords($query);

  // Make sure we still have a query.
  if (!$query) {
    return;
  }
  $nlength = variable_get('fuzzysearch_ngram_length', 3);
  $min_spelling = variable_get('fuzzysearch_spelling', 30);
  $excerpt = variable_get('fuzzysearch_excerpt', 200);
  $missing_letters = variable_get('fuzzysearch_missing_letters', 1);
  $extra_letters = variable_get('fuzzysearch_extra_letters', 1);
  $boundary = '(?:(?<=[' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . '])|(?=[' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']))';
  $words = explode(' ', $query);

  // Build the WHERE clause for the ngrams.
  // @todo Change type of query based on boolean operators
  $clause = '';
  foreach ($words as $k => $word) {
    $length = drupal_strlen($word);

    // $comp_min is the minumum completeness an ngram can have to return words.
    // If > 0 $extra_letters assumes the searcher has missing letters in the search term.
    // It's configurable by the admin. Increasing the number of extra letters
    // lowers the completeness and returns more words, making things fuzzier.
    $comp_min = 100 / ($length - $nlength + 1 + $missing_letters);

    // $comp_max is the maximum completeness an ngram can have and still return words.
    // In this case we assume the search term has extra letters, and this lets us include
    // shorter words in the results. Increasing this raises the completeness of an ngram
    // and returns more grams with higher completeness. Completeness can never be
    // higher than 100%.
    if ($length - $nlength + 1 - $extra_letters <= 0) {
      $comp_max = 100;
    }
    else {
      $comp_max = 100 / ($length - $nlength + 1 - $extra_letters);
    }
    $comp_min = number_format($comp_min, 3) - 0.001;
    $comp_max = number_format($comp_max, 3) + 0.001;

    // Tuning info
    if (variable_get('fuzzysearch_debug_search', FALSE) && user_access('access fuzzysearch debugging')) {
      $message = t('Adding ngrams to the query for the word "@word" to the query with ngram length of @nlength, minimum ngram completeness of @comp_min and maximum ngram completeness of @comp_max', array(
        '@word' => $word,
        '@nlength' => $nlength,
        '@comp_min' => $comp_min,
        '@comp_max' => $comp_max,
      ));
      drupal_set_message($message, 'status', FALSE);
    }
    for ($i = 0; $i < $length - 2; $i++) {
      $clause .= " (ngram = '" . drupal_substr($word, $i, $nlength) . "' AND completeness BETWEEN " . $comp_min . " AND " . $comp_max . ") OR";
    }
  }
  $clause = preg_replace("/ OR\$/", '', $clause);

  // @todo: Fix the minimum completeness so that a single qgram match doesn't necessarily return a match
  $min_completeness = check_plain(variable_get('fuzzysearch_completeness', 40));

  // Get content types to exclude from results. They are still indexed.
  $types = array_filter(variable_get('fuzzysearch_nodetypes', array(
    '',
  )));

  // Build the query args and placeholders.
  $args[] = $min_completeness;
  $args += $types;
  $placeholders = count($types) ? db_placeholders($types, 'text') : '\'\'';
  $order_by = variable_get('fuzzysearch_sort_score', FALSE) ? 'score DESC, percent DESC' : 'percent DESC, score DESC';

  // Main query
  $sql = "SELECT n.nid, MAX(n.moderate) AS moderate, MAX(n.uid) AS uid, MAX(n.type) AS type, MAX(n.status) AS status, SUM(subpercent) AS percent, SUM(subscore) AS score\n          FROM (SELECT DISTINCT word_id, nn.nid, SUM(completeness) AS subpercent, SUM(score) AS subscore\n            FROM {fuzzysearch_index} s\n            LEFT JOIN {node} nn ON (nn.nid = s.nid)\n            WHERE (({$clause}))\n            GROUP BY word_id, nn.nid HAVING SUM(completeness) >= %d) AS q\n          LEFT JOIN {node} n on n.nid = q.nid\n          WHERE n.status = 1\n          AND n.type NOT IN ({$placeholders})\n          GROUP BY n.nid ORDER BY {$order_by}";

  // Count query
  $sql_count = "SELECT COUNT(DISTINCT(n.nid))\n               FROM (SELECT nn.type, nn.uid, nn.moderate, nn.nid, CEILING(SUM(completeness)) AS completeness, SUM(score) AS score\n                 FROM {fuzzysearch_index} AS s\n                 LEFT JOIN {node} nn on s.nid = nn.nid\n                 WHERE {$clause}\n                 GROUP BY word_id, s.nid, nn.type, nn.uid, nn.moderate, nn.nid\n                 HAVING SUM(completeness) >= %d) AS q\n               LEFT JOIN {node} n on n.nid = q.nid\n               WHERE n.status = 1\n               AND n.type NOT IN ({$placeholders})";
  $sql = db_rewrite_sql($sql);
  $sql_count = db_rewrite_sql($sql_count);
  if (variable_get('fuzzysearch_debug_search', FALSE) && user_access('access fuzzysearch debugging')) {
    drupal_set_message(t('Fuzzysearch main query after db_rewriting: @sql', array(
      '@sql' => $sql,
    )), 'status', FALSE);
    $debug_nodes = db_result(db_query($sql_count, $args));
    drupal_set_message(t('Fuzzysearch found @nodes matching node(s).', array(
      '@nodes' => $debug_nodes,
    )), 'status', FALSE);
  }
  $block_limit = $theme == BLOCK_THEME ? variable_get('fuzzysearch_block_limit', 5) : 0;
  if ($block_limit) {
    $pager_results = db_query($sql . ' LIMIT ' . $block_limit, $args);
  }
  else {
    $pager_results = pager_query($sql, $limit, 0, $sql_count, $args);
  }

  // Load the matched nodes.
  while ($row = db_fetch_object($pager_results)) {
    $node = node_load($row->nid);
    $node->score = $row->score;
    $node->completeness = $row->percent;

    // If this is just a title search, we can skip all the processing below.
    if ($theme == 1) {

      // Build the node body. This grabs cck field labels and values. Remove
      // double spaces added for html legibility by cck.
      $node->build_mode = NODE_BUILD_SEARCH_RESULT;
      $node = node_build_content($node, FALSE, FALSE);
      $node->body = preg_replace("/ +/", " ", drupal_render($node->content));

      // Add the comments to the node for highlighting.
      if (function_exists('comment_render') && $node->comment && user_access('access comments')) {
        $comments = db_query('SELECT subject, comment FROM {comments} WHERE nid = %d AND status = %d', $node->nid, COMMENT_PUBLISHED);
        while ($comment = db_fetch_object($comments)) {
          $node->body .= ' ' . strip_tags($comment->subject) . ' ' . strip_tags($comment->comment);
        }
      }

      // Query each matched node for the search ngrams. We use this for fuzzy
      // highlighting of misspelled words. We do this per node to narrow
      // the possible false ngrams when a misspelled ngram matches a real one.
      // This could still return some false ngrams, but that's why it's fuzzy.
      $sql_ngrams = "\n      SELECT s.ngram, s.word_id, s.completeness\n      FROM {fuzzysearch_index} s\n      LEFT JOIN {node} n ON (n.nid = s.nid)\n      WHERE (({$clause}) AND n.nid = {$row->nid} AND n.status = 1\n      AND n.type NOT IN ({$placeholders}))";
      $ngrams = db_query($sql_ngrams, $args);
      $clean_grams = $short_words = array();
      $i = 0;
      while ($ngram = db_fetch_array($ngrams)) {
        $clean_grams[$ngram['ngram']][] = $ngram;
        $i++;
      }

      // Ngrams can occur multiple times, so filter.
      $clean_grams = fuzzysearch_unique($clean_grams);

      // This will hold our search terms.
      $clean_words = explode(' ', $orig_query);

      // Now we rebuild the words stripping out misspelled ngrams.
      foreach ($clean_words as $key => $clean_word) {

        // If we have an exact match, let's skip the work to check for misspellings.
        if (!preg_match('/\\b' . $clean_word . '\\b/iu', $node->body)) {
          $pos = $id_count = array();
          $len = drupal_strlen($clean_word);

          // Ignore search terms less than the ngram length.
          if ($len >= $nlength) {

            // Get the position of each good hit.
            foreach ($clean_grams as $n => $gram) {
              if ($multibyte == UNICODE_MULTIBYTE) {
                if (mb_stripos($clean_word, $n) !== FALSE) {
                  $pos[mb_stripos($clean_word, $n)] = $n;

                  // Keep count of our word ids so we can try to guess which word
                  // we are trying to match.
                  foreach ($clean_grams[$n] as $ngram_data) {
                    $id_count[$ngram_data['word_id']] = '';
                    $id_count[$ngram_data['word_id']] = $id_count[$ngram_data['word_id']] + 1;
                  }
                }
              }
              else {
                if (stripos($clean_word, $n) !== FALSE) {
                  $pos[stripos($clean_word, $n)] = $n;

                  // Keep count of our word ids so we can try to guess which word
                  // we are trying to match.
                  foreach ($clean_grams[$n] as $ngram_data) {
                    $id_count[$ngram_data['word_id']] = '';
                    $id_count[$ngram_data['word_id']] = $id_count[$ngram_data['word_id']] + 1;
                  }
                }
              }
            }
            ksort($pos);

            // This gives us an array with the most common word_id as the first
            // element.
            arsort($id_count);
            $id_count = array_keys($id_count);

            // Remove any position matches that are not in our likely word (the
            // word with the highest word_id count).
            foreach ($pos as $position => $pgram) {
              $pmatch = FALSE;
              foreach ($clean_grams[$pgram] as $pid) {
                if ($pid['word_id'] == $id_count[0]) {
                  $pmatch = TRUE;
                }
              }
              if (!$pmatch) {
                unset($pos[$position]);
              }
            }

            // Start with a dummy word at the right length, but only if there are
            // some matching ngram hits.
            $newword = '';
            if (count($pos)) {
              $newword = str_pad('', $len, '.');
            }
            $hits = $misses = $i = $pos_plus = 0;

            // Check character by character for ngram matches. We don't need to check
            // beyond the first character of the ngram.
            for ($i = 0; $i <= $len - $nlength; $i++) {

              // This is a match, so insert it into our dummy word.
              if (isset($pos[$i])) {
                $newword = drupal_substr($newword, 0, $i + $pos_plus) . $pos[$i] . drupal_substr($newword, $i + $pos_plus + $nlength, $len);
                ++$hits;
              }
              else {

                // But don't overwrite a letter, only a '.' .
                if (drupal_substr($newword, $i + $pos_plus + $nlength - 1, 1) == '.') {
                  $newword = $i == 0 || $i + $pos_plus > $len - $nlength ? $newword : drupal_substr($newword, 0, $i + $pos_plus + $nlength - 1) . '.*' . drupal_substr($newword, $i + $pos_plus + 1 + $nlength - 1);

                  // If we insert here, we need to adjust the positions in the $pos array.
                  $pos_plus++;
                  $len++;
                }
                ++$misses;
              }
            }

            // Only keep our rebuilt word if it meets our minimum spelling match score.
            // Subtract $pos_plus from $len to get the original search term length.
            // Then subtract $nlength - 1 to get the number of ngrams in the term.
            $spell_percent = $hits / ($len - $pos_plus - $nlength + 1) * 100;
            if ($spell_percent >= $min_spelling) {

              // Remove consecutive wildcards and add word boundaries.
              $newword = preg_replace("/\\.\\./", ".*", $newword);
              $newword = preg_replace("/\\.\\*\\.\\*/", ".*", $newword);
              $newword = '\\b\\w*' . trim($newword, '.*') . '.*?\\b';
              $clean_words[$key] = $newword;
              if (variable_get('fuzzysearch_debug_search', FALSE) && user_access('access fuzzysearch debugging')) {
                $node->spelling_debug .= t('Highlighting regex @newword -- Ngram spelling match is @percent%', array(
                  '@newword' => $newword,
                  '@percent' => number_format($spell_percent, 2),
                )) . '</br>';
              }
            }
            else {
              unset($clean_words[$key]);
            }
          }
          else {
            unset($clean_words[$key]);
          }
        }
      }

      // Build a replacement node body containing sections of text with the found
      // words, with leading and trailing text.
      $node->body = strip_tags($node->body);
      $section = array();
      $section_length = array();
      foreach ($clean_words as $k => $word) {
        $location = 0;

        // If the word is found, add its position to $section.
        while (preg_match('/' . $word . '/iu', $node->body, $matches, PREG_OFFSET_CAPTURE, $location) && $word != '') {

          // Make sure we didn't traverse any word breaks by checking for spaces.
          // Pretty sure we don't need mb_stripos() here because we don't actually
          // care about the position
          if (!stripos($matches[0][0], ' ')) {
            $section[] = _fuzzysearch_char_count($node->body, $matches[0][1]);
            $section_length[$matches[0][1]] = drupal_strlen($word);
            $clean_words[$k] = $matches[0][0];
          }

          // Increase $location by one so we don't find the previous location.
          $location = $matches[0][1] + 1;
        }
      }

      // Because we found words one by one, the locations are out of order. Sort
      // so that the locations are in natural order.
      asort($section);
      ksort($section_length);
      $section = array_values($section);
      $section_length = array_values($section_length);
      $p = 0;
      $found = $newbody = '';
      $trail = $lead = $excerpt / 2;
      $start = $section[0];
      while (isset($section[$p])) {

        // If the current section is within the previous, let's not create a new one
        // so we don't have any duplicate text.
        if (isset($section[$p + 1]) && $section[$p] + $lead + $section_length[$p] + $trail > $section[$p + 1]) {
          $trail = $section[$p + 1] + $section_length[$p + 1] + $lead - $start;
          $p++;
          continue;
        }

        // Put an excerpt into our replacement node body, with the
        // found word in the center.
        $found = $start - $lead < 0 ? drupal_substr($node->body, 0, $excerpt) : drupal_substr($node->body, $start - $lead, $trail + $lead);
        if (variable_get('fuzzysearch_max_result', 0) && drupal_strlen($newbody . $found) > variable_get('fuzzysearch_max_result', 0)) {
          break;
        }
        $newbody .= '...' . $found . '... ';
        $p++;
        $start = isset($section[$p]) ? $section[$p] : 0;
        $trail = $lead;
      }

      // Wrap the found words in a <strong> tag to highlight them.
      $newbody = preg_replace('/' . $boundary . '[^' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']*' . '(' . implode('|', $clean_words) . ')' . '[^' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']*' . $boundary . '/iu', '<strong>\\0</strong>', $newbody);

      // If there are no result excerpts in the body, at least show the teaser.
      $node->body = $newbody == '' ? truncate_utf8(strip_tags($node->teaser), variable_get('fuzzysearch_max_result', 0), TRUE, TRUE) : $newbody;
      $results[] = $node;
    }
    else {
      $results[] = $node;
    }
  }
  return $results;
}