You are here

fuzzysearch.module in Fuzzy Search 6

Same filename and directory in other branches
  1. 7 fuzzysearch.module

Module file for fuzzysearch module.

Implementation of fuzzy search indexing Originally by Blake Lucchesi (www.boldsource.com) Algorithm: n-gram indexing and retrieval Code written for Google's Summer of Code 2007

Drupal 6 version maintained by awolfey

File

fuzzysearch.module
View source
<?php

/**
 * @file
 * Module file for fuzzysearch module.
 *
 * Implementation of fuzzy search indexing
 * Originally by Blake Lucchesi (www.boldsource.com)
 * Algorithm: n-gram indexing and retrieval
 * Code written for Google's Summer of Code 2007
 *
 * Drupal 6 version maintained by awolfey
 */

/**
 * Matches Unicode character classes to exclude from the search index.
 *
 * See: http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
 *
 * The index only contains the following character classes:
 * Lu     Letter, Uppercase
 * Ll     Letter, Lowercase
 * Lt     Letter, Titlecase
 * Lo     Letter, Other
 * Nd     Number, Decimal Digit
 * No     Number, Other
 */
define('PREG_CLASS_SEARCH_EXCLUDE', '\\x{0}-\\x{2f}\\x{3a}-\\x{40}\\x{5b}-\\x{60}\\x{7b}-\\x{bf}\\x{d7}\\x{f7}\\x{2b0}-' . '\\x{385}\\x{387}\\x{3f6}\\x{482}-\\x{489}\\x{559}-\\x{55f}\\x{589}-\\x{5c7}\\x{5f3}-' . '\\x{61f}\\x{640}\\x{64b}-\\x{65e}\\x{66a}-\\x{66d}\\x{670}\\x{6d4}\\x{6d6}-\\x{6ed}' . '\\x{6fd}\\x{6fe}\\x{700}-\\x{70f}\\x{711}\\x{730}-\\x{74a}\\x{7a6}-\\x{7b0}\\x{901}-' . '\\x{903}\\x{93c}\\x{93e}-\\x{94d}\\x{951}-\\x{954}\\x{962}-\\x{965}\\x{970}\\x{981}-' . '\\x{983}\\x{9bc}\\x{9be}-\\x{9cd}\\x{9d7}\\x{9e2}\\x{9e3}\\x{9f2}-\\x{a03}\\x{a3c}-' . '\\x{a4d}\\x{a70}\\x{a71}\\x{a81}-\\x{a83}\\x{abc}\\x{abe}-\\x{acd}\\x{ae2}\\x{ae3}' . '\\x{af1}-\\x{b03}\\x{b3c}\\x{b3e}-\\x{b57}\\x{b70}\\x{b82}\\x{bbe}-\\x{bd7}\\x{bf0}-' . '\\x{c03}\\x{c3e}-\\x{c56}\\x{c82}\\x{c83}\\x{cbc}\\x{cbe}-\\x{cd6}\\x{d02}\\x{d03}' . '\\x{d3e}-\\x{d57}\\x{d82}\\x{d83}\\x{dca}-\\x{df4}\\x{e31}\\x{e34}-\\x{e3f}\\x{e46}-' . '\\x{e4f}\\x{e5a}\\x{e5b}\\x{eb1}\\x{eb4}-\\x{ebc}\\x{ec6}-\\x{ecd}\\x{f01}-\\x{f1f}' . '\\x{f2a}-\\x{f3f}\\x{f71}-\\x{f87}\\x{f90}-\\x{fd1}\\x{102c}-\\x{1039}\\x{104a}-' . '\\x{104f}\\x{1056}-\\x{1059}\\x{10fb}\\x{10fc}\\x{135f}-\\x{137c}\\x{1390}-\\x{1399}' . '\\x{166d}\\x{166e}\\x{1680}\\x{169b}\\x{169c}\\x{16eb}-\\x{16f0}\\x{1712}-\\x{1714}' . '\\x{1732}-\\x{1736}\\x{1752}\\x{1753}\\x{1772}\\x{1773}\\x{17b4}-\\x{17db}\\x{17dd}' . '\\x{17f0}-\\x{180e}\\x{1843}\\x{18a9}\\x{1920}-\\x{1945}\\x{19b0}-\\x{19c0}\\x{19c8}' . '\\x{19c9}\\x{19de}-\\x{19ff}\\x{1a17}-\\x{1a1f}\\x{1d2c}-\\x{1d61}\\x{1d78}\\x{1d9b}-' . '\\x{1dc3}\\x{1fbd}\\x{1fbf}-\\x{1fc1}\\x{1fcd}-\\x{1fcf}\\x{1fdd}-\\x{1fdf}\\x{1fed}-' . '\\x{1fef}\\x{1ffd}-\\x{2070}\\x{2074}-\\x{207e}\\x{2080}-\\x{2101}\\x{2103}-\\x{2106}' . '\\x{2108}\\x{2109}\\x{2114}\\x{2116}-\\x{2118}\\x{211e}-\\x{2123}\\x{2125}\\x{2127}' . '\\x{2129}\\x{212e}\\x{2132}\\x{213a}\\x{213b}\\x{2140}-\\x{2144}\\x{214a}-\\x{2b13}' . '\\x{2ce5}-\\x{2cff}\\x{2d6f}\\x{2e00}-\\x{3005}\\x{3007}-\\x{303b}\\x{303d}-\\x{303f}' . '\\x{3099}-\\x{309e}\\x{30a0}\\x{30fb}\\x{30fd}\\x{30fe}\\x{3190}-\\x{319f}\\x{31c0}-' . '\\x{31cf}\\x{3200}-\\x{33ff}\\x{4dc0}-\\x{4dff}\\x{a015}\\x{a490}-\\x{a716}\\x{a802}' . '\\x{a806}\\x{a80b}\\x{a823}-\\x{a82b}\\x{e000}-\\x{f8ff}\\x{fb1e}\\x{fb29}\\x{fd3e}' . '\\x{fd3f}\\x{fdfc}-\\x{fe6b}\\x{feff}-\\x{ff0f}\\x{ff1a}-\\x{ff20}\\x{ff3b}-\\x{ff40}' . '\\x{ff5b}-\\x{ff65}\\x{ff70}\\x{ff9e}\\x{ff9f}\\x{ffe0}-\\x{fffd}');
define('PREG_CLASS_CJK', '\\x{1100}-\\x{11FF}\\x{3040}-\\x{309F}\\x{30A1}-\\x{318E}' . '\\x{31A0}-\\x{31B7}\\x{31F0}-\\x{31FF}\\x{3400}-\\x{4DBF}\\x{4E00}-\\x{9FCF}' . '\\x{A000}-\\x{A48F}\\x{A4D0}-\\x{A4FD}\\x{A960}-\\x{A97F}\\x{AC00}-\\x{D7FF}' . '\\x{F900}-\\x{FAFF}\\x{FF21}-\\x{FF3A}\\x{FF41}-\\x{FF5A}\\x{FF66}-\\x{FFDC}' . '\\x{20000}-\\x{2FFFD}\\x{30000}-\\x{3FFFD}');
define('BLOCK_THEME', 0);
define('NODE_THEME', 1);

/**
 * Implementation of hook_menu().
 */
function fuzzysearch_menu() {
  $items['admin/settings/fuzzysearch'] = array(
    'title' => 'Fuzzysearch settings',
    'description' => 'Fuzzysearch settings allow you to index certain node data',
    'page callback' => 'fuzzysearch_admin',
    'access arguments' => array(
      'administer fuzzysearch',
    ),
    'type' => MENU_NORMAL_ITEM,
    'file' => 'fuzzysearch.admin.inc',
  );
  $items[variable_get('fuzzysearch_path_name', 'fuzzysearch/results')] = array(
    'title' => 'Search',
    'page callback' => 'fuzzysearch_show_results',
    'access arguments' => array(
      'fuzzysearch content',
    ),
    'type' => MENU_NORMAL_ITEM,
  );
  $items['admin/reports/fuzzysearch'] = array(
    'title' => 'Top fuzzysearch phrases',
    'description' => 'View most popular fuzzysearch phrases.',
    'page callback' => 'dblog_top',
    'page arguments' => array(
      'fuzzysearch',
    ),
    'access arguments' => array(
      'access site reports',
    ),
    'file' => 'dblog.admin.inc',
    'file path' => drupal_get_path('module', 'dblog'),
  );
  return $items;
}

/**
 * Implementation of hook_perm().
 */
function fuzzysearch_perm() {
  return array(
    'administer fuzzysearch',
    'fuzzysearch content',
    'access fuzzysearch scoring',
    'access fuzzysearch debugging',
  );
}

/**
 * Implementation of hook_theme().
 */
function fuzzysearch_theme() {
  return array(
    'fuzzysearch_box_form' => array(
      'args' => array(
        'form' => NULL,
      ),
    ),
    'fuzzysearch_form' => array(
      'args' => array(
        'form' => NULL,
      ),
    ),
    'fuzzysearch_show_results' => array(
      'args' => array(
        'keys' => NULL,
      ),
    ),
    'fuzzysearch_results_title' => array(
      'args' => array(
        'results' => NULL,
      ),
    ),
    'fuzzysearch_results' => array(
      'args' => array(
        'results' => NULL,
      ),
    ),
    'fuzzysearch_result' => array(
      'template' => 'fuzzysearch-result',
      'arguments' => array(
        'node' => NULL,
        'teaser' => FALSE,
        'page' => FALSE,
      ),
    ),
  );
}

/**
 * Implementation of hook_content_build_modes().
 */
function fuzzysearch_content_build_modes() {
  return array(
    'fuzzysearch' => array(
      'title' => t('Fuzzy search'),
      'build modes' => array(
        NODE_BUILD_SEARCH_INDEX => array(
          'title' => t('Search index'),
          'views style' => FALSE,
        ),
        NODE_BUILD_SEARCH_RESULT => array(
          'title' => t('Search result'),
          'views style' => FALSE,
        ),
      ),
    ),
  );
}
function fuzzysearch_preprocess_fuzzysearch_result(&$variables) {
  $node = $variables['node'];
  if (module_exists('taxonomy')) {
    $variables['taxonomy'] = taxonomy_link('taxonomy terms', $node);
  }
  else {
    $variables['taxonomy'] = array();
  }
  if ($variables['teaser'] && $node->teaser) {
    $variables['content'] = $node->teaser;
  }
  elseif (isset($node->body)) {
    $variables['content'] = $node->body;
  }
  else {
    $variables['content'] = '';
  }
  $variables['date'] = format_date($node->created);
  $variables['links'] = !empty($node->links) ? theme('links', $node->links, array(
    'class' => 'links inline',
  )) : '';
  $variables['name'] = theme('username', $node);
  $variables['node_url'] = url('node/' . $node->nid);
  $variables['terms'] = theme('links', $variables['taxonomy'], array(
    'class' => 'links inline',
  ));
  $variables['title'] = check_plain($node->title);

  // Flatten the node object's member fields.
  $variables = array_merge((array) $node, $variables);

  // Display info only on certain node types.
  if (theme_get_setting('toggle_node_info_' . $node->type)) {
    $variables['submitted'] = theme('node_submitted', $node);
    $variables['picture'] = theme_get_setting('toggle_node_user_picture') ? theme('user_picture', $node) : '';
  }
  else {
    $variables['submitted'] = '';
    $variables['picture'] = '';
  }
}

/**
 * Implementation of hook_nodeapi().
 *
 * Remove node from index on deletion and queue node for indexing on insert.
 */
function fuzzysearch_nodeapi(&$node, $op, $a3 = NULL, $a4 = NULL) {
  switch ($op) {
    case 'update':
    case 'insert':
      fuzzysearch_reindex($node->nid, 'fuzzysearch');
      break;
    case 'delete':
      db_query("DELETE FROM {fuzzysearch_index} WHERE nid = %d", $node->nid);
      break;
  }
}

/**
 * External API function that allows modules to flag a node for reindexing.
 *
 * @param $nid
 *   Nid of the node to be reindexed.
 * @param $module
 *   Name of the module flagging the node.
 */
function fuzzysearch_reindex($nid, $module) {
  $query = db_query("SELECT * FROM {fuzzysearch_index_queue} WHERE nid = %d", $nid);
  if (!db_result($query)) {
    db_query("INSERT INTO {fuzzysearch_index_queue} (nid, module, timestamp) VALUES (%d, '%s', %d)", $nid, $module, time());
  }
}

/**
 * Implementation of hook_cron().
 */
function fuzzysearch_cron() {
  $query = db_query_range("SELECT nid FROM {fuzzysearch_index_queue}", 0, variable_get('fuzzysearch_index_cron', 150));
  while ($result = db_fetch_object($query)) {
    fuzzysearch_index($result->nid);
  }
}

/**
 * Index the node data in the fuzzy index table.
 *
 * @param nid
 *   The node id of the node being indexed.
 * @return
 *   Returns TRUE on success, FALSE on failure.
 */
function fuzzysearch_index($nid) {

  // First step is removing past index
  db_query("DELETE FROM {fuzzysearch_index} WHERE nid = %d", $nid);

  // No node, nothing to do.
  if (!($node = node_load($nid))) {
    db_query("DELETE FROM {fuzzysearch_index_queue} WHERE nid = %d", $nid);
    return;
  }

  // Let modules alter a node before indexing or prevent it from being indexed.
  // See readme.txt.
  foreach (module_implements('fuzzysearch_index') as $name) {
    $function = $name . '_fuzzysearch_index';
    $node = $function($node);
    if (!$node) {

      // Update the node table to make indexed = 1;
      db_query("DELETE FROM {fuzzysearch_index_queue} WHERE nid = %d", $nid);
      return;
    }
  }

  // Index node title
  $text .= '<h1> ' . $node->title . ' </h1>';

  // Build and index the node body.
  $node->build_mode = NODE_BUILD_SEARCH_INDEX;
  $node = node_build_content($node, FALSE, FALSE);
  $node->body = drupal_render($node->content);
  $text .= $node->body;

  // Implementation of nodeapi's update_index op.
  $new_text = '';
  foreach (module_implements('nodeapi') as $module) {
    $function = $module . '_nodeapi';
    $new_text = $function($node, 'update index', NULL, NULL);
    if (isset($new_text) && is_string($new_text)) {
      $text .= ' ' . $new_text;
    }
  }

  // Insert code to allow other modules to filter indexed text before indexing
  // Multipliers for scores of words inside certain HTML tags.
  $tags = fuzzysearch_get_index_tags();

  // Strip off all ignored tags to speed up processing
  $text = strip_tags($text, '<' . implode('><', array_keys($tags)) . '>');

  // Hook_fuzzysearch_filter lets modules filter text. This should be used for
  // more complex filtering. Stop words should not use this. Create a stopword
  // file instead. See fuzzysearch/stopwords/README.txt.
  foreach (module_implements('fuzzysearch_filter') as $name) {
    $function = $name . '_fuzzysearch_filter';
    $text = $function('index', $text);
  }

  //  Allow other modules to modify the score of the node based on each owns calculations
  //  the sum of all the scores added to each node is then multiplied by the score of the word,
  //  this allows for faster result queries because all scoring is done at the time of indexing
  $hook_scores = module_invoke_all('fuzzysearch_score', 'index', $node);

  //  Build the final score multiplier for each node based on returned multipliers from other nodes
  $node_score = 0;
  foreach ($hook_scores as $score) {
    $multiplier = variable_get('fuzzysearch_scoring_' . $score['id'], 5);
    $node_score += $score['score'] * $multiplier;
  }

  // Begin indexing content.
  // Remove stopwords.
  $text = fuzzysearch_stopwords($text);

  // Find all words not located within tags (score = 1)
  $content = preg_replace('/<([A-Z][A-Z0-9]*)[^>]*>(.*?)<\\/\\1>/i', '', $text);

  //  $content = fuzzysearch_cleanse($content);
  $words = array();
  $index_words = array();
  $words = preg_split('/\\s/', $content, -1, PREG_SPLIT_NO_EMPTY);

  // Build the index array with scores
  foreach ($words as $word) {
    $key = array_search($word, $index_words);
    if ($key === FALSE) {
      $index_words[] = $word;
      $index_scores[] = 1;
    }
    else {
      $index_scores[$key] += 1;
    }
  }

  // Find all words located within tags (score > 1)
  preg_match_all('/<([A-Z][A-Z0-9]*)([^>]*)>(.*?)<\\/\\1>/i', $text, $tagged);

  // filter through each set of content inbetween tags
  foreach ($tagged[3] as $key => $content) {

    //    $content = fuzzysearch_cleanse($content);
    $words = preg_split('/\\s/', $content, -1, PREG_SPLIT_NO_EMPTY);
    $tag = $tagged[1][$key];
    $tag_score = $tags[$tag];
    foreach ($words as $word) {
      $key = array_search($word, $index_words);
      if ($key === FALSE) {
        $index_words[] = $word;
        $index_scores[] = $tag_score;
      }
      else {
        $index_scores[$key] += $tag_score;
      }
    }
  }
  foreach ($index_words as $key => $word) {

    // Each word gets a word_id, which comes from the last value in the id column,
    // which is serial. First we check to make sure it's set. We have to do this
    // to avoid a postrgresql error.
    if (!$word_id) {
      db_query("INSERT INTO {fuzzysearch_index} (nid, word_id, ngram, completeness, score) VALUES (0, 0, 'xxx', 0, 0)");
      db_query("DELETE FROM {fuzzysearch_index} WHERE (nid = 0 AND word_id = 0 AND ngram = 'xxx' AND completeness = 0 AND score = 0)");
      $word_id = db_last_insert_id('fuzzysearch_index', 'id');
    }
    else {
      $word_id = db_last_insert_id('fuzzysearch_index', 'id');
    }
    fuzzysearch_index_insert($word, $word_id, $nid, $index_scores[$key], $node_score);
  }

  // Update the node table to make indexed = 1;
  db_query("DELETE FROM {fuzzysearch_index_queue} WHERE nid = %d", $nid);
}

/**
 * Insert the words into the database as they are indexed.
 *
 *  @param $word
 *   Word to insert into the index.
 * @param $nid
 *   The node id that is to be associated with this word.
 * @param $word_score
 *   Score given to the word based on the tag it is in.
 * @param $node_score
 *   Score modifier given to the node from hook_search_score.
 */
function fuzzysearch_index_insert($word, $word_id, $nid, $word_score, $node_score) {

  // Cleanse and remove spaces.
  $word = str_replace(' ', '', fuzzysearch_cleanse($word));
  $length = drupal_strlen($word);
  $nlength = variable_get('fuzzysearch_ngram_length', 3);

  //  Ensure that having all score modifiers set to 0 will not affect our natural scoring
  if ($node_score > 0) {
    $score = $word_score * $node_score;
  }
  else {
    $score = $word_score;
  }
  if ($length > $nlength) {

    //  Calculate how complete the ngram is compared to the length of the word
    $completeness = number_format(100 / ($length - $nlength + 1), 3);

    //  Create ngrams and index them
    for ($i = 0; $i < $length - $nlength + 1; $i++) {
      db_query("INSERT INTO {fuzzysearch_index} (nid, word_id, ngram, completeness, score) VALUES (%d, %d, '%s', %f, %f)", $nid, $word_id, drupal_substr($word, $i, $nlength), $completeness, $score);
    }
  }
  elseif ($length == $nlength) {

    //  The ngram is the same length as the actual word so it is complete
    $completeness = 100;

    //  Index the ngram
    db_query("INSERT INTO {fuzzysearch_index} (nid, word_id, ngram, completeness, score) VALUES (%d, %d, '%s', %f, %f)", $nid, $word_id, $word, $completeness, $score);
  }
}

/**
 * Implementation of hook_comment().
 */
function fuzzysearch_comment($a1, $op) {
  switch ($op) {

    // Reindex the node when comments are added or changed
    case 'insert':
    case 'update':
    case 'delete':
    case 'publish':
    case 'unpublish':
      fuzzysearch_reindex(is_array($a1) ? $a1['nid'] : $a1->nid, 'fuzzysearch');
      break;
  }
}

/**
 * Strip all non alphanumeric characters from a string
 */
function fuzzysearch_cleanse($text) {
  $text = strip_tags($text);
  $text = drupal_strtolower($text);
  return preg_replace('/[' . PREG_CLASS_SEARCH_EXCLUDE . ']+/u', ' ', $text);
}

/**
 * Form to search the index
 */
function fuzzysearch_box_form() {
  $form['keys'] = array(
    '#type' => 'textfield',
    '#size' => 15,
    '#default_value' => '',
  );
  $form['submit'] = array(
    '#type' => 'submit',
    '#value' => t('Search'),
  );
  $form['#submit'][] = 'fuzzysearch_form_submit';
  return $form;
}

/**
 * Theme the output of the search block
 */
function theme_fuzzysearch_box_form($form) {
  $output .= '<div class="container-inline">' . drupal_render($form['keys']) . drupal_render($form['submit']) . '</div>';
  $output .= drupal_render($form);
  return $output;
}

/**
 * Implementation of hook_block().
 */
function fuzzysearch_block($op = 'list', $delta = 0, $edit = array()) {
  switch ($op) {
    case 'list':
      $blocks[0]['info'] = t('Fuzzy search form');
      $blocks[1]['info'] = t('Fuzzy search title query');
      return $blocks;
      break;
    case 'view':
      if (user_access('fuzzysearch content')) {
        switch ($delta) {
          case 0:
            $block['content'] = drupal_get_form('fuzzysearch_box_form');
            $block['subject'] = t('Search');
            return $block;
            break;
          case 1:
            if ($_GET['fuzzysearch']) {
              $block['content'] = theme('fuzzysearch_show_results', check_plain($_GET['fuzzysearch']), variable_get('fuzzysearch_block_theme', BLOCK_THEME), variable_get('fuzzysearch_block_limit', 5));
            }
            return $block;
            break;
        }
      }
      break;
    case 'configure':
      switch ($delta) {
        case 1:
          $form["fuzzysearch_block_limit"] = array(
            '#type' => 'select',
            '#title' => t('Number of results to display'),
            '#default_value' => variable_get('fuzzysearch_block_limit', 5),
            '#options' => drupal_map_assoc(range(1, 15)),
          );
          $form['fuzzysearch_block_theme'] = array(
            '#type' => 'radios',
            '#title' => t('Display method'),
            '#default_value' => variable_get('fuzzysearch_block_theme', 0),
            '#options' => array(
              BLOCK_THEME => 'Titles',
              NODE_THEME => 'Nodes',
            ),
            '#description' => t('Show titles only or node theme.'),
          );
          return $form;
          break;
      }
      break;
    case 'save':
      switch ($delta) {
        case 1:
          variable_set('fuzzysearch_block_limit', $edit['fuzzysearch_block_limit']);
          variable_set('fuzzysearch_block_theme', $edit['fuzzysearch_block_theme']);
          break;
      }
  }
}

/**
 * Form to search the index
 */
function fuzzysearch_form($form_state, $keys = '') {
  $form['keys'] = array(
    '#title' => t('Enter search phrase'),
    '#type' => 'textfield',
    '#size' => 35,
    '#default_value' => $keys,
  );
  $form['submit'] = array(
    '#type' => 'submit',
    '#value' => t('Search'),
  );
  return $form;
}

/**
 * Redirect to callback with keys so that the search can be linked to.
 */
function fuzzysearch_form_submit($form, &$form_state) {

  // The search form relies on control of the redirect destination for its
  // functionality, so we override any static destination set in the request,
  // for example by drupal_access_denied() or drupal_not_found()
  // (see http://drupal.org/node/292565).
  if (isset($_REQUEST['destination'])) {
    unset($_REQUEST['destination']);
  }
  if (isset($_REQUEST['edit']['destination'])) {
    unset($_REQUEST['edit']['destination']);
  }
  $form_state['redirect'] = variable_get('fuzzysearch_path_name', 'fuzzysearch/results') . '/' . $form_state['values']['keys'];
}

/**
 * Output formatting for the search form
 */
function theme_fuzzysearch_form($form) {
  $output = '<div class="search-form">' . drupal_render($form['keys']) . drupal_render($form['submit']) . '</div>';
  $output .= drupal_render($form);
  return $output;
}

/**
 * Process the search query
 */
function fuzzysearch_process($query, $theme = NODE_THEME, $limit = 10) {
  global $user;
  global $multibyte;

  // if no keys were entered do not display anything below the search form
  if (!$query) {
    return;
  }

  // Sanitize query again because it can be submitted from url as well as form.
  // Do this word-by-word to keep words whole even after removing excluded
  // characters. Keep the original for highlighting.
  $orig_query = $query = trim($query);
  $parts = explode(' ', $query);
  foreach ($parts as $part) {
    $query_array[] = str_replace(' ', '', fuzzysearch_cleanse($part));
  }
  $query = implode(' ', $query_array);

  // Log the search keys:
  watchdog('fuzzysearch', '%query', array(
    '%query' => $query,
  ), WATCHDOG_NOTICE, l(t('results'), variable_get('fuzzysearch_path_name', 'fuzzysearch/results') . '/' . $query));

  // Hook_fuzzysearch_filter lets modules filter text. This should be used for
  // more complex filtering. Stop words should not use this. Create a stopword
  // file instead. See fuzzysearch/stopwords/README.txt.
  foreach (module_implements('fuzzysearch_filter') as $name) {
    $function = $name . '_fuzzysearch_filter';
    $query = $function('search', $query);
  }

  // Remove stopwords.
  $query = fuzzysearch_stopwords($query);

  // Make sure we still have a query.
  if (!$query) {
    return;
  }
  $nlength = variable_get('fuzzysearch_ngram_length', 3);
  $min_spelling = variable_get('fuzzysearch_spelling', 30);
  $excerpt = variable_get('fuzzysearch_excerpt', 200);
  $missing_letters = variable_get('fuzzysearch_missing_letters', 1);
  $extra_letters = variable_get('fuzzysearch_extra_letters', 1);
  $boundary = '(?:(?<=[' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . '])|(?=[' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']))';
  $words = explode(' ', $query);

  // Build the WHERE clause for the ngrams.
  // @todo Change type of query based on boolean operators
  $clause = '';
  foreach ($words as $k => $word) {
    $length = drupal_strlen($word);

    // $comp_min is the minumum completeness an ngram can have to return words.
    // If > 0 $extra_letters assumes the searcher has missing letters in the search term.
    // It's configurable by the admin. Increasing the number of extra letters
    // lowers the completeness and returns more words, making things fuzzier.
    $comp_min = 100 / ($length - $nlength + 1 + $missing_letters);

    // $comp_max is the maximum completeness an ngram can have and still return words.
    // In this case we assume the search term has extra letters, and this lets us include
    // shorter words in the results. Increasing this raises the completeness of an ngram
    // and returns more grams with higher completeness. Completeness can never be
    // higher than 100%.
    if ($length - $nlength + 1 - $extra_letters <= 0) {
      $comp_max = 100;
    }
    else {
      $comp_max = 100 / ($length - $nlength + 1 - $extra_letters);
    }
    $comp_min = number_format($comp_min, 3) - 0.001;
    $comp_max = number_format($comp_max, 3) + 0.001;

    // Tuning info
    if (variable_get('fuzzysearch_debug_search', FALSE) && user_access('access fuzzysearch debugging')) {
      $message = t('Adding ngrams to the query for the word "@word" to the query with ngram length of @nlength, minimum ngram completeness of @comp_min and maximum ngram completeness of @comp_max', array(
        '@word' => $word,
        '@nlength' => $nlength,
        '@comp_min' => $comp_min,
        '@comp_max' => $comp_max,
      ));
      drupal_set_message($message, 'status', FALSE);
    }
    for ($i = 0; $i < $length - 2; $i++) {
      $clause .= " (ngram = '" . drupal_substr($word, $i, $nlength) . "' AND completeness BETWEEN " . $comp_min . " AND " . $comp_max . ") OR";
    }
  }
  $clause = preg_replace("/ OR\$/", '', $clause);

  // @todo: Fix the minimum completeness so that a single qgram match doesn't necessarily return a match
  $min_completeness = check_plain(variable_get('fuzzysearch_completeness', 40));

  // Get content types to exclude from results. They are still indexed.
  $types = array_filter(variable_get('fuzzysearch_nodetypes', array(
    '',
  )));

  // Build the query args and placeholders.
  $args[] = $min_completeness;
  $args += $types;
  $placeholders = count($types) ? db_placeholders($types, 'text') : '\'\'';
  $order_by = variable_get('fuzzysearch_sort_score', FALSE) ? 'score DESC, percent DESC' : 'percent DESC, score DESC';

  // Main query
  $sql = "SELECT n.nid, MAX(n.moderate) AS moderate, MAX(n.uid) AS uid, MAX(n.type) AS type, MAX(n.status) AS status, SUM(subpercent) AS percent, SUM(subscore) AS score\n          FROM (SELECT DISTINCT word_id, nn.nid, SUM(completeness) AS subpercent, SUM(score) AS subscore\n            FROM {fuzzysearch_index} s\n            LEFT JOIN {node} nn ON (nn.nid = s.nid)\n            WHERE (({$clause}))\n            GROUP BY word_id, nn.nid HAVING SUM(completeness) >= %d) AS q\n          LEFT JOIN {node} n on n.nid = q.nid\n          WHERE n.status = 1\n          AND n.type NOT IN ({$placeholders})\n          GROUP BY n.nid ORDER BY {$order_by}";

  // Count query
  $sql_count = "SELECT COUNT(DISTINCT(n.nid))\n               FROM (SELECT nn.type, nn.uid, nn.moderate, nn.nid, CEILING(SUM(completeness)) AS completeness, SUM(score) AS score\n                 FROM {fuzzysearch_index} AS s\n                 LEFT JOIN {node} nn on s.nid = nn.nid\n                 WHERE {$clause}\n                 GROUP BY word_id, s.nid, nn.type, nn.uid, nn.moderate, nn.nid\n                 HAVING SUM(completeness) >= %d) AS q\n               LEFT JOIN {node} n on n.nid = q.nid\n               WHERE n.status = 1\n               AND n.type NOT IN ({$placeholders})";
  $sql = db_rewrite_sql($sql);
  $sql_count = db_rewrite_sql($sql_count);
  if (variable_get('fuzzysearch_debug_search', FALSE) && user_access('access fuzzysearch debugging')) {
    drupal_set_message(t('Fuzzysearch main query after db_rewriting: @sql', array(
      '@sql' => $sql,
    )), 'status', FALSE);
    $debug_nodes = db_result(db_query($sql_count, $args));
    drupal_set_message(t('Fuzzysearch found @nodes matching node(s).', array(
      '@nodes' => $debug_nodes,
    )), 'status', FALSE);
  }
  $block_limit = $theme == BLOCK_THEME ? variable_get('fuzzysearch_block_limit', 5) : 0;
  if ($block_limit) {
    $pager_results = db_query($sql . ' LIMIT ' . $block_limit, $args);
  }
  else {
    $pager_results = pager_query($sql, $limit, 0, $sql_count, $args);
  }

  // Load the matched nodes.
  while ($row = db_fetch_object($pager_results)) {
    $node = node_load($row->nid);
    $node->score = $row->score;
    $node->completeness = $row->percent;

    // If this is just a title search, we can skip all the processing below.
    if ($theme == 1) {

      // Build the node body. This grabs cck field labels and values. Remove
      // double spaces added for html legibility by cck.
      $node->build_mode = NODE_BUILD_SEARCH_RESULT;
      $node = node_build_content($node, FALSE, FALSE);
      $node->body = preg_replace("/ +/", " ", drupal_render($node->content));

      // Add the comments to the node for highlighting.
      if (function_exists('comment_render') && $node->comment && user_access('access comments')) {
        $comments = db_query('SELECT subject, comment FROM {comments} WHERE nid = %d AND status = %d', $node->nid, COMMENT_PUBLISHED);
        while ($comment = db_fetch_object($comments)) {
          $node->body .= ' ' . strip_tags($comment->subject) . ' ' . strip_tags($comment->comment);
        }
      }

      // Query each matched node for the search ngrams. We use this for fuzzy
      // highlighting of misspelled words. We do this per node to narrow
      // the possible false ngrams when a misspelled ngram matches a real one.
      // This could still return some false ngrams, but that's why it's fuzzy.
      $sql_ngrams = "\n      SELECT s.ngram, s.word_id, s.completeness\n      FROM {fuzzysearch_index} s\n      LEFT JOIN {node} n ON (n.nid = s.nid)\n      WHERE (({$clause}) AND n.nid = {$row->nid} AND n.status = 1\n      AND n.type NOT IN ({$placeholders}))";
      $ngrams = db_query($sql_ngrams, $args);
      $clean_grams = $short_words = array();
      $i = 0;
      while ($ngram = db_fetch_array($ngrams)) {
        $clean_grams[$ngram['ngram']][] = $ngram;
        $i++;
      }

      // Ngrams can occur multiple times, so filter.
      $clean_grams = fuzzysearch_unique($clean_grams);

      // This will hold our search terms.
      $clean_words = explode(' ', $orig_query);

      // Now we rebuild the words stripping out misspelled ngrams.
      foreach ($clean_words as $key => $clean_word) {

        // If we have an exact match, let's skip the work to check for misspellings.
        if (!preg_match('/\\b' . $clean_word . '\\b/iu', $node->body)) {
          $pos = $id_count = array();
          $len = drupal_strlen($clean_word);

          // Ignore search terms less than the ngram length.
          if ($len >= $nlength) {

            // Get the position of each good hit.
            foreach ($clean_grams as $n => $gram) {
              if ($multibyte == UNICODE_MULTIBYTE) {
                if (mb_stripos($clean_word, $n) !== FALSE) {
                  $pos[mb_stripos($clean_word, $n)] = $n;

                  // Keep count of our word ids so we can try to guess which word
                  // we are trying to match.
                  foreach ($clean_grams[$n] as $ngram_data) {
                    $id_count[$ngram_data['word_id']] = '';
                    $id_count[$ngram_data['word_id']] = $id_count[$ngram_data['word_id']] + 1;
                  }
                }
              }
              else {
                if (stripos($clean_word, $n) !== FALSE) {
                  $pos[stripos($clean_word, $n)] = $n;

                  // Keep count of our word ids so we can try to guess which word
                  // we are trying to match.
                  foreach ($clean_grams[$n] as $ngram_data) {
                    $id_count[$ngram_data['word_id']] = '';
                    $id_count[$ngram_data['word_id']] = $id_count[$ngram_data['word_id']] + 1;
                  }
                }
              }
            }
            ksort($pos);

            // This gives us an array with the most common word_id as the first
            // element.
            arsort($id_count);
            $id_count = array_keys($id_count);

            // Remove any position matches that are not in our likely word (the
            // word with the highest word_id count).
            foreach ($pos as $position => $pgram) {
              $pmatch = FALSE;
              foreach ($clean_grams[$pgram] as $pid) {
                if ($pid['word_id'] == $id_count[0]) {
                  $pmatch = TRUE;
                }
              }
              if (!$pmatch) {
                unset($pos[$position]);
              }
            }

            // Start with a dummy word at the right length, but only if there are
            // some matching ngram hits.
            $newword = '';
            if (count($pos)) {
              $newword = str_pad('', $len, '.');
            }
            $hits = $misses = $i = $pos_plus = 0;

            // Check character by character for ngram matches. We don't need to check
            // beyond the first character of the ngram.
            for ($i = 0; $i <= $len - $nlength; $i++) {

              // This is a match, so insert it into our dummy word.
              if (isset($pos[$i])) {
                $newword = drupal_substr($newword, 0, $i + $pos_plus) . $pos[$i] . drupal_substr($newword, $i + $pos_plus + $nlength, $len);
                ++$hits;
              }
              else {

                // But don't overwrite a letter, only a '.' .
                if (drupal_substr($newword, $i + $pos_plus + $nlength - 1, 1) == '.') {
                  $newword = $i == 0 || $i + $pos_plus > $len - $nlength ? $newword : drupal_substr($newword, 0, $i + $pos_plus + $nlength - 1) . '.*' . drupal_substr($newword, $i + $pos_plus + 1 + $nlength - 1);

                  // If we insert here, we need to adjust the positions in the $pos array.
                  $pos_plus++;
                  $len++;
                }
                ++$misses;
              }
            }

            // Only keep our rebuilt word if it meets our minimum spelling match score.
            // Subtract $pos_plus from $len to get the original search term length.
            // Then subtract $nlength - 1 to get the number of ngrams in the term.
            $spell_percent = $hits / ($len - $pos_plus - $nlength + 1) * 100;
            if ($spell_percent >= $min_spelling) {

              // Remove consecutive wildcards and add word boundaries.
              $newword = preg_replace("/\\.\\./", ".*", $newword);
              $newword = preg_replace("/\\.\\*\\.\\*/", ".*", $newword);
              $newword = '\\b\\w*' . trim($newword, '.*') . '.*?\\b';
              $clean_words[$key] = $newword;
              if (variable_get('fuzzysearch_debug_search', FALSE) && user_access('access fuzzysearch debugging')) {
                $node->spelling_debug .= t('Highlighting regex @newword -- Ngram spelling match is @percent%', array(
                  '@newword' => $newword,
                  '@percent' => number_format($spell_percent, 2),
                )) . '</br>';
              }
            }
            else {
              unset($clean_words[$key]);
            }
          }
          else {
            unset($clean_words[$key]);
          }
        }
      }

      // Build a replacement node body containing sections of text with the found
      // words, with leading and trailing text.
      $node->body = strip_tags($node->body);
      $section = array();
      $section_length = array();
      foreach ($clean_words as $k => $word) {
        $location = 0;

        // If the word is found, add its position to $section.
        while (preg_match('/' . $word . '/iu', $node->body, $matches, PREG_OFFSET_CAPTURE, $location) && $word != '') {

          // Make sure we didn't traverse any word breaks by checking for spaces.
          // Pretty sure we don't need mb_stripos() here because we don't actually
          // care about the position
          if (!stripos($matches[0][0], ' ')) {
            $section[] = _fuzzysearch_char_count($node->body, $matches[0][1]);
            $section_length[$matches[0][1]] = drupal_strlen($word);
            $clean_words[$k] = $matches[0][0];
          }

          // Increase $location by one so we don't find the previous location.
          $location = $matches[0][1] + 1;
        }
      }

      // Because we found words one by one, the locations are out of order. Sort
      // so that the locations are in natural order.
      asort($section);
      ksort($section_length);
      $section = array_values($section);
      $section_length = array_values($section_length);
      $p = 0;
      $found = $newbody = '';
      $trail = $lead = $excerpt / 2;
      $start = $section[0];
      while (isset($section[$p])) {

        // If the current section is within the previous, let's not create a new one
        // so we don't have any duplicate text.
        if (isset($section[$p + 1]) && $section[$p] + $lead + $section_length[$p] + $trail > $section[$p + 1]) {
          $trail = $section[$p + 1] + $section_length[$p + 1] + $lead - $start;
          $p++;
          continue;
        }

        // Put an excerpt into our replacement node body, with the
        // found word in the center.
        $found = $start - $lead < 0 ? drupal_substr($node->body, 0, $excerpt) : drupal_substr($node->body, $start - $lead, $trail + $lead);
        if (variable_get('fuzzysearch_max_result', 0) && drupal_strlen($newbody . $found) > variable_get('fuzzysearch_max_result', 0)) {
          break;
        }
        $newbody .= '...' . $found . '... ';
        $p++;
        $start = isset($section[$p]) ? $section[$p] : 0;
        $trail = $lead;
      }

      // Wrap the found words in a <strong> tag to highlight them.
      $newbody = preg_replace('/' . $boundary . '[^' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']*' . '(' . implode('|', $clean_words) . ')' . '[^' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']*' . $boundary . '/iu', '<strong>\\0</strong>', $newbody);

      // If there are no result excerpts in the body, at least show the teaser.
      $node->body = $newbody == '' ? truncate_utf8(strip_tags($node->teaser), variable_get('fuzzysearch_max_result', 0), TRUE, TRUE) : $newbody;
      $results[] = $node;
    }
    else {
      $results[] = $node;
    }
  }
  return $results;
}

/**
 * Gather results from the index and build result page.
 */
function fuzzysearch_show_results($keys = '') {
  return theme('fuzzysearch_show_results', $keys);
}

/**
 * Theme hook for rendering search results.
 */
function theme_fuzzysearch_show_results($keys = '', $theme = NODE_THEME, $limit = 10) {
  drupal_add_css(drupal_get_path('module', 'fuzzysearch') . '/fuzzysearch.css', 'module');
  $results = fuzzysearch_process($keys, $theme, $limit);
  if ($results) {
    if ($theme == NODE_THEME) {
      drupal_set_title(check_plain($keys));
      $output = '<h2>' . t('Results for !keys', array(
        '!keys' => check_plain($keys),
      )) . '</h2>';
      $output .= '<div class="clear-block">';
      $output .= drupal_get_form('fuzzysearch_form', $keys);
      $output .= '</div>';
      $output .= theme('fuzzysearch_results', $results);
      $output .= theme('pager', NULL, $limit);
    }
    else {
      $output .= theme('fuzzysearch_results_title', $results);
    }
  }
  else {
    if ($keys != '') {
      $output = '<p>' . t('No matches were found.') . '</p>';
    }
    $output .= drupal_get_form('fuzzysearch_form', $keys);
  }
  return $output;
}

/**
 * Theme the search results
 */
function theme_fuzzysearch_results($results) {
  drupal_set_title(t('Search results'));
  $output = '<div class="box">';
  $output .= '<div class="search-results">';
  $i = 0;
  foreach ($results as $result) {
    $i++;
    $odd = $i % 2 ? 'odd' : 'even';
    $output .= '<div class="fuzzysearch-result ' . $odd . '">';
    $output .= theme('fuzzysearch_result', $result);
    if (variable_get('fuzzysearch_debug_search', FALSE) && user_access('access fuzzysearch debugging')) {
      $output .= $result->spelling_debug;
    }
    if (variable_get('fuzzysearch_debug_score', FALSE) && user_access('access fuzzysearch scoring')) {
      if (variable_get('fuzzysearch_sort_score', FALSE)) {
        $output .= '<p>' . t('Score:') . ' ' . number_format($result->score) . ' ' . t('Completeness:') . ' ' . number_format($result->completeness) . '</p>';
      }
      else {
        $output .= '<p>' . t('Completeness:') . ' ' . number_format($result->completeness) . ' ' . t('Score:') . ' ' . number_format($result->score) . '</p>';
      }
    }
    $output .= '</div>';
  }
  $output .= '</div></div>';
  return $output;
}
function theme_fuzzysearch_results_title($results) {
  $output .= '<div class="box">';
  $output .= '<ul class="search-results">';
  foreach ($results as $result) {
    $i++;
    $odd = $i % 2 ? 'odd' : 'even';
    $output .= '<li>' . l($result->title, drupal_get_path_alias('node/' . $result->nid)) . '</li>';
  }
  $output .= '</ul></div>';
  return $output;
}

// Using some code from drupal_substr() to set character count of the found
// clean word.
function _fuzzysearch_char_count($text, $position) {
  $bytes = 0;

  // Count all the continuation bytes from the start until we have found
  // $start characters
  $bytes = -1;
  $chars = -1;
  while ($bytes < $position) {
    $bytes++;
    $c = ord($text[$bytes]);
    if ($c < 0x80 || $c >= 0xc0) {
      $chars++;
    }
  }
  return $chars;
}

/**
 * Remove stop words from search query and text to be indexed.
 *
 * @param $text
 *   The text to be stripped of stop words.
 */
function fuzzysearch_stopwords($text) {
  static $stop_words;
  if (!is_array($stop_words)) {
    $stop_words = array();
    $files = file_scan_directory('sites/all/libraries/fuzzysearch/stopwords', 'fuzzysearch_stopwords_.+\\.txt', array(), 0, TRUE, 'name');
    foreach ($files as $file) {
      $stop_words = array_merge($stop_words, explode(' ', file_get_contents($file->filename)));
    }
  }
  $text = explode(' ', $text);
  $text = array_diff($text, $stop_words);
  return implode(' ', $text);
}

/**
 * Recursive array_unique().
 */
function fuzzysearch_unique($array) {
  $result = array_map("unserialize", array_unique(array_map("serialize", $array)));
  foreach ($result as $key => $value) {
    if (is_array($value)) {
      $result[$key] = fuzzysearch_unique($value);
    }
  }
  return $result;
}

/**
 * Return tags and values as an array.
 *
 * @return
 *   An array of tags and values.
 */
function fuzzysearch_get_index_tags() {
  $tags = array(
    'h1' => variable_get('fuzzysearch_tag_h1', 10),
    'h2' => variable_get('fuzzysearch_tag_h2', 9),
    'h3' => variable_get('fuzzysearch_tag_h3', 8),
    'h4' => variable_get('fuzzysearch_tag_h4', 7),
    'h5' => variable_get('fuzzysearch_tag_h5', 6),
    'h6' => variable_get('fuzzysearch_tag_h6', 5),
    'u' => variable_get('fuzzysearch_tag_u', 2),
    'b' => variable_get('fuzzysearch_tag_b', 2),
    'i' => variable_get('fuzzysearch_tag_i', 2),
    'strong' => variable_get('fuzzysearch_tag_strong', 2),
    'em' => variable_get('fuzzysearch_tag_em', 2),
    'a' => variable_get('fuzzysearch_tag_a', 5),
  );
  return $tags;
}

Functions

Namesort descending Description
fuzzysearch_block Implementation of hook_block().
fuzzysearch_box_form Form to search the index
fuzzysearch_cleanse Strip all non alphanumeric characters from a string
fuzzysearch_comment Implementation of hook_comment().
fuzzysearch_content_build_modes Implementation of hook_content_build_modes().
fuzzysearch_cron Implementation of hook_cron().
fuzzysearch_form Form to search the index
fuzzysearch_form_submit Redirect to callback with keys so that the search can be linked to.
fuzzysearch_get_index_tags Return tags and values as an array.
fuzzysearch_index Index the node data in the fuzzy index table.
fuzzysearch_index_insert Insert the words into the database as they are indexed.
fuzzysearch_menu Implementation of hook_menu().
fuzzysearch_nodeapi Implementation of hook_nodeapi().
fuzzysearch_perm Implementation of hook_perm().
fuzzysearch_preprocess_fuzzysearch_result
fuzzysearch_process Process the search query
fuzzysearch_reindex External API function that allows modules to flag a node for reindexing.
fuzzysearch_show_results Gather results from the index and build result page.
fuzzysearch_stopwords Remove stop words from search query and text to be indexed.
fuzzysearch_theme Implementation of hook_theme().
fuzzysearch_unique Recursive array_unique().
theme_fuzzysearch_box_form Theme the output of the search block
theme_fuzzysearch_form Output formatting for the search form
theme_fuzzysearch_results Theme the search results
theme_fuzzysearch_results_title
theme_fuzzysearch_show_results Theme hook for rendering search results.
_fuzzysearch_char_count

Constants

Namesort descending Description
BLOCK_THEME
NODE_THEME
PREG_CLASS_CJK
PREG_CLASS_SEARCH_EXCLUDE Matches Unicode character classes to exclude from the search index.