You are here

function search_index in Drupal 6

Same name and namespace in other branches
  1. 8 core/modules/search/search.module \search_index()
  2. 4 modules/search.module \search_index()
  3. 5 modules/search/search.module \search_index()
  4. 7 modules/search/search.module \search_index()

Update the full-text search index for a particular item.

Parameters

$sid: A number identifying this particular item (e.g. node id).

$type: A string defining this type of item (e.g. 'node')

$text: The content of this item. Must be a piece of HTML text.

Related topics

1 call to search_index()
_node_index_node in modules/node/node.module
Index a single node.
2 string references to 'search_index'
system_update_6019 in modules/system/system.install
Reconcile small differences in the previous, manually created mysql and pgsql schemas so they are the same and can be represented by a single schema structure.
system_update_6036 in modules/system/system.install
Change the search schema and indexing.

File

modules/search/search.module, line 419
Enables site-wide keyword searching.

Code

function search_index($sid, $type, $text) {
  $minimum_word_size = variable_get('minimum_word_size', 3);

  // Link matching
  global $base_url;
  $node_regexp = '@href=[\'"]?(?:' . preg_quote($base_url, '@') . '/|' . preg_quote(base_path(), '@') . ')(?:\\?q=)?/?((?![a-z]+:)[^\'">]+)[\'">]@i';

  // Multipliers for scores of words inside certain HTML tags.
  // Note: 'a' must be included for link ranking to work.
  $tags = array(
    'h1' => 25,
    'h2' => 18,
    'h3' => 15,
    'h4' => 12,
    'h5' => 9,
    'h6' => 6,
    'u' => 3,
    'b' => 3,
    'i' => 3,
    'strong' => 3,
    'em' => 3,
    'a' => 10,
  );

  // Strip off all ignored tags to speed up processing, but insert space before/after
  // them to keep word boundaries.
  $text = str_replace(array(
    '<',
    '>',
  ), array(
    ' <',
    '> ',
  ), $text);
  $text = strip_tags($text, '<' . implode('><', array_keys($tags)) . '>');

  // Split HTML tags from plain text.
  $split = preg_split('/\\s*<([^>]+?)>\\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);

  // Note: PHP ensures the array consists of alternating delimiters and literals
  // and begins and ends with a literal (inserting $null as required).
  $tag = FALSE;

  // Odd/even counter. Tag or no tag.
  $link = FALSE;

  // State variable for link analyser
  $score = 1;

  // Starting score per word
  $accum = ' ';

  // Accumulator for cleaned up data
  $tagstack = array();

  // Stack with open tags
  $tagwords = 0;

  // Counter for consecutive words
  $focus = 1;

  // Focus state
  $results = array(
    0 => array(),
  );

  // Accumulator for words for index
  foreach ($split as $value) {
    if ($tag) {

      // Increase or decrease score per word based on tag
      list($tagname) = explode(' ', $value, 2);
      $tagname = drupal_strtolower($tagname);

      // Closing or opening tag?
      if ($tagname[0] == '/') {
        $tagname = substr($tagname, 1);

        // If we encounter unexpected tags, reset score to avoid incorrect boosting.
        if (!count($tagstack) || $tagstack[0] != $tagname) {
          $tagstack = array();
          $score = 1;
        }
        else {

          // Remove from tag stack and decrement score
          $score = max(1, $score - $tags[array_shift($tagstack)]);
        }
        if ($tagname == 'a') {
          $link = FALSE;
        }
      }
      else {
        if (isset($tagstack[0]) && $tagstack[0] == $tagname) {

          // None of the tags we look for make sense when nested identically.
          // If they are, it's probably broken HTML.
          $tagstack = array();
          $score = 1;
        }
        else {

          // Add to open tag stack and increment score
          array_unshift($tagstack, $tagname);
          $score += $tags[$tagname];
        }
        if ($tagname == 'a') {

          // Check if link points to a node on this site
          if (preg_match($node_regexp, $value, $match)) {
            $path = drupal_get_normal_path($match[1]);
            if (preg_match('!(?:node|book)/(?:view/)?([0-9]+)!i', $path, $match)) {
              $linknid = $match[1];
              if ($linknid > 0) {

                // Note: ignore links to uncachable nodes to avoid redirect bugs.
                $node = db_fetch_object(db_query('SELECT n.title, n.nid, n.vid, r.format FROM {node} n INNER JOIN {node_revisions} r ON n.vid = r.vid WHERE n.nid = %d', $linknid));
                if (filter_format_allowcache($node->format)) {
                  $link = TRUE;
                  $linktitle = $node->title;
                }
              }
            }
          }
        }
      }

      // A tag change occurred, reset counter.
      $tagwords = 0;
    }
    else {

      // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty values
      if ($value != '') {
        if ($link) {

          // Check to see if the node link text is its URL. If so, we use the target node title instead.
          if (preg_match('!^https?://!i', $value)) {
            $value = $linktitle;
          }
        }
        $words = search_index_split($value);
        foreach ($words as $word) {

          // Add word to accumulator
          $accum .= $word . ' ';
          $num = is_numeric($word);

          // Check wordlength
          if ($num || drupal_strlen($word) >= $minimum_word_size) {

            // Normalize numbers
            if ($num) {
              $word = (int) ltrim($word, '-0');
            }

            // Links score mainly for the target.
            if ($link) {
              if (!isset($results[$linknid])) {
                $results[$linknid] = array();
              }
              $results[$linknid][] = $word;

              // Reduce score of the link caption in the source.
              $focus *= 0.2;
            }

            // Fall-through
            if (!isset($results[0][$word])) {
              $results[0][$word] = 0;
            }
            $results[0][$word] += $score * $focus;

            // Focus is a decaying value in terms of the amount of unique words up to this point.
            // From 100 words and more, it decays, to e.g. 0.5 at 500 words and 0.3 at 1000 words.
            $focus = min(1, 0.01 + 3.5 / (2 + count($results[0]) * 0.015));
          }
          $tagwords++;

          // Too many words inside a single tag probably mean a tag was accidentally left open.
          if (count($tagstack) && $tagwords >= 15) {
            $tagstack = array();
            $score = 1;
          }
        }
      }
    }
    $tag = !$tag;
  }
  search_wipe($sid, $type, TRUE);

  // Insert cleaned up data into dataset
  db_query("INSERT INTO {search_dataset} (sid, type, data, reindex) VALUES (%d, '%s', '%s', %d)", $sid, $type, $accum, 0);

  // Insert results into search index
  foreach ($results[0] as $word => $score) {

    // Try inserting first because this will succeed most times, but because
    // the database collates similar words (accented and non-accented), the
    // insert can fail, in which case we need to add the word scores together.
    @db_query("INSERT INTO {search_index} (word, sid, type, score) VALUES ('%s', %d, '%s', %f)", $word, $sid, $type, $score);
    if (!db_affected_rows()) {
      db_query("UPDATE {search_index} SET score = score + %f WHERE word = '%s' AND sid = %d AND type = '%s'", $score, $word, $sid, $type);
    }
    search_dirty($word);
  }
  unset($results[0]);

  // Get all previous links from this item.
  $result = db_query("SELECT nid, caption FROM {search_node_links} WHERE sid = %d AND type = '%s'", $sid, $type);
  $links = array();
  while ($link = db_fetch_object($result)) {
    $links[$link->nid] = $link->caption;
  }

  // Now store links to nodes.
  foreach ($results as $nid => $words) {
    $caption = implode(' ', $words);
    if (isset($links[$nid])) {
      if ($links[$nid] != $caption) {

        // Update the existing link and mark the node for reindexing.
        db_query("UPDATE {search_node_links} SET caption = '%s' WHERE sid = %d AND type = '%s' AND nid = %d", $caption, $sid, $type, $nid);
        search_touch_node($nid);
      }

      // Unset the link to mark it as processed.
      unset($links[$nid]);
    }
    else {

      // Insert the existing link and mark the node for reindexing.
      db_query("INSERT INTO {search_node_links} (caption, sid, type, nid) VALUES ('%s', %d, '%s', %d)", $caption, $sid, $type, $nid);
      search_touch_node($nid);
    }
  }

  // Any left-over links in $links no longer exist. Delete them and mark the nodes for reindexing.
  foreach ($links as $nid => $caption) {
    db_query("DELETE FROM {search_node_links} WHERE sid = %d AND type = '%s' AND nid = %d", $sid, $type, $nid);
    search_touch_node($nid);
  }
}