You are here

facetapi_luceneapi.cache.inc in Facet API 6

Term frequency cache functions.

File

contrib/facetapi_luceneapi/facetapi_luceneapi.cache.inc
View source
<?php

/**
 * @file
 * Term frequency cache functions.
 */

/**
 * Executes a match query, returns array keyed by document IDs.  This method
 * skips sorting, so it more efficient than calling luceneapi_find.
 *
 * @param $index
 *   A Zend_Search_Lucene_Interface object.
 * @param $query
 *  A Zend_Search_Lucene_Search_Query object.
 * @return
 *   An array keyed by document IDs.
 */
function facetapi_luceneapi_match_query(Zend_Search_Lucene_Interface $index, Zend_Search_Lucene_Search_Query $query) {
  $docs = array();
  try {
    $query = $query
      ->rewrite($index)
      ->optimize($index);
    $query
      ->execute($index);
    $docs = $query
      ->matchedDocs();
  } catch (Exception $e) {
    luceneapi_throw_error($e, WATCHDOG_ERROR, 'facetapi_luceneapi');
  }
  return $docs;
}

/**
 * Helper function to get the wildcard prefix.
 *
 * @param $text
 *   A string containing the text prefix is being extracted from.
 * @return
 *   A string containing the prefix.
 */
function facetapi_luceneapi_prefix_get($text) {
  $q_pos = strpos($text, '?');
  $a_pos = strpos($text, '*');
  if ($q_pos !== false) {
    if ($a_pos !== false) {
      return substr($text, 0, min($q_pos, $a_pos));
    }
    return substr($text, 0, $q_pos);
  }
  elseif ($a_pos !== false) {
    return substr($text, 0, $a_pos);
  }
  return $text;
}

/**
 * Reads termfrequencies from the lookup table to avoid having to calculate it
 * on the fly, which is very performance intensive.
 *
 * @param $searcher
 *   A string containing the machine readable name of the searcher module.
 * @param $term
 *   A Zend_Search_Lucene_Index_Term object.
 * @return
 *   An array keyed by Lucene document ID to term frequency.
 */
function facetapi_luceneapi_termfreqs_get($searcher, Zend_Search_Lucene_Index_Term $term) {
  static $cache = array();

  // If cache is not populated for the field, reads the field's cached data.
  if (!isset($cache[$searcher][$term->field])) {
    $variable = 'facetapi:termfreqs_cached:' . $searcher . ':::' . $term->field;
    if (!variable_get($variable, FALSE)) {
      facetapi_luceneapi_termfreqs_populate($searcher, $term->field);
      variable_set($variable, TRUE);
    }

    // Reads term frequencies from the database.
    $cache[$searcher][$term->field] = array();
    $table = '{' . $searcher . '_termfreqs}';
    $sql = "SELECT term, termfreqs FROM {$table} WHERE field = '%s'";
    if ($result = db_query($sql, array(
      $term->field,
    ))) {
      while ($record = db_fetch_object($result)) {
        $cache[$searcher][$term->field][$record->term] = array(
          'termfreqs' => $record->termfreqs,
          'unserialized' => FALSE,
        );
      }
    }
  }

  // Returns array of termfreqs, unserializes if necessary.
  if (isset($cache[$searcher][$term->field][$term->text])) {
    $data =& $cache[$searcher][$term->field][$term->text];
    if (empty($data['unserialized'])) {
      $data['termfreqs'] = unserialize($data['termfreqs']);
      $data['unserialized'] = TRUE;
    }
    return $data['termfreqs'];
  }
  else {
    return array();
  }
}

/**
 * Gets the filter counts for an array of terms.
 *
 * @param $index
 *   A LuceneAPI_Search_Lucene_Proxy object, such as the one returned by the
 *   luceneapi_index_open() function.
 * @param $docs
 *   An array of documents keyed by docuent ID to score.
 * @param $terms
 *   An array of Zend_Search_Lucene_Index_Term objects.
 */
function facetapi_luceneapi_terms_count(Zend_Search_Lucene_Proxy $index, array $docs, array $terms) {
  $freqs = array();

  // Reading from a termFreqs cache table is MUCH more efficient than
  // calculating them on the fly for every search.
  foreach ($terms as $term) {
    $freqs += facetapi_luceneapi_termfreqs_get('luceneapi_node', $term);
  }

  // NOTE: This is surprisingly efficient.
  return count(array_intersect_key($docs, $freqs));
}

/**
 * Rewrites a wildcard query into primitive terms.
 *
 * @param $index
 *   A Zend_Search_Lucene_Interface object, such as the one returned by the
 *   luceneapi_index_open() function.
 * @param $text
 *   A string containing the text being parsed.
 * @param $fields
 *   An array of fields the terms are matched against.
 * @return
 *   An array of Zend_Search_Lucene_Index_Term objects.
 */
function facetapi_luceneapi_wildcard_matches_get(Zend_Search_Lucene_Interface $index, $text, array $fields = array(
  'contents',
)) {

  // Prevents too many terms from being extracted.
  $max_terms = Zend_Search_Lucene::getTermsPerQueryLimit();

  // Gets the word prefix.
  $prefix = facetapi_luceneapi_prefix_get($text);
  $prefix_len = strlen($prefix);
  $pattern = '/^' . str_replace(array(
    '\\?',
    '\\*',
  ), array(
    '.',
    '.*',
  ), preg_quote($text, '/')) . '$/';

  // Caclulates matches.
  $matches = array();
  foreach ($fields as $field) {
    $index
      ->resetTermsStream();
    $index
      ->skipTo(new Zend_Search_Lucene_Index_Term($prefix, $field));
    while ($index
      ->currentTerm() !== NULL && $index
      ->currentTerm()->field == $field) {
      if ($prefix != '' && substr($index
        ->currentTerm()->text, 0, $prefix_len) != $prefix) {
        break;
      }
      if (preg_match($pattern, $index
        ->currentTerm()->text) === 1) {
        $matches[] = $index
          ->currentTerm();

        // Terms per query limit reached.
        if ($max_terms != 0 && count($matches) > $max_terms) {
          throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
        }
      }
      $index
        ->nextTerm();
    }
    $index
      ->closeTermsStream();
  }
  return $matches;
}

/**
 * Rewrites a range query into primitive terms.
 *
 * @param $index
 *   A Zend_Search_Lucene_Interface object, such as the one returned by the
 *   luceneapi_index_open() function.
 * @param $lower
 *   A string containing the lower boundary.
 * @param $upper
 *   A string containing the upper boundary.
 * @param $inclusive
 *   A boolean flagging whether to include the upper term in the result set.
 * @param $fields
 *   An array of fields the terms are matched against.
 * @return
 *   An array of Zend_Search_Lucene_Index_Term objects.
 */
function facetapi_luceneapi_range_matches_get(Zend_Search_Lucene_Interface $index, $lower, $upper, $inclusive = FALSE, array $fields = array(
  'contents',
)) {

  // Prevents too many terms from being extracted.
  $max_terms = Zend_Search_Lucene::getTermsPerQueryLimit();
  $matches = array();
  foreach ($fields as $field) {
    $index
      ->resetTermsStream();
    if ($lower !== NULL) {
      $lower_term = new Zend_Search_Lucene_Index_Term($lower, $field);
      $index
        ->skipTo($lower_term);
      if (!$inclusive && $index
        ->currentTerm() == $lower_term) {
        $index
          ->nextTerm();
      }
    }
    else {
      $index
        ->skipTo(new Zend_Search_Lucene_Index_Term('', $field));
    }
    if ($upper !== NULL) {

      // Walk up to the upper term
      $upper_term = new Zend_Search_Lucene_Index_Term($upper, $field);
      while ($index
        ->currentTerm() !== NULL && $index
        ->currentTerm()->field == $field && $index
        ->currentTerm()->text < $upper_term->text) {
        $matches[] = $index
          ->currentTerm();
        if ($max_terms != 0 && count($matches) > $max_terms) {
          throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
        }
        $index
          ->nextTerm();
      }
      if ($inclusive && $index
        ->currentTerm() == $upper_term) {

        // Include upper term into result
        $matches[] = $upper_term;
      }
    }
    else {

      // Walk up to the end of field data
      while ($index
        ->currentTerm() !== NULL && $index
        ->currentTerm()->field == $field) {
        $matches[] = $index
          ->currentTerm();
        if ($max_terms != 0 && count($matches) > $max_terms) {
          throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
        }
        $index
          ->nextTerm();
      }
    }
    $index
      ->closeTermsStream();
  }
  return $matches;
}

/**
 * Populates the termfreqs() database for the specified fields.
 *
 * @param $searcher
 *   A string containing the machine readable name of the searcher module.
 * @param $fields
 *   An array containing the fields the termFreqs cache is being populated for.
 * @param &$context
 *   An optional array containing the batch context when using this function as
 *   a batch operation.
 * @return
 *   NULL
 */
function facetapi_luceneapi_termfreqs_populate($searcher, $fields = NULL, &$context = NULL) {
  if (!($index = luceneapi_index_open($searcher))) {
    return;
  }

  // Gets fields, defaults to all enabled facets.
  if (NULL === $fields) {
    $fields = array();
    foreach (facetapi_enabled_facets_get('luceneapi_node') as $facet) {
      $fields[$facet['field']] = $facet['field'];
    }
  }
  else {
    $fields = drupal_map_assoc((array) $fields);
  }

  // Sets message if this is a batch process.
  if (NULL !== $context) {
    $context['message'] = format_plural(count($fields), 'Populating termFreqs cache for the %fields field', 'Populating termFreqs cache for the %fields fields', array(
      '%fields' => join(', ', $fields),
    ));
  }

  // Gets terms from index.
  $terms = array();
  foreach ($index
    ->terms() as $term) {
    if (isset($fields[$term->field])) {
      $terms[$term->field][$term->text] = $index
        ->termFreqs($term);
    }
  }

  // Populates termfreqs cache for all passed fields.
  $table = '{' . $searcher . '_termfreqs}';
  foreach ($terms as $field => $field_terms) {

    // Builds placeholders and values.
    $values = $placeholders = array();
    foreach ($field_terms as $term => $termfreqs) {
      $placeholders[] = "('%s', '%s', '%s')";
      $values[] = $term;
      $values[] = $field;
      $values[] = serialize($termfreqs);
    }

    // If there are values, inserts data.
    if (!empty($values)) {
      $sql = "DELETE FROM {$table} WHERE field = '%s'";
      db_query($sql, array(
        $field,
      ));

      // @todo Batch in groups of 100? 1000?  OK as is?
      $sql = "INSERT INTO {$table} (term, field, termfreqs) VALUES " . join(',', $placeholders);
      db_query($sql, $values);
    }
  }
}

Functions

Namesort descending Description
facetapi_luceneapi_match_query Executes a match query, returns array keyed by document IDs. This method skips sorting, so it more efficient than calling luceneapi_find.
facetapi_luceneapi_prefix_get Helper function to get the wildcard prefix.
facetapi_luceneapi_range_matches_get Rewrites a range query into primitive terms.
facetapi_luceneapi_termfreqs_get Reads termfrequencies from the lookup table to avoid having to calculate it on the fly, which is very performance intensive.
facetapi_luceneapi_termfreqs_populate Populates the termfreqs() database for the specified fields.
facetapi_luceneapi_terms_count Gets the filter counts for an array of terms.
facetapi_luceneapi_wildcard_matches_get Rewrites a wildcard query into primitive terms.