apachesolr.index.inc in Apache Solr Search 5.2
Same filename and directory in other branches
Functions used when indexing content to Apache Solr.
File
apachesolr.index.incView source
<?php
/**
* @file
* Functions used when indexing content to Apache Solr.
*/
/**
* Add a document to the $documents array based on a node ID.
*/
function apachesolr_add_node_document(&$documents, $nid, $namespace) {
if ($document = apachesolr_node_to_document($nid, $namespace)) {
$documents[] = $document;
}
}
/**
* Strip html tags and also control characters that cause Jetty/Solr to fail.
*/
function apachesolr_clean_text($text) {
// Add spaces before stripping tags to avoid running words together.
$text = filter_xss(str_replace(array(
'<',
'>',
), array(
' <',
'> ',
), $text), array());
// Decode entities and then make safe any < or > characters.
return htmlspecialchars(html_entity_decode($text, ENT_NOQUOTES, 'UTF-8'), ENT_NOQUOTES, 'UTF-8');
}
/**
* Given a node ID, return a document representing that node.
*/
function apachesolr_node_to_document($nid, $namespace) {
// Set reset = TRUE to avoid static caching of all nodes that get indexed.
$node = node_load($nid, NULL, TRUE);
if (empty($node)) {
return FALSE;
}
$document = FALSE;
// Let any module exclude this node from the index.
$build_document = TRUE;
foreach (module_implements('apachesolr_node_exclude') as $module) {
$exclude = module_invoke($module, 'apachesolr_node_exclude', $node, $namespace);
if (!empty($exclude)) {
$build_document = FALSE;
}
}
if ($build_document) {
// Build the node body.
$node->build_mode = NODE_BUILD_SEARCH_INDEX;
$node = node_build_content($node, FALSE, FALSE);
$node->body = drupal_render($node->content);
$node->title = apachesolr_clean_text($node->title);
$text = $node->body;
// Fetch extra data normally not visible, including comments.
$extra = array();
$exclude_comments = in_array($node->type, variable_get('apachesolr_exclude_comments_types', array()), TRUE);
foreach (module_implements('nodeapi') as $module) {
if ($exclude_comments && $module == 'comment') {
// Don't add comments.
continue;
}
$function = $module . '_nodeapi';
if ($output = $function($node, 'update index', FALSE, FALSE)) {
$extra[$module] = $output;
}
}
$text .= "\n\n" . implode(' ', $extra);
$document = new Apache_Solr_Document();
$document->id = apachesolr_document_id($node->nid);
$document->site = url(NULL, NULL, NULL, TRUE);
$document->hash = apachesolr_site_hash();
$document->entity = 'node';
$document->nid = $node->nid;
$document->uid = $node->uid;
$document->title = $node->title;
$document->status = $node->status;
$document->sticky = $node->sticky;
$document->promote = $node->promote;
$document->moderate = $node->moderate;
$document->tnid = $node->tnid;
$document->translate = $node->translate;
if (empty($node->language)) {
// 'und' is the language-neutral code in Drupal 7.
$document->language = 'und';
}
else {
$document->language = $node->language;
}
$document->body = apachesolr_clean_text($text);
$document->type = $node->type;
$document->type_name = node_get_types('name', $node);
$document->created = apachesolr_date_iso($node->created);
$document->changed = apachesolr_date_iso($node->changed);
$last_change = isset($node->last_comment_timestamp) && $node->last_comment_timestamp > $node->changed ? $node->last_comment_timestamp : $node->changed;
$document->last_comment_or_change = apachesolr_date_iso($last_change);
$document->comment_count = isset($node->comment_count) ? $node->comment_count : 0;
$document->name = $node->name;
$path = 'node/' . $node->nid;
$document->url = url($path, NULL, NULL, TRUE);
$document->path = $path;
// Path aliases can have important information about the content.
// Add them to the index as well.
if (function_exists('drupal_get_path_alias')) {
// Add any path alias to the index, looking first for language specific
// aliases but using language neutral aliases otherwise.
$language = empty($node->language) ? '' : $node->language;
$output = drupal_get_path_alias($path, $language);
if ($output && $output != $path) {
$document->path_alias = $output;
}
}
// Get CCK fields list
$cck_fields = apachesolr_cck_fields();
foreach ($cck_fields as $key => $cck_info) {
if (isset($node->{$key})) {
// Got a CCK field. See if it is to be indexed.
$function = $cck_info['indexing_callback'];
if ($cck_info['indexing_callback'] && function_exists($function)) {
$field = $function($node, $key);
}
else {
$field = $node->{$key};
}
$index_key = apachesolr_index_key($cck_info);
foreach ($field as $value) {
// Don't index NULLs or empty strings
// We can use 'value' rather than 'safe' since we strip tags and later check_plain().
// Furthermore, what is being indexed is the KEY for the CCK value. It will need
// a trip through content_format() later to display the value.
if (isset($value['value']) && strlen($value['value'])) {
if ($cck_info['multiple']) {
$document
->setMultiValue($index_key, apachesolr_clean_text($value['value']));
}
else {
$document->{$index_key} = apachesolr_clean_text($value['value']);
}
}
}
}
}
// Index book module data.
if (!empty($node->book['bid'])) {
// Hard-coded - must change if apachesolr_index_key() changes.
$document->is_book_bid = (int) $node->book['bid'];
}
apachesolr_add_tags_to_document($document, $text);
apachesolr_add_taxonomy_to_document($document, $node);
// Let modules add to the document.
foreach (module_implements('apachesolr_update_index') as $module) {
$function = $module . '_apachesolr_update_index';
$function($document, $node, $namespace);
}
}
return $document;
}
/**
* Extract taxonomy from $node and add to dynamic fields.
*/
function apachesolr_add_taxonomy_to_document(&$document, $node) {
if (isset($node->taxonomy) && is_array($node->taxonomy)) {
foreach ($node->taxonomy as $term) {
// Double indexing of tids lets us do effecient searches (on tid)
// and do accurate per-vocabulary faceting.
// By including the ancestors to a term in the index we make
// sure that searches for general categories match specific
// categories, e.g. Fruit -> apple, a search for fruit will find
// content categorized with apple.
$ancestors = taxonomy_get_parents_all($term->tid);
foreach ($ancestors as $ancestor) {
$document
->setMultiValue('tid', $ancestor->tid);
$document
->setMultiValue('im_vid_' . $ancestor->vid, $ancestor->tid);
$name = apachesolr_clean_text($ancestor->name);
$document
->setMultiValue('vid', $ancestor->vid);
$document->{'ts_vid_' . $ancestor->vid . '_names'} .= ' ' . $name;
// We index each name as a string for cross-site faceting
// using the vocab name rather than vid in field construction .
$document
->setMultiValue('sm_vid_' . apachesolr_vocab_name($ancestor->vid), $name);
}
}
}
}
/**
* Helper function - return a safe (PHP identifier) vocabulary name.
*/
function apachesolr_vocab_name($vid) {
static $names = array();
if (!isset($names[$vid])) {
$vocab_name = db_result(db_query('SELECT v.name FROM {vocabulary} v WHERE v.vid = %d', $vid));
$names[$vid] = preg_replace('/[^a-zA-Z0-9_\\x7f-\\xff]/', '_', $vocab_name);
// Fallback for names ending up all as '_'.
$check = rtrim($names[$vid], '_');
if (!$check) {
$names[$vid] = '_' . $vid . '_';
}
}
return $names[$vid];
}
/**
* Extract HTML tag contents from $text and add to boost fields.
*
* $text must be stripped of control characters before hand.
*/
function apachesolr_add_tags_to_document(&$document, $text) {
$tags_to_index = variable_get('apachesolr_tags_to_index', array(
'h1' => 'tags_h1',
'h2' => 'tags_h2_h3',
'h3' => 'tags_h2_h3',
'h4' => 'tags_h4_h5_h6',
'h5' => 'tags_h4_h5_h6',
'h6' => 'tags_h4_h5_h6',
'u' => 'tags_inline',
'b' => 'tags_inline',
'i' => 'tags_inline',
'strong' => 'tags_inline',
'em' => 'tags_inline',
'a' => 'tags_a',
));
// Strip off all ignored tags.
$text = strip_tags($text, '<' . implode('><', array_keys($tags_to_index)) . '>');
preg_match_all('@<(' . implode('|', array_keys($tags_to_index)) . ')[^>]*>(.*)</\\1>@Ui', $text, $matches);
foreach ($matches[1] as $key => $tag) {
$tag = strtolower($tag);
// We don't want to index links auto-generated by the url filter.
if ($tag != 'a' || !preg_match('@(?:http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://|www\\.)[a-zA-Z0-9]+@', $matches[2][$key])) {
$document->{$tags_to_index[$tag]} .= ' ' . $matches[2][$key];
}
}
}
/**
* Additional index utility functions
*/
/**
* hook_cron() helper to try to make {apachesolr_search_node} consistent with {node}.
*/
function apachesolr_cron_check_node_table() {
// Check for unpublished content that wasn't deleted from the index.
$result = db_query("SELECT n.nid, n.status FROM {apachesolr_search_node} asn INNER JOIN {node} n ON n.nid = asn.nid WHERE asn.status <> n.status");
$node_lists = array();
$nodes = array();
// Update or delete at most this many in each Solr query.
$limit = variable_get('apachesolr_cron_mass_limit', 500);
while ($node = db_fetch_object($result)) {
$nodes[$node->nid] = $node;
if (count($nodes) == $limit) {
$node_lists[] = $nodes;
$nodes = array();
}
}
// Any remaning ones if the limit is not reached.
if (count($nodes)) {
$node_lists[] = $nodes;
}
foreach ($node_lists as $nodes) {
watchdog('Apache Solr', t('On cron running apachesolr_nodeapi_mass_update() on nids @nids', array(
'@nids' => implode(',', array_keys($nodes)),
)), WATCHDOG_WARNING);
if (!apachesolr_nodeapi_mass_update($nodes)) {
// Solr query failed - so stop trying.
break;
}
}
// Check for deleted content that wasn't deleted from the index.
$result = db_query("SELECT asn.nid FROM {apachesolr_search_node} asn LEFT JOIN {node} n ON n.nid = asn.nid WHERE n.nid IS NULL");
$node_lists = array();
$nodes = array();
while ($node = db_fetch_object($result)) {
$nodes[$node->nid] = $node;
if (count($nodes) == $limit) {
$node_lists[] = $nodes;
$nodes = array();
}
}
// Any remaning ones if the limit is not reached.
if (count($nodes)) {
$node_lists[] = $nodes;
}
foreach ($node_lists as $nodes) {
watchdog('Apache Solr', t('On cron running apachesolr_nodeapi_mass_delete() on nids @nids', array(
'@nids' => implode(',', array_keys($nodes)),
)), WATCHDOG_WARNING);
if (!apachesolr_nodeapi_mass_delete($nodes)) {
// Solr query failed - so stop trying.
break;
}
}
}
function apachesolr_nodeapi_mass_update($nodes) {
if (empty($nodes)) {
return TRUE;
}
$published_ids = array();
$unpublished_ids = array();
foreach ($nodes as $node) {
if ($node->status) {
$published_ids[$node->nid] = apachesolr_document_id($node->nid);
}
else {
$unpublished_ids[$node->nid] = apachesolr_document_id($node->nid);
}
}
$time = time();
try {
$solr = apachesolr_get_solr();
$solr
->deleteByMultipleIds($unpublished_ids);
apachesolr_index_updated($time);
// There was no exception, so update the table.
if ($published_ids) {
db_query('UPDATE {apachesolr_search_node} SET changed = %d, status = 1 WHERE nid IN (' . db_placeholders($published_ids) . ')', array_merge((array) $time, array_keys($published_ids)));
}
if ($unpublished_ids) {
db_query('UPDATE {apachesolr_search_node} SET changed = %d, status = 0 WHERE nid IN (' . db_placeholders($unpublished_ids) . ')', array_merge((array) $time, array_keys($unpublished_ids)));
}
return TRUE;
} catch (Exception $e) {
watchdog('Apache Solr', nl2br(check_plain($e
->getMessage())), WATCHDOG_ERROR);
return FALSE;
}
}
function apachesolr_nodeapi_mass_delete($nodes) {
if (empty($nodes)) {
return TRUE;
}
$ids = array();
$nids = array();
foreach ($nodes as $node) {
$ids[] = apachesolr_document_id($node->nid);
$nids[] = $node->nid;
}
try {
$solr = apachesolr_get_solr();
$solr
->deleteByMultipleIds($ids);
apachesolr_index_updated($time);
// There was no exception, so update the table.
db_query("DELETE FROM {apachesolr_search_node} WHERE nid IN (" . db_placeholders($nids) . ")", $nids);
return TRUE;
} catch (Exception $e) {
watchdog('Apache Solr', nl2br(check_plain($e
->getMessage())), WATCHDOG_ERROR);
return FALSE;
}
}
Functions
Name![]() |
Description |
---|---|
apachesolr_add_node_document | Add a document to the $documents array based on a node ID. |
apachesolr_add_tags_to_document | Extract HTML tag contents from $text and add to boost fields. |
apachesolr_add_taxonomy_to_document | Extract taxonomy from $node and add to dynamic fields. |
apachesolr_clean_text | Strip html tags and also control characters that cause Jetty/Solr to fail. |
apachesolr_cron_check_node_table | hook_cron() helper to try to make {apachesolr_search_node} consistent with {node}. |
apachesolr_nodeapi_mass_delete | |
apachesolr_nodeapi_mass_update | |
apachesolr_node_to_document | Given a node ID, return a document representing that node. |
apachesolr_vocab_name | Helper function - return a safe (PHP identifier) vocabulary name. |