You are here

function apachesolr_add_tags_to_document in Apache Solr Search 6

Same name and namespace in other branches
  1. 5.2 apachesolr.index.inc \apachesolr_add_tags_to_document()
  2. 6.2 apachesolr.index.inc \apachesolr_add_tags_to_document()

Extract HTML tag contents from $text and add to boost fields.

$text must be stripped of control characters before hand.

1 call to apachesolr_add_tags_to_document()
apachesolr_node_to_document in ./apachesolr.index.inc
Given a node ID, return a document representing that node.

File

./apachesolr.index.inc, line 219
Functions used when indexing content to Apache Solr.

Code

function apachesolr_add_tags_to_document($document, $text) {
  $tags_to_index = variable_get('apachesolr_tags_to_index', array(
    'h1' => 'tags_h1',
    'h2' => 'tags_h2_h3',
    'h3' => 'tags_h2_h3',
    'h4' => 'tags_h4_h5_h6',
    'h5' => 'tags_h4_h5_h6',
    'h6' => 'tags_h4_h5_h6',
    'u' => 'tags_inline',
    'b' => 'tags_inline',
    'i' => 'tags_inline',
    'strong' => 'tags_inline',
    'em' => 'tags_inline',
    'a' => 'tags_a',
  ));

  // Strip off all ignored tags.
  $text = strip_tags($text, '<' . implode('><', array_keys($tags_to_index)) . '>');
  preg_match_all('@<(' . implode('|', array_keys($tags_to_index)) . ')[^>]*>(.*)</\\1>@Ui', $text, $matches);
  foreach ($matches[1] as $key => $tag) {
    $tag = strtolower($tag);

    // We don't want to index links auto-generated by the url filter.
    if ($tag != 'a' || !preg_match('@(?:http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://|www\\.)[a-zA-Z0-9]+@', $matches[2][$key])) {
      if (!isset($document->{$tags_to_index[$tag]})) {
        $document->{$tags_to_index[$tag]} = '';
      }
      $document->{$tags_to_index[$tag]} .= ' ' . apachesolr_clean_text($matches[2][$key]);
    }
  }
}