You are here

processor_highlight.inc in Search API 7

Contains the SearchApiHighlight class.

File

includes/processor_highlight.inc
View source
<?php

/**
 * @file
 * Contains the SearchApiHighlight class.
 */

/**
 * Processor for highlighting search results.
 */
class SearchApiHighlight extends SearchApiAbstractProcessor {

  /**
   * PREG regular expression for a word boundary.
   *
   * We highlight around non-indexable or CJK characters.
   *
   * @var string
   */
  protected static $boundary;

  /**
   * PREG regular expression for splitting words.
   *
   * @var string
   */
  protected static $split;

  /**
   * {@inheritdoc}
   */
  public function __construct(SearchApiIndex $index, array $options = array()) {
    parent::__construct($index, $options);
    $cjk = '\\x{1100}-\\x{11FF}\\x{3040}-\\x{309F}\\x{30A1}-\\x{318E}' . '\\x{31A0}-\\x{31B7}\\x{31F0}-\\x{31FF}\\x{3400}-\\x{4DBF}\\x{4E00}-\\x{9FCF}' . '\\x{A000}-\\x{A48F}\\x{A4D0}-\\x{A4FD}\\x{A960}-\\x{A97F}\\x{AC00}-\\x{D7FF}' . '\\x{F900}-\\x{FAFF}\\x{FF21}-\\x{FF3A}\\x{FF41}-\\x{FF5A}\\x{FF66}-\\x{FFDC}' . '\\x{20000}-\\x{2FFFD}\\x{30000}-\\x{3FFFD}';
    self::$boundary = '(?:(?<=[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . $cjk . '])|(?=[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . $cjk . ']))';
    self::$split = '/[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . ']+/iu';
  }

  /**
   * {@inheritdoc}
   */
  public function configurationForm() {
    $this->options += array(
      'prefix' => '<strong>',
      'suffix' => '</strong>',
      'excerpt' => TRUE,
      'excerpt_length' => 256,
      'highlight' => 'always',
      'highlight_partial' => FALSE,
      'exclude_fields' => array(),
    );
    $form['prefix'] = array(
      '#type' => 'textfield',
      '#title' => t('Highlighting prefix'),
      '#description' => t('Text/HTML that will be prepended to all occurrences of search keywords in highlighted text.'),
      '#default_value' => $this->options['prefix'],
    );
    $form['suffix'] = array(
      '#type' => 'textfield',
      '#title' => t('Highlighting suffix'),
      '#description' => t('Text/HTML that will be appended to all occurrences of search keywords in highlighted text.'),
      '#default_value' => $this->options['suffix'],
    );
    $form['excerpt'] = array(
      '#type' => 'checkbox',
      '#title' => t('Create excerpt'),
      '#description' => t('When enabled, an excerpt will be created for searches with keywords, containing all occurrences of keywords in a fulltext field.'),
      '#default_value' => $this->options['excerpt'],
    );
    $form['excerpt_length'] = array(
      '#type' => 'textfield',
      '#title' => t('Excerpt length'),
      '#description' => t('The requested length of the excerpt, in characters.'),
      '#default_value' => $this->options['excerpt_length'],
      '#element_validate' => array(
        'element_validate_integer_positive',
      ),
      '#states' => array(
        'visible' => array(
          '#edit-processors-search-api-highlighting-settings-excerpt' => array(
            'checked' => TRUE,
          ),
        ),
      ),
    );

    // Exclude certain fulltext fields.
    $fields = $this->index
      ->getFields();
    $fulltext_fields = array();
    foreach ($this->index
      ->getFulltextFields() as $field) {
      if (isset($fields[$field])) {
        $fulltext_fields[$field] = check_plain($fields[$field]['name'] . ' (' . $field . ')');
      }
    }
    $form['exclude_fields'] = array(
      '#type' => 'checkboxes',
      '#title' => t('Exclude fields from excerpt'),
      '#description' => t('Exclude certain fulltext fields from being displayed in the excerpt.'),
      '#options' => $fulltext_fields,
      '#default_value' => $this->options['exclude_fields'],
      '#attributes' => array(
        'class' => array(
          'search-api-checkboxes-list',
        ),
      ),
    );
    $form['highlight'] = array(
      '#type' => 'select',
      '#title' => t('Highlight returned field data'),
      '#description' => t('Select whether returned fields should be highlighted.'),
      '#options' => array(
        'always' => t('Always'),
        'server' => t('If the server returns fields'),
        'never' => t('Never'),
      ),
      '#default_value' => $this->options['highlight'],
    );
    $form['highlight_partial'] = array(
      '#type' => 'checkbox',
      '#title' => t('Highlight partial matches'),
      '#description' => t('When enabled, matches in parts of words will be highlighted as well.'),
      '#default_value' => $this->options['highlight_partial'],
    );
    return $form;
  }

  /**
   * {@inheritdoc}
   */
  public function configurationFormValidate(array $form, array &$values, array &$form_state) {
    $values['exclude_fields'] = array_filter($values['exclude_fields']);
  }

  /**
   * {@inheritdoc}
   */
  public function postprocessSearchResults(array &$response, SearchApiQuery $query) {
    if (empty($response['results']) || !($keys = $this
      ->getKeywords($query))) {
      return;
    }
    $fulltext_fields = $this->index
      ->getFulltextFields();
    if (!empty($this->options['exclude_fields'])) {
      $fulltext_fields = drupal_map_assoc($fulltext_fields);
      foreach ($this->options['exclude_fields'] as $field) {
        unset($fulltext_fields[$field]);
      }
    }
    foreach ($response['results'] as $id => &$result) {
      if ($this->options['excerpt']) {
        $text = array();
        $fields = $this
          ->getFulltextFields($response['results'], $id, $fulltext_fields);
        foreach ($fields as $data) {
          if (is_array($data)) {
            $text = array_merge($text, $data);
          }
          else {
            $text[] = $data;
          }
        }
        $result['excerpt'] = $this
          ->createExcerpt($this
          ->flattenArrayValues($text), $keys);
      }
      if ($this->options['highlight'] != 'never') {
        $fields = $this
          ->getFulltextFields($response['results'], $id, $fulltext_fields, $this->options['highlight'] == 'always');
        foreach ($fields as $field => $data) {
          $result['fields'][$field] = array(
            '#sanitize_callback' => FALSE,
          );
          if (is_array($data)) {
            foreach ($data as $i => $text) {
              $result['fields'][$field]['#value'][$i] = $this
                ->highlightField($text, $keys);
            }
          }
          else {
            $result['fields'][$field]['#value'] = $this
              ->highlightField($data, $keys);
          }
        }
      }
    }
  }

  /**
   * Retrieves the fulltext data of a result.
   *
   * @param array $results
   *   All results returned in the search, by reference.
   * @param int|string $i
   *   The index in the results array of the result whose data should be
   *   returned.
   * @param array $fulltext_fields
   *   The fulltext fields from which the excerpt should be created.
   * @param bool $load
   *   TRUE if the item should be loaded if necessary, FALSE if only fields
   *   already returned in the results should be used.
   *
   * @return array
   *   An array containing fulltext field names mapped to the text data
   *   contained in them for the given result.
   */
  protected function getFulltextFields(array &$results, $i, array $fulltext_fields, $load = TRUE) {
    global $language;
    $data = array();
    $result =& $results[$i];

    // Act as if $load is TRUE if we have a loaded item.
    $load |= !empty($result['entity']);
    $result += array(
      'fields' => array(),
    );

    // We only need detailed fields data if $load is TRUE.
    $fields = $load ? $this->index
      ->getFields() : array();
    $needs_extraction = array();
    $returned_fields = search_api_get_sanitized_field_values(array_intersect_key($result['fields'], array_flip($fulltext_fields)));
    foreach ($fulltext_fields as $field) {
      if (array_key_exists($field, $returned_fields)) {
        $data[$field] = $returned_fields[$field];
      }
      elseif ($load) {
        $needs_extraction[$field] = $fields[$field];
      }
    }
    if (!$needs_extraction) {
      return $data;
    }
    if (empty($result['entity'])) {
      $items = $this->index
        ->loadItems(array_keys($results));
      foreach ($items as $id => $item) {
        $results[$id]['entity'] = $item;
      }
    }

    // If we still don't have a loaded item, we should stop trying.
    if (empty($result['entity'])) {
      return $data;
    }
    $wrapper = $this->index
      ->entityWrapper($result['entity'], FALSE);
    $wrapper
      ->language($language->language);
    $extracted = search_api_extract_fields($wrapper, $needs_extraction, array(
      'sanitize' => TRUE,
    ));
    foreach ($extracted as $field => $info) {
      if (isset($info['value'])) {
        $data[$field] = $info['value'];
      }
    }
    return $data;
  }

  /**
   * Extracts the positive keywords used in a search query.
   *
   * @param SearchApiQuery $query
   *   The query from which to extract the keywords.
   *
   * @return array
   *   An array of all unique positive keywords used in the query.
   */
  protected function getKeywords(SearchApiQuery $query) {
    $keys = $query
      ->getKeys();
    if (!$keys) {
      return array();
    }
    if (is_array($keys)) {
      return $this
        ->flattenKeysArray($keys);
    }
    $keywords = preg_split(self::$split, $keys);

    // Assure there are no duplicates. (This is actually faster than
    // array_unique() by a factor of 3 to 4.)
    $keywords = drupal_map_assoc(array_filter($keywords));

    // Remove quotes from keywords.
    foreach ($keywords as $key) {
      $keywords[$key] = trim($key, "'\" ");
    }
    return drupal_map_assoc(array_filter($keywords));
  }

  /**
   * Extracts the positive keywords from a keys array.
   *
   * @param array $keys
   *   A search keys array, as specified by SearchApiQueryInterface::getKeys().
   *
   * @return array
   *   An array of all unique positive keywords contained in the keys.
   */
  protected function flattenKeysArray(array $keys) {
    if (!empty($keys['#negation'])) {
      return array();
    }
    $keywords = array();
    foreach ($keys as $i => $key) {
      if (!element_child($i)) {
        continue;
      }
      if (is_array($key)) {
        $keywords += $this
          ->flattenKeysArray($key);
      }
      else {
        $keywords[$key] = trim($key);
      }
    }
    return $keywords;
  }

  /**
   * Returns snippets from a piece of text, with certain keywords highlighted.
   *
   * Largely copied from search_excerpt().
   *
   * @param string $text
   *   The text to extract fragments from.
   * @param array $keys
   *   Search keywords entered by the user.
   *
   * @return string|null
   *   A string containing HTML for the excerpt, or NULL if none could be
   *   created.
   */
  protected function createExcerpt($text, array $keys) {

    // Prepare text by stripping HTML tags and decoding HTML entities.
    $text = strip_tags(str_replace(array(
      '<',
      '>',
    ), array(
      ' <',
      '> ',
    ), $text));
    $text = decode_entities($text);
    $text = preg_replace('/\\s+/', ' ', $text);
    $text = trim($text, ' ');
    $text_length = strlen($text);

    // Try to reach the requested excerpt length with about two fragments (each
    // with a keyword and some context).
    $ranges = array();
    $length = 0;
    $look_start = array();
    $remaining_keys = $keys;

    // Get the set excerpt length from the configuration. If the length is too
    // small, only use one fragment.
    $excerpt_length = $this->options['excerpt_length'];
    $context_length = round($excerpt_length / 4) - 3;
    if ($context_length < 32) {
      $context_length = round($excerpt_length / 2) - 1;
    }
    while ($length < $excerpt_length && !empty($remaining_keys)) {
      $found_keys = array();
      foreach ($remaining_keys as $key) {
        if ($length >= $excerpt_length) {
          break;
        }

        // Remember where we last found $key, in case we are coming through a
        // second time.
        if (!isset($look_start[$key])) {
          $look_start[$key] = 0;
        }

        // See if we can find $key after where we found it the last time. Since
        // we are requiring a match on a word boundary, make sure $text starts
        // and ends with a space.
        $matches = array();
        if (empty($this->options['highlight_partial'])) {
          $found_position = FALSE;
          $regex = '/' . static::$boundary . preg_quote($key, '/') . static::$boundary . '/iu';
          if (preg_match($regex, ' ' . $text . ' ', $matches, PREG_OFFSET_CAPTURE, $look_start[$key])) {
            $found_position = $matches[0][1];
          }
        }
        else {
          $found_position = stripos($text, $key, $look_start[$key]);
        }
        if ($found_position !== FALSE) {
          $look_start[$key] = $found_position + 1;

          // Keep track of which keys we found this time, in case we need to
          // pass through again to find more text.
          $found_keys[] = $key;

          // Locate a space before and after this match, leaving some context on
          // each end.
          if ($found_position > $context_length) {
            $before = strpos($text, ' ', $found_position - $context_length);
            if ($before !== FALSE) {
              ++$before;
            }
          }
          else {
            $before = 0;
          }
          if ($before !== FALSE && $before <= $found_position) {
            if ($text_length > $found_position + $context_length) {
              $after = strrpos(substr($text, 0, $found_position + $context_length), ' ', $found_position);
            }
            else {
              $after = $text_length;
            }
            if ($after !== FALSE && $after > $found_position) {
              if ($before < $after) {

                // Save this range.
                $ranges[$before] = $after;
                $length += $after - $before;
              }
            }
          }
        }
      }

      // Next time through this loop, only look for keys we found this time,
      // if any.
      $remaining_keys = $found_keys;
    }
    if (!$ranges) {

      // We didn't find any keyword matches, return NULL.
      return NULL;
    }

    // Sort the text ranges by starting position.
    ksort($ranges);

    // Collapse overlapping text ranges into one. The sorting makes it O(n).
    $newranges = array();
    $from1 = $to1 = NULL;
    foreach ($ranges as $from2 => $to2) {
      if ($from1 === NULL) {

        // This is the first time through this loop: initialize.
        $from1 = $from2;
        $to1 = $to2;
        continue;
      }
      if ($from2 <= $to1) {

        // The ranges overlap: combine them.
        $to1 = max($to1, $to2);
      }
      else {

        // The ranges do not overlap: save the working range and start a new
        // one.
        $newranges[$from1] = $to1;
        $from1 = $from2;
        $to1 = $to2;
      }
    }

    // Save the remaining working range.
    $newranges[$from1] = $to1;

    // Fetch text within the combined ranges we found.
    $out = array();
    foreach ($newranges as $from => $to) {
      $out[] = substr($text, $from, $to - $from);
    }
    if (!$out) {
      return NULL;
    }

    // Let translators have the ... separator text as one chunk.
    $dots = explode('!excerpt', t('... !excerpt ... !excerpt ...'));
    $text = (isset($newranges[0]) ? '' : $dots[0]) . implode($dots[1], $out) . $dots[2];
    $text = check_plain($text);

    // Since we stripped the tags at the beginning, highlighting doesn't need to
    // handle HTML anymore.
    return $this
      ->highlightField($text, $keys, FALSE);
  }

  /**
   * Marks occurrences of the search keywords in a text field.
   *
   * @param string $text
   *   The text of the field.
   * @param array $keys
   *   Search keywords entered by the user.
   * @param bool $html
   *   Whether the text can contain HTML tags or not. In the former case, text
   *   inside tags (i.e., tag names and attributes) won't be highlighted.
   *
   * @return string
   *   The field's text with all occurrences of search keywords highlighted.
   */
  protected function highlightField($text, array $keys, $html = TRUE) {
    if (is_array($text)) {
      $text = $this
        ->flattenArrayValues($text);
    }
    if ($html) {
      $texts = preg_split('#((?:</?[[:alpha:]](?:[^>"\']*|"[^"]*"|\'[^\']\')*>)+)#i', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
      for ($i = 0; $i < count($texts); $i += 2) {
        $texts[$i] = $this
          ->highlightField($texts[$i], $keys, FALSE);
      }
      return implode('', $texts);
    }
    $keys = implode('|', array_map('preg_quote', $keys, array_fill(0, count($keys), '/')));

    // If "Highlight partial matches" is disabled, we only want to highlight
    // matches that are complete words. Otherwise, we want all of them.
    $boundary = empty($this->options['highlight_partial']) ? self::$boundary : '';
    $regex = '/' . $boundary . '(?:' . $keys . ')' . $boundary . '/iu';
    $replace = $this->options['prefix'] . '\\0' . $this->options['suffix'];
    $text = preg_replace($regex, $replace, ' ' . $text . ' ');
    return substr($text, 1, -1);
  }

  /**
   * Flattens a (possibly multidimensional) array into a string.
   *
   * @param array $array
   *   The array to flatten.
   * @param string $glue
   *   (optional) The separator to insert between individual array items.
   *
   * @return string
   *   The glued string.
   */
  protected function flattenArrayValues(array $array, $glue = " \n\n ") {
    $ret = array();
    foreach ($array as $item) {
      if (is_array($item)) {
        $ret[] = $this
          ->flattenArrayValues($item, $glue);
      }
      else {
        $ret[] = $item;
      }
    }
    return implode($glue, $ret);
  }

}

Classes

Namesort descending Description
SearchApiHighlight Processor for highlighting search results.