Provides a file attachment search implementation for use with the Apache Solr module


 * @file
 *   Provides a file attachment search implementation for use with the Apache Solr module
define('SOLR_ATTACHMENT_NS', 'apachesolr_attachment');
define('SOLR_ATTACHMENT_WD', 'Solr Attachments');

 * Implementation of hook_menu().
function apachesolr_attachments_menu($may_cache) {
  $items = array();
  if ($may_cache) {
    $items[] = array(
      'path' => 'admin/settings/apachesolr/attachments',
      'title' => t('Apache Solr Attachments Settings'),
      'description' => t('Administer Apache Solr Attachments'),
      'callback' => 'drupal_get_form',
      'callback arguments' => 'apachesolr_attachments_settings',
      'access' => user_access('administer site configuration'),
  return $items;

 * Displays the Attachment Settings Form.
function apachesolr_attachments_settings() {
  $instruction_text = 'For each type of attachment, enter the path to the helper application installed on your server. "%file%" is a placeholder for the path of the attachment file and is required. If you don\'t want to search a type of attachment, leave the path setting blank (i.e., remove the content from the appropriate field below).';
  $form['instructions'] = array(
    '#type' => 'markup',
    '#value' => t($instruction_text),
  $form['apachesolr_attachment_pdf_path'] = array(
    '#type' => 'textfield',
    '#title' => t('PDF Helper'),
    '#size' => 50,
    '#maxlength' => 100,
    '#description' => t("The full path to the helper for application/pdf files, plus any other parameters needed by the helper."),
    '#default_value' => variable_get('apachesolr_attachment_pdf_path', ''),
  $form['apachesolr_attachment_txt_path'] = array(
    '#type' => 'textfield',
    '#title' => t('Text Helper'),
    '#size' => 50,
    '#maxlength' => 100,
    '#description' => t("The full path to the helper for text/plain files, plus any other parameters needed by the helper."),
    '#default_value' => variable_get('apachesolr_attachment_txt_path', ''),
  $form['apachesolr_attachment_doc_path'] = array(
    '#type' => 'textfield',
    '#title' => t('Word Doc Helper'),
    '#size' => 50,
    '#maxlength' => 100,
    '#description' => t("The full path to the helper for application/msword files, plus any other parameters needed by the helper."),
    '#default_value' => variable_get('apachesolr_attachment_doc_path', ''),
  return system_settings_form($form);

 * Implementation of hook_search().
function apachesolr_attachments_search($op = 'search', $keys = NULL) {
  switch ($op) {
    case 'name':

      // We dont want a tab
      return '';
    case 'reset':
    case 'status':

      // Figure out a way to know how many are left to update, or expose it as part of the apachesolr module
    case 'search':
      return apachesolr_search_search($op, $keys);

 * Hook is called by search.module to add things to the search index.
 * In our case we will search content types and add any CCK type that
 * is a file type that we know how to parse and any uploaded file
 * attachments.
function apachesolr_attachments_update_index() {
  $result = ApacheSolrUpdate::getNodesToIndex(SOLR_ATTACHMENT_NS);
  while ($row = db_fetch_object($result)) {

    // Variables to track the last item changed.
    $solr_last_change = $row->last_change;
    $solr_last_id = $row->nid;
    $node = node_load($row->nid);
    if ($node->nid) {

      // Since there is no notification for an attachment being unassociated with a
      // node (but that action will trigger it to be indexed again), lets remove
      // all indexed attachments then add all attached (if any)
      $files = _asa_get_indexable_files($node);
      if (!empty($files)) {

        // Update solr index.
        try {
          foreach ($files as $file) {

            // Some are arrays others are objects, treat them all as objects
            $file = (object) $file;
            $text = _asa_get_attachment_text($file);
            $text = trim($text);
            if (!empty($text)) {
              $document = new Apache_Solr_Document();
              $site = url(NULL, NULL, NULL, TRUE);
              $hash = md5($site);
              $document->site = $site;
              $document->hash = $hash;
              $document->url = file_create_url($file->filepath);
              $document->id = $file->fid;
              $document->nid = $node->nid;
              $document->title = $file->filename;
              $document->changed = $node->changed;
              $document->uid = $node->uid;
              $document->body = $text;
              $document->text = "{$file->description} {$file->filename} {$text}";
              $document->type = $node->type;
              $document->bsfield_isfile = TRUE;
              _as_configure_taxonomy($document, $node);

              // Let modules add to the document
              foreach (module_implements('apachesolr_attachments_update_index') as $module) {
                $function = $module . '_apachesolr_attachments_update_index';
                $function($document, $node, $file);
              $documents[] = $document;
        } catch (Exception $e) {
          watchdog(SOLR_ATTACHMENT_WD, $e
            ->getMessage(), WATCHDOG_ERROR);
      ApacheSolrUpdate::success(SOLR_ATTACHMENT_NS, $solr_last_change, $solr_last_id);

 * Implementation of hook_nodeapi().
 * For a search result: Parse the nid and fid for a search result for potential use later.
 * For a delete: Remove all associated attachments from the Solr store.
function apachesolr_attachments_nodeapi($node, $op) {
  switch ($op) {
    case 'delete':

 * Implementation of hook_apachesolr_process_results().
 * When using the core Apache Solr module, everythign is treated as a node and as such 
 * the link and type wont be configured correctly if it is a file attachement, so override 
 * those values here if needed. 
function apachesolr_attachments_apachesolr_process_results($results) {
  if (is_array($results)) {
    foreach ($results as &$item) {
      if (isset($item['node']->bsfield_isfile) && $item['node']->bsfield_isfile === TRUE) {
        $nid = $item['node']->nid;
        $node_title = db_result(db_query("SELECT title FROM {node} WHERE nid = %d", $nid));
        $item['snippet'] = l($node_title, "node/{$nid}") . ': ' . $item['snippet'];

 * Return all file attachments for a particular node
function _asa_get_indexable_files($node) {
  $files = array();
  if (!empty($node->files)) {
    $files = array_merge($files, $node->files);
  $fields = _asa_get_cck_file_fields();
  foreach ($fields as $field) {
    if (!empty($node->{$field})) {
      $files = array_merge($files, $node->{$field});
  return $files;

 * Return all CCK fields that are of type 'file'
function _asa_get_cck_file_fields() {
  $file_fields = array();
  if (module_exists('filefield')) {
    $fields = content_fields();
    foreach ($fields as $key => $values) {
      if ($values['type'] == 'file') {
        $file_fields[] = $key;
  return $file_fields;

 * Parse the Attachment getting just the raw text, stripping any garbage characters that
 * could screw up the XML Doc processing.
function _asa_get_attachment_text($file) {
  $helper_command = _asa_get_file_helper_command($file->filemime);

  // Empty entries in settings mean that helper is disabled.
  if ($helper_command == '') {
    return '';

  // %file% is a token that is placed in the helper's parameter list to represent
  // the file path to the attachment.
  $helper_command = preg_replace('/%file%/', "{$file->filepath}", $helper_command);
  $helper_command = escapeshellcmd($helper_command);
  $text = shell_exec($helper_command);

  // Strip anything that might make the Solr integration barf.
  // Wierd control characters make things behave wierd, especially in XML
  $cleaned_text = iconv("utf-8", "utf-8//IGNORE", $text);

  // As per robertDouglass -
  // Bad control character. Do we need to make a hook for text cleanup?
  $cleaned_text = preg_replace('/\\x0C/', '', $cleaned_text);
  return $cleaned_text;

 * For a particular node id, remove all file attachments from the solr index.
function _asa_remove_attachments_from_index($nid) {
  try {
    $solr = _get_solr_instance();
      ->deleteByQuery("nid:{$nid} AND bsfield_isfile:true");
  } catch (Exception $e) {
    watchdog(SOLR_ATTACHMENT_WD, $e
      ->getMessage(), WATCHDOG_ERROR);

 * For a provided fid, get the file path.
function _asa_get_file_url($fid) {
  if (!empty($fid) && is_numeric($fid)) {
    $result = db_query('SELECT * FROM {files} WHERE fid = %d', $fid);
    $file = db_fetch_array($result);
    return $file['filepath'];

 * Get the command to parse text out of a particular mime type.
function _asa_get_file_helper_command($type) {

  // Determine helper based on file extension
  switch ($type) {
    case 'application/pdf':
      $cmd = variable_get('apachesolr_attachment_pdf_path', '');
    case 'text/plain':
      $cmd = variable_get('apachesolr_attachment_txt_path', '');
    case 'application/msword':
      $cmd = variable_get('apachesolr_attachment_doc_path', '');
      $cmd = '';
  return $cmd;

 * Get a reference to the Solr service.
function _asa_get_solr_instance() {
  try {
    return _get_solr_instance();
  } catch (Exception $e) {
    watchdog(SOLR_ATTACHMENT_WD, $e
      ->getMessage(), WATCHDOG_ERROR);
  return FALSE;


/** The following functions should become part of the Apache Solr module API          **/


 * Get a reference to the Solr service. This consolidates cal to varaible_get, etc.
function _get_solr_instance() {
  $host = variable_get('apachesolr_host', 'localhost');
  $port = variable_get('apachesolr_port', 8983);
  $path = variable_get('apachesolr_path', '/solr');
  $solr =& apachesolr_get_solr($host, $port, $path);
  if (!$solr
    ->ping()) {
    throw new Exception(t('No Solr instance available'));
  return $solr;

 * Add taxonomy from the node to the solr document for the attachment.
function _as_configure_taxonomy($document, $node) {
  if (is_array($node->taxonomy)) {
    foreach ($node->taxonomy as $term) {
        ->setMultiValue('tid', $term->tid);

      // Double indexing of tids lets us do effecient searches (on tid)
      // and do accurate per-vocabulary faceting.
        ->setMultiValue('imfield_vid' . $term->vid, $term->tid);
        ->setMultiValue('vid', $term->vid);
        ->setMultiValue('taxonomy_name', $term->name);

 * Take the full list of Docs to submit to Solr and add them in batches.
function _as_index_documents($documents) {
  $solr = _asa_get_solr_instance();
  if (is_object($solr) && count($documents) > 0) {
    watchdog(SOLR_ATTACHMENT_WD, t("Adding @count documents to Solr", array(
      '@count' => count($documents),
    try {

      // Chunk the adds by 50s
      $docs_chunk = array_chunk($documents, 50);
      foreach ($docs_chunk as $docs) {
        ->optimize(FALSE, FALSE);
    } catch (Exception $e) {
      watchdog(SOLR_ATTACHMENT_WD, $e
        ->getMessage(), WATCHDOG_ERROR);


