You are here

function apachesolr_attachments_get_attachment_text in Apache Solr Attachments 6

Same name and namespace in other branches
  1. 6.3 apachesolr_attachments.index.inc \apachesolr_attachments_get_attachment_text()
  2. 6.2 apachesolr_attachments.admin.inc \apachesolr_attachments_get_attachment_text()
  3. 7 apachesolr_attachments.index.inc \apachesolr_attachments_get_attachment_text()

Parse the attachment getting just the raw text.

Throws

Exception

1 call to apachesolr_attachments_get_attachment_text()
apachesolr_attachments_add_documents in ./apachesolr_attachments.admin.inc
Callback for apachesolr_index_nodes().

File

./apachesolr_attachments.admin.inc, line 404
Provides a file attachment search implementation for use with the Apache Solr module

Code

function apachesolr_attachments_get_attachment_text($file) {

  // Any down-side to using realpath()?
  $filepath = realpath($file->filepath);

  // Check that we have a valid filepath.
  if (!$filepath) {
    return FALSE;
  }
  elseif (!is_file($filepath)) {
    watchdog('Apache Solr Attachments', '%filepath is not a valid file path', array(
      '%filepath' => $filepath,
    ), WATCHDOG_WARNING);
    return FALSE;
  }

  // No need to use java for plain text files.
  if ($file->filemime == 'text/plain' || $file->filemime == 'text/x-diff') {
    $text = file_get_contents($filepath);

    // TODO - try to detect encoding and convert to UTF-8.
    // Strip bad control characters.
    $text = iconv("UTF-8", "UTF-8//IGNORE", $text);
    $text = trim(apachesolr_clean_text($text));
    return $text;
  }
  $sha1 = sha1_file($filepath);
  if ($sha1 === FALSE) {
    watchdog('Apache Solr Attachments', 'Failed to calculate hash of %filepath', array(
      '%filepath' => $filepath,
    ), WATCHDOG_ERROR);
    return FALSE;
  }
  $cached = db_fetch_array(db_query("SELECT * FROM {apachesolr_attachments_files} WHERE fid = %d", $file->fid));
  if (!is_null($cached['body']) && $cached['sha1'] == $sha1) {

    // No need to re-extract.
    return $cached['body'];
  }
  if (variable_get('apachesolr_attachment_extract_using', 'tika') == 'tika') {
    $text = apachesolr_attachments_extract_using_tika($filepath);
  }
  else {

    // Extract using Solr.
    try {
      list($text, $metadata) = apachesolr_attachments_extract_using_solr($filepath);
    } catch (Exception $e) {

      // Exceptions from Solr may be transient, or indicate a problem with a specific file.
      // Shortened project name because the watchdog limits type to 16 characters.
      watchdog('ApacheSolrAttach', "Exception occured sending %filepath to Solr\n!message", array(
        '%filepath' => $file->filepath,
        '!message' => nl2br(check_plain($e
          ->getMessage())),
      ), WATCHDOG_ERROR);
      return FALSE;
    }
  }

  // Strip bad control characters.
  $text = iconv("UTF-8", "UTF-8//IGNORE", $text);
  $text = trim(apachesolr_clean_text($text));

  // Save the extracted, cleaned text to the DB.
  db_query("UPDATE {apachesolr_attachments_files} SET sha1 = '%s', body = '%s' WHERE fid = %d", $sha1, $text, $file->fid);
  return $text;
}