You are here

function apachesolr_attachments_get_attachment_text in Apache Solr Attachments 6.3

Same name and namespace in other branches
  1. 6 apachesolr_attachments.admin.inc \apachesolr_attachments_get_attachment_text()
  2. 6.2 apachesolr_attachments.admin.inc \apachesolr_attachments_get_attachment_text()
  3. 7 apachesolr_attachments.index.inc \apachesolr_attachments_get_attachment_text()

Parse the attachment getting just the raw text.

Throws

Exception

2 calls to apachesolr_attachments_get_attachment_text()
apachesolr_attachments_solr_document in ./apachesolr_attachments.module
Builds the file-specific information for a Solr document.
apachesolr_attachments_test_tika_extraction in ./apachesolr_attachments.admin.inc
Function to test if our extracting with tika succeeds

File

./apachesolr_attachments.index.inc, line 50
Indexing-related functions.

Code

function apachesolr_attachments_get_attachment_text($file) {
  $indexer_table = apachesolr_get_indexer_table('file');
  if (!apachesolr_attachments_is_file($file)) {
    return FALSE;
  }
  $filepath = drupal_realpath($file->uri);

  // No need to use java for plain text files.
  if ($file->filemime == 'text/plain' || $file->filemime == 'text/x-diff') {
    $text = file_get_contents($filepath);

    // TODO - try to detect encoding and convert to UTF-8.
    // Strip bad control characters.
    $text = iconv("UTF-8", "UTF-8//IGNORE", $text);
    $text = trim(apachesolr_clean_text($text));
    return $text;
  }
  $hash = hash('sha256', file_get_contents($filepath));
  if ($hash === FALSE) {
    watchdog('Apache Solr Attachments', 'sha256 hash algorithm is not supported', NULL, WATCHDOG_ERROR);
    return FALSE;
  }
  $cached = db_query("SELECT * FROM {{$indexer_table}} WHERE entity_id = :entity_id", array(
    ':entity_id' => $file->fid,
  ))
    ->fetchAssoc();
  if (!is_null($cached['body']) && $cached['hash'] == $hash) {

    // No need to re-extract.
    return $cached['body'];
  }
  if (variable_get('apachesolr_attachments_extract_using', 'tika') == 'tika') {
    $text = apachesolr_attachments_extract_using_tika($filepath);
  }
  else {

    // Extract using Solr.
    try {
      list($text, $metadata) = apachesolr_attachments_extract_using_solr($filepath);
    } catch (Exception $e) {

      // Exceptions from Solr may be transient, or indicate a problem with a specific file.
      watchdog('Apache Solr Attachments', "Exception occurred sending %filepath to Solr\n!message", array(
        '%filepath' => $file->uri,
        '!message' => nl2br(check_plain($e
          ->getMessage())),
      ), WATCHDOG_ERROR);
      return FALSE;
    }
  }

  // Strip bad control characters.
  $text = iconv("UTF-8", "UTF-8//IGNORE", $text);
  $text = trim(apachesolr_clean_text($text));

  // Save the extracted, cleaned text to the DB.
  db_update($indexer_table)
    ->fields(array(
    'hash' => $hash,
    'body' => $text,
  ))
    ->condition('entity_id', $file->fid)
    ->execute();
  return $text;
}