You are here

function apachesolr_attachments_get_attachment_text in Apache Solr Attachments 7

Same name and namespace in other branches
  1. 6.3 apachesolr_attachments.index.inc \apachesolr_attachments_get_attachment_text()
  2. 6 apachesolr_attachments.admin.inc \apachesolr_attachments_get_attachment_text()
  3. 6.2 apachesolr_attachments.admin.inc \apachesolr_attachments_get_attachment_text()

Parse the attachment getting just the raw text.

Throws

Exception

3 calls to apachesolr_attachments_get_attachment_text()
apachesolr_attachments_node_solr_document in ./apachesolr_attachments.module
Builds the file-specific information for a Solr document.
apachesolr_attachments_solr_document in ./apachesolr_attachments.module
Builds the file-specific information for a Solr document.
apachesolr_attachments_test_tika_extraction in ./apachesolr_attachments.admin.inc
Function to test if our extracting with tika succeeds

File

./apachesolr_attachments.index.inc, line 50
Indexing-related functions.

Code

function apachesolr_attachments_get_attachment_text($file) {
  $indexer_table = apachesolr_get_indexer_table('file');
  if (!apachesolr_attachments_is_file($file)) {
    return FALSE;
  }

  // Exclude files above the configured limit.
  $filesize_limit = variable_get('apachesolr_attachments_filesize_limit', '41943040');

  // Check if the filesize is higher than the allowed filesize.
  if (isset($file->filesize) && $filesize_limit > 0 && $file->filesize > $filesize_limit) {
    watchdog('Apache Solr Attachments', 'Not processing file @filename with size @filesize bytes, which exceeds apachesolr_attachments_filesize_limit of @sizelimit bytes.', array(
      '@filesize' => $file->filesize,
      '@filename' => $file->filename,
      '@sizelimit' => $filesize_limit,
    ));
    return FALSE;
  }
  $filepath = drupal_realpath($file->uri);

  // No need to use java for plain text files.
  if ($file->filemime == 'text/plain' || $file->filemime == 'text/x-diff') {
    $text = file_get_contents($filepath);

    // TODO - try to detect encoding and convert to UTF-8.
    // Strip bad control characters.
    $text = iconv("UTF-8", "UTF-8//IGNORE", $text);
    $text = trim(apachesolr_clean_text($text));
    return $text;
  }
  $hash = hash('sha256', file_get_contents($filepath));
  if ($hash === FALSE) {
    watchdog('Apache Solr Attachments', 'sha256 hash algorithm is not supported', NULL, WATCHDOG_ERROR);
    return FALSE;
  }
  $cached_body = cache_get('entity_id:' . $file->fid, 'cache_apachesolr_attachments_file_body');
  $cached = db_query("SELECT * FROM {{$indexer_table}} WHERE entity_id = :entity_id", array(
    ':entity_id' => $file->fid,
  ))
    ->fetchAssoc();
  if ($cached_body !== FALSE && $cached['hash'] == $hash) {

    // No need to re-extract.
    return $cached_body->data;
  }
  if (variable_get('apachesolr_attachments_extract_using', 'tika') == 'tika') {
    $text = apachesolr_attachments_extract_using_tika($filepath);
  }
  else {

    // Extract using Solr.
    try {
      list($text, $metadata) = apachesolr_attachments_extract_using_solr($filepath);
    } catch (Exception $e) {

      // Exceptions from Solr may be transient, or indicate a problem with a specific file.
      watchdog('Apache Solr Attachments', "Exception occurred sending %filepath to Solr\n!message", array(
        '%filepath' => $filepath,
        '!message' => nl2br(check_plain($e
          ->getMessage())),
      ), WATCHDOG_ERROR);
      return FALSE;
    }
  }

  // Strip bad control characters.
  $text = iconv("UTF-8", "UTF-8//IGNORE", $text);
  $text = trim(apachesolr_clean_text($text));

  // Save the hash and the extracted, cleaned text to the DB.
  db_update($indexer_table)
    ->fields(array(
    'hash' => $hash,
  ))
    ->condition('entity_id', $file->fid)
    ->execute();
  cache_set('entity_id:' . $file->fid, $text, 'cache_apachesolr_attachments_file_body', CACHE_PERMANENT);
  return $text;
}