function apachesolr_attachments_get_attachment_text in Apache Solr Attachments 6.3
Same name and namespace in other branches
- 6 apachesolr_attachments.admin.inc \apachesolr_attachments_get_attachment_text()
- 6.2 apachesolr_attachments.admin.inc \apachesolr_attachments_get_attachment_text()
- 7 apachesolr_attachments.index.inc \apachesolr_attachments_get_attachment_text()
Parse the attachment getting just the raw text.
Throws
Exception
2 calls to apachesolr_attachments_get_attachment_text()
- apachesolr_attachments_solr_document in ./
apachesolr_attachments.module - Builds the file-specific information for a Solr document.
- apachesolr_attachments_test_tika_extraction in ./
apachesolr_attachments.admin.inc - Function to test if our extracting with tika succeeds
File
- ./
apachesolr_attachments.index.inc, line 50 - Indexing-related functions.
Code
function apachesolr_attachments_get_attachment_text($file) {
$indexer_table = apachesolr_get_indexer_table('file');
if (!apachesolr_attachments_is_file($file)) {
return FALSE;
}
$filepath = drupal_realpath($file->uri);
// No need to use java for plain text files.
if ($file->filemime == 'text/plain' || $file->filemime == 'text/x-diff') {
$text = file_get_contents($filepath);
// TODO - try to detect encoding and convert to UTF-8.
// Strip bad control characters.
$text = iconv("UTF-8", "UTF-8//IGNORE", $text);
$text = trim(apachesolr_clean_text($text));
return $text;
}
$hash = hash('sha256', file_get_contents($filepath));
if ($hash === FALSE) {
watchdog('Apache Solr Attachments', 'sha256 hash algorithm is not supported', NULL, WATCHDOG_ERROR);
return FALSE;
}
$cached = db_query("SELECT * FROM {{$indexer_table}} WHERE entity_id = :entity_id", array(
':entity_id' => $file->fid,
))
->fetchAssoc();
if (!is_null($cached['body']) && $cached['hash'] == $hash) {
// No need to re-extract.
return $cached['body'];
}
if (variable_get('apachesolr_attachments_extract_using', 'tika') == 'tika') {
$text = apachesolr_attachments_extract_using_tika($filepath);
}
else {
// Extract using Solr.
try {
list($text, $metadata) = apachesolr_attachments_extract_using_solr($filepath);
} catch (Exception $e) {
// Exceptions from Solr may be transient, or indicate a problem with a specific file.
watchdog('Apache Solr Attachments', "Exception occurred sending %filepath to Solr\n!message", array(
'%filepath' => $file->uri,
'!message' => nl2br(check_plain($e
->getMessage())),
), WATCHDOG_ERROR);
return FALSE;
}
}
// Strip bad control characters.
$text = iconv("UTF-8", "UTF-8//IGNORE", $text);
$text = trim(apachesolr_clean_text($text));
// Save the extracted, cleaned text to the DB.
db_update($indexer_table)
->fields(array(
'hash' => $hash,
'body' => $text,
))
->condition('entity_id', $file->fid)
->execute();
return $text;
}