function apachesolr_attachments_get_attachment_text in Apache Solr Attachments 7
Same name and namespace in other branches
- 6.3 apachesolr_attachments.index.inc \apachesolr_attachments_get_attachment_text()
- 6 apachesolr_attachments.admin.inc \apachesolr_attachments_get_attachment_text()
- 6.2 apachesolr_attachments.admin.inc \apachesolr_attachments_get_attachment_text()
Parse the attachment getting just the raw text.
Throws
Exception
3 calls to apachesolr_attachments_get_attachment_text()
- apachesolr_attachments_node_solr_document in ./
apachesolr_attachments.module - Builds the file-specific information for a Solr document.
- apachesolr_attachments_solr_document in ./
apachesolr_attachments.module - Builds the file-specific information for a Solr document.
- apachesolr_attachments_test_tika_extraction in ./
apachesolr_attachments.admin.inc - Function to test if our extracting with tika succeeds
File
- ./
apachesolr_attachments.index.inc, line 50 - Indexing-related functions.
Code
function apachesolr_attachments_get_attachment_text($file) {
$indexer_table = apachesolr_get_indexer_table('file');
if (!apachesolr_attachments_is_file($file)) {
return FALSE;
}
// Exclude files above the configured limit.
$filesize_limit = variable_get('apachesolr_attachments_filesize_limit', '41943040');
// Check if the filesize is higher than the allowed filesize.
if (isset($file->filesize) && $filesize_limit > 0 && $file->filesize > $filesize_limit) {
watchdog('Apache Solr Attachments', 'Not processing file @filename with size @filesize bytes, which exceeds apachesolr_attachments_filesize_limit of @sizelimit bytes.', array(
'@filesize' => $file->filesize,
'@filename' => $file->filename,
'@sizelimit' => $filesize_limit,
));
return FALSE;
}
$filepath = drupal_realpath($file->uri);
// No need to use java for plain text files.
if ($file->filemime == 'text/plain' || $file->filemime == 'text/x-diff') {
$text = file_get_contents($filepath);
// TODO - try to detect encoding and convert to UTF-8.
// Strip bad control characters.
$text = iconv("UTF-8", "UTF-8//IGNORE", $text);
$text = trim(apachesolr_clean_text($text));
return $text;
}
$hash = hash('sha256', file_get_contents($filepath));
if ($hash === FALSE) {
watchdog('Apache Solr Attachments', 'sha256 hash algorithm is not supported', NULL, WATCHDOG_ERROR);
return FALSE;
}
$cached_body = cache_get('entity_id:' . $file->fid, 'cache_apachesolr_attachments_file_body');
$cached = db_query("SELECT * FROM {{$indexer_table}} WHERE entity_id = :entity_id", array(
':entity_id' => $file->fid,
))
->fetchAssoc();
if ($cached_body !== FALSE && $cached['hash'] == $hash) {
// No need to re-extract.
return $cached_body->data;
}
if (variable_get('apachesolr_attachments_extract_using', 'tika') == 'tika') {
$text = apachesolr_attachments_extract_using_tika($filepath);
}
else {
// Extract using Solr.
try {
list($text, $metadata) = apachesolr_attachments_extract_using_solr($filepath);
} catch (Exception $e) {
// Exceptions from Solr may be transient, or indicate a problem with a specific file.
watchdog('Apache Solr Attachments', "Exception occurred sending %filepath to Solr\n!message", array(
'%filepath' => $filepath,
'!message' => nl2br(check_plain($e
->getMessage())),
), WATCHDOG_ERROR);
return FALSE;
}
}
// Strip bad control characters.
$text = iconv("UTF-8", "UTF-8//IGNORE", $text);
$text = trim(apachesolr_clean_text($text));
// Save the hash and the extracted, cleaned text to the DB.
db_update($indexer_table)
->fields(array(
'hash' => $hash,
))
->condition('entity_id', $file->fid)
->execute();
cache_set('entity_id:' . $file->fid, $text, 'cache_apachesolr_attachments_file_body', CACHE_PERMANENT);
return $text;
}