function apachesolr_attachments_get_attachment_text in Apache Solr Attachments 6
Same name and namespace in other branches
- 6.3 apachesolr_attachments.index.inc \apachesolr_attachments_get_attachment_text()
- 6.2 apachesolr_attachments.admin.inc \apachesolr_attachments_get_attachment_text()
- 7 apachesolr_attachments.index.inc \apachesolr_attachments_get_attachment_text()
Parse the attachment getting just the raw text.
Throws
Exception
1 call to apachesolr_attachments_get_attachment_text()
- apachesolr_attachments_add_documents in ./
apachesolr_attachments.admin.inc - Callback for apachesolr_index_nodes().
File
- ./
apachesolr_attachments.admin.inc, line 404 - Provides a file attachment search implementation for use with the Apache Solr module
Code
function apachesolr_attachments_get_attachment_text($file) {
// Any down-side to using realpath()?
$filepath = realpath($file->filepath);
// Check that we have a valid filepath.
if (!$filepath) {
return FALSE;
}
elseif (!is_file($filepath)) {
watchdog('Apache Solr Attachments', '%filepath is not a valid file path', array(
'%filepath' => $filepath,
), WATCHDOG_WARNING);
return FALSE;
}
// No need to use java for plain text files.
if ($file->filemime == 'text/plain' || $file->filemime == 'text/x-diff') {
$text = file_get_contents($filepath);
// TODO - try to detect encoding and convert to UTF-8.
// Strip bad control characters.
$text = iconv("UTF-8", "UTF-8//IGNORE", $text);
$text = trim(apachesolr_clean_text($text));
return $text;
}
$sha1 = sha1_file($filepath);
if ($sha1 === FALSE) {
watchdog('Apache Solr Attachments', 'Failed to calculate hash of %filepath', array(
'%filepath' => $filepath,
), WATCHDOG_ERROR);
return FALSE;
}
$cached = db_fetch_array(db_query("SELECT * FROM {apachesolr_attachments_files} WHERE fid = %d", $file->fid));
if (!is_null($cached['body']) && $cached['sha1'] == $sha1) {
// No need to re-extract.
return $cached['body'];
}
if (variable_get('apachesolr_attachment_extract_using', 'tika') == 'tika') {
$text = apachesolr_attachments_extract_using_tika($filepath);
}
else {
// Extract using Solr.
try {
list($text, $metadata) = apachesolr_attachments_extract_using_solr($filepath);
} catch (Exception $e) {
// Exceptions from Solr may be transient, or indicate a problem with a specific file.
// Shortened project name because the watchdog limits type to 16 characters.
watchdog('ApacheSolrAttach', "Exception occured sending %filepath to Solr\n!message", array(
'%filepath' => $file->filepath,
'!message' => nl2br(check_plain($e
->getMessage())),
), WATCHDOG_ERROR);
return FALSE;
}
}
// Strip bad control characters.
$text = iconv("UTF-8", "UTF-8//IGNORE", $text);
$text = trim(apachesolr_clean_text($text));
// Save the extracted, cleaned text to the DB.
db_query("UPDATE {apachesolr_attachments_files} SET sha1 = '%s', body = '%s' WHERE fid = %d", $sha1, $text, $file->fid);
return $text;
}