function search_file_attachments_extract_tika in Search File Attachments 7
Extract file content with Apache Tika.
Parameters
object $file: The file object.
Return value
string The extracted content.
Throws
Exception
1 call to search_file_attachments_extract_tika()
- search_file_attachments_get_file_content in ./
search_file_attachments.inc - Extract the content of the given file.
File
- ./
search_file_attachments.inc, line 183 - Heler functions, to hold the .module file clean and smart.
Code
function search_file_attachments_extract_tika($file) {
$filepath = file_create_url($file->uri);
$tika_path = realpath(variable_get('search_file_attachments_tika_path', ''));
$tika = realpath($tika_path . '/' . variable_get('search_file_attachments_tika_jar', ''));
if (!$tika || !is_file($tika)) {
throw new Exception(t('Invalid path or filename for tika application jar.'));
}
// UTF-8 multibyte characters will be stripped by escapeshellargs().
// So temporarily set the locale to UTF-8 so that the filepath remain valid.
$backup_locale = setlocale(LC_CTYPE, '0');
setlocale(LC_CTYPE, 'en_US.UTF-8');
$param = '';
$java_path = search_file_attachments_java_path();
if ($file->filemime != 'audio/mpeg') {
$param = ' -Dfile.encoding=UTF8 -cp ' . escapeshellarg($tika_path);
}
if (DIRECTORY_SEPARATOR == '\\') {
// If we on windows, use an other methode to escape the file path strings,
// to prevent problems with paths that contains spaces. Because the
// PHP escapeshellarg() function handle these correct.
$cmd = $java_path . $param . ' -jar "' . str_replace('"', '\\"', $tika) . '" -t "' . str_replace('"', '\\"', $filepath) . '"';
}
else {
$cmd = $java_path . $param . ' -jar ' . escapeshellarg($tika) . ' -t ' . escapeshellarg($filepath);
if (strpos(ini_get('extension_dir'), 'MAMP/')) {
$cmd = 'export DYLD_LIBRARY_PATH=""; ' . $cmd;
}
}
// Support utf-8 commands:
// http://www.php.net/manual/pt_BR/function.shell-exec.php#85095
$cmd = "LANG=en_US.utf-8; {$cmd}";
// Restore the locale.
setlocale(LC_CTYPE, $backup_locale);
// Debug print.
if (variable_get('search_file_attachments_debug', FALSE)) {
$result = shell_exec($cmd . ' 2>&1');
watchdog('search_file_attachments', '<p><strong>Tika Command:</strong> <code>%command</code></p><br /> <p><strong>Result:</strong> %result</p>', array(
'%command' => $cmd,
'%result' => $result,
));
// Empty the result, if it contains an error message, so that the error
// is not in the index.
if (strpos($result, 'Exception in thread') !== FALSE) {
$result = FALSE;
}
return $result;
}
return shell_exec($cmd);
}