You are here

protected function FileSearch::extractContentTika in Search File Attachments 8

Extract file content with Apache Tika.

Parameters

\Drupal\Core\Entity\EntityInterface $file: The file object.

string $file_path: The path to the file.

Return value

string The extracted text.

Throws

\Drupal\search_file_attachments\Plugin\Search\Exception

1 call to FileSearch::extractContentTika()
FileSearch::getFileContent in src/Plugin/Search/FileSearch.php
Extract the content of the given file.

File

src/Plugin/Search/FileSearch.php, line 426

Class

FileSearch
Executes a keyword search for files against {file_managed} database table.

Namespace

Drupal\search_file_attachments\Plugin\Search

Code

protected function extractContentTika(EntityInterface $file, $file_path) {
  $tika_path = realpath($this->moduleSettings
    ->get('tika.path'));
  $tika = realpath($tika_path . '/' . $this->moduleSettings
    ->get('tika.jar'));
  if (!$tika || !is_file($tika)) {
    throw new Exception($this
      ->t('Invalid path or filename for tika application jar.'));
  }

  // UTF-8 multibyte characters will be stripped by escapeshellargs().
  // So temporarily set the locale to UTF-8 so that the filepath remain valid.
  $backup_locale = setlocale(LC_CTYPE, '0');
  setlocale(LC_CTYPE, 'en_US.UTF-8');
  $java_service = \Drupal::service('search_file_attachments.java');
  if ($this->moduleSettings
    ->get('java_path')) {
    $java_service
      ->setJavaPath($this->moduleSettings
      ->get('java_path'));
  }
  $java_path = $java_service
    ->getJavaPath();
  $param = '';
  if ($file->filemime != 'audio/mpeg') {
    $param = ' -Dfile.encoding=UTF8 -cp ' . escapeshellarg($tika_path);
  }
  if (DIRECTORY_SEPARATOR == '\\') {

    // If we on windows, use an other methode to escape the file path strings,
    // to prevent problems with paths that contains spaces. Because the
    // PHP escapeshellarg() function handle these correct.
    $cmd = $java_path . $param . ' -jar "' . str_replace('"', '\\"', $tika) . '" -t "' . str_replace('"', '\\"', $file_path) . '"';
  }
  else {
    $cmd = $java_path . $param . ' -jar ' . escapeshellarg($tika) . ' -t ' . escapeshellarg($file_path);
  }

  // Support utf-8 commands:
  // http://www.php.net/manual/pt_BR/function.shell-exec.php#85095
  $cmd = "LANG=en_US.utf-8; {$cmd}";

  // Restore the locale.
  setlocale(LC_CTYPE, $backup_locale);

  // Debug print.
  if ($this->moduleSettings
    ->get('debug')) {
    $result = shell_exec($cmd . ' 2>&1');
    \Drupal::logger('search_file_attachments')
      ->notice('<p><strong>Tika Command:</strong> <code>%command</code></p><br /> <p><strong>Result:</strong> %result</p>', array(
      '%command' => $cmd,
      '%result' => $result,
    ));

    // Empty the result, if it contains an error message, so that the error
    // is not in the index.
    if (strpos($result, 'Exception in thread') !== FALSE) {
      $result = FALSE;
    }
    return $result;
  }
  return shell_exec($cmd);
}