You are here

protected function SearchApiAttachmentsAlterSettings::getFileContent in Search API attachments 7

Extracts th file content.

Parameters

object $file: The file object.

Return value

string The extracted content.

10 calls to SearchApiAttachmentsAlterSettings::getFileContent()
SearchApiAttachmentsAlterSettings::alterItems in includes/callback_attachments_settings.inc
Alter items before indexing.
SearchApiAttachmentsCommentAlterSettings::alterItems in contrib/search_api_attachments_comment/includes/callback_attachments_comment_settings.inc
Alter items before indexing.
SearchApiAttachmentsCommerceProductReferenceAlterSettings::alterItems in contrib/search_api_attachments_commerce_product_reference/includes/callback_attachments_commerce_product_reference_settings.inc
Alter items before indexing.
SearchApiAttachmentsEntityreferenceAlterSettings::getFilesContent in contrib/search_api_attachments_entityreference/includes/callback_attachments_entityreference_settings.inc
Extracts and returns contents of files.
SearchApiAttachmentsFieldCollectionsAlterSettings::alterItems in contrib/search_api_attachments_field_collections/includes/callback_attachments_field_collections_settings.inc
Alter items before indexing.

... See full list

File

includes/callback_attachments_settings.inc, line 268
Search API data alteration callback.

Class

SearchApiAttachmentsAlterSettings
Indexes files content.

Code

protected function getFileContent($file) {
  $extraction = FALSE;

  // Let's make the variable consistent.
  $file = (array) $file;

  // Before running the (performance-intensive) extraction process, check
  // if we already have a cached copy of the extracted data.
  if (isset($file['fid'])) {

    // Load cached extraction based off file ID.
    $cid = 'cached_extraction_:' . $file['fid'];
    $cached_extraction = cache_get($cid, self::CACHE_TABLE);

    // If we have a cache hit, there really is no need to continue.
    if (!empty($cached_extraction->data)) {
      return $cached_extraction->data;
    }
  }
  if (file_exists($file['uri'])) {
    if (in_array($file['filemime'], $this
      ->textMimetypes())) {
      $extraction = $this
        ->extractSimple($file);
    }
    elseif (in_array($file['filemime'], $this
      ->imageMimetypes())) {
      $extraction = $this
        ->extractExif($file);
    }
    else {
      $extraction_method = variable_get('search_api_attachments_extract_using', 'tika');

      // Send the extraction request to the right place depending on the
      // current setting.
      if ($extraction_method == 'tika') {
        $extraction = $this
          ->extractTika($file);
      }
      elseif ($extraction_method == 'tika_server') {
        $extraction = $this
          ->extractTikaServer($file);
      }
      elseif ($extraction_method == 'python_pdf2txt') {
        if (in_array($file['filemime'], $this
          ->pdfMimetypes())) {
          $extraction = $this
            ->extractPythonPdf2txt($file);
        }
        elseif (variable_get('search_api_attachments_debug', FALSE)) {
          watchdog('search_api_attachments', 'The python_pdf2txt extraction method does not support %mime_type', array(
            '%mime_type' => $file['filemime'],
          ), WATCHDOG_WARNING);
        }
      }
      elseif ($extraction_method == 'pdftotext') {
        if (in_array($file['filemime'], $this
          ->pdfMimetypes())) {
          $extraction = $this
            ->extractPdftotext($file);
        }
        elseif (variable_get('search_api_attachments_debug', FALSE)) {
          watchdog('search_api_attachments', 'The pdftotext extraction method does not support %mime_type', array(
            '%mime_type' => $file['filemime'],
          ), WATCHDOG_WARNING);
        }
      }
      else {
        $extraction = $this
          ->extractSolr($file);
      }
    }
  }
  else {

    // Log the missing file information.
    watchdog('search_api_attachments', "Couldn't index %filename content because this file was missing.", array(
      '%filename' => $file['filename'],
    ));
  }

  // If we have actual extracted data, write it to the cache.
  if ($extraction !== FALSE && isset($cid)) {
    cache_set($cid, $extraction, self::CACHE_TABLE);
  }
  if (variable_get('search_api_attachments_debug', FALSE)) {
    watchdog('search_api_attachments', "File: @filename\nExtraction: @extraction", array(
      '@filename' => $file['uri'],
      '@extraction' => $extraction,
    ), WATCHDOG_DEBUG);
  }
  return $extraction;
}