You are here

public function SearchApiAttachmentsLinksAlterSettings::getLinkContent in Search API attachments 7

1 call to SearchApiAttachmentsLinksAlterSettings::getLinkContent()
SearchApiAttachmentsLinksAlterSettings::alterItems in contrib/search_api_attachments_links/includes/callback_attachments_links_settings.inc
Alter items before indexing.

File

contrib/search_api_attachments_links/includes/callback_attachments_links_settings.inc, line 117
Search API data alteration callback.

Class

SearchApiAttachmentsLinksAlterSettings
@file Search API data alteration callback.

Code

public function getLinkContent($link) {
  $extraction = FALSE;

  // Before running the (performance-intensive) extraction process, check
  // if we already have a cached copy of the extracted data.
  if (isset($link['url'])) {

    // Load cached extraction based off link ID.
    $cid = 'cached_extraction_:' . $link['url'];
    $cached_extraction = cache_get($cid, self::CACHE_TABLE);

    // If we have a cache hit, there really is no need to continue.
    if (!empty($cached_extraction->data)) {
      return $cached_extraction->data;
    }
  }
  if ($headers = get_headers($link['url'], 1)) {
    if ($headers['Content-Type'] == 'text/plain' || $headers['Content-Type'] == 'text/x-diff') {
      $extraction = $this
        ->extract_simple($link);
    }
    elseif (in_array($headers['Content-Type'], array(
      'image/jpeg',
      'image/jpg',
      'image/tiff',
    ))) {
      $extraction = $this
        ->extract_exif($link);
    }
    else {
      $extraction_method = variable_get('search_api_attachments_extract_using', 'tika');

      // Send the extraction request to the right place depending on the
      // current setting.
      if ($extraction_method == 'tika') {
        $extraction = $this
          ->extract_tika($link);
      }
      elseif ($extraction_method == 'python_pdf2txt') {
        if (in_array($headers['Content-Type'], $this
          ->pdf_mimetypes())) {
          $extraction = $this
            ->extract_python_pdf2txt($link);
        }
        elseif (variable_get('search_api_attachments_debug', FALSE)) {
          watchdog('search_api_attachments', 'The python_pdf2txt extraction method does not support %mime_type', array(
            '%mime_type' => $headers['Content-Type'],
          ), WATCHDOG_WARNING);
        }
      }
      elseif ($extraction_method == 'pdftotext') {
        if (in_array($headers['Content-Type'], $this
          ->pdf_mimetypes())) {
          $extraction = $this
            ->extract_pdftotext($link);
        }
        elseif (variable_get('search_api_attachments_debug', FALSE)) {
          watchdog('search_api_attachments', 'The pdftotext extraction method does not support %mime_type', array(
            '%mime_type' => $headers['Content-Type'],
          ), WATCHDOG_WARNING);
        }
      }
      else {
        $extraction = $this
          ->extract_solr($link);
      }
    }
  }
  else {

    // Log the missing link information.
    watchdog('search_api_attachments', "Couldn't index %filename content because this link was missing.", array(
      '%filename' => $link['url'],
    ));
  }

  // If we have actual extracted data, write it to the cache.
  if ($extraction !== FALSE && isset($cid)) {
    cache_set($cid, $extraction, self::CACHE_TABLE);
  }
  if (variable_get('search_api_attachments_debug', FALSE)) {
    watchdog('search_api_attachments', "File: @filename\nExtraction: @extraction", array(
      '@filename' => $link['url'],
      '@extraction' => $extraction,
    ), WATCHDOG_DEBUG);
  }
  return $extraction;
}