public function SearchApiAttachmentsLinksAlterSettings::getLinkContent in Search API attachments 7
1 call to SearchApiAttachmentsLinksAlterSettings::getLinkContent()
- SearchApiAttachmentsLinksAlterSettings::alterItems in contrib/
search_api_attachments_links/ includes/ callback_attachments_links_settings.inc - Alter items before indexing.
File
- contrib/
search_api_attachments_links/ includes/ callback_attachments_links_settings.inc, line 117 - Search API data alteration callback.
Class
- SearchApiAttachmentsLinksAlterSettings
- @file Search API data alteration callback.
Code
public function getLinkContent($link) {
$extraction = FALSE;
// Before running the (performance-intensive) extraction process, check
// if we already have a cached copy of the extracted data.
if (isset($link['url'])) {
// Load cached extraction based off link ID.
$cid = 'cached_extraction_:' . $link['url'];
$cached_extraction = cache_get($cid, self::CACHE_TABLE);
// If we have a cache hit, there really is no need to continue.
if (!empty($cached_extraction->data)) {
return $cached_extraction->data;
}
}
if ($headers = get_headers($link['url'], 1)) {
if ($headers['Content-Type'] == 'text/plain' || $headers['Content-Type'] == 'text/x-diff') {
$extraction = $this
->extract_simple($link);
}
elseif (in_array($headers['Content-Type'], array(
'image/jpeg',
'image/jpg',
'image/tiff',
))) {
$extraction = $this
->extract_exif($link);
}
else {
$extraction_method = variable_get('search_api_attachments_extract_using', 'tika');
// Send the extraction request to the right place depending on the
// current setting.
if ($extraction_method == 'tika') {
$extraction = $this
->extract_tika($link);
}
elseif ($extraction_method == 'python_pdf2txt') {
if (in_array($headers['Content-Type'], $this
->pdf_mimetypes())) {
$extraction = $this
->extract_python_pdf2txt($link);
}
elseif (variable_get('search_api_attachments_debug', FALSE)) {
watchdog('search_api_attachments', 'The python_pdf2txt extraction method does not support %mime_type', array(
'%mime_type' => $headers['Content-Type'],
), WATCHDOG_WARNING);
}
}
elseif ($extraction_method == 'pdftotext') {
if (in_array($headers['Content-Type'], $this
->pdf_mimetypes())) {
$extraction = $this
->extract_pdftotext($link);
}
elseif (variable_get('search_api_attachments_debug', FALSE)) {
watchdog('search_api_attachments', 'The pdftotext extraction method does not support %mime_type', array(
'%mime_type' => $headers['Content-Type'],
), WATCHDOG_WARNING);
}
}
else {
$extraction = $this
->extract_solr($link);
}
}
}
else {
// Log the missing link information.
watchdog('search_api_attachments', "Couldn't index %filename content because this link was missing.", array(
'%filename' => $link['url'],
));
}
// If we have actual extracted data, write it to the cache.
if ($extraction !== FALSE && isset($cid)) {
cache_set($cid, $extraction, self::CACHE_TABLE);
}
if (variable_get('search_api_attachments_debug', FALSE)) {
watchdog('search_api_attachments', "File: @filename\nExtraction: @extraction", array(
'@filename' => $link['url'],
'@extraction' => $extraction,
), WATCHDOG_DEBUG);
}
return $extraction;
}