View source
<?php
namespace Drupal\search_api_attachments\Plugin\search_api\processor;
use Drupal\Component\Plugin\Exception\InvalidPluginDefinitionException;
use Drupal\Component\Plugin\Exception\PluginNotFoundException;
use Drupal\Component\Utility\Bytes;
use Drupal\Core\Config\ConfigFactoryInterface;
use Drupal\Core\Entity\EntityInterface;
use Drupal\Core\Entity\EntityTypeManagerInterface;
use Drupal\Core\Extension\ModuleHandlerInterface;
use Drupal\Core\Field\FieldDefinitionInterface;
use Drupal\Core\Form\FormStateInterface;
use Drupal\Core\KeyValueStore\KeyValueFactoryInterface;
use Drupal\Core\Plugin\PluginFormInterface;
use Drupal\Core\Utility\Error;
use Drupal\file\Entity\File;
use Drupal\media\Entity\Media;
use Drupal\search_api\Datasource\DatasourceInterface;
use Drupal\search_api\Item\ItemInterface;
use Drupal\search_api\Processor\ProcessorPluginBase;
use Drupal\search_api\Processor\ProcessorProperty;
use Drupal\search_api\Utility\FieldsHelperInterface;
use Drupal\search_api_attachments\ExtractFileValidator;
use Drupal\search_api_attachments\TextExtractorPluginInterface;
use Drupal\search_api_attachments\TextExtractorPluginManager;
use Psr\Log\LoggerInterface;
use Psr\Log\LogLevel;
use Symfony\Component\DependencyInjection\ContainerInterface;
class FilesExtractor extends ProcessorPluginBase implements PluginFormInterface {
const CONFIGNAME = 'search_api_attachments.admin_config';
const FALLBACK_QUEUE_LOCK = 'search_api_attachments_fallback_queue';
const FALLBACK_QUEUE_KV = 'search_api_attachments:queued';
const SAA_FILE_ENTITY = 'saa_file_entity';
const SAA_PREFIX = 'saa_';
protected $textExtractorPluginManager;
protected $extractFileValidator;
protected $configFactory;
protected $entityTypeManager;
protected $keyValue;
protected $moduleHandler;
protected $fieldHelper;
protected $logger;
public function __construct(array $configuration, $plugin_id, array $plugin_definition, TextExtractorPluginManager $text_extractor_plugin_manager, ConfigFactoryInterface $config_factory, EntityTypeManagerInterface $entity_type_manager, KeyValueFactoryInterface $key_value, ModuleHandlerInterface $module_handler, FieldsHelperInterface $field_helper, ExtractFileValidator $extractFileValidator, LoggerInterface $logger) {
parent::__construct($configuration, $plugin_id, $plugin_definition);
$this->textExtractorPluginManager = $text_extractor_plugin_manager;
$this->configFactory = $config_factory;
$this->entityTypeManager = $entity_type_manager;
$this->keyValue = $key_value;
$this->moduleHandler = $module_handler;
$this->fieldHelper = $field_helper;
$this->extractFileValidator = $extractFileValidator;
$this->logger = $logger;
}
public static function create(ContainerInterface $container, array $configuration, $plugin_id, $plugin_definition) {
return new static($configuration, $plugin_id, $plugin_definition, $container
->get('plugin.manager.search_api_attachments.text_extractor'), $container
->get('config.factory'), $container
->get('entity_type.manager'), $container
->get('keyvalue'), $container
->get('module_handler'), $container
->get('search_api.fields_helper'), $container
->get('search_api_attachments.extract_file_validator'), $container
->get('logger.channel.search_api_attachments'));
}
public function getPropertyDefinitions(DatasourceInterface $datasource = NULL) {
$properties = [];
if (!$datasource) {
foreach ($this
->getFileFieldsAndFileEntityItems() as $field_name => $label) {
$definition = [
'label' => $this
->t('Search api attachments: @label', [
'@label' => $label,
]),
'description' => $this
->t('Search api attachments: @label', [
'@label' => $label,
]),
'type' => 'string',
'processor_id' => $this
->getPluginId(),
];
$properties[static::SAA_PREFIX . $field_name] = new ProcessorProperty($definition);
}
}
return $properties;
}
public function addFieldValues(ItemInterface $item) {
$files = [];
$config = $this->configFactory
->get(static::CONFIGNAME);
$extractor_plugin_id = $config
->get('extraction_method');
$this->configuration['read_text_files_directly'] = $config
->get('read_text_files_directly');
if ($extractor_plugin_id != '') {
$configuration = $config
->get($extractor_plugin_id . '_configuration');
$extractor_plugin = $this->textExtractorPluginManager
->createInstance($extractor_plugin_id, $configuration);
$entity = $item
->getOriginalObject()
->getValue();
$is_entity_type_file = $entity
->getEntityTypeId() == 'file';
foreach ($this
->getFileFieldsAndFileEntityItems() as $field_name => $label) {
if (!$is_entity_type_file && $field_name == static::SAA_FILE_ENTITY) {
break;
}
if ($is_entity_type_file && $field_name == static::SAA_FILE_ENTITY) {
$files[] = $entity;
}
$property_path = static::SAA_PREFIX . $field_name;
foreach ($this->fieldHelper
->filterForPropertyPath($item
->getFields(), NULL, $property_path) as $field) {
$all_fids = [];
if ($entity
->hasField($field_name)) {
$type = $entity
->get($field_name)
->getFieldDefinition()
->getType();
if ($type == 'entity_reference') {
$field_def = $entity
->get($field_name)
->getFieldDefinition();
if ($field_def
->getItemDefinition()
->getSetting('target_type') === 'media') {
$filefield_values = $entity
->get($field_name)
->filterEmptyItems()
->getValue();
foreach ($filefield_values as $media_value) {
$media = Media::load($media_value['target_id']);
if ($media !== NULL) {
$bundle_configuration = $media
->getSource()
->getConfiguration();
if (isset($bundle_configuration['source_field'])) {
foreach ($media
->get($bundle_configuration['source_field'])
->filterEmptyItems() as $field_item) {
if ($field_item
->getFieldDefinition()
->getType() === 'file') {
$value = $field_item
->getValue();
$all_fids[] = $value['target_id'];
}
}
}
}
}
}
}
elseif ($type == "file") {
$filefield_values = $entity
->get($field_name)
->filterEmptyItems()
->getValue();
foreach ($filefield_values as $filefield_value) {
$all_fids[] = $filefield_value['target_id'];
}
}
$fids = $this
->limitToAllowedNumber($all_fids);
$files = $this->entityTypeManager
->getStorage('file')
->loadMultiple($fids);
}
if (!empty($files)) {
$extraction = '';
foreach ($files as $file) {
if ($this
->isFileIndexable($file, $item, $field_name)) {
$extraction .= $this
->extractOrGetFromCache($entity, $file, $extractor_plugin);
}
}
$field
->addValue($extraction);
}
}
}
}
}
public function extractOrGetFromCache(EntityInterface $entity, File $file, TextExtractorPluginInterface $extractor_plugin) {
if (!empty($this->configuration['read_text_files_directly'])) {
if (substr($file
->getMimeType(), 0, 5) == 'text/') {
return file_get_contents($file
->getFileUri());
}
}
$collection = 'search_api_attachments';
$key = $collection . ':' . $file
->id();
$extracted_data = '';
if ($cache = $this->keyValue
->get($collection)
->get($key)) {
$extracted_data = $this
->limitBytes($cache);
}
else {
try {
$fallback_collection = $this->keyValue
->get(FilesExtractor::FALLBACK_QUEUE_KV);
$queued_files = $fallback_collection
->get($file
->id());
if (empty($queued_files[$entity
->getEntityTypeId()][$entity
->id()])) {
$extracted_data = $extractor_plugin
->extract($file);
$extracted_data = $this
->limitBytes($extracted_data);
$this->keyValue
->get($collection)
->set($key, $extracted_data);
$this->moduleHandler
->invokeAll('search_api_attachments_content_extracted', [
$file,
$entity,
]);
}
else {
$this
->queueItem($entity, $file);
}
} catch (\Exception $e) {
$error = Error::decodeException($e);
$message_params = [
'@file_id' => $file
->id(),
'@entity_id' => $entity
->id(),
'@entity_type' => $entity
->getEntityTypeId(),
'@type' => $error['%type'],
'@message' => $error['@message'],
'@function' => $error['%function'],
'@line' => $error['%line'],
'@file' => $error['%file'],
];
$this->logger
->log(LogLevel::ERROR, 'Error extracting text from file @file_id for @entity_type @entity_id. @type: @message in @function (line @line of @file).', $message_params);
$this
->queueItem($entity, $file);
}
}
return $extracted_data;
}
private function queueItem(EntityInterface $entity, File $file) {
if (\Drupal::lock()
->acquire(static::FALLBACK_QUEUE_LOCK)) {
$queued_file_collection = $this->keyValue
->get(static::FALLBACK_QUEUE_KV);
$queued_files = $queued_file_collection
->get($file
->id());
$queued_files[$entity
->getEntityTypeId()][$entity
->id()] = TRUE;
$queued_file_collection
->set($file
->id(), $queued_files);
\Drupal::lock()
->release(static::FALLBACK_QUEUE_LOCK);
$queue = \Drupal::queue('search_api_attachments');
$item = new \stdClass();
$item->fid = $file
->id();
$item->entity_id = $entity
->id();
$item->entity_type = $entity
->getEntityTypeId();
$item->extract_attempts = 1;
$queue
->createItem($item);
$this->logger
->log(LogLevel::INFO, 'File added to the queue for text extraction @file_id for @entity_type @entity_id.', [
'@file_id' => $file
->id(),
'@entity_id' => $entity
->id(),
'@entity_type' => $entity
->getEntityTypeId(),
]);
return TRUE;
}
return FALSE;
}
public function limitToAllowedNumber(array $all_fids) {
$limit = 0;
if (isset($this->configuration['number_indexed'])) {
$limit = $this->configuration['number_indexed'];
}
if ($limit == 0) {
return $all_fids;
}
if (count($all_fids) > $limit) {
return array_slice($all_fids, 0, $limit);
}
else {
return $all_fids;
}
}
public function limitBytes($extracted_text) {
$bytes = Bytes::toInt('1 MB');
if (isset($this->configuration['number_first_bytes'])) {
$bytes = Bytes::toInt($this->configuration['number_first_bytes']);
}
if ($bytes == 0) {
return $extracted_text;
}
else {
$extracted_text = mb_strcut($extracted_text, 0, $bytes);
}
return $extracted_text;
}
public function isFileIndexable($file, ItemInterface $item, $field_name = NULL) {
$indexable = file_exists($file
->getFileUri());
if (!$indexable) {
return FALSE;
}
$all_excluded_mimes = $this->extractFileValidator
->getExcludedMimes(NULL, $this->configuration['excluded_mimes']);
$indexable = $indexable && !in_array($file
->getMimeType(), $all_excluded_mimes);
if (!$indexable) {
return FALSE;
}
$indexable = $indexable && $file
->isPermanent();
if (!$indexable) {
return FALSE;
}
$max_filesize = $this->configuration['max_filesize'];
$indexable = $indexable && $this->extractFileValidator
->isFileSizeAllowed($file, $max_filesize);
if (!$indexable) {
return FALSE;
}
$excluded_private = $this->configuration['excluded_private'];
$indexable = $indexable && $this->extractFileValidator
->isPrivateFileAllowed($file, $excluded_private);
if (!$indexable) {
return FALSE;
}
$result = $this->moduleHandler
->invokeAll('search_api_attachments_indexable', [
$file,
$item,
$field_name,
]);
$indexable = !in_array(FALSE, $result, TRUE);
return $indexable;
}
protected function getFileFieldsAndFileEntityItems() {
$file_elements = [];
foreach ($this
->getIndex()
->getDatasources() as $datasource) {
if ($datasource
->getPluginId() == 'entity:file') {
$file_elements[static::SAA_FILE_ENTITY] = $this
->t('File entity');
}
foreach ($datasource
->getPropertyDefinitions() as $property) {
if ($property instanceof FieldDefinitionInterface) {
if ($property
->getType() == 'file') {
$file_elements[$property
->getName()] = $property
->getLabel();
}
if ($property
->getType() == "entity_reference") {
if ($property
->getSetting('target_type') === 'media') {
$settings = $property
->getItemDefinition()
->getSettings();
if (isset($settings['handler_settings']['target_bundles'])) {
foreach ($settings['handler_settings']['target_bundles'] as $bundle_name) {
try {
if (!empty($this->entityTypeManager
->getStorage('media_type')
->load($bundle_name))) {
$bundle_configuration = $this->entityTypeManager
->getStorage('media_type')
->load($bundle_name)
->toArray();
if (isset($bundle_configuration['source_configuration']['source_field'])) {
$source_field = $bundle_configuration['source_configuration']['source_field'];
$field_config = $this->entityTypeManager
->getStorage('field_storage_config')
->load(sprintf('media.%s', $source_field))
->toArray();
if (isset($field_config['type']) && $field_config['type'] === 'file') {
$file_elements[$property
->getName()] = $property
->getLabel();
}
}
}
} catch (InvalidPluginDefinitionException $e) {
watchdog_exception('search_api_attachments', $e);
continue;
} catch (PluginNotFoundException $e) {
watchdog_exception('search_api_attachments', $e);
continue;
}
}
}
}
}
}
}
}
return $file_elements;
}
public function buildConfigurationForm(array $form, FormStateInterface $form_state) {
if (isset($this->configuration['excluded_extensions'])) {
$default_excluded_extensions = $this->configuration['excluded_extensions'];
}
else {
$default_excluded_extensions = ExtractFileValidator::DEFAULT_EXCLUDED_EXTENSIONS;
}
$form['excluded_extensions'] = [
'#type' => 'textfield',
'#title' => $this
->t('Excluded file extensions'),
'#default_value' => $default_excluded_extensions,
'#size' => 80,
'#maxlength' => 255,
'#description' => $this
->t('File extensions that are excluded from indexing. Separate extensions with a space and do not include the leading dot.<br />Example: "aif art avi bmp gif ico mov oga ogv png psd ra ram rgb flv"<br />Extensions are internally mapped to a MIME type, so it is not necessary to put variations that map to the same type (e.g. tif is sufficient for tif and tiff)'),
];
$form['number_indexed'] = [
'#type' => 'number',
'#title' => $this
->t('Number of files indexed per file field'),
'#default_value' => isset($this->configuration['number_indexed']) ? $this->configuration['number_indexed'] : '0',
'#size' => 5,
'#min' => 0,
'#max' => 999999,
'#description' => $this
->t('The number of files to index per file field.<br />The order of indexation is the weight in the widget.<br /> 0 for no restriction.'),
];
$form['number_first_bytes'] = [
'#type' => 'textfield',
'#title' => $this
->t('Limit size of the extracted string before indexing.'),
'#default_value' => isset($this->configuration['number_first_bytes']) ? $this->configuration['number_first_bytes'] : '1 MB',
'#size' => 5,
'#min' => 0,
'#max' => 99999,
'#description' => $this
->t('Enter a value like "1000", "10 KB", "10 MB" or "10 GB" in order to restrict the size of the content after extraction.<br /> "0" to index the full extracted content without bytes limitation.'),
];
$form['max_filesize'] = [
'#type' => 'textfield',
'#title' => $this
->t('Maximum upload size'),
'#default_value' => isset($this->configuration['max_filesize']) ? $this->configuration['max_filesize'] : '0',
'#description' => $this
->t('Enter a value like "10 KB", "10 MB" or "10 GB" in order to restrict the max file size of files that should be indexed.<br /> Enter "0" for no limit restriction.'),
'#size' => 10,
];
$form['excluded_private'] = [
'#type' => 'checkbox',
'#title' => $this
->t('Exclude private files'),
'#default_value' => isset($this->configuration['excluded_private']) ? $this->configuration['excluded_private'] : TRUE,
'#description' => $this
->t('Check this box if you want to exclude private files from being indexed.'),
];
return $form;
}
public function validateConfigurationForm(array &$form, FormStateInterface $form_state) {
$number_first_bytes = trim($form_state
->getValue('number_first_bytes'));
$error = $this
->validateSize($number_first_bytes);
if ($error) {
$form_state
->setError($form['number_first_bytes'], $this
->t('The size limit option must contain a valid value. You may either enter "0" (for no restriction) or a string like "10 KB", "10 MB" or "10 GB".'));
}
$max_filesize = trim($form_state
->getValue('max_filesize'));
$error = $this
->validateSize($max_filesize);
if ($error) {
$form_state
->setError($form['max_filesize'], $this
->t('The max filesize option must contain a valid value. You may either enter "0" (for no restriction) or a string like "10 KB", "10 MB" or "10 GB".'));
}
}
public function validateSize($bytes) {
$error = FALSE;
if ($bytes != '0') {
$size_info = explode(' ', $bytes);
if (count($size_info) == 1) {
$error = $size_info[0] != '0';
}
elseif (count($size_info) != 2) {
$error = TRUE;
}
else {
$starts_integer = is_int((int) $size_info[0]);
$unit_expected = in_array($size_info[1], [
'KB',
'MB',
'GB',
]);
$error = !$starts_integer || !$unit_expected;
}
}
return $error;
}
public function submitConfigurationForm(array &$form, FormStateInterface $form_state) {
$excluded_extensions = $form_state
->getValue('excluded_extensions');
$excluded_extensions_array = explode(' ', $excluded_extensions);
$excluded_mimes_array = $this->extractFileValidator
->getExcludedMimes($excluded_extensions_array);
$excluded_mimes_string = implode(' ', $excluded_mimes_array);
$this
->setConfiguration($form_state
->getValues() + [
'excluded_mimes' => $excluded_mimes_string,
]);
}
}