class FilesExtractor in Search API attachments 9.0.x
Same name and namespace in other branches
- 8 src/Plugin/search_api/processor/FilesExtractor.php \Drupal\search_api_attachments\Plugin\search_api\processor\FilesExtractor
Provides file fields processor.
Plugin annotation
@SearchApiProcessor(
id = "file_attachments",
label = @Translation("File attachments"),
description = @Translation("Adds the file attachments content to the indexed data."),
stages = {
"add_properties" = 0,
}
)
Hierarchy
- class \Drupal\search_api_attachments\Plugin\search_api\processor\FilesExtractor extends \Drupal\search_api\Processor\ProcessorPluginBase implements PluginFormInterface
Expanded class hierarchy of FilesExtractor
2 files declare their use of FilesExtractor
- ExtractedText.php in src/
Plugin/ Field/ FieldFormatter/ ExtractedText.php - ExtractorQueue.php in src/
Plugin/ QueueWorker/ ExtractorQueue.php
File
- src/
Plugin/ search_api/ processor/ FilesExtractor.php, line 43
Namespace
Drupal\search_api_attachments\Plugin\search_api\processorView source
class FilesExtractor extends ProcessorPluginBase implements PluginFormInterface {
/**
* Name of the config being edited.
*/
const CONFIGNAME = 'search_api_attachments.admin_config';
const FALLBACK_QUEUE_LOCK = 'search_api_attachments_fallback_queue';
const FALLBACK_QUEUE_KV = 'search_api_attachments:queued';
/**
* Name of the "virtual" field that handles file entity type extractions.
*
* This is used per example in a File datasource index or mixed
* datasources index.
*/
const SAA_FILE_ENTITY = 'saa_file_entity';
/**
* Prefix of the properties provided by this module.
*/
const SAA_PREFIX = 'saa_';
/**
* The plugin manager for our text extractor.
*
* @var \Drupal\search_api_attachments\TextExtractorPluginManager
*/
protected $textExtractorPluginManager;
/**
* The extract file validator service.
*
* @var \Drupal\search_api_attachments\ExtractFileValidator
*/
protected $extractFileValidator;
/**
* Config factory service.
*
* @var \Drupal\Core\Config\ConfigFactoryInterface
*/
protected $configFactory;
/**
* Entity type manager service.
*
* @var \Drupal\Core\Entity\EntityTypeManagerInterface
*/
protected $entityTypeManager;
/**
* Key value service.
*
* @var \Drupal\Core\KeyValueStore\KeyValueFactoryInterface
*/
protected $keyValue;
/**
* Module handler service.
*
* @var \Drupal\Core\Extension\ModuleHandlerInterface
*/
protected $moduleHandler;
/**
* Search API field helper.
*
* @var \Drupal\search_api\Utility\FieldsHelperInterface
*/
protected $fieldHelper;
/**
* The logger service.
*
* @var \Psr\Log\LoggerInterface
*/
protected $logger;
/**
* {@inheritdoc}
*/
public function __construct(array $configuration, $plugin_id, array $plugin_definition, TextExtractorPluginManager $text_extractor_plugin_manager, ConfigFactoryInterface $config_factory, EntityTypeManagerInterface $entity_type_manager, KeyValueFactoryInterface $key_value, ModuleHandlerInterface $module_handler, FieldsHelperInterface $field_helper, ExtractFileValidator $extractFileValidator, LoggerInterface $logger) {
parent::__construct($configuration, $plugin_id, $plugin_definition);
$this->textExtractorPluginManager = $text_extractor_plugin_manager;
$this->configFactory = $config_factory;
$this->entityTypeManager = $entity_type_manager;
$this->keyValue = $key_value;
$this->moduleHandler = $module_handler;
$this->fieldHelper = $field_helper;
$this->extractFileValidator = $extractFileValidator;
$this->logger = $logger;
}
/**
* {@inheritdoc}
*/
public static function create(ContainerInterface $container, array $configuration, $plugin_id, $plugin_definition) {
return new static($configuration, $plugin_id, $plugin_definition, $container
->get('plugin.manager.search_api_attachments.text_extractor'), $container
->get('config.factory'), $container
->get('entity_type.manager'), $container
->get('keyvalue'), $container
->get('module_handler'), $container
->get('search_api.fields_helper'), $container
->get('search_api_attachments.extract_file_validator'), $container
->get('logger.channel.search_api_attachments'));
}
/**
* {@inheritdoc}
*/
public function getPropertyDefinitions(DatasourceInterface $datasource = NULL) {
$properties = [];
if (!$datasource) {
// Add properties for all index available file fields and for file entity.
foreach ($this
->getFileFieldsAndFileEntityItems() as $field_name => $label) {
$definition = [
'label' => $this
->t('Search api attachments: @label', [
'@label' => $label,
]),
'description' => $this
->t('Search api attachments: @label', [
'@label' => $label,
]),
'type' => 'string',
'processor_id' => $this
->getPluginId(),
];
$properties[static::SAA_PREFIX . $field_name] = new ProcessorProperty($definition);
}
}
return $properties;
}
/**
* {@inheritdoc}
*/
public function addFieldValues(ItemInterface $item) {
$files = [];
$config = $this->configFactory
->get(static::CONFIGNAME);
$extractor_plugin_id = $config
->get('extraction_method');
// Get the config option to read text files directly.
$this->configuration['read_text_files_directly'] = $config
->get('read_text_files_directly');
if ($extractor_plugin_id != '') {
$configuration = $config
->get($extractor_plugin_id . '_configuration');
$extractor_plugin = $this->textExtractorPluginManager
->createInstance($extractor_plugin_id, $configuration);
// Get the entity.
$entity = $item
->getOriginalObject()
->getValue();
$is_entity_type_file = $entity
->getEntityTypeId() == 'file';
foreach ($this
->getFileFieldsAndFileEntityItems() as $field_name => $label) {
// If the parent entity is not a file, no need to parse the
// saa static::SAA_FILE_ENTITY item.
if (!$is_entity_type_file && $field_name == static::SAA_FILE_ENTITY) {
break;
}
if ($is_entity_type_file && $field_name == static::SAA_FILE_ENTITY) {
$files[] = $entity;
}
$property_path = static::SAA_PREFIX . $field_name;
// A way to load $field.
foreach ($this->fieldHelper
->filterForPropertyPath($item
->getFields(), NULL, $property_path) as $field) {
$all_fids = [];
if ($entity
->hasField($field_name)) {
// Get type to manage media entity reference case.
$type = $entity
->get($field_name)
->getFieldDefinition()
->getType();
if ($type == 'entity_reference') {
/** @var \Drupal\Core\Field\BaseFieldDefinition $field_def */
$field_def = $entity
->get($field_name)
->getFieldDefinition();
if ($field_def
->getItemDefinition()
->getSetting('target_type') === 'media') {
// This is a media field.
$filefield_values = $entity
->get($field_name)
->filterEmptyItems()
->getValue();
foreach ($filefield_values as $media_value) {
$media = Media::load($media_value['target_id']);
if ($media !== NULL) {
$bundle_configuration = $media
->getSource()
->getConfiguration();
if (isset($bundle_configuration['source_field'])) {
/** @var \Drupal\Core\Field\FieldItemListInterface $field_item */
foreach ($media
->get($bundle_configuration['source_field'])
->filterEmptyItems() as $field_item) {
if ($field_item
->getFieldDefinition()
->getType() === 'file') {
$value = $field_item
->getValue();
$all_fids[] = $value['target_id'];
}
}
}
}
}
}
}
elseif ($type == "file") {
$filefield_values = $entity
->get($field_name)
->filterEmptyItems()
->getValue();
foreach ($filefield_values as $filefield_value) {
$all_fids[] = $filefield_value['target_id'];
}
}
$fids = $this
->limitToAllowedNumber($all_fids);
// Retrieve the files.
$files = $this->entityTypeManager
->getStorage('file')
->loadMultiple($fids);
}
if (!empty($files)) {
$extraction = '';
foreach ($files as $file) {
if ($this
->isFileIndexable($file, $item, $field_name)) {
$extraction .= $this
->extractOrGetFromCache($entity, $file, $extractor_plugin);
}
}
$field
->addValue($extraction);
}
}
}
}
}
/**
* Extract non text file data or get it from cache if available and cache it.
*
* @param \Drupal\Core\Entity\EntityInterface $entity
* The entity the file is attached to.
* @param \Drupal\file\Entity\File $file
* A file object.
* @param \Drupal\search_api_attachments\TextExtractorPluginInterface $extractor_plugin
* The plugin used to extract file content.
*
* @return string
* $extracted_data
*/
public function extractOrGetFromCache(EntityInterface $entity, File $file, TextExtractorPluginInterface $extractor_plugin) {
// Directly process plaintext files.
if (!empty($this->configuration['read_text_files_directly'])) {
if (substr($file
->getMimeType(), 0, 5) == 'text/') {
return file_get_contents($file
->getFileUri());
}
}
$collection = 'search_api_attachments';
$key = $collection . ':' . $file
->id();
$extracted_data = '';
if ($cache = $this->keyValue
->get($collection)
->get($key)) {
$extracted_data = $this
->limitBytes($cache);
}
else {
try {
// Only extract if this file has not previously failed and was queued.
$fallback_collection = $this->keyValue
->get(FilesExtractor::FALLBACK_QUEUE_KV);
$queued_files = $fallback_collection
->get($file
->id());
if (empty($queued_files[$entity
->getEntityTypeId()][$entity
->id()])) {
$extracted_data = $extractor_plugin
->extract($file);
$extracted_data = $this
->limitBytes($extracted_data);
$this->keyValue
->get($collection)
->set($key, $extracted_data);
$this->moduleHandler
->invokeAll('search_api_attachments_content_extracted', [
$file,
$entity,
]);
}
else {
$this
->queueItem($entity, $file);
}
} catch (\Exception $e) {
$error = Error::decodeException($e);
$message_params = [
'@file_id' => $file
->id(),
'@entity_id' => $entity
->id(),
'@entity_type' => $entity
->getEntityTypeId(),
'@type' => $error['%type'],
'@message' => $error['@message'],
'@function' => $error['%function'],
'@line' => $error['%line'],
'@file' => $error['%file'],
];
$this->logger
->log(LogLevel::ERROR, 'Error extracting text from file @file_id for @entity_type @entity_id. @type: @message in @function (line @line of @file).', $message_params);
$this
->queueItem($entity, $file);
}
}
return $extracted_data;
}
/**
* Queue a failed extraction for later processing.
*
* @param \Drupal\Core\Entity\EntityInterface $entity
* The entity the file is attached to.
* @param \Drupal\file\Entity\File $file
* A file object.
*
* @return bool
* Success of queueing process.
*/
private function queueItem(EntityInterface $entity, File $file) {
if (\Drupal::lock()
->acquire(static::FALLBACK_QUEUE_LOCK)) {
$queued_file_collection = $this->keyValue
->get(static::FALLBACK_QUEUE_KV);
$queued_files = $queued_file_collection
->get($file
->id());
$queued_files[$entity
->getEntityTypeId()][$entity
->id()] = TRUE;
$queued_file_collection
->set($file
->id(), $queued_files);
\Drupal::lock()
->release(static::FALLBACK_QUEUE_LOCK);
// Add file to queue.
$queue = \Drupal::queue('search_api_attachments');
$item = new \stdClass();
$item->fid = $file
->id();
$item->entity_id = $entity
->id();
$item->entity_type = $entity
->getEntityTypeId();
$item->extract_attempts = 1;
$queue
->createItem($item);
$this->logger
->log(LogLevel::INFO, 'File added to the queue for text extraction @file_id for @entity_type @entity_id.', [
'@file_id' => $file
->id(),
'@entity_id' => $entity
->id(),
'@entity_type' => $entity
->getEntityTypeId(),
]);
return TRUE;
}
return FALSE;
}
/**
* Limit the number of items to index per field to the configured limit.
*
* @param array $all_fids
* Array of fids.
*
* @return array
* An array of $limit number of items.
*/
public function limitToAllowedNumber(array $all_fids) {
$limit = 0;
if (isset($this->configuration['number_indexed'])) {
$limit = $this->configuration['number_indexed'];
}
// If limit is 0 return all items.
if ($limit == 0) {
return $all_fids;
}
if (count($all_fids) > $limit) {
return array_slice($all_fids, 0, $limit);
}
else {
return $all_fids;
}
}
/**
* Limit the indexed text to first N bytes.
*
* @param string $extracted_text
* The hole extracted text.
*
* @return string
* The first N bytes of the extracted text that will be indexed and cached.
*/
public function limitBytes($extracted_text) {
// Default the configuration to a sensible amount of text to extract and
// cache in the database. 1 million characters should be enough for most
// cases.
$bytes = Bytes::toInt('1 MB');
if (isset($this->configuration['number_first_bytes'])) {
$bytes = Bytes::toInt($this->configuration['number_first_bytes']);
}
// If $bytes is 0 return all items.
if ($bytes == 0) {
return $extracted_text;
}
else {
$extracted_text = mb_strcut($extracted_text, 0, $bytes);
}
return $extracted_text;
}
/**
* Check if the file is allowed to be indexed.
*
* @param object $file
* A file object.
* @param \Drupal\search_api\Item\ItemInterface $item
* The item the file was referenced in.
* @param string|null $field_name
* The name of the field the file was referenced in, if applicable.
*
* @return bool
* TRUE or FALSE
*/
public function isFileIndexable($file, ItemInterface $item, $field_name = NULL) {
// File should exist in disc.
$indexable = file_exists($file
->getFileUri());
if (!$indexable) {
return FALSE;
}
// File should have a mime type that is allowed.
$all_excluded_mimes = $this->extractFileValidator
->getExcludedMimes(NULL, $this->configuration['excluded_mimes']);
$indexable = $indexable && !in_array($file
->getMimeType(), $all_excluded_mimes);
if (!$indexable) {
return FALSE;
}
// File permanent.
$indexable = $indexable && $file
->isPermanent();
if (!$indexable) {
return FALSE;
}
// File shouldn't exceed configured file size.
$max_filesize = $this->configuration['max_filesize'];
$indexable = $indexable && $this->extractFileValidator
->isFileSizeAllowed($file, $max_filesize);
if (!$indexable) {
return FALSE;
}
// Whether a private file can be indexed or not.
$excluded_private = $this->configuration['excluded_private'];
$indexable = $indexable && $this->extractFileValidator
->isPrivateFileAllowed($file, $excluded_private);
if (!$indexable) {
return FALSE;
}
$result = $this->moduleHandler
->invokeAll('search_api_attachments_indexable', [
$file,
$item,
$field_name,
]);
$indexable = !in_array(FALSE, $result, TRUE);
return $indexable;
}
/**
* Get the file fields of indexed bundles and an entity file general item.
*
* @return array
* An array of file field with field name as key and label as value and
* an element for generic file entity item.
*/
protected function getFileFieldsAndFileEntityItems() {
$file_elements = [];
// Retrieve file fields of indexed bundles.
foreach ($this
->getIndex()
->getDatasources() as $datasource) {
if ($datasource
->getPluginId() == 'entity:file') {
$file_elements[static::SAA_FILE_ENTITY] = $this
->t('File entity');
}
foreach ($datasource
->getPropertyDefinitions() as $property) {
if ($property instanceof FieldDefinitionInterface) {
if ($property
->getType() == 'file') {
$file_elements[$property
->getName()] = $property
->getLabel();
}
if ($property
->getType() == "entity_reference") {
if ($property
->getSetting('target_type') === 'media') {
$settings = $property
->getItemDefinition()
->getSettings();
if (isset($settings['handler_settings']['target_bundles'])) {
// For each media bundle allowed, check if the source field is a
// file field.
foreach ($settings['handler_settings']['target_bundles'] as $bundle_name) {
try {
if (!empty($this->entityTypeManager
->getStorage('media_type')
->load($bundle_name))) {
$bundle_configuration = $this->entityTypeManager
->getStorage('media_type')
->load($bundle_name)
->toArray();
if (isset($bundle_configuration['source_configuration']['source_field'])) {
$source_field = $bundle_configuration['source_configuration']['source_field'];
$field_config = $this->entityTypeManager
->getStorage('field_storage_config')
->load(sprintf('media.%s', $source_field))
->toArray();
if (isset($field_config['type']) && $field_config['type'] === 'file') {
$file_elements[$property
->getName()] = $property
->getLabel();
}
}
}
} catch (InvalidPluginDefinitionException $e) {
watchdog_exception('search_api_attachments', $e);
continue;
} catch (PluginNotFoundException $e) {
watchdog_exception('search_api_attachments', $e);
continue;
}
}
}
}
}
}
}
}
return $file_elements;
}
/**
* {@inheritdoc}
*/
public function buildConfigurationForm(array $form, FormStateInterface $form_state) {
if (isset($this->configuration['excluded_extensions'])) {
$default_excluded_extensions = $this->configuration['excluded_extensions'];
}
else {
$default_excluded_extensions = ExtractFileValidator::DEFAULT_EXCLUDED_EXTENSIONS;
}
$form['excluded_extensions'] = [
'#type' => 'textfield',
'#title' => $this
->t('Excluded file extensions'),
'#default_value' => $default_excluded_extensions,
'#size' => 80,
'#maxlength' => 255,
'#description' => $this
->t('File extensions that are excluded from indexing. Separate extensions with a space and do not include the leading dot.<br />Example: "aif art avi bmp gif ico mov oga ogv png psd ra ram rgb flv"<br />Extensions are internally mapped to a MIME type, so it is not necessary to put variations that map to the same type (e.g. tif is sufficient for tif and tiff)'),
];
$form['number_indexed'] = [
'#type' => 'number',
'#title' => $this
->t('Number of files indexed per file field'),
'#default_value' => isset($this->configuration['number_indexed']) ? $this->configuration['number_indexed'] : '0',
'#size' => 5,
'#min' => 0,
'#max' => 999999,
'#description' => $this
->t('The number of files to index per file field.<br />The order of indexation is the weight in the widget.<br /> 0 for no restriction.'),
];
$form['number_first_bytes'] = [
'#type' => 'textfield',
'#title' => $this
->t('Limit size of the extracted string before indexing.'),
'#default_value' => isset($this->configuration['number_first_bytes']) ? $this->configuration['number_first_bytes'] : '1 MB',
'#size' => 5,
'#min' => 0,
'#max' => 99999,
'#description' => $this
->t('Enter a value like "1000", "10 KB", "10 MB" or "10 GB" in order to restrict the size of the content after extraction.<br /> "0" to index the full extracted content without bytes limitation.'),
];
$form['max_filesize'] = [
'#type' => 'textfield',
'#title' => $this
->t('Maximum upload size'),
'#default_value' => isset($this->configuration['max_filesize']) ? $this->configuration['max_filesize'] : '0',
'#description' => $this
->t('Enter a value like "10 KB", "10 MB" or "10 GB" in order to restrict the max file size of files that should be indexed.<br /> Enter "0" for no limit restriction.'),
'#size' => 10,
];
$form['excluded_private'] = [
'#type' => 'checkbox',
'#title' => $this
->t('Exclude private files'),
'#default_value' => isset($this->configuration['excluded_private']) ? $this->configuration['excluded_private'] : TRUE,
'#description' => $this
->t('Check this box if you want to exclude private files from being indexed.'),
];
return $form;
}
/**
* Form validation handler.
*
* @param array $form
* An associative array containing the structure of the plugin form as built
* by static::buildConfigurationForm().
* @param \Drupal\Core\Form\FormStateInterface $form_state
* The current state of the complete form.
*
* @see \Drupal\Core\Plugin\PluginFormInterface::validateConfigurationForm()
*/
public function validateConfigurationForm(array &$form, FormStateInterface $form_state) {
// Validate 'number_first_bytes'.
$number_first_bytes = trim($form_state
->getValue('number_first_bytes'));
$error = $this
->validateSize($number_first_bytes);
if ($error) {
$form_state
->setError($form['number_first_bytes'], $this
->t('The size limit option must contain a valid value. You may either enter "0" (for no restriction) or a string like "10 KB", "10 MB" or "10 GB".'));
}
// Validate 'max_filesize'.
$max_filesize = trim($form_state
->getValue('max_filesize'));
$error = $this
->validateSize($max_filesize);
if ($error) {
$form_state
->setError($form['max_filesize'], $this
->t('The max filesize option must contain a valid value. You may either enter "0" (for no restriction) or a string like "10 KB", "10 MB" or "10 GB".'));
}
}
/**
* Helper method to validate the size of files' format.
*
* @param string $bytes
* Number of bytes.
*
* @return bool
* TRUE if $bites is of form "N KB", "N MB" or "N GB" where N is integer.
*/
public function validateSize($bytes) {
$error = FALSE;
if ($bytes != '0') {
$size_info = explode(' ', $bytes);
// The only case we can have count($size_info) == 1 is for '0' value.
if (count($size_info) == 1) {
$error = $size_info[0] != '0';
}
elseif (count($size_info) != 2) {
$error = TRUE;
}
else {
$starts_integer = is_int((int) $size_info[0]);
$unit_expected = in_array($size_info[1], [
'KB',
'MB',
'GB',
]);
$error = !$starts_integer || !$unit_expected;
}
}
return $error;
}
/**
* Form submission handler.
*
* @param array $form
* An associative array containing the structure of the plugin form as built
* by static::buildConfigurationForm().
* @param \Drupal\Core\Form\FormStateInterface $form_state
* The current state of the complete form.
*
* @see \Drupal\Core\Plugin\PluginFormInterface::submitConfigurationForm()
*/
public function submitConfigurationForm(array &$form, FormStateInterface $form_state) {
$excluded_extensions = $form_state
->getValue('excluded_extensions');
$excluded_extensions_array = explode(' ', $excluded_extensions);
$excluded_mimes_array = $this->extractFileValidator
->getExcludedMimes($excluded_extensions_array);
$excluded_mimes_string = implode(' ', $excluded_mimes_array);
$this
->setConfiguration($form_state
->getValues() + [
'excluded_mimes' => $excluded_mimes_string,
]);
}
}
Members
Name | Modifiers | Type | Description | Overrides |
---|---|---|---|---|
FilesExtractor:: |
protected | property | Config factory service. | |
FilesExtractor:: |
protected | property | Entity type manager service. | |
FilesExtractor:: |
protected | property | The extract file validator service. | |
FilesExtractor:: |
protected | property | Search API field helper. | |
FilesExtractor:: |
protected | property | Key value service. | |
FilesExtractor:: |
protected | property | The logger service. | |
FilesExtractor:: |
protected | property | Module handler service. | |
FilesExtractor:: |
protected | property | The plugin manager for our text extractor. | |
FilesExtractor:: |
public | function | ||
FilesExtractor:: |
public | function |
Form constructor. Overrides PluginFormInterface:: |
|
FilesExtractor:: |
constant | Name of the config being edited. | ||
FilesExtractor:: |
public static | function | ||
FilesExtractor:: |
public | function | Extract non text file data or get it from cache if available and cache it. | |
FilesExtractor:: |
constant | |||
FilesExtractor:: |
constant | |||
FilesExtractor:: |
protected | function | Get the file fields of indexed bundles and an entity file general item. | |
FilesExtractor:: |
public | function | ||
FilesExtractor:: |
public | function | Check if the file is allowed to be indexed. | |
FilesExtractor:: |
public | function | Limit the indexed text to first N bytes. | |
FilesExtractor:: |
public | function | Limit the number of items to index per field to the configured limit. | |
FilesExtractor:: |
private | function | Queue a failed extraction for later processing. | |
FilesExtractor:: |
constant | Name of the "virtual" field that handles file entity type extractions. | ||
FilesExtractor:: |
constant | Prefix of the properties provided by this module. | ||
FilesExtractor:: |
public | function |
Form submission handler. Overrides PluginFormInterface:: |
|
FilesExtractor:: |
public | function |
Form validation handler. Overrides PluginFormInterface:: |
|
FilesExtractor:: |
public | function | Helper method to validate the size of files' format. | |
FilesExtractor:: |
public | function |