class PythonPdf2txtExtractor in Search API attachments 8
Same name and namespace in other branches
- 9.0.x src/Plugin/search_api_attachments/PythonPdf2txtExtractor.php \Drupal\search_api_attachments\Plugin\search_api_attachments\PythonPdf2txtExtractor
Provides python pdf2text extractor.
Plugin annotation
@SearchApiAttachmentsTextExtractor(
id = "python_pdf2txt_extractor",
label = @Translation("Python Pdf2txt Extractor"),
description = @Translation("Adds python Pdf2txt extractor support."),
)
Hierarchy
- class \Drupal\Component\Plugin\PluginBase implements DerivativeInspectionInterface, PluginInspectionInterface
- class \Drupal\Core\Plugin\PluginBase uses DependencySerializationTrait, MessengerTrait, StringTranslationTrait
- class \Drupal\search_api_attachments\TextExtractorPluginBase implements ContainerFactoryPluginInterface, TextExtractorPluginInterface
- class \Drupal\search_api_attachments\Plugin\search_api_attachments\PythonPdf2txtExtractor
- class \Drupal\search_api_attachments\TextExtractorPluginBase implements ContainerFactoryPluginInterface, TextExtractorPluginInterface
- class \Drupal\Core\Plugin\PluginBase uses DependencySerializationTrait, MessengerTrait, StringTranslationTrait
Expanded class hierarchy of PythonPdf2txtExtractor
File
- src/
Plugin/ search_api_attachments/ PythonPdf2txtExtractor.php, line 18
Namespace
Drupal\search_api_attachments\Plugin\search_api_attachmentsView source
class PythonPdf2txtExtractor extends TextExtractorPluginBase {
/**
* Extract file with python Pdf2txt library.
*
* @param \Drupal\file\Entity\File $file
* A file object.
*
* @return string
* The text extracted from the file.
*/
public function extract(File $file) {
if (in_array($file
->getMimeType(), $this
->getPdfMimeTypes())) {
$output = '';
$filepath = $this
->getRealpath($file
->getFileUri());
// Restore the locale.
$python_path = $this->configuration['python_path'];
$python_pdf2txt_script = realpath($this->configuration['python_pdf2txt_script']);
$cmd = escapeshellcmd($python_path) . ' ' . escapeshellarg($python_pdf2txt_script) . ' -C -t text ' . escapeshellarg($filepath);
// UTF-8 multibyte characters will be stripped by escapeshellargs() for
// the default C-locale.
// So temporarily set the locale to UTF-8 so that the filepath remains
// valid.
$backup_locale = setlocale(LC_CTYPE, '0');
setlocale(LC_CTYPE, $backup_locale);
// Support UTF-8 commands.
// @see http://www.php.net/manual/en/function.shell-exec.php#85095
shell_exec("LANG=en_US.utf-8");
$output = shell_exec($cmd);
if (is_null($output)) {
throw new \Exception('Python Pdf2txt Exctractor is not available.');
}
return $output;
}
else {
return NULL;
}
}
/**
* {@inheritdoc}
*/
public function buildConfigurationForm(array $form, FormStateInterface $form_state) {
$form['python_path'] = [
'#type' => 'textfield',
'#title' => $this
->t('Path to python executable'),
'#description' => $this
->t('Enter the path to python executable. Example: "python".'),
'#default_value' => $this->configuration['python_path'],
'#required' => TRUE,
];
$form['python_pdf2txt_script'] = [
'#type' => 'textfield',
'#title' => $this
->t('Full path to the python pdf2txt script'),
'#description' => $this
->t('Enter the full path to the python pdf2txt script. Example: "/usr/bin/pdf2txt.py".'),
'#default_value' => $this->configuration['python_pdf2txt_script'],
'#required' => TRUE,
];
return $form;
}
/**
* {@inheritdoc}
*/
public function validateConfigurationForm(array &$form, FormStateInterface $form_state) {
$values = $form_state
->getValue([
'text_extractor_config',
]);
$python_path = $values['python_path'];
$python_pdf2txt_script = $values['python_pdf2txt_script'];
// Check that the file exists.
if (!file_exists($python_pdf2txt_script)) {
$form_state
->setError($form['text_extractor_config']['python_pdf2txt_script'], $this
->t('The file %path does not exist.', [
'%path' => $python_pdf2txt_script,
]));
}
else {
$cmd = escapeshellcmd($python_path) . ' ' . escapeshellarg($python_pdf2txt_script);
exec($cmd, $output, $return_code);
// $return_code = 1 if it fails. 100 instead.
if ($return_code != 100) {
$form_state
->setError($form['text_extractor_config']['python_path'], '');
$form_state
->setError($form['text_extractor_config']['python_pdf2txt_script'], $this
->t('Python Pdf2txt script file is not executable.'));
}
}
}
/**
* {@inheritdoc}
*/
public function submitConfigurationForm(array &$form, FormStateInterface $form_state) {
$this->configuration['python_path'] = $form_state
->getValue([
'text_extractor_config',
'python_path',
]);
$this->configuration['python_pdf2txt_script'] = $form_state
->getValue([
'text_extractor_config',
'python_pdf2txt_script',
]);
parent::submitConfigurationForm($form, $form_state);
}
}
Members
Name | Modifiers | Type | Description | Overrides |
---|---|---|---|---|
DependencySerializationTrait:: |
protected | property | An array of entity type IDs keyed by the property name of their storages. | |
DependencySerializationTrait:: |
protected | property | An array of service IDs keyed by property name used for serialization. | |
DependencySerializationTrait:: |
public | function | 1 | |
DependencySerializationTrait:: |
public | function | 2 | |
MessengerTrait:: |
public | function | Gets the messenger. | 29 |
MessengerTrait:: |
public | function | Sets the messenger. | |
PluginBase:: |
protected | property | Configuration information passed into the plugin. | 1 |
PluginBase:: |
protected | property | The plugin implementation definition. | 1 |
PluginBase:: |
protected | property | The plugin_id. | |
PluginBase:: |
constant | A string which is used to separate base plugin IDs from the derivative ID. | ||
PluginBase:: |
public | function |
Gets the base_plugin_id of the plugin instance. Overrides DerivativeInspectionInterface:: |
|
PluginBase:: |
public | function |
Gets the derivative_id of the plugin instance. Overrides DerivativeInspectionInterface:: |
|
PluginBase:: |
public | function |
Gets the definition of the plugin implementation. Overrides PluginInspectionInterface:: |
3 |
PluginBase:: |
public | function |
Gets the plugin_id of the plugin instance. Overrides PluginInspectionInterface:: |
|
PluginBase:: |
public | function | Determines if the plugin is configurable. | |
PythonPdf2txtExtractor:: |
public | function |
Form constructor. Overrides PluginFormInterface:: |
|
PythonPdf2txtExtractor:: |
public | function |
Extract file with python Pdf2txt library. Overrides TextExtractorPluginBase:: |
|
PythonPdf2txtExtractor:: |
public | function |
Form submission handler. Overrides TextExtractorPluginBase:: |
|
PythonPdf2txtExtractor:: |
public | function |
Form validation handler. Overrides TextExtractorPluginBase:: |
|
StringTranslationTrait:: |
protected | property | The string translation service. | 1 |
StringTranslationTrait:: |
protected | function | Formats a string containing a count of items. | |
StringTranslationTrait:: |
protected | function | Returns the number of plurals supported by a given language. | |
StringTranslationTrait:: |
protected | function | Gets the string translation service. | |
StringTranslationTrait:: |
public | function | Sets the string translation service to use. | 2 |
StringTranslationTrait:: |
protected | function | Translates a string to the current language or to a given language. | |
TextExtractorPluginBase:: |
protected | property | Config factory service. | |
TextExtractorPluginBase:: |
protected | property |
The messenger. Overrides MessengerTrait:: |
|
TextExtractorPluginBase:: |
protected | property | Mime type guesser service. | |
TextExtractorPluginBase:: |
protected | property | Stream wrapper manager service. | |
TextExtractorPluginBase:: |
public | function | ||
TextExtractorPluginBase:: |
constant | Name of the config being edited. | ||
TextExtractorPluginBase:: |
public static | function |
Creates an instance of the plugin. Overrides ContainerFactoryPluginInterface:: |
2 |
TextExtractorPluginBase:: |
public | function |
Gets default configuration for this plugin. Overrides ConfigurableInterface:: |
|
TextExtractorPluginBase:: |
public | function |
Gets this plugin's configuration. Overrides ConfigurableInterface:: |
|
TextExtractorPluginBase:: |
public | function | ||
TextExtractorPluginBase:: |
public | function | Helper method to get the PDF MIME types. | |
TextExtractorPluginBase:: |
public | function | Helper method to get the real path from an uri. | |
TextExtractorPluginBase:: |
public | function |
Sets the configuration for this plugin instance. Overrides ConfigurableInterface:: |
|
TextExtractorPluginBase:: |
public | function |
Constructs a \Drupal\Component\Plugin\PluginBase object. Overrides PluginBase:: |
2 |