class HtmlFilter in Search API 8
Strips HTML tags from fulltext fields and decodes HTML entities.
Plugin annotation
@SearchApiProcessor(
id = "html_filter",
label = @Translation("HTML filter"),
description = @Translation("Strips HTML tags from fulltext fields and decodes HTML entities. Use this processor when indexing HTML data – for example, node bodies for certain text formats. The processor also allows to boost (or ignore) the contents of specific elements."),
stages = {
"pre_index_save" = 0,
"preprocess_index" = -15,
"preprocess_query" = -15,
}
)
Hierarchy
- class \Drupal\Component\Plugin\PluginBase implements DerivativeInspectionInterface, PluginInspectionInterface
- class \Drupal\Core\Plugin\PluginBase uses DependencySerializationTrait, MessengerTrait, StringTranslationTrait
- class \Drupal\search_api\Plugin\HideablePluginBase implements HideablePluginInterface
- class \Drupal\search_api\Plugin\ConfigurablePluginBase implements ConfigurablePluginInterface uses PluginDependencyTrait
- class \Drupal\search_api\Plugin\IndexPluginBase implements IndexPluginInterface
- class \Drupal\search_api\Processor\ProcessorPluginBase implements ProcessorInterface
- class \Drupal\search_api\Processor\FieldsProcessorPluginBase implements PluginFormInterface, TrustedCallbackInterface uses PluginFormTrait
- class \Drupal\search_api\Plugin\search_api\processor\HtmlFilter
- class \Drupal\search_api\Processor\FieldsProcessorPluginBase implements PluginFormInterface, TrustedCallbackInterface uses PluginFormTrait
- class \Drupal\search_api\Processor\ProcessorPluginBase implements ProcessorInterface
- class \Drupal\search_api\Plugin\IndexPluginBase implements IndexPluginInterface
- class \Drupal\search_api\Plugin\ConfigurablePluginBase implements ConfigurablePluginInterface uses PluginDependencyTrait
- class \Drupal\search_api\Plugin\HideablePluginBase implements HideablePluginInterface
- class \Drupal\Core\Plugin\PluginBase uses DependencySerializationTrait, MessengerTrait, StringTranslationTrait
Expanded class hierarchy of HtmlFilter
1 file declares its use of HtmlFilter
- HtmlFilterTest.php in tests/
src/ Unit/ Processor/ HtmlFilterTest.php
File
- src/
Plugin/ search_api/ processor/ HtmlFilter.php, line 32
Namespace
Drupal\search_api\Plugin\search_api\processorView source
class HtmlFilter extends FieldsProcessorPluginBase {
/**
* The data type helper.
*
* @var \Drupal\search_api\Utility\DataTypeHelperInterface|null
*/
protected $dataTypeHelper;
/**
* Retrieves the data type helper.
*
* @return \Drupal\search_api\Utility\DataTypeHelperInterface
* The data type helper.
*/
public function getDataTypeHelper() {
return $this->dataTypeHelper ?: \Drupal::service('search_api.data_type_helper');
}
/**
* Sets the data type helper.
*
* @param \Drupal\search_api\Utility\DataTypeHelperInterface $data_type_helper
* The new data type helper.
*
* @return $this
*/
public function setDataTypeHelper(DataTypeHelperInterface $data_type_helper) {
$this->dataTypeHelper = $data_type_helper;
return $this;
}
/**
* {@inheritdoc}
*/
public function defaultConfiguration() {
$configuration = parent::defaultConfiguration();
$configuration += [
'title' => TRUE,
'alt' => TRUE,
'tags' => [
'h1' => 5,
'h2' => 3,
'h3' => 2,
'strong' => 2,
'b' => 2,
'em' => 1.5,
'u' => 1.5,
],
];
return $configuration;
}
/**
* {@inheritdoc}
*/
public function buildConfigurationForm(array $form, FormStateInterface $form_state) {
$form = parent::buildConfigurationForm($form, $form_state);
$form['title'] = [
'#type' => 'checkbox',
'#title' => $this
->t('Index title attribute'),
'#description' => $this
->t('If set, the contents of title attributes will be indexed.'),
'#default_value' => $this->configuration['title'],
];
$form['alt'] = [
'#type' => 'checkbox',
'#title' => $this
->t('Index alt attribute'),
'#description' => $this
->t('If set, the alternative text of images will be indexed.'),
'#default_value' => $this->configuration['alt'],
];
$dumper = new Dumper();
$tags = $dumper
->dump($this->configuration['tags'], 2);
$tags = str_replace('\\r\\n', "\n", $tags);
$tags = str_replace('"', '', $tags);
$t_args[':url'] = Url::fromUri('https://en.wikipedia.org/wiki/YAML')
->toString();
$form['tags'] = [
'#type' => 'textarea',
'#title' => $this
->t('Tag boosts'),
'#description' => $this
->t('Specify special boost values for certain HTML elements, in <a href=":url">YAML file format</a>. The boost values of nested elements are multiplied, elements not mentioned will have the default boost value of 1. Assign a boost of 0 to ignore the text content of that HTML element.', $t_args),
'#default_value' => $tags,
];
return $form;
}
/**
* {@inheritdoc}
*/
public function validateConfigurationForm(array &$form, FormStateInterface $form_state) {
parent::validateConfigurationForm($form, $form_state);
$tags = trim($form_state
->getValue('tags'));
if (!$tags) {
$form_state
->setValue('tags', []);
return;
}
$errors = [];
try {
$parser = new Parser();
$tags = $parser
->parse($tags);
if (!is_array($tags)) {
$errors[] = $this
->t('Tags is not a valid YAML map. See @link for information on how to write correctly formed YAML.', [
'@link' => 'http://yaml.org',
]);
$tags = [];
}
} catch (ParseException $exception) {
$errors[] = $this
->t('Tags is not a valid YAML map. See @link for information on how to write correctly formed YAML.', [
'@link' => 'http://yaml.org',
]);
$tags = [];
}
foreach ($tags as $key => $value) {
$tag = "<{$key}>";
if (is_array($value)) {
$errors[] = $this
->t("Boost value for tag @tag can't be an array.", [
'@tag' => $tag,
]);
}
elseif (!is_numeric($value)) {
$errors[] = $this
->t('Boost value for tag @tag must be numeric.', [
'@tag' => $tag,
]);
}
elseif ($value < 0) {
$errors[] = $this
->t('Boost value for tag @tag must be non-negative.', [
'@tag' => $tag,
]);
}
elseif ($value == 1) {
unset($tags[$key]);
}
else {
$tags[$key] = (double) $value;
}
}
$form_state
->setValue('tags', $tags);
if ($errors) {
$message = array_shift($errors);
foreach ($errors as $error) {
$args = [
'@message1' => $message,
'@message2' => $error,
];
$message = new FormattableMarkup('@message1<br />@message2', $args);
}
$form_state
->setError($form['tags'], $message);
}
}
/**
* {@inheritdoc}
*/
protected function processField(FieldInterface $field) {
parent::processField($field);
foreach ($field
->getValues() as $value) {
if ($value instanceof TextValueInterface) {
$value
->setProperty('strip_html');
}
}
}
/**
* {@inheritdoc}
*/
protected function processFieldValue(&$value, $type) {
// Remove invisible content.
$text = preg_replace('@<(applet|audio|canvas|command|embed|iframe|map|menu|noembed|noframes|noscript|script|style|svg|video)[^>]*>.*</\\1>@siU', ' ', $value);
// Let removed tags still delimit words.
$is_text_type = $this
->getDataTypeHelper()
->isTextType($type);
if ($is_text_type) {
$text = str_replace([
'<',
'>',
], [
' <',
'> ',
], $text);
if ($this->configuration['title']) {
$text = preg_replace('/(<[-a-z_]+[^>]*["\\s])title\\s*=\\s*("([^"]+)"|\'([^\']+)\')([^>]*>)/i', '$1 $5 $3$4 ', $text);
}
if ($this->configuration['alt']) {
$text = preg_replace('/<[-a-z_]+[^>]*["\\s]alt\\s*=\\s*("([^"]+)"|\'([^\']+)\')[^>]*>/i', ' <img>$2$3</img> ', $text);
}
}
if ($this->configuration['tags'] && $is_text_type) {
$text = strip_tags($text, '<' . implode('><', array_keys($this->configuration['tags'])) . '>');
$value = $this
->parseHtml($text);
}
else {
$text = strip_tags($text);
$value = $this
->normalizeText(trim($text));
}
}
/**
* {@inheritdoc}
*/
protected function process(&$value) {
$value = str_replace([
'<',
'>',
], [
' <',
'> ',
], $value);
$value = strip_tags($value);
$value = $this
->normalizeText($value);
}
/**
* Tokenizes an HTML string according to the HTML elements.
*
* Assigns boost values to the elements' contents accordingly.
*
* @param string $text
* The HTML string to parse, passed by reference. After the method call, the
* variable will contain the portion of the string after the current
* element, or an empty string (if there is no current element).
* @param string|null $active_tag
* (optional) The currently active tag, for which a closing tag has to be
* found. Internal use only.
* @param float $boost
* (optional) The currently active boost value. Internal use only.
*
* @return \Drupal\search_api\Plugin\search_api\data_type\value\TextTokenInterface[]
* Tokenized text with appropriate scores.
*/
protected function parseHtml(&$text, $active_tag = NULL, $boost = 1.0) {
$ret = [];
while (($pos = strpos($text, '<')) !== FALSE) {
$text_before = substr($text, 0, $pos);
$text_after = substr($text, $pos + 1);
// Attempt some small error tolerance when literal "<" characters aren't
// escaped properly (and are free-standing).
if (!preg_match('#^(/?)([-:_a-zA-Z0-9]+)#', $text_after, $m)) {
$text = $text_before . '<' . $text_after;
continue;
}
if ($boost && $pos > 0) {
$value = $this
->normalizeText($text_before);
if ($value !== '') {
$ret[] = Utility::createTextToken($value, $boost);
}
}
$text = $text_after;
$pos = strpos($text, '>');
$empty_tag = $text[$pos - 1] == '/';
$text = substr($text, $pos + 1);
if ($m[1]) {
// Closing tag.
if ($active_tag && $m[2] == $active_tag) {
return $ret;
}
}
elseif (!$empty_tag) {
// Opening tag => recursive call.
$inner_boost = $boost * ($this->configuration['tags'][$m[2]] ?? 1);
$ret = array_merge($ret, $this
->parseHtml($text, $m[2], $inner_boost));
}
}
if ($text) {
$value = $this
->normalizeText($text);
if ($value !== '') {
$ret[] = Utility::createTextToken($value, $boost);
}
$text = '';
}
return $ret;
}
/**
* Removes superfluous whitespace and unescapes HTML entities.
*
* @param string $value
* The text to process.
*
* @return string
* The text without unnecessary whitespace and HTML entities transformed
* back to plain text.
*/
protected function normalizeText($value) {
$value = Html::decodeEntities($value);
$value = trim($value);
$value = preg_replace('/\\s+/', ' ', $value);
return $value;
}
}
Members
Name | Modifiers | Type | Description | Overrides |
---|---|---|---|---|
ConfigurablePluginBase:: |
public | function |
Calculates dependencies for the configured plugin. Overrides DependentPluginInterface:: |
6 |
ConfigurablePluginBase:: |
protected | function | Calculates and adds dependencies of a specific plugin instance. | |
ConfigurablePluginBase:: |
public | function |
Gets this plugin's configuration. Overrides ConfigurableInterface:: |
|
ConfigurablePluginBase:: |
public | function |
Returns the plugin's description. Overrides ConfigurablePluginInterface:: |
|
ConfigurablePluginBase:: |
protected | function | Calculates and returns dependencies of a specific plugin instance. | |
ConfigurablePluginBase:: |
public | function |
Returns the label for use on the administration pages. Overrides ConfigurablePluginInterface:: |
|
ConfigurablePluginBase:: |
protected | function | Wraps the module handler. | |
ConfigurablePluginBase:: |
public | function |
Informs the plugin that some of its dependencies are being removed. Overrides ConfigurablePluginInterface:: |
5 |
ConfigurablePluginBase:: |
public | function |
Sets the configuration for this plugin instance. Overrides ConfigurableInterface:: |
3 |
ConfigurablePluginBase:: |
protected | function | Wraps the theme handler. | |
DependencySerializationTrait:: |
protected | property | An array of entity type IDs keyed by the property name of their storages. | |
DependencySerializationTrait:: |
protected | property | An array of service IDs keyed by property name used for serialization. | |
DependencySerializationTrait:: |
public | function | 1 | |
DependencySerializationTrait:: |
public | function | 2 | |
DependencyTrait:: |
protected | property | The object's dependencies. | |
DependencyTrait:: |
protected | function | Adds multiple dependencies. | |
DependencyTrait:: |
protected | function | Adds a dependency. | |
FieldsProcessorPluginBase:: |
protected | property | The element info manager. | |
FieldsProcessorPluginBase:: |
public static | function |
Creates an instance of the plugin. Overrides ProcessorPluginBase:: |
1 |
FieldsProcessorPluginBase:: |
public | function | Retrieves the element info manager. | |
FieldsProcessorPluginBase:: |
public | function |
Preprocesses the search index entity before it is saved. Overrides ProcessorPluginBase:: |
|
FieldsProcessorPluginBase:: |
public | function |
Preprocesses search items for indexing. Overrides ProcessorPluginBase:: |
1 |
FieldsProcessorPluginBase:: |
public | function |
Preprocesses a search query. Overrides ProcessorPluginBase:: |
2 |
FieldsProcessorPluginBase:: |
public static | function | Preprocesses the "fields" checkboxes before rendering. | |
FieldsProcessorPluginBase:: |
protected | function | Preprocesses the query conditions. | |
FieldsProcessorPluginBase:: |
protected | function | Processes a single condition value. | 1 |
FieldsProcessorPluginBase:: |
protected | function | Processes a single search keyword. | 1 |
FieldsProcessorPluginBase:: |
protected | function | Preprocesses the search keywords. | |
FieldsProcessorPluginBase:: |
public | function | Sets the element info manager. | |
FieldsProcessorPluginBase:: |
protected | function | Determines whether a single value (not an array) should be processed. | 1 |
FieldsProcessorPluginBase:: |
protected | function | Tests whether a certain field should be processed. | 1 |
FieldsProcessorPluginBase:: |
protected | function | Determines whether a field of a certain type should be preprocessed. | 4 |
FieldsProcessorPluginBase:: |
public static | function |
Lists the trusted callbacks provided by the implementing class. Overrides TrustedCallbackInterface:: |
|
HtmlFilter:: |
protected | property |
The data type helper. Overrides FieldsProcessorPluginBase:: |
|
HtmlFilter:: |
public | function |
Form constructor. Overrides FieldsProcessorPluginBase:: |
|
HtmlFilter:: |
public | function |
Gets default configuration for this plugin. Overrides FieldsProcessorPluginBase:: |
|
HtmlFilter:: |
public | function |
Retrieves the data type helper. Overrides FieldsProcessorPluginBase:: |
|
HtmlFilter:: |
protected | function | Removes superfluous whitespace and unescapes HTML entities. | |
HtmlFilter:: |
protected | function | Tokenizes an HTML string according to the HTML elements. | |
HtmlFilter:: |
protected | function |
Processes a single string value. Overrides FieldsProcessorPluginBase:: |
|
HtmlFilter:: |
protected | function |
Processes a single field's value. Overrides FieldsProcessorPluginBase:: |
|
HtmlFilter:: |
protected | function |
Processes a single text element in a field. Overrides FieldsProcessorPluginBase:: |
|
HtmlFilter:: |
public | function |
Sets the data type helper. Overrides FieldsProcessorPluginBase:: |
|
HtmlFilter:: |
public | function |
Form validation handler. Overrides FieldsProcessorPluginBase:: |
|
IndexPluginBase:: |
protected | property | The index this processor is configured for. | |
IndexPluginBase:: |
public | function |
Retrieves the index this plugin is configured for. Overrides IndexPluginInterface:: |
|
IndexPluginBase:: |
public | function |
Sets the index this plugin is configured for. Overrides IndexPluginInterface:: |
|
IndexPluginBase:: |
public | function |
Constructs a \Drupal\Component\Plugin\PluginBase object. Overrides ConfigurablePluginBase:: |
2 |
MessengerTrait:: |
protected | property | The messenger. | 29 |
MessengerTrait:: |
public | function | Gets the messenger. | 29 |
MessengerTrait:: |
public | function | Sets the messenger. | |
PluginBase:: |
protected | property | Configuration information passed into the plugin. | 1 |
PluginBase:: |
protected | property | The plugin implementation definition. | 1 |
PluginBase:: |
protected | property | The plugin_id. | |
PluginBase:: |
constant | A string which is used to separate base plugin IDs from the derivative ID. | ||
PluginBase:: |
public | function |
Gets the base_plugin_id of the plugin instance. Overrides DerivativeInspectionInterface:: |
|
PluginBase:: |
public | function |
Gets the derivative_id of the plugin instance. Overrides DerivativeInspectionInterface:: |
|
PluginBase:: |
public | function |
Gets the definition of the plugin implementation. Overrides PluginInspectionInterface:: |
3 |
PluginBase:: |
public | function |
Gets the plugin_id of the plugin instance. Overrides PluginInspectionInterface:: |
|
PluginBase:: |
public | function | Determines if the plugin is configurable. | |
PluginDependencyTrait:: |
protected | function | Calculates and adds dependencies of a specific plugin instance. Aliased as: traitCalculatePluginDependencies | 1 |
PluginDependencyTrait:: |
protected | function | Calculates and returns dependencies of a specific plugin instance. Aliased as: traitGetPluginDependencies | |
PluginDependencyTrait:: |
protected | function | Wraps the module handler. Aliased as: traitModuleHandler | 1 |
PluginDependencyTrait:: |
protected | function | Wraps the theme handler. Aliased as: traitThemeHandler | 1 |
PluginFormTrait:: |
public | function | Form submission handler. | 7 |
ProcessorInterface:: |
constant | Processing stage: add properties. | ||
ProcessorInterface:: |
constant | Processing stage: alter indexed items. | ||
ProcessorInterface:: |
constant | Processing stage: postprocess query. | ||
ProcessorInterface:: |
constant | Processing stage: preprocess index. | ||
ProcessorInterface:: |
constant | Processing stage: preprocess query. | ||
ProcessorInterface:: |
constant | Processing stage: preprocess index. | ||
ProcessorPluginBase:: |
protected | property | The fields helper. | 1 |
ProcessorPluginBase:: |
public | function |
Adds the values of properties defined by this processor to the item. Overrides ProcessorInterface:: |
8 |
ProcessorPluginBase:: |
public | function |
Alter the items to be indexed. Overrides ProcessorInterface:: |
3 |
ProcessorPluginBase:: |
protected | function | Ensures that a field with certain properties is indexed on the index. | |
ProcessorPluginBase:: |
protected | function | Finds a certain field in the index. | |
ProcessorPluginBase:: |
public | function | Retrieves the fields helper. | 1 |
ProcessorPluginBase:: |
public | function |
Retrieves the properties this processor defines for the given datasource. Overrides ProcessorInterface:: |
8 |
ProcessorPluginBase:: |
public | function |
Returns the weight for a specific processing stage. Overrides ProcessorInterface:: |
|
ProcessorPluginBase:: |
public | function |
Determines whether this plugin should be hidden in the UI. Overrides HideablePluginBase:: |
|
ProcessorPluginBase:: |
public | function |
Determines whether this processor should always be enabled. Overrides ProcessorInterface:: |
|
ProcessorPluginBase:: |
public | function |
Postprocess search results before they are returned by the query. Overrides ProcessorInterface:: |
2 |
ProcessorPluginBase:: |
public | function |
Determines whether re-indexing is required after a settings change. Overrides ProcessorInterface:: |
|
ProcessorPluginBase:: |
public | function | Sets the fields helper. | 1 |
ProcessorPluginBase:: |
public | function |
Sets the weight for a specific processing stage. Overrides ProcessorInterface:: |
|
ProcessorPluginBase:: |
public static | function |
Checks whether this processor is applicable for a certain index. Overrides ProcessorInterface:: |
8 |
ProcessorPluginBase:: |
public | function |
Checks whether this processor implements a particular stage. Overrides ProcessorInterface:: |
2 |
StringTranslationTrait:: |
protected | property | The string translation service. | 1 |
StringTranslationTrait:: |
protected | function | Formats a string containing a count of items. | |
StringTranslationTrait:: |
protected | function | Returns the number of plurals supported by a given language. | |
StringTranslationTrait:: |
protected | function | Gets the string translation service. | |
StringTranslationTrait:: |
public | function | Sets the string translation service to use. | 2 |
StringTranslationTrait:: |
protected | function | Translates a string to the current language or to a given language. | |
TrustedCallbackInterface:: |
constant | Untrusted callbacks throw exceptions. | ||
TrustedCallbackInterface:: |
constant | Untrusted callbacks trigger silenced E_USER_DEPRECATION errors. | ||
TrustedCallbackInterface:: |
constant | Untrusted callbacks trigger E_USER_WARNING errors. |