You are here

class HtmlFilter in Search API 8

Strips HTML tags from fulltext fields and decodes HTML entities.

Plugin annotation


@SearchApiProcessor(
  id = "html_filter",
  label = @Translation("HTML filter"),
  description = @Translation("Strips HTML tags from fulltext fields and decodes HTML entities. Use this processor when indexing HTML datafor example, node bodies for certain text formats. The processor also allows to boost (or ignore) the contents of specific elements."),
  stages = {
    "pre_index_save" = 0,
    "preprocess_index" = -15,
    "preprocess_query" = -15,
  }
)

Hierarchy

Expanded class hierarchy of HtmlFilter

1 file declares its use of HtmlFilter
HtmlFilterTest.php in tests/src/Unit/Processor/HtmlFilterTest.php

File

src/Plugin/search_api/processor/HtmlFilter.php, line 32

Namespace

Drupal\search_api\Plugin\search_api\processor
View source
class HtmlFilter extends FieldsProcessorPluginBase {

  /**
   * The data type helper.
   *
   * @var \Drupal\search_api\Utility\DataTypeHelperInterface|null
   */
  protected $dataTypeHelper;

  /**
   * Retrieves the data type helper.
   *
   * @return \Drupal\search_api\Utility\DataTypeHelperInterface
   *   The data type helper.
   */
  public function getDataTypeHelper() {
    return $this->dataTypeHelper ?: \Drupal::service('search_api.data_type_helper');
  }

  /**
   * Sets the data type helper.
   *
   * @param \Drupal\search_api\Utility\DataTypeHelperInterface $data_type_helper
   *   The new data type helper.
   *
   * @return $this
   */
  public function setDataTypeHelper(DataTypeHelperInterface $data_type_helper) {
    $this->dataTypeHelper = $data_type_helper;
    return $this;
  }

  /**
   * {@inheritdoc}
   */
  public function defaultConfiguration() {
    $configuration = parent::defaultConfiguration();
    $configuration += [
      'title' => TRUE,
      'alt' => TRUE,
      'tags' => [
        'h1' => 5,
        'h2' => 3,
        'h3' => 2,
        'strong' => 2,
        'b' => 2,
        'em' => 1.5,
        'u' => 1.5,
      ],
    ];
    return $configuration;
  }

  /**
   * {@inheritdoc}
   */
  public function buildConfigurationForm(array $form, FormStateInterface $form_state) {
    $form = parent::buildConfigurationForm($form, $form_state);
    $form['title'] = [
      '#type' => 'checkbox',
      '#title' => $this
        ->t('Index title attribute'),
      '#description' => $this
        ->t('If set, the contents of title attributes will be indexed.'),
      '#default_value' => $this->configuration['title'],
    ];
    $form['alt'] = [
      '#type' => 'checkbox',
      '#title' => $this
        ->t('Index alt attribute'),
      '#description' => $this
        ->t('If set, the alternative text of images will be indexed.'),
      '#default_value' => $this->configuration['alt'],
    ];
    $dumper = new Dumper();
    $tags = $dumper
      ->dump($this->configuration['tags'], 2);
    $tags = str_replace('\\r\\n', "\n", $tags);
    $tags = str_replace('"', '', $tags);
    $t_args[':url'] = Url::fromUri('https://en.wikipedia.org/wiki/YAML')
      ->toString();
    $form['tags'] = [
      '#type' => 'textarea',
      '#title' => $this
        ->t('Tag boosts'),
      '#description' => $this
        ->t('Specify special boost values for certain HTML elements, in <a href=":url">YAML file format</a>. The boost values of nested elements are multiplied, elements not mentioned will have the default boost value of 1. Assign a boost of 0 to ignore the text content of that HTML element.', $t_args),
      '#default_value' => $tags,
    ];
    return $form;
  }

  /**
   * {@inheritdoc}
   */
  public function validateConfigurationForm(array &$form, FormStateInterface $form_state) {
    parent::validateConfigurationForm($form, $form_state);
    $tags = trim($form_state
      ->getValue('tags'));
    if (!$tags) {
      $form_state
        ->setValue('tags', []);
      return;
    }
    $errors = [];
    try {
      $parser = new Parser();
      $tags = $parser
        ->parse($tags);
      if (!is_array($tags)) {
        $errors[] = $this
          ->t('Tags is not a valid YAML map. See @link for information on how to write correctly formed YAML.', [
          '@link' => 'http://yaml.org',
        ]);
        $tags = [];
      }
    } catch (ParseException $exception) {
      $errors[] = $this
        ->t('Tags is not a valid YAML map. See @link for information on how to write correctly formed YAML.', [
        '@link' => 'http://yaml.org',
      ]);
      $tags = [];
    }
    foreach ($tags as $key => $value) {
      $tag = "<{$key}>";
      if (is_array($value)) {
        $errors[] = $this
          ->t("Boost value for tag @tag can't be an array.", [
          '@tag' => $tag,
        ]);
      }
      elseif (!is_numeric($value)) {
        $errors[] = $this
          ->t('Boost value for tag @tag must be numeric.', [
          '@tag' => $tag,
        ]);
      }
      elseif ($value < 0) {
        $errors[] = $this
          ->t('Boost value for tag @tag must be non-negative.', [
          '@tag' => $tag,
        ]);
      }
      elseif ($value == 1) {
        unset($tags[$key]);
      }
      else {
        $tags[$key] = (double) $value;
      }
    }
    $form_state
      ->setValue('tags', $tags);
    if ($errors) {
      $message = array_shift($errors);
      foreach ($errors as $error) {
        $args = [
          '@message1' => $message,
          '@message2' => $error,
        ];
        $message = new FormattableMarkup('@message1<br />@message2', $args);
      }
      $form_state
        ->setError($form['tags'], $message);
    }
  }

  /**
   * {@inheritdoc}
   */
  protected function processField(FieldInterface $field) {
    parent::processField($field);
    foreach ($field
      ->getValues() as $value) {
      if ($value instanceof TextValueInterface) {
        $value
          ->setProperty('strip_html');
      }
    }
  }

  /**
   * {@inheritdoc}
   */
  protected function processFieldValue(&$value, $type) {

    // Remove invisible content.
    $text = preg_replace('@<(applet|audio|canvas|command|embed|iframe|map|menu|noembed|noframes|noscript|script|style|svg|video)[^>]*>.*</\\1>@siU', ' ', $value);

    // Let removed tags still delimit words.
    $is_text_type = $this
      ->getDataTypeHelper()
      ->isTextType($type);
    if ($is_text_type) {
      $text = str_replace([
        '<',
        '>',
      ], [
        ' <',
        '> ',
      ], $text);
      if ($this->configuration['title']) {
        $text = preg_replace('/(<[-a-z_]+[^>]*["\\s])title\\s*=\\s*("([^"]+)"|\'([^\']+)\')([^>]*>)/i', '$1 $5 $3$4 ', $text);
      }
      if ($this->configuration['alt']) {
        $text = preg_replace('/<[-a-z_]+[^>]*["\\s]alt\\s*=\\s*("([^"]+)"|\'([^\']+)\')[^>]*>/i', ' <img>$2$3</img> ', $text);
      }
    }
    if ($this->configuration['tags'] && $is_text_type) {
      $text = strip_tags($text, '<' . implode('><', array_keys($this->configuration['tags'])) . '>');
      $value = $this
        ->parseHtml($text);
    }
    else {
      $text = strip_tags($text);
      $value = $this
        ->normalizeText(trim($text));
    }
  }

  /**
   * {@inheritdoc}
   */
  protected function process(&$value) {
    $value = str_replace([
      '<',
      '>',
    ], [
      ' <',
      '> ',
    ], $value);
    $value = strip_tags($value);
    $value = $this
      ->normalizeText($value);
  }

  /**
   * Tokenizes an HTML string according to the HTML elements.
   *
   * Assigns boost values to the elements' contents accordingly.
   *
   * @param string $text
   *   The HTML string to parse, passed by reference. After the method call, the
   *   variable will contain the portion of the string after the current
   *   element, or an empty string (if there is no current element).
   * @param string|null $active_tag
   *   (optional) The currently active tag, for which a closing tag has to be
   *   found. Internal use only.
   * @param float $boost
   *   (optional) The currently active boost value. Internal use only.
   *
   * @return \Drupal\search_api\Plugin\search_api\data_type\value\TextTokenInterface[]
   *   Tokenized text with appropriate scores.
   */
  protected function parseHtml(&$text, $active_tag = NULL, $boost = 1.0) {
    $ret = [];
    while (($pos = strpos($text, '<')) !== FALSE) {
      $text_before = substr($text, 0, $pos);
      $text_after = substr($text, $pos + 1);

      // Attempt some small error tolerance when literal "<" characters aren't
      // escaped properly (and are free-standing).
      if (!preg_match('#^(/?)([-:_a-zA-Z0-9]+)#', $text_after, $m)) {
        $text = $text_before . '&lt;' . $text_after;
        continue;
      }
      if ($boost && $pos > 0) {
        $value = $this
          ->normalizeText($text_before);
        if ($value !== '') {
          $ret[] = Utility::createTextToken($value, $boost);
        }
      }
      $text = $text_after;
      $pos = strpos($text, '>');
      $empty_tag = $text[$pos - 1] == '/';
      $text = substr($text, $pos + 1);
      if ($m[1]) {

        // Closing tag.
        if ($active_tag && $m[2] == $active_tag) {
          return $ret;
        }
      }
      elseif (!$empty_tag) {

        // Opening tag => recursive call.
        $inner_boost = $boost * ($this->configuration['tags'][$m[2]] ?? 1);
        $ret = array_merge($ret, $this
          ->parseHtml($text, $m[2], $inner_boost));
      }
    }
    if ($text) {
      $value = $this
        ->normalizeText($text);
      if ($value !== '') {
        $ret[] = Utility::createTextToken($value, $boost);
      }
      $text = '';
    }
    return $ret;
  }

  /**
   * Removes superfluous whitespace and unescapes HTML entities.
   *
   * @param string $value
   *   The text to process.
   *
   * @return string
   *   The text without unnecessary whitespace and HTML entities transformed
   *   back to plain text.
   */
  protected function normalizeText($value) {
    $value = Html::decodeEntities($value);
    $value = trim($value);
    $value = preg_replace('/\\s+/', ' ', $value);
    return $value;
  }

}

Members

Namesort descending Modifiers Type Description Overrides
ConfigurablePluginBase::calculateDependencies public function Calculates dependencies for the configured plugin. Overrides DependentPluginInterface::calculateDependencies 6
ConfigurablePluginBase::calculatePluginDependencies Deprecated protected function Calculates and adds dependencies of a specific plugin instance.
ConfigurablePluginBase::getConfiguration public function Gets this plugin's configuration. Overrides ConfigurableInterface::getConfiguration
ConfigurablePluginBase::getDescription public function Returns the plugin's description. Overrides ConfigurablePluginInterface::getDescription
ConfigurablePluginBase::getPluginDependencies Deprecated protected function Calculates and returns dependencies of a specific plugin instance.
ConfigurablePluginBase::label public function Returns the label for use on the administration pages. Overrides ConfigurablePluginInterface::label
ConfigurablePluginBase::moduleHandler Deprecated protected function Wraps the module handler.
ConfigurablePluginBase::onDependencyRemoval public function Informs the plugin that some of its dependencies are being removed. Overrides ConfigurablePluginInterface::onDependencyRemoval 5
ConfigurablePluginBase::setConfiguration public function Sets the configuration for this plugin instance. Overrides ConfigurableInterface::setConfiguration 3
ConfigurablePluginBase::themeHandler Deprecated protected function Wraps the theme handler.
DependencySerializationTrait::$_entityStorages protected property An array of entity type IDs keyed by the property name of their storages.
DependencySerializationTrait::$_serviceIds protected property An array of service IDs keyed by property name used for serialization.
DependencySerializationTrait::__sleep public function 1
DependencySerializationTrait::__wakeup public function 2
DependencyTrait::$dependencies protected property The object's dependencies.
DependencyTrait::addDependencies protected function Adds multiple dependencies.
DependencyTrait::addDependency protected function Adds a dependency.
FieldsProcessorPluginBase::$elementInfoManager protected property The element info manager.
FieldsProcessorPluginBase::create public static function Creates an instance of the plugin. Overrides ProcessorPluginBase::create 1
FieldsProcessorPluginBase::getElementInfoManager public function Retrieves the element info manager.
FieldsProcessorPluginBase::preIndexSave public function Preprocesses the search index entity before it is saved. Overrides ProcessorPluginBase::preIndexSave
FieldsProcessorPluginBase::preprocessIndexItems public function Preprocesses search items for indexing. Overrides ProcessorPluginBase::preprocessIndexItems 1
FieldsProcessorPluginBase::preprocessSearchQuery public function Preprocesses a search query. Overrides ProcessorPluginBase::preprocessSearchQuery 2
FieldsProcessorPluginBase::preRenderFieldsCheckboxes public static function Preprocesses the "fields" checkboxes before rendering.
FieldsProcessorPluginBase::processConditions protected function Preprocesses the query conditions.
FieldsProcessorPluginBase::processConditionValue protected function Processes a single condition value. 1
FieldsProcessorPluginBase::processKey protected function Processes a single search keyword. 1
FieldsProcessorPluginBase::processKeys protected function Preprocesses the search keywords.
FieldsProcessorPluginBase::setElementInfoManager public function Sets the element info manager.
FieldsProcessorPluginBase::shouldProcess protected function Determines whether a single value (not an array) should be processed. 1
FieldsProcessorPluginBase::testField protected function Tests whether a certain field should be processed. 1
FieldsProcessorPluginBase::testType protected function Determines whether a field of a certain type should be preprocessed. 4
FieldsProcessorPluginBase::trustedCallbacks public static function Lists the trusted callbacks provided by the implementing class. Overrides TrustedCallbackInterface::trustedCallbacks
HtmlFilter::$dataTypeHelper protected property The data type helper. Overrides FieldsProcessorPluginBase::$dataTypeHelper
HtmlFilter::buildConfigurationForm public function Form constructor. Overrides FieldsProcessorPluginBase::buildConfigurationForm
HtmlFilter::defaultConfiguration public function Gets default configuration for this plugin. Overrides FieldsProcessorPluginBase::defaultConfiguration
HtmlFilter::getDataTypeHelper public function Retrieves the data type helper. Overrides FieldsProcessorPluginBase::getDataTypeHelper
HtmlFilter::normalizeText protected function Removes superfluous whitespace and unescapes HTML entities.
HtmlFilter::parseHtml protected function Tokenizes an HTML string according to the HTML elements.
HtmlFilter::process protected function Processes a single string value. Overrides FieldsProcessorPluginBase::process
HtmlFilter::processField protected function Processes a single field's value. Overrides FieldsProcessorPluginBase::processField
HtmlFilter::processFieldValue protected function Processes a single text element in a field. Overrides FieldsProcessorPluginBase::processFieldValue
HtmlFilter::setDataTypeHelper public function Sets the data type helper. Overrides FieldsProcessorPluginBase::setDataTypeHelper
HtmlFilter::validateConfigurationForm public function Form validation handler. Overrides FieldsProcessorPluginBase::validateConfigurationForm
IndexPluginBase::$index protected property The index this processor is configured for.
IndexPluginBase::getIndex public function Retrieves the index this plugin is configured for. Overrides IndexPluginInterface::getIndex
IndexPluginBase::setIndex public function Sets the index this plugin is configured for. Overrides IndexPluginInterface::setIndex
IndexPluginBase::__construct public function Constructs a \Drupal\Component\Plugin\PluginBase object. Overrides ConfigurablePluginBase::__construct 2
MessengerTrait::$messenger protected property The messenger. 29
MessengerTrait::messenger public function Gets the messenger. 29
MessengerTrait::setMessenger public function Sets the messenger.
PluginBase::$configuration protected property Configuration information passed into the plugin. 1
PluginBase::$pluginDefinition protected property The plugin implementation definition. 1
PluginBase::$pluginId protected property The plugin_id.
PluginBase::DERIVATIVE_SEPARATOR constant A string which is used to separate base plugin IDs from the derivative ID.
PluginBase::getBaseId public function Gets the base_plugin_id of the plugin instance. Overrides DerivativeInspectionInterface::getBaseId
PluginBase::getDerivativeId public function Gets the derivative_id of the plugin instance. Overrides DerivativeInspectionInterface::getDerivativeId
PluginBase::getPluginDefinition public function Gets the definition of the plugin implementation. Overrides PluginInspectionInterface::getPluginDefinition 3
PluginBase::getPluginId public function Gets the plugin_id of the plugin instance. Overrides PluginInspectionInterface::getPluginId
PluginBase::isConfigurable public function Determines if the plugin is configurable.
PluginDependencyTrait::calculatePluginDependencies protected function Calculates and adds dependencies of a specific plugin instance. Aliased as: traitCalculatePluginDependencies 1
PluginDependencyTrait::getPluginDependencies protected function Calculates and returns dependencies of a specific plugin instance. Aliased as: traitGetPluginDependencies
PluginDependencyTrait::moduleHandler protected function Wraps the module handler. Aliased as: traitModuleHandler 1
PluginDependencyTrait::themeHandler protected function Wraps the theme handler. Aliased as: traitThemeHandler 1
PluginFormTrait::submitConfigurationForm public function Form submission handler. 7
ProcessorInterface::STAGE_ADD_PROPERTIES constant Processing stage: add properties.
ProcessorInterface::STAGE_ALTER_ITEMS constant Processing stage: alter indexed items.
ProcessorInterface::STAGE_POSTPROCESS_QUERY constant Processing stage: postprocess query.
ProcessorInterface::STAGE_PREPROCESS_INDEX constant Processing stage: preprocess index.
ProcessorInterface::STAGE_PREPROCESS_QUERY constant Processing stage: preprocess query.
ProcessorInterface::STAGE_PRE_INDEX_SAVE constant Processing stage: preprocess index.
ProcessorPluginBase::$fieldsHelper protected property The fields helper. 1
ProcessorPluginBase::addFieldValues public function Adds the values of properties defined by this processor to the item. Overrides ProcessorInterface::addFieldValues 8
ProcessorPluginBase::alterIndexedItems public function Alter the items to be indexed. Overrides ProcessorInterface::alterIndexedItems 3
ProcessorPluginBase::ensureField protected function Ensures that a field with certain properties is indexed on the index.
ProcessorPluginBase::findField protected function Finds a certain field in the index.
ProcessorPluginBase::getFieldsHelper public function Retrieves the fields helper. 1
ProcessorPluginBase::getPropertyDefinitions public function Retrieves the properties this processor defines for the given datasource. Overrides ProcessorInterface::getPropertyDefinitions 8
ProcessorPluginBase::getWeight public function Returns the weight for a specific processing stage. Overrides ProcessorInterface::getWeight
ProcessorPluginBase::isHidden public function Determines whether this plugin should be hidden in the UI. Overrides HideablePluginBase::isHidden
ProcessorPluginBase::isLocked public function Determines whether this processor should always be enabled. Overrides ProcessorInterface::isLocked
ProcessorPluginBase::postprocessSearchResults public function Postprocess search results before they are returned by the query. Overrides ProcessorInterface::postprocessSearchResults 2
ProcessorPluginBase::requiresReindexing public function Determines whether re-indexing is required after a settings change. Overrides ProcessorInterface::requiresReindexing
ProcessorPluginBase::setFieldsHelper public function Sets the fields helper. 1
ProcessorPluginBase::setWeight public function Sets the weight for a specific processing stage. Overrides ProcessorInterface::setWeight
ProcessorPluginBase::supportsIndex public static function Checks whether this processor is applicable for a certain index. Overrides ProcessorInterface::supportsIndex 8
ProcessorPluginBase::supportsStage public function Checks whether this processor implements a particular stage. Overrides ProcessorInterface::supportsStage 2
StringTranslationTrait::$stringTranslation protected property The string translation service. 1
StringTranslationTrait::formatPlural protected function Formats a string containing a count of items.
StringTranslationTrait::getNumberOfPlurals protected function Returns the number of plurals supported by a given language.
StringTranslationTrait::getStringTranslation protected function Gets the string translation service.
StringTranslationTrait::setStringTranslation public function Sets the string translation service to use. 2
StringTranslationTrait::t protected function Translates a string to the current language or to a given language.
TrustedCallbackInterface::THROW_EXCEPTION constant Untrusted callbacks throw exceptions.
TrustedCallbackInterface::TRIGGER_SILENCED_DEPRECATION constant Untrusted callbacks trigger silenced E_USER_DEPRECATION errors.
TrustedCallbackInterface::TRIGGER_WARNING constant Untrusted callbacks trigger E_USER_WARNING errors.