View source
<?php
namespace Drupal\search_api\Plugin\search_api\processor;
use Drupal\Component\Utility\Unicode;
use Drupal\Core\Form\FormStateInterface;
use Drupal\Core\Url;
use Drupal\search_api\Item\FieldInterface;
use Drupal\search_api\Plugin\search_api\data_type\value\TextValueInterface;
use Drupal\search_api\Processor\FieldsProcessorPluginBase;
use Drupal\search_api\Utility\Utility;
class Tokenizer extends FieldsProcessorPluginBase {
protected $ignored;
protected $spaces;
public function defaultConfiguration() {
$configuration = parent::defaultConfiguration();
$configuration += [
'ignored' => '._-',
'spaces' => '',
'overlap_cjk' => TRUE,
'minimum_word_size' => 3,
];
return $configuration;
}
public function setConfiguration(array $configuration) {
parent::setConfiguration($configuration);
unset($this->spaces);
}
public function buildConfigurationForm(array $form, FormStateInterface $form_state) {
$form = parent::buildConfigurationForm($form, $form_state);
$args = [
':pcre-url' => Url::fromUri('https://php.net/manual/regexp.reference.character-classes.php')
->toString(),
':doc-url' => Url::fromUri('https://api.drupal.org/api/drupal/core!lib!Drupal!Component!Utility!Unicode.php/constant/Unicode%3A%3APREG_CLASS_WORD_BOUNDARY/8')
->toString(),
];
$form['ignored'] = [
'#type' => 'textfield',
'#title' => $this
->t('Ignored characters'),
'#description' => $this
->t('Specify the characters that should be removed prior to processing. Dots, dashes, and underscores are ignored by default to allow meaningful search behavior with acronyms and URLs. Specify the characters as the inside of a <a href=":pcre-url">PCRE character class</a>.', $args),
'#default_value' => $this->configuration['ignored'],
];
$form['spaces'] = [
'#type' => 'textfield',
'#title' => $this
->t('Whitespace characters'),
'#description' => $this
->t('Specify the characters that should be regarded as whitespace and therefore used as word-delimiters. Specify the characters as the inside of a <a href=":pcre-url">PCRE character class</a>. Leave empty to use a <a href=":doc-url">default</a> which should be suitable for most languages with a Latin alphabet.', $args),
'#default_value' => $this->configuration['spaces'],
];
$form['overlap_cjk'] = [
'#type' => 'checkbox',
'#title' => $this
->t('Simple CJK handling'),
'#default_value' => $this->configuration['overlap_cjk'],
'#description' => $this
->t('Whether to apply a simple Chinese/Japanese/Korean tokenizer based on overlapping sequences. Does not affect other languages.'),
];
$form['minimum_word_size'] = [
'#type' => 'number',
'#title' => $this
->t('Minimum word length to index'),
'#default_value' => $this->configuration['minimum_word_size'],
'#min' => 1,
'#max' => 1000,
'#description' => $this
->t('The number of characters a word has to be to be indexed. A lower setting means better search result ranking, but also a larger database. Each search query must contain at least one keyword that is this size (or longer).'),
];
return $form;
}
public function validateConfigurationForm(array &$form, FormStateInterface $form_state) {
parent::validateConfigurationForm($form, $form_state);
foreach ([
'spaces',
'ignored',
] as $field) {
$field_value = $form_state
->getValue($field, '');
$field_value = str_replace('/', '\\/', trim($field_value));
if ($field_value !== '' && @preg_match('/[' . $field_value . ']+/u', '') === FALSE) {
$form_state
->setError($form[$field], $form[$field]['#title'] . ': ' . $this
->t('The entered text is no valid PCRE character class.'));
}
}
}
protected function testType($type) {
return $this
->getDataTypeHelper()
->isTextType($type);
}
protected function processField(FieldInterface $field) {
parent::processField($field);
foreach ($field
->getValues() as $value) {
if ($value instanceof TextValueInterface) {
$value
->setProperty('tokenized');
}
}
}
protected function getPregClassNumbers() {
return '\\x{30}-\\x{39}\\x{b2}\\x{b3}\\x{b9}\\x{bc}-\\x{be}\\x{660}-\\x{669}\\x{6f0}-\\x{6f9}' . '\\x{966}-\\x{96f}\\x{9e6}-\\x{9ef}\\x{9f4}-\\x{9f9}\\x{a66}-\\x{a6f}\\x{ae6}-\\x{aef}' . '\\x{b66}-\\x{b6f}\\x{be7}-\\x{bf2}\\x{c66}-\\x{c6f}\\x{ce6}-\\x{cef}\\x{d66}-\\x{d6f}' . '\\x{e50}-\\x{e59}\\x{ed0}-\\x{ed9}\\x{f20}-\\x{f33}\\x{1040}-\\x{1049}\\x{1369}-' . '\\x{137c}\\x{16ee}-\\x{16f0}\\x{17e0}-\\x{17e9}\\x{17f0}-\\x{17f9}\\x{1810}-\\x{1819}' . '\\x{1946}-\\x{194f}\\x{2070}\\x{2074}-\\x{2079}\\x{2080}-\\x{2089}\\x{2153}-\\x{2183}' . '\\x{2460}-\\x{249b}\\x{24ea}-\\x{24ff}\\x{2776}-\\x{2793}\\x{3007}\\x{3021}-\\x{3029}' . '\\x{3038}-\\x{303a}\\x{3192}-\\x{3195}\\x{3220}-\\x{3229}\\x{3251}-\\x{325f}\\x{3280}-' . '\\x{3289}\\x{32b1}-\\x{32bf}\\x{ff10}-\\x{ff19}';
}
protected function getPregClassPunctuation() {
return '\\x{21}-\\x{23}\\x{25}-\\x{2a}\\x{2c}-\\x{2f}\\x{3a}\\x{3b}\\x{3f}\\x{40}\\x{5b}-\\x{5d}' . '\\x{5f}\\x{7b}\\x{7d}\\x{a1}\\x{ab}\\x{b7}\\x{bb}\\x{bf}\\x{37e}\\x{387}\\x{55a}-\\x{55f}' . '\\x{589}\\x{58a}\\x{5be}\\x{5c0}\\x{5c3}\\x{5f3}\\x{5f4}\\x{60c}\\x{60d}\\x{61b}\\x{61f}' . '\\x{66a}-\\x{66d}\\x{6d4}\\x{700}-\\x{70d}\\x{964}\\x{965}\\x{970}\\x{df4}\\x{e4f}' . '\\x{e5a}\\x{e5b}\\x{f04}-\\x{f12}\\x{f3a}-\\x{f3d}\\x{f85}\\x{104a}-\\x{104f}\\x{10fb}' . '\\x{1361}-\\x{1368}\\x{166d}\\x{166e}\\x{169b}\\x{169c}\\x{16eb}-\\x{16ed}\\x{1735}' . '\\x{1736}\\x{17d4}-\\x{17d6}\\x{17d8}-\\x{17da}\\x{1800}-\\x{180a}\\x{1944}\\x{1945}' . '\\x{2010}-\\x{2027}\\x{2030}-\\x{2043}\\x{2045}-\\x{2051}\\x{2053}\\x{2054}\\x{2057}' . '\\x{207d}\\x{207e}\\x{208d}\\x{208e}\\x{2329}\\x{232a}\\x{23b4}-\\x{23b6}\\x{2768}-' . '\\x{2775}\\x{27e6}-\\x{27eb}\\x{2983}-\\x{2998}\\x{29d8}-\\x{29db}\\x{29fc}\\x{29fd}' . '\\x{3001}-\\x{3003}\\x{3008}-\\x{3011}\\x{3014}-\\x{301f}\\x{3030}\\x{303d}\\x{30a0}' . '\\x{30fb}\\x{fd3e}\\x{fd3f}\\x{fe30}-\\x{fe52}\\x{fe54}-\\x{fe61}\\x{fe63}\\x{fe68}' . '\\x{fe6a}\\x{fe6b}\\x{ff01}-\\x{ff03}\\x{ff05}-\\x{ff0a}\\x{ff0c}-\\x{ff0f}\\x{ff1a}' . '\\x{ff1b}\\x{ff1f}\\x{ff20}\\x{ff3b}-\\x{ff3d}\\x{ff3f}\\x{ff5b}\\x{ff5d}\\x{ff5f}-' . '\\x{ff65}';
}
protected function getPregClassCjk() {
return '\\x{1100}-\\x{11FF}\\x{3040}-\\x{309F}\\x{30A1}-\\x{318E}' . '\\x{31A0}-\\x{31B7}\\x{31F0}-\\x{31FF}\\x{3400}-\\x{4DBF}\\x{4E00}-\\x{9FCF}' . '\\x{A000}-\\x{A48F}\\x{A4D0}-\\x{A4FD}\\x{A960}-\\x{A97F}\\x{AC00}-\\x{D7FF}' . '\\x{F900}-\\x{FAFF}\\x{FF21}-\\x{FF3A}\\x{FF41}-\\x{FF5A}\\x{FF66}-\\x{FFDC}' . '\\x{20000}-\\x{2FFFD}\\x{30000}-\\x{3FFFD}';
}
protected function processFieldValue(&$value, $type) {
$this
->prepare();
$text = $this
->simplifyText($value);
$arr = explode(' ', $text);
$value = [];
foreach ($arr as $token) {
if (is_numeric($token) || mb_strlen($token) >= $this->configuration['minimum_word_size']) {
$value[] = Utility::createTextToken($token);
}
}
}
protected function simplifyText($text) {
if ($this->configuration['overlap_cjk']) {
$text = preg_replace_callback('/[' . $this
->getPregClassCjk() . ']+/u', [
$this,
'expandCjk',
], $text);
}
$text = preg_replace('/([' . $this
->getPregClassNumbers() . ']+)[' . $this
->getPregClassPunctuation() . ']+(?=[' . $this
->getPregClassNumbers() . '])/u', '\\1', $text);
$text = preg_replace('/[' . $this->ignored . ']{2,}/u', ' ', $text);
$text = preg_replace('/[' . $this->ignored . ']+/u', '', $text);
$text = preg_replace('/[' . $this->spaces . ']+/u', ' ', $text);
return trim($text);
}
protected function expandCjk(array $matches) {
$min = $this->configuration['minimum_word_size'];
$str = $matches[0];
$length = mb_strlen($str);
if ($length <= $min) {
return ' ' . $str . ' ';
}
$tokens = ' ';
$chars = [];
for ($i = 0; $i < $length; $i++) {
$current = mb_substr($str, 0, 1);
$str = substr($str, strlen($current));
$chars[] = $current;
if ($i >= $min - 1) {
$tokens .= implode('', $chars) . ' ';
array_shift($chars);
}
}
return $tokens;
}
protected function process(&$value) {
$this
->prepare();
$value = trim($this
->simplifyText($value));
$min = $this->configuration['minimum_word_size'];
if ($min > 1) {
$words = explode(' ', $value);
foreach ($words as $i => $word) {
if (mb_strlen($word) < $min) {
unset($words[$i]);
}
}
$value = implode(' ', $words);
}
}
protected function prepare() {
if (!isset($this->spaces)) {
if ($this->configuration['spaces'] !== '') {
$this->spaces = str_replace('/', '\\/', $this->configuration['spaces']);
}
else {
$this->spaces = Unicode::PREG_CLASS_WORD_BOUNDARY;
}
}
if (!isset($this->ignored)) {
if ($this->configuration['ignored'] !== '') {
$this->ignored = str_replace('/', '\\/', $this->configuration['ignored']);
}
else {
$this->ignored = '._-';
}
}
}
}