You are here

public function TokenizerTest::testCjkSupport in Search API 8

Tests that the simplifyText() method handles CJK characters properly.

The simplifyText() method does special things with numbers, symbols and punctuation. So we only test that CJK characters that are not in these character classes are tokenized properly. See PREG_CLASS_CJK for more information.

File

tests/src/Unit/Processor/TokenizerTest.php, line 157

Class

TokenizerTest
Tests the "Tokenizer" processor.

Namespace

Drupal\Tests\search_api\Unit\Processor

Code

public function testCjkSupport() {
  $this
    ->invokeMethod('prepare');

  // Create a string of CJK characters from various character ranges in
  // the Unicode tables. $starts contains the starts of the character ranges,
  // $ends the ends.
  $starts = [
    'CJK unified' => 0x4e00,
    'CJK Ext A' => 0x3400,
    'CJK Compat' => 0xf900,
    'Hangul Jamo' => 0x1100,
    'Hangul Ext A' => 0xa960,
    'Hangul Ext B' => 0xd7b0,
    'Hangul Compat' => 0x3131,
    'Half non-punct 1' => 0xff21,
    'Half non-punct 2' => 0xff41,
    'Half non-punct 3' => 0xff66,
    'Hangul Syllables' => 0xac00,
    'Hiragana' => 0x3040,
    'Katakana' => 0x30a1,
    'Katakana Ext' => 0x31f0,
    'CJK Reserve 1' => 0x20000,
    'CJK Reserve 2' => 0x30000,
    'Bomofo' => 0x3100,
    'Bomofo Ext' => 0x31a0,
    'Lisu' => 0xa4d0,
    'Yi' => 0xa000,
  ];
  $ends = [
    'CJK unified' => 0x9fcf,
    'CJK Ext A' => 0x4dbf,
    'CJK Compat' => 0xfaff,
    'Hangul Jamo' => 0x11ff,
    'Hangul Ext A' => 0xa97f,
    'Hangul Ext B' => 0xd7ff,
    'Hangul Compat' => 0x318e,
    'Half non-punct 1' => 0xff3a,
    'Half non-punct 2' => 0xff5a,
    'Half non-punct 3' => 0xffdc,
    'Hangul Syllables' => 0xd7af,
    'Hiragana' => 0x309f,
    'Katakana' => 0x30ff,
    'Katakana Ext' => 0x31ff,
    'CJK Reserve 1' => 0x2fffd,
    'CJK Reserve 2' => 0x3fffd,
    'Bomofo' => 0x312f,
    'Bomofo Ext' => 0x31b7,
    'Lisu' => 0xa4fd,
    'Yi' => 0xa48f,
  ];

  // Generate characters consisting of starts, midpoints, and ends.
  $chars = [];
  foreach ($starts as $key => $value) {
    $chars[] = static::codepointToUtf8($starts[$key]);
    $mid = round(0.5 * ($starts[$key] + $ends[$key]));
    $chars[] = static::codepointToUtf8($mid);
    $chars[] = static::codepointToUtf8($ends[$key]);
  }

  // Merge into a single string and tokenize.
  $text = implode('', $chars);
  $simplified_text = $this
    ->invokeMethod('simplifyText', [
    $text,
  ]);

  // Prepare the expected return value, which consists of all the 3-grams in
  // the original string, separated by spaces.
  $expected = '';
  for ($i = 2; $i < count($chars); ++$i) {
    $expected .= $chars[$i - 2];
    $expected .= $chars[$i - 1];
    $expected .= $chars[$i];
    $expected .= ' ';
  }
  $expected = trim($expected);

  // Verify that the output matches what we expect.
  $this
    ->assertEquals($expected, $simplified_text, 'CJK tokenizer worked on all supplied CJK characters');

  // Verify that disabling the "overlap_cjk" setting works as expected.
  $this->processor
    ->setConfiguration([
    'overlap_cjk' => FALSE,
  ]);
  $this
    ->invokeMethod('prepare');
  $simplified_text = $this
    ->invokeMethod('simplifyText', [
    $text,
  ]);
  $this
    ->assertEquals($text, $simplified_text, 'CJK tokenizing is successfully disabled');
}