View source
<?php
function porterstemmer_search_preprocess($text) {
$text = drupal_strtolower(str_replace('’', "'", $text));
$words = preg_split('/(' . PORTERSTEMMER_BOUNDARY . '+)/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
if (!count($words)) {
return $text;
}
$has_pecl_stem = _porterstemmer_pecl_loaded();
$isword = !preg_match('/' . PORTERSTEMMER_BOUNDARY . '/', $words[0]);
foreach ($words as $k => $word) {
if ($isword) {
if ($has_pecl_stem) {
$words[$k] = stem_english($word);
}
else {
$words[$k] = porterstemmer_stem($word);
}
}
$isword = !$isword;
}
return implode('', $words);
}
function porterstemmer_help($path, $arg) {
switch ($path) {
case 'admin/help#porterstemmer':
$output = '';
$output .= '<p>' . t('The Porter Stemmer module implements version 2 of the <a href="@algorithm">Porter Stemmer algorithm</a>, to improve American English-language searching with the core <a href="@search-help">Search module</a>. Stemming reduces a word to its basic root or stem (e.g. "blogging" to "blog") so that variations on a word ("blogs", "blogged", "blogging", "blog") are considered equivalent when searching. This generally results in more relevant results.', array(
'@search-help' => url('admin/help/search'),
'@algorithm' => 'http://snowball.tartarus.org/algorithms/english/stemmer.html',
)) . '</p>';
return $output;
}
}
function porterstemmer_sbp_excerpt_match($key, $text, $offset, $boundary) {
$key = porterstemmer_stem($key);
$didit = FALSE;
porterstemmer_suffix($key, 'i', '', $didit, NULL, 2) or porterstemmer_suffix($key, 'e', '', $didit, NULL, 2) or porterstemmer_suffix($key, 'y', '', $didit, NULL, 2);
$match = array();
if (!preg_match('/' . $boundary . '(' . $key . ')/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
return FALSE;
}
$newmatch = array();
$pos = $match[1][1];
if (preg_match('/' . $boundary . '/iu', $text, $newmatch, PREG_OFFSET_CAPTURE, $pos + strlen($key))) {
$keyfound = substr($text, $pos, $newmatch[0][1] - $pos);
}
else {
$keyfound = substr($text, $pos);
}
$foundstem = porterstemmer_stem($keyfound);
$didit = FALSE;
porterstemmer_suffix($foundstem, 'i', '', $didit, NULL, 2) or porterstemmer_suffix($foundstem, 'e', '', $didit, NULL, 2) or porterstemmer_suffix($foundstem, 'y', '', $didit, NULL, 2);
if (drupal_strtolower($foundstem) == drupal_strtolower($key)) {
return array(
'where' => $pos,
'keyword' => $keyfound,
);
}
return porterstemmer_sbp_excerpt_match($key, $text, $pos + strlen($keyfound), $boundary);
}
function _porterstemmer_pecl_loaded() {
static $has_pecl_stem = FALSE;
static $already_checked = FALSE;
if ($already_checked) {
return $has_pecl_stem;
}
$has_pecl_stem = extension_loaded('stem') && function_exists('stem_english');
$already_checked = TRUE;
return $has_pecl_stem;
}
define('PORTERSTEMMER_VOWEL', '[aeiouy]');
define('PORTERSTEMMER_NOT_VOWEL', '[^aeiouy]');
define('PORTERSTEMMER_NOT_VOWEL_WXY', '[^aeiouywxY]');
define('PORTERSTEMMER_DOUBLE', '(bb|dd|ff|gg|mm|nn|pp|rr|tt)');
define('PORTERSTEMMER_LI_END', '[cdeghkmnrt]');
define('PORTERSTEMMER_BOUNDARY', "[^a-zA-Z']+");
function porterstemmer_stem($word) {
$r1 = 0;
$r2 = 0;
porterstemmer_prestemming($word, $r1, $r2) or porterstemmer_exception1($word) or porterstemmer_step0($word) or porterstemmer_step1a($word) or porterstemmer_exception2($word) or porterstemmer_step1b($word, $r1) or porterstemmer_step1c($word) or porterstemmer_step2($word, $r1) or porterstemmer_step3($word, $r1, $r2) or porterstemmer_step4($word, $r2) or porterstemmer_step5($word, $r1, $r2);
porterstemmer_poststemming($word);
return $word;
}
function porterstemmer_too_short($word, $reset = FALSE) {
static $min_chars = 0;
if (!$min_chars || $reset) {
$min_chars = intval(variable_get('minimum_word_size', 3));
if ($min_chars < 2) {
$min_chars = 2;
}
}
if (drupal_strlen($word) < $min_chars) {
return TRUE;
}
return FALSE;
}
function porterstemmer_step_ending(&$word, $tmp) {
if (porterstemmer_too_short($tmp)) {
return TRUE;
}
$word = $tmp;
return FALSE;
}
function porterstemmer_suffix(&$word, $oldend, $newend, &$didit, $other = NULL, $minlen = 1) {
$end_regexp = '/' . $oldend . '$/';
if (!preg_match($end_regexp, $word)) {
return FALSE;
}
if ($other && !preg_match($other, $word)) {
return TRUE;
}
if (drupal_strlen($word) < $minlen) {
return TRUE;
}
$word = preg_replace($end_regexp, $newend, $word);
$didit = TRUE;
return TRUE;
}
function porterstemmer_short_word($word, $r1) {
if (drupal_strlen($word) > $r1) {
return FALSE;
}
if (preg_match('/^' . PORTERSTEMMER_VOWEL . PORTERSTEMMER_NOT_VOWEL . '$/', $word)) {
return TRUE;
}
if (preg_match('/' . PORTERSTEMMER_NOT_VOWEL . PORTERSTEMMER_VOWEL . PORTERSTEMMER_NOT_VOWEL_WXY . '$/', $word)) {
return TRUE;
}
return FALSE;
}
function porterstemmer_prestemming(&$word, &$r1, &$r2) {
if (porterstemmer_too_short($word)) {
return TRUE;
}
$tmp = $word;
$tmp = preg_replace("/^'/", '', $tmp);
if (porterstemmer_too_short($tmp)) {
return TRUE;
}
$tmp = preg_replace('/^y/', 'Y', $tmp);
$before = 'not going to match';
while ($before != $tmp) {
$before = $tmp;
$tmp = preg_replace('/(' . PORTERSTEMMER_VOWEL . ')y/', '$1Y', $tmp, 1);
}
$word = $tmp;
$max = drupal_strlen($word);
$r1 = $max;
$r2 = $max;
$matches = array();
$rdef = '/^' . PORTERSTEMMER_NOT_VOWEL . '*' . PORTERSTEMMER_VOWEL . '+(' . PORTERSTEMMER_NOT_VOWEL . ')/';
if (preg_match('/^(gener|commun|arsen)/', $word, $matches)) {
$r1 = drupal_strlen($matches[1]);
}
elseif (preg_match($rdef, $word, $matches, PREG_OFFSET_CAPTURE)) {
$r1 = $matches[1][1] + 1;
}
$R1 = drupal_substr($word, $r1);
if ($R1 && preg_match($rdef, $R1, $matches, PREG_OFFSET_CAPTURE)) {
$r2 = $r1 + $matches[1][1] + 1;
}
return FALSE;
}
function porterstemmer_poststemming(&$word) {
$word = str_replace('Y', 'y', $word);
}
function porterstemmer_step0(&$word) {
$tmp = $word;
$didit = FALSE;
porterstemmer_suffix($tmp, "'s'", '', $didit) or porterstemmer_suffix($tmp, "'s", '', $didit) or porterstemmer_suffix($tmp, "'", '', $didit);
return porterstemmer_step_ending($word, $tmp);
}
function porterstemmer_step1a(&$word) {
$tmp = $word;
$didit = FALSE;
$done = porterstemmer_suffix($tmp, 'sses', 'ss', $didit);
if (!$done && porterstemmer_suffix($tmp, 'ies', 'ie', $didit, '/^.?ies$/')) {
if (!$didit) {
porterstemmer_suffix($tmp, 'ies', 'i', $didit);
}
$done = TRUE;
}
if (!$done && porterstemmer_suffix($tmp, 'ied', 'ie', $didit, '/^.?ied$/')) {
if (!$didit) {
porterstemmer_suffix($tmp, 'ied', 'i', $didit);
}
$done = TRUE;
}
if (!$done) {
porterstemmer_suffix($tmp, 'ss', 'ss', $didit) or porterstemmer_suffix($tmp, 'us', 'us', $didit) or porterstemmer_suffix($tmp, 's', '', $didit, '/' . PORTERSTEMMER_VOWEL . '.+s$/');
}
return porterstemmer_step_ending($word, $tmp);
}
function porterstemmer_step1b(&$word, $r1) {
$tmp = $word;
$didit = FALSE;
$done = (porterstemmer_suffix($tmp, 'eedly', 'ee', $didit, NULL, $r1 + 5) or porterstemmer_suffix($tmp, 'eed', 'ee', $didit, NULL, $r1 + 3));
$didit = FALSE;
if (!$done) {
porterstemmer_suffix($tmp, 'edly', '', $didit, '/' . PORTERSTEMMER_VOWEL . '.*edly$/') or porterstemmer_suffix($tmp, 'ed', '', $didit, '/' . PORTERSTEMMER_VOWEL . '.*ed$/') or porterstemmer_suffix($tmp, 'ingly', '', $didit, '/' . PORTERSTEMMER_VOWEL . '.*ingly$/') or porterstemmer_suffix($tmp, 'ing', '', $didit, '/' . PORTERSTEMMER_VOWEL . '.*ing$/');
}
if ($didit) {
$done = porterstemmer_suffix($tmp, 'at', 'ate', $didit) or porterstemmer_suffix($tmp, 'bl', 'ble', $didit) or porterstemmer_suffix($tmp, 'iz', 'ize', $didit);
if (!$done && preg_match('/' . PORTERSTEMMER_DOUBLE . '$/', $tmp)) {
$tmp = drupal_substr($tmp, 0, -1);
$done = TRUE;
}
if (!$done && porterstemmer_short_word($tmp, $r1)) {
$tmp = $tmp . 'e';
}
}
return porterstemmer_step_ending($word, $tmp);
}
function porterstemmer_step1c(&$word) {
$tmp = $word;
$didit = FALSE;
$ytest = '/.' . PORTERSTEMMER_NOT_VOWEL . '[Yy]$/';
porterstemmer_suffix($tmp, 'Y', 'i', $didit, $ytest) or porterstemmer_suffix($tmp, 'y', 'i', $didit, $ytest);
return porterstemmer_step_ending($word, $tmp);
}
function porterstemmer_step2(&$word, $r1) {
$tmp = $word;
$didit = FALSE;
porterstemmer_suffix($tmp, 'ational', 'ate', $didit, NULL, $r1 + 7) or porterstemmer_suffix($tmp, 'fulness', 'ful', $didit, NULL, $r1 + 7) or porterstemmer_suffix($tmp, 'iveness', 'ive', $didit, NULL, $r1 + 7) or porterstemmer_suffix($tmp, 'ization', 'ize', $didit, NULL, $r1 + 7) or porterstemmer_suffix($tmp, 'ousness', 'ous', $didit, NULL, $r1 + 7) or porterstemmer_suffix($tmp, 'biliti', 'ble', $didit, NULL, $r1 + 6) or porterstemmer_suffix($tmp, 'lessli', 'less', $didit, NULL, $r1 + 6) or porterstemmer_suffix($tmp, 'tional', 'tion', $didit, NULL, $r1 + 6) or porterstemmer_suffix($tmp, 'aliti', 'al', $didit, NULL, $r1 + 5) or porterstemmer_suffix($tmp, 'ation', 'ate', $didit, NULL, $r1 + 5) or porterstemmer_suffix($tmp, 'alism', 'al', $didit, NULL, $r1 + 5) or porterstemmer_suffix($tmp, 'entli', 'ent', $didit, NULL, $r1 + 5) or porterstemmer_suffix($tmp, 'fulli', 'ful', $didit, NULL, $r1 + 5) or porterstemmer_suffix($tmp, 'iviti', 'ive', $didit, NULL, $r1 + 5) or porterstemmer_suffix($tmp, 'ousli', 'ous', $didit, NULL, $r1 + 5) or porterstemmer_suffix($tmp, 'abli', 'able', $didit, NULL, $r1 + 4) or porterstemmer_suffix($tmp, 'alli', 'al', $didit, NULL, $r1 + 4) or porterstemmer_suffix($tmp, 'ator', 'ate', $didit, NULL, $r1 + 4) or porterstemmer_suffix($tmp, 'anci', 'ance', $didit, NULL, $r1 + 4) or porterstemmer_suffix($tmp, 'enci', 'ence', $didit, NULL, $r1 + 4) or porterstemmer_suffix($tmp, 'izer', 'ize', $didit, NULL, $r1 + 4) or porterstemmer_suffix($tmp, 'bli', 'ble', $didit, NULL, $r1 + 3) or porterstemmer_suffix($tmp, 'ogi', 'og', $didit, '/logi$/', $r1 + 3) or porterstemmer_suffix($tmp, 'li', '', $didit, '/' . PORTERSTEMMER_LI_END . 'li$/', $r1 + 2);
return porterstemmer_step_ending($word, $tmp);
}
function porterstemmer_step3(&$word, $r1, $r2) {
$tmp = $word;
$didit = FALSE;
porterstemmer_suffix($tmp, 'ational', 'ate', $didit, NULL, $r1 + 7) or porterstemmer_suffix($tmp, 'tional', 'tion', $didit, NULL, $r1 + 6) or porterstemmer_suffix($tmp, 'alize', 'al', $didit, NULL, $r1 + 5) or porterstemmer_suffix($tmp, 'ative', '', $didit, NULL, $r2 + 5) or porterstemmer_suffix($tmp, 'icate', 'ic', $didit, NULL, $r1 + 5) or porterstemmer_suffix($tmp, 'iciti', 'ic', $didit, NULL, $r1 + 5) or porterstemmer_suffix($tmp, 'ical', 'ic', $didit, NULL, $r1 + 4) or porterstemmer_suffix($tmp, 'ness', '', $didit, NULL, $r1 + 4) or porterstemmer_suffix($tmp, 'ful', '', $didit, NULL, $r1 + 3);
return porterstemmer_step_ending($word, $tmp);
}
function porterstemmer_step4(&$word, $r2) {
$tmp = $word;
$didit = FALSE;
porterstemmer_suffix($tmp, 'ement', '', $didit, NULL, $r2 + 5) or porterstemmer_suffix($tmp, 'able', '', $didit, NULL, $r2 + 4) or porterstemmer_suffix($tmp, 'ance', '', $didit, NULL, $r2 + 4) or porterstemmer_suffix($tmp, 'ence', '', $didit, NULL, $r2 + 4) or porterstemmer_suffix($tmp, 'ible', '', $didit, NULL, $r2 + 4) or porterstemmer_suffix($tmp, 'ment', '', $didit, NULL, $r2 + 4) or porterstemmer_suffix($tmp, 'ant', '', $didit, NULL, $r2 + 3) or porterstemmer_suffix($tmp, 'ate', '', $didit, NULL, $r2 + 3) or porterstemmer_suffix($tmp, 'ent', '', $didit, NULL, $r2 + 3) or porterstemmer_suffix($tmp, 'ion', '', $didit, '/[st]ion$/', $r2 + 3) or porterstemmer_suffix($tmp, 'ism', '', $didit, NULL, $r2 + 3) or porterstemmer_suffix($tmp, 'iti', '', $didit, NULL, $r2 + 3) or porterstemmer_suffix($tmp, 'ive', '', $didit, NULL, $r2 + 3) or porterstemmer_suffix($tmp, 'ize', '', $didit, NULL, $r2 + 3) or porterstemmer_suffix($tmp, 'ous', '', $didit, NULL, $r2 + 3) or porterstemmer_suffix($tmp, 'al', '', $didit, NULL, $r2 + 2) or porterstemmer_suffix($tmp, 'er', '', $didit, NULL, $r2 + 2) or porterstemmer_suffix($tmp, 'ic', '', $didit, NULL, $r2 + 2);
return porterstemmer_step_ending($word, $tmp);
}
function porterstemmer_step5(&$word, $r1, $r2) {
$tmp = $word;
$didit = FALSE;
$done = FALSE;
$done = porterstemmer_suffix($tmp, 'll', 'l', $didit, NULL, $r2 + 1);
$len = drupal_strlen($tmp);
if (!$done && preg_match('/e$/', $tmp) && ($len > $r2 || $len > $r1 && !preg_match('/^' . PORTERSTEMMER_VOWEL . PORTERSTEMMER_NOT_VOWEL . 'e$/', $tmp) && !preg_match('/' . PORTERSTEMMER_NOT_VOWEL . PORTERSTEMMER_VOWEL . PORTERSTEMMER_NOT_VOWEL_WXY . 'e$/', $tmp))) {
$tmp = drupal_substr($tmp, 0, -1);
}
return porterstemmer_step_ending($word, $tmp);
}
function porterstemmer_exception1(&$word) {
$repl = array(
'skis' => 'ski',
'skies' => 'sky',
'dying' => 'die',
'lying' => 'lie',
'tying' => 'tie',
'idly' => 'idl',
'gently' => 'gentl',
'ugly' => 'ugli',
'early' => 'earli',
'only' => 'onli',
'singly' => 'singl',
'sky' => 'sky',
'news' => 'news',
'howe' => 'howe',
'atlas' => 'atlas',
'cosmos' => 'cosmos',
'bias' => 'bias',
'andes' => 'andes',
);
if (isset($repl[$word])) {
$word = $repl[$word];
return TRUE;
}
return FALSE;
}
function porterstemmer_exception2(&$word) {
$repl = array(
'inning' => 1,
'outing' => 1,
'canning' => 1,
'herring' => 1,
'earring' => 1,
'proceed' => 1,
'exceed' => 1,
'succeed' => 1,
);
if (isset($repl[$word])) {
return TRUE;
}
return FALSE;
}