function bayesian_tokenize in Spam 5.3
Split content into an array of tokens.
2 string references to 'bayesian_tokenize'
- bayesian_spamapi in filters/
bayesian/ bayesian.module - Spam API Hook
- bayesian_spam_filter in filters/
bayesian/ bayesian.module - Determine whether or not the content is spam.
File
- filters/
bayesian/ bayesian.module, line 138
Code
function bayesian_tokenize($content, $type, $fields, $extra = array(), $tag = NULL) {
static $tokens = array();
$id = spam_invoke_module($type, 'content_id', $content, $extra);
if (is_object($content)) {
$content = (array) $content;
}
if (!isset($tokens["{$type}-{$id}-{$tag}"])) {
$string = spam_get_text($content, $type, $fields, $extra);
$URI = "(http://|https://|ftp://|mailto:)";
// strip out unwanted html/url noise
$sanitized = preg_replace("'(www\\.)|(</a>)|(href=)|(target=)|(src=)'i", '', $string);
$sanitized = preg_replace($URI, '', $sanitized);
// Strip out values that should not be considered part of tokens, so
// things like '{viagra}' and 'vi.agra' are counted as hits towards
// 'viagra'
$sanitized = preg_replace("/[()\\{\\}\\[\\]#.,]/", '', $sanitized);
// Force all tokens to lowercase, again to aggregate tokens. This both
// lowers the total token number of rows in the spam_tokens table and
// increases the strength of individual tokens by linking them to
// capitalized versions.
$sanitized = strtolower($sanitized);
// divide sanitized string into tokens
$delimiters = " \t\n\r-_<>'\"`/|*%^&+=~:;?";
$tok = strtok($sanitized, $delimiters);
while ($tok !== FALSE) {
// Only inspect the token if over minimum length.
if (strlen($tok) >= variable_get('bayesian_minimum_token_length', 3)) {
// If the token is longer than 255 characters, truncate it.
$toks[] = htmlspecialchars(substr("{$tag}{$tok}", 0, 254));
}
$tok = strtok($delimiters);
}
// allow external module ability to extract additional tokens
$hook = spam_invoke_api('tokenize', $string, $tag);
if ($hook['tokens']) {
$toks = array_merge($toks, $hook['tokens']);
}
$tokens["{$type}-{$id}-{$tag}"] = $toks;
}
return $tokens["{$type}-{$id}-{$tag}"];
}