function spam_filter_bayesian_tokenize in Spam 6
Split content into an array of tokens.
2 string references to 'spam_filter_bayesian_tokenize'
- spam_filter_bayesian_spamapi in filters/
spam_filter_bayesian/ spam_filter_bayesian.module - Spam API Hook
- spam_filter_bayesian_spam_filter in filters/
spam_filter_bayesian/ spam_filter_bayesian.module - Determine whether or not the content is spam.
File
- filters/
spam_filter_bayesian/ spam_filter_bayesian.module, line 184 - Bayesian filter module Copyright(c) 2007-2008 Jeremy Andrews <jeremy@tag1consulting.com>. All rights reserved.
Code
function spam_filter_bayesian_tokenize($content, $type, $fields, $extra = array(), $tag = NULL) {
static $tokens = array();
$id = spam_invoke_module($type, 'content_id', $content, $extra);
if (is_object($content)) {
$content = (array) $content;
}
if (!isset($tokens["{$type}-{$id}-{$tag}"])) {
$string = spam_get_text($content, $type, $fields, $extra);
// Force all tokens to lowercase, again to aggregate tokens. This both
// lowers the total token number of rows in the spam_tokens table and
// increases the strength of individual tokens by linking them to
// capitalized versions.
$sanitized = drupal_strtolower($string);
// strip out unwanted html/url noise
$sanitized = preg_replace("'(www\\.)|(</a>)|(href=)|(target=)|(src=)'", '', $sanitized);
$sanitized = preg_replace("(http://|https://|ftp://|mailto:)", '', $sanitized);
// Strip out values that should not be considered part of tokens, so
// things like '{viagra}' and 'vi.agra' are counted as hits towards
// 'viagra'
$sanitized = preg_replace("/[()\\{\\}\\[\\]#.,]/", '', $sanitized);
// Simple CJK handling
if (variable_get('spam_filter_bayesian_overlap_cjk', TRUE)) {
$sanitized = preg_replace_callback('/[' . SPAM_FILTER_BAYESIAN_PREG_CLASS_CJK . ']+/u', 'spam_filter_bayesian_expand_cjk', $sanitized);
}
// divide sanitized string into tokens
$delimiters = " \t\n\r-_<>'\"`/|*%^&+=~:;?";
$tok = strtok($sanitized, $delimiters);
$min_length = variable_get('spam_filter_bayesian_minimum_token_length', 3);
while ($tok !== FALSE) {
// Only inspect the token if over minimum length.
if (drupal_strlen($tok) >= $min_length) {
// If the token is longer than 255 characters, truncate it.
$toks[] = htmlspecialchars(drupal_substr("{$tag}{$tok}", 0, 254));
}
$tok = strtok($delimiters);
}
// allow external module ability to extract additional tokens
$hook = spam_invoke_api('tokenize', $string, $tag);
if (isset($hook['tokens']) && $hook['tokens']) {
$toks = array_merge($toks, $hook['tokens']);
}
$tokens["{$type}-{$id}-{$tag}"] = $toks;
}
return $tokens["{$type}-{$id}-{$tag}"];
}