function spam_bayesian_filter in Spam 5
Simple Bayesian logic to determine the probability that the passed in array of tokens is spam.
Parameters
$tokens An array of tokens.:
Return value
An integer from 1 to 99 which is the probability that the array of tokens passed to this function are spam.
1 call to spam_bayesian_filter()
- spam_content_filter in ./
spam.module - Determine whether or not provided text is spam.
File
- ./
spam.module, line 1181
Code
function spam_bayesian_filter($tokens = array()) {
/* Lookup each token in the databse to assign a probability that the token
* is spam. If the token doesn't exist in the database, assign a default
* probability. Finally, calculate how far this probability is away from
* a median of 50%.
*/
foreach ($tokens as $token) {
// TODO: Optimize, this is a lot of database queries!
$p = db_fetch_object(db_query("SELECT probability FROM {spam_tokens} WHERE token = '%s'", $token));
if (!$p->probability) {
$p->probability = variable_get('spam_default_probability', 40);
}
$t["{$token},{$p->probability}"] = abs($p->probability - 50);
}
/* Sort token array so those tokens with the largest "drift" come first.
* Drift is this distance from a median of 50%.
*/
asort($t);
/* Take the n most "interesting" tokens from the top of the token array.
* The larger a token's drift, the more interesting it is.
*/
$keys = array_keys($t);
$max = variable_get('spam_interesting_tokens', 15);
$total = 0;
for ($i = 0; $i < $max; $i++) {
if ($pair = array_pop($keys)) {
$p = explode(',', $pair);
$total = $total + $p[1];
spam_log(SPAM_DEBUG, t("bayesian filter: token '@token' spam probability @percent%", array(
'@token' => $p[0],
'@percent' => $p[1],
)));
}
else {
// we've looked at all the tokens
break;
}
}
$probability = round($total / $i, 1);
spam_log(SPAM_LOG, t("bayesian filter: examined @num tokens, spam probability of @percent%", array(
'@num' => $i,
'@percent' => $probability,
)));
return $probability;
}