You are here

function bayesian_spam_filter in Spam 5.3

Determine whether or not the content is spam.

1 call to bayesian_spam_filter()
bayesian_spamapi in filters/bayesian/bayesian.module
Spam API Hook

File

filters/bayesian/bayesian.module, line 55

Code

function bayesian_spam_filter($content, $type, $fields, $extra = array(), $filter_test = FALSE) {
  $class = 'spam';
  $id = spam_invoke_module($type, 'content_id', $content, $extra);
  $tokenizer = variable_get('bayesian_tokenizer', 'bayesian_tokenize');
  $tokens = $tokenizer($content, $type, $fields, $extra);
  if (is_array($tokens)) {
    foreach ($tokens as $token) {
      $p = db_fetch_object(db_query("SELECT probability FROM {bayesian_tokens} WHERE class = '%s' AND token = '%s'", $class, $token));
      if (!$p->probability) {
        $p->probability = variable_get('bayesian_default_probability', 40);
      }
      $t["{$token},{$p->probability}"] = abs($p->probability - 50);
    }
  }
  else {

    // No tokens, return default score.
    $action['total'] = variable_get('bayesian_default_probability', 40);
    return $action;
  }

  /* Sort token array so those tokens with the largest "drift" come first.
   * Drift is this distance from a median of 50%.
   */
  asort($t);

  /* Take the n most "interesting" tokens from the top of the token array.
   * The larger a token's drift, the more interesting it is.
   */
  $keys = array_keys($t);
  $max = variable_get('bayesian_interesting_tokens', 15);
  $total = 0;
  for ($i = 0; $i < $max; $i++) {
    if ($pair = array_pop($keys)) {
      $p = explode(',', $pair);
      $total = $total + $p[1];
      $action['bayesian'][$i] = array(
        'token' => $p[0],
        'probability' => $p[1],
      );
      spam_log(SPAM_DEBUG, 'bayesian_spam_filter', t('interesting token [@count] (@token) probability(@probability)', array(
        '@token' => $p[0],
        '@probability' => $p[1],
        '@count' => $i + 1,
      )), $type, $id);
    }
    else {

      // we've looked at all the tokens
      break;
    }
  }
  $probability = round($total / $i, 1);
  spam_log(SPAM_VERBOSE, 'bayesian_spam_filter', t('total(@total) count(@count) probability(@probability)', array(
    '@probability' => $probability,
    '@total' => $total,
    '@count' => $i,
  )), $type, $id);
  $action['total'] = $probability;
  return $action;
}