You are here

spam_filter_bayesian.module in Spam 6

Bayesian filter module Copyright(c) 2007-2008 Jeremy Andrews <jeremy@tag1consulting.com>. All rights reserved.

Provides a generic Bayesian filter for use with other modules. Defines hooks for use with the Spam API.

File

filters/spam_filter_bayesian/spam_filter_bayesian.module
View source
<?php

/**
 * @file
 * Bayesian filter module
 * Copyright(c) 2007-2008
 *  Jeremy Andrews <jeremy@tag1consulting.com>.  All rights reserved.
 *
 * Provides a generic Bayesian filter for use with other modules.
 *  Defines hooks for use with the Spam API.
 */

/**
 * From core search module.
 *
 * Matches all CJK characters that are candidates for auto-splitting
 * (Chinese, Japanese, Korean).
 * Contains kana and BMP ideographs.
 */
define('SPAM_FILTER_BAYESIAN_PREG_CLASS_CJK', '\\x{3041}-\\x{30ff}\\x{31f0}-\\x{31ff}\\x{3400}-\\x{4db5}' . '\\x{4e00}-\\x{9fbb}\\x{f900}-\\x{fad9}');

/**
 * Drupal _menu() hook.
 */
function spam_filter_bayesian_menu() {
  $items = array();
  $items['admin/settings/spam/filters/bayes'] = array(
    'title' => 'Bayesian',
    'page callback' => 'drupal_get_form',
    'page arguments' => array(
      'spam_filter_bayesian_admin_settings',
    ),
    'access arguments' => array(
      'administer spam',
    ),
    'description' => 'Configure the bayesian spam filter module.',
    'type' => MENU_LOCAL_TASK,
  );
  return $items;
}

/**
 * Adminsitrative interface for configuring the bayesian filter rules.
 */
function spam_filter_bayesian_admin_settings() {
  $form = array();
  $form['spam_filter_bayesian_overlap_cjk'] = array(
    '#type' => 'checkbox',
    '#title' => t('Simple CJK handling'),
    '#default_value' => variable_get('spam_filter_bayesian_overlap_cjk', TRUE),
    '#description' => t('Whether to apply a simple Chinese/Japanese/Korean tokenizer based on overlapping sequences. Does not affect other languages.'),
  );
  return system_settings_form($form);
}

/**
 * Spam API Hook
 */
function spam_filter_bayesian_spamapi($op, $type = NULL, $content = array(), $fields = array(), $extra = NULL) {
  switch ($op) {
    case 'filter':
      if (!module_invoke('spam', 'filter_enabled', 'spam_filter_bayesian', $type, $content, $fields, $extra)) {
        return;
      }
      return spam_filter_bayesian_spam_filter($content, $type, $fields, $extra);
    case 'filter_module':
      return 'spam_filter_bayesian';
      break;
    case 'filter_info':
      return array(
        'name' => t('Bayesian filter'),
        'module' => t('spam_filter_bayesian'),
        'description' => t('A bayesian spam filter.'),
        'help' => t('The bayesian filter can learn to tell the difference between valid content spam content.'),
      );
      break;
    case 'filter_install':
      return array(
        'status' => SPAM_FILTER_ENABLED,
      );
    case 'mark_as_spam':
    case 'mark_as_not_spam':
      if (!module_invoke('spam', 'filter_enabled', 'spam_filter_bayesian', $type, $content, $fields, $extra)) {
        return;
      }
      spam_log(SPAM_DEBUG, 'spam_filter_bayesian_spamapi', t('@op', array(
        '@op' => $op,
      )), $type, $extra['id']);
      $fields = spam_invoke_module($type, 'filter_fields', $extra['content']);
      $tokenizer = variable_get('spam_filter_bayesian_tokenizer', 'spam_filter_bayesian_tokenize');
      $tokens = $tokenizer($extra['content'], $type, $fields, $extra);
      spam_filter_bayesian_tokens_update('spam', $tokens, $op == 'mark_as_spam' ? TRUE : FALSE, $type, $extra['id']);
      break;
  }
}

/**
 * Determine whether or not the content is spam.
 */
function spam_filter_bayesian_spam_filter($content, $type, $fields, $extra = array(), $filter_test = FALSE) {
  $class = 'spam';
  $id = spam_invoke_module($type, 'content_id', $content, $extra);
  $tokenizer = variable_get('spam_filter_bayesian_tokenizer', 'spam_filter_bayesian_tokenize');
  $tokens = $tokenizer($content, $type, $fields, $extra);
  if (is_array($tokens)) {
    foreach ($tokens as $token) {
      $p = db_fetch_object(db_query("SELECT probability FROM {spam_filter_bayesian_tokens} WHERE class = '%s' AND token = '%s'", $class, $token));
      if (!$p) {
        $p->probability = variable_get('spam_filter_bayesian_default_probability', 40);
      }
      $t["{$token},{$p->probability}"] = abs($p->probability - 50);
    }
  }
  else {

    // No tokens, return default score.
    $action['total'] = variable_get('spam_filter_bayesian_default_probability', 40);
    return $action;
  }

  /* Sort token array so those tokens with the largest "drift" come first.
   * Drift is this distance from a median of 50%.
   */
  asort($t);

  /* Take the n most "interesting" tokens from the top of the token array.
   * The larger a token's drift, the more interesting it is.
   */
  $keys = array_keys($t);
  $max = variable_get('spam_filter_bayesian_interesting_tokens', 15);
  $total = 0;
  for ($i = 0; $i < $max; $i++) {
    if ($pair = array_pop($keys)) {
      $p = explode(',', $pair);
      $total = $total + $p[1];
      $action['spam_filter_bayesian'][$i] = array(
        'token' => $p[0],
        'probability' => $p[1],
      );
      spam_log(SPAM_DEBUG, 'spam_filter_bayesian_spam_filter', t('interesting token [@count] (@token) probability(@probability)', array(
        '@token' => $p[0],
        '@probability' => $p[1],
        '@count' => $i + 1,
      )), $type, $id);
    }
    else {

      // we've looked at all the tokens
      break;
    }
  }
  $probability = round($total / $i, 1);
  spam_log(SPAM_VERBOSE, 'spam_filter_bayesian_spam_filter', t('total(@total) count(@count) probability(@probability)', array(
    '@probability' => $probability,
    '@total' => $total,
    '@count' => $i,
  )), $type, $id);
  $action['total'] = $probability;
  return $action;
}

/**
 * Update token probabilities in database.
 */
function spam_filter_bayesian_tokens_update($class, $tokens, $yes, $type = NULL, $id = 0) {
  if (is_array($tokens)) {
    foreach ($tokens as $token) {
      $old = db_fetch_object(db_query("SELECT probability, yes_count, no_count FROM {spam_filter_bayesian_tokens} WHERE class = '%s' AND token = '%s'", $class, $token));
      if ($old) {
        $total = $old->yes_count + $old->no_count + 1;
        $probability = spam_sanitize_score(($old->yes_count + ($yes ? 1 : 0)) / $total * 100);
        spam_log(SPAM_DEBUG, 'spam_filter_bayesian_tokens_update', t('update token(@token) class(@class) yes(@yes) no(@no) prob(@prob): added @new', array(
          '@token' => $token,
          '@class' => $class,
          '@yes' => $old->yes_count + ($yes ? 1 : 0),
          '@no' => $old->no_count + ($yes ? 0 : 1),
          '@prob' => $probability,
          '@new' => $yes ? 'yes' : 'no',
        )), $type, $id);
        if ($yes) {
          db_query("UPDATE {spam_filter_bayesian_tokens} SET yes_count = yes_count + 1, probability = %d, last = %d WHERE class = '%s' AND token = '%s'", $probability, time(), $class, $token);
        }
        else {
          db_query("UPDATE {spam_filter_bayesian_tokens} SET no_count = no_count + 1, probability = %d, last = %d WHERE class = '%s' AND token = '%s'", $probability, time(), $class, $token);
        }
      }
      else {
        $probability = $yes ? 99 : 1;
        spam_log(SPAM_DEBUG, 'spam_filter_bayesian_tokens_update', t('insert token(@token) class(@class) probability(@probability)', array(
          '@token' => $token,
          '@class' => $class,
          '@probability' => $probability,
        )), $type, $id);
        db_query("INSERT INTO {spam_filter_bayesian_tokens} (class, token, yes_count, no_count, probability, last) VALUES('%s', '%s', %d, %d, %d, %d)", $class, $token, $yes ? 1 : 0, $yes ? 0 : 1, $probability, time());
      }
    }
  }
}

/**
 * Split content into an array of tokens.
 */
function spam_filter_bayesian_tokenize($content, $type, $fields, $extra = array(), $tag = NULL) {
  static $tokens = array();
  $id = spam_invoke_module($type, 'content_id', $content, $extra);
  if (is_object($content)) {
    $content = (array) $content;
  }
  if (!isset($tokens["{$type}-{$id}-{$tag}"])) {
    $string = spam_get_text($content, $type, $fields, $extra);

    // Force all tokens to lowercase, again to aggregate tokens.  This both
    // lowers the total token number of rows in the spam_tokens table and
    // increases the strength of individual tokens by linking them to
    // capitalized versions.
    $sanitized = drupal_strtolower($string);

    // strip out unwanted html/url noise
    $sanitized = preg_replace("'(www\\.)|(</a>)|(href=)|(target=)|(src=)'", '', $sanitized);
    $sanitized = preg_replace("(http://|https://|ftp://|mailto:)", '', $sanitized);

    // Strip out values that should not be considered part of tokens, so
    // things like '{viagra}' and 'vi.agra' are counted as hits towards
    // 'viagra'
    $sanitized = preg_replace("/[()\\{\\}\\[\\]#.,]/", '', $sanitized);

    // Simple CJK handling
    if (variable_get('spam_filter_bayesian_overlap_cjk', TRUE)) {
      $sanitized = preg_replace_callback('/[' . SPAM_FILTER_BAYESIAN_PREG_CLASS_CJK . ']+/u', 'spam_filter_bayesian_expand_cjk', $sanitized);
    }

    // divide sanitized string into tokens
    $delimiters = " \t\n\r-_<>'\"`/|*%^&+=~:;?";
    $tok = strtok($sanitized, $delimiters);
    $min_length = variable_get('spam_filter_bayesian_minimum_token_length', 3);
    while ($tok !== FALSE) {

      // Only inspect the token if over minimum length.
      if (drupal_strlen($tok) >= $min_length) {

        // If the token is longer than 255 characters, truncate it.
        $toks[] = htmlspecialchars(drupal_substr("{$tag}{$tok}", 0, 254));
      }
      $tok = strtok($delimiters);
    }

    // allow external module ability to extract additional tokens
    $hook = spam_invoke_api('tokenize', $string, $tag);
    if (isset($hook['tokens']) && $hook['tokens']) {
      $toks = array_merge($toks, $hook['tokens']);
    }
    $tokens["{$type}-{$id}-{$tag}"] = $toks;
  }
  return $tokens["{$type}-{$id}-{$tag}"];
}

/**
 * From core search.module
 *
 * Basic CJK tokenizer. Simply splits a string into consecutive, overlapping
 * sequences of characters ('minimum_word_size' long).
 */
function spam_filter_bayesian_expand_cjk($matches) {
  $min = variable_get('spam_filter_bayesian_minimum_token_length', 3);
  $str = $matches[0];
  $l = drupal_strlen($str);

  // Passthrough short words
  if ($l <= $min) {
    return ' ' . $str . ' ';
  }
  $tokens = ' ';

  // FIFO queue of characters
  $chars = array();

  // Begin loop
  for ($i = 0; $i < $l; ++$i) {

    // Grab next character
    $current = drupal_substr($str, 0, 1);
    $str = substr($str, strlen($current));
    $chars[] = $current;
    if ($i >= $min - 1) {
      $tokens .= implode('', $chars) . ' ';
      array_shift($chars);
    }
  }
  return $tokens;
}

// vim: ts=2 sw=2 et syntax=php

Functions

Namesort descending Description
spam_filter_bayesian_admin_settings Adminsitrative interface for configuring the bayesian filter rules.
spam_filter_bayesian_expand_cjk From core search.module
spam_filter_bayesian_menu Drupal _menu() hook.
spam_filter_bayesian_spamapi Spam API Hook
spam_filter_bayesian_spam_filter Determine whether or not the content is spam.
spam_filter_bayesian_tokenize Split content into an array of tokens.
spam_filter_bayesian_tokens_update Update token probabilities in database.

Constants

Namesort descending Description
SPAM_FILTER_BAYESIAN_PREG_CLASS_CJK From core search module.