You are here

function spam_filter_bayesian_tokenize in Spam 6

Split content into an array of tokens.

2 string references to 'spam_filter_bayesian_tokenize'
spam_filter_bayesian_spamapi in filters/spam_filter_bayesian/spam_filter_bayesian.module
Spam API Hook
spam_filter_bayesian_spam_filter in filters/spam_filter_bayesian/spam_filter_bayesian.module
Determine whether or not the content is spam.

File

filters/spam_filter_bayesian/spam_filter_bayesian.module, line 184
Bayesian filter module Copyright(c) 2007-2008 Jeremy Andrews <jeremy@tag1consulting.com>. All rights reserved.

Code

function spam_filter_bayesian_tokenize($content, $type, $fields, $extra = array(), $tag = NULL) {
  static $tokens = array();
  $id = spam_invoke_module($type, 'content_id', $content, $extra);
  if (is_object($content)) {
    $content = (array) $content;
  }
  if (!isset($tokens["{$type}-{$id}-{$tag}"])) {
    $string = spam_get_text($content, $type, $fields, $extra);

    // Force all tokens to lowercase, again to aggregate tokens.  This both
    // lowers the total token number of rows in the spam_tokens table and
    // increases the strength of individual tokens by linking them to
    // capitalized versions.
    $sanitized = drupal_strtolower($string);

    // strip out unwanted html/url noise
    $sanitized = preg_replace("'(www\\.)|(</a>)|(href=)|(target=)|(src=)'", '', $sanitized);
    $sanitized = preg_replace("(http://|https://|ftp://|mailto:)", '', $sanitized);

    // Strip out values that should not be considered part of tokens, so
    // things like '{viagra}' and 'vi.agra' are counted as hits towards
    // 'viagra'
    $sanitized = preg_replace("/[()\\{\\}\\[\\]#.,]/", '', $sanitized);

    // Simple CJK handling
    if (variable_get('spam_filter_bayesian_overlap_cjk', TRUE)) {
      $sanitized = preg_replace_callback('/[' . SPAM_FILTER_BAYESIAN_PREG_CLASS_CJK . ']+/u', 'spam_filter_bayesian_expand_cjk', $sanitized);
    }

    // divide sanitized string into tokens
    $delimiters = " \t\n\r-_<>'\"`/|*%^&+=~:;?";
    $tok = strtok($sanitized, $delimiters);
    $min_length = variable_get('spam_filter_bayesian_minimum_token_length', 3);
    while ($tok !== FALSE) {

      // Only inspect the token if over minimum length.
      if (drupal_strlen($tok) >= $min_length) {

        // If the token is longer than 255 characters, truncate it.
        $toks[] = htmlspecialchars(drupal_substr("{$tag}{$tok}", 0, 254));
      }
      $tok = strtok($delimiters);
    }

    // allow external module ability to extract additional tokens
    $hook = spam_invoke_api('tokenize', $string, $tag);
    if (isset($hook['tokens']) && $hook['tokens']) {
      $toks = array_merge($toks, $hook['tokens']);
    }
    $tokens["{$type}-{$id}-{$tag}"] = $toks;
  }
  return $tokens["{$type}-{$id}-{$tag}"];
}