You are here

function bayesian_tokenize in Spam 5.3

Split content into an array of tokens.

2 string references to 'bayesian_tokenize'
bayesian_spamapi in filters/bayesian/bayesian.module
Spam API Hook
bayesian_spam_filter in filters/bayesian/bayesian.module
Determine whether or not the content is spam.

File

filters/bayesian/bayesian.module, line 138

Code

function bayesian_tokenize($content, $type, $fields, $extra = array(), $tag = NULL) {
  static $tokens = array();
  $id = spam_invoke_module($type, 'content_id', $content, $extra);
  if (is_object($content)) {
    $content = (array) $content;
  }
  if (!isset($tokens["{$type}-{$id}-{$tag}"])) {
    $string = spam_get_text($content, $type, $fields, $extra);
    $URI = "(http://|https://|ftp://|mailto:)";

    // strip out unwanted html/url noise
    $sanitized = preg_replace("'(www\\.)|(</a>)|(href=)|(target=)|(src=)'i", '', $string);
    $sanitized = preg_replace($URI, '', $sanitized);

    // Strip out values that should not be considered part of tokens, so
    // things like '{viagra}' and 'vi.agra' are counted as hits towards
    // 'viagra'
    $sanitized = preg_replace("/[()\\{\\}\\[\\]#.,]/", '', $sanitized);

    // Force all tokens to lowercase, again to aggregate tokens.  This both
    // lowers the total token number of rows in the spam_tokens table and
    // increases the strength of individual tokens by linking them to
    // capitalized versions.
    $sanitized = strtolower($sanitized);

    // divide sanitized string into tokens
    $delimiters = " \t\n\r-_<>'\"`/|*%^&+=~:;?";
    $tok = strtok($sanitized, $delimiters);
    while ($tok !== FALSE) {

      // Only inspect the token if over minimum length.
      if (strlen($tok) >= variable_get('bayesian_minimum_token_length', 3)) {

        // If the token is longer than 255 characters, truncate it.
        $toks[] = htmlspecialchars(substr("{$tag}{$tok}", 0, 254));
      }
      $tok = strtok($delimiters);
    }

    // allow external module ability to extract additional tokens
    $hook = spam_invoke_api('tokenize', $string, $tag);
    if ($hook['tokens']) {
      $toks = array_merge($toks, $hook['tokens']);
    }
    $tokens["{$type}-{$id}-{$tag}"] = $toks;
  }
  return $tokens["{$type}-{$id}-{$tag}"];
}