You are here

function spam_content_filter in Spam 5

Same name and namespace in other branches
  1. 5.3 spam.module \spam_content_filter()
  2. 6 spam.module \spam_content_filter()

Determine whether or not provided text is spam.

Parameters

$source: Which module this text came from (ie 'comment', 'node', 'trackback'...)

$id: Numeric identifier for content (ie node id, comment id, etc)

$header: Header portion of text to be filtered.

$body: Body portion of text to be filtered.

$callback: Function to call once text is determined to be spam or not

Return value

TRUE (spam) or FALSE (not spam)

2 calls to spam_content_filter()
spam_comment in ./spam.module
Drupal _comment hook. Passes new comments to the spam filter.
spam_nodeapi in ./spam.module
Drupal _nodeapi hook. Passes new node content through the spam filter.

File

./spam.module, line 26

Code

function spam_content_filter($source, $id, $header, $body, $callback = NULL) {

  // globals used in logging
  global $ID, $SOURCE;
  $ID = $id;
  $SOURCE = $source;

  // md5 hash used to catch same comment being posted over and over
  $hash = md5($header . $body);

  // sanity check comments
  if ($source == 'comment') {
    if (spam_validate_comment($id) == FALSE) {
      $probability = 99;
    }
  }

  // first filter, see if this is duplicated content
  if ($probability < variable_get('spam_threshold', 80)) {
    if (spam_duplicate_filter($hash, $source, $id, $header, $body)) {
      $probability = 99;
    }
  }

  // second filter, see if this content matches custom filters
  if ($probability < variable_get('spam_threshold', 80)) {
    $action = array();
    $weight = spam_custom_filter($header, $body, $action);
  }

  // third filter, see if content contains spam urls
  if ($probability + $weight < variable_get('spam_threshold', 80)) {
    $weight += spam_url_filter($header . ' ' . $body);
  }

  // fourth filter, see if content contains too many urls
  if ($probability + $weight < variable_get('spam_threshold', 80)) {

    // split content into tokens
    $tokens = spam_tokenize($header, 'header*');
    $tokens = array_merge($tokens, spam_tokenize($body));
    $weight += spam_url_limit(spam_urls_count());
  }

  // pass content through external filters, if any
  if ($probability + $weight < variable_get('spam_threshold', 80)) {

    // external spam filters
    $hook = spam_invoke_hook('filter', $header . ' ' . $body, $tokens);
    if ($hook['weight']) {
      $weight += $hook['weight'];

      // external modules should do their own logging, but just in case...
      spam_log(SPAM_VERBOSE, t('spam_content_filter: external module added weight of @weight', array(
        '@weight' => $hook['weight'],
      )), $source, $id);
    }
  }

  // finally, if necessary pass content through bayesian filter
  if ($probability + $weight < variable_get('spam_threshold', 80)) {
    $probability = spam_bayesian_filter($tokens);
  }

  // be sure probability is in valid range (1-99)
  $probability = spam_get_probability($probability, $weight);
  $old = db_fetch_object(db_query("SELECT * FROM {spam_tracker} WHERE source = '%s' AND id = %d", $source, $id));
  if ($old->id) {

    // content has been updated
    db_query("UPDATE {spam_tracker} SET probability = %d, hostname = '%s', hash = '%s', timestamp = %d WHERE source = '%s' AND id = '%d'", $probability, $_SERVER['REMOTE_ADDR'], $hash, time(), $source, $id);
  }
  else {

    // this is the first time we've filtered this content
    db_query("INSERT INTO {spam_tracker} (id, source, probability, hostname, hash, timestamp) VALUES(%d, '%s', %d, '%s', '%s', %d)", $id, $source, $probability, $_SERVER['REMOTE_ADDR'], $hash, time());
  }
  spam_log(SPAM_LOG, t('spam_content_filter: @prob% probability of being spam for @source "%header"', array(
    '@prob' => $probability,
    '@source' => $source,
    '%header' => $header,
  )), $source, $id);

  // pass probability to callback which will deal with it as is appropriate
  if ($callback != NULL) {
    $return = $callback($source, $id, $header, $body, $probability, $old, $action);
  }
  else {
    $return = spam_default_actions($source, $id, $header, $body, $probability, $old, $action);
  }
  if ($probability >= variable_get('spam_threshold', 80)) {

    /* This ip should have already been blocked when the content was previewed,
     * but if the website is not configured to require previews this is our
     * only chance to try and delay the spammer.  Delaying them here is much
     * less effective, as the spam was already posted and many automatic tools
     * probably don't wait around at this point.
     */
    spam_ip_filter($source, $id, TRUE);
  }
  return $return;
}