You are here

spam_filter_url.module in Spam 6

URL filter plug in for the spam module. Copyright(c) 2007-2008 Jeremy Andrews <jeremy@tag1consulting.com>. All rights reserved.

File

filters/spam_filter_url/spam_filter_url.module
View source
<?php

/**
 * @file
 * URL filter plug in for the spam module.
 * Copyright(c) 2007-2008
 *  Jeremy Andrews <jeremy@tag1consulting.com>.  All rights reserved.
 *
 */
function spam_filter_url_spamapi($op, $type = NULL, $content = array(), $fields = array(), $extra = NULL) {

  // Don't both with this hook unless the filter is actually enabled.
  switch ($op) {
    case 'filter':
      if (!module_invoke('spam', 'filter_enabled', 'spam_filter_url', $type, $content, $fields, $extra)) {
        return;
      }
      return spam_filter_url_spam_filter($content, $type, $fields, $extra);
    case 'filter_module':
      return 'spam_filter_url';
    case 'filter_info':
      return array(
        'name' => t('URL filter'),
        'module' => t('spam_filter_url'),
        'description' => t('A spam url filter.'),
        'help' => t('The url filter blocks posts containing spam-URLs, automatically learned with the bayesian filter module.'),
      );
      break;
    case 'filter_install':
      return array(
        'status' => SPAM_FILTER_ENABLED,
        'gain' => 250,
        'weight' => -6,
      );
    case 'mark_as_spam':
    case 'mark_as_not_spam':
      if (!module_invoke('spam', 'filter_enabled', 'spam_filter_url', $type, $content, $fields, $extra)) {
        return;
      }
      spam_log(SPAM_DEBUG, 'spam_filter_url_spamapi', t('@op', array(
        '@op' => $op,
      )), $type, $extra['id']);
      $fields = spam_invoke_module($type, 'filter_fields', $extra['content']);
      $spam_filter_urls = _spam_filter_url_extract($extra['content'], $type, $fields, $extra);
      spam_filter_url_update($spam_filter_urls, $op == 'mark_as_spam' ? TRUE : FALSE, $type, $extra['id']);
      break;
  }
}
function _spam_filter_url_extract($content, $type, $fields, $extra = array()) {
  static $spam_filter_urls = array();
  $id = spam_invoke_module($type, 'content_id', $content, $extra);
  if (is_object($content)) {
    $content = (array) $content;
  }
  if (!isset($spam_filter_urls["{$type}-{$id}"])) {
    $string = '';
    if (isset($fields['main']) && is_array($fields['main'])) {
      foreach ($fields['main'] as $field) {
        $string .= $content["{$field}"] . ' ';
      }
    }
    if (isset($fields['other']) && is_array($fields['other'])) {
      foreach ($fields['other'] as $field) {
        $string .= $content["{$field}"] . ' ';
      }
    }
    $URI = "(http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://)";

    // Find all urls in content. Code based on _filter_url().
    // Match absolute URLs.
    preg_match_all("`(<p>|<li>|<br\\s*/?>|[ \n\r\t\\(])({$URI}([a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-]))([.,?!]*?)(?=(</p>|</li>|<br\\s*/?>|[ \n\r\t\\)]))`i", $string, $matches_absolute);

    // Match e-mail addresses.
    preg_match_all("`(<p>|<li>|<br\\s*/?>|[ \n\r\t\\(])([A-Za-z0-9._-]+@[A-Za-z0-9._+-]+\\.[A-Za-z]{2,4})([.,?!]*?)(?=(</p>|</li>|<br\\s*/?>|[ \n\r\t\\)]))`i", $string, $matches_email);

    // Match www domains/addresses.
    preg_match_all("`(<p>|<li>|[ \n\r\t\\(])(www\\.[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+~#\\&=/;-])([.,?!]*?)(?=(</p>|</li>|<br\\s*/?>|[ \n\r\t\\)]))`i", $string, $matches_www);
    $matches = array_merge($matches_absolute[2], $matches_email[2], $matches_www[2]);
    $u = array();
    foreach ($matches as $spam_filter_url) {
      $spam_filter_url = preg_replace("'{$URI}'", '', $spam_filter_url);

      // get full domain (ie www.sample.com)
      preg_match("/^()?([^\\/\"\\']+)/i", $spam_filter_url, $domain);

      // get root domain (ie sample.com)
      preg_match("/[^\\.\\/]+\\.[^\\.\\/]+\$/", $domain[2], $root);
      $spam_filter_url = htmlspecialchars(drupal_strtolower($root[0]));
      _spam_filter_url_count($spam_filter_url);
      $u[] = $spam_filter_url;
    }
    $spam_filter_urls["{$type}-{$id}"] = $u;
  }
  return $spam_filter_urls["{$type}-{$id}"];
}

/**
 * Search for known spam urls in content.
 */
function spam_filter_url_spam_filter($content, $type, $fields, $extra = array(), $filter_test = FALSE) {
  $action = array();
  $id = spam_invoke_module($type, 'content_id', $content, $extra);
  $spam = FALSE;
  $spam_filter_urls = _spam_filter_url_extract($content, $type, $fields, $extra);
  $probability = variable_get('spam_filter_url_probability', 99);
  if (is_array($spam_filter_urls) && !empty($spam_filter_urls)) {
    $count = _spam_filter_url_count();
    $limit = variable_get('spam_filter_url_limit_total', 10);
    if ($limit > -1 && $count['total'] > $limit) {
      spam_log(SPAM_VERBOSE, 'spam_filter_url_spam_filter', t('total urls(@total) > spam_filter_url_limit_total(@limit)', array(
        '@total' => $count['total'],
        '@limit' => variable_get('spam_filter_url_limit_total', 10),
      )), $type, $id);
      $action['spam_filter_url'][] = array(
        'limit' => 'total',
        'total' => $count['total'],
      );
      $action['total'] = $probability;
      return $action;
    }
    $limit = variable_get('spam_filter_url_limit_repeat', 5);
    if ($limit > -1) {

      // Sort urls from most repeated to least repeated.
      asort($count);

      // skip count['total']
      array_pop($count);
      $max = array_pop($count);
      if ($max > $limit) {
        spam_log(SPAM_VERBOSE, 'spam_filter_url_spam_filter', t('repeated urls(@total) > spam_filter_url_limit_repeat(@limit)', array(
          '@total' => $max,
          '@limit' => variable_get('spam_filter_url_limit_repeat', 5),
        )), $type, $id);
        $action['spam_filter_url'][] = array(
          'limit' => 'repeat',
          'total' => $max,
        );
        $action['total'] = $probability;
        return $action;
      }
    }
    foreach ($spam_filter_urls as $spam_filter_url) {
      $p = db_fetch_object(db_query("SELECT probability FROM {spam_filter_bayesian_tokens} WHERE class = 'spam_filter_url' AND token = '%s'", $spam_filter_url));
      $action['spam_filter_url'][] = array(
        'spam_filter_url' => $spam_filter_url,
        'probability' => $p->probability,
      );
      if ($p->probability >= variable_get('spam_threshold', SPAM_DEFAULT_THRESHOLD)) {
        spam_log(SPAM_VERBOSE, 'spam_filter_url_spam_filter', t('found spam url(@url) probability(@probability)', array(
          '@url' => $spam_filter_url,
          '@probability' => $p->probability,
        )), $type, $id);
        $spam = TRUE;
        break;
      }
      spam_log(SPAM_DEBUG, 'spam_filter_url_spam_filter', t('not spam url(@url) probability(@probability)', array(
        '@url' => $spam_filter_url,
        '@probability' => $p->probability,
      )), $type, $id);
    }
  }
  if ($spam) {
    $action['total'] = $probability;
  }
  else {
    $action['total'] = 0;
  }
  return $action;
}

/**
 * Update url probabilities in database.
 */
function spam_filter_url_update($spam_filter_urls, $yes, $type, $id) {
  module_invoke('spam_filter_bayesian', 'tokens_update', 'spam_filter_url', $spam_filter_urls, $yes, $type, $id);
}

/**
 * Keep track of the total number of URLs found in the current content.
 *
 * @param $spam_filter_url  A URL to be added to a static array.
 * @return      Array of URLs showing how many times each URL is present, and
 *              the total number of arrays.
 */
function _spam_filter_url_count($spam_filter_url = NULL) {

  // build up an array of all URLs seen in current content
  static $spam_filter_urls = array(
    'total' => 0,
  );
  if ($spam_filter_url != NULL) {
    if (empty($spam_filter_urls["{$spam_filter_url}"])) {
      $spam_filter_urls["{$spam_filter_url}"] = 1;
    }
    else {
      $spam_filter_urls["{$spam_filter_url}"]++;
    }
    $spam_filter_urls['total']++;
  }
  return $spam_filter_urls;
}
function spam_filter_url_menu() {
  $items = array();
  $items['admin/settings/spam/filters/url'] = array(
    'title' => 'URL',
    'page callback' => 'drupal_get_form',
    'page arguments' => array(
      'spam_filter_url_admin_settings',
    ),
    'access arguments' => array(
      'administer spam',
    ),
    'description' => 'Configure the url filter.',
    'type' => MENU_LOCAL_TASK,
  );
  return $items;
}
function spam_filter_url_admin_settings() {
  $form = array();
  $form['short'] = array(
    '#type' => 'fieldset',
    '#title' => 'URL limits',
    '#collapsible' => TRUE,
    '#collapsed' => FALSE,
  );
  $max_urls = drupal_map_assoc(array(
    -1,
    0,
    1,
    2,
    3,
    4,
    5,
    6,
    7,
    8,
    9,
    10,
    15,
    20,
    25,
    40,
    100,
  ));
  $max_urls[-1] = "Unlimited";
  $form['short']['spam_filter_url_limit_total'] = array(
    '#type' => 'select',
    '#title' => t('Maximum total URLs per post'),
    '#options' => $max_urls,
    '#required' => TRUE,
    '#default_value' => variable_get('spam_filter_url_limit_total', 10),
  );
  $max_dups = drupal_map_assoc(array(
    -1,
    0,
    1,
    2,
    3,
    4,
    5,
    10,
    15,
    20,
    25,
    40,
    100,
  ));
  $max_dups[-1] = "Unlimited";
  $form['short']['spam_filter_url_limit_repeat'] = array(
    '#type' => 'select',
    '#title' => t('Maximum number of times the same URL can appear in a post'),
    '#options' => $max_dups,
    '#required' => TRUE,
    '#default_value' => variable_get('spam_filter_url_limit_repeat', 5),
  );
  $url_spam_prob = drupal_map_assoc(array(
    60,
    65,
    70,
    75,
    80,
    85,
    90,
    95,
    99,
  ));
  $form['short']['spam_filter_url_probability'] = array(
    '#type' => 'select',
    '#title' => t('Probability that content posted that exceeds the url limit or duplicate url limit is spam'),
    '#options' => $url_spam_prob,
    '#required' => TRUE,
    '#default_value' => variable_get('spam_filter_url_probability', 99),
  );
  return system_settings_form($form);
}
function spam_filter_url_admin_settings_validate($form, &$form_state) {
  $limit_long = $form_state['values']['spam_filter_url_limit_total'];
  $limit_short = $form_state['values']['spam_filter_url_limit_repeat'];
  if ($limit_short > $limit_long) {
    form_set_error('spam_filter_url_limit_total', t('Total URL limit must be at least as big as the repeated URL limit.'));
  }
}
function spam_filter_url_admin_settings_submit($form, &$form_state) {
  if ($form_state['values']['op'] == t('Reset to defaults')) {
    variable_del('spam_filter_url_limit_total');
    variable_del('spam_filter_url_limit_repeat');
    drupal_set_message('Configuration reset to defaults.');
  }
  else {
    variable_set('spam_filter_url_limit_total', $form_state['values']['spam_filter_url_limit_total']);
    variable_set('spam_filter_url_limit_repeat', $form_state['values']['spam_filter_url_limit_repeat']);
    drupal_set_message('Configuration saved.');
  }
}

Functions

Namesort descending Description
spam_filter_url_admin_settings
spam_filter_url_admin_settings_submit
spam_filter_url_admin_settings_validate
spam_filter_url_menu
spam_filter_url_spamapi @file URL filter plug in for the spam module. Copyright(c) 2007-2008 Jeremy Andrews <jeremy@tag1consulting.com>. All rights reserved.
spam_filter_url_spam_filter Search for known spam urls in content.
spam_filter_url_update Update url probabilities in database.
_spam_filter_url_count Keep track of the total number of URLs found in the current content.
_spam_filter_url_extract