You are here

function _spam_filter_surbl_url_extract in Spam 6

Extract URLs from content.

1 call to _spam_filter_surbl_url_extract()
spam_filter_surbl_spam_filter in filters/spam_filter_surbl/spam_filter_surbl.module
Search for known spam urls in content.

File

filters/spam_filter_surbl/spam_filter_surbl.module, line 55
Surbl filter plug in for the spam module. Copyright(c) 2007-2008 Jeremy Andrews <jeremy@tag1consulting.com>.

Code

function _spam_filter_surbl_url_extract($content, $type, $fields, $extra = array()) {
  static $urls = array();
  $id = spam_invoke_module($type, 'content_id', $content, $extra);
  if (is_object($content)) {
    $content = (array) $content;
  }
  if (!isset($urls["{$type}-{$id}"])) {
    $string = '';
    foreach ($fields['main'] as $field) {
      $string .= $content["{$field}"] . ' ';
    }
    if (isset($fields['other']) && is_array($fields['other'])) {
      foreach ($fields['other'] as $field) {
        $string .= $content["{$field}"] . ' ';
      }
    }

    // TODO: Improve this matching.  We don't actually extract mailto: urls.
    $URI = "(http://|https://|ftp://|mailto:)";

    // Find all urls in content.
    preg_match_all("!(<p>|[ \n\r\t\\(]*)({$URI}([a-zA-Z0-9@:%_~#?&=.,/;-]*[a-zA-Z0-9@:%_~#&=/;-]))([.,?]?)(?=(</p>|[ \n\r\t\\)]*))!i", $string, $matches);
    $u = array();
    foreach ($matches[2] as $url) {
      $url = preg_replace("'{$URI}'", '', $url);

      // get full domain (ie www.sample.com)
      preg_match("/^()?([^\\/\"\\']+)/i", $url, $domain);

      // get root domain (ie sample.com)
      preg_match("/[^\\.\\/]+\\.[^\\.\\/]+\$/", $domain[2], $root);
      $u[md5($root[0])] = htmlspecialchars(drupal_strtolower($root[0]));
    }
    $urls["{$type}-{$id}"] = $u;
  }
  return $urls["{$type}-{$id}"];
}