You are here

function _url_extract in Spam 5.3

2 calls to _url_extract()
url_spamapi in filters/url/url.module
URL filter plug in for the spam module. Copyright(c) 2007-2008 Jeremy Andrews <jeremy@tag1consulting.com>. All rights reserved.
url_spam_filter in filters/url/url.module
Search for known spam urls in content.

File

filters/url/url.module, line 48

Code

function _url_extract($content, $type, $fields, $extra = array()) {
  static $urls = array();
  $id = spam_invoke_module($type, 'content_id', $content, $extra);
  if (is_object($content)) {
    $content = (array) $content;
  }
  if (!isset($urls["{$type}-{$id}"])) {
    $string = '';
    foreach ($fields['main'] as $field) {
      $string .= $content["{$field}"] . ' ';
    }
    if (is_array($fields['other'])) {
      foreach ($fields['other'] as $field) {
        $string .= $content["{$field}"] . ' ';
      }
    }

    // TODO: Improve this matching.  We don't actually extract mailto: urls.
    $URI = "(http://|https://|ftp://|mailto:)";

    // Find all urls in content.
    preg_match_all("!(<p>|[ \n\r\t\\(]*)({$URI}([a-zA-Z0-9@:%_~#?&=.,/;-]*[a-zA-Z0-9@:%_~#&=/;-]))([.,?]?)(?=(</p>|[ \n\r\t\\)]*))!i", $string, $matches);
    foreach ($matches[2] as $url) {
      $url = preg_replace("'{$URI}'", '', $url);

      // get full domain (ie www.sample.com)
      preg_match("/^()?([^\\/\"\\']+)/i", $url, $domain);

      // get root domain (ie sample.com)
      preg_match("/[^\\.\\/]+\\.[^\\.\\/]+\$/", $domain[2], $root);
      $url = htmlspecialchars(strtolower($root[0]));
      _url_count($url);
      $u[] = $url;
    }
    $urls["{$type}-{$id}"] = $u;
  }
  return $urls["{$type}-{$id}"];
}