function spam_tokenize in Spam 5
Divide a string into tokens.
Parameters
$string A text string to tokenize.:
$tag An optional tag to prepend to each token.:
Return value
An array of tokens that were obtained from the string.
6 calls to spam_tokenize()
- spam_content_filter in ./
spam.module - Determine whether or not provided text is spam.
- spam_notspam_comment in ./
spam.module - Mark the comment as not spam. This may cause the comment to become published depending on settings
- spam_notspam_node in ./
spam.module - Force a node to be marked as not spam. May not publish depending on settings
- spam_page in ./
spam.module - Drupal _page hook. Provides various spam actions based on the URL that is currently being accessed.
- spam_spam_comment in ./
spam.module - Mark the comment as spam. This may cause the comment to become unpublished depending on settings
File
- ./
spam.module, line 1086
Code
function spam_tokenize($string, $tag = NULL) {
$tokens = array();
$URI = "(http://|https://|ftp://|mailto:)";
// strip out unwanted html/url noise
$sanitized = preg_replace("'(www\\.)|(</a>)|(href=)|(target=)|(src=)'i", '', $string);
$sanitized = preg_replace($URI, '', $sanitized);
// Strip out values that should not be considered part of tokens, so
// things like '{viagra}' and 'vi.agra' are counted as hits towards
// 'viagra'
$sanitized = preg_replace("/[()\\{\\}\\[\\]#.,]/", '', $sanitized);
// Force all non URL* tokens to lowercase, again to aggregate
// tokens. This both lowers the total token number of rows in the
// spam_tokens table and increases the strength of individual tokens
// by linking them to capitalized versions.
$sanitized = strtolower($sanitized);
// divide sanitized string into tokens
$delimiters = " \t\n\r-_<>'\"`/|*%^&+=~:;?";
$tok = strtok($sanitized, $delimiters);
while ($tok !== FALSE) {
// if longer than 255 characters, truncate the token
$tokens[] = htmlspecialchars(substr("{$tag}{$tok}", 0, 254));
$tok = strtok($delimiters);
}
// second pass, grab urls from unsanitized string
$matches = preg_match_all("!(<p>|[ \n\r\t\\(]*)({$URI}([a-zA-Z0-9@:%_~#?&=.,/;-]*[a-zA-Z0-9@:%_~#&=/;-]))([.,?]?)(?=(</p>|[ \n\r\t\\)]*))!i", $string, $urls);
foreach ($urls[2] as $url) {
$tokens[] = $url;
spam_urls_count($url);
$url = preg_replace("'{$URI}'", '', $url);
// get full domain (ie www.sample.com)
preg_match("/^()?([^\\/\"\\']+)/i", $url, $domain);
// get root domain (ie sample.com)
preg_match("/[^\\.\\/]+\\.[^\\.\\/]+\$/", $domain[2], $root);
$tokens[] = htmlspecialchars("URL*{$root[0]}");
}
// allow external module ability to extract additional tokens
$hook = spam_invoke_hook('tokenize', $string);
if ($hook['tokens']) {
$tokens = array_merge($tokens, $hook['tokens']);
}
return $tokens;
}