function _parser_common_syndication_download in FeedAPI 6
Same name and namespace in other branches
- 5 parser_common_syndication/parser_common_syndication.module \_parser_common_syndication_download()
Call one of the possible feedapi_get hook and pass back the downloaded data
Return value
string - the downloaded data, FALSE - if the URL is not reachable
2 calls to _parser_common_syndication_download()
- parser_common_syndication_feedapi_feed in parser_common_syndication/
parser_common_syndication.module - Implementation of hook_feedapi_feed().
- _parser_common_syndication_feedapi_parse in parser_common_syndication/
parser_common_syndication.inc - Parse the feed into a data structure.
File
- parser_common_syndication/
parser_common_syndication.inc, line 250 - Downloading and parsing functions for Common Syndication Parser
Code
function _parser_common_syndication_download($url, $settings = NULL) {
if (valid_url($url, TRUE)) {
// Handle password protected feeds.
$url_parts = parse_url($url);
$password = $username = NULL;
if (!empty($url_parts['user'])) {
$password = $url_parts['pass'];
$username = $url_parts['user'];
}
}
$accept_invalid_cert = isset($settings['accept_invalid_cert']) ? $settings['accept_invalid_cert'] : FALSE;
$downloaded_string = _parser_common_syndication_feedapi_get($url, $username, $password, $accept_invalid_cert);
// Cannot get the feed, pass the problem to one level up.
if ($downloaded_string == FALSE) {
return FALSE;
}
else {
if (is_object($downloaded_string)) {
return $downloaded_string;
}
}
// Do the autodiscovery at this level, pass back the real data.
// Maybe it's HTML. If it's not HTML, not worth to take a look into the downloaded string.
if (strpos(strtolower($downloaded_string), "<html") !== FALSE) {
$allowed_mime = array(
"text/xml",
"application/rss+xml",
"application/atom+xml",
"application/rdf+xml",
"application/xml",
);
$matches = array();
// Get all the links tag
preg_match_all('/<link\\s+(.*?)\\s*\\/?>/si', $downloaded_string, $matches);
$links = $matches[1];
$rss_link = FALSE;
foreach ($links as $link) {
$mime = array();
// Get the type attribute and check if the mime type is allowed.
preg_match_all('/type\\s*=\\s*("|\')([A-Za-z\\/+]*)("|\')/si', $link, $mime);
if (in_array(array_pop($mime[2]), $allowed_mime)) {
$href = array();
// Get the href attribute.
preg_match_all('/href\\s*=\\s*("|\')([=#\\?_:.0-9A-Za-z\\/+]*)("|\')/si', $link, $href);
$rss_link = array_pop($href[2]);
if (is_string($rss_link) && strlen($rss_link) > 0 && $rss_link != $url) {
// Handle base url related stuff.
$parsed_url = parse_url($rss_link);
if (!isset($parsed_url['host'])) {
// It's relative so make it absolute.
$base_tag = array();
preg_match_all('/<base href\\s*=\\s*("|\')([_:.0-9A-Za-z\\/+]*)("|\')/si', $link, $base_tag);
$base_url = array_pop($base_tag[2]);
if (is_string($base_url) && strlen($base_url) > 0) {
// Get from the HTML base tag.
$rss_link = $base_url . $rss_link;
}
else {
// Guess from the original URL.
$original_url = parse_url($url);
$rss_link = $original_url['scheme'] . '://' . $original_url['host'] . (isset($original_url['port']) ? ':' : '') . $original_url['port'] . $parsed_url['path'] . '?' . $parsed_url['query'] . '#' . $parsed_url['fragment'];
}
}
$downloaded_string = _parser_common_syndication_download($rss_link);
break;
}
}
}
}
// Ugly hack to be able to retrieve the xml:base property, no way to access xml:lang inside <feed>
$downloaded_string = preg_replace('/xml:base *=/', 'base=', $downloaded_string);
// Filter out strange tags. Without this, the text would contain strange stuff.
// @todo: make sure that these are not important for feed element mapper
$downloaded_string_filtered = preg_replace(array(
'@<script[^>]*?.*?</script>@si',
'@<object[^>]*?.*?</object>@si',
'@<embed[^>]*?.*?</embed>@si',
'@<applet[^>]*?.*?</applet>@si',
'@<noframes[^>]*?.*?</noframes>@si',
'@<noscript[^>]*?.*?</noscript>@si',
'@<noembed[^>]*?.*?</noembed>@si',
), '', $downloaded_string);
return empty($downloaded_string_filtered) ? $downloaded_string : $downloaded_string_filtered;
}