parser_common_syndication.inc in FeedAPI 6
Downloading and parsing functions for Common Syndication Parser
File
parser_common_syndication/parser_common_syndication.incView source
<?php
/**
* @file
* Downloading and parsing functions for Common Syndication Parser
*/
/**
* Parse the feed into a data structure.
*
* @param $feed
* The feed object (contains the URL or the parsed XML structure.
* @return
* stdClass The structured datas extracted from the feed.
*/
function _parser_common_syndication_feedapi_parse($feed) {
if (is_a($feed, 'SimpleXMLElement')) {
$xml = $feed;
}
else {
$downloaded_string = _parser_common_syndication_download($feed->url);
if ($downloaded_string === FALSE || is_object($downloaded_string)) {
return $downloaded_string;
}
if (!defined('LIBXML_VERSION') || version_compare(phpversion(), '5.1.0', '<')) {
@($xml = simplexml_load_string($downloaded_string, NULL));
}
else {
@($xml = simplexml_load_string($downloaded_string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NOCDATA));
}
// Got a malformed XML.
if ($xml === FALSE || is_null($xml)) {
return FALSE;
}
}
$feed_type = _parser_common_syndication_feed_format_detect($xml);
if ($feed_type == "atom1.0") {
return _parser_common_syndication_atom10_parse($xml);
}
if ($feed_type == "RSS2.0" || $feed_type == "RSS0.91" || $feed_type == "RSS0.92") {
return _parser_common_syndication_RSS20_parse($xml);
}
if ($feed_type == "RDF") {
return _parser_common_syndication_RDF10_parse($xml);
}
return FALSE;
}
/**
* Get the cached version of the <var>$url</var>
*/
function _parser_common_syndication_cache_get($url) {
$cache_file = _parser_common_syndication_sanitize_cache() . '/' . md5($url);
if (file_exists($cache_file)) {
$file_content = file_get_contents($cache_file);
return unserialize($file_content);
}
return FALSE;
}
/**
* Store the parsed feed into the cache
*/
function _parser_common_syndication_cache_set($url, $parsed_feed) {
$cache_file = _parser_common_syndication_sanitize_cache() . '/' . md5($url);
$cache_fp = fopen($cache_file, 'w');
fwrite($cache_fp, serialize($parsed_feed));
fclose($cache_fp);
}
/**
* Get the content from the given URL.
*
* @param $url
* A valid URL (not only web URLs).
* @param $username
* If the URL use authentication, here you can supply the username for this.
* @param $password
* If the URL use authentication, here you can supply the password for this.
* @return
* The data pulled from the URL or FALSE if the feed does not need refresh.
*/
function _parser_common_syndication_feedapi_get($url, $username = NULL, $password = NULL, $accept_invalid_cert = FALSE) {
// Intra-pagedownload cache, avoid to download the same content twice within one page download (it's possible, compatible and parse calls).
static $download_cache = array();
if (isset($download_cache[$url])) {
return $download_cache[$url];
}
$has_etag = FALSE;
$curl = _parser_common_syndication_use_curl();
// Only download and parse data if really needs refresh.
// Based on "Last-Modified" and "If-Modified-Since".
$headers = array();
$db_result = db_query("SELECT etag, last_modified FROM {parser_common_syndication} WHERE url = '%s'", md5($url));
while ($validate = db_fetch_array($db_result)) {
$has_etag = TRUE;
if (!empty($validate['etag'])) {
if ($curl) {
$headers[] = 'If-None-Match: ' . $validate['etag'];
}
else {
$headers['If-None-Match'] = $validate['etag'];
}
}
if (!empty($validate['last_modified'])) {
if ($curl) {
$headers[] = 'If-Modified-Since: ' . $validate['last_modified'];
}
else {
$headers['If-Modified-Since'] = $validate['last_modified'];
}
}
if (!empty($username) && !$curl) {
$headers['Authorization'] = 'Basic ' . base64_encode("{$username}:{$password}");
}
}
if ($curl) {
$headers[] = 'User-Agent: Drupal (+http://drupal.org/)';
$result = new stdClass();
$download = curl_init($url);
curl_setopt($download, CURLOPT_FOLLOWLOCATION, TRUE);
if (!empty($username)) {
curl_setopt($download, CURLOPT_USERPWD, "{$username}:{$password}");
}
curl_setopt($download, CURLOPT_HTTPHEADER, $headers);
curl_setopt($download, CURLOPT_HEADER, TRUE);
curl_setopt($download, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($download, CURLOPT_ENCODING, '');
if ($accept_invalid_cert) {
curl_setopt($download, CURLOPT_SSL_VERIFYPEER, 0);
}
$header = '';
$data = curl_exec($download);
$header_size = curl_getinfo($download, CURLINFO_HEADER_SIZE);
$header = substr($data, 0, $header_size - 1);
$result->data = substr($data, $header_size);
$header_lines = preg_split("/\r\n|\n|\r/", $header);
$result->headers = array();
array_shift($header_lines);
// skip HTTP response status
while ($line = trim(array_shift($header_lines))) {
list($header, $value) = explode(':', $line, 2);
if (isset($result->headers[$header]) && $header == 'Set-Cookie') {
// RFC 2109: the Set-Cookie response header comprises the token Set-
// Cookie:, followed by a comma-separated list of one or more cookies.
$result->headers[$header] .= ',' . trim($value);
}
else {
$result->headers[$header] = trim($value);
}
}
$result->code = curl_getinfo($download, CURLINFO_HTTP_CODE);
curl_close($download);
}
else {
$result = drupal_http_request($url, $headers);
}
$result->code = isset($result->code) ? $result->code : 200;
// In this case return the cached data.
if ($result->code == 304) {
$cached_data = _parser_common_syndication_cache_get($url);
if (is_object($cached_data)) {
$cached_data->from_cache = TRUE;
return $cached_data;
}
else {
// It's a tragedy, this file has to be exist and contain good data.
// In this case, repeat the stuff without cache.
db_query("DELETE FROM {parser_common_syndication} WHERE url = '%s'", md5($url));
return _parser_common_syndication_feedapi_get($url, $username, $password);
}
}
if (!isset($result->headers) || !isset($result->headers['ETag']) || !isset($result->headers['Last-Modified'])) {
$result->headers = isset($result->headers) ? $result->headers : array();
$result->headers['ETag'] = isset($result->headers['ETag']) ? $result->headers['ETag'] : '';
$result->headers['Last-Modified'] = isset($result->headers['Last-Modified']) ? $result->headers['Last-Modified'] : '';
}
if ($has_etag == TRUE) {
db_query("UPDATE {parser_common_syndication} SET etag = '%s', last_modified = '%s' WHERE url = '%s'", $result->headers['ETag'], $result->headers['Last-Modified'], md5($url));
}
else {
db_query("INSERT INTO {parser_common_syndication} (etag, last_modified, url) VALUES ('%s', '%s', '%s')", $result->headers['ETag'], $result->headers['Last-Modified'], md5($url));
}
$download_cache[$url] = $result->data;
return empty($result->data) ? FALSE : $result->data;
}
/**
* Delete cache validating functions when feed is deleted
*/
function parser_common_syndication_nodeapi(&$node, $op) {
if (isset($node->feed) || feedapi_enabled_type($node->type)) {
switch ($op) {
case 'delete':
db_query("DELETE FROM {parser_common_syndication} WHERE url = '%s'", $node->feed->url);
$cache_dir = _parser_common_syndication_sanitize_cache();
$cache_filename = $cache_dir . '/' . md5($node->feed->url);
if (file_exists($cache_filename)) {
unlink($cache_filename);
}
break;
}
}
}
/**
* Determine the feed format of a SimpleXML parsed object structure.
*
* @param $xml
* SimpleXML-preprocessed feed.
* @return
* The feed format short description or FALSE if not compatible.
*/
function _parser_common_syndication_feed_format_detect($xml) {
if (!is_object($xml)) {
return FALSE;
}
$attr = $xml
->attributes();
$type = strtolower($xml
->getName());
if (isset($xml->entry) && $type == "feed") {
return "atom1.0";
}
if ($type == "rss" && $attr["version"] == "2.0") {
return "RSS2.0";
}
if ($type == "rdf" && isset($xml->channel)) {
return "RDF";
}
if ($type == "rss" && $attr["version"] == "0.91") {
return "RSS0.91";
}
if ($type == "rss" && $attr["version"] == "0.92") {
return "RSS0.92";
}
return FALSE;
}
/**
* Call one of the possible feedapi_get hook and pass back the downloaded data
*
* @return
* string - the downloaded data, FALSE - if the URL is not reachable
*/
function _parser_common_syndication_download($url, $settings = NULL) {
if (valid_url($url, TRUE)) {
// Handle password protected feeds.
$url_parts = parse_url($url);
$password = $username = NULL;
if (!empty($url_parts['user'])) {
$password = $url_parts['pass'];
$username = $url_parts['user'];
}
}
$accept_invalid_cert = isset($settings['accept_invalid_cert']) ? $settings['accept_invalid_cert'] : FALSE;
$downloaded_string = _parser_common_syndication_feedapi_get($url, $username, $password, $accept_invalid_cert);
// Cannot get the feed, pass the problem to one level up.
if ($downloaded_string == FALSE) {
return FALSE;
}
else {
if (is_object($downloaded_string)) {
return $downloaded_string;
}
}
// Do the autodiscovery at this level, pass back the real data.
// Maybe it's HTML. If it's not HTML, not worth to take a look into the downloaded string.
if (strpos(strtolower($downloaded_string), "<html") !== FALSE) {
$allowed_mime = array(
"text/xml",
"application/rss+xml",
"application/atom+xml",
"application/rdf+xml",
"application/xml",
);
$matches = array();
// Get all the links tag
preg_match_all('/<link\\s+(.*?)\\s*\\/?>/si', $downloaded_string, $matches);
$links = $matches[1];
$rss_link = FALSE;
foreach ($links as $link) {
$mime = array();
// Get the type attribute and check if the mime type is allowed.
preg_match_all('/type\\s*=\\s*("|\')([A-Za-z\\/+]*)("|\')/si', $link, $mime);
if (in_array(array_pop($mime[2]), $allowed_mime)) {
$href = array();
// Get the href attribute.
preg_match_all('/href\\s*=\\s*("|\')([=#\\?_:.0-9A-Za-z\\/+]*)("|\')/si', $link, $href);
$rss_link = array_pop($href[2]);
if (is_string($rss_link) && strlen($rss_link) > 0 && $rss_link != $url) {
// Handle base url related stuff.
$parsed_url = parse_url($rss_link);
if (!isset($parsed_url['host'])) {
// It's relative so make it absolute.
$base_tag = array();
preg_match_all('/<base href\\s*=\\s*("|\')([_:.0-9A-Za-z\\/+]*)("|\')/si', $link, $base_tag);
$base_url = array_pop($base_tag[2]);
if (is_string($base_url) && strlen($base_url) > 0) {
// Get from the HTML base tag.
$rss_link = $base_url . $rss_link;
}
else {
// Guess from the original URL.
$original_url = parse_url($url);
$rss_link = $original_url['scheme'] . '://' . $original_url['host'] . (isset($original_url['port']) ? ':' : '') . $original_url['port'] . $parsed_url['path'] . '?' . $parsed_url['query'] . '#' . $parsed_url['fragment'];
}
}
$downloaded_string = _parser_common_syndication_download($rss_link);
break;
}
}
}
}
// Ugly hack to be able to retrieve the xml:base property, no way to access xml:lang inside <feed>
$downloaded_string = preg_replace('/xml:base *=/', 'base=', $downloaded_string);
// Filter out strange tags. Without this, the text would contain strange stuff.
// @todo: make sure that these are not important for feed element mapper
$downloaded_string_filtered = preg_replace(array(
'@<script[^>]*?.*?</script>@si',
'@<object[^>]*?.*?</object>@si',
'@<embed[^>]*?.*?</embed>@si',
'@<applet[^>]*?.*?</applet>@si',
'@<noframes[^>]*?.*?</noframes>@si',
'@<noscript[^>]*?.*?</noscript>@si',
'@<noembed[^>]*?.*?</noembed>@si',
), '', $downloaded_string);
return empty($downloaded_string_filtered) ? $downloaded_string : $downloaded_string_filtered;
}
/**
* Parse atom feeds.
*/
function _parser_common_syndication_atom10_parse($feed_XML) {
$parsed_source = new stdClass();
$base = (string) array_shift($feed_XML
->xpath("@base"));
if (!valid_url($base, TRUE)) {
$base = FALSE;
}
// Detect the title
$parsed_source->title = isset($feed_XML->title) ? _parser_common_syndication_title("{$feed_XML->title}") : "";
// Detect the description
$parsed_source->description = isset($feed_XML->subtitle) ? "{$feed_XML->subtitle}" : "";
$parsed_source->options = new stdClass();
$parsed_source->options->link = _parser_common_syndication_link($feed_XML->link);
if (valid_url($parsed_source->options->link) && !valid_url($parsed_source->options->link, TRUE) && !empty($base)) {
$parsed_source->options->link = $base . $parsed_source->options->link;
}
$parsed_source->items = array();
foreach ($feed_XML->entry as $news) {
$original_url = NULL;
$guid = !empty($news->id) ? "{$news->id}" : NULL;
// I don't know how standard this is, but sometimes the id is the URL.
if (valid_url($guid, TRUE)) {
$original_url = $guid;
}
$additional_taxonomies = array();
if (isset($news->category)) {
$additional_taxonomies['ATOM Categories'] = array();
$additional_taxonomies['ATOM Domains'] = array();
foreach ($news->category as $category) {
if (isset($category['scheme'])) {
$domain = "{$category['scheme']}";
if (!empty($domain)) {
if (!isset($additional_taxonomies['ATOM Domains'][$domain])) {
$additional_taxonomies['ATOM Domains'][$domain] = array();
}
$additional_taxonomies['ATOM Domains'][$domain][] = count($additional_taxonomies['ATOM Categories']) - 1;
}
}
$additional_taxonomies['ATOM Categories'][] = "{$category['term']}";
}
}
$title = "{$news->title}";
$body = '';
if (!empty($news->content)) {
foreach ($news->content
->children() as $child) {
$body .= $child
->asXML();
}
$body .= "{$news->content}";
}
else {
if (!empty($news->summary)) {
foreach ($news->summary
->children() as $child) {
$body .= $child
->asXML();
}
$body .= "{$news->summary}";
}
}
if (!empty($news->content['src'])) {
// some src elements in some valid atom feeds contained no urls at all
if (valid_url("{$news->content['src']}", TRUE)) {
$original_url = "{$news->content['src']}";
}
}
$author_found = FALSE;
if (!empty($news->source->author->name)) {
$original_author = "{$news->source->author->name}";
$author_found = TRUE;
}
else {
if (!empty($news->author->name)) {
$original_author = "{$news->author->name}";
$author_found = TRUE;
}
}
if (!empty($feed_XML->author->name) && !$author_found) {
$original_author = "{$feed_XML->author->name}";
}
$original_url = _parser_common_syndication_link($news->link);
$item = new stdClass();
$item->title = _parser_common_syndication_title($title, $body);
$item->description = $body;
$item->options = new stdClass();
$item->options->original_author = $original_author;
$item->options->timestamp = _parser_common_syndication_parse_date(isset($news->published) ? "{$news->published}" : "{$news->issued}");
$item->options->original_url = trim($original_url);
if (valid_url($item->options->original_url) && !valid_url($item->options->original_url, TRUE) && !empty($base)) {
$item->options->original_url = $base . $item->options->original_url;
}
$item->options->guid = $guid;
$item->options->tags = isset($additional_taxonomies['ATOM Categories']) ? $additional_taxonomies['ATOM Categories'] : array();
$item->options->domains = isset($additional_taxonomies['ATOM Domains']) ? $additional_taxonomies['ATOM Domains'] : array();
$parsed_source->items[] = $item;
}
return $parsed_source;
}
/**
* Parse RDF Site Summary (RSS) 1.0 feeds in RDF/XML format.
*
* @see http://web.resource.org/rss/1.0/
*/
function _parser_common_syndication_RDF10_parse($feed_XML) {
// Declare some canonical standard prefixes for well-known namespaces:
static $canonical_namespaces = array(
'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'rdfs' => 'http://www.w3.org/2000/01/rdf-schema#',
'xsi' => 'http://www.w3.org/2001/XMLSchema-instance#',
'xsd' => 'http://www.w3.org/2001/XMLSchema#',
'owl' => 'http://www.w3.org/2002/07/owl#',
'dc' => 'http://purl.org/dc/elements/1.1/',
'dcterms' => 'http://purl.org/dc/terms/',
'dcmitype' => 'http://purl.org/dc/dcmitype/',
'foaf' => 'http://xmlns.com/foaf/0.1/',
'rss' => 'http://purl.org/rss/1.0/',
);
// Get all namespaces declared in the feed element, with special handling
// for PHP versions prior to 5.1.2 as they don't handle namespaces.
$namespaces = version_compare(phpversion(), '5.1.2', '<') ? array() : $feed_XML
->getNamespaces(TRUE);
// Process the <rss:channel> resource containing feed metadata:
foreach ($feed_XML
->children($canonical_namespaces['rss'])->channel as $rss_channel) {
$parsed_source = (object) array(
'title' => _parser_common_syndication_title((string) $rss_channel->title),
'description' => (string) $rss_channel->description,
'options' => (object) array(
'link' => (string) $rss_channel->link,
),
'items' => array(),
);
break;
}
// Process each <rss:item> resource contained in the feed:
foreach ($feed_XML
->children($canonical_namespaces['rss'])->item as $rss_item) {
// Extract all available RDF statements from the feed item's RDF/XML
// tags, allowing for both the item's attributes and child elements to
// contain RDF properties:
$rdf_data = array();
foreach ($namespaces as $ns => $ns_uri) {
// Note that we attempt to normalize the found property name
// namespaces to well-known 'standard' prefixes where possible, as the
// feed may in principle use any arbitrary prefixes and we should
// still be able to correctly handle it.
foreach ($rss_item
->attributes($ns_uri) as $attr_name => $attr_value) {
$ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
$rdf_data[$ns_prefix . ':' . $attr_name][] = (string) $attr_value;
}
foreach ($rss_item
->children($ns_uri) as $rss_property) {
$ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
$rdf_data[$ns_prefix . ':' . $rss_property
->getName()][] = (string) $rss_property;
}
}
// Declaratively define mappings that determine how to construct the
// object that gets passed back to FeedAPI:
$item = _parser_common_syndication_RDF10_item($rdf_data, (object) array(
'title' => array(
'rss:title',
'dc:title',
),
'description' => array(
'rss:description',
'dc:description',
'content:encoded',
),
'options' => (object) array(
'guid' => 'rdf:about',
'timestamp' => 'dc:date',
'original_author' => array(
'dc:creator',
'dc:publisher',
),
'original_url' => array(
'rss:link',
'rdf:about',
),
'tags' => 'dc:subject',
),
));
// Special handling for the title:
$item->title = _parser_common_syndication_title($item->title, $item->description);
// Parse any date/time values into Unix timestamps:
$item->options->timestamp = _parser_common_syndication_parse_date($item->options->timestamp);
// If no author name found, use the feed title:
if (empty($item->options->original_author)) {
$item->options->original_author = $parsed_source->title;
}
// Add every found RDF property to the FeedAPI item in order for Feed
// Element Mapper to be able to map these properties:
$item->rdf = (object) array();
foreach ($rdf_data as $rdf_property => $rdf_value) {
$rdf_property = str_replace(':', '_', $rdf_property);
// looks nicer in the mapper UI
$item->rdf->{$rdf_property} = $rdf_value;
}
$parsed_source->items[] = $item;
}
return $parsed_source;
}
function _parser_common_syndication_RDF10_property($rdf_data, $rdf_properties = array()) {
$rdf_properties = is_array($rdf_properties) ? $rdf_properties : array_slice(func_get_args(), 1);
foreach ($rdf_properties as $rdf_property) {
if ($rdf_property && !empty($rdf_data[$rdf_property])) {
return array_filter($rdf_data[$rdf_property], 'strlen');
// remove empty strings
}
}
}
function _parser_common_syndication_RDF10_item($rdf_data, $mappings) {
foreach (get_object_vars($mappings) as $k => $v) {
if (is_object($v)) {
$mappings->{$k} = _parser_common_syndication_RDF10_item($rdf_data, $v);
}
else {
$values = _parser_common_syndication_RDF10_property($rdf_data, $v);
$mappings->{$k} = !is_array($values) || count($values) > 1 ? $values : reset($values);
}
}
return (object) $mappings;
}
/**
* Parse RSS2.0 feeds.
*/
function _parser_common_syndication_RSS20_parse($feed_XML) {
$parsed_source = new stdClass();
// Detect the title.
$parsed_source->title = isset($feed_XML->channel->title) ? _parser_common_syndication_title("{$feed_XML->channel->title}") : "";
// Detect the description.
$parsed_source->description = isset($feed_XML->channel->description) ? "{$feed_XML->channel->description}" : "";
$parsed_source->options = new stdClass();
// Detect the link.
$parsed_source->options->link = isset($feed_XML->channel->link) ? "{$feed_XML->channel->link}" : "";
$parsed_source->items = array();
foreach ($feed_XML
->xpath('//item') as $news) {
$category = $news
->xpath('category');
// Get children for current namespace.
if (version_compare(phpversion(), '5.1.2', '>')) {
$content = (array) $news
->children('http://purl.org/rss/1.0/modules/content/');
}
$news = (array) $news;
$news['category'] = $category;
if (isset($news['guid'])) {
$guid = "{$news['guid']}";
}
else {
$guid = NULL;
}
if (isset($news['title'])) {
$title = "{$news['title']}";
}
else {
$title = '';
}
if (isset($news['description'])) {
$body = "{$news['description']}";
}
// Some sources use content:encoded as description i.e. PostNuke PageSetter module.
if (isset($news['encoded'])) {
// content:encoded for PHP < 5.1.2.
if (strlen($body) < strlen("{$news['encoded']}")) {
$body = "{$news['encoded']}";
}
}
if (isset($content['encoded'])) {
// content:encoded for PHP >= 5.1.2.
if (strlen($body) < strlen("{$content['encoded']}")) {
$body = "{$content['encoded']}";
}
}
if (!isset($body)) {
$body = "{$news['title']}";
}
if (!empty($news['author'])) {
$original_author = $news['author'];
}
else {
if (!empty($feed_XML->channel->title)) {
$original_author = "{$feed_XML->channel->title}";
}
}
if (!empty($news['link'])) {
$original_url = "{$news['link']}";
}
else {
$original_url = NULL;
}
$additional_taxonomies = array();
$additional_taxonomies['RSS Categories'] = array();
$additional_taxonomies['RSS Domains'] = array();
if (isset($news['category'])) {
foreach ($news['category'] as $category) {
$additional_taxonomies['RSS Categories'][] = "{$category}";
if (isset($category['domain'])) {
$domain = "{$category['domain']}";
if (!empty($domain)) {
if (!isset($additional_taxonomies['RSS Domains'][$domain])) {
$additional_taxonomies['RSS Domains'][$domain] = array();
}
$additional_taxonomies['RSS Domains'][$domain][] = count($additional_taxonomies['RSS Categories']) - 1;
}
}
}
}
$item = new stdClass();
$item->title = _parser_common_syndication_title($title, $body);
$item->description = $body;
$item->options = new stdClass();
$item->options->original_author = $original_author;
if (isset($news['pubDate'])) {
$item->options->timestamp = _parser_common_syndication_parse_date($news['pubDate']);
}
else {
$item->options->timestamp = time();
}
$item->options->original_url = trim($original_url);
$item->options->guid = $guid;
$item->options->domains = $additional_taxonomies['RSS Domains'];
$item->options->tags = $additional_taxonomies['RSS Categories'];
$parsed_source->items[] = $item;
}
return $parsed_source;
}
/**
* Set the default caching directory if the current setting is not useable
*/
function _parser_common_syndication_sanitize_cache() {
$cache_location = file_directory_path() . '/parser_common_syndication_cache';
if (!is_writeable($cache_location) || !is_dir($cache_location)) {
$cache_location = file_create_path($cache_location);
if (!file_exists($cache_location) && is_writable(file_directory_path())) {
mkdir($cache_location);
}
if (!is_writeable($cache_location)) {
return FALSE;
}
}
return $cache_location;
}
/**
* Parse a date comes from a feed.
*
* @param $date_string
* The date string in various formats.
* @return
* The timestamp of the string or the current time if can't be parsed
*/
function _parser_common_syndication_parse_date($date_str) {
$parsed_date = strtotime($date_str);
if ($parsed_date === FALSE || $parsed_date == -1) {
$parsed_date = _parser_common_syndication_parse_w3cdtf($date_str);
}
return $parsed_date === FALSE ? time() : $parsed_date;
}
/**
* Parse the W3C date/time format, a subset of ISO 8601.
*
* PHP date parsing functions do not handle this format.
* See http://www.w3.org/TR/NOTE-datetime for more information.
* Originally from MagpieRSS (http://magpierss.sourceforge.net/).
*
* @param $date_str
* A string with a potentially W3C DTF date.
* @return
* A timestamp if parsed successfully or FALSE if not.
*/
function _parser_common_syndication_parse_w3cdtf($date_str) {
if (preg_match('/(\\d{4})-(\\d{2})-(\\d{2})T(\\d{2}):(\\d{2})(:(\\d{2}))?(?:([-+])(\\d{2}):?(\\d{2})|(Z))?/', $date_str, $match)) {
list($year, $month, $day, $hours, $minutes, $seconds) = array(
$match[1],
$match[2],
$match[3],
$match[4],
$match[5],
$match[6],
);
// Calculate the epoch for current date assuming GMT.
$epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year);
if ($match[10] != 'Z') {
// Z is zulu time, aka GMT
list($tz_mod, $tz_hour, $tz_min) = array(
$match[8],
$match[9],
$match[10],
);
// Zero out the variables.
if (!$tz_hour) {
$tz_hour = 0;
}
if (!$tz_min) {
$tz_min = 0;
}
$offset_secs = ($tz_hour * 60 + $tz_min) * 60;
// Is timezone ahead of GMT? If yes, subtract offset.
if ($tz_mod == '+') {
$offset_secs *= -1;
}
$epoch += $offset_secs;
}
return $epoch;
}
else {
return FALSE;
}
}
/**
* Extract the link that points to the original content (back to site or original article)
*
* @param $links
* Array of SimpleXML objects
*/
function _parser_common_syndication_link($links) {
$to_link = '';
if (count($links) > 0) {
foreach ($links as $link) {
$link = $link
->attributes();
$to_link = isset($link["href"]) ? "{$link["href"]}" : "";
if (isset($link["rel"])) {
if ("{$link["rel"]}" == 'alternate') {
break;
}
}
}
}
return $to_link;
}
/**
* Prepare raw data to be a title
*/
function _parser_common_syndication_title($title, $body = FALSE) {
if (empty($title) && !empty($body)) {
// Explode to words and use the first 3 words.
$words = preg_split("/[\\s,]+/", $body);
$title = $words[0] . ' ' . $words[1] . ' ' . $words[2];
}
return $title;
}
Functions
Name | Description |
---|---|
parser_common_syndication_nodeapi | Delete cache validating functions when feed is deleted |
_parser_common_syndication_atom10_parse | Parse atom feeds. |
_parser_common_syndication_cache_get | Get the cached version of the <var>$url</var> |
_parser_common_syndication_cache_set | Store the parsed feed into the cache |
_parser_common_syndication_download | Call one of the possible feedapi_get hook and pass back the downloaded data |
_parser_common_syndication_feedapi_get | Get the content from the given URL. |
_parser_common_syndication_feedapi_parse | Parse the feed into a data structure. |
_parser_common_syndication_feed_format_detect | Determine the feed format of a SimpleXML parsed object structure. |
_parser_common_syndication_link | Extract the link that points to the original content (back to site or original article) |
_parser_common_syndication_parse_date | Parse a date comes from a feed. |
_parser_common_syndication_parse_w3cdtf | Parse the W3C date/time format, a subset of ISO 8601. |
_parser_common_syndication_RDF10_item | |
_parser_common_syndication_RDF10_parse | Parse RDF Site Summary (RSS) 1.0 feeds in RDF/XML format. |
_parser_common_syndication_RDF10_property | |
_parser_common_syndication_RSS20_parse | Parse RSS2.0 feeds. |
_parser_common_syndication_sanitize_cache | Set the default caching directory if the current setting is not useable |
_parser_common_syndication_title | Prepare raw data to be a title |