 * @file
 * Downloading and parsing functions for Common Syndication Parser.
 * Pillaged from FeedAPI common syndication parser.
 * @todo Restructure. OO could work wonders here.
 * @todo Write unit tests.
 * @todo Keep in Feeds project or host on Drupal?

 * Parse the feed into a data structure.
 * @param string $string
 *   The feed object (contains the URL or the parsed XML structure).
 * @return array|false
 *   The structured datas extracted from the feed or FALSE in case of failures.
function common_syndication_parser_parse($string) {

  // SimpleXML can only deal with XML declaration at the start of the document,
  // so remove any surrounding whitespace.
  $string = trim($string);
  @($xml = simplexml_load_string($string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NOCDATA));

  // Got a malformed XML.
  if ($xml === FALSE || is_null($xml)) {
    return FALSE;
  $feed_type = _parser_common_syndication_feed_format_detect($xml);
  if ($feed_type == "atom1.0") {
    return _parser_common_syndication_atom10_parse($xml);
  if ($feed_type == "RSS2.0" || $feed_type == "RSS0.91" || $feed_type == "RSS0.92") {
    return _parser_common_syndication_RSS20_parse($xml);
  if ($feed_type == "RDF") {
    return _parser_common_syndication_RDF10_parse($xml);
  return FALSE;

 * Determine the feed format of a SimpleXML parsed object structure.
 * @param SimpleXMLElement $xml
 *   SimpleXML-preprocessed feed.
 * @return string|false
 *   The feed format short description or FALSE if not compatible.
function _parser_common_syndication_feed_format_detect($xml) {
  if (!is_object($xml)) {
    return FALSE;
  $attr = $xml
  $type = strtolower($xml
  if (isset($xml->entry) && $type == "feed") {
    return "atom1.0";
  if ($type == "rss" && $attr["version"] == "2.0") {
    return "RSS2.0";
  if ($type == "rdf" && isset($xml->channel)) {
    return "RDF";
  if ($type == "rss" && $attr["version"] == "0.91") {
    return "RSS0.91";
  if ($type == "rss" && $attr["version"] == "0.92") {
    return "RSS0.92";
  return FALSE;

 * Parse atom feeds.
function _parser_common_syndication_atom10_parse($feed_XML) {
  $parsed_source = array();
  $ns = array(
    "georss" => "",
  $base = _parser_common_syndication_atom10_parse_base_url($feed_XML);

  // Detect the title.
  $parsed_source['title'] = isset($feed_XML->title) ? _parser_common_syndication_title("{$feed_XML->title}") : "";

  // Detect the description.
  $parsed_source['description'] = isset($feed_XML->subtitle) ? "{$feed_XML->subtitle}" : "";
  $parsed_source['link'] = _parser_common_syndication_link($feed_XML->link);
  if ($base && !valid_url($parsed_source['link'], TRUE) && valid_url($parsed_source['link'])) {
    $parsed_source['link'] = $base . $parsed_source['link'];
  $parsed_source['items'] = array();
  foreach ($feed_XML->entry as $news) {
    $georss = (array) $news
    $geoname = '';
    if (isset($georss['featureName'])) {
      $geoname = "{$georss['featureName']}";
    $latlon = $lat = $lon = NULL;
    if (isset($georss['point'])) {
      $latlon = explode(' ', $georss['point']);
      $lat = "{$latlon[0]}";
      $lon = "{$latlon[1]}";
      if (!$geoname) {
        $geoname = "{$lat} {$lon}";
    $additional_taxonomies = array();
    if (isset($news->category)) {
      $additional_taxonomies['ATOM Categories'] = array();
      $additional_taxonomies['ATOM Domains'] = array();
      foreach ($news->category as $category) {
        if (isset($category['scheme'])) {
          $domain = "{$category['scheme']}";
          if (!empty($domain)) {
            if (!isset($additional_taxonomies['ATOM Domains'][$domain])) {
              $additional_taxonomies['ATOM Domains'][$domain] = array();
            $additional_taxonomies['ATOM Domains'][$domain][] = count($additional_taxonomies['ATOM Categories']) - 1;
        $additional_taxonomies['ATOM Categories'][] = "{$category['term']}";
    $title = "{$news->title}";
    $body = '';
    if (!empty($news->content)) {
      foreach ($news->content
        ->children() as $child) {
        $body .= $child
      $body .= "{$news->content}";
    elseif (!empty($news->summary)) {
      foreach ($news->summary
        ->children() as $child) {
        $body .= $child
      $body .= "{$news->summary}";
    $original_author = '';
    if (!empty($news->source->author->name)) {
      $original_author = "{$news->source->author->name}";
    elseif (!empty($news->author->name)) {
      $original_author = "{$news->author->name}";
    elseif (!empty($feed_XML->author->name)) {
      $original_author = "{$feed_XML->author->name}";
    $item = array();
    $item['title'] = _parser_common_syndication_title($title, $body);
    $item['description'] = $body;
    $item['author_name'] = $original_author;

    // Fall back to updated for timestamp if both published and issued are
    // empty.
    if (isset($news->published)) {
      $item['timestamp'] = _parser_common_syndication_parse_date("{$news->published}");
    elseif (isset($news->issued)) {
      $item['timestamp'] = _parser_common_syndication_parse_date("{$news->issued}");
    elseif (isset($news->updated)) {
      $item['timestamp'] = _parser_common_syndication_parse_date("{$news->updated}");
    $item['guid'] = (string) $news->id;
    $item['url'] = _parser_common_syndication_link($news->link);
    if (!$item['url'] && !empty($news->content['src']) && valid_url($news->content['src'], TRUE)) {
      $item['url'] = (string) $news->content['src'];
    if (!strlen($item['url']) && $item['guid'] && valid_url($item['guid'], TRUE)) {
      $item['url'] = $item['guid'];
    if (!valid_url($item['url'], TRUE) && valid_url($item['url'])) {
      if ($item_base = _parser_common_syndication_atom10_parse_base_url($news)) {
        $item['url'] = $item_base . $item['url'];
      elseif ($base) {
        $item['url'] = $base . $item['url'];

    // Fall back on URL if GUID is empty.
    if (!strlen($item['guid'])) {
      $item['guid'] = $item['url'];
    $item['geolocations'] = array();
    if ($lat && $lon) {
      $item['geolocations'] = array(
          'name' => $geoname,
          'lat' => $lat,
          'lon' => $lon,
    $item['tags'] = isset($additional_taxonomies['ATOM Categories']) ? $additional_taxonomies['ATOM Categories'] : array();
    $item['domains'] = isset($additional_taxonomies['ATOM Domains']) ? $additional_taxonomies['ATOM Domains'] : array();
    $parsed_source['items'][] = $item;
  return $parsed_source;

 * Finds the base URL of an Atom document.
 * @param SimpleXMLElement $xml
 *   The XML document.
 * @return string|false
 *   Returns the base URL or false on failure.
function _parser_common_syndication_atom10_parse_base_url(SimpleXMLElement $xml) {
  $base = $xml
    ->attributes('xml', TRUE)->base;
  if (!$base) {
    $base = $xml['base'];
  if ($base && valid_url($base, TRUE)) {
    return rtrim($base, '/') . '/';

  // Try to build a base from the self link.
  foreach ($xml
    ->xpath('*[local-name() = "link" and @rel="self" and @href]') as $self) {
    if (valid_url($self['href'], TRUE)) {
      return _parser_common_syndication_string_url_path((string) $self['href']);

  // Try to build a base from the alternate link.
  foreach ($xml
    ->xpath('*[local-name() = "link" and @rel="alternate" and @href]') as $alternate) {
    if (valid_url($alternate['href'], TRUE)) {
      return _parser_common_syndication_string_url_path((string) $alternate['href']);
  return FALSE;

 * Removes the path parts of an absolute URL.
 * @param string $url
 *   The absolute URL.
 * @return string
 *   The absolute URL with the path stripped.
function _parser_common_syndication_string_url_path($url) {
  $pos = strpos($url, '/', strpos($url, '//') + 2);
  return $pos ? substr($url, 0, $pos + 1) : $url . '/';

 * Parse RDF Site Summary (RSS) 1.0 feeds in RDF/XML format.
 * @see
function _parser_common_syndication_RDF10_parse($feed_XML) {

  // Declare some canonical standard prefixes for well-known namespaces:
  static $canonical_namespaces = array(
    'rdf' => '',
    'rdfs' => '',
    'xsi' => '',
    'xsd' => '',
    'owl' => '',
    'dc' => '',
    'dcterms' => '',
    'dcmitype' => '',
    'foaf' => '',
    'rss' => '',

  // Get all namespaces declared in the feed element.
  $namespaces = $feed_XML

  // Process the <rss:channel> resource containing feed metadata:
  foreach ($feed_XML
    ->children($canonical_namespaces['rss'])->channel as $rss_channel) {
    $parsed_source = array(
      'title' => _parser_common_syndication_title((string) $rss_channel->title),
      'description' => (string) $rss_channel->description,
      'link' => (string) $rss_channel->link,
      'items' => array(),

  // Process each <rss:item> resource contained in the feed:
  foreach ($feed_XML
    ->children($canonical_namespaces['rss'])->item as $rss_item) {

    // Extract all available RDF statements from the feed item's RDF/XML
    // tags, allowing for both the item's attributes and child elements to
    // contain RDF properties:
    $rdf_data = array();
    foreach ($namespaces as $ns => $ns_uri) {

      // Note that we attempt to normalize the found property name
      // namespaces to well-known 'standard' prefixes where possible, as the
      // feed may in principle use any arbitrary prefixes and we should
      // still be able to correctly handle it.
      foreach ($rss_item
        ->attributes($ns_uri) as $attr_name => $attr_value) {
        $ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
        $rdf_data[$ns_prefix . ':' . $attr_name][] = (string) $attr_value;
      foreach ($rss_item
        ->children($ns_uri) as $rss_property) {
        $ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
        $rdf_data[$ns_prefix . ':' . $rss_property
          ->getName()][] = (string) $rss_property;

    // Declaratively define mappings that determine how to construct the result object.
    $item = _parser_common_syndication_RDF10_item($rdf_data, array(
      'title' => array(
      'description' => array(
      'url' => array(
      'author_name' => array(
      'guid' => 'rdf:about',
      'timestamp' => 'dc:date',
      'tags' => 'dc:subject',

    // Special handling for the title:
    $item['title'] = _parser_common_syndication_title($item['title'], $item['description']);

    // Parse any date/time values into Unix timestamps:
    $item['timestamp'] = _parser_common_syndication_parse_date($item['timestamp']);

    // If no GUID found, use the URL of the feed.
    if (empty($item['guid'])) {
      $item['guid'] = $item['url'];

    // Add every found RDF property to the feed item.
    $item['rdf'] = array();
    foreach ($rdf_data as $rdf_property => $rdf_value) {

      // Looks nicer in the mapper UI.
      // @todo Revisit, not used with feedapi mapper anymore.
      $rdf_property = str_replace(':', '_', $rdf_property);
      $item['rdf'][$rdf_property] = $rdf_value;
    $parsed_source['items'][] = $item;
  return $parsed_source;

function _parser_common_syndication_RDF10_property($rdf_data, $rdf_properties = array()) {
  $rdf_properties = is_array($rdf_properties) ? $rdf_properties : array_slice(func_get_args(), 1);
  foreach ($rdf_properties as $rdf_property) {
    if ($rdf_property && !empty($rdf_data[$rdf_property])) {

      // Remove empty strings.
      return array_filter($rdf_data[$rdf_property], 'strlen');

function _parser_common_syndication_RDF10_item($rdf_data, $mappings) {
  foreach ($mappings as $k => $v) {
    $values = _parser_common_syndication_RDF10_property($rdf_data, $v);
    $mappings[$k] = !is_array($values) || count($values) > 1 ? $values : reset($values);
  return $mappings;

 * Parse RSS2.0 feeds.
function _parser_common_syndication_RSS20_parse($feed_XML) {
  $ns = array(
    "content" => "",
    "dc" => "",
    "georss" => "",
  $parsed_source = array();

  // Detect the title.
  $parsed_source['title'] = isset($feed_XML->channel->title) ? _parser_common_syndication_title("{$feed_XML->channel->title}") : "";

  // Detect the description.
  $parsed_source['description'] = isset($feed_XML->channel->description) ? "{$feed_XML->channel->description}" : "";

  // Detect the link.
  $parsed_source['link'] = isset($feed_XML->channel->link) ? "{$feed_XML->channel->link}" : "";
  $parsed_source['items'] = array();
  foreach ($feed_XML
    ->xpath('//item') as $news) {
    $title = $body = $original_author = $original_url = $guid = '';

    // Get optional source url.
    $source_url = (string) $news->source['url'];
    $category = $news

    // Get children for current namespace.
    $content = (array) $news
    $dc = (array) $news
    $georss = (array) $news
    $news = (array) $news;
    $news['category'] = $category;
    if (isset($news['title'])) {
      $title = "{$news['title']}";
    if (isset($news['description'])) {
      $body = "{$news['description']}";

    // Some sources use content:encoded as description i.e.
    // PostNuke PageSetter module.
    // content:encoded for PHP < 5.1.2.
    if (isset($news['encoded'])) {
      if (strlen($body) < strlen("{$news['encoded']}")) {
        $body = "{$news['encoded']}";

    // content:encoded for PHP >= 5.1.2.
    if (isset($content['encoded'])) {
      if (strlen($body) < strlen("{$content['encoded']}")) {
        $body = "{$content['encoded']}";
    if (!isset($body)) {
      $body = "{$news['title']}";
    if (!empty($news['author'])) {
      $original_author = "{$news['author']}";
    elseif (!empty($dc["creator"])) {
      $original_author = (string) $dc["creator"];
    if (!empty($news['link'])) {
      $original_url = "{$news['link']}";
      $guid = $original_url;
    if (!empty($news['guid'])) {
      $guid = "{$news['guid']}";
    if (!empty($georss['featureName'])) {
      $geoname = "{$georss['featureName']}";
    $lat = $lon = $latlon = $geoname = NULL;
    if (!empty($georss['point'])) {
      $latlon = explode(' ', $georss['point']);
      $lat = "{$latlon[0]}";
      $lon = "{$latlon[1]}";
      if (!$geoname) {
        $geoname = "{$lat} {$lon}";
    $additional_taxonomies = array();
    $additional_taxonomies['RSS Categories'] = array();
    $additional_taxonomies['RSS Domains'] = array();
    if (isset($news['category'])) {
      foreach ($news['category'] as $category) {
        $additional_taxonomies['RSS Categories'][] = "{$category}";
        if (isset($category['domain'])) {
          $domain = "{$category['domain']}";
          if (!empty($domain)) {
            if (!isset($additional_taxonomies['RSS Domains'][$domain])) {
              $additional_taxonomies['RSS Domains'][$domain] = array();
            $additional_taxonomies['RSS Domains'][$domain][] = count($additional_taxonomies['RSS Categories']) - 1;
    $item = array();
    $item['title'] = _parser_common_syndication_title($title, $body);
    $item['description'] = $body;
    $item['author_name'] = $original_author;
    if (!empty($news['pubDate'])) {
      $item['timestamp'] = _parser_common_syndication_parse_date($news['pubDate']);
    elseif (!empty($dc['date'])) {
      $item['timestamp'] = _parser_common_syndication_parse_date($dc['date']);
    else {
      $item['timestamp'] = time();
    $item['url'] = trim($original_url);
    $item['guid'] = $guid;
    if (!empty($news['source'])) {
      $item['source:title'] = $news['source'];
    else {
      $item['source:title'] = NULL;
    $item['source:url'] = trim($source_url);
    $item['geolocations'] = array();
    if (isset($geoname, $lat, $lon)) {
      $item['geolocations'] = array(
          'name' => $geoname,
          'lat' => $lat,
          'lon' => $lon,
    $item['domains'] = $additional_taxonomies['RSS Domains'];
    $item['tags'] = $additional_taxonomies['RSS Categories'];
    $parsed_source['items'][] = $item;
  return $parsed_source;

 * Parse a date comes from a feed.
 * @param string $date_str
 *   The date string in various formats.
 * @return int
 *   The timestamp of the string or the current time if can't be parsed.
function _parser_common_syndication_parse_date($date_str) {

  // PHP < 5.3 doesn't like the GMT- notation for parsing timezones.
  $date_str = str_replace('GMT-', '-', $date_str);
  $date_str = str_replace('GMT+', '+', $date_str);
  $parsed_date = strtotime($date_str);
  if ($parsed_date === FALSE || $parsed_date == -1) {
    $parsed_date = _parser_common_syndication_parse_w3cdtf($date_str);
  if ($parsed_date === FALSE || $parsed_date == -1) {

    // PHP does not support the UT timezone. Fake it. The system that generated
    // this, Google Groups, probably meant UTC.
    $date_str = strtolower(trim($date_str));
    $last_three = substr($date_str, strlen($date_str) - 3, 3);
    if ($last_three == ' ut') {
      $parsed_date = strtotime($date_str . 'c');
  return $parsed_date === FALSE ? time() : $parsed_date;

 * Parse the W3C date/time format, a subset of ISO 8601.
 * PHP date parsing functions do not handle this format.
 * See for more information.
 * Originally from MagpieRSS (
 * @param string $date_str
 *   A potentially W3C DTF date.
 * @return int|false
 *   A timestamp if parsed successfully or FALSE if not.
function _parser_common_syndication_parse_w3cdtf($date_str) {
  if (preg_match('/(\\d{4})-(\\d{2})-(\\d{2})T(\\d{2}):(\\d{2})(:(\\d{2}))?(?:([-+])(\\d{2}):?(\\d{2})|(Z))?/', $date_str, $match)) {
    list($year, $month, $day, $hours, $minutes, $seconds) = array(

    // Calculate the epoch for current date assuming GMT.
    $epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year);

    // Z is zulu time, aka GMT.
    if ($match[10] != 'Z') {
      list($tz_mod, $tz_hour, $tz_min) = array(

      // Zero out the variables.
      if (!$tz_hour) {
        $tz_hour = 0;
      if (!$tz_min) {
        $tz_min = 0;
      $offset_secs = ($tz_hour * 60 + $tz_min) * 60;

      // Is timezone ahead of GMT?  If yes, subtract offset.
      if ($tz_mod == '+') {
        $offset_secs *= -1;
      $epoch += $offset_secs;
    return $epoch;
  else {
    return FALSE;

 * Extract the link that points to the original content (back to site or
 * original article)
 * @param array $links
 *   Array of SimpleXML objects.
 * @return string
 *   An URL if found. An empty string otherwise.
function _parser_common_syndication_link($links) {
  $to_link = '';
  if (count($links) > 0) {
    foreach ($links as $link) {
      $link = $link
      $to_link = isset($link["href"]) ? "{$link["href"]}" : "";
      if (isset($link["rel"])) {
        if ("{$link["rel"]}" == 'alternate') {
  return trim($to_link);

 * Prepare raw data to be a title.
function _parser_common_syndication_title($title, $body = FALSE) {
  if (empty($title) && !empty($body)) {

    // Explode to words and use the first 3 words.
    $words = preg_split('/[\\s,]+/', strip_tags($body));
    $title = implode(' ', array_slice($words, 0, 3));
  return $title;


