You are here

function _parser_common_syndication_download in FeedAPI 6

Same name and namespace in other branches
  1. 5 parser_common_syndication/parser_common_syndication.module \_parser_common_syndication_download()

Call one of the possible feedapi_get hook and pass back the downloaded data

Return value

string - the downloaded data, FALSE - if the URL is not reachable

2 calls to _parser_common_syndication_download()
parser_common_syndication_feedapi_feed in parser_common_syndication/parser_common_syndication.module
Implementation of hook_feedapi_feed().
_parser_common_syndication_feedapi_parse in parser_common_syndication/parser_common_syndication.inc
Parse the feed into a data structure.

File

parser_common_syndication/parser_common_syndication.inc, line 250
Downloading and parsing functions for Common Syndication Parser

Code

function _parser_common_syndication_download($url, $settings = NULL) {
  if (valid_url($url, TRUE)) {

    // Handle password protected feeds.
    $url_parts = parse_url($url);
    $password = $username = NULL;
    if (!empty($url_parts['user'])) {
      $password = $url_parts['pass'];
      $username = $url_parts['user'];
    }
  }
  $accept_invalid_cert = isset($settings['accept_invalid_cert']) ? $settings['accept_invalid_cert'] : FALSE;
  $downloaded_string = _parser_common_syndication_feedapi_get($url, $username, $password, $accept_invalid_cert);

  // Cannot get the feed, pass the problem to one level up.
  if ($downloaded_string == FALSE) {
    return FALSE;
  }
  else {
    if (is_object($downloaded_string)) {
      return $downloaded_string;
    }
  }

  // Do the autodiscovery at this level, pass back the real data.
  // Maybe it's HTML. If it's not HTML, not worth to take a look into the downloaded string.
  if (strpos(strtolower($downloaded_string), "<html") !== FALSE) {
    $allowed_mime = array(
      "text/xml",
      "application/rss+xml",
      "application/atom+xml",
      "application/rdf+xml",
      "application/xml",
    );
    $matches = array();

    // Get all the links tag
    preg_match_all('/<link\\s+(.*?)\\s*\\/?>/si', $downloaded_string, $matches);
    $links = $matches[1];
    $rss_link = FALSE;
    foreach ($links as $link) {
      $mime = array();

      // Get the type attribute and check if the mime type is allowed.
      preg_match_all('/type\\s*=\\s*("|\')([A-Za-z\\/+]*)("|\')/si', $link, $mime);
      if (in_array(array_pop($mime[2]), $allowed_mime)) {
        $href = array();

        // Get the href attribute.
        preg_match_all('/href\\s*=\\s*("|\')([=#\\?_:.0-9A-Za-z\\/+]*)("|\')/si', $link, $href);
        $rss_link = array_pop($href[2]);
        if (is_string($rss_link) && strlen($rss_link) > 0 && $rss_link != $url) {

          // Handle base url related stuff.
          $parsed_url = parse_url($rss_link);
          if (!isset($parsed_url['host'])) {

            // It's relative so make it absolute.
            $base_tag = array();
            preg_match_all('/<base href\\s*=\\s*("|\')([_:.0-9A-Za-z\\/+]*)("|\')/si', $link, $base_tag);
            $base_url = array_pop($base_tag[2]);
            if (is_string($base_url) && strlen($base_url) > 0) {

              // Get from the HTML base tag.
              $rss_link = $base_url . $rss_link;
            }
            else {

              // Guess from the original URL.
              $original_url = parse_url($url);
              $rss_link = $original_url['scheme'] . '://' . $original_url['host'] . (isset($original_url['port']) ? ':' : '') . $original_url['port'] . $parsed_url['path'] . '?' . $parsed_url['query'] . '#' . $parsed_url['fragment'];
            }
          }
          $downloaded_string = _parser_common_syndication_download($rss_link);
          break;
        }
      }
    }
  }

  // Ugly hack to be able to retrieve the xml:base property, no way to access xml:lang inside <feed>
  $downloaded_string = preg_replace('/xml:base *=/', 'base=', $downloaded_string);

  // Filter out strange tags. Without this, the text would contain strange stuff.
  // @todo: make sure that these are not important for feed element mapper
  $downloaded_string_filtered = preg_replace(array(
    '@<script[^>]*?.*?</script>@si',
    '@<object[^>]*?.*?</object>@si',
    '@<embed[^>]*?.*?</embed>@si',
    '@<applet[^>]*?.*?</applet>@si',
    '@<noframes[^>]*?.*?</noframes>@si',
    '@<noscript[^>]*?.*?</noscript>@si',
    '@<noembed[^>]*?.*?</noembed>@si',
  ), '', $downloaded_string);
  return empty($downloaded_string_filtered) ? $downloaded_string : $downloaded_string_filtered;
}