You are here

function _parser_common_syndication_download in FeedAPI 5

Same name and namespace in other branches
  1. 6 parser_common_syndication/parser_common_syndication.inc \_parser_common_syndication_download()

Call one of the possible feedapi_get hook and pass back the downloaded data

Return value

string - the downloaded data, FALSE - if the URL is not reachable

2 calls to _parser_common_syndication_download()
parser_common_syndication_feedapi_feed in parser_common_syndication/parser_common_syndication.module
Implementation of hook_feedapi_feed().
_parser_common_syndication_feedapi_parse in parser_common_syndication/parser_common_syndication.module
Parse the feed into a data structure.

File

parser_common_syndication/parser_common_syndication.module, line 250
Parse the incoming URL with SimpleXML then provide a data structure of the feed. Requires PHP5 because of SimpleXML.

Code

function _parser_common_syndication_download($url, $op) {
  if (valid_url($url, TRUE)) {

    // Handle password protected feeds.
    $url_parts = parse_url($url);
    $password = $username = NULL;
    if (!empty($url_parts['user'])) {
      $password = $url_parts['pass'];
      $username = $url_parts['user'];
    }
  }
  $downloaded_string = _parser_common_syndication_feedapi_get($url, $username, $password, $op);

  // Cannot get the feed, pass the problem to one level upper
  if ($downloaded_string == "") {
    return FALSE;
  }
  else {
    if (is_object($downloaded_string)) {
      return $downloaded_string;
    }
  }

  // Do the autodiscovery at this level, pass back the real data
  // Maybe it's HTML. If it's not HTML, not worth to take a look into the downloaded string
  if (strpos(strtolower($downloaded_string), "<html") !== FALSE) {
    $allowed_mime = array(
      "text/xml",
      "application/rss+xml",
      "application/atom+xml",
      "application/rdf+xml",
      "application/xml",
    );
    $matches = array();

    // Get all the links tag
    preg_match_all('/<link\\s+(.*?)\\s*\\/?>/si', $downloaded_string, $matches);
    $links = $matches[1];
    $rss_link = FALSE;
    foreach ($links as $link) {
      $mime = array();

      // Get the type attribute and check if the mime type is allowed.
      preg_match_all('/type\\s*=\\s*("|' . "'" . ')([A-Za-z\\/+]*)("|' . "'" . ')/si', $link, $mime);
      if (in_array(array_pop($mime[2]), $allowed_mime)) {
        $href = array();

        // Get the href attribute.
        preg_match_all('/href\\s*=\\s*("|' . "'" . ')([=#\\?_:.0-9A-Za-z\\/+]*)("|' . "'" . ')/si', $link, $href);
        $rss_link = array_pop($href[2]);
        if (is_string($rss_link) && strlen($rss_link) > 0 && $rss_link != $url) {

          // Handle base url related stuff.
          $parsed_url = parse_url($rss_link);
          if (!isset($parsed_url['host'])) {

            // It's relative so make it absolute.
            $base_tag = array();
            preg_match_all('/<base href\\s*=\\s*("|' . "'" . ')([_:.0-9A-Za-z\\/+]*)("|' . "'" . ')/si', $link, $base_tag);
            $base_url = array_pop($base_tag[2]);
            if (is_string($base_url) && strlen($base_url) > 0) {

              // Get from the HTML base tag.
              $rss_link = $base_url . $rss_link;
            }
            else {

              // Guess from the original URL.
              $original_url = parse_url($url);
              $rss_link = $original_url['scheme'] . '://' . $original_url['host'] . (isset($original_url['port']) ? ':' : '') . $original_url['port'] . $parsed_url['path'] . '?' . $parsed_url['query'] . '#' . $parsed_url['fragment'];
            }
          }
          $downloaded_string = _parser_common_syndication_download($rss_link, $op);
          break;
        }
      }
    }
  }

  // Filter out strange tags. Without this, the text would contain strange stuff.
  // @todo: make sure that these are not important for feed element mapper
  $downloaded_string = preg_replace(array(
    '@<script[^>]*?.*?</script>@si',
    '@<object[^>]*?.*?</object>@si',
    '@<embed[^>]*?.*?</embed>@si',
    '@<applet[^>]*?.*?</applet>@si',
    '@<noframes[^>]*?.*?</noframes>@si',
    '@<noscript[^>]*?.*?</noscript>@si',
    '@<noembed[^>]*?.*?</noembed>@si',
  ), array(
    '',
    '',
    '',
    '',
    '',
    '',
    '',
  ), $downloaded_string);

  // Ugly hack to be able to retrieve the xml:base property, no way to access xml:lang inside <feed>
  $downloaded_string = preg_replace('/xml:base *=/', 'base=', $downloaded_string);
  return $downloaded_string;
}