You are here

cf_http.module in Common Functionality 7

Same filename and directory in other branches
  1. 7.2 modules/cf_http/cf_http.module

File

modules/cf_http/cf_http.module
View source
<?php

/**
 * Implements hook_init().
 */
function cf_http_init() {
  static $cf_http_html_headers = FALSE;
  if (!$cf_http_html_headers) {
    drupal_add_css(drupal_get_path('module', 'cf_http') . '/includes/cf_http_html_headers.css');
    $cf_http_html_headers = TRUE;
  }
}

/**
 * Reads an http page at the given path and returns an unprocessed response.
 *
 * Why:
 *   Custom php scripts need a straight-forward and easy way to pull data from
 *   another website.
 *   This is useful as an alternative to iframe and has advantages and
 *   disadvantages to iframes.
 *   An advantage is that this allows showing remote content even if the remote
 *   url is down (via caching).
 *   A disadvantage is that remote images and links need to be processed,
 *   updated, and possibly even manually cached.
 *
 * @param string $server
 *   Hostname or ip address of the server.
 *   Should not contain http:// or similary prefixes.
 * @param string $path
 *  The file/path on the server to
 * @param int $port
 *   (optional) port number of the page to read (defaults to 80).
 * @param array $function_history
 *   (optional) An array of function names, ie:
 *   array('0' => 'my_function_name').
 *
 * @return array
 *   An array containing the connection status and return http response.
 *   The array keys:
 *   - connected: A boolean with TRUE representing that the connection to the
 *   server was established and FALSE otherwise.
 *   - response: The http response as returned by the target server.
 *     This http response must be processed.
 */
function cf_http_get_response($server, $path, $port = 80, array $function_history = array()) {
  cf_error_append_history($function_history, __FUNCTION__);
  $results = array(
    'connected' => FALSE,
    'response' => '',
    'is_local' => FALSE,
  );
  if (cf_is_empty_or_non_string($function_history, 'server', $server, WATCHDOG_ERROR)) {
    return $results;
  }
  if (cf_is_empty_or_non_string($function_history, 'path', $path, WATCHDOG_ERROR)) {
    return $results;
  }
  if (!is_numeric($port)) {
    cf_error_not_numeric($function_history, 'port');
    return $results;
  }
  $fp = fsockopen($server, $port, $errno, $errstr, 8);
  $server_address = preg_replace('@^\\w+://@i', '', $server);
  if (!$fp || !is_string($server_address)) {
    return $results;
  }
  else {
    fwrite($fp, 'GET ' . $path . ' HTTP/1.1' . "\r\n" . 'Host: ' . $server_address . "\r\n" . 'Accept-Encoding: deflate' . "\r\n" . 'Connection: Close' . "\r\n\r\n");
    stream_set_timeout($fp, 4);
    $results['is_local'] = stream_is_local($fp);
    while (!feof($fp)) {
      $results['response'] .= fgets($fp, 8192);
      $info = stream_get_meta_data($fp);
      if ($info['timed_out']) {
        fclose($fp);
        return $results;
      }
    }
    $results['connected'] = TRUE;
    fclose($fp);
    return $results;
  }
}

/**
 * Validate http responses by checking header.
 *
 * Originally From: http://php.net/manual/en/function.fsockopen.php#85572
 *
 * Why:
 *   Custom php scripts need a straight-forward and easy way to pull data from
 *   another website.
 *   This is useful as an alternative to iframe and has advantages and
 *   disadvantages to iframes.
 *   An advantage is that this allows showing remote content even if the remote
 *   url is down (via caching).
 *   A disadvantage is that remote images and links need to be processed,
 *   updated, and possibly even manually cached.
 *
 * @param array $headers
 *   An array of http headers to validate.
 * @param array $function_history
 *   (optional) An array of function names, ie:
 *   array('0' => 'my_function_name').
 *
 * @return bool
 *   A boolean with TRUE representing that the headers are valid, FALSE
 *   otherwise.
 */
function cf_http_validate_response(array $headers, array $function_history = array()) {
  if (empty($headers)) {
    return FALSE;
  }
  switch (trim(strtolower($headers[0]))) {
    case 'http/1.0 100 ok':
    case 'http/1.0 200 ok':
    case 'http/1.1 100 ok':
    case 'http/1.1 200 ok':
      return TRUE;
  }
  return FALSE;
}

/**
 * Search through an array of http errors for common 400 and 500 http codes.
 *
 * Why:
 *   Custom php scripts need a straight-forward and easy way to pull data from
 *   another website.
 *   This is useful as an alternative to iframe and has advantages and
 *   disadvantages to iframes.
 *   An advantage is that this allows showing remote content even if the remote
 *   url is down (via caching).
 *   A disadvantage is that remote images and links need to be processed,
 *   updated, and possibly even manually cached.
 *
 * @param array $headers
 *   An array of http headers.
 * @param array $function_history
 *   (optional) An array of function names, ie:
 *   array('0' => 'my_function_name').
 *
 * @return array
 *   An array with the following structure:
 *   - error_code: number representing the error code of the error found,
 *   0 otherwise.
 *   - key: array key of the header with the error.
 *   - value: error information associated with the error code.
 */
function cf_http_headers_errors(array &$headers, array $function_history = array()) {
  foreach ($headers as $key => &$value) {
    $matches = array();
    if (preg_match('/^([45]\\d\\d)\\s/i', $value, $matches) > 0) {
      return array(
        'error_code' => $matches[1],
        'key' => $key,
        'value' => &$value,
      );
    }
  }
  return array(
    'error_code' => 0,
    'key' => '',
    'value' => '',
  );
}

/**
 * Unchunk http content.
 *
 * Originally From: http://php.net/manual/en/function.fsockopen.php#85572
 *
 * Why:
 *   Custom php scripts need a straight-forward and easy way to pull data from
 *   another website.
 *   This is useful as an alternative to iframe and has advantages and
 *   disadvantages to iframes.
 *   An advantage is that this allows showing remote content even if the remote
 *   url is down (via caching).
 *   A disadvantage is that remote images and links need to be processed,
 *   updated, and possibly even manually cached.
 *
 * @param string $document
 *   An string representing an html document.
 * @param array $function_history
 *   (optional) An array of function names, ie:
 *   array('0' => 'my_function_name').
 *
 * @return array
 *   An array containing the unchunk status and unchunked string.
 *   The array keys:
 *   - unchunked: A boolean with TRUE representing that the document string was
 *   successfully unchunked, FALSE otherwise.
 *   - document: The complete (unchunked) html document.
 */
function cf_http_unchunk_response(string $document, array $function_history = array()) {
  $results = array(
    'unchunked' => FALSE,
    'document' => '',
  );
  if (empty($document)) {
    return $results;
  }
  $eol = "\r\n";
  $add = strlen($eol);
  $tmp = $document;
  do {
    $tmp = ltrim($tmp);
    $position = strpos($tmp, $eol);
    if ($position === FALSE) {
      return $results;
    }
    $length = hexdec(substr($tmp, 0, $position));
    if (!is_numeric($length) || $length < 0) {
      return $results;
    }
    $results['document'] .= substr($tmp, $position + $add, $length);
    $tmp = substr($tmp, $length + $position + $add);
    $check = trim($tmp);
  } while (!empty($check));
  unset($tmp);
  return $results;
}

/**
 * Accepts and processes provided http content.
 *
 * This process checks for a valid http response, unchunks if needed, returns
 * http content without headers on success, false on any errors.
 *
 * Originally From: http://php.net/manual/en/function.fsockopen.php#85572
 *
 * Why:
 *   Custom php scripts need a straight-forward and easy way to pull data from
 *   another website.
 *   This is useful as an alternative to iframe and has advantages and
 *   disadvantages to iframes.
 *   An advantage is that this allows showing remote content even if the
 *   remote url is down (via caching).
 *   A disadvantage is that remote images and links need to be processed,
 *   updated, and possibly even manually cached.
 *
 * @param string $http_response
 *   An http response string.
 * @param string $path
 *   The file/path on the server to.
 * @param int $port
 *   (optional) port number of the page to read (defaults to 80).
 * @param array $function_history
 *   (optional) An array of function names, ie:
 *   array('0' => 'my_function_name').
 *
 * @return array
 *   An array containing the connection status and return http response.
 *   The array keys:
 *   - parsed: A boolean with TRUE representing that the http request string
 *   was successfully parsed, FALSE otherwise.
 *   - headers: The http header from the httpd response.
 *   - document: The complete html document from the http response.
 */
function cf_http_parse_response($http_response, array $function_history = array()) {
  cf_error_append_history($function_history, __FUNCTION__);
  $results = array(
    'parsed' => FALSE,
    'header' => '',
    'document' => '',
    'http_error' => array(
      'error_code' => 0,
      'key' => '',
      'value' => '',
    ),
  );
  if (cf_is_empty_or_non_string($function_history, 'http_response', $http_response, WATCHDOG_ERROR)) {
    return $results;
  }

  // split into array, headers and content.
  $hunks = explode("\r\n\r\n", trim($http_response));
  if (!is_array($hunks) || count($hunks) < 2) {
    return $results;
  }
  $header = $hunks[count($hunks) - 2];
  $document = $hunks[count($hunks) - 1];
  $headers = explode("\n", $header);
  $results['headers'] = $headers;
  $results['document'] = $document;
  unset($hunks);
  unset($header);
  unset($document);
  if (!cf_http_validate_response($results['headers'], $function_history)) {
    $results['http_error'] = cf_http_headers_errors($results['headers'], $function_history);
  }
  if (in_array('Transfer-Coding: chunked', $results['headers'])) {
    $result = cf_http_unchunk_response($results['document'], $function_history);
    if ($result['unchunked']) {
      $results['document'] = $result['document'];
    }
  }
  $results['document'] = trim($results['document']);

  // remove some additional trash not removed by the original function
  $results['document'] = preg_replace("/^[[:alnum:]]+\r\n/i", '', $results['document']);
  $results['document'] = preg_replace("/\r\n0\$/i", '', $results['document']);
  if (!is_string($results['document'])) {
    $results['document'] = '';
    return $results;
  }
  $results['parsed'] = TRUE;
  return $results;
}

/**
 * Breaks apart an html formatted document string.
 *
 * The string is broken into an array containing two parts: 'head' and 'body'.
 * All other elements before, between, or after the html <head> and <body> tags
 * are lost/ignored.
 *
 * Why:
 *   Custom php scripts need a straight-forward and easy way to pull data from
 *   another website.
 *   This is useful as an alternative to iframe and has advantages and
 *   disadvantages to iframes.
 *   An advantage is that this allows showing remote content even if the remote
 *   url is down (via caching).
 *   A disadvantage is that remote images and links need to be processed,
 *   updated, and possibly even manually cached.
 *
 * @param string $document
 *   An http response string.
 * @param array $function_history
 *   (optional) An array of function names, ie:
 *   array('0' => 'my_function_name').
 *
 * @return array
 *   An array containing the connection status and return http response.
 *   The array keys:
 *   - split: A boolean with TRUE representing that the document string was
 *   successfully split, FALSE otherwise.
 *   - headers: The http header from the httpd response.
 *   - document: The complete html document from the http response.
 */
function cf_http_split_response($document, array $function_history = array()) {
  cf_error_append_history($function_history, __FUNCTION__);
  $results = array(
    'split' => FALSE,
    'head' => '',
    'body' => '',
  );
  if (cf_is_empty_or_non_string($function_history, 'document', $document, WATCHDOG_ERROR)) {
    return $results;
  }
  $matches = array();
  $result = preg_match('/<head[^>]*>(.*)<\\/head>/sim', $document, $matches);
  if ($result > 0 && isset($matches[1])) {
    $results['head'] = $matches[1];
  }
  else {
    return $results;
  }
  $matches = array();
  $result = preg_match('/<body[^>]*>(.*)<\\/body>/sim', $document, $matches);
  if ($result > 0 && isset($matches[1])) {
    $results['body'] = $matches[1];
  }
  else {
    return $results;
  }
  $results['split'] = TRUE;
  return $results;
}

/**
 * Fix relative urls pulled from the remote server.
 *
 * These urls are turned into absolute urls.
 *
 * Why:
 *   Custom php scripts need a straight-forward and easy way to pull data from
 *   another website.
 *   This is useful as an alternative to iframe and has advantages and
 *   disadvantages to iframes.
 *   An advantage is that this allows showing remote content even if the remote
 *   url is down (via caching).
 *   A disadvantage is that remote images and links need to be processed,
 *   updated, and possibly even manually cached.
 *
 * @param string $text
 *   The html document text whose urls are to be altered.
 * @param string $server
 *   The hostname or ip address of the server to use when generating absolute urls.
 *   This must not contain the 'http://' prefixes nor the suffixes such as '/' or ':80'.
 * @param string $relative_path
 *   all relative paths will have this prepended to the absolute url.
 * @param string $scheme
 *   (optional) The 'http' at the front of most urls.
 *   A common alternative is 'https'.
 * @param string $suffix
 *   (optional) The suffix to prepend to the url.
 *   Most cases this should be '/', but if the links are being cached on a
 *   different server and a different sub-path, then this must be used.
 * @param int $port
 *   (optional) The port number of the web-server.
 *   In almost all cases this should be 80.
 *   If $schema is set to 'https', then normally this should instead be 443.
 * @param array $function_history
 *   (optional) An array of function names, ie:
 *   array('0' => 'my_function_name').
 *
 * @return array
 *   An array containing the connection status and return http response.
 *   The array keys:
 *   - adjusted: A boolean with TRUE representing that the text's urls were
 *   successfully adjuested, FALSE otherwise.
 *   - text: The complete html text with all links adjusted to absolute paths.
 */
function cf_http_adjust_urls($text, $server, $relative_path, $scheme = 'http', $suffix = '/', $port = 80, array $function_history = array()) {
  cf_error_append_history($function_history, __FUNCTION__);
  $results = array(
    'adjusted' => FALSE,
    'text' => $text,
  );
  $matches = array();
  foreach (array(
    'src',
    'href',
  ) as $attr_key => $attribute) {
    $result = preg_match_all('/(<[^>]*' . $attribute . '\\s*=\\s*)(["|\'])([^>]*)>/i', $text, $matches);
    if ($result > 0) {
      foreach ($matches[0] as $key => &$value) {
        $parts = explode($matches[2][$key], $matches[3][$key], 2);
        $parsed_url = parse_url($parts[0]);
        if (!isset($parsed_url['host'])) {
          $parsed_url['scheme'] = $scheme;
          $parsed_url['host'] = $server;
          if (!($scheme == 'http' && $port == 80) && !($scheme == 'https' && $port == 443)) {
            $parsed_url['port'] = $port;
          }
          $generated_url = $parsed_url['scheme'] . '://';
          $generated_url .= $parsed_url['host'];
          if (!empty($parsed_url['port'])) {
            $generated_url .= ':' . $parsed_url['port'];
          }
          if (!empty($parsed_url['path'])) {
            if (preg_match('/^\\//i', $parsed_url['path']) == 0) {
              $generated_url .= $relative_path . '/';
            }
            $generated_url .= $parsed_url['path'];
          }
          else {
            $generated_url .= $relative_path . '/';
          }
          if (!empty($parsed_url['query'])) {
            $generated_url .= '?' . $parsed_url['query'];
          }
          if (!empty($parsed_url['fragment'])) {
            $generated_url .= '#' . $parsed_url['fragment'];
          }
          $safe_expression = preg_replace('/\\`/i', '\\`', $matches[1][$key] . $matches[2][$key] . $parts[0] . $matches[2][$key]);
          $safe_expression = preg_replace('/\\?/i', '\\?', $safe_expression);
          $safe_expression = preg_replace('/\\./i', '\\.', $safe_expression);
          $safe_expression = preg_replace('/\\~/i', '\\~', $safe_expression);
          $safe_text = preg_replace('`' . $safe_expression . '`si', $matches[1][$key] . $matches[2][$key] . $generated_url . $matches[2][$key], $results['text']);
          if (is_string($safe_text)) {
            $results['text'] = $safe_text;
          }
        }
      }
    }
  }
  return $results;
}

/**
 * Reads and processes a website page at the given path.
 *
 * Why:
 *   Custom php scripts need a straight-forward and easy way to pull data from
 *   another website.
 *   This is useful as an alternative to iframe and has advantages and
 *   disadvantages to iframes.
 *   An advantage is that this allows showing remote content even if the remote
 *   url is down (via caching).
 *   A disadvantage is that remote images and links need to be processed,
 *   updated, and possibly even manually cached.
 *
 * @param string $server
 *   Hostname or ip address of the server.
 *   Should not contain http:// or similary prefixes.
 * @param string $path
 *  The file/path on the server to.
 * @param int $port
 *   (optional) Port number of the page to read (defaults to 80).
 * @param array $function_history
 *   (optional) An array of function names, ie:
 *   array('0' => 'my_function_name').
 *
 * @return array
 *   An array containing the connection status and return http response.
 *   The array keys:
 *   - read: A boolean with TRUE representing that the read was successful and
 *   FALSE otherwise.
 *   - headers: The http header from the httpd response.
 *   - document: The complete html document from the http response.
 */
function cf_http_get_webpage($server, $path, $port = 80, array $function_history = array()) {
  cf_error_append_history($function_history, __FUNCTION__);
  $results = array(
    'read' => FALSE,
    'headers' => '',
    'document' => '',
    'http_error' => array(
      'error_code' => 0,
      'key' => '',
      'value' => '',
    ),
  );
  $result = cf_http_get_response($server, $path, $port, $function_history);
  if ($result['connected']) {
    $result = cf_http_parse_response($result['response'], $function_history);
    if ($result['parsed']) {
      $results['headers'] = $result['headers'];
      $results['document'] = $result['document'];
      $results['http_error'] = $result['http_error'];
      $results['read'] = TRUE;
    }
  }
  return $results;
}

/**
 * Reads and processes a website page at the given path.
 *
 * Why:
 *   Custom php scripts need a straight-forward and easy way to pull data from
 *   another website.
 *   This is useful as an alternative to iframe and has advantages and
 *   disadvantages to iframes.
 *   An advantage is that this allows showing remote content even if the remote
 *   url is down (via caching).
 *   A disadvantage is that remote images and links need to be processed,
 *   updated, and possibly even manually cached.
 *
 * @param $text
 *   Hostname or ip address of the server.
 *   Should not contain http:// or similary prefixes.
 * @param $depth
 *   The amount of shrinkage to perform. Any number from 1 to 6.
 * @param $preserve
 *   A boolean representing whether or not to preserve the header structure
 *   when the depth of a given header is reduced to a number greater than 6.
 *   If preserve is false, all header formatting will be lost.
 * @param array $function_history
 *   (optional) An array of function names, ie:
 *   array('0' => 'my_function_name').
 *
 * @return array
 *   An array containing the connection status and return http response.
 *   The array keys:
 *   - reduced: A boolean with TRUE representing that the text was successful
 *   reduced and FALSE otherwise.
 *   - text: The http text with all html headers reduced by $depth.
 */
function cf_http_reduce_html_headers($text, $depth = 1, $preserve = TRUE, array $function_history = array()) {
  cf_error_append_history($function_history, __FUNCTION__);
  $results = array(
    'reduced' => FALSE,
    'text' => '',
  );
  if (!is_string($text)) {
    cf_error_not_string($function_history, 'text');
    return $results;
  }
  if ($depth < 0 || $depth > 6) {
    return $results;
  }
  $results['text'] = $text;
  foreach (array(
    6,
    5,
    4,
    3,
    2,
    1,
  ) as $number) {
    $reduced = $number + $depth;
    $tag = 'h' . $reduced;
    $matches = array();
    if ($reduced > 6) {
      $tag = 'div';
    }
    if ($tag != 'div' || $preserve) {
      if (preg_match_all('/<h' . $number . '([^>]*)>/i', $results['text'], $matches) > 0) {
        $results['text'] = preg_replace('/<h' . $number . '>/i', '<' . $tag . ' class="cf_http-was_h' . $number . '">', $results['text']);
        foreach ($matches[1] as $match_key => $match) {
          if (!empty($match)) {
            $class_matches = array();
            if (preg_match('/class="([^"]*)"/i', $match, $class_matches) == 0) {
              $class_matches = array();
              if (preg_match("/class='([^']*)'/i", $match, $class_matches) == 0) {
                $results['text'] = preg_replace('/<h' . $number . '([^>]*)>/i', '<' . $tag . ' ${1} class="cf_http-was_h' . $number . '">', $results['text']);
              }
              else {
                $new_attributes = preg_replace("/\\bclass='([^']*)'/i", "class='" . $class_matches[1] . ' cf_http-was_h' . $number . "'", $match);
                $results['text'] = preg_replace('/<h' . $number . '[^>]*>/i', '<' . $tag . ' ' . $new_attributes . '>', $results['text']);
              }
            }
            else {
              $new_attributes = preg_replace('/\\bclass="([^"]*)"/i', 'class="' . $class_matches[1] . ' cf_http-was_h' . $number . '"', $match);
              $results['text'] = preg_replace('/<h' . $number . '[^>]*>/i', '<' . $tag . ' ' . $new_attributes . '>', $results['text']);
            }
          }
        }
        $results['text'] = preg_replace('/<\\/h' . $number . '>/i', '</' . $tag . '>', $results['text']);
      }
    }
    else {
      $results['text'] = preg_replace('/<h' . $number . '([^>]*)>/i', '<' . $tag . ' ${1}' . '">', $results['text']);
      $results['text'] = preg_replace('/<\\/h' . $number . '>/i', '</' . $tag . '>', $results['text']);
    }
  }
  if (!is_string($results['text'])) {
    $results['text'] = '';
    return $results;
  }
  $results['reduced'] = TRUE;
  return $results;
}

Functions

Namesort descending Description
cf_http_adjust_urls Fix relative urls pulled from the remote server.
cf_http_get_response Reads an http page at the given path and returns an unprocessed response.
cf_http_get_webpage Reads and processes a website page at the given path.
cf_http_headers_errors Search through an array of http errors for common 400 and 500 http codes.
cf_http_init Implements hook_init().
cf_http_parse_response Accepts and processes provided http content.
cf_http_reduce_html_headers Reads and processes a website page at the given path.
cf_http_split_response Breaks apart an html formatted document string.
cf_http_unchunk_response Unchunk http content.
cf_http_validate_response Validate http responses by checking header.