 * @file
 * Grabs images for items imported using the feeds module.
 * Grabs images for each feed item from their respective web pages and stores
 * them in an image field.
 * Requires the Feeds module.




 * Implements hook_menu().
function feeds_imagegrabber_menu() {
  $items = array();
  $items['admin/config/content/feeds_imagegrabber'] = array(
    'title' => 'Feeds Image Grabber',
    'page callback' => 'drupal_get_form',
    'page arguments' => array(
    'access arguments' => array(
      'administer site configuration',
    'description' => 'Configure default options for Feeds Image Grabber',
  return $items;

 * Implements hook_form_BASE_FORM_ID_alter().
function feeds_imagegrabber_form_node_form_alter(&$form, &$form_state, $form_id) {
  if ($importer_id = feeds_get_importer_id($form['#node']->type)) {

    // Use the values from $form_state if available
    if (isset($form_state['values']['feeds_imagegrabber'])) {
      $settings = $form_state['values']['feeds_imagegrabber'];
    elseif (!isset($form['#node']->nid) || ($settings = feeds_imagegrabber_get_settings($form['#node']->nid)) === FALSE) {
      $settings = feeds_imagegrabber_get_default_settings();
    $form['feeds_imagegrabber'] = array(
      '#type' => 'fieldset',
      '#title' => t('Feeds Image Grabber'),
      '#tree' => TRUE,
      '#collapsible' => TRUE,
      '#collapsed' => TRUE,
    feeds_imagegrabber_form($form, $settings);
    $form['#validate'][] = 'feeds_imagegrabber_form_validate';

 * Validates the feeds_imagegrabber form of the node.
function feeds_imagegrabber_form_validate($form, &$form_state) {
  if ($form_state['values']['feeds_imagegrabber']['enabled'] == 1) {
    $id_class = $form_state['values']['feeds_imagegrabber']['id_class'];
    $id_class_desc = $form_state['values']['feeds_imagegrabber']['id_class_desc'];
    if ($id_class) {
      if (!isset($id_class_desc) || empty($id_class_desc) || $id_class_desc == '') {
        form_set_error('feeds_imagegrabber][id_class_desc', "Specify the id/class of the desired tag.");
      if ($id_class == 1 && !preg_match('/^[a-zA-Z]+[_a-zA-Z0-9-]*$/', $id_class_desc)) {
        form_set_error('feeds_imagegrabber][id_class_desc', "Only alphabets, digits, hyphens and underscores are allowed in HTML id");
      elseif ($id_class == 2 && !preg_match('/^[a-zA-Z]+[_a-zA-Z0-9- ]*$/', $id_class_desc)) {
        form_set_error('feeds_imagegrabber][id_class_desc', "Only alphabets, digits, hyphens, spaces and underscores are allowed in HTML class");
    else {
      form_set_value($form['feeds_imagegrabber']['id_class_desc'], '', $form_state);
    $temp = $form_state['values']['feeds_imagegrabber']['exec_time'];
    if (!is_numeric($temp) || $temp < 10 || $temp != round($temp) || $temp > 75) {
      form_set_error('feeds_imagegrabber][exec_time', t('Select the correct option for FIG execution time.'));

 * Implements hook_node_insert().
function feeds_imagegrabber_node_insert($node) {

 * Implements hook_node_update().
function feeds_imagegrabber_node_update($node) {
  if ($importer_id = feeds_get_importer_id($node->type)) {
    if (isset($node->feeds_imagegrabber['enabled']) && $node->feeds_imagegrabber['enabled']) {
      $settings = array(
        'enabled' => $node->feeds_imagegrabber['enabled'],
        'id_class' => $node->feeds_imagegrabber['id_class'],
        'id_class_desc' => $node->feeds_imagegrabber['id_class_desc'],
        'exec_time' => $node->feeds_imagegrabber['exec_time'],
        'feeling_lucky' => $node->feeds_imagegrabber['feeling_lucky'],
        'feed_nid' => $node->nid,
    else {
        'enabled' => 0,
        ->condition('feed_nid', $node->nid)

 * Implements hook_node_delete().
function feeds_imagegrabber_node_delete($node) {
    ->condition('feed_nid', $node->nid)




 * Implements hook_feeds_processor_targets_alter().
 * @see FeedsNodeProcessor::getMappingTargets().
function feeds_imagegrabber_feeds_processor_targets_alter(&$targets, $entity_type, $bundle_name) {
  foreach (field_info_instances($entity_type, $bundle_name) as $name => $instance) {
    $info = field_info_field($name);
    if (in_array($info['type'], array(
    ))) {
      $targets[$name . ':fig'] = array(
        'name' => check_plain($instance['label']) . ' (FIG)',
        'callback' => 'feeds_imagegrabber_feeds_set_target',
        'description' => t('The @label field of the node.', array(
          '@label' => $instance['label'],

 * Callback for mapping. Here is where the actual mapping happens.
function feeds_imagegrabber_feeds_set_target($source, $entity, $target, $page_url) {
  if (empty($page_url)) {
  $feed_nid = $entity->feeds_item->feed_nid;
  $settings = feeds_imagegrabber_get_settings($feed_nid);

  // Fall back to the defaults.
  if (!$settings) {
    $settings = feeds_imagegrabber_get_default_settings();
  if (!$settings || !$settings['enabled']) {
    return FALSE;
  if (!feeds_imagegrabber_include_library('', 'feeds_imagegrabber')) {
    watchdog('feeds_imagegrabber', 'url conversion script is missing. ', array(), WATCHDOG_ERROR, 'admin/reports/dblog/feeds_imagegrabber');
    return FALSE;
  list($field_name) = explode(':', $target);
  list($entity_id, $vid, $bundle_name) = entity_extract_ids($entity->feeds_item->entity_type, $entity);
  $instance_info = field_info_instance($entity->feeds_item->entity_type, $field_name, $bundle_name);
  $info = field_info_field($field_name);
  $max_filesize = parse_size(file_upload_max_size());
  if (!empty($instance_info['settings']['max_filesize']) && parse_size($instance_info['settings']['max_filesize']) < $max_filesize) {
    $max_filesize = parse_size($instance_info['settings']['max_filesize']);
  $max_exec_time = ini_get('max_execution_time');
  $timeout = $max_exec_time == 0 ? 10 : $settings['exec_time'] * $max_exec_time / 100;
  $page_time = timer_read('page') / 1000;
  if (function_exists('encode_url')) {
    $page_url = encode_url($page_url);
  if (valid_url($page_url)) {
    $xml = feeds_imagegrabber_webpage_scraper($page_url, $settings['id_class'], $settings['id_class_desc'], $timeout);
    if ($xml == FALSE) {
    $timeout = $timeout - timer_read('page') / 1000 + $page_time;
    $options = array(
      'max_imagesize' => $max_filesize,
      'timeout' => $timeout,
      'feeling_lucky' => $settings['feeling_lucky'],
      'cardinality' => $info['cardinality'],
    $images = feeds_imagegrabber_scrape_images($xml, $page_url, $settings, $options);
    if ($images == FALSE || count($images) <= 0) {
    $images = array_reverse($images, TRUE);
    $data = array();
    if (!empty($entity->uid)) {
      $data[$entity->feeds_item->entity_type] = $entity;
    $field = isset($entity->{$field_name}) ? $entity->{$field_name} : array();
    $target_dir = file_field_widget_uri($info, $instance_info, $data);
    $image_count = 0;
    foreach ($images as $url => $size) {

      // We don't need to pass an encoded url to feeds.
      // Feeds will handle encoding spaces for retrieval and replacing spaces
      // with underscores when saving to the file system.
      $url = rawurldecode($url);
      $enclosure = new FeedsEnclosure($url, 'application/octet-stream');
      if (($file = $enclosure
        ->getFile($target_dir)) && ($file = feeds_imagegrabber_is_image($file)) && !count(feeds_imagegrabber_widget_file_validator($file, $instance_info))) {
        $field['und'][$image_count] = (array) $file;
        $field['und'][$image_count]['display'] = 1;
        if ($image_count == $info['cardinality']) {
    if ($image_count) {
      $entity->{$field_name} = $field;




 * Retrieve settings for a feed node from the database.
 * @param $feed_nid
 *   The nid of the feed node.
 * @return
 *   An array of settings or FALSE if settings not found.
function feeds_imagegrabber_get_settings($feed_nid) {
  $settings = db_query("SELECT enabled, id_class, id_class_desc, feeling_lucky, exec_time FROM {feeds_imagegrabber} WHERE feed_nid = :feed_nid", array(
    ':feed_nid' => $feed_nid,
  return $settings;

 * Retrieve the default settings for a feed node from the database.
 * @return
 *   An array of settings.
function feeds_imagegrabber_get_default_settings() {
  $default = array(
    'enabled' => 0,
    'id_class' => 0,
    'id_class_desc' => '',
    'exec_time' => 10,
    'feeling_lucky' => 0,
  return variable_get('feeds_imagegrabber', $default);

 * Implementation of the default settings admin form.
function feeds_imagegrabber_admin($form, &$form_state) {
  $form = array();
  $settings = feeds_imagegrabber_get_default_settings();
  $form['feeds_imagegrabber'] = array(
    '#type' => 'fieldset',
    '#title' => t('Default Settings'),
    '#tree' => TRUE,
    '#collapsible' => FALSE,
  feeds_imagegrabber_form($form, $settings);
  $form['#validate'][] = 'feeds_imagegrabber_form_validate';
  return system_settings_form($form);

 * Appends the form with the Feeds Image Grabber form using the passed default
 * settings.
 * @param $form
 *   The form to append under the 'feeds_imagegrabber' fieldset.
 * @param $default_settings
 *   The default values of the form elements.
function feeds_imagegrabber_form(&$form, $default_settings) {
  $form['feeds_imagegrabber']['enabled'] = array(
    '#type' => 'checkbox',
    '#title' => t('Enable Feeds Image Grabber'),
    '#description' => t('Check if you want to download images from URL of the feed items.'),
    '#default_value' => $default_settings['enabled'],
  $form['feeds_imagegrabber']['id_class'] = array(
    '#type' => 'radios',
    '#title' => t('Search for the images between the tag which is identified by'),
    '#options' => array(
      t('None, search the whole web-page for the images.'),
      t('an Id'),
      t('a Class'),
    '#default_value' => $default_settings['id_class'],
  $form['feeds_imagegrabber']['id_class_desc'] = array(
    '#type' => 'textfield',
    '#title' => t('<i>Id</i> or <i>Class</i> of the HTML tag (Leave empty if you selected <i>None</i> above.)'),
    '#default_value' => $default_settings['id_class_desc'],
    '#description' => t('Separate multiple classes with spaces (as present in the HTML)'),
    '#maxlength' => 100,
  $form['feeds_imagegrabber']['feeling_lucky'] = array(
    '#type' => 'radios',
    '#title' => t('Feeling lucky, huh?'),
    '#options' => array(
      t('No, select the images based of their size (large to small) between the tag.'),
      t('Yes, select the images based on their position (top to bottom) between the tag. (Recommended)'),
    '#default_value' => $default_settings['feeling_lucky'],
  $form['feeds_imagegrabber']['exec_time'] = array(
    '#type' => 'select',
    '#title' => t('Execution time[%]'),
    '#options' => drupal_map_assoc(array(
    '#default_value' => $default_settings['exec_time'],
    '#description' => t('Select the percentage of maximum PHP execution time to take while grabbing images for a feed item.'),

 * Validates the size of an file accessible through a http url.
 * @param $file_url
 *   A string specifying the formatted file url.
 * @param $max_size
 *   Maximum size of the file to be downloaded.
 * @param $timeout
 *   A float representing the maximum number of seconds the function call may
 *   take. The default is 10 seconds. If a timeout occurs, the retuen code is
 *   set to the HTTP_REQUEST_TIMEOUT constant.
 * @param $max_redirects
 *   An integer representing how many times a redirect may be followed.
 *   Defaults to 3.
 * @return
 *   An integer code containing filesize in case the file exists and conforms to
 *   the size limit, -1 otherwise.
function feeds_imagegrabber_validate_download_size($file_url, $max_size, $timeout = 10, $max_redirects = 3) {
  $options = array(
    'headers' => array(),
    'method' => 'HEAD',
    'data' => NULL,
    'max_redirects' => $max_redirects,
    'timeout' => $timeout,
  $result = drupal_http_request($file_url, $options);
  if ($result->code == 200 && isset($result->headers) && is_array($result->headers)) {

    //Bug #882992, some servers may return keys with different case.
    $headers = array_change_key_case($result->headers);
    if (isset($headers['content-length']) && $headers['content-length'] <= $max_size) {
      return $headers['content-length'];
  return -1;

 * Scrape the webpage using the id or the css class of a tag and returns the
 * HTML between the tag.
 * @param $page_url
 *   A string specifying the page url to scrape. If there is a redirect, it is
 *   changed to the redirect_url.
 * @param $itype
 *   A positive integer value representing the identifier type for the tag:
 *     - 0 : selects content between <body> </body>.
 *     - 1 : selects content between the tag identified by an ID.
 *     - 2 : selects content between the first tag identified by a CSS class.
 * @param $ivalue
 *   A string specifying the ID or the CSS class.
 * @param $timeout
 *   A float representing the maximum number of seconds the function call may
 *   take. The default is 15 seconds. If a timeout occurs, the retuen code is
 *   set to the HTTP_REQUEST_TIMEOUT constant.
 * @param $max_redirects
 *   An integer representing how many times a redirect may be followed.
 *   Defaults to 3.
 * @param $error_log
 *   An array which contains the error codes and messages in case the functions
 *   fails.
 * @return
 *   FALSE on failure, OR content between the tags as XML on success.
function feeds_imagegrabber_webpage_scraper(&$page_url, $itype, $ivalue = '', $timeout = 15, $max_redirects = 3, &$error_log = array()) {
  $options = array(
    'headers' => array(),
    'method' => 'GET',
    'data' => NULL,
    'max_redirects' => $max_redirects,
    'timeout' => $timeout,
  $result = drupal_http_request($page_url, $options);
  if (isset($result->redirect_code) && in_array($result->redirect_code, array(
  ))) {
    $page_url = $result->redirect_url;
  if ($result->code != 200) {
    $error_log['code'] = $result->code;
    $error_log['error'] = "unable to retrieve content from web page";
    return FALSE;
  if (empty($result->data) || drupal_strlen($result->data) <= 0) {
    $error_log['code'] = -1;
    $error_log['error'] = "no data available on url";
    return FALSE;
  $doc = new DOMDocument();
  if (@$doc
    ->loadHTML($result->data) === FALSE) {
    $error_log['code'] = -2;
    $error_log['error'] = "unable to parse the html content";
    return FALSE;
  if ($itype == 0) {
    $items = @$doc
    if ($items != NULL && $items->length > 0) {
      $dist = $items
    else {
      $dist = NULL;
  elseif ($itype == 1) {
    $dist = @$doc
  elseif ($itype == 2) {
    $xpath = new DOMXPath($doc);

    //Normalize whitespaces.
    $ivalue = preg_replace('/\\s\\s+/', ' ', trim($ivalue));
    $items = $xpath
      ->query("//*[@class and contains(concat(' ',normalize-space(@class),' '), ' {$ivalue} ')]");
    if ($items != NULL && $items->length > 0) {
      $dist = $items
    else {
      $dist = NULL;
  else {

    //not supported yet
    $dist = NULL;
  if ($dist == NULL) {
    $error_log['code'] = -3;
    $error_log['error'] = "tag not found";
    return FALSE;
  $content = '';
  if (($content = @$dist->ownerDocument
    ->saveXML($dist)) === FALSE) {
    $error_log['code'] = -4;
    $error_log['error'] = "error converting content to XML";
    return FALSE;
  return $content;

 * Scrape images from HTML/XML content.
function feeds_imagegrabber_scrape_images($content, $base_url, $settings, array $options = array(), &$error_log = array()) {

  // Merge the default options.
  $options += array(
    'expression' => "//img",
    'getsize' => TRUE,
    'max_imagesize' => 512000,
    'timeout' => 10,
    'max_redirects' => 3,
    'cardinality' => 1,
  $doc = new DOMDocument();
  if (@$doc
    ->loadXML($content) === FALSE && @$doc
    ->loadHTML($content) === FALSE) {
    $error_log['code'] = -5;
    $error_log['error'] = "unable to parse the xml//html content";
    return FALSE;
  $xpath = new DOMXPath($doc);
  $hrefs = @$xpath
  if ($options['getsize']) {
  $images = array();
  $imagesize = 0;
  for ($i = 0; $i < $hrefs->length; $i++) {
    $url = $hrefs
    if (!isset($url) || empty($url) || $url == '') {
    if (function_exists('encode_url')) {
      $url = encode_url($url);
    if (function_exists('url_to_absolute')) {
      $url = url_to_absolute($base_url, $url);
    if ($url == FALSE) {
    if ($options['getsize']) {
      if (($imagesize = feeds_imagegrabber_validate_download_size($url, $options['max_imagesize'], $options['timeout'] - timer_read(__FUNCTION__) / 1000)) != -1) {
        $images[$url] = $imagesize;
        if ($settings['feeling_lucky'] && count($images) == $options['cardinality']) {
      if ($options['timeout'] - timer_read(__FUNCTION__) / 1000 <= 0) {
        $error_log['code'] = HTTP_REQUEST_TIMEOUT;
        $error_log['error'] = "timeout occured while scraping the content";
    else {
      $images[$url] = $imagesize;
      if ($settings['feeling_lucky'] && count($images) == $options['cardinality']) {
  return $images;

 * Checks that a file is an image.
 * This check allows the image formats identified by drupal i.e. jpeg, png, and
 * gif. If the filename is missing an extension but is a valid image, its actual
 * extension is added to it and the original file is renamed.
 * @param $file
 *   The file object.
 * @return
 *   The final file object, it may be modified or FALSE on failure.
function feeds_imagegrabber_is_image(&$file) {
  $extensions = 'jpeg jpg png gif';
  if ($file && ($filepath = $file->uri)) {
    $info = feeds_imagegrabber_get_image_info($filepath);
    if ($info && !empty($info['extension'])) {
      if (!count(file_validate_extensions($file, $extensions))) {
        return $file;
      else {
        $basename = basename($filepath);
        $directory = drupal_dirname($filepath);
        if ($pos = strrpos($basename, '.')) {
          $name = substr($basename, 0, $pos);
          $ext = substr($basename, $pos);
          $regex = '/\\.(' . preg_replace('/ +/', '|', preg_quote($extensions)) . ')/i';
          if (preg_match($regex, $ext, $matches)) {
            $ext = $matches[1];
          else {
            $ext = $info['extension'];
          $basename = $name . '.' . $ext;
        else {
          $basename .= '.' . $info['extension'];
        if ($basename == basename($filepath)) {
          return $file;
        $dest = file_create_filename($basename, $directory);
        if ($file = file_move($file, $dest)) {
          return $file;
  return FALSE;

 * Get the file info for in image file.
 * Drupal only supports GIF, JPG and PNG file formats.
 * This is similar to drupal's image_get_info() except the file need not be an
 * uploaded one.
 * @return
 *   FALSE, if the file could not be found or is not an image. Otherwise, a
 *   keyed array containing information about the image:
 *    'width'     - Width in pixels.
 *    'height'    - Height in pixels.
 *    'extension' - Commonly used file extension for the image.
 *    'mime_type' - MIME type ('image/jpeg', 'image/gif', 'image/png').
function feeds_imagegrabber_get_image_info($file) {
  if (!is_file($file)) {
    return FALSE;
  $details = FALSE;
  $data = @getimagesize($file);
  if (isset($data) && is_array($data)) {
    $extensions = array(
      '1' => 'gif',
      '2' => 'jpg',
      '3' => 'png',
    $extension = array_key_exists($data[2], $extensions) ? $extensions[$data[2]] : '';
    $details = array(
      'width' => $data[0],
      'height' => $data[1],
      'extension' => $extension,
      'mime_type' => $data['mime'],
  return $details;

 * Includes a required library file.
 * @param $file
 *   The filename to load from.
 * @param $library
 *   The name of the library. If libraries module is installed, 
 *   feeds_imagegrabber_include_library() will look for libraries with this name
 *   managed by libraries module.
function feeds_imagegrabber_include_library($file, $library) {
  if (module_exists('libraries') && file_exists(libraries_get_path($library) . "/{$file}")) {
    require_once DRUPAL_ROOT . '/' . libraries_get_path($library) . "/{$file}";
    return TRUE;
  else {
    $paths = array(
      drupal_get_path('module', 'feeds_imagegrabber'),
      drupal_get_path('module', 'feeds_imagegrabber') . "/libraries",
    foreach ($paths as $library_path) {
      $path = $library_path . "/{$file}";
      if (file_exists($path)) {
        require_once DRUPAL_ROOT . '/' . $path;
        return TRUE;
  return FALSE;

 * Validate the file object with field instance specific validators.
 * @param $file
 *   The file object to be validated.
 * @param $instance
 *   The instance info of the image field.
 * @return
 *   An array containing the errors (if any) which occurs while running the
 *   validators.
function feeds_imagegrabber_widget_file_validator($file, $instance) {
  $settings = $instance['settings'];

  // Create the list of validators.
  $image_validators = array();
  $image_validators['file_validate_name_length'] = array();
  if ($settings['max_resolution'] || $settings['min_resolution']) {
    $image_validators['file_validate_image_resolution'] = array(
  $file_validators = file_field_widget_upload_validators(NULL, $instance);
  $validators = array_merge($file_validators, $image_validators);
  $errors = file_validate($file, $validators);
  return $errors;


