You are here

biblio_advanced_import.module in Biblio Advanced Import 7

Same filename and directory in other branches
  1. 6 biblio_advanced_import.module

Biblio add-on.

Instead of creating duplicate biblio records, existing ones could be updated or the import could be skipped depending on a configurable duplicate detection strategy.

@author Markus Kalkbrenner | Cocomore AG

File

biblio_advanced_import.module
View source
<?php

/**
 * @file
 * Biblio add-on.
 *
 * Instead of creating duplicate biblio records,
 * existing ones could be updated or the import could
 * be skipped depending on a configurable duplicate
 * detection strategy.
 *
 * @see biblio.module
 *
 * @author Markus Kalkbrenner | Cocomore AG
 *   @see http://drupal.org/user/124705
 */

/**
 * Implements hook_menu().
 */
function biblio_advanced_import_menu() {
  $items['admin/config/content/biblio/advanced_import'] = array(
    'title' => 'Advanced Import',
    'page callback' => 'drupal_get_form',
    'page arguments' => array(
      'biblio_advanced_import_settings_form',
    ),
    'access arguments' => array(
      'administer biblio',
    ),
    'file' => 'biblio_advanced_import.admin.inc',
    'type' => MENU_LOCAL_TASK,
    'weight' => 2,
  );
  return $items;
}

/**
 * Implements hook_node_presave().
 */
function biblio_advanced_import_node_presave($node) {
  if ('biblio' == $node->type && empty($node->nid)) {
    $verbose_messages = (bool) variable_get('biblio_advanced_import_verbose_messages', '0');
    biblio_advanced_import_pitfall_workarounds($node);
    if (variable_get('biblio_auto_citekey', 1)) {

      // on new entries, override citekeys generated by parses, depending on settings
      $citekey = biblio_advanced_import_create_citekey($node);
      if ($citekey) {
        $node->biblio_citekey = $citekey;
      }
    }
    $query = db_select('biblio', 'b');
    $alias = $query
      ->innerJoin('node', 'n', 'b.nid = n.nid AND b.vid = n.vid');
    $query
      ->fields('b', array(
      'nid',
    ));
    $skip = FALSE;
    $revision = FALSE;
    switch (variable_get('biblio_advanced_import_duplicate_strategy', 'create duplicate')) {
      case 'create duplicate':
        if ($verbose_messages) {
          drupal_set_message(t('Creating duplicate of node %node_title', array(
            '%node_title' => $node->title,
          )));
        }
        return;
      case 'skip import':

        // There's no way to stop an already running node_save()
        // in a safe way without breaking a batch process.
        // So we do a little trick to realize the 'skip import':
        // We simply replace the current node to be saved by the
        // unmodified oldest duplicate and save this one instead
        $skip = TRUE;
        $query
          ->orderBy('b.nid', 'DESC')
          ->range(0, 1);
        break;
      case 'new rev latest':
        $revision = TRUE;
      case 'update latest':
        $query
          ->orderBy('b.nid', 'DESC')
          ->range(0, 1);
        break;
      case 'new rev oldest':
        $revision = TRUE;
      case 'update oldest':
        $query
          ->orderBy('b.nid', 'ASC')
          ->range(0, 1);
        break;
      case 'new rev all':
        $revision = TRUE;
      case 'update all':
        break;
    }
    $condition_exists = FALSE;
    $or_condition = db_or();
    foreach (variable_get('biblio_advanced_import_detect_duplicate_strategy', array(
      'md5' => 'md5',
    )) as $field) {
      switch ((string) $field) {
        case 'md5':
          $or_condition
            ->condition('b.biblio_md5', biblio_advanced_import_hash($node));
          $condition_exists = TRUE;
          break;
        case 'isbn':
        case 'issn':
        case 'doi':
          $field_property = 'biblio_' . $field;
          if (!empty($node->{$field_property})) {
            $or_condition
              ->condition('b.' . $field_property, $node->{$field_property});
            $condition_exists = TRUE;
          }
          break;
        case 'pubmed':
          if (module_exists('biblio_pm') && !empty($node->biblio_pubmed_id)) {
            $query
              ->innerJoin('biblio_pubmed', 'bp', 'b.nid = bp.nid');
            $or_condition
              ->condition('bp.biblio_pubmed_id', $node->biblio_pubmed_id);
            $condition_exists = TRUE;
          }
          break;
      }
    }
    if ($condition_exists) {
      $query
        ->condition($or_condition);
      $result = $query
        ->execute();
      $is_first_duplicate = TRUE;
      $node_new = (array) $node;
      while ($row = $result
        ->fetchObject()) {

        // there are duplicates:
        $node_old = node_load($row->nid);

        // we need to set this or the node module will throw notices
        // (if this node becomes the one to be really saved instead of the original one)
        $node_old->is_new = FALSE;
        if (!$skip) {

          // update an existing biblio node with new data
          if ($verbose_messages) {
            drupal_set_message(t('Updating node %node_title (node %nid)', array(
              '%nid' => $node_old->nid,
              '%node_title' => $node_old->title,
            )));
          }
          $merge = FALSE;
          foreach ($node_new as $key => $value) {
            if (strpos($key, 'biblio') === 0 && 'biblio_citekey' != $key || strpos($key, 'contributors') === 0 || 'title' == $key) {
              $strategy = variable_get('biblio_advanced_import_merge_strategy', 'override');
              if ('override' == $strategy || 'override but keep additional' == $strategy && !empty($value) || 'add new' == $strategy && !empty($value) && empty($node_old->{$key}) || 'override existing non empty' == $strategy && !empty($node_old->{$key}) || 'override existing non empty with non empty' == $strategy && !empty($value) && !empty($node_old->{$key})) {
                if (!property_exists($node_old, $key) || $node_old->{$key} != $value) {
                  $node_old->{$key} = $value;
                  $merge = TRUE;
                }
              }
            }
          }
          if ($revision && $merge) {
            $node_old->revision = TRUE;
            $node_old->log = t('New revision created automatically by Biblio Advanced Import.');
          }
        }
        else {

          // There's no way to stop an already running node_save()
          // in a safe way without breaking the batch process.
          // So we use a little trick to implement the 'skip import':
          // We replace the current node to be saved with the
          // unmodified first duplicate and let drupal save that one instead.
          if ($verbose_messages) {
            drupal_set_message(t('Skipping update of node %node_title (node %nid)', array(
              '%nid' => $node_old->nid,
              '%node_title' => $node_old->title,
            )));
          }
        }
        if ($is_first_duplicate) {

          // the content of the node being saved gets replaced with the values from the first duplicate node
          // (replacing the whole object with the loaded node did not seem to work
          // so we do it property by property ...)
          $is_first_duplicate = FALSE;

          // clear existing object
          foreach (get_object_vars($node) as $key => $value) {
            unset($node->{$key});
          }

          // copy values over
          foreach (get_object_vars($node_old) as $key => $value) {
            $node->{$key} = $value;
          }
        }
        else {

          // save any other existing duplicates, with values updated
          node_save($node_old);
        }
      }
    }
  }
}

/**
 * Implements hook_node_insert().
 */
function biblio_advanced_import_node_insert($node) {
  if ('biblio' == $node->type) {
    biblio_advanced_import_update_hash($node);
  }
}

/**
 * Implements hook_node_update().
 */
function biblio_advanced_import_node_update($node) {
  if ('biblio' == $node->type) {
    biblio_advanced_import_update_hash($node);
  }
}

/**
 * Helper function to create a hash from a biblio node
 * depending on a configurable duplicate detection
 * strategy.
 *
 * @see biblio_advanced_import_settings_form()
 *
 * @param $node
 *   a biblio node
 *
 * @return
 *   a md5 hash
 */
function biblio_advanced_import_hash($node) {
  $hash_string = '';
  $duplicate_criteria = variable_get('biblio_advanced_import_duplicate_criteria', array(
    'title' => 'title',
    'biblio_year' => 'biblio_year',
  ));
  foreach ($duplicate_criteria as $field) {
    if ($field) {
      $field_value = '';
      if (isset($node->{$field})) {
        $field_value = $node->{$field};
      }
      if ('biblio_year' == $field && (empty($field_value) || $field_value == t('Submitted'))) {

        // If the year is empty, it will be set to 9999 by biblio on save.
        // 9999 => "Submitted"
        // Therefore we have to do the same here to not break duplicate detection.
        // Furthermore, on load this magic value will is replaced with a translated version
        // of the string "Submitted"; we hit this case when we rehash after a configuration change;
        // we standardize all these cases on 9999 to arrive at consistent hash values
        $field_value = 9999;
      }
      if ($field_value) {
        if (is_array($field_value) || is_object($field_value)) {
          $hash_string .= serialize($field_value);
        }
        else {
          $hash_string .= preg_replace("/\\s+/", '', mb_strtolower(mb_substr($field_value, 0, 256)));
        }
      }
    }
  }
  return md5(strtolower($hash_string));
}

/**
 * Helper function to update the hash of a biblio node.
 *
 * @see biblio_advanced_import_settings_form()
 *
 * @param $node
 *   a biblio node
 */
function biblio_advanced_import_update_hash(&$node) {
  $node->biblio_md5 = biblio_advanced_import_hash($node);
  db_update('biblio')
    ->fields(array(
    'biblio_md5' => $node->biblio_md5,
  ))
    ->condition('nid', $node->nid)
    ->condition('vid', $node->vid)
    ->execute();
}

/**
 * Helper function to create a configurable biblio node citekey.
 *
 * @see biblio_advanced_import_settings_form()
 * @see biblio_advanced_import_settings_form_submit()
 *
 * @param $node
 *   a biblio node
 */
function biblio_advanced_import_create_citekey($node) {
  $citekey = '';
  switch (variable_get('biblio_advanced_import_citekey_creation_strategy', 'biblio')) {
    case 'fields':
      $citekey_parts = array();
      $prefix = variable_get('biblio_citekey_prefix', '');
      if (!empty($prefix)) {
        $citekey_parts[] = $prefix;
      }
      foreach (variable_get('biblio_advanced_import_citekey_creation', array(
        'title' => 'title',
        'biblio_year' => 'biblio_year',
      )) as $field) {
        if (!empty($field) && !empty($node->{$field})) {
          $citekey_parts[] = $node->{$field};
        }
      }
      $citekey = implode('|', $citekey_parts);

      // biblio stores citekey as varchar(255), we need to make sure it fits
      // or a PDO Exception is thrown
      $citekey = mb_substr($citekey, 0, 255);

      // strip trailing pipe symbol, if any
      $citekey = preg_replace('@\\|+$@', '', $citekey);
      $citekey = trim($citekey);
      break;
  }
  if ($citekey) {
    if (db_query("SELECT 1 FROM {biblio} WHERE biblio_citekey = :biblio_citekey", array(
      ':biblio_citekey' => $citekey,
    ))
      ->fetchField()) {
      switch (variable_get('biblio_advanced_import_duplicate_citekey_strategy', 'skip')) {
        case 'skip':
          $citekey = '';
          break;
        case 'append counter':
          $counter = variable_get('biblio_advanced_import_citekey_creation_counter', 0) + 1;
          variable_set('biblio_advanced_import_citekey_creation_counter', $counter);

          // biblio stores citekey as varchar(255), so we have to ensure that the counter is saved
          $citekey = mb_substr($citekey, 0, 254 - strlen($counter)) . '|' . $counter;
      }
    }
  }
  return $citekey;
}

/**
 * @todo Please document this function.
 * @see http://drupal.org/node/1354
 */
function biblio_advanced_import_form_biblio_admin_settings_alter(&$form, &$form_state) {
  if ('fields' == variable_get('biblio_advanced_import_citekey_creation_strategy', 'biblio')) {
    $form['citekey']['biblio_citekey_field1']['#type'] = 'value';
    $form['citekey']['biblio_citekey_field1']['#value'] = $form['citekey']['biblio_citekey_field1']['#default_value'];
    $form['citekey']['biblio_citekey_field2']['#type'] = 'value';
    $form['citekey']['biblio_citekey_field2']['#value'] = $form['citekey']['biblio_citekey_field2']['#default_value'];
    $form['citekey']['biblio_citekey_phpcode']['#type'] = 'value';
    $form['citekey']['biblio_citekey_phpcode']['#value'] = $form['citekey']['biblio_citekey_phpcode']['#default_value'];
  }
}

/**
 * This function implements some optional data cleanup / normalization
 * that can be activated on the "advanced import" tab.
 */
function biblio_advanced_import_pitfall_workarounds(&$node) {
  switch (variable_get('biblio_advanced_import_fix_issn', 'as is')) {
    case 'as is':
      break;
    case 'normalize from isbn':
      if (empty($node->biblio_issn) || !empty($node->biblio_isbn)) {

        // RIS format does not distinguish between ISBN and ISSN
        $node->biblio_issn = $node->biblio_isbn;
      }

    // no break
    case 'normalize':

      // @see http://en.wikipedia.org/wiki/International_Standard_Serial_Number
      if (!empty($node->biblio_issn)) {
        if (preg_match("@\\b([0-9]{4})-?([0-9X]{4})\\b@i", $node->biblio_issn, $matches)) {
          $issn = strtoupper($matches[1] . $matches[2]);
          $sum = 0;
          for ($i = 0; $i < 7; $i++) {
            $sum += $issn[$i] * (8 - $i);
          }
          $checksum = 11 - $sum % 11;
          if ($checksum == $issn[7] || 10 == $checksum && 'X' == $issn[7]) {
            $node->biblio_issn = $issn;
          }
          else {
            unset($node->biblio_issn);
          }
        }
        else {
          unset($node->biblio_issn);
        }
      }
      break;
  }
  switch (variable_get('biblio_advanced_import_fix_isbn', 'as is')) {
    case 'as is':
      break;
    case 'remove':

      // @see http://en.wikipedia.org/wiki/International_Standard_Book_Number
      if (!empty($node->biblio_isbn)) {
        module_load_include('class.php', 'biblio_advanced_import', 'lib/isbntest');
        $currISBN = new ISBNtest();
        $currISBN
          ->set_isbn($matches[0]);
        if ($currISBN
          ->valid_isbn10() || $currISBN
          ->valid_isbn13() || $currISBN
          ->valid_gtin14()) {
          $node->biblio_isbn = $currISBN
            ->get_gtin14();
        }
        else {
          unset($node->biblio_isbn);
        }
      }
      break;
    case 'convert 13':

      // @see http://en.wikipedia.org/wiki/International_Standard_Book_Number
      if (!empty($node->biblio_isbn)) {
        if (preg_match("@[0-9\\-]{10,}@", $node->biblio_isbn, $matches)) {
          module_load_include('class.php', 'biblio_advanced_import', 'lib/isbntest');
          $currISBN = new ISBNtest();
          $currISBN
            ->set_isbn($matches[0]);
          if ($currISBN
            ->valid_isbn13()) {
            $node->biblio_isbn = $currISBN
              ->get_isbn13();
          }
          elseif ($currISBN
            ->valid_gtin14()) {
            $node->biblio_isbn = $currISBN
              ->get_gtin14();
          }
          else {
            unset($node->biblio_isbn);
          }
        }
        else {
          unset($node->biblio_isbn);
        }
      }
      break;
  }
  switch (variable_get('biblio_advanced_import_fix_doi', 'as is')) {
    case 'as is':
      break;
    case 'one valid':

      // @see http://en.wikipedia.org/wiki/Digital_object_identifier
      if (!empty($node->biblio_doi)) {
        if (preg_match("@10\\.\\d{4,}/[^\\s]+@i", $node->biblio_doi, $matches)) {
          $node->biblio_doi = $matches[0];
        }
        else {
          unset($node->biblio_doi);
        }
      }
      break;
  }
  switch (variable_get('biblio_advanced_import_fix_title', 'as is')) {
    case 'as is':
      break;
    case 'mendeley bibtex':
      if (!empty($node->title)) {

        // strip off enclosing curly braces, but only a matching pair
        $node->title = preg_replace('@^\\{(.*)\\}$@', '$1', $node->title);
      }
      break;
  }
  switch (variable_get('biblio_advanced_import_fix_url', 'as is')) {
    case 'as is':
      break;
    case 'one valid':
      if (!empty($node->biblio_url)) {
        if (preg_match("@(http|https)://[^\\s]+@i", $node->biblio_url, $matches)) {

          // ris import runs together lists of urls without a delimiter
          $urls = explode('http', str_replace(array(
            'HTTP:',
            'HTTPS:',
          ), array(
            'http:',
            'https:',
          ), $matches[0]));
          $node->biblio_url = 'http' . $urls[1];
        }
        else {
          unset($node->biblio_url);
        }
      }
      break;
  }
}

Functions

Namesort descending Description
biblio_advanced_import_create_citekey Helper function to create a configurable biblio node citekey.
biblio_advanced_import_form_biblio_admin_settings_alter @todo Please document this function.
biblio_advanced_import_hash Helper function to create a hash from a biblio node depending on a configurable duplicate detection strategy.
biblio_advanced_import_menu Implements hook_menu().
biblio_advanced_import_node_insert Implements hook_node_insert().
biblio_advanced_import_node_presave Implements hook_node_presave().
biblio_advanced_import_node_update Implements hook_node_update().
biblio_advanced_import_pitfall_workarounds This function implements some optional data cleanup / normalization that can be activated on the "advanced import" tab.
biblio_advanced_import_update_hash Helper function to update the hash of a biblio node.