You are here

biblio_advanced_import.module in Biblio Advanced Import 6

Same filename and directory in other branches
  1. 7 biblio_advanced_import.module

Biblio add-on.

Instead of creating duplicate biblio records, existing ones could be updated or the import could be skipped depending on a configurable duplicate detection strategy.

@author Markus Kalkbrenner | Cocomore AG

File

biblio_advanced_import.module
View source
<?php

/**
 * @file
 * Biblio add-on.
 *
 * Instead of creating duplicate biblio records,
 * existing ones could be updated or the import could
 * be skipped depending on a configurable duplicate
 * detection strategy.
 *
 * @see biblio.module
 *
 * @author Markus Kalkbrenner | Cocomore AG
 *   @see http://drupal.org/user/124705
 */

/**
 * Implements hook_menu().
 */
function biblio_advanced_import_menu() {
  $items['admin/settings/biblio/advanced_import'] = array(
    'title' => 'Advanced Import',
    'page callback' => 'drupal_get_form',
    'page arguments' => array(
      'biblio_advanced_import_settings_form',
    ),
    'access arguments' => array(
      'administer biblio',
    ),
    'file' => 'biblio_advanced_import.admin.inc',
    'type' => MENU_LOCAL_TASK,
    'weight' => 2,
  );
  return $items;
}

/**
 * Implements hook_nodeapi().
 *
 * Instead of creating duplicate biblio records,
 * existing ones could be updated or the import could
 * be skipped.
 *
 * @see biblio_advanced_import_settings_form()
 */
function biblio_advanced_import_nodeapi(&$node, $op, $a1, $a2) {
  switch ($op) {
    case 'presave':
      if ('biblio' == $node->type && !$node->nid) {
        biblio_advanced_import_pitfall_workarounds($node);
        if (variable_get('biblio_auto_citekey', 1)) {

          // on new entries, override citekeys generated by parses, depending on settings
          $citekey = biblio_advanced_import_create_citekey($node);
          if ($citekey) {
            $node->biblio_citekey = $citekey;
          }
        }
        $order = '';
        $limit = '';
        $skip = FALSE;
        $revision = FALSE;
        switch (variable_get('biblio_advanced_import_duplicate_strategy', 'create duplicate')) {
          case 'create duplicate':
            return;
          case 'skip import':

            // There's no way to stop an already running node_save()
            // in a safe way without breaking a batch process.
            // So we do a little trick to realize the 'skip import':
            // We simply replace the current node to be saved by the
            // unmodified oldest duplicate and save this one instead
            $skip = TRUE;
            $order = 'ORDER BY nid DESC';
            $limit = 'LIMIT 1';
            break;
          case 'new rev latest':
            $revision = TRUE;
          case 'update latest':
            $order = 'ORDER BY nid DESC';
            $limit = 'LIMIT 1';
            break;
          case 'new rev oldest':
            $revision = TRUE;
          case 'update oldest':
            $order = 'ORDER BY nid ASC';
            $limit = 'LIMIT 1';
            break;
          case 'new rev all':
            $revision = TRUE;
          case 'update all':
            break;
        }
        $or_fields = array();
        $joins = array();
        foreach (variable_get('biblio_advanced_import_detect_duplicate_strategy', array(
          'md5' => 'md5',
        )) as $field) {
          switch ((string) $field) {
            case 'md5':
              $or_fields[] = "biblio_md5 = '" . db_escape_string(biblio_advanced_import_hash($node)) . "'";
              break;
            case 'isbn':
            case 'issn':
            case 'doi':
              $field_property = 'biblio_' . $field;
              if (!empty($node->{$field_property})) {
                $or_fields[] = $field_property . " = '" . db_escape_string($node->{$field_property}) . "'";
              }
              break;
            case 'pubmed':
              if (module_exists('biblio_pm') && !empty($node->biblio_pubmed_id)) {
                $joins[] = 'JOIN {biblio_pubmed} USING (nid)';
                $or_fields[] = "biblio_pubmed_id = '" . db_escape_string($node->biblio_pubmed_id) . "'";
              }
              break;
          }
        }
        if (!empty($or_fields)) {
          $biblio_advanced_import_recursion = FALSE;
          $query = "SELECT nid FROM {biblio} JOIN {node} USING (nid, vid) " . implode(' ', $joins) . " WHERE " . implode(' OR ', $or_fields) . " {$order} {$limit}";
          if ($result = db_query($query)) {
            $node_new = (array) $node;
            while ($row = db_fetch_object($result)) {

              // there're duplicates:
              $node_old = node_load($row->nid);
              if (!$skip) {

                // update an existing biblio node with new data
                $merge = FALSE;
                foreach ($node_new as $key => $value) {
                  if (strpos($key, 'biblio') === 0 && 'biblio_citekey' != $key || strpos($key, 'contributors') === 0 || 'title' == $key) {
                    $strategy = variable_get('biblio_advanced_import_merge_strategy', 'override');
                    if ('override' == $strategy || 'override but keep additional' == $strategy && !empty($value) || 'add new' == $strategy && !empty($value) && empty($node_old->{$key}) || 'override existing non empty' == $strategy && !empty($node_old->{$key}) || 'override existing non empty with non empty' == $strategy && !empty($value) && !empty($node_old->{$key})) {
                      if ($node_old->{$key} != $value) {
                        $node_old->{$key} = $value;
                        $merge = TRUE;
                      }
                    }
                  }
                }
                if ($revision && $merge) {
                  $node_old->revision = TRUE;
                  $node_old->log = t('New revision created automatically by Biblio Advanced Import.');
                }
              }
              else {

                // There's no way to stop an already running node_save()
                // in a safe way without breaking a batch process.
                // So we do a little trick to realize the 'skip import':
                // We simply replace the current node to be saved by the
                // unmodified oldest duplicate and save this one instead
              }
              if ($biblio_advanced_import_recursion) {
                node_save($node_old);
              }
              else {
                $biblio_advanced_import_recursion = TRUE;
                $node = $node_old;
              }
            }
          }
        }
      }
      break;
    case 'insert':
    case 'update':
      if ('biblio' == $node->type) {
        biblio_advanced_import_update_hash($node);
      }
      break;
  }
}

/**
 * Helper function to create a hash from a biblio node
 * depending on a configurable duplicate detection
 * strategy.
 *
 * @see biblio_advanced_import_settings_form()
 *
 * @param $node
 *   a biblio node
 *
 * @return
 *   a md5 hash
 */
function biblio_advanced_import_hash($node) {
  $hash_string = '';
  $duplicate_criteria = variable_get('biblio_advanced_import_duplicate_criteria', array(
    'title' => 'title',
    'biblio_year' => 'biblio_year',
  ));
  foreach ($duplicate_criteria as $field) {
    if ($field) {
      $field_value = '';
      if (isset($node->{$field})) {
        $field_value = $node->{$field};
      }
      if ('biblio_year' == $field && empty($field_value)) {

        // If the year is empty, it will be set to 9999 by biblio on save.
        // 9999 => "Submitted"
        // Therefore we have to do the same here to not break duplicate detection.
        $field_value = 9999;
      }
      if ($field_value) {
        if (is_array($field_value) || is_object($field_value)) {
          $hash_string .= serialize($field_value);
        }
        else {
          $hash_string .= preg_replace("/\\s+/", '', mb_strtolower(mb_substr($field_value, 0, 256)));
        }
      }
    }
  }
  return md5(strtolower($hash_string));
}

/**
 * Helper function to update the hash of a biblio node.
 *
 * @see biblio_advanced_import_settings_form()
 *
 * @param $node
 *   a biblio node
 */
function biblio_advanced_import_update_hash(&$node) {
  $node->biblio_md5 = biblio_advanced_import_hash($node);
  db_query("UPDATE {biblio} SET biblio_md5 = '%s' WHERE nid = %d AND vid = %d", $node->biblio_md5, $node->nid, $node->vid);
}

/**
 * Helper function to create a configurable biblio node citekey.
 *
 * @see biblio_advanced_import_settings_form()
 * @see biblio_advanced_import_settings_form_submit()
 *
 * @param $node
 *   a biblio node
 */
function biblio_advanced_import_create_citekey($node) {
  $citekey = '';
  switch (variable_get('biblio_advanced_import_citekey_creation_strategy', 'biblio')) {
    case 'fields':
      $citekey_parts = array();
      $prefix = variable_get('biblio_citekey_prefix', '');
      if (!empty($prefix)) {
        $citekey_parts[] = $prefix;
      }
      foreach (variable_get('biblio_advanced_import_citekey_creation', array(
        'title' => 'title',
        'biblio_year' => 'biblio_year',
      )) as $field) {
        if (!empty($field) && !empty($node->{$field})) {
          $citekey_parts[] = $node->{$field};
        }
      }
      $citekey = implode('|', $citekey_parts);

      // biblio stores citekey as varchar(255), we make sure it fits
      $citekey = mb_substr($citekey, 0, 255);

      // strip trailing pipe symbol, if any
      $citekey = preg_replace('@\\|+$@', '', $citekey);
      $citekey = trim($citekey);
      break;
  }
  if ($citekey) {
    if (db_result(db_query("SELECT 1 FROM {biblio} WHERE biblio_citekey = '%s'", $citekey))) {
      switch (variable_get('biblio_advanced_import_duplicate_citekey_strategy', 'skip')) {
        case 'skip':
          $citekey = '';
          break;
        case 'append counter':
          $counter = variable_get('biblio_advanced_import_citekey_creation_counter', 0) + 1;
          variable_set('biblio_advanced_import_citekey_creation_counter', $counter);

          // biblio stores citekey as varchar(255), so we have to ensure that the counter is saved
          $citekey = mb_substr($citekey, 0, 254 - strlen($counter)) . '|' . $counter;
      }
    }
  }
  return $citekey;
}
function biblio_advanced_import_form_biblio_admin_settings_alter(&$form, &$form_state) {
  if ('fields' == variable_get('biblio_advanced_import_citekey_creation_strategy', 'biblio')) {
    $form['citekey']['biblio_citekey_field1']['#type'] = 'value';
    $form['citekey']['biblio_citekey_field1']['#value'] = $form['citekey']['biblio_citekey_field1']['#default_value'];
    $form['citekey']['biblio_citekey_field2']['#type'] = 'value';
    $form['citekey']['biblio_citekey_field2']['#value'] = $form['citekey']['biblio_citekey_field2']['#default_value'];
    $form['citekey']['biblio_citekey_phpcode']['#type'] = 'value';
    $form['citekey']['biblio_citekey_phpcode']['#value'] = $form['citekey']['biblio_citekey_phpcode']['#default_value'];
  }
}
function biblio_advanced_import_pitfall_workarounds(&$node) {
  switch (variable_get('biblio_advanced_import_fix_issn', 'as is')) {
    case 'as is':
      break;
    case 'normalize from isbn':
      if (empty($node->biblio_issn) || !empty($node->biblio_isbn)) {

        // RIS format does not distinguish between ISBN and ISSN
        $node->biblio_issn = $node->biblio_isbn;
      }

    // no break
    case 'normalize':

      // @see http://en.wikipedia.org/wiki/International_Standard_Serial_Number
      if (!empty($node->biblio_issn)) {
        if (preg_match("@\\b([0-9]{4})-?([0-9X]{4})\\b@i", $node->biblio_issn, $matches)) {
          $issn = strtoupper($matches[1] . $matches[2]);
          $sum = 0;
          for ($i = 0; $i < 7; $i++) {
            $sum += $issn[$i] * (8 - $i);
          }
          $checksum = 11 - $sum % 11;
          if ($checksum == $issn[7] || 10 == $checksum && 'X' == $issn[7]) {
            $node->biblio_issn = $issn;
          }
          else {
            unset($node->biblio_issn);
          }
        }
        else {
          unset($node->biblio_issn);
        }
      }
      break;
  }
  switch (variable_get('biblio_advanced_import_fix_isbn', 'as is')) {
    case 'as is':
      break;
    case 'remove':

      // @see http://en.wikipedia.org/wiki/International_Standard_Book_Number
      if (!empty($node->biblio_isbn)) {
        module_load_include('class.php', 'biblio_advanced_import', 'lib/isbntest');
        $currISBN = new ISBNtest();
        $currISBN
          ->set_isbn($matches[0]);
        if ($currISBN
          ->valid_isbn10() || $currISBN
          ->valid_isbn13() || $currISBN
          ->valid_gtin14()) {
          $node->biblio_isbn = $currISBN
            ->get_gtin14();
        }
        else {
          unset($node->biblio_isbn);
        }
      }
      break;
    case 'convert 13':

      // @see http://en.wikipedia.org/wiki/International_Standard_Book_Number
      if (!empty($node->biblio_isbn)) {
        if (preg_match("@[0-9\\-]{10,}@", $node->biblio_isbn, $matches)) {
          module_load_include('class.php', 'biblio_advanced_import', 'lib/isbntest');
          $currISBN = new ISBNtest();
          $currISBN
            ->set_isbn($matches[0]);
          if ($currISBN
            ->valid_isbn13()) {
            $node->biblio_isbn = $currISBN
              ->get_isbn13();
          }
          elseif ($currISBN
            ->valid_gtin14()) {
            $node->biblio_isbn = $currISBN
              ->get_gtin14();
          }
          else {
            unset($node->biblio_isbn);
          }
        }
        else {
          unset($node->biblio_isbn);
        }
      }
      break;
  }
  switch (variable_get('biblio_advanced_import_fix_doi', 'as is')) {
    case 'as is':
      break;
    case 'one valid':

      // @see http://en.wikipedia.org/wiki/Digital_object_identifier
      if (!empty($node->biblio_doi)) {
        if (preg_match("@10\\.\\d{4,}/[^\\s]+@i", $node->biblio_doi, $matches)) {
          $node->biblio_doi = $matches[0];
        }
        else {
          unset($node->biblio_doi);
        }
      }
      break;
  }
  switch (variable_get('biblio_advanced_import_fix_title', 'as is')) {
    case 'as is':
      break;
    case 'mendeley bibtex':
      if (!empty($node->title)) {

        // strip off enclosing curly braces, but only a matching pair
        $node->title = preg_replace('@^\\{(.*)\\}$@', '$1', $node->title);
      }
      break;
  }
  switch (variable_get('biblio_advanced_import_fix_url', 'as is')) {
    case 'as is':
      break;
    case 'one valid':
      if (!empty($node->biblio_url)) {
        if (preg_match("@(http|https)://[^\\s]+@i", $node->biblio_url, $matches)) {

          // ris import runs together lists of urls without a delimiter
          $urls = explode('http', str_replace(array(
            'HTTP:',
            'HTTPS:',
          ), array(
            'http:',
            'https:',
          ), $matches[0]));
          $node->biblio_url = 'http' . $urls[1];
        }
        else {
          unset($node->biblio_url);
        }
      }
      break;
  }
}

Functions

Namesort descending Description
biblio_advanced_import_create_citekey Helper function to create a configurable biblio node citekey.
biblio_advanced_import_form_biblio_admin_settings_alter
biblio_advanced_import_hash Helper function to create a hash from a biblio node depending on a configurable duplicate detection strategy.
biblio_advanced_import_menu Implements hook_menu().
biblio_advanced_import_nodeapi Implements hook_nodeapi().
biblio_advanced_import_pitfall_workarounds
biblio_advanced_import_update_hash Helper function to update the hash of a biblio node.