biblio_advanced_import.module in Biblio Advanced Import 7
Same filename and directory in other branches
Biblio add-on.
Instead of creating duplicate biblio records, existing ones could be updated or the import could be skipped depending on a configurable duplicate detection strategy.
@author Markus Kalkbrenner | Cocomore AG
File
biblio_advanced_import.moduleView source
<?php
/**
* @file
* Biblio add-on.
*
* Instead of creating duplicate biblio records,
* existing ones could be updated or the import could
* be skipped depending on a configurable duplicate
* detection strategy.
*
* @see biblio.module
*
* @author Markus Kalkbrenner | Cocomore AG
* @see http://drupal.org/user/124705
*/
/**
* Implements hook_menu().
*/
function biblio_advanced_import_menu() {
$items['admin/config/content/biblio/advanced_import'] = array(
'title' => 'Advanced Import',
'page callback' => 'drupal_get_form',
'page arguments' => array(
'biblio_advanced_import_settings_form',
),
'access arguments' => array(
'administer biblio',
),
'file' => 'biblio_advanced_import.admin.inc',
'type' => MENU_LOCAL_TASK,
'weight' => 2,
);
return $items;
}
/**
* Implements hook_node_presave().
*/
function biblio_advanced_import_node_presave($node) {
if ('biblio' == $node->type && empty($node->nid)) {
$verbose_messages = (bool) variable_get('biblio_advanced_import_verbose_messages', '0');
biblio_advanced_import_pitfall_workarounds($node);
if (variable_get('biblio_auto_citekey', 1)) {
// on new entries, override citekeys generated by parses, depending on settings
$citekey = biblio_advanced_import_create_citekey($node);
if ($citekey) {
$node->biblio_citekey = $citekey;
}
}
$query = db_select('biblio', 'b');
$alias = $query
->innerJoin('node', 'n', 'b.nid = n.nid AND b.vid = n.vid');
$query
->fields('b', array(
'nid',
));
$skip = FALSE;
$revision = FALSE;
switch (variable_get('biblio_advanced_import_duplicate_strategy', 'create duplicate')) {
case 'create duplicate':
if ($verbose_messages) {
drupal_set_message(t('Creating duplicate of node %node_title', array(
'%node_title' => $node->title,
)));
}
return;
case 'skip import':
// There's no way to stop an already running node_save()
// in a safe way without breaking a batch process.
// So we do a little trick to realize the 'skip import':
// We simply replace the current node to be saved by the
// unmodified oldest duplicate and save this one instead
$skip = TRUE;
$query
->orderBy('b.nid', 'DESC')
->range(0, 1);
break;
case 'new rev latest':
$revision = TRUE;
case 'update latest':
$query
->orderBy('b.nid', 'DESC')
->range(0, 1);
break;
case 'new rev oldest':
$revision = TRUE;
case 'update oldest':
$query
->orderBy('b.nid', 'ASC')
->range(0, 1);
break;
case 'new rev all':
$revision = TRUE;
case 'update all':
break;
}
$condition_exists = FALSE;
$or_condition = db_or();
foreach (variable_get('biblio_advanced_import_detect_duplicate_strategy', array(
'md5' => 'md5',
)) as $field) {
switch ((string) $field) {
case 'md5':
$or_condition
->condition('b.biblio_md5', biblio_advanced_import_hash($node));
$condition_exists = TRUE;
break;
case 'isbn':
case 'issn':
case 'doi':
$field_property = 'biblio_' . $field;
if (!empty($node->{$field_property})) {
$or_condition
->condition('b.' . $field_property, $node->{$field_property});
$condition_exists = TRUE;
}
break;
case 'pubmed':
if (module_exists('biblio_pm') && !empty($node->biblio_pubmed_id)) {
$query
->innerJoin('biblio_pubmed', 'bp', 'b.nid = bp.nid');
$or_condition
->condition('bp.biblio_pubmed_id', $node->biblio_pubmed_id);
$condition_exists = TRUE;
}
break;
}
}
if ($condition_exists) {
$query
->condition($or_condition);
$result = $query
->execute();
$is_first_duplicate = TRUE;
$node_new = (array) $node;
while ($row = $result
->fetchObject()) {
// there are duplicates:
$node_old = node_load($row->nid);
// we need to set this or the node module will throw notices
// (if this node becomes the one to be really saved instead of the original one)
$node_old->is_new = FALSE;
if (!$skip) {
// update an existing biblio node with new data
if ($verbose_messages) {
drupal_set_message(t('Updating node %node_title (node %nid)', array(
'%nid' => $node_old->nid,
'%node_title' => $node_old->title,
)));
}
$merge = FALSE;
foreach ($node_new as $key => $value) {
if (strpos($key, 'biblio') === 0 && 'biblio_citekey' != $key || strpos($key, 'contributors') === 0 || 'title' == $key) {
$strategy = variable_get('biblio_advanced_import_merge_strategy', 'override');
if ('override' == $strategy || 'override but keep additional' == $strategy && !empty($value) || 'add new' == $strategy && !empty($value) && empty($node_old->{$key}) || 'override existing non empty' == $strategy && !empty($node_old->{$key}) || 'override existing non empty with non empty' == $strategy && !empty($value) && !empty($node_old->{$key})) {
if (!property_exists($node_old, $key) || $node_old->{$key} != $value) {
$node_old->{$key} = $value;
$merge = TRUE;
}
}
}
}
if ($revision && $merge) {
$node_old->revision = TRUE;
$node_old->log = t('New revision created automatically by Biblio Advanced Import.');
}
}
else {
// There's no way to stop an already running node_save()
// in a safe way without breaking the batch process.
// So we use a little trick to implement the 'skip import':
// We replace the current node to be saved with the
// unmodified first duplicate and let drupal save that one instead.
if ($verbose_messages) {
drupal_set_message(t('Skipping update of node %node_title (node %nid)', array(
'%nid' => $node_old->nid,
'%node_title' => $node_old->title,
)));
}
}
if ($is_first_duplicate) {
// the content of the node being saved gets replaced with the values from the first duplicate node
// (replacing the whole object with the loaded node did not seem to work
// so we do it property by property ...)
$is_first_duplicate = FALSE;
// clear existing object
foreach (get_object_vars($node) as $key => $value) {
unset($node->{$key});
}
// copy values over
foreach (get_object_vars($node_old) as $key => $value) {
$node->{$key} = $value;
}
}
else {
// save any other existing duplicates, with values updated
node_save($node_old);
}
}
}
}
}
/**
* Implements hook_node_insert().
*/
function biblio_advanced_import_node_insert($node) {
if ('biblio' == $node->type) {
biblio_advanced_import_update_hash($node);
}
}
/**
* Implements hook_node_update().
*/
function biblio_advanced_import_node_update($node) {
if ('biblio' == $node->type) {
biblio_advanced_import_update_hash($node);
}
}
/**
* Helper function to create a hash from a biblio node
* depending on a configurable duplicate detection
* strategy.
*
* @see biblio_advanced_import_settings_form()
*
* @param $node
* a biblio node
*
* @return
* a md5 hash
*/
function biblio_advanced_import_hash($node) {
$hash_string = '';
$duplicate_criteria = variable_get('biblio_advanced_import_duplicate_criteria', array(
'title' => 'title',
'biblio_year' => 'biblio_year',
));
foreach ($duplicate_criteria as $field) {
if ($field) {
$field_value = '';
if (isset($node->{$field})) {
$field_value = $node->{$field};
}
if ('biblio_year' == $field && (empty($field_value) || $field_value == t('Submitted'))) {
// If the year is empty, it will be set to 9999 by biblio on save.
// 9999 => "Submitted"
// Therefore we have to do the same here to not break duplicate detection.
// Furthermore, on load this magic value will is replaced with a translated version
// of the string "Submitted"; we hit this case when we rehash after a configuration change;
// we standardize all these cases on 9999 to arrive at consistent hash values
$field_value = 9999;
}
if ($field_value) {
if (is_array($field_value) || is_object($field_value)) {
$hash_string .= serialize($field_value);
}
else {
$hash_string .= preg_replace("/\\s+/", '', mb_strtolower(mb_substr($field_value, 0, 256)));
}
}
}
}
return md5(strtolower($hash_string));
}
/**
* Helper function to update the hash of a biblio node.
*
* @see biblio_advanced_import_settings_form()
*
* @param $node
* a biblio node
*/
function biblio_advanced_import_update_hash(&$node) {
$node->biblio_md5 = biblio_advanced_import_hash($node);
db_update('biblio')
->fields(array(
'biblio_md5' => $node->biblio_md5,
))
->condition('nid', $node->nid)
->condition('vid', $node->vid)
->execute();
}
/**
* Helper function to create a configurable biblio node citekey.
*
* @see biblio_advanced_import_settings_form()
* @see biblio_advanced_import_settings_form_submit()
*
* @param $node
* a biblio node
*/
function biblio_advanced_import_create_citekey($node) {
$citekey = '';
switch (variable_get('biblio_advanced_import_citekey_creation_strategy', 'biblio')) {
case 'fields':
$citekey_parts = array();
$prefix = variable_get('biblio_citekey_prefix', '');
if (!empty($prefix)) {
$citekey_parts[] = $prefix;
}
foreach (variable_get('biblio_advanced_import_citekey_creation', array(
'title' => 'title',
'biblio_year' => 'biblio_year',
)) as $field) {
if (!empty($field) && !empty($node->{$field})) {
$citekey_parts[] = $node->{$field};
}
}
$citekey = implode('|', $citekey_parts);
// biblio stores citekey as varchar(255), we need to make sure it fits
// or a PDO Exception is thrown
$citekey = mb_substr($citekey, 0, 255);
// strip trailing pipe symbol, if any
$citekey = preg_replace('@\\|+$@', '', $citekey);
$citekey = trim($citekey);
break;
}
if ($citekey) {
if (db_query("SELECT 1 FROM {biblio} WHERE biblio_citekey = :biblio_citekey", array(
':biblio_citekey' => $citekey,
))
->fetchField()) {
switch (variable_get('biblio_advanced_import_duplicate_citekey_strategy', 'skip')) {
case 'skip':
$citekey = '';
break;
case 'append counter':
$counter = variable_get('biblio_advanced_import_citekey_creation_counter', 0) + 1;
variable_set('biblio_advanced_import_citekey_creation_counter', $counter);
// biblio stores citekey as varchar(255), so we have to ensure that the counter is saved
$citekey = mb_substr($citekey, 0, 254 - strlen($counter)) . '|' . $counter;
}
}
}
return $citekey;
}
/**
* @todo Please document this function.
* @see http://drupal.org/node/1354
*/
function biblio_advanced_import_form_biblio_admin_settings_alter(&$form, &$form_state) {
if ('fields' == variable_get('biblio_advanced_import_citekey_creation_strategy', 'biblio')) {
$form['citekey']['biblio_citekey_field1']['#type'] = 'value';
$form['citekey']['biblio_citekey_field1']['#value'] = $form['citekey']['biblio_citekey_field1']['#default_value'];
$form['citekey']['biblio_citekey_field2']['#type'] = 'value';
$form['citekey']['biblio_citekey_field2']['#value'] = $form['citekey']['biblio_citekey_field2']['#default_value'];
$form['citekey']['biblio_citekey_phpcode']['#type'] = 'value';
$form['citekey']['biblio_citekey_phpcode']['#value'] = $form['citekey']['biblio_citekey_phpcode']['#default_value'];
}
}
/**
* This function implements some optional data cleanup / normalization
* that can be activated on the "advanced import" tab.
*/
function biblio_advanced_import_pitfall_workarounds(&$node) {
switch (variable_get('biblio_advanced_import_fix_issn', 'as is')) {
case 'as is':
break;
case 'normalize from isbn':
if (empty($node->biblio_issn) || !empty($node->biblio_isbn)) {
// RIS format does not distinguish between ISBN and ISSN
$node->biblio_issn = $node->biblio_isbn;
}
// no break
case 'normalize':
// @see http://en.wikipedia.org/wiki/International_Standard_Serial_Number
if (!empty($node->biblio_issn)) {
if (preg_match("@\\b([0-9]{4})-?([0-9X]{4})\\b@i", $node->biblio_issn, $matches)) {
$issn = strtoupper($matches[1] . $matches[2]);
$sum = 0;
for ($i = 0; $i < 7; $i++) {
$sum += $issn[$i] * (8 - $i);
}
$checksum = 11 - $sum % 11;
if ($checksum == $issn[7] || 10 == $checksum && 'X' == $issn[7]) {
$node->biblio_issn = $issn;
}
else {
unset($node->biblio_issn);
}
}
else {
unset($node->biblio_issn);
}
}
break;
}
switch (variable_get('biblio_advanced_import_fix_isbn', 'as is')) {
case 'as is':
break;
case 'remove':
// @see http://en.wikipedia.org/wiki/International_Standard_Book_Number
if (!empty($node->biblio_isbn)) {
module_load_include('class.php', 'biblio_advanced_import', 'lib/isbntest');
$currISBN = new ISBNtest();
$currISBN
->set_isbn($matches[0]);
if ($currISBN
->valid_isbn10() || $currISBN
->valid_isbn13() || $currISBN
->valid_gtin14()) {
$node->biblio_isbn = $currISBN
->get_gtin14();
}
else {
unset($node->biblio_isbn);
}
}
break;
case 'convert 13':
// @see http://en.wikipedia.org/wiki/International_Standard_Book_Number
if (!empty($node->biblio_isbn)) {
if (preg_match("@[0-9\\-]{10,}@", $node->biblio_isbn, $matches)) {
module_load_include('class.php', 'biblio_advanced_import', 'lib/isbntest');
$currISBN = new ISBNtest();
$currISBN
->set_isbn($matches[0]);
if ($currISBN
->valid_isbn13()) {
$node->biblio_isbn = $currISBN
->get_isbn13();
}
elseif ($currISBN
->valid_gtin14()) {
$node->biblio_isbn = $currISBN
->get_gtin14();
}
else {
unset($node->biblio_isbn);
}
}
else {
unset($node->biblio_isbn);
}
}
break;
}
switch (variable_get('biblio_advanced_import_fix_doi', 'as is')) {
case 'as is':
break;
case 'one valid':
// @see http://en.wikipedia.org/wiki/Digital_object_identifier
if (!empty($node->biblio_doi)) {
if (preg_match("@10\\.\\d{4,}/[^\\s]+@i", $node->biblio_doi, $matches)) {
$node->biblio_doi = $matches[0];
}
else {
unset($node->biblio_doi);
}
}
break;
}
switch (variable_get('biblio_advanced_import_fix_title', 'as is')) {
case 'as is':
break;
case 'mendeley bibtex':
if (!empty($node->title)) {
// strip off enclosing curly braces, but only a matching pair
$node->title = preg_replace('@^\\{(.*)\\}$@', '$1', $node->title);
}
break;
}
switch (variable_get('biblio_advanced_import_fix_url', 'as is')) {
case 'as is':
break;
case 'one valid':
if (!empty($node->biblio_url)) {
if (preg_match("@(http|https)://[^\\s]+@i", $node->biblio_url, $matches)) {
// ris import runs together lists of urls without a delimiter
$urls = explode('http', str_replace(array(
'HTTP:',
'HTTPS:',
), array(
'http:',
'https:',
), $matches[0]));
$node->biblio_url = 'http' . $urls[1];
}
else {
unset($node->biblio_url);
}
}
break;
}
}
Functions
Name![]() |
Description |
---|---|
biblio_advanced_import_create_citekey | Helper function to create a configurable biblio node citekey. |
biblio_advanced_import_form_biblio_admin_settings_alter | @todo Please document this function. |
biblio_advanced_import_hash | Helper function to create a hash from a biblio node depending on a configurable duplicate detection strategy. |
biblio_advanced_import_menu | Implements hook_menu(). |
biblio_advanced_import_node_insert | Implements hook_node_insert(). |
biblio_advanced_import_node_presave | Implements hook_node_presave(). |
biblio_advanced_import_node_update | Implements hook_node_update(). |
biblio_advanced_import_pitfall_workarounds | This function implements some optional data cleanup / normalization that can be activated on the "advanced import" tab. |
biblio_advanced_import_update_hash | Helper function to update the hash of a biblio node. |