You are here

csv_format.inc in Taxonomy import/export via XML 7

Include routines for CSV parsing and taxonomy/term creation.

File

formats/csv_format.inc
View source
<?php

/* double-commented to avoid conflict with svn
 */

/**
 * @file
 *   Include  routines for CSV parsing and taxonomy/term creation.
 */

/**
 * Scan the input CSV file and create a taxonomy structure out of it.
 *
 * See the sample files for the expected format of the CSV
 *
 *
 * This scan process takes many rows of discrete 'statements' and combines them
 * into one interleaved description of many dependant terms. It does this in
 * three passes,
 * - The first to collect and enumerate the terms being used.
 * - The second to retrieve or create the terms.
 * - The third to link the dependancies together.
 *
 * The wording used in the source CSV may vary depending on your sources, add
 * extra terminology to the provided taxonomy_xml_relationship_synonyms()
 * function to adapt other words.
 */
function taxonomy_xml_csv_parse(&$data, $vid) {
  $output = '';

  // Unset the global variables before we use them:
  unset($GLOBALS['element'], $GLOBALS['term'], $GLOBALS['tag']);
  $terms = array();
  $new_terms = array();
  $skipped_terms = array();
  $vocabulary = array();
  if ($vid) {
    $vocabulary = taxonomy_vocabulary_load($vid);
  }
  else {
    drupal_set_message(t('No vocab to import into. Either make one or choose one.'));
    return;
  }
  $inverses = array(
    TAXONOMY_XML_PARENT => TAXONOMY_XML_CHILD,
    TAXONOMY_XML_RELATED => TAXONOMY_XML_RELATED,
  );
  $inverses = array_merge($inverses, array_flip($inverses));
  $rows = explode("\n", $data);
  drupal_set_message(t('%rowcount rows of data', array(
    '%rowcount' => count($rows),
  )));

  // PHASE 1
  //
  // Enumerate all terms and their properties
  // This goes through all the input and sets up an array of placeholders for
  // the terms, before actually creating any.
  $predicate_synonyms = taxonomy_xml_relationship_synonyms();
  foreach ($rows as $row) {
    $triple = csv_string_to_array($row);
    if (count($triple) == 1) {

      // Assume it's just a simple list : "I am"
      $triple = array(
        $triple[0],
        TAXONOMY_XML_NAME,
        $triple[0],
      );
    }
    else {
      if (count($triple) < 3) {

        # drupal_set_message("This line containes no triple: '$row'", 'error');
        continue;
      }
    }
    $subject = trim($triple[0], '"');
    $predicate = $original_predicate = trim($triple[1], '"');
    $object = trim($triple[2], "\n\r\"");
    if (!$subject) {
      continue;
    }

    // Translate terminology synonyms to the real predicate, because the source data can be inconsistant
    if (isset($predicate_synonyms[$predicate])) {
      $predicate = $predicate_synonyms[$predicate];
    }
    unset($term);

    // As we are dealing with handles, be careful to avoid inadvertant re-use
    unset($other_term);
    $term = isset($terms[$subject]) ? $terms[$subject] : NULL;
    if (!$term) {

      // Start by looking for it
      $term = _taxonomy_xml_get_term_placeholder($subject, $vid);
      $terms[$subject] =& $term;

      // Created term placeholder, or have a handle on it
    }

    // Set its property as an array value. Allow duplicates, we will filter later
    if (!isset($term->predicates[$predicate]) || !is_array($term->predicates[$predicate])) {
      $term->predicates[$predicate] = array();
    }
    $term->predicates[$predicate][] = $object;

    // Also set up reciprocal links with the opposite term.
    // We use reciprocals because we allow either broader or narrower terms, but don't require both.
    if (isset($inverses[$predicate])) {
      $inverse = $inverses[$predicate];

      // Ensure the other word exists. fetch it or make a placeholder
      $other_term = isset($terms[$object]) ? $terms[$object] : NULL;
      if (!$other_term) {
        $other_term = _taxonomy_xml_get_term_placeholder($object, $vid);
        $terms[$object] =& $other_term;
      }

      // Set the inverse property on it, referring back to the current subject.
      if (!isset($other_term->predicates[$inverse]) || !is_array($other_term->predicates[$inverse])) {
        $other_term->predicates[$inverse] = array();
      }
      $other_term->predicates[$inverse][] = $subject;
    }
    else {

      // This predicate has no inverse, it's not a relationship, it's flat data
      switch ($predicate) {
        case TAXONOMY_XML_NAME:
          $term->name = $object;
          break;
        case TAXONOMY_XML_DESCRIPTION:

          // Multiple descriptions roll up into one big string.
          $term->description = $term->description ? $term->description . "\n" . $object : $object;
          break;
        case TAXONOMY_XML_HAS_SYNONYM:

          // This strong term also uses the weak one as a synonym
          $term->synonyms_array[] = $object;

          // Synonyms are just extra text labels
          break;
        case TAXONOMY_XML_SYNONYM_OF:

          // This weak term is just another word for the referred to one.
          // It's not really a full term. Do nothing now, tag the strong term later. It may not exist yet.
          break;
        default:
          drupal_set_message("Not quite sure what '{$original_predicate}' ('{$predicate}') in '{$row}' means. You may add this term to the translation array in the module code to make it become useful.");
      }
    }
    $terms[$subject] =& $term;
  }
  unset($term);
  drupal_set_message(t("Processing statements about %count terms", array(
    '%count' => count($terms),
  )));

  # dpm(array('terms from data' => $terms));

  // Note the $terms array is all handles, not copies. Changes to them happen everywhere.
  // PHASE 2
  // Ordered all the input, go through and actually add terms to Drupal (if needed)
  //
  // Ensure a definition exists for them, Make one if needed, retrieve the id
  //
  foreach ($terms as $name => $term) {
    drupal_set_message(t("Processing term %name (%termname) %tid", array(
      '%name' => $name,
      '%termname' => $term->name,
      '%tid' => isset($term->tid) ? $term->tid : 'new',
    )));
    if (!is_object($term)) {
      drupal_set_message("Having difficulty analyzing term info '{$name}':" . print_r($term, 1), 'error');

      // Bad data got this far. Ignore.
      continue;
    }
    $term->vid = $vocabulary->vid;

    // If the first pass was indexed on identifier, not name, we would not have retrieved it. Try again.
    if ($loaded_term = taxonomy_xml_get_term_by_name_from_vocab($term->name, $vid)) {

      // Found one by name this time, merge data with it and keep a handle on it
      foreach ($term as $att => $val) {
        $loaded_term->{$att} = $val;
      }
      $term = $loaded_term;
      $terms[$name] = $term;
    }
    if (empty($term->tid)) {
      if (count($term->predicates) == 1 && isset($term->predicates[TAXONOMY_XML_SYNONYM_OF])) {

        // If a term was only listed to be a synonym, don't really make it.
        drupal_set_message(t("The term %name is just a synonym for %strong_term  - not a true term.", array(
          '%name' => $term->name,
          '%strong_term' => print_r($term->predicates[TAXONOMY_XML_SYNONYM_OF], 1),
        )));

        // Ensure the stronger term knows ...
        foreach ($term->predicates[TAXONOMY_XML_SYNONYM_OF] as $strong_term) {
          $terms[$strong_term]->synonyms_array[] = $term->name;
        }

        // And now it's attached to its stronger term, we can forget it.
        unset($terms[$name]);
      }
      else {

        // Make new term!

        #drupal_set_message(t("Did not find an existing entry for %termname - making a new one ", array('%termname' => $term->name) ));
        $term->synonyms = join("\n", array_unique((array) $term->synonyms_array));
        taxonomy_term_save($term);

        // Even though $term was created and possibly modified by reference, it SHOULD still retain all the raw data we had it hold.
        // @todo unit test this
        $new_terms[] = $term->name;
      }
    }
    else {

      // Term already existed. Just make a note.
      $skipped_terms[] = $term->name;
    }
  }

  #dpm($terms);
  drupal_set_message(t('Created all %count needed terms, now linking them together.', array(
    '%count' => count($terms),
  )));

  // PHASE 3
  // Third time through, set the related terms and structure,
  // and save again
  taxonomy_xml_set_term_relations($terms);
  if ($new_terms) {
    drupal_set_message(t('Added term(s)') . ' <i>' . implode(', ', $new_terms) . '.</i> ');
  }
  else {
    drupal_set_message(t('No new terms added.'));
  }
  if ($skipped_terms) {
    drupal_set_message(t('Did not need to re-create %skipped_count duplicate/existing term(s)', array(
      '%skipped_count' => count($skipped_terms),
    )));
  }
  return $terms;
}

/**
 * Given a CSV string that may or may not contain quoted values,
 * Split it into an array of values.
 */
function csv_string_to_array($str) {
  if (drupal_substr($str, 0, 1) == '#' || drupal_substr($str, 0, 1) == ';') {
    return;
  }
  $expr = "/,(?=(?:[^\"]*\"[^\"]*\")*(?![^\"]*\"))/";
  $results = preg_split($expr, $str);
  $results = array_map('trim', $results);
  return preg_replace("/^\"(.*)\"\$/", "\$1", $results);
}

Functions

Namesort descending Description
csv_string_to_array Given a CSV string that may or may not contain quoted values, Split it into an array of values.
taxonomy_xml_csv_parse Scan the input CSV file and create a taxonomy structure out of it.