You are here

function taxonomy_xml_rdf_parse in Taxonomy import/export via XML 6

Same name and namespace in other branches
  1. 5.2 rdf_format.inc \taxonomy_xml_rdf_parse()
  2. 5 rdf_format.inc \taxonomy_xml_rdf_parse()
  3. 6.2 rdf_format.inc \taxonomy_xml_rdf_parse()
  4. 7 formats/rdf_format.inc \taxonomy_xml_rdf_parse()

Read in RDF taxonomies and vocabularies. Create vocabs and terms as needed.

See formats.html readme for information about the RDF input supported.

Targets include : ICRA Content Rating http://www.icra.org/vocabulary/ WordNet Lexicon http: //wordnet.princeton.edu/ SUMO http://www. ontologyportal.org/

... and the ontologies found at http://www.schemaweb.info/ that implement appropriate parts of the RDF Schema "rdfs" (eg Classes with subclassOf)

Parameters

$data the string containing XML/RDF:

$vid int Vocab ID. May be modified by ref if this process creates a: new vocab to use.

$url optional source URL this RDF came from if needed to resolve GUIDs: etc. Cannot work for uploads.

File

./rdf_format.inc, line 51
Include routines for RDF parsing and taxonomy/term creation.

Code

function taxonomy_xml_rdf_parse(&$data, &$vid, $url = NULL) {
  drupal_set_message(t("Parsing RDF"));

  // Use ARC parser
  include_once "arc/ARC_rdfxml_parser.php";
  $parser_args = array(
    "bnode_prefix" => "genid",
    "base" => "",
  );
  $parser = new ARC_rdfxml_parser($parser_args);
  $triples = $parser
    ->parse_data($data);
  if (!is_array($triples)) {
    drupal_set_message(t("Problem parsing input %message", array(
      '%message' => $triples,
    )), 'error');
    return;
  }
  drupal_set_message(t("%count data triples (atomic statements) found in the source RDF doc", array(
    '%count' => count($triples),
  )));

  # dpm($triples);

  // The RDF input may come in several flavours,
  // Resources of the following 'types' may be cast into taxonomy terms for our purposes.
  // That is, an rdf:Class is a Drupal:term
  //
  // Add to this list as needed
  //
  $term_types = array(
    TAXONOMY_XML_RDF_NS . 'Property',
    TAXONOMY_XML_DC_NS . 'subject',
    TAXONOMY_XML_RDFS_NS . 'Class',
    TAXONOMY_XML_W3C_WN_SCHEMA . 'Word',
    TAXONOMY_XML_W3C_WN_SCHEMA . 'NounWordSense',
    TAXONOMY_XML_W3C_WN_SCHEMA . 'NounSynset',
    TAXONOMY_XML_CONTENTLABEL_NS . 'Category',
    TAXONOMY_XML_SKOS_NS . 'Concept',
    'urn:lsid:ubio.org:classificationbank',
  );

  // A Drupal 'vocabulary' is represented by an owl:Ontology
  // or other similar shaped constructs
  $vocabulary_types = array(
    TAXONOMY_XML_OWL_NS . 'Ontology',
    TAXONOMY_XML_RDF_NS . 'Description',
    'http://www.w3.org/2001/12/Glossary',
    TAXONOMY_XML_TDWG_NS . 'Collection',
  );
  $resources_by_type = taxonomy_xml_convert_triples_to_sorted_objects($triples);

  # dpm($resources_by_type);

  // The resources are all initialized as data objects.
  // Resource types we expect to be dealing with are just vocabs and terms.
  drupal_set_message(t("Found %count different <strong>kinds</strong> of resources in the input : %types", array(
    '%count' => count($resources_by_type),
    '%types' => join(', ', array_keys($resources_by_type)),
  )));

  #dpm($resources_by_type);
  if ($vid == 0) {

    // We've been asked to use the vocab described in the source file.
    // If the vid has already been set, we ignore vocab definitions found in the file
    // Scan the sorted objects for vocabulary definitions
    // Hopefully there's only one vocab per file, but loop anyway
    $vocabularies = array();
    foreach ($vocabulary_types as $vocabulary_type) {
      if (isset($resources_by_type[$vocabulary_type]) && is_array($resources_by_type[$vocabulary_type])) {
        foreach ($resources_by_type[$vocabulary_type] as $uri => &$vocabulary_handle) {
          $vocabularies[$uri] =& $vocabulary_handle;
        }
      }
    }
    drupal_set_message(t("Found %count resources to be used as vocabulary definitions", array(
      '%count' => count($vocabularies),
    )));
    if (!$vocabularies) {

      // Create a placeholder.
      $vocabularies[] = array(
        'name' => 'Imported Vocabulary',
      );
    }
    $vid = taxonomy_xml_absorb_vocabulary_definitions($vocabularies);

    // $vocabularies now contains a keyed array of target vocabularies the terms may be put into
    // $vid is the default one (most common is one vocab per input file) to be used unless otherwise defined per-term.
  }
  else {

    // Else using a form-selected vocob.
    $vocabularies[$vid] = taxonomy_vocabulary_load($vid);
  }

  #dpm(array('vocabs are' => $vocabularies));

  // Gather the resources that will become terms.
  // Slightly long way (not using array_merge), as I need to merge indexed and by reference
  $terms = array();
  foreach ($term_types as $term_type) {
    if (isset($resources_by_type[$term_type]) && is_array($resources_by_type[$term_type])) {
      foreach ($resources_by_type[$term_type] as $uri => &$term_handle) {

        // Grab name/label early for debugging and indexing
        $predicates = $term_handle->predicates;
        if (isset($predicates['label'])) {
          $term_handle->name = $predicates['label'][0];
        }
        $terms[$uri] =& $term_handle;
      }
    }
  }

  // Some of the RDF documents I've been fed DO NOT DEFINE A TYPE for their primary subject.
  // Neither
  // http://www.ubio.org/authority/metadata.php nor
  // http://biocol.org/ nor
  // http://lsid.tdwg.org/
  // return RDF that says WHAT the data is. Those that use LSIDs have a type encoded in the Identifier itself :-/
  // I end up with a collection of data but no idea what it's really talking about.
  // But IF an entity is rdf:about="THIS URL" then we will take a leap and assume that is our target lump of data.
  // ... this worked for biocol input
  foreach ((array) $resources_by_type[TAXONOMY_XML_UNTYPED] as $identifier => $untyped_lump) {
    if ($identifier == $url) {

      // Looks like this was the specific thing we were looking for
      $terms[$identifier] = $untyped_lump;

      # dpm(array("The default 'HERE' entity is " => $untyped_lump));
    }
  }
  drupal_set_message(t("Found %count resources to be imported as terms into vocabulary %vid", array(
    '%count' => count($terms),
    '%vid' => $vid,
  )));

  // $predicate_synonyms is a translation array to match rdf-speak with Drupal concepts
  $predicate_synonyms = taxonomy_xml_relationship_synonyms();

  //
  // START MAKING TERMS
  //
  foreach ($terms as $identifier => &$term) {

    #drupal_set_message(t("Reviewing term %identifier '%name' and analyzing its properties", array('%identifier' => $identifier, '%name' => $term->name)));
    if (!isset($term->vid)) {

      // This is just a default fallback. Imported terms should really have already chosen their vid.
      $term->vid = $vid;
    }

    // When running in batch, children will have a hard time finding their
    // parents if they only know them by source-localized ID (probably a URI)
    // and the destination-taxonomy (here) HASN'T REMEMBERED THAT INFO.
    // Because taxonomy.module just doesn't.
    // We require some other module (taxonomy_enhancer is good) to save that
    // metadata for us so the child can find its target later.
    // This is our 'identifier' - the REMOTE identifier not the local one.
    if (!isset($term->uri)) {
      $term->uri = $identifier;
    }

    #dpm($term);

    // Build term from data
    // Convert all input predicates into attributes on the object
    // the taxonomy.module will understand
    taxonomy_xml_canonicize_predicates($term);

    // Ensure name is valid
    if (!$term->name) {

      // Look, if we don't even have a name, creating a term is a waste of time.
      // RDF feeds commonly consist of a bunch of pointers, we can't invent placeholders until we know a little more.
      // Let's not do this.

      #drupal_set_message(t("Not enough information yet (not even a name) to create a term referred to as %identifier. Not creating it yet.", array('%identifier' => $identifier)));
      unset($terms[$identifier]);
      continue;

      /*
      // Fallback to a name, identifier derived (roughly) from the URI identifier - not always meaningful, but all we have in some contexts.
      $term->name = basename($identifier);
      drupal_set_message(t("Problem, we were unable to find a specific label for the term referred to as %identifier. Guessing that %name will be good enough.", array('%identifier' => $identifier, '%name' => $term->name)));
      */
    }

    // See if a definition already exists in the DB. Build on that.
    $existing_term = _taxonomy_xml_get_term_placeholder($term->name, $vid);

    // Merge the old term objects properties into this one. Really just want its tid, but there may be more info I should not lose.
    // New input takes precedence over older data
    foreach ((array) $existing_term as $key => $value) {
      if (!isset($term->{$key})) {
        $term->{$key} = $value;
      }
    }

    // The term object is now as tidy as it can be as a self-contained entity.

    # dpm($term);
    if (variable_get('taxonomy_xml_reuseids', FALSE)) {

      // MAINTAIN IDS
      // Because this is likely to be used with a site-cloning set-up, it would help if we tried to match IDs
      // OTOH, doing so could be very messy for other situations.
      // So,
      //  iff there is no pre-existing term with this id,
      //  create this one as a clone with the old ID.
      // This requires a little DB sneakiness.
      if ($term->internal_id && !taxonomy_get_term($term->internal_id)) {
        $term->tid = $term->internal_id;
        drupal_set_message(t("Doing sneaky import of %term_name re-using the internal id = %term_id", array(
          '%term_name' => $term->name,
          '%term_id' => $term->internal_id,
        )));
        db_query("INSERT INTO {term_data} (tid, name, description, vid, weight) VALUES (%d, '%s', '%s', %d, %d)", $term->tid, $term->name, $term->description, $term->vid, $term->weight);

        # sequences is gone in D6. Will inserting beyond the auto-increment self-correct?
        $current_id = db_last_insert_id('term_data', 'tid');
        if ($current_id < $term->tid) {

          // This is probably now MYSQL specific.
          db_query("ALTER TABLE {term_data} AUTO_INCREMENT = %d;", $term->tid);
        }
      }
    }

    # Here's where last-minute data storage done by other modules gets set up
    module_invoke_all('taxonomy_term_presave', $term);

    #dpm(array("ready to save" => $term));
    $save_term = (array) $term;
    $status = taxonomy_save_term($save_term);

    # Need to ensure the new hook callbacks fire also during that term saving

    #

    // Re-retrieve the new term definition, just in case anything extra happened to it during processing
    $new_term = taxonomy_xml_get_term_by_name_from_vocab($term->name, $term->vid);
    if (!$new_term) {
      drupal_set_message(t("It seems like we failed to create and retrieve a term called %term_name", array(
        '%term_name' => $term->name,
      )), 'error');
    }

    // Merge retrieved values back over our main definition so the handles are up-to-date
    foreach ((array) $new_term as $key => $value) {
      $term->{$key} = $value;
    }
    if ($status == SAVED_NEW) {

      // Just remember this is fresh - for useful feedback messages.
      $term->taxonomy_xml_new_term = TRUE;
    }

    // It's possible that not all the referenced items were available in the current document/loop
    // Add referred items to the import queue for later processing
    taxonomy_xml_add_all_children_to_queue($term);
    $term->taxonomy_xml_presaved = TRUE;

    // A flag to avoid double-processing
  }

  // end term-construction loop;

  #dpm(array("Saved all, now linking!" => $terms));

  // Now the terms are all happily created, create their relationships
  // Couldn't do so until they had all been given tids.
  taxonomy_xml_set_term_relations($terms);

  #dpm(array('After re-linking, we now have all terms set' => $terms));
  return $terms;
}