You are here

function taxonomy_xml_rdf_parse in Taxonomy import/export via XML 7

Same name and namespace in other branches
  1. 5.2 rdf_format.inc \taxonomy_xml_rdf_parse()
  2. 5 rdf_format.inc \taxonomy_xml_rdf_parse()
  3. 6.2 rdf_format.inc \taxonomy_xml_rdf_parse()
  4. 6 rdf_format.inc \taxonomy_xml_rdf_parse()

Read in RDF taxonomies and vocabularies. Create vocabs and terms as needed.

See formats.html readme for information about the RDF input supported.

Targets include: ICRA Content Rating http://www.icra.org/vocabulary/ WordNet Lexicon http: //wordnet.princeton.edu/ SUMO http://www. ontologyportal.org/ Freebase

... and the ontologies found at http://www.schemaweb.info/ that implement appropriate parts of the RDF Schema "rdfs" (eg Classes with subclassOf)

This function takes care of the parsing of RDF syntax into attributes (predicates). Actual term creation and logic is done by taxonomy_xml.module, mostly in taxonomy_xml_rdf_make_term() taxonomy_xml_canonisize_predicates().

Parameters

string $data: the string containing XML/RDF

int $vid: Vocab ID. May be modified by ref if this process creates a new vocab to use.

string $url: optional source URL this RDF came from if needed to resolve GUIDs etc. Cannot work for uploads.

Return value

a list of resulting terms. FALSE on failure.

File

formats/rdf_format.inc, line 131

Code

function taxonomy_xml_rdf_parse(&$data, &$vid, $url = NULL) {

  // See if it's really a different file we need to parse.
  @(list($resource_url, $anchor) = explode('#', $url));
  $index = taxonomy_xml_rdf_parse_data_into_index($data, $url);

  // If a specific ID was defined in the file, this means we just need to load
  // that one. This will help break things up for batches, and also allow us to
  // grab only sub-trees from big files.
  if (!empty($anchor)) {
    watchdog('taxonomy_xml', "\n      We were only asked about #%anchor in this document.\n      Reducing the data down to statements about that.", array(
      '%anchor' => $anchor,
    ), WATCHDOG_DEBUG);
    $index = array(
      $url => $index[$url],
    );
    if (empty($index)) {
      watchdog('taxonomy_xml', "Found no information about  %anchor in the document !resource_url", array(
        '%anchor' => $anchor,
        '!resource_url' => l($resource_url, $resource_url),
      ), WATCHDOG_WARNING);
      return NULL;
    }
  }
  $resources_by_type = taxonomy_xml_convert_index_to_sorted_objects($index);

  // dpm($index);
  // The resources are all initialized as data objects.
  // The predicates have NOT been flattened yet.
  // Resource types we expect to be dealing with are just vocabs and terms.
  // Debug only:
  if (!$anchor) {

    // Message is just noise if using anchors.
    watchdog('taxonomy_xml', "\n      Found %count different <strong>kinds</strong> of resources\n      in the named input : %types\n      ", array(
      '%count' => count($resources_by_type),
      '%types' => implode(', ', array_keys($resources_by_type)),
    ), WATCHDOG_INFO);
  }

  // Debug only:
  if (!empty($resources_by_type[TAXONOMY_XML_UNTYPED])) {

    // Just FYI, make a note about the quality of data found.
    // Do not complain about URLs - this is quite normal.
    watchdog('taxonomy_xml', "\n      Found %count Unsorted (untyped) resources.\n      An untyped entity is the subject of a statement,\n      but I don't know what <em>type</em> of thing they are.\n      Not sure what I'll do with these.\n      They are just things that have had statements made about them ..\n      that I don't recognise.\n      Probably just extra data found in the input and ignored.\n      <br/>ID was: %unknown", array(
      '%count' => count($resources_by_type[TAXONOMY_XML_UNTYPED]),
      '%unknown' => join(', ', array_keys($resources_by_type[TAXONOMY_XML_UNTYPED])),
    ), WATCHDOG_DEBUG);
  }

  // Debug only:
  if (count($resources_by_type) == 0) {
    watchdog('taxonomy_xml', "\n      It sure doesn't look like this is any useful sort of RDF source.\n      Zero resource entities were parsed out of it.\n      Probably need to do content-negotiation or something,\n      and check the validity of the file. Aborting.", array(
      '%url' => '',
    ), WATCHDOG_ERROR);
    return;
  }

  // Almost ready to build.
  // Prepare destination VOCAB.
  $vocabulary_types = taxonomy_xml_rdf_vocabulary_types();
  if ($vid == TAXONOMY_XML_DETERMINED_BY_SOURCE_FILE) {

    // If the vid has already been set,
    // we ignore vocab definitions found in the file.
    // Scan the sorted objects for vocabulary definitions
    // Hopefully there's only one vocab per file, but loop anyway.
    $vocabularies = array();
    foreach ($vocabulary_types as $vocabulary_type) {
      if (isset($resources_by_type[$vocabulary_type]) && is_array($resources_by_type[$vocabulary_type])) {
        foreach ($resources_by_type[$vocabulary_type] as $guid => &$vocabulary_handle) {
          $vocabularies[$guid] =& $vocabulary_handle;
        }
      }
    }
    drupal_set_message(t("Found %count resources to be used as vocabulary definitions", array(
      '%count' => count($vocabularies),
    )));
    if (!$vocabularies) {

      // Create a placeholder.
      $vocabularies[] = (object) array(
        'name' => 'Imported Vocabulary',
      );
    }
    $vid = taxonomy_xml_absorb_vocabulary_definitions($vocabularies);

    // $vocabularies now contains a keyed array of target vocabularies
    // the terms may be put into.
    // $vid is the default one (most common is one vocab per input file)
    // to be used unless otherwise defined per-term.
    if (empty($vid)) {
      drupal_set_message(t("No vocabulary to add terms to, aborting."), 'error');
      return FALSE;
    }
  }
  else {

    // Else using a form-selected vocob.
    $vocabularies[$vid] = taxonomy_vocabulary_load($vid);
  }

  // VOCAB set up, start on TERMS...
  // Note that when 'identifier' is used as a key here, it means the identifier
  // according to the source document - usually a URI.
  // A term identifier is a string distinct from the local term id.
  // Gather the resources that will become terms.
  // Slightly long way (not using array_merge),
  // as I need to merge indexed and by reference.
  $terms = array();
  $term_types = taxonomy_xml_rdf_term_types();
  foreach ($term_types as $term_type) {

    // watchdog('taxonomy_xml', 'Adding all %term_type to the list of terms to be processed', array('%term_type' => $term_type), WATCHDOG_DEBUG);
    if (isset($resources_by_type[$term_type]) && is_array($resources_by_type[$term_type])) {
      foreach ($resources_by_type[$term_type] as $guid => &$term_handle) {
        $terms[$guid] =& $term_handle;
      }
    }
  }

  // Some of the RDF documents I've been fed DO NOT DEFINE A TYPE
  // for their primary subject.
  // Neither
  // http://www.ubio.org/authority/metadata.php nor
  // http://biocol.org/ nor
  // http://lsid.tdwg.org/
  // return RDF that says WHAT the data is.
  // Those that use LSIDs have a type encoded in the Identifier itself :-/
  // I end up with a collection of data but no idea what it's really
  // talking about.
  // But IF an entity is rdf:about="THIS URL" then we will take a leap
  // and assume that is our target lump of data.
  // ... this worked for biocol input.
  foreach ((array) @$resources_by_type[TAXONOMY_XML_UNTYPED] as $identifier => $untyped_lump) {
    if ($identifier == $url) {

      // Looks like this was the specific thing we were looking for.
      watchdog('taxonomy_xml', "Trying to import an <em>untyped</em> data object in the hopes that it is the term we asked for. This may be incorrect, but it's all the document gave us. We asked, and got: '%identifier' .", array(
        '%identifier' => $identifier,
      ), WATCHDOG_NOTICE);
      watchdog('taxonomy_xml', "Untyped data object (possibly wrong) '%identifier' = <pre>%data</pre> .", array(
        '%identifier' => $identifier,
        '%data' => print_r($untyped_lump, 1),
      ), WATCHDOG_DEBUG);
      $terms[$identifier] = $untyped_lump;
    }
  }

  // Special case for Freebase.
  taxonomy_xml_rdf_process_freebase_vocab($resources_by_type, $vid);

  // Freebase. Sub-terms are listed, but point to the parent, not vice-versa.
  taxonomy_xml_rdf_process_dbpedia($resources_by_type, $terms);
  if (!$anchor) {

    // Shh.
    drupal_set_message(t("Found %count resources to be imported as terms into vocabulary %vid", array(
      '%count' => count($terms),
      '%vid' => $vid,
    )));
  }

  //
  // START MAKING TERMS
  //
  foreach ($terms as $guid => &$term) {

    // drupal_set_message(t("Reviewing term %identifier '%name' and analyzing its properties", array('%identifier' => $identifier, '%name' => $term->name)));
    if (empty($term)) {
      watchdog('taxonomy_xml', "An empty term '%guid' was in the array of terms to create. This should not have happened, fix the input upstream. Ignoring.", array(
        '%guid' => $guid,
      ), WATCHDOG_NOTICE);
      continue;
    }
    if (!isset($term->vid)) {

      // This is just a default fallback.
      // Imported terms should really have already chosen their vid.
      $term->vid = $vid;
    }
    taxonomy_xml_set_term_guid($term, $guid);
    taxonomy_xml_rdf_make_term($term);
  }

  // Now the terms are all happily created, create their relationships
  // Couldn't do so until they had all been given tids.
  taxonomy_xml_set_term_relations($terms);

  // Note this will not yet affect terms that have been queued
  // for later processing.
  // Such terms will need to attach themselves to the parent terms themselves.
  // watchdog('taxonomy_xml', "After re-linking, we now have all terms set <pre>!data</pre>", array('!data' => print_r($terms, 1)), WATCHDOG_INFO);
  return $terms;
}