You are here

function taxonomy_xml_rdf_parse in Taxonomy import/export via XML 6.2

Same name and namespace in other branches
  1. 5.2 rdf_format.inc \taxonomy_xml_rdf_parse()
  2. 5 rdf_format.inc \taxonomy_xml_rdf_parse()
  3. 6 rdf_format.inc \taxonomy_xml_rdf_parse()
  4. 7 formats/rdf_format.inc \taxonomy_xml_rdf_parse()

Read in RDF taxonomies and vocabularies. Create vocabs and terms as needed.

See formats.html readme for information about the RDF input supported.

Targets include: ICRA Content Rating http://www.icra.org/vocabulary/ WordNet Lexicon http: //wordnet.princeton.edu/ SUMO http://www. ontologyportal.org/ Freebase

... and the ontologies found at http://www.schemaweb.info/ that implement appropriate parts of the RDF Schema "rdfs" (eg Classes with subclassOf)

This function takes care of the parsing of RDF syntax into attributes (predicates). Actual term creation and logic is done by taxonomy_xml.module, mostly in taxonomy_xml_rdf_make_term() taxonomy_xml_canonisize_predicates().

Parameters

$data the string containing XML/RDF:

$vid int Vocab ID. May be modified by ref if this process creates a: new vocab to use.

$url optional source URL this RDF came from if needed to resolve GUIDs: etc. Cannot work for uploads.

Return value

a list of resulting terms. FALSE on failure.

1 string reference to 'taxonomy_xml_rdf_parse'
taxonomy_xml_rdf_format_info in ./rdf_format.inc
Return information about this format

File

./rdf_format.inc, line 94
Include routines for RDF parsing and taxonomy/term creation. @author dman http://coders.co.nz

Code

function taxonomy_xml_rdf_parse(&$data, &$vid, $url = NULL) {

  // See if it's really a different file we need to parse
  @(list($resource_url, $anchor) = split('#', $url));
  $triples = taxonomy_xml_rdf_parse_data_into_triples($data, $resource_url);
  if (empty($triples)) {
    drupal_set_message(t("No data extracted from input %url.", array(
      '%url' => $resource_url,
    )));
    return FALSE;
  }

  // If a specific ID was defined in the file, this means we just need to load
  // that one. This will help break things up for batches, and also allow us to
  // grab only sub-trees from big files.
  if (!empty($anchor)) {
    watchdog('taxonomy_xml', "\n      We were only asked about #%anchor in this document.\n      Reducing the data down to statements about that.", array(
      '%anchor' => $anchor,
    ), WATCHDOG_DEBUG);
    $triples = taxonomy_xml_rdf_get_statements_about($url, $triples);
    if (empty($triples)) {
      watchdog('taxonomy_xml', "Found no information about  %anchor in the document !resource_url", array(
        '%anchor' => $anchor,
        '!resource_url' => l($resource_url, $resource_url),
      ), WATCHDOG_WARNING);
    }
  }

  // The RDF input may come in several flavours,
  // Resources of the following 'types' may be cast into taxonomy terms for our purposes.
  // That is, an rdf:Class is a Drupal:term
  //
  // These are the things to look for.
  // Add to this list as needed
  //
  $term_types = array(
    TAXONOMY_XML_RDF_NS . 'Property',
    TAXONOMY_XML_DC_NS . 'subject',
    TAXONOMY_XML_RDFS_NS . 'Class',
    TAXONOMY_XML_OWL_NS . 'Class',
    TAXONOMY_XML_W3C_WN_SCHEMA . 'Word',
    TAXONOMY_XML_W3C_WN_SCHEMA . 'NounWordSense',
    TAXONOMY_XML_W3C_WN_SCHEMA . 'NounSynset',
    TAXONOMY_XML_CONTENTLABEL_NS . 'Category',
    TAXONOMY_XML_SKOS_NS . 'Concept',
    TAXONOMY_XML_SKOSREF_NS . 'Concept',
    'urn:lsid:ubio.org:classificationbank',
    'http://prismstandard.org/namespaces/2.0/pcv/Descriptor',
    TAXONOMY_XML_FB_NS . 'common.topic',
    // A freebase core 'topic'
    // freebase 'topic' is a superclass of useful things like 'music.genre'
    // @see http://www.alexandria.ucsb.edu/gazetteer/FeatureTypes/FTT_metadata.htm
    'http://www.esri.com/metadata/catalog/adl/#PT',
  );

  // A Drupal 'vocabulary' is represented by an owl:Ontology
  // or other similar shaped constructs
  $vocabulary_types = array(
    TAXONOMY_XML_OWL_NS . 'Ontology',
    TAXONOMY_XML_RDF_NS . 'Description',
    'http://www.w3.org/2001/12/Glossary',
    TAXONOMY_XML_TDWG_NS . 'Collection',
    TAXONOMY_XML_SKOS_NS . 'ConceptScheme',
    TAXONOMY_XML_SKOSREF_NS . 'ConceptScheme',
    # Resources that are of type fb:type_profile are often collections of 'topics'

    # thus, the are analogous to our 'vocabulary'
    TAXONOMY_XML_FB_NS . 'freebase.type_profile',
    TAXONOMY_XML_FB_NS . 'base.ontologies.ontology_class',
  );

  // Group the statements about things together.
  // This will flatten the structure a little, and discards namespaces
  $resources_by_type = taxonomy_xml_convert_triples_to_sorted_objects($triples);

  // The resources are all initialized as data objects.
  // Resource types we expect to be dealing with are just vocabs and terms.
  if (!$anchor) {

    // Message is just noise if using anchors.
    watchdog('taxonomy_xml', "\n      Found %count different <strong>kinds</strong> of resources\n      in the named input : %types\n      ", array(
      '%count' => count($resources_by_type),
      '%types' => join(', ', array_keys($resources_by_type)),
    ), WATCHDOG_INFO);
  }
  if (count($resources_by_type) == 0) {
    watchdog('taxonomy_xml', "\n      It sure doesn't look like this is any useful sort of RDF source.\n      Probably need to do content-negotiation or something. Aborting.", array(
      '%url' => '',
    ), WATCHDOG_WARNING);
    return;
  }

  #dpm($resources_by_type);
  $vocab_uri = NULL;
  if ($vid == 0) {

    // We've been asked to use the vocab described in the source file.
    // If the vid has already been set, we ignore vocab definitions found in the file
    // Scan the sorted objects for vocabulary definitions
    // Hopefully there's only one vocab per file, but loop anyway
    $vocabularies = array();
    foreach ($vocabulary_types as $vocabulary_type) {
      if (isset($resources_by_type[$vocabulary_type]) && is_array($resources_by_type[$vocabulary_type])) {
        foreach ($resources_by_type[$vocabulary_type] as $vocab_uri => &$vocabulary_handle) {
          $vocabularies[$vocab_uri] =& $vocabulary_handle;
        }
      }
    }
    drupal_set_message(t("Found %count resources to be used as vocabulary definitions", array(
      '%count' => count($vocabularies),
    )));
    if (!$vocabularies) {

      // Create a placeholder.
      $vocabularies[] = (object) array(
        'name' => 'Imported Vocabulary',
      );
    }
    $vid = taxonomy_xml_absorb_vocabulary_definitions($vocabularies);

    // $vocabularies now contains a keyed array of target vocabularies the terms may be put into
    // $vid is the default one (most common is one vocab per input file) to be used unless otherwise defined per-term.
    if (empty($vid)) {
      drupal_set_message(t("No vocabulary to add terms to, aborting."), 'error');
      return FALSE;
    }
  }
  else {

    // Else using a form-selected vocob.
    $vocabularies[$vid] = taxonomy_vocabulary_load($vid);

    // Note that a pre-made vocab already in the system will not have predicates
    // any more. Don't count on them
  }
  foreach ($vocabularies as $vocabulary) {
    module_invoke_all('taxonomy_xml_vocabulary_presave', $vocabulary);
  }

  //
  // VOCAB set up, start on TERMS...

  ///

  #dpm(array('vocabs are' => $vocabularies));

  // Gather the resources that will become terms.
  // Slightly long way (not using array_merge), as I need to merge indexed and by reference
  $terms = array();
  foreach ($term_types as $term_type) {

    // watchdog('taxonomy_xml', 'Adding all %term_type to the list of terms to be processed', array('%term_type' => $term_type), WATCHDOG_DEBUG);
    if (isset($resources_by_type[$term_type]) && is_array($resources_by_type[$term_type])) {
      foreach ($resources_by_type[$term_type] as $guid => &$term_handle) {

        // Grab name/label early for debugging and indexing
        $predicates = @$term_handle->predicates;
        if (isset($predicates['label'])) {
          $term_handle->name = reset($predicates['label']);
        }
        $terms[$guid] =& $term_handle;
      }
    }
  }

  // A FB import MAY also tell us a vocabulary is a top-level term
  // FB allows it to be both. We don't, it breaks things
  if (isset($terms[$vocab_uri])) {
    watchdog('taxonomy_xml', 'Vocab %vocab_uri was allegedly both a vocab and a term. Drupal can not handle that. Simplifing', array(
      '%vocab_uri' => $vocab_uri,
    ), WATCHDOG_NOTICE);
    unset($terms[$vocab_uri]);
  }

  // Some of the RDF documents I've been fed DO NOT DEFINE A TYPE for their primary subject.
  // Neither
  // http://www.ubio.org/authority/metadata.php nor
  // http://biocol.org/ nor
  // http://lsid.tdwg.org/
  // return RDF that says WHAT the data is. Those that use LSIDs have a type encoded in the Identifier itself :-/
  // I end up with a collection of data but no idea what it's really talking about.
  // But IF an entity is rdf:about="THIS URL" then we will take a leap and assume that is our target lump of data.
  // ... this worked for biocol input
  foreach ((array) @$resources_by_type[TAXONOMY_XML_UNTYPED] as $identifier => $untyped_lump) {
    if ($identifier == $url) {

      // Looks like this was the specific thing we were looking for
      $terms[$identifier] = $untyped_lump;
    }
  }

  // FREEBASE only
  // Special case for freebase.
  // If we are reading a top-level topic type page
  // eg http://www.freebase.com/tools/explore/music/genre
  // type = fb:type_profile
  // then it may contain a list of 'instances' which represent our desired
  // member terms.
  $fb_vocab_type = TAXONOMY_XML_FB_NS . 'freebase.type_profile';
  foreach ((array) @$resources_by_type[$fb_vocab_type] as $vocab_uri => $vocabulary) {
    $instances = @$vocabulary->predicates['type.type.instance'];
    if (!empty($instances)) {

      // I've got a list of URIs that represent terms, but not even a name for them
      // The system will still hopefully be able to work it out from just that.
      watchdog('taxonomy_xml', "\n        FREEBASE: Each <em>instance</em> listed in a freebase <em>type profile</em>\n        will be imported as a term.", array(), WATCHDOG_INFO);
      foreach ($instances as $term_guid) {
        $terms[$term_guid] = $placeholder_term = (object) array(
          'guid' => $term_guid,
          'vid' => $vid,
        );

        // Queue a full lookup of this item
        taxonomy_xml_add_term_to_batch_queue($placeholder_term);
        watchdog('taxonomy_xml', "Queuing a full retrieval of term !term_uri it for later retrieval and import", array(
          '!term_uri' => l($term_guid, $term_guid),
        ), WATCHDOG_INFO);
      }

      // loop over all term 'instances' mentioned by the vocab
    }

    // Extra diagnostic - freebase-specific
    if (isset($vocabulary->predicates)) {
      $instance_count = $vocabulary->predicates['freebase.type_profile.instance_count'];
      if ($instance_count > count($instances)) {
        watchdog('taxonomy_xml', "\n          FREEBASE: The topic set definition claims there are %instance_count\n          topic instances in the set, but I can see only %actual_count.\n          Some data may be missing from this doc that I am unable to retrieve.\n          ", array(
          '%instance_count' => reset($instance_count),
          '%actual_count' => count($instances),
        ), WATCHDOG_WARNING);
      }
    }

    // Resources that are being processed as vocabs are NOT also terms.
    // But the freenet schema labels topic sets as 'topics' themselves.
    // Unset this so as not to make a vocab definition a member of itself.
    unset($resources_by_type[TAXONOMY_XML_FB_NS . 'common.topic'][$vocab_uri]);
  }
  if (!$anchor) {

    // Shh.
    drupal_set_message(t("Found %count resources to be imported as terms into vocabulary %vid", array(
      '%count' => count($terms),
      '%vid' => $vid,
    )));
  }

  //
  // START MAKING TERMS
  //
  foreach ($terms as $identifier => &$term) {

    #drupal_set_message(t("Reviewing term %identifier '%name' and analyzing its properties", array('%identifier' => $identifier, '%name' => $term->name)));
    $term->identifier = $identifier;
    if (!isset($term->vid)) {

      // This is just a default fallback. Imported terms should really have already chosen their vid.
      $term->vid = $vid;
    }
    taxonomy_xml_rdf_make_term($term);
  }

  // Now the terms are all happily created, create their relationships
  // Couldn't do so until they had all been given tids.
  taxonomy_xml_set_term_relations($terms);

  // Note this will not yet affect terms that have been queued for later processing.
  // Such terms will need to attach themselves to the parent terms themselves.

  #dpm(array('After re-linking, we now have all terms set' => $terms));
  foreach ($vocabularies as $vocabulary) {
    module_invoke_all('taxonomy_xml_vocabulary_postsave', $vocabulary);
  }
  return $terms;
}