function taxonomy_xml_rdf_parse in Taxonomy import/export via XML 6
Same name and namespace in other branches
- 5.2 rdf_format.inc \taxonomy_xml_rdf_parse()
- 5 rdf_format.inc \taxonomy_xml_rdf_parse()
- 6.2 rdf_format.inc \taxonomy_xml_rdf_parse()
- 7 formats/rdf_format.inc \taxonomy_xml_rdf_parse()
Read in RDF taxonomies and vocabularies. Create vocabs and terms as needed.
See formats.html readme for information about the RDF input supported.
Targets include : ICRA Content Rating http://www.icra.org/vocabulary/ WordNet Lexicon http: //wordnet.princeton.edu/ SUMO http://www. ontologyportal.org/
... and the ontologies found at http://www.schemaweb.info/ that implement appropriate parts of the RDF Schema "rdfs" (eg Classes with subclassOf)
Parameters
$data the string containing XML/RDF:
$vid int Vocab ID. May be modified by ref if this process creates a: new vocab to use.
$url optional source URL this RDF came from if needed to resolve GUIDs: etc. Cannot work for uploads.
File
- ./
rdf_format.inc, line 51 - Include routines for RDF parsing and taxonomy/term creation.
Code
function taxonomy_xml_rdf_parse(&$data, &$vid, $url = NULL) {
drupal_set_message(t("Parsing RDF"));
// Use ARC parser
include_once "arc/ARC_rdfxml_parser.php";
$parser_args = array(
"bnode_prefix" => "genid",
"base" => "",
);
$parser = new ARC_rdfxml_parser($parser_args);
$triples = $parser
->parse_data($data);
if (!is_array($triples)) {
drupal_set_message(t("Problem parsing input %message", array(
'%message' => $triples,
)), 'error');
return;
}
drupal_set_message(t("%count data triples (atomic statements) found in the source RDF doc", array(
'%count' => count($triples),
)));
# dpm($triples);
// The RDF input may come in several flavours,
// Resources of the following 'types' may be cast into taxonomy terms for our purposes.
// That is, an rdf:Class is a Drupal:term
//
// Add to this list as needed
//
$term_types = array(
TAXONOMY_XML_RDF_NS . 'Property',
TAXONOMY_XML_DC_NS . 'subject',
TAXONOMY_XML_RDFS_NS . 'Class',
TAXONOMY_XML_W3C_WN_SCHEMA . 'Word',
TAXONOMY_XML_W3C_WN_SCHEMA . 'NounWordSense',
TAXONOMY_XML_W3C_WN_SCHEMA . 'NounSynset',
TAXONOMY_XML_CONTENTLABEL_NS . 'Category',
TAXONOMY_XML_SKOS_NS . 'Concept',
'urn:lsid:ubio.org:classificationbank',
);
// A Drupal 'vocabulary' is represented by an owl:Ontology
// or other similar shaped constructs
$vocabulary_types = array(
TAXONOMY_XML_OWL_NS . 'Ontology',
TAXONOMY_XML_RDF_NS . 'Description',
'http://www.w3.org/2001/12/Glossary',
TAXONOMY_XML_TDWG_NS . 'Collection',
);
$resources_by_type = taxonomy_xml_convert_triples_to_sorted_objects($triples);
# dpm($resources_by_type);
// The resources are all initialized as data objects.
// Resource types we expect to be dealing with are just vocabs and terms.
drupal_set_message(t("Found %count different <strong>kinds</strong> of resources in the input : %types", array(
'%count' => count($resources_by_type),
'%types' => join(', ', array_keys($resources_by_type)),
)));
#dpm($resources_by_type);
if ($vid == 0) {
// We've been asked to use the vocab described in the source file.
// If the vid has already been set, we ignore vocab definitions found in the file
// Scan the sorted objects for vocabulary definitions
// Hopefully there's only one vocab per file, but loop anyway
$vocabularies = array();
foreach ($vocabulary_types as $vocabulary_type) {
if (isset($resources_by_type[$vocabulary_type]) && is_array($resources_by_type[$vocabulary_type])) {
foreach ($resources_by_type[$vocabulary_type] as $uri => &$vocabulary_handle) {
$vocabularies[$uri] =& $vocabulary_handle;
}
}
}
drupal_set_message(t("Found %count resources to be used as vocabulary definitions", array(
'%count' => count($vocabularies),
)));
if (!$vocabularies) {
// Create a placeholder.
$vocabularies[] = array(
'name' => 'Imported Vocabulary',
);
}
$vid = taxonomy_xml_absorb_vocabulary_definitions($vocabularies);
// $vocabularies now contains a keyed array of target vocabularies the terms may be put into
// $vid is the default one (most common is one vocab per input file) to be used unless otherwise defined per-term.
}
else {
// Else using a form-selected vocob.
$vocabularies[$vid] = taxonomy_vocabulary_load($vid);
}
#dpm(array('vocabs are' => $vocabularies));
// Gather the resources that will become terms.
// Slightly long way (not using array_merge), as I need to merge indexed and by reference
$terms = array();
foreach ($term_types as $term_type) {
if (isset($resources_by_type[$term_type]) && is_array($resources_by_type[$term_type])) {
foreach ($resources_by_type[$term_type] as $uri => &$term_handle) {
// Grab name/label early for debugging and indexing
$predicates = $term_handle->predicates;
if (isset($predicates['label'])) {
$term_handle->name = $predicates['label'][0];
}
$terms[$uri] =& $term_handle;
}
}
}
// Some of the RDF documents I've been fed DO NOT DEFINE A TYPE for their primary subject.
// Neither
// http://www.ubio.org/authority/metadata.php nor
// http://biocol.org/ nor
// http://lsid.tdwg.org/
// return RDF that says WHAT the data is. Those that use LSIDs have a type encoded in the Identifier itself :-/
// I end up with a collection of data but no idea what it's really talking about.
// But IF an entity is rdf:about="THIS URL" then we will take a leap and assume that is our target lump of data.
// ... this worked for biocol input
foreach ((array) $resources_by_type[TAXONOMY_XML_UNTYPED] as $identifier => $untyped_lump) {
if ($identifier == $url) {
// Looks like this was the specific thing we were looking for
$terms[$identifier] = $untyped_lump;
# dpm(array("The default 'HERE' entity is " => $untyped_lump));
}
}
drupal_set_message(t("Found %count resources to be imported as terms into vocabulary %vid", array(
'%count' => count($terms),
'%vid' => $vid,
)));
// $predicate_synonyms is a translation array to match rdf-speak with Drupal concepts
$predicate_synonyms = taxonomy_xml_relationship_synonyms();
//
// START MAKING TERMS
//
foreach ($terms as $identifier => &$term) {
#drupal_set_message(t("Reviewing term %identifier '%name' and analyzing its properties", array('%identifier' => $identifier, '%name' => $term->name)));
if (!isset($term->vid)) {
// This is just a default fallback. Imported terms should really have already chosen their vid.
$term->vid = $vid;
}
// When running in batch, children will have a hard time finding their
// parents if they only know them by source-localized ID (probably a URI)
// and the destination-taxonomy (here) HASN'T REMEMBERED THAT INFO.
// Because taxonomy.module just doesn't.
// We require some other module (taxonomy_enhancer is good) to save that
// metadata for us so the child can find its target later.
// This is our 'identifier' - the REMOTE identifier not the local one.
if (!isset($term->uri)) {
$term->uri = $identifier;
}
#dpm($term);
// Build term from data
// Convert all input predicates into attributes on the object
// the taxonomy.module will understand
taxonomy_xml_canonicize_predicates($term);
// Ensure name is valid
if (!$term->name) {
// Look, if we don't even have a name, creating a term is a waste of time.
// RDF feeds commonly consist of a bunch of pointers, we can't invent placeholders until we know a little more.
// Let's not do this.
#drupal_set_message(t("Not enough information yet (not even a name) to create a term referred to as %identifier. Not creating it yet.", array('%identifier' => $identifier)));
unset($terms[$identifier]);
continue;
/*
// Fallback to a name, identifier derived (roughly) from the URI identifier - not always meaningful, but all we have in some contexts.
$term->name = basename($identifier);
drupal_set_message(t("Problem, we were unable to find a specific label for the term referred to as %identifier. Guessing that %name will be good enough.", array('%identifier' => $identifier, '%name' => $term->name)));
*/
}
// See if a definition already exists in the DB. Build on that.
$existing_term = _taxonomy_xml_get_term_placeholder($term->name, $vid);
// Merge the old term objects properties into this one. Really just want its tid, but there may be more info I should not lose.
// New input takes precedence over older data
foreach ((array) $existing_term as $key => $value) {
if (!isset($term->{$key})) {
$term->{$key} = $value;
}
}
// The term object is now as tidy as it can be as a self-contained entity.
# dpm($term);
if (variable_get('taxonomy_xml_reuseids', FALSE)) {
// MAINTAIN IDS
// Because this is likely to be used with a site-cloning set-up, it would help if we tried to match IDs
// OTOH, doing so could be very messy for other situations.
// So,
// iff there is no pre-existing term with this id,
// create this one as a clone with the old ID.
// This requires a little DB sneakiness.
if ($term->internal_id && !taxonomy_get_term($term->internal_id)) {
$term->tid = $term->internal_id;
drupal_set_message(t("Doing sneaky import of %term_name re-using the internal id = %term_id", array(
'%term_name' => $term->name,
'%term_id' => $term->internal_id,
)));
db_query("INSERT INTO {term_data} (tid, name, description, vid, weight) VALUES (%d, '%s', '%s', %d, %d)", $term->tid, $term->name, $term->description, $term->vid, $term->weight);
# sequences is gone in D6. Will inserting beyond the auto-increment self-correct?
$current_id = db_last_insert_id('term_data', 'tid');
if ($current_id < $term->tid) {
// This is probably now MYSQL specific.
db_query("ALTER TABLE {term_data} AUTO_INCREMENT = %d;", $term->tid);
}
}
}
# Here's where last-minute data storage done by other modules gets set up
module_invoke_all('taxonomy_term_presave', $term);
#dpm(array("ready to save" => $term));
$save_term = (array) $term;
$status = taxonomy_save_term($save_term);
# Need to ensure the new hook callbacks fire also during that term saving
#
// Re-retrieve the new term definition, just in case anything extra happened to it during processing
$new_term = taxonomy_xml_get_term_by_name_from_vocab($term->name, $term->vid);
if (!$new_term) {
drupal_set_message(t("It seems like we failed to create and retrieve a term called %term_name", array(
'%term_name' => $term->name,
)), 'error');
}
// Merge retrieved values back over our main definition so the handles are up-to-date
foreach ((array) $new_term as $key => $value) {
$term->{$key} = $value;
}
if ($status == SAVED_NEW) {
// Just remember this is fresh - for useful feedback messages.
$term->taxonomy_xml_new_term = TRUE;
}
// It's possible that not all the referenced items were available in the current document/loop
// Add referred items to the import queue for later processing
taxonomy_xml_add_all_children_to_queue($term);
$term->taxonomy_xml_presaved = TRUE;
// A flag to avoid double-processing
}
// end term-construction loop;
#dpm(array("Saved all, now linking!" => $terms));
// Now the terms are all happily created, create their relationships
// Couldn't do so until they had all been given tids.
taxonomy_xml_set_term_relations($terms);
#dpm(array('After re-linking, we now have all terms set' => $terms));
return $terms;
}