function taxonomy_xml_tcs_parse in Taxonomy import/export via XML 6
Same name and namespace in other branches
- 6.2 tcs_format.inc \taxonomy_xml_tcs_parse()
- 7 formats/tcs_format.inc \taxonomy_xml_tcs_parse()
Reads a TCS file and creates the term definitions found in it.
Implimentation of the taxonomy_xml_HOOK_parse() callback.
This passes through several times, first finding all TaxonConcepts mentioned in the Doc, then finding if they refer to each other, then creating or retrieving and merging existing term data, then looping again to ensure they each point at each other. At no point can we assume to have all term items in memory, it just tries to instantiate them incrementally, depending on the information available.
Parameters
$data XML string representing the TCS file to be parsed:
$vid Vocabulary ID the terms are to be created under:
$url The source URL of the TCS doc. Used to create URIs from document: IDs
Return value
An array of the terms created in this parsing process.
File
- ./
tcs_format.inc, line 32 - Include routines for the Taxon Concepts Schema as used by "the Encyclopedia of Life" original XML parsing and taxonomy/term creation. and others.
Code
function taxonomy_xml_tcs_parse(&$data, $vid = 0, $url = '') {
#drupal_set_message(t("Importing from provided TCS data file %url.", array('%url' => $url)));
if ($vid == 0) {
// We've been asked to use the vocab described in the source file.
// However TCS files do not define a vocab
drupal_set_message(t("No vocabulary specified in the form, using a default one."));
// Create a placeholder, use that
$vocabulary = _taxonomy_xml_get_vocabulary_placeholder('Taxa');
$vid = $vocabulary->vid;
}
else {
// Else using a form-selected vocob.
$vocabulary = taxonomy_vocabulary_load($vid);
}
$xmldoc = new domdocument();
// Use the DOM, not the parser, it's quicker (to code)
if (!$xmldoc
->loadxml($data)) {
trigger_error("Failed to parse in xml source. [{$xmlfile}]", E_USER_WARNING);
return;
}
// Scan for 'taxonconcepts' which are our prime elements
$xp = new DomXPath($xmldoc);
// NEED a namespace when the default namespace is declared.
$fakenamespace = TRUE;
// set as an option during development - may be unwanted
if ($fakenamespace) {
$prefix = "tcs:";
$xp
->registerNameSpace('tcs', TAXONOMY_XML_TCS_NS);
}
else {
$prefix = '';
}
$query = "//{$prefix}TaxonConcept";
$concepts = $xp
->query($query);
if (!$concepts->length) {
drupal_set_message('No TaxonConcepts found in this doc. Namespace problems? Wrong format?', 'error');
}
//
// BEGIN the first loop, finding terms in this document
//
// Remembering all terms is memory-intensive, but may be more efficient in batch jobs.
// Use a static list where possible. EXPERIMENTAL
$terms =& taxonomy_xml_current_terms();
#dpm(array("About to start analyzing a data doc $url, known terms are: " => $terms));
foreach ($concepts as $concept) {
// Start constructing a (new?) term
$term = (object) array(
'predicates' => array(),
);
// Concepts must have IDs (?)
// But is it a local numeric id or a URI? Depends on what we are given.
// In any case remember it and compare this value with later references to it.
$term->id = $concept
->getAttribute('id');
// The URIs being used in the current data service are a bit vague.
// As a TCS file may contain more than one 'TaxonConcept',
// we can't really just point at the service URL to identify it.
// It should really have an ID in the string.
# $term->URI = $url .'#'. $concept->getAttribute('id');
$term->uri = $url;
$termnames = $xp
->query("{$prefix}Name", $concept);
foreach ($termnames as $termname) {
$term->name = $termname->textContent;
// TCS doesn't provide names so much as references to names. Make a note.
$term->namerefs[] = $termname
->getAttribute('ref');
}
// Find ALL its relationships. Store them in an array for later linking
$relationships = $xp
->query("{$prefix}TaxonRelationships/{$prefix}TaxonRelationship", $concept);
foreach ($relationships as $rel) {
$reltype = $rel
->getAttribute('type');
$reltags = $xp
->query("{$prefix}ToTaxonConcept", $rel);
foreach ($reltags as $reltag) {
// Following the spec, a reference may be by id, by name, or both.
// Trust the ref-id where possible, otherwise lookup the name
$reftarget = $reltag
->getAttribute('ref') ? $reltag
->getAttribute('ref') : $reltag->textContent;
// $term->relationships array is here for other storage mechanisms (term_relations) to investigate and serialize if they can
$term->relationships[$reltype][$reftarget] = $reftarget;
$term->predicates[$reltype][$reftarget] = $reftarget;
}
}
// TCS has a number of extra metadata fields which we want, BUT a clean Drupal install
// does not have any way of serializing it.
// IF we want to save that data, it must be massaged into saveable fields
// by another module eg the new experimental eol.module
// that grabs the data on hook_node_presave() and saves it using taxonomy_enhancer.
//
// This import module will attempt to NOT make any judgements at this
// stage about how that data shall be stored.
//
// It could be possible to pass a handle on the full XML node,
// but that proved way too intensive on memory.
// Instead let the term know the xml that it came from. Other parsers can take it from there.
//
// Let other hooks do their own logic with the data on save.
$term->xml = $xmldoc
->saveXML($concept);
#dpm(simplexml_import_dom($concept));
// TCS stores its name information in another node. This concept needs to
// know it later. Without getting into too much logic here, we'll see if
// we can find the data and append it here.
// Again, no analysis is done, just the xml is attached to the $term object
foreach ($term->namerefs as $nameref) {
// In theory, namerefs may point to TaxonNames in other documents
// We are not expected to support that yet.
$taxon_names = $xp
->query("//{$prefix}TaxonName[@id='{$nameref}']");
foreach ($taxon_names as $taxon_name) {
$term->taxon_names[$nameref] = $xmldoc
->saveXML($taxon_name);
}
}
#dpm(array("found a taxonConcept ". $term->name ." " . $term->id, $term));
// Add this term to our list, indexed as best we can.
$terms[$term->id] = $term;
// If we were loading a remote file, and the file contains only one Concept, then the file URI represents the concept.
// This is not strictly precise enough - it should be the #ID inside the doc, but this is the way the current web services work.
// Index this thing as a duplicate handle so we can find it easily if asked for the URL
if ($url && count($concepts) == 1) {
$terms[$url] =& $terms[$term->id];
}
}
#dpm(array("initialized placeholder terms from the input, now to flesh them out and structure them" => $terms));
// The first placeholder term definitions are set up.
// They may want to refer to each other, So now scan the refs for known referees
// and actually create them so we have tids to link.
//
// $terms list may also include pre-existing terms, included for cross-reference and linking
foreach ($terms as $identifier => &$term) {
// Skip duplicates (some dupes may exist due to the use of handles)
if ($term->taxonomy_xml_presaved) {
continue;
}
if (!isset($term->uri)) {
$term->uri = $identifier;
}
// Translate the predicate statements into the syntax we need
taxonomy_xml_canonicize_predicates($term);
// Data is now massaged and referring to itself correctly,
// Start creating terms so we can retrieve term ids
// Ensure name is valid
if (!$term->name) {
drupal_set_message(t("Problem, we were unable to find a specific label for the term referred to as %uri. Guessing that %name will be good enough.", array(
'%uri' => $term->URI,
'%name' => $term->name,
)));
}
#dpm(array("Looking for an existing definition or making a placeholder for " => $term));
// See if a definition already exists in the DB. Build on that. Otherwise start setting up a new prototype $term object.
// This does a get by name. If we had a better GUID to lookup, should try that instead
$existing_term = _taxonomy_xml_get_term_placeholder($term->name, $vid);
// Merge the old term objects properties into this one. Really just want its tid, but there may be more info I should not lose.
// Our new input takes precedence over older data
foreach ((array) $existing_term as $key => $value) {
if (!isset($term->{$key})) {
$term->{$key} = $value;
}
}
//
// The term object is now as tidy as it can be as a self-contained entity.
//
#dpm(array("Assembled term data, almost ready to save" => $term));
# Here's where last-minute data storage done by other modules gets set up
module_invoke_all('taxonomy_term_presave', $term);
#dpm(array("Assembled term data and other, now saving" => $term));
$save_term = (array) $term;
$status = taxonomy_save_term($save_term);
// Re-retrieve the new term definition, just in case anything extra happened to it during processing
$new_term = taxonomy_xml_get_term_by_name_from_vocab($term->name, $term->vid);
if (!$new_term) {
drupal_set_message(t("It seems like we failed to create and retrieve a term called %term_name", array(
'%term_name' => $term->name,
)), 'error');
}
// Merge retrieved values back over our main definition so the handles are up-to-date
//
// There seems to be a lot of data copying back and forth,
// even though we are using handles, but this is because we are
// temporarily retaining more information in the $term object than
// the system knows how to handle.
foreach ((array) $new_term as $key => $value) {
$term->{$key} = $value;
}
if ($status == SAVED_NEW) {
// Just remember this is fresh - for useful feedback messages.
$term->taxonomy_xml_new_term = TRUE;
}
// It's possible that not all the referenced items were available in the current document/loop
// Add referred items to the import queue for later processing
taxonomy_xml_add_all_children_to_queue($term);
$term->taxonomy_xml_presaved = TRUE;
// A flag to avoid double-processing
}
// end term-construction loop;
#dpm(array('created a bunch of terms, now they need relations set.' => $terms));
taxonomy_xml_set_term_relations($terms);
#dpm(array('After re-linking, we now have all terms set' => $terms));
return $terms;
}