rdf_format.inc in Taxonomy import/export via XML 7
File
formats/rdf_format.incView source
<?php
/**
* @file
* Include routines for RDF parsing and taxonomy/term creation.
*
* RDF here is based on the W3C examples (using RDFS), but also incorporates
* support for the SKOS Dialect as well.
*
* Note the use of the word 'node' here almost always refers to XML nodes, not
* Drupal nodes.
*/
module_load_include('inc', 'taxonomy_xml', 'rdf_utils');
/**
* Constants for rules when recursing
*/
/**
* When dumping a term, don't list child terms.
*/
define('TAXONOMY_XML_NO_CHILDREN', 0);
/**
* When dumping a term, Just list child term URIs.
*/
define('TAXONOMY_XML_CHILDREN_REF_ONLY', 1);
/**
* When dumping a term, Fully describe immediate child terms.
* (URI ref under them).
*/
define('TAXONOMY_XML_CHILDREN_DETAILS', 3);
/**
* When dumping a term, Fully describe all child terms.
*/
define('TAXONOMY_XML_CHILDREN_RECURSIVE', 4);
/**
* Sub-hook.
*
* Returns info about this syntax
*
* @see taxonomy_xml_HOOK_format_info()
*/
function taxonomy_xml_rdf_format_info() {
return array(
'description' => "RDF is recommended for portability with external databases, although it is verbose and sometimes unreadable to humans. The RDF used here is based on Drupal 7 'rdf_mapping' used internally by entities and RDFa",
'mime' => 'application/rdf+xml',
);
}
/**
* Return a list of 'types' of things that we may import as 'terms'.
*
* The RDF input may come in several flavours,
* Resources of the following 'types'
* may be cast into taxonomy terms for our purposes.
* That is, an rdf:Class is a Drupal:term
*
* Add to this list as needed as examples come from the wild
*/
function taxonomy_xml_rdf_term_types() {
$term_types = array(
TAXONOMY_XML_SKOS_NS . 'Concept',
TAXONOMY_XML_RDF_NS . 'Property',
TAXONOMY_XML_DC_NS . 'subject',
TAXONOMY_XML_RDFS_NS . 'Class',
TAXONOMY_XML_OWL_NS . 'Class',
TAXONOMY_XML_W3C_WN_SCHEMA . 'Word',
TAXONOMY_XML_W3C_WN_SCHEMA . 'NounWordSense',
TAXONOMY_XML_W3C_WN_SCHEMA . 'NounSynset',
TAXONOMY_XML_CONTENTLABEL_NS . 'Category',
'urn:lsid:ubio.org:classificationbank',
TAXONOMY_XML_FB_NS . 'common.topic',
);
return $term_types;
}
/**
* Return a list of types of things that may behave like 'vocabularies'.
*
* A Drupal 'vocabulary' is represented by an owl:Ontology
* or other similar shaped constructs, like a SKOS ConceptScheme
*/
function taxonomy_xml_rdf_vocabulary_types() {
$vocabulary_types = array(
TAXONOMY_XML_SKOS_NS . 'ConceptScheme',
TAXONOMY_XML_OWL_NS . 'Ontology',
// eg SIOC
TAXONOMY_XML_RDF_NS . 'Description',
'http://www.w3.org/2001/12/Glossary',
TAXONOMY_XML_TDWG_NS . 'Collection',
// Resources that are of type fb:type_profile
// are often collections of 'topics'
// thus, the are analogous to our 'vocabulary'
TAXONOMY_XML_FB_NS . 'freebase.type_profile',
);
return $vocabulary_types;
}
/**
* Read in RDF taxonomies and vocabularies. Create vocabs and terms as needed.
*
* See formats.html readme for information about the RDF input supported.
*
* Targets include:
* ICRA Content Rating http://www.icra.org/vocabulary/
* WordNet Lexicon http: //wordnet.princeton.edu/
* SUMO http://www. ontologyportal.org/
* Freebase
*
* ... and the ontologies found at http://www.schemaweb.info/ that implement
* appropriate parts of the RDF Schema "rdfs" (eg Classes with subclassOf)
*
* This function takes care of the parsing of RDF syntax into attributes
* (predicates). Actual term creation and logic is done by taxonomy_xml.module,
* mostly in taxonomy_xml_rdf_make_term() taxonomy_xml_canonisize_predicates().
*
* @param string $data
* the string containing XML/RDF
* @param int $vid
* Vocab ID. May be modified by ref if this process creates a
* new vocab to use.
* @param string $url
* optional source URL this RDF came from if needed to resolve GUIDs
* etc. Cannot work for uploads.
*
* @return a list of resulting terms. FALSE on failure.
*/
function taxonomy_xml_rdf_parse(&$data, &$vid, $url = NULL) {
// See if it's really a different file we need to parse.
@(list($resource_url, $anchor) = explode('#', $url));
$index = taxonomy_xml_rdf_parse_data_into_index($data, $url);
// If a specific ID was defined in the file, this means we just need to load
// that one. This will help break things up for batches, and also allow us to
// grab only sub-trees from big files.
if (!empty($anchor)) {
watchdog('taxonomy_xml', "\n We were only asked about #%anchor in this document.\n Reducing the data down to statements about that.", array(
'%anchor' => $anchor,
), WATCHDOG_DEBUG);
$index = array(
$url => $index[$url],
);
if (empty($index)) {
watchdog('taxonomy_xml', "Found no information about %anchor in the document !resource_url", array(
'%anchor' => $anchor,
'!resource_url' => l($resource_url, $resource_url),
), WATCHDOG_WARNING);
return NULL;
}
}
$resources_by_type = taxonomy_xml_convert_index_to_sorted_objects($index);
// dpm($index);
// The resources are all initialized as data objects.
// The predicates have NOT been flattened yet.
// Resource types we expect to be dealing with are just vocabs and terms.
// Debug only:
if (!$anchor) {
// Message is just noise if using anchors.
watchdog('taxonomy_xml', "\n Found %count different <strong>kinds</strong> of resources\n in the named input : %types\n ", array(
'%count' => count($resources_by_type),
'%types' => implode(', ', array_keys($resources_by_type)),
), WATCHDOG_INFO);
}
// Debug only:
if (!empty($resources_by_type[TAXONOMY_XML_UNTYPED])) {
// Just FYI, make a note about the quality of data found.
// Do not complain about URLs - this is quite normal.
watchdog('taxonomy_xml', "\n Found %count Unsorted (untyped) resources.\n An untyped entity is the subject of a statement,\n but I don't know what <em>type</em> of thing they are.\n Not sure what I'll do with these.\n They are just things that have had statements made about them ..\n that I don't recognise.\n Probably just extra data found in the input and ignored.\n <br/>ID was: %unknown", array(
'%count' => count($resources_by_type[TAXONOMY_XML_UNTYPED]),
'%unknown' => join(', ', array_keys($resources_by_type[TAXONOMY_XML_UNTYPED])),
), WATCHDOG_DEBUG);
}
// Debug only:
if (count($resources_by_type) == 0) {
watchdog('taxonomy_xml', "\n It sure doesn't look like this is any useful sort of RDF source.\n Zero resource entities were parsed out of it.\n Probably need to do content-negotiation or something,\n and check the validity of the file. Aborting.", array(
'%url' => '',
), WATCHDOG_ERROR);
return;
}
// Almost ready to build.
// Prepare destination VOCAB.
$vocabulary_types = taxonomy_xml_rdf_vocabulary_types();
if ($vid == TAXONOMY_XML_DETERMINED_BY_SOURCE_FILE) {
// If the vid has already been set,
// we ignore vocab definitions found in the file.
// Scan the sorted objects for vocabulary definitions
// Hopefully there's only one vocab per file, but loop anyway.
$vocabularies = array();
foreach ($vocabulary_types as $vocabulary_type) {
if (isset($resources_by_type[$vocabulary_type]) && is_array($resources_by_type[$vocabulary_type])) {
foreach ($resources_by_type[$vocabulary_type] as $guid => &$vocabulary_handle) {
$vocabularies[$guid] =& $vocabulary_handle;
}
}
}
drupal_set_message(t("Found %count resources to be used as vocabulary definitions", array(
'%count' => count($vocabularies),
)));
if (!$vocabularies) {
// Create a placeholder.
$vocabularies[] = (object) array(
'name' => 'Imported Vocabulary',
);
}
$vid = taxonomy_xml_absorb_vocabulary_definitions($vocabularies);
// $vocabularies now contains a keyed array of target vocabularies
// the terms may be put into.
// $vid is the default one (most common is one vocab per input file)
// to be used unless otherwise defined per-term.
if (empty($vid)) {
drupal_set_message(t("No vocabulary to add terms to, aborting."), 'error');
return FALSE;
}
}
else {
// Else using a form-selected vocob.
$vocabularies[$vid] = taxonomy_vocabulary_load($vid);
}
// VOCAB set up, start on TERMS...
// Note that when 'identifier' is used as a key here, it means the identifier
// according to the source document - usually a URI.
// A term identifier is a string distinct from the local term id.
// Gather the resources that will become terms.
// Slightly long way (not using array_merge),
// as I need to merge indexed and by reference.
$terms = array();
$term_types = taxonomy_xml_rdf_term_types();
foreach ($term_types as $term_type) {
// watchdog('taxonomy_xml', 'Adding all %term_type to the list of terms to be processed', array('%term_type' => $term_type), WATCHDOG_DEBUG);
if (isset($resources_by_type[$term_type]) && is_array($resources_by_type[$term_type])) {
foreach ($resources_by_type[$term_type] as $guid => &$term_handle) {
$terms[$guid] =& $term_handle;
}
}
}
// Some of the RDF documents I've been fed DO NOT DEFINE A TYPE
// for their primary subject.
// Neither
// http://www.ubio.org/authority/metadata.php nor
// http://biocol.org/ nor
// http://lsid.tdwg.org/
// return RDF that says WHAT the data is.
// Those that use LSIDs have a type encoded in the Identifier itself :-/
// I end up with a collection of data but no idea what it's really
// talking about.
// But IF an entity is rdf:about="THIS URL" then we will take a leap
// and assume that is our target lump of data.
// ... this worked for biocol input.
foreach ((array) @$resources_by_type[TAXONOMY_XML_UNTYPED] as $identifier => $untyped_lump) {
if ($identifier == $url) {
// Looks like this was the specific thing we were looking for.
watchdog('taxonomy_xml', "Trying to import an <em>untyped</em> data object in the hopes that it is the term we asked for. This may be incorrect, but it's all the document gave us. We asked, and got: '%identifier' .", array(
'%identifier' => $identifier,
), WATCHDOG_NOTICE);
watchdog('taxonomy_xml', "Untyped data object (possibly wrong) '%identifier' = <pre>%data</pre> .", array(
'%identifier' => $identifier,
'%data' => print_r($untyped_lump, 1),
), WATCHDOG_DEBUG);
$terms[$identifier] = $untyped_lump;
}
}
// Special case for Freebase.
taxonomy_xml_rdf_process_freebase_vocab($resources_by_type, $vid);
// Freebase. Sub-terms are listed, but point to the parent, not vice-versa.
taxonomy_xml_rdf_process_dbpedia($resources_by_type, $terms);
if (!$anchor) {
// Shh.
drupal_set_message(t("Found %count resources to be imported as terms into vocabulary %vid", array(
'%count' => count($terms),
'%vid' => $vid,
)));
}
//
// START MAKING TERMS
//
foreach ($terms as $guid => &$term) {
// drupal_set_message(t("Reviewing term %identifier '%name' and analyzing its properties", array('%identifier' => $identifier, '%name' => $term->name)));
if (empty($term)) {
watchdog('taxonomy_xml', "An empty term '%guid' was in the array of terms to create. This should not have happened, fix the input upstream. Ignoring.", array(
'%guid' => $guid,
), WATCHDOG_NOTICE);
continue;
}
if (!isset($term->vid)) {
// This is just a default fallback.
// Imported terms should really have already chosen their vid.
$term->vid = $vid;
}
taxonomy_xml_set_term_guid($term, $guid);
taxonomy_xml_rdf_make_term($term);
}
// Now the terms are all happily created, create their relationships
// Couldn't do so until they had all been given tids.
taxonomy_xml_set_term_relations($terms);
// Note this will not yet affect terms that have been queued
// for later processing.
// Such terms will need to attach themselves to the parent terms themselves.
// watchdog('taxonomy_xml', "After re-linking, we now have all terms set <pre>!data</pre>", array('!data' => print_r($terms, 1)), WATCHDOG_INFO);
return $terms;
}
/**
* FREEBASE only.
*
* If we are reading a top-level topic type page
* eg http://www.freebase.com/tools/explore/music/genre
* type = fb:type_profile
* then it may contain a list of 'instances' which represent our desired
* member terms.
*/
function taxonomy_xml_rdf_process_freebase_vocab(&$resources_by_type, $vid) {
$fb_vocab_type = TAXONOMY_XML_FB_NS . 'freebase.type_profile';
if (empty($resources_by_type[$fb_vocab_type])) {
return;
}
foreach ($resources_by_type[$fb_vocab_type] as $vocab_guid => $vocabulary) {
if (empty($vocabulary->predicates)) {
trigger_error("Something wrong with the vocabulary we are trying to process, it has no predicates");
// dpm($vocabulary);
}
$instances = @$vocabulary->predicates['type.type.instance'];
if (!empty($instances)) {
// I've got a list of URIs that represent terms,
// but not even a name for them.
// The system will still hopefully be able to work it out from just that.
watchdog('taxonomy_xml', "\n FREEBASE: Each <em>instance</em> listed in a freebase <em>type profile</em>\n will be imported as a term.", array(), WATCHDOG_INFO);
foreach ($instances as $term_guid) {
$terms[$term_guid] = $placeholder_term = (object) array(
'guid' => $term_guid,
'vid' => $vid,
);
// Queue a full lookup of this item.
taxonomy_xml_add_term_to_batch_queue($placeholder_term);
watchdog('taxonomy_xml', "Queuing a full retrieval of term !term_guid it for later retrieval and import", array(
'!term_guid' => l($term_guid, $term_guid),
), WATCHDOG_INFO);
}
// Loop over all term 'instances' mentioned by the vocab.
}
// Extra diagnostic - freebase-specific
$instance_count = $vocabulary->predicates['freebase.type_profile.instance_count'];
if ($instance_count > count($instances)) {
watchdog('taxonomy_xml', "\n FREEBASE: The topic set definition claims there are %instance_count\n topic instances in the set, but I can see only %actual_count.\n Some data may be missing from this doc that I am unable to retrieve.\n ", array(
'%instance_count' => reset($instance_count),
'%actual_count' => count($instances),
), WATCHDOG_WARNING);
}
}
}
/**
* Special handling for dbpedia data.
*
* eg
* http://dbpedia.org/page/Category:Rock_music_genres
*
* When taking data from dbpedia, it does not list sub terms as 'narrower', it
* instead lists all the subterms individually, and tags them as having the
* parent term as 'broader'. This means the same thing, but the parent term does
* not know about its children.
* To support this, ensure that any resource (probably untyped) then has a
* 'broader' property matching a current term id gets tagged as being a child of
* it, and is present for being processed as a 'term'.
*/
function taxonomy_xml_rdf_process_dbpedia(&$resources_by_type, &$terms) {
watchdog('taxonomy_xml', "Processing dbpedia special case", array(), WATCHDOG_INFO);
foreach ($resources_by_type as $type => &$typedlist) {
foreach ($typedlist as $guid => $resource) {
if (isset($terms[$guid])) {
// Already know this is a term.
continue;
}
// Should canonicize predicates here?
if (isset($resource->predicates[TAXONOMY_XML_SKOS_NS . 'broader'])) {
// This is not a term, but is DOES have something else as a broader term
// therefore it really is a term. (of unknown type).
watchdog('taxonomy_xml', "Although not listed as a a term, %guid has something as a 'broader' parent. So it probably is a term after all. Adding it to the list", array(
'%guid' => $guid,
), WATCHDOG_INFO);
$terms[$guid] = $resource;
}
}
}
return $terms;
}
/**
* Invoke the ARC parser on the given data.
*
* Uses some minor caching if the base $url is the same.
* If the requested base Uguidis the same as the previous one, you'll get a
* cached version, but those data objects are not held onto in a true cache
* array.
* This will be optimal for one big file being called all the time (an all-in-
* one taxonomy), and NOT fill up with crud if lots of different files are
* requested once (as happens when spidering).
*
* @return array
* An indexed set of triples
*/
function taxonomy_xml_rdf_parse_data_into_index($data, $url) {
static $old_index, $old_url;
if (!empty($url) && $url == $old_url) {
// Re-using parser cache.
return $old_index;
}
watchdog('taxonomy_xml', "Parsing RDF", array(), WATCHDOG_INFO);
// Use ARC parser.
if (!rdf_load_arc2()) {
watchdog('taxonomy_xml', "ARC2 Parser was unavailable", array(), WATCHDOG_ERROR);
return FALSE;
}
$parser = ARC2::getRDFParser();
$base = $url;
$parser
->parse($base, $data);
if ($errors = $parser
->getErrors()) {
watchdog('taxonomy_xml', "ARC2 Parser returned an error : %error", array(
'%error' => print_r($errors, 1),
), WATCHDOG_ERROR);
}
$index = $parser
->getSimpleIndex();
// GetSimpleIndex flattens multiple values,
// eg different language labels for the same concept.
// Cannot retrieve that info if we use this method.
// @todo bug in arc? attributes - eg href - nested in rdf seem to get damaged.
// may be only when tagged as XMLLiteral rdf:parseType="Literal"
if (!is_array($index)) {
drupal_set_message(t("Problem parsing input %message", array(
'%message' => $index,
)), 'error');
return array();
}
watchdog('taxonomy_xml', "\n %count data objects (subjects) found in the source RDF doc", array(
'%count' => count($index),
), WATCHDOG_INFO);
// Caching.
$old_url = $url;
$old_index = $index;
return $index;
}
/**
* Sort a list of data objects into groups by 'type'.
*
* Arc2 indexing has done most of the flattening for us, we just need to throw
* these things into different bags.
*
* Ensure that objects have a 'name' and a 'type'
*/
function taxonomy_xml_convert_index_to_sorted_objects(&$index) {
$resources_by_type = array();
foreach ($index as $subject_guid => $values) {
// A proto-resource object stores all the statements
// in an array called 'predicates' that we will inspect later.
$subject = (object) array(
'predicates' => $values,
);
// This remote URI is the key we use for real indexing
// when matching up children and parents.
$subject->identifier = $subject_guid;
// 'predicates' given to us from ARC2 are full namespaced URIs
// yet through the rest of the code, we prefer to use the
// predicates 'shortname' (without namespace) most of the time.
// Remember to flatten them if comparing with cannonic types later.
// We need to know that a thing has a name for later.
if (isset($values[TAXONOMY_XML_NAME])) {
$subject->name = reset($values[TAXONOMY_XML_NAME]);
}
else {
// Guess a shortname based on URI
// $subject->name = taxonomy_xml_shortname($subject_guid);
// No, premature, if set here it takes priority.
// A better, more accurate name may be deduced later on during
// synonym collapsing.
}
if (isset($values[TAXONOMY_XML_TYPE])) {
// Types may be multiple. Don't know why or how, but they may.
// (see http://rdfs.org/sioc/types for one such)
// In any case, it's always an array when we see it
// To deal with twosies, place the same thing in two places by reference.
unset($handle);
// Work on a copy of subject.
$handle = $subject;
foreach ($values[TAXONOMY_XML_TYPE] as $type) {
$subject->type = $type;
$resources_by_type[$type][$subject_guid] =& $handle;
}
unset($handle);
}
else {
// No idea what this is, remember it anyway.
$resources_by_type[TAXONOMY_XML_UNTYPED][$subject_guid] = $subject;
}
}
return $resources_by_type;
}
/**
* Create the placeholder and fill in the values for this term.
*
* NOT its relationships yet.
*/
function taxonomy_xml_rdf_make_term(&$term) {
$identifier = taxonomy_xml_get_term_guid($term);
// drupal_set_message(t("Reviewing term %identifier '%name' and analyzing its properties", array('%identifier' => $identifier, '%name' => @$term->name)));
if (empty($identifier)) {
watchdog('taxonomy_xml', "\n Attempting to make term, but no identifier is available. Can't do that. Skipping it. <pre>!term</pre>", array(
'!term' => print_r($term, 1),
), WATCHDOG_ERROR);
return NULL;
}
// When running in batch, children will have a hard time finding their
// parents if they only know them by source-localized ID (probably a URI)
// and the destination-taxonomy (here) HASN'T REMEMBERED THAT INFO.
// Because taxonomy.module just doesn't.
// We require some other method (fields on terms) to save that
// metadata for us so the child can find its target later.
// This is our 'identifier' - the REMOTE identifier not the local one.
// Build term from data.
// Convert all input predicates into attributes on the object
// the taxonomy.module will understand.
taxonomy_xml_canonicize_predicates($term);
// Ensure name is valid, this is required.
if (empty($term->name)) {
// Fallback to a name, identifier
// derived (roughly) from the URI identifier.
// Not always meaningful, but all we have in some contexts.
$term->name = taxonomy_xml_shortname($identifier);
if (empty($term->name)) {
// Still not set?
// This should be impossible - all subjects must have a URI or identifier
// But who knows what wierdness the input gave us.
drupal_set_message(t("\n A term called %identifier didn't produce any readable name to use. ", array(
'%identifier' => $identifier,
)), 'error');
watchdog('taxonomy_xml', "\n Invalid term object, not enough data : NO NAME <pre>!term</pre>", array(
'!term' => print_r($term, 1),
), WATCHDOG_ERROR);
return;
}
else {
watchdog('taxonomy_xml', "\n We were unable to find a specific label for the term\n referred to as %identifier.\n Guessing that %name will be good enough.", array(
'%identifier' => $identifier,
'%name' => $term->name,
), WATCHDOG_NOTICE);
// Still, this causes problems
// if queuing data about terms that are not yet loaded
// - such as those that are ONLY referenced by URI
// with no human name (Freenet)
// Our munged names are temporary until the full data is retrieved.
}
}
// See if a definition matching this terms name already exists in the DB.
// Build on that.
$existing_term = taxonomy_xml_get_term_by_guid($identifier, $term->vid);
if (!$existing_term) {
$force_new = variable_get('taxonomy_xml_duplicate', FALSE);
$existing_term = _taxonomy_xml_get_term_placeholder($term->name, $term->vid, $force_new);
}
// Merge the old term objects properties into this one.
// Really just want its tid, but there may be more info I should not lose.
// New input takes precedence over older data.
// Old data just fills in the gaps.
foreach ((array) $existing_term as $key => $value) {
if (!isset($term->{$key})) {
$term->{$key} = $value;
}
}
// The term object is now as tidy as it can be as a self-contained entity.
// dpm($term);
// It may be premature to save this term if we don't know its parent yet,
// The system will default to parent=0, which causes bad structure later on.
if (!isset($term->parents)) {
watchdog('taxonomy_xml', "About to save a term '%name' with no parent, this could be a problem later, but probably just means it's root-level", array(
'%name' => $term->name,
), WATCHDOG_INFO);
}
$status = taxonomy_term_save($term);
// This object is being passed around as a handle, so I don't expect to have
// lost anything important from it.
if ($status == SAVED_NEW) {
// Just remember this is fresh - for useful feedback messages.
$term->taxonomy_xml_new_term = TRUE;
}
// It's possible that not all the referenced items were available
// in the current document/loop.
// Add referred items to the import queue for later processing.
taxonomy_xml_add_all_children_to_queue($term);
$term->taxonomy_xml_presaved = TRUE;
// A flag to avoid double-processing.
return $term;
}
/**
* Return an XML/RDF document representing this vocab
*
* I'd like to use ARC libraries, but it doesn't appear to include an RDF
* serializer output method, only an input parser...
*
* Uses PHP DOM to create DOM document and nodes.
*
* We use namespaces carefully here, although it may create wordy output if the
* DOM is not optimizing the declarations for us. Still, best to be explicit, it
* would seem.
*
* The URI used to refer to other resources is based on the source document
* location, eg
* http://this.server/taxonomy/vocabulary/{vid}/rdf#{tid}
*
* Preamble should look something like:
*
* <rdf:RDF xmlns:rdf ="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
* xmlns: rdfs="http://www.w3.org/2000/01/rdf-schema#"
* xmlns: owl="http://www.w3.org/2002/07/owl#"
*
* @return string
* An XML document string.
*/
function taxonomy_xml_rdf_create($vocabulary, $parent = 0, $depth = -1, $max_depth = NULL) {
$vocabulary = is_numeric($vocabulary) ? taxonomy_vocabulary_load($vocabulary) : $vocabulary;
$domcontainer = taxonomy_xml_rdf_document();
$dom = $domcontainer->ownerDocument;
// Define the vocab.
taxonomy_xml_rdf_add_vocab($domcontainer, $vocabulary);
// Now start adding terms.
// They are listed as siblings, not children of the ontology.
$tree = taxonomy_get_tree($vocabulary->vid, $parent, $max_depth, $depth);
taxonomy_xml_rdf_add_terms($domcontainer, $tree);
$result = $dom
->savexml();
// Minor layout tweak for readability,
// singletons go on their own lines.
$result = preg_replace('|(<[^<]*/>)|', "\$1\n", $result);
// Nested tags go onto new lines.
$result = preg_replace('|><|', ">\n<", $result);
return $result;
}
/**
* Create a vocabulary definition (just the def, not its terms) and insert it
* into the given document element.
*
* No return, it acts on the DOM document directly.
*
* @param DOMElement $domcontainer
* Modified by ref.
* @param object $vocabulary
* A vocab object.
*/
function taxonomy_xml_rdf_add_vocab(DOMElement &$domcontainer, $vocabulary) {
$dom = $domcontainer->ownerDocument;
$ns = rdf_get_namespaces();
// Describe the vocabulary itself.
$vocabnode = rdf_entity_to_xml($vocabulary, $dom, $domcontainer);
if (!$vocabnode) {
trigger_error("Failed to create vocabnode using XML methods", E_USER_ERROR);
return;
}
// If this was a cannonic vocab,
// we would use a full external URI as identifiers.
// But if it's our own, we get our own vocabulary path as a URI (or URN).
$vocabnode
->setattributens($ns['rdf'], 'rdf:about', taxonomy_xml_get_vocabulary_uri($vocabulary));
$vocabnode
->appendchild($dom
->createelementns($ns['owl'], 'owl:versionInfo', xmlentities(format_date(REQUEST_TIME, 'long'))));
}
/**
* Append definitions of a list of terms on to a DOM container.
*
* This term dump will directly reflect any rdf_mapping retrieved from
* the Drupal 'entity' schema, which is based on SKOS.
*
* <skos:Concept rdf:ID="term-1764" rdf:about="http://taxonomy.drupal7.
* gadget/taxonomy/term/1764">
* <rdfs:label>Corporate management (Internal)</rdfs:label>
* <skos:prefLabel>Corporate management (Internal)</skos:prefLabel>
* <skos:definition>
* Managing the organisation's own corporate body
* </skos:definition>
* <skos:member>3</skos:member>
* <skos:broader>1763</skos:broader>
* </skos:Concept>
*
* @param $termlist a FLAT array of all terms, internally cross-referenced to
* each other defining the tree stucture
*
* No return, it acts on the DOM document directly.
*/
function taxonomy_xml_rdf_add_terms(DOMElement &$domcontainer, $termlist, $recursion_behaviour = TAXONOMY_XML_CHILDREN_REF_ONLY) {
if (!$termlist) {
return;
}
$dom = $domcontainer->ownerDocument;
$ns = rdf_get_namespaces();
// Allow submission of a single term.
if (!is_array($termlist)) {
$termlist = array(
$termlist,
);
}
// D7 hook_taxonomy_term_load actually takes an array, not a singular.
module_invoke_all('taxonomy_term_load', $termlist);
foreach ($termlist as $term) {
// dpm($term);
// rdf_entity_to_xml does a direct mapping from data structure to XML,
// so picks up most of the default values, using rdf_mapping
// The term SHOULD have its entity mapping details attached to it by now.
// didn't module_invoke_all do that?
// If I have to do it myself : INEFFICIENCY HERE due to the full reload.
if (empty($term->rdf_mapping)) {
$term = taxonomy_term_load($term->tid);
}
$termnode = rdf_entity_to_xml($term, $dom, $domcontainer);
if (!$termnode) {
watchdog('taxonomy_xml', "Failed to create an XML entry for term, <pre>!data</pre>", array(
'!data' => print_r($term, 1),
), WATCHDOG_ERROR);
continue;
}
// Set either the local or (preferably) the cannonic remote URI
// as the elements 'about' attribute.
$guid = taxonomy_xml_get_term_guid($term);
$termnode
->setattributens($ns['rdf'], 'rdf:about', $guid);
$vocab_ref = $dom
->createelementns($ns['skos'], 'skos:member');
$vocabulary = taxonomy_vocabulary_load($term->vid);
$vocab_guid = taxonomy_xml_get_vocabulary_uri($term->vid);
$vocab_ref
->setattributens($ns['rdf'], 'rdf:resource', $vocab_guid);
// Looks like setattributens is now safe for xmlentities.
// Previous PHP did not?
$vocab_ref
->setattributens($ns['rdf'], 'rdf:value', $vocabulary->name);
$termnode
->appendchild($vocab_ref);
if (!empty($term->parents)) {
foreach ((array) $term->parents as $parent_id) {
$parentlist = array();
if ($parent_id) {
$parentlist[$parent_id] = $parent = taxonomy_term_load($parent_id);
$parent_node = $dom
->createelementns($ns['skos'], 'skos:broader');
$parent_node
->setattributens($ns['rdf'], 'rdf:resource', taxonomy_xml_get_term_guid($parent));
$parent_node
->setattributens($ns['rdf'], 'rdf:value', $parent->name);
$termnode
->appendchild($parent_node);
}
}
}
// Now add the children also.
switch ($recursion_behaviour) {
case TAXONOMY_XML_NO_CHILDREN:
break;
case TAXONOMY_XML_CHILDREN_REF_ONLY:
$max_depth = 1;
$tree = taxonomy_get_tree($term->vid, $term->tid, $max_depth);
foreach ($tree as $child) {
$child_id = $child->tid;
$child_node = $dom
->createelementns($ns['skos'], 'skos:narrower');
$child_node
->setattributens($ns['rdf'], 'rdf:resource', taxonomy_xml_get_term_guid($child));
$child_node
->setattributens($ns['rdf'], 'rdf:value', $child->name);
$termnode
->appendchild($child_node);
}
}
// Workaround for large vocabs - extend runtime indefinately.
drupal_set_time_limit(10);
}
// Done all terms in list.
}
/**
* Return a term as RDF-XML.
*
* A sub-hook implementation of taxonomy_xml_{format}_create_term()
* @see taxonomy_xml_export_term()
*
* @param int|object $term
* Either a term object, or a term id
* @param int $depth
* How far currently recursed into the tree
* @param int|null $max_depth
* How far to recurse into this items children.
*
* @return string
* an XML string.
*/
function taxonomy_xml_rdf_create_term($term, $depth = -1, $max_depth = NULL) {
$term = is_numeric($term) ? taxonomy_term_load($term) : $term;
// Do I need to load in all extra data ? All taken core of in D7?
if (empty($term)) {
watchdog('taxonomy_xml', "NULL term loaded <pre>!data</pre>", array(
'!data' => func_get_args(),
), WATCHDOG_ERROR);
return FALSE;
}
$domcontainer = taxonomy_xml_rdf_document();
$dom = $domcontainer->ownerDocument;
// Although we were only asked for one term,
// child or parent terms may be mentioned when the entry is built.
taxonomy_xml_rdf_add_terms($domcontainer, $term);
$result = $dom
->savexml();
// Minor layout tweak for readability.
$result = preg_replace('|(<[^<]*/[^>]*>)|', "\$1\n", $result);
$result = preg_replace('|><|', ">\n<", $result);
return $result;
}
Functions
Name![]() |
Description |
---|---|
taxonomy_xml_convert_index_to_sorted_objects | Sort a list of data objects into groups by 'type'. |
taxonomy_xml_rdf_add_terms | Append definitions of a list of terms on to a DOM container. |
taxonomy_xml_rdf_add_vocab | Create a vocabulary definition (just the def, not its terms) and insert it into the given document element. |
taxonomy_xml_rdf_create | Return an XML/RDF document representing this vocab |
taxonomy_xml_rdf_create_term | Return a term as RDF-XML. |
taxonomy_xml_rdf_format_info | Sub-hook. |
taxonomy_xml_rdf_make_term | Create the placeholder and fill in the values for this term. |
taxonomy_xml_rdf_parse | Read in RDF taxonomies and vocabularies. Create vocabs and terms as needed. |
taxonomy_xml_rdf_parse_data_into_index | Invoke the ARC parser on the given data. |
taxonomy_xml_rdf_process_dbpedia | Special handling for dbpedia data. |
taxonomy_xml_rdf_process_freebase_vocab | FREEBASE only. |
taxonomy_xml_rdf_term_types | Return a list of 'types' of things that we may import as 'terms'. |
taxonomy_xml_rdf_vocabulary_types | Return a list of types of things that may behave like 'vocabularies'. |
Constants
Name![]() |
Description |
---|---|
TAXONOMY_XML_CHILDREN_DETAILS | When dumping a term, Fully describe immediate child terms. (URI ref under them). |
TAXONOMY_XML_CHILDREN_RECURSIVE | When dumping a term, Fully describe all child terms. |
TAXONOMY_XML_CHILDREN_REF_ONLY | When dumping a term, Just list child term URIs. |
TAXONOMY_XML_NO_CHILDREN | When dumping a term, don't list child terms. |