You are here

function taxonomy_xml_rdf_parse in Taxonomy import/export via XML 5.2

Same name and namespace in other branches
  1. 5 rdf_format.inc \taxonomy_xml_rdf_parse()
  2. 6.2 rdf_format.inc \taxonomy_xml_rdf_parse()
  3. 6 rdf_format.inc \taxonomy_xml_rdf_parse()
  4. 7 formats/rdf_format.inc \taxonomy_xml_rdf_parse()

Read in RDF taxonomies and vocabularies. Create vocabs and terms as needed.

See formats.html readme for information about the RDF input supported.

Targets include : ICRA Content Rating http://www.icra.org/vocabulary/ WordNet Lexicon http: //wordnet.princeton.edu/ SUMO http://www. ontologyportal.org/

... and the ontologies found at http://www.schemaweb.info/ that implement appropriate parts of the RDF Schema "rdfs" (eg Classes with subclassOf)

File

./rdf_format.inc, line 42
Include routines for RDF parsing and taxonomy/term creation.

Code

function taxonomy_xml_rdf_parse(&$data, $vid) {
  if (!file_exists(drupal_get_path('module', 'taxonomy_xml'))) {
    drupal_set_message(t('This method requires the ARC library to be available. Please check the taxonomy_xml INSTALL.txt'));
    return false;
  }

  // Use ARC parser
  include_once "arc/ARC_rdfxml_parser.php";
  $parser_args = array(
    "bnode_prefix" => "genid",
    "base" => "",
  );
  $parser = new ARC_rdfxml_parser($parser_args);
  $triples = $parser
    ->parse_data($data);
  if (!is_array($triples)) {
    drupal_set_message(t("Problem parsing input %message", array(
      '%message' => $triples,
    )), 'error');
    return false;
  }
  drupal_set_message(t("%count data triples (atomic statements) found in the source RDF doc", array(
    '%count' => count($triples),
  )));

  #dpm($triples);

  // The RDF input may come in several flavours,
  // Resources of the following 'types' may be cast into taxonomy terms for our purposes.
  // That is, an rdf:Class is a Drupal:term
  //
  // Add to this list as needed
  //
  $term_types = array(
    TAXONOMY_XML_RDF_NS . 'Property',
    'http://purl.org/dc/elements/1.1/subject',
    TAXONOMY_XML_RDFS_NS . 'Class',
    TAXONOMY_XML_W3C_WN_SCHEMA . 'Word',
    TAXONOMY_XML_W3C_WN_SCHEMA . 'NounWordSense',
    TAXONOMY_XML_W3C_WN_SCHEMA . 'NounSynset',
    'http://www.w3.org/2004/12/q/contentlabel#Category',
    TAXONOMY_XML_SKOS_NS . 'Concept',
  );

  // A Drupal 'vocabulary' is represented by an owl:Ontology
  // or other similar shaped constructs
  $vocabulary_types = array(
    TAXONOMY_XML_OWL_NS . 'Ontology',
    TAXONOMY_XML_RDF_NS . 'Description',
    'http://www.w3.org/2001/12/Glossary',
  );
  $resources_by_type = taxonomy_xml_convert_triples_to_sorted_objects($triples);

  // The resources are all initialized as data objects.
  // Resource types we expect to be dealing with are just vocabs and terms.
  drupal_set_message(t("Found %count different <strong>kinds</strong> of resources in the input : %types", array(
    '%count' => count($resources_by_type),
    '%types' => join(', ', array_keys($resources_by_type)),
  )));

  #dpm($resources_by_type);

  // Scan the sorted objects for vocabulary definitions
  // Hopefully there's only one vocab per file, but loop anyway
  $vocabularies = array();
  foreach ($vocabulary_types as $vocabulary_type) {
    if (isset($resources_by_type[$vocabulary_type]) && is_array($resources_by_type[$vocabulary_type])) {
      foreach ($resources_by_type[$vocabulary_type] as $uri => &$vocabulary_handle) {
        $vocabularies[$uri] =& $vocabulary_handle;
      }
    }
  }
  drupal_set_message(t("Found %count resources to be used as vocabulary definitions", array(
    '%count' => count($vocabularies),
  )));
  if ($vid == 0) {

    // We've been asked to use the vocab described in the source file.
    if (!$vocabularies) {

      // Create a placeholder.
      $vocabularies[] = array(
        'name' => 'Imported Vocabulary',
      );
    }
  }
  $vid = taxonomy_xml_absorb_vocabulary_definitions($vocabularies);

  // $vocabularies now contains a keyed array of target vocabularies the terms may be put into
  // $vid is the default one (most common is one vocab per input file) to be used unless otherwise defined per-term.
  // Gather the resources that will become terms.
  // Slightly long way (not using array_merge), as I need to merge indexed and by reference
  $terms = array();
  foreach ($term_types as $term_type) {
    if (isset($resources_by_type[$term_type]) && is_array($resources_by_type[$term_type])) {
      foreach ($resources_by_type[$term_type] as $uri => &$term_handle) {

        // Grab name/label early for debugging and indexing
        $predicates = $term_handle->predicates;
        if (isset($predicates['label'])) {
          $term_handle->name = $predicates['label'][0];
        }
        $terms[$uri] =& $term_handle;
      }
    }
  }
  drupal_set_message(t("Found %count resources to be imported as terms into vocabulary %vid", array(
    '%count' => count($terms),
    '%vid' => $vid,
  )));

  // $predicate_synonyms is a translation array to match rdf-speak with Drupal concepts
  $predicate_synonyms = taxonomy_xml_relationship_synonyms();

  //
  // START MAKING TERMS
  //
  foreach ($terms as $uri => &$term) {
    drupal_set_message(t("Reviewing term %uri '%name' and analyzing its properties", array(
      '%uri' => $uri,
      '%name' => $term->name,
    )));
    if (!isset($term->vid)) {

      // This is just a default fallback. Imported terms should really have already chosen their vid.
      $term->vid = $vid;
    }

    // Build term from data
    // Convert all input predicates into attributes on the object the taxonomy.module will understand
    foreach ($term->predicates as $predicate => $values) {
      $original_predicate = $predicate;

      // First translate misc terminology synonyms to the cannonic predicate I use everywhere
      // This allows us to interpret several XML dialects at once
      if (isset($predicate_synonyms[$predicate]) && ($cannonic = $predicate_synonyms[$predicate])) {
        $predicate = $cannonic;
      }

      #drupal_set_message(t("Applying '$predicate' ($predicate) value of ". print_r($values, 1) ." found in $uri"));
      switch ($predicate) {
        case 'type':

        // These are already done. Ignore
        case 'subPropertyOf':

          // Useless, ignore also
          break;
        case TAXONOMY_XML_NAME:
          $term->name = taxonomy_xml_get_literal_string($values);
          break;
        case TAXONOMY_XML_DESCRIPTION:
          $term->description = taxonomy_xml_get_literal_string($values);
          break;
        case TAXONOMY_XML_PARENT:
        case TAXONOMY_XML_RELATED:
        case TAXONOMY_XML_CHILD:

          // A term relationship.
          // Translate each referred item from URI to label or handle,
          // and save to be linked in later
          foreach ($values as $i => $target_uri) {
            $term->predicates[$predicate][$target_uri] = $target_uri;
          }
          break;
        case TAXONOMY_XML_HAS_SYNONYM:
          $term->synonyms_array = isset($term->synonyms_array) ? array_merge($term->synonyms_array, $values) : $values;
          $term->synonyms = join("\n", array_unique($term->synonyms_array));
          break;
        case TAXONOMY_XML_IN_VOCABULARY:

          // This term need to be in the vocabulary referred to by this URI
          // check our known vocabs to see if they are recognised
          // Do we know a vocab with an ID matching this 'isdefinedby' value?
          foreach ($values as $value) {

            // probably just one...
            if ($target_vocab = $vocabularies[$value]) {

              // I know this vocab!
              $term->vid = $target_vocab->vid;
            }
          }
          break;
        case 'unused':

          // Explicitly ignore these
          break;
        default:
          drupal_set_message(t("Dunno what to do with '{$predicate}' value of " . print_r($values, 1) . " found in {$uri}", array(
            '$predicate' => $predicate,
            '%values' => print_r($values, 1),
            '%uri' => $uri,
          )));
      }
    }

    // Look for existing term matching this one and blend the properties
    // Ensure name is valid
    if (!$term->name) {

      // Fallback to a name, identifier derived (roughly) from the URI - not always meaningful, but all we have in some contexts.
      $term->name = basename($uri);
      drupal_set_message(t("Problem, we were unable to find a specific label for the term referred to as %uri. Guessing that %name will be good enough.", array(
        '%uri' => $uri,
        '%name' => $term->name,
      )));
    }
    if (!$term->name) {

      // Should never get here
      drupal_set_message(t("Problem, this term %uri does not have a readable label.", array(
        '%uri' => $uri,
      )));
      next;
    }

    # dpm(array('data to merge' => $term));

    // See if a definition already exisits in the DB. Build on that.
    $existing_term = _taxonomy_xml_get_term_placeholder($term->name, $vid);

    // Merge the old term objects properties into this one. Really just want its tid, but there may be more info I should not lose.
    // New input takes precedence over older data
    foreach ((array) $existing_term as $key => $value) {
      if (!isset($term->{$key})) {
        $term->{$key} = $value;
      }
    }

    // The term object is now as tidy as it can be as a self-contained entity.

    # dpm($term);

    // MAINTAIN IDS
    // Because this is likely to be used with a site-cloning set-up, it would help if we tried to match IDs
    // OTOH, doing so could be very messy for other situations.
    // So,
    //  iff there is no pre-existing term with this id,
    //  create this one as a clone with the old ID.
    // This requires a little DB sneakiness.
    if ($term->internal_id && !taxonomy_get_term($term->internal_id)) {
      $term->tid = $term->internal_id;
      drupal_set_message(t("Doing sneaky import of %term_name re-using the internal id = %term_id", array(
        '%term_name' => $term->name,
        '%term_id' => $term->internal_id,
      )));
      db_query("INSERT INTO {term_data} (tid, name, description, vid, weight) VALUES (%d, '%s', '%s', %d, %d)", $term->tid, $term->name, $term->description, $term->vid, $term->weight);

      // Fudge the sequences table to patch the hack we just did, avoid over-writing later
      $current_id = db_result(db_query("SELECT id FROM {sequences} WHERE name = '%s'", '{term_data}_tid'));
      if ($current_id < $term->tid) {
        db_query("REPLACE INTO {sequences} VALUES ('%s', %d)", '{term_data}_tid', $term->tid);
      }
    }

    #dpm(array("ready to save" => $term));
    $save_term = (array) $term;
    taxonomy_save_term($save_term);

    // Re-retrieve the new term definition, just in case anything extra happened to it during processing
    $new_term = taxonomy_xml_get_term_by_name_from_vocab($term->name, $term->vid);
    if (!$new_term) {
      drupal_set_message(t("It seems like we failed to create and retrieve a term called %term_name", array(
        '%term_name' => $term->name,
      )), 'error');
    }

    // Merge retrieved values back over our main definition so the handles are up-to-date
    foreach ((array) $new_term as $key => $value) {
      $term->{$key} = $value;
    }
  }

  // end term-construction loop;

  #dpm("Saved all, now linking!");

  #dpm($terms);

  // Now the terms are all happily created, create their relationships
  // Couldn't do so until they had all been given tids.
  taxonomy_xml_set_term_relations($terms);
  $term_list = array();
  foreach ($terms as $term) {
    $term_list[] = l($term->name, 'admin/content/taxonomy/edit/term/' . $term->tid);
  }
  drupal_set_message(t('Updated %count term(s)', array(
    '%count' => count($terms),
  )) . ' <i>' . implode(', ', $term_list) . '.</i> ');
  drupal_set_message(t("\n    Finished importing vocabulary %vocab_name. \n    You may now need to <a href='!settings_link'>Review the vocabulary settings</a> \n    or <a href='!list_link'>List the terms</a>", array(
    '%vocab_name' => $vocabulary->name,
    '!settings_link' => url('admin/content/taxonomy/edit/vocabulary/' . $vid),
    '!list_link' => url('admin/content/taxonomy/' . $vid),
  )));
  return count($term_list);
}