You are here

xml.inc in Migrate 7.2

Same filename and directory in other branches
  1. 6.2 plugins/sources/xml.inc

Support for migration from XML sources.

NOTE: There are two methods supported in this file.

1) List - ids are listed in an index xml file and the data for each item is stored in a separate xml file per item. Use MigrateSourceList class as the source.

2) MultiItems - ids are part of the item and all items are stored in a single xml file. Use MigrateSourceMultiItems class as the source.

Both of these methods are described in more detail in the wine migration example.

File

plugins/sources/xml.inc
View source
<?php

/**
 * @file
 * Support for migration from XML sources.
 *
 * NOTE: There are two methods supported in this file.
 *
 * 1) List - ids are listed in an index xml file and the data for each item is
 *      stored in a separate xml file per item. Use MigrateSourceList class
 *      as the source.
 *
 * 2) MultiItems - ids are part of the item and all items are stored in a
 *      single xml file. Use MigrateSourceMultiItems class as the source.
 *
 * Both of these methods are described in more detail in the wine migration
 * example.
 */

/* ========================================================================== */

/*                              List Method                                   */

/* ========================================================================== */

/**
 * Implementation of MigrateList, for retrieving a list of IDs to be migrated
 * from an XML document.
 */
class MigrateListXML extends MigrateList {

  /**
   * A URL pointing to an XML document containing a list of IDs to be processed.
   *
   * @var string
   */
  protected $listUrl;

  /**
   * An array of namespaces to explicitly register before Xpath queries.
   *
   * @var array
   */
  protected $namespaces;

  /**
   * {@inheritdoc}
   */
  public function __construct($list_url, array $namespaces = array()) {
    parent::__construct();
    $this->listUrl = $list_url;
    $this->namespaces = $namespaces;

    // Suppress errors during parsing, so we can pick them up after.
    libxml_use_internal_errors(TRUE);
  }

  /**
   * {@inheritdoc}
   *
   * Our public face is the URL we're getting items from
   */
  public function __toString() {
    return $this->listUrl;
  }

  /**
   * {@inheritdoc}
   *
   * Load the XML at the given URL, and return an array of the IDs found
   * within it.
   */
  public function getIdList() {
    migrate_instrument_start("Retrieve {$this->listUrl}");
    $xml = simplexml_load_file($this->listUrl);
    migrate_instrument_stop("Retrieve {$this->listUrl}");
    if ($xml !== FALSE) {
      $this
        ->registerNamespaces($xml);
      return $this
        ->getIDsFromXML($xml);
    }
    else {
      Migration::displayMessage(t('Loading of !listUrl failed:', array(
        '!listUrl' => $this->listUrl,
      )));
      foreach (libxml_get_errors() as $error) {
        Migration::displayMessage(MigrateItemsXML::parseLibXMLError($error));
      }
      return NULL;
    }
  }

  /**
   * Gets an array of the IDs found in a XML.
   *
   * Given an XML object, parse out the IDs for processing and return them as an
   * array. The default implementation assumes the IDs are simply the values of
   * the top-level elements - in most cases, you will need to override this to
   * reflect your particular XML structure.
   *
   * @param SimpleXMLElement $xml
   *   Object from we get the ID's
   *
   * @return array
   *   Extracted ID's
   */
  protected function getIDsFromXML(SimpleXMLElement $xml) {
    $ids = array();
    foreach ($xml as $element) {
      $ids[] = (string) $element;
    }

    // Additionally, if there are any namespaces registered, try to parse
    // elements with namespaces as well.
    if ($namespaces = $xml
      ->getNamespaces()) {
      foreach ($namespaces as $prefix => $url) {
        foreach ($xml
          ->children($url) as $element) {
          $ids[] = (string) $element;
        }
      }
    }
    return array_unique($ids);
  }

  /**
   * {@inheritdoc}
   *
   * Return a count of all available IDs from the source listing.
   * The default implementation assumes the count of top-level elements
   * reflects the number of IDs available - in many cases, you will need
   * to override this to reflect your particular XML structure.
   */
  public function computeCount() {
    $xml = simplexml_load_file($this->listUrl);
    $this
      ->registerNamespaces($xml);

    // Number of sourceid elements beneath the top-level element.
    $count = count($xml);

    // Additionally, if there are any namespaces registered, try to count
    // elements with namespaces as well.
    if ($namespaces = $xml
      ->getNamespaces()) {
      foreach ($namespaces as $prefix => $url) {
        $count += count($xml
          ->children($url));
      }
    }
    return $count;
  }

  /**
   * Explicitly register namespaces on an XML element.
   *
   * @param SimpleXMLElement $xml
   *   A SimpleXMLElement to register the namespaces on.
   */
  protected function registerNamespaces(SimpleXMLElement &$xml) {
    foreach ($this->namespaces as $prefix => $namespace) {
      $xml
        ->registerXPathNamespace($prefix, $namespace);
    }
  }

}

/**
 * Implementation of MigrateItem, for retrieving a parsed XML document given
 * an ID provided by a MigrateList class.
 */
class MigrateItemXML extends MigrateItem {

  /**
   * A URL pointing to an XML document containing the data for one item to be
   * migrated.
   *
   * @var string
   */
  protected $itemUrl;

  /**
   * An array of namespaces to explicitly register before Xpath queries.
   *
   * @var array
   */
  protected $namespaces;

  /**
   * {@inheritdoc}
   */
  public function __construct($item_url, array $namespaces = array()) {
    parent::__construct();
    $this->itemUrl = $item_url;
    $this->namespaces = $namespaces;

    // Suppress errors during parsing, so we can pick them up after.
    libxml_use_internal_errors(TRUE);
  }

  /**
   * Explicitly register namespaces on an XML element.
   *
   * @param SimpleXMLElement $xml
   *   A SimpleXMLElement to register the namespaces on.
   */
  protected function registerNamespaces(SimpleXMLElement &$xml) {
    foreach ($this->namespaces as $prefix => $namespace) {
      $xml
        ->registerXPathNamespace($prefix, $namespace);
    }
  }

  /**
   * {@inheritdoc}
   *
   * Implementors are expected to return an object representing a source item.
   */
  public function getItem($id) {

    // Make sure we actually have an ID.
    if (empty($id)) {
      return NULL;
    }
    $item_url = $this
      ->constructItemUrl($id);

    // And make sure we actually got a URL to fetch.
    if (empty($item_url)) {
      return NULL;
    }

    // Get the XML object at the specified URL.
    $xml = $this
      ->loadXmlUrl($item_url);
    if ($xml !== FALSE) {
      $this
        ->registerNamespaces($xml);
      $return = new stdclass();
      $return->xml = $xml;
      return $return;
    }
    else {
      $migration = Migration::currentMigration();
      $message = t('Loading of !objecturl failed:', array(
        '!objecturl' => $item_url,
      ));
      foreach (libxml_get_errors() as $error) {
        $message .= "\n" . $error->message;
      }
      $migration
        ->getMap()
        ->saveMessage(array(
        $id,
      ), $message, MigrationBase::MESSAGE_ERROR);
      libxml_clear_errors();
      return NULL;
    }
  }

  /**
   * Creates a valid URL pointing to current item.
   *
   * The default implementation simply replaces the :id token in the URL with
   * the ID obtained from MigrateListXML. Override if the item URL is not so
   * easily expressed from the ID.
   *
   * @param mixed $id
   *   XML item ID
   *
   * @return string
   *   Formatted string with replaced tokens
   */
  protected function constructItemUrl($id) {
    return str_replace(':id', $id, $this->itemUrl);
  }

  /**
   * Loads the XML.
   *
   * Default XML loader - just use Simplexml directly. This can be overridden
   * for preprocessing of XML (removal of unwanted elements, caching of XML if
   * the source service is slow, etc.)
   *
   * @param string $item_url
   *   URL to the XML file
   *
   * @return SimpleXMLElement
   *   Loaded XML
   */
  protected function loadXmlUrl($item_url) {
    return simplexml_load_file($item_url);
  }

  /**
   * Implments MigrateItem::hash().
   */
  public function hash($row) {

    // $row->xml is a SimpleXMLElement. Temporarily set it as an XML string
    // to prevent parent::hash() failing when try to create the hash.
    migrate_instrument_start('MigrateItemXML::hash');
    $hash = md5(serialize($row->xml
      ->asXML()));
    migrate_instrument_stop('MigrateItemXML::hash');
    return $hash;
  }

}

/**
 * Adds xpath info to field mappings for XML sources
 */
class MigrateXMLFieldMapping extends MigrateFieldMapping {

  /**
   * The xpath used to retrieve the data for this field from the XML.
   *
   * @var string
   */
  protected $xpath;

  /**
   * Get xpath of current item.
   */
  public function getXpath() {
    return $this->xpath;
  }

  /**
   * Add an xpath to this field mapping.
   *
   * @param string $xpath
   *   xpath
   *
   * @return MigrateFieldMapping
   *   MigrateFieldMapping
   */
  public function xpath($xpath) {
    $this->xpath = $xpath;
    return $this;
  }

}

/**
 * Migrations using XML sources should extend this class instead of Migration.
 */
abstract class XMLMigration extends Migration {

  /**
   * {@inheritdoc}
   *
   * So we can create our special field mapping class.
   *
   * @todo Find a cleaner way to just substitute a different mapping class.
   *
   * @param string|null $destination_field
   *   machine-name of destination field
   * @param string|null $source_field
   *   name of source field
   * @param bool $warn_on_override
   *   Set to FALSE to prevent warnings when there's an existing mapping
   *   for this destination field.
   *
   * @return MigrateXMLFieldMapping
   *   MigrateXMLFieldMapping
   */
  public function addFieldMapping($destination_field, $source_field = NULL, $warn_on_override = TRUE) {

    // Warn of duplicate mappings.
    if ($warn_on_override && !is_null($destination_field) && isset($this->codedFieldMappings[$destination_field])) {
      self::displayMessage(t('!name addFieldMapping: !dest was previously mapped, overridden', array(
        '!name' => $this->machineName,
        '!dest' => $destination_field,
      )), 'warning');
    }
    $mapping = new MigrateXMLFieldMapping($destination_field, $source_field);
    if (is_null($destination_field)) {
      $this->codedFieldMappings[] = $mapping;
    }
    else {
      $this->codedFieldMappings[$destination_field] = $mapping;
    }
    return $mapping;
  }

  /**
   * {@inheritdoc}
   *
   * A normal $data_row has all the input data as top-level fields - in this
   * case, however, the data is embedded within a SimpleXMLElement object in
   * $data_row->xml. Explode that out to the normal form, and pass on to the
   * normal implementation.
   */
  protected function applyMappings() {

    // We only know what data to pull from the xpaths in the mappings.
    foreach ($this
      ->getFieldMappings() as $mapping) {
      $source = $mapping
        ->getSourceField();
      if ($source && !isset($this->sourceValues->{$source})) {
        $xpath = $mapping
          ->getXpath();
        if ($xpath) {

          // Derived class may override applyXpath().
          $source_value = $this
            ->applyXpath($this->sourceValues, $xpath);
          if (!is_null($source_value)) {
            $this->sourceValues->{$source} = $source_value;
          }
        }
      }
    }
    parent::applyMappings();
  }

  /**
   * Gets item from XML using the xpath.
   *
   * Default implementation - straightforward xpath application
   *
   * @param stdClass $data_row
   *   row containing items.
   * @param string $xpath
   *   xpath used to find the item
   *
   * @return SimpleXMLElement
   *   found element
   */
  public function applyXpath($data_row, $xpath) {
    $result = $data_row->xml
      ->xpath($xpath);
    if ($result) {
      if (count($result) > 1) {
        $return = array();
        foreach ($result as $record) {
          $return[] = (string) $record;
        }
        return $return;
      }
      else {
        return (string) $result[0];
      }
    }
    else {
      return NULL;
    }
  }

}

/* ========================================================================== */

/*                            MultiItems Method                               */

/* ========================================================================== */

/**
 * Implementation of MigrateItems, for providing a list of IDs and for
 * retrieving a parsed XML document given an ID from this list.
 */
class MigrateItemsXML extends MigrateItems {

  /**
   * An array with all urls to available xml files.
   *
   * @var array
   */
  protected $urls;

  /**
   * Define the current cursor over the urls array.
   *
   * @var string
   */
  protected $currentUrl;

  /**
   * An array of namespaces to explicitly register before Xpath queries.
   *
   * @var array
   */
  protected $namespaces;

  /**
   * Stores the loaded XML document from currentUrl.
   *
   * @var SimpleXMLElement
   */
  protected $currentXml = FALSE;

  /**
   * To find the right url depending on the id, we'll build a map in the form of
   * an array('url1' => $ids, 'url2' => $ids, ...).
   *
   * @var array
   */
  protected $idsMap = NULL;

  /**
   * Stores the id list from all urls.
   *
   * @var array
   */
  protected $cacheIDs = NULL;

  /**
   * xpath identifying the element used for each item.
   *
   * @var string
   */
  protected $itemXpath;

  /**
   * Gets xpath identifying the element used for each item.
   *
   * @return string
   *   xpath
   */
  public function getItemXpath() {
    return $this->itemXpath;
  }

  /**
   * xpath identifying the subelement under itemXpath that holds the id for
   * each item.
   *
   * @var string
   */
  protected $itemIDXpath;

  /**
   * Getter for itemIDXpath.
   *
   * @return string
   */
  public function getIDXpath() {
    return $this->itemIDXpath;
  }

  /**
   * {@inheritdoc}
   */
  public function __construct($urls, $item_xpath = 'item', $item_id_xpath = 'id', array $namespaces = array()) {
    parent::__construct();
    if (!is_array($urls)) {
      $urls = array(
        $urls,
      );
    }
    $this->urls = $urls;
    $this->itemXpath = $item_xpath;
    $this->itemIDXpath = $item_id_xpath;
    $this->namespaces = $namespaces;

    // Suppress errors during parsing, so we can pick them up after.
    libxml_use_internal_errors(TRUE);
  }

  /**
   * Explicitly register namespaces on an XML element.
   *
   * @param SimpleXMLElement $xml
   *   A SimpleXMLElement to register the namespaces on.
   */
  protected function registerNamespaces(SimpleXMLElement &$xml) {
    foreach ($this->namespaces as $prefix => $namespace) {
      $xml
        ->registerXPathNamespace($prefix, $namespace);
    }
  }

  /**
   * Our public face is the URL list we're getting items from.
   */
  public function __toString() {
    $urls = implode('</li><li>', $this->urls);

    // Prepare a list of urls.
    $output = '<b>urls</b> = <ul><li>' . $urls . '</li></ul>';
    $output .= '<br />';

    // Add selection rules to the end.
    $output .= '<b>item xpath</b> = ' . $this->itemXpath . ' | ';
    $output .= '<b>item ID xpath</b> = ' . $this->itemIDXpath;
    return $output;
  }

  /**
   * Load and return the xml from currentUrl.
   *
   * @return SimpleXMLElement
   *   SimpleXMLElement
   */
  public function &xml() {
    if (!empty($this->currentUrl)) {
      $this->currentXml = simplexml_load_file($this->currentUrl);
      if ($this->currentXml === FALSE) {
        Migration::displayMessage(t('Loading of !currentUrl failed:', array(
          '!currentUrl' => $this->currentUrl,
        )));
        foreach (libxml_get_errors() as $error) {
          Migration::displayMessage(self::parseLibXMLError($error));
        }
      }
      else {
        $this
          ->registerNamespaces($this->currentXml);
      }
    }
    return $this->currentXml;
  }

  /**
   * Parses a LibXMLError to a error message string.
   *
   * @param LibXMLError $error
   *   Error thrown by the XML
   *
   * @return string
   *   Error message
   */
  public static function parseLibXMLError(LibXMLError $error) {
    $error_code_name = 'Unknown Error';
    switch ($error->level) {
      case LIBXML_ERR_WARNING:
        $error_code_name = t('Warning');
        break;
      case LIBXML_ERR_ERROR:
        $error_code_name = t('Error');
        break;
      case LIBXML_ERR_FATAL:
        $error_code_name = t('Fatal Error');
        break;
    }
    return t("!libxmlerrorcodename !libxmlerrorcode: !libxmlerrormessage\n" . "Line: !libxmlerrorline\n" . "Column: !libxmlerrorcolumn\n" . "File: !libxmlerrorfile", array(
      '!libxmlerrorcodename' => $error_code_name,
      '!libxmlerrorcode' => $error->code,
      '!libxmlerrormessage' => trim($error->message),
      '!libxmlerrorline' => $error->line,
      '!libxmlerrorcolumn' => $error->column,
      '!libxmlerrorfile' => $error->file ? $error->file : NULL,
    ));
  }

  /**
   * Load ID's from URLs.
   *
   * Load ids from all urls and map them in idsMap depending on the currentURL.
   *
   * After ids were fetched from all urls store them in cacheIDs and return the
   * whole list.
   *
   * @return array
   *   mapped ID's
   */
  public function getIdList() {
    $ids = array();
    foreach ($this->urls as $url) {
      migrate_instrument_start("Retrieve {$url}");

      // Make sure, to load new xml.
      $this->currentUrl = $url;
      $xml = $this
        ->xml();
      if ($xml !== FALSE) {
        $url_ids = $this
          ->getIdsFromXML($xml);
        $this->idsMap[$url] = $url_ids;
        $ids = array_merge($ids, $url_ids);
      }
      migrate_instrument_stop("Retrieve {$url}");
    }
    if (!empty($ids)) {
      $this->cacheIDs = array_unique($ids);
      return $this->cacheIDs;
    }
    return NULL;
  }

  /**
   * Given an XML object, parse out the IDs for processing and return them as
   * an array. The location of the IDs in the XML are based on the item xpath
   * and item ID xpath set in the constructor.
   *    eg, xpath = itemXpath . '/' . itemIDXpath
   * IDs are cached.  The list of IDs are returned from the cache except when
   * this is the first call (ie, cache is NULL) OR the refresh parameter is
   * TRUE.
   *
   * @param SimpleXMLElement $xml
   *   SimpleXMLElement
   *
   * @return array
   */
  protected function getIDsFromXML(SimpleXMLElement $xml) {
    $result = $xml
      ->xpath($this->itemXpath);
    $ids = array();
    if ($result) {
      foreach ($result as $element) {
        if (!isset($element)) {
          continue;
        }

        // Namespaces must be reapplied after xpath().
        $this
          ->registerNamespaces($element);
        $id = $this
          ->getItemID($element);
        if (!is_null($id)) {
          $ids[] = (string) $id;
        }
      }
    }
    return array_unique($ids);
  }

  /**
   * Return a count of all available IDs from the source listing.
   *
   * @return int
   *   count of available IDs
   */
  public function computeCount() {
    if (!isset($this->cacheIDs)) {
      $this
        ->getIdList();
    }
    return count($this->cacheIDs);
  }

  /**
   * Load the XML at the given URL, and return an array.
   *
   * @return array
   *   array of the Items found within it.
   */
  public function getAllItems() {
    $xml = $this
      ->xml();
    if ($xml !== FALSE) {
      return $this
        ->getItemsFromXML($xml, TRUE);
    }
    return NULL;
  }
  protected $currentItems = NULL;

  /**
   * Parses out the items from a given XML object, and parse it's items.
   *
   * Given an XML object, parse out the items for processing and return them as
   * an array. The location of the items in the XML are based on the item xpath
   * set in the constructor. Items from currentUrl are cached. The list of items
   * returned from the cache except when this is the first call
   * (ie, cache is NULL) OR the refresh parameter is TRUE.
   *
   * Items are cached as an array of key=ID and value=stdClass object with
   * attribute xml containing the xml SimpleXMLElement object of the item.
   *
   * @param SimpleXMLElement $xml
   *   XML to parse
   * @param bool $refresh
   *   Indicates if necessary parse again the items or get them from cache.
   *
   * @return array
   *   Array of obtained items.
   */
  public function getItemsFromXML(SimpleXMLElement $xml, $refresh = FALSE) {
    if ($refresh !== FALSE && $this->currentItems != NULL) {
      return $this->currentItems;
    }
    $this->currentItems = NULL;
    $items = array();
    $result = $xml
      ->xpath($this->itemXpath);
    if ($result) {
      foreach ($result as $item_xml) {
        if (!isset($item_xml)) {
          continue;
        }

        // Namespaces must be reapplied after xpath().
        $this
          ->registerNamespaces($item_xml);
        $id = $this
          ->getItemID($item_xml);
        $item = new stdclass();
        $item->xml = $item_xml;
        $items[$id] = $item;
      }
      $this->currentItems = $items;
      return $this->currentItems;
    }
    else {
      return NULL;
    }
  }

  /**
   * Get the item ID from the itemXML based on itemIDXpath.
   *
   * @param SimpleXMLElement $item_xml
   *   Element from we get the ID
   *
   * @return string
   *   The item ID
   */
  protected function getItemID($item_xml) {
    return $this
      ->getElementValue($item_xml, $this->itemIDXpath);
  }

  /**
   * Get an element from the itemXML based on an xpath.
   *
   * @param SimpleXMLElement $item_xml
   *   Element from we get the required value
   * @param string $xpath
   *   xpath used to locate the value
   *
   * @return string
   *   Extracted value
   */
  protected function getElementValue($item_xml, $xpath) {
    $value = NULL;
    if ($item_xml
      ->asXML()) {
      $result = $item_xml
        ->xpath($xpath);
      if ($result) {
        $value = (string) $result[0];
      }
    }
    return $value;
  }

  /**
   * Implementers are expected to return an object representing a source item.
   * Items from currentUrl are cached as an array of key=ID and value=stdClass
   * object with attribute xml containing the xml SimpleXMLElement object of the
   * item.
   *
   * @param mixed $id
   *
   * @return stdClass
   */
  public function getItem($id) {

    // Make sure we actually have an ID.
    if (empty($id)) {
      return NULL;
    }

    // If $id is in currentXml return the right item immediately.
    if (isset($this->currentItems) && isset($this->currentItems[$id])) {
      $item = $this->currentItems[$id];
    }
    else {

      // Otherwise find the right url and get the items from.
      if ($this->idsMap === NULL) {

        // Populate the map.
        $this
          ->getIdList();
      }
      foreach ($this->idsMap as $url => $ids) {
        if (in_array($id, $ids, TRUE)) {
          $this->currentItems = NULL;
          $this->currentUrl = $url;
          $items = $this
            ->getAllItems();
          $item = $items[$id];
        }
      }
    }
    if (!empty($item)) {
      return $item;
    }
    else {
      $migration = Migration::currentMigration();
      $message = t('Loading of item XML for ID !id failed:', array(
        '!id' => $id,
      ));
      foreach (libxml_get_errors() as $error) {
        $message .= "\n" . $error->message;
      }
      $migration
        ->getMap()
        ->saveMessage(array(
        $id,
      ), $message, MigrationBase::MESSAGE_ERROR);
      libxml_clear_errors();
      return NULL;
    }
  }

  /**
   * {@inheritdoc}
   */
  public function hash($row) {

    // $row->xml is a SimpleXMLElement. Temporarily set it as an XML string
    // to prevent parent::hash() failing when try to create the hash.
    migrate_instrument_start('MigrateItemXML::hash');
    $hash = md5(serialize($row->xml
      ->asXML()));
    migrate_instrument_stop('MigrateItemXML::hash');
    return $hash;
  }

}

/**
 * Makes an XMLReader object iterable, returning elements matching a restricted
 * xpath-like syntax.
 */
class MigrateXMLReader implements Iterator {

  /**
   * The XMLReader we are encapsulating.
   *
   * @var XMLReader
   */
  public $reader;

  /**
   * URL of the source XML file.
   *
   * @var string
   */
  public $url;

  /**
   * Array of the element names from the query, 0-based from the first (root)
   * element. For example, '//file/article' would be stored as
   * array(0 => 'file', 1 => 'article').
   *
   * @var array
   */
  protected $elementsToMatch = array();

  /**
   * If the element query is filtering by an attribute name=value, the name of
   * the attribute in question.
   *
   * @var string
   */
  protected $attributeName = NULL;

  /**
   * If the element query is filtering by an attribute name=value, the value of
   * the attribute in question.
   *
   * @var string
   */
  protected $attributeValue = NULL;

  /**
   * Array representing the path to the current element as we traverse the XML.
   * For example, if in an XML string like '<file><article>...</article></file>'
   * we are positioned within the article element, currentPath will be
   * array(0 => 'file', 1 => 'article').
   *
   * @var array
   */
  protected $currentPath = array();

  /**
   * Query string used to retrieve the elements from the XML file.
   *
   * @var string
   */
  public $elementQuery;

  /**
   * Xpath query string used to retrieve the primary key value from each
   * element.
   *
   * @var string
   */
  public $idQuery;

  /**
   * Current element object when iterating.
   *
   * @var SimpleXMLElement
   */
  protected $currentElement = NULL;

  /**
   * Value of the ID for the current element when iterating.
   *
   * @var string
   */
  protected $currentId = NULL;

  /**
   * When matching element names, whether to compare to the namespace-prefixed
   * name, or the local name.
   *
   * @var bool
   */
  protected $prefixedName = FALSE;

  /**
   * Prepares our extensions to the XMLReader object.
   *
   * @param string $xml_url
   *   URL of the XML file to be parsed.
   * @param string $element_query
   *   Query string in a restricted xpath format, for selecting elements to be
   * @param string $id_query
   *   Query string to the unique identifier for an element,
   *   relative to the root of that element. This supports the full
   *   xpath syntax.
   */
  public function __construct($xml_url, $element_query, $id_query) {
    $this->reader = new XMLReader();
    $this->url = $xml_url;
    $this->elementQuery = $element_query;
    $this->idQuery = $id_query;

    // Suppress errors during parsing, so we can pick them up after.
    libxml_use_internal_errors(TRUE);

    // Parse the element query. First capture group is the element path, second
    // (if present) is the attribute.
    preg_match_all('|^/([^\\[]+)(.*)$|', $element_query, $matches);
    $element_path = $matches[1][0];
    $this->elementsToMatch = explode('/', $element_path);
    $attribute_query = $matches[2][0];
    if ($attribute_query) {

      // Matches [@attribute="value"] (with either single- or double-quotes).
      preg_match_all('|^\\[@([^=]+)=[\'"](.*)[\'"]\\]$|', $attribute_query, $matches);
      $this->attributeName = $matches[1][0];
      $this->attributeValue = $matches[2][0];
    }

    // If the element path contains any colons, it must be specifying
    // namespaces, so we need to compare using the prefixed element
    // name in next().
    if (strpos($element_path, ':')) {
      $this->prefixedName = TRUE;
    }
  }

  /**
   * Implementation of Iterator::rewind().
   */
  public function rewind() {

    // (Re)open the provided URL.
    $this->reader
      ->close();
    $status = $this->reader
      ->open($this->url, NULL, LIBXML_NOWARNING);

    // Reset our path tracker.
    $this->currentPath = array();
    if ($status) {

      // Load the first matching element and its ID.
      $this
        ->next();
    }
    else {
      Migration::displayMessage(t('Could not open XML file !url', array(
        '!url' => $this->url,
      )));
    }
  }

  /**
   * Implementation of Iterator::next().
   */
  public function next() {
    migrate_instrument_start('MigrateXMLReader::next');
    $this->currentElement = $this->currentId = NULL;

    // Loop over each node in the XML file, looking for elements at a path
    // matching the input query string (represented in $this->elementsToMatch).
    while ($this->reader
      ->read()) {
      if ($this->reader->nodeType == XMLREADER::ELEMENT) {
        if ($this->prefixedName) {
          $this->currentPath[$this->reader->depth] = $this->reader->name;
        }
        else {
          $this->currentPath[$this->reader->depth] = $this->reader->localName;
        }
        if ($this->currentPath == $this->elementsToMatch) {

          // We're positioned to the right element path - if filtering on an
          // attribute, check that as well before accepting this element.
          if (empty($this->attributeName) || $this->reader
            ->getAttribute($this->attributeName) == $this->attributeValue) {

            // We've found a matching element - get a SimpleXML object
            // representing it.We must associate the DOMNode with a
            // DOMDocument to be able to import
            // it into SimpleXML.
            // Despite appearances, this is almost twice as fast as
            // simplexml_load_string($this->readOuterXML());
            $node = $this->reader
              ->expand();
            if ($node) {
              $dom = new DOMDocument();
              $node = $dom
                ->importNode($node, TRUE);
              $dom
                ->appendChild($node);
              $this->currentElement = simplexml_import_dom($node);
              $idnode = $this->currentElement
                ->xpath($this->idQuery);
              if (is_array($idnode)) {
                $this->currentId = (string) reset($idnode);
              }
              else {
                throw new Exception(t('Failure retrieving ID, xpath: !xpath', array(
                  '!xpath' => $this->idQuery,
                )));
              }
              break;
            }
            else {
              foreach (libxml_get_errors() as $error) {
                $error_string = MigrateItemsXML::parseLibXMLError($error);
                if ($migration = Migration::currentMigration()) {
                  $migration
                    ->saveMessage($error_string);
                }
                else {
                  Migration::displayMessage($error_string);
                }
              }
            }
          }
        }
      }
      elseif ($this->reader->nodeType == XMLREADER::END_ELEMENT) {

        // Remove this element and any deeper ones from the current path.
        foreach ($this->currentPath as $depth => $name) {
          if ($depth >= $this->reader->depth) {
            unset($this->currentPath[$depth]);
          }
        }
      }
    }
    migrate_instrument_stop('MigrateXMLReader::next');
  }

  /**
   * Implementation of Iterator::current().
   *
   * @return null|SimpleXMLElement
   *   Current item
   */
  public function current() {
    return $this->currentElement;
  }

  /**
   * Implementation of Iterator::key().
   *
   * @return null|string
   *   Current key
   */
  public function key() {
    return $this->currentId;
  }

  /**
   * Implementation of Iterator::valid().
   *
   * @return bool
   *   Indicates if current element is valid
   */
  public function valid() {
    return $this->currentElement instanceof SimpleXMLElement;
  }

}

/**
 * Implementation of MigrateSource, to handle imports from XML files.
 */
class MigrateSourceXML extends MigrateSource {

  /**
   * @var $reader MigrateXMLReader
   */
  protected $reader;

  /**
   * The MigrateXMLReader object serving as a cursor over the XML source.
   *
   * @return MigrateXMLReader
   *   MigrateXMLReader
   */
  public function getReader() {
    return $this->reader;
  }

  /**
   * The source URLs to load XML from
   *
   * @var array
   */
  protected $sourceUrls = array();

  /**
   * Holds our current position within the $source_urls array
   *
   * @var int
   */
  protected $activeUrl = NULL;

  /**
   * An array of namespaces to explicitly register before Xpath queries.
   *
   * @var array
   */
  protected $namespaces;

  /**
   * Store the query string used to recognize elements being iterated
   * so we can create reader objects on the fly.
   *
   * @var string
   */
  protected $elementQuery = '';

  /**
   * Store the query string used to retrieve the primary key value from each
   * element so we can create reader objects on the fly.
   *
   * @var string
   */
  protected $idQuery = '';

  /**
   * Store the reader class used to query XML so we can create reader objects
   * on the fly.
   *
   * @var string
   */
  protected $readerClass = '';

  /**
   * List of available source fields.
   *
   * @var array
   */
  protected $fields = array();

  /**
   * Source constructor.
   *
   * @param string|array $urls
   *   URL(s) of the XML source data.
   * @param string $element_query
   *   Query string used to recognize elements being iterated.
   * @param string $id_query
   *   Xpath query string used to retrieve the primary key value
   *   from each element.
   * @param array $fields
   *   Optional - keys are field names, values are descriptions. Use to override
   *   the default descriptions, or to add additional source fields which the
   *   migration will add via other means (e.g., prepareRow()).
   * @param array $options
   *   Options applied to this source. In addition to the standard MigrateSource
   *   options, we support:
   *   - reader_class: The reader class to instantiate for traversing the XML -
   *     defaults to MigrateXMLReader (any substitutions must be derived from
   *     MigrateXMLReader).
   */
  public function __construct($urls, $element_query, $id_query, array $fields = array(), array $options = array(), array $namespaces = array()) {
    parent::__construct($options);
    if (empty($options['reader_class'])) {
      $reader_class = 'MigrateXMLReader';
    }
    else {
      $reader_class = $options['reader_class'];
    }
    if (!is_array($urls)) {
      $urls = array(
        $urls,
      );
    }
    $this->sourceUrls = $urls;
    $this->activeUrl = NULL;
    $this->elementQuery = $element_query;
    $this->idQuery = $id_query;
    $this->readerClass = $reader_class;
    $this->fields = $fields;
    $this->namespaces = $namespaces;
  }

  /**
   * Explicitly register namespaces on an XML element.
   *
   * @param SimpleXMLElement $xml
   *   A SimpleXMLElement to register the namespaces on.
   */
  protected function registerNamespaces(SimpleXMLElement &$xml) {
    foreach ($this->namespaces as $prefix => $namespace) {
      $xml
        ->registerXPathNamespace($prefix, $namespace);
    }
  }

  /**
   * Return a string representing the source query.
   *
   * @return string
   *   source query
   */
  public function __toString() {

    // Clump the urls into a string
    // This could cause a problem when using
    // a lot of urls, may need to hash.
    $urls = implode(', ', $this->sourceUrls);
    return 'urls = ' . $urls . ' | item xpath = ' . $this->elementQuery . ' | item ID xpath = ' . $this->idQuery;
  }

  /**
   * Returns a list of fields available to be mapped from the source query.
   *
   * @return array
   *   keys: machine names of the fields (to be passed to addFieldMapping)
   *   values: Human-friendly descriptions of the fields.
   */
  public function fields() {
    return $this->fields;
  }

  /**
   * Returns the active Url.
   *
   * @return string
   *   active Url
   */
  public function activeUrl() {
    if (!is_null($this->activeUrl)) {
      return $this->sourceUrls[$this->activeUrl];
    }
  }

  /**
   * Return a count of all available source records.
   */
  public function computeCount() {
    $count = 0;
    foreach ($this->sourceUrls as $url) {
      $reader = new $this->readerClass($url, $this->elementQuery, $this->idQuery);
      foreach ($reader as $element) {
        $count++;
      }
    }
    return $count;
  }

  /**
   * Implementation of MigrateSource::performRewind().
   */
  public function performRewind() {

    // Set the reader back to the beginning of the file (positioned to the
    // first matching element), then apply our logic to make sure we have the
    // first element fulfilling our logic (idlist/map/prepareRow()).
    $this->activeUrl = NULL;
    $this->reader = NULL;
  }

  /**
   * Implementation of MigrationSource::getNextRow().
   *
   * @return stdClass
   *   data for the next row from the XML source files
   */
  public function getNextRow() {
    migrate_instrument_start('MigrateSourceXML::next');
    $source_key = $this->activeMap
      ->getSourceKey();
    $key_name = key($source_key);
    $row = NULL;

    // The reader is now lazy loaded, so it may
    // not be defined yet, need to test if set.
    if (isset($this->reader)) {

      // Attempt to load the next row.
      $this->reader
        ->next();
    }

    // Test the reader for a valid row.
    if (isset($this->reader) && $this->reader
      ->valid()) {
      $row = new stdClass();
      $row->{$key_name} = $this->reader
        ->key();
      $row->xml = $this->reader
        ->current();
      $this
        ->registerNamespaces($row->xml);
    }
    else {

      // The current source is at the end, try to load the next source.
      if ($this
        ->getNextSource()) {
        $row = new stdClass();
        $row->{$key_name} = $this->reader
          ->key();
        $row->xml = $this->reader
          ->current();
        $this
          ->registerNamespaces($row->xml);
      }
    }
    migrate_instrument_stop('MigrateSourceXML::next');
    return $row;
  }

  /**
   * Advances the reader to the next source from source_urls.
   *
   * @return bool
   *   TRUE if a valid source was loaded
   */
  public function getNextSource() {
    migrate_instrument_start('MigrateSourceXML::nextSource');

    // Return value.
    $status = FALSE;
    while ($this->activeUrl === NULL || count($this->sourceUrls) - 1 > $this->activeUrl) {
      if (is_null($this->activeUrl)) {
        $this->activeUrl = 0;
      }
      else {

        // Increment the activeUrl so we try to load the next source.
        $this->activeUrl = $this->activeUrl + 1;
      }
      $this->reader = new $this->readerClass($this->sourceUrls[$this->activeUrl], $this->elementQuery, $this->idQuery);
      $this->reader
        ->rewind();
      if ($this->reader
        ->valid()) {

        // We have a valid source.
        $status = TRUE;
        break;
      }
    }
    migrate_instrument_stop('MigrateSourceXML::nextSource');
    return $status;
  }

  /**
   * {@inheritdoc}
   */
  protected function hash($row) {

    // $row->xml is a SimpleXMLElement. Temporarily set it as an XML string
    // to prevent parent::hash() failing when try to create the hash.
    return parent::hash($row->xml
      ->asXML());
  }

}

Classes

Namesort descending Description
MigrateItemsXML Implementation of MigrateItems, for providing a list of IDs and for retrieving a parsed XML document given an ID from this list.
MigrateItemXML Implementation of MigrateItem, for retrieving a parsed XML document given an ID provided by a MigrateList class.
MigrateListXML Implementation of MigrateList, for retrieving a list of IDs to be migrated from an XML document.
MigrateSourceXML Implementation of MigrateSource, to handle imports from XML files.
MigrateXMLFieldMapping Adds xpath info to field mappings for XML sources
MigrateXMLReader Makes an XMLReader object iterable, returning elements matching a restricted xpath-like syntax.
XMLMigration Migrations using XML sources should extend this class instead of Migration.