function feeds_oai_pmh_parse in Feeds OAI-PMH Fetcher and Parser 6
Same name and namespace in other branches
- 7 feeds_oai_pmh.inc \feeds_oai_pmh_parse()
Parse a raw response from an OAI-PMH endpoint into an array of items.
1 call to feeds_oai_pmh_parse()
- FeedsOAIParser::parse in ./
FeedsOAIParser.inc - Implementation of FeedsParser::parse().
File
- ./
feeds_oai_pmh.inc, line 200
Code
function feeds_oai_pmh_parse($raw_xml) {
// Items array
$items = array();
// Parse raw string into xml
$xml = simplexml_load_string($raw_xml);
if (!$xml) {
return FALSE;
}
// If error element is set, we have a problem. Blow up before the
// foreach blows up for us. More info this way too.
if (isset($xml->error)) {
return FALSE;
}
// Calculate base URL for requesting single metadata records.
$record_request_base_url = (string) $xml->request . '?metadataPrefix=' . (string) $xml->request['metadataPrefix'];
foreach ($xml->ListRecords->record as $xml_item) {
// TODO: Handle items marked "deleted" in repository, if so configured.
// TODO: Handle updates to existing nodes.
// Skip items marked "deleted"
if ($xml_item->header["status"] == "deleted") {
continue;
}
$xml_dc_metadata = $xml_item->metadata
->children('http://www.openarchives.org/OAI/2.0/oai_dc/')
->children('http://purl.org/dc/elements/1.1/');
$item = array(
'guid' => (string) $xml_item->header->identifier,
'datestamp' => strtotime((string) $xml_item->header->datestamp),
'title' => (string) $xml_dc_metadata->title,
);
// Add a direct URL to the metadata record
$item['metadata_record_url'] = $record_request_base_url . '&verb=GetRecord&identifier=' . $item['guid'];
// Parse the setSpec from each item's header
// Some implementations might repeat the same setSpec, so de-dupe.
$set_spec_values = array();
foreach ($xml_item->header->setSpec as $value) {
$value = (string) $value;
$set_spec_values[$value] = $value;
}
$item['setspec_raw'] = array_values($set_spec_values);
// Parse elements
$elements = array(
#'title',
'creator',
'subject',
'description',
'publisher',
'contributor',
'date',
'type',
'format',
'identifier',
'source',
'language',
'relation',
'coverage',
'rights',
);
foreach ($elements as $element) {
if (isset($xml_dc_metadata->{$element})) {
$item[$element] = array();
foreach ($xml_dc_metadata->{$element} as $value) {
$value = (string) $value;
// TODO: add callback functions to further process values (like convert dates to timestamps, split subjects, etc.)
$item[$element][$value] = $value;
}
$item[$element] = array_values($item[$element]);
}
}
// Add "url" element from "identifier" items that are URLs.
foreach ($item['identifier'] as $value) {
if (valid_url($value, TRUE)) {
$item['url'][] = $value;
}
}
$items[] = $item;
}
// if a resumption token is set, and it is non-null. Requests with
// resumptionTokens come back with an empty self closing tag
// indicating the end of the request.
// if (
// isset($xml->ListRecords->resumptionToken)
// && (string)$xml->ListRecords->resumptionToken != ''
// ) {
//
// $resumption_token = (string)$xml->ListRecords->resumptionToken;
// dsm("Resumption token: " . $resumption_token);
// // Run the loop a second time, update the request url
// #$request = '?verb=ListRecords&resumptionToken='.
// #_oai_pmh_clean_url((string)$xml->ListRecords->resumptionToken);
// // Unneeded in theory, but makes me feel better
// #unset($xml->ListRecords->resumptionToken);
// #dsm("Next request will be: $request");
// #$times++;
// #if ($times == 2 ) {
// # dsm("Looped $times times, breaking.");
// # break;
// #}
// }
#dsm("All the items returned:");
#dpm($items);
return array(
'items' => $items,
);
}