View source
<?php
class EasyRdf_Parser_Rdfa extends EasyRdf_Parser {
const XML_NS = 'http://www.w3.org/XML/1998/namespace';
const RDF_XML_LITERAL = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral';
const TERM_REGEXP = '/^([a-zA-Z_])([0-9a-zA-Z_\\.-]*)$/';
public $debug = false;
public function __construct() {
}
protected function addTriple($resource, $property, $value) {
if ($this->debug) {
print "Adding triple: {$resource} -> {$property} -> " . $value['type'] . ':' . $value['value'] . "\n";
}
$count = $this->graph
->add($resource, $property, $value);
$this->tripleCount += $count;
return $count;
}
protected function generateList($subject, $property, $list) {
$current = $subject;
$prop = $property;
foreach ($list as $item) {
$newNode = $this->graph
->newBNodeId();
$this
->addTriple($current, $prop, array(
'type' => 'bnode',
'value' => $newNode,
));
$this
->addTriple($newNode, 'rdf:first', $item);
$current = $newNode;
$prop = 'rdf:rest';
}
$this
->addTriple($current, $prop, array(
'type' => 'uri',
'value' => EasyRdf_Namespace::expand('rdf:nil'),
));
}
protected function addToList($listMapping, $property, $value) {
if ($this->debug) {
print "Adding to list: {$property} -> " . $value['type'] . ':' . $value['value'] . "\n";
}
if (!isset($listMapping->{$property})) {
$listMapping->{$property} = array();
}
array_push($listMapping->{$property}, $value);
}
protected function printNode($node, $depth) {
$indent = str_repeat(' ', $depth);
print $indent;
switch ($node->nodeType) {
case XML_ELEMENT_NODE:
print 'node';
break;
case XML_ATTRIBUTE_NODE:
print 'attr';
break;
case XML_TEXT_NODE:
print 'text';
break;
case XML_CDATA_SECTION_NODE:
print 'cdata';
break;
case XML_ENTITY_REF_NODE:
print 'entref';
break;
case XML_ENTITY_NODE:
print 'entity';
break;
case XML_PI_NODE:
print 'pi';
break;
case XML_COMMENT_NODE:
print 'comment';
break;
case XML_DOCUMENT_NODE:
print 'doc';
break;
case XML_DOCUMENT_TYPE_NODE:
print 'doctype';
break;
case XML_HTML_DOCUMENT_NODE:
print 'html';
break;
default:
throw new EasyRdf_Exception("unknown node type: " . $node->nodeType);
break;
}
print ' ' . $node->nodeName . "\n";
if ($node
->hasAttributes()) {
foreach ($node->attributes as $attr) {
print $indent . ' ' . $attr->nodeName . " => " . $attr->nodeValue . "\n";
}
}
}
protected function guessTimeDatatype($value) {
if (preg_match('/^-?\\d{4}-\\d{2}-\\d{2}(Z|[\\-\\+]\\d{2}:\\d{2})?$/', $value)) {
return 'http://www.w3.org/2001/XMLSchema#date';
}
elseif (preg_match('/^\\d{2}:\\d{2}:\\d{2}(Z|[\\-\\+]\\d{2}:\\d{2})?$/', $value)) {
return 'http://www.w3.org/2001/XMLSchema#time';
}
elseif (preg_match('/^-?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(Z|[\\-\\+]\\d{2}:\\d{2})?$/', $value)) {
return 'http://www.w3.org/2001/XMLSchema#dateTime';
}
elseif (preg_match('/^P(\\d+Y)?(\\d+M)?(\\d+D)?T?(\\d+H)?(\\d+M)?(\\d+S)?$/', $value)) {
return 'http://www.w3.org/2001/XMLSchema#duration';
}
elseif (preg_match('/^\\d{4}$/', $value)) {
return 'http://www.w3.org/2001/XMLSchema#gYear';
}
elseif (preg_match('/^\\d{4}-\\d{2}$/', $value)) {
return 'http://www.w3.org/2001/XMLSchema#gYearMonth';
}
else {
return null;
}
}
protected function initialContext() {
$context = array(
'prefixes' => array(),
'vocab' => null,
'subject' => $this->baseUri,
'property' => null,
'object' => null,
'terms' => array(),
'incompleteRels' => array(),
'incompleteRevs' => array(),
'listMapping' => null,
'lang' => null,
'path' => '',
'xmlns' => array(),
);
$context['prefixes'][''] = 'http://www.w3.org/1999/xhtml/vocab#';
$context['terms']['describedby'] = 'http://www.w3.org/2007/05/powder-s#describedby';
$context['terms']['license'] = 'http://www.w3.org/1999/xhtml/vocab#license';
$context['terms']['role'] = 'http://www.w3.org/1999/xhtml/vocab#role';
return $context;
}
protected function expandCurie($node, &$context, $value) {
if (preg_match('/^(\\w*?):(.*)$/', $value, $matches)) {
list(, $prefix, $local) = $matches;
$prefix = strtolower($prefix);
if ($prefix === '_') {
return $this
->remapBnode(substr($value, 2));
}
elseif (empty($prefix) and $context['vocab']) {
return $context['vocab'] . $local;
}
elseif (isset($context['prefixes'][$prefix])) {
return $context['prefixes'][$prefix] . $local;
}
elseif ($uri = $node
->lookupNamespaceURI($prefix)) {
return $uri . $local;
}
elseif (!empty($prefix) and $uri = EasyRdf_Namespace::get($prefix)) {
return $uri . $local;
}
}
}
protected function processUri($node, &$context, $value, $isProp = false) {
if (preg_match('/^\\[(.*)\\]$/', $value, $matches)) {
return $this
->expandCurie($node, $context, $matches[1]);
}
elseif (preg_match(self::TERM_REGEXP, $value) and $isProp) {
$term = strtolower($value);
if ($context['vocab']) {
return $context['vocab'] . $value;
}
elseif (isset($context['terms'][$term])) {
return $context['terms'][$term];
}
}
elseif (substr($value, 0, 2) === '_:' and $isProp) {
return null;
}
else {
$uri = $this
->expandCurie($node, $context, $value);
if ($uri) {
return $uri;
}
else {
$parsed = new EasyRdf_ParsedUri($value);
if ($parsed
->isAbsolute()) {
return $value;
}
elseif ($isProp) {
return null;
}
elseif ($this->baseUri) {
return $this->baseUri
->resolve($parsed);
}
}
}
}
protected function processUriList($node, $context, $values) {
if (!$values) {
return array();
}
$uris = array();
foreach (preg_split('/\\s+/', $values) as $value) {
$uri = $this
->processUri($node, $context, $value, true);
if ($uri) {
array_push($uris, $uri);
}
}
return $uris;
}
protected function getUriAttribute($node, &$context, $attributes) {
if (!is_array($attributes)) {
$attributes = array(
$attributes,
);
}
foreach ($attributes as $attribute) {
if ($node
->hasAttribute($attribute)) {
$value = $node
->getAttribute($attribute);
$uri = $this
->processUri($node, $context, $value);
if ($uri) {
return $uri;
}
}
}
}
protected function processNode($node, &$context, $depth = 1) {
if ($this->debug) {
$this
->printNode($node, $depth);
}
$skip = false;
$subject = null;
$typedResource = null;
$object = null;
$rels = array();
$revs = array();
$lang = $context['lang'];
$incompleteRels = array();
$incompleteRevs = array();
if ($node->nodeType === XML_ELEMENT_NODE) {
$context['path'] .= '/' . $node->nodeName;
$content = $node
->hasAttribute('content') ? $node
->getAttribute('content') : null;
$datatype = $node
->hasAttribute('datatype') ? $node
->getAttribute('datatype') : null;
$property = $node
->getAttribute('property') ? $node
->getAttribute('property') : null;
$typeof = $node
->getAttribute('typeof') ? $node
->getAttribute('typeof') : null;
if ($node
->hasAttribute('vocab')) {
$context['vocab'] = $node
->getAttribute('vocab');
if ($context['vocab']) {
$this
->addTriple($this->baseUri, 'rdfa:usesVocabulary', array(
'type' => 'uri',
'value' => $context['vocab'],
));
}
}
foreach ($context['xmlns'] as $prefix => $uri) {
if ($node
->hasAttribute('xmlns:' . $prefix)) {
$context['prefixes'][$prefix] = $node
->getAttribute('xmlns:' . $prefix);
if ($this->debug) {
print "Prefix (xmlns): {$prefix} => {$uri}\n";
}
}
}
if ($node
->hasAttribute('prefix')) {
$mappings = preg_split('/\\s+/', $node
->getAttribute('prefix'));
while (count($mappings)) {
$prefix = strtolower(array_shift($mappings));
$uri = array_shift($mappings);
if (substr($prefix, -1) === ':') {
$prefix = substr($prefix, 0, -1);
}
else {
continue;
}
if ($prefix === '_') {
continue;
}
elseif (!empty($prefix)) {
$context['prefixes'][$prefix] = $uri;
if ($this->debug) {
print "Prefix: {$prefix} => {$uri}\n";
}
}
}
}
if ($node
->hasAttributeNS(self::XML_NS, 'lang')) {
$lang = $node
->getAttributeNS(self::XML_NS, 'lang');
}
elseif ($node
->hasAttribute('lang')) {
$lang = $node
->getAttribute('lang');
}
foreach (array(
'rel',
'rev',
) as $attr) {
if ($node
->hasAttribute('property') and $node
->hasAttribute($attr)) {
if (strpos($node
->getAttribute($attr), ':') === false) {
$node
->removeAttribute($attr);
}
else {
$curies = array();
foreach (preg_split('/\\s+/', $node
->getAttribute($attr)) as $token) {
if (strpos($token, ':')) {
$curies[] = $token;
}
}
$node
->setAttribute($attr, implode(' ', $curies));
}
}
}
$rels = $this
->processUriList($node, $context, $node
->getAttribute('rel'));
$revs = $this
->processUriList($node, $context, $node
->getAttribute('rev'));
if (!$node
->hasAttribute('rel') and !$node
->hasAttribute('rev')) {
if ($property and is_null($content) and is_null($datatype)) {
$subject = $this
->getUriAttribute($node, $context, 'about');
if ($typeof and !$subject) {
$typedResource = $this
->getUriAttribute($node, $context, array(
'resource',
'href',
'src',
));
if (!$typedResource) {
$typedResource = $this->graph
->newBNodeId();
}
$object = $typedResource;
}
}
else {
$subject = $this
->getUriAttribute($node, $context, array(
'about',
'resource',
'href',
'src',
));
}
if (is_null($subject)) {
if ($context['path'] === '/html/head') {
$subject = $context['object'];
}
elseif ($depth <= 2) {
$subject = $this->baseUri;
}
elseif ($typeof and !$property) {
$subject = $this->graph
->newBNodeId();
}
else {
if (!$property) {
$skip = true;
}
$subject = $context['object'];
}
}
}
else {
$subject = $this
->getUriAttribute($node, $context, 'about');
$object = $this
->getUriAttribute($node, $context, array(
'resource',
'href',
'src',
));
if ($typeof) {
if (!$object and !$subject) {
$object = $this->graph
->newBNodeId();
}
$typedResource = $subject ? $subject : $object;
}
if (!$subject) {
$subject = $context['object'];
}
}
if ($typeof and $subject and !$typedResource) {
$typedResource = $subject;
}
if ($typedResource) {
foreach ($this
->processUriList($node, $context, $typeof) as $type) {
$this
->addTriple($typedResource, 'rdf:type', array(
'type' => 'uri',
'value' => $type,
));
}
}
if ($subject and $subject !== $context['subject']) {
$listMapping = new StdClass();
}
else {
$listMapping = $context['listMapping'];
}
if ($subject and $object) {
foreach ($rels as $prop) {
$obj = array(
'type' => 'uri',
'value' => $object,
);
if ($node
->hasAttribute('inlist')) {
$this
->addToList($listMapping, $prop, $obj);
}
else {
$this
->addTriple($subject, $prop, $obj);
}
}
foreach ($revs as $prop) {
$this
->addTriple($object, $prop, array(
'type' => 'uri',
'value' => $subject,
));
}
}
elseif ($rels or $revs) {
$object = $this->graph
->newBNodeId();
if ($rels) {
if ($node
->hasAttribute('inlist')) {
foreach ($rels as $prop) {
if (!isset($listMapping->{$prop})) {
$listMapping->{$prop} = array();
}
}
}
else {
$incompleteRels = $rels;
if ($this->debug) {
print "Incomplete rels: " . implode(',', $rels) . "\n";
}
}
}
if ($revs) {
$incompleteRevs = $revs;
if ($this->debug) {
print "Incomplete revs: " . implode(',', $revs) . "\n";
}
}
}
if ($subject and $property) {
$value = array();
if ($datatype) {
$datatype = $this
->processUri($node, $context, $datatype, true);
}
if ($content !== null) {
$value['value'] = $content;
}
elseif ($node
->hasAttribute('datetime')) {
$value['value'] = $node
->getAttribute('datetime');
$datetime = true;
}
elseif ($datatype === '') {
$value['value'] = $node->textContent;
}
elseif ($datatype === self::RDF_XML_LITERAL) {
$value['value'] = '';
foreach ($node->childNodes as $child) {
$value['value'] .= $child
->C14N();
}
}
elseif (is_null($datatype) and empty($rels) and empty($revs)) {
$value['value'] = $this
->getUriAttribute($node, $context, array(
'resource',
'href',
'src',
));
if ($value['value']) {
$value['type'] = 'uri';
}
}
if (empty($value['value']) and $typedResource and !$node
->hasAttribute('about')) {
$value['type'] = 'uri';
$value['value'] = $typedResource;
}
if (empty($value['value'])) {
$value['value'] = $node->textContent;
}
if (empty($value['type'])) {
$value['type'] = 'literal';
if ($datatype) {
$value['datatype'] = $datatype;
}
elseif (isset($datetime) or $node->nodeName === 'time') {
$value['datatype'] = $this
->guessTimeDatatype($value['value']);
}
if (empty($value['datatype']) and $lang) {
$value['lang'] = $lang;
}
}
foreach ($this
->processUriList($node, $context, $property) as $prop) {
if ($node
->hasAttribute('inlist')) {
$this
->addToList($listMapping, $prop, $value);
}
elseif ($subject) {
$this
->addTriple($subject, $prop, $value);
}
}
}
if (!$skip and $subject and ($context['incompleteRels'] or $context['incompleteRevs'])) {
foreach ($context['incompleteRels'] as $prop) {
$this
->addTriple($context['subject'], $prop, array(
'type' => 'uri',
'value' => $subject,
));
}
foreach ($context['incompleteRevs'] as $prop) {
$this
->addTriple($subject, $prop, array(
'type' => 'uri',
'value' => $context['subject'],
));
}
}
}
if ($node
->hasChildNodes()) {
if ($skip) {
$newContext = $context;
}
else {
$newContext = $context;
if ($object) {
$newContext['object'] = $object;
}
elseif ($subject) {
$newContext['object'] = $subject;
}
else {
$newContext['object'] = $context['subject'];
}
if ($subject) {
$newContext['subject'] = $subject;
}
$newContext['incompleteRels'] = $incompleteRels;
$newContext['incompleteRevs'] = $incompleteRevs;
if (isset($listMapping)) {
$newContext['listMapping'] = $listMapping;
}
}
$newContext['lang'] = $lang;
foreach ($node->childNodes as $child) {
if ($child->nodeType === XML_ELEMENT_NODE) {
$this
->processNode($child, $newContext, $depth + 1);
}
}
}
if (!empty($listMapping)) {
foreach ($listMapping as $prop => $list) {
if ($context['listMapping'] !== $listMapping) {
if ($this->debug) {
print "Need to create triples for {$prop} => " . count($list) . " items\n";
}
$this
->generateList($subject, $prop, $list);
}
}
}
}
public function parse($graph, $data, $format, $baseUri) {
parent::checkParseParams($graph, $data, $format, $baseUri);
if ($format != 'rdfa') {
throw new EasyRdf_Exception("EasyRdf_Parser_Rdfa does not support: {$format}");
}
$context = $this
->initialContext();
libxml_use_internal_errors(true);
$doc = new DOMDocument();
if ($doc
->loadXML($data, LIBXML_NONET)) {
if ($this->debug) {
print "Document was parsed as XML.";
}
$sxe = simplexml_import_dom($doc);
$context['xmlns'] = $sxe
->getDocNamespaces(true);
unset($context['xmlns']['']);
}
else {
$doc
->loadHTML($data);
if ($this->debug) {
print "Document was parsed as HTML.";
}
}
$xpath = new DOMXPath($doc);
$xpath
->registerNamespace('xh', "http://www.w3.org/1999/xhtml");
$nodeList = $xpath
->query('/xh:html/xh:head/xh:base');
if ($node = $nodeList
->item(0) and $href = $node
->getAttribute('href')) {
$this->baseUri = new EasyRdf_ParsedUri($href);
}
$nodeList = $xpath
->query('/html/head/base');
if ($node = $nodeList
->item(0) and $href = $node
->getAttribute('href')) {
$this->baseUri = new EasyRdf_ParsedUri($href);
}
$this->baseUri
->setFragment(null);
$this
->processNode($doc, $context);
return $this->tripleCount;
}
}