View source
<?php
class EasyRdf_Parser_Turtle extends EasyRdf_Parser_Ntriples {
protected $data;
protected $namespaces;
protected $subject;
protected $predicate;
protected $object;
protected $line;
protected $column;
public function __construct() {
}
public function parse($graph, $data, $format, $baseUri) {
parent::checkParseParams($graph, $data, $format, $baseUri);
if ($format != 'turtle') {
throw new EasyRdf_Exception("EasyRdf_Parser_Turtle does not support: {$format}");
}
$this->data = $data;
$this->namespaces = array();
$this->subject = null;
$this->predicate = null;
$this->object = null;
$this->line = 1;
$this->column = 1;
$this
->resetBnodeMap();
$c = $this
->skipWSC();
while ($c != -1) {
$this
->parseStatement();
$c = $this
->skipWSC();
}
return $this->tripleCount;
}
protected function parseStatement() {
$directive = '';
while (true) {
$c = $this
->read();
if ($c == -1 || self::isWhitespace($c)) {
$this
->unread($c);
break;
}
else {
$directive .= $c;
}
}
if (preg_match('/^(@|prefix$|base$)/i', $directive)) {
$this
->parseDirective($directive);
$this
->skipWSC();
if ($directive[0] == "@") {
$this
->verifyCharacterOrFail($this
->read(), ".");
}
}
else {
$this
->unread($directive);
$this
->parseTriples();
$this
->skipWSC();
$this
->verifyCharacterOrFail($this
->read(), ".");
}
}
protected function parseDirective($directive) {
$directive = strtolower($directive);
if ($directive == "prefix" || $directive == '@prefix') {
$this
->parsePrefixID();
}
elseif ($directive == "base" || $directive == '@base') {
$this
->parseBase();
}
elseif (mb_strlen($directive, "UTF-8") == 0) {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: directive name is missing, expected @prefix or @base", $this->line, $this->column);
}
else {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: unknown directive \"{$directive}\"", $this->line, $this->column);
}
}
protected function parsePrefixID() {
$this
->skipWSC();
$prefixID = '';
while (true) {
$c = $this
->read();
if ($c == ':') {
$this
->unread($c);
break;
}
elseif (self::isWhitespace($c)) {
break;
}
elseif ($c == -1) {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: unexpected end of file while reading prefix id", $this->line, $this->column);
}
$prefixID .= $c;
}
$this
->skipWSC();
$this
->verifyCharacterOrFail($this
->read(), ":");
$this
->skipWSC();
$namespace = $this
->parseURI();
$this->namespaces[$prefixID] = $namespace['value'];
}
protected function parseBase() {
$this
->skipWSC();
$baseUri = $this
->parseURI();
$this->baseUri = new EasyRdf_ParsedUri($baseUri['value']);
}
protected function parseTriples() {
$c = $this
->peek();
if ($c == '[') {
$c = $this
->read();
$this
->skipWSC();
$c = $this
->peek();
if ($c == ']') {
$c = $this
->read();
$this->subject = $this
->createBNode();
$this
->skipWSC();
$this
->parsePredicateObjectList();
}
else {
$this
->unread('[');
$this->subject = $this
->parseImplicitBlank();
}
$this
->skipWSC();
$c = $this
->peek();
if ($c != '.') {
$this
->parsePredicateObjectList();
}
}
else {
$this
->parseSubject();
$this
->skipWSC();
$this
->parsePredicateObjectList();
}
$this->subject = null;
$this->predicate = null;
$this->object = null;
}
protected function parsePredicateObjectList() {
$this->predicate = $this
->parsePredicate();
$this
->skipWSC();
$this
->parseObjectList();
while ($this
->skipWSC() == ';') {
$this
->read();
$c = $this
->skipWSC();
if ($c == '.' || $c == ']') {
break;
}
elseif ($c == ';') {
continue;
}
$this->predicate = $this
->parsePredicate();
$this
->skipWSC();
$this
->parseObjectList();
}
}
protected function parseObjectList() {
$this
->parseObject();
while ($this
->skipWSC() == ',') {
$this
->read();
$this
->skipWSC();
$this
->parseObject();
}
}
protected function parseSubject() {
$c = $this
->peek();
if ($c == '(') {
$this->subject = $this
->parseCollection();
}
elseif ($c == '[') {
$this->subject = $this
->parseImplicitBlank();
}
else {
$value = $this
->parseValue();
if ($value['type'] == 'uri' or $value['type'] == 'bnode') {
$this->subject = $value;
}
else {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: illegal subject type: " . $value['type'], $this->line, $this->column);
}
}
}
protected function parsePredicate() {
$c1 = $this
->read();
if ($c1 == 'a') {
$c2 = $this
->read();
if (self::isWhitespace($c2)) {
return array(
'type' => 'uri',
'value' => EasyRdf_Namespace::get('rdf') . 'type',
);
}
$this
->unread($c2);
}
$this
->unread($c1);
$predicate = $this
->parseValue();
if ($predicate['type'] == 'uri') {
return $predicate;
}
else {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: Illegal predicate type: " . $predicate['type'], $this->line, $this->column);
}
}
protected function parseObject() {
$c = $this
->peek();
if ($c == '(') {
$this->object = $this
->parseCollection();
}
elseif ($c == '[') {
$this->object = $this
->parseImplicitBlank();
}
else {
$this->object = $this
->parseValue();
}
$this
->addTriple($this->subject['value'], $this->predicate['value'], $this->object);
}
protected function parseImplicitBlank() {
$this
->verifyCharacterOrFail($this
->read(), "[");
$bnode = $this
->createBNode();
$c = $this
->read();
if ($c != ']') {
$this
->unread($c);
$oldSubject = $this->subject;
$oldPredicate = $this->predicate;
$this->subject = $bnode;
$this
->skipWSC();
$this
->parsePredicateObjectList();
$this
->skipWSC();
$this
->verifyCharacterOrFail($this
->read(), "]");
$this->subject = $oldSubject;
$this->predicate = $oldPredicate;
}
return $bnode;
}
protected function parseCollection() {
$this
->verifyCharacterOrFail($this
->read(), "(");
$c = $this
->skipWSC();
if ($c == ')') {
$this
->read();
return array(
'type' => 'uri',
'value' => EasyRdf_Namespace::get('rdf') . 'nil',
);
}
else {
$listRoot = $this
->createBNode();
$oldSubject = $this->subject;
$oldPredicate = $this->predicate;
$this->subject = $listRoot;
$this->predicate = array(
'type' => 'uri',
'value' => EasyRdf_Namespace::get('rdf') . 'first',
);
$this
->parseObject();
$bNode = $listRoot;
while ($this
->skipWSC() != ')') {
$newNode = $this
->createBNode();
$this
->addTriple($bNode['value'], EasyRdf_Namespace::get('rdf') . 'rest', $newNode);
$this->subject = $bNode = $newNode;
$this
->parseObject();
}
$this
->read();
$this
->addTriple($bNode['value'], EasyRdf_Namespace::get('rdf') . 'rest', array(
'type' => 'uri',
'value' => EasyRdf_Namespace::get('rdf') . 'nil',
));
$this->subject = $oldSubject;
$this->predicate = $oldPredicate;
return $listRoot;
}
}
protected function parseValue() {
$c = $this
->peek();
if ($c == '<') {
return $this
->parseURI();
}
elseif ($c == ':' || self::isPrefixStartChar($c)) {
return $this
->parseQNameOrBoolean();
}
elseif ($c == '_') {
return $this
->parseNodeID();
}
elseif ($c == '"' || $c == "'") {
return $this
->parseQuotedLiteral();
}
elseif (ctype_digit($c) || $c == '.' || $c == '+' || $c == '-') {
return $this
->parseNumber();
}
elseif ($c == -1) {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: unexpected end of file while reading value", $this->line, $this->column);
}
else {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: expected an RDF value here, found '{$c}'", $this->line, $this->column);
}
}
protected function parseQuotedLiteral() {
$label = $this
->parseQuotedString();
$c = $this
->peek();
if ($c == '@') {
$this
->read();
$lang = '';
$c = $this
->read();
if ($c == -1) {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: unexpected end of file while reading language", $this->line, $this->column);
}
elseif (!self::isLanguageStartChar($c)) {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: expected a letter, found '{$c}'", $this->line, $this->column);
}
$lang .= $c;
$c = $this
->read();
while (!self::isWhitespace($c)) {
if ($c == '.' || $c == ';' || $c == ',' || $c == ')' || $c == ']' || $c == -1) {
break;
}
if (self::isLanguageChar($c)) {
$lang .= $c;
}
else {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: illegal language tag char: '{$c}'", $this->line, $this->column);
}
$c = $this
->read();
}
$this
->unread($c);
return array(
'type' => 'literal',
'value' => $label,
'lang' => $lang,
);
}
elseif ($c == '^') {
$this
->read();
$this
->verifyCharacterOrFail($this
->read(), "^");
$datatype = $this
->parseValue();
if ($datatype['type'] == 'uri') {
return array(
'type' => 'literal',
'value' => $label,
'datatype' => $datatype['value'],
);
}
else {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: illegal datatype type: " . $datatype['type'], $this->line, $this->column);
}
}
else {
return array(
'type' => 'literal',
'value' => $label,
);
}
}
protected function parseQuotedString() {
$result = null;
$c1 = $this
->read();
$this
->verifyCharacterOrFail($c1, "\"\\'");
$c2 = $this
->read();
$c3 = $this
->read();
if ($c2 == $c1 && $c3 == $c1) {
$result = $this
->parseLongString($c2);
}
else {
$this
->unread($c3);
$this
->unread($c2);
$result = $this
->parseString($c1);
}
return $this
->unescapeString($result);
}
protected function parseString($closingCharacter) {
$str = '';
while (true) {
$c = $this
->read();
if ($c == $closingCharacter) {
break;
}
elseif ($c == -1) {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: unexpected end of file while reading string", $this->line, $this->column);
}
$str .= $c;
if ($c == '\\') {
$c = $this
->read();
if ($c == -1) {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: unexpected end of file while reading string", $this->line, $this->column);
}
$str .= $c;
}
}
return $str;
}
protected function parseLongString($closingCharacter) {
$str = '';
$doubleQuoteCount = 0;
while ($doubleQuoteCount < 3) {
$c = $this
->read();
if ($c == -1) {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: unexpected end of file while reading long string", $this->line, $this->column);
}
elseif ($c == $closingCharacter) {
$doubleQuoteCount++;
}
else {
$doubleQuoteCount = 0;
}
$str .= $c;
if ($c == '\\') {
$c = $this
->read();
if ($c == -1) {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: unexpected end of file while reading long string", $this->line, $this->column);
}
$str .= $c;
}
}
return mb_substr($str, 0, -3, "UTF-8");
}
protected function parseNumber() {
$value = '';
$datatype = EasyRdf_Namespace::get('xsd') . 'integer';
$c = $this
->read();
if ($c == '+' || $c == '-') {
$value .= $c;
$c = $this
->read();
}
while (ctype_digit($c)) {
$value .= $c;
$c = $this
->read();
}
if ($c == '.' || $c == 'e' || $c == 'E') {
if ($c == '.') {
if (self::isWhitespace($this
->peek())) {
}
else {
$value .= $c;
$c = $this
->read();
while (ctype_digit($c)) {
$value .= $c;
$c = $this
->read();
}
if (mb_strlen($value, "UTF-8") == 1) {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: object for statement missing", $this->line, $this->column);
}
$datatype = EasyRdf_Namespace::get('xsd') . 'decimal';
}
}
else {
if (mb_strlen($value, "UTF-8") == 0) {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: object for statement missing", $this->line, $this->column);
}
}
if ($c == 'e' || $c == 'E') {
$datatype = EasyRdf_Namespace::get('xsd') . 'double';
$value .= $c;
$c = $this
->read();
if ($c == '+' || $c == '-') {
$value .= $c;
$c = $this
->read();
}
if (!ctype_digit($c)) {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: exponent value missing", $this->line, $this->column);
}
$value .= $c;
$c = $this
->read();
while (ctype_digit($c)) {
$value .= $c;
$c = $this
->read();
}
}
}
$this
->unread($c);
return array(
'type' => 'literal',
'value' => $value,
'datatype' => $datatype,
);
}
protected function parseURI() {
$uri = '';
$this
->verifyCharacterOrFail($this
->read(), "<");
while (true) {
$c = $this
->read();
if ($c == '>') {
break;
}
elseif ($c == -1) {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: unexpected end of file while reading URI", $this->line, $this->column);
}
$uri .= $c;
if ($c == '\\') {
$c = $this
->read();
if ($c == -1) {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: unexpected end of file while reading URI", $this->line, $this->column);
}
$uri .= $c;
}
}
$uri = $this
->unescapeString($uri);
return array(
'type' => 'uri',
'value' => $this
->resolve($uri),
);
}
protected function parseQNameOrBoolean() {
$c = $this
->read();
if ($c == -1) {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: unexpected end of file while readying value", $this->line, $this->column);
}
if ($c != ':' && !self::isPrefixStartChar($c)) {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: expected a ':' or a letter, found '{$c}'", $this->line, $this->column);
}
$namespace = null;
if ($c == ':') {
if (isset($this->namespaces[''])) {
$namespace = $this->namespaces[''];
}
else {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: default namespace used but not defined", $this->line, $this->column);
}
}
else {
$prefix = $c;
$c = $this
->read();
while (self::isPrefixChar($c)) {
$prefix .= $c;
$c = $this
->read();
}
if ($c != ':') {
$value = $prefix;
if ($value == "true" || $value == "false") {
return array(
'type' => 'literal',
'value' => $value,
'datatype' => EasyRdf_Namespace::get('xsd') . 'boolean',
);
}
}
$this
->verifyCharacterOrFail($c, ":");
if (isset($this->namespaces[$prefix])) {
$namespace = $this->namespaces[$prefix];
}
else {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: namespace prefix '{$prefix}' used but not defined", $this->line, $this->column);
}
}
$localName = '';
$c = $this
->read();
if (self::isNameStartChar($c)) {
if ($c == '\\') {
$localName .= $this
->readLocalEscapedChar();
}
else {
$localName .= $c;
}
$c = $this
->read();
while (self::isNameChar($c)) {
if ($c == '\\') {
$localName .= $this
->readLocalEscapedChar();
}
else {
$localName .= $c;
}
$c = $this
->read();
}
}
$this
->unread($c);
return array(
'type' => 'uri',
'value' => $namespace . $localName,
);
}
protected function readLocalEscapedChar() {
$c = $this
->read();
if (self::isLocalEscapedChar($c)) {
return $c;
}
else {
throw new EasyRdf_Parser_Exception("found '" . $c . "', expected one of: " . implode(', ', self::$localEscapedChars), $this->line, $this->column);
}
}
protected function parseNodeID() {
$this
->verifyCharacterOrFail($this
->read(), "_");
$this
->verifyCharacterOrFail($this
->read(), ":");
$c = $this
->read();
if ($c == -1) {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: unexpected end of file while reading node id", $this->line, $this->column);
}
elseif (!self::isNameStartChar($c)) {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: expected a letter, found '{$c}'", $this->line, $this->column);
}
$name = $c;
$c = $this
->read();
while (self::isNameChar($c)) {
$name .= $c;
$c = $this
->read();
}
$this
->unread($c);
return array(
'type' => 'bnode',
'value' => $this
->remapBnode($name),
);
}
protected function resolve($uri) {
if ($this->baseUri) {
return $this->baseUri
->resolve($uri)
->toString();
}
else {
return $uri;
}
}
protected function verifyCharacterOrFail($c, $expected) {
if ($c == -1) {
throw new EasyRdf_Parser_Exception("Turtle Parse Error: unexpected end of file", $this->line, $this->column);
}
elseif (strpbrk($c, $expected) === false) {
$msg = 'expected ';
for ($i = 0; $i < strlen($expected); $i++) {
if ($i > 0) {
$msg .= " or ";
}
$msg .= '\'' . $expected[$i] . '\'';
}
$msg .= ", found '{$c}'";
throw new EasyRdf_Parser_Exception("Turtle Parse Error: {$msg}", $this->line, $this->column);
}
}
protected function skipWSC() {
$c = $this
->read();
while (self::isWhitespace($c) || $c == '#') {
if ($c == '#') {
$this
->processComment();
}
$c = $this
->read();
}
$this
->unread($c);
return $c;
}
protected function processComment() {
$comment = '';
$c = $this
->read();
while ($c != -1 && $c != "\r" && $c != "\n") {
$comment .= $c;
$c = $this
->read();
}
if ($c == "\r") {
$c = $this
->read();
if ($c != "\n") {
$this
->unread($c);
}
}
}
protected function read() {
if (!empty($this->data)) {
$c = mb_substr($this->data, 0, 1, "UTF-8");
if ($c == "\n") {
$this->line += 1;
$this->column = 1;
}
else {
$this->column += 1;
}
if (version_compare(PHP_VERSION, '5.4.8', '<')) {
$this->data = mb_substr($this->data, 1, mb_strlen($this->data), "UTF-8");
}
else {
$this->data = mb_substr($this->data, 1, null, "UTF-8");
}
return $c;
}
else {
return -1;
}
}
protected function peek() {
if (!empty($this->data)) {
return mb_substr($this->data, 0, 1, "UTF-8");
}
else {
return -1;
}
}
protected function unread($c) {
$this->column -= mb_strlen($c, "UTF-8");
$this->data = $c . $this->data;
}
protected function createBNode() {
return array(
'type' => 'bnode',
'value' => $this->graph
->newBNodeId(),
);
}
public static function isWhitespace($c) {
return $c == " " || $c == "\t" || $c == "\n" || $c == "\r";
}
public static function isPrefixStartChar($c) {
$o = ord($c);
return $o >= 0x41 && $o <= 0x5a || $o >= 0x61 && $o <= 0x7a || $o >= 0xc0 && $o <= 0xd6 || $o >= 0xd8 && $o <= 0xf6 || $o >= 0xf8 && $o <= 0x2ff || $o >= 0x370 && $o <= 0x37d || $o >= 0x37f && $o <= 0x1fff || $o >= 0x200c && $o <= 0x200d || $o >= 0x2070 && $o <= 0x218f || $o >= 0x2c00 && $o <= 0x2fef || $o >= 0x3001 && $o <= 0xd7ff || $o >= 0xf900 && $o <= 0xfdcf || $o >= 0xfdf0 && $o <= 0xfffd || $o >= 0x10000 && $o <= 0xeffff;
}
public static function isNameStartChar($c) {
return $c == '\\' || $c == '_' || $c == ':' || $c == '%' || ctype_digit($c) || self::isPrefixStartChar($c);
}
public static function isNameChar($c) {
$o = ord($c);
return self::isNameStartChar($c) || $o >= 0x30 && $o <= 0x39 || $c == '-' || $o == 0xb7 || $o >= 0x300 && $o <= 0x36f || $o >= 0x203f && $o <= 0x2040;
}
private static $localEscapedChars = array(
'_',
'~',
'.',
'-',
'!',
'$',
'&',
'\'',
'(',
')',
'*',
'+',
',',
';',
'=',
'/',
'?',
'#',
'@',
'%',
);
public static function isLocalEscapedChar($c) {
return in_array($c, self::$localEscapedChars);
}
public static function isPrefixChar($c) {
$o = ord($c);
return $c == '_' || $o >= 0x30 && $o <= 0x39 || self::isPrefixStartChar($c) || $c == '-' || $o == 0xb7 || $c >= 0x300 && $c <= 0x36f || $c >= 0x203f && $c <= 0x2040;
}
public static function isLanguageStartChar($c) {
$o = ord($c);
return $o >= 0x41 && $o <= 0x5a || $o >= 0x61 && $o <= 0x7a;
}
public static function isLanguageChar($c) {
$o = ord($c);
return $o >= 0x41 && $o <= 0x5a || $o >= 0x61 && $o <= 0x7a || $o >= 0x30 && $o <= 0x39 || $c == '-';
}
}