modules/rdf/parser/turtle.zzm

rdf-0.0.3 source code

=encoding utf8

=head1 NAME

rdf/parser/turtle - Turtle parser.

=head1 SYNOPSIS

  from rdf/parser/turtle import TurtleParser;
  
  let parser := new TurtleParser();
  let quads := parser.parse_string("""
  @prefix ex: <http://example.com/> .
  ex:s ex:p "value" .
  """);


=head1 DESCRIPTION

C<TurtleParser> parses RDF 1.1 Turtle into default-graph quads. It
supports Turtle directives, SPARQL-style C<PREFIX> and C<BASE>, relative
IRI resolution, blank node property lists, collections, object lists,
language tags, datatypes, numerics, booleans, and single, double, and
long string forms.

Parser options are C<base>, used to resolve relative IRIs, and C<into>,
used to load parsed quads directly into a store.

=head1 EXPORTS

=head2 Classes

=over

=item C<TurtleParser>

=over

=item C<< parse_string(String text, ... options) >>

Parses C<text>. Returns an array of quads, or the supplied C<into> store
after adding the quads. Throws C<RDFSyntaxError> on invalid input.

=item C<< parse_file(path, ... options) >>

Reads UTF-8 from C<path> and parses it.

=item C<< parse_lines(Array lines, ... options) >>

Parses concatenated line chunks.

=item C<< parse_chunks(Array chunks, ... options) >>

Parses concatenated string or nested-array chunks.

=back

=back

=head1 COPYRIGHT AND LICENCE

B<< rdf/parser/turtle >> is copyright Toby Inkster.

It is free software; you may redistribute it and/or modify it under
the terms of either the Artistic License 1.0 or the GNU General Public
License version 2.

=cut

from rdf/parser import RdfParser;
from rdf/parser/common import RDFReader, _parser_options, _parser_result;
from rdf/term import RDF_NS, RDFIRI, rdf_iri, rdf_quad;
from std/string import ends_with, substr;

function _turtle_blank_node;
function _turtle_collection;
function _turtle_object;
function _turtle_predicate_object_list;

function _turtle_directive ( RDFReader reader ) {
	reader._skip_ws();
	if ( reader._starts("@prefix") or reader._starts_ci("PREFIX") ) {
		let sparql_style := reader._starts_ci("PREFIX");
		reader.advance( sparql_style ? 6 : 7 );
		reader._skip_ws();
		let name := reader._read_prefix_part();
		reader._expect(":");
		let iri := reader.read_iri();
		reader.get_prefixes().set( name, iri.get_value() );
		reader._skip_ws();
		if ( sparql_style ) {
			reader._error("Unexpected directive terminator")
				if reader._peek() eq ".";
		}
		else {
			reader._expect(".");
		}
		return true;
	}
	if ( reader._starts("@base") or reader._starts_ci("BASE") ) {
		let sparql_style := reader._starts_ci("BASE");
		reader.advance( sparql_style ? 4 : 5 );
		reader._skip_ws();
		reader.set_base(reader.read_iri().get_value());
		reader._skip_ws();
		if ( sparql_style ) {
			reader._error("Unexpected directive terminator")
				if reader._peek() eq ".";
		}
		else {
			reader._expect(".");
		}
		return true;
	}
	return false;
}

function _turtle_predicate ( RDFReader reader ) {
	reader._skip_ws();
	if ( reader._peek() eq "a" and
		( reader._starts("a ") or reader._starts("a\t") or
		reader._starts("a\n") or reader._starts("a\r") ) ) {
		reader.advance(1);
		return rdf_iri(RDF_NS _ "type");
	}
	let predicate := reader.read_term();
	reader._error("Turtle predicate must be an IRI")
		unless predicate instanceof RDFIRI;
	return predicate;
}

function _turtle_object ( RDFReader reader, Array out, graph := null ) {
	reader._skip_ws();
	if ( reader._peek() eq "[" ) {
		return _turtle_blank_node(reader, out, graph);
	}
	if ( reader._peek() eq "(" ) {
		return _turtle_collection(reader, out, graph);
	}
	return reader.read_object();
}

function _turtle_object_list ( RDFReader reader, subject, predicate,
	Array out, graph := null ) {
	while ( true ) {
		let object := _turtle_object( reader, out, graph );
		out.push(rdf_quad( subject, predicate, object, graph ));
		reader._skip_ws();
		last unless reader._peek() eq ",";
		reader.advance(1);
	}
}

function _turtle_predicate_object_list ( RDFReader reader, subject,
	Array out, graph := null ) {
	while ( true ) {
		let predicate := _turtle_predicate(reader);
		_turtle_object_list( reader, subject, predicate, out, graph );
		reader._skip_ws();
		last unless reader._peek() eq ";";
		reader.advance(1);
		reader._skip_ws();
		while ( reader._peek() eq ";" ) {
			reader.advance(1);
			reader._skip_ws();
		}
		last if reader._peek() eq "." or reader._peek() eq "]" or
			reader._peek() eq "}";
	}
}

function _turtle_blank_node ( RDFReader reader, Array out, graph := null ) {
	reader._expect("[");
	let blank := reader.fresh_blank();
	reader._skip_ws();
	if ( reader._peek() ne "]" ) {
		_turtle_predicate_object_list( reader, blank, out, graph );
	}
	reader._expect("]");
	return blank;
}

function _turtle_collection ( RDFReader reader, Array out, graph := null ) {
	reader._expect("(");
	reader._skip_ws();
	if ( reader._peek() eq ")" ) {
		reader.advance(1);
		return rdf_iri(RDF_NS _ "nil");
	}
	let head := reader.fresh_blank();
	let current := head;
	while ( true ) {
		let item := _turtle_object( reader, out, graph );
		out.push(rdf_quad(
			current,
			rdf_iri(RDF_NS _ "first"),
			item,
			graph,
		));
		reader._skip_ws();
		if ( reader._peek() eq ")" ) {
			out.push(rdf_quad(
				current,
				rdf_iri(RDF_NS _ "rest"),
				rdf_iri(RDF_NS _ "nil"),
				graph,
			));
			reader.advance(1);
			return head;
		}
		let next_node := reader.fresh_blank();
		out.push(rdf_quad(
			current,
			rdf_iri(RDF_NS _ "rest"),
			next_node,
			graph,
		));
		current := next_node;
	}
}

function _turtle_subject ( RDFReader reader, Array out, graph := null ) {
	reader._skip_ws();
	if ( reader._peek() eq "[" ) {
		return { term: _turtle_blank_node( reader, out, graph ), complete: true };
	}
	if ( reader._peek() eq "(" ) {
		return { term: _turtle_collection( reader, out, graph ), complete: false };
	}
	return { term: reader.read_subject(), complete: false };
}

class TurtleParser with RdfParser {
	method parse_string ( String text, ... PairList options ) {
		let opts := _parser_options(options);
		let reader := new RDFReader(
			source: text,
			base: opts{base},
			validate_decoded_iri: true,
		);
		let out := [];
		while ( true ) {
			reader._skip_ws();
			last if reader._peek() == null;
			if ( _turtle_directive(reader) ) {
				next;
			}
			let subject := _turtle_subject( reader, out );
			reader._skip_ws();
			if ( not subject{complete} or reader._peek() ne "." ) {
				_turtle_predicate_object_list( reader, subject{term}, out );
			}
			reader._expect(".");
		}
		return _parser_result( out, options );
	}
}