modules/rdf/parser/html_rdfa.zzm

rdf-rdfa-0.0.1 source code

Package

Name
rdf-rdfa
Version
0.0.1
Uploaded
2026-06-13 00:17:04
Repository
https://github.com/tobyink/zuzu-rdf-rdfa
Dependencies
Metadata
zuzu-distribution.json
Archive
Download .tar.gz
=encoding utf8

=head1 NAME

rdf/parser/html_rdfa - HTML+RDFa 1.1 parser.

=head1 SYNOPSIS

  from rdf/parser/html_rdfa import HtmlRdfaParser;

  let parser := new HtmlRdfaParser();
  let quads := parser.parse_file(
    new Path("page.html"),
    base: "http://example.com/page.html",
  );

=head1 DESCRIPTION

C<HtmlRdfaParser> implements W3C HTML+RDFa 1.1 (Second Edition). The
input is parsed with the C<html/parser> HTML5 parser, so it accepts
real-world tag soup.

Host language behaviour on top of RDFa Core 1.1: the C<base> element
sets the base IRI; C<head> and C<body> inherit the parent object as
subject when no resource attributes are present; the XHTML vocabulary
terms are recognised in C<@rel> and C<@rev>; with C<@property> present,
plain terms in C<@rel>/C<@rev> are ignored; language comes from
C<lang> or C<xml:lang>; C<@datetime> (and C<time> element content)
produce typed date/time literals; C<rdfa:copy>/C<rdfa:Pattern>
property copying is applied.

Accepts the standard C<base> and C<into> parser options plus
C<vocab_expansion> and C<vocab_loader> (see L<rdf/parser/rdfa_core>).

=head1 EXPORTS

=head2 Classes

=over

=item C<HtmlRdfaParser>

The parser class; composes the C<RdfParser> trait so C<parse_string>,
C<parse_file>, C<parse_lines>, and C<parse_chunks> are all available.

=item C<HtmlRdfaHost>

The HTML host-language configuration, extending C<RdfaHost>.

=back

=head1 COPYRIGHT AND LICENCE

B<< rdf/parser/html_rdfa >> is copyright Toby Inkster.

It is free software; you may redistribute it and/or modify it under
the terms of either the Artistic License 1.0 or the GNU General Public
License version 2.

=cut

from rdf/parser import RdfParser;
from rdf/parser/rdfa_core import
	RdfaHost,
	RdfaParserOptions,
	RdfaProcessor,
	XHV_NS,
	_rdfa_resolve_against,
	_rdfa_strip_fragment;
from rdf/ns import XSD_NS;
from std/string import starts_with, substr;

// HTML+RDFa 1.1 reserved @rel/@rev values; keys lowercase.
const XHV_TERMS := {
	"alternate":  XHV_NS _ "alternate",
	"appendix":   XHV_NS _ "appendix",
	"bookmark":   XHV_NS _ "bookmark",
	"chapter":    XHV_NS _ "chapter",
	"cite":       XHV_NS _ "cite",
	"contents":   XHV_NS _ "contents",
	"copyright":  XHV_NS _ "copyright",
	"first":      XHV_NS _ "first",
	"glossary":   XHV_NS _ "glossary",
	"help":       XHV_NS _ "help",
	"icon":       XHV_NS _ "icon",
	"index":      XHV_NS _ "index",
	"last":       XHV_NS _ "last",
	"license":    XHV_NS _ "license",
	"meta":       XHV_NS _ "meta",
	"next":       XHV_NS _ "next",
	"p3pv1":      XHV_NS _ "p3pv1",
	"prev":       XHV_NS _ "prev",
	"previous":   XHV_NS _ "previous",
	"role":       XHV_NS _ "role",
	"section":    XHV_NS _ "section",
	"start":      XHV_NS _ "start",
	"stylesheet": XHV_NS _ "stylesheet",
	"subsection": XHV_NS _ "subsection",
	"top":        XHV_NS _ "top",
	"up":         XHV_NS _ "up",
};

function _html_datetime_datatype ( String value ) {
	switch ( value : ~ ) {
		case /^-?P/:                                return XSD_NS _ "duration";
		case /^-?\d\d\d\d\d*-\d\d-\d\dT\d\d:\d\d/:  return XSD_NS _ "dateTime";
		case /^-?\d\d\d\d\d*-\d\d-\d\d$/:           return XSD_NS _ "date";
		case /^\d\d:\d\d(:\d\d(\.\d+)?)?$/:         return XSD_NS _ "time";
		case /^-?\d\d\d\d\d*-\d\d$/:                return XSD_NS _ "gYearMonth";
		case /^-?\d\d\d\d\d*$/:                     return XSD_NS _ "gYear";
	}
	return null;
}

class HtmlRdfaHost extends RdfaHost {
	method attr_records ( node ) {
		let out := [];
		for ( let record in node.attributeRecords() ) {
			let ns := record{namespaceURI};
			ns := "" if ns == null;
			out.push({
				ns: ns,
				local: record{localName},
				name: record{qualifiedName},
				value: record{value},
			});
		}
		return out;
	}

	method get_attr ( node, String name ) {
		return node.getAttribute(name);
	}

	method get_lang ( node ) {
		let lang := node.getAttribute("lang");
		return lang if not (lang == null);
		return node.getAttribute("xml:lang");
	}

	// HTML uses the base element, not xml:base.
	method element_base ( node, String current ) {
		return current;
	}

	method doc_base ( root, String default_base ) {
		for ( let el in root.getElementsByTagName("base") ) {
			let href := el.getAttribute("href");
			if ( not (href == null) ) {
				return _rdfa_strip_fragment(_rdfa_resolve_against( default_base, href ));
			}
		}
		return default_base;
	}

	method is_head_or_body ( node ) {
		return node.tagName() in [ "head", "body" ];
	}

	method rel_rev_terms () {
		// Local copy works around the zuzu-rust bare-return-global bug.
		let out := XHV_TERMS;
		return out;
	}

	method property_copying () {
		return true;
	}

	method rel_rev_needs_curie_with_property () {
		return true;
	}

	method value_override ( node ) {
		let datetime := node.getAttribute("datetime");
		if ( not (datetime == null) ) {
			return { value: datetime, datatype: _html_datetime_datatype(datetime) };
		}
		if ( node.tagName() eq "time" ) {
			let text := node.textContent();
			let dt := _html_datetime_datatype(text);
			return { value: text, datatype: dt } if not (dt == null);
		}
		return null;
	}
}

class HtmlRdfaParser with RdfParser {
	method parse_string ( String text, ... PairList options ) {
		from html/parser import HTML;
		let opts := RdfaParserOptions.from_pairs(options);
		let doc := HTML.parse(text);
		let processor := new RdfaProcessor(
			host: new HtmlRdfaHost(),
			vocab_expansion: opts.get_vocab_expansion(),
			vocab_loader: opts.get_vocab_loader(),
		);
		let quads := processor.run( doc.documentElement(), opts.get_base() );
		return opts.result(quads);
	}
}