=encoding utf8
=head1 NAME
rdf/parser/html_rdfa - HTML+RDFa 1.1 parser.
=head1 SYNOPSIS
from rdf/parser/html_rdfa import HtmlRdfaParser;
let parser := new HtmlRdfaParser();
let quads := parser.parse_file(
new Path("page.html"),
base: "http://example.com/page.html",
);
=head1 DESCRIPTION
C<HtmlRdfaParser> implements W3C HTML+RDFa 1.1 (Second Edition). The
input is parsed with the C<html/parser> HTML5 parser, so it accepts
real-world tag soup.
Host language behaviour on top of RDFa Core 1.1: the C<base> element
sets the base IRI; C<head> and C<body> inherit the parent object as
subject when no resource attributes are present; the XHTML vocabulary
terms are recognised in C<@rel> and C<@rev>; with C<@property> present,
plain terms in C<@rel>/C<@rev> are ignored; language comes from
C<lang> or C<xml:lang>; C<@datetime> (and C<time> element content)
produce typed date/time literals; C<rdfa:copy>/C<rdfa:Pattern>
property copying is applied.
Accepts the standard C<base> and C<into> parser options plus
C<vocab_expansion> and C<vocab_loader> (see L<rdf/parser/rdfa_core>).
=head1 EXPORTS
=head2 Classes
=over
=item C<HtmlRdfaParser>
The parser class; composes the C<RdfParser> trait so C<parse_string>,
C<parse_file>, C<parse_lines>, and C<parse_chunks> are all available.
=item C<HtmlRdfaHost>
The HTML host-language configuration, extending C<RdfaHost>.
=back
=head1 COPYRIGHT AND LICENCE
B<< rdf/parser/html_rdfa >> is copyright Toby Inkster.
It is free software; you may redistribute it and/or modify it under
the terms of either the Artistic License 1.0 or the GNU General Public
License version 2.
=cut
from rdf/parser import RdfParser;
from rdf/parser/rdfa_core import
RdfaHost,
RdfaParserOptions,
RdfaProcessor,
XHV_NS,
_rdfa_resolve_against,
_rdfa_strip_fragment;
from rdf/ns import XSD_NS;
from std/string import starts_with, substr;
// HTML+RDFa 1.1 reserved @rel/@rev values; keys lowercase.
const XHV_TERMS := {
"alternate": XHV_NS _ "alternate",
"appendix": XHV_NS _ "appendix",
"bookmark": XHV_NS _ "bookmark",
"chapter": XHV_NS _ "chapter",
"cite": XHV_NS _ "cite",
"contents": XHV_NS _ "contents",
"copyright": XHV_NS _ "copyright",
"first": XHV_NS _ "first",
"glossary": XHV_NS _ "glossary",
"help": XHV_NS _ "help",
"icon": XHV_NS _ "icon",
"index": XHV_NS _ "index",
"last": XHV_NS _ "last",
"license": XHV_NS _ "license",
"meta": XHV_NS _ "meta",
"next": XHV_NS _ "next",
"p3pv1": XHV_NS _ "p3pv1",
"prev": XHV_NS _ "prev",
"previous": XHV_NS _ "previous",
"role": XHV_NS _ "role",
"section": XHV_NS _ "section",
"start": XHV_NS _ "start",
"stylesheet": XHV_NS _ "stylesheet",
"subsection": XHV_NS _ "subsection",
"top": XHV_NS _ "top",
"up": XHV_NS _ "up",
};
function _html_datetime_datatype ( String value ) {
switch ( value : ~ ) {
case /^-?P/: return XSD_NS _ "duration";
case /^-?\d\d\d\d\d*-\d\d-\d\dT\d\d:\d\d/: return XSD_NS _ "dateTime";
case /^-?\d\d\d\d\d*-\d\d-\d\d$/: return XSD_NS _ "date";
case /^\d\d:\d\d(:\d\d(\.\d+)?)?$/: return XSD_NS _ "time";
case /^-?\d\d\d\d\d*-\d\d$/: return XSD_NS _ "gYearMonth";
case /^-?\d\d\d\d\d*$/: return XSD_NS _ "gYear";
}
return null;
}
class HtmlRdfaHost extends RdfaHost {
method attr_records ( node ) {
let out := [];
for ( let record in node.attributeRecords() ) {
let ns := record{namespaceURI};
ns := "" if ns == null;
out.push({
ns: ns,
local: record{localName},
name: record{qualifiedName},
value: record{value},
});
}
return out;
}
method get_attr ( node, String name ) {
return node.getAttribute(name);
}
method get_lang ( node ) {
let lang := node.getAttribute("lang");
return lang if not (lang == null);
return node.getAttribute("xml:lang");
}
// HTML uses the base element, not xml:base.
method element_base ( node, String current ) {
return current;
}
method doc_base ( root, String default_base ) {
for ( let el in root.getElementsByTagName("base") ) {
let href := el.getAttribute("href");
if ( not (href == null) ) {
return _rdfa_strip_fragment(_rdfa_resolve_against( default_base, href ));
}
}
return default_base;
}
method is_head_or_body ( node ) {
return node.tagName() in [ "head", "body" ];
}
method rel_rev_terms () {
// Local copy works around the zuzu-rust bare-return-global bug.
let out := XHV_TERMS;
return out;
}
method property_copying () {
return true;
}
method rel_rev_needs_curie_with_property () {
return true;
}
method value_override ( node ) {
let datetime := node.getAttribute("datetime");
if ( not (datetime == null) ) {
return { value: datetime, datatype: _html_datetime_datatype(datetime) };
}
if ( node.tagName() eq "time" ) {
let text := node.textContent();
let dt := _html_datetime_datatype(text);
return { value: text, datatype: dt } if not (dt == null);
}
return null;
}
}
class HtmlRdfaParser with RdfParser {
method parse_string ( String text, ... PairList options ) {
from html/parser import HTML;
let opts := RdfaParserOptions.from_pairs(options);
let doc := HTML.parse(text);
let processor := new RdfaProcessor(
host: new HtmlRdfaHost(),
vocab_expansion: opts.get_vocab_expansion(),
vocab_loader: opts.get_vocab_loader(),
);
let quads := processor.run( doc.documentElement(), opts.get_base() );
return opts.result(quads);
}
}
modules/rdf/parser/html_rdfa.zzm
rdf-rdfa-0.0.1 source code
Package
- Name
- rdf-rdfa
- Version
- 0.0.1
- Uploaded
- 2026-06-13 00:17:04
- Repository
- https://github.com/tobyink/zuzu-rdf-rdfa
- Dependencies
-
-
html/parser>= 0 -
rdf>= 0 -
std/data/xml>= 0 -
std/io>= 0 -
std/string>= 0
-
- Metadata
- zuzu-distribution.json
- Archive
- Download .tar.gz