=encoding utf8
=head1 NAME
html/parser - HTML5 parser entry points.
=head1 SYNOPSIS
from html/parser import HTML, HTMLParser;
let doc := HTML.parse("<!doctype html><title>Example</title>");
let fragment := HTML.parse_fragment("<tr><td>x", context: "table");
let parser := new HTMLParser();
parser.parse("<p>Reusable</p>");
let errors := parser.errors();
=head1 DESCRIPTION
This module is the main public entry point for the pure ZuzuScript HTML
parser. It parses full documents and context-sensitive fragments into
the DOM-like classes from C<html/dom>.
The parser implements the document and fragment tree-building behaviour
covered by the focused test suites and the claimed html5lib
tree-construction support level: document setup, in-body recovery,
active formatting reconstruction, adoption-agency recovery, forms,
buttons, void elements, plaintext, table foster parenting, select
recovery, template content, framesets, SVG/MathML namespaces, adjusted
foreign names, foreign attributes, integration points, and foreign CDATA
sections.
HTML parse errors are collected by default. Pass C<strict: true> to
throw after parsing if any parse errors were recorded. Strict mode does
not change recovery behaviour; it only turns a non-empty parse-error
list into an exception.
The C<scripting> option defaults to false. It affects C<noscript>
tokenization and html5lib scripting-mode variants. It does not execute
scripts or support script-driven DOM mutation during parsing.
C<HTML.load> and C<HTML.dump> are public methods that currently throw
clear unimplemented errors. Use C<HTML.parse> and C<toHTML> with explicit
C<std/io> file handling in application code.
=head1 EXPORTS
=head2 Parser Facade
=over
=item C<HTML>
Static parser facade.
=over
=item C<< HTML.parse(String html, ... options) -> HTMLDocument >>
Parse a full HTML document and return an C<HTMLDocument>. Options:
C<strict> and C<scripting>.
=item C<< HTML.parse_string(String html, ... options) -> HTMLDocument >>
Alias for C<HTML.parse>.
=item C<< HTML.parse_fragment(String html, ... options) -> HTMLDocumentFragment >>
Parse an HTML fragment and return an C<HTMLDocumentFragment>. C<context>
defaults to C<div>. It may be a tag-name string, the special strings
C<svg> or C<math>, or an C<HTMLElement>/C<HTMLTemplateElement>, including
elements created with C<createElementNS> for SVG or MathML contexts.
Options: C<context>, C<strict>, and C<scripting>.
=item C<< HTML.load(Path path, ... options) -> HTMLDocument >>
Not implemented. This method currently throws C<html/parser: load is not
implemented yet>.
=item C<< HTML.dump(Path path, HTMLDocument|HTMLNode value, Bool pretty?) >>
Not implemented. This method currently throws C<html/parser: dump is not
implemented yet>.
=back
=back
=head2 Parser Class
=over
=item C<HTMLParser>
Reusable parser object with instance methods C<parse>, C<parse_string>,
C<parse_fragment>, C<load>, C<dump>, C<document>, C<errors>, and
C<parseErrors>. C<document()> returns the last parsed full document, or
the staging document from the last fragment parse. C<errors()> returns a
copy of the most recent parse-error list. C<parseErrors()> is an alias
for C<errors()>.
=back
=head2 Re-exported DOM Classes
=over
=item C<HTMLDocument>, C<HTMLDocumentFragment>, C<HTMLElement>,
C<HTMLTemplateElement>, C<HTMLNode>, C<HTMLText>, C<HTMLComment>,
C<HTMLDoctype>
DOM classes from C<html/dom>. They are re-exported so code which imports
C<html/parser> can inspect or type-check parse results without a second
import.
=item C<DOMNode>, C<DOMDocument>, C<DOMElement>, C<DOMText>,
C<DOMComment>
DOM-compatible aliases from C<html/dom>.
=item C<HTML_NAMESPACE_URI>, C<SVG_NAMESPACE_URI>, C<MATHML_NAMESPACE_URI>,
C<XLINK_NAMESPACE_URI>, C<XML_NAMESPACE_URI>, C<XMLNS_NAMESPACE_URI>
Namespace URI constants from C<html/dom>.
=back
=head2 Re-exported Tokenizer Classes
=over
=item C<HTMLInputStream>, C<HTMLTokenizer>, C<HTMLToken>,
C<HTMLParseError>, C<HTMLNamedCharacterReferences>
Tokenizer-layer classes re-exported from C<html/tokenizer>. These are
available for focused tokenizer tests and tree-builder integration.
=back
=head2 Re-exported Tree-Builder Classes
=over
=item C<HTMLTreeBuilder>, C<HTMLTreeConstructionResult>,
C<HTMLTreeTestSerializer>
Tree-builder classes re-exported from C<html/treebuilder>. Most users
should prefer C<HTML> or C<HTMLParser>; these classes are available for
diagnostics, tests, and tooling that needs direct access to the
tree-construction layer.
=back
=head1 COPYRIGHT AND LICENCE
B<< html/parser >> is copyright Toby Inkster.
It is free software; you may redistribute it and/or modify it under
the terms of either the Artistic License 1.0 or the GNU General Public
License version 2.
=cut
from html/dom import
DOMComment,
DOMDocument,
DOMElement,
DOMNode,
DOMText,
HTMLComment,
HTMLDocument,
HTMLDocumentFragment,
HTMLDoctype,
HTML_NAMESPACE_URI,
HTMLElement,
MATHML_NAMESPACE_URI,
SVG_NAMESPACE_URI,
HTMLTemplateElement,
HTMLNode,
HTMLText,
XLINK_NAMESPACE_URI,
XML_NAMESPACE_URI,
XMLNS_NAMESPACE_URI;
from html/tokenizer import
HTMLInputStream,
HTMLNamedCharacterReferences,
HTMLParseError,
HTMLToken,
HTMLTokenizer;
from html/treebuilder import
HTMLTreeBuilder,
HTMLTreeConstructionResult,
HTMLTreeTestSerializer;
function _html_parser_unimplemented ( String name ) {
die "html/parser: " _ name _ " is not implemented yet";
}
class HTMLParser {
let _last_result := null;
let _document := null;
let Array _errors := [];
let Boolean _scripting := false;
method __build__ () {
_errors := [] if _errors ≡ null;
}
method parse ( String html, ... PairList options ) {
_scripting := options.get( "scripting", false ) ? true : false;
let strict := options.get( "strict", false ) ? true : false;
let builder := new HTMLTreeBuilder(
_input: html,
_scripting: _scripting,
);
_last_result := builder.parse();
_document := _last_result.document();
_errors := _last_result.errors();
die "html/parser: parse errors encountered" if strict and _errors.length();
return _document;
}
method parse_string ( String html, ... PairList options ) {
return self.parse( html, ...options );
}
method parse_fragment ( String html, ... PairList options ) {
_scripting := options.get( "scripting", false ) ? true : false;
let strict := options.get( "strict", false ) ? true : false;
let context := options.get( "context", "div" );
let builder := new HTMLTreeBuilder(
_input: html,
_scripting: _scripting,
);
_last_result := builder.parseFragment( html, context, _scripting );
_document := _last_result.document();
_errors := _last_result.errors();
die "html/parser: parse errors encountered" if strict and _errors.length();
return _last_result.fragment();
}
method load ( path, ... PairList options ) {
_html_parser_unimplemented("load");
}
method dump ( path, value, Boolean pretty := false ) {
_html_parser_unimplemented("dump");
}
method document () {
return _document;
}
method errors () {
let out := [];
for ( let error in _errors ) {
out.push(error);
}
return out;
}
method parseErrors () {
return self.errors();
}
}
class HTML {
static method parse ( String html, ... PairList options ) {
return new HTMLParser().parse( html, ...options );
}
static method parse_string ( String html, ... PairList options ) {
return self.parse( html, ...options );
}
static method parse_fragment ( String html, ... PairList options ) {
return new HTMLParser().parse_fragment( html, ...options );
}
static method load ( path, ... PairList options ) {
return new HTMLParser().load( path, ...options );
}
static method dump ( path, value, Boolean pretty := false ) {
return new HTMLParser().dump( path, value, pretty );
}
}
modules/html/parser.zzm
html-0.0.2 source code
Package
- Name
- html
- Version
- 0.0.2
- Uploaded
- 2026-06-12 23:25:02
- Repository
- https://github.com/tobyink/zuzu-html
- Dependencies
-
-
std/io>= 0 -
std/string>= 0
-
- Metadata
- zuzu-distribution.json
- Archive
- Download .tar.gz