modules/html/parser.zzm

=encoding utf8

=head1 NAME

html/parser - HTML5 parser entry points.

=head1 SYNOPSIS

  from html/parser import HTML, HTMLParser;
  
  let doc := HTML.parse("<!doctype html><title>Example</title>");
  let fragment := HTML.parse_fragment("<tr><td>x", context: "table");
  
  let parser := new HTMLParser();
  parser.parse("<p>Reusable</p>");
  let errors := parser.errors();

=head1 DESCRIPTION

This module is the main public entry point for the pure ZuzuScript HTML
parser. It parses full documents and context-sensitive fragments into
the DOM-like classes from C<html/dom>.

The parser implements the document and fragment tree-building behaviour
covered by the focused test suites and the claimed html5lib
tree-construction support level: document setup, in-body recovery,
active formatting reconstruction, adoption-agency recovery, forms,
buttons, void elements, plaintext, table foster parenting, select
recovery, template content, framesets, SVG/MathML namespaces, adjusted
foreign names, foreign attributes, integration points, and foreign CDATA
sections.

HTML parse errors are collected by default. Pass C<strict: true> to
throw after parsing if any parse errors were recorded. Strict mode does
not change recovery behaviour; it only turns a non-empty parse-error
list into an exception.

The C<scripting> option defaults to false. It affects C<noscript>
tokenization and html5lib scripting-mode variants. It does not execute
scripts or support script-driven DOM mutation during parsing.

C<HTML.load> and C<HTML.dump> are public methods that currently throw
clear unimplemented errors. Use C<HTML.parse> and C<toHTML> with explicit
C<std/io> file handling in application code.

=head1 EXPORTS

=head2 Parser Facade

=over

=item C<HTML>

Static parser facade.

=over

=item C<< HTML.parse(String html, ... options) -> HTMLDocument >>

Parse a full HTML document and return an C<HTMLDocument>. Options:
C<strict> and C<scripting>.

=item C<< HTML.parse_string(String html, ... options) -> HTMLDocument >>

Alias for C<HTML.parse>.

=item C<< HTML.parse_fragment(String html, ... options) -> HTMLDocumentFragment >>

Parse an HTML fragment and return an C<HTMLDocumentFragment>. C<context>
defaults to C<div>. It may be a tag-name string, the special strings
C<svg> or C<math>, or an C<HTMLElement>/C<HTMLTemplateElement>, including
elements created with C<createElementNS> for SVG or MathML contexts.
Options: C<context>, C<strict>, and C<scripting>.

=item C<< HTML.load(Path path, ... options) -> HTMLDocument >>

Not implemented. This method currently throws C<html/parser: load is not
implemented yet>.

=item C<< HTML.dump(Path path, HTMLDocument|HTMLNode value, Bool pretty?) >>

Not implemented. This method currently throws C<html/parser: dump is not
implemented yet>.

=back

=back

=head2 Parser Class

=over

=item C<HTMLParser>

Reusable parser object with instance methods C<parse>, C<parse_string>,
C<parse_fragment>, C<load>, C<dump>, C<document>, C<errors>, and
C<parseErrors>. C<document()> returns the last parsed full document, or
the staging document from the last fragment parse. C<errors()> returns a
copy of the most recent parse-error list. C<parseErrors()> is an alias
for C<errors()>.

=back

=head2 Re-exported DOM Classes

=over

=item C<HTMLDocument>, C<HTMLDocumentFragment>, C<HTMLElement>,
C<HTMLTemplateElement>, C<HTMLNode>, C<HTMLText>, C<HTMLComment>,
C<HTMLDoctype>

DOM classes from C<html/dom>. They are re-exported so code which imports
C<html/parser> can inspect or type-check parse results without a second
import.

=item C<DOMNode>, C<DOMDocument>, C<DOMElement>, C<DOMText>,
C<DOMComment>

DOM-compatible aliases from C<html/dom>.

=item C<HTML_NAMESPACE_URI>, C<SVG_NAMESPACE_URI>, C<MATHML_NAMESPACE_URI>,
C<XLINK_NAMESPACE_URI>, C<XML_NAMESPACE_URI>, C<XMLNS_NAMESPACE_URI>

Namespace URI constants from C<html/dom>.

=back

=head2 Re-exported Tokenizer Classes

=over

=item C<HTMLInputStream>, C<HTMLTokenizer>, C<HTMLToken>,
C<HTMLParseError>, C<HTMLNamedCharacterReferences>

Tokenizer-layer classes re-exported from C<html/tokenizer>. These are
available for focused tokenizer tests and tree-builder integration.

=back

=head2 Re-exported Tree-Builder Classes

=over

=item C<HTMLTreeBuilder>, C<HTMLTreeConstructionResult>,
C<HTMLTreeTestSerializer>

Tree-builder classes re-exported from C<html/treebuilder>. Most users
should prefer C<HTML> or C<HTMLParser>; these classes are available for
diagnostics, tests, and tooling that needs direct access to the
tree-construction layer.

=back

=head1 COPYRIGHT AND LICENCE

B<< html/parser >> is copyright Toby Inkster.

It is free software; you may redistribute it and/or modify it under
the terms of either the Artistic License 1.0 or the GNU General Public
License version 2.

=cut

from html/dom import
	DOMComment,
	DOMDocument,
	DOMElement,
	DOMNode,
	DOMText,
	HTMLComment,
	HTMLDocument,
	HTMLDocumentFragment,
	HTMLDoctype,
	HTML_NAMESPACE_URI,
	HTMLElement,
	MATHML_NAMESPACE_URI,
	SVG_NAMESPACE_URI,
	HTMLTemplateElement,
	HTMLNode,
	HTMLText,
	XLINK_NAMESPACE_URI,
	XML_NAMESPACE_URI,
	XMLNS_NAMESPACE_URI;
from html/tokenizer import
	HTMLInputStream,
	HTMLNamedCharacterReferences,
	HTMLParseError,
	HTMLToken,
	HTMLTokenizer;
from html/treebuilder import
	HTMLTreeBuilder,
	HTMLTreeConstructionResult,
	HTMLTreeTestSerializer;

function _html_parser_unimplemented ( String name ) {
	die "html/parser: " _ name _ " is not implemented yet";
}

class HTMLParser {
	let _last_result := null;
	let _document := null;
	let Array _errors := [];
	let Boolean _scripting := false;

	method __build__ () {
		_errors := [] if _errors ≡ null;
	}

	method parse ( String html, ... PairList options ) {
		_scripting := options.get( "scripting", false ) ? true : false;
		let strict := options.get( "strict", false ) ? true : false;
		let builder := new HTMLTreeBuilder(
			_input: html,
			_scripting: _scripting,
		);
		_last_result := builder.parse();
		_document := _last_result.document();
		_errors := _last_result.errors();
		die "html/parser: parse errors encountered" if strict and _errors.length();
		return _document;
	}

	method parse_string ( String html, ... PairList options ) {
		return self.parse( html, ...options );
	}

	method parse_fragment ( String html, ... PairList options ) {
		_scripting := options.get( "scripting", false ) ? true : false;
		let strict := options.get( "strict", false ) ? true : false;
		let context := options.get( "context", "div" );
		let builder := new HTMLTreeBuilder(
			_input: html,
			_scripting: _scripting,
		);
		_last_result := builder.parseFragment( html, context, _scripting );
		_document := _last_result.document();
		_errors := _last_result.errors();
		die "html/parser: parse errors encountered" if strict and _errors.length();
		return _last_result.fragment();
	}

	method load ( path, ... PairList options ) {
		_html_parser_unimplemented("load");
	}

	method dump ( path, value, Boolean pretty := false ) {
		_html_parser_unimplemented("dump");
	}

	method document () {
		return _document;
	}

	method errors () {
		let out := [];
		for ( let error in _errors ) {
			out.push(error);
		}
		return out;
	}

	method parseErrors () {
		return self.errors();
	}
}

class HTML {
	static method parse ( String html, ... PairList options ) {
		return new HTMLParser().parse( html, ...options );
	}

	static method parse_string ( String html, ... PairList options ) {
		return self.parse( html, ...options );
	}

	static method parse_fragment ( String html, ... PairList options ) {
		return new HTMLParser().parse_fragment( html, ...options );
	}

	static method load ( path, ... PairList options ) {
		return new HTMLParser().load( path, ...options );
	}

	static method dump ( path, value, Boolean pretty := false ) {
		return new HTMLParser().dump( path, value, pretty );
	}
}
modules/html/parser.zzm

Package