class DOMTreeBuilder in Zircon Profile 8
Same name and namespace in other branches
- 8.0 vendor/masterminds/html5/src/HTML5/Parser/DOMTreeBuilder.php \Masterminds\HTML5\Parser\DOMTreeBuilder
Create an HTML5 DOM tree from events.
This attempts to create a DOM from events emitted by a parser. This attempts (but does not guarantee) to up-convert older HTML documents to HTML5. It does this by applying HTML5's rules, but it will not change the architecture of the document itself.
Many of the error correction and quirks features suggested in the specification are implemented herein; however, not all of them are. Since we do not assume a graphical user agent, no presentation-specific logic is conducted during tree building.
FIXME: The present tree builder does not exactly follow the state machine rules for insert modes as outlined in the HTML5 spec. The processor needs to be re-written to accomodate this. See, for example, the Go language HTML5 parser.
Hierarchy
- class \Masterminds\HTML5\Parser\DOMTreeBuilder implements EventHandler
Expanded class hierarchy of DOMTreeBuilder
3 files declare their use of DOMTreeBuilder
- DOMTreeBuilderTest.php in vendor/
masterminds/ html5/ test/ HTML5/ Parser/ DOMTreeBuilderTest.php - Test the Tree Builder.
- HTML5.php in vendor/
masterminds/ html5/ src/ HTML5.php - TreeBuildingRulesTest.php in vendor/
masterminds/ html5/ test/ HTML5/ Parser/ TreeBuildingRulesTest.php - Test the Tree Builder's special-case rules.
File
- vendor/
masterminds/ html5/ src/ HTML5/ Parser/ DOMTreeBuilder.php, line 24
Namespace
Masterminds\HTML5\ParserView source
class DOMTreeBuilder implements EventHandler {
/**
* Defined in http://www.w3.org/TR/html51/infrastructure.html#html-namespace-0
*/
const NAMESPACE_HTML = 'http://www.w3.org/1999/xhtml';
const NAMESPACE_MATHML = 'http://www.w3.org/1998/Math/MathML';
const NAMESPACE_SVG = 'http://www.w3.org/2000/svg';
const NAMESPACE_XLINK = 'http://www.w3.org/1999/xlink';
const NAMESPACE_XML = 'http://www.w3.org/XML/1998/namespace';
const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/';
const OPT_DISABLE_HTML_NS = 'disable_html_ns';
const OPT_TARGET_DOC = 'target_document';
const OPT_IMPLICIT_NS = 'implicit_namespaces';
/**
* Holds the HTML5 element names that causes a namespace switch
*
* @var array
*/
protected $nsRoots = array(
'html' => self::NAMESPACE_HTML,
'svg' => self::NAMESPACE_SVG,
'math' => self::NAMESPACE_MATHML,
);
/**
* Holds the always available namespaces (which does not require the XMLNS declaration).
*
* @var array
*/
protected $implicitNamespaces = array(
'xml' => self::NAMESPACE_XML,
'xmlns' => self::NAMESPACE_XMLNS,
'xlink' => self::NAMESPACE_XLINK,
);
/**
* Holds a stack of currently active namespaces.
*
* @var array
*/
protected $nsStack = array();
/**
* Holds the number of namespaces declared by a node.
*
* @var array
*/
protected $pushes = array();
/**
* Defined in 8.2.5.
*/
const IM_INITIAL = 0;
const IM_BEFORE_HTML = 1;
const IM_BEFORE_HEAD = 2;
const IM_IN_HEAD = 3;
const IM_IN_HEAD_NOSCRIPT = 4;
const IM_AFTER_HEAD = 5;
const IM_IN_BODY = 6;
const IM_TEXT = 7;
const IM_IN_TABLE = 8;
const IM_IN_TABLE_TEXT = 9;
const IM_IN_CAPTION = 10;
const IM_IN_COLUMN_GROUP = 11;
const IM_IN_TABLE_BODY = 12;
const IM_IN_ROW = 13;
const IM_IN_CELL = 14;
const IM_IN_SELECT = 15;
const IM_IN_SELECT_IN_TABLE = 16;
const IM_AFTER_BODY = 17;
const IM_IN_FRAMESET = 18;
const IM_AFTER_FRAMESET = 19;
const IM_AFTER_AFTER_BODY = 20;
const IM_AFTER_AFTER_FRAMESET = 21;
const IM_IN_SVG = 22;
const IM_IN_MATHML = 23;
protected $options = array();
protected $stack = array();
protected $current;
// Pointer in the tag hierarchy.
protected $doc;
protected $frag;
protected $processor;
protected $insertMode = 0;
/**
* Track if we are in an element that allows only inline child nodes
* @var string|null
*/
protected $onlyInline;
/**
* Quirks mode is enabled by default.
* Any document that is missing the
* DT will be considered to be in quirks mode.
*/
protected $quirks = true;
protected $errors = array();
public function __construct($isFragment = false, array $options = array()) {
$this->options = $options;
if (isset($options[self::OPT_TARGET_DOC])) {
$this->doc = $options[self::OPT_TARGET_DOC];
}
else {
$impl = new \DOMImplementation();
// XXX:
// Create the doctype. For now, we are always creating HTML5
// documents, and attempting to up-convert any older DTDs to HTML5.
$dt = $impl
->createDocumentType('html');
// $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
$this->doc = $impl
->createDocument(null, null, $dt);
}
$this->errors = array();
$this->current = $this->doc;
// ->documentElement;
// Create a rules engine for tags.
$this->rules = new TreeBuildingRules($this->doc);
$implicitNS = array();
if (isset($this->options[self::OPT_IMPLICIT_NS])) {
$implicitNS = $this->options[self::OPT_IMPLICIT_NS];
}
elseif (isset($this->options["implicitNamespaces"])) {
$implicitNS = $this->options["implicitNamespaces"];
}
// Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options
array_unshift($this->nsStack, $implicitNS + array(
'' => self::NAMESPACE_HTML,
) + $this->implicitNamespaces);
if ($isFragment) {
$this->insertMode = static::IM_IN_BODY;
$this->frag = $this->doc
->createDocumentFragment();
$this->current = $this->frag;
}
}
/**
* Get the document.
*/
public function document() {
return $this->doc;
}
/**
* Get the DOM fragment for the body.
*
* This returns a DOMNodeList because a fragment may have zero or more
* DOMNodes at its root.
*
* @see http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#concept-frag-parse-context
*
* @return \DOMFragmentDocumentFragment
*/
public function fragment() {
return $this->frag;
}
/**
* Provide an instruction processor.
*
* This is used for handling Processor Instructions as they are
* inserted. If omitted, PI's are inserted directly into the DOM tree.
*/
public function setInstructionProcessor(\Masterminds\HTML5\InstructionProcessor $proc) {
$this->processor = $proc;
}
public function doctype($name, $idType = 0, $id = null, $quirks = false) {
// This is used solely for setting quirks mode. Currently we don't
// try to preserve the inbound DT. We convert it to HTML5.
$this->quirks = $quirks;
if ($this->insertMode > static::IM_INITIAL) {
$this
->parseError("Illegal placement of DOCTYPE tag. Ignoring: " . $name);
return;
}
$this->insertMode = static::IM_BEFORE_HTML;
}
/**
* Process the start tag.
*
* @todo - XMLNS namespace handling (we need to parse, even if it's not valid)
* - XLink, MathML and SVG namespace handling
* - Omission rules: 8.1.2.4 Optional tags
*/
public function startTag($name, $attributes = array(), $selfClosing = false) {
// fprintf(STDOUT, $name);
$lname = $this
->normalizeTagName($name);
// Make sure we have an html element.
if (!$this->doc->documentElement && $name !== 'html' && !$this->frag) {
$this
->startTag('html');
}
// Set quirks mode if we're at IM_INITIAL with no doctype.
if ($this->insertMode == static::IM_INITIAL) {
$this->quirks = true;
$this
->parseError("No DOCTYPE specified.");
}
// SPECIAL TAG HANDLING:
// Spec says do this, and "don't ask."
if ($name == 'image') {
$name = 'img';
}
// Autoclose p tags where appropriate.
if ($this->insertMode >= static::IM_IN_BODY && Elements::isA($name, Elements::AUTOCLOSE_P)) {
$this
->autoclose('p');
}
// Set insert mode:
switch ($name) {
case 'html':
$this->insertMode = static::IM_BEFORE_HEAD;
break;
case 'head':
if ($this->insertMode > static::IM_BEFORE_HEAD) {
$this
->parseError("Unexpected head tag outside of head context.");
}
else {
$this->insertMode = static::IM_IN_HEAD;
}
break;
case 'body':
$this->insertMode = static::IM_IN_BODY;
break;
case 'svg':
$this->insertMode = static::IM_IN_SVG;
break;
case 'math':
$this->insertMode = static::IM_IN_MATHML;
break;
case 'noscript':
if ($this->insertMode == static::IM_IN_HEAD) {
$this->insertMode = static::IM_IN_HEAD_NOSCRIPT;
}
break;
}
// Special case handling for SVG.
if ($this->insertMode == static::IM_IN_SVG) {
$lname = Elements::normalizeSvgElement($lname);
}
$pushes = 0;
// when we found a tag thats appears inside $nsRoots, we have to switch the defalut namespace
if (isset($this->nsRoots[$lname]) && $this->nsStack[0][''] !== $this->nsRoots[$lname]) {
array_unshift($this->nsStack, array(
'' => $this->nsRoots[$lname],
) + $this->nsStack[0]);
$pushes++;
}
$needsWorkaround = false;
if (isset($this->options["xmlNamespaces"]) && $this->options["xmlNamespaces"]) {
// when xmlNamespaces is true a and we found a 'xmlns' or 'xmlns:*' attribute, we should add a new item to the $nsStack
foreach ($attributes as $aName => $aVal) {
if ($aName === 'xmlns') {
$needsWorkaround = $aVal;
array_unshift($this->nsStack, array(
'' => $aVal,
) + $this->nsStack[0]);
$pushes++;
}
elseif ((($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : '') === 'xmlns') {
array_unshift($this->nsStack, array(
substr($aName, $pos + 1) => $aVal,
) + $this->nsStack[0]);
$pushes++;
}
}
}
if ($this->onlyInline && Elements::isA($lname, Elements::BLOCK_TAG)) {
$this
->autoclose($this->onlyInline);
$this->onlyInline = null;
}
try {
$prefix = ($pos = strpos($lname, ':')) ? substr($lname, 0, $pos) : '';
if ($needsWorkaround !== false) {
$xml = "<{$lname} xmlns=\"{$needsWorkaround}\" " . (strlen($prefix) && isset($this->nsStack[0][$prefix]) ? "xmlns:{$prefix}=\"" . $this->nsStack[0][$prefix] . "\"" : "") . "/>";
$frag = new \DOMDocument('1.0', 'UTF-8');
$frag
->loadXML($xml);
$ele = $this->doc
->importNode($frag->documentElement, true);
}
else {
if (!isset($this->nsStack[0][$prefix]) || $prefix === "" && isset($this->options[self::OPT_DISABLE_HTML_NS]) && $this->options[self::OPT_DISABLE_HTML_NS]) {
$ele = $this->doc
->createElement($lname);
}
else {
$ele = $this->doc
->createElementNS($this->nsStack[0][$prefix], $lname);
}
}
} catch (\DOMException $e) {
$this
->parseError("Illegal tag name: <{$lname}>. Replaced with <invalid>.");
$ele = $this->doc
->createElement('invalid');
}
if (Elements::isA($lname, Elements::BLOCK_ONLY_INLINE)) {
$this->onlyInline = $lname;
}
// When we add some namespacess, we have to track them. Later, when "endElement" is invoked, we have to remove them.
// When we are on a void tag, we do not need to care about namesapce nesting.
if ($pushes > 0 && !Elements::isA($name, Elements::VOID_TAG)) {
// PHP tends to free the memory used by DOM,
// to avoid spl_object_hash collisions whe have to avoid garbage collection of $ele storing it into $pushes
// see https://bugs.php.net/bug.php?id=67459
$this->pushes[spl_object_hash($ele)] = array(
$pushes,
$ele,
);
// SEE https://github.com/facebook/hhvm/issues/2962
if (defined('HHVM_VERSION')) {
$ele
->setAttribute('html5-php-fake-id-attribute', spl_object_hash($ele));
}
}
foreach ($attributes as $aName => $aVal) {
// xmlns attributes can't be set
if ($aName === 'xmlns') {
continue;
}
if ($this->insertMode == static::IM_IN_SVG) {
$aName = Elements::normalizeSvgAttribute($aName);
}
elseif ($this->insertMode == static::IM_IN_MATHML) {
$aName = Elements::normalizeMathMlAttribute($aName);
}
try {
$prefix = ($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : false;
if ($prefix === 'xmlns') {
$ele
->setAttributeNs(self::NAMESPACE_XMLNS, $aName, $aVal);
}
elseif ($prefix !== false && isset($this->nsStack[0][$prefix])) {
$ele
->setAttributeNs($this->nsStack[0][$prefix], $aName, $aVal);
}
else {
$ele
->setAttribute($aName, $aVal);
}
} catch (\DOMException $e) {
$this
->parseError("Illegal attribute name for tag {$name}. Ignoring: {$aName}");
continue;
}
// This is necessary on a non-DTD schema, like HTML5.
if ($aName == 'id') {
$ele
->setIdAttribute('id', true);
}
}
// Some elements have special processing rules. Handle those separately.
if ($this->rules
->hasRules($name) && $this->frag !== $this->current) {
$this->current = $this->rules
->evaluate($ele, $this->current);
}
else {
$this->current
->appendChild($ele);
// XXX: Need to handle self-closing tags and unary tags.
if (!Elements::isA($name, Elements::VOID_TAG)) {
$this->current = $ele;
}
}
// This is sort of a last-ditch attempt to correct for cases where no head/body
// elements are provided.
if ($this->insertMode <= static::IM_BEFORE_HEAD && $name != 'head' && $name != 'html') {
$this->insertMode = static::IM_IN_BODY;
}
// When we are on a void tag, we do not need to care about namesapce nesting,
// but we have to remove the namespaces pushed to $nsStack.
if ($pushes > 0 && Elements::isA($name, Elements::VOID_TAG)) {
// remove the namespaced definded by current node
for ($i = 0; $i < $pushes; $i++) {
array_shift($this->nsStack);
}
}
// Return the element mask, which the tokenizer can then use to set
// various processing rules.
return Elements::element($name);
}
public function endTag($name) {
$lname = $this
->normalizeTagName($name);
// Ignore closing tags for unary elements.
if (Elements::isA($name, Elements::VOID_TAG)) {
return;
}
if ($this->insertMode <= static::IM_BEFORE_HTML) {
// 8.2.5.4.2
if (in_array($name, array(
'html',
'br',
'head',
'title',
))) {
$this
->startTag('html');
$this
->endTag($name);
$this->insertMode = static::IM_BEFORE_HEAD;
return;
}
// Ignore the tag.
$this
->parseError("Illegal closing tag at global scope.");
return;
}
// Special case handling for SVG.
if ($this->insertMode == static::IM_IN_SVG) {
$lname = Elements::normalizeSvgElement($lname);
}
// See https://github.com/facebook/hhvm/issues/2962
if (defined('HHVM_VERSION') && ($cid = $this->current
->getAttribute('html5-php-fake-id-attribute'))) {
$this->current
->removeAttribute('html5-php-fake-id-attribute');
}
else {
$cid = spl_object_hash($this->current);
}
// XXX: Not sure whether we need this anymore.
// if ($name != $lname) {
// return $this->quirksTreeResolver($lname);
// }
// XXX: HTML has no parent. What do we do, though,
// if this element appears in the wrong place?
if ($lname == 'html') {
return;
}
// remove the namespaced definded by current node
if (isset($this->pushes[$cid])) {
for ($i = 0; $i < $this->pushes[$cid][0]; $i++) {
array_shift($this->nsStack);
}
unset($this->pushes[$cid]);
}
if (!$this
->autoclose($lname)) {
$this
->parseError('Could not find closing tag for ' . $lname);
}
// switch ($this->insertMode) {
switch ($lname) {
case "head":
$this->insertMode = static::IM_AFTER_HEAD;
break;
case "body":
$this->insertMode = static::IM_AFTER_BODY;
break;
case "svg":
case "mathml":
$this->insertMode = static::IM_IN_BODY;
break;
}
}
public function comment($cdata) {
// TODO: Need to handle case where comment appears outside of the HTML tag.
$node = $this->doc
->createComment($cdata);
$this->current
->appendChild($node);
}
public function text($data) {
// XXX: Hmmm.... should we really be this strict?
if ($this->insertMode < static::IM_IN_HEAD) {
// Per '8.2.5.4.3 The "before head" insertion mode' the characters
// " \t\n\r\f" should be ignored but no mention of a parse error. This is
// practical as most documents contain these characters. Other text is not
// expected here so recording a parse error is necessary.
$dataTmp = trim($data, " \t\n\r\f");
if (!empty($dataTmp)) {
// fprintf(STDOUT, "Unexpected insert mode: %d", $this->insertMode);
$this
->parseError("Unexpected text. Ignoring: " . $dataTmp);
}
return;
}
// fprintf(STDOUT, "Appending text %s.", $data);
$node = $this->doc
->createTextNode($data);
$this->current
->appendChild($node);
}
public function eof() {
// If the $current isn't the $root, do we need to do anything?
}
public function parseError($msg, $line = 0, $col = 0) {
$this->errors[] = sprintf("Line %d, Col %d: %s", $line, $col, $msg);
}
public function getErrors() {
return $this->errors;
}
public function cdata($data) {
$node = $this->doc
->createCDATASection($data);
$this->current
->appendChild($node);
}
public function processingInstruction($name, $data = null) {
// XXX: Ignore initial XML declaration, per the spec.
if ($this->insertMode == static::IM_INITIAL && 'xml' == strtolower($name)) {
return;
}
// Important: The processor may modify the current DOM tree however
// it sees fit.
if (isset($this->processor)) {
$res = $this->processor
->process($this->current, $name, $data);
if (!empty($res)) {
$this->current = $res;
}
return;
}
// Otherwise, this is just a dumb PI element.
$node = $this->doc
->createProcessingInstruction($name, $data);
$this->current
->appendChild($node);
}
// ==========================================================================
// UTILITIES
// ==========================================================================
/**
* Apply normalization rules to a tag name.
*
* See sections 2.9 and 8.1.2.
*
* @param string $name
* The tag name.
* @return string The normalized tag name.
*/
protected function normalizeTagName($name) {
/*
* Section 2.9 suggests that we should not do this. if (strpos($name, ':') !== false) { // We know from the grammar that there must be at least one other // char besides :, since : is not a legal tag start. $parts = explode(':', $name); return array_pop($parts); }
*/
return $name;
}
protected function quirksTreeResolver($name) {
throw new \Exception("Not implemented.");
}
/**
* Automatically climb the tree and close the closest node with the matching $tag.
*/
protected function autoclose($tag) {
$working = $this->current;
do {
if ($working->nodeType != XML_ELEMENT_NODE) {
return false;
}
if ($working->tagName == $tag) {
$this->current = $working->parentNode;
return true;
}
} while ($working = $working->parentNode);
return false;
}
/**
* Checks if the given tagname is an ancestor of the present candidate.
*
* If $this->current or anything above $this->current matches the given tag
* name, this returns true.
*/
protected function isAncestor($tagname) {
$candidate = $this->current;
while ($candidate->nodeType === XML_ELEMENT_NODE) {
if ($candidate->tagName == $tagname) {
return true;
}
$candidate = $candidate->parentNode;
}
return false;
}
/**
* Returns true if the immediate parent element is of the given tagname.
*/
protected function isParent($tagname) {
return $this->current->tagName == $tagname;
}
}