<?php /* Copyright 2007 Jeroen van der Meer <http://jero.net/> Copyright 2009 Edward Z. Yang <edwardzyang@thewritingpot.com> Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ // Tags for FIX ME!!!: (in order of priority) // XXX - should be fixed NAO! // XERROR - with regards to parse errors // XSCRIPT - with regards to scripting mode // XENCODING - with regards to encoding (for reparsing tests) class HTML5_TreeBuilder { public $stack = array(); public $content_model; private $mode; private $original_mode; private $secondary_mode; private $dom; // Whether or not normal insertion of nodes should actually foster // parent (used in one case in spec) private $foster_parent = false; private $a_formatting = array(); private $head_pointer = null; private $form_pointer = null; private $flag_frameset_ok = true; private $flag_force_quirks = false; private $ignored = false; private $quirks_mode = null; // this gets to 2 when we want to ignore the next lf character, and // is decrement at the beginning of each processed token (this way, // code can check for (bool)$ignore_lf_token, but it phases out // appropriately) private $ignore_lf_token = 0; private $fragment = false; private $root; private $scoping = array('applet','button','caption','html','marquee','object','table','td','th', 'svg:foreignObject'); private $formatting = array('a','b','big','code','em','font','i','nobr','s','small','strike','strong','tt','u'); private $special = array('address','area','article','aside','base','basefont','bgsound', 'blockquote','body','br','center','col','colgroup','command','dd','details','dialog','dir','div','dl', 'dt','embed','fieldset','figure','footer','form','frame','frameset','h1','h2','h3','h4','h5', 'h6','head','header','hgroup','hr','iframe','img','input','isindex','li','link', 'listing','menu','meta','nav','noembed','noframes','noscript','ol', 'p','param','plaintext','pre','script','select','spacer','style', 'tbody','textarea','tfoot','thead','title','tr','ul','wbr'); // Tree construction modes const INITIAL = 0; const BEFORE_HTML = 1; const BEFORE_HEAD = 2; const IN_HEAD = 3; const IN_HEAD_NOSCRIPT = 4; const AFTER_HEAD = 5; const IN_BODY = 6; const IN_CDATA_RCDATA = 7; const IN_TABLE = 8; const IN_CAPTION = 9; const IN_COLUMN_GROUP = 10; const IN_TABLE_BODY = 11; const IN_ROW = 12; const IN_CELL = 13; const IN_SELECT = 14; const IN_SELECT_IN_TABLE= 15; const IN_FOREIGN_CONTENT= 16; const AFTER_BODY = 17; const IN_FRAMESET = 18; const AFTER_FRAMESET = 19; const AFTER_AFTER_BODY = 20; const AFTER_AFTER_FRAMESET = 21; /** * Converts a magic number to a readable name. Use for debugging. */ private function strConst($number) { static $lookup; if (!$lookup) { $r = new ReflectionClass('HTML5_TreeBuilder'); $lookup = array_flip($r->getConstants()); } return $lookup[$number]; } // The different types of elements. const SPECIAL = 100; const SCOPING = 101; const FORMATTING = 102; const PHRASING = 103; // Quirks modes in $quirks_mode const NO_QUIRKS = 200; const QUIRKS_MODE = 201; const LIMITED_QUIRKS_MODE = 202; // Marker to be placed in $a_formatting const MARKER = 300; // Namespaces for foreign content const NS_HTML = null; // to prevent DOM from requiring NS on everything const NS_MATHML = 'http://www.w3.org/1998/Math/MathML'; const NS_SVG = 'http://www.w3.org/2000/svg'; const NS_XLINK = 'http://www.w3.org/1999/xlink'; const NS_XML = 'http://www.w3.org/XML/1998/namespace'; const NS_XMLNS = 'http://www.w3.org/2000/xmlns/'; public function __construct() { $this->mode = self::INITIAL; $this->dom = new DOMDocument; $this->dom->encoding = 'UTF-8'; $this->dom->preserveWhiteSpace = true; $this->dom->substituteEntities = true; $this->dom->strictErrorChecking = false; } // Process tag tokens public function emitToken($token, $mode = null) { // XXX: ignore parse errors... why are we emitting them, again? if ($token['type'] === HTML5_Tokenizer::PARSEERROR) return; if ($mode === null) $mode = $this->mode; /* $backtrace = debug_backtrace(); if ($backtrace[1]['class'] !== 'HTML5_TreeBuilder') echo "--\n"; echo $this->strConst($mode); if ($this->original_mode) echo " (originally ".$this->strConst($this->original_mode).")"; echo "\n "; token_dump($token); $this->printStack(); $this->printActiveFormattingElements(); if ($this->foster_parent) echo " -> this is a foster parent mode\n"; */ if ($this->ignore_lf_token) $this->ignore_lf_token--; $this->ignored = false; // indenting is a little wonky, this can be changed later on switch ($mode) { case self::INITIAL: /* A character token that is one of U+0009 CHARACTER TABULATION, * U+000A LINE FEED (LF), U+000C FORM FEED (FF), or U+0020 SPACE */ if ($token['type'] === HTML5_Tokenizer::SPACECHARACTER) { /* Ignore the token. */ $this->ignored = true; } elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) { if ( $token['name'] !== 'html' || !empty($token['public']) || !empty($token['system']) || $token !== 'about:legacy-compat' ) { /* If the DOCTYPE token's name is not a case-sensitive match * for the string "html", or if the token's public identifier * is not missing, or if the token's system identifier is * neither missing nor a case-sensitive match for the string * "about:legacy-compat", then there is a parse error (this * is the DOCTYPE parse error). */ // DOCTYPE parse error } /* Append a DocumentType node to the Document node, with the name * attribute set to the name given in the DOCTYPE token, or the * empty string if the name was missing; the publicId attribute * set to the public identifier given in the DOCTYPE token, or * the empty string if the public identifier was missing; the * systemId attribute set to the system identifier given in the * DOCTYPE token, or the empty string if the system identifier * was missing; and the other attributes specific to * DocumentType objects set to null and empty lists as * appropriate. Associate the DocumentType node with the * Document object so that it is returned as the value of the * doctype attribute of the Document object. */ if (!isset($token['public'])) $token['public'] = null; if (!isset($token['system'])) $token['system'] = null; // Yes this is hacky. I'm kind of annoyed that I can't appendChild // a doctype to DOMDocument. Maybe I haven't chanted the right // syllables. $impl = new DOMImplementation(); // This call can fail for particularly pathological cases (namely, // the qualifiedName parameter ($token['name']) could be missing. if ($token['name']) { $doctype = $impl->createDocumentType($token['name'], $token['public'], $token['system']); $this->dom->appendChild($doctype); } else { // It looks like libxml's not actually *able* to express this case. // So... don't. $this->dom->emptyDoctype = true; } $public = is_null($token['public']) ? false : strtolower($token['public']); $system = is_null($token['system']) ? false : strtolower($token['system']); $publicStartsWithForQuirks = array( "+//silmaril//dtd html pro v0r11 19970101//", "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", "-//as//dtd html 3.0 aswedit + extensions//", "-//ietf//dtd html 2.0 level 1//", "-//ietf//dtd html 2.0 level 2//", "-//ietf//dtd html 2.0 strict level 1//", "-//ietf//dtd html 2.0 strict level 2//", "-//ietf//dtd html 2.0 strict//", "-//ietf//dtd html 2.0//", "-//ietf//dtd html 2.1e//", "-//ietf//dtd html 3.0//", "-//ietf//dtd html 3.2 final//", "-//ietf//dtd html 3.2//", "-//ietf//dtd html 3//", "-//ietf//dtd html level 0//", "-//ietf//dtd html level 1//", "-//ietf//dtd html level 2//", "-//ietf//dtd html level 3//", "-//ietf//dtd html strict level 0//", "-//ietf//dtd html strict level 1//", "-//ietf//dtd html strict level 2//", "-//ietf//dtd html strict level 3//", "-//ietf//dtd html strict//", "-//ietf//dtd html//", "-//metrius//dtd metrius presentational//", "-//microsoft//dtd internet explorer 2.0 html strict//", "-//microsoft//dtd internet explorer 2.0 html//", "-//microsoft//dtd internet explorer 2.0 tables//", "-//microsoft//dtd internet explorer 3.0 html strict//", "-//microsoft//dtd internet explorer 3.0 html//", "-//microsoft//dtd internet explorer 3.0 tables//", "-//netscape comm. corp.//dtd html//", "-//netscape comm. corp.//dtd strict html//", "-//o'reilly and associates//dtd html 2.0//", "-//o'reilly and associates//dtd html extended 1.0//", "-//o'reilly and associates//dtd html extended relaxed 1.0//", "-//spyglass//dtd html 2.0 extended//", "-//sq//dtd html 2.0 hotmetal + extensions//", "-//sun microsystems corp.//dtd hotjava html//", "-//sun microsystems corp.//dtd hotjava strict html//", "-//w3c//dtd html 3 1995-03-24//", "-//w3c//dtd html 3.2 draft//", "-//w3c//dtd html 3.2 final//", "-//w3c//dtd html 3.2//", "-//w3c//dtd html 3.2s draft//", "-//w3c//dtd html 4.0 frameset//", "-//w3c//dtd html 4.0 transitional//", "-//w3c//dtd html experimental 19960712//", "-//w3c//dtd html experimental 970421//", "-//w3c//dtd w3 html//", "-//w3o//dtd w3 html 3.0//", "-//webtechs//dtd mozilla html 2.0//", "-//webtechs//dtd mozilla html//", ); $publicSetToForQuirks = array( "-//w3o//dtd w3 html strict 3.0//", "-/w3c/dtd html 4.0 transitional/en", "html", ); $publicStartsWithAndSystemForQuirks = array( "-//w3c//dtd html 4.01 frameset//", "-//w3c//dtd html 4.01 transitional//", ); $publicStartsWithForLimitedQuirks = array( "-//w3c//dtd xhtml 1.0 frameset//", "-//w3c//dtd xhtml 1.0 transitional//", ); $publicStartsWithAndSystemForLimitedQuirks = array( "-//w3c//dtd html 4.01 frameset//", "-//w3c//dtd html 4.01 transitional//", ); // first, do easy checks if ( !empty($token['force-quirks']) || strtolower($token['name']) !== 'html' ) { $this->quirks_mode = self::QUIRKS_MODE; } else { do { if ($system) { foreach ($publicStartsWithAndSystemForQuirks as $x) { if (strncmp($public, $x, strlen($x)) === 0) { $this->quirks_mode = self::QUIRKS_MODE; break; } } if (!is_null($this->quirks_mode)) break; foreach ($publicStartsWithAndSystemForLimitedQuirks as $x) { if (strncmp($public, $x, strlen($x)) === 0) { $this->quirks_mode = self::LIMITED_QUIRKS_MODE; break; } } if (!is_null($this->quirks_mode)) break; } foreach ($publicSetToForQuirks as $x) { if ($public === $x) { $this->quirks_mode = self::QUIRKS_MODE; break; } } if (!is_null($this->quirks_mode)) break; foreach ($publicStartsWithForLimitedQuirks as $x) { if (strncmp($public, $x, strlen($x)) === 0) { $this->quirks_mode = self::LIMITED_QUIRKS_MODE; } } if (!is_null($this->quirks_mode)) break; if ($system === "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") { $this->quirks_mode = self::QUIRKS_MODE; break; } foreach ($publicStartsWithForQuirks as $x) { if (strncmp($public, $x, strlen($x)) === 0) { $this->quirks_mode = self::QUIRKS_MODE; break; } } if (is_null($this->quirks_mode)) { $this->quirks_mode = self::NO_QUIRKS; } } while (false); } $this->mode = self::BEFORE_HTML; } else { // parse error /* Switch the insertion mode to "before html", then reprocess the * current token. */ $this->mode = self::BEFORE_HTML; $this->quirks_mode = self::QUIRKS_MODE; $this->emitToken($token); } break; case self::BEFORE_HTML: /* A DOCTYPE token */ if($token['type'] === HTML5_Tokenizer::DOCTYPE) { // Parse error. Ignore the token. $this->ignored = true; /* A comment token */ } elseif($token['type'] === HTML5_Tokenizer::COMMENT) { /* Append a Comment node to the Document object with the data attribute set to the data given in the comment token. */ $comment = $this->dom->createComment($token['data']); $this->dom->appendChild($comment); /* A character token that is one of one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), or U+0020 SPACE */ } elseif($token['type'] === HTML5_Tokenizer::SPACECHARACTER) { /* Ignore the token. */ $this->ignored = true; /* A start tag whose tag name is "html" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] == 'html') { /* Create an element for the token in the HTML namespace. Append it * to the Document object. Put this element in the stack of open * elements. */ $html = $this->insertElement($token, false); $this->dom->appendChild($html); $this->stack[] = $html; $this->mode = self::BEFORE_HEAD; } else { /* Create an html element. Append it to the Document object. Put * this element in the stack of open elements. */ $html = $this->dom->createElementNS(self::NS_HTML, 'html'); $this->dom->appendChild($html); $this->stack[] = $html; /* Switch the insertion mode to "before head", then reprocess the * current token. */ $this->mode = self::BEFORE_HEAD; $this->emitToken($token); } break; case self::BEFORE_HEAD: /* A character token that is one of one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), or U+0020 SPACE */ if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) { /* Ignore the token. */ $this->ignored = true; /* A comment token */ } elseif($token['type'] === HTML5_Tokenizer::COMMENT) { /* Append a Comment node to the current node with the data attribute set to the data given in the comment token. */ $this->insertComment($token['data']); /* A DOCTYPE token */ } elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) { /* Parse error. Ignore the token */ $this->ignored = true; // parse error /* A start tag token with the tag name "html" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') { /* Process the token using the rules for the "in body" * insertion mode. */ $this->processWithRulesFor($token, self::IN_BODY); /* A start tag token with the tag name "head" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'head') { /* Insert an HTML element for the token. */ $element = $this->insertElement($token); /* Set the head element pointer to this new element node. */ $this->head_pointer = $element; /* Change the insertion mode to "in head". */ $this->mode = self::IN_HEAD; /* An end tag whose tag name is one of: "head", "body", "html", "br" */ } elseif( $token['type'] === HTML5_Tokenizer::ENDTAG && ( $token['name'] === 'head' || $token['name'] === 'body' || $token['name'] === 'html' || $token['name'] === 'br' )) { /* Act as if a start tag token with the tag name "head" and no * attributes had been seen, then reprocess the current token. */ $this->emitToken(array( 'name' => 'head', 'type' => HTML5_Tokenizer::STARTTAG, 'attr' => array() )); $this->emitToken($token); /* Any other end tag */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG) { /* Parse error. Ignore the token. */ $this->ignored = true; } else { /* Act as if a start tag token with the tag name "head" and no * attributes had been seen, then reprocess the current token. * Note: This will result in an empty head element being * generated, with the current token being reprocessed in the * "after head" insertion mode. */ $this->emitToken(array( 'name' => 'head', 'type' => HTML5_Tokenizer::STARTTAG, 'attr' => array() )); $this->emitToken($token); } break; case self::IN_HEAD: /* A character token that is one of one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), or U+0020 SPACE. */ if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) { /* Insert the character into the current node. */ $this->insertText($token['data']); /* A comment token */ } elseif($token['type'] === HTML5_Tokenizer::COMMENT) { /* Append a Comment node to the current node with the data attribute set to the data given in the comment token. */ $this->insertComment($token['data']); /* A DOCTYPE token */ } elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) { /* Parse error. Ignore the token. */ $this->ignored = true; // parse error /* A start tag whose tag name is "html" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') { $this->processWithRulesFor($token, self::IN_BODY); /* A start tag whose tag name is one of: "base", "command", "link" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && ($token['name'] === 'base' || $token['name'] === 'command' || $token['name'] === 'link')) { /* Insert an HTML element for the token. Immediately pop the * current node off the stack of open elements. */ $this->insertElement($token); array_pop($this->stack); // YYY: Acknowledge the token's self-closing flag, if it is set. /* A start tag whose tag name is "meta" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'meta') { /* Insert an HTML element for the token. Immediately pop the * current node off the stack of open elements. */ $this->insertElement($token); array_pop($this->stack); // XERROR: Acknowledge the token's self-closing flag, if it is set. // XENCODING: If the element has a charset attribute, and its value is a // supported encoding, and the confidence is currently tentative, // then change the encoding to the encoding given by the value of // the charset attribute. // // Otherwise, if the element has a content attribute, and applying // the algorithm for extracting an encoding from a Content-Type to // its value returns a supported encoding encoding, and the // confidence is currently tentative, then change the encoding to // the encoding encoding. /* A start tag with the tag name "title" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'title') { $this->insertRCDATAElement($token); /* A start tag whose tag name is "noscript", if the scripting flag is enabled, or * A start tag whose tag name is one of: "noframes", "style" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && ($token['name'] === 'noscript' || $token['name'] === 'noframes' || $token['name'] === 'style')) { // XSCRIPT: Scripting flag not respected $this->insertCDATAElement($token); // XSCRIPT: Scripting flag disable not implemented /* A start tag with the tag name "script" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'script') { /* 1. Create an element for the token in the HTML namespace. */ $node = $this->insertElement($token, false); /* 2. Mark the element as being "parser-inserted" */ // Uhhh... XSCRIPT /* 3. If the parser was originally created for the HTML * fragment parsing algorithm, then mark the script element as * "already executed". (fragment case) */ // ditto... XSCRIPT /* 4. Append the new element to the current node and push it onto * the stack of open elements. */ end($this->stack)->appendChild($node); $this->stack[] = $node; // I guess we could squash these together /* 6. Let the original insertion mode be the current insertion mode. */ $this->original_mode = $this->mode; /* 7. Switch the insertion mode to "in CDATA/RCDATA" */ $this->mode = self::IN_CDATA_RCDATA; /* 5. Switch the tokeniser's content model flag to the CDATA state. */ $this->content_model = HTML5_Tokenizer::CDATA; /* An end tag with the tag name "head" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'head') { /* Pop the current node (which will be the head element) off the stack of open elements. */ array_pop($this->stack); /* Change the insertion mode to "after head". */ $this->mode = self::AFTER_HEAD; // Slight logic inversion here to minimize duplication /* A start tag with the tag name "head". */ /* An end tag whose tag name is not one of: "body", "html", "br" */ } elseif(($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'head') || ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] !== 'html' && $token['name'] !== 'body' && $token['name'] !== 'br')) { // Parse error. Ignore the token. $this->ignored = true; /* Anything else */ } else { /* Act as if an end tag token with the tag name "head" had been * seen, and reprocess the current token. */ $this->emitToken(array( 'name' => 'head', 'type' => HTML5_Tokenizer::ENDTAG )); /* Then, reprocess the current token. */ $this->emitToken($token); } break; case self::IN_HEAD_NOSCRIPT: if ($token['type'] === HTML5_Tokenizer::DOCTYPE) { // parse error } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') { $this->processWithRulesFor($token, self::IN_BODY); } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'noscript') { /* Pop the current node (which will be a noscript element) from the * stack of open elements; the new current node will be a head * element. */ array_pop($this->stack); $this->mode = self::IN_HEAD; } elseif ( ($token['type'] === HTML5_Tokenizer::SPACECHARACTER) || ($token['type'] === HTML5_Tokenizer::COMMENT) || ($token['type'] === HTML5_Tokenizer::STARTTAG && ( $token['name'] === 'link' || $token['name'] === 'meta' || $token['name'] === 'noframes' || $token['name'] === 'style'))) { $this->processWithRulesFor($token, self::IN_HEAD); // inverted logic } elseif ( ($token['type'] === HTML5_Tokenizer::STARTTAG && ( $token['name'] === 'head' || $token['name'] === 'noscript')) || ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] !== 'br')) { // parse error } else { // parse error $this->emitToken(array( 'type' => HTML5_Tokenizer::ENDTAG, 'name' => 'noscript', )); $this->emitToken($token); } break; case self::AFTER_HEAD: /* Handle the token as follows: */ /* A character token that is one of one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), or U+0020 SPACE */ if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) { /* Append the character to the current node. */ $this->insertText($token['data']); /* A comment token */ } elseif($token['type'] === HTML5_Tokenizer::COMMENT) { /* Append a Comment node to the current node with the data attribute set to the data given in the comment token. */ $this->insertComment($token['data']); } elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) { // parse error } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') { $this->processWithRulesFor($token, self::IN_BODY); /* A start tag token with the tag name "body" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'body') { $this->insertElement($token); /* Set the frameset-ok flag to "not ok". */ $this->flag_frameset_ok = false; /* Change the insertion mode to "in body". */ $this->mode = self::IN_BODY; /* A start tag token with the tag name "frameset" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'frameset') { /* Insert a frameset element for the token. */ $this->insertElement($token); /* Change the insertion mode to "in frameset". */ $this->mode = self::IN_FRAMESET; /* A start tag token whose tag name is one of: "base", "link", "meta", "script", "style", "title" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'], array('base', 'link', 'meta', 'noframes', 'script', 'style', 'title'))) { // parse error /* Push the node pointed to by the head element pointer onto the * stack of open elements. */ $this->stack[] = $this->head_pointer; $this->processWithRulesFor($token, self::IN_HEAD); array_splice($this->stack, array_search($this->head_pointer, $this->stack, true), 1); // inversion of specification } elseif( ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'head') || ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] !== 'body' && $token['name'] !== 'html' && $token['name'] !== 'br')) { // parse error /* Anything else */ } else { $this->emitToken(array( 'name' => 'body', 'type' => HTML5_Tokenizer::STARTTAG, 'attr' => array() )); $this->flag_frameset_ok = true; $this->emitToken($token); } break; case self::IN_BODY: /* Handle the token as follows: */ switch($token['type']) { /* A character token */ case HTML5_Tokenizer::CHARACTER: case HTML5_Tokenizer::SPACECHARACTER: /* Reconstruct the active formatting elements, if any. */ $this->reconstructActiveFormattingElements(); /* Append the token's character to the current node. */ $this->insertText($token['data']); /* If the token is not one of U+0009 CHARACTER TABULATION, * U+000A LINE FEED (LF), U+000C FORM FEED (FF), or U+0020 * SPACE, then set the frameset-ok flag to "not ok". */ // i.e., if any of the characters is not whitespace if (strlen($token['data']) !== strspn($token['data'], HTML5_Tokenizer::WHITESPACE)) { $this->flag_frameset_ok = false; } break; /* A comment token */ case HTML5_Tokenizer::COMMENT: /* Append a Comment node to the current node with the data attribute set to the data given in the comment token. */ $this->insertComment($token['data']); break; case HTML5_Tokenizer::DOCTYPE: // parse error break; case HTML5_Tokenizer::STARTTAG: switch($token['name']) { case 'html': // parse error /* For each attribute on the token, check to see if the * attribute is already present on the top element of the * stack of open elements. If it is not, add the attribute * and its corresponding value to that element. */ foreach($token['attr'] as $attr) { if(!$this->stack[0]->hasAttribute($attr['name'])) { $this->stack[0]->setAttribute($attr['name'], $attr['value']); } } break; case 'base': case 'command': case 'link': case 'meta': case 'noframes': case 'script': case 'style': case 'title': /* Process the token as if the insertion mode had been "in head". */ $this->processWithRulesFor($token, self::IN_HEAD); break; /* A start tag token with the tag name "body" */ case 'body': /* Parse error. If the second element on the stack of open elements is not a body element, or, if the stack of open elements has only one node on it, then ignore the token. (fragment case) */ if(count($this->stack) === 1 || $this->stack[1]->tagName !== 'body') { $this->ignored = true; // Ignore /* Otherwise, for each attribute on the token, check to see if the attribute is already present on the body element (the second element) on the stack of open elements. If it is not, add the attribute and its corresponding value to that element. */ } else { foreach($token['attr'] as $attr) { if(!$this->stack[1]->hasAttribute($attr['name'])) { $this->stack[1]->setAttribute($attr['name'], $attr['value']); } } } break; case 'frameset': // parse error /* If the second element on the stack of open elements is * not a body element, or, if the stack of open elements * has only one node on it, then ignore the token. * (fragment case) */ if(count($this->stack) === 1 || $this->stack[1]->tagName !== 'body') { $this->ignored = true; // Ignore } elseif (!$this->flag_frameset_ok) { $this->ignored = true; // Ignore } else { /* 1. Remove the second element on the stack of open * elements from its parent node, if it has one. */ if($this->stack[1]->parentNode) { $this->stack[1]->parentNode->removeChild($this->stack[1]); } /* 2. Pop all the nodes from the bottom of the stack of * open elements, from the current node up to the root * html element. */ array_splice($this->stack, 1); $this->insertElement($token); $this->mode = self::IN_FRAMESET; } break; // in spec, there is a diversion here case 'address': case 'article': case 'aside': case 'blockquote': case 'center': case 'datagrid': case 'details': case 'dialog': case 'dir': case 'div': case 'dl': case 'fieldset': case 'figure': case 'footer': case 'header': case 'hgroup': case 'menu': case 'nav': case 'ol': case 'p': case 'section': case 'ul': /* If the stack of open elements has a p element in scope, then act as if an end tag with the tag name p had been seen. */ if($this->elementInScope('p')) { $this->emitToken(array( 'name' => 'p', 'type' => HTML5_Tokenizer::ENDTAG )); } /* Insert an HTML element for the token. */ $this->insertElement($token); break; /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6" */ case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': /* If the stack of open elements has a p element in scope, then act as if an end tag with the tag name p had been seen. */ if($this->elementInScope('p')) { $this->emitToken(array( 'name' => 'p', 'type' => HTML5_Tokenizer::ENDTAG )); } /* If the current node is an element whose tag name is one * of "h1", "h2", "h3", "h4", "h5", or "h6", then this is a * parse error; pop the current node off the stack of open * elements. */ $peek = array_pop($this->stack); if (in_array($peek->tagName, array("h1", "h2", "h3", "h4", "h5", "h6"))) { // parse error } else { $this->stack[] = $peek; } /* Insert an HTML element for the token. */ $this->insertElement($token); break; case 'pre': case 'listing': /* If the stack of open elements has a p element in scope, then act as if an end tag with the tag name p had been seen. */ if($this->elementInScope('p')) { $this->emitToken(array( 'name' => 'p', 'type' => HTML5_Tokenizer::ENDTAG )); } $this->insertElement($token); /* If the next token is a U+000A LINE FEED (LF) character * token, then ignore that token and move on to the next * one. (Newlines at the start of pre blocks are ignored as * an authoring convenience.) */ $this->ignore_lf_token = 2; $this->flag_frameset_ok = false; break; /* A start tag whose tag name is "form" */ case 'form': /* If the form element pointer is not null, ignore the token with a parse error. */ if($this->form_pointer !== null) { $this->ignored = true; // Ignore. /* Otherwise: */ } else { /* If the stack of open elements has a p element in scope, then act as if an end tag with the tag name p had been seen. */ if($this->elementInScope('p')) { $this->emitToken(array( 'name' => 'p', 'type' => HTML5_Tokenizer::ENDTAG )); } /* Insert an HTML element for the token, and set the form element pointer to point to the element created. */ $element = $this->insertElement($token); $this->form_pointer = $element; } break; // condensed specification case 'li': case 'dd': case 'dt': /* 1. Set the frameset-ok flag to "not ok". */ $this->flag_frameset_ok = false; $stack_length = count($this->stack) - 1; for($n = $stack_length; 0 <= $n; $n--) { /* 2. Initialise node to be the current node (the bottommost node of the stack). */ $stop = false; $node = $this->stack[$n]; $cat = $this->getElementCategory($node); // for case 'li': /* 3. If node is an li element, then act as if an end * tag with the tag name "li" had been seen, then jump * to the last step. */ // for case 'dd': case 'dt': /* If node is a dd or dt element, then act as if an end * tag with the same tag name as node had been seen, then * jump to the last step. */ if(($token['name'] === 'li' && $node->tagName === 'li') || ($token['name'] !== 'li' && ($node->tagName === 'dd' || $node->tagName === 'dt'))) { // limited conditional $this->emitToken(array( 'type' => HTML5_Tokenizer::ENDTAG, 'name' => $node->tagName, )); break; } /* 4. If node is not in the formatting category, and is not in the phrasing category, and is not an address, div or p element, then stop this algorithm. */ if($cat !== self::FORMATTING && $cat !== self::PHRASING && $node->tagName !== 'address' && $node->tagName !== 'div' && $node->tagName !== 'p') { break; } /* 5. Otherwise, set node to the previous entry in the * stack of open elements and return to step 2. */ } /* 6. This is the last step. */ /* If the stack of open elements has a p element in scope, then act as if an end tag with the tag name p had been seen. */ if($this->elementInScope('p')) { $this->emitToken(array( 'name' => 'p', 'type' => HTML5_Tokenizer::ENDTAG )); } /* Finally, insert an HTML element with the same tag name as the token's. */ $this->insertElement($token); break; /* A start tag token whose tag name is "plaintext" */ case 'plaintext': /* If the stack of open elements has a p element in scope, then act as if an end tag with the tag name p had been seen. */ if($this->elementInScope('p')) { $this->emitToken(array( 'name' => 'p', 'type' => HTML5_Tokenizer::ENDTAG )); } /* Insert an HTML element for the token. */ $this->insertElement($token); $this->content_model = HTML5_Tokenizer::PLAINTEXT; break; // more diversions /* A start tag whose tag name is "a" */ case 'a': /* If the list of active formatting elements contains an element whose tag name is "a" between the end of the list and the last marker on the list (or the start of the list if there is no marker on the list), then this is a parse error; act as if an end tag with the tag name "a" had been seen, then remove that element from the list of active formatting elements and the stack of open elements if the end tag didn't already remove it (it might not have if the element is not in table scope). */ $leng = count($this->a_formatting); for($n = $leng - 1; $n >= 0; $n--) { if($this->a_formatting[$n] === self::MARKER) { break; } elseif($this->a_formatting[$n]->tagName === 'a') { $a = $this->a_formatting[$n]; $this->emitToken(array( 'name' => 'a', 'type' => HTML5_Tokenizer::ENDTAG )); if (in_array($a, $this->a_formatting)) { $a_i = array_search($a, $this->a_formatting, true); if($a_i !== false) array_splice($this->a_formatting, $a_i, 1); } if (in_array($a, $this->stack)) { $a_i = array_search($a, $this->stack, true); if ($a_i !== false) array_splice($this->stack, $a_i, 1); } break; } } /* Reconstruct the active formatting elements, if any. */ $this->reconstructActiveFormattingElements(); /* Insert an HTML element for the token. */ $el = $this->insertElement($token); /* Add that element to the list of active formatting elements. */ $this->a_formatting[] = $el; break; case 'b': case 'big': case 'code': case 'em': case 'font': case 'i': case 's': case 'small': case 'strike': case 'strong': case 'tt': case 'u': /* Reconstruct the active formatting elements, if any. */ $this->reconstructActiveFormattingElements(); /* Insert an HTML element for the token. */ $el = $this->insertElement($token); /* Add that element to the list of active formatting elements. */ $this->a_formatting[] = $el; break; case 'nobr': /* Reconstruct the active formatting elements, if any. */ $this->reconstructActiveFormattingElements(); /* If the stack of open elements has a nobr element in * scope, then this is a parse error; act as if an end tag * with the tag name "nobr" had been seen, then once again * reconstruct the active formatting elements, if any. */ if ($this->elementInScope('nobr')) { $this->emitToken(array( 'name' => 'nobr', 'type' => HTML5_Tokenizer::ENDTAG, )); $this->reconstructActiveFormattingElements(); } /* Insert an HTML element for the token. */ $el = $this->insertElement($token); /* Add that element to the list of active formatting elements. */ $this->a_formatting[] = $el; break; // another diversion /* A start tag token whose tag name is "button" */ case 'button': /* If the stack of open elements has a button element in scope, then this is a parse error; act as if an end tag with the tag name "button" had been seen, then reprocess the token. (We don't do that. Unnecessary.) (I hope you're right! -- ezyang) */ if($this->elementInScope('button')) { $this->emitToken(array( 'name' => 'button', 'type' => HTML5_Tokenizer::ENDTAG )); } /* Reconstruct the active formatting elements, if any. */ $this->reconstructActiveFormattingElements(); /* Insert an HTML element for the token. */ $this->insertElement($token); /* Insert a marker at the end of the list of active formatting elements. */ $this->a_formatting[] = self::MARKER; $this->flag_frameset_ok = false; break; case 'applet': case 'marquee': case 'object': /* Reconstruct the active formatting elements, if any. */ $this->reconstructActiveFormattingElements(); /* Insert an HTML element for the token. */ $this->insertElement($token); /* Insert a marker at the end of the list of active formatting elements. */ $this->a_formatting[] = self::MARKER; $this->flag_frameset_ok = false; break; // spec diversion /* A start tag whose tag name is "table" */ case 'table': /* If the stack of open elements has a p element in scope, then act as if an end tag with the tag name p had been seen. */ if($this->quirks_mode !== self::QUIRKS_MODE && $this->elementInScope('p')) { $this->emitToken(array( 'name' => 'p', 'type' => HTML5_Tokenizer::ENDTAG )); } /* Insert an HTML element for the token. */ $this->insertElement($token); $this->flag_frameset_ok = false; /* Change the insertion mode to "in table". */ $this->mode = self::IN_TABLE; break; /* A start tag whose tag name is one of: "area", "basefont", "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */ case 'area': case 'basefont': case 'bgsound': case 'br': case 'embed': case 'img': case 'input': case 'keygen': case 'spacer': case 'wbr': /* Reconstruct the active formatting elements, if any. */ $this->reconstructActiveFormattingElements(); /* Insert an HTML element for the token. */ $this->insertElement($token); /* Immediately pop the current node off the stack of open elements. */ array_pop($this->stack); // YYY: Acknowledge the token's self-closing flag, if it is set. $this->flag_frameset_ok = false; break; case 'param': case 'source': /* Insert an HTML element for the token. */ $this->insertElement($token); /* Immediately pop the current node off the stack of open elements. */ array_pop($this->stack); // YYY: Acknowledge the token's self-closing flag, if it is set. break; /* A start tag whose tag name is "hr" */ case 'hr': /* If the stack of open elements has a p element in scope, then act as if an end tag with the tag name p had been seen. */ if($this->elementInScope('p')) { $this->emitToken(array( 'name' => 'p', 'type' => HTML5_Tokenizer::ENDTAG )); } /* Insert an HTML element for the token. */ $this->insertElement($token); /* Immediately pop the current node off the stack of open elements. */ array_pop($this->stack); // YYY: Acknowledge the token's self-closing flag, if it is set. $this->flag_frameset_ok = false; break; /* A start tag whose tag name is "image" */ case 'image': /* Parse error. Change the token's tag name to "img" and reprocess it. (Don't ask.) */ $token['name'] = 'img'; $this->emitToken($token); break; /* A start tag whose tag name is "isindex" */ case 'isindex': /* Parse error. */ /* If the form element pointer is not null, then ignore the token. */ if($this->form_pointer === null) { /* Act as if a start tag token with the tag name "form" had been seen. */ /* If the token has an attribute called "action", set * the action attribute on the resulting form * element to the value of the "action" attribute of * the token. */ $attr = array(); $action = $this->getAttr($token, 'action'); if ($action !== false) { $attr[] = array('name' => 'action', 'value' => $action); } $this->emitToken(array( 'name' => 'form', 'type' => HTML5_Tokenizer::STARTTAG, 'attr' => $attr )); /* Act as if a start tag token with the tag name "hr" had been seen. */ $this->emitToken(array( 'name' => 'hr', 'type' => HTML5_Tokenizer::STARTTAG, 'attr' => array() )); /* Act as if a start tag token with the tag name "p" had been seen. */ $this->emitToken(array( 'name' => 'p', 'type' => HTML5_Tokenizer::STARTTAG, 'attr' => array() )); /* Act as if a start tag token with the tag name "label" had been seen. */ $this->emitToken(array( 'name' => 'label', 'type' => HTML5_Tokenizer::STARTTAG, 'attr' => array() )); /* Act as if a stream of character tokens had been seen. */ $prompt = $this->getAttr($token, 'prompt'); if ($prompt === false) { $prompt = 'This is a searchable index. '. 'Insert your search keywords here: '; } $this->emitToken(array( 'data' => $prompt, 'type' => HTML5_Tokenizer::CHARACTER, )); /* Act as if a start tag token with the tag name "input" had been seen, with all the attributes from the "isindex" token, except with the "name" attribute set to the value "isindex" (ignoring any explicit "name" attribute). */ $attr = array(); foreach ($token['attr'] as $keypair) { if ($keypair['name'] === 'name' || $keypair['name'] === 'action' || $keypair['name'] === 'prompt') continue; $attr[] = $keypair; } $attr[] = array('name' => 'name', 'value' => 'isindex'); $this->emitToken(array( 'name' => 'input', 'type' => HTML5_Tokenizer::STARTTAG, 'attr' => $attr )); /* Act as if an end tag token with the tag name "label" had been seen. */ $this->emitToken(array( 'name' => 'label', 'type' => HTML5_Tokenizer::ENDTAG )); /* Act as if an end tag token with the tag name "p" had been seen. */ $this->emitToken(array( 'name' => 'p', 'type' => HTML5_Tokenizer::ENDTAG )); /* Act as if a start tag token with the tag name "hr" had been seen. */ $this->emitToken(array( 'name' => 'hr', 'type' => HTML5_Tokenizer::STARTTAG )); /* Act as if an end tag token with the tag name "form" had been seen. */ $this->emitToken(array( 'name' => 'form', 'type' => HTML5_Tokenizer::ENDTAG )); } else { $this->ignored = true; } break; /* A start tag whose tag name is "textarea" */ case 'textarea': $this->insertElement($token); /* If the next token is a U+000A LINE FEED (LF) * character token, then ignore that token and move on to * the next one. (Newlines at the start of textarea * elements are ignored as an authoring convenience.) * need flag, see also <pre> */ $this->ignore_lf_token = 2; $this->original_mode = $this->mode; $this->flag_frameset_ok = false; $this->mode = self::IN_CDATA_RCDATA; /* Switch the tokeniser's content model flag to the RCDATA state. */ $this->content_model = HTML5_Tokenizer::RCDATA; break; /* A start tag token whose tag name is "xmp" */ case 'xmp': /* Reconstruct the active formatting elements, if any. */ $this->reconstructActiveFormattingElements(); $this->flag_frameset_ok = false; $this->insertCDATAElement($token); break; case 'iframe': $this->flag_frameset_ok = false; $this->insertCDATAElement($token); break; case 'noembed': case 'noscript': // XSCRIPT: should check scripting flag $this->insertCDATAElement($token); break; /* A start tag whose tag name is "select" */ case 'select': /* Reconstruct the active formatting elements, if any. */ $this->reconstructActiveFormattingElements(); /* Insert an HTML element for the token. */ $this->insertElement($token); $this->flag_frameset_ok = false; /* If the insertion mode is one of in table", "in caption", * "in column group", "in table body", "in row", or "in * cell", then switch the insertion mode to "in select in * table". Otherwise, switch the insertion mode to "in * select". */ if ( $this->mode === self::IN_TABLE || $this->mode === self::IN_CAPTION || $this->mode === self::IN_COLUMN_GROUP || $this->mode ==+self::IN_TABLE_BODY || $this->mode === self::IN_ROW || $this->mode === self::IN_CELL ) { $this->mode = self::IN_SELECT_IN_TABLE; } else { $this->mode = self::IN_SELECT; } break; case 'option': case 'optgroup': if ($this->elementInScope('option')) { $this->emitToken(array( 'name' => 'option', 'type' => HTML5_Tokenizer::ENDTAG, )); } $this->reconstructActiveFormattingElements(); $this->insertElement($token); break; case 'rp': case 'rt': /* If the stack of open elements has a ruby element in scope, then generate * implied end tags. If the current node is not then a ruby element, this is * a parse error; pop all the nodes from the current node up to the node * immediately before the bottommost ruby element on the stack of open elements. */ if ($this->elementInScope('ruby')) { $this->generateImpliedEndTags(); } $peek = false; do { if ($peek) { // parse error } $peek = array_pop($this->stack); } while ($peek->tagName !== 'ruby'); $this->stack[] = $peek; // we popped one too many $this->insertElement($token); break; // spec diversion case 'math': $this->reconstructActiveFormattingElements(); $token = $this->adjustMathMLAttributes($token); $token = $this->adjustForeignAttributes($token); $this->insertForeignElement($token, self::NS_MATHML); if (isset($token['self-closing'])) { // XERROR: acknowledge the token's self-closing flag array_pop($this->stack); } if ($this->mode !== self::IN_FOREIGN_CONTENT) { $this->secondary_mode = $this->mode; $this->mode = self::IN_FOREIGN_CONTENT; } break; case 'svg': $this->reconstructActiveFormattingElements(); $token = $this->adjustSVGAttributes($token); $token = $this->adjustForeignAttributes($token); $this->insertForeignElement($token, self::NS_SVG); if (isset($token['self-closing'])) { // XERROR: acknowledge the token's self-closing flag array_pop($this->stack); } if ($this->mode !== self::IN_FOREIGN_CONTENT) { $this->secondary_mode = $this->mode; $this->mode = self::IN_FOREIGN_CONTENT; } break; case 'caption': case 'col': case 'colgroup': case 'frame': case 'head': case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead': case 'tr': // parse error break; /* A start tag token not covered by the previous entries */ default: /* Reconstruct the active formatting elements, if any. */ $this->reconstructActiveFormattingElements(); $this->insertElement($token); /* This element will be a phrasing element. */ break; } break; case HTML5_Tokenizer::ENDTAG: switch($token['name']) { /* An end tag with the tag name "body" */ case 'body': /* If the second element in the stack of open elements is not a body element, this is a parse error. Ignore the token. (innerHTML case) */ if(count($this->stack) < 2 || $this->stack[1]->tagName !== 'body') { $this->ignored = true; /* Otherwise, if there is a node in the stack of open * elements that is not either a dd element, a dt * element, an li element, an optgroup element, an * option element, a p element, an rp element, an rt * element, a tbody element, a td element, a tfoot * element, a th element, a thead element, a tr element, * the body element, or the html element, then this is a * parse error. */ } else { // XERROR: implement this check for parse error } /* Change the insertion mode to "after body". */ $this->mode = self::AFTER_BODY; break; /* An end tag with the tag name "html" */ case 'html': /* Act as if an end tag with tag name "body" had been seen, then, if that token wasn't ignored, reprocess the current token. */ $this->emitToken(array( 'name' => 'body', 'type' => HTML5_Tokenizer::ENDTAG )); if (!$this->ignored) $this->emitToken($token); break; case 'address': case 'article': case 'aside': case 'blockquote': case 'center': case 'datagrid': case 'details': case 'dir': case 'div': case 'dl': case 'fieldset': case 'figure': case 'footer': case 'header': case 'hgroup': case 'listing': case 'menu': case 'nav': case 'ol': case 'pre': case 'section': case 'ul': /* If the stack of open elements has an element in scope with the same tag name as that of the token, then generate implied end tags. */ if($this->elementInScope($token['name'])) { $this->generateImpliedEndTags(); /* Now, if the current node is not an element with the same tag name as that of the token, then this is a parse error. */ // XERROR: implement parse error logic /* If the stack of open elements has an element in scope with the same tag name as that of the token, then pop elements from this stack until an element with that tag name has been popped from the stack. */ do { $node = array_pop($this->stack); } while ($node->tagName !== $token['name']); } else { // parse error } break; /* An end tag whose tag name is "form" */ case 'form': /* Let node be the element that the form element pointer is set to. */ $node = $this->form_pointer; /* Set the form element pointer to null. */ $this->form_pointer = null; /* If node is null or the stack of open elements does not * have node in scope, then this is a parse error; ignore the token. */ if ($node === null || !in_array($node, $this->stack)) { // parse error $this->ignored = true; } else { /* 1. Generate implied end tags. */ $this->generateImpliedEndTags(); /* 2. If the current node is not node, then this is a parse error. */ if (end($this->stack) !== $node) { // parse error } /* 3. Remove node from the stack of open elements. */ array_splice($this->stack, array_search($node, $this->stack, true), 1); } break; /* An end tag whose tag name is "p" */ case 'p': /* If the stack of open elements has a p element in scope, then generate implied end tags, except for p elements. */ if($this->elementInScope('p')) { /* Generate implied end tags, except for elements with * the same tag name as the token. */ $this->generateImpliedEndTags(array('p')); /* If the current node is not a p element, then this is a parse error. */ // XERROR: implement /* Pop elements from the stack of open elements until * an element with the same tag name as the token has * been popped from the stack. */ do { $node = array_pop($this->stack); } while ($node->tagName !== 'p'); } else { // parse error $this->emitToken(array( 'name' => 'p', 'type' => HTML5_Tokenizer::STARTTAG, )); $this->emitToken($token); } break; /* An end tag whose tag name is "dd", "dt", or "li" */ case 'dd': case 'dt': case 'li': if($this->elementInScope($token['name'])) { $this->generateImpliedEndTags(array($token['name'])); /* If the current node is not an element with the same tag name as the token, then this is a parse error. */ // XERROR: implement parse error /* Pop elements from the stack of open elements until * an element with the same tag name as the token has * been popped from the stack. */ do { $node = array_pop($this->stack); } while ($node->tagName !== $token['name']); } else { // parse error } break; /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6" */ case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'); /* If the stack of open elements has in scope an element whose tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then generate implied end tags. */ if($this->elementInScope($elements)) { $this->generateImpliedEndTags(); /* Now, if the current node is not an element with the same tag name as that of the token, then this is a parse error. */ // XERROR: implement parse error /* If the stack of open elements has in scope an element whose tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then pop elements from the stack until an element with one of those tag names has been popped from the stack. */ do { $node = array_pop($this->stack); } while (!in_array($node->tagName, $elements)); } else { // parse error } break; /* An end tag whose tag name is one of: "a", "b", "big", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */ case 'a': case 'b': case 'big': case 'code': case 'em': case 'font': case 'i': case 'nobr': case 's': case 'small': case 'strike': case 'strong': case 'tt': case 'u': // XERROR: generally speaking this needs parse error logic /* 1. Let the formatting element be the last element in the list of active formatting elements that: * is between the end of the list and the last scope marker in the list, if any, or the start of the list otherwise, and * has the same tag name as the token. */ while(true) { for($a = count($this->a_formatting) - 1; $a >= 0; $a--) { if($this->a_formatting[$a] === self::MARKER) { break; } elseif($this->a_formatting[$a]->tagName === $token['name']) { $formatting_element = $this->a_formatting[$a]; $in_stack = in_array($formatting_element, $this->stack, true); $fe_af_pos = $a; break; } } /* If there is no such node, or, if that node is also in the stack of open elements but the element is not in scope, then this is a parse error. Abort these steps. The token is ignored. */ if(!isset($formatting_element) || ($in_stack && !$this->elementInScope($token['name']))) { $this->ignored = true; break; /* Otherwise, if there is such a node, but that node is not in the stack of open elements, then this is a parse error; remove the element from the list, and abort these steps. */ } elseif(isset($formatting_element) && !$in_stack) { unset($this->a_formatting[$fe_af_pos]); $this->a_formatting = array_merge($this->a_formatting); break; } /* Otherwise, there is a formatting element and that * element is in the stack and is in scope. If the * element is not the current node, this is a parse * error. In any case, proceed with the algorithm as * written in the following steps. */ // XERROR: implement me /* 2. Let the furthest block be the topmost node in the stack of open elements that is lower in the stack than the formatting element, and is not an element in the phrasing or formatting categories. There might not be one. */ $fe_s_pos = array_search($formatting_element, $this->stack, true); $length = count($this->stack); for($s = $fe_s_pos + 1; $s < $length; $s++) { $category = $this->getElementCategory($this->stack[$s]); if($category !== self::PHRASING && $category !== self::FORMATTING) { $furthest_block = $this->stack[$s]; break; } } /* 3. If there is no furthest block, then the UA must skip the subsequent steps and instead just pop all the nodes from the bottom of the stack of open elements, from the current node up to the formatting element, and remove the formatting element from the list of active formatting elements. */ if(!isset($furthest_block)) { for($n = $length - 1; $n >= $fe_s_pos; $n--) { array_pop($this->stack); } unset($this->a_formatting[$fe_af_pos]); $this->a_formatting = array_merge($this->a_formatting); break; } /* 4. Let the common ancestor be the element immediately above the formatting element in the stack of open elements. */ $common_ancestor = $this->stack[$fe_s_pos - 1]; /* 5. Let a bookmark note the position of the formatting element in the list of active formatting elements relative to the elements on either side of it in the list. */ $bookmark = $fe_af_pos; /* 6. Let node and last node be the furthest block. Follow these steps: */ $node = $furthest_block; $last_node = $furthest_block; while(true) { for($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) { /* 6.1 Let node be the element immediately prior to node in the stack of open elements. */ $node = $this->stack[$n]; /* 6.2 If node is not in the list of active formatting elements, then remove node from the stack of open elements and then go back to step 1. */ if(!in_array($node, $this->a_formatting, true)) { array_splice($this->stack, $n, 1); } else { break; } } /* 6.3 Otherwise, if node is the formatting element, then go to the next step in the overall algorithm. */ if($node === $formatting_element) { break; /* 6.4 Otherwise, if last node is the furthest block, then move the aforementioned bookmark to be immediately after the node in the list of active formatting elements. */ } elseif($last_node === $furthest_block) { $bookmark = array_search($node, $this->a_formatting, true) + 1; } /* 6.5 Create an element for the token for which * the element node was created, replace the entry * for node in the list of active formatting * elements with an entry for the new element, * replace the entry for node in the stack of open * elements with an entry for the new element, and * let node be the new element. */ // we don't know what the token is anymore $clone = $node->cloneNode(); $a_pos = array_search($node, $this->a_formatting, true); $s_pos = array_search($node, $this->stack, true); $this->a_formatting[$a_pos] = $clone; $this->stack[$s_pos] = $clone; $node = $clone; /* 6.6 Insert last node into node, first removing it from its previous parent node if any. */ if($last_node->parentNode !== null) { $last_node->parentNode->removeChild($last_node); } $node->appendChild($last_node); /* 6.7 Let last node be node. */ $last_node = $node; /* 6.8 Return to step 1 of this inner set of steps. */ } /* 7. If the common ancestor node is a table, tbody, * tfoot, thead, or tr element, then, foster parent * whatever last node ended up being in the previous * step, first removing it from its previous parent * node if any. */ if ($last_node->parentNode) { // common step $last_node->parentNode->removeChild($last_node); } if (in_array($common_ancestor->tagName, array('table', 'tbody', 'tfoot', 'thead', 'tr'))) { $this->fosterParent($last_node); /* Otherwise, append whatever last node ended up being * in the previous step to the common ancestor node, * first removing it from its previous parent node if * any. */ } else { $common_ancestor->appendChild($last_node); } /* 8. Create an element for the token for which the * formatting element was created. */ $clone = $formatting_element->cloneNode(); /* 9. Take all of the child nodes of the furthest block and append them to the element created in the last step. */ while($furthest_block->hasChildNodes()) { $child = $furthest_block->firstChild; $furthest_block->removeChild($child); $clone->appendChild($child); } /* 10. Append that clone to the furthest block. */ $furthest_block->appendChild($clone); /* 11. Remove the formatting element from the list of active formatting elements, and insert the new element into the list of active formatting elements at the position of the aforementioned bookmark. */ $fe_af_pos = array_search($formatting_element, $this->a_formatting, true); array_splice($this->a_formatting, $fe_af_pos, 1); $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1); $af_part2 = array_slice($this->a_formatting, $bookmark); $this->a_formatting = array_merge($af_part1, array($clone), $af_part2); /* 12. Remove the formatting element from the stack of open elements, and insert the new element into the stack of open elements immediately below the position of the furthest block in that stack. */ $fe_s_pos = array_search($formatting_element, $this->stack, true); array_splice($this->stack, $fe_s_pos, 1); $fb_s_pos = array_search($furthest_block, $this->stack, true); $s_part1 = array_slice($this->stack, 0, $fb_s_pos + 1); $s_part2 = array_slice($this->stack, $fb_s_pos + 1); $this->stack = array_merge($s_part1, array($clone), $s_part2); /* 13. Jump back to step 1 in this series of steps. */ unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block); } break; case 'applet': case 'button': case 'marquee': case 'object': /* If the stack of open elements has an element in scope whose tag name matches the tag name of the token, then generate implied tags. */ if($this->elementInScope($token['name'])) { $this->generateImpliedEndTags(); /* Now, if the current node is not an element with the same tag name as the token, then this is a parse error. */ // XERROR: implement logic /* Pop elements from the stack of open elements until * an element with the same tag name as the token has * been popped from the stack. */ do { $node = array_pop($this->stack); } while ($node->tagName !== $token['name']); /* Clear the list of active formatting elements up to the * last marker. */ $keys = array_keys($this->a_formatting, self::MARKER, true); $marker = end($keys); for($n = count($this->a_formatting) - 1; $n > $marker; $n--) { array_pop($this->a_formatting); } } else { // parse error } break; case 'br': // Parse error $this->emitToken(array( 'name' => 'br', 'type' => HTML5_Tokenizer::STARTTAG, )); break; /* An end tag token not covered by the previous entries */ default: for($n = count($this->stack) - 1; $n >= 0; $n--) { /* Initialise node to be the current node (the bottommost node of the stack). */ $node = $this->stack[$n]; /* If node has the same tag name as the end tag token, then: */ if($token['name'] === $node->tagName) { /* Generate implied end tags. */ $this->generateImpliedEndTags(); /* If the tag name of the end tag token does not match the tag name of the current node, this is a parse error. */ // XERROR: implement this /* Pop all the nodes from the current node up to node, including node, then stop these steps. */ // XSKETCHY do { $pop = array_pop($this->stack); } while ($pop !== $node); break; } else { $category = $this->getElementCategory($node); if($category !== self::FORMATTING && $category !== self::PHRASING) { /* Otherwise, if node is in neither the formatting category nor the phrasing category, then this is a parse error. Stop this algorithm. The end tag token is ignored. */ $this->ignored = true; break; // parse error } } /* Set node to the previous entry in the stack of open elements. Loop. */ } break; } break; } break; case self::IN_CDATA_RCDATA: if ( $token['type'] === HTML5_Tokenizer::CHARACTER || $token['type'] === HTML5_Tokenizer::SPACECHARACTER ) { $this->insertText($token['data']); } elseif ($token['type'] === HTML5_Tokenizer::EOF) { // parse error /* If the current node is a script element, mark the script * element as "already executed". */ // probably not necessary array_pop($this->stack); $this->mode = $this->original_mode; $this->emitToken($token); } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'script') { array_pop($this->stack); $this->mode = $this->original_mode; // we're ignoring all of the execution stuff } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG) { array_pop($this->stack); $this->mode = $this->original_mode; } break; case self::IN_TABLE: $clear = array('html', 'table'); /* A character token that is one of one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), or U+0020 SPACE */ if($token['type'] === HTML5_Tokenizer::SPACECHARACTER && /* If the current table is tainted, then act as described in * the "anything else" entry below. */ // Note: hsivonen has a test that fails due to this line // because he wants to convince Hixie not to do taint !$this->currentTableIsTainted()) { /* Append the character to the current node. */ $this->insertText($token['data']); /* A comment token */ } elseif($token['type'] === HTML5_Tokenizer::COMMENT) { /* Append a Comment node to the current node with the data attribute set to the data given in the comment token. */ $this->insertComment($token['data']); } elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) { // parse error /* A start tag whose tag name is "caption" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'caption') { /* Clear the stack back to a table context. */ $this->clearStackToTableContext($clear); /* Insert a marker at the end of the list of active formatting elements. */ $this->a_formatting[] = self::MARKER; /* Insert an HTML element for the token, then switch the insertion mode to "in caption". */ $this->insertElement($token); $this->mode = self::IN_CAPTION; /* A start tag whose tag name is "colgroup" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'colgroup') { /* Clear the stack back to a table context. */ $this->clearStackToTableContext($clear); /* Insert an HTML element for the token, then switch the insertion mode to "in column group". */ $this->insertElement($token); $this->mode = self::IN_COLUMN_GROUP; /* A start tag whose tag name is "col" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'col') { $this->emitToken(array( 'name' => 'colgroup', 'type' => HTML5_Tokenizer::STARTTAG, 'attr' => array() )); $this->emitToken($token); /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'], array('tbody', 'tfoot', 'thead'))) { /* Clear the stack back to a table context. */ $this->clearStackToTableContext($clear); /* Insert an HTML element for the token, then switch the insertion mode to "in table body". */ $this->insertElement($token); $this->mode = self::IN_TABLE_BODY; /* A start tag whose tag name is one of: "td", "th", "tr" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'], array('td', 'th', 'tr'))) { /* Act as if a start tag token with the tag name "tbody" had been seen, then reprocess the current token. */ $this->emitToken(array( 'name' => 'tbody', 'type' => HTML5_Tokenizer::STARTTAG, 'attr' => array() )); $this->emitToken($token); /* A start tag whose tag name is "table" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'table') { /* Parse error. Act as if an end tag token with the tag name "table" had been seen, then, if that token wasn't ignored, reprocess the current token. */ $this->emitToken(array( 'name' => 'table', 'type' => HTML5_Tokenizer::ENDTAG )); if (!$this->ignored) $this->emitToken($token); /* An end tag whose tag name is "table" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'table') { /* If the stack of open elements does not have an element in table scope with the same tag name as the token, this is a parse error. Ignore the token. (fragment case) */ if(!$this->elementInScope($token['name'], true)) { $this->ignored = true; /* Otherwise: */ } else { do { $node = array_pop($this->stack); } while ($node->tagName !== 'table'); /* Reset the insertion mode appropriately. */ $this->resetInsertionMode(); } /* An end tag whose tag name is one of: "body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'], array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'))) { // Parse error. Ignore the token. } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && ($token['name'] === 'style' || $token['name'] === 'script')) { $this->processWithRulesFor($token, self::IN_HEAD); } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'input' && // assignment is intentional /* If the token does not have an attribute with the name "type", or * if it does, but that attribute's value is not an ASCII * case-insensitive match for the string "hidden", then: act as * described in the "anything else" entry below. */ ($type = $this->getAttr($token, 'type')) && strtolower($type) === 'hidden') { // I.e., if its an input with the type attribute == 'hidden' /* Otherwise */ // parse error $this->insertElement($token); array_pop($this->stack); } elseif ($token['type'] === HTML5_Tokenizer::EOF) { /* If the current node is not the root html element, then this is a parse error. */ if (end($this->stack)->tagName !== 'html') { // Note: It can only be the current node in the fragment case. // parse error } /* Stop parsing. */ /* Anything else */ } else { /* Parse error. Process the token as if the insertion mode was "in body", with the following exception: */ $old = $this->foster_parent; $this->foster_parent = true; $this->processWithRulesFor($token, self::IN_BODY); $this->foster_parent = $old; } break; case self::IN_CAPTION: /* An end tag whose tag name is "caption" */ if($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'caption') { /* If the stack of open elements does not have an element in table scope with the same tag name as the token, this is a parse error. Ignore the token. (fragment case) */ if(!$this->elementInScope($token['name'], true)) { $this->ignored = true; // Ignore /* Otherwise: */ } else { /* Generate implied end tags. */ $this->generateImpliedEndTags(); /* Now, if the current node is not a caption element, then this is a parse error. */ // XERROR: implement /* Pop elements from this stack until a caption element has been popped from the stack. */ do { $node = array_pop($this->stack); } while ($node->tagName !== 'caption'); /* Clear the list of active formatting elements up to the last marker. */ $this->clearTheActiveFormattingElementsUpToTheLastMarker(); /* Switch the insertion mode to "in table". */ $this->mode = self::IN_TABLE; } /* A start tag whose tag name is one of: "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag name is "table" */ } elseif(($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'], array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'))) || ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'table')) { /* Parse error. Act as if an end tag with the tag name "caption" had been seen, then, if that token wasn't ignored, reprocess the current token. */ $this->emitToken(array( 'name' => 'caption', 'type' => HTML5_Tokenizer::ENDTAG )); if (!$this->ignored) $this->emitToken($token); /* An end tag whose tag name is one of: "body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'], array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th', 'thead', 'tr'))) { // Parse error. Ignore the token. $this->ignored = true; /* Anything else */ } else { /* Process the token as if the insertion mode was "in body". */ $this->processWithRulesFor($token, self::IN_BODY); } break; case self::IN_COLUMN_GROUP: /* A character token that is one of one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), or U+0020 SPACE */ if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) { /* Append the character to the current node. */ $this->insertText($token['data']); /* A comment token */ } elseif($token['type'] === HTML5_Tokenizer::COMMENT) { /* Append a Comment node to the current node with the data attribute set to the data given in the comment token. */ $this->insertToken($token['data']); } elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) { // parse error } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') { $this->processWithRulesFor($token, self::IN_BODY); /* A start tag whose tag name is "col" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'col') { /* Insert a col element for the token. Immediately pop the current node off the stack of open elements. */ $this->insertElement($token); array_pop($this->stack); // XERROR: Acknowledge the token's self-closing flag, if it is set. /* An end tag whose tag name is "colgroup" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'colgroup') { /* If the current node is the root html element, then this is a parse error, ignore the token. (fragment case) */ if(end($this->stack)->tagName === 'html') { $this->ignored = true; /* Otherwise, pop the current node (which will be a colgroup element) from the stack of open elements. Switch the insertion mode to "in table". */ } else { array_pop($this->stack); $this->mode = self::IN_TABLE; } /* An end tag whose tag name is "col" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'col') { /* Parse error. Ignore the token. */ $this->ignored = true; /* An end-of-file token */ /* If the current node is the root html element */ } elseif($token['type'] === HTML5_Tokenizer::EOF && end($this->stack)->tagName === 'html') { /* Stop parsing */ /* Anything else */ } else { /* Act as if an end tag with the tag name "colgroup" had been seen, and then, if that token wasn't ignored, reprocess the current token. */ $this->emitToken(array( 'name' => 'colgroup', 'type' => HTML5_Tokenizer::ENDTAG )); if (!$this->ignored) $this->emitToken($token); } break; case self::IN_TABLE_BODY: $clear = array('tbody', 'tfoot', 'thead', 'html'); /* A start tag whose tag name is "tr" */ if($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'tr') { /* Clear the stack back to a table body context. */ $this->clearStackToTableContext($clear); /* Insert a tr element for the token, then switch the insertion mode to "in row". */ $this->insertElement($token); $this->mode = self::IN_ROW; /* A start tag whose tag name is one of: "th", "td" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && ($token['name'] === 'th' || $token['name'] === 'td')) { /* Parse error. Act as if a start tag with the tag name "tr" had been seen, then reprocess the current token. */ $this->emitToken(array( 'name' => 'tr', 'type' => HTML5_Tokenizer::STARTTAG, 'attr' => array() )); $this->emitToken($token); /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'], array('tbody', 'tfoot', 'thead'))) { /* If the stack of open elements does not have an element in table scope with the same tag name as the token, this is a parse error. Ignore the token. */ if(!$this->elementInScope($token['name'], true)) { // Parse error $this->ignored = true; /* Otherwise: */ } else { /* Clear the stack back to a table body context. */ $this->clearStackToTableContext($clear); /* Pop the current node from the stack of open elements. Switch the insertion mode to "in table". */ array_pop($this->stack); $this->mode = self::IN_TABLE; } /* A start tag whose tag name is one of: "caption", "col", "colgroup", "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */ } elseif(($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'], array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead'))) || ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'table')) { /* If the stack of open elements does not have a tbody, thead, or tfoot element in table scope, this is a parse error. Ignore the token. (fragment case) */ if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) { // parse error $this->ignored = true; /* Otherwise: */ } else { /* Clear the stack back to a table body context. */ $this->clearStackToTableContext($clear); /* Act as if an end tag with the same tag name as the current node ("tbody", "tfoot", or "thead") had been seen, then reprocess the current token. */ $this->emitToken(array( 'name' => end($this->stack)->tagName, 'type' => HTML5_Tokenizer::ENDTAG )); $this->emitToken($token); } /* An end tag whose tag name is one of: "body", "caption", "col", "colgroup", "html", "td", "th", "tr" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'], array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) { /* Parse error. Ignore the token. */ $this->ignored = true; /* Anything else */ } else { /* Process the token as if the insertion mode was "in table". */ $this->processWithRulesFor($token, self::IN_TABLE); } break; case self::IN_ROW: $clear = array('tr', 'html'); /* A start tag whose tag name is one of: "th", "td" */ if($token['type'] === HTML5_Tokenizer::STARTTAG && ($token['name'] === 'th' || $token['name'] === 'td')) { /* Clear the stack back to a table row context. */ $this->clearStackToTableContext($clear); /* Insert an HTML element for the token, then switch the insertion mode to "in cell". */ $this->insertElement($token); $this->mode = self::IN_CELL; /* Insert a marker at the end of the list of active formatting elements. */ $this->a_formatting[] = self::MARKER; /* An end tag whose tag name is "tr" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'tr') { /* If the stack of open elements does not have an element in table scope with the same tag name as the token, this is a parse error. Ignore the token. (fragment case) */ if(!$this->elementInScope($token['name'], true)) { // Ignore. $this->ignored = true; /* Otherwise: */ } else { /* Clear the stack back to a table row context. */ $this->clearStackToTableContext($clear); /* Pop the current node (which will be a tr element) from the stack of open elements. Switch the insertion mode to "in table body". */ array_pop($this->stack); $this->mode = self::IN_TABLE_BODY; } /* A start tag whose tag name is one of: "caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */ } elseif(($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'], array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) || ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'table')) { /* Act as if an end tag with the tag name "tr" had been seen, then, if that token wasn't ignored, reprocess the current token. */ $this->emitToken(array( 'name' => 'tr', 'type' => HTML5_Tokenizer::ENDTAG )); if (!$this->ignored) $this->emitToken($token); /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'], array('tbody', 'tfoot', 'thead'))) { /* If the stack of open elements does not have an element in table scope with the same tag name as the token, this is a parse error. Ignore the token. */ if(!$this->elementInScope($token['name'], true)) { $this->ignored = true; /* Otherwise: */ } else { /* Otherwise, act as if an end tag with the tag name "tr" had been seen, then reprocess the current token. */ $this->emitToken(array( 'name' => 'tr', 'type' => HTML5_Tokenizer::ENDTAG )); $this->emitToken($token); } /* An end tag whose tag name is one of: "body", "caption", "col", "colgroup", "html", "td", "th" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'], array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th'))) { /* Parse error. Ignore the token. */ $this->ignored = true; /* Anything else */ } else { /* Process the token as if the insertion mode was "in table". */ $this->processWithRulesFor($token, self::IN_TABLE); } break; case self::IN_CELL: /* An end tag whose tag name is one of: "td", "th" */ if($token['type'] === HTML5_Tokenizer::ENDTAG && ($token['name'] === 'td' || $token['name'] === 'th')) { /* If the stack of open elements does not have an element in table scope with the same tag name as that of the token, then this is a parse error and the token must be ignored. */ if(!$this->elementInScope($token['name'], true)) { $this->ignored = true; /* Otherwise: */ } else { /* Generate implied end tags, except for elements with the same tag name as the token. */ $this->generateImpliedEndTags(array($token['name'])); /* Now, if the current node is not an element with the same tag name as the token, then this is a parse error. */ // XERROR: Implement parse error code /* Pop elements from this stack until an element with the same tag name as the token has been popped from the stack. */ do { $node = array_pop($this->stack); } while ($node->tagName !== $token['name']); /* Clear the list of active formatting elements up to the last marker. */ $this->clearTheActiveFormattingElementsUpToTheLastMarker(); /* Switch the insertion mode to "in row". (The current node will be a tr element at this point.) */ $this->mode = self::IN_ROW; } /* A start tag whose tag name is one of: "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'], array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'))) { /* If the stack of open elements does not have a td or th element in table scope, then this is a parse error; ignore the token. (fragment case) */ if(!$this->elementInScope(array('td', 'th'), true)) { // parse error $this->ignored = true; /* Otherwise, close the cell (see below) and reprocess the current token. */ } else { $this->closeCell(); $this->emitToken($token); } /* An end tag whose tag name is one of: "body", "caption", "col", "colgroup", "html" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'], array('body', 'caption', 'col', 'colgroup', 'html'))) { /* Parse error. Ignore the token. */ $this->ignored = true; /* An end tag whose tag name is one of: "table", "tbody", "tfoot", "thead", "tr" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'], array('table', 'tbody', 'tfoot', 'thead', 'tr'))) { /* If the stack of open elements does not have a td or th element in table scope, then this is a parse error; ignore the token. (innerHTML case) */ if(!$this->elementInScope(array('td', 'th'), true)) { // Parse error $this->ignored = true; /* Otherwise, close the cell (see below) and reprocess the current token. */ } else { $this->closeCell(); $this->emitToken($token); } /* Anything else */ } else { /* Process the token as if the insertion mode was "in body". */ $this->processWithRulesFor($token, self::IN_BODY); } break; case self::IN_SELECT: /* Handle the token as follows: */ /* A character token */ if( $token['type'] === HTML5_Tokenizer::CHARACTER || $token['type'] === HTML5_Tokenizer::SPACECHARACTER ) { /* Append the token's character to the current node. */ $this->insertText($token['data']); /* A comment token */ } elseif($token['type'] === HTML5_Tokenizer::COMMENT) { /* Append a Comment node to the current node with the data attribute set to the data given in the comment token. */ $this->insertComment($token['data']); } elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) { // parse error } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') { $this->processWithRulesFor($token, self::INBODY); /* A start tag token whose tag name is "option" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'option') { /* If the current node is an option element, act as if an end tag with the tag name "option" had been seen. */ if(end($this->stack)->tagName === 'option') { $this->emitToken(array( 'name' => 'option', 'type' => HTML5_Tokenizer::ENDTAG )); } /* Insert an HTML element for the token. */ $this->insertElement($token); /* A start tag token whose tag name is "optgroup" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'optgroup') { /* If the current node is an option element, act as if an end tag with the tag name "option" had been seen. */ if(end($this->stack)->tagName === 'option') { $this->emitToken(array( 'name' => 'option', 'type' => HTML5_Tokenizer::ENDTAG )); } /* If the current node is an optgroup element, act as if an end tag with the tag name "optgroup" had been seen. */ if(end($this->stack)->tagName === 'optgroup') { $this->emitToken(array( 'name' => 'optgroup', 'type' => HTML5_Tokenizer::ENDTAG )); } /* Insert an HTML element for the token. */ $this->insertElement($token); /* An end tag token whose tag name is "optgroup" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'optgroup') { /* First, if the current node is an option element, and the node immediately before it in the stack of open elements is an optgroup element, then act as if an end tag with the tag name "option" had been seen. */ $elements_in_stack = count($this->stack); if($this->stack[$elements_in_stack - 1]->tagName === 'option' && $this->stack[$elements_in_stack - 2]->tagName === 'optgroup') { $this->emitToken(array( 'name' => 'option', 'type' => HTML5_Tokenizer::ENDTAG )); } /* If the current node is an optgroup element, then pop that node from the stack of open elements. Otherwise, this is a parse error, ignore the token. */ if(end($this->stack)->tagName === 'optgroup') { array_pop($this->stack); } else { // parse error $this->ignored = true; } /* An end tag token whose tag name is "option" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'option') { /* If the current node is an option element, then pop that node from the stack of open elements. Otherwise, this is a parse error, ignore the token. */ if(end($this->stack)->tagName === 'option') { array_pop($this->stack); } else { // parse error $this->ignored = true; } /* An end tag whose tag name is "select" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'select') { /* If the stack of open elements does not have an element in table scope with the same tag name as the token, this is a parse error. Ignore the token. (fragment case) */ if(!$this->elementInScope($token['name'], true)) { $this->ignored = true; // parse error /* Otherwise: */ } else { /* Pop elements from the stack of open elements until a select element has been popped from the stack. */ do { $node = array_pop($this->stack); } while ($node->tagName !== 'select'); /* Reset the insertion mode appropriately. */ $this->resetInsertionMode(); } /* A start tag whose tag name is "select" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'select') { /* Parse error. Act as if the token had been an end tag with the tag name "select" instead. */ $this->emitToken(array( 'name' => 'select', 'type' => HTML5_Tokenizer::ENDTAG )); } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && ($token['name'] === 'input' || $token['name'] === 'textarea')) { // parse error $this->emitToken(array( 'name' => 'select', 'type' => HTML5_Tokenizer::ENDTAG )); $this->emitToken($token); } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'script') { $this->processWithRulesFor($token, self::IN_HEAD); } elseif($token['type'] === HTML5_Tokenizer::EOF) { // XERROR: If the current node is not the root html element, then this is a parse error. /* Stop parsing */ /* Anything else */ } else { /* Parse error. Ignore the token. */ $this->ignored = true; } break; case self::IN_SELECT_IN_TABLE: if($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'], array('caption', 'table', 'tbody', 'tfoot', 'thead', 'tr', 'td', 'th'))) { // parse error $this->emitToken(array( 'name' => 'select', 'type' => HTML5_Tokenizer::ENDTAG, )); $this->emitToken($token); /* An end tag whose tag name is one of: "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'], array('caption', 'table', 'tbody', 'tfoot', 'thead', 'tr', 'td', 'th'))) { /* Parse error. */ // parse error /* If the stack of open elements has an element in table scope with the same tag name as that of the token, then act as if an end tag with the tag name "select" had been seen, and reprocess the token. Otherwise, ignore the token. */ if($this->elementInScope($token['name'], true)) { $this->emitToken(array( 'name' => 'select', 'type' => HTML5_Tokenizer::ENDTAG )); $this->emitToken($token); } else { $this->ignored = true; } } else { $this->processWithRulesFor($token, self::IN_SELECT); } break; case self::IN_FOREIGN_CONTENT: if ($token['type'] === HTML5_Tokenizer::CHARACTER || $token['type'] === HTML5_Tokenizer::SPACECHARACTER) { $this->insertText($token['data']); } elseif ($token['type'] === HTML5_Tokenizer::COMMENT) { $this->insertComment($token['data']); } elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) { // XERROR: parse error } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'script' && end($this->stack)->tagName === 'script' && end($this->stack)->namespaceURI === self::NS_SVG) { array_pop($this->stack); // a bunch of script running mumbo jumbo } elseif ( ($token['type'] === HTML5_Tokenizer::STARTTAG && (( $token['name'] !== 'mglyph' && $token['name'] !== 'malignmark' && end($this->stack)->namespaceURI === self::NS_MATHML && in_array(end($this->stack)->tagName, array('mi', 'mo', 'mn', 'ms', 'mtext')) ) || ( $token['name'] === 'svg' && end($this->stack)->namespaceURI === self::NS_MATHML && end($this->stack)->tagName === 'annotation-xml' ) || ( end($this->stack)->namespaceURI === self::NS_SVG && in_array(end($this->stack)->tagName, array('foreignObject', 'desc', 'title')) ) || ( // XSKETCHY end($this->stack)->namespaceURI === self::NS_HTML )) ) || $token['type'] === HTML5_Tokenizer::ENDTAG ) { $this->processWithRulesFor($token, $this->secondary_mode); /* If, after doing so, the insertion mode is still "in foreign * content", but there is no element in scope that has a namespace * other than the HTML namespace, switch the insertion mode to the * secondary insertion mode. */ if ($this->mode === self::IN_FOREIGN_CONTENT) { $found = false; // this basically duplicates elementInScope() for ($i = count($this->stack) - 1; $i >= 0; $i--) { $node = $this->stack[$i]; if ($node->namespaceURI !== self::NS_HTML) { $found = true; break; } elseif (in_array($node->tagName, array('table', 'html', 'applet', 'caption', 'td', 'th', 'button', 'marquee', 'object')) || ($node->tagName === 'foreignObject' && $node->namespaceURI === self::NS_SVG)) { break; } } if (!$found) { $this->mode = $this->secondary_mode; } } } elseif ($token['type'] === HTML5_Tokenizer::EOF || ( $token['type'] === HTML5_Tokenizer::STARTTAG && (in_array($token['name'], array('b', "big", "blockquote", "body", "br", "center", "code", "dd", "div", "dl", "dt", "em", "embed", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "i", "img", "li", "listing", "menu", "meta", "nobr", "ol", "p", "pre", "ruby", "s", "small", "span", "strong", "strike", "sub", "sup", "table", "tt", "u", "ul", "var")) || ($token['name'] === 'font' && ($this->getAttr($token, 'color') || $this->getAttr($token, 'face') || $this->getAttr($token, 'size')))))) { // XERROR: parse error do { $node = array_pop($this->stack); } while ($node->namespaceURI !== self::NS_HTML); $this->stack[] = $node; $this->mode = $this->secondary_mode; $this->emitToken($token); } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG) { static $svg_lookup = array( 'altglyph' => 'altGlyph', 'altglyphdef' => 'altGlyphDef', 'altglyphitem' => 'altGlyphItem', 'animatecolor' => 'animateColor', 'animatemotion' => 'animateMotion', 'animatetransform' => 'animateTransform', 'clippath' => 'clipPath', 'feblend' => 'feBlend', 'fecolormatrix' => 'feColorMatrix', 'fecomponenttransfer' => 'feComponentTransfer', 'fecomposite' => 'feComposite', 'feconvolvematrix' => 'feConvolveMatrix', 'fediffuselighting' => 'feDiffuseLighting', 'fedisplacementmap' => 'feDisplacementMap', 'fedistantlight' => 'feDistantLight', 'feflood' => 'feFlood', 'fefunca' => 'feFuncA', 'fefuncb' => 'feFuncB', 'fefuncg' => 'feFuncG', 'fefuncr' => 'feFuncR', 'fegaussianblur' => 'feGaussianBlur', 'feimage' => 'feImage', 'femerge' => 'feMerge', 'femergenode' => 'feMergeNode', 'femorphology' => 'feMorphology', 'feoffset' => 'feOffset', 'fepointlight' => 'fePointLight', 'fespecularlighting' => 'feSpecularLighting', 'fespotlight' => 'feSpotLight', 'fetile' => 'feTile', 'feturbulence' => 'feTurbulence', 'foreignobject' => 'foreignObject', 'glyphref' => 'glyphRef', 'lineargradient' => 'linearGradient', 'radialgradient' => 'radialGradient', 'textpath' => 'textPath', ); $current = end($this->stack); if ($current->namespaceURI === self::NS_MATHML) { $token = $this->adjustMathMLAttributes($token); } if ($current->namespaceURI === self::NS_SVG && isset($svg_lookup[$token['name']])) { $token['name'] = $svg_lookup[$token['name']]; } if ($current->namespaceURI === self::NS_SVG) { $token = $this->adjustSVGAttributes($token); } $token = $this->adjustForeignAttributes($token); $this->insertForeignElement($token, $current->namespaceURI); if (isset($token['self-closing'])) { array_pop($this->stack); // XERROR: acknowledge self-closing flag } } break; case self::AFTER_BODY: /* Handle the token as follows: */ /* A character token that is one of one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), or U+0020 SPACE */ if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) { /* Process the token as it would be processed if the insertion mode was "in body". */ $this->processWithRulesFor($token, self::IN_BODY); /* A comment token */ } elseif($token['type'] === HTML5_Tokenizer::COMMENT) { /* Append a Comment node to the first element in the stack of open elements (the html element), with the data attribute set to the data given in the comment token. */ $comment = $this->dom->createComment($token['data']); $this->stack[0]->appendChild($comment); } elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) { // parse error } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') { $this->processWithRulesFor($token, self::IN_BODY); /* An end tag with the tag name "html" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'html') { /* If the parser was originally created as part of the HTML * fragment parsing algorithm, this is a parse error; ignore * the token. (fragment case) */ $this->ignored = true; // XERROR: implement this $this->mode = self::AFTER_AFTER_BODY; } elseif($token['type'] === HTML5_Tokenizer::EOF) { /* Stop parsing */ /* Anything else */ } else { /* Parse error. Set the insertion mode to "in body" and reprocess the token. */ $this->mode = self::IN_BODY; $this->emitToken($token); } break; case self::IN_FRAMESET: /* Handle the token as follows: */ /* A character token that is one of one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) { /* Append the character to the current node. */ $this->insertText($token['data']); /* A comment token */ } elseif($token['type'] === HTML5_Tokenizer::COMMENT) { /* Append a Comment node to the current node with the data attribute set to the data given in the comment token. */ $this->insertComment($token['data']); } elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) { // parse error /* A start tag with the tag name "frameset" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'frameset') { $this->insertElement($token); /* An end tag with the tag name "frameset" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'frameset') { /* If the current node is the root html element, then this is a parse error; ignore the token. (fragment case) */ if(end($this->stack)->tagName === 'html') { $this->ignored = true; // Parse error } else { /* Otherwise, pop the current node from the stack of open elements. */ array_pop($this->stack); /* If the parser was not originally created as part of the HTML * fragment parsing algorithm (fragment case), and the current * node is no longer a frameset element, then switch the * insertion mode to "after frameset". */ $this->mode = self::AFTER_FRAMESET; } /* A start tag with the tag name "frame" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'frame') { /* Insert an HTML element for the token. */ $this->insertElement($token); /* Immediately pop the current node off the stack of open elements. */ array_pop($this->stack); // XERROR: Acknowledge the token's self-closing flag, if it is set. /* A start tag with the tag name "noframes" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'noframes') { /* Process the token using the rules for the "in head" insertion mode. */ $this->processwithRulesFor($token, self::IN_HEAD); } elseif($token['type'] === HTML5_Tokenizer::EOF) { // XERROR: If the current node is not the root html element, then this is a parse error. /* Stop parsing */ /* Anything else */ } else { /* Parse error. Ignore the token. */ $this->ignored = true; } break; case self::AFTER_FRAMESET: /* Handle the token as follows: */ /* A character token that is one of one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) { /* Append the character to the current node. */ $this->insertText($token['data']); /* A comment token */ } elseif($token['type'] === HTML5_Tokenizer::COMMENT) { /* Append a Comment node to the current node with the data attribute set to the data given in the comment token. */ $this->insertComment($token['data']); } elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) { // parse error } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') { $this->processWithRulesFor($token, self::IN_BODY); /* An end tag with the tag name "html" */ } elseif($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'html') { $this->mode = self::AFTER_AFTER_FRAMESET; /* A start tag with the tag name "noframes" */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'noframes') { $this->processWithRulesFor($token, self::IN_HEAD); } elseif($token['type'] === HTML5_Tokenizer::EOF) { /* Stop parsing */ /* Anything else */ } else { /* Parse error. Ignore the token. */ $this->ignored = true; } break; case self::AFTER_AFTER_BODY: /* A comment token */ if($token['type'] === HTML5_Tokenizer::COMMENT) { /* Append a Comment node to the Document object with the data attribute set to the data given in the comment token. */ $comment = $this->dom->createComment($token['data']); $this->dom->appendChild($comment); } elseif($token['type'] === HTML5_Tokenizer::DOCTYPE || $token['type'] === HTML5_Tokenizer::SPACECHARACTER || ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html')) { $this->processWithRulesFor($token, self::IN_BODY); /* An end-of-file token */ } elseif($token['type'] === HTML5_Tokenizer::EOF) { /* OMG DONE!! */ } else { // parse error $this->mode = self::IN_BODY; $this->emitToken($token); } break; case self::AFTER_AFTER_FRAMESET: /* A comment token */ if($token['type'] === HTML5_Tokenizer::COMMENT) { /* Append a Comment node to the Document object with the data attribute set to the data given in the comment token. */ $comment = $this->dom->createComment($token['data']); $this->dom->appendChild($comment); } elseif($token['type'] === HTML5_Tokenizer::DOCTYPE || $token['type'] === HTML5_Tokenizer::SPACECHARACTER || ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html')) { $this->processWithRulesFor($token, self::IN_BODY); /* An end-of-file token */ } elseif($token['type'] === HTML5_Tokenizer::EOF) { /* OMG DONE!! */ } elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'nofrmaes') { $this->processWithRulesFor($token, self::IN_HEAD); } else { // parse error } break; } // end funky indenting } private function insertElement($token, $append = true) { $el = $this->dom->createElementNS(self::NS_HTML, $token['name']); if (!empty($token['attr'])) { foreach($token['attr'] as $attr) { // mike@macgirvin.com 2011-11-17, check attribute name for // validity (ignoring extenders and combiners) as illegal chars in names // causes everything to abort $valid = preg_match('/^[a-zA-Z\_\:]([\-a-zA-Z0-9\_\:\.]+$)/',$attr['name'],$matches); if($attr['name'] && (!$el->hasAttribute($attr['name'])) && ($valid)) { $el->setAttribute($attr['name'], $attr['value']); } } } if ($append) { $this->appendToRealParent($el); $this->stack[] = $el; } return $el; } private function insertText($data) { if ($data === '') return; if ($this->ignore_lf_token) { if ($data[0] === "\n") { $data = substr($data, 1); if ($data === false) return; } } $text = $this->dom->createTextNode($data); $this->appendToRealParent($text); } private function insertComment($data) { $comment = $this->dom->createComment($data); $this->appendToRealParent($comment); } private function appendToRealParent($node) { // this is only for the foster_parent case /* If the current node is a table, tbody, tfoot, thead, or tr element, then, whenever a node would be inserted into the current node, it must instead be inserted into the foster parent element. */ if(!$this->foster_parent || !in_array(end($this->stack)->tagName, array('table', 'tbody', 'tfoot', 'thead', 'tr'))) { end($this->stack)->appendChild($node); } else { $this->fosterParent($node); } } private function elementInScope($el, $table = false) { if(is_array($el)) { foreach($el as $element) { if($this->elementInScope($element, $table)) { return true; } } return false; } $leng = count($this->stack); for($n = 0; $n < $leng; $n++) { /* 1. Initialise node to be the current node (the bottommost node of the stack). */ $node = $this->stack[$leng - 1 - $n]; if($node->tagName === $el) { /* 2. If node is the target node, terminate in a match state. */ return true; // these are the common states for "in scope" and "in table scope" } elseif($node->tagName === 'table' || $node->tagName === 'html') { return false; // these are only valid for "in scope" } elseif(!$table && (in_array($node->tagName, array('applet', 'caption', 'td', 'th', 'button', 'marquee', 'object')) || $node->tagName === 'foreignObject' && $node->namespaceURI === self::NS_SVG)) { return false; } /* Otherwise, set node to the previous entry in the stack of open elements and return to step 2. (This will never fail, since the loop will always terminate in the previous step if the top of the stack is reached.) */ } } private function reconstructActiveFormattingElements() { /* 1. If there are no entries in the list of active formatting elements, then there is nothing to reconstruct; stop this algorithm. */ $formatting_elements = count($this->a_formatting); if($formatting_elements === 0) { return false; } /* 3. Let entry be the last (most recently added) element in the list of active formatting elements. */ $entry = end($this->a_formatting); /* 2. If the last (most recently added) entry in the list of active formatting elements is a marker, or if it is an element that is in the stack of open elements, then there is nothing to reconstruct; stop this algorithm. */ if($entry === self::MARKER || in_array($entry, $this->stack, true)) { return false; } for($a = $formatting_elements - 1; $a >= 0; true) { /* 4. If there are no entries before entry in the list of active formatting elements, then jump to step 8. */ if($a === 0) { $step_seven = false; break; } /* 5. Let entry be the entry one earlier than entry in the list of active formatting elements. */ $a--; $entry = $this->a_formatting[$a]; /* 6. If entry is neither a marker nor an element that is also in thetack of open elements, go to step 4. */ if($entry === self::MARKER || in_array($entry, $this->stack, true)) { break; } } while(true) { /* 7. Let entry be the element one later than entry in the list of active formatting elements. */ if(isset($step_seven) && $step_seven === true) { $a++; $entry = $this->a_formatting[$a]; } /* 8. Perform a shallow clone of the element entry to obtain clone. */ $clone = $entry->cloneNode(); /* 9. Append clone to the current node and push it onto the stack of open elements so that it is the new current node. */ $this->appendToRealParent($clone); $this->stack[] = $clone; /* 10. Replace the entry for entry in the list with an entry for clone. */ $this->a_formatting[$a] = $clone; /* 11. If the entry for clone in the list of active formatting elements is not the last entry in the list, return to step 7. */ if(end($this->a_formatting) !== $clone) { $step_seven = true; } else { break; } } } private function clearTheActiveFormattingElementsUpToTheLastMarker() { /* When the steps below require the UA to clear the list of active formatting elements up to the last marker, the UA must perform the following steps: */ while(true) { /* 1. Let entry be the last (most recently added) entry in the list of active formatting elements. */ $entry = end($this->a_formatting); /* 2. Remove entry from the list of active formatting elements. */ array_pop($this->a_formatting); /* 3. If entry was a marker, then stop the algorithm at this point. The list has been cleared up to the last marker. */ if($entry === self::MARKER) { break; } } } private function generateImpliedEndTags($exclude = array()) { /* When the steps below require the UA to generate implied end tags, then, if the current node is a dd element, a dt element, an li element, a p element, a td element, a th element, or a tr element, the UA must act as if an end tag with the respective tag name had been seen and then generate implied end tags again. */ $node = end($this->stack); $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude); while(in_array(end($this->stack)->tagName, $elements)) { array_pop($this->stack); } } private function getElementCategory($node) { if (!is_object($node)) debug_print_backtrace(); $name = $node->tagName; if(in_array($name, $this->special)) return self::SPECIAL; elseif(in_array($name, $this->scoping)) return self::SCOPING; elseif(in_array($name, $this->formatting)) return self::FORMATTING; else return self::PHRASING; } private function clearStackToTableContext($elements) { /* When the steps above require the UA to clear the stack back to a table context, it means that the UA must, while the current node is not a table element or an html element, pop elements from the stack of open elements. */ while(true) { $name = end($this->stack)->tagName; if(in_array($name, $elements)) { break; } else { array_pop($this->stack); } } } private function resetInsertionMode($context = null) { /* 1. Let last be false. */ $last = false; $leng = count($this->stack); for($n = $leng - 1; $n >= 0; $n--) { /* 2. Let node be the last node in the stack of open elements. */ $node = $this->stack[$n]; /* 3. If node is the first node in the stack of open elements, then * set last to true and set node to the context element. (fragment * case) */ if($this->stack[0]->isSameNode($node)) { $last = true; $node = $context; } /* 4. If node is a select element, then switch the insertion mode to "in select" and abort these steps. (fragment case) */ if($node->tagName === 'select') { $this->mode = self::IN_SELECT; break; /* 5. If node is a td or th element, then switch the insertion mode to "in cell" and abort these steps. */ } elseif($node->tagName === 'td' || $node->nodeName === 'th') { $this->mode = self::IN_CELL; break; /* 6. If node is a tr element, then switch the insertion mode to "in row" and abort these steps. */ } elseif($node->tagName === 'tr') { $this->mode = self::IN_ROW; break; /* 7. If node is a tbody, thead, or tfoot element, then switch the insertion mode to "in table body" and abort these steps. */ } elseif(in_array($node->tagName, array('tbody', 'thead', 'tfoot'))) { $this->mode = self::IN_TABLE_BODY; break; /* 8. If node is a caption element, then switch the insertion mode to "in caption" and abort these steps. */ } elseif($node->tagName === 'caption') { $this->mode = self::IN_CAPTION; break; /* 9. If node is a colgroup element, then switch the insertion mode to "in column group" and abort these steps. (innerHTML case) */ } elseif($node->tagName === 'colgroup') { $this->mode = self::IN_COLUMN_GROUP; break; /* 10. If node is a table element, then switch the insertion mode to "in table" and abort these steps. */ } elseif($node->tagName === 'table') { $this->mode = self::IN_TABLE; break; /* 11. If node is an element from the MathML namespace or the SVG * namespace, then switch the insertion mode to "in foreign * content", let the secondary insertion mode be "in body", and * abort these steps. */ } elseif($node->namespaceURI === self::NS_SVG || $node->namespaceURI === self::NS_MATHML) { $this->mode = self::IN_FOREIGN_CONTENT; $this->secondary_mode = self::IN_BODY; break; /* 12. If node is a head element, then switch the insertion mode to "in body" ("in body"! not "in head"!) and abort these steps. (fragment case) */ } elseif($node->tagName === 'head') { $this->mode = self::IN_BODY; break; /* 13. If node is a body element, then switch the insertion mode to "in body" and abort these steps. */ } elseif($node->tagName === 'body') { $this->mode = self::IN_BODY; break; /* 14. If node is a frameset element, then switch the insertion mode to "in frameset" and abort these steps. (fragment case) */ } elseif($node->tagName === 'frameset') { $this->mode = self::IN_FRAMESET; break; /* 15. If node is an html element, then: if the head element pointer is null, switch the insertion mode to "before head", otherwise, switch the insertion mode to "after head". In either case, abort these steps. (fragment case) */ } elseif($node->tagName === 'html') { $this->mode = ($this->head_pointer === null) ? self::BEFORE_HEAD : self::AFTER_HEAD; break; /* 16. If last is true, then set the insertion mode to "in body" and abort these steps. (fragment case) */ } elseif($last) { $this->mode = self::IN_BODY; break; } } } private function closeCell() { /* If the stack of open elements has a td or th element in table scope, then act as if an end tag token with that tag name had been seen. */ foreach(array('td', 'th') as $cell) { if($this->elementInScope($cell, true)) { $this->emitToken(array( 'name' => $cell, 'type' => HTML5_Tokenizer::ENDTAG )); break; } } } private function processWithRulesFor($token, $mode) { /* "using the rules for the m insertion mode", where m is one of these * modes, the user agent must use the rules described under the m * insertion mode's section, but must leave the insertion mode * unchanged unless the rules in m themselves switch the insertion mode * to a new value. */ return $this->emitToken($token, $mode); } private function insertCDATAElement($token) { $this->insertElement($token); $this->original_mode = $this->mode; $this->mode = self::IN_CDATA_RCDATA; $this->content_model = HTML5_Tokenizer::CDATA; } private function insertRCDATAElement($token) { $this->insertElement($token); $this->original_mode = $this->mode; $this->mode = self::IN_CDATA_RCDATA; $this->content_model = HTML5_Tokenizer::RCDATA; } private function getAttr($token, $key) { if (!isset($token['attr'])) return false; $ret = false; foreach ($token['attr'] as $keypair) { if ($keypair['name'] === $key) $ret = $keypair['value']; } return $ret; } private function getCurrentTable() { /* The current table is the last table element in the stack of open * elements, if there is one. If there is no table element in the stack * of open elements (fragment case), then the current table is the * first element in the stack of open elements (the html element). */ for ($i = count($this->stack) - 1; $i >= 0; $i--) { if ($this->stack[$i]->tagName === 'table') { return $this->stack[$i]; } } return $this->stack[0]; } private function getFosterParent() { /* The foster parent element is the parent element of the last table element in the stack of open elements, if there is a table element and it has such a parent element. If there is no table element in the stack of open elements (innerHTML case), then the foster parent element is the first element in the stack of open elements (the html element). Otherwise, if there is a table element in the stack of open elements, but the last table element in the stack of open elements has no parent, or its parent node is not an element, then the foster parent element is the element before the last table element in the stack of open elements. */ for($n = count($this->stack) - 1; $n >= 0; $n--) { if($this->stack[$n]->tagName === 'table') { $table = $this->stack[$n]; break; } } if(isset($table) && $table->parentNode !== null) { return $table->parentNode; } elseif(!isset($table)) { return $this->stack[0]; } elseif(isset($table) && ($table->parentNode === null || $table->parentNode->nodeType !== XML_ELEMENT_NODE)) { return $this->stack[$n - 1]; } } public function fosterParent($node) { $foster_parent = $this->getFosterParent(); $table = $this->getCurrentTable(); // almost equivalent to last table element, except it can be html /* When a node node is to be foster parented, the node node must be * inserted into the foster parent element, and the current table must * be marked as tainted. (Once the current table has been tainted, * whitespace characters are inserted into the foster parent element * instead of the current node.) */ $table->tainted = true; /* If the foster parent element is the parent element of the last table * element in the stack of open elements, then node must be inserted * immediately before the last table element in the stack of open * elements in the foster parent element; otherwise, node must be * appended to the foster parent element. */ if ($table->tagName === 'table' && $table->parentNode->isSameNode($foster_parent)) { $foster_parent->insertBefore($node, $table); } else { $foster_parent->appendChild($node); } } /** * For debugging, prints the stack */ private function printStack() { $names = array(); foreach ($this->stack as $i => $element) { $names[] = $element->tagName; } echo " -> stack [" . implode(', ', $names) . "]\n"; } /** * For debugging, prints active formatting elements */ private function printActiveFormattingElements() { if (!$this->a_formatting) return; $names = array(); foreach ($this->a_formatting as $node) { if ($node === self::MARKER) $names[] = 'MARKER'; else $names[] = $node->tagName; } echo " -> active formatting [" . implode(', ', $names) . "]\n"; } public function currentTableIsTainted() { return !empty($this->getCurrentTable()->tainted); } /** * Sets up the tree constructor for building a fragment. */ public function setupContext($context = null) { $this->fragment = true; if ($context) { $context = $this->dom->createElementNS(self::NS_HTML, $context); /* 4.1. Set the HTML parser's tokenization stage's content model * flag according to the context element, as follows: */ switch ($context->tagName) { case 'title': case 'textarea': $this->content_model = HTML5_Tokenizer::RCDATA; break; case 'style': case 'script': case 'xmp': case 'iframe': case 'noembed': case 'noframes': $this->content_model = HTML5_Tokenizer::CDATA; break; case 'noscript': // XSCRIPT: assuming scripting is enabled $this->content_model = HTML5_Tokenizer::CDATA; break; case 'plaintext': $this->content_model = HTML5_Tokenizer::PLAINTEXT; break; } /* 4.2. Let root be a new html element with no attributes. */ $root = $this->dom->createElementNS(self::NS_HTML, 'html'); $this->root = $root; /* 4.3 Append the element root to the Document node created above. */ $this->dom->appendChild($root); /* 4.4 Set up the parser's stack of open elements so that it * contains just the single element root. */ $this->stack = array($root); /* 4.5 Reset the parser's insertion mode appropriately. */ $this->resetInsertionMode($context); /* 4.6 Set the parser's form element pointer to the nearest node * to the context element that is a form element (going straight up * the ancestor chain, and including the element itself, if it is a * form element), or, if there is no such form element, to null. */ $node = $context; do { if ($node->tagName === 'form') { $this->form_pointer = $node; break; } } while ($node = $node->parentNode); } } public function adjustMathMLAttributes($token) { foreach ($token['attr'] as &$kp) { if ($kp['name'] === 'definitionurl') { $kp['name'] = 'definitionURL'; } } return $token; } public function adjustSVGAttributes($token) { static $lookup = array( 'attributename' => 'attributeName', 'attributetype' => 'attributeType', 'basefrequency' => 'baseFrequency', 'baseprofile' => 'baseProfile', 'calcmode' => 'calcMode', 'clippathunits' => 'clipPathUnits', 'contentscripttype' => 'contentScriptType', 'contentstyletype' => 'contentStyleType', 'diffuseconstant' => 'diffuseConstant', 'edgemode' => 'edgeMode', 'externalresourcesrequired' => 'externalResourcesRequired', 'filterres' => 'filterRes', 'filterunits' => 'filterUnits', 'glyphref' => 'glyphRef', 'gradienttransform' => 'gradientTransform', 'gradientunits' => 'gradientUnits', 'kernelmatrix' => 'kernelMatrix', 'kernelunitlength' => 'kernelUnitLength', 'keypoints' => 'keyPoints', 'keysplines' => 'keySplines', 'keytimes' => 'keyTimes', 'lengthadjust' => 'lengthAdjust', 'limitingconeangle' => 'limitingConeAngle', 'markerheight' => 'markerHeight', 'markerunits' => 'markerUnits', 'markerwidth' => 'markerWidth', 'maskcontentunits' => 'maskContentUnits', 'maskunits' => 'maskUnits', 'numoctaves' => 'numOctaves', 'pathlength' => 'pathLength', 'patterncontentunits' => 'patternContentUnits', 'patterntransform' => 'patternTransform', 'patternunits' => 'patternUnits', 'pointsatx' => 'pointsAtX', 'pointsaty' => 'pointsAtY', 'pointsatz' => 'pointsAtZ', 'preservealpha' => 'preserveAlpha', 'preserveaspectratio' => 'preserveAspectRatio', 'primitiveunits' => 'primitiveUnits', 'refx' => 'refX', 'refy' => 'refY', 'repeatcount' => 'repeatCount', 'repeatdur' => 'repeatDur', 'requiredextensions' => 'requiredExtensions', 'requiredfeatures' => 'requiredFeatures', 'specularconstant' => 'specularConstant', 'specularexponent' => 'specularExponent', 'spreadmethod' => 'spreadMethod', 'startoffset' => 'startOffset', 'stddeviation' => 'stdDeviation', 'stitchtiles' => 'stitchTiles', 'surfacescale' => 'surfaceScale', 'systemlanguage' => 'systemLanguage', 'tablevalues' => 'tableValues', 'targetx' => 'targetX', 'targety' => 'targetY', 'textlength' => 'textLength', 'viewbox' => 'viewBox', 'viewtarget' => 'viewTarget', 'xchannelselector' => 'xChannelSelector', 'ychannelselector' => 'yChannelSelector', 'zoomandpan' => 'zoomAndPan', ); foreach ($token['attr'] as &$kp) { if (isset($lookup[$kp['name']])) { $kp['name'] = $lookup[$kp['name']]; } } return $token; } public function adjustForeignAttributes($token) { static $lookup = array( 'xlink:actuate' => array('xlink', 'actuate', self::NS_XLINK), 'xlink:arcrole' => array('xlink', 'arcrole', self::NS_XLINK), 'xlink:href' => array('xlink', 'href', self::NS_XLINK), 'xlink:role' => array('xlink', 'role', self::NS_XLINK), 'xlink:show' => array('xlink', 'show', self::NS_XLINK), 'xlink:title' => array('xlink', 'title', self::NS_XLINK), 'xlink:type' => array('xlink', 'type', self::NS_XLINK), 'xml:base' => array('xml', 'base', self::NS_XML), 'xml:lang' => array('xml', 'lang', self::NS_XML), 'xml:space' => array('xml', 'space', self::NS_XML), 'xmlns' => array(null, 'xmlns', self::NS_XMLNS), 'xmlns:xlink' => array('xmlns', 'xlink', self::NS_XMLNS), ); foreach ($token['attr'] as &$kp) { if (isset($lookup[$kp['name']])) { $kp['name'] = $lookup[$kp['name']]; } } return $token; } public function insertForeignElement($token, $namespaceURI) { $el = $this->dom->createElementNS($namespaceURI, $token['name']); if (!empty($token['attr'])) { foreach ($token['attr'] as $kp) { $attr = $kp['name']; if (is_array($attr)) { $ns = $attr[2]; $attr = $attr[1]; } else { $ns = self::NS_HTML; } if (!$el->hasAttributeNS($ns, $attr)) { // XSKETCHY: work around godawful libxml bug if ($ns === self::NS_XLINK) { $el->setAttribute('xlink:'.$attr, $kp['value']); } elseif ($ns === self::NS_HTML) { // Another godawful libxml bug $el->setAttribute($attr, $kp['value']); } else { $el->setAttributeNS($ns, $attr, $kp['value']); } } } } $this->appendToRealParent($el); $this->stack[] = $el; // XERROR: see below /* If the newly created element has an xmlns attribute in the XMLNS * namespace whose value is not exactly the same as the element's * namespace, that is a parse error. Similarly, if the newly created * element has an xmlns:xlink attribute in the XMLNS namespace whose * value is not the XLink Namespace, that is a parse error. */ } public function save() { $this->dom->normalize(); if (!$this->fragment) { return $this->dom; } else { if ($this->root) { return $this->root->childNodes; } else { return $this->dom->childNodes; } } } }