diff options
author | Klaus Weidenbach <Klaus.Weidenbach@gmx.net> | 2017-03-18 17:50:05 +0100 |
---|---|---|
committer | Klaus Weidenbach <Klaus.Weidenbach@gmx.net> | 2017-03-26 00:41:27 +0100 |
commit | f718e2b0db0fe3477212a8dd6c3ec067f4432862 (patch) | |
tree | 8dfbd3b3d4bdcd967b50f1ee4655440bcdef5bb8 /library/HTMLPurifier/Lexer | |
parent | 2115eb26a7fd2ca937286bd4e98ab74c7d6e9525 (diff) | |
download | volse-hubzilla-f718e2b0db0fe3477212a8dd6c3ec067f4432862.tar.gz volse-hubzilla-f718e2b0db0fe3477212a8dd6c3ec067f4432862.tar.bz2 volse-hubzilla-f718e2b0db0fe3477212a8dd6c3ec067f4432862.zip |
:arrow_up: Update HTML Purifier library.
Updated HTML Purifier from 4.6.0 to 4.9.2 with better PHP7 compatibility.
Used composer to manage this library.
Diffstat (limited to 'library/HTMLPurifier/Lexer')
-rw-r--r-- | library/HTMLPurifier/Lexer/DOMLex.php | 280 | ||||
-rw-r--r-- | library/HTMLPurifier/Lexer/DirectLex.php | 539 | ||||
-rw-r--r-- | library/HTMLPurifier/Lexer/PH5P.php | 4788 |
3 files changed, 0 insertions, 5607 deletions
diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php deleted file mode 100644 index 720754454..000000000 --- a/library/HTMLPurifier/Lexer/DOMLex.php +++ /dev/null @@ -1,280 +0,0 @@ -<?php - -/** - * Parser that uses PHP 5's DOM extension (part of the core). - * - * In PHP 5, the DOM XML extension was revamped into DOM and added to the core. - * It gives us a forgiving HTML parser, which we use to transform the HTML - * into a DOM, and then into the tokens. It is blazingly fast (for large - * documents, it performs twenty times faster than - * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5. - * - * @note Any empty elements will have empty tokens associated with them, even if - * this is prohibited by the spec. This is cannot be fixed until the spec - * comes into play. - * - * @note PHP's DOM extension does not actually parse any entities, we use - * our own function to do that. - * - * @warning DOM tends to drop whitespace, which may wreak havoc on indenting. - * If this is a huge problem, due to the fact that HTML is hand - * edited and you are unable to get a parser cache that caches the - * the output of HTML Purifier while keeping the original HTML lying - * around, you may want to run Tidy on the resulting output or use - * HTMLPurifier_DirectLex - */ - -class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer -{ - - /** - * @type HTMLPurifier_TokenFactory - */ - private $factory; - - public function __construct() - { - // setup the factory - parent::__construct(); - $this->factory = new HTMLPurifier_TokenFactory(); - } - - /** - * @param string $html - * @param HTMLPurifier_Config $config - * @param HTMLPurifier_Context $context - * @return HTMLPurifier_Token[] - */ - public function tokenizeHTML($html, $config, $context) - { - $html = $this->normalize($html, $config, $context); - - // attempt to armor stray angled brackets that cannot possibly - // form tags and thus are probably being used as emoticons - if ($config->get('Core.AggressivelyFixLt')) { - $char = '[^a-z!\/]'; - $comment = "/<!--(.*?)(-->|\z)/is"; - $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html); - do { - $old = $html; - $html = preg_replace("/<($char)/i", '<\\1', $html); - } while ($html !== $old); - $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments - } - - // preprocess html, essential for UTF-8 - $html = $this->wrapHTML($html, $config, $context); - - $doc = new DOMDocument(); - $doc->encoding = 'UTF-8'; // theoretically, the above has this covered - - set_error_handler(array($this, 'muteErrorHandler')); - $doc->loadHTML($html); - restore_error_handler(); - - $tokens = array(); - $this->tokenizeDOM( - $doc->getElementsByTagName('html')->item(0)-> // <html> - getElementsByTagName('body')->item(0)-> // <body> - getElementsByTagName('div')->item(0), // <div> - $tokens - ); - return $tokens; - } - - /** - * Iterative function that tokenizes a node, putting it into an accumulator. - * To iterate is human, to recurse divine - L. Peter Deutsch - * @param DOMNode $node DOMNode to be tokenized. - * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens. - * @return HTMLPurifier_Token of node appended to previously passed tokens. - */ - protected function tokenizeDOM($node, &$tokens) - { - $level = 0; - $nodes = array($level => new HTMLPurifier_Queue(array($node))); - $closingNodes = array(); - do { - while (!$nodes[$level]->isEmpty()) { - $node = $nodes[$level]->shift(); // FIFO - $collect = $level > 0 ? true : false; - $needEndingTag = $this->createStartNode($node, $tokens, $collect); - if ($needEndingTag) { - $closingNodes[$level][] = $node; - } - if ($node->childNodes && $node->childNodes->length) { - $level++; - $nodes[$level] = new HTMLPurifier_Queue(); - foreach ($node->childNodes as $childNode) { - $nodes[$level]->push($childNode); - } - } - } - $level--; - if ($level && isset($closingNodes[$level])) { - while ($node = array_pop($closingNodes[$level])) { - $this->createEndNode($node, $tokens); - } - } - } while ($level > 0); - } - - /** - * @param DOMNode $node DOMNode to be tokenized. - * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens. - * @param bool $collect Says whether or start and close are collected, set to - * false at first recursion because it's the implicit DIV - * tag you're dealing with. - * @return bool if the token needs an endtoken - * @todo data and tagName properties don't seem to exist in DOMNode? - */ - protected function createStartNode($node, &$tokens, $collect) - { - // intercept non element nodes. WE MUST catch all of them, - // but we're not getting the character reference nodes because - // those should have been preprocessed - if ($node->nodeType === XML_TEXT_NODE) { - $tokens[] = $this->factory->createText($node->data); - return false; - } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { - // undo libxml's special treatment of <script> and <style> tags - $last = end($tokens); - $data = $node->data; - // (note $node->tagname is already normalized) - if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) { - $new_data = trim($data); - if (substr($new_data, 0, 4) === '<!--') { - $data = substr($new_data, 4); - if (substr($data, -3) === '-->') { - $data = substr($data, 0, -3); - } else { - // Highly suspicious! Not sure what to do... - } - } - } - $tokens[] = $this->factory->createText($this->parseData($data)); - return false; - } elseif ($node->nodeType === XML_COMMENT_NODE) { - // this is code is only invoked for comments in script/style in versions - // of libxml pre-2.6.28 (regular comments, of course, are still - // handled regularly) - $tokens[] = $this->factory->createComment($node->data); - return false; - } elseif ($node->nodeType !== XML_ELEMENT_NODE) { - // not-well tested: there may be other nodes we have to grab - return false; - } - - $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array(); - - // We still have to make sure that the element actually IS empty - if (!$node->childNodes->length) { - if ($collect) { - $tokens[] = $this->factory->createEmpty($node->tagName, $attr); - } - return false; - } else { - if ($collect) { - $tokens[] = $this->factory->createStart( - $tag_name = $node->tagName, // somehow, it get's dropped - $attr - ); - } - return true; - } - } - - /** - * @param DOMNode $node - * @param HTMLPurifier_Token[] $tokens - */ - protected function createEndNode($node, &$tokens) - { - $tokens[] = $this->factory->createEnd($node->tagName); - } - - - /** - * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array. - * - * @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects. - * @return array Associative array of attributes. - */ - protected function transformAttrToAssoc($node_map) - { - // NamedNodeMap is documented very well, so we're using undocumented - // features, namely, the fact that it implements Iterator and - // has a ->length attribute - if ($node_map->length === 0) { - return array(); - } - $array = array(); - foreach ($node_map as $attr) { - $array[$attr->name] = $attr->value; - } - return $array; - } - - /** - * An error handler that mutes all errors - * @param int $errno - * @param string $errstr - */ - public function muteErrorHandler($errno, $errstr) - { - } - - /** - * Callback function for undoing escaping of stray angled brackets - * in comments - * @param array $matches - * @return string - */ - public function callbackUndoCommentSubst($matches) - { - return '<!--' . strtr($matches[1], array('&' => '&', '<' => '<')) . $matches[2]; - } - - /** - * Callback function that entity-izes ampersands in comments so that - * callbackUndoCommentSubst doesn't clobber them - * @param array $matches - * @return string - */ - public function callbackArmorCommentEntities($matches) - { - return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2]; - } - - /** - * Wraps an HTML fragment in the necessary HTML - * @param string $html - * @param HTMLPurifier_Config $config - * @param HTMLPurifier_Context $context - * @return string - */ - protected function wrapHTML($html, $config, $context) - { - $def = $config->getDefinition('HTML'); - $ret = ''; - - if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) { - $ret .= '<!DOCTYPE html '; - if (!empty($def->doctype->dtdPublic)) { - $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" '; - } - if (!empty($def->doctype->dtdSystem)) { - $ret .= '"' . $def->doctype->dtdSystem . '" '; - } - $ret .= '>'; - } - - $ret .= '<html><head>'; - $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'; - // No protection if $html contains a stray </div>! - $ret .= '</head><body><div>' . $html . '</div></body></html>'; - return $ret; - } -} - -// vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php deleted file mode 100644 index 746b6e315..000000000 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ /dev/null @@ -1,539 +0,0 @@ -<?php - -/** - * Our in-house implementation of a parser. - * - * A pure PHP parser, DirectLex has absolutely no dependencies, making - * it a reasonably good default for PHP4. Written with efficiency in mind, - * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it - * pales in comparison to HTMLPurifier_Lexer_DOMLex. - * - * @todo Reread XML spec and document differences. - */ -class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer -{ - /** - * @type bool - */ - public $tracksLineNumbers = true; - - /** - * Whitespace characters for str(c)spn. - * @type string - */ - protected $_whitespace = "\x20\x09\x0D\x0A"; - - /** - * Callback function for script CDATA fudge - * @param array $matches, in form of array(opening tag, contents, closing tag) - * @return string - */ - protected function scriptCallback($matches) - { - return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3]; - } - - /** - * @param String $html - * @param HTMLPurifier_Config $config - * @param HTMLPurifier_Context $context - * @return array|HTMLPurifier_Token[] - */ - public function tokenizeHTML($html, $config, $context) - { - // special normalization for script tags without any armor - // our "armor" heurstic is a < sign any number of whitespaces after - // the first script tag - if ($config->get('HTML.Trusted')) { - $html = preg_replace_callback( - '#(<script[^>]*>)(\s*[^<].+?)(</script>)#si', - array($this, 'scriptCallback'), - $html - ); - } - - $html = $this->normalize($html, $config, $context); - - $cursor = 0; // our location in the text - $inside_tag = false; // whether or not we're parsing the inside of a tag - $array = array(); // result array - - // This is also treated to mean maintain *column* numbers too - $maintain_line_numbers = $config->get('Core.MaintainLineNumbers'); - - if ($maintain_line_numbers === null) { - // automatically determine line numbering by checking - // if error collection is on - $maintain_line_numbers = $config->get('Core.CollectErrors'); - } - - if ($maintain_line_numbers) { - $current_line = 1; - $current_col = 0; - $length = strlen($html); - } else { - $current_line = false; - $current_col = false; - $length = false; - } - $context->register('CurrentLine', $current_line); - $context->register('CurrentCol', $current_col); - $nl = "\n"; - // how often to manually recalculate. This will ALWAYS be right, - // but it's pretty wasteful. Set to 0 to turn off - $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval'); - - $e = false; - if ($config->get('Core.CollectErrors')) { - $e =& $context->get('ErrorCollector'); - } - - // for testing synchronization - $loops = 0; - - while (++$loops) { - // $cursor is either at the start of a token, or inside of - // a tag (i.e. there was a < immediately before it), as indicated - // by $inside_tag - - if ($maintain_line_numbers) { - // $rcursor, however, is always at the start of a token. - $rcursor = $cursor - (int)$inside_tag; - - // Column number is cheap, so we calculate it every round. - // We're interested at the *end* of the newline string, so - // we need to add strlen($nl) == 1 to $nl_pos before subtracting it - // from our "rcursor" position. - $nl_pos = strrpos($html, $nl, $rcursor - $length); - $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1); - - // recalculate lines - if ($synchronize_interval && // synchronization is on - $cursor > 0 && // cursor is further than zero - $loops % $synchronize_interval === 0) { // time to synchronize! - $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor); - } - } - - $position_next_lt = strpos($html, '<', $cursor); - $position_next_gt = strpos($html, '>', $cursor); - - // triggers on "<b>asdf</b>" but not "asdf <b></b>" - // special case to set up context - if ($position_next_lt === $cursor) { - $inside_tag = true; - $cursor++; - } - - if (!$inside_tag && $position_next_lt !== false) { - // We are not inside tag and there still is another tag to parse - $token = new - HTMLPurifier_Token_Text( - $this->parseData( - substr( - $html, - $cursor, - $position_next_lt - $cursor - ) - ) - ); - if ($maintain_line_numbers) { - $token->rawPosition($current_line, $current_col); - $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor); - } - $array[] = $token; - $cursor = $position_next_lt + 1; - $inside_tag = true; - continue; - } elseif (!$inside_tag) { - // We are not inside tag but there are no more tags - // If we're already at the end, break - if ($cursor === strlen($html)) { - break; - } - // Create Text of rest of string - $token = new - HTMLPurifier_Token_Text( - $this->parseData( - substr( - $html, - $cursor - ) - ) - ); - if ($maintain_line_numbers) { - $token->rawPosition($current_line, $current_col); - } - $array[] = $token; - break; - } elseif ($inside_tag && $position_next_gt !== false) { - // We are in tag and it is well formed - // Grab the internals of the tag - $strlen_segment = $position_next_gt - $cursor; - - if ($strlen_segment < 1) { - // there's nothing to process! - $token = new HTMLPurifier_Token_Text('<'); - $cursor++; - continue; - } - - $segment = substr($html, $cursor, $strlen_segment); - - if ($segment === false) { - // somehow, we attempted to access beyond the end of - // the string, defense-in-depth, reported by Nate Abele - break; - } - - // Check if it's a comment - if (substr($segment, 0, 3) === '!--') { - // re-determine segment length, looking for --> - $position_comment_end = strpos($html, '-->', $cursor); - if ($position_comment_end === false) { - // uh oh, we have a comment that extends to - // infinity. Can't be helped: set comment - // end position to end of string - if ($e) { - $e->send(E_WARNING, 'Lexer: Unclosed comment'); - } - $position_comment_end = strlen($html); - $end = true; - } else { - $end = false; - } - $strlen_segment = $position_comment_end - $cursor; - $segment = substr($html, $cursor, $strlen_segment); - $token = new - HTMLPurifier_Token_Comment( - substr( - $segment, - 3, - $strlen_segment - 3 - ) - ); - if ($maintain_line_numbers) { - $token->rawPosition($current_line, $current_col); - $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment); - } - $array[] = $token; - $cursor = $end ? $position_comment_end : $position_comment_end + 3; - $inside_tag = false; - continue; - } - - // Check if it's an end tag - $is_end_tag = (strpos($segment, '/') === 0); - if ($is_end_tag) { - $type = substr($segment, 1); - $token = new HTMLPurifier_Token_End($type); - if ($maintain_line_numbers) { - $token->rawPosition($current_line, $current_col); - $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); - } - $array[] = $token; - $inside_tag = false; - $cursor = $position_next_gt + 1; - continue; - } - - // Check leading character is alnum, if not, we may - // have accidently grabbed an emoticon. Translate into - // text and go our merry way - if (!ctype_alpha($segment[0])) { - // XML: $segment[0] !== '_' && $segment[0] !== ':' - if ($e) { - $e->send(E_NOTICE, 'Lexer: Unescaped lt'); - } - $token = new HTMLPurifier_Token_Text('<'); - if ($maintain_line_numbers) { - $token->rawPosition($current_line, $current_col); - $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); - } - $array[] = $token; - $inside_tag = false; - continue; - } - - // Check if it is explicitly self closing, if so, remove - // trailing slash. Remember, we could have a tag like <br>, so - // any later token processing scripts must convert improperly - // classified EmptyTags from StartTags. - $is_self_closing = (strrpos($segment, '/') === $strlen_segment - 1); - if ($is_self_closing) { - $strlen_segment--; - $segment = substr($segment, 0, $strlen_segment); - } - - // Check if there are any attributes - $position_first_space = strcspn($segment, $this->_whitespace); - - if ($position_first_space >= $strlen_segment) { - if ($is_self_closing) { - $token = new HTMLPurifier_Token_Empty($segment); - } else { - $token = new HTMLPurifier_Token_Start($segment); - } - if ($maintain_line_numbers) { - $token->rawPosition($current_line, $current_col); - $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); - } - $array[] = $token; - $inside_tag = false; - $cursor = $position_next_gt + 1; - continue; - } - - // Grab out all the data - $type = substr($segment, 0, $position_first_space); - $attribute_string = - trim( - substr( - $segment, - $position_first_space - ) - ); - if ($attribute_string) { - $attr = $this->parseAttributeString( - $attribute_string, - $config, - $context - ); - } else { - $attr = array(); - } - - if ($is_self_closing) { - $token = new HTMLPurifier_Token_Empty($type, $attr); - } else { - $token = new HTMLPurifier_Token_Start($type, $attr); - } - if ($maintain_line_numbers) { - $token->rawPosition($current_line, $current_col); - $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); - } - $array[] = $token; - $cursor = $position_next_gt + 1; - $inside_tag = false; - continue; - } else { - // inside tag, but there's no ending > sign - if ($e) { - $e->send(E_WARNING, 'Lexer: Missing gt'); - } - $token = new - HTMLPurifier_Token_Text( - '<' . - $this->parseData( - substr($html, $cursor) - ) - ); - if ($maintain_line_numbers) { - $token->rawPosition($current_line, $current_col); - } - // no cursor scroll? Hmm... - $array[] = $token; - break; - } - break; - } - - $context->destroy('CurrentLine'); - $context->destroy('CurrentCol'); - return $array; - } - - /** - * PHP 5.0.x compatible substr_count that implements offset and length - * @param string $haystack - * @param string $needle - * @param int $offset - * @param int $length - * @return int - */ - protected function substrCount($haystack, $needle, $offset, $length) - { - static $oldVersion; - if ($oldVersion === null) { - $oldVersion = version_compare(PHP_VERSION, '5.1', '<'); - } - if ($oldVersion) { - $haystack = substr($haystack, $offset, $length); - return substr_count($haystack, $needle); - } else { - return substr_count($haystack, $needle, $offset, $length); - } - } - - /** - * Takes the inside of an HTML tag and makes an assoc array of attributes. - * - * @param string $string Inside of tag excluding name. - * @param HTMLPurifier_Config $config - * @param HTMLPurifier_Context $context - * @return array Assoc array of attributes. - */ - public function parseAttributeString($string, $config, $context) - { - $string = (string)$string; // quick typecast - - if ($string == '') { - return array(); - } // no attributes - - $e = false; - if ($config->get('Core.CollectErrors')) { - $e =& $context->get('ErrorCollector'); - } - - // let's see if we can abort as quickly as possible - // one equal sign, no spaces => one attribute - $num_equal = substr_count($string, '='); - $has_space = strpos($string, ' '); - if ($num_equal === 0 && !$has_space) { - // bool attribute - return array($string => $string); - } elseif ($num_equal === 1 && !$has_space) { - // only one attribute - list($key, $quoted_value) = explode('=', $string); - $quoted_value = trim($quoted_value); - if (!$key) { - if ($e) { - $e->send(E_ERROR, 'Lexer: Missing attribute key'); - } - return array(); - } - if (!$quoted_value) { - return array($key => ''); - } - $first_char = @$quoted_value[0]; - $last_char = @$quoted_value[strlen($quoted_value) - 1]; - - $same_quote = ($first_char == $last_char); - $open_quote = ($first_char == '"' || $first_char == "'"); - - if ($same_quote && $open_quote) { - // well behaved - $value = substr($quoted_value, 1, strlen($quoted_value) - 2); - } else { - // not well behaved - if ($open_quote) { - if ($e) { - $e->send(E_ERROR, 'Lexer: Missing end quote'); - } - $value = substr($quoted_value, 1); - } else { - $value = $quoted_value; - } - } - if ($value === false) { - $value = ''; - } - return array($key => $this->parseData($value)); - } - - // setup loop environment - $array = array(); // return assoc array of attributes - $cursor = 0; // current position in string (moves forward) - $size = strlen($string); // size of the string (stays the same) - - // if we have unquoted attributes, the parser expects a terminating - // space, so let's guarantee that there's always a terminating space. - $string .= ' '; - - $old_cursor = -1; - while ($cursor < $size) { - if ($old_cursor >= $cursor) { - throw new Exception("Infinite loop detected"); - } - $old_cursor = $cursor; - - $cursor += ($value = strspn($string, $this->_whitespace, $cursor)); - // grab the key - - $key_begin = $cursor; //we're currently at the start of the key - - // scroll past all characters that are the key (not whitespace or =) - $cursor += strcspn($string, $this->_whitespace . '=', $cursor); - - $key_end = $cursor; // now at the end of the key - - $key = substr($string, $key_begin, $key_end - $key_begin); - - if (!$key) { - if ($e) { - $e->send(E_ERROR, 'Lexer: Missing attribute key'); - } - $cursor += 1 + strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop - continue; // empty key - } - - // scroll past all whitespace - $cursor += strspn($string, $this->_whitespace, $cursor); - - if ($cursor >= $size) { - $array[$key] = $key; - break; - } - - // if the next character is an equal sign, we've got a regular - // pair, otherwise, it's a bool attribute - $first_char = @$string[$cursor]; - - if ($first_char == '=') { - // key="value" - - $cursor++; - $cursor += strspn($string, $this->_whitespace, $cursor); - - if ($cursor === false) { - $array[$key] = ''; - break; - } - - // we might be in front of a quote right now - - $char = @$string[$cursor]; - - if ($char == '"' || $char == "'") { - // it's quoted, end bound is $char - $cursor++; - $value_begin = $cursor; - $cursor = strpos($string, $char, $cursor); - $value_end = $cursor; - } else { - // it's not quoted, end bound is whitespace - $value_begin = $cursor; - $cursor += strcspn($string, $this->_whitespace, $cursor); - $value_end = $cursor; - } - - // we reached a premature end - if ($cursor === false) { - $cursor = $size; - $value_end = $cursor; - } - - $value = substr($string, $value_begin, $value_end - $value_begin); - if ($value === false) { - $value = ''; - } - $array[$key] = $this->parseData($value); - $cursor++; - } else { - // boolattr - if ($key !== '') { - $array[$key] = $key; - } else { - // purely theoretical - if ($e) { - $e->send(E_ERROR, 'Lexer: Missing attribute key'); - } - } - } - } - return $array; - } -} - -// vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/Lexer/PH5P.php b/library/HTMLPurifier/Lexer/PH5P.php deleted file mode 100644 index a4587e4cd..000000000 --- a/library/HTMLPurifier/Lexer/PH5P.php +++ /dev/null @@ -1,4788 +0,0 @@ -<?php - -/** - * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library. - * Occupies space in the HTML5 pseudo-namespace, which may cause conflicts. - * - * @note - * Recent changes to PHP's DOM extension have resulted in some fatal - * error conditions with the original version of PH5P. Pending changes, - * this lexer will punt to DirectLex if DOM throws an exception. - */ - -class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex -{ - /** - * @param string $html - * @param HTMLPurifier_Config $config - * @param HTMLPurifier_Context $context - * @return HTMLPurifier_Token[] - */ - public function tokenizeHTML($html, $config, $context) - { - $new_html = $this->normalize($html, $config, $context); - $new_html = $this->wrapHTML($new_html, $config, $context); - try { - $parser = new HTML5($new_html); - $doc = $parser->save(); - } catch (DOMException $e) { - // Uh oh, it failed. Punt to DirectLex. - $lexer = new HTMLPurifier_Lexer_DirectLex(); - $context->register('PH5PError', $e); // save the error, so we can detect it - return $lexer->tokenizeHTML($html, $config, $context); // use original HTML - } - $tokens = array(); - $this->tokenizeDOM( - $doc->getElementsByTagName('html')->item(0)-> // <html> - getElementsByTagName('body')->item(0)-> // <body> - getElementsByTagName('div')->item(0) // <div> - , - $tokens - ); - return $tokens; - } -} - -/* - -Copyright 2007 Jeroen van der Meer <http://jero.net/> - -Permission is hereby granted, free of charge, to any person obtaining a -copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be included -in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -*/ - -class HTML5 -{ - private $data; - private $char; - private $EOF; - private $state; - private $tree; - private $token; - private $content_model; - private $escape = false; - private $entities = array( - 'AElig;', - 'AElig', - 'AMP;', - 'AMP', - 'Aacute;', - 'Aacute', - 'Acirc;', - 'Acirc', - 'Agrave;', - 'Agrave', - 'Alpha;', - 'Aring;', - 'Aring', - 'Atilde;', - 'Atilde', - 'Auml;', - 'Auml', - 'Beta;', - 'COPY;', - 'COPY', - 'Ccedil;', - 'Ccedil', - 'Chi;', - 'Dagger;', - 'Delta;', - 'ETH;', - 'ETH', - 'Eacute;', - 'Eacute', - 'Ecirc;', - 'Ecirc', - 'Egrave;', - 'Egrave', - 'Epsilon;', - 'Eta;', - 'Euml;', - 'Euml', - 'GT;', - 'GT', - 'Gamma;', - 'Iacute;', - 'Iacute', - 'Icirc;', - 'Icirc', - 'Igrave;', - 'Igrave', - 'Iota;', - 'Iuml;', - 'Iuml', - 'Kappa;', - 'LT;', - 'LT', - 'Lambda;', - 'Mu;', - 'Ntilde;', - 'Ntilde', - 'Nu;', - 'OElig;', - 'Oacute;', - 'Oacute', - 'Ocirc;', - 'Ocirc', - 'Ograve;', - 'Ograve', - 'Omega;', - 'Omicron;', - 'Oslash;', - 'Oslash', - 'Otilde;', - 'Otilde', - 'Ouml;', - 'Ouml', - 'Phi;', - 'Pi;', - 'Prime;', - 'Psi;', - 'QUOT;', - 'QUOT', - 'REG;', - 'REG', - 'Rho;', - 'Scaron;', - 'Sigma;', - 'THORN;', - 'THORN', - 'TRADE;', - 'Tau;', - 'Theta;', - 'Uacute;', - 'Uacute', - 'Ucirc;', - 'Ucirc', - 'Ugrave;', - 'Ugrave', - 'Upsilon;', - 'Uuml;', - 'Uuml', - 'Xi;', - 'Yacute;', - 'Yacute', - 'Yuml;', - 'Zeta;', - 'aacute;', - 'aacute', - 'acirc;', - 'acirc', - 'acute;', - 'acute', - 'aelig;', - 'aelig', - 'agrave;', - 'agrave', - 'alefsym;', - 'alpha;', - 'amp;', - 'amp', - 'and;', - 'ang;', - 'apos;', - 'aring;', - 'aring', - 'asymp;', - 'atilde;', - 'atilde', - 'auml;', - 'auml', - 'bdquo;', - 'beta;', - 'brvbar;', - 'brvbar', - 'bull;', - 'cap;', - 'ccedil;', - 'ccedil', - 'cedil;', - 'cedil', - 'cent;', - 'cent', - 'chi;', - 'circ;', - 'clubs;', - 'cong;', - 'copy;', - 'copy', - 'crarr;', - 'cup;', - 'curren;', - 'curren', - 'dArr;', - 'dagger;', - 'darr;', - 'deg;', - 'deg', - 'delta;', - 'diams;', - 'divide;', - 'divide', - 'eacute;', - 'eacute', - 'ecirc;', - 'ecirc', - 'egrave;', - 'egrave', - 'empty;', - 'emsp;', - 'ensp;', - 'epsilon;', - 'equiv;', - 'eta;', - 'eth;', - 'eth', - 'euml;', - 'euml', - 'euro;', - 'exist;', - 'fnof;', - 'forall;', - 'frac12;', - 'frac12', - 'frac14;', - 'frac14', - 'frac34;', - 'frac34', - 'frasl;', - 'gamma;', - 'ge;', - 'gt;', - 'gt', - 'hArr;', - 'harr;', - 'hearts;', - 'hellip;', - 'iacute;', - 'iacute', - 'icirc;', - 'icirc', - 'iexcl;', - 'iexcl', - 'igrave;', - 'igrave', - 'image;', - 'infin;', - 'int;', - 'iota;', - 'iquest;', - 'iquest', - 'isin;', - 'iuml;', - 'iuml', - 'kappa;', - 'lArr;', - 'lambda;', - 'lang;', - 'laquo;', - 'laquo', - 'larr;', - 'lceil;', - 'ldquo;', - 'le;', - 'lfloor;', - 'lowast;', - 'loz;', - 'lrm;', - 'lsaquo;', - 'lsquo;', - 'lt;', - 'lt', - 'macr;', - 'macr', - 'mdash;', - 'micro;', - 'micro', - 'middot;', - 'middot', - 'minus;', - 'mu;', - 'nabla;', - 'nbsp;', - 'nbsp', - 'ndash;', - 'ne;', - 'ni;', - 'not;', - 'not', - 'notin;', - 'nsub;', - 'ntilde;', - 'ntilde', - 'nu;', - 'oacute;', - 'oacute', - 'ocirc;', - 'ocirc', - 'oelig;', - 'ograve;', - 'ograve', - 'oline;', - 'omega;', - 'omicron;', - 'oplus;', - 'or;', - 'ordf;', - 'ordf', - 'ordm;', - 'ordm', - 'oslash;', - 'oslash', - 'otilde;', - 'otilde', - 'otimes;', - 'ouml;', - 'ouml', - 'para;', - 'para', - 'part;', - 'permil;', - 'perp;', - 'phi;', - 'pi;', - 'piv;', - 'plusmn;', - 'plusmn', - 'pound;', - 'pound', - 'prime;', - 'prod;', - 'prop;', - 'psi;', - 'quot;', - 'quot', - 'rArr;', - 'radic;', - 'rang;', - 'raquo;', - 'raquo', - 'rarr;', - 'rceil;', - 'rdquo;', - 'real;', - 'reg;', - 'reg', - 'rfloor;', - 'rho;', - 'rlm;', - 'rsaquo;', - 'rsquo;', - 'sbquo;', - 'scaron;', - 'sdot;', - 'sect;', - 'sect', - 'shy;', - 'shy', - 'sigma;', - 'sigmaf;', - 'sim;', - 'spades;', - 'sub;', - 'sube;', - 'sum;', - 'sup1;', - 'sup1', - 'sup2;', - 'sup2', - 'sup3;', - 'sup3', - 'sup;', - 'supe;', - 'szlig;', - 'szlig', - 'tau;', - 'there4;', - 'theta;', - 'thetasym;', - 'thinsp;', - 'thorn;', - 'thorn', - 'tilde;', - 'times;', - 'times', - 'trade;', - 'uArr;', - 'uacute;', - 'uacute', - 'uarr;', - 'ucirc;', - 'ucirc', - 'ugrave;', - 'ugrave', - 'uml;', - 'uml', - 'upsih;', - 'upsilon;', - 'uuml;', - 'uuml', - 'weierp;', - 'xi;', - 'yacute;', - 'yacute', - 'yen;', - 'yen', - 'yuml;', - 'yuml', - 'zeta;', - 'zwj;', - 'zwnj;' - ); - - const PCDATA = 0; - const RCDATA = 1; - const CDATA = 2; - const PLAINTEXT = 3; - - const DOCTYPE = 0; - const STARTTAG = 1; - const ENDTAG = 2; - const COMMENT = 3; - const CHARACTR = 4; - const EOF = 5; - - public function __construct($data) - { - $this->data = $data; - $this->char = -1; - $this->EOF = strlen($data); - $this->tree = new HTML5TreeConstructer; - $this->content_model = self::PCDATA; - - $this->state = 'data'; - - while ($this->state !== null) { - $this->{$this->state . 'State'}(); - } - } - - public function save() - { - return $this->tree->save(); - } - - private function char() - { - return ($this->char < $this->EOF) - ? $this->data[$this->char] - : false; - } - - private function character($s, $l = 0) - { - if ($s + $l < $this->EOF) { - if ($l === 0) { - return $this->data[$s]; - } else { - return substr($this->data, $s, $l); - } - } - } - - private function characters($char_class, $start) - { - return preg_replace('#^([' . $char_class . ']+).*#s', '\\1', substr($this->data, $start)); - } - - private function dataState() - { - // Consume the next input character - $this->char++; - $char = $this->char(); - - if ($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) { - /* U+0026 AMPERSAND (&) - When the content model flag is set to one of the PCDATA or RCDATA - states: switch to the entity data state. Otherwise: treat it as per - the "anything else" entry below. */ - $this->state = 'entityData'; - - } elseif ($char === '-') { - /* If the content model flag is set to either the RCDATA state or - the CDATA state, and the escape flag is false, and there are at - least three characters before this one in the input stream, and the - last four characters in the input stream, including this one, are - U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS, - and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */ - if (($this->content_model === self::RCDATA || $this->content_model === - self::CDATA) && $this->escape === false && - $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--' - ) { - $this->escape = true; - } - - /* In any case, emit the input character as a character token. Stay - in the data state. */ - $this->emitToken( - array( - 'type' => self::CHARACTR, - 'data' => $char - ) - ); - - /* U+003C LESS-THAN SIGN (<) */ - } elseif ($char === '<' && ($this->content_model === self::PCDATA || - (($this->content_model === self::RCDATA || - $this->content_model === self::CDATA) && $this->escape === false)) - ) { - /* When the content model flag is set to the PCDATA state: switch - to the tag open state. - - When the content model flag is set to either the RCDATA state or - the CDATA state and the escape flag is false: switch to the tag - open state. - - Otherwise: treat it as per the "anything else" entry below. */ - $this->state = 'tagOpen'; - - /* U+003E GREATER-THAN SIGN (>) */ - } elseif ($char === '>') { - /* If the content model flag is set to either the RCDATA state or - the CDATA state, and the escape flag is true, and the last three - characters in the input stream including this one are U+002D - HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"), - set the escape flag to false. */ - if (($this->content_model === self::RCDATA || - $this->content_model === self::CDATA) && $this->escape === true && - $this->character($this->char, 3) === '-->' - ) { - $this->escape = false; - } - - /* In any case, emit the input character as a character token. - Stay in the data state. */ - $this->emitToken( - array( - 'type' => self::CHARACTR, - 'data' => $char - ) - ); - - } elseif ($this->char === $this->EOF) { - /* EOF - Emit an end-of-file token. */ - $this->EOF(); - - } elseif ($this->content_model === self::PLAINTEXT) { - /* When the content model flag is set to the PLAINTEXT state - THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of - the text and emit it as a character token. */ - $this->emitToken( - array( - 'type' => self::CHARACTR, - 'data' => substr($this->data, $this->char) - ) - ); - - $this->EOF(); - - } else { - /* Anything else - THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that - otherwise would also be treated as a character token and emit it - as a single character token. Stay in the data state. */ - $len = strcspn($this->data, '<&', $this->char); - $char = substr($this->data, $this->char, $len); - $this->char += $len - 1; - - $this->emitToken( - array( - 'type' => self::CHARACTR, - 'data' => $char - ) - ); - - $this->state = 'data'; - } - } - - private function entityDataState() - { - // Attempt to consume an entity. - $entity = $this->entity(); - - // If nothing is returned, emit a U+0026 AMPERSAND character token. - // Otherwise, emit the character token that was returned. - $char = (!$entity) ? '&' : $entity; - $this->emitToken( - array( - 'type' => self::CHARACTR, - 'data' => $char - ) - ); - - // Finally, switch to the data state. - $this->state = 'data'; - } - - private function tagOpenState() - { - switch ($this->content_model) { - case self::RCDATA: - case self::CDATA: - /* If the next input character is a U+002F SOLIDUS (/) character, - consume it and switch to the close tag open state. If the next - input character is not a U+002F SOLIDUS (/) character, emit a - U+003C LESS-THAN SIGN character token and switch to the data - state to process the next input character. */ - if ($this->character($this->char + 1) === '/') { - $this->char++; - $this->state = 'closeTagOpen'; - - } else { - $this->emitToken( - array( - 'type' => self::CHARACTR, - 'data' => '<' - ) - ); - - $this->state = 'data'; - } - break; - - case self::PCDATA: - // If the content model flag is set to the PCDATA state - // Consume the next input character: - $this->char++; - $char = $this->char(); - - if ($char === '!') { - /* U+0021 EXCLAMATION MARK (!) - Switch to the markup declaration open state. */ - $this->state = 'markupDeclarationOpen'; - - } elseif ($char === '/') { - /* U+002F SOLIDUS (/) - Switch to the close tag open state. */ - $this->state = 'closeTagOpen'; - - } elseif (preg_match('/^[A-Za-z]$/', $char)) { - /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z - Create a new start tag token, set its tag name to the lowercase - version of the input character (add 0x0020 to the character's code - point), then switch to the tag name state. (Don't emit the token - yet; further details will be filled in before it is emitted.) */ - $this->token = array( - 'name' => strtolower($char), - 'type' => self::STARTTAG, - 'attr' => array() - ); - - $this->state = 'tagName'; - - } elseif ($char === '>') { - /* U+003E GREATER-THAN SIGN (>) - Parse error. Emit a U+003C LESS-THAN SIGN character token and a - U+003E GREATER-THAN SIGN character token. Switch to the data state. */ - $this->emitToken( - array( - 'type' => self::CHARACTR, - 'data' => '<>' - ) - ); - - $this->state = 'data'; - - } elseif ($char === '?') { - /* U+003F QUESTION MARK (?) - Parse error. Switch to the bogus comment state. */ - $this->state = 'bogusComment'; - - } else { - /* Anything else - Parse error. Emit a U+003C LESS-THAN SIGN character token and - reconsume the current input character in the data state. */ - $this->emitToken( - array( - 'type' => self::CHARACTR, - 'data' => '<' - ) - ); - - $this->char--; - $this->state = 'data'; - } - break; - } - } - - private function closeTagOpenState() - { - $next_node = strtolower($this->characters('A-Za-z', $this->char + 1)); - $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName; - - if (($this->content_model === self::RCDATA || $this->content_model === self::CDATA) && - (!$the_same || ($the_same && (!preg_match( - '/[\t\n\x0b\x0c >\/]/', - $this->character($this->char + 1 + strlen($next_node)) - ) || $this->EOF === $this->char))) - ) { - /* If the content model flag is set to the RCDATA or CDATA states then - examine the next few characters. If they do not match the tag name of - the last start tag token emitted (case insensitively), or if they do but - they are not immediately followed by one of the following characters: - * U+0009 CHARACTER TABULATION - * U+000A LINE FEED (LF) - * U+000B LINE TABULATION - * U+000C FORM FEED (FF) - * U+0020 SPACE - * U+003E GREATER-THAN SIGN (>) - * U+002F SOLIDUS (/) - * EOF - ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character - token, a U+002F SOLIDUS character token, and switch to the data state - to process the next input character. */ - $this->emitToken( - array( - 'type' => self::CHARACTR, - 'data' => '</' - ) - ); - - $this->state = 'data'; - - } else { - /* Otherwise, if the content model flag is set to the PCDATA state, - or if the next few characters do match that tag name, consume the - next input character: */ - $this->char++; - $char = $this->char(); - - if (preg_match('/^[A-Za-z]$/', $char)) { - /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z - Create a new end tag token, set its tag name to the lowercase version - of the input character (add 0x0020 to the character's code point), then - switch to the tag name state. (Don't emit the token yet; further details - will be filled in before it is emitted.) */ - $this->token = array( - 'name' => strtolower($char), - 'type' => self::ENDTAG - ); - - $this->state = 'tagName'; - - } elseif ($char === '>') { - /* U+003E GREATER-THAN SIGN (>) - Parse error. Switch to the data state. */ - $this->state = 'data'; - - } elseif ($this->char === $this->EOF) { - /* EOF - Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F - SOLIDUS character token. Reconsume the EOF character in the data state. */ - $this->emitToken( - array( - 'type' => self::CHARACTR, - 'data' => '</' - ) - ); - - $this->char--; - $this->state = 'data'; - - } else { - /* Parse error. Switch to the bogus comment state. */ - $this->state = 'bogusComment'; - } - } - } - - private function tagNameState() - { - // Consume the next input character: - $this->char++; - $char = $this->character($this->char); - - if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - /* U+0009 CHARACTER TABULATION - U+000A LINE FEED (LF) - U+000B LINE TABULATION - U+000C FORM FEED (FF) - U+0020 SPACE - Switch to the before attribute name state. */ - $this->state = 'beforeAttributeName'; - - } elseif ($char === '>') { - /* U+003E GREATER-THAN SIGN (>) - Emit the current tag token. Switch to the data state. */ - $this->emitToken($this->token); - $this->state = 'data'; - - } elseif ($this->char === $this->EOF) { - /* EOF - Parse error. Emit the current tag token. Reconsume the EOF - character in the data state. */ - $this->emitToken($this->token); - - $this->char--; - $this->state = 'data'; - - } elseif ($char === '/') { - /* U+002F SOLIDUS (/) - Parse error unless this is a permitted slash. Switch to the before - attribute name state. */ - $this->state = 'beforeAttributeName'; - - } else { - /* Anything else - Append the current input character to the current tag token's tag name. - Stay in the tag name state. */ - $this->token['name'] .= strtolower($char); - $this->state = 'tagName'; - } - } - - private function beforeAttributeNameState() - { - // Consume the next input character: - $this->char++; - $char = $this->character($this->char); - - if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - /* U+0009 CHARACTER TABULATION - U+000A LINE FEED (LF) - U+000B LINE TABULATION - U+000C FORM FEED (FF) - U+0020 SPACE - Stay in the before attribute name state. */ - $this->state = 'beforeAttributeName'; - - } elseif ($char === '>') { - /* U+003E GREATER-THAN SIGN (>) - Emit the current tag token. Switch to the data state. */ - $this->emitToken($this->token); - $this->state = 'data'; - - } elseif ($char === '/') { - /* U+002F SOLIDUS (/) - Parse error unless this is a permitted slash. Stay in the before - attribute name state. */ - $this->state = 'beforeAttributeName'; - - } elseif ($this->char === $this->EOF) { - /* EOF - Parse error. Emit the current tag token. Reconsume the EOF - character in the data state. */ - $this->emitToken($this->token); - - $this->char--; - $this->state = 'data'; - - } else { - /* Anything else - Start a new attribute in the current tag token. Set that attribute's - name to the current input character, and its value to the empty string. - Switch to the attribute name state. */ - $this->token['attr'][] = array( - 'name' => strtolower($char), - 'value' => null - ); - - $this->state = 'attributeName'; - } - } - - private function attributeNameState() - { - // Consume the next input character: - $this->char++; - $char = $this->character($this->char); - - if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - /* U+0009 CHARACTER TABULATION - U+000A LINE FEED (LF) - U+000B LINE TABULATION - U+000C FORM FEED (FF) - U+0020 SPACE - Stay in the before attribute name state. */ - $this->state = 'afterAttributeName'; - - } elseif ($char === '=') { - /* U+003D EQUALS SIGN (=) - Switch to the before attribute value state. */ - $this->state = 'beforeAttributeValue'; - - } elseif ($char === '>') { - /* U+003E GREATER-THAN SIGN (>) - Emit the current tag token. Switch to the data state. */ - $this->emitToken($this->token); - $this->state = 'data'; - - } elseif ($char === '/' && $this->character($this->char + 1) !== '>') { - /* U+002F SOLIDUS (/) - Parse error unless this is a permitted slash. Switch to the before - attribute name state. */ - $this->state = 'beforeAttributeName'; - - } elseif ($this->char === $this->EOF) { - /* EOF - Parse error. Emit the current tag token. Reconsume the EOF - character in the data state. */ - $this->emitToken($this->token); - - $this->char--; - $this->state = 'data'; - - } else { - /* Anything else - Append the current input character to the current attribute's name. - Stay in the attribute name state. */ - $last = count($this->token['attr']) - 1; - $this->token['attr'][$last]['name'] .= strtolower($char); - - $this->state = 'attributeName'; - } - } - - private function afterAttributeNameState() - { - // Consume the next input character: - $this->char++; - $char = $this->character($this->char); - - if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - /* U+0009 CHARACTER TABULATION - U+000A LINE FEED (LF) - U+000B LINE TABULATION - U+000C FORM FEED (FF) - U+0020 SPACE - Stay in the after attribute name state. */ - $this->state = 'afterAttributeName'; - - } elseif ($char === '=') { - /* U+003D EQUALS SIGN (=) - Switch to the before attribute value state. */ - $this->state = 'beforeAttributeValue'; - - } elseif ($char === '>') { - /* U+003E GREATER-THAN SIGN (>) - Emit the current tag token. Switch to the data state. */ - $this->emitToken($this->token); - $this->state = 'data'; - - } elseif ($char === '/' && $this->character($this->char + 1) !== '>') { - /* U+002F SOLIDUS (/) - Parse error unless this is a permitted slash. Switch to the - before attribute name state. */ - $this->state = 'beforeAttributeName'; - - } elseif ($this->char === $this->EOF) { - /* EOF - Parse error. Emit the current tag token. Reconsume the EOF - character in the data state. */ - $this->emitToken($this->token); - - $this->char--; - $this->state = 'data'; - - } else { - /* Anything else - Start a new attribute in the current tag token. Set that attribute's - name to the current input character, and its value to the empty string. - Switch to the attribute name state. */ - $this->token['attr'][] = array( - 'name' => strtolower($char), - 'value' => null - ); - - $this->state = 'attributeName'; - } - } - - private function beforeAttributeValueState() - { - // Consume the next input character: - $this->char++; - $char = $this->character($this->char); - - if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - /* U+0009 CHARACTER TABULATION - U+000A LINE FEED (LF) - U+000B LINE TABULATION - U+000C FORM FEED (FF) - U+0020 SPACE - Stay in the before attribute value state. */ - $this->state = 'beforeAttributeValue'; - - } elseif ($char === '"') { - /* U+0022 QUOTATION MARK (") - Switch to the attribute value (double-quoted) state. */ - $this->state = 'attributeValueDoubleQuoted'; - - } elseif ($char === '&') { - /* U+0026 AMPERSAND (&) - Switch to the attribute value (unquoted) state and reconsume - this input character. */ - $this->char--; - $this->state = 'attributeValueUnquoted'; - - } elseif ($char === '\'') { - /* U+0027 APOSTROPHE (') - Switch to the attribute value (single-quoted) state. */ - $this->state = 'attributeValueSingleQuoted'; - - } elseif ($char === '>') { - /* U+003E GREATER-THAN SIGN (>) - Emit the current tag token. Switch to the data state. */ - $this->emitToken($this->token); - $this->state = 'data'; - - } else { - /* Anything else - Append the current input character to the current attribute's value. - Switch to the attribute value (unquoted) state. */ - $last = count($this->token['attr']) - 1; - $this->token['attr'][$last]['value'] .= $char; - - $this->state = 'attributeValueUnquoted'; - } - } - - private function attributeValueDoubleQuotedState() - { - // Consume the next input character: - $this->char++; - $char = $this->character($this->char); - - if ($char === '"') { - /* U+0022 QUOTATION MARK (") - Switch to the before attribute name state. */ - $this->state = 'beforeAttributeName'; - - } elseif ($char === '&') { - /* U+0026 AMPERSAND (&) - Switch to the entity in attribute value state. */ - $this->entityInAttributeValueState('double'); - - } elseif ($this->char === $this->EOF) { - /* EOF - Parse error. Emit the current tag token. Reconsume the character - in the data state. */ - $this->emitToken($this->token); - - $this->char--; - $this->state = 'data'; - - } else { - /* Anything else - Append the current input character to the current attribute's value. - Stay in the attribute value (double-quoted) state. */ - $last = count($this->token['attr']) - 1; - $this->token['attr'][$last]['value'] .= $char; - - $this->state = 'attributeValueDoubleQuoted'; - } - } - - private function attributeValueSingleQuotedState() - { - // Consume the next input character: - $this->char++; - $char = $this->character($this->char); - - if ($char === '\'') { - /* U+0022 QUOTATION MARK (') - Switch to the before attribute name state. */ - $this->state = 'beforeAttributeName'; - - } elseif ($char === '&') { - /* U+0026 AMPERSAND (&) - Switch to the entity in attribute value state. */ - $this->entityInAttributeValueState('single'); - - } elseif ($this->char === $this->EOF) { - /* EOF - Parse error. Emit the current tag token. Reconsume the character - in the data state. */ - $this->emitToken($this->token); - - $this->char--; - $this->state = 'data'; - - } else { - /* Anything else - Append the current input character to the current attribute's value. - Stay in the attribute value (single-quoted) state. */ - $last = count($this->token['attr']) - 1; - $this->token['attr'][$last]['value'] .= $char; - - $this->state = 'attributeValueSingleQuoted'; - } - } - - private function attributeValueUnquotedState() - { - // Consume the next input character: - $this->char++; - $char = $this->character($this->char); - - if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - /* U+0009 CHARACTER TABULATION - U+000A LINE FEED (LF) - U+000B LINE TABULATION - U+000C FORM FEED (FF) - U+0020 SPACE - Switch to the before attribute name state. */ - $this->state = 'beforeAttributeName'; - - } elseif ($char === '&') { - /* U+0026 AMPERSAND (&) - Switch to the entity in attribute value state. */ - $this->entityInAttributeValueState(); - - } elseif ($char === '>') { - /* U+003E GREATER-THAN SIGN (>) - Emit the current tag token. Switch to the data state. */ - $this->emitToken($this->token); - $this->state = 'data'; - - } else { - /* Anything else - Append the current input character to the current attribute's value. - Stay in the attribute value (unquoted) state. */ - $last = count($this->token['attr']) - 1; - $this->token['attr'][$last]['value'] .= $char; - - $this->state = 'attributeValueUnquoted'; - } - } - - private function entityInAttributeValueState() - { - // Attempt to consume an entity. - $entity = $this->entity(); - - // If nothing is returned, append a U+0026 AMPERSAND character to the - // current attribute's value. Otherwise, emit the character token that - // was returned. - $char = (!$entity) - ? '&' - : $entity; - - $last = count($this->token['attr']) - 1; - $this->token['attr'][$last]['value'] .= $char; - } - - private function bogusCommentState() - { - /* Consume every character up to the first U+003E GREATER-THAN SIGN - character (>) or the end of the file (EOF), whichever comes first. Emit - a comment token whose data is the concatenation of all the characters - starting from and including the character that caused the state machine - to switch into the bogus comment state, up to and including the last - consumed character before the U+003E character, if any, or up to the - end of the file otherwise. (If the comment was started by the end of - the file (EOF), the token is empty.) */ - $data = $this->characters('^>', $this->char); - $this->emitToken( - array( - 'data' => $data, - 'type' => self::COMMENT - ) - ); - - $this->char += strlen($data); - - /* Switch to the data state. */ - $this->state = 'data'; - - /* If the end of the file was reached, reconsume the EOF character. */ - if ($this->char === $this->EOF) { - $this->char = $this->EOF - 1; - } - } - - private function markupDeclarationOpenState() - { - /* If the next two characters are both U+002D HYPHEN-MINUS (-) - characters, consume those two characters, create a comment token whose - data is the empty string, and switch to the comment state. */ - if ($this->character($this->char + 1, 2) === '--') { - $this->char += 2; - $this->state = 'comment'; - $this->token = array( - 'data' => null, - 'type' => self::COMMENT - ); - - /* Otherwise if the next seven chacacters are a case-insensitive match - for the word "DOCTYPE", then consume those characters and switch to the - DOCTYPE state. */ - } elseif (strtolower($this->character($this->char + 1, 7)) === 'doctype') { - $this->char += 7; - $this->state = 'doctype'; - - /* Otherwise, is is a parse error. Switch to the bogus comment state. - The next character that is consumed, if any, is the first character - that will be in the comment. */ - } else { - $this->char++; - $this->state = 'bogusComment'; - } - } - - private function commentState() - { - /* Consume the next input character: */ - $this->char++; - $char = $this->char(); - - /* U+002D HYPHEN-MINUS (-) */ - if ($char === '-') { - /* Switch to the comment dash state */ - $this->state = 'commentDash'; - - /* EOF */ - } elseif ($this->char === $this->EOF) { - /* Parse error. Emit the comment token. Reconsume the EOF character - in the data state. */ - $this->emitToken($this->token); - $this->char--; - $this->state = 'data'; - - /* Anything else */ - } else { - /* Append the input character to the comment token's data. Stay in - the comment state. */ - $this->token['data'] .= $char; - } - } - - private function commentDashState() - { - /* Consume the next input character: */ - $this->char++; - $char = $this->char(); - - /* U+002D HYPHEN-MINUS (-) */ - if ($char === '-') { - /* Switch to the comment end state */ - $this->state = 'commentEnd'; - - /* EOF */ - } elseif ($this->char === $this->EOF) { - /* Parse error. Emit the comment token. Reconsume the EOF character - in the data state. */ - $this->emitToken($this->token); - $this->char--; - $this->state = 'data'; - - /* Anything else */ - } else { - /* Append a U+002D HYPHEN-MINUS (-) character and the input - character to the comment token's data. Switch to the comment state. */ - $this->token['data'] .= '-' . $char; - $this->state = 'comment'; - } - } - - private function commentEndState() - { - /* Consume the next input character: */ - $this->char++; - $char = $this->char(); - - if ($char === '>') { - $this->emitToken($this->token); - $this->state = 'data'; - - } elseif ($char === '-') { - $this->token['data'] .= '-'; - - } elseif ($this->char === $this->EOF) { - $this->emitToken($this->token); - $this->char--; - $this->state = 'data'; - - } else { - $this->token['data'] .= '--' . $char; - $this->state = 'comment'; - } - } - - private function doctypeState() - { - /* Consume the next input character: */ - $this->char++; - $char = $this->char(); - - if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - $this->state = 'beforeDoctypeName'; - - } else { - $this->char--; - $this->state = 'beforeDoctypeName'; - } - } - - private function beforeDoctypeNameState() - { - /* Consume the next input character: */ - $this->char++; - $char = $this->char(); - - if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - // Stay in the before DOCTYPE name state. - - } elseif (preg_match('/^[a-z]$/', $char)) { - $this->token = array( - 'name' => strtoupper($char), - 'type' => self::DOCTYPE, - 'error' => true - ); - - $this->state = 'doctypeName'; - - } elseif ($char === '>') { - $this->emitToken( - array( - 'name' => null, - 'type' => self::DOCTYPE, - 'error' => true - ) - ); - - $this->state = 'data'; - - } elseif ($this->char === $this->EOF) { - $this->emitToken( - array( - 'name' => null, - 'type' => self::DOCTYPE, - 'error' => true - ) - ); - - $this->char--; - $this->state = 'data'; - - } else { - $this->token = array( - 'name' => $char, - 'type' => self::DOCTYPE, - 'error' => true - ); - - $this->state = 'doctypeName'; - } - } - - private function doctypeNameState() - { - /* Consume the next input character: */ - $this->char++; - $char = $this->char(); - - if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - $this->state = 'AfterDoctypeName'; - - } elseif ($char === '>') { - $this->emitToken($this->token); - $this->state = 'data'; - - } elseif (preg_match('/^[a-z]$/', $char)) { - $this->token['name'] .= strtoupper($char); - - } elseif ($this->char === $this->EOF) { - $this->emitToken($this->token); - $this->char--; - $this->state = 'data'; - - } else { - $this->token['name'] .= $char; - } - - $this->token['error'] = ($this->token['name'] === 'HTML') - ? false - : true; - } - - private function afterDoctypeNameState() - { - /* Consume the next input character: */ - $this->char++; - $char = $this->char(); - - if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - // Stay in the DOCTYPE name state. - - } elseif ($char === '>') { - $this->emitToken($this->token); - $this->state = 'data'; - - } elseif ($this->char === $this->EOF) { - $this->emitToken($this->token); - $this->char--; - $this->state = 'data'; - - } else { - $this->token['error'] = true; - $this->state = 'bogusDoctype'; - } - } - - private function bogusDoctypeState() - { - /* Consume the next input character: */ - $this->char++; - $char = $this->char(); - - if ($char === '>') { - $this->emitToken($this->token); - $this->state = 'data'; - - } elseif ($this->char === $this->EOF) { - $this->emitToken($this->token); - $this->char--; - $this->state = 'data'; - - } else { - // Stay in the bogus DOCTYPE state. - } - } - - private function entity() - { - $start = $this->char; - - // This section defines how to consume an entity. This definition is - // used when parsing entities in text and in attributes. - - // The behaviour depends on the identity of the next character (the - // one immediately after the U+0026 AMPERSAND character): - - switch ($this->character($this->char + 1)) { - // U+0023 NUMBER SIGN (#) - case '#': - - // The behaviour further depends on the character after the - // U+0023 NUMBER SIGN: - switch ($this->character($this->char + 1)) { - // U+0078 LATIN SMALL LETTER X - // U+0058 LATIN CAPITAL LETTER X - case 'x': - case 'X': - // Follow the steps below, but using the range of - // characters U+0030 DIGIT ZERO through to U+0039 DIGIT - // NINE, U+0061 LATIN SMALL LETTER A through to U+0066 - // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER - // A, through to U+0046 LATIN CAPITAL LETTER F (in other - // words, 0-9, A-F, a-f). - $char = 1; - $char_class = '0-9A-Fa-f'; - break; - - // Anything else - default: - // Follow the steps below, but using the range of - // characters U+0030 DIGIT ZERO through to U+0039 DIGIT - // NINE (i.e. just 0-9). - $char = 0; - $char_class = '0-9'; - break; - } - - // Consume as many characters as match the range of characters - // given above. - $this->char++; - $e_name = $this->characters($char_class, $this->char + $char + 1); - $entity = $this->character($start, $this->char); - $cond = strlen($e_name) > 0; - - // The rest of the parsing happens bellow. - break; - - // Anything else - default: - // Consume the maximum number of characters possible, with the - // consumed characters case-sensitively matching one of the - // identifiers in the first column of the entities table. - $e_name = $this->characters('0-9A-Za-z;', $this->char + 1); - $len = strlen($e_name); - - for ($c = 1; $c <= $len; $c++) { - $id = substr($e_name, 0, $c); - $this->char++; - - if (in_array($id, $this->entities)) { - if ($e_name[$c - 1] !== ';') { - if ($c < $len && $e_name[$c] == ';') { - $this->char++; // consume extra semicolon - } - } - $entity = $id; - break; - } - } - - $cond = isset($entity); - // The rest of the parsing happens bellow. - break; - } - - if (!$cond) { - // If no match can be made, then this is a parse error. No - // characters are consumed, and nothing is returned. - $this->char = $start; - return false; - } - - // Return a character token for the character corresponding to the - // entity name (as given by the second column of the entities table). - return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8'); - } - - private function emitToken($token) - { - $emit = $this->tree->emitToken($token); - - if (is_int($emit)) { - $this->content_model = $emit; - - } elseif ($token['type'] === self::ENDTAG) { - $this->content_model = self::PCDATA; - } - } - - private function EOF() - { - $this->state = null; - $this->tree->emitToken( - array( - 'type' => self::EOF - ) - ); - } -} - -class HTML5TreeConstructer -{ - public $stack = array(); - - private $phase; - private $mode; - private $dom; - private $foster_parent = null; - private $a_formatting = array(); - - private $head_pointer = null; - private $form_pointer = null; - - private $scoping = array('button', 'caption', 'html', 'marquee', 'object', 'table', 'td', 'th'); - private $formatting = array( - 'a', - 'b', - 'big', - 'em', - 'font', - 'i', - 'nobr', - 's', - 'small', - 'strike', - 'strong', - 'tt', - 'u' - ); - private $special = array( - 'address', - 'area', - 'base', - 'basefont', - 'bgsound', - 'blockquote', - 'body', - 'br', - 'center', - 'col', - 'colgroup', - 'dd', - 'dir', - 'div', - 'dl', - 'dt', - 'embed', - 'fieldset', - 'form', - 'frame', - 'frameset', - 'h1', - 'h2', - 'h3', - 'h4', - 'h5', - 'h6', - 'head', - 'hr', - 'iframe', - 'image', - 'img', - 'input', - 'isindex', - 'li', - 'link', - 'listing', - 'menu', - 'meta', - 'noembed', - 'noframes', - 'noscript', - 'ol', - 'optgroup', - 'option', - 'p', - 'param', - 'plaintext', - 'pre', - 'script', - 'select', - 'spacer', - 'style', - 'tbody', - 'textarea', - 'tfoot', - 'thead', - 'title', - 'tr', - 'ul', - 'wbr' - ); - - // The different phases. - const INIT_PHASE = 0; - const ROOT_PHASE = 1; - const MAIN_PHASE = 2; - const END_PHASE = 3; - - // The different insertion modes for the main phase. - const BEFOR_HEAD = 0; - const IN_HEAD = 1; - const AFTER_HEAD = 2; - const IN_BODY = 3; - const IN_TABLE = 4; - const IN_CAPTION = 5; - const IN_CGROUP = 6; - const IN_TBODY = 7; - const IN_ROW = 8; - const IN_CELL = 9; - const IN_SELECT = 10; - const AFTER_BODY = 11; - const IN_FRAME = 12; - const AFTR_FRAME = 13; - - // The different types of elements. - const SPECIAL = 0; - const SCOPING = 1; - const FORMATTING = 2; - const PHRASING = 3; - - const MARKER = 0; - - public function __construct() - { - $this->phase = self::INIT_PHASE; - $this->mode = self::BEFOR_HEAD; - $this->dom = new DOMDocument; - - $this->dom->encoding = 'UTF-8'; - $this->dom->preserveWhiteSpace = true; - $this->dom->substituteEntities = true; - $this->dom->strictErrorChecking = false; - } - - // Process tag tokens - public function emitToken($token) - { - switch ($this->phase) { - case self::INIT_PHASE: - return $this->initPhase($token); - break; - case self::ROOT_PHASE: - return $this->rootElementPhase($token); - break; - case self::MAIN_PHASE: - return $this->mainPhase($token); - break; - case self::END_PHASE : - return $this->trailingEndPhase($token); - break; - } - } - - private function initPhase($token) - { - /* Initially, the tree construction stage must handle each token - emitted from the tokenisation stage as follows: */ - - /* A DOCTYPE token that is marked as being in error - A comment token - A start tag token - An end tag token - A character token that is not one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE - An end-of-file token */ - if ((isset($token['error']) && $token['error']) || - $token['type'] === HTML5::COMMENT || - $token['type'] === HTML5::STARTTAG || - $token['type'] === HTML5::ENDTAG || - $token['type'] === HTML5::EOF || - ($token['type'] === HTML5::CHARACTR && isset($token['data']) && - !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) - ) { - /* This specification does not define how to handle this case. In - particular, user agents may ignore the entirety of this specification - altogether for such documents, and instead invoke special parse modes - with a greater emphasis on backwards compatibility. */ - - $this->phase = self::ROOT_PHASE; - return $this->rootElementPhase($token); - - /* A DOCTYPE token marked as being correct */ - } elseif (isset($token['error']) && !$token['error']) { - /* Append a DocumentType node to the Document node, with the name - attribute set to the name given in the DOCTYPE token (which will be - "HTML"), and the other attributes specific to DocumentType objects - set to null, empty lists, or the empty string as appropriate. */ - $doctype = new DOMDocumentType(null, null, 'HTML'); - - /* Then, switch to the root element phase of the tree construction - stage. */ - $this->phase = self::ROOT_PHASE; - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE */ - } elseif (isset($token['data']) && preg_match( - '/^[\t\n\x0b\x0c ]+$/', - $token['data'] - ) - ) { - /* Append that character to the Document node. */ - $text = $this->dom->createTextNode($token['data']); - $this->dom->appendChild($text); - } - } - - private function rootElementPhase($token) - { - /* After the initial phase, as each token is emitted from the tokenisation - stage, it must be processed as described in this section. */ - - /* A DOCTYPE token */ - if ($token['type'] === HTML5::DOCTYPE) { - // Parse error. Ignore the token. - - /* A comment token */ - } elseif ($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the Document object with the data - attribute set to the data given in the comment token. */ - $comment = $this->dom->createComment($token['data']); - $this->dom->appendChild($comment); - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE */ - } elseif ($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) - ) { - /* Append that character to the Document node. */ - $text = $this->dom->createTextNode($token['data']); - $this->dom->appendChild($text); - - /* A character token that is not one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED - (FF), or U+0020 SPACE - A start tag token - An end tag token - An end-of-file token */ - } elseif (($token['type'] === HTML5::CHARACTR && - !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || - $token['type'] === HTML5::STARTTAG || - $token['type'] === HTML5::ENDTAG || - $token['type'] === HTML5::EOF - ) { - /* Create an HTMLElement node with the tag name html, in the HTML - namespace. Append it to the Document object. Switch to the main - phase and reprocess the current token. */ - $html = $this->dom->createElement('html'); - $this->dom->appendChild($html); - $this->stack[] = $html; - - $this->phase = self::MAIN_PHASE; - return $this->mainPhase($token); - } - } - - private function mainPhase($token) - { - /* Tokens in the main phase must be handled as follows: */ - - /* A DOCTYPE token */ - if ($token['type'] === HTML5::DOCTYPE) { - // Parse error. Ignore the token. - - /* A start tag token with the tag name "html" */ - } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') { - /* If this start tag token was not the first start tag token, then - it is a parse error. */ - - /* For each attribute on the token, check to see if the attribute - is already present on the top element of the stack of open elements. - If it is not, add the attribute and its corresponding value to that - element. */ - foreach ($token['attr'] as $attr) { - if (!$this->stack[0]->hasAttribute($attr['name'])) { - $this->stack[0]->setAttribute($attr['name'], $attr['value']); - } - } - - /* An end-of-file token */ - } elseif ($token['type'] === HTML5::EOF) { - /* Generate implied end tags. */ - $this->generateImpliedEndTags(); - - /* Anything else. */ - } else { - /* Depends on the insertion mode: */ - switch ($this->mode) { - case self::BEFOR_HEAD: - return $this->beforeHead($token); - break; - case self::IN_HEAD: - return $this->inHead($token); - break; - case self::AFTER_HEAD: - return $this->afterHead($token); - break; - case self::IN_BODY: - return $this->inBody($token); - break; - case self::IN_TABLE: - return $this->inTable($token); - break; - case self::IN_CAPTION: - return $this->inCaption($token); - break; - case self::IN_CGROUP: - return $this->inColumnGroup($token); - break; - case self::IN_TBODY: - return $this->inTableBody($token); - break; - case self::IN_ROW: - return $this->inRow($token); - break; - case self::IN_CELL: - return $this->inCell($token); - break; - case self::IN_SELECT: - return $this->inSelect($token); - break; - case self::AFTER_BODY: - return $this->afterBody($token); - break; - case self::IN_FRAME: - return $this->inFrameset($token); - break; - case self::AFTR_FRAME: - return $this->afterFrameset($token); - break; - case self::END_PHASE: - return $this->trailingEndPhase($token); - break; - } - } - } - - private function beforeHead($token) - { - /* Handle the token as follows: */ - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE */ - if ($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) - ) { - /* Append the character to the current node. */ - $this->insertText($token['data']); - - /* A comment token */ - } elseif ($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the current node with the data attribute - set to the data given in the comment token. */ - $this->insertComment($token['data']); - - /* A start tag token with the tag name "head" */ - } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') { - /* Create an element for the token, append the new element to the - current node and push it onto the stack of open elements. */ - $element = $this->insertElement($token); - - /* Set the head element pointer to this new element node. */ - $this->head_pointer = $element; - - /* Change the insertion mode to "in head". */ - $this->mode = self::IN_HEAD; - - /* A start tag token whose tag name is one of: "base", "link", "meta", - "script", "style", "title". Or an end tag with the tag name "html". - Or a character token that is not one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE. Or any other start tag token */ - } elseif ($token['type'] === HTML5::STARTTAG || - ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') || - ($token['type'] === HTML5::CHARACTR && !preg_match( - '/^[\t\n\x0b\x0c ]$/', - $token['data'] - )) - ) { - /* Act as if a start tag token with the tag name "head" and no - attributes had been seen, then reprocess the current token. */ - $this->beforeHead( - array( - 'name' => 'head', - 'type' => HTML5::STARTTAG, - 'attr' => array() - ) - ); - - return $this->inHead($token); - - /* Any other end tag */ - } elseif ($token['type'] === HTML5::ENDTAG) { - /* Parse error. Ignore the token. */ - } - } - - private function inHead($token) - { - /* Handle the token as follows: */ - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE. - - THIS DIFFERS FROM THE SPEC: If the current node is either a title, style - or script element, append the character to the current node regardless - of its content. */ - if (($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || ( - $token['type'] === HTML5::CHARACTR && in_array( - end($this->stack)->nodeName, - array('title', 'style', 'script') - )) - ) { - /* Append the character to the current node. */ - $this->insertText($token['data']); - - /* A comment token */ - } elseif ($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the current node with the data attribute - set to the data given in the comment token. */ - $this->insertComment($token['data']); - - } elseif ($token['type'] === HTML5::ENDTAG && - in_array($token['name'], array('title', 'style', 'script')) - ) { - array_pop($this->stack); - return HTML5::PCDATA; - - /* A start tag with the tag name "title" */ - } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') { - /* Create an element for the token and append the new element to the - node pointed to by the head element pointer, or, if that is null - (innerHTML case), to the current node. */ - if ($this->head_pointer !== null) { - $element = $this->insertElement($token, false); - $this->head_pointer->appendChild($element); - - } else { - $element = $this->insertElement($token); - } - - /* Switch the tokeniser's content model flag to the RCDATA state. */ - return HTML5::RCDATA; - - /* A start tag with the tag name "style" */ - } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') { - /* Create an element for the token and append the new element to the - node pointed to by the head element pointer, or, if that is null - (innerHTML case), to the current node. */ - if ($this->head_pointer !== null) { - $element = $this->insertElement($token, false); - $this->head_pointer->appendChild($element); - - } else { - $this->insertElement($token); - } - - /* Switch the tokeniser's content model flag to the CDATA state. */ - return HTML5::CDATA; - - /* A start tag with the tag name "script" */ - } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') { - /* Create an element for the token. */ - $element = $this->insertElement($token, false); - $this->head_pointer->appendChild($element); - - /* Switch the tokeniser's content model flag to the CDATA state. */ - return HTML5::CDATA; - - /* A start tag with the tag name "base", "link", or "meta" */ - } elseif ($token['type'] === HTML5::STARTTAG && in_array( - $token['name'], - array('base', 'link', 'meta') - ) - ) { - /* Create an element for the token and append the new element to the - node pointed to by the head element pointer, or, if that is null - (innerHTML case), to the current node. */ - if ($this->head_pointer !== null) { - $element = $this->insertElement($token, false); - $this->head_pointer->appendChild($element); - array_pop($this->stack); - - } else { - $this->insertElement($token); - } - - /* An end tag with the tag name "head" */ - } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') { - /* If the current node is a head element, pop the current node off - the stack of open elements. */ - if ($this->head_pointer->isSameNode(end($this->stack))) { - array_pop($this->stack); - - /* Otherwise, this is a parse error. */ - } else { - // k - } - - /* Change the insertion mode to "after head". */ - $this->mode = self::AFTER_HEAD; - - /* A start tag with the tag name "head" or an end tag except "html". */ - } elseif (($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') || - ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html') - ) { - // Parse error. Ignore the token. - - /* Anything else */ - } else { - /* If the current node is a head element, act as if an end tag - token with the tag name "head" had been seen. */ - if ($this->head_pointer->isSameNode(end($this->stack))) { - $this->inHead( - array( - 'name' => 'head', - 'type' => HTML5::ENDTAG - ) - ); - - /* Otherwise, change the insertion mode to "after head". */ - } else { - $this->mode = self::AFTER_HEAD; - } - - /* Then, reprocess the current token. */ - return $this->afterHead($token); - } - } - - private function afterHead($token) - { - /* Handle the token as follows: */ - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE */ - if ($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) - ) { - /* Append the character to the current node. */ - $this->insertText($token['data']); - - /* A comment token */ - } elseif ($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the current node with the data attribute - set to the data given in the comment token. */ - $this->insertComment($token['data']); - - /* A start tag token with the tag name "body" */ - } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') { - /* Insert a body element for the token. */ - $this->insertElement($token); - - /* Change the insertion mode to "in body". */ - $this->mode = self::IN_BODY; - - /* A start tag token with the tag name "frameset" */ - } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') { - /* Insert a frameset element for the token. */ - $this->insertElement($token); - - /* Change the insertion mode to "in frameset". */ - $this->mode = self::IN_FRAME; - - /* A start tag token whose tag name is one of: "base", "link", "meta", - "script", "style", "title" */ - } elseif ($token['type'] === HTML5::STARTTAG && in_array( - $token['name'], - array('base', 'link', 'meta', 'script', 'style', 'title') - ) - ) { - /* Parse error. Switch the insertion mode back to "in head" and - reprocess the token. */ - $this->mode = self::IN_HEAD; - return $this->inHead($token); - - /* Anything else */ - } else { - /* Act as if a start tag token with the tag name "body" and no - attributes had been seen, and then reprocess the current token. */ - $this->afterHead( - array( - 'name' => 'body', - 'type' => HTML5::STARTTAG, - 'attr' => array() - ) - ); - - return $this->inBody($token); - } - } - - private function inBody($token) - { - /* Handle the token as follows: */ - - switch ($token['type']) { - /* A character token */ - case HTML5::CHARACTR: - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - /* Append the token's character to the current node. */ - $this->insertText($token['data']); - break; - - /* A comment token */ - case HTML5::COMMENT: - /* Append a Comment node to the current node with the data - attribute set to the data given in the comment token. */ - $this->insertComment($token['data']); - break; - - case HTML5::STARTTAG: - switch ($token['name']) { - /* A start tag token whose tag name is one of: "script", - "style" */ - case 'script': - case 'style': - /* Process the token as if the insertion mode had been "in - head". */ - return $this->inHead($token); - break; - - /* A start tag token whose tag name is one of: "base", "link", - "meta", "title" */ - case 'base': - case 'link': - case 'meta': - case 'title': - /* Parse error. Process the token as if the insertion mode - had been "in head". */ - return $this->inHead($token); - break; - - /* A start tag token with the tag name "body" */ - case 'body': - /* Parse error. If the second element on the stack of open - elements is not a body element, or, if the stack of open - elements has only one node on it, then ignore the token. - (innerHTML case) */ - if (count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') { - // Ignore - - /* Otherwise, for each attribute on the token, check to see - if the attribute is already present on the body element (the - second element) on the stack of open elements. If it is not, - add the attribute and its corresponding value to that - element. */ - } else { - foreach ($token['attr'] as $attr) { - if (!$this->stack[1]->hasAttribute($attr['name'])) { - $this->stack[1]->setAttribute($attr['name'], $attr['value']); - } - } - } - break; - - /* A start tag whose tag name is one of: "address", - "blockquote", "center", "dir", "div", "dl", "fieldset", - "listing", "menu", "ol", "p", "ul" */ - case 'address': - case 'blockquote': - case 'center': - case 'dir': - case 'div': - case 'dl': - case 'fieldset': - case 'listing': - case 'menu': - case 'ol': - case 'p': - case 'ul': - /* If the stack of open elements has a p element in scope, - then act as if an end tag with the tag name p had been - seen. */ - if ($this->elementInScope('p')) { - $this->emitToken( - array( - 'name' => 'p', - 'type' => HTML5::ENDTAG - ) - ); - } - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - break; - - /* A start tag whose tag name is "form" */ - case 'form': - /* If the form element pointer is not null, ignore the - token with a parse error. */ - if ($this->form_pointer !== null) { - // Ignore. - - /* Otherwise: */ - } else { - /* If the stack of open elements has a p element in - scope, then act as if an end tag with the tag name p - had been seen. */ - if ($this->elementInScope('p')) { - $this->emitToken( - array( - 'name' => 'p', - 'type' => HTML5::ENDTAG - ) - ); - } - - /* Insert an HTML element for the token, and set the - form element pointer to point to the element created. */ - $element = $this->insertElement($token); - $this->form_pointer = $element; - } - break; - - /* A start tag whose tag name is "li", "dd" or "dt" */ - case 'li': - case 'dd': - case 'dt': - /* If the stack of open elements has a p element in scope, - then act as if an end tag with the tag name p had been - seen. */ - if ($this->elementInScope('p')) { - $this->emitToken( - array( - 'name' => 'p', - 'type' => HTML5::ENDTAG - ) - ); - } - - $stack_length = count($this->stack) - 1; - - for ($n = $stack_length; 0 <= $n; $n--) { - /* 1. Initialise node to be the current node (the - bottommost node of the stack). */ - $stop = false; - $node = $this->stack[$n]; - $cat = $this->getElementCategory($node->tagName); - - /* 2. If node is an li, dd or dt element, then pop all - the nodes from the current node up to node, including - node, then stop this algorithm. */ - if ($token['name'] === $node->tagName || ($token['name'] !== 'li' - && ($node->tagName === 'dd' || $node->tagName === 'dt')) - ) { - for ($x = $stack_length; $x >= $n; $x--) { - array_pop($this->stack); - } - - break; - } - - /* 3. If node is not in the formatting category, and is - not in the phrasing category, and is not an address or - div element, then stop this algorithm. */ - if ($cat !== self::FORMATTING && $cat !== self::PHRASING && - $node->tagName !== 'address' && $node->tagName !== 'div' - ) { - break; - } - } - - /* Finally, insert an HTML element with the same tag - name as the token's. */ - $this->insertElement($token); - break; - - /* A start tag token whose tag name is "plaintext" */ - case 'plaintext': - /* If the stack of open elements has a p element in scope, - then act as if an end tag with the tag name p had been - seen. */ - if ($this->elementInScope('p')) { - $this->emitToken( - array( - 'name' => 'p', - 'type' => HTML5::ENDTAG - ) - ); - } - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - return HTML5::PLAINTEXT; - break; - - /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4", - "h5", "h6" */ - case 'h1': - case 'h2': - case 'h3': - case 'h4': - case 'h5': - case 'h6': - /* If the stack of open elements has a p element in scope, - then act as if an end tag with the tag name p had been seen. */ - if ($this->elementInScope('p')) { - $this->emitToken( - array( - 'name' => 'p', - 'type' => HTML5::ENDTAG - ) - ); - } - - /* If the stack of open elements has in scope an element whose - tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then - this is a parse error; pop elements from the stack until an - element with one of those tag names has been popped from the - stack. */ - while ($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) { - array_pop($this->stack); - } - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - break; - - /* A start tag whose tag name is "a" */ - case 'a': - /* If the list of active formatting elements contains - an element whose tag name is "a" between the end of the - list and the last marker on the list (or the start of - the list if there is no marker on the list), then this - is a parse error; act as if an end tag with the tag name - "a" had been seen, then remove that element from the list - of active formatting elements and the stack of open - elements if the end tag didn't already remove it (it - might not have if the element is not in table scope). */ - $leng = count($this->a_formatting); - - for ($n = $leng - 1; $n >= 0; $n--) { - if ($this->a_formatting[$n] === self::MARKER) { - break; - - } elseif ($this->a_formatting[$n]->nodeName === 'a') { - $this->emitToken( - array( - 'name' => 'a', - 'type' => HTML5::ENDTAG - ) - ); - break; - } - } - - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - /* Insert an HTML element for the token. */ - $el = $this->insertElement($token); - - /* Add that element to the list of active formatting - elements. */ - $this->a_formatting[] = $el; - break; - - /* A start tag whose tag name is one of: "b", "big", "em", "font", - "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */ - case 'b': - case 'big': - case 'em': - case 'font': - case 'i': - case 'nobr': - case 's': - case 'small': - case 'strike': - case 'strong': - case 'tt': - case 'u': - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - /* Insert an HTML element for the token. */ - $el = $this->insertElement($token); - - /* Add that element to the list of active formatting - elements. */ - $this->a_formatting[] = $el; - break; - - /* A start tag token whose tag name is "button" */ - case 'button': - /* If the stack of open elements has a button element in scope, - then this is a parse error; act as if an end tag with the tag - name "button" had been seen, then reprocess the token. (We don't - do that. Unnecessary.) */ - if ($this->elementInScope('button')) { - $this->inBody( - array( - 'name' => 'button', - 'type' => HTML5::ENDTAG - ) - ); - } - - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* Insert a marker at the end of the list of active - formatting elements. */ - $this->a_formatting[] = self::MARKER; - break; - - /* A start tag token whose tag name is one of: "marquee", "object" */ - case 'marquee': - case 'object': - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* Insert a marker at the end of the list of active - formatting elements. */ - $this->a_formatting[] = self::MARKER; - break; - - /* A start tag token whose tag name is "xmp" */ - case 'xmp': - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* Switch the content model flag to the CDATA state. */ - return HTML5::CDATA; - break; - - /* A start tag whose tag name is "table" */ - case 'table': - /* If the stack of open elements has a p element in scope, - then act as if an end tag with the tag name p had been seen. */ - if ($this->elementInScope('p')) { - $this->emitToken( - array( - 'name' => 'p', - 'type' => HTML5::ENDTAG - ) - ); - } - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* Change the insertion mode to "in table". */ - $this->mode = self::IN_TABLE; - break; - - /* A start tag whose tag name is one of: "area", "basefont", - "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */ - case 'area': - case 'basefont': - case 'bgsound': - case 'br': - case 'embed': - case 'img': - case 'param': - case 'spacer': - case 'wbr': - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* Immediately pop the current node off the stack of open elements. */ - array_pop($this->stack); - break; - - /* A start tag whose tag name is "hr" */ - case 'hr': - /* If the stack of open elements has a p element in scope, - then act as if an end tag with the tag name p had been seen. */ - if ($this->elementInScope('p')) { - $this->emitToken( - array( - 'name' => 'p', - 'type' => HTML5::ENDTAG - ) - ); - } - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* Immediately pop the current node off the stack of open elements. */ - array_pop($this->stack); - break; - - /* A start tag whose tag name is "image" */ - case 'image': - /* Parse error. Change the token's tag name to "img" and - reprocess it. (Don't ask.) */ - $token['name'] = 'img'; - return $this->inBody($token); - break; - - /* A start tag whose tag name is "input" */ - case 'input': - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - /* Insert an input element for the token. */ - $element = $this->insertElement($token, false); - - /* If the form element pointer is not null, then associate the - input element with the form element pointed to by the form - element pointer. */ - $this->form_pointer !== null - ? $this->form_pointer->appendChild($element) - : end($this->stack)->appendChild($element); - - /* Pop that input element off the stack of open elements. */ - array_pop($this->stack); - break; - - /* A start tag whose tag name is "isindex" */ - case 'isindex': - /* Parse error. */ - // w/e - - /* If the form element pointer is not null, - then ignore the token. */ - if ($this->form_pointer === null) { - /* Act as if a start tag token with the tag name "form" had - been seen. */ - $this->inBody( - array( - 'name' => 'body', - 'type' => HTML5::STARTTAG, - 'attr' => array() - ) - ); - - /* Act as if a start tag token with the tag name "hr" had - been seen. */ - $this->inBody( - array( - 'name' => 'hr', - 'type' => HTML5::STARTTAG, - 'attr' => array() - ) - ); - - /* Act as if a start tag token with the tag name "p" had - been seen. */ - $this->inBody( - array( - 'name' => 'p', - 'type' => HTML5::STARTTAG, - 'attr' => array() - ) - ); - - /* Act as if a start tag token with the tag name "label" - had been seen. */ - $this->inBody( - array( - 'name' => 'label', - 'type' => HTML5::STARTTAG, - 'attr' => array() - ) - ); - - /* Act as if a stream of character tokens had been seen. */ - $this->insertText( - 'This is a searchable index. ' . - 'Insert your search keywords here: ' - ); - - /* Act as if a start tag token with the tag name "input" - had been seen, with all the attributes from the "isindex" - token, except with the "name" attribute set to the value - "isindex" (ignoring any explicit "name" attribute). */ - $attr = $token['attr']; - $attr[] = array('name' => 'name', 'value' => 'isindex'); - - $this->inBody( - array( - 'name' => 'input', - 'type' => HTML5::STARTTAG, - 'attr' => $attr - ) - ); - - /* Act as if a stream of character tokens had been seen - (see below for what they should say). */ - $this->insertText( - 'This is a searchable index. ' . - 'Insert your search keywords here: ' - ); - - /* Act as if an end tag token with the tag name "label" - had been seen. */ - $this->inBody( - array( - 'name' => 'label', - 'type' => HTML5::ENDTAG - ) - ); - - /* Act as if an end tag token with the tag name "p" had - been seen. */ - $this->inBody( - array( - 'name' => 'p', - 'type' => HTML5::ENDTAG - ) - ); - - /* Act as if a start tag token with the tag name "hr" had - been seen. */ - $this->inBody( - array( - 'name' => 'hr', - 'type' => HTML5::ENDTAG - ) - ); - - /* Act as if an end tag token with the tag name "form" had - been seen. */ - $this->inBody( - array( - 'name' => 'form', - 'type' => HTML5::ENDTAG - ) - ); - } - break; - - /* A start tag whose tag name is "textarea" */ - case 'textarea': - $this->insertElement($token); - - /* Switch the tokeniser's content model flag to the - RCDATA state. */ - return HTML5::RCDATA; - break; - - /* A start tag whose tag name is one of: "iframe", "noembed", - "noframes" */ - case 'iframe': - case 'noembed': - case 'noframes': - $this->insertElement($token); - - /* Switch the tokeniser's content model flag to the CDATA state. */ - return HTML5::CDATA; - break; - - /* A start tag whose tag name is "select" */ - case 'select': - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* Change the insertion mode to "in select". */ - $this->mode = self::IN_SELECT; - break; - - /* A start or end tag whose tag name is one of: "caption", "col", - "colgroup", "frame", "frameset", "head", "option", "optgroup", - "tbody", "td", "tfoot", "th", "thead", "tr". */ - case 'caption': - case 'col': - case 'colgroup': - case 'frame': - case 'frameset': - case 'head': - case 'option': - case 'optgroup': - case 'tbody': - case 'td': - case 'tfoot': - case 'th': - case 'thead': - case 'tr': - // Parse error. Ignore the token. - break; - - /* A start or end tag whose tag name is one of: "event-source", - "section", "nav", "article", "aside", "header", "footer", - "datagrid", "command" */ - case 'event-source': - case 'section': - case 'nav': - case 'article': - case 'aside': - case 'header': - case 'footer': - case 'datagrid': - case 'command': - // Work in progress! - break; - - /* A start tag token not covered by the previous entries */ - default: - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - $this->insertElement($token, true, true); - break; - } - break; - - case HTML5::ENDTAG: - switch ($token['name']) { - /* An end tag with the tag name "body" */ - case 'body': - /* If the second element in the stack of open elements is - not a body element, this is a parse error. Ignore the token. - (innerHTML case) */ - if (count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') { - // Ignore. - - /* If the current node is not the body element, then this - is a parse error. */ - } elseif (end($this->stack)->nodeName !== 'body') { - // Parse error. - } - - /* Change the insertion mode to "after body". */ - $this->mode = self::AFTER_BODY; - break; - - /* An end tag with the tag name "html" */ - case 'html': - /* Act as if an end tag with tag name "body" had been seen, - then, if that token wasn't ignored, reprocess the current - token. */ - $this->inBody( - array( - 'name' => 'body', - 'type' => HTML5::ENDTAG - ) - ); - - return $this->afterBody($token); - break; - - /* An end tag whose tag name is one of: "address", "blockquote", - "center", "dir", "div", "dl", "fieldset", "listing", "menu", - "ol", "pre", "ul" */ - case 'address': - case 'blockquote': - case 'center': - case 'dir': - case 'div': - case 'dl': - case 'fieldset': - case 'listing': - case 'menu': - case 'ol': - case 'pre': - case 'ul': - /* If the stack of open elements has an element in scope - with the same tag name as that of the token, then generate - implied end tags. */ - if ($this->elementInScope($token['name'])) { - $this->generateImpliedEndTags(); - - /* Now, if the current node is not an element with - the same tag name as that of the token, then this - is a parse error. */ - // w/e - - /* If the stack of open elements has an element in - scope with the same tag name as that of the token, - then pop elements from this stack until an element - with that tag name has been popped from the stack. */ - for ($n = count($this->stack) - 1; $n >= 0; $n--) { - if ($this->stack[$n]->nodeName === $token['name']) { - $n = -1; - } - - array_pop($this->stack); - } - } - break; - - /* An end tag whose tag name is "form" */ - case 'form': - /* If the stack of open elements has an element in scope - with the same tag name as that of the token, then generate - implied end tags. */ - if ($this->elementInScope($token['name'])) { - $this->generateImpliedEndTags(); - - } - - if (end($this->stack)->nodeName !== $token['name']) { - /* Now, if the current node is not an element with the - same tag name as that of the token, then this is a parse - error. */ - // w/e - - } else { - /* Otherwise, if the current node is an element with - the same tag name as that of the token pop that element - from the stack. */ - array_pop($this->stack); - } - - /* In any case, set the form element pointer to null. */ - $this->form_pointer = null; - break; - - /* An end tag whose tag name is "p" */ - case 'p': - /* If the stack of open elements has a p element in scope, - then generate implied end tags, except for p elements. */ - if ($this->elementInScope('p')) { - $this->generateImpliedEndTags(array('p')); - - /* If the current node is not a p element, then this is - a parse error. */ - // k - - /* If the stack of open elements has a p element in - scope, then pop elements from this stack until the stack - no longer has a p element in scope. */ - for ($n = count($this->stack) - 1; $n >= 0; $n--) { - if ($this->elementInScope('p')) { - array_pop($this->stack); - - } else { - break; - } - } - } - break; - - /* An end tag whose tag name is "dd", "dt", or "li" */ - case 'dd': - case 'dt': - case 'li': - /* If the stack of open elements has an element in scope - whose tag name matches the tag name of the token, then - generate implied end tags, except for elements with the - same tag name as the token. */ - if ($this->elementInScope($token['name'])) { - $this->generateImpliedEndTags(array($token['name'])); - - /* If the current node is not an element with the same - tag name as the token, then this is a parse error. */ - // w/e - - /* If the stack of open elements has an element in scope - whose tag name matches the tag name of the token, then - pop elements from this stack until an element with that - tag name has been popped from the stack. */ - for ($n = count($this->stack) - 1; $n >= 0; $n--) { - if ($this->stack[$n]->nodeName === $token['name']) { - $n = -1; - } - - array_pop($this->stack); - } - } - break; - - /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4", - "h5", "h6" */ - case 'h1': - case 'h2': - case 'h3': - case 'h4': - case 'h5': - case 'h6': - $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'); - - /* If the stack of open elements has in scope an element whose - tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then - generate implied end tags. */ - if ($this->elementInScope($elements)) { - $this->generateImpliedEndTags(); - - /* Now, if the current node is not an element with the same - tag name as that of the token, then this is a parse error. */ - // w/e - - /* If the stack of open elements has in scope an element - whose tag name is one of "h1", "h2", "h3", "h4", "h5", or - "h6", then pop elements from the stack until an element - with one of those tag names has been popped from the stack. */ - while ($this->elementInScope($elements)) { - array_pop($this->stack); - } - } - break; - - /* An end tag whose tag name is one of: "a", "b", "big", "em", - "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */ - case 'a': - case 'b': - case 'big': - case 'em': - case 'font': - case 'i': - case 'nobr': - case 's': - case 'small': - case 'strike': - case 'strong': - case 'tt': - case 'u': - /* 1. Let the formatting element be the last element in - the list of active formatting elements that: - * is between the end of the list and the last scope - marker in the list, if any, or the start of the list - otherwise, and - * has the same tag name as the token. - */ - while (true) { - for ($a = count($this->a_formatting) - 1; $a >= 0; $a--) { - if ($this->a_formatting[$a] === self::MARKER) { - break; - - } elseif ($this->a_formatting[$a]->tagName === $token['name']) { - $formatting_element = $this->a_formatting[$a]; - $in_stack = in_array($formatting_element, $this->stack, true); - $fe_af_pos = $a; - break; - } - } - - /* If there is no such node, or, if that node is - also in the stack of open elements but the element - is not in scope, then this is a parse error. Abort - these steps. The token is ignored. */ - if (!isset($formatting_element) || ($in_stack && - !$this->elementInScope($token['name'])) - ) { - break; - - /* Otherwise, if there is such a node, but that node - is not in the stack of open elements, then this is a - parse error; remove the element from the list, and - abort these steps. */ - } elseif (isset($formatting_element) && !$in_stack) { - unset($this->a_formatting[$fe_af_pos]); - $this->a_formatting = array_merge($this->a_formatting); - break; - } - - /* 2. Let the furthest block be the topmost node in the - stack of open elements that is lower in the stack - than the formatting element, and is not an element in - the phrasing or formatting categories. There might - not be one. */ - $fe_s_pos = array_search($formatting_element, $this->stack, true); - $length = count($this->stack); - - for ($s = $fe_s_pos + 1; $s < $length; $s++) { - $category = $this->getElementCategory($this->stack[$s]->nodeName); - - if ($category !== self::PHRASING && $category !== self::FORMATTING) { - $furthest_block = $this->stack[$s]; - } - } - - /* 3. If there is no furthest block, then the UA must - skip the subsequent steps and instead just pop all - the nodes from the bottom of the stack of open - elements, from the current node up to the formatting - element, and remove the formatting element from the - list of active formatting elements. */ - if (!isset($furthest_block)) { - for ($n = $length - 1; $n >= $fe_s_pos; $n--) { - array_pop($this->stack); - } - - unset($this->a_formatting[$fe_af_pos]); - $this->a_formatting = array_merge($this->a_formatting); - break; - } - - /* 4. Let the common ancestor be the element - immediately above the formatting element in the stack - of open elements. */ - $common_ancestor = $this->stack[$fe_s_pos - 1]; - - /* 5. If the furthest block has a parent node, then - remove the furthest block from its parent node. */ - if ($furthest_block->parentNode !== null) { - $furthest_block->parentNode->removeChild($furthest_block); - } - - /* 6. Let a bookmark note the position of the - formatting element in the list of active formatting - elements relative to the elements on either side - of it in the list. */ - $bookmark = $fe_af_pos; - - /* 7. Let node and last node be the furthest block. - Follow these steps: */ - $node = $furthest_block; - $last_node = $furthest_block; - - while (true) { - for ($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) { - /* 7.1 Let node be the element immediately - prior to node in the stack of open elements. */ - $node = $this->stack[$n]; - - /* 7.2 If node is not in the list of active - formatting elements, then remove node from - the stack of open elements and then go back - to step 1. */ - if (!in_array($node, $this->a_formatting, true)) { - unset($this->stack[$n]); - $this->stack = array_merge($this->stack); - - } else { - break; - } - } - - /* 7.3 Otherwise, if node is the formatting - element, then go to the next step in the overall - algorithm. */ - if ($node === $formatting_element) { - break; - - /* 7.4 Otherwise, if last node is the furthest - block, then move the aforementioned bookmark to - be immediately after the node in the list of - active formatting elements. */ - } elseif ($last_node === $furthest_block) { - $bookmark = array_search($node, $this->a_formatting, true) + 1; - } - - /* 7.5 If node has any children, perform a - shallow clone of node, replace the entry for - node in the list of active formatting elements - with an entry for the clone, replace the entry - for node in the stack of open elements with an - entry for the clone, and let node be the clone. */ - if ($node->hasChildNodes()) { - $clone = $node->cloneNode(); - $s_pos = array_search($node, $this->stack, true); - $a_pos = array_search($node, $this->a_formatting, true); - - $this->stack[$s_pos] = $clone; - $this->a_formatting[$a_pos] = $clone; - $node = $clone; - } - - /* 7.6 Insert last node into node, first removing - it from its previous parent node if any. */ - if ($last_node->parentNode !== null) { - $last_node->parentNode->removeChild($last_node); - } - - $node->appendChild($last_node); - - /* 7.7 Let last node be node. */ - $last_node = $node; - } - - /* 8. Insert whatever last node ended up being in - the previous step into the common ancestor node, - first removing it from its previous parent node if - any. */ - if ($last_node->parentNode !== null) { - $last_node->parentNode->removeChild($last_node); - } - - $common_ancestor->appendChild($last_node); - - /* 9. Perform a shallow clone of the formatting - element. */ - $clone = $formatting_element->cloneNode(); - - /* 10. Take all of the child nodes of the furthest - block and append them to the clone created in the - last step. */ - while ($furthest_block->hasChildNodes()) { - $child = $furthest_block->firstChild; - $furthest_block->removeChild($child); - $clone->appendChild($child); - } - - /* 11. Append that clone to the furthest block. */ - $furthest_block->appendChild($clone); - - /* 12. Remove the formatting element from the list - of active formatting elements, and insert the clone - into the list of active formatting elements at the - position of the aforementioned bookmark. */ - $fe_af_pos = array_search($formatting_element, $this->a_formatting, true); - unset($this->a_formatting[$fe_af_pos]); - $this->a_formatting = array_merge($this->a_formatting); - - $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1); - $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting)); - $this->a_formatting = array_merge($af_part1, array($clone), $af_part2); - - /* 13. Remove the formatting element from the stack - of open elements, and insert the clone into the stack - of open elements immediately after (i.e. in a more - deeply nested position than) the position of the - furthest block in that stack. */ - $fe_s_pos = array_search($formatting_element, $this->stack, true); - $fb_s_pos = array_search($furthest_block, $this->stack, true); - unset($this->stack[$fe_s_pos]); - - $s_part1 = array_slice($this->stack, 0, $fb_s_pos); - $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack)); - $this->stack = array_merge($s_part1, array($clone), $s_part2); - - /* 14. Jump back to step 1 in this series of steps. */ - unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block); - } - break; - - /* An end tag token whose tag name is one of: "button", - "marquee", "object" */ - case 'button': - case 'marquee': - case 'object': - /* If the stack of open elements has an element in scope whose - tag name matches the tag name of the token, then generate implied - tags. */ - if ($this->elementInScope($token['name'])) { - $this->generateImpliedEndTags(); - - /* Now, if the current node is not an element with the same - tag name as the token, then this is a parse error. */ - // k - - /* Now, if the stack of open elements has an element in scope - whose tag name matches the tag name of the token, then pop - elements from the stack until that element has been popped from - the stack, and clear the list of active formatting elements up - to the last marker. */ - for ($n = count($this->stack) - 1; $n >= 0; $n--) { - if ($this->stack[$n]->nodeName === $token['name']) { - $n = -1; - } - - array_pop($this->stack); - } - - $marker = end(array_keys($this->a_formatting, self::MARKER, true)); - - for ($n = count($this->a_formatting) - 1; $n > $marker; $n--) { - array_pop($this->a_formatting); - } - } - break; - - /* Or an end tag whose tag name is one of: "area", "basefont", - "bgsound", "br", "embed", "hr", "iframe", "image", "img", - "input", "isindex", "noembed", "noframes", "param", "select", - "spacer", "table", "textarea", "wbr" */ - case 'area': - case 'basefont': - case 'bgsound': - case 'br': - case 'embed': - case 'hr': - case 'iframe': - case 'image': - case 'img': - case 'input': - case 'isindex': - case 'noembed': - case 'noframes': - case 'param': - case 'select': - case 'spacer': - case 'table': - case 'textarea': - case 'wbr': - // Parse error. Ignore the token. - break; - - /* An end tag token not covered by the previous entries */ - default: - for ($n = count($this->stack) - 1; $n >= 0; $n--) { - /* Initialise node to be the current node (the bottommost - node of the stack). */ - $node = end($this->stack); - - /* If node has the same tag name as the end tag token, - then: */ - if ($token['name'] === $node->nodeName) { - /* Generate implied end tags. */ - $this->generateImpliedEndTags(); - - /* If the tag name of the end tag token does not - match the tag name of the current node, this is a - parse error. */ - // k - - /* Pop all the nodes from the current node up to - node, including node, then stop this algorithm. */ - for ($x = count($this->stack) - $n; $x >= $n; $x--) { - array_pop($this->stack); - } - - } else { - $category = $this->getElementCategory($node); - - if ($category !== self::SPECIAL && $category !== self::SCOPING) { - /* Otherwise, if node is in neither the formatting - category nor the phrasing category, then this is a - parse error. Stop this algorithm. The end tag token - is ignored. */ - return false; - } - } - } - break; - } - break; - } - } - - private function inTable($token) - { - $clear = array('html', 'table'); - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE */ - if ($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) - ) { - /* Append the character to the current node. */ - $text = $this->dom->createTextNode($token['data']); - end($this->stack)->appendChild($text); - - /* A comment token */ - } elseif ($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the current node with the data - attribute set to the data given in the comment token. */ - $comment = $this->dom->createComment($token['data']); - end($this->stack)->appendChild($comment); - - /* A start tag whose tag name is "caption" */ - } elseif ($token['type'] === HTML5::STARTTAG && - $token['name'] === 'caption' - ) { - /* Clear the stack back to a table context. */ - $this->clearStackToTableContext($clear); - - /* Insert a marker at the end of the list of active - formatting elements. */ - $this->a_formatting[] = self::MARKER; - - /* Insert an HTML element for the token, then switch the - insertion mode to "in caption". */ - $this->insertElement($token); - $this->mode = self::IN_CAPTION; - - /* A start tag whose tag name is "colgroup" */ - } elseif ($token['type'] === HTML5::STARTTAG && - $token['name'] === 'colgroup' - ) { - /* Clear the stack back to a table context. */ - $this->clearStackToTableContext($clear); - - /* Insert an HTML element for the token, then switch the - insertion mode to "in column group". */ - $this->insertElement($token); - $this->mode = self::IN_CGROUP; - - /* A start tag whose tag name is "col" */ - } elseif ($token['type'] === HTML5::STARTTAG && - $token['name'] === 'col' - ) { - $this->inTable( - array( - 'name' => 'colgroup', - 'type' => HTML5::STARTTAG, - 'attr' => array() - ) - ); - - $this->inColumnGroup($token); - - /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */ - } elseif ($token['type'] === HTML5::STARTTAG && in_array( - $token['name'], - array('tbody', 'tfoot', 'thead') - ) - ) { - /* Clear the stack back to a table context. */ - $this->clearStackToTableContext($clear); - - /* Insert an HTML element for the token, then switch the insertion - mode to "in table body". */ - $this->insertElement($token); - $this->mode = self::IN_TBODY; - - /* A start tag whose tag name is one of: "td", "th", "tr" */ - } elseif ($token['type'] === HTML5::STARTTAG && - in_array($token['name'], array('td', 'th', 'tr')) - ) { - /* Act as if a start tag token with the tag name "tbody" had been - seen, then reprocess the current token. */ - $this->inTable( - array( - 'name' => 'tbody', - 'type' => HTML5::STARTTAG, - 'attr' => array() - ) - ); - - return $this->inTableBody($token); - - /* A start tag whose tag name is "table" */ - } elseif ($token['type'] === HTML5::STARTTAG && - $token['name'] === 'table' - ) { - /* Parse error. Act as if an end tag token with the tag name "table" - had been seen, then, if that token wasn't ignored, reprocess the - current token. */ - $this->inTable( - array( - 'name' => 'table', - 'type' => HTML5::ENDTAG - ) - ); - - return $this->mainPhase($token); - - /* An end tag whose tag name is "table" */ - } elseif ($token['type'] === HTML5::ENDTAG && - $token['name'] === 'table' - ) { - /* If the stack of open elements does not have an element in table - scope with the same tag name as the token, this is a parse error. - Ignore the token. (innerHTML case) */ - if (!$this->elementInScope($token['name'], true)) { - return false; - - /* Otherwise: */ - } else { - /* Generate implied end tags. */ - $this->generateImpliedEndTags(); - - /* Now, if the current node is not a table element, then this - is a parse error. */ - // w/e - - /* Pop elements from this stack until a table element has been - popped from the stack. */ - while (true) { - $current = end($this->stack)->nodeName; - array_pop($this->stack); - - if ($current === 'table') { - break; - } - } - - /* Reset the insertion mode appropriately. */ - $this->resetInsertionMode(); - } - - /* An end tag whose tag name is one of: "body", "caption", "col", - "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */ - } elseif ($token['type'] === HTML5::ENDTAG && in_array( - $token['name'], - array( - 'body', - 'caption', - 'col', - 'colgroup', - 'html', - 'tbody', - 'td', - 'tfoot', - 'th', - 'thead', - 'tr' - ) - ) - ) { - // Parse error. Ignore the token. - - /* Anything else */ - } else { - /* Parse error. Process the token as if the insertion mode was "in - body", with the following exception: */ - - /* If the current node is a table, tbody, tfoot, thead, or tr - element, then, whenever a node would be inserted into the current - node, it must instead be inserted into the foster parent element. */ - if (in_array( - end($this->stack)->nodeName, - array('table', 'tbody', 'tfoot', 'thead', 'tr') - ) - ) { - /* The foster parent element is the parent element of the last - table element in the stack of open elements, if there is a - table element and it has such a parent element. If there is no - table element in the stack of open elements (innerHTML case), - then the foster parent element is the first element in the - stack of open elements (the html element). Otherwise, if there - is a table element in the stack of open elements, but the last - table element in the stack of open elements has no parent, or - its parent node is not an element, then the foster parent - element is the element before the last table element in the - stack of open elements. */ - for ($n = count($this->stack) - 1; $n >= 0; $n--) { - if ($this->stack[$n]->nodeName === 'table') { - $table = $this->stack[$n]; - break; - } - } - - if (isset($table) && $table->parentNode !== null) { - $this->foster_parent = $table->parentNode; - - } elseif (!isset($table)) { - $this->foster_parent = $this->stack[0]; - - } elseif (isset($table) && ($table->parentNode === null || - $table->parentNode->nodeType !== XML_ELEMENT_NODE) - ) { - $this->foster_parent = $this->stack[$n - 1]; - } - } - - $this->inBody($token); - } - } - - private function inCaption($token) - { - /* An end tag whose tag name is "caption" */ - if ($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') { - /* If the stack of open elements does not have an element in table - scope with the same tag name as the token, this is a parse error. - Ignore the token. (innerHTML case) */ - if (!$this->elementInScope($token['name'], true)) { - // Ignore - - /* Otherwise: */ - } else { - /* Generate implied end tags. */ - $this->generateImpliedEndTags(); - - /* Now, if the current node is not a caption element, then this - is a parse error. */ - // w/e - - /* Pop elements from this stack until a caption element has - been popped from the stack. */ - while (true) { - $node = end($this->stack)->nodeName; - array_pop($this->stack); - - if ($node === 'caption') { - break; - } - } - - /* Clear the list of active formatting elements up to the last - marker. */ - $this->clearTheActiveFormattingElementsUpToTheLastMarker(); - - /* Switch the insertion mode to "in table". */ - $this->mode = self::IN_TABLE; - } - - /* A start tag whose tag name is one of: "caption", "col", "colgroup", - "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag - name is "table" */ - } elseif (($token['type'] === HTML5::STARTTAG && in_array( - $token['name'], - array( - 'caption', - 'col', - 'colgroup', - 'tbody', - 'td', - 'tfoot', - 'th', - 'thead', - 'tr' - ) - )) || ($token['type'] === HTML5::ENDTAG && - $token['name'] === 'table') - ) { - /* Parse error. Act as if an end tag with the tag name "caption" - had been seen, then, if that token wasn't ignored, reprocess the - current token. */ - $this->inCaption( - array( - 'name' => 'caption', - 'type' => HTML5::ENDTAG - ) - ); - - return $this->inTable($token); - - /* An end tag whose tag name is one of: "body", "col", "colgroup", - "html", "tbody", "td", "tfoot", "th", "thead", "tr" */ - } elseif ($token['type'] === HTML5::ENDTAG && in_array( - $token['name'], - array( - 'body', - 'col', - 'colgroup', - 'html', - 'tbody', - 'tfoot', - 'th', - 'thead', - 'tr' - ) - ) - ) { - // Parse error. Ignore the token. - - /* Anything else */ - } else { - /* Process the token as if the insertion mode was "in body". */ - $this->inBody($token); - } - } - - private function inColumnGroup($token) - { - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE */ - if ($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) - ) { - /* Append the character to the current node. */ - $text = $this->dom->createTextNode($token['data']); - end($this->stack)->appendChild($text); - - /* A comment token */ - } elseif ($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the current node with the data - attribute set to the data given in the comment token. */ - $comment = $this->dom->createComment($token['data']); - end($this->stack)->appendChild($comment); - - /* A start tag whose tag name is "col" */ - } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') { - /* Insert a col element for the token. Immediately pop the current - node off the stack of open elements. */ - $this->insertElement($token); - array_pop($this->stack); - - /* An end tag whose tag name is "colgroup" */ - } elseif ($token['type'] === HTML5::ENDTAG && - $token['name'] === 'colgroup' - ) { - /* If the current node is the root html element, then this is a - parse error, ignore the token. (innerHTML case) */ - if (end($this->stack)->nodeName === 'html') { - // Ignore - - /* Otherwise, pop the current node (which will be a colgroup - element) from the stack of open elements. Switch the insertion - mode to "in table". */ - } else { - array_pop($this->stack); - $this->mode = self::IN_TABLE; - } - - /* An end tag whose tag name is "col" */ - } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') { - /* Parse error. Ignore the token. */ - - /* Anything else */ - } else { - /* Act as if an end tag with the tag name "colgroup" had been seen, - and then, if that token wasn't ignored, reprocess the current token. */ - $this->inColumnGroup( - array( - 'name' => 'colgroup', - 'type' => HTML5::ENDTAG - ) - ); - - return $this->inTable($token); - } - } - - private function inTableBody($token) - { - $clear = array('tbody', 'tfoot', 'thead', 'html'); - - /* A start tag whose tag name is "tr" */ - if ($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') { - /* Clear the stack back to a table body context. */ - $this->clearStackToTableContext($clear); - - /* Insert a tr element for the token, then switch the insertion - mode to "in row". */ - $this->insertElement($token); - $this->mode = self::IN_ROW; - - /* A start tag whose tag name is one of: "th", "td" */ - } elseif ($token['type'] === HTML5::STARTTAG && - ($token['name'] === 'th' || $token['name'] === 'td') - ) { - /* Parse error. Act as if a start tag with the tag name "tr" had - been seen, then reprocess the current token. */ - $this->inTableBody( - array( - 'name' => 'tr', - 'type' => HTML5::STARTTAG, - 'attr' => array() - ) - ); - - return $this->inRow($token); - - /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */ - } elseif ($token['type'] === HTML5::ENDTAG && - in_array($token['name'], array('tbody', 'tfoot', 'thead')) - ) { - /* If the stack of open elements does not have an element in table - scope with the same tag name as the token, this is a parse error. - Ignore the token. */ - if (!$this->elementInScope($token['name'], true)) { - // Ignore - - /* Otherwise: */ - } else { - /* Clear the stack back to a table body context. */ - $this->clearStackToTableContext($clear); - - /* Pop the current node from the stack of open elements. Switch - the insertion mode to "in table". */ - array_pop($this->stack); - $this->mode = self::IN_TABLE; - } - - /* A start tag whose tag name is one of: "caption", "col", "colgroup", - "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */ - } elseif (($token['type'] === HTML5::STARTTAG && in_array( - $token['name'], - array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead') - )) || - ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table') - ) { - /* If the stack of open elements does not have a tbody, thead, or - tfoot element in table scope, this is a parse error. Ignore the - token. (innerHTML case) */ - if (!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) { - // Ignore. - - /* Otherwise: */ - } else { - /* Clear the stack back to a table body context. */ - $this->clearStackToTableContext($clear); - - /* Act as if an end tag with the same tag name as the current - node ("tbody", "tfoot", or "thead") had been seen, then - reprocess the current token. */ - $this->inTableBody( - array( - 'name' => end($this->stack)->nodeName, - 'type' => HTML5::ENDTAG - ) - ); - - return $this->mainPhase($token); - } - - /* An end tag whose tag name is one of: "body", "caption", "col", - "colgroup", "html", "td", "th", "tr" */ - } elseif ($token['type'] === HTML5::ENDTAG && in_array( - $token['name'], - array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr') - ) - ) { - /* Parse error. Ignore the token. */ - - /* Anything else */ - } else { - /* Process the token as if the insertion mode was "in table". */ - $this->inTable($token); - } - } - - private function inRow($token) - { - $clear = array('tr', 'html'); - - /* A start tag whose tag name is one of: "th", "td" */ - if ($token['type'] === HTML5::STARTTAG && - ($token['name'] === 'th' || $token['name'] === 'td') - ) { - /* Clear the stack back to a table row context. */ - $this->clearStackToTableContext($clear); - - /* Insert an HTML element for the token, then switch the insertion - mode to "in cell". */ - $this->insertElement($token); - $this->mode = self::IN_CELL; - - /* Insert a marker at the end of the list of active formatting - elements. */ - $this->a_formatting[] = self::MARKER; - - /* An end tag whose tag name is "tr" */ - } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') { - /* If the stack of open elements does not have an element in table - scope with the same tag name as the token, this is a parse error. - Ignore the token. (innerHTML case) */ - if (!$this->elementInScope($token['name'], true)) { - // Ignore. - - /* Otherwise: */ - } else { - /* Clear the stack back to a table row context. */ - $this->clearStackToTableContext($clear); - - /* Pop the current node (which will be a tr element) from the - stack of open elements. Switch the insertion mode to "in table - body". */ - array_pop($this->stack); - $this->mode = self::IN_TBODY; - } - - /* A start tag whose tag name is one of: "caption", "col", "colgroup", - "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */ - } elseif ($token['type'] === HTML5::STARTTAG && in_array( - $token['name'], - array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr') - ) - ) { - /* Act as if an end tag with the tag name "tr" had been seen, then, - if that token wasn't ignored, reprocess the current token. */ - $this->inRow( - array( - 'name' => 'tr', - 'type' => HTML5::ENDTAG - ) - ); - - return $this->inCell($token); - - /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */ - } elseif ($token['type'] === HTML5::ENDTAG && - in_array($token['name'], array('tbody', 'tfoot', 'thead')) - ) { - /* If the stack of open elements does not have an element in table - scope with the same tag name as the token, this is a parse error. - Ignore the token. */ - if (!$this->elementInScope($token['name'], true)) { - // Ignore. - - /* Otherwise: */ - } else { - /* Otherwise, act as if an end tag with the tag name "tr" had - been seen, then reprocess the current token. */ - $this->inRow( - array( - 'name' => 'tr', - 'type' => HTML5::ENDTAG - ) - ); - - return $this->inCell($token); - } - - /* An end tag whose tag name is one of: "body", "caption", "col", - "colgroup", "html", "td", "th" */ - } elseif ($token['type'] === HTML5::ENDTAG && in_array( - $token['name'], - array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr') - ) - ) { - /* Parse error. Ignore the token. */ - - /* Anything else */ - } else { - /* Process the token as if the insertion mode was "in table". */ - $this->inTable($token); - } - } - - private function inCell($token) - { - /* An end tag whose tag name is one of: "td", "th" */ - if ($token['type'] === HTML5::ENDTAG && - ($token['name'] === 'td' || $token['name'] === 'th') - ) { - /* If the stack of open elements does not have an element in table - scope with the same tag name as that of the token, then this is a - parse error and the token must be ignored. */ - if (!$this->elementInScope($token['name'], true)) { - // Ignore. - - /* Otherwise: */ - } else { - /* Generate implied end tags, except for elements with the same - tag name as the token. */ - $this->generateImpliedEndTags(array($token['name'])); - - /* Now, if the current node is not an element with the same tag - name as the token, then this is a parse error. */ - // k - - /* Pop elements from this stack until an element with the same - tag name as the token has been popped from the stack. */ - while (true) { - $node = end($this->stack)->nodeName; - array_pop($this->stack); - - if ($node === $token['name']) { - break; - } - } - - /* Clear the list of active formatting elements up to the last - marker. */ - $this->clearTheActiveFormattingElementsUpToTheLastMarker(); - - /* Switch the insertion mode to "in row". (The current node - will be a tr element at this point.) */ - $this->mode = self::IN_ROW; - } - - /* A start tag whose tag name is one of: "caption", "col", "colgroup", - "tbody", "td", "tfoot", "th", "thead", "tr" */ - } elseif ($token['type'] === HTML5::STARTTAG && in_array( - $token['name'], - array( - 'caption', - 'col', - 'colgroup', - 'tbody', - 'td', - 'tfoot', - 'th', - 'thead', - 'tr' - ) - ) - ) { - /* If the stack of open elements does not have a td or th element - in table scope, then this is a parse error; ignore the token. - (innerHTML case) */ - if (!$this->elementInScope(array('td', 'th'), true)) { - // Ignore. - - /* Otherwise, close the cell (see below) and reprocess the current - token. */ - } else { - $this->closeCell(); - return $this->inRow($token); - } - - /* A start tag whose tag name is one of: "caption", "col", "colgroup", - "tbody", "td", "tfoot", "th", "thead", "tr" */ - } elseif ($token['type'] === HTML5::STARTTAG && in_array( - $token['name'], - array( - 'caption', - 'col', - 'colgroup', - 'tbody', - 'td', - 'tfoot', - 'th', - 'thead', - 'tr' - ) - ) - ) { - /* If the stack of open elements does not have a td or th element - in table scope, then this is a parse error; ignore the token. - (innerHTML case) */ - if (!$this->elementInScope(array('td', 'th'), true)) { - // Ignore. - - /* Otherwise, close the cell (see below) and reprocess the current - token. */ - } else { - $this->closeCell(); - return $this->inRow($token); - } - - /* An end tag whose tag name is one of: "body", "caption", "col", - "colgroup", "html" */ - } elseif ($token['type'] === HTML5::ENDTAG && in_array( - $token['name'], - array('body', 'caption', 'col', 'colgroup', 'html') - ) - ) { - /* Parse error. Ignore the token. */ - - /* An end tag whose tag name is one of: "table", "tbody", "tfoot", - "thead", "tr" */ - } elseif ($token['type'] === HTML5::ENDTAG && in_array( - $token['name'], - array('table', 'tbody', 'tfoot', 'thead', 'tr') - ) - ) { - /* If the stack of open elements does not have an element in table - scope with the same tag name as that of the token (which can only - happen for "tbody", "tfoot" and "thead", or, in the innerHTML case), - then this is a parse error and the token must be ignored. */ - if (!$this->elementInScope($token['name'], true)) { - // Ignore. - - /* Otherwise, close the cell (see below) and reprocess the current - token. */ - } else { - $this->closeCell(); - return $this->inRow($token); - } - - /* Anything else */ - } else { - /* Process the token as if the insertion mode was "in body". */ - $this->inBody($token); - } - } - - private function inSelect($token) - { - /* Handle the token as follows: */ - - /* A character token */ - if ($token['type'] === HTML5::CHARACTR) { - /* Append the token's character to the current node. */ - $this->insertText($token['data']); - - /* A comment token */ - } elseif ($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the current node with the data - attribute set to the data given in the comment token. */ - $this->insertComment($token['data']); - - /* A start tag token whose tag name is "option" */ - } elseif ($token['type'] === HTML5::STARTTAG && - $token['name'] === 'option' - ) { - /* If the current node is an option element, act as if an end tag - with the tag name "option" had been seen. */ - if (end($this->stack)->nodeName === 'option') { - $this->inSelect( - array( - 'name' => 'option', - 'type' => HTML5::ENDTAG - ) - ); - } - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* A start tag token whose tag name is "optgroup" */ - } elseif ($token['type'] === HTML5::STARTTAG && - $token['name'] === 'optgroup' - ) { - /* If the current node is an option element, act as if an end tag - with the tag name "option" had been seen. */ - if (end($this->stack)->nodeName === 'option') { - $this->inSelect( - array( - 'name' => 'option', - 'type' => HTML5::ENDTAG - ) - ); - } - - /* If the current node is an optgroup element, act as if an end tag - with the tag name "optgroup" had been seen. */ - if (end($this->stack)->nodeName === 'optgroup') { - $this->inSelect( - array( - 'name' => 'optgroup', - 'type' => HTML5::ENDTAG - ) - ); - } - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* An end tag token whose tag name is "optgroup" */ - } elseif ($token['type'] === HTML5::ENDTAG && - $token['name'] === 'optgroup' - ) { - /* First, if the current node is an option element, and the node - immediately before it in the stack of open elements is an optgroup - element, then act as if an end tag with the tag name "option" had - been seen. */ - $elements_in_stack = count($this->stack); - - if ($this->stack[$elements_in_stack - 1]->nodeName === 'option' && - $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup' - ) { - $this->inSelect( - array( - 'name' => 'option', - 'type' => HTML5::ENDTAG - ) - ); - } - - /* If the current node is an optgroup element, then pop that node - from the stack of open elements. Otherwise, this is a parse error, - ignore the token. */ - if ($this->stack[$elements_in_stack - 1] === 'optgroup') { - array_pop($this->stack); - } - - /* An end tag token whose tag name is "option" */ - } elseif ($token['type'] === HTML5::ENDTAG && - $token['name'] === 'option' - ) { - /* If the current node is an option element, then pop that node - from the stack of open elements. Otherwise, this is a parse error, - ignore the token. */ - if (end($this->stack)->nodeName === 'option') { - array_pop($this->stack); - } - - /* An end tag whose tag name is "select" */ - } elseif ($token['type'] === HTML5::ENDTAG && - $token['name'] === 'select' - ) { - /* If the stack of open elements does not have an element in table - scope with the same tag name as the token, this is a parse error. - Ignore the token. (innerHTML case) */ - if (!$this->elementInScope($token['name'], true)) { - // w/e - - /* Otherwise: */ - } else { - /* Pop elements from the stack of open elements until a select - element has been popped from the stack. */ - while (true) { - $current = end($this->stack)->nodeName; - array_pop($this->stack); - - if ($current === 'select') { - break; - } - } - - /* Reset the insertion mode appropriately. */ - $this->resetInsertionMode(); - } - - /* A start tag whose tag name is "select" */ - } elseif ($token['name'] === 'select' && - $token['type'] === HTML5::STARTTAG - ) { - /* Parse error. Act as if the token had been an end tag with the - tag name "select" instead. */ - $this->inSelect( - array( - 'name' => 'select', - 'type' => HTML5::ENDTAG - ) - ); - - /* An end tag whose tag name is one of: "caption", "table", "tbody", - "tfoot", "thead", "tr", "td", "th" */ - } elseif (in_array( - $token['name'], - array( - 'caption', - 'table', - 'tbody', - 'tfoot', - 'thead', - 'tr', - 'td', - 'th' - ) - ) && $token['type'] === HTML5::ENDTAG - ) { - /* Parse error. */ - // w/e - - /* If the stack of open elements has an element in table scope with - the same tag name as that of the token, then act as if an end tag - with the tag name "select" had been seen, and reprocess the token. - Otherwise, ignore the token. */ - if ($this->elementInScope($token['name'], true)) { - $this->inSelect( - array( - 'name' => 'select', - 'type' => HTML5::ENDTAG - ) - ); - - $this->mainPhase($token); - } - - /* Anything else */ - } else { - /* Parse error. Ignore the token. */ - } - } - - private function afterBody($token) - { - /* Handle the token as follows: */ - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE */ - if ($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) - ) { - /* Process the token as it would be processed if the insertion mode - was "in body". */ - $this->inBody($token); - - /* A comment token */ - } elseif ($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the first element in the stack of open - elements (the html element), with the data attribute set to the - data given in the comment token. */ - $comment = $this->dom->createComment($token['data']); - $this->stack[0]->appendChild($comment); - - /* An end tag with the tag name "html" */ - } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') { - /* If the parser was originally created in order to handle the - setting of an element's innerHTML attribute, this is a parse error; - ignore the token. (The element will be an html element in this - case.) (innerHTML case) */ - - /* Otherwise, switch to the trailing end phase. */ - $this->phase = self::END_PHASE; - - /* Anything else */ - } else { - /* Parse error. Set the insertion mode to "in body" and reprocess - the token. */ - $this->mode = self::IN_BODY; - return $this->inBody($token); - } - } - - private function inFrameset($token) - { - /* Handle the token as follows: */ - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ - if ($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) - ) { - /* Append the character to the current node. */ - $this->insertText($token['data']); - - /* A comment token */ - } elseif ($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the current node with the data - attribute set to the data given in the comment token. */ - $this->insertComment($token['data']); - - /* A start tag with the tag name "frameset" */ - } elseif ($token['name'] === 'frameset' && - $token['type'] === HTML5::STARTTAG - ) { - $this->insertElement($token); - - /* An end tag with the tag name "frameset" */ - } elseif ($token['name'] === 'frameset' && - $token['type'] === HTML5::ENDTAG - ) { - /* If the current node is the root html element, then this is a - parse error; ignore the token. (innerHTML case) */ - if (end($this->stack)->nodeName === 'html') { - // Ignore - - } else { - /* Otherwise, pop the current node from the stack of open - elements. */ - array_pop($this->stack); - - /* If the parser was not originally created in order to handle - the setting of an element's innerHTML attribute (innerHTML case), - and the current node is no longer a frameset element, then change - the insertion mode to "after frameset". */ - $this->mode = self::AFTR_FRAME; - } - - /* A start tag with the tag name "frame" */ - } elseif ($token['name'] === 'frame' && - $token['type'] === HTML5::STARTTAG - ) { - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* Immediately pop the current node off the stack of open elements. */ - array_pop($this->stack); - - /* A start tag with the tag name "noframes" */ - } elseif ($token['name'] === 'noframes' && - $token['type'] === HTML5::STARTTAG - ) { - /* Process the token as if the insertion mode had been "in body". */ - $this->inBody($token); - - /* Anything else */ - } else { - /* Parse error. Ignore the token. */ - } - } - - private function afterFrameset($token) - { - /* Handle the token as follows: */ - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ - if ($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) - ) { - /* Append the character to the current node. */ - $this->insertText($token['data']); - - /* A comment token */ - } elseif ($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the current node with the data - attribute set to the data given in the comment token. */ - $this->insertComment($token['data']); - - /* An end tag with the tag name "html" */ - } elseif ($token['name'] === 'html' && - $token['type'] === HTML5::ENDTAG - ) { - /* Switch to the trailing end phase. */ - $this->phase = self::END_PHASE; - - /* A start tag with the tag name "noframes" */ - } elseif ($token['name'] === 'noframes' && - $token['type'] === HTML5::STARTTAG - ) { - /* Process the token as if the insertion mode had been "in body". */ - $this->inBody($token); - - /* Anything else */ - } else { - /* Parse error. Ignore the token. */ - } - } - - private function trailingEndPhase($token) - { - /* After the main phase, as each token is emitted from the tokenisation - stage, it must be processed as described in this section. */ - - /* A DOCTYPE token */ - if ($token['type'] === HTML5::DOCTYPE) { - // Parse error. Ignore the token. - - /* A comment token */ - } elseif ($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the Document object with the data - attribute set to the data given in the comment token. */ - $comment = $this->dom->createComment($token['data']); - $this->dom->appendChild($comment); - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE */ - } elseif ($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) - ) { - /* Process the token as it would be processed in the main phase. */ - $this->mainPhase($token); - - /* A character token that is not one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE. Or a start tag token. Or an end tag token. */ - } elseif (($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || - $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG - ) { - /* Parse error. Switch back to the main phase and reprocess the - token. */ - $this->phase = self::MAIN_PHASE; - return $this->mainPhase($token); - - /* An end-of-file token */ - } elseif ($token['type'] === HTML5::EOF) { - /* OMG DONE!! */ - } - } - - private function insertElement($token, $append = true, $check = false) - { - // Proprietary workaround for libxml2's limitations with tag names - if ($check) { - // Slightly modified HTML5 tag-name modification, - // removing anything that's not an ASCII letter, digit, or hyphen - $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']); - // Remove leading hyphens and numbers - $token['name'] = ltrim($token['name'], '-0..9'); - // In theory, this should ever be needed, but just in case - if ($token['name'] === '') { - $token['name'] = 'span'; - } // arbitrary generic choice - } - - $el = $this->dom->createElement($token['name']); - - foreach ($token['attr'] as $attr) { - if (!$el->hasAttribute($attr['name'])) { - $el->setAttribute($attr['name'], $attr['value']); - } - } - - $this->appendToRealParent($el); - $this->stack[] = $el; - - return $el; - } - - private function insertText($data) - { - $text = $this->dom->createTextNode($data); - $this->appendToRealParent($text); - } - - private function insertComment($data) - { - $comment = $this->dom->createComment($data); - $this->appendToRealParent($comment); - } - - private function appendToRealParent($node) - { - if ($this->foster_parent === null) { - end($this->stack)->appendChild($node); - - } elseif ($this->foster_parent !== null) { - /* If the foster parent element is the parent element of the - last table element in the stack of open elements, then the new - node must be inserted immediately before the last table element - in the stack of open elements in the foster parent element; - otherwise, the new node must be appended to the foster parent - element. */ - for ($n = count($this->stack) - 1; $n >= 0; $n--) { - if ($this->stack[$n]->nodeName === 'table' && - $this->stack[$n]->parentNode !== null - ) { - $table = $this->stack[$n]; - break; - } - } - - if (isset($table) && $this->foster_parent->isSameNode($table->parentNode)) { - $this->foster_parent->insertBefore($node, $table); - } else { - $this->foster_parent->appendChild($node); - } - - $this->foster_parent = null; - } - } - - private function elementInScope($el, $table = false) - { - if (is_array($el)) { - foreach ($el as $element) { - if ($this->elementInScope($element, $table)) { - return true; - } - } - - return false; - } - - $leng = count($this->stack); - - for ($n = 0; $n < $leng; $n++) { - /* 1. Initialise node to be the current node (the bottommost node of - the stack). */ - $node = $this->stack[$leng - 1 - $n]; - - if ($node->tagName === $el) { - /* 2. If node is the target node, terminate in a match state. */ - return true; - - } elseif ($node->tagName === 'table') { - /* 3. Otherwise, if node is a table element, terminate in a failure - state. */ - return false; - - } elseif ($table === true && in_array( - $node->tagName, - array( - 'caption', - 'td', - 'th', - 'button', - 'marquee', - 'object' - ) - ) - ) { - /* 4. Otherwise, if the algorithm is the "has an element in scope" - variant (rather than the "has an element in table scope" variant), - and node is one of the following, terminate in a failure state. */ - return false; - - } elseif ($node === $node->ownerDocument->documentElement) { - /* 5. Otherwise, if node is an html element (root element), terminate - in a failure state. (This can only happen if the node is the topmost - node of the stack of open elements, and prevents the next step from - being invoked if there are no more elements in the stack.) */ - return false; - } - - /* Otherwise, set node to the previous entry in the stack of open - elements and return to step 2. (This will never fail, since the loop - will always terminate in the previous step if the top of the stack - is reached.) */ - } - } - - private function reconstructActiveFormattingElements() - { - /* 1. If there are no entries in the list of active formatting elements, - then there is nothing to reconstruct; stop this algorithm. */ - $formatting_elements = count($this->a_formatting); - - if ($formatting_elements === 0) { - return false; - } - - /* 3. Let entry be the last (most recently added) element in the list - of active formatting elements. */ - $entry = end($this->a_formatting); - - /* 2. If the last (most recently added) entry in the list of active - formatting elements is a marker, or if it is an element that is in the - stack of open elements, then there is nothing to reconstruct; stop this - algorithm. */ - if ($entry === self::MARKER || in_array($entry, $this->stack, true)) { - return false; - } - - for ($a = $formatting_elements - 1; $a >= 0; true) { - /* 4. If there are no entries before entry in the list of active - formatting elements, then jump to step 8. */ - if ($a === 0) { - $step_seven = false; - break; - } - - /* 5. Let entry be the entry one earlier than entry in the list of - active formatting elements. */ - $a--; - $entry = $this->a_formatting[$a]; - - /* 6. If entry is neither a marker nor an element that is also in - thetack of open elements, go to step 4. */ - if ($entry === self::MARKER || in_array($entry, $this->stack, true)) { - break; - } - } - - while (true) { - /* 7. Let entry be the element one later than entry in the list of - active formatting elements. */ - if (isset($step_seven) && $step_seven === true) { - $a++; - $entry = $this->a_formatting[$a]; - } - - /* 8. Perform a shallow clone of the element entry to obtain clone. */ - $clone = $entry->cloneNode(); - - /* 9. Append clone to the current node and push it onto the stack - of open elements so that it is the new current node. */ - end($this->stack)->appendChild($clone); - $this->stack[] = $clone; - - /* 10. Replace the entry for entry in the list with an entry for - clone. */ - $this->a_formatting[$a] = $clone; - - /* 11. If the entry for clone in the list of active formatting - elements is not the last entry in the list, return to step 7. */ - if (end($this->a_formatting) !== $clone) { - $step_seven = true; - } else { - break; - } - } - } - - private function clearTheActiveFormattingElementsUpToTheLastMarker() - { - /* When the steps below require the UA to clear the list of active - formatting elements up to the last marker, the UA must perform the - following steps: */ - - while (true) { - /* 1. Let entry be the last (most recently added) entry in the list - of active formatting elements. */ - $entry = end($this->a_formatting); - - /* 2. Remove entry from the list of active formatting elements. */ - array_pop($this->a_formatting); - - /* 3. If entry was a marker, then stop the algorithm at this point. - The list has been cleared up to the last marker. */ - if ($entry === self::MARKER) { - break; - } - } - } - - private function generateImpliedEndTags($exclude = array()) - { - /* When the steps below require the UA to generate implied end tags, - then, if the current node is a dd element, a dt element, an li element, - a p element, a td element, a th element, or a tr element, the UA must - act as if an end tag with the respective tag name had been seen and - then generate implied end tags again. */ - $node = end($this->stack); - $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude); - - while (in_array(end($this->stack)->nodeName, $elements)) { - array_pop($this->stack); - } - } - - private function getElementCategory($node) - { - $name = $node->tagName; - if (in_array($name, $this->special)) { - return self::SPECIAL; - } elseif (in_array($name, $this->scoping)) { - return self::SCOPING; - } elseif (in_array($name, $this->formatting)) { - return self::FORMATTING; - } else { - return self::PHRASING; - } - } - - private function clearStackToTableContext($elements) - { - /* When the steps above require the UA to clear the stack back to a - table context, it means that the UA must, while the current node is not - a table element or an html element, pop elements from the stack of open - elements. If this causes any elements to be popped from the stack, then - this is a parse error. */ - while (true) { - $node = end($this->stack)->nodeName; - - if (in_array($node, $elements)) { - break; - } else { - array_pop($this->stack); - } - } - } - - private function resetInsertionMode() - { - /* 1. Let last be false. */ - $last = false; - $leng = count($this->stack); - - for ($n = $leng - 1; $n >= 0; $n--) { - /* 2. Let node be the last node in the stack of open elements. */ - $node = $this->stack[$n]; - - /* 3. If node is the first node in the stack of open elements, then - set last to true. If the element whose innerHTML attribute is being - set is neither a td element nor a th element, then set node to the - element whose innerHTML attribute is being set. (innerHTML case) */ - if ($this->stack[0]->isSameNode($node)) { - $last = true; - } - - /* 4. If node is a select element, then switch the insertion mode to - "in select" and abort these steps. (innerHTML case) */ - if ($node->nodeName === 'select') { - $this->mode = self::IN_SELECT; - break; - - /* 5. If node is a td or th element, then switch the insertion mode - to "in cell" and abort these steps. */ - } elseif ($node->nodeName === 'td' || $node->nodeName === 'th') { - $this->mode = self::IN_CELL; - break; - - /* 6. If node is a tr element, then switch the insertion mode to - "in row" and abort these steps. */ - } elseif ($node->nodeName === 'tr') { - $this->mode = self::IN_ROW; - break; - - /* 7. If node is a tbody, thead, or tfoot element, then switch the - insertion mode to "in table body" and abort these steps. */ - } elseif (in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) { - $this->mode = self::IN_TBODY; - break; - - /* 8. If node is a caption element, then switch the insertion mode - to "in caption" and abort these steps. */ - } elseif ($node->nodeName === 'caption') { - $this->mode = self::IN_CAPTION; - break; - - /* 9. If node is a colgroup element, then switch the insertion mode - to "in column group" and abort these steps. (innerHTML case) */ - } elseif ($node->nodeName === 'colgroup') { - $this->mode = self::IN_CGROUP; - break; - - /* 10. If node is a table element, then switch the insertion mode - to "in table" and abort these steps. */ - } elseif ($node->nodeName === 'table') { - $this->mode = self::IN_TABLE; - break; - - /* 11. If node is a head element, then switch the insertion mode - to "in body" ("in body"! not "in head"!) and abort these steps. - (innerHTML case) */ - } elseif ($node->nodeName === 'head') { - $this->mode = self::IN_BODY; - break; - - /* 12. If node is a body element, then switch the insertion mode to - "in body" and abort these steps. */ - } elseif ($node->nodeName === 'body') { - $this->mode = self::IN_BODY; - break; - - /* 13. If node is a frameset element, then switch the insertion - mode to "in frameset" and abort these steps. (innerHTML case) */ - } elseif ($node->nodeName === 'frameset') { - $this->mode = self::IN_FRAME; - break; - - /* 14. If node is an html element, then: if the head element - pointer is null, switch the insertion mode to "before head", - otherwise, switch the insertion mode to "after head". In either - case, abort these steps. (innerHTML case) */ - } elseif ($node->nodeName === 'html') { - $this->mode = ($this->head_pointer === null) - ? self::BEFOR_HEAD - : self::AFTER_HEAD; - - break; - - /* 15. If last is true, then set the insertion mode to "in body" - and abort these steps. (innerHTML case) */ - } elseif ($last) { - $this->mode = self::IN_BODY; - break; - } - } - } - - private function closeCell() - { - /* If the stack of open elements has a td or th element in table scope, - then act as if an end tag token with that tag name had been seen. */ - foreach (array('td', 'th') as $cell) { - if ($this->elementInScope($cell, true)) { - $this->inCell( - array( - 'name' => $cell, - 'type' => HTML5::ENDTAG - ) - ); - - break; - } - } - } - - public function save() - { - return $this->dom; - } -} |