From 6348e70daa113e8b3203de8fbc919d08c90d972e Mon Sep 17 00:00:00 2001 From: Mike Macgirvin Date: Thu, 1 Jul 2010 16:48:07 -0700 Subject: Initial checkin --- library/HTML5/Tokenizer.php | 2307 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2307 insertions(+) create mode 100644 library/HTML5/Tokenizer.php (limited to 'library/HTML5/Tokenizer.php') diff --git a/library/HTML5/Tokenizer.php b/library/HTML5/Tokenizer.php new file mode 100644 index 000000000..06c73065f --- /dev/null +++ b/library/HTML5/Tokenizer.php @@ -0,0 +1,2307 @@ + +Copyright 2008 Edward Z. Yang +Copyright 2009 Geoffrey Sneddon + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +*/ + +// Some conventions: +// /* */ indicates verbatim text from the HTML 5 specification +// // indicates regular comments + +// all flags are in hyphenated form + +class HTML5_Tokenizer { + /** + * Points to an InputStream object. + */ + protected $stream; + + /** + * Tree builder that the tokenizer emits token to. + */ + private $tree; + + /** + * Current content model we are parsing as. + */ + protected $content_model; + + /** + * Current token that is being built, but not yet emitted. Also + * is the last token emitted, if applicable. + */ + protected $token; + + // These are constants describing the content model + const PCDATA = 0; + const RCDATA = 1; + const CDATA = 2; + const PLAINTEXT = 3; + + // These are constants describing tokens + // XXX should probably be moved somewhere else, probably the + // HTML5 class. + const DOCTYPE = 0; + const STARTTAG = 1; + const ENDTAG = 2; + const COMMENT = 3; + const CHARACTER = 4; + const SPACECHARACTER = 5; + const EOF = 6; + const PARSEERROR = 7; + + // These are constants representing bunches of characters. + const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'; + const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; + const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz'; + const DIGIT = '0123456789'; + const HEX = '0123456789ABCDEFabcdef'; + const WHITESPACE = "\t\n\x0c "; + + /** + * @param $data Data to parse + */ + public function __construct($data, $builder = null) { + $this->stream = new HTML5_InputStream($data); + if (!$builder) $this->tree = new HTML5_TreeBuilder; + $this->content_model = self::PCDATA; + } + + public function parseFragment($context = null) { + $this->tree->setupContext($context); + if ($this->tree->content_model) { + $this->content_model = $this->tree->content_model; + $this->tree->content_model = null; + } + $this->parse(); + } + + // XXX maybe convert this into an iterator? regardless, this function + // and the save function should go into a Parser facade of some sort + /** + * Performs the actual parsing of the document. + */ + public function parse() { + // Current state + $state = 'data'; + // This is used to avoid having to have look-behind in the data state. + $lastFourChars = ''; + /** + * Escape flag as specified by the HTML5 specification: "used to + * control the behavior of the tokeniser. It is either true or + * false, and initially must be set to the false state." + */ + $escape = false; + //echo "\n\n"; + while($state !== null) { + + /*echo $state . ' '; + switch ($this->content_model) { + case self::PCDATA: echo 'PCDATA'; break; + case self::RCDATA: echo 'RCDATA'; break; + case self::CDATA: echo 'CDATA'; break; + case self::PLAINTEXT: echo 'PLAINTEXT'; break; + } + if ($escape) echo " escape"; + echo "\n";*/ + + switch($state) { + case 'data': + + /* Consume the next input character */ + $char = $this->stream->char(); + $lastFourChars .= $char; + if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4); + + // see below for meaning + $hyp_cond = + !$escape && + ( + $this->content_model === self::RCDATA || + $this->content_model === self::CDATA + ); + $amp_cond = + !$escape && + ( + $this->content_model === self::PCDATA || + $this->content_model === self::RCDATA + ); + $lt_cond = + $this->content_model === self::PCDATA || + ( + ( + $this->content_model === self::RCDATA || + $this->content_model === self::CDATA + ) && + !$escape + ); + $gt_cond = + $escape && + ( + $this->content_model === self::RCDATA || + $this->content_model === self::CDATA + ); + + if($char === '&' && $amp_cond) { + /* U+0026 AMPERSAND (&) + When the content model flag is set to one of the PCDATA or RCDATA + states and the escape flag is false: switch to the + character reference data state. Otherwise: treat it as per + the "anything else" entry below. */ + $state = 'characterReferenceData'; + + } elseif( + $char === '-' && + $hyp_cond && + $lastFourChars === '' + ) { + /* If the content model flag is set to either the RCDATA state or + the CDATA state, and the escape flag is true, and the last three + characters in the input stream including this one are U+002D + HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"), + set the escape flag to false. */ + $escape = false; + + /* In any case, emit the input character as a character token. + Stay in the data state. */ + $this->emitToken(array( + 'type' => self::CHARACTER, + 'data' => '>' + )); + // We do the "any case" part as part of "anything else". + + } elseif($char === false) { + /* EOF + Emit an end-of-file token. */ + $state = null; + $this->tree->emitToken(array( + 'type' => self::EOF + )); + + } elseif($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { + // Directly after emitting a token you switch back to the "data + // state". At that point spaceCharacters are important so they are + // emitted separately. + $chars = $this->stream->charsWhile(self::WHITESPACE); + $this->emitToken(array( + 'type' => self::SPACECHARACTER, + 'data' => $char . $chars + )); + $lastFourChars .= $chars; + if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4); + + } else { + /* Anything else + THIS IS AN OPTIMIZATION: Get as many character that + otherwise would also be treated as a character token and emit it + as a single character token. Stay in the data state. */ + + $mask = ''; + if ($hyp_cond) $mask .= '-'; + if ($amp_cond) $mask .= '&'; + if ($lt_cond) $mask .= '<'; + if ($gt_cond) $mask .= '>'; + + if ($mask === '') { + $chars = $this->stream->remainingChars(); + } else { + $chars = $this->stream->charsUntil($mask); + } + + $this->emitToken(array( + 'type' => self::CHARACTER, + 'data' => $char . $chars + )); + + $lastFourChars .= $chars; + if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4); + + $state = 'data'; + } + break; + + case 'characterReferenceData': + /* (This cannot happen if the content model flag + is set to the CDATA state.) */ + + /* Attempt to consume a character reference, with no + additional allowed character. */ + $entity = $this->consumeCharacterReference(); + + /* If nothing is returned, emit a U+0026 AMPERSAND + character token. Otherwise, emit the character token that + was returned. */ + // This is all done when consuming the character reference. + $this->emitToken(array( + 'type' => self::CHARACTER, + 'data' => $entity + )); + + /* Finally, switch to the data state. */ + $state = 'data'; + break; + + case 'tagOpen': + $char = $this->stream->char(); + + switch($this->content_model) { + case self::RCDATA: + case self::CDATA: + /* Consume the next input character. If it is a + U+002F SOLIDUS (/) character, switch to the close + tag open state. Otherwise, emit a U+003C LESS-THAN + SIGN character token and reconsume the current input + character in the data state. */ + // We consumed above. + + if($char === '/') { + $state = 'closeTagOpen'; + + } else { + $this->emitToken(array( + 'type' => self::CHARACTER, + 'data' => '<' + )); + + $this->stream->unget(); + + $state = 'data'; + } + break; + + case self::PCDATA: + /* If the content model flag is set to the PCDATA state + Consume the next input character: */ + // We consumed above. + + if($char === '!') { + /* U+0021 EXCLAMATION MARK (!) + Switch to the markup declaration open state. */ + $state = 'markupDeclarationOpen'; + + } elseif($char === '/') { + /* U+002F SOLIDUS (/) + Switch to the close tag open state. */ + $state = 'closeTagOpen'; + + } elseif('A' <= $char && $char <= 'Z') { + /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z + Create a new start tag token, set its tag name to the lowercase + version of the input character (add 0x0020 to the character's code + point), then switch to the tag name state. (Don't emit the token + yet; further details will be filled in before it is emitted.) */ + $this->token = array( + 'name' => strtolower($char), + 'type' => self::STARTTAG, + 'attr' => array() + ); + + $state = 'tagName'; + + } elseif('a' <= $char && $char <= 'z') { + /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z + Create a new start tag token, set its tag name to the input + character, then switch to the tag name state. (Don't emit + the token yet; further details will be filled in before it + is emitted.) */ + $this->token = array( + 'name' => $char, + 'type' => self::STARTTAG, + 'attr' => array() + ); + + $state = 'tagName'; + + } elseif($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Parse error. Emit a U+003C LESS-THAN SIGN character token and a + U+003E GREATER-THAN SIGN character token. Switch to the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'expected-tag-name-but-got-right-bracket' + )); + $this->emitToken(array( + 'type' => self::CHARACTER, + 'data' => '<>' + )); + + $state = 'data'; + + } elseif($char === '?') { + /* U+003F QUESTION MARK (?) + Parse error. Switch to the bogus comment state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'expected-tag-name-but-got-question-mark' + )); + $this->token = array( + 'data' => '?', + 'type' => self::COMMENT + ); + $state = 'bogusComment'; + + } else { + /* Anything else + Parse error. Emit a U+003C LESS-THAN SIGN character token and + reconsume the current input character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'expected-tag-name' + )); + $this->emitToken(array( + 'type' => self::CHARACTER, + 'data' => '<' + )); + + $state = 'data'; + $this->stream->unget(); + } + break; + } + break; + + case 'closeTagOpen': + if ( + $this->content_model === self::RCDATA || + $this->content_model === self::CDATA + ) { + /* If the content model flag is set to the RCDATA or CDATA + states... */ + $name = strtolower($this->stream->charsWhile(self::ALPHA)); + $following = $this->stream->char(); + $this->stream->unget(); + if ( + !$this->token || + $this->token['name'] !== $name || + $this->token['name'] === $name && !in_array($following, array("\x09", "\x0A", "\x0C", "\x20", "\x3E", "\x2F", false)) + ) { + /* if no start tag token has ever been emitted by this instance + of the tokenizer (fragment case), or, if the next few + characters do not match the tag name of the last start tag + token emitted (compared in an ASCII case-insensitive manner), + or if they do but they are not immediately followed by one of + the following characters: + + * U+0009 CHARACTER TABULATION + * U+000A LINE FEED (LF) + * U+000C FORM FEED (FF) + * U+0020 SPACE + * U+003E GREATER-THAN SIGN (>) + * U+002F SOLIDUS (/) + * EOF + + ...then emit a U+003C LESS-THAN SIGN character token, a + U+002F SOLIDUS character token, and switch to the data + state to process the next input character. */ + // XXX: Probably ought to replace in_array with $following === x ||... + + // We also need to emit $name now we've consumed that, as we + // know it'll just be emitted as a character token. + $this->emitToken(array( + 'type' => self::CHARACTER, + 'data' => 'token = array( + 'name' => $name, + 'type' => self::ENDTAG + ); + + // Change to tag name state. + $state = 'tagName'; + } + } elseif ($this->content_model === self::PCDATA) { + /* Otherwise, if the content model flag is set to the PCDATA + state [...]: */ + $char = $this->stream->char(); + + if ('A' <= $char && $char <= 'Z') { + /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z + Create a new end tag token, set its tag name to the lowercase version + of the input character (add 0x0020 to the character's code point), then + switch to the tag name state. (Don't emit the token yet; further details + will be filled in before it is emitted.) */ + $this->token = array( + 'name' => strtolower($char), + 'type' => self::ENDTAG + ); + + $state = 'tagName'; + + } elseif ('a' <= $char && $char <= 'z') { + /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z + Create a new end tag token, set its tag name to the + input character, then switch to the tag name state. + (Don't emit the token yet; further details will be + filled in before it is emitted.) */ + $this->token = array( + 'name' => $char, + 'type' => self::ENDTAG + ); + + $state = 'tagName'; + + } elseif($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Parse error. Switch to the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'expected-closing-tag-but-got-right-bracket' + )); + $state = 'data'; + + } elseif($char === false) { + /* EOF + Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F + SOLIDUS character token. Reconsume the EOF character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'expected-closing-tag-but-got-eof' + )); + $this->emitToken(array( + 'type' => self::CHARACTER, + 'data' => 'stream->unget(); + $state = 'data'; + + } else { + /* Parse error. Switch to the bogus comment state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'expected-closing-tag-but-got-char' + )); + $this->token = array( + 'data' => $char, + 'type' => self::COMMENT + ); + $state = 'bogusComment'; + } + } + break; + + case 'tagName': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Switch to the before attribute name state. */ + $state = 'beforeAttributeName'; + + } elseif($char === '/') { + /* U+002F SOLIDUS (/) + Switch to the self-closing start tag state. */ + $state = 'selfClosingStartTag'; + + } elseif($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Emit the current tag token. Switch to the data state. */ + $this->emitToken($this->token); + $state = 'data'; + + } elseif('A' <= $char && $char <= 'Z') { + /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z + Append the lowercase version of the current input + character (add 0x0020 to the character's code point) to + the current tag token's tag name. Stay in the tag name state. */ + $chars = $this->stream->charsWhile(self::UPPER_ALPHA); + + $this->token['name'] .= strtolower($char . $chars); + $state = 'tagName'; + + } elseif($char === false) { + /* EOF + Parse error. Emit the current tag token. Reconsume the EOF + character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-tag-name' + )); + $this->emitToken($this->token); + + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else + Append the current input character to the current tag token's tag name. + Stay in the tag name state. */ + $chars = $this->stream->charsUntil("\t\n\x0C />" . self::UPPER_ALPHA); + + $this->token['name'] .= $char . $chars; + $state = 'tagName'; + } + break; + + case 'beforeAttributeName': + /* Consume the next input character: */ + $char = $this->stream->char(); + + // this conditional is optimized, check bottom + if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Stay in the before attribute name state. */ + $state = 'beforeAttributeName'; + + } elseif($char === '/') { + /* U+002F SOLIDUS (/) + Switch to the self-closing start tag state. */ + $state = 'selfClosingStartTag'; + + } elseif($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Emit the current tag token. Switch to the data state. */ + $this->emitToken($this->token); + $state = 'data'; + + } elseif('A' <= $char && $char <= 'Z') { + /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z + Start a new attribute in the current tag token. Set that + attribute's name to the lowercase version of the current + input character (add 0x0020 to the character's code + point), and its value to the empty string. Switch to the + attribute name state.*/ + $this->token['attr'][] = array( + 'name' => strtolower($char), + 'value' => '' + ); + + $state = 'attributeName'; + + } elseif($char === false) { + /* EOF + Parse error. Emit the current tag token. Reconsume the EOF + character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'expected-attribute-name-but-got-eof' + )); + $this->emitToken($this->token); + + $this->stream->unget(); + $state = 'data'; + + } else { + /* U+0022 QUOTATION MARK (") + U+0027 APOSTROPHE (') + U+003D EQUALS SIGN (=) + Parse error. Treat it as per the "anything else" entry + below. */ + if($char === '"' || $char === "'" || $char === '=') { + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'invalid-character-in-attribute-name' + )); + } + + /* Anything else + Start a new attribute in the current tag token. Set that attribute's + name to the current input character, and its value to the empty string. + Switch to the attribute name state. */ + $this->token['attr'][] = array( + 'name' => $char, + 'value' => '' + ); + + $state = 'attributeName'; + } + break; + + case 'attributeName': + // Consume the next input character: + $char = $this->stream->char(); + + // this conditional is optimized, check bottom + if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Switch to the after attribute name state. */ + $state = 'afterAttributeName'; + + } elseif($char === '/') { + /* U+002F SOLIDUS (/) + Switch to the self-closing start tag state. */ + $state = 'selfClosingStartTag'; + + } elseif($char === '=') { + /* U+003D EQUALS SIGN (=) + Switch to the before attribute value state. */ + $state = 'beforeAttributeValue'; + + } elseif($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Emit the current tag token. Switch to the data state. */ + $this->emitToken($this->token); + $state = 'data'; + + } elseif('A' <= $char && $char <= 'Z') { + /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z + Append the lowercase version of the current input + character (add 0x0020 to the character's code point) to + the current attribute's name. Stay in the attribute name + state. */ + $chars = $this->stream->charsWhile(self::UPPER_ALPHA); + + $last = count($this->token['attr']) - 1; + $this->token['attr'][$last]['name'] .= strtolower($char . $chars); + + $state = 'attributeName'; + + } elseif($char === false) { + /* EOF + Parse error. Emit the current tag token. Reconsume the EOF + character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-attribute-name' + )); + $this->emitToken($this->token); + + $this->stream->unget(); + $state = 'data'; + + } else { + /* U+0022 QUOTATION MARK (") + U+0027 APOSTROPHE (') + Parse error. Treat it as per the "anything else" + entry below. */ + if($char === '"' || $char === "'") { + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'invalid-character-in-attribute-name' + )); + } + + /* Anything else + Append the current input character to the current attribute's name. + Stay in the attribute name state. */ + $chars = $this->stream->charsUntil("\t\n\x0C /=>\"'" . self::UPPER_ALPHA); + + $last = count($this->token['attr']) - 1; + $this->token['attr'][$last]['name'] .= $char . $chars; + + $state = 'attributeName'; + } + + /* When the user agent leaves the attribute name state + (and before emitting the tag token, if appropriate), the + complete attribute's name must be compared to the other + attributes on the same token; if there is already an + attribute on the token with the exact same name, then this + is a parse error and the new attribute must be dropped, along + with the value that gets associated with it (if any). */ + // this might be implemented in the emitToken method + break; + + case 'afterAttributeName': + // Consume the next input character: + $char = $this->stream->char(); + + // this is an optimized conditional, check the bottom + if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Stay in the after attribute name state. */ + $state = 'afterAttributeName'; + + } elseif($char === '/') { + /* U+002F SOLIDUS (/) + Switch to the self-closing start tag state. */ + $state = 'selfClosingStartTag'; + + } elseif($char === '=') { + /* U+003D EQUALS SIGN (=) + Switch to the before attribute value state. */ + $state = 'beforeAttributeValue'; + + } elseif($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Emit the current tag token. Switch to the data state. */ + $this->emitToken($this->token); + $state = 'data'; + + } elseif('A' <= $char && $char <= 'Z') { + /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z + Start a new attribute in the current tag token. Set that + attribute's name to the lowercase version of the current + input character (add 0x0020 to the character's code + point), and its value to the empty string. Switch to the + attribute name state. */ + $this->token['attr'][] = array( + 'name' => strtolower($char), + 'value' => '' + ); + + $state = 'attributeName'; + + } elseif($char === false) { + /* EOF + Parse error. Emit the current tag token. Reconsume the EOF + character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'expected-end-of-tag-but-got-eof' + )); + $this->emitToken($this->token); + + $this->stream->unget(); + $state = 'data'; + + } else { + /* U+0022 QUOTATION MARK (") + U+0027 APOSTROPHE (') + Parse error. Treat it as per the "anything else" + entry below. */ + if($char === '"' || $char === "'") { + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'invalid-character-after-attribute-name' + )); + } + + /* Anything else + Start a new attribute in the current tag token. Set that attribute's + name to the current input character, and its value to the empty string. + Switch to the attribute name state. */ + $this->token['attr'][] = array( + 'name' => $char, + 'value' => '' + ); + + $state = 'attributeName'; + } + break; + + case 'beforeAttributeValue': + // Consume the next input character: + $char = $this->stream->char(); + + // this is an optimized conditional + if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Stay in the before attribute value state. */ + $state = 'beforeAttributeValue'; + + } elseif($char === '"') { + /* U+0022 QUOTATION MARK (") + Switch to the attribute value (double-quoted) state. */ + $state = 'attributeValueDoubleQuoted'; + + } elseif($char === '&') { + /* U+0026 AMPERSAND (&) + Switch to the attribute value (unquoted) state and reconsume + this input character. */ + $this->stream->unget(); + $state = 'attributeValueUnquoted'; + + } elseif($char === '\'') { + /* U+0027 APOSTROPHE (') + Switch to the attribute value (single-quoted) state. */ + $state = 'attributeValueSingleQuoted'; + + } elseif($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Parse error. Emit the current tag token. Switch to the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'expected-attribute-value-but-got-right-bracket' + )); + $this->emitToken($this->token); + $state = 'data'; + + } elseif($char === false) { + /* EOF + Parse error. Emit the current tag token. Reconsume + the character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'expected-attribute-value-but-got-eof' + )); + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + + } else { + /* U+003D EQUALS SIGN (=) + Parse error. Treat it as per the "anything else" entry below. */ + if($char === '=') { + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'equals-in-unquoted-attribute-value' + )); + } + + /* Anything else + Append the current input character to the current attribute's value. + Switch to the attribute value (unquoted) state. */ + $last = count($this->token['attr']) - 1; + $this->token['attr'][$last]['value'] .= $char; + + $state = 'attributeValueUnquoted'; + } + break; + + case 'attributeValueDoubleQuoted': + // Consume the next input character: + $char = $this->stream->char(); + + if($char === '"') { + /* U+0022 QUOTATION MARK (") + Switch to the after attribute value (quoted) state. */ + $state = 'afterAttributeValueQuoted'; + + } elseif($char === '&') { + /* U+0026 AMPERSAND (&) + Switch to the character reference in attribute value + state, with the additional allowed character + being U+0022 QUOTATION MARK ("). */ + $this->characterReferenceInAttributeValue('"'); + + } elseif($char === false) { + /* EOF + Parse error. Emit the current tag token. Reconsume the character + in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-attribute-value-double-quote' + )); + $this->emitToken($this->token); + + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else + Append the current input character to the current attribute's value. + Stay in the attribute value (double-quoted) state. */ + $chars = $this->stream->charsUntil('"&'); + + $last = count($this->token['attr']) - 1; + $this->token['attr'][$last]['value'] .= $char . $chars; + + $state = 'attributeValueDoubleQuoted'; + } + break; + + case 'attributeValueSingleQuoted': + // Consume the next input character: + $char = $this->stream->char(); + + if($char === "'") { + /* U+0022 QUOTATION MARK (') + Switch to the after attribute value state. */ + $state = 'afterAttributeValueQuoted'; + + } elseif($char === '&') { + /* U+0026 AMPERSAND (&) + Switch to the entity in attribute value state. */ + $this->characterReferenceInAttributeValue("'"); + + } elseif($char === false) { + /* EOF + Parse error. Emit the current tag token. Reconsume the character + in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-attribute-value-single-quote' + )); + $this->emitToken($this->token); + + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else + Append the current input character to the current attribute's value. + Stay in the attribute value (single-quoted) state. */ + $chars = $this->stream->charsUntil("'&"); + + $last = count($this->token['attr']) - 1; + $this->token['attr'][$last]['value'] .= $char . $chars; + + $state = 'attributeValueSingleQuoted'; + } + break; + + case 'attributeValueUnquoted': + // Consume the next input character: + $char = $this->stream->char(); + + if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Switch to the before attribute name state. */ + $state = 'beforeAttributeName'; + + } elseif($char === '&') { + /* U+0026 AMPERSAND (&) + Switch to the entity in attribute value state. */ + $this->characterReferenceInAttributeValue(); + + } elseif($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Emit the current tag token. Switch to the data state. */ + $this->emitToken($this->token); + $state = 'data'; + + } elseif ($char === false) { + /* EOF + Parse error. Emit the current tag token. Reconsume + the character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-attribute-value-no-quotes' + )); + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + + } else { + /* U+0022 QUOTATION MARK (") + U+0027 APOSTROPHE (') + U+003D EQUALS SIGN (=) + Parse error. Treat it as per the "anything else" + entry below. */ + if($char === '"' || $char === "'" || $char === '=') { + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'unexpected-character-in-unquoted-attribute-value' + )); + } + + /* Anything else + Append the current input character to the current attribute's value. + Stay in the attribute value (unquoted) state. */ + $chars = $this->stream->charsUntil("\t\n\x0c &>\"'="); + + $last = count($this->token['attr']) - 1; + $this->token['attr'][$last]['value'] .= $char . $chars; + + $state = 'attributeValueUnquoted'; + } + break; + + case 'afterAttributeValueQuoted': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Switch to the before attribute name state. */ + $state = 'beforeAttributeName'; + + } elseif ($char === '/') { + /* U+002F SOLIDUS (/) + Switch to the self-closing start tag state. */ + $state = 'selfClosingStartTag'; + + } elseif ($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Emit the current tag token. Switch to the data state. */ + $this->emitToken($this->token); + $state = 'data'; + + } elseif ($char === false) { + /* EOF + Parse error. Emit the current tag token. Reconsume the EOF + character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'unexpected-EOF-after-attribute-value' + )); + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else + Parse error. Reconsume the character in the before attribute + name state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'unexpected-character-after-attribute-value' + )); + $this->stream->unget(); + $state = 'beforeAttributeName'; + } + break; + + case 'selfClosingStartTag': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if ($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Set the self-closing flag of the current tag token. + Emit the current tag token. Switch to the data state. */ + // not sure if this is the name we want + $this->token['self-closing'] = true; + /* When an end tag token is emitted with its self-closing flag set, + that is a parse error. */ + if ($this->token['type'] === self::ENDTAG) { + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'self-closing-end-tag' + )); + } + $this->emitToken($this->token); + $state = 'data'; + + } elseif ($char === false) { + /* EOF + Parse error. Emit the current tag token. Reconsume the + EOF character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'unexpected-eof-after-self-closing' + )); + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else + Parse error. Reconsume the character in the before attribute name state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'unexpected-character-after-self-closing' + )); + $this->stream->unget(); + $state = 'beforeAttributeName'; + } + break; + + case 'bogusComment': + /* (This can only happen if the content model flag is set to the PCDATA state.) */ + /* Consume every character up to the first U+003E GREATER-THAN SIGN + character (>) or the end of the file (EOF), whichever comes first. Emit + a comment token whose data is the concatenation of all the characters + starting from and including the character that caused the state machine + to switch into the bogus comment state, up to and including the last + consumed character before the U+003E character, if any, or up to the + end of the file otherwise. (If the comment was started by the end of + the file (EOF), the token is empty.) */ + $this->token['data'] .= (string) $this->stream->charsUntil('>'); + $this->stream->char(); + + $this->emitToken($this->token); + + /* Switch to the data state. */ + $state = 'data'; + break; + + case 'markupDeclarationOpen': + // Consume for below + $hyphens = $this->stream->charsWhile('-', 2); + if ($hyphens === '-') { + $this->stream->unget(); + } + if ($hyphens !== '--') { + $alpha = $this->stream->charsWhile(self::ALPHA, 7); + } + + /* If the next two characters are both U+002D HYPHEN-MINUS (-) + characters, consume those two characters, create a comment token whose + data is the empty string, and switch to the comment state. */ + if($hyphens === '--') { + $state = 'commentStart'; + $this->token = array( + 'data' => '', + 'type' => self::COMMENT + ); + + /* Otherwise if the next seven characters are a case-insensitive match + for the word "DOCTYPE", then consume those characters and switch to the + DOCTYPE state. */ + } elseif(strtoupper($alpha) === 'DOCTYPE') { + $state = 'doctype'; + + // XXX not implemented + /* Otherwise, if the insertion mode is "in foreign content" + and the current node is not an element in the HTML namespace + and the next seven characters are an ASCII case-sensitive + match for the string "[CDATA[" (the five uppercase letters + "CDATA" with a U+005B LEFT SQUARE BRACKET character before + and after), then consume those characters and switch to the + CDATA section state (which is unrelated to the content model + flag's CDATA state). */ + + /* Otherwise, is is a parse error. Switch to the bogus comment state. + The next character that is consumed, if any, is the first character + that will be in the comment. */ + } else { + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'expected-dashes-or-doctype' + )); + $this->token = array( + 'data' => (string) $alpha, + 'type' => self::COMMENT + ); + $state = 'bogusComment'; + } + break; + + case 'commentStart': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if ($char === '-') { + /* U+002D HYPHEN-MINUS (-) + Switch to the comment start dash state. */ + $state = 'commentStartDash'; + } elseif ($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Parse error. Emit the comment token. Switch to the + data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'incorrect-comment' + )); + $this->emitToken($this->token); + $state = 'data'; + } elseif ($char === false) { + /* EOF + Parse error. Emit the comment token. Reconsume the + EOF character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-comment' + )); + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + } else { + /* Anything else + Append the input character to the comment token's + data. Switch to the comment state. */ + $this->token['data'] .= $char; + $state = 'comment'; + } + break; + + case 'commentStartDash': + /* Consume the next input character: */ + $char = $this->stream->char(); + if ($char === '-') { + /* U+002D HYPHEN-MINUS (-) + Switch to the comment end state */ + $state = 'commentEnd'; + } elseif ($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Parse error. Emit the comment token. Switch to the + data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'incorrect-comment' + )); + $this->emitToken($this->token); + $state = 'data'; + } elseif ($char === false) { + /* Parse error. Emit the comment token. Reconsume the + EOF character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-comment' + )); + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + } else { + $this->token['data'] .= '-' . $char; + $state = 'comment'; + } + break; + + case 'comment': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if($char === '-') { + /* U+002D HYPHEN-MINUS (-) + Switch to the comment end dash state */ + $state = 'commentEndDash'; + + } elseif($char === false) { + /* EOF + Parse error. Emit the comment token. Reconsume the EOF character + in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-comment' + )); + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else + Append the input character to the comment token's data. Stay in + the comment state. */ + $chars = $this->stream->charsUntil('-'); + + $this->token['data'] .= $char . $chars; + } + break; + + case 'commentEndDash': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if($char === '-') { + /* U+002D HYPHEN-MINUS (-) + Switch to the comment end state */ + $state = 'commentEnd'; + + } elseif($char === false) { + /* EOF + Parse error. Emit the comment token. Reconsume the EOF character + in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-comment-end-dash' + )); + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else + Append a U+002D HYPHEN-MINUS (-) character and the input + character to the comment token's data. Switch to the comment state. */ + $this->token['data'] .= '-'.$char; + $state = 'comment'; + } + break; + + case 'commentEnd': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Emit the comment token. Switch to the data state. */ + $this->emitToken($this->token); + $state = 'data'; + + } elseif($char === '-') { + /* U+002D HYPHEN-MINUS (-) + Parse error. Append a U+002D HYPHEN-MINUS (-) character + to the comment token's data. Stay in the comment end + state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'unexpected-dash-after-double-dash-in-comment' + )); + $this->token['data'] .= '-'; + + } elseif($char === false) { + /* EOF + Parse error. Emit the comment token. Reconsume the + EOF character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-comment-double-dash' + )); + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else + Parse error. Append two U+002D HYPHEN-MINUS (-) + characters and the input character to the comment token's + data. Switch to the comment state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'unexpected-char-in-comment' + )); + $this->token['data'] .= '--'.$char; + $state = 'comment'; + } + break; + + case 'doctype': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Switch to the before DOCTYPE name state. */ + $state = 'beforeDoctypeName'; + + } else { + /* Anything else + Parse error. Reconsume the current character in the + before DOCTYPE name state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'need-space-after-doctype' + )); + $this->stream->unget(); + $state = 'beforeDoctypeName'; + } + break; + + case 'beforeDoctypeName': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Stay in the before DOCTYPE name state. */ + + } elseif($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Parse error. Create a new DOCTYPE token. Set its + force-quirks flag to on. Emit the token. Switch to the + data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'expected-doctype-name-but-got-right-bracket' + )); + $this->emitToken(array( + 'name' => '', + 'type' => self::DOCTYPE, + 'force-quirks' => true, + 'error' => true + )); + + $state = 'data'; + + } elseif('A' <= $char && $char <= 'Z') { + /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z + Create a new DOCTYPE token. Set the token's name to the + lowercase version of the input character (add 0x0020 to + the character's code point). Switch to the DOCTYPE name + state. */ + $this->token = array( + 'name' => strtolower($char), + 'type' => self::DOCTYPE, + 'error' => true + ); + + $state = 'doctypeName'; + + } elseif($char === false) { + /* EOF + Parse error. Create a new DOCTYPE token. Set its + force-quirks flag to on. Emit the token. Reconsume the + EOF character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'expected-doctype-name-but-got-eof' + )); + $this->emitToken(array( + 'name' => '', + 'type' => self::DOCTYPE, + 'force-quirks' => true, + 'error' => true + )); + + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else + Create a new DOCTYPE token. Set the token's name to the + current input character. Switch to the DOCTYPE name state. */ + $this->token = array( + 'name' => $char, + 'type' => self::DOCTYPE, + 'error' => true + ); + + $state = 'doctypeName'; + } + break; + + case 'doctypeName': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Switch to the after DOCTYPE name state. */ + $state = 'afterDoctypeName'; + + } elseif($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Emit the current DOCTYPE token. Switch to the data state. */ + $this->emitToken($this->token); + $state = 'data'; + + } elseif('A' <= $char && $char <= 'Z') { + /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z + Append the lowercase version of the input character + (add 0x0020 to the character's code point) to the current + DOCTYPE token's name. Stay in the DOCTYPE name state. */ + $this->token['name'] .= strtolower($char); + + } elseif($char === false) { + /* EOF + Parse error. Set the DOCTYPE token's force-quirks flag + to on. Emit that DOCTYPE token. Reconsume the EOF + character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-doctype-name' + )); + $this->token['force-quirks'] = true; + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else + Append the current input character to the current + DOCTYPE token's name. Stay in the DOCTYPE name state. */ + $this->token['name'] .= $char; + } + + // XXX this is probably some sort of quirks mode designation, + // check tree-builder to be sure. In general 'error' needs + // to be specc'ified, this probably means removing it at the end + $this->token['error'] = ($this->token['name'] === 'HTML') + ? false + : true; + break; + + case 'afterDoctypeName': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Stay in the after DOCTYPE name state. */ + + } elseif($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Emit the current DOCTYPE token. Switch to the data state. */ + $this->emitToken($this->token); + $state = 'data'; + + } elseif($char === false) { + /* EOF + Parse error. Set the DOCTYPE token's force-quirks flag + to on. Emit that DOCTYPE token. Reconsume the EOF + character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-doctype' + )); + $this->token['force-quirks'] = true; + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else */ + + $nextSix = strtoupper($char . $this->stream->charsWhile(self::ALPHA, 5)); + if ($nextSix === 'PUBLIC') { + /* If the next six characters are an ASCII + case-insensitive match for the word "PUBLIC", then + consume those characters and switch to the before + DOCTYPE public identifier state. */ + $state = 'beforeDoctypePublicIdentifier'; + + } elseif ($nextSix === 'SYSTEM') { + /* Otherwise, if the next six characters are an ASCII + case-insensitive match for the word "SYSTEM", then + consume those characters and switch to the before + DOCTYPE system identifier state. */ + $state = 'beforeDoctypeSystemIdentifier'; + + } else { + /* Otherwise, this is the parse error. Set the DOCTYPE + token's force-quirks flag to on. Switch to the bogus + DOCTYPE state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'expected-space-or-right-bracket-in-doctype' + )); + $this->token['force-quirks'] = true; + $this->token['error'] = true; + $state = 'bogusDoctype'; + } + } + break; + + case 'beforeDoctypePublicIdentifier': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Stay in the before DOCTYPE public identifier state. */ + } elseif ($char === '"') { + /* U+0022 QUOTATION MARK (") + Set the DOCTYPE token's public identifier to the empty + string (not missing), then switch to the DOCTYPE public + identifier (double-quoted) state. */ + $this->token['public'] = ''; + $state = 'doctypePublicIdentifierDoubleQuoted'; + } elseif ($char === "'") { + /* U+0027 APOSTROPHE (') + Set the DOCTYPE token's public identifier to the empty + string (not missing), then switch to the DOCTYPE public + identifier (single-quoted) state. */ + $this->token['public'] = ''; + $state = 'doctypePublicIdentifierSingleQuoted'; + } elseif ($char === '>') { + /* Parse error. Set the DOCTYPE token's force-quirks flag + to on. Emit that DOCTYPE token. Switch to the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'unexpected-end-of-doctype' + )); + $this->token['force-quirks'] = true; + $this->emitToken($this->token); + $state = 'data'; + } elseif ($char === false) { + /* Parse error. Set the DOCTYPE token's force-quirks + flag to on. Emit that DOCTYPE token. Reconsume the EOF + character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-doctype' + )); + $this->token['force-quirks'] = true; + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + } else { + /* Parse error. Set the DOCTYPE token's force-quirks flag + to on. Switch to the bogus DOCTYPE state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'unexpected-char-in-doctype' + )); + $this->token['force-quirks'] = true; + $state = 'bogusDoctype'; + } + break; + + case 'doctypePublicIdentifierDoubleQuoted': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if ($char === '"') { + /* U+0022 QUOTATION MARK (") + Switch to the after DOCTYPE public identifier state. */ + $state = 'afterDoctypePublicIdentifier'; + } elseif ($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Parse error. Set the DOCTYPE token's force-quirks flag + to on. Emit that DOCTYPE token. Switch to the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'unexpected-end-of-doctype' + )); + $this->token['force-quirks'] = true; + $this->emitToken($this->token); + $state = 'data'; + } elseif ($char === false) { + /* EOF + Parse error. Set the DOCTYPE token's force-quirks flag + to on. Emit that DOCTYPE token. Reconsume the EOF + character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-doctype' + )); + $this->token['force-quirks'] = true; + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + } else { + /* Anything else + Append the current input character to the current + DOCTYPE token's public identifier. Stay in the DOCTYPE + public identifier (double-quoted) state. */ + $this->token['public'] .= $char; + } + break; + + case 'doctypePublicIdentifierSingleQuoted': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if ($char === "'") { + /* U+0027 APOSTROPHE (') + Switch to the after DOCTYPE public identifier state. */ + $state = 'afterDoctypePublicIdentifier'; + } elseif ($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Parse error. Set the DOCTYPE token's force-quirks flag + to on. Emit that DOCTYPE token. Switch to the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'unexpected-end-of-doctype' + )); + $this->token['force-quirks'] = true; + $this->emitToken($this->token); + $state = 'data'; + } elseif ($char === false) { + /* EOF + Parse error. Set the DOCTYPE token's force-quirks flag + to on. Emit that DOCTYPE token. Reconsume the EOF + character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-doctype' + )); + $this->token['force-quirks'] = true; + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + } else { + /* Anything else + Append the current input character to the current + DOCTYPE token's public identifier. Stay in the DOCTYPE + public identifier (double-quoted) state. */ + $this->token['public'] .= $char; + } + break; + + case 'afterDoctypePublicIdentifier': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Stay in the after DOCTYPE public identifier state. */ + } elseif ($char === '"') { + /* U+0022 QUOTATION MARK (") + Set the DOCTYPE token's system identifier to the + empty string (not missing), then switch to the DOCTYPE + system identifier (double-quoted) state. */ + $this->token['system'] = ''; + $state = 'doctypeSystemIdentifierDoubleQuoted'; + } elseif ($char === "'") { + /* U+0027 APOSTROPHE (') + Set the DOCTYPE token's system identifier to the + empty string (not missing), then switch to the DOCTYPE + system identifier (single-quoted) state. */ + $this->token['system'] = ''; + $state = 'doctypeSystemIdentifierSingleQuoted'; + } elseif ($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Emit the current DOCTYPE token. Switch to the data state. */ + $this->emitToken($this->token); + $state = 'data'; + } elseif ($char === false) { + /* Parse error. Set the DOCTYPE token's force-quirks + flag to on. Emit that DOCTYPE token. Reconsume the EOF + character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-doctype' + )); + $this->token['force-quirks'] = true; + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + } else { + /* Anything else + Parse error. Set the DOCTYPE token's force-quirks flag + to on. Switch to the bogus DOCTYPE state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'unexpected-char-in-doctype' + )); + $this->token['force-quirks'] = true; + $state = 'bogusDoctype'; + } + break; + + case 'beforeDoctypeSystemIdentifier': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Stay in the before DOCTYPE system identifier state. */ + } elseif ($char === '"') { + /* U+0022 QUOTATION MARK (") + Set the DOCTYPE token's system identifier to the empty + string (not missing), then switch to the DOCTYPE system + identifier (double-quoted) state. */ + $this->token['system'] = ''; + $state = 'doctypeSystemIdentifierDoubleQuoted'; + } elseif ($char === "'") { + /* U+0027 APOSTROPHE (') + Set the DOCTYPE token's system identifier to the empty + string (not missing), then switch to the DOCTYPE system + identifier (single-quoted) state. */ + $this->token['system'] = ''; + $state = 'doctypeSystemIdentifierSingleQuoted'; + } elseif ($char === '>') { + /* Parse error. Set the DOCTYPE token's force-quirks flag + to on. Emit that DOCTYPE token. Switch to the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'unexpected-char-in-doctype' + )); + $this->token['force-quirks'] = true; + $this->emitToken($this->token); + $state = 'data'; + } elseif ($char === false) { + /* Parse error. Set the DOCTYPE token's force-quirks + flag to on. Emit that DOCTYPE token. Reconsume the EOF + character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-doctype' + )); + $this->token['force-quirks'] = true; + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + } else { + /* Parse error. Set the DOCTYPE token's force-quirks flag + to on. Switch to the bogus DOCTYPE state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'unexpected-char-in-doctype' + )); + $this->token['force-quirks'] = true; + $state = 'bogusDoctype'; + } + break; + + case 'doctypeSystemIdentifierDoubleQuoted': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if ($char === '"') { + /* U+0022 QUOTATION MARK (") + Switch to the after DOCTYPE system identifier state. */ + $state = 'afterDoctypeSystemIdentifier'; + } elseif ($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Parse error. Set the DOCTYPE token's force-quirks flag + to on. Emit that DOCTYPE token. Switch to the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'unexpected-end-of-doctype' + )); + $this->token['force-quirks'] = true; + $this->emitToken($this->token); + $state = 'data'; + } elseif ($char === false) { + /* EOF + Parse error. Set the DOCTYPE token's force-quirks flag + to on. Emit that DOCTYPE token. Reconsume the EOF + character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-doctype' + )); + $this->token['force-quirks'] = true; + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + } else { + /* Anything else + Append the current input character to the current + DOCTYPE token's system identifier. Stay in the DOCTYPE + system identifier (double-quoted) state. */ + $this->token['system'] .= $char; + } + break; + + case 'doctypeSystemIdentifierSingleQuoted': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if ($char === "'") { + /* U+0027 APOSTROPHE (') + Switch to the after DOCTYPE system identifier state. */ + $state = 'afterDoctypeSystemIdentifier'; + } elseif ($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Parse error. Set the DOCTYPE token's force-quirks flag + to on. Emit that DOCTYPE token. Switch to the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'unexpected-end-of-doctype' + )); + $this->token['force-quirks'] = true; + $this->emitToken($this->token); + $state = 'data'; + } elseif ($char === false) { + /* EOF + Parse error. Set the DOCTYPE token's force-quirks flag + to on. Emit that DOCTYPE token. Reconsume the EOF + character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-doctype' + )); + $this->token['force-quirks'] = true; + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + } else { + /* Anything else + Append the current input character to the current + DOCTYPE token's system identifier. Stay in the DOCTYPE + system identifier (double-quoted) state. */ + $this->token['system'] .= $char; + } + break; + + case 'afterDoctypeSystemIdentifier': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Stay in the after DOCTYPE system identifier state. */ + } elseif ($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Emit the current DOCTYPE token. Switch to the data state. */ + $this->emitToken($this->token); + $state = 'data'; + } elseif ($char === false) { + /* Parse error. Set the DOCTYPE token's force-quirks + flag to on. Emit that DOCTYPE token. Reconsume the EOF + character in the data state. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'eof-in-doctype' + )); + $this->token['force-quirks'] = true; + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + } else { + /* Anything else + Parse error. Switch to the bogus DOCTYPE state. + (This does not set the DOCTYPE token's force-quirks + flag to on.) */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'unexpected-char-in-doctype' + )); + $state = 'bogusDoctype'; + } + break; + + case 'bogusDoctype': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if ($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Emit the DOCTYPE token. Switch to the data state. */ + $this->emitToken($this->token); + $state = 'data'; + + } elseif($char === false) { + /* EOF + Emit the DOCTYPE token. Reconsume the EOF character in + the data state. */ + $this->emitToken($this->token); + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else + Stay in the bogus DOCTYPE state. */ + } + break; + + // case 'cdataSection': + + } + } + } + + /** + * Returns a serialized representation of the tree. + */ + public function save() { + return $this->tree->save(); + } + + /** + * Returns the input stream. + */ + public function stream() { + return $this->stream; + } + + private function consumeCharacterReference($allowed = false, $inattr = false) { + // This goes quite far against spec, and is far closer to the Python + // impl., mainly because we don't do the large unconsuming the spec + // requires. + + // All consumed characters. + $chars = $this->stream->char(); + + /* This section defines how to consume a character + reference. This definition is used when parsing character + references in text and in attributes. + + The behavior depends on the identity of the next character + (the one immediately after the U+0026 AMPERSAND character): */ + + if ( + $chars[0] === "\x09" || + $chars[0] === "\x0A" || + $chars[0] === "\x0C" || + $chars[0] === "\x20" || + $chars[0] === '<' || + $chars[0] === '&' || + $chars === false || + $chars[0] === $allowed + ) { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + U+003C LESS-THAN SIGN + U+0026 AMPERSAND + EOF + The additional allowed character, if there is one + Not a character reference. No characters are consumed, + and nothing is returned. (This is not an error, either.) */ + // We already consumed, so unconsume. + $this->stream->unget(); + return '&'; + } elseif ($chars[0] === '#') { + /* Consume the U+0023 NUMBER SIGN. */ + // Um, yeah, we already did that. + /* The behavior further depends on the character after + the U+0023 NUMBER SIGN: */ + $chars .= $this->stream->char(); + if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) { + /* U+0078 LATIN SMALL LETTER X + U+0058 LATIN CAPITAL LETTER X */ + /* Consume the X. */ + // Um, yeah, we already did that. + /* Follow the steps below, but using the range of + characters U+0030 DIGIT ZERO through to U+0039 DIGIT + NINE, U+0061 LATIN SMALL LETTER A through to U+0066 + LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER + A, through to U+0046 LATIN CAPITAL LETTER F (in other + words, 0123456789, ABCDEF, abcdef). */ + $char_class = self::HEX; + /* When it comes to interpreting the + number, interpret it as a hexadecimal number. */ + $hex = true; + } else { + /* Anything else */ + // Unconsume because we shouldn't have consumed this. + $chars = $chars[0]; + $this->stream->unget(); + /* Follow the steps below, but using the range of + characters U+0030 DIGIT ZERO through to U+0039 DIGIT + NINE (i.e. just 0123456789). */ + $char_class = self::DIGIT; + /* When it comes to interpreting the number, + interpret it as a decimal number. */ + $hex = false; + } + + /* Consume as many characters as match the range of characters given above. */ + $consumed = $this->stream->charsWhile($char_class); + if ($consumed === '' || $consumed === false) { + /* If no characters match the range, then don't consume + any characters (and unconsume the U+0023 NUMBER SIGN + character and, if appropriate, the X character). This + is a parse error; nothing is returned. */ + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'expected-numeric-entity' + )); + return '&' . $chars; + } else { + /* Otherwise, if the next character is a U+003B SEMICOLON, + consume that too. If it isn't, there is a parse error. */ + if ($this->stream->char() !== ';') { + $this->stream->unget(); + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'numeric-entity-without-semicolon' + )); + } + + /* If one or more characters match the range, then take + them all and interpret the string of characters as a number + (either hexadecimal or decimal as appropriate). */ + $codepoint = $hex ? hexdec($consumed) : (int) $consumed; + + /* If that number is one of the numbers in the first column + of the following table, then this is a parse error. Find the + row with that number in the first column, and return a + character token for the Unicode character given in the + second column of that row. */ + $new_codepoint = HTML5_Data::getRealCodepoint($codepoint); + if ($new_codepoint) { + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'illegal-windows-1252-entity' + )); + $codepoint = $new_codepoint; + } else { + /* Otherwise, if the number is in the range 0x0000 to 0x0008, + U+000B, U+000E to 0x001F, 0x007F to 0x009F, 0xD800 to 0xDFFF , + 0xFDD0 to 0xFDEF, or is one of 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, + 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, + 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF, + 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, + 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, + 0x10FFFE, or 0x10FFFF, or is higher than 0x10FFFF, then this + is a parse error; return a character token for the U+FFFD + REPLACEMENT CHARACTER character instead. */ + // && has higher precedence than || + if ( + $codepoint >= 0x0000 && $codepoint <= 0x0008 || + $codepoint === 0x000B || + $codepoint >= 0x000E && $codepoint <= 0x001F || + $codepoint >= 0x007F && $codepoint <= 0x009F || + $codepoint >= 0xD800 && $codepoint <= 0xDFFF || + $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF || + ($codepoint & 0xFFFE) === 0xFFFE || + $codepoint > 0x10FFFF + ) { + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'illegal-codepoint-for-numeric-entity' + )); + $codepoint = 0xFFFD; + } + } + + /* Otherwise, return a character token for the Unicode + character whose code point is that number. */ + return HTML5_Data::utf8chr($codepoint); + } + + } else { + /* Anything else */ + + /* Consume the maximum number of characters possible, + with the consumed characters matching one of the + identifiers in the first column of the named character + references table (in a case-sensitive manner). */ + + // we will implement this by matching the longest + // alphanumeric + semicolon string, and then working + // our way backwards + $chars .= $this->stream->charsWhile(self::DIGIT . self::ALPHA . ';', HTML5_Data::getNamedCharacterReferenceMaxLength() - 1); + $len = strlen($chars); + + $refs = HTML5_Data::getNamedCharacterReferences(); + $codepoint = false; + for($c = $len; $c > 0; $c--) { + $id = substr($chars, 0, $c); + if(isset($refs[$id])) { + $codepoint = $refs[$id]; + break; + } + } + + /* If no match can be made, then this is a parse error. + No characters are consumed, and nothing is returned. */ + if (!$codepoint) { + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'expected-named-entity' + )); + return '&' . $chars; + } + + /* If the last character matched is not a U+003B SEMICOLON + (;), there is a parse error. */ + $semicolon = true; + if (substr($id, -1) !== ';') { + $this->emitToken(array( + 'type' => self::PARSEERROR, + 'data' => 'named-entity-without-semicolon' + )); + $semicolon = false; + } + + + /* If the character reference is being consumed as part of + an attribute, and the last character matched is not a + U+003B SEMICOLON (;), and the next character is in the + range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041 + LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z, + or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z, + then, for historical reasons, all the characters that were + matched after the U+0026 AMPERSAND (&) must be unconsumed, + and nothing is returned. */ + if ( + $inattr && !$semicolon && + strspn(substr($chars, $c, 1), self::ALPHA . self::DIGIT) + ) { + return '&' . $chars; + } + + /* Otherwise, return a character token for the character + corresponding to the character reference name (as given + by the second column of the named character references table). */ + return HTML5_Data::utf8chr($codepoint) . substr($chars, $c); + } + } + + private function characterReferenceInAttributeValue($allowed = false) { + /* Attempt to consume a character reference. */ + $entity = $this->consumeCharacterReference($allowed, true); + + /* If nothing is returned, append a U+0026 AMPERSAND + character to the current attribute's value. + + Otherwise, append the returned character token to the + current attribute's value. */ + $char = (!$entity) + ? '&' + : $entity; + + $last = count($this->token['attr']) - 1; + $this->token['attr'][$last]['value'] .= $char; + + /* Finally, switch back to the attribute value state that you + were in when were switched into this state. */ + } + + /** + * Emits a token, passing it on to the tree builder. + */ + protected function emitToken($token, $checkStream = true) { + if ($checkStream) { + // Emit errors from input stream. + while ($this->stream->errors) { + $this->emitToken(array_shift($this->stream->errors), false); + } + } + + // the current structure of attributes is not a terribly good one + $this->tree->emitToken($token); + + if(is_int($this->tree->content_model)) { + $this->content_model = $this->tree->content_model; + $this->tree->content_model = null; + + } elseif($token['type'] === self::ENDTAG) { + $this->content_model = self::PCDATA; + } + } +} + -- cgit v1.2.3