diff options
Diffstat (limited to 'library/HTMLPurifier/Lexer/DirectLex.php')
-rw-r--r-- | library/HTMLPurifier/Lexer/DirectLex.php | 539 |
1 files changed, 0 insertions, 539 deletions
diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php deleted file mode 100644 index 746b6e315..000000000 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ /dev/null @@ -1,539 +0,0 @@ -<?php - -/** - * Our in-house implementation of a parser. - * - * A pure PHP parser, DirectLex has absolutely no dependencies, making - * it a reasonably good default for PHP4. Written with efficiency in mind, - * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it - * pales in comparison to HTMLPurifier_Lexer_DOMLex. - * - * @todo Reread XML spec and document differences. - */ -class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer -{ - /** - * @type bool - */ - public $tracksLineNumbers = true; - - /** - * Whitespace characters for str(c)spn. - * @type string - */ - protected $_whitespace = "\x20\x09\x0D\x0A"; - - /** - * Callback function for script CDATA fudge - * @param array $matches, in form of array(opening tag, contents, closing tag) - * @return string - */ - protected function scriptCallback($matches) - { - return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3]; - } - - /** - * @param String $html - * @param HTMLPurifier_Config $config - * @param HTMLPurifier_Context $context - * @return array|HTMLPurifier_Token[] - */ - public function tokenizeHTML($html, $config, $context) - { - // special normalization for script tags without any armor - // our "armor" heurstic is a < sign any number of whitespaces after - // the first script tag - if ($config->get('HTML.Trusted')) { - $html = preg_replace_callback( - '#(<script[^>]*>)(\s*[^<].+?)(</script>)#si', - array($this, 'scriptCallback'), - $html - ); - } - - $html = $this->normalize($html, $config, $context); - - $cursor = 0; // our location in the text - $inside_tag = false; // whether or not we're parsing the inside of a tag - $array = array(); // result array - - // This is also treated to mean maintain *column* numbers too - $maintain_line_numbers = $config->get('Core.MaintainLineNumbers'); - - if ($maintain_line_numbers === null) { - // automatically determine line numbering by checking - // if error collection is on - $maintain_line_numbers = $config->get('Core.CollectErrors'); - } - - if ($maintain_line_numbers) { - $current_line = 1; - $current_col = 0; - $length = strlen($html); - } else { - $current_line = false; - $current_col = false; - $length = false; - } - $context->register('CurrentLine', $current_line); - $context->register('CurrentCol', $current_col); - $nl = "\n"; - // how often to manually recalculate. This will ALWAYS be right, - // but it's pretty wasteful. Set to 0 to turn off - $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval'); - - $e = false; - if ($config->get('Core.CollectErrors')) { - $e =& $context->get('ErrorCollector'); - } - - // for testing synchronization - $loops = 0; - - while (++$loops) { - // $cursor is either at the start of a token, or inside of - // a tag (i.e. there was a < immediately before it), as indicated - // by $inside_tag - - if ($maintain_line_numbers) { - // $rcursor, however, is always at the start of a token. - $rcursor = $cursor - (int)$inside_tag; - - // Column number is cheap, so we calculate it every round. - // We're interested at the *end* of the newline string, so - // we need to add strlen($nl) == 1 to $nl_pos before subtracting it - // from our "rcursor" position. - $nl_pos = strrpos($html, $nl, $rcursor - $length); - $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1); - - // recalculate lines - if ($synchronize_interval && // synchronization is on - $cursor > 0 && // cursor is further than zero - $loops % $synchronize_interval === 0) { // time to synchronize! - $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor); - } - } - - $position_next_lt = strpos($html, '<', $cursor); - $position_next_gt = strpos($html, '>', $cursor); - - // triggers on "<b>asdf</b>" but not "asdf <b></b>" - // special case to set up context - if ($position_next_lt === $cursor) { - $inside_tag = true; - $cursor++; - } - - if (!$inside_tag && $position_next_lt !== false) { - // We are not inside tag and there still is another tag to parse - $token = new - HTMLPurifier_Token_Text( - $this->parseData( - substr( - $html, - $cursor, - $position_next_lt - $cursor - ) - ) - ); - if ($maintain_line_numbers) { - $token->rawPosition($current_line, $current_col); - $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor); - } - $array[] = $token; - $cursor = $position_next_lt + 1; - $inside_tag = true; - continue; - } elseif (!$inside_tag) { - // We are not inside tag but there are no more tags - // If we're already at the end, break - if ($cursor === strlen($html)) { - break; - } - // Create Text of rest of string - $token = new - HTMLPurifier_Token_Text( - $this->parseData( - substr( - $html, - $cursor - ) - ) - ); - if ($maintain_line_numbers) { - $token->rawPosition($current_line, $current_col); - } - $array[] = $token; - break; - } elseif ($inside_tag && $position_next_gt !== false) { - // We are in tag and it is well formed - // Grab the internals of the tag - $strlen_segment = $position_next_gt - $cursor; - - if ($strlen_segment < 1) { - // there's nothing to process! - $token = new HTMLPurifier_Token_Text('<'); - $cursor++; - continue; - } - - $segment = substr($html, $cursor, $strlen_segment); - - if ($segment === false) { - // somehow, we attempted to access beyond the end of - // the string, defense-in-depth, reported by Nate Abele - break; - } - - // Check if it's a comment - if (substr($segment, 0, 3) === '!--') { - // re-determine segment length, looking for --> - $position_comment_end = strpos($html, '-->', $cursor); - if ($position_comment_end === false) { - // uh oh, we have a comment that extends to - // infinity. Can't be helped: set comment - // end position to end of string - if ($e) { - $e->send(E_WARNING, 'Lexer: Unclosed comment'); - } - $position_comment_end = strlen($html); - $end = true; - } else { - $end = false; - } - $strlen_segment = $position_comment_end - $cursor; - $segment = substr($html, $cursor, $strlen_segment); - $token = new - HTMLPurifier_Token_Comment( - substr( - $segment, - 3, - $strlen_segment - 3 - ) - ); - if ($maintain_line_numbers) { - $token->rawPosition($current_line, $current_col); - $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment); - } - $array[] = $token; - $cursor = $end ? $position_comment_end : $position_comment_end + 3; - $inside_tag = false; - continue; - } - - // Check if it's an end tag - $is_end_tag = (strpos($segment, '/') === 0); - if ($is_end_tag) { - $type = substr($segment, 1); - $token = new HTMLPurifier_Token_End($type); - if ($maintain_line_numbers) { - $token->rawPosition($current_line, $current_col); - $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); - } - $array[] = $token; - $inside_tag = false; - $cursor = $position_next_gt + 1; - continue; - } - - // Check leading character is alnum, if not, we may - // have accidently grabbed an emoticon. Translate into - // text and go our merry way - if (!ctype_alpha($segment[0])) { - // XML: $segment[0] !== '_' && $segment[0] !== ':' - if ($e) { - $e->send(E_NOTICE, 'Lexer: Unescaped lt'); - } - $token = new HTMLPurifier_Token_Text('<'); - if ($maintain_line_numbers) { - $token->rawPosition($current_line, $current_col); - $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); - } - $array[] = $token; - $inside_tag = false; - continue; - } - - // Check if it is explicitly self closing, if so, remove - // trailing slash. Remember, we could have a tag like <br>, so - // any later token processing scripts must convert improperly - // classified EmptyTags from StartTags. - $is_self_closing = (strrpos($segment, '/') === $strlen_segment - 1); - if ($is_self_closing) { - $strlen_segment--; - $segment = substr($segment, 0, $strlen_segment); - } - - // Check if there are any attributes - $position_first_space = strcspn($segment, $this->_whitespace); - - if ($position_first_space >= $strlen_segment) { - if ($is_self_closing) { - $token = new HTMLPurifier_Token_Empty($segment); - } else { - $token = new HTMLPurifier_Token_Start($segment); - } - if ($maintain_line_numbers) { - $token->rawPosition($current_line, $current_col); - $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); - } - $array[] = $token; - $inside_tag = false; - $cursor = $position_next_gt + 1; - continue; - } - - // Grab out all the data - $type = substr($segment, 0, $position_first_space); - $attribute_string = - trim( - substr( - $segment, - $position_first_space - ) - ); - if ($attribute_string) { - $attr = $this->parseAttributeString( - $attribute_string, - $config, - $context - ); - } else { - $attr = array(); - } - - if ($is_self_closing) { - $token = new HTMLPurifier_Token_Empty($type, $attr); - } else { - $token = new HTMLPurifier_Token_Start($type, $attr); - } - if ($maintain_line_numbers) { - $token->rawPosition($current_line, $current_col); - $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); - } - $array[] = $token; - $cursor = $position_next_gt + 1; - $inside_tag = false; - continue; - } else { - // inside tag, but there's no ending > sign - if ($e) { - $e->send(E_WARNING, 'Lexer: Missing gt'); - } - $token = new - HTMLPurifier_Token_Text( - '<' . - $this->parseData( - substr($html, $cursor) - ) - ); - if ($maintain_line_numbers) { - $token->rawPosition($current_line, $current_col); - } - // no cursor scroll? Hmm... - $array[] = $token; - break; - } - break; - } - - $context->destroy('CurrentLine'); - $context->destroy('CurrentCol'); - return $array; - } - - /** - * PHP 5.0.x compatible substr_count that implements offset and length - * @param string $haystack - * @param string $needle - * @param int $offset - * @param int $length - * @return int - */ - protected function substrCount($haystack, $needle, $offset, $length) - { - static $oldVersion; - if ($oldVersion === null) { - $oldVersion = version_compare(PHP_VERSION, '5.1', '<'); - } - if ($oldVersion) { - $haystack = substr($haystack, $offset, $length); - return substr_count($haystack, $needle); - } else { - return substr_count($haystack, $needle, $offset, $length); - } - } - - /** - * Takes the inside of an HTML tag and makes an assoc array of attributes. - * - * @param string $string Inside of tag excluding name. - * @param HTMLPurifier_Config $config - * @param HTMLPurifier_Context $context - * @return array Assoc array of attributes. - */ - public function parseAttributeString($string, $config, $context) - { - $string = (string)$string; // quick typecast - - if ($string == '') { - return array(); - } // no attributes - - $e = false; - if ($config->get('Core.CollectErrors')) { - $e =& $context->get('ErrorCollector'); - } - - // let's see if we can abort as quickly as possible - // one equal sign, no spaces => one attribute - $num_equal = substr_count($string, '='); - $has_space = strpos($string, ' '); - if ($num_equal === 0 && !$has_space) { - // bool attribute - return array($string => $string); - } elseif ($num_equal === 1 && !$has_space) { - // only one attribute - list($key, $quoted_value) = explode('=', $string); - $quoted_value = trim($quoted_value); - if (!$key) { - if ($e) { - $e->send(E_ERROR, 'Lexer: Missing attribute key'); - } - return array(); - } - if (!$quoted_value) { - return array($key => ''); - } - $first_char = @$quoted_value[0]; - $last_char = @$quoted_value[strlen($quoted_value) - 1]; - - $same_quote = ($first_char == $last_char); - $open_quote = ($first_char == '"' || $first_char == "'"); - - if ($same_quote && $open_quote) { - // well behaved - $value = substr($quoted_value, 1, strlen($quoted_value) - 2); - } else { - // not well behaved - if ($open_quote) { - if ($e) { - $e->send(E_ERROR, 'Lexer: Missing end quote'); - } - $value = substr($quoted_value, 1); - } else { - $value = $quoted_value; - } - } - if ($value === false) { - $value = ''; - } - return array($key => $this->parseData($value)); - } - - // setup loop environment - $array = array(); // return assoc array of attributes - $cursor = 0; // current position in string (moves forward) - $size = strlen($string); // size of the string (stays the same) - - // if we have unquoted attributes, the parser expects a terminating - // space, so let's guarantee that there's always a terminating space. - $string .= ' '; - - $old_cursor = -1; - while ($cursor < $size) { - if ($old_cursor >= $cursor) { - throw new Exception("Infinite loop detected"); - } - $old_cursor = $cursor; - - $cursor += ($value = strspn($string, $this->_whitespace, $cursor)); - // grab the key - - $key_begin = $cursor; //we're currently at the start of the key - - // scroll past all characters that are the key (not whitespace or =) - $cursor += strcspn($string, $this->_whitespace . '=', $cursor); - - $key_end = $cursor; // now at the end of the key - - $key = substr($string, $key_begin, $key_end - $key_begin); - - if (!$key) { - if ($e) { - $e->send(E_ERROR, 'Lexer: Missing attribute key'); - } - $cursor += 1 + strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop - continue; // empty key - } - - // scroll past all whitespace - $cursor += strspn($string, $this->_whitespace, $cursor); - - if ($cursor >= $size) { - $array[$key] = $key; - break; - } - - // if the next character is an equal sign, we've got a regular - // pair, otherwise, it's a bool attribute - $first_char = @$string[$cursor]; - - if ($first_char == '=') { - // key="value" - - $cursor++; - $cursor += strspn($string, $this->_whitespace, $cursor); - - if ($cursor === false) { - $array[$key] = ''; - break; - } - - // we might be in front of a quote right now - - $char = @$string[$cursor]; - - if ($char == '"' || $char == "'") { - // it's quoted, end bound is $char - $cursor++; - $value_begin = $cursor; - $cursor = strpos($string, $char, $cursor); - $value_end = $cursor; - } else { - // it's not quoted, end bound is whitespace - $value_begin = $cursor; - $cursor += strcspn($string, $this->_whitespace, $cursor); - $value_end = $cursor; - } - - // we reached a premature end - if ($cursor === false) { - $cursor = $size; - $value_end = $cursor; - } - - $value = substr($string, $value_begin, $value_end - $value_begin); - if ($value === false) { - $value = ''; - } - $array[$key] = $this->parseData($value); - $cursor++; - } else { - // boolattr - if ($key !== '') { - $array[$key] = $key; - } else { - // purely theoretical - if ($e) { - $e->send(E_ERROR, 'Lexer: Missing attribute key'); - } - } - } - } - return $array; - } -} - -// vim: et sw=4 sts=4 |