diff options
Diffstat (limited to 'library/HTMLPurifier/Lexer/DOMLex.php')
-rw-r--r-- | library/HTMLPurifier/Lexer/DOMLex.php | 161 |
1 files changed, 114 insertions, 47 deletions
diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php index 20dc2ed48..720754454 100644 --- a/library/HTMLPurifier/Lexer/DOMLex.php +++ b/library/HTMLPurifier/Lexer/DOMLex.php @@ -27,16 +27,26 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer { + /** + * @type HTMLPurifier_TokenFactory + */ private $factory; - public function __construct() { + public function __construct() + { // setup the factory parent::__construct(); $this->factory = new HTMLPurifier_TokenFactory(); } - public function tokenizeHTML($html, $config, $context) { - + /** + * @param string $html + * @param HTMLPurifier_Config $config + * @param HTMLPurifier_Context $context + * @return HTMLPurifier_Token[] + */ + public function tokenizeHTML($html, $config, $context) + { $html = $this->normalize($html, $config, $context); // attempt to armor stray angled brackets that cannot possibly @@ -65,30 +75,67 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer $tokens = array(); $this->tokenizeDOM( $doc->getElementsByTagName('html')->item(0)-> // <html> - getElementsByTagName('body')->item(0)-> // <body> - getElementsByTagName('div')->item(0) // <div> - , $tokens); + getElementsByTagName('body')->item(0)-> // <body> + getElementsByTagName('div')->item(0), // <div> + $tokens + ); return $tokens; } /** - * Recursive function that tokenizes a node, putting it into an accumulator. - * - * @param $node DOMNode to be tokenized. - * @param $tokens Array-list of already tokenized tokens. - * @param $collect Says whether or start and close are collected, set to - * false at first recursion because it's the implicit DIV - * tag you're dealing with. - * @returns Tokens of node appended to previously passed tokens. + * Iterative function that tokenizes a node, putting it into an accumulator. + * To iterate is human, to recurse divine - L. Peter Deutsch + * @param DOMNode $node DOMNode to be tokenized. + * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens. + * @return HTMLPurifier_Token of node appended to previously passed tokens. */ - protected function tokenizeDOM($node, &$tokens, $collect = false) { + protected function tokenizeDOM($node, &$tokens) + { + $level = 0; + $nodes = array($level => new HTMLPurifier_Queue(array($node))); + $closingNodes = array(); + do { + while (!$nodes[$level]->isEmpty()) { + $node = $nodes[$level]->shift(); // FIFO + $collect = $level > 0 ? true : false; + $needEndingTag = $this->createStartNode($node, $tokens, $collect); + if ($needEndingTag) { + $closingNodes[$level][] = $node; + } + if ($node->childNodes && $node->childNodes->length) { + $level++; + $nodes[$level] = new HTMLPurifier_Queue(); + foreach ($node->childNodes as $childNode) { + $nodes[$level]->push($childNode); + } + } + } + $level--; + if ($level && isset($closingNodes[$level])) { + while ($node = array_pop($closingNodes[$level])) { + $this->createEndNode($node, $tokens); + } + } + } while ($level > 0); + } + /** + * @param DOMNode $node DOMNode to be tokenized. + * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens. + * @param bool $collect Says whether or start and close are collected, set to + * false at first recursion because it's the implicit DIV + * tag you're dealing with. + * @return bool if the token needs an endtoken + * @todo data and tagName properties don't seem to exist in DOMNode? + */ + protected function createStartNode($node, &$tokens, $collect) + { // intercept non element nodes. WE MUST catch all of them, // but we're not getting the character reference nodes because // those should have been preprocessed if ($node->nodeType === XML_TEXT_NODE) { $tokens[] = $this->factory->createText($node->data); - return; + return false; } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { // undo libxml's special treatment of <script> and <style> tags $last = end($tokens); @@ -106,59 +153,61 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer } } $tokens[] = $this->factory->createText($this->parseData($data)); - return; + return false; } elseif ($node->nodeType === XML_COMMENT_NODE) { // this is code is only invoked for comments in script/style in versions // of libxml pre-2.6.28 (regular comments, of course, are still // handled regularly) $tokens[] = $this->factory->createComment($node->data); - return; - } elseif ( + return false; + } elseif ($node->nodeType !== XML_ELEMENT_NODE) { // not-well tested: there may be other nodes we have to grab - $node->nodeType !== XML_ELEMENT_NODE - ) { - return; + return false; } - $attr = $node->hasAttributes() ? - $this->transformAttrToAssoc($node->attributes) : - array(); + $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array(); // We still have to make sure that the element actually IS empty if (!$node->childNodes->length) { if ($collect) { $tokens[] = $this->factory->createEmpty($node->tagName, $attr); } + return false; } else { - if ($collect) { // don't wrap on first iteration + if ($collect) { $tokens[] = $this->factory->createStart( $tag_name = $node->tagName, // somehow, it get's dropped $attr ); } - foreach ($node->childNodes as $node) { - // remember, it's an accumulator. Otherwise, we'd have - // to use array_merge - $this->tokenizeDOM($node, $tokens, true); - } - if ($collect) { - $tokens[] = $this->factory->createEnd($tag_name); - } + return true; } + } + /** + * @param DOMNode $node + * @param HTMLPurifier_Token[] $tokens + */ + protected function createEndNode($node, &$tokens) + { + $tokens[] = $this->factory->createEnd($node->tagName); } + /** * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array. * - * @param $attribute_list DOMNamedNodeMap of DOMAttr objects. - * @returns Associative array of attributes. + * @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects. + * @return array Associative array of attributes. */ - protected function transformAttrToAssoc($node_map) { + protected function transformAttrToAssoc($node_map) + { // NamedNodeMap is documented very well, so we're using undocumented // features, namely, the fact that it implements Iterator and // has a ->length attribute - if ($node_map->length === 0) return array(); + if ($node_map->length === 0) { + return array(); + } $array = array(); foreach ($node_map as $attr) { $array[$attr->name] = $attr->value; @@ -168,46 +217,64 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer /** * An error handler that mutes all errors + * @param int $errno + * @param string $errstr */ - public function muteErrorHandler($errno, $errstr) {} + public function muteErrorHandler($errno, $errstr) + { + } /** * Callback function for undoing escaping of stray angled brackets * in comments + * @param array $matches + * @return string */ - public function callbackUndoCommentSubst($matches) { - return '<!--' . strtr($matches[1], array('&'=>'&','<'=>'<')) . $matches[2]; + public function callbackUndoCommentSubst($matches) + { + return '<!--' . strtr($matches[1], array('&' => '&', '<' => '<')) . $matches[2]; } /** * Callback function that entity-izes ampersands in comments so that * callbackUndoCommentSubst doesn't clobber them + * @param array $matches + * @return string */ - public function callbackArmorCommentEntities($matches) { + public function callbackArmorCommentEntities($matches) + { return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2]; } /** * Wraps an HTML fragment in the necessary HTML + * @param string $html + * @param HTMLPurifier_Config $config + * @param HTMLPurifier_Context $context + * @return string */ - protected function wrapHTML($html, $config, $context) { + protected function wrapHTML($html, $config, $context) + { $def = $config->getDefinition('HTML'); $ret = ''; if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) { $ret .= '<!DOCTYPE html '; - if (!empty($def->doctype->dtdPublic)) $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" '; - if (!empty($def->doctype->dtdSystem)) $ret .= '"' . $def->doctype->dtdSystem . '" '; + if (!empty($def->doctype->dtdPublic)) { + $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" '; + } + if (!empty($def->doctype->dtdSystem)) { + $ret .= '"' . $def->doctype->dtdSystem . '" '; + } $ret .= '>'; } $ret .= '<html><head>'; $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'; // No protection if $html contains a stray </div>! - $ret .= '</head><body><div>'.$html.'</div></body></html>'; + $ret .= '</head><body><div>' . $html . '</div></body></html>'; return $ret; } - } // vim: et sw=4 sts=4 |