aboutsummaryrefslogtreecommitdiffstats
path: root/library/HTMLPurifier/Lexer/DOMLex.php
diff options
context:
space:
mode:
Diffstat (limited to 'library/HTMLPurifier/Lexer/DOMLex.php')
-rw-r--r--library/HTMLPurifier/Lexer/DOMLex.php161
1 files changed, 114 insertions, 47 deletions
diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php
index 20dc2ed48..720754454 100644
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@@ -27,16 +27,26 @@
class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
{
+ /**
+ * @type HTMLPurifier_TokenFactory
+ */
private $factory;
- public function __construct() {
+ public function __construct()
+ {
// setup the factory
parent::__construct();
$this->factory = new HTMLPurifier_TokenFactory();
}
- public function tokenizeHTML($html, $config, $context) {
-
+ /**
+ * @param string $html
+ * @param HTMLPurifier_Config $config
+ * @param HTMLPurifier_Context $context
+ * @return HTMLPurifier_Token[]
+ */
+ public function tokenizeHTML($html, $config, $context)
+ {
$html = $this->normalize($html, $config, $context);
// attempt to armor stray angled brackets that cannot possibly
@@ -65,30 +75,67 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
$tokens = array();
$this->tokenizeDOM(
$doc->getElementsByTagName('html')->item(0)-> // <html>
- getElementsByTagName('body')->item(0)-> // <body>
- getElementsByTagName('div')->item(0) // <div>
- , $tokens);
+ getElementsByTagName('body')->item(0)-> // <body>
+ getElementsByTagName('div')->item(0), // <div>
+ $tokens
+ );
return $tokens;
}
/**
- * Recursive function that tokenizes a node, putting it into an accumulator.
- *
- * @param $node DOMNode to be tokenized.
- * @param $tokens Array-list of already tokenized tokens.
- * @param $collect Says whether or start and close are collected, set to
- * false at first recursion because it's the implicit DIV
- * tag you're dealing with.
- * @returns Tokens of node appended to previously passed tokens.
+ * Iterative function that tokenizes a node, putting it into an accumulator.
+ * To iterate is human, to recurse divine - L. Peter Deutsch
+ * @param DOMNode $node DOMNode to be tokenized.
+ * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.
+ * @return HTMLPurifier_Token of node appended to previously passed tokens.
*/
- protected function tokenizeDOM($node, &$tokens, $collect = false) {
+ protected function tokenizeDOM($node, &$tokens)
+ {
+ $level = 0;
+ $nodes = array($level => new HTMLPurifier_Queue(array($node)));
+ $closingNodes = array();
+ do {
+ while (!$nodes[$level]->isEmpty()) {
+ $node = $nodes[$level]->shift(); // FIFO
+ $collect = $level > 0 ? true : false;
+ $needEndingTag = $this->createStartNode($node, $tokens, $collect);
+ if ($needEndingTag) {
+ $closingNodes[$level][] = $node;
+ }
+ if ($node->childNodes && $node->childNodes->length) {
+ $level++;
+ $nodes[$level] = new HTMLPurifier_Queue();
+ foreach ($node->childNodes as $childNode) {
+ $nodes[$level]->push($childNode);
+ }
+ }
+ }
+ $level--;
+ if ($level && isset($closingNodes[$level])) {
+ while ($node = array_pop($closingNodes[$level])) {
+ $this->createEndNode($node, $tokens);
+ }
+ }
+ } while ($level > 0);
+ }
+ /**
+ * @param DOMNode $node DOMNode to be tokenized.
+ * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.
+ * @param bool $collect Says whether or start and close are collected, set to
+ * false at first recursion because it's the implicit DIV
+ * tag you're dealing with.
+ * @return bool if the token needs an endtoken
+ * @todo data and tagName properties don't seem to exist in DOMNode?
+ */
+ protected function createStartNode($node, &$tokens, $collect)
+ {
// intercept non element nodes. WE MUST catch all of them,
// but we're not getting the character reference nodes because
// those should have been preprocessed
if ($node->nodeType === XML_TEXT_NODE) {
$tokens[] = $this->factory->createText($node->data);
- return;
+ return false;
} elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
// undo libxml's special treatment of <script> and <style> tags
$last = end($tokens);
@@ -106,59 +153,61 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
}
}
$tokens[] = $this->factory->createText($this->parseData($data));
- return;
+ return false;
} elseif ($node->nodeType === XML_COMMENT_NODE) {
// this is code is only invoked for comments in script/style in versions
// of libxml pre-2.6.28 (regular comments, of course, are still
// handled regularly)
$tokens[] = $this->factory->createComment($node->data);
- return;
- } elseif (
+ return false;
+ } elseif ($node->nodeType !== XML_ELEMENT_NODE) {
// not-well tested: there may be other nodes we have to grab
- $node->nodeType !== XML_ELEMENT_NODE
- ) {
- return;
+ return false;
}
- $attr = $node->hasAttributes() ?
- $this->transformAttrToAssoc($node->attributes) :
- array();
+ $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
// We still have to make sure that the element actually IS empty
if (!$node->childNodes->length) {
if ($collect) {
$tokens[] = $this->factory->createEmpty($node->tagName, $attr);
}
+ return false;
} else {
- if ($collect) { // don't wrap on first iteration
+ if ($collect) {
$tokens[] = $this->factory->createStart(
$tag_name = $node->tagName, // somehow, it get's dropped
$attr
);
}
- foreach ($node->childNodes as $node) {
- // remember, it's an accumulator. Otherwise, we'd have
- // to use array_merge
- $this->tokenizeDOM($node, $tokens, true);
- }
- if ($collect) {
- $tokens[] = $this->factory->createEnd($tag_name);
- }
+ return true;
}
+ }
+ /**
+ * @param DOMNode $node
+ * @param HTMLPurifier_Token[] $tokens
+ */
+ protected function createEndNode($node, &$tokens)
+ {
+ $tokens[] = $this->factory->createEnd($node->tagName);
}
+
/**
* Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
*
- * @param $attribute_list DOMNamedNodeMap of DOMAttr objects.
- * @returns Associative array of attributes.
+ * @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects.
+ * @return array Associative array of attributes.
*/
- protected function transformAttrToAssoc($node_map) {
+ protected function transformAttrToAssoc($node_map)
+ {
// NamedNodeMap is documented very well, so we're using undocumented
// features, namely, the fact that it implements Iterator and
// has a ->length attribute
- if ($node_map->length === 0) return array();
+ if ($node_map->length === 0) {
+ return array();
+ }
$array = array();
foreach ($node_map as $attr) {
$array[$attr->name] = $attr->value;
@@ -168,46 +217,64 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
/**
* An error handler that mutes all errors
+ * @param int $errno
+ * @param string $errstr
*/
- public function muteErrorHandler($errno, $errstr) {}
+ public function muteErrorHandler($errno, $errstr)
+ {
+ }
/**
* Callback function for undoing escaping of stray angled brackets
* in comments
+ * @param array $matches
+ * @return string
*/
- public function callbackUndoCommentSubst($matches) {
- return '<!--' . strtr($matches[1], array('&amp;'=>'&','&lt;'=>'<')) . $matches[2];
+ public function callbackUndoCommentSubst($matches)
+ {
+ return '<!--' . strtr($matches[1], array('&amp;' => '&', '&lt;' => '<')) . $matches[2];
}
/**
* Callback function that entity-izes ampersands in comments so that
* callbackUndoCommentSubst doesn't clobber them
+ * @param array $matches
+ * @return string
*/
- public function callbackArmorCommentEntities($matches) {
+ public function callbackArmorCommentEntities($matches)
+ {
return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
}
/**
* Wraps an HTML fragment in the necessary HTML
+ * @param string $html
+ * @param HTMLPurifier_Config $config
+ * @param HTMLPurifier_Context $context
+ * @return string
*/
- protected function wrapHTML($html, $config, $context) {
+ protected function wrapHTML($html, $config, $context)
+ {
$def = $config->getDefinition('HTML');
$ret = '';
if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
$ret .= '<!DOCTYPE html ';
- if (!empty($def->doctype->dtdPublic)) $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
- if (!empty($def->doctype->dtdSystem)) $ret .= '"' . $def->doctype->dtdSystem . '" ';
+ if (!empty($def->doctype->dtdPublic)) {
+ $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
+ }
+ if (!empty($def->doctype->dtdSystem)) {
+ $ret .= '"' . $def->doctype->dtdSystem . '" ';
+ }
$ret .= '>';
}
$ret .= '<html><head>';
$ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
// No protection if $html contains a stray </div>!
- $ret .= '</head><body><div>'.$html.'</div></body></html>';
+ $ret .= '</head><body><div>' . $html . '</div></body></html>';
return $ret;
}
-
}
// vim: et sw=4 sts=4