From a0052f0176bd079e6a94baec59fea2ec5a8d651e Mon Sep 17 00:00:00 2001 From: friendica Date: Thu, 1 Jan 2015 22:18:27 -0800 Subject: htmlpurifier update - compatibility issue with language library autoloader --- library/HTMLPurifier/ChildDef/Chameleon.php | 33 ++- library/HTMLPurifier/ChildDef/Custom.php | 56 ++-- library/HTMLPurifier/ChildDef/Empty.php | 22 +- library/HTMLPurifier/ChildDef/List.php | 86 ++++++ library/HTMLPurifier/ChildDef/Optional.php | 31 ++- library/HTMLPurifier/ChildDef/Required.php | 103 +++---- library/HTMLPurifier/ChildDef/StrictBlockquote.php | 96 ++++--- library/HTMLPurifier/ChildDef/Table.php | 302 +++++++++++++-------- 8 files changed, 494 insertions(+), 235 deletions(-) create mode 100644 library/HTMLPurifier/ChildDef/List.php (limited to 'library/HTMLPurifier/ChildDef') diff --git a/library/HTMLPurifier/ChildDef/Chameleon.php b/library/HTMLPurifier/ChildDef/Chameleon.php index 15c364ee3..7439be26b 100644 --- a/library/HTMLPurifier/ChildDef/Chameleon.php +++ b/library/HTMLPurifier/ChildDef/Chameleon.php @@ -14,33 +14,52 @@ class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef /** * Instance of the definition object to use when inline. Usually stricter. + * @type HTMLPurifier_ChildDef_Optional */ public $inline; /** * Instance of the definition object to use when block. + * @type HTMLPurifier_ChildDef_Optional */ public $block; + /** + * @type string + */ public $type = 'chameleon'; /** - * @param $inline List of elements to allow when inline. - * @param $block List of elements to allow when block. + * @param array $inline List of elements to allow when inline. + * @param array $block List of elements to allow when block. */ - public function __construct($inline, $block) { + public function __construct($inline, $block) + { $this->inline = new HTMLPurifier_ChildDef_Optional($inline); - $this->block = new HTMLPurifier_ChildDef_Optional($block); + $this->block = new HTMLPurifier_ChildDef_Optional($block); $this->elements = $this->block->elements; } - public function validateChildren($tokens_of_children, $config, $context) { + /** + * @param HTMLPurifier_Node[] $children + * @param HTMLPurifier_Config $config + * @param HTMLPurifier_Context $context + * @return bool + */ + public function validateChildren($children, $config, $context) + { if ($context->get('IsInline') === false) { return $this->block->validateChildren( - $tokens_of_children, $config, $context); + $children, + $config, + $context + ); } else { return $this->inline->validateChildren( - $tokens_of_children, $config, $context); + $children, + $config, + $context + ); } } } diff --git a/library/HTMLPurifier/ChildDef/Custom.php b/library/HTMLPurifier/ChildDef/Custom.php index b68047b4b..128132e96 100644 --- a/library/HTMLPurifier/ChildDef/Custom.php +++ b/library/HTMLPurifier/ChildDef/Custom.php @@ -8,28 +8,42 @@ */ class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef { + /** + * @type string + */ public $type = 'custom'; + + /** + * @type bool + */ public $allow_empty = false; + /** - * Allowed child pattern as defined by the DTD + * Allowed child pattern as defined by the DTD. + * @type string */ public $dtd_regex; + /** - * PCRE regex derived from $dtd_regex - * @private + * PCRE regex derived from $dtd_regex. + * @type string */ private $_pcre_regex; + /** * @param $dtd_regex Allowed child pattern from the DTD */ - public function __construct($dtd_regex) { + public function __construct($dtd_regex) + { $this->dtd_regex = $dtd_regex; $this->_compileRegex(); } + /** * Compiles the PCRE regex from a DTD regex ($dtd_regex to $_pcre_regex) */ - protected function _compileRegex() { + protected function _compileRegex() + { $raw = str_replace(' ', '', $this->dtd_regex); if ($raw{0} != '(') { $raw = "($raw)"; @@ -57,33 +71,31 @@ class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef $this->_pcre_regex = $reg; } - public function validateChildren($tokens_of_children, $config, $context) { + + /** + * @param HTMLPurifier_Node[] $children + * @param HTMLPurifier_Config $config + * @param HTMLPurifier_Context $context + * @return bool + */ + public function validateChildren($children, $config, $context) + { $list_of_children = ''; $nesting = 0; // depth into the nest - foreach ($tokens_of_children as $token) { - if (!empty($token->is_whitespace)) continue; - - $is_child = ($nesting == 0); // direct - - if ($token instanceof HTMLPurifier_Token_Start) { - $nesting++; - } elseif ($token instanceof HTMLPurifier_Token_End) { - $nesting--; - } - - if ($is_child) { - $list_of_children .= $token->name . ','; + foreach ($children as $node) { + if (!empty($node->is_whitespace)) { + continue; } + $list_of_children .= $node->name . ','; } // add leading comma to deal with stray comma declarations $list_of_children = ',' . rtrim($list_of_children, ','); $okay = preg_match( - '/^,?'.$this->_pcre_regex.'$/', + '/^,?' . $this->_pcre_regex . '$/', $list_of_children ); - - return (bool) $okay; + return (bool)$okay; } } diff --git a/library/HTMLPurifier/ChildDef/Empty.php b/library/HTMLPurifier/ChildDef/Empty.php index 13171f665..a8a6cbdd2 100644 --- a/library/HTMLPurifier/ChildDef/Empty.php +++ b/library/HTMLPurifier/ChildDef/Empty.php @@ -9,10 +9,28 @@ */ class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef { + /** + * @type bool + */ public $allow_empty = true; + + /** + * @type string + */ public $type = 'empty'; - public function __construct() {} - public function validateChildren($tokens_of_children, $config, $context) { + + public function __construct() + { + } + + /** + * @param HTMLPurifier_Node[] $children + * @param HTMLPurifier_Config $config + * @param HTMLPurifier_Context $context + * @return array + */ + public function validateChildren($children, $config, $context) + { return array(); } } diff --git a/library/HTMLPurifier/ChildDef/List.php b/library/HTMLPurifier/ChildDef/List.php new file mode 100644 index 000000000..891b9f6f5 --- /dev/null +++ b/library/HTMLPurifier/ChildDef/List.php @@ -0,0 +1,86 @@ + true, 'ul' => true, 'ol' => true); + + /** + * @param array $children + * @param HTMLPurifier_Config $config + * @param HTMLPurifier_Context $context + * @return array + */ + public function validateChildren($children, $config, $context) + { + // Flag for subclasses + $this->whitespace = false; + + // if there are no tokens, delete parent node + if (empty($children)) { + return false; + } + + // the new set of children + $result = array(); + + // a little sanity check to make sure it's not ALL whitespace + $all_whitespace = true; + + $current_li = false; + + foreach ($children as $node) { + if (!empty($node->is_whitespace)) { + $result[] = $node; + continue; + } + $all_whitespace = false; // phew, we're not talking about whitespace + + if ($node->name === 'li') { + // good + $current_li = $node; + $result[] = $node; + } else { + // we want to tuck this into the previous li + // Invariant: we expect the node to be ol/ul + // ToDo: Make this more robust in the case of not ol/ul + // by distinguishing between existing li and li created + // to handle non-list elements; non-list elements should + // not be appended to an existing li; only li created + // for non-list. This distinction is not currently made. + if ($current_li === false) { + $current_li = new HTMLPurifier_Node_Element('li'); + $result[] = $current_li; + } + $current_li->children[] = $node; + $current_li->empty = false; // XXX fascinating! Check for this error elsewhere ToDo + } + } + if (empty($result)) { + return false; + } + if ($all_whitespace) { + return false; + } + return $result; + } +} + +// vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/ChildDef/Optional.php b/library/HTMLPurifier/ChildDef/Optional.php index 32bcb9898..b9468063b 100644 --- a/library/HTMLPurifier/ChildDef/Optional.php +++ b/library/HTMLPurifier/ChildDef/Optional.php @@ -9,15 +9,34 @@ */ class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required { + /** + * @type bool + */ public $allow_empty = true; + + /** + * @type string + */ public $type = 'optional'; - public function validateChildren($tokens_of_children, $config, $context) { - $result = parent::validateChildren($tokens_of_children, $config, $context); - // we assume that $tokens_of_children is not modified + + /** + * @param array $children + * @param HTMLPurifier_Config $config + * @param HTMLPurifier_Context $context + * @return array + */ + public function validateChildren($children, $config, $context) + { + $result = parent::validateChildren($children, $config, $context); + // we assume that $children is not modified if ($result === false) { - if (empty($tokens_of_children)) return true; - elseif ($this->whitespace) return $tokens_of_children; - else return array(); + if (empty($children)) { + return true; + } elseif ($this->whitespace) { + return $children; + } else { + return array(); + } } return $result; } diff --git a/library/HTMLPurifier/ChildDef/Required.php b/library/HTMLPurifier/ChildDef/Required.php index 4889f249b..0d1c8f5f3 100644 --- a/library/HTMLPurifier/ChildDef/Required.php +++ b/library/HTMLPurifier/ChildDef/Required.php @@ -7,17 +7,21 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef { /** * Lookup table of allowed elements. - * @public + * @type array */ public $elements = array(); + /** * Whether or not the last passed node was all whitespace. + * @type bool */ protected $whitespace = false; + /** - * @param $elements List of allowed element names (lowercase). + * @param array|string $elements List of allowed element names (lowercase). */ - public function __construct($elements) { + public function __construct($elements) + { if (is_string($elements)) { $elements = str_replace(' ', '', $elements); $elements = explode('|', $elements); @@ -27,29 +31,43 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef $elements = array_flip($elements); foreach ($elements as $i => $x) { $elements[$i] = true; - if (empty($i)) unset($elements[$i]); // remove blank + if (empty($i)) { + unset($elements[$i]); + } // remove blank } } $this->elements = $elements; } + + /** + * @type bool + */ public $allow_empty = false; + + /** + * @type string + */ public $type = 'required'; - public function validateChildren($tokens_of_children, $config, $context) { + + /** + * @param array $children + * @param HTMLPurifier_Config $config + * @param HTMLPurifier_Context $context + * @return array + */ + public function validateChildren($children, $config, $context) + { // Flag for subclasses $this->whitespace = false; // if there are no tokens, delete parent node - if (empty($tokens_of_children)) return false; + if (empty($children)) { + return false; + } // the new set of children $result = array(); - // current depth into the nest - $nesting = 0; - - // whether or not we're deleting a node - $is_deleting = false; - // whether or not parsed character data is allowed // this controls whether or not we silently drop a tag // or generate escaped HTML from it @@ -58,58 +76,41 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef // a little sanity check to make sure it's not ALL whitespace $all_whitespace = true; - // some configuration - $escape_invalid_children = $config->get('Core.EscapeInvalidChildren'); - - // generator - $gen = new HTMLPurifier_Generator($config, $context); - - foreach ($tokens_of_children as $token) { - if (!empty($token->is_whitespace)) { - $result[] = $token; + $stack = array_reverse($children); + while (!empty($stack)) { + $node = array_pop($stack); + if (!empty($node->is_whitespace)) { + $result[] = $node; continue; } $all_whitespace = false; // phew, we're not talking about whitespace - $is_child = ($nesting == 0); - - if ($token instanceof HTMLPurifier_Token_Start) { - $nesting++; - } elseif ($token instanceof HTMLPurifier_Token_End) { - $nesting--; - } - - if ($is_child) { - $is_deleting = false; - if (!isset($this->elements[$token->name])) { - $is_deleting = true; - if ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text) { - $result[] = $token; - } elseif ($pcdata_allowed && $escape_invalid_children) { - $result[] = new HTMLPurifier_Token_Text( - $gen->generateFromToken($token) - ); + if (!isset($this->elements[$node->name])) { + // special case text + // XXX One of these ought to be redundant or something + if ($pcdata_allowed && $node instanceof HTMLPurifier_Node_Text) { + $result[] = $node; + continue; + } + // spill the child contents in + // ToDo: Make configurable + if ($node instanceof HTMLPurifier_Node_Element) { + for ($i = count($node->children) - 1; $i >= 0; $i--) { + $stack[] = $node->children[$i]; } continue; } + continue; } - if (!$is_deleting || ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text)) { - $result[] = $token; - } elseif ($pcdata_allowed && $escape_invalid_children) { - $result[] = - new HTMLPurifier_Token_Text( - $gen->generateFromToken($token) - ); - } else { - // drop silently - } + $result[] = $node; + } + if (empty($result)) { + return false; } - if (empty($result)) return false; if ($all_whitespace) { $this->whitespace = true; return false; } - if ($tokens_of_children == $result) return true; return $result; } } diff --git a/library/HTMLPurifier/ChildDef/StrictBlockquote.php b/library/HTMLPurifier/ChildDef/StrictBlockquote.php index dfae8a6e5..3270a46e1 100644 --- a/library/HTMLPurifier/ChildDef/StrictBlockquote.php +++ b/library/HTMLPurifier/ChildDef/StrictBlockquote.php @@ -5,75 +5,97 @@ */ class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Required { + /** + * @type array + */ protected $real_elements; + + /** + * @type array + */ protected $fake_elements; + + /** + * @type bool + */ public $allow_empty = true; + + /** + * @type string + */ public $type = 'strictblockquote'; + + /** + * @type bool + */ protected $init = false; /** + * @param HTMLPurifier_Config $config + * @return array * @note We don't want MakeWellFormed to auto-close inline elements since * they might be allowed. */ - public function getAllowedElements($config) { + public function getAllowedElements($config) + { $this->init($config); return $this->fake_elements; } - public function validateChildren($tokens_of_children, $config, $context) { - + /** + * @param array $children + * @param HTMLPurifier_Config $config + * @param HTMLPurifier_Context $context + * @return array + */ + public function validateChildren($children, $config, $context) + { $this->init($config); // trick the parent class into thinking it allows more $this->elements = $this->fake_elements; - $result = parent::validateChildren($tokens_of_children, $config, $context); + $result = parent::validateChildren($children, $config, $context); $this->elements = $this->real_elements; - if ($result === false) return array(); - if ($result === true) $result = $tokens_of_children; + if ($result === false) { + return array(); + } + if ($result === true) { + $result = $children; + } $def = $config->getHTMLDefinition(); - $block_wrap_start = new HTMLPurifier_Token_Start($def->info_block_wrapper); - $block_wrap_end = new HTMLPurifier_Token_End( $def->info_block_wrapper); - $is_inline = false; - $depth = 0; + $block_wrap_name = $def->info_block_wrapper; + $block_wrap = false; $ret = array(); - // assuming that there are no comment tokens - foreach ($result as $i => $token) { - $token = $result[$i]; - // ifs are nested for readability - if (!$is_inline) { - if (!$depth) { - if ( - ($token instanceof HTMLPurifier_Token_Text && !$token->is_whitespace) || - (!$token instanceof HTMLPurifier_Token_Text && !isset($this->elements[$token->name])) - ) { - $is_inline = true; - $ret[] = $block_wrap_start; - } + foreach ($result as $node) { + if ($block_wrap === false) { + if (($node instanceof HTMLPurifier_Node_Text && !$node->is_whitespace) || + ($node instanceof HTMLPurifier_Node_Element && !isset($this->elements[$node->name]))) { + $block_wrap = new HTMLPurifier_Node_Element($def->info_block_wrapper); + $ret[] = $block_wrap; } } else { - if (!$depth) { - // starting tokens have been inline text / empty - if ($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) { - if (isset($this->elements[$token->name])) { - // ended - $ret[] = $block_wrap_end; - $is_inline = false; - } - } + if ($node instanceof HTMLPurifier_Node_Element && isset($this->elements[$node->name])) { + $block_wrap = false; + } } - $ret[] = $token; - if ($token instanceof HTMLPurifier_Token_Start) $depth++; - if ($token instanceof HTMLPurifier_Token_End) $depth--; + if ($block_wrap) { + $block_wrap->children[] = $node; + } else { + $ret[] = $node; + } } - if ($is_inline) $ret[] = $block_wrap_end; return $ret; } - private function init($config) { + /** + * @param HTMLPurifier_Config $config + */ + private function init($config) + { if (!$this->init) { $def = $config->getHTMLDefinition(); // allow all inline elements diff --git a/library/HTMLPurifier/ChildDef/Table.php b/library/HTMLPurifier/ChildDef/Table.php index 34f0227dd..3e4a0f218 100644 --- a/library/HTMLPurifier/ChildDef/Table.php +++ b/library/HTMLPurifier/ChildDef/Table.php @@ -1,140 +1,222 @@ true, 'tbody' => true, 'thead' => true, - 'tfoot' => true, 'caption' => true, 'colgroup' => true, 'col' => true); - public function __construct() {} - public function validateChildren($tokens_of_children, $config, $context) { - if (empty($tokens_of_children)) return false; - // this ensures that the loop gets run one last time before closing - // up. It's a little bit of a hack, but it works! Just make sure you - // get rid of the token later. - $tokens_of_children[] = false; + /** + * @type array + */ + public $elements = array( + 'tr' => true, + 'tbody' => true, + 'thead' => true, + 'tfoot' => true, + 'caption' => true, + 'colgroup' => true, + 'col' => true + ); + + public function __construct() + { + } + + /** + * @param array $children + * @param HTMLPurifier_Config $config + * @param HTMLPurifier_Context $context + * @return array + */ + public function validateChildren($children, $config, $context) + { + if (empty($children)) { + return false; + } // only one of these elements is allowed in a table $caption = false; - $thead = false; - $tfoot = false; + $thead = false; + $tfoot = false; + + // whitespace + $initial_ws = array(); + $after_caption_ws = array(); + $after_thead_ws = array(); + $after_tfoot_ws = array(); // as many of these as you want - $cols = array(); + $cols = array(); $content = array(); - $nesting = 0; // current depth so we can determine nodes - $is_collecting = false; // are we globbing together tokens to package - // into one of the collectors? - $collection = array(); // collected nodes - $tag_index = 0; // the first node might be whitespace, - // so this tells us where the start tag is - - foreach ($tokens_of_children as $token) { - $is_child = ($nesting == 0); - - if ($token === false) { - // terminating sequence started - } elseif ($token instanceof HTMLPurifier_Token_Start) { - $nesting++; - } elseif ($token instanceof HTMLPurifier_Token_End) { - $nesting--; - } + $tbody_mode = false; // if true, then we need to wrap any stray + // s with a . - // handle node collection - if ($is_collecting) { - if ($is_child) { - // okay, let's stash the tokens away - // first token tells us the type of the collection - switch ($collection[$tag_index]->name) { - case 'tr': - case 'tbody': - $content[] = $collection; - break; - case 'caption': - if ($caption !== false) break; - $caption = $collection; - break; - case 'thead': - case 'tfoot': - // access the appropriate variable, $thead or $tfoot - $var = $collection[$tag_index]->name; - if ($$var === false) { - $$var = $collection; - } else { - // transmutate the first and less entries into - // tbody tags, and then put into content - $collection[$tag_index]->name = 'tbody'; - $collection[count($collection)-1]->name = 'tbody'; - $content[] = $collection; - } - break; - case 'colgroup': - $cols[] = $collection; - break; - } - $collection = array(); - $is_collecting = false; - $tag_index = 0; + $ws_accum =& $initial_ws; + + foreach ($children as $node) { + if ($node instanceof HTMLPurifier_Node_Comment) { + $ws_accum[] = $node; + continue; + } + switch ($node->name) { + case 'tbody': + $tbody_mode = true; + // fall through + case 'tr': + $content[] = $node; + $ws_accum =& $content; + break; + case 'caption': + // there can only be one caption! + if ($caption !== false) break; + $caption = $node; + $ws_accum =& $after_caption_ws; + break; + case 'thead': + $tbody_mode = true; + // XXX This breaks rendering properties with + // Firefox, which never floats a to + // the top. Ever. (Our scheme will float the + // first to the top.) So maybe + // s that are not first should be + // turned into ? Very tricky, indeed. + if ($thead === false) { + $thead = $node; + $ws_accum =& $after_thead_ws; } else { - // add the node to the collection - $collection[] = $token; + // Oops, there's a second one! What + // should we do? Current behavior is to + // transmutate the first and last entries into + // tbody tags, and then put into content. + // Maybe a better idea is to *attach + // it* to the existing thead or tfoot? + // We don't do this, because Firefox + // doesn't float an extra tfoot to the + // bottom like it does for the first one. + $node->name = 'tbody'; + $content[] = $node; + $ws_accum =& $content; } - } - - // terminate - if ($token === false) break; - - if ($is_child) { - // determine what we're dealing with - if ($token->name == 'col') { - // the only empty tag in the possie, we can handle it - // immediately - $cols[] = array_merge($collection, array($token)); - $collection = array(); - $tag_index = 0; - continue; + break; + case 'tfoot': + // see above for some aveats + $tbody_mode = true; + if ($tfoot === false) { + $tfoot = $node; + $ws_accum =& $after_tfoot_ws; + } else { + $node->name = 'tbody'; + $content[] = $node; + $ws_accum =& $content; } - switch($token->name) { - case 'caption': - case 'colgroup': - case 'thead': - case 'tfoot': - case 'tbody': - case 'tr': - $is_collecting = true; - $collection[] = $token; - continue; - default: - if (!empty($token->is_whitespace)) { - $collection[] = $token; - $tag_index++; - } - continue; + break; + case 'colgroup': + case 'col': + $cols[] = $node; + $ws_accum =& $cols; + break; + case '#PCDATA': + // How is whitespace handled? We treat is as sticky to + // the *end* of the previous element. So all of the + // nonsense we have worked on is to keep things + // together. + if (!empty($node->is_whitespace)) { + $ws_accum[] = $node; } + break; } } - if (empty($content)) return false; - - $ret = array(); - if ($caption !== false) $ret = array_merge($ret, $caption); - if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array); - if ($thead !== false) $ret = array_merge($ret, $thead); - if ($tfoot !== false) $ret = array_merge($ret, $tfoot); - foreach ($content as $token_array) $ret = array_merge($ret, $token_array); - if (!empty($collection) && $is_collecting == false){ - // grab the trailing space - $ret = array_merge($ret, $collection); + if (empty($content)) { + return false; + } + + $ret = $initial_ws; + if ($caption !== false) { + $ret[] = $caption; + $ret = array_merge($ret, $after_caption_ws); + } + if ($cols !== false) { + $ret = array_merge($ret, $cols); + } + if ($thead !== false) { + $ret[] = $thead; + $ret = array_merge($ret, $after_thead_ws); + } + if ($tfoot !== false) { + $ret[] = $tfoot; + $ret = array_merge($ret, $after_tfoot_ws); } - array_pop($tokens_of_children); // remove phantom token + if ($tbody_mode) { + // we have to shuffle tr into tbody + $current_tr_tbody = null; + + foreach($content as $node) { + switch ($node->name) { + case 'tbody': + $current_tr_tbody = null; + $ret[] = $node; + break; + case 'tr': + if ($current_tr_tbody === null) { + $current_tr_tbody = new HTMLPurifier_Node_Element('tbody'); + $ret[] = $current_tr_tbody; + } + $current_tr_tbody->children[] = $node; + break; + case '#PCDATA': + assert($node->is_whitespace); + if ($current_tr_tbody === null) { + $ret[] = $node; + } else { + $current_tr_tbody->children[] = $node; + } + break; + } + } + } else { + $ret = array_merge($ret, $content); + } - return ($ret === $tokens_of_children) ? true : $ret; + return $ret; } } -- cgit v1.2.3