diff options
author | Mike Macgirvin <mike@macgirvin.com> | 2010-09-08 20:14:17 -0700 |
---|---|---|
committer | Mike Macgirvin <mike@macgirvin.com> | 2010-09-08 20:14:17 -0700 |
commit | ffb1997902facb36b78a7cfa522f41f2b3d71cda (patch) | |
tree | e9fe47acf26c5fd2c742677f2610b60d3008eb26 /library/HTMLPurifier/ChildDef | |
parent | b49858b038a0a05bbe7685929e88071d0e125538 (diff) | |
download | volse-hubzilla-ffb1997902facb36b78a7cfa522f41f2b3d71cda.tar.gz volse-hubzilla-ffb1997902facb36b78a7cfa522f41f2b3d71cda.tar.bz2 volse-hubzilla-ffb1997902facb36b78a7cfa522f41f2b3d71cda.zip |
mistpark 2.0 infrasturcture lands
Diffstat (limited to 'library/HTMLPurifier/ChildDef')
-rw-r--r-- | library/HTMLPurifier/ChildDef/Chameleon.php | 48 | ||||
-rw-r--r-- | library/HTMLPurifier/ChildDef/Custom.php | 90 | ||||
-rw-r--r-- | library/HTMLPurifier/ChildDef/Empty.php | 20 | ||||
-rw-r--r-- | library/HTMLPurifier/ChildDef/Optional.php | 26 | ||||
-rw-r--r-- | library/HTMLPurifier/ChildDef/Required.php | 117 | ||||
-rw-r--r-- | library/HTMLPurifier/ChildDef/StrictBlockquote.php | 88 | ||||
-rw-r--r-- | library/HTMLPurifier/ChildDef/Table.php | 142 |
7 files changed, 531 insertions, 0 deletions
diff --git a/library/HTMLPurifier/ChildDef/Chameleon.php b/library/HTMLPurifier/ChildDef/Chameleon.php new file mode 100644 index 000000000..15c364ee3 --- /dev/null +++ b/library/HTMLPurifier/ChildDef/Chameleon.php @@ -0,0 +1,48 @@ +<?php + +/** + * Definition that uses different definitions depending on context. + * + * The del and ins tags are notable because they allow different types of + * elements depending on whether or not they're in a block or inline context. + * Chameleon allows this behavior to happen by using two different + * definitions depending on context. While this somewhat generalized, + * it is specifically intended for those two tags. + */ +class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef +{ + + /** + * Instance of the definition object to use when inline. Usually stricter. + */ + public $inline; + + /** + * Instance of the definition object to use when block. + */ + public $block; + + public $type = 'chameleon'; + + /** + * @param $inline List of elements to allow when inline. + * @param $block List of elements to allow when block. + */ + public function __construct($inline, $block) { + $this->inline = new HTMLPurifier_ChildDef_Optional($inline); + $this->block = new HTMLPurifier_ChildDef_Optional($block); + $this->elements = $this->block->elements; + } + + public function validateChildren($tokens_of_children, $config, $context) { + if ($context->get('IsInline') === false) { + return $this->block->validateChildren( + $tokens_of_children, $config, $context); + } else { + return $this->inline->validateChildren( + $tokens_of_children, $config, $context); + } + } +} + +// vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/ChildDef/Custom.php b/library/HTMLPurifier/ChildDef/Custom.php new file mode 100644 index 000000000..b68047b4b --- /dev/null +++ b/library/HTMLPurifier/ChildDef/Custom.php @@ -0,0 +1,90 @@ +<?php + +/** + * Custom validation class, accepts DTD child definitions + * + * @warning Currently this class is an all or nothing proposition, that is, + * it will only give a bool return value. + */ +class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef +{ + public $type = 'custom'; + public $allow_empty = false; + /** + * Allowed child pattern as defined by the DTD + */ + public $dtd_regex; + /** + * PCRE regex derived from $dtd_regex + * @private + */ + private $_pcre_regex; + /** + * @param $dtd_regex Allowed child pattern from the DTD + */ + public function __construct($dtd_regex) { + $this->dtd_regex = $dtd_regex; + $this->_compileRegex(); + } + /** + * Compiles the PCRE regex from a DTD regex ($dtd_regex to $_pcre_regex) + */ + protected function _compileRegex() { + $raw = str_replace(' ', '', $this->dtd_regex); + if ($raw{0} != '(') { + $raw = "($raw)"; + } + $el = '[#a-zA-Z0-9_.-]+'; + $reg = $raw; + + // COMPLICATED! AND MIGHT BE BUGGY! I HAVE NO CLUE WHAT I'M + // DOING! Seriously: if there's problems, please report them. + + // collect all elements into the $elements array + preg_match_all("/$el/", $reg, $matches); + foreach ($matches[0] as $match) { + $this->elements[$match] = true; + } + + // setup all elements as parentheticals with leading commas + $reg = preg_replace("/$el/", '(,\\0)', $reg); + + // remove commas when they were not solicited + $reg = preg_replace("/([^,(|]\(+),/", '\\1', $reg); + + // remove all non-paranthetical commas: they are handled by first regex + $reg = preg_replace("/,\(/", '(', $reg); + + $this->_pcre_regex = $reg; + } + public function validateChildren($tokens_of_children, $config, $context) { + $list_of_children = ''; + $nesting = 0; // depth into the nest + foreach ($tokens_of_children as $token) { + if (!empty($token->is_whitespace)) continue; + + $is_child = ($nesting == 0); // direct + + if ($token instanceof HTMLPurifier_Token_Start) { + $nesting++; + } elseif ($token instanceof HTMLPurifier_Token_End) { + $nesting--; + } + + if ($is_child) { + $list_of_children .= $token->name . ','; + } + } + // add leading comma to deal with stray comma declarations + $list_of_children = ',' . rtrim($list_of_children, ','); + $okay = + preg_match( + '/^,?'.$this->_pcre_regex.'$/', + $list_of_children + ); + + return (bool) $okay; + } +} + +// vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/ChildDef/Empty.php b/library/HTMLPurifier/ChildDef/Empty.php new file mode 100644 index 000000000..13171f665 --- /dev/null +++ b/library/HTMLPurifier/ChildDef/Empty.php @@ -0,0 +1,20 @@ +<?php + +/** + * Definition that disallows all elements. + * @warning validateChildren() in this class is actually never called, because + * empty elements are corrected in HTMLPurifier_Strategy_MakeWellFormed + * before child definitions are parsed in earnest by + * HTMLPurifier_Strategy_FixNesting. + */ +class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef +{ + public $allow_empty = true; + public $type = 'empty'; + public function __construct() {} + public function validateChildren($tokens_of_children, $config, $context) { + return array(); + } +} + +// vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/ChildDef/Optional.php b/library/HTMLPurifier/ChildDef/Optional.php new file mode 100644 index 000000000..32bcb9898 --- /dev/null +++ b/library/HTMLPurifier/ChildDef/Optional.php @@ -0,0 +1,26 @@ +<?php + +/** + * Definition that allows a set of elements, and allows no children. + * @note This is a hack to reuse code from HTMLPurifier_ChildDef_Required, + * really, one shouldn't inherit from the other. Only altered behavior + * is to overload a returned false with an array. Thus, it will never + * return false. + */ +class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required +{ + public $allow_empty = true; + public $type = 'optional'; + public function validateChildren($tokens_of_children, $config, $context) { + $result = parent::validateChildren($tokens_of_children, $config, $context); + // we assume that $tokens_of_children is not modified + if ($result === false) { + if (empty($tokens_of_children)) return true; + elseif ($this->whitespace) return $tokens_of_children; + else return array(); + } + return $result; + } +} + +// vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/ChildDef/Required.php b/library/HTMLPurifier/ChildDef/Required.php new file mode 100644 index 000000000..4889f249b --- /dev/null +++ b/library/HTMLPurifier/ChildDef/Required.php @@ -0,0 +1,117 @@ +<?php + +/** + * Definition that allows a set of elements, but disallows empty children. + */ +class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef +{ + /** + * Lookup table of allowed elements. + * @public + */ + public $elements = array(); + /** + * Whether or not the last passed node was all whitespace. + */ + protected $whitespace = false; + /** + * @param $elements List of allowed element names (lowercase). + */ + public function __construct($elements) { + if (is_string($elements)) { + $elements = str_replace(' ', '', $elements); + $elements = explode('|', $elements); + } + $keys = array_keys($elements); + if ($keys == array_keys($keys)) { + $elements = array_flip($elements); + foreach ($elements as $i => $x) { + $elements[$i] = true; + if (empty($i)) unset($elements[$i]); // remove blank + } + } + $this->elements = $elements; + } + public $allow_empty = false; + public $type = 'required'; + public function validateChildren($tokens_of_children, $config, $context) { + // Flag for subclasses + $this->whitespace = false; + + // if there are no tokens, delete parent node + if (empty($tokens_of_children)) return false; + + // the new set of children + $result = array(); + + // current depth into the nest + $nesting = 0; + + // whether or not we're deleting a node + $is_deleting = false; + + // whether or not parsed character data is allowed + // this controls whether or not we silently drop a tag + // or generate escaped HTML from it + $pcdata_allowed = isset($this->elements['#PCDATA']); + + // a little sanity check to make sure it's not ALL whitespace + $all_whitespace = true; + + // some configuration + $escape_invalid_children = $config->get('Core.EscapeInvalidChildren'); + + // generator + $gen = new HTMLPurifier_Generator($config, $context); + + foreach ($tokens_of_children as $token) { + if (!empty($token->is_whitespace)) { + $result[] = $token; + continue; + } + $all_whitespace = false; // phew, we're not talking about whitespace + + $is_child = ($nesting == 0); + + if ($token instanceof HTMLPurifier_Token_Start) { + $nesting++; + } elseif ($token instanceof HTMLPurifier_Token_End) { + $nesting--; + } + + if ($is_child) { + $is_deleting = false; + if (!isset($this->elements[$token->name])) { + $is_deleting = true; + if ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text) { + $result[] = $token; + } elseif ($pcdata_allowed && $escape_invalid_children) { + $result[] = new HTMLPurifier_Token_Text( + $gen->generateFromToken($token) + ); + } + continue; + } + } + if (!$is_deleting || ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text)) { + $result[] = $token; + } elseif ($pcdata_allowed && $escape_invalid_children) { + $result[] = + new HTMLPurifier_Token_Text( + $gen->generateFromToken($token) + ); + } else { + // drop silently + } + } + if (empty($result)) return false; + if ($all_whitespace) { + $this->whitespace = true; + return false; + } + if ($tokens_of_children == $result) return true; + return $result; + } +} + +// vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/ChildDef/StrictBlockquote.php b/library/HTMLPurifier/ChildDef/StrictBlockquote.php new file mode 100644 index 000000000..dfae8a6e5 --- /dev/null +++ b/library/HTMLPurifier/ChildDef/StrictBlockquote.php @@ -0,0 +1,88 @@ +<?php + +/** + * Takes the contents of blockquote when in strict and reformats for validation. + */ +class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Required +{ + protected $real_elements; + protected $fake_elements; + public $allow_empty = true; + public $type = 'strictblockquote'; + protected $init = false; + + /** + * @note We don't want MakeWellFormed to auto-close inline elements since + * they might be allowed. + */ + public function getAllowedElements($config) { + $this->init($config); + return $this->fake_elements; + } + + public function validateChildren($tokens_of_children, $config, $context) { + + $this->init($config); + + // trick the parent class into thinking it allows more + $this->elements = $this->fake_elements; + $result = parent::validateChildren($tokens_of_children, $config, $context); + $this->elements = $this->real_elements; + + if ($result === false) return array(); + if ($result === true) $result = $tokens_of_children; + + $def = $config->getHTMLDefinition(); + $block_wrap_start = new HTMLPurifier_Token_Start($def->info_block_wrapper); + $block_wrap_end = new HTMLPurifier_Token_End( $def->info_block_wrapper); + $is_inline = false; + $depth = 0; + $ret = array(); + + // assuming that there are no comment tokens + foreach ($result as $i => $token) { + $token = $result[$i]; + // ifs are nested for readability + if (!$is_inline) { + if (!$depth) { + if ( + ($token instanceof HTMLPurifier_Token_Text && !$token->is_whitespace) || + (!$token instanceof HTMLPurifier_Token_Text && !isset($this->elements[$token->name])) + ) { + $is_inline = true; + $ret[] = $block_wrap_start; + } + } + } else { + if (!$depth) { + // starting tokens have been inline text / empty + if ($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) { + if (isset($this->elements[$token->name])) { + // ended + $ret[] = $block_wrap_end; + $is_inline = false; + } + } + } + } + $ret[] = $token; + if ($token instanceof HTMLPurifier_Token_Start) $depth++; + if ($token instanceof HTMLPurifier_Token_End) $depth--; + } + if ($is_inline) $ret[] = $block_wrap_end; + return $ret; + } + + private function init($config) { + if (!$this->init) { + $def = $config->getHTMLDefinition(); + // allow all inline elements + $this->real_elements = $this->elements; + $this->fake_elements = $def->info_content_sets['Flow']; + $this->fake_elements['#PCDATA'] = true; + $this->init = true; + } + } +} + +// vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/ChildDef/Table.php b/library/HTMLPurifier/ChildDef/Table.php new file mode 100644 index 000000000..34f0227dd --- /dev/null +++ b/library/HTMLPurifier/ChildDef/Table.php @@ -0,0 +1,142 @@ +<?php + +/** + * Definition for tables + */ +class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef +{ + public $allow_empty = false; + public $type = 'table'; + public $elements = array('tr' => true, 'tbody' => true, 'thead' => true, + 'tfoot' => true, 'caption' => true, 'colgroup' => true, 'col' => true); + public function __construct() {} + public function validateChildren($tokens_of_children, $config, $context) { + if (empty($tokens_of_children)) return false; + + // this ensures that the loop gets run one last time before closing + // up. It's a little bit of a hack, but it works! Just make sure you + // get rid of the token later. + $tokens_of_children[] = false; + + // only one of these elements is allowed in a table + $caption = false; + $thead = false; + $tfoot = false; + + // as many of these as you want + $cols = array(); + $content = array(); + + $nesting = 0; // current depth so we can determine nodes + $is_collecting = false; // are we globbing together tokens to package + // into one of the collectors? + $collection = array(); // collected nodes + $tag_index = 0; // the first node might be whitespace, + // so this tells us where the start tag is + + foreach ($tokens_of_children as $token) { + $is_child = ($nesting == 0); + + if ($token === false) { + // terminating sequence started + } elseif ($token instanceof HTMLPurifier_Token_Start) { + $nesting++; + } elseif ($token instanceof HTMLPurifier_Token_End) { + $nesting--; + } + + // handle node collection + if ($is_collecting) { + if ($is_child) { + // okay, let's stash the tokens away + // first token tells us the type of the collection + switch ($collection[$tag_index]->name) { + case 'tr': + case 'tbody': + $content[] = $collection; + break; + case 'caption': + if ($caption !== false) break; + $caption = $collection; + break; + case 'thead': + case 'tfoot': + // access the appropriate variable, $thead or $tfoot + $var = $collection[$tag_index]->name; + if ($$var === false) { + $$var = $collection; + } else { + // transmutate the first and less entries into + // tbody tags, and then put into content + $collection[$tag_index]->name = 'tbody'; + $collection[count($collection)-1]->name = 'tbody'; + $content[] = $collection; + } + break; + case 'colgroup': + $cols[] = $collection; + break; + } + $collection = array(); + $is_collecting = false; + $tag_index = 0; + } else { + // add the node to the collection + $collection[] = $token; + } + } + + // terminate + if ($token === false) break; + + if ($is_child) { + // determine what we're dealing with + if ($token->name == 'col') { + // the only empty tag in the possie, we can handle it + // immediately + $cols[] = array_merge($collection, array($token)); + $collection = array(); + $tag_index = 0; + continue; + } + switch($token->name) { + case 'caption': + case 'colgroup': + case 'thead': + case 'tfoot': + case 'tbody': + case 'tr': + $is_collecting = true; + $collection[] = $token; + continue; + default: + if (!empty($token->is_whitespace)) { + $collection[] = $token; + $tag_index++; + } + continue; + } + } + } + + if (empty($content)) return false; + + $ret = array(); + if ($caption !== false) $ret = array_merge($ret, $caption); + if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array); + if ($thead !== false) $ret = array_merge($ret, $thead); + if ($tfoot !== false) $ret = array_merge($ret, $tfoot); + foreach ($content as $token_array) $ret = array_merge($ret, $token_array); + if (!empty($collection) && $is_collecting == false){ + // grab the trailing space + $ret = array_merge($ret, $collection); + } + + array_pop($tokens_of_children); // remove phantom token + + return ($ret === $tokens_of_children) ? true : $ret; + + } +} + +// vim: et sw=4 sts=4 |