aboutsummaryrefslogtreecommitdiffstats
path: root/library/HTMLPurifier/ChildDef
diff options
context:
space:
mode:
authorMike Macgirvin <mike@macgirvin.com>2010-09-08 20:14:17 -0700
committerMike Macgirvin <mike@macgirvin.com>2010-09-08 20:14:17 -0700
commitffb1997902facb36b78a7cfa522f41f2b3d71cda (patch)
treee9fe47acf26c5fd2c742677f2610b60d3008eb26 /library/HTMLPurifier/ChildDef
parentb49858b038a0a05bbe7685929e88071d0e125538 (diff)
downloadvolse-hubzilla-ffb1997902facb36b78a7cfa522f41f2b3d71cda.tar.gz
volse-hubzilla-ffb1997902facb36b78a7cfa522f41f2b3d71cda.tar.bz2
volse-hubzilla-ffb1997902facb36b78a7cfa522f41f2b3d71cda.zip
mistpark 2.0 infrasturcture lands
Diffstat (limited to 'library/HTMLPurifier/ChildDef')
-rw-r--r--library/HTMLPurifier/ChildDef/Chameleon.php48
-rw-r--r--library/HTMLPurifier/ChildDef/Custom.php90
-rw-r--r--library/HTMLPurifier/ChildDef/Empty.php20
-rw-r--r--library/HTMLPurifier/ChildDef/Optional.php26
-rw-r--r--library/HTMLPurifier/ChildDef/Required.php117
-rw-r--r--library/HTMLPurifier/ChildDef/StrictBlockquote.php88
-rw-r--r--library/HTMLPurifier/ChildDef/Table.php142
7 files changed, 531 insertions, 0 deletions
diff --git a/library/HTMLPurifier/ChildDef/Chameleon.php b/library/HTMLPurifier/ChildDef/Chameleon.php
new file mode 100644
index 000000000..15c364ee3
--- /dev/null
+++ b/library/HTMLPurifier/ChildDef/Chameleon.php
@@ -0,0 +1,48 @@
+<?php
+
+/**
+ * Definition that uses different definitions depending on context.
+ *
+ * The del and ins tags are notable because they allow different types of
+ * elements depending on whether or not they're in a block or inline context.
+ * Chameleon allows this behavior to happen by using two different
+ * definitions depending on context. While this somewhat generalized,
+ * it is specifically intended for those two tags.
+ */
+class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
+{
+
+ /**
+ * Instance of the definition object to use when inline. Usually stricter.
+ */
+ public $inline;
+
+ /**
+ * Instance of the definition object to use when block.
+ */
+ public $block;
+
+ public $type = 'chameleon';
+
+ /**
+ * @param $inline List of elements to allow when inline.
+ * @param $block List of elements to allow when block.
+ */
+ public function __construct($inline, $block) {
+ $this->inline = new HTMLPurifier_ChildDef_Optional($inline);
+ $this->block = new HTMLPurifier_ChildDef_Optional($block);
+ $this->elements = $this->block->elements;
+ }
+
+ public function validateChildren($tokens_of_children, $config, $context) {
+ if ($context->get('IsInline') === false) {
+ return $this->block->validateChildren(
+ $tokens_of_children, $config, $context);
+ } else {
+ return $this->inline->validateChildren(
+ $tokens_of_children, $config, $context);
+ }
+ }
+}
+
+// vim: et sw=4 sts=4
diff --git a/library/HTMLPurifier/ChildDef/Custom.php b/library/HTMLPurifier/ChildDef/Custom.php
new file mode 100644
index 000000000..b68047b4b
--- /dev/null
+++ b/library/HTMLPurifier/ChildDef/Custom.php
@@ -0,0 +1,90 @@
+<?php
+
+/**
+ * Custom validation class, accepts DTD child definitions
+ *
+ * @warning Currently this class is an all or nothing proposition, that is,
+ * it will only give a bool return value.
+ */
+class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
+{
+ public $type = 'custom';
+ public $allow_empty = false;
+ /**
+ * Allowed child pattern as defined by the DTD
+ */
+ public $dtd_regex;
+ /**
+ * PCRE regex derived from $dtd_regex
+ * @private
+ */
+ private $_pcre_regex;
+ /**
+ * @param $dtd_regex Allowed child pattern from the DTD
+ */
+ public function __construct($dtd_regex) {
+ $this->dtd_regex = $dtd_regex;
+ $this->_compileRegex();
+ }
+ /**
+ * Compiles the PCRE regex from a DTD regex ($dtd_regex to $_pcre_regex)
+ */
+ protected function _compileRegex() {
+ $raw = str_replace(' ', '', $this->dtd_regex);
+ if ($raw{0} != '(') {
+ $raw = "($raw)";
+ }
+ $el = '[#a-zA-Z0-9_.-]+';
+ $reg = $raw;
+
+ // COMPLICATED! AND MIGHT BE BUGGY! I HAVE NO CLUE WHAT I'M
+ // DOING! Seriously: if there's problems, please report them.
+
+ // collect all elements into the $elements array
+ preg_match_all("/$el/", $reg, $matches);
+ foreach ($matches[0] as $match) {
+ $this->elements[$match] = true;
+ }
+
+ // setup all elements as parentheticals with leading commas
+ $reg = preg_replace("/$el/", '(,\\0)', $reg);
+
+ // remove commas when they were not solicited
+ $reg = preg_replace("/([^,(|]\(+),/", '\\1', $reg);
+
+ // remove all non-paranthetical commas: they are handled by first regex
+ $reg = preg_replace("/,\(/", '(', $reg);
+
+ $this->_pcre_regex = $reg;
+ }
+ public function validateChildren($tokens_of_children, $config, $context) {
+ $list_of_children = '';
+ $nesting = 0; // depth into the nest
+ foreach ($tokens_of_children as $token) {
+ if (!empty($token->is_whitespace)) continue;
+
+ $is_child = ($nesting == 0); // direct
+
+ if ($token instanceof HTMLPurifier_Token_Start) {
+ $nesting++;
+ } elseif ($token instanceof HTMLPurifier_Token_End) {
+ $nesting--;
+ }
+
+ if ($is_child) {
+ $list_of_children .= $token->name . ',';
+ }
+ }
+ // add leading comma to deal with stray comma declarations
+ $list_of_children = ',' . rtrim($list_of_children, ',');
+ $okay =
+ preg_match(
+ '/^,?'.$this->_pcre_regex.'$/',
+ $list_of_children
+ );
+
+ return (bool) $okay;
+ }
+}
+
+// vim: et sw=4 sts=4
diff --git a/library/HTMLPurifier/ChildDef/Empty.php b/library/HTMLPurifier/ChildDef/Empty.php
new file mode 100644
index 000000000..13171f665
--- /dev/null
+++ b/library/HTMLPurifier/ChildDef/Empty.php
@@ -0,0 +1,20 @@
+<?php
+
+/**
+ * Definition that disallows all elements.
+ * @warning validateChildren() in this class is actually never called, because
+ * empty elements are corrected in HTMLPurifier_Strategy_MakeWellFormed
+ * before child definitions are parsed in earnest by
+ * HTMLPurifier_Strategy_FixNesting.
+ */
+class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef
+{
+ public $allow_empty = true;
+ public $type = 'empty';
+ public function __construct() {}
+ public function validateChildren($tokens_of_children, $config, $context) {
+ return array();
+ }
+}
+
+// vim: et sw=4 sts=4
diff --git a/library/HTMLPurifier/ChildDef/Optional.php b/library/HTMLPurifier/ChildDef/Optional.php
new file mode 100644
index 000000000..32bcb9898
--- /dev/null
+++ b/library/HTMLPurifier/ChildDef/Optional.php
@@ -0,0 +1,26 @@
+<?php
+
+/**
+ * Definition that allows a set of elements, and allows no children.
+ * @note This is a hack to reuse code from HTMLPurifier_ChildDef_Required,
+ * really, one shouldn't inherit from the other. Only altered behavior
+ * is to overload a returned false with an array. Thus, it will never
+ * return false.
+ */
+class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
+{
+ public $allow_empty = true;
+ public $type = 'optional';
+ public function validateChildren($tokens_of_children, $config, $context) {
+ $result = parent::validateChildren($tokens_of_children, $config, $context);
+ // we assume that $tokens_of_children is not modified
+ if ($result === false) {
+ if (empty($tokens_of_children)) return true;
+ elseif ($this->whitespace) return $tokens_of_children;
+ else return array();
+ }
+ return $result;
+ }
+}
+
+// vim: et sw=4 sts=4
diff --git a/library/HTMLPurifier/ChildDef/Required.php b/library/HTMLPurifier/ChildDef/Required.php
new file mode 100644
index 000000000..4889f249b
--- /dev/null
+++ b/library/HTMLPurifier/ChildDef/Required.php
@@ -0,0 +1,117 @@
+<?php
+
+/**
+ * Definition that allows a set of elements, but disallows empty children.
+ */
+class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
+{
+ /**
+ * Lookup table of allowed elements.
+ * @public
+ */
+ public $elements = array();
+ /**
+ * Whether or not the last passed node was all whitespace.
+ */
+ protected $whitespace = false;
+ /**
+ * @param $elements List of allowed element names (lowercase).
+ */
+ public function __construct($elements) {
+ if (is_string($elements)) {
+ $elements = str_replace(' ', '', $elements);
+ $elements = explode('|', $elements);
+ }
+ $keys = array_keys($elements);
+ if ($keys == array_keys($keys)) {
+ $elements = array_flip($elements);
+ foreach ($elements as $i => $x) {
+ $elements[$i] = true;
+ if (empty($i)) unset($elements[$i]); // remove blank
+ }
+ }
+ $this->elements = $elements;
+ }
+ public $allow_empty = false;
+ public $type = 'required';
+ public function validateChildren($tokens_of_children, $config, $context) {
+ // Flag for subclasses
+ $this->whitespace = false;
+
+ // if there are no tokens, delete parent node
+ if (empty($tokens_of_children)) return false;
+
+ // the new set of children
+ $result = array();
+
+ // current depth into the nest
+ $nesting = 0;
+
+ // whether or not we're deleting a node
+ $is_deleting = false;
+
+ // whether or not parsed character data is allowed
+ // this controls whether or not we silently drop a tag
+ // or generate escaped HTML from it
+ $pcdata_allowed = isset($this->elements['#PCDATA']);
+
+ // a little sanity check to make sure it's not ALL whitespace
+ $all_whitespace = true;
+
+ // some configuration
+ $escape_invalid_children = $config->get('Core.EscapeInvalidChildren');
+
+ // generator
+ $gen = new HTMLPurifier_Generator($config, $context);
+
+ foreach ($tokens_of_children as $token) {
+ if (!empty($token->is_whitespace)) {
+ $result[] = $token;
+ continue;
+ }
+ $all_whitespace = false; // phew, we're not talking about whitespace
+
+ $is_child = ($nesting == 0);
+
+ if ($token instanceof HTMLPurifier_Token_Start) {
+ $nesting++;
+ } elseif ($token instanceof HTMLPurifier_Token_End) {
+ $nesting--;
+ }
+
+ if ($is_child) {
+ $is_deleting = false;
+ if (!isset($this->elements[$token->name])) {
+ $is_deleting = true;
+ if ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text) {
+ $result[] = $token;
+ } elseif ($pcdata_allowed && $escape_invalid_children) {
+ $result[] = new HTMLPurifier_Token_Text(
+ $gen->generateFromToken($token)
+ );
+ }
+ continue;
+ }
+ }
+ if (!$is_deleting || ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text)) {
+ $result[] = $token;
+ } elseif ($pcdata_allowed && $escape_invalid_children) {
+ $result[] =
+ new HTMLPurifier_Token_Text(
+ $gen->generateFromToken($token)
+ );
+ } else {
+ // drop silently
+ }
+ }
+ if (empty($result)) return false;
+ if ($all_whitespace) {
+ $this->whitespace = true;
+ return false;
+ }
+ if ($tokens_of_children == $result) return true;
+ return $result;
+ }
+}
+
+// vim: et sw=4 sts=4
diff --git a/library/HTMLPurifier/ChildDef/StrictBlockquote.php b/library/HTMLPurifier/ChildDef/StrictBlockquote.php
new file mode 100644
index 000000000..dfae8a6e5
--- /dev/null
+++ b/library/HTMLPurifier/ChildDef/StrictBlockquote.php
@@ -0,0 +1,88 @@
+<?php
+
+/**
+ * Takes the contents of blockquote when in strict and reformats for validation.
+ */
+class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Required
+{
+ protected $real_elements;
+ protected $fake_elements;
+ public $allow_empty = true;
+ public $type = 'strictblockquote';
+ protected $init = false;
+
+ /**
+ * @note We don't want MakeWellFormed to auto-close inline elements since
+ * they might be allowed.
+ */
+ public function getAllowedElements($config) {
+ $this->init($config);
+ return $this->fake_elements;
+ }
+
+ public function validateChildren($tokens_of_children, $config, $context) {
+
+ $this->init($config);
+
+ // trick the parent class into thinking it allows more
+ $this->elements = $this->fake_elements;
+ $result = parent::validateChildren($tokens_of_children, $config, $context);
+ $this->elements = $this->real_elements;
+
+ if ($result === false) return array();
+ if ($result === true) $result = $tokens_of_children;
+
+ $def = $config->getHTMLDefinition();
+ $block_wrap_start = new HTMLPurifier_Token_Start($def->info_block_wrapper);
+ $block_wrap_end = new HTMLPurifier_Token_End( $def->info_block_wrapper);
+ $is_inline = false;
+ $depth = 0;
+ $ret = array();
+
+ // assuming that there are no comment tokens
+ foreach ($result as $i => $token) {
+ $token = $result[$i];
+ // ifs are nested for readability
+ if (!$is_inline) {
+ if (!$depth) {
+ if (
+ ($token instanceof HTMLPurifier_Token_Text && !$token->is_whitespace) ||
+ (!$token instanceof HTMLPurifier_Token_Text && !isset($this->elements[$token->name]))
+ ) {
+ $is_inline = true;
+ $ret[] = $block_wrap_start;
+ }
+ }
+ } else {
+ if (!$depth) {
+ // starting tokens have been inline text / empty
+ if ($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) {
+ if (isset($this->elements[$token->name])) {
+ // ended
+ $ret[] = $block_wrap_end;
+ $is_inline = false;
+ }
+ }
+ }
+ }
+ $ret[] = $token;
+ if ($token instanceof HTMLPurifier_Token_Start) $depth++;
+ if ($token instanceof HTMLPurifier_Token_End) $depth--;
+ }
+ if ($is_inline) $ret[] = $block_wrap_end;
+ return $ret;
+ }
+
+ private function init($config) {
+ if (!$this->init) {
+ $def = $config->getHTMLDefinition();
+ // allow all inline elements
+ $this->real_elements = $this->elements;
+ $this->fake_elements = $def->info_content_sets['Flow'];
+ $this->fake_elements['#PCDATA'] = true;
+ $this->init = true;
+ }
+ }
+}
+
+// vim: et sw=4 sts=4
diff --git a/library/HTMLPurifier/ChildDef/Table.php b/library/HTMLPurifier/ChildDef/Table.php
new file mode 100644
index 000000000..34f0227dd
--- /dev/null
+++ b/library/HTMLPurifier/ChildDef/Table.php
@@ -0,0 +1,142 @@
+<?php
+
+/**
+ * Definition for tables
+ */
+class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
+{
+ public $allow_empty = false;
+ public $type = 'table';
+ public $elements = array('tr' => true, 'tbody' => true, 'thead' => true,
+ 'tfoot' => true, 'caption' => true, 'colgroup' => true, 'col' => true);
+ public function __construct() {}
+ public function validateChildren($tokens_of_children, $config, $context) {
+ if (empty($tokens_of_children)) return false;
+
+ // this ensures that the loop gets run one last time before closing
+ // up. It's a little bit of a hack, but it works! Just make sure you
+ // get rid of the token later.
+ $tokens_of_children[] = false;
+
+ // only one of these elements is allowed in a table
+ $caption = false;
+ $thead = false;
+ $tfoot = false;
+
+ // as many of these as you want
+ $cols = array();
+ $content = array();
+
+ $nesting = 0; // current depth so we can determine nodes
+ $is_collecting = false; // are we globbing together tokens to package
+ // into one of the collectors?
+ $collection = array(); // collected nodes
+ $tag_index = 0; // the first node might be whitespace,
+ // so this tells us where the start tag is
+
+ foreach ($tokens_of_children as $token) {
+ $is_child = ($nesting == 0);
+
+ if ($token === false) {
+ // terminating sequence started
+ } elseif ($token instanceof HTMLPurifier_Token_Start) {
+ $nesting++;
+ } elseif ($token instanceof HTMLPurifier_Token_End) {
+ $nesting--;
+ }
+
+ // handle node collection
+ if ($is_collecting) {
+ if ($is_child) {
+ // okay, let's stash the tokens away
+ // first token tells us the type of the collection
+ switch ($collection[$tag_index]->name) {
+ case 'tr':
+ case 'tbody':
+ $content[] = $collection;
+ break;
+ case 'caption':
+ if ($caption !== false) break;
+ $caption = $collection;
+ break;
+ case 'thead':
+ case 'tfoot':
+ // access the appropriate variable, $thead or $tfoot
+ $var = $collection[$tag_index]->name;
+ if ($$var === false) {
+ $$var = $collection;
+ } else {
+ // transmutate the first and less entries into
+ // tbody tags, and then put into content
+ $collection[$tag_index]->name = 'tbody';
+ $collection[count($collection)-1]->name = 'tbody';
+ $content[] = $collection;
+ }
+ break;
+ case 'colgroup':
+ $cols[] = $collection;
+ break;
+ }
+ $collection = array();
+ $is_collecting = false;
+ $tag_index = 0;
+ } else {
+ // add the node to the collection
+ $collection[] = $token;
+ }
+ }
+
+ // terminate
+ if ($token === false) break;
+
+ if ($is_child) {
+ // determine what we're dealing with
+ if ($token->name == 'col') {
+ // the only empty tag in the possie, we can handle it
+ // immediately
+ $cols[] = array_merge($collection, array($token));
+ $collection = array();
+ $tag_index = 0;
+ continue;
+ }
+ switch($token->name) {
+ case 'caption':
+ case 'colgroup':
+ case 'thead':
+ case 'tfoot':
+ case 'tbody':
+ case 'tr':
+ $is_collecting = true;
+ $collection[] = $token;
+ continue;
+ default:
+ if (!empty($token->is_whitespace)) {
+ $collection[] = $token;
+ $tag_index++;
+ }
+ continue;
+ }
+ }
+ }
+
+ if (empty($content)) return false;
+
+ $ret = array();
+ if ($caption !== false) $ret = array_merge($ret, $caption);
+ if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array);
+ if ($thead !== false) $ret = array_merge($ret, $thead);
+ if ($tfoot !== false) $ret = array_merge($ret, $tfoot);
+ foreach ($content as $token_array) $ret = array_merge($ret, $token_array);
+ if (!empty($collection) && $is_collecting == false){
+ // grab the trailing space
+ $ret = array_merge($ret, $collection);
+ }
+
+ array_pop($tokens_of_children); // remove phantom token
+
+ return ($ret === $tokens_of_children) ? true : $ret;
+
+ }
+}
+
+// vim: et sw=4 sts=4