mistpark 2.0 infrasturcture lands

author: Mike Macgirvin <mike@macgirvin.com> 2010-09-08 20:14:17 -0700
committer: Mike Macgirvin <mike@macgirvin.com> 2010-09-08 20:14:17 -0700
commit: ffb1997902facb36b78a7cfa522f41f2b3d71cda (patch)
tree: e9fe47acf26c5fd2c742677f2610b60d3008eb26 /library/HTMLPurifier/ChildDef
parent: b49858b038a0a05bbe7685929e88071d0e125538 (diff)
download: volse-hubzilla-ffb1997902facb36b78a7cfa522f41f2b3d71cda.tar.gz
volse-hubzilla-ffb1997902facb36b78a7cfa522f41f2b3d71cda.tar.bz2
volse-hubzilla-ffb1997902facb36b78a7cfa522f41f2b3d71cda.zip
7 files changed, 531 insertions, 0 deletions
diff --git a/library/HTMLPurifier/ChildDef/Chameleon.php b/library/HTMLPurifier/ChildDef/Chameleon.php
new file mode 100644
index 000000000..15c364ee3
--- /dev/null
+++ b/library/HTMLPurifier/ChildDef/Chameleon.php
@@ -0,0 +1,48 @@
+<?php
+
+/**
+ * Definition that uses different definitions depending on context.
+ *
+ * The del and ins tags are notable because they allow different types of
+ * elements depending on whether or not they're in a block or inline context.
+ * Chameleon allows this behavior to happen by using two different
+ * definitions depending on context.  While this somewhat generalized,
+ * it is specifically intended for those two tags.
+ */
+class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
+{
+
+    /**
+     * Instance of the definition object to use when inline. Usually stricter.
+     */
+    public $inline;
+
+    /**
+     * Instance of the definition object to use when block.
+     */
+    public $block;
+
+    public $type = 'chameleon';
+
+    /**
+     * @param $inline List of elements to allow when inline.
+     * @param $block List of elements to allow when block.
+     */
+    public function __construct($inline, $block) {
+        $this->inline = new HTMLPurifier_ChildDef_Optional($inline);
+        $this->block  = new HTMLPurifier_ChildDef_Optional($block);
+        $this->elements = $this->block->elements;
+    }
+
+    public function validateChildren($tokens_of_children, $config, $context) {
+        if ($context->get('IsInline') === false) {
+            return $this->block->validateChildren(
+                $tokens_of_children, $config, $context);
+        } else {
+            return $this->inline->validateChildren(
+                $tokens_of_children, $config, $context);
+        }
+    }
+}
+
+// vim: et sw=4 sts=4
diff --git a/library/HTMLPurifier/ChildDef/Custom.php b/library/HTMLPurifier/ChildDef/Custom.php
new file mode 100644
index 000000000..b68047b4b
--- /dev/null
+++ b/library/HTMLPurifier/ChildDef/Custom.php
@@ -0,0 +1,90 @@
+<?php
+
+/**
+ * Custom validation class, accepts DTD child definitions
+ *
+ * @warning Currently this class is an all or nothing proposition, that is,
+ *          it will only give a bool return value.
+ */
+class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
+{
+    public $type = 'custom';
+    public $allow_empty = false;
+    /**
+     * Allowed child pattern as defined by the DTD
+     */
+    public $dtd_regex;
+    /**
+     * PCRE regex derived from $dtd_regex
+     * @private
+     */
+    private $_pcre_regex;
+    /**
+     * @param $dtd_regex Allowed child pattern from the DTD
+     */
+    public function __construct($dtd_regex) {
+        $this->dtd_regex = $dtd_regex;
+        $this->_compileRegex();
+    }
+    /**
+     * Compiles the PCRE regex from a DTD regex ($dtd_regex to $_pcre_regex)
+     */
+    protected function _compileRegex() {
+        $raw = str_replace(' ', '', $this->dtd_regex);
+        if ($raw{0} != '(') {
+            $raw = "($raw)";
+        }
+        $el = '[#a-zA-Z0-9_.-]+';
+        $reg = $raw;
+
+        // COMPLICATED! AND MIGHT BE BUGGY! I HAVE NO CLUE WHAT I'M
+        // DOING! Seriously: if there's problems, please report them.
+
+        // collect all elements into the $elements array
+        preg_match_all("/$el/", $reg, $matches);
+        foreach ($matches[0] as $match) {
+            $this->elements[$match] = true;
+        }
+
+        // setup all elements as parentheticals with leading commas
+        $reg = preg_replace("/$el/", '(,\\0)', $reg);
+
+        // remove commas when they were not solicited
+        $reg = preg_replace("/([^,(|]\(+),/", '\\1', $reg);
+
+        // remove all non-paranthetical commas: they are handled by first regex
+        $reg = preg_replace("/,\(/", '(', $reg);
+
+        $this->_pcre_regex = $reg;
+    }
+    public function validateChildren($tokens_of_children, $config, $context) {
+        $list_of_children = '';
+        $nesting = 0; // depth into the nest
+        foreach ($tokens_of_children as $token) {
+            if (!empty($token->is_whitespace)) continue;
+
+            $is_child = ($nesting == 0); // direct
+
+            if ($token instanceof HTMLPurifier_Token_Start) {
+                $nesting++;
+            } elseif ($token instanceof HTMLPurifier_Token_End) {
+                $nesting--;
+            }
+
+            if ($is_child) {
+                $list_of_children .= $token->name . ',';
+            }
+        }
+        // add leading comma to deal with stray comma declarations
+        $list_of_children = ',' . rtrim($list_of_children, ',');
+        $okay =
+            preg_match(
+                '/^,?'.$this->_pcre_regex.'$/',
+                $list_of_children
+            );
+
+        return (bool) $okay;
+    }
+}
+
+// vim: et sw=4 sts=4
diff --git a/library/HTMLPurifier/ChildDef/Empty.php b/library/HTMLPurifier/ChildDef/Empty.php
new file mode 100644
index 000000000..13171f665
--- /dev/null
+++ b/library/HTMLPurifier/ChildDef/Empty.php
@@ -0,0 +1,20 @@
+<?php
+
+/**
+ * Definition that disallows all elements.
+ * @warning validateChildren() in this class is actually never called, because
+ *          empty elements are corrected in HTMLPurifier_Strategy_MakeWellFormed
+ *          before child definitions are parsed in earnest by
+ *          HTMLPurifier_Strategy_FixNesting.
+ */
+class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef
+{
+    public $allow_empty = true;
+    public $type = 'empty';
+    public function __construct() {}
+    public function validateChildren($tokens_of_children, $config, $context) {
+        return array();
+    }
+}
+
+// vim: et sw=4 sts=4
diff --git a/library/HTMLPurifier/ChildDef/Optional.php b/library/HTMLPurifier/ChildDef/Optional.php
new file mode 100644
index 000000000..32bcb9898
--- /dev/null
+++ b/library/HTMLPurifier/ChildDef/Optional.php
@@ -0,0 +1,26 @@
+<?php
+
+/**
+ * Definition that allows a set of elements, and allows no children.
+ * @note This is a hack to reuse code from HTMLPurifier_ChildDef_Required,
+ *       really, one shouldn't inherit from the other.  Only altered behavior
+ *       is to overload a returned false with an array.  Thus, it will never
+ *       return false.
+ */
+class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
+{
+    public $allow_empty = true;
+    public $type = 'optional';
+    public function validateChildren($tokens_of_children, $config, $context) {
+        $result = parent::validateChildren($tokens_of_children, $config, $context);
+        // we assume that $tokens_of_children is not modified
+        if ($result === false) {
+            if (empty($tokens_of_children)) return true;
+            elseif ($this->whitespace) return $tokens_of_children;
+            else return array();
+        }
+        return $result;
+    }
+}
+
+// vim: et sw=4 sts=4
diff --git a/library/HTMLPurifier/ChildDef/Required.php b/library/HTMLPurifier/ChildDef/Required.php
new file mode 100644
index 000000000..4889f249b
--- /dev/null
+++ b/library/HTMLPurifier/ChildDef/Required.php
@@ -0,0 +1,117 @@
+<?php
+
+/**
+ * Definition that allows a set of elements, but disallows empty children.
+ */
+class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
+{
+    /**
+     * Lookup table of allowed elements.
+     * @public
+     */
+    public $elements = array();
+    /**
+     * Whether or not the last passed node was all whitespace.
+     */
+    protected $whitespace = false;
+    /**
+     * @param $elements List of allowed element names (lowercase).
+     */
+    public function __construct($elements) {
+        if (is_string($elements)) {
+            $elements = str_replace(' ', '', $elements);
+            $elements = explode('|', $elements);
+        }
+        $keys = array_keys($elements);
+        if ($keys == array_keys($keys)) {
+            $elements = array_flip($elements);
+            foreach ($elements as $i => $x) {
+                $elements[$i] = true;
+                if (empty($i)) unset($elements[$i]); // remove blank
+            }
+        }
+        $this->elements = $elements;
+    }
+    public $allow_empty = false;
+    public $type = 'required';
+    public function validateChildren($tokens_of_children, $config, $context) {
+        // Flag for subclasses
+        $this->whitespace = false;
+
+        // if there are no tokens, delete parent node
+        if (empty($tokens_of_children)) return false;
+
+        // the new set of children
+        $result = array();
+
+        // current depth into the nest
+        $nesting = 0;
+
+        // whether or not we're deleting a node
+        $is_deleting = false;
+
+        // whether or not parsed character data is allowed
+        // this controls whether or not we silently drop a tag
+        // or generate escaped HTML from it
+        $pcdata_allowed = isset($this->elements['#PCDATA']);
+
+        // a little sanity check to make sure it's not ALL whitespace
+        $all_whitespace = true;
+
+        // some configuration
+        $escape_invalid_children = $config->get('Core.EscapeInvalidChildren');
+
+        // generator
+        $gen = new HTMLPurifier_Generator($config, $context);
+
+        foreach ($tokens_of_children as $token) {
+            if (!empty($token->is_whitespace)) {
+                $result[] = $token;
+                continue;
+            }
+            $all_whitespace = false; // phew, we're not talking about whitespace
+
+            $is_child = ($nesting == 0);
+
+            if ($token instanceof HTMLPurifier_Token_Start) {
+                $nesting++;
+            } elseif ($token instanceof HTMLPurifier_Token_End) {
+                $nesting--;
+            }
+
+            if ($is_child) {
+                $is_deleting = false;
+                if (!isset($this->elements[$token->name])) {
+                    $is_deleting = true;
+                    if ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text) {
+                        $result[] = $token;
+                    } elseif ($pcdata_allowed && $escape_invalid_children) {
+                        $result[] = new HTMLPurifier_Token_Text(
+                            $gen->generateFromToken($token)
+                        );
+                    }
+                    continue;
+                }
+            }
+            if (!$is_deleting || ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text)) {
+                $result[] = $token;
+            } elseif ($pcdata_allowed && $escape_invalid_children) {
+                $result[] =
+                    new HTMLPurifier_Token_Text(
+                        $gen->generateFromToken($token)
+                    );
+            } else {
+                // drop silently
+            }
+        }
+        if (empty($result)) return false;
+        if ($all_whitespace) {
+            $this->whitespace = true;
+            return false;
+        }
+        if ($tokens_of_children == $result) return true;
+        return $result;
+    }
+}
+
+// vim: et sw=4 sts=4
diff --git a/library/HTMLPurifier/ChildDef/StrictBlockquote.php b/library/HTMLPurifier/ChildDef/StrictBlockquote.php
new file mode 100644
index 000000000..dfae8a6e5
--- /dev/null
+++ b/library/HTMLPurifier/ChildDef/StrictBlockquote.php
@@ -0,0 +1,88 @@
+<?php
+
+/**
+ * Takes the contents of blockquote when in strict and reformats for validation.
+ */
+class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Required
+{
+    protected $real_elements;
+    protected $fake_elements;
+    public $allow_empty = true;
+    public $type = 'strictblockquote';
+    protected $init = false;
+
+    /**
+     * @note We don't want MakeWellFormed to auto-close inline elements since
+     *       they might be allowed.
+     */
+    public function getAllowedElements($config) {
+        $this->init($config);
+        return $this->fake_elements;
+    }
+
+    public function validateChildren($tokens_of_children, $config, $context) {
+
+        $this->init($config);
+
+        // trick the parent class into thinking it allows more
+        $this->elements = $this->fake_elements;
+        $result = parent::validateChildren($tokens_of_children, $config, $context);
+        $this->elements = $this->real_elements;
+
+        if ($result === false) return array();
+        if ($result === true) $result = $tokens_of_children;
+
+        $def = $config->getHTMLDefinition();
+        $block_wrap_start = new HTMLPurifier_Token_Start($def->info_block_wrapper);
+        $block_wrap_end   = new HTMLPurifier_Token_End(  $def->info_block_wrapper);
+        $is_inline = false;
+        $depth = 0;
+        $ret = array();
+
+        // assuming that there are no comment tokens
+        foreach ($result as $i => $token) {
+            $token = $result[$i];
+            // ifs are nested for readability
+            if (!$is_inline) {
+                if (!$depth) {
+                     if (
+                        ($token instanceof HTMLPurifier_Token_Text && !$token->is_whitespace) ||
+                        (!$token instanceof HTMLPurifier_Token_Text && !isset($this->elements[$token->name]))
+                     ) {
+                        $is_inline = true;
+                        $ret[] = $block_wrap_start;
+                     }
+                }
+            } else {
+                if (!$depth) {
+                    // starting tokens have been inline text / empty
+                    if ($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) {
+                        if (isset($this->elements[$token->name])) {
+                            // ended
+                            $ret[] = $block_wrap_end;
+                            $is_inline = false;
+                        }
+                    }
+                }
+            }
+            $ret[] = $token;
+            if ($token instanceof HTMLPurifier_Token_Start) $depth++;
+            if ($token instanceof HTMLPurifier_Token_End)   $depth--;
+        }
+        if ($is_inline) $ret[] = $block_wrap_end;
+        return $ret;
+    }
+
+    private function init($config) {
+        if (!$this->init) {
+            $def = $config->getHTMLDefinition();
+            // allow all inline elements
+            $this->real_elements = $this->elements;
+            $this->fake_elements = $def->info_content_sets['Flow'];
+            $this->fake_elements['#PCDATA'] = true;
+            $this->init = true;
+        }
+    }
+}
+
+// vim: et sw=4 sts=4
diff --git a/library/HTMLPurifier/ChildDef/Table.php b/library/HTMLPurifier/ChildDef/Table.php
new file mode 100644
index 000000000..34f0227dd
--- /dev/null
+++ b/library/HTMLPurifier/ChildDef/Table.php
@@ -0,0 +1,142 @@
+<?php
+
+/**
+ * Definition for tables
+ */
+class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
+{
+    public $allow_empty = false;
+    public $type = 'table';
+    public $elements = array('tr' => true, 'tbody' => true, 'thead' => true,
+        'tfoot' => true, 'caption' => true, 'colgroup' => true, 'col' => true);
+    public function __construct() {}
+    public function validateChildren($tokens_of_children, $config, $context) {
+        if (empty($tokens_of_children)) return false;
+
+        // this ensures that the loop gets run one last time before closing
+        // up. It's a little bit of a hack, but it works! Just make sure you
+        // get rid of the token later.
+        $tokens_of_children[] = false;
+
+        // only one of these elements is allowed in a table
+        $caption = false;
+        $thead   = false;
+        $tfoot   = false;
+
+        // as many of these as you want
+        $cols    = array();
+        $content = array();
+
+        $nesting = 0; // current depth so we can determine nodes
+        $is_collecting = false; // are we globbing together tokens to package
+                                // into one of the collectors?
+        $collection = array(); // collected nodes
+        $tag_index = 0; // the first node might be whitespace,
+                            // so this tells us where the start tag is
+
+        foreach ($tokens_of_children as $token) {
+            $is_child = ($nesting == 0);
+
+            if ($token === false) {
+                // terminating sequence started
+            } elseif ($token instanceof HTMLPurifier_Token_Start) {
+                $nesting++;
+            } elseif ($token instanceof HTMLPurifier_Token_End) {
+                $nesting--;
+            }
+
+            // handle node collection
+            if ($is_collecting) {
+                if ($is_child) {
+                    // okay, let's stash the tokens away
+                    // first token tells us the type of the collection
+                    switch ($collection[$tag_index]->name) {
+                        case 'tr':
+                        case 'tbody':
+                            $content[] = $collection;
+                            break;
+                        case 'caption':
+                            if ($caption !== false) break;
+                            $caption = $collection;
+                            break;
+                        case 'thead':
+                        case 'tfoot':
+                            // access the appropriate variable, $thead or $tfoot
+                            $var = $collection[$tag_index]->name;
+                            if ($$var === false) {
+                                $$var = $collection;
+                            } else {
+                                // transmutate the first and less entries into
+                                // tbody tags, and then put into content
+                                $collection[$tag_index]->name = 'tbody';
+                                $collection[count($collection)-1]->name = 'tbody';
+                                $content[] = $collection;
+                            }
+                            break;
+                         case 'colgroup':
+                            $cols[] = $collection;
+                            break;
+                    }
+                    $collection = array();
+                    $is_collecting = false;
+                    $tag_index = 0;
+                } else {
+                    // add the node to the collection
+                    $collection[] = $token;
+                }
+            }
+
+            // terminate
+            if ($token === false) break;
+
+            if ($is_child) {
+                // determine what we're dealing with
+                if ($token->name == 'col') {
+                    // the only empty tag in the possie, we can handle it
+                    // immediately
+                    $cols[] = array_merge($collection, array($token));
+                    $collection = array();
+                    $tag_index = 0;
+                    continue;
+                }
+                switch($token->name) {
+                    case 'caption':
+                    case 'colgroup':
+                    case 'thead':
+                    case 'tfoot':
+                    case 'tbody':
+                    case 'tr':
+                        $is_collecting = true;
+                        $collection[] = $token;
+                        continue;
+                    default:
+                        if (!empty($token->is_whitespace)) {
+                            $collection[] = $token;
+                            $tag_index++;
+                        }
+                        continue;
+                }
+            }
+        }
+
+        if (empty($content)) return false;
+
+        $ret = array();
+        if ($caption !== false) $ret = array_merge($ret, $caption);
+        if ($cols !== false)    foreach ($cols as $token_array) $ret = array_merge($ret, $token_array);
+        if ($thead !== false)   $ret = array_merge($ret, $thead);
+        if ($tfoot !== false)   $ret = array_merge($ret, $tfoot);
+        foreach ($content as $token_array) $ret = array_merge($ret, $token_array);
+        if (!empty($collection) && $is_collecting == false){
+            // grab the trailing space
+            $ret = array_merge($ret, $collection);
+        }
+
+        array_pop($tokens_of_children); // remove phantom token
+
+        return ($ret === $tokens_of_children) ? true : $ret;
+
+    }
+}
+
+// vim: et sw=4 sts=4
author	Mike Macgirvin <mike@macgirvin.com>	2010-09-08 20:14:17 -0700
committer	Mike Macgirvin <mike@macgirvin.com>	2010-09-08 20:14:17 -0700
commit	ffb1997902facb36b78a7cfa522f41f2b3d71cda (patch)
tree	e9fe47acf26c5fd2c742677f2610b60d3008eb26 /library/HTMLPurifier/ChildDef
parent	b49858b038a0a05bbe7685929e88071d0e125538 (diff)
download	volse-hubzilla-ffb1997902facb36b78a7cfa522f41f2b3d71cda.tar.gz volse-hubzilla-ffb1997902facb36b78a7cfa522f41f2b3d71cda.tar.bz2 volse-hubzilla-ffb1997902facb36b78a7cfa522f41f2b3d71cda.zip