From ffb1997902facb36b78a7cfa522f41f2b3d71cda Mon Sep 17 00:00:00 2001 From: Mike Macgirvin Date: Wed, 8 Sep 2010 20:14:17 -0700 Subject: mistpark 2.0 infrasturcture lands --- library/HTMLPurifier/EntityParser.php | 144 ++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 library/HTMLPurifier/EntityParser.php (limited to 'library/HTMLPurifier/EntityParser.php') diff --git a/library/HTMLPurifier/EntityParser.php b/library/HTMLPurifier/EntityParser.php new file mode 100644 index 000000000..8c384472d --- /dev/null +++ b/library/HTMLPurifier/EntityParser.php @@ -0,0 +1,144 @@ + '"', + 38 => '&', + 39 => "'", + 60 => '<', + 62 => '>' + ); + + /** + * Stripped entity names to decimal conversion table for special entities. + */ + protected $_special_ent2dec = + array( + 'quot' => 34, + 'amp' => 38, + 'lt' => 60, + 'gt' => 62 + ); + + /** + * Substitutes non-special entities with their parsed equivalents. Since + * running this whenever you have parsed character is t3h 5uck, we run + * it before everything else. + * + * @param $string String to have non-special entities parsed. + * @returns Parsed string. + */ + public function substituteNonSpecialEntities($string) { + // it will try to detect missing semicolons, but don't rely on it + return preg_replace_callback( + $this->_substituteEntitiesRegex, + array($this, 'nonSpecialEntityCallback'), + $string + ); + } + + /** + * Callback function for substituteNonSpecialEntities() that does the work. + * + * @param $matches PCRE matches array, with 0 the entire match, and + * either index 1, 2 or 3 set with a hex value, dec value, + * or string (respectively). + * @returns Replacement string. + */ + + protected function nonSpecialEntityCallback($matches) { + // replaces all but big five + $entity = $matches[0]; + $is_num = (@$matches[0][1] === '#'); + if ($is_num) { + $is_hex = (@$entity[2] === 'x'); + $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; + + // abort for special characters + if (isset($this->_special_dec2str[$code])) return $entity; + + return HTMLPurifier_Encoder::unichr($code); + } else { + if (isset($this->_special_ent2dec[$matches[3]])) return $entity; + if (!$this->_entity_lookup) { + $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); + } + if (isset($this->_entity_lookup->table[$matches[3]])) { + return $this->_entity_lookup->table[$matches[3]]; + } else { + return $entity; + } + } + } + + /** + * Substitutes only special entities with their parsed equivalents. + * + * @notice We try to avoid calling this function because otherwise, it + * would have to be called a lot (for every parsed section). + * + * @param $string String to have non-special entities parsed. + * @returns Parsed string. + */ + public function substituteSpecialEntities($string) { + return preg_replace_callback( + $this->_substituteEntitiesRegex, + array($this, 'specialEntityCallback'), + $string); + } + + /** + * Callback function for substituteSpecialEntities() that does the work. + * + * This callback has same syntax as nonSpecialEntityCallback(). + * + * @param $matches PCRE-style matches array, with 0 the entire match, and + * either index 1, 2 or 3 set with a hex value, dec value, + * or string (respectively). + * @returns Replacement string. + */ + protected function specialEntityCallback($matches) { + $entity = $matches[0]; + $is_num = (@$matches[0][1] === '#'); + if ($is_num) { + $is_hex = (@$entity[2] === 'x'); + $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; + return isset($this->_special_dec2str[$int]) ? + $this->_special_dec2str[$int] : + $entity; + } else { + return isset($this->_special_ent2dec[$matches[3]]) ? + $this->_special_ent2dec[$matches[3]] : + $entity; + } + } + +} + +// vim: et sw=4 sts=4 -- cgit v1.2.3