diff options
Diffstat (limited to 'library/markdownify/parsehtml')
-rw-r--r-- | library/markdownify/parsehtml/parsehtml.php | 618 |
1 files changed, 0 insertions, 618 deletions
diff --git a/library/markdownify/parsehtml/parsehtml.php b/library/markdownify/parsehtml/parsehtml.php deleted file mode 100644 index 1a8ecacda..000000000 --- a/library/markdownify/parsehtml/parsehtml.php +++ /dev/null @@ -1,618 +0,0 @@ -<?php -/** - * parseHTML is a HTML parser which works with PHP 4 and above. - * It tries to handle invalid HTML to some degree. - * - * @version 1.0 beta - * @author Milian Wolff (mail@milianw.de, http://milianw.de) - * @license LGPL, see LICENSE_LGPL.txt and the summary below - * @copyright (C) 2007 Milian Wolff - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ -class parseHTML { - /** - * tags which are always empty (<br /> etc.) - * - * @var array<string> - */ - var $emptyTags = array( - 'br', - 'hr', - 'input', - 'img', - 'area', - 'link', - 'meta', - 'param', - ); - /** - * tags with preformatted text - * whitespaces wont be touched in them - * - * @var array<string> - */ - var $preformattedTags = array( - 'script', - 'style', - 'pre', - 'code', - ); - /** - * supress HTML tags inside preformatted tags (see above) - * - * @var bool - */ - var $noTagsInCode = false; - /** - * html to be parsed - * - * @var string - */ - var $html = ''; - /** - * node type: - * - * - tag (see isStartTag) - * - text (includes cdata) - * - comment - * - doctype - * - pi (processing instruction) - * - * @var string - */ - var $nodeType = ''; - /** - * current node content, i.e. either a - * simple string (text node), or something like - * <tag attrib="value"...> - * - * @var string - */ - var $node = ''; - /** - * wether current node is an opening tag (<a>) or not (</a>) - * set to NULL if current node is not a tag - * NOTE: empty tags (<br />) set this to true as well! - * - * @var bool | null - */ - var $isStartTag = null; - /** - * wether current node is an empty tag (<br />) or not (<a></a>) - * - * @var bool | null - */ - var $isEmptyTag = null; - /** - * tag name - * - * @var string | null - */ - var $tagName = ''; - /** - * attributes of current tag - * - * @var array (attribName=>value) | null - */ - var $tagAttributes = null; - /** - * wether the current tag is a block element - * - * @var bool | null - */ - var $isBlockElement = null; - - /** - * keep whitespace - * - * @var int - */ - var $keepWhitespace = 0; - /** - * list of open tags - * count this to get current depth - * - * @var array - */ - var $openTags = array(); - /** - * list of block elements - * - * @var array - * TODO: what shall we do with <del> and <ins> ?! - */ - var $blockElements = array ( - # tag name => <bool> is block - # block elements - 'address' => true, - 'blockquote' => true, - 'center' => true, - 'del' => true, - 'dir' => true, - 'div' => true, - 'dl' => true, - 'fieldset' => true, - 'form' => true, - 'h1' => true, - 'h2' => true, - 'h3' => true, - 'h4' => true, - 'h5' => true, - 'h6' => true, - 'hr' => true, - 'ins' => true, - 'isindex' => true, - 'menu' => true, - 'noframes' => true, - 'noscript' => true, - 'ol' => true, - 'p' => true, - 'pre' => true, - 'table' => true, - 'ul' => true, - # set table elements and list items to block as well - 'thead' => true, - 'tbody' => true, - 'tfoot' => true, - 'td' => true, - 'tr' => true, - 'th' => true, - 'li' => true, - 'dd' => true, - 'dt' => true, - # header items and html / body as well - 'html' => true, - 'body' => true, - 'head' => true, - 'meta' => true, - 'link' => true, - 'style' => true, - 'title' => true, - # unfancy media tags, when indented should be rendered as block - 'map' => true, - 'object' => true, - 'param' => true, - 'embed' => true, - 'area' => true, - # inline elements - 'a' => false, - 'abbr' => false, - 'acronym' => false, - 'applet' => false, - 'b' => false, - 'basefont' => false, - 'bdo' => false, - 'big' => false, - 'br' => false, - 'button' => false, - 'cite' => false, - 'code' => false, - 'del' => false, - 'dfn' => false, - 'em' => false, - 'font' => false, - 'i' => false, - 'img' => false, - 'ins' => false, - 'input' => false, - 'iframe' => false, - 'kbd' => false, - 'label' => false, - 'q' => false, - 'samp' => false, - 'script' => false, - 'select' => false, - 'small' => false, - 'span' => false, - 'strong' => false, - 'sub' => false, - 'sup' => false, - 'textarea' => false, - 'tt' => false, - 'var' => false, - ); - /** - * get next node, set $this->html prior! - * - * @param void - * @return bool - */ - function nextNode() { - if (empty($this->html)) { - # we are done with parsing the html string - return false; - } - static $skipWhitespace = true; - if ($this->isStartTag && !$this->isEmptyTag) { - array_push($this->openTags, $this->tagName); - if (in_array($this->tagName, $this->preformattedTags)) { - # dont truncate whitespaces for <code> or <pre> contents - $this->keepWhitespace++; - } - } - - if ($this->html[0] == '<') { - $token = substr($this->html, 0, 9); - if (substr($token, 0, 2) == '<?') { - # xml prolog or other pi's - /** TODO **/ - #trigger_error('this might need some work', E_USER_NOTICE); - $pos = strpos($this->html, '>'); - $this->setNode('pi', $pos + 1); - return true; - } - if (substr($token, 0, 4) == '<!--') { - # comment - $pos = strpos($this->html, '-->'); - if ($pos === false) { - # could not find a closing -->, use next gt instead - # this is firefox' behaviour - $pos = strpos($this->html, '>') + 1; - } else { - $pos += 3; - } - $this->setNode('comment', $pos); - - $skipWhitespace = true; - return true; - } - if ($token == '<!DOCTYPE') { - # doctype - $this->setNode('doctype', strpos($this->html, '>')+1); - - $skipWhitespace = true; - return true; - } - if ($token == '<![CDATA[') { - # cdata, use text node - - # remove leading <![CDATA[ - $this->html = substr($this->html, 9); - - $this->setNode('text', strpos($this->html, ']]>')+3); - - # remove trailing ]]> and trim - $this->node = substr($this->node, 0, -3); - $this->handleWhitespaces(); - - $skipWhitespace = true; - return true; - } - if ($this->parseTag()) { - # seems to be a tag - # handle whitespaces - if ($this->isBlockElement) { - $skipWhitespace = true; - } else { - $skipWhitespace = false; - } - return true; - } - } - if ($this->keepWhitespace) { - $skipWhitespace = false; - } - # when we get here it seems to be a text node - $pos = strpos($this->html, '<'); - if ($pos === false) { - $pos = strlen($this->html); - } - $this->setNode('text', $pos); - $this->handleWhitespaces(); - if ($skipWhitespace && $this->node == ' ') { - return $this->nextNode(); - } - $skipWhitespace = false; - return true; - } - /** - * parse tag, set tag name and attributes, see if it's a closing tag and so forth... - * - * @param void - * @return bool - */ - function parseTag() { - static $a_ord, $z_ord, $special_ords; - if (!isset($a_ord)) { - $a_ord = ord('a'); - $z_ord = ord('z'); - $special_ords = array( - ord(':'), // for xml:lang - ord('-'), // for http-equiv - ); - } - - $tagName = ''; - - $pos = 1; - $isStartTag = $this->html[$pos] != '/'; - if (!$isStartTag) { - $pos++; - } - # get tagName - while (isset($this->html[$pos])) { - $pos_ord = ord(strtolower($this->html[$pos])); - if (($pos_ord >= $a_ord && $pos_ord <= $z_ord) || (!empty($tagName) && is_numeric($this->html[$pos]))) { - $tagName .= $this->html[$pos]; - $pos++; - } else { - $pos--; - break; - } - } - - $tagName = strtolower($tagName); - if (empty($tagName) || !isset($this->blockElements[$tagName])) { - # something went wrong => invalid tag - $this->invalidTag(); - return false; - } - if ($this->noTagsInCode && end($this->openTags) == 'code' && !($tagName == 'code' && !$isStartTag)) { - # we supress all HTML tags inside code tags - $this->invalidTag(); - return false; - } - - # get tag attributes - /** TODO: in html 4 attributes do not need to be quoted **/ - $isEmptyTag = false; - $attributes = array(); - $currAttrib = ''; - while (isset($this->html[$pos+1])) { - $pos++; - # close tag - if ($this->html[$pos] == '>' || $this->html[$pos].$this->html[$pos+1] == '/>') { - if ($this->html[$pos] == '/') { - $isEmptyTag = true; - $pos++; - } - break; - } - - $pos_ord = ord(strtolower($this->html[$pos])); - if ( ($pos_ord >= $a_ord && $pos_ord <= $z_ord) || in_array($pos_ord, $special_ords)) { - # attribute name - $currAttrib .= $this->html[$pos]; - } elseif (in_array($this->html[$pos], array(' ', "\t", "\n"))) { - # drop whitespace - } elseif (in_array($this->html[$pos].$this->html[$pos+1], array('="', "='"))) { - # get attribute value - $pos++; - $await = $this->html[$pos]; # single or double quote - $pos++; - $value = ''; - while (isset($this->html[$pos]) && $this->html[$pos] != $await) { - $value .= $this->html[$pos]; - $pos++; - } - $attributes[$currAttrib] = $value; - $currAttrib = ''; - } else { - $this->invalidTag(); - return false; - } - } - if ($this->html[$pos] != '>') { - $this->invalidTag(); - return false; - } - - if (!empty($currAttrib)) { - # html 4 allows something like <option selected> instead of <option selected="selected"> - $attributes[$currAttrib] = $currAttrib; - } - if (!$isStartTag) { - if (!empty($attributes) || $tagName != end($this->openTags)) { - # end tags must not contain any attributes - # or maybe we did not expect a different tag to be closed - $this->invalidTag(); - return false; - } - array_pop($this->openTags); - if (in_array($tagName, $this->preformattedTags)) { - $this->keepWhitespace--; - } - } - $pos++; - $this->node = substr($this->html, 0, $pos); - $this->html = substr($this->html, $pos); - $this->tagName = $tagName; - $this->tagAttributes = $attributes; - $this->isStartTag = $isStartTag; - $this->isEmptyTag = $isEmptyTag || in_array($tagName, $this->emptyTags); - if ($this->isEmptyTag) { - # might be not well formed - $this->node = preg_replace('# */? *>$#', ' />', $this->node); - } - $this->nodeType = 'tag'; - $this->isBlockElement = $this->blockElements[$tagName]; - return true; - } - /** - * handle invalid tags - * - * @param void - * @return void - */ - function invalidTag() { - $this->html = substr_replace($this->html, '<', 0, 1); - } - /** - * update all vars and make $this->html shorter - * - * @param string $type see description for $this->nodeType - * @param int $pos to which position shall we cut? - * @return void - */ - function setNode($type, $pos) { - if ($this->nodeType == 'tag') { - # set tag specific vars to null - # $type == tag should not be called here - # see this::parseTag() for more - $this->tagName = null; - $this->tagAttributes = null; - $this->isStartTag = null; - $this->isEmptyTag = null; - $this->isBlockElement = null; - - } - $this->nodeType = $type; - $this->node = substr($this->html, 0, $pos); - $this->html = substr($this->html, $pos); - } - /** - * check if $this->html begins with $str - * - * @param string $str - * @return bool - */ - function match($str) { - return substr($this->html, 0, strlen($str)) == $str; - } - /** - * truncate whitespaces - * - * @param void - * @return void - */ - function handleWhitespaces() { - if ($this->keepWhitespace) { - # <pre> or <code> before... - return; - } - # truncate multiple whitespaces to a single one - $this->node = preg_replace('#\s+#s', ' ', $this->node); - } - /** - * normalize self::node - * - * @param void - * @return void - */ - function normalizeNode() { - $this->node = '<'; - if (!$this->isStartTag) { - $this->node .= '/'.$this->tagName.'>'; - return; - } - $this->node .= $this->tagName; - foreach ($this->tagAttributes as $name => $value) { - $this->node .= ' '.$name.'="'.str_replace('"', '"', $value).'"'; - } - if ($this->isEmptyTag) { - $this->node .= ' /'; - } - $this->node .= '>'; - } -} - -/** - * indent a HTML string properly - * - * @param string $html - * @param string $indent optional - * @return string - */ -function indentHTML($html, $indent = " ", $noTagsInCode = false) { - $parser = new parseHTML; - $parser->noTagsInCode = $noTagsInCode; - $parser->html = $html; - $html = ''; - $last = true; # last tag was block elem - $indent_a = array(); - while($parser->nextNode()) { - if ($parser->nodeType == 'tag') { - $parser->normalizeNode(); - } - if ($parser->nodeType == 'tag' && $parser->isBlockElement) { - $isPreOrCode = in_array($parser->tagName, array('code', 'pre')); - if (!$parser->keepWhitespace && !$last && !$isPreOrCode) { - $html = rtrim($html)."\n"; - } - if ($parser->isStartTag) { - $html .= implode($indent_a); - if (!$parser->isEmptyTag) { - array_push($indent_a, $indent); - } - } else { - array_pop($indent_a); - if (!$isPreOrCode) { - $html .= implode($indent_a); - } - } - $html .= $parser->node; - if (!$parser->keepWhitespace && !($isPreOrCode && $parser->isStartTag)) { - $html .= "\n"; - } - $last = true; - } else { - if ($parser->nodeType == 'tag' && $parser->tagName == 'br') { - $html .= $parser->node."\n"; - $last = true; - continue; - } elseif ($last && !$parser->keepWhitespace) { - $html .= implode($indent_a); - $parser->node = ltrim($parser->node); - } - $html .= $parser->node; - - if (in_array($parser->nodeType, array('comment', 'pi', 'doctype'))) { - $html .= "\n"; - } else { - $last = false; - } - } - } - return $html; -} -/* -# testcase / example -error_reporting(E_ALL); - -$html = '<p>Simple block on one line:</p> - -<div>foo</div> - -<p>And nested without indentation:</p> - -<div> -<div> -<div> -foo -</div> -<div style=">"/> -</div> -<div>bar</div> -</div> - -<p>And with attributes:</p> - -<div> - <div id="foo"> - </div> -</div> - -<p>This was broken in 1.0.2b7:</p> - -<div class="inlinepage"> -<div class="toggleableend"> -foo -</div> -</div>'; -#$html = '<a href="asdfasdf" title=\'asdf\' foo="bar">asdf</a>'; -echo indentHTML($html); -die(); -*/ |