diff options
Diffstat (limited to 'include/markdownify/parsehtml/parsehtml.php')
-rw-r--r-- | include/markdownify/parsehtml/parsehtml.php | 618 |
1 files changed, 618 insertions, 0 deletions
diff --git a/include/markdownify/parsehtml/parsehtml.php b/include/markdownify/parsehtml/parsehtml.php new file mode 100644 index 000000000..1a8ecacda --- /dev/null +++ b/include/markdownify/parsehtml/parsehtml.php @@ -0,0 +1,618 @@ +<?php +/** + * parseHTML is a HTML parser which works with PHP 4 and above. + * It tries to handle invalid HTML to some degree. + * + * @version 1.0 beta + * @author Milian Wolff (mail@milianw.de, http://milianw.de) + * @license LGPL, see LICENSE_LGPL.txt and the summary below + * @copyright (C) 2007 Milian Wolff + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +class parseHTML { + /** + * tags which are always empty (<br /> etc.) + * + * @var array<string> + */ + var $emptyTags = array( + 'br', + 'hr', + 'input', + 'img', + 'area', + 'link', + 'meta', + 'param', + ); + /** + * tags with preformatted text + * whitespaces wont be touched in them + * + * @var array<string> + */ + var $preformattedTags = array( + 'script', + 'style', + 'pre', + 'code', + ); + /** + * supress HTML tags inside preformatted tags (see above) + * + * @var bool + */ + var $noTagsInCode = false; + /** + * html to be parsed + * + * @var string + */ + var $html = ''; + /** + * node type: + * + * - tag (see isStartTag) + * - text (includes cdata) + * - comment + * - doctype + * - pi (processing instruction) + * + * @var string + */ + var $nodeType = ''; + /** + * current node content, i.e. either a + * simple string (text node), or something like + * <tag attrib="value"...> + * + * @var string + */ + var $node = ''; + /** + * wether current node is an opening tag (<a>) or not (</a>) + * set to NULL if current node is not a tag + * NOTE: empty tags (<br />) set this to true as well! + * + * @var bool | null + */ + var $isStartTag = null; + /** + * wether current node is an empty tag (<br />) or not (<a></a>) + * + * @var bool | null + */ + var $isEmptyTag = null; + /** + * tag name + * + * @var string | null + */ + var $tagName = ''; + /** + * attributes of current tag + * + * @var array (attribName=>value) | null + */ + var $tagAttributes = null; + /** + * wether the current tag is a block element + * + * @var bool | null + */ + var $isBlockElement = null; + + /** + * keep whitespace + * + * @var int + */ + var $keepWhitespace = 0; + /** + * list of open tags + * count this to get current depth + * + * @var array + */ + var $openTags = array(); + /** + * list of block elements + * + * @var array + * TODO: what shall we do with <del> and <ins> ?! + */ + var $blockElements = array ( + # tag name => <bool> is block + # block elements + 'address' => true, + 'blockquote' => true, + 'center' => true, + 'del' => true, + 'dir' => true, + 'div' => true, + 'dl' => true, + 'fieldset' => true, + 'form' => true, + 'h1' => true, + 'h2' => true, + 'h3' => true, + 'h4' => true, + 'h5' => true, + 'h6' => true, + 'hr' => true, + 'ins' => true, + 'isindex' => true, + 'menu' => true, + 'noframes' => true, + 'noscript' => true, + 'ol' => true, + 'p' => true, + 'pre' => true, + 'table' => true, + 'ul' => true, + # set table elements and list items to block as well + 'thead' => true, + 'tbody' => true, + 'tfoot' => true, + 'td' => true, + 'tr' => true, + 'th' => true, + 'li' => true, + 'dd' => true, + 'dt' => true, + # header items and html / body as well + 'html' => true, + 'body' => true, + 'head' => true, + 'meta' => true, + 'link' => true, + 'style' => true, + 'title' => true, + # unfancy media tags, when indented should be rendered as block + 'map' => true, + 'object' => true, + 'param' => true, + 'embed' => true, + 'area' => true, + # inline elements + 'a' => false, + 'abbr' => false, + 'acronym' => false, + 'applet' => false, + 'b' => false, + 'basefont' => false, + 'bdo' => false, + 'big' => false, + 'br' => false, + 'button' => false, + 'cite' => false, + 'code' => false, + 'del' => false, + 'dfn' => false, + 'em' => false, + 'font' => false, + 'i' => false, + 'img' => false, + 'ins' => false, + 'input' => false, + 'iframe' => false, + 'kbd' => false, + 'label' => false, + 'q' => false, + 'samp' => false, + 'script' => false, + 'select' => false, + 'small' => false, + 'span' => false, + 'strong' => false, + 'sub' => false, + 'sup' => false, + 'textarea' => false, + 'tt' => false, + 'var' => false, + ); + /** + * get next node, set $this->html prior! + * + * @param void + * @return bool + */ + function nextNode() { + if (empty($this->html)) { + # we are done with parsing the html string + return false; + } + static $skipWhitespace = true; + if ($this->isStartTag && !$this->isEmptyTag) { + array_push($this->openTags, $this->tagName); + if (in_array($this->tagName, $this->preformattedTags)) { + # dont truncate whitespaces for <code> or <pre> contents + $this->keepWhitespace++; + } + } + + if ($this->html[0] == '<') { + $token = substr($this->html, 0, 9); + if (substr($token, 0, 2) == '<?') { + # xml prolog or other pi's + /** TODO **/ + #trigger_error('this might need some work', E_USER_NOTICE); + $pos = strpos($this->html, '>'); + $this->setNode('pi', $pos + 1); + return true; + } + if (substr($token, 0, 4) == '<!--') { + # comment + $pos = strpos($this->html, '-->'); + if ($pos === false) { + # could not find a closing -->, use next gt instead + # this is firefox' behaviour + $pos = strpos($this->html, '>') + 1; + } else { + $pos += 3; + } + $this->setNode('comment', $pos); + + $skipWhitespace = true; + return true; + } + if ($token == '<!DOCTYPE') { + # doctype + $this->setNode('doctype', strpos($this->html, '>')+1); + + $skipWhitespace = true; + return true; + } + if ($token == '<![CDATA[') { + # cdata, use text node + + # remove leading <![CDATA[ + $this->html = substr($this->html, 9); + + $this->setNode('text', strpos($this->html, ']]>')+3); + + # remove trailing ]]> and trim + $this->node = substr($this->node, 0, -3); + $this->handleWhitespaces(); + + $skipWhitespace = true; + return true; + } + if ($this->parseTag()) { + # seems to be a tag + # handle whitespaces + if ($this->isBlockElement) { + $skipWhitespace = true; + } else { + $skipWhitespace = false; + } + return true; + } + } + if ($this->keepWhitespace) { + $skipWhitespace = false; + } + # when we get here it seems to be a text node + $pos = strpos($this->html, '<'); + if ($pos === false) { + $pos = strlen($this->html); + } + $this->setNode('text', $pos); + $this->handleWhitespaces(); + if ($skipWhitespace && $this->node == ' ') { + return $this->nextNode(); + } + $skipWhitespace = false; + return true; + } + /** + * parse tag, set tag name and attributes, see if it's a closing tag and so forth... + * + * @param void + * @return bool + */ + function parseTag() { + static $a_ord, $z_ord, $special_ords; + if (!isset($a_ord)) { + $a_ord = ord('a'); + $z_ord = ord('z'); + $special_ords = array( + ord(':'), // for xml:lang + ord('-'), // for http-equiv + ); + } + + $tagName = ''; + + $pos = 1; + $isStartTag = $this->html[$pos] != '/'; + if (!$isStartTag) { + $pos++; + } + # get tagName + while (isset($this->html[$pos])) { + $pos_ord = ord(strtolower($this->html[$pos])); + if (($pos_ord >= $a_ord && $pos_ord <= $z_ord) || (!empty($tagName) && is_numeric($this->html[$pos]))) { + $tagName .= $this->html[$pos]; + $pos++; + } else { + $pos--; + break; + } + } + + $tagName = strtolower($tagName); + if (empty($tagName) || !isset($this->blockElements[$tagName])) { + # something went wrong => invalid tag + $this->invalidTag(); + return false; + } + if ($this->noTagsInCode && end($this->openTags) == 'code' && !($tagName == 'code' && !$isStartTag)) { + # we supress all HTML tags inside code tags + $this->invalidTag(); + return false; + } + + # get tag attributes + /** TODO: in html 4 attributes do not need to be quoted **/ + $isEmptyTag = false; + $attributes = array(); + $currAttrib = ''; + while (isset($this->html[$pos+1])) { + $pos++; + # close tag + if ($this->html[$pos] == '>' || $this->html[$pos].$this->html[$pos+1] == '/>') { + if ($this->html[$pos] == '/') { + $isEmptyTag = true; + $pos++; + } + break; + } + + $pos_ord = ord(strtolower($this->html[$pos])); + if ( ($pos_ord >= $a_ord && $pos_ord <= $z_ord) || in_array($pos_ord, $special_ords)) { + # attribute name + $currAttrib .= $this->html[$pos]; + } elseif (in_array($this->html[$pos], array(' ', "\t", "\n"))) { + # drop whitespace + } elseif (in_array($this->html[$pos].$this->html[$pos+1], array('="', "='"))) { + # get attribute value + $pos++; + $await = $this->html[$pos]; # single or double quote + $pos++; + $value = ''; + while (isset($this->html[$pos]) && $this->html[$pos] != $await) { + $value .= $this->html[$pos]; + $pos++; + } + $attributes[$currAttrib] = $value; + $currAttrib = ''; + } else { + $this->invalidTag(); + return false; + } + } + if ($this->html[$pos] != '>') { + $this->invalidTag(); + return false; + } + + if (!empty($currAttrib)) { + # html 4 allows something like <option selected> instead of <option selected="selected"> + $attributes[$currAttrib] = $currAttrib; + } + if (!$isStartTag) { + if (!empty($attributes) || $tagName != end($this->openTags)) { + # end tags must not contain any attributes + # or maybe we did not expect a different tag to be closed + $this->invalidTag(); + return false; + } + array_pop($this->openTags); + if (in_array($tagName, $this->preformattedTags)) { + $this->keepWhitespace--; + } + } + $pos++; + $this->node = substr($this->html, 0, $pos); + $this->html = substr($this->html, $pos); + $this->tagName = $tagName; + $this->tagAttributes = $attributes; + $this->isStartTag = $isStartTag; + $this->isEmptyTag = $isEmptyTag || in_array($tagName, $this->emptyTags); + if ($this->isEmptyTag) { + # might be not well formed + $this->node = preg_replace('# */? *>$#', ' />', $this->node); + } + $this->nodeType = 'tag'; + $this->isBlockElement = $this->blockElements[$tagName]; + return true; + } + /** + * handle invalid tags + * + * @param void + * @return void + */ + function invalidTag() { + $this->html = substr_replace($this->html, '<', 0, 1); + } + /** + * update all vars and make $this->html shorter + * + * @param string $type see description for $this->nodeType + * @param int $pos to which position shall we cut? + * @return void + */ + function setNode($type, $pos) { + if ($this->nodeType == 'tag') { + # set tag specific vars to null + # $type == tag should not be called here + # see this::parseTag() for more + $this->tagName = null; + $this->tagAttributes = null; + $this->isStartTag = null; + $this->isEmptyTag = null; + $this->isBlockElement = null; + + } + $this->nodeType = $type; + $this->node = substr($this->html, 0, $pos); + $this->html = substr($this->html, $pos); + } + /** + * check if $this->html begins with $str + * + * @param string $str + * @return bool + */ + function match($str) { + return substr($this->html, 0, strlen($str)) == $str; + } + /** + * truncate whitespaces + * + * @param void + * @return void + */ + function handleWhitespaces() { + if ($this->keepWhitespace) { + # <pre> or <code> before... + return; + } + # truncate multiple whitespaces to a single one + $this->node = preg_replace('#\s+#s', ' ', $this->node); + } + /** + * normalize self::node + * + * @param void + * @return void + */ + function normalizeNode() { + $this->node = '<'; + if (!$this->isStartTag) { + $this->node .= '/'.$this->tagName.'>'; + return; + } + $this->node .= $this->tagName; + foreach ($this->tagAttributes as $name => $value) { + $this->node .= ' '.$name.'="'.str_replace('"', '"', $value).'"'; + } + if ($this->isEmptyTag) { + $this->node .= ' /'; + } + $this->node .= '>'; + } +} + +/** + * indent a HTML string properly + * + * @param string $html + * @param string $indent optional + * @return string + */ +function indentHTML($html, $indent = " ", $noTagsInCode = false) { + $parser = new parseHTML; + $parser->noTagsInCode = $noTagsInCode; + $parser->html = $html; + $html = ''; + $last = true; # last tag was block elem + $indent_a = array(); + while($parser->nextNode()) { + if ($parser->nodeType == 'tag') { + $parser->normalizeNode(); + } + if ($parser->nodeType == 'tag' && $parser->isBlockElement) { + $isPreOrCode = in_array($parser->tagName, array('code', 'pre')); + if (!$parser->keepWhitespace && !$last && !$isPreOrCode) { + $html = rtrim($html)."\n"; + } + if ($parser->isStartTag) { + $html .= implode($indent_a); + if (!$parser->isEmptyTag) { + array_push($indent_a, $indent); + } + } else { + array_pop($indent_a); + if (!$isPreOrCode) { + $html .= implode($indent_a); + } + } + $html .= $parser->node; + if (!$parser->keepWhitespace && !($isPreOrCode && $parser->isStartTag)) { + $html .= "\n"; + } + $last = true; + } else { + if ($parser->nodeType == 'tag' && $parser->tagName == 'br') { + $html .= $parser->node."\n"; + $last = true; + continue; + } elseif ($last && !$parser->keepWhitespace) { + $html .= implode($indent_a); + $parser->node = ltrim($parser->node); + } + $html .= $parser->node; + + if (in_array($parser->nodeType, array('comment', 'pi', 'doctype'))) { + $html .= "\n"; + } else { + $last = false; + } + } + } + return $html; +} +/* +# testcase / example +error_reporting(E_ALL); + +$html = '<p>Simple block on one line:</p> + +<div>foo</div> + +<p>And nested without indentation:</p> + +<div> +<div> +<div> +foo +</div> +<div style=">"/> +</div> +<div>bar</div> +</div> + +<p>And with attributes:</p> + +<div> + <div id="foo"> + </div> +</div> + +<p>This was broken in 1.0.2b7:</p> + +<div class="inlinepage"> +<div class="toggleableend"> +foo +</div> +</div>'; +#$html = '<a href="asdfasdf" title=\'asdf\' foo="bar">asdf</a>'; +echo indentHTML($html); +die(); +*/ |