aboutsummaryrefslogtreecommitdiffstats
path: root/library/markdownify/parsehtml/parsehtml.php
diff options
context:
space:
mode:
Diffstat (limited to 'library/markdownify/parsehtml/parsehtml.php')
-rw-r--r--library/markdownify/parsehtml/parsehtml.php618
1 files changed, 0 insertions, 618 deletions
diff --git a/library/markdownify/parsehtml/parsehtml.php b/library/markdownify/parsehtml/parsehtml.php
deleted file mode 100644
index 1a8ecacda..000000000
--- a/library/markdownify/parsehtml/parsehtml.php
+++ /dev/null
@@ -1,618 +0,0 @@
-<?php
-/**
- * parseHTML is a HTML parser which works with PHP 4 and above.
- * It tries to handle invalid HTML to some degree.
- *
- * @version 1.0 beta
- * @author Milian Wolff (mail@milianw.de, http://milianw.de)
- * @license LGPL, see LICENSE_LGPL.txt and the summary below
- * @copyright (C) 2007 Milian Wolff
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-class parseHTML {
- /**
- * tags which are always empty (<br /> etc.)
- *
- * @var array<string>
- */
- var $emptyTags = array(
- 'br',
- 'hr',
- 'input',
- 'img',
- 'area',
- 'link',
- 'meta',
- 'param',
- );
- /**
- * tags with preformatted text
- * whitespaces wont be touched in them
- *
- * @var array<string>
- */
- var $preformattedTags = array(
- 'script',
- 'style',
- 'pre',
- 'code',
- );
- /**
- * supress HTML tags inside preformatted tags (see above)
- *
- * @var bool
- */
- var $noTagsInCode = false;
- /**
- * html to be parsed
- *
- * @var string
- */
- var $html = '';
- /**
- * node type:
- *
- * - tag (see isStartTag)
- * - text (includes cdata)
- * - comment
- * - doctype
- * - pi (processing instruction)
- *
- * @var string
- */
- var $nodeType = '';
- /**
- * current node content, i.e. either a
- * simple string (text node), or something like
- * <tag attrib="value"...>
- *
- * @var string
- */
- var $node = '';
- /**
- * wether current node is an opening tag (<a>) or not (</a>)
- * set to NULL if current node is not a tag
- * NOTE: empty tags (<br />) set this to true as well!
- *
- * @var bool | null
- */
- var $isStartTag = null;
- /**
- * wether current node is an empty tag (<br />) or not (<a></a>)
- *
- * @var bool | null
- */
- var $isEmptyTag = null;
- /**
- * tag name
- *
- * @var string | null
- */
- var $tagName = '';
- /**
- * attributes of current tag
- *
- * @var array (attribName=>value) | null
- */
- var $tagAttributes = null;
- /**
- * wether the current tag is a block element
- *
- * @var bool | null
- */
- var $isBlockElement = null;
-
- /**
- * keep whitespace
- *
- * @var int
- */
- var $keepWhitespace = 0;
- /**
- * list of open tags
- * count this to get current depth
- *
- * @var array
- */
- var $openTags = array();
- /**
- * list of block elements
- *
- * @var array
- * TODO: what shall we do with <del> and <ins> ?!
- */
- var $blockElements = array (
- # tag name => <bool> is block
- # block elements
- 'address' => true,
- 'blockquote' => true,
- 'center' => true,
- 'del' => true,
- 'dir' => true,
- 'div' => true,
- 'dl' => true,
- 'fieldset' => true,
- 'form' => true,
- 'h1' => true,
- 'h2' => true,
- 'h3' => true,
- 'h4' => true,
- 'h5' => true,
- 'h6' => true,
- 'hr' => true,
- 'ins' => true,
- 'isindex' => true,
- 'menu' => true,
- 'noframes' => true,
- 'noscript' => true,
- 'ol' => true,
- 'p' => true,
- 'pre' => true,
- 'table' => true,
- 'ul' => true,
- # set table elements and list items to block as well
- 'thead' => true,
- 'tbody' => true,
- 'tfoot' => true,
- 'td' => true,
- 'tr' => true,
- 'th' => true,
- 'li' => true,
- 'dd' => true,
- 'dt' => true,
- # header items and html / body as well
- 'html' => true,
- 'body' => true,
- 'head' => true,
- 'meta' => true,
- 'link' => true,
- 'style' => true,
- 'title' => true,
- # unfancy media tags, when indented should be rendered as block
- 'map' => true,
- 'object' => true,
- 'param' => true,
- 'embed' => true,
- 'area' => true,
- # inline elements
- 'a' => false,
- 'abbr' => false,
- 'acronym' => false,
- 'applet' => false,
- 'b' => false,
- 'basefont' => false,
- 'bdo' => false,
- 'big' => false,
- 'br' => false,
- 'button' => false,
- 'cite' => false,
- 'code' => false,
- 'del' => false,
- 'dfn' => false,
- 'em' => false,
- 'font' => false,
- 'i' => false,
- 'img' => false,
- 'ins' => false,
- 'input' => false,
- 'iframe' => false,
- 'kbd' => false,
- 'label' => false,
- 'q' => false,
- 'samp' => false,
- 'script' => false,
- 'select' => false,
- 'small' => false,
- 'span' => false,
- 'strong' => false,
- 'sub' => false,
- 'sup' => false,
- 'textarea' => false,
- 'tt' => false,
- 'var' => false,
- );
- /**
- * get next node, set $this->html prior!
- *
- * @param void
- * @return bool
- */
- function nextNode() {
- if (empty($this->html)) {
- # we are done with parsing the html string
- return false;
- }
- static $skipWhitespace = true;
- if ($this->isStartTag && !$this->isEmptyTag) {
- array_push($this->openTags, $this->tagName);
- if (in_array($this->tagName, $this->preformattedTags)) {
- # dont truncate whitespaces for <code> or <pre> contents
- $this->keepWhitespace++;
- }
- }
-
- if ($this->html[0] == '<') {
- $token = substr($this->html, 0, 9);
- if (substr($token, 0, 2) == '<?') {
- # xml prolog or other pi's
- /** TODO **/
- #trigger_error('this might need some work', E_USER_NOTICE);
- $pos = strpos($this->html, '>');
- $this->setNode('pi', $pos + 1);
- return true;
- }
- if (substr($token, 0, 4) == '<!--') {
- # comment
- $pos = strpos($this->html, '-->');
- if ($pos === false) {
- # could not find a closing -->, use next gt instead
- # this is firefox' behaviour
- $pos = strpos($this->html, '>') + 1;
- } else {
- $pos += 3;
- }
- $this->setNode('comment', $pos);
-
- $skipWhitespace = true;
- return true;
- }
- if ($token == '<!DOCTYPE') {
- # doctype
- $this->setNode('doctype', strpos($this->html, '>')+1);
-
- $skipWhitespace = true;
- return true;
- }
- if ($token == '<![CDATA[') {
- # cdata, use text node
-
- # remove leading <![CDATA[
- $this->html = substr($this->html, 9);
-
- $this->setNode('text', strpos($this->html, ']]>')+3);
-
- # remove trailing ]]> and trim
- $this->node = substr($this->node, 0, -3);
- $this->handleWhitespaces();
-
- $skipWhitespace = true;
- return true;
- }
- if ($this->parseTag()) {
- # seems to be a tag
- # handle whitespaces
- if ($this->isBlockElement) {
- $skipWhitespace = true;
- } else {
- $skipWhitespace = false;
- }
- return true;
- }
- }
- if ($this->keepWhitespace) {
- $skipWhitespace = false;
- }
- # when we get here it seems to be a text node
- $pos = strpos($this->html, '<');
- if ($pos === false) {
- $pos = strlen($this->html);
- }
- $this->setNode('text', $pos);
- $this->handleWhitespaces();
- if ($skipWhitespace && $this->node == ' ') {
- return $this->nextNode();
- }
- $skipWhitespace = false;
- return true;
- }
- /**
- * parse tag, set tag name and attributes, see if it's a closing tag and so forth...
- *
- * @param void
- * @return bool
- */
- function parseTag() {
- static $a_ord, $z_ord, $special_ords;
- if (!isset($a_ord)) {
- $a_ord = ord('a');
- $z_ord = ord('z');
- $special_ords = array(
- ord(':'), // for xml:lang
- ord('-'), // for http-equiv
- );
- }
-
- $tagName = '';
-
- $pos = 1;
- $isStartTag = $this->html[$pos] != '/';
- if (!$isStartTag) {
- $pos++;
- }
- # get tagName
- while (isset($this->html[$pos])) {
- $pos_ord = ord(strtolower($this->html[$pos]));
- if (($pos_ord >= $a_ord && $pos_ord <= $z_ord) || (!empty($tagName) && is_numeric($this->html[$pos]))) {
- $tagName .= $this->html[$pos];
- $pos++;
- } else {
- $pos--;
- break;
- }
- }
-
- $tagName = strtolower($tagName);
- if (empty($tagName) || !isset($this->blockElements[$tagName])) {
- # something went wrong => invalid tag
- $this->invalidTag();
- return false;
- }
- if ($this->noTagsInCode && end($this->openTags) == 'code' && !($tagName == 'code' && !$isStartTag)) {
- # we supress all HTML tags inside code tags
- $this->invalidTag();
- return false;
- }
-
- # get tag attributes
- /** TODO: in html 4 attributes do not need to be quoted **/
- $isEmptyTag = false;
- $attributes = array();
- $currAttrib = '';
- while (isset($this->html[$pos+1])) {
- $pos++;
- # close tag
- if ($this->html[$pos] == '>' || $this->html[$pos].$this->html[$pos+1] == '/>') {
- if ($this->html[$pos] == '/') {
- $isEmptyTag = true;
- $pos++;
- }
- break;
- }
-
- $pos_ord = ord(strtolower($this->html[$pos]));
- if ( ($pos_ord >= $a_ord && $pos_ord <= $z_ord) || in_array($pos_ord, $special_ords)) {
- # attribute name
- $currAttrib .= $this->html[$pos];
- } elseif (in_array($this->html[$pos], array(' ', "\t", "\n"))) {
- # drop whitespace
- } elseif (in_array($this->html[$pos].$this->html[$pos+1], array('="', "='"))) {
- # get attribute value
- $pos++;
- $await = $this->html[$pos]; # single or double quote
- $pos++;
- $value = '';
- while (isset($this->html[$pos]) && $this->html[$pos] != $await) {
- $value .= $this->html[$pos];
- $pos++;
- }
- $attributes[$currAttrib] = $value;
- $currAttrib = '';
- } else {
- $this->invalidTag();
- return false;
- }
- }
- if ($this->html[$pos] != '>') {
- $this->invalidTag();
- return false;
- }
-
- if (!empty($currAttrib)) {
- # html 4 allows something like <option selected> instead of <option selected="selected">
- $attributes[$currAttrib] = $currAttrib;
- }
- if (!$isStartTag) {
- if (!empty($attributes) || $tagName != end($this->openTags)) {
- # end tags must not contain any attributes
- # or maybe we did not expect a different tag to be closed
- $this->invalidTag();
- return false;
- }
- array_pop($this->openTags);
- if (in_array($tagName, $this->preformattedTags)) {
- $this->keepWhitespace--;
- }
- }
- $pos++;
- $this->node = substr($this->html, 0, $pos);
- $this->html = substr($this->html, $pos);
- $this->tagName = $tagName;
- $this->tagAttributes = $attributes;
- $this->isStartTag = $isStartTag;
- $this->isEmptyTag = $isEmptyTag || in_array($tagName, $this->emptyTags);
- if ($this->isEmptyTag) {
- # might be not well formed
- $this->node = preg_replace('# */? *>$#', ' />', $this->node);
- }
- $this->nodeType = 'tag';
- $this->isBlockElement = $this->blockElements[$tagName];
- return true;
- }
- /**
- * handle invalid tags
- *
- * @param void
- * @return void
- */
- function invalidTag() {
- $this->html = substr_replace($this->html, '&lt;', 0, 1);
- }
- /**
- * update all vars and make $this->html shorter
- *
- * @param string $type see description for $this->nodeType
- * @param int $pos to which position shall we cut?
- * @return void
- */
- function setNode($type, $pos) {
- if ($this->nodeType == 'tag') {
- # set tag specific vars to null
- # $type == tag should not be called here
- # see this::parseTag() for more
- $this->tagName = null;
- $this->tagAttributes = null;
- $this->isStartTag = null;
- $this->isEmptyTag = null;
- $this->isBlockElement = null;
-
- }
- $this->nodeType = $type;
- $this->node = substr($this->html, 0, $pos);
- $this->html = substr($this->html, $pos);
- }
- /**
- * check if $this->html begins with $str
- *
- * @param string $str
- * @return bool
- */
- function match($str) {
- return substr($this->html, 0, strlen($str)) == $str;
- }
- /**
- * truncate whitespaces
- *
- * @param void
- * @return void
- */
- function handleWhitespaces() {
- if ($this->keepWhitespace) {
- # <pre> or <code> before...
- return;
- }
- # truncate multiple whitespaces to a single one
- $this->node = preg_replace('#\s+#s', ' ', $this->node);
- }
- /**
- * normalize self::node
- *
- * @param void
- * @return void
- */
- function normalizeNode() {
- $this->node = '<';
- if (!$this->isStartTag) {
- $this->node .= '/'.$this->tagName.'>';
- return;
- }
- $this->node .= $this->tagName;
- foreach ($this->tagAttributes as $name => $value) {
- $this->node .= ' '.$name.'="'.str_replace('"', '&quot;', $value).'"';
- }
- if ($this->isEmptyTag) {
- $this->node .= ' /';
- }
- $this->node .= '>';
- }
-}
-
-/**
- * indent a HTML string properly
- *
- * @param string $html
- * @param string $indent optional
- * @return string
- */
-function indentHTML($html, $indent = " ", $noTagsInCode = false) {
- $parser = new parseHTML;
- $parser->noTagsInCode = $noTagsInCode;
- $parser->html = $html;
- $html = '';
- $last = true; # last tag was block elem
- $indent_a = array();
- while($parser->nextNode()) {
- if ($parser->nodeType == 'tag') {
- $parser->normalizeNode();
- }
- if ($parser->nodeType == 'tag' && $parser->isBlockElement) {
- $isPreOrCode = in_array($parser->tagName, array('code', 'pre'));
- if (!$parser->keepWhitespace && !$last && !$isPreOrCode) {
- $html = rtrim($html)."\n";
- }
- if ($parser->isStartTag) {
- $html .= implode($indent_a);
- if (!$parser->isEmptyTag) {
- array_push($indent_a, $indent);
- }
- } else {
- array_pop($indent_a);
- if (!$isPreOrCode) {
- $html .= implode($indent_a);
- }
- }
- $html .= $parser->node;
- if (!$parser->keepWhitespace && !($isPreOrCode && $parser->isStartTag)) {
- $html .= "\n";
- }
- $last = true;
- } else {
- if ($parser->nodeType == 'tag' && $parser->tagName == 'br') {
- $html .= $parser->node."\n";
- $last = true;
- continue;
- } elseif ($last && !$parser->keepWhitespace) {
- $html .= implode($indent_a);
- $parser->node = ltrim($parser->node);
- }
- $html .= $parser->node;
-
- if (in_array($parser->nodeType, array('comment', 'pi', 'doctype'))) {
- $html .= "\n";
- } else {
- $last = false;
- }
- }
- }
- return $html;
-}
-/*
-# testcase / example
-error_reporting(E_ALL);
-
-$html = '<p>Simple block on one line:</p>
-
-<div>foo</div>
-
-<p>And nested without indentation:</p>
-
-<div>
-<div>
-<div>
-foo
-</div>
-<div style=">"/>
-</div>
-<div>bar</div>
-</div>
-
-<p>And with attributes:</p>
-
-<div>
- <div id="foo">
- </div>
-</div>
-
-<p>This was broken in 1.0.2b7:</p>
-
-<div class="inlinepage">
-<div class="toggleableend">
-foo
-</div>
-</div>';
-#$html = '<a href="asdfasdf" title=\'asdf\' foo="bar">asdf</a>';
-echo indentHTML($html);
-die();
-*/