diff options
author | Klaus Weidenbach <Klaus.Weidenbach@gmx.net> | 2017-03-02 23:25:04 +0100 |
---|---|---|
committer | Klaus Weidenbach <Klaus.Weidenbach@gmx.net> | 2017-03-05 01:14:15 +0100 |
commit | 6c79e0c077971029343b2dff30017571ea118438 (patch) | |
tree | 26809ee07eeee05240878bd08cfb4fdcf4bb450a /vendor/pixel418/markdownify/src/Parser.php | |
parent | 8e1716065ee01959fc799fa14ba627392a876afa (diff) | |
download | volse-hubzilla-6c79e0c077971029343b2dff30017571ea118438.tar.gz volse-hubzilla-6c79e0c077971029343b2dff30017571ea118438.tar.bz2 volse-hubzilla-6c79e0c077971029343b2dff30017571ea118438.zip |
:arrow_up: :hammer: Upgrade Markdownify library.
The current version 2.0.0 (alpha) throws deprecated warning with
PHP7.1 and PHPUnit.
Upgrade the HTML to Markdown converter for PHP to the current
Markdownify 2.2.1.
Used composer to manage this library.
Diffstat (limited to 'vendor/pixel418/markdownify/src/Parser.php')
-rw-r--r-- | vendor/pixel418/markdownify/src/Parser.php | 564 |
1 files changed, 564 insertions, 0 deletions
diff --git a/vendor/pixel418/markdownify/src/Parser.php b/vendor/pixel418/markdownify/src/Parser.php new file mode 100644 index 000000000..90fcdf9f8 --- /dev/null +++ b/vendor/pixel418/markdownify/src/Parser.php @@ -0,0 +1,564 @@ +<?php + +/* This file is part of the Markdownify project, which is under LGPL license */ + +namespace Markdownify; + +class Parser +{ + public static $skipWhitespace = true; + public static $a_ord; + public static $z_ord; + public static $special_ords; + + /** + * tags which are always empty (<br /> etc.) + * + * @var array<string> + */ + public $emptyTags = array( + 'br', + 'hr', + 'input', + 'img', + 'area', + 'link', + 'meta', + 'param', + ); + + /** + * tags with preformatted text + * whitespaces wont be touched in them + * + * @var array<string> + */ + public $preformattedTags = array( + 'script', + 'style', + 'pre', + 'code', + ); + + /** + * supress HTML tags inside preformatted tags (see above) + * + * @var bool + */ + public $noTagsInCode = false; + + /** + * html to be parsed + * + * @var string + */ + public $html = ''; + + /** + * node type: + * + * - tag (see isStartTag) + * - text (includes cdata) + * - comment + * - doctype + * - pi (processing instruction) + * + * @var string + */ + public $nodeType = ''; + + /** + * current node content, i.e. either a + * simple string (text node), or something like + * <tag attrib="value"...> + * + * @var string + */ + public $node = ''; + + /** + * wether current node is an opening tag (<a>) or not (</a>) + * set to NULL if current node is not a tag + * NOTE: empty tags (<br />) set this to true as well! + * + * @var bool | null + */ + public $isStartTag = null; + + /** + * wether current node is an empty tag (<br />) or not (<a></a>) + * + * @var bool | null + */ + public $isEmptyTag = null; + + /** + * tag name + * + * @var string | null + */ + public $tagName = ''; + + /** + * attributes of current tag + * + * @var array (attribName=>value) | null + */ + public $tagAttributes = null; + + /** + * whether or not the actual context is a inline context + * + * @var bool | null + */ + public $isInlineContext = null; + + /** + * whether the current tag is a block element + * + * @var bool | null + */ + public $isBlockElement = null; + + /** + * whether the previous tag (browser) is a block element + * + * @var bool | null + */ + public $isNextToInlineContext = null; + + /** + * keep whitespace + * + * @var int + */ + public $keepWhitespace = 0; + + /** + * list of open tags + * count this to get current depth + * + * @var array + */ + public $openTags = array(); + + /** + * list of block elements + * + * @var array + * TODO: what shall we do with <del> and <ins> ?! + */ + public $blockElements = array( + // tag name => <bool> is block + // block elements + 'address' => true, + 'blockquote' => true, + 'center' => true, + 'del' => true, + 'dir' => true, + 'div' => true, + 'dl' => true, + 'fieldset' => true, + 'form' => true, + 'h1' => true, + 'h2' => true, + 'h3' => true, + 'h4' => true, + 'h5' => true, + 'h6' => true, + 'hr' => true, + 'ins' => true, + 'isindex' => true, + 'menu' => true, + 'noframes' => true, + 'noscript' => true, + 'ol' => true, + 'p' => true, + 'pre' => true, + 'table' => true, + 'ul' => true, + // set table elements and list items to block as well + 'thead' => true, + 'tbody' => true, + 'tfoot' => true, + 'td' => true, + 'tr' => true, + 'th' => true, + 'li' => true, + 'dd' => true, + 'dt' => true, + // header items and html / body as well + 'html' => true, + 'body' => true, + 'head' => true, + 'meta' => true, + 'link' => true, + 'style' => true, + 'title' => true, + // unfancy media tags, when indented should be rendered as block + 'map' => true, + 'object' => true, + 'param' => true, + 'embed' => true, + 'area' => true, + // inline elements + 'a' => false, + 'abbr' => false, + 'acronym' => false, + 'applet' => false, + 'b' => false, + 'basefont' => false, + 'bdo' => false, + 'big' => false, + 'br' => false, + 'button' => false, + 'cite' => false, + 'code' => false, + 'del' => false, + 'dfn' => false, + 'em' => false, + 'font' => false, + 'i' => false, + 'img' => false, + 'ins' => false, + 'input' => false, + 'iframe' => false, + 'kbd' => false, + 'label' => false, + 'q' => false, + 'samp' => false, + 'script' => false, + 'select' => false, + 'small' => false, + 'span' => false, + 'strong' => false, + 'sub' => false, + 'sup' => false, + 'textarea' => false, + 'tt' => false, + 'var' => false, + ); + + /** + * get next node, set $this->html prior! + * + * @param void + * @return bool + */ + public function nextNode() + { + if (empty($this->html)) { + // we are done with parsing the html string + + return false; + } + + if ($this->isStartTag && !$this->isEmptyTag) { + array_push($this->openTags, $this->tagName); + if (in_array($this->tagName, $this->preformattedTags)) { + // dont truncate whitespaces for <code> or <pre> contents + $this->keepWhitespace++; + } + } + + if ($this->html[0] == '<') { + $token = substr($this->html, 0, 9); + if (substr($token, 0, 2) == '<?') { + // xml prolog or other pi's + /** TODO **/ + // trigger_error('this might need some work', E_USER_NOTICE); + $pos = strpos($this->html, '>'); + $this->setNode('pi', $pos + 1); + + return true; + } + if (substr($token, 0, 4) == '<!--') { + // comment + $pos = strpos($this->html, '-->'); + if ($pos === false) { + // could not find a closing -->, use next gt instead + // this is firefox' behaviour + $pos = strpos($this->html, '>') + 1; + } else { + $pos += 3; + } + $this->setNode('comment', $pos); + + static::$skipWhitespace = true; + + return true; + } + if ($token == '<!DOCTYPE') { + // doctype + $this->setNode('doctype', strpos($this->html, '>') + 1); + + static::$skipWhitespace = true; + + return true; + } + if ($token == '<![CDATA[') { + // cdata, use text node + + // remove leading <![CDATA[ + $this->html = substr($this->html, 9); + + $this->setNode('text', strpos($this->html, ']]>') + 3); + + // remove trailing ]]> and trim + $this->node = substr($this->node, 0, -3); + $this->handleWhitespaces(); + + static::$skipWhitespace = true; + + return true; + } + if ($this->parseTag()) { + // seems to be a tag + // handle whitespaces + if ($this->isBlockElement) { + static::$skipWhitespace = true; + } else { + static::$skipWhitespace = false; + } + + return true; + } + } + if ($this->keepWhitespace) { + static::$skipWhitespace = false; + } + // when we get here it seems to be a text node + $pos = strpos($this->html, '<'); + if ($pos === false) { + $pos = strlen($this->html); + } + $this->setNode('text', $pos); + $this->handleWhitespaces(); + if (static::$skipWhitespace && $this->node == ' ') { + return $this->nextNode(); + } + $this->isInlineContext = true; + static::$skipWhitespace = false; + + return true; + } + + /** + * parse tag, set tag name and attributes, see if it's a closing tag and so forth... + * + * @param void + * @return bool + */ + protected function parseTag() + { + if (!isset(static::$a_ord)) { + static::$a_ord = ord('a'); + static::$z_ord = ord('z'); + static::$special_ords = array( + ord(':'), // for xml:lang + ord('-'), // for http-equiv + ); + } + + $tagName = ''; + + $pos = 1; + $isStartTag = $this->html[$pos] != '/'; + if (!$isStartTag) { + $pos++; + } + // get tagName + while (isset($this->html[$pos])) { + $pos_ord = ord(strtolower($this->html[$pos])); + if (($pos_ord >= static::$a_ord && $pos_ord <= static::$z_ord) || (!empty($tagName) && is_numeric($this->html[$pos]))) { + $tagName .= $this->html[$pos]; + $pos++; + } else { + $pos--; + break; + } + } + + $tagName = strtolower($tagName); + if (empty($tagName) || !isset($this->blockElements[$tagName])) { + // something went wrong => invalid tag + $this->invalidTag(); + + return false; + } + if ($this->noTagsInCode && end($this->openTags) == 'code' && !($tagName == 'code' && !$isStartTag)) { + // we supress all HTML tags inside code tags + $this->invalidTag(); + + return false; + } + + // get tag attributes + /** TODO: in html 4 attributes do not need to be quoted **/ + $isEmptyTag = false; + $attributes = array(); + $currAttrib = ''; + while (isset($this->html[$pos + 1])) { + $pos++; + // close tag + if ($this->html[$pos] == '>' || $this->html[$pos] . $this->html[$pos + 1] == '/>') { + if ($this->html[$pos] == '/') { + $isEmptyTag = true; + $pos++; + } + break; + } + + $pos_ord = ord(strtolower($this->html[$pos])); + if (($pos_ord >= static::$a_ord && $pos_ord <= static::$z_ord) || in_array($pos_ord, static::$special_ords)) { + // attribute name + $currAttrib .= $this->html[$pos]; + } elseif (in_array($this->html[$pos], array(' ', "\t", "\n"))) { + // drop whitespace + } elseif (in_array($this->html[$pos] . $this->html[$pos + 1], array('="', "='"))) { + // get attribute value + $pos++; + $await = $this->html[$pos]; // single or double quote + $pos++; + $value = ''; + while (isset($this->html[$pos]) && $this->html[$pos] != $await) { + $value .= $this->html[$pos]; + $pos++; + } + $attributes[$currAttrib] = $value; + $currAttrib = ''; + } else { + $this->invalidTag(); + + return false; + } + } + if ($this->html[$pos] != '>') { + $this->invalidTag(); + + return false; + } + + if (!empty($currAttrib)) { + // html 4 allows something like <option selected> instead of <option selected="selected"> + $attributes[$currAttrib] = $currAttrib; + } + if (!$isStartTag) { + if (!empty($attributes) || $tagName != end($this->openTags)) { + // end tags must not contain any attributes + // or maybe we did not expect a different tag to be closed + $this->invalidTag(); + + return false; + } + array_pop($this->openTags); + if (in_array($tagName, $this->preformattedTags)) { + $this->keepWhitespace--; + } + } + $pos++; + $this->node = substr($this->html, 0, $pos); + $this->html = substr($this->html, $pos); + $this->tagName = $tagName; + $this->tagAttributes = $attributes; + $this->isStartTag = $isStartTag; + $this->isEmptyTag = $isEmptyTag || in_array($tagName, $this->emptyTags); + if ($this->isEmptyTag) { + // might be not well formed + $this->node = preg_replace('# */? *>$#', ' />', $this->node); + } + $this->nodeType = 'tag'; + $this->isBlockElement = $this->blockElements[$tagName]; + $this->isNextToInlineContext = $isStartTag && $this->isInlineContext; + $this->isInlineContext = !$this->isBlockElement; + return true; + } + + /** + * handle invalid tags + * + * @param void + * @return void + */ + protected function invalidTag() + { + $this->html = substr_replace($this->html, '<', 0, 1); + } + + /** + * update all vars and make $this->html shorter + * + * @param string $type see description for $this->nodeType + * @param int $pos to which position shall we cut? + * @return void + */ + protected function setNode($type, $pos) + { + if ($this->nodeType == 'tag') { + // set tag specific vars to null + // $type == tag should not be called here + // see this::parseTag() for more + $this->tagName = null; + $this->tagAttributes = null; + $this->isStartTag = null; + $this->isEmptyTag = null; + $this->isBlockElement = null; + + } + $this->nodeType = $type; + $this->node = substr($this->html, 0, $pos); + $this->html = substr($this->html, $pos); + } + + /** + * check if $this->html begins with $str + * + * @param string $str + * @return bool + */ + protected function match($str) + { + return substr($this->html, 0, strlen($str)) == $str; + } + + /** + * truncate whitespaces + * + * @param void + * @return void + */ + protected function handleWhitespaces() + { + if ($this->keepWhitespace) { + // <pre> or <code> before... + + return; + } + // truncate multiple whitespaces to a single one + $this->node = preg_replace('#\s+#s', ' ', $this->node); + } + + /** + * normalize self::node + * + * @param void + * @return void + */ + protected function normalizeNode() + { + $this->node = '<'; + if (!$this->isStartTag) { + $this->node .= '/' . $this->tagName . '>'; + + return; + } + $this->node .= $this->tagName; + foreach ($this->tagAttributes as $name => $value) { + $this->node .= ' ' . $name . '="' . str_replace('"', '"', $value) . '"'; + } + if ($this->isEmptyTag) { + $this->node .= ' /'; + } + $this->node .= '>'; + } +} |