Merge branch 'master' of github.com:annando/friendica

Conflicts: include/oembed.php
author: Michael <icarus@dabo.de> 2012-06-18 08:51:12 +0200
committer: Michael <icarus@dabo.de> 2012-06-18 08:51:12 +0200
commit: 6eaca19f4195a5f8f2c1b6b1e51c6e52bad67d96 (patch)
tree: 2dd0bdf8d76dac54b69c2abdf7be661146fd630b /include/markdownify/parsehtml/parsehtml.php
parent: d53414a333964026c26c3acee0d953e742a9a633 (diff)
parent: cc56f33b80bdd871b8a1245e3c4bd3a31ed49330 (diff)
download: volse-hubzilla-6eaca19f4195a5f8f2c1b6b1e51c6e52bad67d96.tar.gz
volse-hubzilla-6eaca19f4195a5f8f2c1b6b1e51c6e52bad67d96.tar.bz2
volse-hubzilla-6eaca19f4195a5f8f2c1b6b1e51c6e52bad67d96.zip
1 files changed, 618 insertions, 0 deletions
diff --git a/include/markdownify/parsehtml/parsehtml.php b/include/markdownify/parsehtml/parsehtml.php
new file mode 100644
index 000000000..1a8ecacda
--- /dev/null
+++ b/include/markdownify/parsehtml/parsehtml.php
@@ -0,0 +1,618 @@
+<?php
+/**
+ * parseHTML is a HTML parser which works with PHP 4 and above.
+ * It tries to handle invalid HTML to some degree.
+ *
+ * @version 1.0 beta
+ * @author Milian Wolff (mail@milianw.de, http://milianw.de)
+ * @license LGPL, see LICENSE_LGPL.txt and the summary below
+ * @copyright (C) 2007  Milian Wolff
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+class parseHTML {
+  /**
+   * tags which are always empty (<br /> etc.)
+   *
+   * @var array<string>
+   */
+  var $emptyTags = array(
+    'br',
+    'hr',
+    'input',
+    'img',
+    'area',
+    'link',
+    'meta',
+    'param',
+  );
+  /**
+   * tags with preformatted text
+   * whitespaces wont be touched in them
+   *
+   * @var array<string>
+   */
+  var $preformattedTags = array(
+    'script',
+    'style',
+    'pre',
+    'code',
+  );
+  /**
+   * supress HTML tags inside preformatted tags (see above)
+   *
+   * @var bool
+   */
+  var $noTagsInCode = false;
+  /**
+   * html to be parsed
+   *
+   * @var string
+   */
+  var $html = '';
+  /**
+   * node type:
+   *
+   * - tag (see isStartTag)
+   * - text (includes cdata)
+   * - comment
+   * - doctype
+   * - pi (processing instruction)
+   *
+   * @var string
+   */
+  var $nodeType = '';
+  /**
+   * current node content, i.e. either a
+   * simple string (text node), or something like
+   * <tag attrib="value"...>
+   *
+   * @var string
+   */
+  var $node = '';
+  /**
+   * wether current node is an opening tag (<a>) or not (</a>)
+   * set to NULL if current node is not a tag
+   * NOTE: empty tags (<br />) set this to true as well!
+   *
+   * @var bool | null
+   */
+  var $isStartTag = null;
+  /**
+   * wether current node is an empty tag (<br />) or not (<a></a>)
+   *
+   * @var bool | null
+   */
+  var $isEmptyTag = null;
+  /**
+   * tag name
+   *
+   * @var string | null
+   */
+  var $tagName = '';
+  /**
+   * attributes of current tag
+   *
+   * @var array (attribName=>value) | null
+   */
+  var $tagAttributes = null;
+  /**
+   * wether the current tag is a block element
+   *
+   * @var bool | null
+   */
+  var $isBlockElement = null;
+
+  /**
+   * keep whitespace
+   *
+   * @var int
+   */
+  var $keepWhitespace = 0;
+  /**
+   * list of open tags
+   * count this to get current depth
+   *
+   * @var array
+   */
+  var $openTags = array();
+  /**
+   * list of block elements
+   *
+   * @var array
+   * TODO: what shall we do with <del> and <ins> ?!
+   */
+  var $blockElements = array (
+    # tag name => <bool> is block
+    # block elements
+    'address' => true,
+    'blockquote' => true,
+    'center' => true,
+    'del' => true,
+    'dir' => true,
+    'div' => true,
+    'dl' => true,
+    'fieldset' => true,
+    'form' => true,
+    'h1' => true,
+    'h2' => true,
+    'h3' => true,
+    'h4' => true,
+    'h5' => true,
+    'h6' => true,
+    'hr' => true,
+    'ins' => true,
+    'isindex' => true,
+    'menu' => true,
+    'noframes' => true,
+    'noscript' => true,
+    'ol' => true,
+    'p' => true,
+    'pre' => true,
+    'table' => true,
+    'ul' => true,
+    # set table elements and list items to block as well
+    'thead' => true,
+    'tbody' => true,
+    'tfoot' => true,
+    'td' => true,
+    'tr' => true,
+    'th' => true,
+    'li' => true,
+    'dd' => true,
+    'dt' => true,
+    # header items and html / body as well
+    'html' => true,
+    'body' => true,
+    'head' => true,
+    'meta' => true,
+    'link' => true,
+    'style' => true,
+    'title' => true,
+    # unfancy media tags, when indented should be rendered as block
+    'map' => true,
+    'object' => true,
+    'param' => true,
+    'embed' => true,
+    'area' => true,
+    # inline elements
+    'a' => false,
+    'abbr' => false,
+    'acronym' => false,
+    'applet' => false,
+    'b' => false,
+    'basefont' => false,
+    'bdo' => false,
+    'big' => false,
+    'br' => false,
+    'button' => false,
+    'cite' => false,
+    'code' => false,
+    'del' => false,
+    'dfn' => false,
+    'em' => false,
+    'font' => false,
+    'i' => false,
+    'img' => false,
+    'ins' => false,
+    'input' => false,
+    'iframe' => false,
+    'kbd' => false,
+    'label' => false,
+    'q' => false,
+    'samp' => false,
+    'script' => false,
+    'select' => false,
+    'small' => false,
+    'span' => false,
+    'strong' => false,
+    'sub' => false,
+    'sup' => false,
+    'textarea' => false,
+    'tt' => false,
+    'var' => false,
+  );
+  /**
+   * get next node, set $this->html prior!
+   *
+   * @param void
+   * @return bool
+   */
+  function nextNode() {
+    if (empty($this->html)) {
+      # we are done with parsing the html string
+      return false;
+    }
+    static $skipWhitespace = true;
+    if ($this->isStartTag && !$this->isEmptyTag) {
+      array_push($this->openTags, $this->tagName);
+      if (in_array($this->tagName, $this->preformattedTags)) {
+        # dont truncate whitespaces for <code> or <pre> contents
+        $this->keepWhitespace++;
+      }
+    }
+
+    if ($this->html[0] == '<') {
+      $token = substr($this->html, 0, 9);
+      if (substr($token, 0, 2) == '<?') {
+        # xml prolog or other pi's
+        /** TODO **/
+        #trigger_error('this might need some work', E_USER_NOTICE);
+        $pos = strpos($this->html, '>');
+        $this->setNode('pi', $pos + 1);
+        return true;
+      }
+      if (substr($token, 0, 4) == '<!--') {
+        # comment
+        $pos = strpos($this->html, '-->');
+        if ($pos === false) {
+          # could not find a closing -->, use next gt instead
+          # this is firefox' behaviour
+          $pos = strpos($this->html, '>') + 1;
+        } else {
+          $pos += 3;
+        }
+        $this->setNode('comment', $pos);
+
+        $skipWhitespace = true;
+        return true;
+      }
+      if ($token == '<!DOCTYPE') {
+        # doctype
+        $this->setNode('doctype', strpos($this->html, '>')+1);
+
+        $skipWhitespace = true;
+        return true;
+      }
+      if ($token == '<![CDATA[') {
+        # cdata, use text node
+
+        # remove leading <![CDATA[
+        $this->html = substr($this->html, 9);
+
+        $this->setNode('text', strpos($this->html, ']]>')+3);
+
+        # remove trailing ]]> and trim
+        $this->node = substr($this->node, 0, -3);
+        $this->handleWhitespaces();
+
+        $skipWhitespace = true;
+        return true;
+      }
+      if ($this->parseTag()) {
+        # seems to be a tag
+        # handle whitespaces
+        if ($this->isBlockElement) {
+          $skipWhitespace = true;
+        } else {
+          $skipWhitespace = false;
+        }
+        return true;
+      }
+    }
+    if ($this->keepWhitespace) {
+      $skipWhitespace = false;
+    }
+    # when we get here it seems to be a text node
+    $pos = strpos($this->html, '<');
+    if ($pos === false) {
+      $pos = strlen($this->html);
+    }
+    $this->setNode('text', $pos);
+    $this->handleWhitespaces();
+    if ($skipWhitespace && $this->node == ' ') {
+      return $this->nextNode();
+    }
+    $skipWhitespace = false;
+    return true;
+  }
+  /**
+   * parse tag, set tag name and attributes, see if it's a closing tag and so forth...
+   *
+   * @param void
+   * @return bool
+   */
+  function parseTag() {
+    static $a_ord, $z_ord, $special_ords;
+    if (!isset($a_ord)) {
+      $a_ord = ord('a');
+      $z_ord = ord('z');
+      $special_ords = array(
+        ord(':'), // for xml:lang
+        ord('-'), // for http-equiv
+      );
+    }
+
+    $tagName = '';
+
+    $pos = 1;
+    $isStartTag = $this->html[$pos] != '/';
+    if (!$isStartTag) {
+      $pos++;
+    }
+    # get tagName
+    while (isset($this->html[$pos])) {
+      $pos_ord = ord(strtolower($this->html[$pos]));
+      if (($pos_ord >= $a_ord && $pos_ord <= $z_ord) || (!empty($tagName) && is_numeric($this->html[$pos]))) {
+        $tagName .= $this->html[$pos];
+        $pos++;
+      } else {
+        $pos--;
+        break;
+      }
+    }
+
+    $tagName = strtolower($tagName);
+    if (empty($tagName) || !isset($this->blockElements[$tagName])) {
+      # something went wrong => invalid tag
+      $this->invalidTag();
+      return false;
+    }
+    if ($this->noTagsInCode && end($this->openTags) == 'code' && !($tagName == 'code' && !$isStartTag)) {
+      # we supress all HTML tags inside code tags
+      $this->invalidTag();
+      return false;
+    }
+
+    # get tag attributes
+    /** TODO: in html 4 attributes do not need to be quoted **/
+    $isEmptyTag = false;
+    $attributes = array();
+    $currAttrib = '';
+    while (isset($this->html[$pos+1])) {
+      $pos++;
+      # close tag
+      if ($this->html[$pos] == '>' || $this->html[$pos].$this->html[$pos+1] == '/>') {
+        if ($this->html[$pos] == '/') {
+          $isEmptyTag = true;
+          $pos++;
+        }
+        break;
+      }
+
+      $pos_ord = ord(strtolower($this->html[$pos]));
+      if ( ($pos_ord >= $a_ord && $pos_ord <= $z_ord) || in_array($pos_ord, $special_ords)) {
+        # attribute name
+        $currAttrib .= $this->html[$pos];
+      } elseif (in_array($this->html[$pos], array(' ', "\t", "\n"))) {
+        # drop whitespace
+      } elseif (in_array($this->html[$pos].$this->html[$pos+1], array('="', "='"))) {
+        # get attribute value
+        $pos++;
+        $await = $this->html[$pos]; # single or double quote
+        $pos++;
+        $value = '';
+        while (isset($this->html[$pos]) && $this->html[$pos] != $await) {
+          $value .= $this->html[$pos];
+          $pos++;
+        }
+        $attributes[$currAttrib] = $value;
+        $currAttrib = '';
+      } else {
+        $this->invalidTag();
+        return false;
+      }
+    }
+    if ($this->html[$pos] != '>') {
+      $this->invalidTag();
+      return false;
+    }
+
+    if (!empty($currAttrib)) {
+      # html 4 allows something like <option selected> instead of <option selected="selected">
+      $attributes[$currAttrib] = $currAttrib;
+    }
+    if (!$isStartTag) {
+      if (!empty($attributes) || $tagName != end($this->openTags)) {
+        # end tags must not contain any attributes
+        # or maybe we did not expect a different tag to be closed
+        $this->invalidTag();
+        return false;
+      }
+      array_pop($this->openTags);
+      if (in_array($tagName, $this->preformattedTags)) {
+        $this->keepWhitespace--;
+      }
+    }
+    $pos++;
+    $this->node = substr($this->html, 0, $pos);
+    $this->html = substr($this->html, $pos);
+    $this->tagName = $tagName;
+    $this->tagAttributes = $attributes;
+    $this->isStartTag = $isStartTag;
+    $this->isEmptyTag = $isEmptyTag || in_array($tagName, $this->emptyTags);
+    if ($this->isEmptyTag) {
+      # might be not well formed
+      $this->node = preg_replace('# */? *>$#', ' />', $this->node);
+    }
+    $this->nodeType = 'tag';
+    $this->isBlockElement = $this->blockElements[$tagName];
+    return true;
+  }
+  /**
+   * handle invalid tags
+   *
+   * @param void
+   * @return void
+   */
+  function invalidTag() {
+    $this->html = substr_replace($this->html, '&lt;', 0, 1);
+  }
+  /**
+   * update all vars and make $this->html shorter
+   *
+   * @param string $type see description for $this->nodeType
+   * @param int $pos to which position shall we cut?
+   * @return void
+   */
+  function setNode($type, $pos) {
+    if ($this->nodeType == 'tag') {
+      # set tag specific vars to null
+      # $type == tag should not be called here
+      # see this::parseTag() for more
+      $this->tagName = null;
+      $this->tagAttributes = null;
+      $this->isStartTag = null;
+      $this->isEmptyTag = null;
+      $this->isBlockElement = null;
+
+    }
+    $this->nodeType = $type;
+    $this->node = substr($this->html, 0, $pos);
+    $this->html = substr($this->html, $pos);
+  }
+  /**
+   * check if $this->html begins with $str
+   *
+   * @param string $str
+   * @return bool
+   */
+  function match($str) {
+    return substr($this->html, 0, strlen($str)) == $str;
+  }
+  /**
+   * truncate whitespaces
+   *
+   * @param void
+   * @return void
+   */
+  function handleWhitespaces() {
+    if ($this->keepWhitespace) {
+      # <pre> or <code> before...
+      return;
+    }
+    # truncate multiple whitespaces to a single one
+    $this->node = preg_replace('#\s+#s', ' ', $this->node);
+  }
+  /**
+   * normalize self::node
+   *
+   * @param void
+   * @return void
+   */
+  function normalizeNode() {
+    $this->node = '<';
+    if (!$this->isStartTag) {
+      $this->node .= '/'.$this->tagName.'>';
+      return;
+    }
+    $this->node .= $this->tagName;
+    foreach ($this->tagAttributes as $name => $value) {
+      $this->node .= ' '.$name.'="'.str_replace('"', '&quot;', $value).'"';
+    }
+    if ($this->isEmptyTag) {
+      $this->node .= ' /';
+    }
+    $this->node .= '>';
+  }
+}
+
+/**
+ * indent a HTML string properly
+ *
+ * @param string $html
+ * @param string $indent optional
+ * @return string
+ */
+function indentHTML($html, $indent = "  ", $noTagsInCode = false) {
+  $parser = new parseHTML;
+  $parser->noTagsInCode = $noTagsInCode;
+  $parser->html = $html;
+  $html = '';
+  $last = true; # last tag was block elem
+  $indent_a = array();
+  while($parser->nextNode()) {
+    if ($parser->nodeType == 'tag') {
+      $parser->normalizeNode();
+    }
+    if ($parser->nodeType == 'tag' && $parser->isBlockElement) {
+      $isPreOrCode = in_array($parser->tagName, array('code', 'pre'));
+      if (!$parser->keepWhitespace && !$last && !$isPreOrCode) {
+        $html = rtrim($html)."\n";
+      }
+      if ($parser->isStartTag) {
+        $html .= implode($indent_a);
+        if (!$parser->isEmptyTag) {
+          array_push($indent_a, $indent);
+        }
+      } else {
+        array_pop($indent_a);
+        if (!$isPreOrCode) {
+          $html .= implode($indent_a);
+        }
+      }
+      $html .= $parser->node;
+      if (!$parser->keepWhitespace && !($isPreOrCode && $parser->isStartTag)) {
+        $html .= "\n";
+      }
+      $last = true;
+    } else {
+      if ($parser->nodeType == 'tag' && $parser->tagName == 'br') {
+        $html .= $parser->node."\n";
+        $last = true;
+        continue;
+      } elseif ($last && !$parser->keepWhitespace) {
+        $html .= implode($indent_a);
+        $parser->node = ltrim($parser->node);
+      }
+      $html .= $parser->node;
+
+      if (in_array($parser->nodeType, array('comment', 'pi', 'doctype'))) {
+        $html .= "\n";
+      } else {
+        $last = false;
+      }
+    }
+  }
+  return $html;
+}
+/*
+# testcase / example
+error_reporting(E_ALL);
+
+$html = '<p>Simple block on one line:</p>
+
+<div>foo</div>
+
+<p>And nested without indentation:</p>
+
+<div>
+<div>
+<div>
+foo
+</div>
+<div style=">"/>
+</div>
+<div>bar</div>
+</div>
+
+<p>And with attributes:</p>
+
+<div>
+    <div id="foo">
+    </div>
+</div>
+
+<p>This was broken in 1.0.2b7:</p>
+
+<div class="inlinepage">
+<div class="toggleableend">
+foo
+</div>
+</div>';
+#$html = '<a href="asdfasdf"       title=\'asdf\' foo="bar">asdf</a>';
+echo indentHTML($html);
+die();
+*/
author	Michael <icarus@dabo.de>	2012-06-18 08:51:12 +0200
committer	Michael <icarus@dabo.de>	2012-06-18 08:51:12 +0200
commit	6eaca19f4195a5f8f2c1b6b1e51c6e52bad67d96 (patch)
tree	2dd0bdf8d76dac54b69c2abdf7be661146fd630b /include/markdownify/parsehtml/parsehtml.php
parent	d53414a333964026c26c3acee0d953e742a9a633 (diff)
parent	cc56f33b80bdd871b8a1245e3c4bd3a31ed49330 (diff)
download	volse-hubzilla-6eaca19f4195a5f8f2c1b6b1e51c6e52bad67d96.tar.gz volse-hubzilla-6eaca19f4195a5f8f2c1b6b1e51c6e52bad67d96.tar.bz2 volse-hubzilla-6eaca19f4195a5f8f2c1b6b1e51c6e52bad67d96.zip