From 3968e77f9e15c149d8cafa1dac37411777bd58bd Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Sun, 17 Jun 2012 19:49:05 +0200 Subject: New function to convert bbcode to markdown. --- include/markdownify/parsehtml/parsehtml.php | 618 ++++++++++++++++++++++++++++ 1 file changed, 618 insertions(+) create mode 100644 include/markdownify/parsehtml/parsehtml.php (limited to 'include/markdownify/parsehtml/parsehtml.php') diff --git a/include/markdownify/parsehtml/parsehtml.php b/include/markdownify/parsehtml/parsehtml.php new file mode 100644 index 000000000..1a8ecacda --- /dev/null +++ b/include/markdownify/parsehtml/parsehtml.php @@ -0,0 +1,618 @@ + etc.) + * + * @var array + */ + var $emptyTags = array( + 'br', + 'hr', + 'input', + 'img', + 'area', + 'link', + 'meta', + 'param', + ); + /** + * tags with preformatted text + * whitespaces wont be touched in them + * + * @var array + */ + var $preformattedTags = array( + 'script', + 'style', + 'pre', + 'code', + ); + /** + * supress HTML tags inside preformatted tags (see above) + * + * @var bool + */ + var $noTagsInCode = false; + /** + * html to be parsed + * + * @var string + */ + var $html = ''; + /** + * node type: + * + * - tag (see isStartTag) + * - text (includes cdata) + * - comment + * - doctype + * - pi (processing instruction) + * + * @var string + */ + var $nodeType = ''; + /** + * current node content, i.e. either a + * simple string (text node), or something like + * + * + * @var string + */ + var $node = ''; + /** + * wether current node is an opening tag () or not () + * set to NULL if current node is not a tag + * NOTE: empty tags (
) set this to true as well! + * + * @var bool | null + */ + var $isStartTag = null; + /** + * wether current node is an empty tag (
) or not () + * + * @var bool | null + */ + var $isEmptyTag = null; + /** + * tag name + * + * @var string | null + */ + var $tagName = ''; + /** + * attributes of current tag + * + * @var array (attribName=>value) | null + */ + var $tagAttributes = null; + /** + * wether the current tag is a block element + * + * @var bool | null + */ + var $isBlockElement = null; + + /** + * keep whitespace + * + * @var int + */ + var $keepWhitespace = 0; + /** + * list of open tags + * count this to get current depth + * + * @var array + */ + var $openTags = array(); + /** + * list of block elements + * + * @var array + * TODO: what shall we do with and ?! + */ + var $blockElements = array ( + # tag name => is block + # block elements + 'address' => true, + 'blockquote' => true, + 'center' => true, + 'del' => true, + 'dir' => true, + 'div' => true, + 'dl' => true, + 'fieldset' => true, + 'form' => true, + 'h1' => true, + 'h2' => true, + 'h3' => true, + 'h4' => true, + 'h5' => true, + 'h6' => true, + 'hr' => true, + 'ins' => true, + 'isindex' => true, + 'menu' => true, + 'noframes' => true, + 'noscript' => true, + 'ol' => true, + 'p' => true, + 'pre' => true, + 'table' => true, + 'ul' => true, + # set table elements and list items to block as well + 'thead' => true, + 'tbody' => true, + 'tfoot' => true, + 'td' => true, + 'tr' => true, + 'th' => true, + 'li' => true, + 'dd' => true, + 'dt' => true, + # header items and html / body as well + 'html' => true, + 'body' => true, + 'head' => true, + 'meta' => true, + 'link' => true, + 'style' => true, + 'title' => true, + # unfancy media tags, when indented should be rendered as block + 'map' => true, + 'object' => true, + 'param' => true, + 'embed' => true, + 'area' => true, + # inline elements + 'a' => false, + 'abbr' => false, + 'acronym' => false, + 'applet' => false, + 'b' => false, + 'basefont' => false, + 'bdo' => false, + 'big' => false, + 'br' => false, + 'button' => false, + 'cite' => false, + 'code' => false, + 'del' => false, + 'dfn' => false, + 'em' => false, + 'font' => false, + 'i' => false, + 'img' => false, + 'ins' => false, + 'input' => false, + 'iframe' => false, + 'kbd' => false, + 'label' => false, + 'q' => false, + 'samp' => false, + 'script' => false, + 'select' => false, + 'small' => false, + 'span' => false, + 'strong' => false, + 'sub' => false, + 'sup' => false, + 'textarea' => false, + 'tt' => false, + 'var' => false, + ); + /** + * get next node, set $this->html prior! + * + * @param void + * @return bool + */ + function nextNode() { + if (empty($this->html)) { + # we are done with parsing the html string + return false; + } + static $skipWhitespace = true; + if ($this->isStartTag && !$this->isEmptyTag) { + array_push($this->openTags, $this->tagName); + if (in_array($this->tagName, $this->preformattedTags)) { + # dont truncate whitespaces for or
 contents
+        $this->keepWhitespace++;
+      }
+    }
+
+    if ($this->html[0] == '<') {
+      $token = substr($this->html, 0, 9);
+      if (substr($token, 0, 2) == 'html, '>');
+        $this->setNode('pi', $pos + 1);
+        return true;
+      }
+      if (substr($token, 0, 4) == '');
+        if ($pos === false) {
+          # could not find a closing -->, use next gt instead
+          # this is firefox' behaviour
+          $pos = strpos($this->html, '>') + 1;
+        } else {
+          $pos += 3;
+        }
+        $this->setNode('comment', $pos);
+
+        $skipWhitespace = true;
+        return true;
+      }
+      if ($token == 'setNode('doctype', strpos($this->html, '>')+1);
+
+        $skipWhitespace = true;
+        return true;
+      }
+      if ($token == 'html = substr($this->html, 9);
+
+        $this->setNode('text', strpos($this->html, ']]>')+3);
+
+        # remove trailing ]]> and trim
+        $this->node = substr($this->node, 0, -3);
+        $this->handleWhitespaces();
+
+        $skipWhitespace = true;
+        return true;
+      }
+      if ($this->parseTag()) {
+        # seems to be a tag
+        # handle whitespaces
+        if ($this->isBlockElement) {
+          $skipWhitespace = true;
+        } else {
+          $skipWhitespace = false;
+        }
+        return true;
+      }
+    }
+    if ($this->keepWhitespace) {
+      $skipWhitespace = false;
+    }
+    # when we get here it seems to be a text node
+    $pos = strpos($this->html, '<');
+    if ($pos === false) {
+      $pos = strlen($this->html);
+    }
+    $this->setNode('text', $pos);
+    $this->handleWhitespaces();
+    if ($skipWhitespace && $this->node == ' ') {
+      return $this->nextNode();
+    }
+    $skipWhitespace = false;
+    return true;
+  }
+  /**
+   * parse tag, set tag name and attributes, see if it's a closing tag and so forth...
+   *
+   * @param void
+   * @return bool
+   */
+  function parseTag() {
+    static $a_ord, $z_ord, $special_ords;
+    if (!isset($a_ord)) {
+      $a_ord = ord('a');
+      $z_ord = ord('z');
+      $special_ords = array(
+        ord(':'), // for xml:lang
+        ord('-'), // for http-equiv
+      );
+    }
+
+    $tagName = '';
+
+    $pos = 1;
+    $isStartTag = $this->html[$pos] != '/';
+    if (!$isStartTag) {
+      $pos++;
+    }
+    # get tagName
+    while (isset($this->html[$pos])) {
+      $pos_ord = ord(strtolower($this->html[$pos]));
+      if (($pos_ord >= $a_ord && $pos_ord <= $z_ord) || (!empty($tagName) && is_numeric($this->html[$pos]))) {
+        $tagName .= $this->html[$pos];
+        $pos++;
+      } else {
+        $pos--;
+        break;
+      }
+    }
+
+    $tagName = strtolower($tagName);
+    if (empty($tagName) || !isset($this->blockElements[$tagName])) {
+      # something went wrong => invalid tag
+      $this->invalidTag();
+      return false;
+    }
+    if ($this->noTagsInCode && end($this->openTags) == 'code' && !($tagName == 'code' && !$isStartTag)) {
+      # we supress all HTML tags inside code tags
+      $this->invalidTag();
+      return false;
+    }
+
+    # get tag attributes
+    /** TODO: in html 4 attributes do not need to be quoted **/
+    $isEmptyTag = false;
+    $attributes = array();
+    $currAttrib = '';
+    while (isset($this->html[$pos+1])) {
+      $pos++;
+      # close tag
+      if ($this->html[$pos] == '>' || $this->html[$pos].$this->html[$pos+1] == '/>') {
+        if ($this->html[$pos] == '/') {
+          $isEmptyTag = true;
+          $pos++;
+        }
+        break;
+      }
+
+      $pos_ord = ord(strtolower($this->html[$pos]));
+      if ( ($pos_ord >= $a_ord && $pos_ord <= $z_ord) || in_array($pos_ord, $special_ords)) {
+        # attribute name
+        $currAttrib .= $this->html[$pos];
+      } elseif (in_array($this->html[$pos], array(' ', "\t", "\n"))) {
+        # drop whitespace
+      } elseif (in_array($this->html[$pos].$this->html[$pos+1], array('="', "='"))) {
+        # get attribute value
+        $pos++;
+        $await = $this->html[$pos]; # single or double quote
+        $pos++;
+        $value = '';
+        while (isset($this->html[$pos]) && $this->html[$pos] != $await) {
+          $value .= $this->html[$pos];
+          $pos++;
+        }
+        $attributes[$currAttrib] = $value;
+        $currAttrib = '';
+      } else {
+        $this->invalidTag();
+        return false;
+      }
+    }
+    if ($this->html[$pos] != '>') {
+      $this->invalidTag();
+      return false;
+    }
+
+    if (!empty($currAttrib)) {
+      # html 4 allows something like