Replace Mardownify library with html-to-markdown library.

author: Klaus Weidenbach <Klaus.Weidenbach@gmx.net> 2017-05-23 00:32:11 +0200
committer: Klaus Weidenbach <Klaus.Weidenbach@gmx.net> 2017-05-23 00:32:11 +0200
commit: 547df2219ab4b870256f2ed90e36b97d8bf200bf (patch)
tree: 3e990b35eb939911bb7949c2f5d633fa3d788faf /vendor/league/html-to-markdown/src/HtmlConverter.php
parent: 50e9d024581ddf57f37a6302bc089a88237657bb (diff)
download: volse-hubzilla-547df2219ab4b870256f2ed90e36b97d8bf200bf.tar.gz
volse-hubzilla-547df2219ab4b870256f2ed90e36b97d8bf200bf.tar.bz2
volse-hubzilla-547df2219ab4b870256f2ed90e36b97d8bf200bf.zip
1 files changed, 231 insertions, 0 deletions
diff --git a/vendor/league/html-to-markdown/src/HtmlConverter.php b/vendor/league/html-to-markdown/src/HtmlConverter.php
new file mode 100644
index 000000000..db3c29e1c
--- /dev/null
+++ b/vendor/league/html-to-markdown/src/HtmlConverter.php
@@ -0,0 +1,231 @@
+<?php
+
+namespace League\HTMLToMarkdown;
+
+/**
+ * Class HtmlConverter
+ *
+ * A helper class to convert HTML to Markdown.
+ *
+ * @author Colin O'Dell <colinodell@gmail.com>
+ * @author Nick Cernis <nick@cern.is>
+ *
+ * @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub.
+ *
+ * @license http://www.opensource.org/licenses/mit-license.php MIT
+ */
+class HtmlConverter
+{
+    /**
+     * @var Environment
+     */
+    protected $environment;
+
+    /**
+     * Constructor
+     *
+     * @param Environment|array $options Environment object or configuration options
+     */
+    public function __construct($options = array())
+    {
+        if ($options instanceof Environment) {
+            $this->environment = $options;
+        } elseif (is_array($options)) {
+            $defaults = array(
+                'header_style' => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2
+                'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
+                'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
+                'bold_style' => '**', // Set to '__' if you prefer the underlined style
+                'italic_style' => '_', // Set to '*' if you prefer the asterisk style
+                'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
+                'hard_break' => false,// Set to true to turn <br> into `\n` instead of `  \n`
+            );
+
+            $this->environment = Environment::createDefaultEnvironment($defaults);
+
+            $this->environment->getConfig()->merge($options);
+        }
+    }
+
+    /**
+     * @return Environment
+     */
+    public function getEnvironment()
+    {
+        return $this->environment;
+    }
+
+    /**
+     * @return Configuration
+     */
+    public function getConfig()
+    {
+        return $this->environment->getConfig();
+    }
+
+    /**
+     * Convert
+     *
+     * @see HtmlConverter::convert
+     *
+     * @param string $html
+     *
+     * @return string The Markdown version of the html
+     */
+    public function __invoke($html)
+    {
+        return $this->convert($html);
+    }
+
+    /**
+     * Convert
+     *
+     * Loads HTML and passes to getMarkdown()
+     *
+     * @param string $html
+     *
+     * @throws \InvalidArgumentException
+     *
+     * @return string The Markdown version of the html
+     */
+    public function convert($html)
+    {
+        if (trim($html) === '') {
+            return '';
+        }
+
+        $document = $this->createDOMDocument($html);
+
+        // Work on the entire DOM tree (including head and body)
+        if (!($root = $document->getElementsByTagName('html')->item(0))) {
+            throw new \InvalidArgumentException('Invalid HTML was provided');
+        }
+
+        $rootElement = new Element($root);
+        $this->convertChildren($rootElement);
+
+        // Store the now-modified DOMDocument as a string
+        $markdown = $document->saveHTML();
+
+        return $this->sanitize($markdown);
+    }
+
+    /**
+     * @param string $html
+     *
+     * @return \DOMDocument
+     */
+    private function createDOMDocument($html)
+    {
+        $document = new \DOMDocument();
+
+        if ($this->getConfig()->getOption('suppress_errors')) {
+            // Suppress conversion errors (from http://bit.ly/pCCRSX)
+            libxml_use_internal_errors(true);
+        }
+
+        // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt)
+        $document->loadHTML('<?xml encoding="UTF-8">' . $html);
+        $document->encoding = 'UTF-8';
+
+        if ($this->getConfig()->getOption('suppress_errors')) {
+            libxml_clear_errors();
+        }
+
+        return $document;
+    }
+
+    /**
+     * Convert Children
+     *
+     * Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
+     *
+     * Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
+     * starting with the innermost element and working up to the outermost element.
+     *
+     * @param ElementInterface $element
+     */
+    private function convertChildren(ElementInterface $element)
+    {
+        // Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
+        // except if the current node is a code tag, which needs to be converted by the CodeConverter.
+        if ($element->isDescendantOf(array('pre', 'code')) && $element->getTagName() !== 'code') {
+            return;
+        }
+
+        // If the node has children, convert those to Markdown first
+        if ($element->hasChildren()) {
+            foreach ($element->getChildren() as $child) {
+                $this->convertChildren($child);
+            }
+        }
+
+        // Now that child nodes have been converted, convert the original node
+        $markdown = $this->convertToMarkdown($element);
+
+        // Create a DOM text node containing the Markdown equivalent of the original node
+
+        // Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title'
+        $element->setFinalMarkdown($markdown);
+    }
+
+    /**
+     * Convert to Markdown
+     *
+     * Converts an individual node into a #text node containing a string of its Markdown equivalent.
+     *
+     * Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title'
+     *
+     * @param ElementInterface $element
+     *
+     * @return string The converted HTML as Markdown
+     */
+    protected function convertToMarkdown(ElementInterface $element)
+    {
+        $tag = $element->getTagName();
+
+        // Strip nodes named in remove_nodes
+        $tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes'));
+        if (in_array($tag, $tags_to_remove)) {
+            return false;
+        }
+
+        $converter = $this->environment->getConverterByTag($tag);
+
+        return $converter->convert($element);
+    }
+
+    /**
+     * @param string $markdown
+     *
+     * @return string
+     */
+    protected function sanitize($markdown)
+    {
+        $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
+        $markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
+        $markdown = trim($markdown); // Remove blank spaces at the beggining of the html
+
+        /*
+         * Removing unwanted tags. Tags should be added to the array in the order they are expected.
+         * XML, html and body opening tags should be in that order. Same case with closing tags
+         */
+        $unwanted = array('<?xml encoding="UTF-8">', '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '&#xD;');
+
+        foreach ($unwanted as $tag) {
+            if (strpos($tag, '/') === false) {
+                // Opening tags
+                if (strpos($markdown, $tag) === 0) {
+                    $markdown = substr($markdown, strlen($tag));
+                }
+            } else {
+                // Closing tags
+                if (strpos($markdown, $tag) === strlen($markdown) - strlen($tag)) {
+                    $markdown = substr($markdown, 0, -strlen($tag));
+                }
+            }
+        }
+
+        return trim($markdown, "\n\r\0\x0B");
+    }
+}
author	Klaus Weidenbach <Klaus.Weidenbach@gmx.net>	2017-05-23 00:32:11 +0200
committer	Klaus Weidenbach <Klaus.Weidenbach@gmx.net>	2017-05-23 00:32:11 +0200
commit	547df2219ab4b870256f2ed90e36b97d8bf200bf (patch)
tree	3e990b35eb939911bb7949c2f5d633fa3d788faf /vendor/league/html-to-markdown/src/HtmlConverter.php
parent	50e9d024581ddf57f37a6302bc089a88237657bb (diff)
download	volse-hubzilla-547df2219ab4b870256f2ed90e36b97d8bf200bf.tar.gz volse-hubzilla-547df2219ab4b870256f2ed90e36b97d8bf200bf.tar.bz2 volse-hubzilla-547df2219ab4b870256f2ed90e36b97d8bf200bf.zip