* @author Nick Cernis * * @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub. * * @license http://www.opensource.org/licenses/mit-license.php MIT */ class HtmlConverter { /** * @var Environment */ protected $environment; /** * Constructor * * @param Environment|array $options Environment object or configuration options */ public function __construct($options = array()) { if ($options instanceof Environment) { $this->environment = $options; } elseif (is_array($options)) { $defaults = array( 'header_style' => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2 'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML 'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output. 'bold_style' => '**', // Set to '__' if you prefer the underlined style 'italic_style' => '_', // Set to '*' if you prefer the asterisk style 'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: 'meta style script' 'hard_break' => false,// Set to true to turn
into `\n` instead of ` \n` ); $this->environment = Environment::createDefaultEnvironment($defaults); $this->environment->getConfig()->merge($options); } } /** * @return Environment */ public function getEnvironment() { return $this->environment; } /** * @return Configuration */ public function getConfig() { return $this->environment->getConfig(); } /** * Convert * * @see HtmlConverter::convert * * @param string $html * * @return string The Markdown version of the html */ public function __invoke($html) { return $this->convert($html); } /** * Convert * * Loads HTML and passes to getMarkdown() * * @param string $html * * @throws \InvalidArgumentException * * @return string The Markdown version of the html */ public function convert($html) { if (trim($html) === '') { return ''; } $document = $this->createDOMDocument($html); // Work on the entire DOM tree (including head and body) if (!($root = $document->getElementsByTagName('html')->item(0))) { throw new \InvalidArgumentException('Invalid HTML was provided'); } $rootElement = new Element($root); $this->convertChildren($rootElement); // Store the now-modified DOMDocument as a string $markdown = $document->saveHTML(); return $this->sanitize($markdown); } /** * @param string $html * * @return \DOMDocument */ private function createDOMDocument($html) { $document = new \DOMDocument(); if ($this->getConfig()->getOption('suppress_errors')) { // Suppress conversion errors (from http://bit.ly/pCCRSX) libxml_use_internal_errors(true); } // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt) $document->loadHTML('' . $html); $document->encoding = 'UTF-8'; if ($this->getConfig()->getOption('suppress_errors')) { libxml_clear_errors(); } return $document; } /** * Convert Children * * Recursive function to drill into the DOM and convert each node into Markdown from the inside out. * * Finds children of each node and convert those to #text nodes containing their Markdown equivalent, * starting with the innermost element and working up to the outermost element. * * @param ElementInterface $element */ private function convertChildren(ElementInterface $element) { // Don't convert HTML code inside and
 blocks to Markdown - that should stay as HTML
        // except if the current node is a code tag, which needs to be converted by the CodeConverter.
        if ($element->isDescendantOf(array('pre', 'code')) && $element->getTagName() !== 'code') {
            return;
        }

        // If the node has children, convert those to Markdown first
        if ($element->hasChildren()) {
            foreach ($element->getChildren() as $child) {
                $this->convertChildren($child);
            }
        }

        // Now that child nodes have been converted, convert the original node
        $markdown = $this->convertToMarkdown($element);

        // Create a DOM text node containing the Markdown equivalent of the original node

        // Replace the old $node e.g. '

Title

' with the new $markdown_node e.g. '### Title' $element->setFinalMarkdown($markdown); } /** * Convert to Markdown * * Converts an individual node into a #text node containing a string of its Markdown equivalent. * * Example: An

node with text content of 'Title' becomes a text node with content of '### Title' * * @param ElementInterface $element * * @return string The converted HTML as Markdown */ protected function convertToMarkdown(ElementInterface $element) { $tag = $element->getTagName(); // Strip nodes named in remove_nodes $tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes')); if (in_array($tag, $tags_to_remove)) { return false; } $converter = $this->environment->getConverterByTag($tag); return $converter->convert($element); } /** * @param string $markdown * * @return string */ protected function sanitize($markdown) { $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8'); $markdown = preg_replace('/]+>/', '', $markdown); // Strip doctype declaration $markdown = trim($markdown); // Remove blank spaces at the beggining of the html /* * Removing unwanted tags. Tags should be added to the array in the order they are expected. * XML, html and body opening tags should be in that order. Same case with closing tags */ $unwanted = array('', '', '', '', '', '', '', ' '); foreach ($unwanted as $tag) { if (strpos($tag, '/') === false) { // Opening tags if (strpos($markdown, $tag) === 0) { $markdown = substr($markdown, strlen($tag)); } } else { // Closing tags if (strpos($markdown, $tag) === strlen($markdown) - strlen($tag)) { $markdown = substr($markdown, 0, -strlen($tag)); } } } return trim($markdown, "\n\r\0\x0B"); } }