aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/league/html-to-markdown/src/HtmlConverter.php
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/league/html-to-markdown/src/HtmlConverter.php')
-rw-r--r--vendor/league/html-to-markdown/src/HtmlConverter.php148
1 files changed, 86 insertions, 62 deletions
diff --git a/vendor/league/html-to-markdown/src/HtmlConverter.php b/vendor/league/html-to-markdown/src/HtmlConverter.php
index 6f98e97b4..7162b256d 100644
--- a/vendor/league/html-to-markdown/src/HtmlConverter.php
+++ b/vendor/league/html-to-markdown/src/HtmlConverter.php
@@ -1,10 +1,10 @@
<?php
+declare(strict_types=1);
+
namespace League\HTMLToMarkdown;
/**
- * Class HtmlConverter
- *
* A helper class to convert HTML to Markdown.
*
* @author Colin O'Dell <colinodell@gmail.com>
@@ -16,25 +16,24 @@ namespace League\HTMLToMarkdown;
*/
class HtmlConverter implements HtmlConverterInterface
{
- /**
- * @var Environment
- */
+ /** @var Environment */
protected $environment;
/**
* Constructor
*
- * @param Environment|array $options Environment object or configuration options
+ * @param Environment|array<string, mixed> $options Environment object or configuration options
*/
- public function __construct($options = array())
+ public function __construct($options = [])
{
if ($options instanceof Environment) {
$this->environment = $options;
- } elseif (is_array($options)) {
- $defaults = array(
+ } elseif (\is_array($options)) {
+ $defaults = [
'header_style' => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2
'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
+ 'strip_placeholder_links' => false, // Set to true to remove <a> that doesn't have href.
'bold_style' => '**', // DEPRECATED: Set to '__' if you prefer the underlined style
'italic_style' => '*', // DEPRECATED: Set to '_' if you prefer the underlined style
'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
@@ -42,7 +41,9 @@ class HtmlConverter implements HtmlConverterInterface
'list_item_style' => '-', // Set the default character for each <li> in a <ul>. Can be '-', '*', or '+'
'preserve_comments' => false, // Set to true to preserve comments, or set to an array of strings to preserve specific comments
'use_autolinks' => true, // Set to true to use simple link syntax if possible. Will always use []() if set to false
- );
+ 'table_pipe_escape' => '\|', // Replacement string for pipe characters inside markdown table cells
+ 'table_caption_side' => 'top', // Set to 'top' or 'bottom' to show <caption> content before or after table, null to suppress
+ ];
$this->environment = Environment::createDefaultEnvironment($defaults);
@@ -50,18 +51,12 @@ class HtmlConverter implements HtmlConverterInterface
}
}
- /**
- * @return Environment
- */
- public function getEnvironment()
+ public function getEnvironment(): Environment
{
return $this->environment;
}
- /**
- * @return Configuration
- */
- public function getConfig()
+ public function getConfig(): Configuration
{
return $this->environment->getConfig();
}
@@ -71,11 +66,9 @@ class HtmlConverter implements HtmlConverterInterface
*
* @see HtmlConverter::convert
*
- * @param string $html
- *
* @return string The Markdown version of the html
*/
- public function __invoke($html)
+ public function __invoke(string $html): string
{
return $this->convert($html);
}
@@ -85,22 +78,20 @@ class HtmlConverter implements HtmlConverterInterface
*
* Loads HTML and passes to getMarkdown()
*
- * @param string $html
- *
- * @throws \InvalidArgumentException
- *
* @return string The Markdown version of the html
+ *
+ * @throws \InvalidArgumentException|\RuntimeException
*/
- public function convert($html)
+ public function convert(string $html): string
{
- if (trim($html) === '') {
+ if (\trim($html) === '') {
return '';
}
$document = $this->createDOMDocument($html);
// Work on the entire DOM tree (including head and body)
- if (!($root = $document->getElementsByTagName('html')->item(0))) {
+ if (! ($root = $document->getElementsByTagName('html')->item(0))) {
throw new \InvalidArgumentException('Invalid HTML was provided');
}
@@ -110,52 +101,87 @@ class HtmlConverter implements HtmlConverterInterface
// Store the now-modified DOMDocument as a string
$markdown = $document->saveHTML();
+ if ($markdown === false) {
+ throw new \RuntimeException('Unknown error occurred during HTML to Markdown conversion');
+ }
+
return $this->sanitize($markdown);
}
- /**
- * @param string $html
- *
- * @return \DOMDocument
- */
- private function createDOMDocument($html)
+ private function createDOMDocument(string $html): \DOMDocument
{
$document = new \DOMDocument();
if ($this->getConfig()->getOption('suppress_errors')) {
// Suppress conversion errors (from http://bit.ly/pCCRSX)
- libxml_use_internal_errors(true);
+ \libxml_use_internal_errors(true);
}
// Hack to load utf-8 HTML (from http://bit.ly/pVDyCt)
$document->loadHTML('<?xml encoding="UTF-8">' . $html);
$document->encoding = 'UTF-8';
+ $this->replaceMisplacedComments($document);
+
if ($this->getConfig()->getOption('suppress_errors')) {
- libxml_clear_errors();
+ \libxml_clear_errors();
}
return $document;
}
/**
+ * Finds any comment nodes outside <html> element and moves them into <body>.
+ *
+ * @see https://github.com/thephpleague/html-to-markdown/issues/212
+ * @see https://3v4l.org/7bC33
+ */
+ private function replaceMisplacedComments(\DOMDocument $document): void
+ {
+ // Find ny comment nodes at the root of the document.
+ $misplacedComments = (new \DOMXPath($document))->query('/comment()');
+ if ($misplacedComments === false) {
+ return;
+ }
+
+ $body = $document->getElementsByTagName('body')->item(0);
+ if ($body === null) {
+ return;
+ }
+
+ // Loop over comment nodes in reverse so we put them inside <body> in
+ // their original order.
+ for ($index = $misplacedComments->length - 1; $index >= 0; $index--) {
+ if ($body->firstChild === null) {
+ $body->insertBefore($misplacedComments[$index]);
+ } else {
+ $body->insertBefore($misplacedComments[$index], $body->firstChild);
+ }
+ }
+ }
+
+ /**
* Convert Children
*
* Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
*
* Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
* starting with the innermost element and working up to the outermost element.
- *
- * @param ElementInterface $element
*/
- private function convertChildren(ElementInterface $element)
+ private function convertChildren(ElementInterface $element): void
{
// Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
// except if the current node is a code tag, which needs to be converted by the CodeConverter.
- if ($element->isDescendantOf(array('pre', 'code')) && $element->getTagName() !== 'code') {
+ if ($element->isDescendantOf(['pre', 'code']) && $element->getTagName() !== 'code') {
return;
}
+ // Give converter a chance to inspect/modify the DOM before children are converted
+ $converter = $this->environment->getConverterByTag($element->getTagName());
+ if ($converter instanceof PreConverterInterface) {
+ $converter->preConvert($element);
+ }
+
// If the node has children, convert those to Markdown first
if ($element->hasChildren()) {
foreach ($element->getChildren() as $child) {
@@ -179,18 +205,16 @@ class HtmlConverter implements HtmlConverterInterface
*
* Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title'
*
- * @param ElementInterface $element
- *
* @return string The converted HTML as Markdown
*/
- protected function convertToMarkdown(ElementInterface $element)
+ protected function convertToMarkdown(ElementInterface $element): string
{
$tag = $element->getTagName();
// Strip nodes named in remove_nodes
- $tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes'));
- if (in_array($tag, $tags_to_remove)) {
- return false;
+ $tagsToRemove = \explode(' ', $this->getConfig()->getOption('remove_nodes') ?? '');
+ if (\in_array($tag, $tagsToRemove, true)) {
+ return '';
}
$converter = $this->environment->getConverterByTag($tag);
@@ -198,38 +222,34 @@ class HtmlConverter implements HtmlConverterInterface
return $converter->convert($element);
}
- /**
- * @param string $markdown
- *
- * @return string
- */
- protected function sanitize($markdown)
+ protected function sanitize(string $markdown): string
{
- $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
- $markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
- $markdown = trim($markdown); // Remove blank spaces at the beggining of the html
+ $markdown = \html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
+ $markdown = \preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
+ \assert($markdown !== null);
+ $markdown = \trim($markdown); // Remove blank spaces at the beggining of the html
/*
* Removing unwanted tags. Tags should be added to the array in the order they are expected.
* XML, html and body opening tags should be in that order. Same case with closing tags
*/
- $unwanted = array('<?xml encoding="UTF-8">', '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '&#xD;');
+ $unwanted = ['<?xml encoding="UTF-8">', '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '&#xD;'];
foreach ($unwanted as $tag) {
- if (strpos($tag, '/') === false) {
+ if (\strpos($tag, '/') === false) {
// Opening tags
- if (strpos($markdown, $tag) === 0) {
- $markdown = substr($markdown, strlen($tag));
+ if (\strpos($markdown, $tag) === 0) {
+ $markdown = \substr($markdown, \strlen($tag));
}
} else {
// Closing tags
- if (strpos($markdown, $tag) === strlen($markdown) - strlen($tag)) {
- $markdown = substr($markdown, 0, -strlen($tag));
+ if (\strpos($markdown, $tag) === \strlen($markdown) - \strlen($tag)) {
+ $markdown = \substr($markdown, 0, -\strlen($tag));
}
}
}
- return trim($markdown, "\n\r\0\x0B");
+ return \trim($markdown, "\n\r\0\x0B");
}
/**
@@ -239,6 +259,10 @@ class HtmlConverter implements HtmlConverterInterface
* An example being:
*
* HtmlConverter::setOptions(['strip_tags' => true])->convert('<h1>test</h1>');
+ *
+ * @param array<string, mixed> $options
+ *
+ * @return $this
*/
public function setOptions(array $options)
{