diff options
author | git-marijus <mario@mariovavti.com> | 2017-05-23 21:39:47 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-05-23 21:39:47 +0200 |
commit | dea0d07b9af9a5927dd524a3e486317690a7e112 (patch) | |
tree | 3e990b35eb939911bb7949c2f5d633fa3d788faf /vendor/league/html-to-markdown/src | |
parent | 50e9d024581ddf57f37a6302bc089a88237657bb (diff) | |
parent | 547df2219ab4b870256f2ed90e36b97d8bf200bf (diff) | |
download | volse-hubzilla-dea0d07b9af9a5927dd524a3e486317690a7e112.tar.gz volse-hubzilla-dea0d07b9af9a5927dd524a3e486317690a7e112.tar.bz2 volse-hubzilla-dea0d07b9af9a5927dd524a3e486317690a7e112.zip |
Merge pull request #794 from dawnbreak/dev
Replace Mardownify library with html-to-markdown library.
Diffstat (limited to 'vendor/league/html-to-markdown/src')
23 files changed, 1581 insertions, 0 deletions
diff --git a/vendor/league/html-to-markdown/src/Configuration.php b/vendor/league/html-to-markdown/src/Configuration.php new file mode 100644 index 000000000..2943383aa --- /dev/null +++ b/vendor/league/html-to-markdown/src/Configuration.php @@ -0,0 +1,60 @@ +<?php + +namespace League\HTMLToMarkdown; + +class Configuration +{ + protected $config; + + /** + * @param array $config + */ + public function __construct(array $config = array()) + { + $this->config = $config; + } + + /** + * @param array $config + */ + public function merge(array $config = array()) + { + $this->config = array_replace_recursive($this->config, $config); + } + + /** + * @param array $config + */ + public function replace(array $config = array()) + { + $this->config = $config; + } + + /** + * @param string $key + * @param mixed $value + */ + public function setOption($key, $value) + { + $this->config[$key] = $value; + } + + /** + * @param string|null $key + * @param mixed|null $default + * + * @return mixed|null + */ + public function getOption($key = null, $default = null) + { + if ($key === null) { + return $this->config; + } + + if (!isset($this->config[$key])) { + return $default; + } + + return $this->config[$key]; + } +} diff --git a/vendor/league/html-to-markdown/src/ConfigurationAwareInterface.php b/vendor/league/html-to-markdown/src/ConfigurationAwareInterface.php new file mode 100644 index 000000000..8aca530be --- /dev/null +++ b/vendor/league/html-to-markdown/src/ConfigurationAwareInterface.php @@ -0,0 +1,11 @@ +<?php + +namespace League\HTMLToMarkdown; + +interface ConfigurationAwareInterface +{ + /** + * @param Configuration $config + */ + public function setConfig(Configuration $config); +} diff --git a/vendor/league/html-to-markdown/src/Converter/BlockquoteConverter.php b/vendor/league/html-to-markdown/src/Converter/BlockquoteConverter.php new file mode 100644 index 000000000..eb2d09d17 --- /dev/null +++ b/vendor/league/html-to-markdown/src/Converter/BlockquoteConverter.php @@ -0,0 +1,44 @@ +<?php + +namespace League\HTMLToMarkdown\Converter; + +use League\HTMLToMarkdown\ElementInterface; + +class BlockquoteConverter implements ConverterInterface +{ + /** + * @param ElementInterface $element + * + * @return string + */ + public function convert(ElementInterface $element) + { + // Contents should have already been converted to Markdown by this point, + // so we just need to add '>' symbols to each line. + + $markdown = ''; + + $quote_content = trim($element->getValue()); + + $lines = preg_split('/\r\n|\r|\n/', $quote_content); + + $total_lines = count($lines); + + foreach ($lines as $i => $line) { + $markdown .= '> ' . $line . "\n"; + if ($i + 1 === $total_lines) { + $markdown .= "\n"; + } + } + + return $markdown; + } + + /** + * @return string[] + */ + public function getSupportedTags() + { + return array('blockquote'); + } +} diff --git a/vendor/league/html-to-markdown/src/Converter/CodeConverter.php b/vendor/league/html-to-markdown/src/Converter/CodeConverter.php new file mode 100644 index 000000000..c8ec2c005 --- /dev/null +++ b/vendor/league/html-to-markdown/src/Converter/CodeConverter.php @@ -0,0 +1,62 @@ +<?php + +namespace League\HTMLToMarkdown\Converter; + +use League\HTMLToMarkdown\ElementInterface; + +class CodeConverter implements ConverterInterface +{ + /** + * @param ElementInterface $element + * + * @return string + */ + public function convert(ElementInterface $element) + { + $language = null; + + // Checking for language class on the code block + $classes = $element->getAttribute('class'); + + if ($classes) { + // Since tags can have more than one class, we need to find the one that starts with 'language-' + $classes = explode(' ', $classes); + foreach ($classes as $class) { + if (strpos($class, 'language-') !== false) { + // Found one, save it as the selected language and stop looping over the classes. + // The space after the language avoids gluing the actual code with the language tag + $language = str_replace('language-', '', $class) . ' '; + break; + } + } + } + + $markdown = ''; + $code = html_entity_decode($element->getChildrenAsString()); + + // In order to remove the code tags we need to search for them and, in the case of the opening tag + // use a regular expression to find the tag and the other attributes it might have + $code = preg_replace('/<code\b[^>]*>/', '', $code); + $code = str_replace('</code>', '', $code); + + // Checking if the code has multiple lines + $lines = preg_split('/\r\n|\r|\n/', $code); + if (count($lines) > 1) { + // Multiple lines detected, adding three backticks and newlines + $markdown .= '```' . $language . "\n" . $code . "\n" . '```'; + } else { + // One line of code, wrapping it on one backtick. + $markdown .= '`' . $language . $code . '`'; + } + + return $markdown; + } + + /** + * @return string[] + */ + public function getSupportedTags() + { + return array('code'); + } +} diff --git a/vendor/league/html-to-markdown/src/Converter/CommentConverter.php b/vendor/league/html-to-markdown/src/Converter/CommentConverter.php new file mode 100644 index 000000000..55038b254 --- /dev/null +++ b/vendor/league/html-to-markdown/src/Converter/CommentConverter.php @@ -0,0 +1,26 @@ +<?php + +namespace League\HTMLToMarkdown\Converter; + +use League\HTMLToMarkdown\ElementInterface; + +class CommentConverter implements ConverterInterface +{ + /** + * @param ElementInterface $element + * + * @return string + */ + public function convert(ElementInterface $element) + { + return ''; + } + + /** + * @return string[] + */ + public function getSupportedTags() + { + return array('#comment'); + } +} diff --git a/vendor/league/html-to-markdown/src/Converter/ConverterInterface.php b/vendor/league/html-to-markdown/src/Converter/ConverterInterface.php new file mode 100644 index 000000000..8530559a0 --- /dev/null +++ b/vendor/league/html-to-markdown/src/Converter/ConverterInterface.php @@ -0,0 +1,20 @@ +<?php + +namespace League\HTMLToMarkdown\Converter; + +use League\HTMLToMarkdown\ElementInterface; + +interface ConverterInterface +{ + /** + * @param ElementInterface $element + * + * @return string + */ + public function convert(ElementInterface $element); + + /** + * @return string[] + */ + public function getSupportedTags(); +} diff --git a/vendor/league/html-to-markdown/src/Converter/DefaultConverter.php b/vendor/league/html-to-markdown/src/Converter/DefaultConverter.php new file mode 100644 index 000000000..964a71093 --- /dev/null +++ b/vendor/league/html-to-markdown/src/Converter/DefaultConverter.php @@ -0,0 +1,50 @@ +<?php + +namespace League\HTMLToMarkdown\Converter; + +use League\HTMLToMarkdown\Configuration; +use League\HTMLToMarkdown\ConfigurationAwareInterface; +use League\HTMLToMarkdown\ElementInterface; + +class DefaultConverter implements ConverterInterface, ConfigurationAwareInterface +{ + const DEFAULT_CONVERTER = '_default'; + + /** + * @var Configuration + */ + protected $config; + + /** + * @param Configuration $config + */ + public function setConfig(Configuration $config) + { + $this->config = $config; + } + + /** + * @param ElementInterface $element + * + * @return string + */ + public function convert(ElementInterface $element) + { + // If strip_tags is false (the default), preserve tags that don't have Markdown equivalents, + // such as <span> nodes on their own. C14N() canonicalizes the node to a string. + // See: http://www.php.net/manual/en/domnode.c14n.php + if ($this->config->getOption('strip_tags', false)) { + return $element->getValue(); + } + + return html_entity_decode($element->getChildrenAsString()); + } + + /** + * @return string[] + */ + public function getSupportedTags() + { + return array(self::DEFAULT_CONVERTER); + } +} diff --git a/vendor/league/html-to-markdown/src/Converter/DivConverter.php b/vendor/league/html-to-markdown/src/Converter/DivConverter.php new file mode 100644 index 000000000..656a0ba4d --- /dev/null +++ b/vendor/league/html-to-markdown/src/Converter/DivConverter.php @@ -0,0 +1,45 @@ +<?php + +namespace League\HTMLToMarkdown\Converter; + +use League\HTMLToMarkdown\Configuration; +use League\HTMLToMarkdown\ConfigurationAwareInterface; +use League\HTMLToMarkdown\ElementInterface; + +class DivConverter implements ConverterInterface, ConfigurationAwareInterface +{ + /** + * @var Configuration + */ + protected $config; + + /** + * @param Configuration $config + */ + public function setConfig(Configuration $config) + { + $this->config = $config; + } + + /** + * @param ElementInterface $element + * + * @return string + */ + public function convert(ElementInterface $element) + { + if ($this->config->getOption('strip_tags', false)) { + return $element->getValue() . "\n\n"; + } + + return html_entity_decode($element->getChildrenAsString()); + } + + /** + * @return string[] + */ + public function getSupportedTags() + { + return array('div'); + } +} diff --git a/vendor/league/html-to-markdown/src/Converter/EmphasisConverter.php b/vendor/league/html-to-markdown/src/Converter/EmphasisConverter.php new file mode 100644 index 000000000..67250769b --- /dev/null +++ b/vendor/league/html-to-markdown/src/Converter/EmphasisConverter.php @@ -0,0 +1,57 @@ +<?php + +namespace League\HTMLToMarkdown\Converter; + +use League\HTMLToMarkdown\Configuration; +use League\HTMLToMarkdown\ConfigurationAwareInterface; +use League\HTMLToMarkdown\ElementInterface; + +class EmphasisConverter implements ConverterInterface, ConfigurationAwareInterface +{ + /** + * @var Configuration + */ + protected $config; + + /** + * @param Configuration $config + */ + public function setConfig(Configuration $config) + { + $this->config = $config; + } + + /** + * @param ElementInterface $element + * + * @return string + */ + public function convert(ElementInterface $element) + { + $tag = $element->getTagName(); + $value = $element->getValue(); + + if (!trim($value)) { + return ''; + } + + if ($tag === 'i' || $tag === 'em') { + $style = $this->config->getOption('italic_style'); + } else { + $style = $this->config->getOption('bold_style'); + } + + $prefix = ltrim($value) !== $value ? ' ' : ''; + $suffix = rtrim($value) !== $value ? ' ' : ''; + + return $prefix . $style . trim($value) . $style . $suffix; + } + + /** + * @return string[] + */ + public function getSupportedTags() + { + return array('em', 'i', 'strong', 'b'); + } +} diff --git a/vendor/league/html-to-markdown/src/Converter/HardBreakConverter.php b/vendor/league/html-to-markdown/src/Converter/HardBreakConverter.php new file mode 100644 index 000000000..37cd44e73 --- /dev/null +++ b/vendor/league/html-to-markdown/src/Converter/HardBreakConverter.php @@ -0,0 +1,41 @@ +<?php + +namespace League\HTMLToMarkdown\Converter; + +use League\HTMLToMarkdown\Configuration; +use League\HTMLToMarkdown\ConfigurationAwareInterface; +use League\HTMLToMarkdown\ElementInterface; + +class HardBreakConverter implements ConverterInterface, ConfigurationAwareInterface +{ + /** + * @var Configuration + */ + protected $config; + + /** + * @param Configuration $config + */ + public function setConfig(Configuration $config) + { + $this->config = $config; + } + + /** + * @param ElementInterface $element + * + * @return string + */ + public function convert(ElementInterface $element) + { + return $this->config->getOption('hard_break') ? "\n" : " \n"; + } + + /** + * @return string[] + */ + public function getSupportedTags() + { + return array('br'); + } +} diff --git a/vendor/league/html-to-markdown/src/Converter/HeaderConverter.php b/vendor/league/html-to-markdown/src/Converter/HeaderConverter.php new file mode 100644 index 000000000..d117e7d36 --- /dev/null +++ b/vendor/league/html-to-markdown/src/Converter/HeaderConverter.php @@ -0,0 +1,78 @@ +<?php + +namespace League\HTMLToMarkdown\Converter; + +use League\HTMLToMarkdown\Configuration; +use League\HTMLToMarkdown\ConfigurationAwareInterface; +use League\HTMLToMarkdown\ElementInterface; + +class HeaderConverter implements ConverterInterface, ConfigurationAwareInterface +{ + const STYLE_ATX = 'atx'; + const STYLE_SETEXT = 'setext'; + + /** + * @var Configuration + */ + protected $config; + + /** + * @param Configuration $config + */ + public function setConfig(Configuration $config) + { + $this->config = $config; + } + + /** + * @param ElementInterface $element + * + * @return string + */ + public function convert(ElementInterface $element) + { + $level = (int) substr($element->getTagName(), 1, 1); + $style = $this->config->getOption('header_style', self::STYLE_SETEXT); + + if (($level === 1 || $level === 2) && !$element->isDescendantOf('blockquote') && $style === self::STYLE_SETEXT) { + return $this->createSetextHeader($level, $element->getValue()); + } + + return $this->createAtxHeader($level, $element->getValue()); + } + + /** + * @return string[] + */ + public function getSupportedTags() + { + return array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'); + } + + /** + * @param int $level + * @param string $content + * + * @return string + */ + private function createSetextHeader($level, $content) + { + $length = function_exists('mb_strlen') ? mb_strlen($content, 'utf-8') : strlen($content); + $underline = ($level === 1) ? '=' : '-'; + + return $content . "\n" . str_repeat($underline, $length) . "\n\n"; + } + + /** + * @param int $level + * @param string $content + * + * @return string + */ + private function createAtxHeader($level, $content) + { + $prefix = str_repeat('#', $level) . ' '; + + return $prefix . $content . "\n\n"; + } +} diff --git a/vendor/league/html-to-markdown/src/Converter/HorizontalRuleConverter.php b/vendor/league/html-to-markdown/src/Converter/HorizontalRuleConverter.php new file mode 100644 index 000000000..8f54f9397 --- /dev/null +++ b/vendor/league/html-to-markdown/src/Converter/HorizontalRuleConverter.php @@ -0,0 +1,26 @@ +<?php + +namespace League\HTMLToMarkdown\Converter; + +use League\HTMLToMarkdown\ElementInterface; + +class HorizontalRuleConverter implements ConverterInterface +{ + /** + * @param ElementInterface $element + * + * @return string + */ + public function convert(ElementInterface $element) + { + return "- - - - - -\n\n"; + } + + /** + * @return string[] + */ + public function getSupportedTags() + { + return array('hr'); + } +} diff --git a/vendor/league/html-to-markdown/src/Converter/ImageConverter.php b/vendor/league/html-to-markdown/src/Converter/ImageConverter.php new file mode 100644 index 000000000..657c769c2 --- /dev/null +++ b/vendor/league/html-to-markdown/src/Converter/ImageConverter.php @@ -0,0 +1,35 @@ +<?php + +namespace League\HTMLToMarkdown\Converter; + +use League\HTMLToMarkdown\ElementInterface; + +class ImageConverter implements ConverterInterface +{ + /** + * @param ElementInterface $element + * + * @return string + */ + public function convert(ElementInterface $element) + { + $src = $element->getAttribute('src'); + $alt = $element->getAttribute('alt'); + $title = $element->getAttribute('title'); + + if ($title !== '') { + // No newlines added. <img> should be in a block-level element. + return '![' . $alt . '](' . $src . ' "' . $title . '")'; + } + + return '![' . $alt . '](' . $src . ')'; + } + + /** + * @return string[] + */ + public function getSupportedTags() + { + return array('img'); + } +} diff --git a/vendor/league/html-to-markdown/src/Converter/LinkConverter.php b/vendor/league/html-to-markdown/src/Converter/LinkConverter.php new file mode 100644 index 000000000..f0765f38b --- /dev/null +++ b/vendor/league/html-to-markdown/src/Converter/LinkConverter.php @@ -0,0 +1,52 @@ +<?php + +namespace League\HTMLToMarkdown\Converter; + +use League\HTMLToMarkdown\ElementInterface; + +class LinkConverter implements ConverterInterface +{ + /** + * @param ElementInterface $element + * + * @return string + */ + public function convert(ElementInterface $element) + { + $href = $element->getAttribute('href'); + $title = $element->getAttribute('title'); + $text = trim($element->getValue()); + + if ($title !== '') { + $markdown = '[' . $text . '](' . $href . ' "' . $title . '")'; + } elseif ($href === $text && $this->isValidAutolink($href)) { + $markdown = '<' . $href . '>'; + } else { + $markdown = '[' . $text . '](' . $href . ')'; + } + + if (!$href) { + $markdown = html_entity_decode($element->getChildrenAsString()); + } + + return $markdown; + } + + /** + * @return string[] + */ + public function getSupportedTags() + { + return array('a'); + } + + /** + * @param string $href + * + * @return bool + */ + private function isValidAutolink($href) + { + return preg_match('/^[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*/i', $href) === 1; + } +} diff --git a/vendor/league/html-to-markdown/src/Converter/ListBlockConverter.php b/vendor/league/html-to-markdown/src/Converter/ListBlockConverter.php new file mode 100644 index 000000000..07a4c85a9 --- /dev/null +++ b/vendor/league/html-to-markdown/src/Converter/ListBlockConverter.php @@ -0,0 +1,26 @@ +<?php + +namespace League\HTMLToMarkdown\Converter; + +use League\HTMLToMarkdown\ElementInterface; + +class ListBlockConverter implements ConverterInterface +{ + /** + * @param ElementInterface $element + * + * @return string + */ + public function convert(ElementInterface $element) + { + return $element->getValue() . "\n"; + } + + /** + * @return string[] + */ + public function getSupportedTags() + { + return array('ol', 'ul'); + } +} diff --git a/vendor/league/html-to-markdown/src/Converter/ListItemConverter.php b/vendor/league/html-to-markdown/src/Converter/ListItemConverter.php new file mode 100644 index 000000000..dafec077c --- /dev/null +++ b/vendor/league/html-to-markdown/src/Converter/ListItemConverter.php @@ -0,0 +1,47 @@ +<?php + +namespace League\HTMLToMarkdown\Converter; + +use League\HTMLToMarkdown\ElementInterface; + +class ListItemConverter implements ConverterInterface +{ + /** + * @param ElementInterface $element + * + * @return string + */ + public function convert(ElementInterface $element) + { + // If parent is an ol, use numbers, otherwise, use dashes + $list_type = $element->getParent()->getTagName(); + + // Add spaces to start for nested list items + $level = $element->getListItemLevel($element); + + $prefixForParagraph = str_repeat(' ', $level + 1); + $value = trim(implode("\n" . $prefixForParagraph, explode("\n", trim($element->getValue())))); + + // If list item is the first in a nested list, add a newline before it + $prefix = ''; + if ($level > 0 && $element->getSiblingPosition() === 1) { + $prefix = "\n"; + } + + if ($list_type === 'ul') { + return $prefix . '- ' . $value . "\n"; + } + + $number = $element->getSiblingPosition(); + + return $prefix . $number . '. ' . $value . "\n"; + } + + /** + * @return string[] + */ + public function getSupportedTags() + { + return array('li'); + } +} diff --git a/vendor/league/html-to-markdown/src/Converter/ParagraphConverter.php b/vendor/league/html-to-markdown/src/Converter/ParagraphConverter.php new file mode 100644 index 000000000..cf852bfcf --- /dev/null +++ b/vendor/league/html-to-markdown/src/Converter/ParagraphConverter.php @@ -0,0 +1,124 @@ +<?php + +namespace League\HTMLToMarkdown\Converter; + +use League\HTMLToMarkdown\ElementInterface; + +class ParagraphConverter implements ConverterInterface +{ + /** + * @param ElementInterface $element + * + * @return string + */ + public function convert(ElementInterface $element) + { + $value = $element->getValue(); + + $markdown = ''; + + $lines = preg_split('/\r\n|\r|\n/', $value); + foreach ($lines as $line) { + /* + * Some special characters need to be escaped based on the position that they appear + * The following function will deal with those special cases. + */ + $markdown .= $this->escapeSpecialCharacters($line); + $markdown .= "\n"; + } + + return trim($markdown) !== '' ? rtrim($markdown) . "\n\n" : ''; + } + + /** + * @return string[] + */ + public function getSupportedTags() + { + return array('p'); + } + + /** + * @param string $line + * + * @return string + */ + private function escapeSpecialCharacters($line) + { + $line = $this->escapeFirstCharacters($line); + $line = $this->escapeOtherCharacters($line); + $line = $this->escapeOtherCharactersRegex($line); + + return $line; + } + + /** + * @param string $line + * + * @return string + */ + private function escapeFirstCharacters($line) + { + $escapable = array( + '>', + '- ', + '+ ', + '--', + '~~~', + '---', + '- - -' + ); + + foreach ($escapable as $i) { + if (strpos(ltrim($line), $i) === 0) { + // Found a character that must be escaped, adding a backslash before + return '\\' . ltrim($line); + } + } + + return $line; + } + + /** + * @param string $line + * + * @return string + */ + private function escapeOtherCharacters($line) + { + $escapable = array( + '<!--' + ); + + foreach ($escapable as $i) { + if (strpos($line, $i) !== false) { + // Found an escapable character, escaping it + $line = substr_replace($line, '\\', strpos($line, $i), 0); + } + } + + return $line; + } + + /** + * @param string $line + * + * @return string + */ + private function escapeOtherCharactersRegex($line) + { + $regExs = array( + // Match numbers ending on ')' or '.' that are at the beginning of the line. + '/^[0-9]+(?=\)|\.)/' + ); + + foreach ($regExs as $i) { + if (preg_match($i, $line, $match)) { + // Matched an escapable character, adding a backslash on the string before the offending character + $line = substr_replace($line, '\\', strlen($match[0]), 0); + } + } + + return $line; + } +} diff --git a/vendor/league/html-to-markdown/src/Converter/PreformattedConverter.php b/vendor/league/html-to-markdown/src/Converter/PreformattedConverter.php new file mode 100644 index 000000000..7a4ec3357 --- /dev/null +++ b/vendor/league/html-to-markdown/src/Converter/PreformattedConverter.php @@ -0,0 +1,59 @@ +<?php + +namespace League\HTMLToMarkdown\Converter; + +use League\HTMLToMarkdown\ElementInterface; + +class PreformattedConverter implements ConverterInterface +{ + /** + * @param ElementInterface $element + * + * @return string + */ + public function convert(ElementInterface $element) + { + $markdown = ''; + + $pre_content = html_entity_decode($element->getChildrenAsString()); + $pre_content = str_replace(array('<pre>', '</pre>'), '', $pre_content); + + /* + * Checking for the code tag. + * Usually pre tags are used along with code tags. This conditional will check for already converted code tags, + * which use backticks, and if those backticks are at the beginning and at the end of the string it means + * there's no more information to convert. + */ + + $firstBacktick = strpos(trim($pre_content), '`'); + $lastBacktick = strrpos(trim($pre_content), '`'); + if ($firstBacktick === 0 && $lastBacktick === strlen(trim($pre_content)) - 1) { + return $pre_content; + } + + // If the execution reaches this point it means it's just a pre tag, with no code tag nested + + // Normalizing new lines + $pre_content = preg_replace('/\r\n|\r|\n/', PHP_EOL, $pre_content); + + // Checking if the string has multiple lines + $lines = preg_split('/\r\n|\r|\n/', $pre_content); + if (count($lines) > 1) { + // Multiple lines detected, adding three backticks and newlines + $markdown .= '```' . "\n" . $pre_content . "\n" . '```'; + } else { + // One line of code, wrapping it on one backtick. + $markdown .= '`' . $pre_content . '`'; + } + + return $markdown; + } + + /** + * @return string[] + */ + public function getSupportedTags() + { + return array('pre'); + } +} diff --git a/vendor/league/html-to-markdown/src/Converter/TextConverter.php b/vendor/league/html-to-markdown/src/Converter/TextConverter.php new file mode 100644 index 000000000..d6d91e16f --- /dev/null +++ b/vendor/league/html-to-markdown/src/Converter/TextConverter.php @@ -0,0 +1,46 @@ +<?php + +namespace League\HTMLToMarkdown\Converter; + +use League\HTMLToMarkdown\ElementInterface; + +class TextConverter implements ConverterInterface +{ + /** + * @param ElementInterface $element + * + * @return string + */ + public function convert(ElementInterface $element) + { + $markdown = $element->getValue(); + + // Remove leftover \n at the beginning of the line + $markdown = ltrim($markdown, "\n"); + + // Replace sequences of invisible characters with spaces + $markdown = preg_replace('~\s+~u', ' ', $markdown); + + // Escape the following characters: '*', '_', '[', ']' and '\' + $markdown = preg_replace('~([*_\\[\\]\\\\])~u', '\\\\$1', $markdown); + + $markdown = preg_replace('~^#~u', '\\\\#', $markdown); + + if ($markdown === ' ') { + $next = $element->getNext(); + if (!$next || $next->isBlock()) { + $markdown = ''; + } + } + + return $markdown; + } + + /** + * @return string[] + */ + public function getSupportedTags() + { + return array('#text'); + } +} diff --git a/vendor/league/html-to-markdown/src/Element.php b/vendor/league/html-to-markdown/src/Element.php new file mode 100644 index 000000000..e1e9d1a09 --- /dev/null +++ b/vendor/league/html-to-markdown/src/Element.php @@ -0,0 +1,257 @@ +<?php + +namespace League\HTMLToMarkdown; + +class Element implements ElementInterface +{ + /** + * @var \DOMNode + */ + protected $node; + + /** + * @var ElementInterface|null + */ + private $nextCached; + + public function __construct(\DOMNode $node) + { + $this->node = $node; + } + + /** + * @return bool + */ + public function isBlock() + { + switch ($this->getTagName()) { + case 'blockquote': + case 'body': + case 'code': + case 'div': + case 'h1': + case 'h2': + case 'h3': + case 'h4': + case 'h5': + case 'h6': + case 'hr': + case 'html': + case 'li': + case 'p': + case 'ol': + case 'ul': + return true; + default: + return false; + } + } + + /** + * @return bool + */ + public function isText() + { + return $this->getTagName() === '#text'; + } + + /** + * @return bool + */ + public function isWhitespace() + { + return $this->getTagName() === '#text' && trim($this->getValue()) === ''; + } + + /** + * @return string + */ + public function getTagName() + { + return $this->node->nodeName; + } + + /** + * @return string + */ + public function getValue() + { + return $this->node->nodeValue; + } + + /** + * @return ElementInterface|null + */ + public function getParent() + { + return new static($this->node->parentNode) ?: null; + } + + /** + * @return bool + */ + public function hasChildren() + { + return $this->node->hasChildNodes(); + } + + /** + * @return ElementInterface[] + */ + public function getChildren() + { + $ret = array(); + /** @var \DOMNode $node */ + foreach ($this->node->childNodes as $node) { + $ret[] = new static($node); + } + + return $ret; + } + + /** + * @return ElementInterface|null + */ + public function getNext() + { + if ($this->nextCached === null) { + $nextNode = $this->getNextNode($this->node); + if ($nextNode !== null) { + $this->nextCached = new static($nextNode); + } + } + + return $this->nextCached; + } + + /** + * @param \DomNode $node + * @param bool $checkChildren + * + * @return \DomNode|null + */ + private function getNextNode($node, $checkChildren = true) + { + if ($checkChildren && $node->firstChild) { + return $node->firstChild; + } + + if ($node->nextSibling) { + return $node->nextSibling; + } + + if ($node->parentNode) { + return $this->getNextNode($node->parentNode, false); + } + } + + /** + * @param string[]|string $tagNames + * + * @return bool + */ + public function isDescendantOf($tagNames) + { + if (!is_array($tagNames)) { + $tagNames = array($tagNames); + } + + for ($p = $this->node->parentNode; $p !== false; $p = $p->parentNode) { + if (is_null($p)) { + return false; + } + + if (in_array($p->nodeName, $tagNames)) { + return true; + } + } + + return false; + } + + /** + * @param string $markdown + */ + public function setFinalMarkdown($markdown) + { + $markdown_node = $this->node->ownerDocument->createTextNode($markdown); + $this->node->parentNode->replaceChild($markdown_node, $this->node); + } + + /** + * @return string + */ + public function getChildrenAsString() + { + return $this->node->C14N(); + } + + /** + * @return int + */ + public function getSiblingPosition() + { + $position = 0; + + // Loop through all nodes and find the given $node + foreach ($this->getParent()->getChildren() as $current_node) { + if (!$current_node->isWhitespace()) { + $position++; + } + + // TODO: Need a less-buggy way of comparing these + // Perhaps we can somehow ensure that we always have the exact same object and use === instead? + if ($this->equals($current_node)) { + break; + } + } + + return $position; + } + + /** + * @return int + */ + public function getListItemLevel() + { + $level = 0; + $parent = $this->getParent(); + + while ($parent !== null && $parent->node->parentNode) { + if ($parent->getTagName() === 'li') { + $level++; + } + $parent = $parent->getParent(); + } + + return $level; + } + + /** + * @param string $name + * + * @return string + */ + public function getAttribute($name) + { + if ($this->node instanceof \DOMElement) { + return $this->node->getAttribute($name); + } + + return ''; + } + + /** + * @param ElementInterface $element + * + * @return bool + */ + public function equals(ElementInterface $element) + { + if ($element instanceof self) { + return $element->node === $this->node; + } + + return $element === $this; + } +} diff --git a/vendor/league/html-to-markdown/src/ElementInterface.php b/vendor/league/html-to-markdown/src/ElementInterface.php new file mode 100644 index 000000000..138ddf286 --- /dev/null +++ b/vendor/league/html-to-markdown/src/ElementInterface.php @@ -0,0 +1,80 @@ +<?php + +namespace League\HTMLToMarkdown; + +interface ElementInterface +{ + /** + * @return bool + */ + public function isBlock(); + + /** + * @return bool + */ + public function isText(); + + /** + * @return bool + */ + public function isWhitespace(); + + /** + * @return string + */ + public function getTagName(); + + /** + * @return string + */ + public function getValue(); + + /** + * @return ElementInterface|null + */ + public function getParent(); + + /** + * @param string|string[] $tagNames + * + * @return bool + */ + public function isDescendantOf($tagNames); + + /** + * @return bool + */ + public function hasChildren(); + + /** + * @return ElementInterface[] + */ + public function getChildren(); + + /** + * @return ElementInterface|null + */ + public function getNext(); + + /** + * @return int + */ + public function getSiblingPosition(); + + /** + * @return string + */ + public function getChildrenAsString(); + + /** + * @param string $markdown + */ + public function setFinalMarkdown($markdown); + + /** + * @param string $name + * + * @return string + */ + public function getAttribute($name); +} diff --git a/vendor/league/html-to-markdown/src/Environment.php b/vendor/league/html-to-markdown/src/Environment.php new file mode 100644 index 000000000..560cfe613 --- /dev/null +++ b/vendor/league/html-to-markdown/src/Environment.php @@ -0,0 +1,104 @@ +<?php + +namespace League\HTMLToMarkdown; + +use League\HTMLToMarkdown\Converter\BlockquoteConverter; +use League\HTMLToMarkdown\Converter\CodeConverter; +use League\HTMLToMarkdown\Converter\CommentConverter; +use League\HTMLToMarkdown\Converter\ConverterInterface; +use League\HTMLToMarkdown\Converter\DefaultConverter; +use League\HTMLToMarkdown\Converter\DivConverter; +use League\HTMLToMarkdown\Converter\EmphasisConverter; +use League\HTMLToMarkdown\Converter\HardBreakConverter; +use League\HTMLToMarkdown\Converter\HeaderConverter; +use League\HTMLToMarkdown\Converter\HorizontalRuleConverter; +use League\HTMLToMarkdown\Converter\ImageConverter; +use League\HTMLToMarkdown\Converter\LinkConverter; +use League\HTMLToMarkdown\Converter\ListBlockConverter; +use League\HTMLToMarkdown\Converter\ListItemConverter; +use League\HTMLToMarkdown\Converter\ParagraphConverter; +use League\HTMLToMarkdown\Converter\PreformattedConverter; +use League\HTMLToMarkdown\Converter\TextConverter; + +final class Environment +{ + /** + * @var Configuration + */ + protected $config; + + /** + * @var ConverterInterface[] + */ + protected $converters = array(); + + public function __construct(array $config = array()) + { + $this->config = new Configuration($config); + $this->addConverter(new DefaultConverter()); + } + + /** + * @return Configuration + */ + public function getConfig() + { + return $this->config; + } + + /** + * @param ConverterInterface $converter + */ + public function addConverter(ConverterInterface $converter) + { + if ($converter instanceof ConfigurationAwareInterface) { + $converter->setConfig($this->config); + } + + foreach ($converter->getSupportedTags() as $tag) { + $this->converters[$tag] = $converter; + } + } + + /** + * @param string $tag + * + * @return ConverterInterface + */ + public function getConverterByTag($tag) + { + if (isset($this->converters[$tag])) { + return $this->converters[$tag]; + } + + return $this->converters[DefaultConverter::DEFAULT_CONVERTER]; + } + + /** + * @param array $config + * + * @return Environment + */ + public static function createDefaultEnvironment(array $config = array()) + { + $environment = new static($config); + + $environment->addConverter(new BlockquoteConverter()); + $environment->addConverter(new CodeConverter()); + $environment->addConverter(new CommentConverter()); + $environment->addConverter(new DivConverter()); + $environment->addConverter(new EmphasisConverter()); + $environment->addConverter(new HardBreakConverter()); + $environment->addConverter(new HeaderConverter()); + $environment->addConverter(new HorizontalRuleConverter()); + $environment->addConverter(new ImageConverter()); + $environment->addConverter(new LinkConverter()); + $environment->addConverter(new ListBlockConverter()); + $environment->addConverter(new ListItemConverter()); + $environment->addConverter(new ParagraphConverter()); + $environment->addConverter(new PreformattedConverter()); + $environment->addConverter(new TextConverter()); + + return $environment; + } +} diff --git a/vendor/league/html-to-markdown/src/HtmlConverter.php b/vendor/league/html-to-markdown/src/HtmlConverter.php new file mode 100644 index 000000000..db3c29e1c --- /dev/null +++ b/vendor/league/html-to-markdown/src/HtmlConverter.php @@ -0,0 +1,231 @@ +<?php + +namespace League\HTMLToMarkdown; + +/** + * Class HtmlConverter + * + * A helper class to convert HTML to Markdown. + * + * @author Colin O'Dell <colinodell@gmail.com> + * @author Nick Cernis <nick@cern.is> + * + * @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub. + * + * @license http://www.opensource.org/licenses/mit-license.php MIT + */ +class HtmlConverter +{ + /** + * @var Environment + */ + protected $environment; + + /** + * Constructor + * + * @param Environment|array $options Environment object or configuration options + */ + public function __construct($options = array()) + { + if ($options instanceof Environment) { + $this->environment = $options; + } elseif (is_array($options)) { + $defaults = array( + 'header_style' => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2 + 'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML + 'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output. + 'bold_style' => '**', // Set to '__' if you prefer the underlined style + 'italic_style' => '_', // Set to '*' if you prefer the asterisk style + 'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: 'meta style script' + 'hard_break' => false,// Set to true to turn <br> into `\n` instead of ` \n` + ); + + $this->environment = Environment::createDefaultEnvironment($defaults); + + $this->environment->getConfig()->merge($options); + } + } + + /** + * @return Environment + */ + public function getEnvironment() + { + return $this->environment; + } + + /** + * @return Configuration + */ + public function getConfig() + { + return $this->environment->getConfig(); + } + + /** + * Convert + * + * @see HtmlConverter::convert + * + * @param string $html + * + * @return string The Markdown version of the html + */ + public function __invoke($html) + { + return $this->convert($html); + } + + /** + * Convert + * + * Loads HTML and passes to getMarkdown() + * + * @param string $html + * + * @throws \InvalidArgumentException + * + * @return string The Markdown version of the html + */ + public function convert($html) + { + if (trim($html) === '') { + return ''; + } + + $document = $this->createDOMDocument($html); + + // Work on the entire DOM tree (including head and body) + if (!($root = $document->getElementsByTagName('html')->item(0))) { + throw new \InvalidArgumentException('Invalid HTML was provided'); + } + + $rootElement = new Element($root); + $this->convertChildren($rootElement); + + // Store the now-modified DOMDocument as a string + $markdown = $document->saveHTML(); + + return $this->sanitize($markdown); + } + + /** + * @param string $html + * + * @return \DOMDocument + */ + private function createDOMDocument($html) + { + $document = new \DOMDocument(); + + if ($this->getConfig()->getOption('suppress_errors')) { + // Suppress conversion errors (from http://bit.ly/pCCRSX) + libxml_use_internal_errors(true); + } + + // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt) + $document->loadHTML('<?xml encoding="UTF-8">' . $html); + $document->encoding = 'UTF-8'; + + if ($this->getConfig()->getOption('suppress_errors')) { + libxml_clear_errors(); + } + + return $document; + } + + /** + * Convert Children + * + * Recursive function to drill into the DOM and convert each node into Markdown from the inside out. + * + * Finds children of each node and convert those to #text nodes containing their Markdown equivalent, + * starting with the innermost element and working up to the outermost element. + * + * @param ElementInterface $element + */ + private function convertChildren(ElementInterface $element) + { + // Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML + // except if the current node is a code tag, which needs to be converted by the CodeConverter. + if ($element->isDescendantOf(array('pre', 'code')) && $element->getTagName() !== 'code') { + return; + } + + // If the node has children, convert those to Markdown first + if ($element->hasChildren()) { + foreach ($element->getChildren() as $child) { + $this->convertChildren($child); + } + } + + // Now that child nodes have been converted, convert the original node + $markdown = $this->convertToMarkdown($element); + + // Create a DOM text node containing the Markdown equivalent of the original node + + // Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title' + $element->setFinalMarkdown($markdown); + } + + /** + * Convert to Markdown + * + * Converts an individual node into a #text node containing a string of its Markdown equivalent. + * + * Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title' + * + * @param ElementInterface $element + * + * @return string The converted HTML as Markdown + */ + protected function convertToMarkdown(ElementInterface $element) + { + $tag = $element->getTagName(); + + // Strip nodes named in remove_nodes + $tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes')); + if (in_array($tag, $tags_to_remove)) { + return false; + } + + $converter = $this->environment->getConverterByTag($tag); + + return $converter->convert($element); + } + + /** + * @param string $markdown + * + * @return string + */ + protected function sanitize($markdown) + { + $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8'); + $markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration + $markdown = trim($markdown); // Remove blank spaces at the beggining of the html + + /* + * Removing unwanted tags. Tags should be added to the array in the order they are expected. + * XML, html and body opening tags should be in that order. Same case with closing tags + */ + $unwanted = array('<?xml encoding="UTF-8">', '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '
'); + + foreach ($unwanted as $tag) { + if (strpos($tag, '/') === false) { + // Opening tags + if (strpos($markdown, $tag) === 0) { + $markdown = substr($markdown, strlen($tag)); + } + } else { + // Closing tags + if (strpos($markdown, $tag) === strlen($markdown) - strlen($tag)) { + $markdown = substr($markdown, 0, -strlen($tag)); + } + } + } + + return trim($markdown, "\n\r\0\x0B"); + } +} |