1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
|
<?php
namespace Zotlabs\Lib;
/**
* @brief MarkdownSoap class.
*
* Purify Markdown for storage
* @code{.php}
* $x = new MarkdownSoap($string_to_be_cleansed);
* $text = $x->clean();
* @endcode
* What this does:
* 1. extracts code blocks and privately escapes them from processing
* 2. Run html purifier on the content
* 3. put back the code blocks
* 4. run htmlspecialchars on the entire content for safe storage
*
* At render time:
* @code{.php}
* $markdown = \Zotlabs\Lib\MarkdownSoap::unescape($text);
* $html = \Michelf\MarkdownExtra::DefaultTransform($markdown);
* @endcode
*/
class MarkdownSoap {
/**
* @var string
*/
private $str;
/**
* @var string
*/
private $token;
function __construct($s) {
$this->str = $s;
$this->token = random_string(20);
}
function clean() {
$x = $this->extract_code($this->str);
$x = $this->purify($x);
$x = $this->putback_code($x);
$x = $this->escape($x);
return $x;
}
/**
* @brief Extracts code blocks and privately escapes them from processing.
*
* @see encode_code()
* @see putback_code()
*
* @param string $s
* @return string
*/
function extract_code($s) {
$text = preg_replace_callback('{
(?:\n\n|\A\n?)
( # $1 = the code block -- one or more lines, starting with a space/tab
(?>
[ ]{'.'4'.'} # Lines must start with a tab or a tab-width of spaces
.*\n+
)+
)
((?=^[ ]{0,'.'4'.'}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
}xm',
[ $this , 'encode_code' ], $s);
return $text;
}
function encode_code($matches) {
return $this->token . ';' . base64_encode($matches[0]) . ';' ;
}
function decode_code($matches) {
return base64_decode($matches[1]);
}
/**
* @brief Put back the code blocks.
*
* @see extract_code()
* @see decode_code()
*
* @param string $s
* @return string
*/
function putback_code($s) {
$text = preg_replace_callback('{' . $this->token . '\;(.*?)\;}xm', [ $this, 'decode_code' ], $s);
return $text;
}
function purify($s) {
$s = $this->protect_autolinks($s);
$s = purify_html($s);
$s = $this->unprotect_autolinks($s);
return $s;
}
function protect_autolinks($s) {
$s = preg_replace('/\<(https?\:\/\/)(.*?)\>/', '[$1$2]($1$2)', $s);
return $s;
}
function unprotect_autolinks($s) {
return $s;
}
function escape($s) {
return htmlspecialchars($s, ENT_QUOTES, 'UTF-8', false);
}
/**
* @brief Converts special HTML entities back to characters.
*
* @param string $s
* @return string
*/
static public function unescape($s) {
return htmlspecialchars_decode($s, ENT_QUOTES);
}
}
|