From 32c4614a324f07133fee1163fff77aab5e9cc258 Mon Sep 17 00:00:00 2001 From: Max Kostikov Date: Fri, 2 Nov 2018 10:52:52 +0100 Subject: Workaround for incorrect ISO-8859-5 encoded content conversion --- Zotlabs/Module/Linkinfo.php | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Zotlabs/Module/Linkinfo.php b/Zotlabs/Module/Linkinfo.php index a0ad17e68..5c3946d4f 100644 --- a/Zotlabs/Module/Linkinfo.php +++ b/Zotlabs/Module/Linkinfo.php @@ -228,8 +228,11 @@ class Linkinfo extends \Zotlabs\Web\Controller { $header = $result['header']; $body = $result['body']; - - $body = mb_convert_encoding($body, 'UTF-8', (preg_match('/meta.+content=["|\']text\/html;\s+charset=([^"|\']+)/i', $body, $o) ? $o[1] : 'UTF-8')); + + $cp = (preg_match('/meta.+content=["|\']text\/html;\s+charset=([^"|\']+)/i', $body, $o) ? $o[1] : 'AUTO'); + if(strtoupper($cp) == 'ISO-8859-5') + $cp = 'AUTO'; + $body = mb_convert_encoding($body, 'UTF-8', $cp); $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8"); $doc = new \DOMDocument(); -- cgit v1.2.3 From 0d9d0a4b70a2dd73005089f1128f4d6fc10340ce Mon Sep 17 00:00:00 2001 From: Max Kostikov Date: Fri, 2 Nov 2018 22:37:53 +0100 Subject: more precise codepage detection --- Zotlabs/Module/Linkinfo.php | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Zotlabs/Module/Linkinfo.php b/Zotlabs/Module/Linkinfo.php index 5c3946d4f..4bd1deefb 100644 --- a/Zotlabs/Module/Linkinfo.php +++ b/Zotlabs/Module/Linkinfo.php @@ -229,9 +229,11 @@ class Linkinfo extends \Zotlabs\Web\Controller { $header = $result['header']; $body = $result['body']; - $cp = (preg_match('/meta.+content=["|\']text\/html;\s+charset=([^"|\']+)/i', $body, $o) ? $o[1] : 'AUTO'); - if(strtoupper($cp) == 'ISO-8859-5') - $cp = 'AUTO'; + // Check codepage in page or in HTTP headers if not exist + $cp = (preg_match('/meta.+content=["|\']text\/html;\s+charset=([^"|\']+)/i', $body, $o) ? $o[1] : ''); + if(empty($cp) || strtoupper($cp) == 'ISO-8859-5') + $cp = (preg_match('/Content-Type: text\/html;\s+charset=(.+)/im', $header, $o) ? $o[1] : 'AUTO'); + $body = mb_convert_encoding($body, 'UTF-8', $cp); $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8"); -- cgit v1.2.3 From a9bbe0b12882db9d9c3192c3cd0055a50e9c34d5 Mon Sep 17 00:00:00 2001 From: Max Kostikov Date: Fri, 2 Nov 2018 22:40:48 +0100 Subject: Update Linkinfo.php --- Zotlabs/Module/Linkinfo.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Zotlabs/Module/Linkinfo.php b/Zotlabs/Module/Linkinfo.php index 4bd1deefb..6b067f6fd 100644 --- a/Zotlabs/Module/Linkinfo.php +++ b/Zotlabs/Module/Linkinfo.php @@ -230,9 +230,9 @@ class Linkinfo extends \Zotlabs\Web\Controller { $body = $result['body']; // Check codepage in page or in HTTP headers if not exist - $cp = (preg_match('/meta.+content=["|\']text\/html;\s+charset=([^"|\']+)/i', $body, $o) ? $o[1] : ''); + $cp = (preg_match('/meta.+content=["|\']text\/html; charset=([^"|\']+)/i', $body, $o) ? $o[1] : ''); if(empty($cp) || strtoupper($cp) == 'ISO-8859-5') - $cp = (preg_match('/Content-Type: text\/html;\s+charset=(.+)/im', $header, $o) ? $o[1] : 'AUTO'); + $cp = (preg_match('/Content-Type: text\/html; charset=(.+)/im', $header, $o) ? $o[1] : 'AUTO'); $body = mb_convert_encoding($body, 'UTF-8', $cp); $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8"); -- cgit v1.2.3 From 2f3b4a7aafeaf92d9937aa71acce8e777c46f63d Mon Sep 17 00:00:00 2001 From: Max Kostikov Date: Sat, 3 Nov 2018 00:01:15 +0100 Subject: Update Linkinfo.php --- Zotlabs/Module/Linkinfo.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Zotlabs/Module/Linkinfo.php b/Zotlabs/Module/Linkinfo.php index 6b067f6fd..8ce82f7e9 100644 --- a/Zotlabs/Module/Linkinfo.php +++ b/Zotlabs/Module/Linkinfo.php @@ -229,10 +229,10 @@ class Linkinfo extends \Zotlabs\Web\Controller { $header = $result['header']; $body = $result['body']; - // Check codepage in page or in HTTP headers if not exist - $cp = (preg_match('/meta.+content=["|\']text\/html; charset=([^"|\']+)/i', $body, $o) ? $o[1] : ''); - if(empty($cp) || strtoupper($cp) == 'ISO-8859-5') - $cp = (preg_match('/Content-Type: text\/html; charset=(.+)/im', $header, $o) ? $o[1] : 'AUTO'); + // Check codepage in HTTP headers or HTML if not exist + $cp = trim((preg_match('/Content-Type: text\/html; charset=(.+)/im', $header, $o) ? $o[1] : '')); + if(empty($cp)) + $cp = (preg_match('/meta.+content=["|\']text\/html; charset=([^"|\']+)/i', $body, $o) ? $o[1] : 'AUTO'); $body = mb_convert_encoding($body, 'UTF-8', $cp); $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8"); -- cgit v1.2.3 From a7624ea48bb89122ef23d59d977642994feb6ad5 Mon Sep 17 00:00:00 2001 From: Max Kostikov Date: Sat, 3 Nov 2018 00:10:05 +0100 Subject: Update Linkinfo.php --- Zotlabs/Module/Linkinfo.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Zotlabs/Module/Linkinfo.php b/Zotlabs/Module/Linkinfo.php index 8ce82f7e9..623b574af 100644 --- a/Zotlabs/Module/Linkinfo.php +++ b/Zotlabs/Module/Linkinfo.php @@ -230,7 +230,7 @@ class Linkinfo extends \Zotlabs\Web\Controller { $body = $result['body']; // Check codepage in HTTP headers or HTML if not exist - $cp = trim((preg_match('/Content-Type: text\/html; charset=(.+)/im', $header, $o) ? $o[1] : '')); + $cp = (preg_match('/Content-Type: text\/html; charset=(.+)/i', $header, $o) ? trim($o[1]) : ''); if(empty($cp)) $cp = (preg_match('/meta.+content=["|\']text\/html; charset=([^"|\']+)/i', $body, $o) ? $o[1] : 'AUTO'); -- cgit v1.2.3 From 46b0510b994fb885cd3c64b703ce73c2cee6d6c1 Mon Sep 17 00:00:00 2001 From: Max Kostikov Date: Sat, 3 Nov 2018 10:41:31 +0100 Subject: Update Linkinfo.php --- Zotlabs/Module/Linkinfo.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Zotlabs/Module/Linkinfo.php b/Zotlabs/Module/Linkinfo.php index 623b574af..670967370 100644 --- a/Zotlabs/Module/Linkinfo.php +++ b/Zotlabs/Module/Linkinfo.php @@ -230,7 +230,7 @@ class Linkinfo extends \Zotlabs\Web\Controller { $body = $result['body']; // Check codepage in HTTP headers or HTML if not exist - $cp = (preg_match('/Content-Type: text\/html; charset=(.+)/i', $header, $o) ? trim($o[1]) : ''); + $cp = (preg_match('/Content-Type: text\/html; charset=(.+)\r\n/i', $header, $o) ? $o[1] : ''); if(empty($cp)) $cp = (preg_match('/meta.+content=["|\']text\/html; charset=([^"|\']+)/i', $body, $o) ? $o[1] : 'AUTO'); -- cgit v1.2.3