diff options
author | Friendika <info@friendika.com> | 2011-02-02 14:48:27 -0800 |
---|---|---|
committer | Friendika <info@friendika.com> | 2011-02-02 14:48:27 -0800 |
commit | ee45dee9324be48cd87a883405bdd9e11e3f39f5 (patch) | |
tree | ae4588ffabf2c2099e705f049bbf80d5ec494774 | |
parent | fce9988f73ee8ad3624586e6866a68a2ae952ef8 (diff) | |
download | volse-hubzilla-ee45dee9324be48cd87a883405bdd9e11e3f39f5.tar.gz volse-hubzilla-ee45dee9324be48cd87a883405bdd9e11e3f39f5.tar.bz2 volse-hubzilla-ee45dee9324be48cd87a883405bdd9e11e3f39f5.zip |
suppress some scraping errors when confronted with hybrid/strange
feeds that provide insufficient content-type and choke the html parser.
-rw-r--r-- | boot.php | 3 | ||||
-rw-r--r-- | include/Scrape.php | 20 |
2 files changed, 21 insertions, 2 deletions
@@ -1366,6 +1366,7 @@ function lrdd($uri) { else { $html = fetch_url($uri); $headers = $a->get_curl_headers(); + logger('lrdd: headers=' . $headers, LOGGER_DEBUG); $lines = explode("\n",$headers); if(count($lines)) { foreach($lines as $line) { @@ -1377,6 +1378,8 @@ function lrdd($uri) { // don't try and run feeds through the html5 parser if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml')))) return array(); + if(stristr($html,'<rss') || stristr($html,'<feed')) + return array(); } } if(! isset($link)) { diff --git a/include/Scrape.php b/include/Scrape.php index bb42c3bdd..ff9899252 100644 --- a/include/Scrape.php +++ b/include/Scrape.php @@ -8,12 +8,18 @@ function scrape_dfrn($url) { $a = get_app(); $ret = array(); + + logger('scrape_dfrn: url=' . $url); + $s = fetch_url($url); if(! $s) return $ret; $headers = $a->get_curl_headers(); + logger('scrape_dfrn: headers=' . $headers, LOGGER_DEBUG); + + $lines = explode("\n",$headers); if(count($lines)) { foreach($lines as $line) { @@ -93,12 +99,17 @@ function scrape_meta($url) { $a = get_app(); $ret = array(); + + logger('scrape_meta: url=' . $url); + $s = fetch_url($url); if(! $s) return $ret; $headers = $a->get_curl_headers(); + logger('scrape_meta: headers=' . $headers, LOGGER_DEBUG); + $lines = explode("\n",$headers); if(count($lines)) { foreach($lines as $line) { @@ -135,6 +146,9 @@ function scrape_vcard($url) { $a = get_app(); $ret = array(); + + logger('scrape_vcard: url=' . $url); + $s = fetch_url($url); if(! $s) @@ -190,15 +204,17 @@ function scrape_feed($url) { return $ret; $headers = $a->get_curl_headers(); + logger('scrape_feed: headers=' . $headers, LOGGER_DEBUG); + $lines = explode("\n",$headers); if(count($lines)) { foreach($lines as $line) { if(stristr($line,'content-type:')) { - if(stristr($line,'application/atom+xml')) { + if(stristr($line,'application/atom+xml') || stristr($s,'<feed')) { $ret['feed_atom'] = $url; return $ret; } - if(stristr($line,'application/rss+xml')) { + if(stristr($line,'application/rss+xml') || stristr($s,'<rss')) { $ret['feed_rss'] = $url; return ret; } |