suppress some scraping errors when confronted with hybrid/strange

feeds that provide insufficient content-type and choke the html parser.
author: Friendika <info@friendika.com> 2011-02-02 14:48:27 -0800
committer: Friendika <info@friendika.com> 2011-02-02 14:48:27 -0800
commit: ee45dee9324be48cd87a883405bdd9e11e3f39f5 (patch)
tree: ae4588ffabf2c2099e705f049bbf80d5ec494774
parent: fce9988f73ee8ad3624586e6866a68a2ae952ef8 (diff)
download: volse-hubzilla-ee45dee9324be48cd87a883405bdd9e11e3f39f5.tar.gz
volse-hubzilla-ee45dee9324be48cd87a883405bdd9e11e3f39f5.tar.bz2
volse-hubzilla-ee45dee9324be48cd87a883405bdd9e11e3f39f5.zip
2 files changed, 21 insertions, 2 deletions
diff --git a/boot.php b/boot.php
index e47f1834d..0dd507168 100644
--- a/boot.php
+++ b/boot.php
@@ -1366,6 +1366,7 @@ function lrdd($uri) {
 	else {
 		$html = fetch_url($uri);
 		$headers = $a->get_curl_headers();
+		logger('lrdd: headers=' . $headers, LOGGER_DEBUG);
 		$lines = explode("\n",$headers);
 		if(count($lines)) {
 			foreach($lines as $line) {				
@@ -1377,6 +1378,8 @@ function lrdd($uri) {
 				// don't try and run feeds through the html5 parser
 				if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
 					return array();
+				if(stristr($html,'<rss') || stristr($html,'<feed'))
+					return array();
 			}
 		}
 		if(! isset($link)) {
diff --git a/include/Scrape.php b/include/Scrape.php
index bb42c3bdd..ff9899252 100644
--- a/include/Scrape.php
+++ b/include/Scrape.php
@@ -8,12 +8,18 @@ function scrape_dfrn($url) {
 	$a = get_app();
 
 	$ret = array();
+
+	logger('scrape_dfrn: url=' . $url);
+
 	$s = fetch_url($url);
 
 	if(! $s) 
 		return $ret;
 
 	$headers = $a->get_curl_headers();
+	logger('scrape_dfrn: headers=' . $headers, LOGGER_DEBUG);
+
+
 	$lines = explode("\n",$headers);
 	if(count($lines)) {
 		foreach($lines as $line) {				
@@ -93,12 +99,17 @@ function scrape_meta($url) {
 	$a = get_app();
 
 	$ret = array();
+
+	logger('scrape_meta: url=' . $url);
+
 	$s = fetch_url($url);
 
 	if(! $s) 
 		return $ret;
 
 	$headers = $a->get_curl_headers();
+	logger('scrape_meta: headers=' . $headers, LOGGER_DEBUG);
+
 	$lines = explode("\n",$headers);
 	if(count($lines)) {
 		foreach($lines as $line) {				
@@ -135,6 +146,9 @@ function scrape_vcard($url) {
 	$a = get_app();
 
 	$ret = array();
+
+	logger('scrape_vcard: url=' . $url);
+
 	$s = fetch_url($url);
 
 	if(! $s) 
@@ -190,15 +204,17 @@ function scrape_feed($url) {
 		return $ret;
 
 	$headers = $a->get_curl_headers();
+	logger('scrape_feed: headers=' . $headers, LOGGER_DEBUG);
+
 	$lines = explode("\n",$headers);
 	if(count($lines)) {
 		foreach($lines as $line) {				
 			if(stristr($line,'content-type:')) {
-				if(stristr($line,'application/atom+xml')) {
+				if(stristr($line,'application/atom+xml') || stristr($s,'<feed')) {
 					$ret['feed_atom'] = $url;
 					return $ret;
 				}
- 				if(stristr($line,'application/rss+xml')) {
+ 				if(stristr($line,'application/rss+xml') || stristr($s,'<rss')) {
 					$ret['feed_rss'] = $url;
 					return ret;
 				}
author	Friendika <info@friendika.com>	2011-02-02 14:48:27 -0800
committer	Friendika <info@friendika.com>	2011-02-02 14:48:27 -0800
commit	ee45dee9324be48cd87a883405bdd9e11e3f39f5 (patch)
tree	ae4588ffabf2c2099e705f049bbf80d5ec494774
parent	fce9988f73ee8ad3624586e6866a68a2ae952ef8 (diff)
download	volse-hubzilla-ee45dee9324be48cd87a883405bdd9e11e3f39f5.tar.gz volse-hubzilla-ee45dee9324be48cd87a883405bdd9e11e3f39f5.tar.bz2 volse-hubzilla-ee45dee9324be48cd87a883405bdd9e11e3f39f5.zip