aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFriendika <info@friendika.com>2011-04-04 19:36:18 -0700
committerFriendika <info@friendika.com>2011-04-04 19:36:18 -0700
commit793967a1d3c23fcf1f3b00a2832f51e6f473f4bd (patch)
treeec045409190c042c621874f10cf38cf00488b9b3
parent178362e50b846aef1caf4e191ea0394c5d636857 (diff)
downloadvolse-hubzilla-793967a1d3c23fcf1f3b00a2832f51e6f473f4bd.tar.gz
volse-hubzilla-793967a1d3c23fcf1f3b00a2832f51e6f473f4bd.tar.bz2
volse-hubzilla-793967a1d3c23fcf1f3b00a2832f51e6f473f4bd.zip
better handling of troublesome feeds.
-rw-r--r--boot.php44
-rw-r--r--include/Scrape.php2
-rw-r--r--include/items.php23
-rw-r--r--include/poller.php2
-rw-r--r--mod/dfrn_confirm.php2
-rw-r--r--mod/dfrn_poll.php4
-rw-r--r--mod/follow.php14
7 files changed, 63 insertions, 28 deletions
diff --git a/boot.php b/boot.php
index 3b86d0dbe..f5c0e6f92 100644
--- a/boot.php
+++ b/boot.php
@@ -1478,7 +1478,9 @@ function lrdd($uri) {
return array();
logger('lrdd: host_meta: ' . $xml, LOGGER_DATA);
- $h = simplexml_load_string($xml);
+
+ $h = parse_xml_string($xml);
+
$arr = convert_xml_element_to_array($h);
if(isset($arr['xrd']['property'])) {
@@ -1550,16 +1552,19 @@ function lrdd($uri) {
$headers = $a->get_curl_headers();
logger('lrdd: headers=' . $headers, LOGGER_DEBUG);
- require_once('library/HTML5/Parser.php');
- $dom = @HTML5_Parser::parse($html);
-
- if($dom) {
- $items = $dom->getElementsByTagName('link');
- foreach($items as $item) {
- $x = $item->getAttribute('rel');
- if($x == "lrdd") {
- $pagelink = $item->getAttribute('href');
- break;
+ // don't try and parse raw xml as html
+ if(! strstr($html,'<?xml')) {
+ require_once('library/HTML5/Parser.php');
+ $dom = @HTML5_Parser::parse($html);
+
+ if($dom) {
+ $items = $dom->getElementsByTagName('link');
+ foreach($items as $item) {
+ $x = $item->getAttribute('rel');
+ if($x == "lrdd") {
+ $pagelink = $item->getAttribute('href');
+ break;
+ }
}
}
}
@@ -1638,7 +1643,7 @@ function fetch_xrd_links($url) {
return array();
logger('fetch_xrd_links: ' . $xml, LOGGER_DATA);
- $h = simplexml_load_string($xml);
+ $h = parse_xml_string($xml);
$arr = convert_xml_element_to_array($h);
$links = array();
@@ -2759,3 +2764,18 @@ function lang_selector() {
$o .= '</select></form></div>';
return $o;
}}
+
+
+if(! function_exists('parse_xml_string')) {
+function parse_xml_string($s) {
+ if(! strstr($s,'<?xml'))
+ return false;
+ $s2 = substr($s,strpos($s,'<?xml'));
+ libxml_use_internal_errors(true);
+ $x = @simplexml_load_string($s2);
+ if(count(libxml_get_errors()))
+ foreach(libxml_get_errors() as $err)
+ logger('libxml: parse: ' . $err, LOGGER_DATA);
+ libxml_clear_errors();
+ return $x;
+}}
diff --git a/include/Scrape.php b/include/Scrape.php
index ff9899252..21820ddaf 100644
--- a/include/Scrape.php
+++ b/include/Scrape.php
@@ -216,7 +216,7 @@ function scrape_feed($url) {
}
if(stristr($line,'application/rss+xml') || stristr($s,'<rss')) {
$ret['feed_rss'] = $url;
- return ret;
+ return $ret;
}
}
}
diff --git a/include/items.php b/include/items.php
index 1dd39d2ba..a9ac85969 100644
--- a/include/items.php
+++ b/include/items.php
@@ -180,7 +180,7 @@ function construct_activity_object($item) {
if($item['object']) {
$o = '<as:object>' . "\r\n";
- $r = @simplexml_load_string($item['object']);
+ $r = parse_xml_string($item['object']);
if($r->type)
$o .= '<as:object-type>' . xmlify($r->type) . '</as:object-type>' . "\r\n";
if($r->id)
@@ -206,7 +206,7 @@ function construct_activity_target($item) {
if($item['target']) {
$o = '<as:target>' . "\r\n";
- $r = @simplexml_load_string($item['target']);
+ $r = parse_xml_string($item['target']);
if($r->type)
$o .= '<as:object-type>' . xmlify($r->type) . '</as:object-type>' . "\r\n";
if($r->id)
@@ -241,8 +241,14 @@ function get_atom_elements($feed,$item) {
$res = array();
$author = $item->get_author();
- $res['author-name'] = unxmlify($author->get_name());
- $res['author-link'] = unxmlify($author->get_link());
+ if($author) {
+ $res['author-name'] = unxmlify($author->get_name());
+ $res['author-link'] = unxmlify($author->get_link());
+ }
+ else {
+ $res['author-name'] = unxmlify($feed->get_title());
+ $res['author-link'] = unxmlify($feed->get_permalink());
+ }
$res['uri'] = unxmlify($item->get_id());
$res['title'] = unxmlify($item->get_title());
$res['body'] = unxmlify($item->get_content());
@@ -343,7 +349,6 @@ function get_atom_elements($feed,$item) {
// the wild, by sanitising it and converting supported tags to bbcode before we rip out any remaining
// html.
-
if((strpos($res['body'],'<') !== false) || (strpos($res['body'],'>') !== false)) {
$res['body'] = preg_replace('#<object[^>]+>.+?' . 'http://www.youtube.com/((?:v|cp)/[A-Za-z0-9\-_=]+).+?</object>#s',
@@ -783,7 +788,7 @@ function dfrn_deliver($owner,$contact,$atom, $dissolve = false) {
return 3;
}
- $res = simplexml_load_string($xml);
+ $res = parse_xml_string($xml);
if((intval($res->status) != 0) || (! strlen($res->challenge)) || (! strlen($res->dfrn_id)))
return (($res->status) ? $res->status : 3);
@@ -878,7 +883,7 @@ function dfrn_deliver($owner,$contact,$atom, $dissolve = false) {
return 3;
}
- $res = simplexml_load_string($xml);
+ $res = parse_xml_string($xml);
return $res->status;
@@ -916,6 +921,7 @@ function consume_feed($xml,$importer,&$contact, &$hub, $datedir = 0, $secure_fee
if($feed->error())
logger('consume_feed: Error parsing XML: ' . $feed->error());
+ $permalink = $feed->get_permalink();
// Check at the feed level for updated contact name and/or photo
@@ -1230,6 +1236,7 @@ function consume_feed($xml,$importer,&$contact, &$hub, $datedir = 0, $secure_fee
// Head post of a conversation. Have we seen it? If not, import it.
$item_id = $item->get_id();
+
$datarray = get_atom_elements($feed,$item);
$r = q("SELECT `uid`, `last-child`, `edited`, `body` FROM `item` WHERE `uri` = '%s' AND `uid` = %d LIMIT 1",
@@ -1275,7 +1282,7 @@ function consume_feed($xml,$importer,&$contact, &$hub, $datedir = 0, $secure_fee
if(! is_array($contact))
return;
- if($contact['network'] === 'stat') {
+ if($contact['network'] === 'stat' || stristr($permalink,'twitter.com')) {
if(strlen($datarray['title']))
unset($datarray['title']);
$datarray['last-child'] = 1;
diff --git a/include/poller.php b/include/poller.php
index 3b80c1c04..9362c28b3 100644
--- a/include/poller.php
+++ b/include/poller.php
@@ -203,7 +203,7 @@ function poller_run($argv, $argc){
}
- $res = simplexml_load_string($xml);
+ $res = parse_xml_string($xml);
if(intval($res->status) == 1) {
logger("poller: $url replied status 1 - marking for death ");
diff --git a/mod/dfrn_confirm.php b/mod/dfrn_confirm.php
index 1bf1ba954..2db745d25 100644
--- a/mod/dfrn_confirm.php
+++ b/mod/dfrn_confirm.php
@@ -240,7 +240,7 @@ function dfrn_confirm_post(&$a,$handsfree = null) {
notice( t('Unexpected response from remote site: ') . EOL . $leading_junk . EOL );
}
- $xml = simplexml_load_string($res);
+ $xml = parse_xml_string($res);
$status = (int) $xml->status;
$message = unxmlify($xml->message); // human readable text of what may have gone wrong.
switch($status) {
diff --git a/mod/dfrn_poll.php b/mod/dfrn_poll.php
index 5149dc3b2..2ccfadd03 100644
--- a/mod/dfrn_poll.php
+++ b/mod/dfrn_poll.php
@@ -69,7 +69,7 @@ function dfrn_poll_init(&$a) {
if(strlen($s)) {
- $xml = simplexml_load_string($s);
+ $xml = parse_xml_string($s);
if((int) $xml->status == 1) {
$_SESSION['authenticated'] = 1;
@@ -468,7 +468,7 @@ function dfrn_poll_content(&$a) {
if(strlen($s) && strstr($s,'<?xml')) {
- $xml = simplexml_load_string($s);
+ $xml = parse_xml_string($s);
logger('dfrn_poll: profile: parsed xml: ' . print_r($xml,true), LOGGER_DATA);
diff --git a/mod/follow.php b/mod/follow.php
index 4ce3ccb82..06e81ceed 100644
--- a/mod/follow.php
+++ b/mod/follow.php
@@ -15,7 +15,8 @@ function follow_post(&$a) {
$email_conversant = false;
if($url) {
- $links = @lrdd($url);
+ $links = lrdd($url);
+
if(count($links)) {
foreach($links as $link) {
if($link['@attributes']['rel'] === NAMESPACE_DFRN)
@@ -107,7 +108,7 @@ function follow_post(&$a) {
if((! isset($vcard)) && (! $poll)) {
$ret = scrape_feed($url);
-
+ logger('mod_follow: scrape_feed returns: ' . print_r($ret,true), LOGGER_DATA);
if(count($ret) && ($ret['feed_atom'] || $ret['feed_rss'])) {
$poll = ((x($ret,'feed_atom')) ? unamp($ret['feed_atom']) : unamp($ret['feed_rss']));
$vcard = array();
@@ -156,7 +157,14 @@ function follow_post(&$a) {
}
if((! $vcard['photo']) && strlen($email))
$vcard['photo'] = gravatar_img($email);
-
+ if($poll === $profile)
+ $lnk = $feed->get_permalink();
+ if(isset($lnk) && strlen($lnk))
+ $profile = $lnk;
+ if(! (x($vcard,'fn')))
+ $vcard['fn'] = notags($feed->get_title());
+ if(! (x($vcard,'fn')))
+ $vcard['fn'] = notags($feed->get_description());
$network = 'feed';
$priority = 2;
}