diff options
author | Friendika <info@friendika.com> | 2011-04-04 19:36:18 -0700 |
---|---|---|
committer | Friendika <info@friendika.com> | 2011-04-04 19:36:18 -0700 |
commit | 793967a1d3c23fcf1f3b00a2832f51e6f473f4bd (patch) | |
tree | ec045409190c042c621874f10cf38cf00488b9b3 | |
parent | 178362e50b846aef1caf4e191ea0394c5d636857 (diff) | |
download | volse-hubzilla-793967a1d3c23fcf1f3b00a2832f51e6f473f4bd.tar.gz volse-hubzilla-793967a1d3c23fcf1f3b00a2832f51e6f473f4bd.tar.bz2 volse-hubzilla-793967a1d3c23fcf1f3b00a2832f51e6f473f4bd.zip |
better handling of troublesome feeds.
-rw-r--r-- | boot.php | 44 | ||||
-rw-r--r-- | include/Scrape.php | 2 | ||||
-rw-r--r-- | include/items.php | 23 | ||||
-rw-r--r-- | include/poller.php | 2 | ||||
-rw-r--r-- | mod/dfrn_confirm.php | 2 | ||||
-rw-r--r-- | mod/dfrn_poll.php | 4 | ||||
-rw-r--r-- | mod/follow.php | 14 |
7 files changed, 63 insertions, 28 deletions
@@ -1478,7 +1478,9 @@ function lrdd($uri) { return array(); logger('lrdd: host_meta: ' . $xml, LOGGER_DATA); - $h = simplexml_load_string($xml); + + $h = parse_xml_string($xml); + $arr = convert_xml_element_to_array($h); if(isset($arr['xrd']['property'])) { @@ -1550,16 +1552,19 @@ function lrdd($uri) { $headers = $a->get_curl_headers(); logger('lrdd: headers=' . $headers, LOGGER_DEBUG); - require_once('library/HTML5/Parser.php'); - $dom = @HTML5_Parser::parse($html); - - if($dom) { - $items = $dom->getElementsByTagName('link'); - foreach($items as $item) { - $x = $item->getAttribute('rel'); - if($x == "lrdd") { - $pagelink = $item->getAttribute('href'); - break; + // don't try and parse raw xml as html + if(! strstr($html,'<?xml')) { + require_once('library/HTML5/Parser.php'); + $dom = @HTML5_Parser::parse($html); + + if($dom) { + $items = $dom->getElementsByTagName('link'); + foreach($items as $item) { + $x = $item->getAttribute('rel'); + if($x == "lrdd") { + $pagelink = $item->getAttribute('href'); + break; + } } } } @@ -1638,7 +1643,7 @@ function fetch_xrd_links($url) { return array(); logger('fetch_xrd_links: ' . $xml, LOGGER_DATA); - $h = simplexml_load_string($xml); + $h = parse_xml_string($xml); $arr = convert_xml_element_to_array($h); $links = array(); @@ -2759,3 +2764,18 @@ function lang_selector() { $o .= '</select></form></div>'; return $o; }} + + +if(! function_exists('parse_xml_string')) { +function parse_xml_string($s) { + if(! strstr($s,'<?xml')) + return false; + $s2 = substr($s,strpos($s,'<?xml')); + libxml_use_internal_errors(true); + $x = @simplexml_load_string($s2); + if(count(libxml_get_errors())) + foreach(libxml_get_errors() as $err) + logger('libxml: parse: ' . $err, LOGGER_DATA); + libxml_clear_errors(); + return $x; +}} diff --git a/include/Scrape.php b/include/Scrape.php index ff9899252..21820ddaf 100644 --- a/include/Scrape.php +++ b/include/Scrape.php @@ -216,7 +216,7 @@ function scrape_feed($url) { } if(stristr($line,'application/rss+xml') || stristr($s,'<rss')) { $ret['feed_rss'] = $url; - return ret; + return $ret; } } } diff --git a/include/items.php b/include/items.php index 1dd39d2ba..a9ac85969 100644 --- a/include/items.php +++ b/include/items.php @@ -180,7 +180,7 @@ function construct_activity_object($item) { if($item['object']) { $o = '<as:object>' . "\r\n"; - $r = @simplexml_load_string($item['object']); + $r = parse_xml_string($item['object']); if($r->type) $o .= '<as:object-type>' . xmlify($r->type) . '</as:object-type>' . "\r\n"; if($r->id) @@ -206,7 +206,7 @@ function construct_activity_target($item) { if($item['target']) { $o = '<as:target>' . "\r\n"; - $r = @simplexml_load_string($item['target']); + $r = parse_xml_string($item['target']); if($r->type) $o .= '<as:object-type>' . xmlify($r->type) . '</as:object-type>' . "\r\n"; if($r->id) @@ -241,8 +241,14 @@ function get_atom_elements($feed,$item) { $res = array(); $author = $item->get_author(); - $res['author-name'] = unxmlify($author->get_name()); - $res['author-link'] = unxmlify($author->get_link()); + if($author) { + $res['author-name'] = unxmlify($author->get_name()); + $res['author-link'] = unxmlify($author->get_link()); + } + else { + $res['author-name'] = unxmlify($feed->get_title()); + $res['author-link'] = unxmlify($feed->get_permalink()); + } $res['uri'] = unxmlify($item->get_id()); $res['title'] = unxmlify($item->get_title()); $res['body'] = unxmlify($item->get_content()); @@ -343,7 +349,6 @@ function get_atom_elements($feed,$item) { // the wild, by sanitising it and converting supported tags to bbcode before we rip out any remaining // html. - if((strpos($res['body'],'<') !== false) || (strpos($res['body'],'>') !== false)) { $res['body'] = preg_replace('#<object[^>]+>.+?' . 'http://www.youtube.com/((?:v|cp)/[A-Za-z0-9\-_=]+).+?</object>#s', @@ -783,7 +788,7 @@ function dfrn_deliver($owner,$contact,$atom, $dissolve = false) { return 3; } - $res = simplexml_load_string($xml); + $res = parse_xml_string($xml); if((intval($res->status) != 0) || (! strlen($res->challenge)) || (! strlen($res->dfrn_id))) return (($res->status) ? $res->status : 3); @@ -878,7 +883,7 @@ function dfrn_deliver($owner,$contact,$atom, $dissolve = false) { return 3; } - $res = simplexml_load_string($xml); + $res = parse_xml_string($xml); return $res->status; @@ -916,6 +921,7 @@ function consume_feed($xml,$importer,&$contact, &$hub, $datedir = 0, $secure_fee if($feed->error()) logger('consume_feed: Error parsing XML: ' . $feed->error()); + $permalink = $feed->get_permalink(); // Check at the feed level for updated contact name and/or photo @@ -1230,6 +1236,7 @@ function consume_feed($xml,$importer,&$contact, &$hub, $datedir = 0, $secure_fee // Head post of a conversation. Have we seen it? If not, import it. $item_id = $item->get_id(); + $datarray = get_atom_elements($feed,$item); $r = q("SELECT `uid`, `last-child`, `edited`, `body` FROM `item` WHERE `uri` = '%s' AND `uid` = %d LIMIT 1", @@ -1275,7 +1282,7 @@ function consume_feed($xml,$importer,&$contact, &$hub, $datedir = 0, $secure_fee if(! is_array($contact)) return; - if($contact['network'] === 'stat') { + if($contact['network'] === 'stat' || stristr($permalink,'twitter.com')) { if(strlen($datarray['title'])) unset($datarray['title']); $datarray['last-child'] = 1; diff --git a/include/poller.php b/include/poller.php index 3b80c1c04..9362c28b3 100644 --- a/include/poller.php +++ b/include/poller.php @@ -203,7 +203,7 @@ function poller_run($argv, $argc){ } - $res = simplexml_load_string($xml); + $res = parse_xml_string($xml); if(intval($res->status) == 1) { logger("poller: $url replied status 1 - marking for death "); diff --git a/mod/dfrn_confirm.php b/mod/dfrn_confirm.php index 1bf1ba954..2db745d25 100644 --- a/mod/dfrn_confirm.php +++ b/mod/dfrn_confirm.php @@ -240,7 +240,7 @@ function dfrn_confirm_post(&$a,$handsfree = null) { notice( t('Unexpected response from remote site: ') . EOL . $leading_junk . EOL ); } - $xml = simplexml_load_string($res); + $xml = parse_xml_string($res); $status = (int) $xml->status; $message = unxmlify($xml->message); // human readable text of what may have gone wrong. switch($status) { diff --git a/mod/dfrn_poll.php b/mod/dfrn_poll.php index 5149dc3b2..2ccfadd03 100644 --- a/mod/dfrn_poll.php +++ b/mod/dfrn_poll.php @@ -69,7 +69,7 @@ function dfrn_poll_init(&$a) { if(strlen($s)) { - $xml = simplexml_load_string($s); + $xml = parse_xml_string($s); if((int) $xml->status == 1) { $_SESSION['authenticated'] = 1; @@ -468,7 +468,7 @@ function dfrn_poll_content(&$a) { if(strlen($s) && strstr($s,'<?xml')) { - $xml = simplexml_load_string($s); + $xml = parse_xml_string($s); logger('dfrn_poll: profile: parsed xml: ' . print_r($xml,true), LOGGER_DATA); diff --git a/mod/follow.php b/mod/follow.php index 4ce3ccb82..06e81ceed 100644 --- a/mod/follow.php +++ b/mod/follow.php @@ -15,7 +15,8 @@ function follow_post(&$a) { $email_conversant = false; if($url) { - $links = @lrdd($url); + $links = lrdd($url); + if(count($links)) { foreach($links as $link) { if($link['@attributes']['rel'] === NAMESPACE_DFRN) @@ -107,7 +108,7 @@ function follow_post(&$a) { if((! isset($vcard)) && (! $poll)) { $ret = scrape_feed($url); - + logger('mod_follow: scrape_feed returns: ' . print_r($ret,true), LOGGER_DATA); if(count($ret) && ($ret['feed_atom'] || $ret['feed_rss'])) { $poll = ((x($ret,'feed_atom')) ? unamp($ret['feed_atom']) : unamp($ret['feed_rss'])); $vcard = array(); @@ -156,7 +157,14 @@ function follow_post(&$a) { } if((! $vcard['photo']) && strlen($email)) $vcard['photo'] = gravatar_img($email); - + if($poll === $profile) + $lnk = $feed->get_permalink(); + if(isset($lnk) && strlen($lnk)) + $profile = $lnk; + if(! (x($vcard,'fn'))) + $vcard['fn'] = notags($feed->get_title()); + if(! (x($vcard,'fn'))) + $vcard['fn'] = notags($feed->get_description()); $network = 'feed'; $priority = 2; } |