diff options
Diffstat (limited to 'include')
-rw-r--r-- | include/Scrape.php | 58 | ||||
-rw-r--r-- | include/items.php | 50 |
2 files changed, 76 insertions, 32 deletions
diff --git a/include/Scrape.php b/include/Scrape.php index e4f7a0878..bb42c3bdd 100644 --- a/include/Scrape.php +++ b/include/Scrape.php @@ -5,12 +5,25 @@ require_once('library/HTML5/Parser.php'); if(! function_exists('scrape_dfrn')) { function scrape_dfrn($url) { + $a = get_app(); + $ret = array(); $s = fetch_url($url); if(! $s) return $ret; + $headers = $a->get_curl_headers(); + $lines = explode("\n",$headers); + if(count($lines)) { + foreach($lines as $line) { + // don't try and run feeds through the html5 parser + if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml')))) + return ret; + } + } + + $dom = HTML5_Parser::parse($s); if(! $dom) @@ -77,12 +90,26 @@ function validate_dfrn($a) { if(! function_exists('scrape_meta')) { function scrape_meta($url) { + $a = get_app(); + $ret = array(); $s = fetch_url($url); if(! $s) return $ret; + $headers = $a->get_curl_headers(); + $lines = explode("\n",$headers); + if(count($lines)) { + foreach($lines as $line) { + // don't try and run feeds through the html5 parser + if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml')))) + return ret; + } + } + + + $dom = HTML5_Parser::parse($s); if(! $dom) @@ -105,12 +132,24 @@ function scrape_meta($url) { if(! function_exists('scrape_vcard')) { function scrape_vcard($url) { + $a = get_app(); + $ret = array(); $s = fetch_url($url); if(! $s) return $ret; + $headers = $a->get_curl_headers(); + $lines = explode("\n",$headers); + if(count($lines)) { + foreach($lines as $line) { + // don't try and run feeds through the html5 parser + if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml')))) + return ret; + } + } + $dom = HTML5_Parser::parse($s); if(! $dom) @@ -142,12 +181,31 @@ function scrape_vcard($url) { if(! function_exists('scrape_feed')) { function scrape_feed($url) { + $a = get_app(); + $ret = array(); $s = fetch_url($url); if(! $s) return $ret; + $headers = $a->get_curl_headers(); + $lines = explode("\n",$headers); + if(count($lines)) { + foreach($lines as $line) { + if(stristr($line,'content-type:')) { + if(stristr($line,'application/atom+xml')) { + $ret['feed_atom'] = $url; + return $ret; + } + if(stristr($line,'application/rss+xml')) { + $ret['feed_rss'] = $url; + return ret; + } + } + } + } + $dom = HTML5_Parser::parse($s); if(! $dom) diff --git a/include/items.php b/include/items.php index 5747d301d..c29ad9e44 100644 --- a/include/items.php +++ b/include/items.php @@ -388,8 +388,8 @@ function get_atom_elements($feed,$item) { $have_real_body = true; $res['body'] = $rawenv[0]['data']; $res['body'] = str_replace(array(' ',"\t","\r","\n"), array('','','',''),$res['body']); - $res['body'] = base64url_decode($res['body']); - $res['realbody'] = true; + // make sure nobody is trying to sneak some html tags by us + $res['body'] = notags(base64url_decode($res['body'])); } $maxlen = get_max_import_size(); @@ -407,7 +407,7 @@ function get_atom_elements($feed,$item) { // html. - if((! $have_real_body) || (strpos($res['body'],'<')) || (strpos($res['body'],'>'))) { + if((strpos($res['body'],'<') !== false) || (strpos($res['body'],'>') !== false)) { $res['body'] = preg_replace('#<object[^>]+>.+?' . 'http://www.youtube.com/((?:v|cp)/[A-Za-z0-9\-_=]+).+?</object>#s', '[youtube]$1[/youtube]', $res['body']); @@ -426,10 +426,7 @@ function get_atom_elements($feed,$item) { $res['body'] = html2bbcode($res['body']); } - else - $res['body'] = escape_tags($res['body']); - $allow = $item->get_item_tags(NAMESPACE_DFRN,'comment-allow'); if($allow && $allow[0]['data'] == 1) $res['last-child'] = 1; @@ -455,14 +452,16 @@ function get_atom_elements($feed,$item) { $rawedited = $item->get_item_tags(SIMPLEPIE_NAMESPACE_ATOM_10,'updated'); if($rawedited) - $res['edited'] = unxmlify($rawcreated[0]['data']); + $res['edited'] = unxmlify($rawedited[0]['data']); + if((x($res,'edited')) && (! (x($res,'created')))) + $res['created'] = $res['edited']; if(! $res['created']) - $res['created'] = $item->get_date(); + $res['created'] = $item->get_date('c'); if(! $res['edited']) - $res['edited'] = $item->get_date(); + $res['edited'] = $item->get_date('c'); $rawowner = $item->get_item_tags(NAMESPACE_DFRN, 'owner'); @@ -526,7 +525,7 @@ function get_atom_elements($feed,$item) { $body = $rawobj[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['summary'][0]['data']; // preserve a copy of the original body content in case we later need to parse out any microformat information, e.g. events $res['object'] .= '<orig>' . xmlify($body) . '</orig>' . "\n"; - if((strpos($body,'<')) || (strpos($body,'>'))) { + if((strpos($body,'<') !== false) || (strpos($body,'>') !== false)) { $body = preg_replace('#<object[^>]+>.+?' . 'http://www.youtube.com/((?:v|cp)/[A-Za-z0-9\-_=]+).+?</object>#s', '[youtube]$1[/youtube]', $body); @@ -538,8 +537,6 @@ function get_atom_elements($feed,$item) { $body = $purifier->purify($body); $body = html2bbcode($body); } - else - $body = escape_tags($body); $res['object'] .= '<content>' . $body . '</content>' . "\n"; } @@ -567,7 +564,7 @@ function get_atom_elements($feed,$item) { $body = $rawobj[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['summary'][0]['data']; // preserve a copy of the original body content in case we later need to parse out any microformat information, e.g. events $res['object'] .= '<orig>' . xmlify($body) . '</orig>' . "\n"; - if((strpos($body,'<')) || (strpos($body,'>'))) { + if((strpos($body,'<') !== false) || (strpos($body,'>') !== false)) { $body = preg_replace('#<object[^>]+>.+?' . 'http://www.youtube.com/((?:v|cp)/[A-Za-z0-9\-_=]+).+?</object>#s', '[youtube]$1[/youtube]', $body); @@ -579,8 +576,6 @@ function get_atom_elements($feed,$item) { $body = $purifier->purify($body); $body = html2bbcode($body); } - else - $body = escape_tags($body); $res['target'] .= '<content>' . $body . '</content>' . "\n"; } @@ -629,6 +624,13 @@ function item_store($arr) { if(! x($arr,'type')) $arr['type'] = 'remote'; + + // Shouldn't happen but we want to make absolutely sure it doesn't leak from a plugin. + + if((strpos($arr['body'],'<') !== false) || (strpos($arr['body'],'>') !== false)) + $arr['body'] = strip_tags($arr['body']); + + $arr['wall'] = ((x($arr,'wall')) ? intval($arr['wall']) : 0); $arr['uri'] = ((x($arr,'uri')) ? notags(trim($arr['uri'])) : random_string()); $arr['author-name'] = ((x($arr,'author-name')) ? notags(trim($arr['author-name'])) : ''); @@ -657,23 +659,7 @@ function item_store($arr) { $arr['deny_cid'] = ((x($arr,'deny_cid')) ? trim($arr['deny_cid']) : ''); $arr['deny_gid'] = ((x($arr,'deny_gid')) ? trim($arr['deny_gid']) : ''); $arr['private'] = ((x($arr,'private')) ? intval($arr['private']) : 0 ); - $arr['body'] = ((x($arr,'body')) ? escape_tags(trim($arr['body'])) : ''); - - // The content body may have been through a lot of filtering and transport escaping by now. - // We don't want to skip any filters, however a side effect of all this filtering - // is that ampersands and <> may have been double encoded, depending on which filter chain - // they came through. The presence of $res['realbody'] means we have something encoded in a - // transport safe manner at the source and does not require any filter corrections. - - if(x($arr,'realbody')) - unset($arr['realbody']); - else { - $arr['body'] = str_replace( - array('&amp;', '&gt;', '&lt;', '&quot;'), - array('&' , '>' , '<', '"'), - $arr['body'] - ); - } + $arr['body'] = ((x($arr,'body')) ? trim($arr['body']) : ''); if($arr['parent-uri'] === $arr['uri']) { $parent_id = 0; |