aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
Diffstat (limited to 'include')
-rw-r--r--include/Scrape.php58
-rw-r--r--include/items.php50
2 files changed, 76 insertions, 32 deletions
diff --git a/include/Scrape.php b/include/Scrape.php
index e4f7a0878..bb42c3bdd 100644
--- a/include/Scrape.php
+++ b/include/Scrape.php
@@ -5,12 +5,25 @@ require_once('library/HTML5/Parser.php');
if(! function_exists('scrape_dfrn')) {
function scrape_dfrn($url) {
+ $a = get_app();
+
$ret = array();
$s = fetch_url($url);
if(! $s)
return $ret;
+ $headers = $a->get_curl_headers();
+ $lines = explode("\n",$headers);
+ if(count($lines)) {
+ foreach($lines as $line) {
+ // don't try and run feeds through the html5 parser
+ if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
+ return ret;
+ }
+ }
+
+
$dom = HTML5_Parser::parse($s);
if(! $dom)
@@ -77,12 +90,26 @@ function validate_dfrn($a) {
if(! function_exists('scrape_meta')) {
function scrape_meta($url) {
+ $a = get_app();
+
$ret = array();
$s = fetch_url($url);
if(! $s)
return $ret;
+ $headers = $a->get_curl_headers();
+ $lines = explode("\n",$headers);
+ if(count($lines)) {
+ foreach($lines as $line) {
+ // don't try and run feeds through the html5 parser
+ if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
+ return ret;
+ }
+ }
+
+
+
$dom = HTML5_Parser::parse($s);
if(! $dom)
@@ -105,12 +132,24 @@ function scrape_meta($url) {
if(! function_exists('scrape_vcard')) {
function scrape_vcard($url) {
+ $a = get_app();
+
$ret = array();
$s = fetch_url($url);
if(! $s)
return $ret;
+ $headers = $a->get_curl_headers();
+ $lines = explode("\n",$headers);
+ if(count($lines)) {
+ foreach($lines as $line) {
+ // don't try and run feeds through the html5 parser
+ if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
+ return ret;
+ }
+ }
+
$dom = HTML5_Parser::parse($s);
if(! $dom)
@@ -142,12 +181,31 @@ function scrape_vcard($url) {
if(! function_exists('scrape_feed')) {
function scrape_feed($url) {
+ $a = get_app();
+
$ret = array();
$s = fetch_url($url);
if(! $s)
return $ret;
+ $headers = $a->get_curl_headers();
+ $lines = explode("\n",$headers);
+ if(count($lines)) {
+ foreach($lines as $line) {
+ if(stristr($line,'content-type:')) {
+ if(stristr($line,'application/atom+xml')) {
+ $ret['feed_atom'] = $url;
+ return $ret;
+ }
+ if(stristr($line,'application/rss+xml')) {
+ $ret['feed_rss'] = $url;
+ return ret;
+ }
+ }
+ }
+ }
+
$dom = HTML5_Parser::parse($s);
if(! $dom)
diff --git a/include/items.php b/include/items.php
index 5747d301d..c29ad9e44 100644
--- a/include/items.php
+++ b/include/items.php
@@ -388,8 +388,8 @@ function get_atom_elements($feed,$item) {
$have_real_body = true;
$res['body'] = $rawenv[0]['data'];
$res['body'] = str_replace(array(' ',"\t","\r","\n"), array('','','',''),$res['body']);
- $res['body'] = base64url_decode($res['body']);
- $res['realbody'] = true;
+ // make sure nobody is trying to sneak some html tags by us
+ $res['body'] = notags(base64url_decode($res['body']));
}
$maxlen = get_max_import_size();
@@ -407,7 +407,7 @@ function get_atom_elements($feed,$item) {
// html.
- if((! $have_real_body) || (strpos($res['body'],'<')) || (strpos($res['body'],'>'))) {
+ if((strpos($res['body'],'<') !== false) || (strpos($res['body'],'>') !== false)) {
$res['body'] = preg_replace('#<object[^>]+>.+?' . 'http://www.youtube.com/((?:v|cp)/[A-Za-z0-9\-_=]+).+?</object>#s',
'[youtube]$1[/youtube]', $res['body']);
@@ -426,10 +426,7 @@ function get_atom_elements($feed,$item) {
$res['body'] = html2bbcode($res['body']);
}
- else
- $res['body'] = escape_tags($res['body']);
-
$allow = $item->get_item_tags(NAMESPACE_DFRN,'comment-allow');
if($allow && $allow[0]['data'] == 1)
$res['last-child'] = 1;
@@ -455,14 +452,16 @@ function get_atom_elements($feed,$item) {
$rawedited = $item->get_item_tags(SIMPLEPIE_NAMESPACE_ATOM_10,'updated');
if($rawedited)
- $res['edited'] = unxmlify($rawcreated[0]['data']);
+ $res['edited'] = unxmlify($rawedited[0]['data']);
+ if((x($res,'edited')) && (! (x($res,'created'))))
+ $res['created'] = $res['edited'];
if(! $res['created'])
- $res['created'] = $item->get_date();
+ $res['created'] = $item->get_date('c');
if(! $res['edited'])
- $res['edited'] = $item->get_date();
+ $res['edited'] = $item->get_date('c');
$rawowner = $item->get_item_tags(NAMESPACE_DFRN, 'owner');
@@ -526,7 +525,7 @@ function get_atom_elements($feed,$item) {
$body = $rawobj[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['summary'][0]['data'];
// preserve a copy of the original body content in case we later need to parse out any microformat information, e.g. events
$res['object'] .= '<orig>' . xmlify($body) . '</orig>' . "\n";
- if((strpos($body,'<')) || (strpos($body,'>'))) {
+ if((strpos($body,'<') !== false) || (strpos($body,'>') !== false)) {
$body = preg_replace('#<object[^>]+>.+?' . 'http://www.youtube.com/((?:v|cp)/[A-Za-z0-9\-_=]+).+?</object>#s',
'[youtube]$1[/youtube]', $body);
@@ -538,8 +537,6 @@ function get_atom_elements($feed,$item) {
$body = $purifier->purify($body);
$body = html2bbcode($body);
}
- else
- $body = escape_tags($body);
$res['object'] .= '<content>' . $body . '</content>' . "\n";
}
@@ -567,7 +564,7 @@ function get_atom_elements($feed,$item) {
$body = $rawobj[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['summary'][0]['data'];
// preserve a copy of the original body content in case we later need to parse out any microformat information, e.g. events
$res['object'] .= '<orig>' . xmlify($body) . '</orig>' . "\n";
- if((strpos($body,'<')) || (strpos($body,'>'))) {
+ if((strpos($body,'<') !== false) || (strpos($body,'>') !== false)) {
$body = preg_replace('#<object[^>]+>.+?' . 'http://www.youtube.com/((?:v|cp)/[A-Za-z0-9\-_=]+).+?</object>#s',
'[youtube]$1[/youtube]', $body);
@@ -579,8 +576,6 @@ function get_atom_elements($feed,$item) {
$body = $purifier->purify($body);
$body = html2bbcode($body);
}
- else
- $body = escape_tags($body);
$res['target'] .= '<content>' . $body . '</content>' . "\n";
}
@@ -629,6 +624,13 @@ function item_store($arr) {
if(! x($arr,'type'))
$arr['type'] = 'remote';
+
+ // Shouldn't happen but we want to make absolutely sure it doesn't leak from a plugin.
+
+ if((strpos($arr['body'],'<') !== false) || (strpos($arr['body'],'>') !== false))
+ $arr['body'] = strip_tags($arr['body']);
+
+
$arr['wall'] = ((x($arr,'wall')) ? intval($arr['wall']) : 0);
$arr['uri'] = ((x($arr,'uri')) ? notags(trim($arr['uri'])) : random_string());
$arr['author-name'] = ((x($arr,'author-name')) ? notags(trim($arr['author-name'])) : '');
@@ -657,23 +659,7 @@ function item_store($arr) {
$arr['deny_cid'] = ((x($arr,'deny_cid')) ? trim($arr['deny_cid']) : '');
$arr['deny_gid'] = ((x($arr,'deny_gid')) ? trim($arr['deny_gid']) : '');
$arr['private'] = ((x($arr,'private')) ? intval($arr['private']) : 0 );
- $arr['body'] = ((x($arr,'body')) ? escape_tags(trim($arr['body'])) : '');
-
- // The content body may have been through a lot of filtering and transport escaping by now.
- // We don't want to skip any filters, however a side effect of all this filtering
- // is that ampersands and <> may have been double encoded, depending on which filter chain
- // they came through. The presence of $res['realbody'] means we have something encoded in a
- // transport safe manner at the source and does not require any filter corrections.
-
- if(x($arr,'realbody'))
- unset($arr['realbody']);
- else {
- $arr['body'] = str_replace(
- array('&amp;amp;', '&amp;gt;', '&amp;lt;', '&amp;quot;'),
- array('&amp;' , '&gt;' , '&lt;', '&quot;'),
- $arr['body']
- );
- }
+ $arr['body'] = ((x($arr,'body')) ? trim($arr['body']) : '');
if($arr['parent-uri'] === $arr['uri']) {
$parent_id = 0;