From 2d9718fee9f5c07fe8098b892f9b096fcc68c43e Mon Sep 17 00:00:00 2001 From: Friendika Date: Sun, 19 Dec 2010 19:04:37 -0800 Subject: do a slightly better job at finding relevant content from scraping submitted links --- mod/parse_url.php | 43 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) (limited to 'mod/parse_url.php') diff --git a/mod/parse_url.php b/mod/parse_url.php index aa71893ab..acfe624cb 100644 --- a/mod/parse_url.php +++ b/mod/parse_url.php @@ -6,6 +6,8 @@ function parse_url_content(&$a) { $url = trim($_GET['url']); + $text = null; + $template = "%s%s"; if($url) @@ -34,15 +36,38 @@ function parse_url_content(&$a) { } } - $items = $dom->getElementsByTagName('p'); - if($items) { - foreach($items as $item) { - $text = $item->textContent; - $text = strip_tags($text); - if(strlen($text) < 100) - continue; - $text = substr($text,0,250) . '...' ; - break; + + $divs = $dom->getElementsByTagName('div'); + if($divs) { + foreach($divs as $div) { + $class = $div->getAttribute('class'); + if($class && stristr($class,'article')) { + $items = $div->getElementsByTagName('p'); + if($items) { + foreach($items as $item) { + $text = $item->textContent; + $text = strip_tags($text); + if(strlen($text) < 100) + continue; + $text = substr($text,0,250) . '...' ; + break; + } + } + } + } + } + + if(! $text) { + $items = $dom->getElementsByTagName('p'); + if($items) { + foreach($items as $item) { + $text = $item->textContent; + $text = strip_tags($text); + if(strlen($text) < 100) + continue; + $text = substr($text,0,250) . '...' ; + break; + } } } -- cgit v1.2.3