diff options
author | Friendika <info@friendika.com> | 2010-12-19 19:04:37 -0800 |
---|---|---|
committer | Friendika <info@friendika.com> | 2010-12-19 19:04:37 -0800 |
commit | 2d9718fee9f5c07fe8098b892f9b096fcc68c43e (patch) | |
tree | a714631ea94bd57a7becd04f8081edf8cf1b2b3e | |
parent | 24a9a41f969cfd7625847c77a949e4b279f68406 (diff) | |
download | volse-hubzilla-2d9718fee9f5c07fe8098b892f9b096fcc68c43e.tar.gz volse-hubzilla-2d9718fee9f5c07fe8098b892f9b096fcc68c43e.tar.bz2 volse-hubzilla-2d9718fee9f5c07fe8098b892f9b096fcc68c43e.zip |
do a slightly better job at finding relevant content from scraping submitted links
-rw-r--r-- | mod/parse_url.php | 43 |
1 files changed, 34 insertions, 9 deletions
diff --git a/mod/parse_url.php b/mod/parse_url.php index aa71893ab..acfe624cb 100644 --- a/mod/parse_url.php +++ b/mod/parse_url.php @@ -6,6 +6,8 @@ function parse_url_content(&$a) { $url = trim($_GET['url']); + $text = null; + $template = "<a href=\"%s\" >%s</a>%s"; if($url) @@ -34,15 +36,38 @@ function parse_url_content(&$a) { } } - $items = $dom->getElementsByTagName('p'); - if($items) { - foreach($items as $item) { - $text = $item->textContent; - $text = strip_tags($text); - if(strlen($text) < 100) - continue; - $text = substr($text,0,250) . '...' ; - break; + + $divs = $dom->getElementsByTagName('div'); + if($divs) { + foreach($divs as $div) { + $class = $div->getAttribute('class'); + if($class && stristr($class,'article')) { + $items = $div->getElementsByTagName('p'); + if($items) { + foreach($items as $item) { + $text = $item->textContent; + $text = strip_tags($text); + if(strlen($text) < 100) + continue; + $text = substr($text,0,250) . '...' ; + break; + } + } + } + } + } + + if(! $text) { + $items = $dom->getElementsByTagName('p'); + if($items) { + foreach($items as $item) { + $text = $item->textContent; + $text = strip_tags($text); + if(strlen($text) < 100) + continue; + $text = substr($text,0,250) . '...' ; + break; + } } } |