do a slightly better job at finding relevant content from scraping submitted links

author: Friendika <info@friendika.com> 2010-12-19 19:04:37 -0800
committer: Friendika <info@friendika.com> 2010-12-19 19:04:37 -0800
commit: 2d9718fee9f5c07fe8098b892f9b096fcc68c43e (patch)
tree: a714631ea94bd57a7becd04f8081edf8cf1b2b3e /mod/parse_url.php
parent: 24a9a41f969cfd7625847c77a949e4b279f68406 (diff)
download: volse-hubzilla-2d9718fee9f5c07fe8098b892f9b096fcc68c43e.tar.gz
volse-hubzilla-2d9718fee9f5c07fe8098b892f9b096fcc68c43e.tar.bz2
volse-hubzilla-2d9718fee9f5c07fe8098b892f9b096fcc68c43e.zip
1 files changed, 34 insertions, 9 deletions
diff --git a/mod/parse_url.php b/mod/parse_url.php
index aa71893ab..acfe624cb 100644
--- a/mod/parse_url.php
+++ b/mod/parse_url.php
@@ -6,6 +6,8 @@ function parse_url_content(&$a) {
 
 	$url = trim($_GET['url']);
 
+	$text = null;
+
 	$template = "<a href=\"%s\" >%s</a>%s";
 
 	if($url) 
@@ -34,15 +36,38 @@ function parse_url_content(&$a) {
 		}
 	}
 
-	$items = $dom->getElementsByTagName('p');
-	if($items) {
-		foreach($items as $item) {
-			$text = $item->textContent;
-			$text = strip_tags($text);
-			if(strlen($text) < 100)
-				continue;
-			$text = substr($text,0,250) . '...' ;
-			break;
+
+	$divs = $dom->getElementsByTagName('div');
+	if($divs) {
+		foreach($divs as $div) {
+			$class = $div->getAttribute('class');
+			if($class && stristr($class,'article')) {
+				$items = $div->getElementsByTagName('p');
+				if($items) {
+					foreach($items as $item) {
+						$text = $item->textContent;
+						$text = strip_tags($text);
+						if(strlen($text) < 100)
+							continue;
+						$text = substr($text,0,250) . '...' ;
+						break;
+					}
+				}
+			}
+		}
+	}
+
+	if(! $text) {
+		$items = $dom->getElementsByTagName('p');
+		if($items) {
+			foreach($items as $item) {
+				$text = $item->textContent;
+				$text = strip_tags($text);
+				if(strlen($text) < 100)
+					continue;
+				$text = substr($text,0,250) . '...' ;
+				break;
+			}
 		}
 	}
author	Friendika <info@friendika.com>	2010-12-19 19:04:37 -0800
committer	Friendika <info@friendika.com>	2010-12-19 19:04:37 -0800
commit	2d9718fee9f5c07fe8098b892f9b096fcc68c43e (patch)
tree	a714631ea94bd57a7becd04f8081edf8cf1b2b3e /mod/parse_url.php
parent	24a9a41f969cfd7625847c77a949e4b279f68406 (diff)
download	volse-hubzilla-2d9718fee9f5c07fe8098b892f9b096fcc68c43e.tar.gz volse-hubzilla-2d9718fee9f5c07fe8098b892f9b096fcc68c43e.tar.bz2 volse-hubzilla-2d9718fee9f5c07fe8098b892f9b096fcc68c43e.zip