aboutsummaryrefslogtreecommitdiffstats
path: root/mod/parse_url.php
blob: b10d11c4bdc44ca686229f64a59bc6a20ad19940 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
<?php

require_once('library/HTML5/Parser.php');
require_once('library/HTMLPurifier.auto.php');

function parse_url_content(&$a) {

	logger('parse_url: ' . $_GET['url']);

	$url = trim(hex2bin($_GET['url']));

	logger('parse_url: ' . $url);

	$text = null;

	$template = "<br /><a href=\"%s\" >%s</a>%s<br />";


	$arr = array('url' => $url, 'text' => '');

	call_hooks('parse_link', $arr);

	if(strlen($arr['text'])) {
		echo $arr['text'];
		killme();
	}

	if($url) {
		$s = fetch_url($url);
	} else {
		echo '';
		killme();
	}

	logger('parse_url: data: ' . $s, LOGGER_DATA);

	if(! $s) {
		echo sprintf($template,$url,$url,'');
		killme();
	}

	if(strpos($s,'<title>')) {
		$title = substr($s,strpos($s,'<title>')+7,64);
		if(strpos($title,'<') !== false)
			$title = strip_tags(substr($title,0,strpos($title,'<')));
	}

	$config = HTMLPurifier_Config::createDefault();
	$config->set('Cache.DefinitionImpl', null);

	$purifier = new HTMLPurifier($config);
	$s = $purifier->purify($s);

//	logger('parse_url: purified: ' . $s, LOGGER_DATA);

	$dom = @HTML5_Parser::parse($s);

	if(! $dom) {
		echo sprintf($template,$url,$url,'');
		killme();
	}

	$items = $dom->getElementsByTagName('title');

	if($items) {
		foreach($items as $item) {
			$title = trim($item->textContent);
			break;
		}
	}

	$divs = $dom->getElementsByTagName('div');
	if($divs) {
		foreach($divs as $div) {
			$class = $div->getAttribute('class');
			if($class && (stristr($class,'article') || stristr($class,'content'))) {
				$items = $div->getElementsByTagName('p');
				if($items) {
					foreach($items as $item) {
						$text = $item->textContent;
						if(stristr($text,'<script')) {
							$text = '';
							continue;
						}
						$text = strip_tags($text);
						if(strlen($text) < 100) {
							$text = '';
							continue;
						}
						$text = substr($text,0,250) . '...' ;
						break;
					}
				}
			}
			if($text)
				break;
		}
	}

	if(! $text) {
		$items = $dom->getElementsByTagName('p');
		if($items) {
			foreach($items as $item) {
				$text = $item->textContent;
				if(stristr($text,'<script'))
					continue;
				$text = strip_tags($text);
				if(strlen($text) < 100) {
					$text = '';
					continue;
				}
				$text = substr($text,0,250) . '...' ;
				break;
			}
		}
	}

	if(strlen($text)) {
		$text = '<br /><br /><blockquote>' . $text . '</blockquote><br />';
	}

	$title = str_replace("\n",'',$title);

	echo sprintf($template,$url,($title) ? $title : $url,$text);
	killme();
}