<?php
namespace ticky;
class collection {
public static $url;
public static function get_content($url) {
self::$url = $url;
$content = '';
if (extension_loaded('curl')) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
curl_setopt($ch, CURLOPT_HEADER, 0);
$content = curl_exec($ch);
curl_close($ch);
} else {
$content = @file_get_contents($url);
}
return trim($content);
}
public static function get_sub_content($html, $start, $end) {
if (empty($html))
return '';
if ($start == '' || $end == '') {
return $html;
}
$html = str_replace(array("\r", "\n"), "", $html);
$start = str_replace(array("\r", "\n"), "", $start);
$end = str_replace(array("\r", "\n"), "", $end);
$html = explode(trim($start), $html);
if (is_array($html))
$html = explode(trim($end), $html[1]);
return trim($html[0]);
}
public static function get_all_url($html, $url_contain = '', $url_except = '') {
$html = str_replace(array("\r", "\n"), '', $html);
$html = str_replace(array("</a>", "</A>"), "</a>\n", $html);
preg_match_all('/<a ([^>]*)>([^\/a>].*)<\/a>/i', $html, $out);
$data = array();
foreach ($out[1] as $k => $v) {
if (preg_match('/href=[\'"]?([^\'" ]*)[\'"]?/i', $v, $match_out)) {
if ($url_contain) {
if (strpos($match_out[1], $url_contain) === false) {
continue;
}
}
if ($url_except) {
if (strpos($match_out[1], $url_except) !== false) {
continue;
}
}
$url2 = $match_out[1];
$url2 = self::url_check($url2, self::$url);
$title = strip_tags($out[2][$k]);
if (empty($url2) || empty($title))
continue;
$data['url'][$k] = $url2;
$data['title'][$k] = $title;
} else {
continue;
}
}
$arr = array();
$data['url'] = array_unique($data['url']);
foreach ($data['url'] as $k => $v) {
$arr['url'][] = $data['url'][$k];
$arr['title'][] = $data['title'][$k];
}
return $arr;
}
public static function get_filter_html($html, $config = array()) {
$data = array();
$data['title'] = self::replace_item(self::get_sub_content($html, $config['title_rule'][0], $config['title_rule'][1]), $config['title_html_rule']);
if ($config['time_rule'][0] != '' && $config['time_rule'][1] != '') {
$data['inputtime'] = self::replace_item(self::get_sub_content($html, $config['time_rule'][0], $config['time_rule'][1]), $config['time_html_rule']);
$data['inputtime'] = !empty($data['inputtime']) ? strtotime($data['inputtime']) : NOW_TIME;
} else {
$data['inputtime'] = NOW_TIME;
}
$data['content'] = self::replace_item(self::get_sub_content($html, $config['content_rule'][0], $config['content_rule'][1]), $config['content_html_rule']);
return $data;
}
public static function myexp($separator, $string) {
if (empty($string))
return array('', '');
$string = str_replace(array("\r", "\n"), '', $string);
$arr = explode($separator, $string);
if ($arr[count($arr) - 1] == '')
unset($arr[count($arr) - 1]);
return $arr;
}
protected static function replace_item($html, $config) {
if (!is_array($config) || empty($config))
return $html;
$patterns = $replace = array();
foreach ($config as $k => $v) {
$patterns[] = '/' . str_replace('/', '\/', $v) . '/i';
$replace[] = '';
}
return trim(preg_replace($patterns, $replace, $html));
}
protected static function url_check($url, $baseurl) {
$urlinfo = parse_url($baseurl);
$baseurl = $urlinfo['scheme'] . '://' . $urlinfo['host'] . (substr($urlinfo['path'], -1, 1) === '/' ? substr($urlinfo['path'], 0, -1) : str_replace('\\', '/', dirname($urlinfo['path']))) . '/';
if (strpos($url, '://') === false) {
if ($url[0] == '/') {
$url = $urlinfo['scheme'] . '://' . $urlinfo['host'] . $url;
} else {
$url = $baseurl . $url;
}
}
return $url;
}
}