<?php
/* 海龙挖掘机 2.0正式版
* 正文提取,分析,可自动判断编码,自动转码
* 原理:根据代码块加权的原理,首先将HTML分成若干个小块,然后对每个小块进行评分。
* 取分数在3分以上的代码块中的内容返回
* 加分项 1 含有标点符号
* 2 含有<p>标签
* 3 含有<br>标签
* 减分项 1 含有li标签
* 2 不包含任何标点符号
* 3 含有关键词javascript
* 4 不包含任何中文的,直接删除
* 5 有<li><a这样标签
* 实例:
* $he = new HtmlExtractor();
* $str = $he->text($html);
* 其中$html是某个网页的HTML代码,$str是返回的正文,正文编码是utf-8的
*/
class HtmlExtractor {
function chineseCount($str){
$count = preg_match_all("/[\xB0-\xF7][\xA1-\xFE]/",$str,$ff);
return $count;
}
function getutf8($str){
if(!$this->is_utf8(substr(strip_tags($str),0,500))){
$str = $this->auto_charset($str,"gbk","utf-8");
}
return $str;
}
function is_utf8($string)
{
if(preg_match("/^([".chr(228)."-".chr(233)."]{1}[".chr(128)."-".chr(191)."]{1}[".chr(128)."-".chr(191)."]{1}){1}/",$string) == true || preg_match("/([".chr(228)."-".chr(233)."]{1}[".chr(128)."-".chr(191)."]{1}[".chr(128)."-".chr(191)."]{1}){1}$/",$string) == true || preg_match("/([".chr(228)."-".chr(233)."]{1}[".chr(128)."-".chr(191)."]{1}[".chr(128)."-".chr(191)."]{1}){2,}/",$string) == true){
return true;
}else{
return false;
}
}
function auto_charset($fContents,$from,$to){
$from = strtoupper($from)=='UTF8'? 'utf-8':$from;
$to = strtoupper($to)=='UTF8'? 'utf-8':$to;
if( strtoupper($from) === strtoupper($to) || empty($fContents) || (is_scalar($fContents) && !is_string($fContents)) ){
return $fContents;
}
if(is_string($fContents) ) {
if(function_exists('mb_convert_encoding')){
return mb_convert_encoding ($fContents, $to, $from);
}elseif(function_exists('iconv')){
return iconv($from,$to,$fContents);
}else{
return $fContents;
}
}
elseif(is_array($fContents)){
foreach ( $fContents as $key => $val ) {
$_key = $this->auto_charset($key,$from,$to);
$fContents[$_key] = $this->auto_charset($val,$from,$to);
if($key != $_key )
unset($fContents[$key]);
}
return $fContents;
}
else{
return $fContents;
}
}
function text($str){
$str = $this->clear($str);
$str = $this->getutf8($str);
$divList = $this->divList($str);
$content = array();
foreach($divList[0] as $k=>$v){
if($this->chineseCount($v)/(strlen($v)/3) >= 0.4 && $this->checkHref($v)){
array_push($content,strip_tags($v,"<p><br>"));
}else if($this->makeScore($v) >= 3){
array_push($content,strip_tags($v,"<p><br>"));
}else{
}
}
return implode("",$content);
}
private function checkHref($str){
if(!preg_match("'<a[^>]*?>(.*)</a>'si",$str)){
return true;
}
$clear_str = preg_replace("'<a[^>]*?>(.*)</a>'si","",$str);
if($this->chineseCount($clear_str)){
return true;
}else{
return false;
}
}
function makeScore($str){
$score = 0;
$score += $this->score1($str);
$score += $this->score2($str);
$score += $this->score3($str);
$score -= $this->score4($str);
$score -= $this->score5($str);
$score -= $this->score6($str);
<li><a这样的标签
$score -= $this->score7($str);
return $score;
}
private function score1($str){
$count = preg_match_all("/(,|。|!|(|)|“|”|;|《|》|、)/si",$str,$out);
if($count){
return $count * 2;
}else{
return 0;
}
}
private function score2($str){
$count = preg_match_all("'<p[^>]*?>.*?</p>'si",$str,$out);
return $count * 2;
}
private function score3($str){
$count = preg_match_all("'<br/>'si",$str,$out) + preg_match_all("'<br>'si",$str,$out);
return $count * 2;
}
private function score4($str){
$count = preg_match_all("'<li[^>]*?>.*?</li>'si",$str,$out);
return $count * 2;
}
private function score5($str){
if(!preg_match_all("/(,|。|!|(|)|“|”|;|《|》|、|【|】)/si",$str,$out)){
return 2;
}else{
return 0;
}
}
private function score6($str){
$count = preg_match_all("'javascript'si",$str,$out);
return $count;
}
/*
* 判断<li><a这样的标签,有几个,减几分
*/
private function score7($str){
$count = preg_match_all("'<li[^>]*?>.*?<a'si",$str,$out);
return $count * 2;
}
private function clear($str){
$str = preg_replace("'<script[^>]*?>.*?</script>'si","",$str);
$str = preg_replace("'<style[^>]*?>.*?</style>'si","",$str);
$str = preg_replace("''si","",$str);
return $str;
}
private function divList($str){
preg_match_all("'<[^a][^>]*?>.*?</[^>]*?>'si",$str,$divlist);
return $divlist;
}
}