0){
sleep($GLOBALS['config']['caiji']['html_interval']);
return true;
}
}
}
/*获取内容*/
public function get_content($html){
try {
$cread=new \util\Readability($html,'utf-8');
$data=$cread->getContent();
}catch (\Exception $ex){
return null;
}
return trim($data['content']);
}
/*获取标题*/
public function get_title($html){
if(preg_match_all('/
]*?>(?P[\s\S]+?)<\/h1>/i', $html,$title)){
if (count($title['content'])>1){
$title=null;
}else{
$title=strip_tags(reset($title['content']));
if (preg_match('/^((\ \;)|\s)*$/i', $title)){
$title=null;
}
}
}else{
$title=null;
}
if (empty($title)){
$pattern = array (
'<(h[12])\b[^<>]*?(id|class)=[\'\"]{0,1}[^\'\"<>]*(title|article)[^<>]*>(?P[\s\S]+?)<\/\1>',
'(?P[\s\S]+?)([\-\_\|][\s\S]+?)*<\/title>'
);
$title=$this->return_preg_match($pattern, $html);
}
return trim(strip_tags($title));
}
public function get_keywords($html){
$patterns=array(
']*?name=[\'\"]keywords[\'\"][^<>]*?content=[\'\"](?P[\s\S]*?)[\'\"]',
']*?content=[\'\"](?P[\s\S]*?)[\'\"][^<>]*?name=[\'\"]keywords[\'\"]'
);
$data=$this->return_preg_match($patterns, $html);
return trim(strip_tags($data));
}
public function get_description($html){
$patterns=array(
']*?name=[\'\"]description[\'\"][^<>]*?content=[\'\"](?P[\s\S]*?)[\'\"]',
']*?content=[\'\"](?P[\s\S]*?)[\'\"][^<>]*?name=[\'\"]description[\'\"]'
);
$data=$this->return_preg_match($patterns, $html);
return trim(strip_tags($data));
}
/**
* 匹配规则的值
* @param 规则 $pattern
* @param 来源内容 $content
* @param 返回值得键名 $reg_key
*/
public function return_preg_match($pattern,$content,$reg_key='content'){
if(is_array($pattern)){
foreach ($pattern as $patt){
if(preg_match('/'.$patt.'/i', $content,$cont)){
$cont=$cont[$reg_key];
break;
}else{
$cont=false;
}
}
}else{
if(preg_match('/'.$pattern.'/i', $content,$cont)){
$cont=$cont[$reg_key];
}else{
$cont=false;
}
}
return empty($cont)?'':$cont;
}
/**
* 匹配根目录
* @param unknown $url
* @param unknown $html
* @return Ambigous
*/
public function match_base_url($url,$html){
if(preg_match('/]*href=[\'\"](?P[^\<\>\"\']*?)[\'\"]/i', $html,$base_url)){
$base_url=$base_url['base'];
}else{
$base_url=preg_replace('/[\#\?][^\/]*$/', '', $url);
if(preg_match('/^\w+\:\/\/([\w\-]+\.){1,}[\w]+\/.+/',$base_url)&&preg_match('/\.[a-z]+$/i', $base_url)){
$base_url=preg_replace('/\/[^\/]*\.[a-z]+$/', '', $base_url);
}
}
$base_url=rtrim($base_url,'/');
return $base_url?$base_url:null;
}
/**
* 匹配域名
* @param unknown $url
* @return Ambigous
*/
public function match_domain_url($url){
if(preg_match('/^\w+\:\/\/([\w\-]+\.){1,}[\w]+/', $url,$domain_url)){
$domain_url=rtrim($domain_url[0],'/');
}
return $domain_url?$domain_url:null;
}
/**
* 生成完整网址
* @param $url 要填充的网址
* @param $base_url 根目录网址
* @param $domain_url 域名
*/
public function create_complete_url($url,$base_url,$domain_url){
if(preg_match('/^\w+\:\/\//', $url)){
return $url;
}elseif(strpos($url,'//')===0){
$url='https:'.$url;
}elseif(strpos($url,'/')===0){
$url=$domain_url.'/'.ltrim($url,'/');
}elseif(stripos($url,'javascript')===0||stripos($url,'#')===0){
$url='';
}elseif(!preg_match('/^\w+\:\/\//', $url)){
$url=$base_url.'/'.ltrim($url,'/');
}
return $url;
}
}
?>