0){ sleep($GLOBALS['config']['caiji']['html_interval']); return true; } } } /*获取内容*/ public function get_content($html){ try { $cread=new \util\Readability($html,'utf-8'); $data=$cread->getContent(); }catch (\Exception $ex){ return null; } return trim($data['content']); } /*获取标题*/ public function get_title($html){ if(preg_match_all('/]*?>(?P[\s\S]+?)<\/h1>/i', $html,$title)){ if (count($title['content'])>1){ $title=null; }else{ $title=strip_tags(reset($title['content'])); if (preg_match('/^((\ \;)|\s)*$/i', $title)){ $title=null; } } }else{ $title=null; } if (empty($title)){ $pattern = array ( '<(h[12])\b[^<>]*?(id|class)=[\'\"]{0,1}[^\'\"<>]*(title|article)[^<>]*>(?P[\s\S]+?)<\/\1>', '(?P<content>[\s\S]+?)([\-\_\|][\s\S]+?)*<\/title>' ); $title=$this->return_preg_match($pattern, $html); } return trim(strip_tags($title)); } public function get_keywords($html){ $patterns=array( '<meta[^<>]*?name=[\'\"]keywords[\'\"][^<>]*?content=[\'\"](?P<content>[\s\S]*?)[\'\"]', '<meta[^<>]*?content=[\'\"](?P<content>[\s\S]*?)[\'\"][^<>]*?name=[\'\"]keywords[\'\"]' ); $data=$this->return_preg_match($patterns, $html); return trim(strip_tags($data)); } public function get_description($html){ $patterns=array( '<meta[^<>]*?name=[\'\"]description[\'\"][^<>]*?content=[\'\"](?P<content>[\s\S]*?)[\'\"]', '<meta[^<>]*?content=[\'\"](?P<content>[\s\S]*?)[\'\"][^<>]*?name=[\'\"]description[\'\"]' ); $data=$this->return_preg_match($patterns, $html); return trim(strip_tags($data)); } /** * 匹配规则的值 * @param 规则 $pattern * @param 来源内容 $content * @param 返回值得键名 $reg_key */ public function return_preg_match($pattern,$content,$reg_key='content'){ if(is_array($pattern)){ foreach ($pattern as $patt){ if(preg_match('/'.$patt.'/i', $content,$cont)){ $cont=$cont[$reg_key]; break; }else{ $cont=false; } } }else{ if(preg_match('/'.$pattern.'/i', $content,$cont)){ $cont=$cont[$reg_key]; }else{ $cont=false; } } return empty($cont)?'':$cont; } /** * 匹配根目录 * @param unknown $url * @param unknown $html * @return Ambigous <NULL, string> */ public function match_base_url($url,$html){ if(preg_match('/<base[^<>]*href=[\'\"](?P<base>[^\<\>\"\']*?)[\'\"]/i', $html,$base_url)){ $base_url=$base_url['base']; }else{ $base_url=preg_replace('/[\#\?][^\/]*$/', '', $url); if(preg_match('/^\w+\:\/\/([\w\-]+\.){1,}[\w]+\/.+/',$base_url)&&preg_match('/\.[a-z]+$/i', $base_url)){ $base_url=preg_replace('/\/[^\/]*\.[a-z]+$/', '', $base_url); } } $base_url=rtrim($base_url,'/'); return $base_url?$base_url:null; } /** * 匹配域名 * @param unknown $url * @return Ambigous <NULL, string> */ public function match_domain_url($url){ if(preg_match('/^\w+\:\/\/([\w\-]+\.){1,}[\w]+/', $url,$domain_url)){ $domain_url=rtrim($domain_url[0],'/'); } return $domain_url?$domain_url:null; } /** * 生成完整网址 * @param $url 要填充的网址 * @param $base_url 根目录网址 * @param $domain_url 域名 */ public function create_complete_url($url,$base_url,$domain_url){ if(preg_match('/^\w+\:\/\//', $url)){ return $url; }elseif(strpos($url,'//')===0){ $url='https:'.$url; }elseif(strpos($url,'/')===0){ $url=$domain_url.'/'.ltrim($url,'/'); }elseif(stripos($url,'javascript')===0||stripos($url,'#')===0){ $url=''; }elseif(!preg_match('/^\w+\:\/\//', $url)){ $url=$base_url.'/'.ltrim($url,'/'); } return $url; } } ?>