skycaiji/SkycaijiApp/admin/event/CpatternBase.php

1402 lines
38 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<?php
/*
|--------------------------------------------------------------------------
| SkyCaiji (蓝天采集器)
|--------------------------------------------------------------------------
| Copyright (c) 2018 https://www.skycaiji.com All rights reserved.
|--------------------------------------------------------------------------
| 使用协议 https://www.skycaiji.com/licenses
|--------------------------------------------------------------------------
*/
namespace skycaiji\admin\event;
use skycaiji\admin\model\CacheModel;
class CpatternBase extends Collector{
public $collector;
public $config;
public $release;
public $first_loop_field=null;
public $field_val_list=array();
public $collect_num=0;
public $collected_field_list=array();
public $used_source_urls=array();
public $used_level_urls=array();
public $used_cont_urls=array();
public $original_source_urls=null;
public $level_urls_list=array();
public $cont_urls_list=array();
public $exclude_cont_urls=array();
public $relation_url_list=array();
public $used_paging_urls=array();
public $cur_level_urls=array();
public $cur_source_url='';
public $html_cache_list=array();
public $show_opened_tools=false;
public function setConfig($config){}
public function init($config){}
public function collect($num=10){}
/*对象销毁时处理*/
public function __destruct(){
if(!empty($this->used_cont_urls)){
$usedContUrls=array_keys($this->used_cont_urls);
if(!empty($usedContUrls)&&is_array($usedContUrls)){
$total=count($usedContUrls);
$limit=800;
$batch=ceil($total/$limit);
for($i=1;$i<=$batch;$i++){
$list=array_slice($usedContUrls,($i-1)*$limit,$limit);
if(!empty($list)){
CacheModel::getInstance('cont_url')->db()->where('cname','in',$list)->delete();
}
}
}
}
}
/*规则匹配区域*/
public function rule_match_area($config,$html){
if(!empty($config['reg_area'])){
if(empty($config['reg_area_module'])){
if(preg_match('/'.$config['reg_area'].'/i',$html,$area_cont)){
if(isset($area_cont['match'])){
$html=$area_cont['match'];
}else{
$html=$area_cont[0];
}
}else{
$html='';
}
}elseif('json'==$config['reg_area_module']){
$html=$this->rule_module_json_data(array('json'=>$config['reg_area'],'json_arr'=>'jsonencode'),$html);
}elseif('xpath'==$config['reg_area_module']){
$html=$this->rule_module_xpath_data(array('xpath'=>$config['reg_area'],'xpath_attr'=>'outerHtml'),$html);
}else{
$html='';
}
}
return $html;
}
/**
* 规则匹配网址
* @param array $config 配置参数
* @param string $html 源码
* @param bool $whole 完全匹配模式
*
*/
public function rule_match_urls($config,$html,$whole=false){
$cont_urls=array();
if(!empty($config['reg_url'])&&!empty($config['url_merge'])){
$sign_match=$this->sign_addslashes(cp_sign('match','(?P<num>\d*)'));
if(preg_match_all('/'.$sign_match.'/i', $config['url_merge'],$match_signs)){
$url_merge=true;
if(empty($config['reg_url_module'])){
if(preg_match('/\(\?P<match\d*>/i', $config['reg_url'])){
if(preg_match_all('/'.$config['reg_url'].'/i',$html,$cont_urls,PREG_SET_ORDER)){
if($config['url_merge']==cp_sign('match')){
$url_merge=false;
foreach ($cont_urls as $k=>$v){
$cont_urls[$k]=$v['match'];
}
}
}
}else{
if($whole){
if(preg_match_all('/'.$config['reg_url'].'/i',$html,$cont_urls)){
$cont_urls=$cont_urls[0];
if($config['url_merge']==cp_sign('match')){
$url_merge=false;
}else{
foreach ($cont_urls as $k=>$v){
$cont_urls[$k]=array(
'match'=>$v
);
}
}
}
}
}
}elseif(in_array($config['reg_url_module'],array('xpath','json'))){
if('xpath'==$config['reg_url_module']){
$cont_urls=$this->rule_module_xpath_data ( array (
'xpath' => $config['reg_url'],
'xpath_attr' => 'href',
'xpath_multi'=>true,
'xpath_multi_type'=>'loop'
),$html);
$cont_urls=is_array($cont_urls)?$cont_urls:array();
}elseif('json'==$config['reg_url_module']){
$cont_urls=$this->rule_module_json_data(array('json'=>$config['reg_url'],'json_arr'=>'_original_'),$html);
if(empty($cont_urls)){
$cont_urls=array();
}elseif(!is_array($cont_urls)){
$cont_urls=array($cont_urls);
}
}
if($config['url_merge']==cp_sign('match')){
$url_merge=false;
}else{
foreach ($cont_urls as $k=>$v){
$cont_urls[$k]=array(
'match'=>$v
);
}
}
}
if($url_merge){
foreach ($cont_urls as $k=>$v){
$re_match=array();
foreach($match_signs['num'] as $ms_k=>$ms_v){
$re_match[$ms_k]=$v['match'.$ms_v];
}
$cont_urls[$k]=str_replace($match_signs[0], $re_match, $config['url_merge']);
}
}
}
}
$cont_urls=is_array($cont_urls)?array_unique($cont_urls):array();
$cont_urls=array_values($cont_urls);
return $cont_urls;
}
public function match_rule($html,$rule,$merge,$multi=false,$multi_str=''){
$val='';
$sign_match=$this->sign_addslashes(cp_sign('match','(?P<num>\d*)'));
if(!empty($rule)&&preg_match_all('/'.$sign_match.'/i',$merge,$match_signs)){
$multiStr='';
if(!empty($multi)){
preg_match_all('/'.$rule.'/i',$html,$match_conts,PREG_SET_ORDER);
$multiStr=str_replace(array('\r','\n'), array("\r","\n"), $multi_str);
}else{
if(preg_match('/'.$rule.'/i', $html,$match_cont)){
$match_conts=array($match_cont);
}
}
$curI=0;
foreach ($match_conts as $match_cont){
$curI++;
$re_match=array();
foreach($match_signs['num'] as $ms_k=>$ms_v){
$re_match[$ms_k]=$match_cont['match'.$ms_v];
}
$val.=($curI<=1?'':$multiStr).str_replace($match_signs[0], $re_match, $merge);
}
}
return $val;
}
/**
* 规则匹配,方法可调用,$field_params传入规则参数
* @param array $field_params
* @param string $html
* @return string
*/
public function field_module_rule($field_params,&$html){
$val='';
$sign_match=$this->sign_addslashes(cp_sign('match','(?P<num>\d*)'));
if(!empty($field_params['reg_rule'])&&preg_match_all('/'.$sign_match.'/i', $field_params['rule_merge'],$match_signs)){
$multiStr='';
$is_loop=false;
if(!empty($field_params['rule_multi'])){
preg_match_all('/'.$field_params['reg_rule'].'/i',$html,$match_conts,PREG_SET_ORDER);
$is_loop='loop'==$field_params['rule_multi_type']?true:false;
if($is_loop){
if(empty($this->first_loop_field)){
$this->first_loop_field=$field_params['name'];
}
$val=array();
}else{
$multiStr=str_replace(array('\r','\n'), array("\r","\n"), $field_params['rule_multi_str']);
}
}else{
if(preg_match('/'.$field_params['reg_rule'].'/i', $html,$match_cont)){
$match_conts=array($match_cont);
}
}
$curI=0;
if(is_array($match_conts)){
foreach ($match_conts as $match_cont){
$curI++;
$re_match=array();
foreach($match_signs['num'] as $ms_k=>$ms_v){
$re_match[$ms_k]=$match_cont['match'.$ms_v];
}
$contVal=str_replace($match_signs[0], $re_match, $field_params['rule_merge']);
if($is_loop){
$val[]=$contVal;
}else{
$val.=($curI<=1?'':$multiStr).$contVal;
}
}
}
}
return $val;
}
/**
* xpath规则方法可调用$field_params传入规则参数
* @param array $field_params
* @param string $html
* @return string
*/
public function field_module_xpath($field_params,$html){
if(!empty($field_params['xpath_multi'])){
if('loop'==$field_params['xpath_multi_type']){
if(empty($this->first_loop_field)){
$this->first_loop_field=$field_params['name'];
}
}
}
return $this->rule_module_xpath_data($field_params,$html);
}
public function rule_module_xpath_data($field_params,$html){
$vals='';
if(!empty($field_params['xpath'])){
$dom=new \DOMDocument;
$libxml_previous_state = libxml_use_internal_errors(true);
@$dom->loadHTML('<meta http-equiv="Content-Type" content="text/html;charset=utf-8">'.$html);
$dom->normalize();
$xPath = new \DOMXPath($dom);
$xpath_attr=strtolower($field_params['xpath_attr']);
$xpath_attr='custom'==$xpath_attr?strtolower($field_params['xpath_attr_custom']):$xpath_attr;
$normal_attr=true;
if(in_array($xpath_attr,array('innerhtml','outerhtml','text'))){
$normal_attr=false;
}
$xpath_q=trim($field_params['xpath']);
if(!empty($xpath_attr)){
if(preg_match('/\/\@[\w\-]+$/', $xpath_q)){
$xpath_q=preg_replace('/\@[\w\-]+$/', '', $xpath_q);
}
if($normal_attr){
$xpath_q=$xpath_q.(preg_match('/\/$/', $xpath_q)?'':'/').'@'.$xpath_attr;
}
}else{
if(!preg_match('/\/\@[\w\-]+$/', $xpath_q)){
$xpath_attr='innerhtml';
$normal_attr=false;
}
}
$nodes = $xPath->query($xpath_q);
$multiStr='';
$is_loop=false;
if(!empty($field_params['xpath_multi'])){
$is_loop='loop'==$field_params['xpath_multi_type']?true:false;
if($is_loop){
$vals=array();
}else{
$multiStr=str_replace(array('\r','\n'), array("\r","\n"), $field_params['xpath_multi_str']);
}
}
$curI=0;
foreach ($nodes as $node){
$curI++;
$val=($curI<=1?'':$multiStr);
if($normal_attr){
$val.=$node->nodeValue;
}else{
switch ($xpath_attr){
case 'innerhtml':
$nchilds = $node->childNodes;
foreach ($nchilds as $nchild){
$val .= $nchild->ownerDocument->saveHTML($nchild);
}
break;
case 'outerhtml':$val.=$node->ownerDocument->saveHTML($node);break;
case 'text':
$nchilds = $node->childNodes;
foreach ($nchilds as $nchild){
$val .= $nchild->ownerDocument->saveHTML($nchild);
}
$val=$this->filter_html_tags($val, array('style','script','object'));
$val=strip_tags($val);
break;
}
}
if($is_loop){
$vals[]=$val;
}else{
$vals.=$val;
}
if(empty($field_params['xpath_multi'])){
break;
}
}
libxml_clear_errors();
}
return $vals;
}
/*自动获取*/
public function field_module_auto($field_params,&$html,$cur_url){
switch (strtolower($field_params['auto'])){
case 'title':$val=$this->get_title($html);break;
case 'content':$val=$this->get_content($html);break;
case 'keywords':$val=$this->get_keywords($html);break;
case 'description':$val=$this->get_description($html);break;
case 'url':$val=$cur_url;break;
}
return $val;
}
public function field_module_words($field_params){
return $field_params['words'];
}
public function field_module_num($field_params){
$start=intval($field_params['num_start']);
$end=intval($field_params['num_end']);
return rand($start, $end);
}
public function field_module_time($field_params){
$val='';
$start=empty($field_params['time_start'])?NOW_TIME:strtotime($field_params['time_start']);
$end=empty($field_params['time_end'])?NOW_TIME:strtotime($field_params['time_end']);
$time=rand($start, $end);
if(empty($field_params['time_stamp'])){
$fmt=empty($field_params['time_format'])?'Y-m-d H:i':
str_replace(array('[年]','[月]','[日]','[时]','[分]','[秒]'), array('Y','m','d','H','i','s'), $field_params['time_format']);
$val=date($fmt,$time);
}else{
$val=$time;
}
return $val;
}
public function field_module_list($field_params){
static $list=array();
$key=md5($field_params['list']);
if(!isset($list[$key])){
if(preg_match_all('/[^\r\n]+/', $field_params['list'],$str_list)){
$str_list=$str_list[0];
}else{
$str_list=array();
}
$list[$key]=$str_list;
}
$str_list=$list[$key];
$val='';
if(!empty($str_list)){
$randi=array_rand($str_list,1);
$val=$str_list[$randi];
}
return $val;
}
public function field_module_merge($field_params,$val_list){
$val='';
if(preg_match_all('/\[\x{5b57}\x{6bb5}\:(.+?)\]/u', $field_params['merge'],$match_fields)){
$val=$field_params['merge'];
for($i=0;$i<count($match_fields[0]);$i++){
$val=str_replace($match_fields[0][$i],$val_list[$match_fields[1][$i]]['value'],$val);
}
}
return $val;
}
/**
* json提取方法可调用$field_params传入规则参数
* @param array $field_params
* @param string $html
* @return string
*/
public static $jsonpRegExp='/^(\s*[\$\w\-]+\s*[\{\(])+(?P<json>[\s\S]+)\}\s*\)\s*[\;]{0,1}/i';
public function field_module_json($field_params,$html,$cur_url=''){
static $jsonList=array();
$jsonKey=!empty($cur_url)?md5($cur_url):md5($html);
if(!isset($jsonList[$jsonKey])){
$jsonList[$jsonKey]=json_decode($html,true);
if(empty($jsonList[$jsonKey])&&preg_match(self::$jsonpRegExp, $html,$json)){
$json=trim($json['json']).'}';
$jsonList[$jsonKey]=json_decode($json,true);
}
}
$jsonArrType=$field_params['json_arr'];
if($field_params['json_loop']){
$field_params['json_arr']='_original_';
}
$val=$this->rule_module_json_data($field_params,$jsonList[$jsonKey]);
if($field_params['json_loop']){
if(is_array($val)){
$field_params['json_arr']=$jsonArrType;
foreach ($val as $k=>$v){
$val[$k]=$this->rule_module_json_data_convert($v,$field_params);
}
if(empty($this->first_loop_field)){
$this->first_loop_field=$field_params['name'];
}
}
}
return $val;
}
public function rule_module_json_data($field_params,$jsonArrOrStr){
$jsonArr=array();
if(is_array($jsonArrOrStr)){
$jsonArr=&$jsonArrOrStr;
}else{
$jsonArr=json_decode($jsonArrOrStr,true);
if(empty($jsonArr)&&preg_match(self::$jsonpRegExp,$jsonArrOrStr,$jsonArrOrStr)){
$jsonArr=trim($jsonArrOrStr['json']).'}';
$jsonArr=json_decode($jsonArr,true);
}
unset($jsonArrOrStr);
}
$val='';
if(!empty($jsonArr)){
if(!empty($field_params['json'])){
$jsonFmt=str_replace(array('"',"'",'[',' '), '', $field_params['json']);
$jsonFmt=str_replace(']','.',$jsonFmt);
$jsonFmt=trim($jsonFmt,'.');
$jsonFmt=explode('.', $jsonFmt);
$jsonFmt=array_values($jsonFmt);
if(!empty($jsonFmt)){
$val=$jsonArr;
$prevKey='';
foreach ($jsonFmt as $i=>$key){
if($prevKey=='*'){
$new_field_params=$field_params;
$new_field_params['json']=array_slice($jsonFmt, $i);
$new_field_params['json']=implode('.', $new_field_params['json']);
foreach ($val as $vk=>$vv){
$val[$vk]=$this->rule_module_json_data($new_field_params,$vv);
}
break;
}else{
if($key!='*'){
$val=$val[$key];
}
}
$prevKey=$key;
}
}
}
}
return $this->rule_module_json_data_convert($val, $field_params);
}
public function rule_module_json_data_convert($val,$field_params){
if(is_array($val)){
$json_arr=strtolower($field_params['json_arr']);
if(empty($json_arr)){
$json_arr='implode';
}
switch ($json_arr){
case 'implode':$arrImplode=str_replace(array('\r','\n'), array("\r","\n"), $field_params['json_arr_implode']);$val=array_implode($arrImplode,$val);break;
case 'jsonencode':$val=json_encode($val);break;
case 'serialize':$val=serialize($val);break;
case '_original_': break;
}
}
return $val;
}
/*字段提取内容*/
public function field_module_extract($field_params,$extract_field_val,$base_url,$domain_url){
$field_html=$extract_field_val['value'];
if(empty($field_html)){
return '';
}
$val='';
$extract_module=strtolower($field_params['extract_module']);
switch ($extract_module){
case 'cover':
if(!empty($extract_field_val['img'])){
$val=reset($extract_field_val['img']);
}else{
if(preg_match('/<img[^<>]*\bsrc=[\'\"](?P<url>[^\'\"]+?)[\'\"]/i',$field_html,$cover)){
$cover=$cover['url'];
$cover=$this->create_complete_url($cover, $base_url, $domain_url);
$val=$cover;
}
}
break;
case 'phone':
$field_html=$this->filter_html_tags($field_html,'style,script,object');
$field_html=strip_tags($field_html);
if(preg_match('/\d{11}/', $field_html,$phone)){
$val=$phone[0];
}
break;
case 'email':
$field_html=$this->filter_html_tags($field_html,'style,script,object');
$field_html=strip_tags($field_html);
if(preg_match('/[\w\-]+\@[\w\-\.]+/i', $field_html,$email)){
$val=$email[0];
}
break;
case 'rule':
$val=$this->field_module_rule(array('reg_rule'=>$field_params['reg_extract_rule']), $field_html);
if(empty($val)){
if(preg_match('/'.$field_params['reg_extract_rule'].'/i', $field_html,$val)){
$val=$val[0];
}
}
break;
case 'xpath':
$val=$this->field_module_xpath(array('xpath'=>$field_params['extract_xpath'],'xpath_attr'=>$field_params['extract_xpath_attr'],'xpath_attr_custom'=>$field_params['extract_xpath_attr_custom']), $field_html);
break;
case 'json':
$val=$this->field_module_json(array('json'=>$field_params['extract_json'],'json_arr'=>$field_params['extract_json_arr'],'json_arr_implode'=>$field_params['extract_json_arr_implode']), $field_html);
break;
}
return $val;
}
/*数据处理方法*/
public function process_f_html($fieldVal,$params){
$htmlAllow=array_filter(explode(',',$params['html_allow']));
$htmlFilter=array_filter(explode(',',$params['html_filter']));
if(!empty($htmlAllow)){
$htmlAllowStr='';
foreach ($htmlAllow as $v){
$htmlAllowStr.='<'.$v.'>';
}
$fieldVal=strip_tags($fieldVal,$htmlAllowStr);
}
if(!empty($htmlFilter)){
if(in_array('all', $htmlFilter)){
$fieldVal=$this->filter_html_tags($fieldVal, array('style','script','object'));
$fieldVal=strip_tags($fieldVal);
}else{
$fieldVal=$this->filter_html_tags($fieldVal, $htmlFilter);
}
}
return $fieldVal;
}
public function process_f_replace($fieldVal,$params){
return preg_replace('/'.$params['replace_from'].'/i',$params['replace_to'], $fieldVal);
}
public function process_f_tool($fieldVal,$params){
if(in_array('format', $params['tool_list'])){
$fieldVal=$this->filter_html_tags($fieldVal,array('style','script'));
$fieldVal=preg_replace('/\b(id|class|style|width|height|align)\s*=\s*([\'\"])[^\<\>\'\"]+?\\2(?=\s|$|\/|>)/i', ' ', $fieldVal);
}
if(in_array('trim', $params['tool_list'])){
$fieldVal=trim($fieldVal);
}
if(in_array('is_img', $params['tool_list'])){
if(!empty($GLOBALS['config']['caiji']['download_img'])){
$fieldVal=preg_replace('/(\bhttp[s]{0,1}\:\/\/[^\s]+)/i','{[img]}'."$1".'{[/img]}',$fieldVal);
}
}
return $fieldVal;
}
public function process_f_translate($fieldVal,$params){
if(!empty($GLOBALS['config']['translate'])&&!empty($GLOBALS['config']['translate']['open'])){
$fieldVal=\util\Translator::translate($fieldVal, $params['translate_from'], $params['translate_to']);
}
return $fieldVal;
}
public function process_f_batch($fieldVal,$params){
static $batch_list=array();
if(!empty($params['batch_list'])){
$listMd5=md5($params['batch_list']);
if(!isset($batch_list[$listMd5])){
if(preg_match_all('/([^\r\n]+?)\=([^\r\n]+)/', $params['batch_list'],$mlist)){
$batch_re=$mlist[1];
$batch_to=$mlist[2];
$batch_list[$listMd5]=array($batch_re,$batch_to);
}
}else{
$batch_re=$batch_list[$listMd5][0];
$batch_to=$batch_list[$listMd5][1];
}
$batch_re=is_array($batch_re)?$batch_re:null;
$batch_to=is_array($batch_to)?$batch_to:null;
if(!empty($batch_re)&&count($batch_re)==count($batch_to)){
$fieldVal=str_replace($batch_re, $batch_to, $fieldVal);
}
}
return $fieldVal;
}
public function process_f_substr($fieldVal,$params){
$params['substr_len']=intval($params['substr_len']);
if($params['substr_len']>0){
if(mb_strlen($fieldVal,'utf-8')>$params['substr_len']){
$fieldVal=mb_substr($fieldVal,0,$params['substr_len'],'utf-8').$params['substr_end'];
}
}
return $fieldVal;
}
public function process_f_func($fieldVal,$params){
if(!empty($params['func_name'])){
if(!function_exists($params['func_name'])){
$this->error('数据处理》无效的函数:'.$params['func_name']);
}else{
if(array_key_exists($params['func_name'], config('allow_process_func'))||array_key_exists($params['func_name'], config('EXTEND_PROCESS_FUNC'))){
static $func_param_list=array();
$funcParam=null;
if(empty($params['func_param'])){
$funcParam=array($fieldVal);
}else{
$fparamMd5=md5($params['func_param']);
if(!isset($func_param_list[$fparamMd5])){
if(preg_match_all('/[^\r\n]+/', $params['func_param'],$mfuncParam)){
$func_param_list[$fparamMd5]=$mfuncParam[0];
}
}
$funcParam=$func_param_list[$fparamMd5];
foreach ($funcParam as $k=>$v){
$funcParam[$k]=str_replace('###', $fieldVal, $v);
}
}
if(!empty($funcParam)&&is_array($funcParam)){
try {
$fieldVal=call_user_func_array($params['func_name'], $funcParam);
}catch (\Exception $ex){
}
}
}else{
$this->error('数据处理》未配置函数:'.$params['func_name']);
}
}
}
return $fieldVal;
}
public function process_f_filter($fieldVal,$params,$curUrlMd5,$loopIndex,$contUrlMd5){
static $key_list=array();
if(!empty($params['filter_list'])){
$listMd5=md5($params['filter_list']);
if(!isset($key_list[$listMd5])){
$filterList=explode("\r\n", $params['filter_list']);
$filterList=array_filter($filterList);
$key_list[$listMd5]=$filterList;
}else{
$filterList=$key_list[$listMd5];
}
$filterList=is_array($filterList)?$filterList:array();
if(!empty($params['filter_pass'])){
if($params['filter_pass']=='1'){
foreach ($filterList as $filterStr){
if(stripos($fieldVal,$filterStr)!==false){
$fieldVal='';
break;
}
}
}elseif($params['filter_pass']=='2'){
foreach ($filterList as $filterStr){
if(stripos($fieldVal,$filterStr)!==false){
if(!isset($this->exclude_cont_urls[$contUrlMd5])){
$this->exclude_cont_urls[$contUrlMd5]=array();
}
if(empty($this->first_loop_field)){
$this->exclude_cont_urls[$contUrlMd5][$curUrlMd5]='filter:'.$filterStr;
}else{
if(!isset($this->exclude_cont_urls[$contUrlMd5][$curUrlMd5])){
$this->exclude_cont_urls[$contUrlMd5][$curUrlMd5]=array();
}
$this->exclude_cont_urls[$contUrlMd5][$curUrlMd5][$loopIndex]='filter:'.$filterStr;
}
break;
}
}
}elseif($params['filter_pass']=='3'){
$hasKey=false;
foreach ($filterList as $filterStr){
if(stripos($fieldVal,$filterStr)!==false){
$hasKey=true;
break;
}
}
if(!$hasKey){
$fieldVal='';
}
}elseif($params['filter_pass']=='4'){
$hasKey=false;
foreach ($filterList as $filterStr){
if(stripos($fieldVal,$filterStr)!==false){
$hasKey=true;
break;
}
}
if(!$hasKey){
if(!isset($this->exclude_cont_urls[$contUrlMd5])){
$this->exclude_cont_urls[$contUrlMd5]=array();
}
if(empty($this->first_loop_field)){
$this->exclude_cont_urls[$contUrlMd5][$curUrlMd5]='filter:';
}else{
if(!isset($this->exclude_cont_urls[$contUrlMd5][$curUrlMd5])){
$this->exclude_cont_urls[$contUrlMd5][$curUrlMd5]=array();
}
$this->exclude_cont_urls[$contUrlMd5][$curUrlMd5][$loopIndex]='filter:';
}
}
}
}else{
$fieldVal=str_ireplace($filterList, $params['filter_replace'], $fieldVal);
}
}
return $fieldVal;
}
public function process_f_if($fieldVal,$params,$curUrlMd5,$loopIndex,$contUrlMd5){
static $func_list=array();
if(is_array($params['if_logic'])&&!empty($params['if_logic'])){
$resultOr=array();
$resultAnd=array();
foreach($params['if_logic'] as $ifk=>$iflv){
if(empty($iflv)||empty($params['if_cond'][$ifk])){
continue;
}
$ifVal=$params['if_val'][$ifk];
$ifCond=$params['if_cond'][$ifk];
$result=false;
switch($ifCond){
case 'regexp':
if(preg_match('/'.$ifVal.'/', $fieldVal)){
$result=true;
}
break;
case 'func':
if(!empty($ifVal)){
$funcMd5=md5($ifVal);
if(!isset($func_list[$funcMd5])){
if(preg_match_all('/[^\r\n]+/',$ifVal,$funcParam)){
$funcParam=$funcParam[0];
}else{
$funcParam=array($ifVal);
}
$func_list[$funcMd5]=$funcParam;
}else{
$funcParam=$func_list[$funcMd5];
}
$funcName=$funcParam[0];
$isTurn=false;
if(strpos($funcName,'!')===0){
$funcName=substr($funcName, 1);
$isTurn=true;
}
unset($funcParam[0]);
if(empty($funcParam)){
$funcParam=array($fieldVal);
}else{
foreach($funcParam as $k=>$v){
$funcParam[$k]=str_replace('###', $fieldVal, $v);
}
}
if(!function_exists($funcName)){
$this->error('数据处理》条件判断》无效的函数:'.$funcName);
}else{
if(array_key_exists($funcName, config('allow_process_if'))||array_key_exists($funcName, config('EXTEND_PROCESS_IF'))){
try {
$result=call_user_func_array($funcName, $funcParam);
if($isTurn){
$result=$result?false:true;
}
}catch (\Exception $ex){
$this->error('数据处理》条件判断》函数'.$funcName.'运行错误,'.$ex->getMessage());
}
}else{
$this->error('数据处理》条件判断》未配置函数:'.$funcName);
}
}
}
break;
case 'has':$result=stripos($fieldVal,$ifVal)!==false?true:false;break;
case 'nhas':$result=stripos($fieldVal,$ifVal)===false?true:false;break;
case 'eq':$result=$fieldVal==$ifVal?true:false;break;
case 'neq':$result=$fieldVal!=$ifVal?true:false;break;
case 'heq':$result=$fieldVal===$ifVal?true:false;break;
case 'nheq':$result=$fieldVal!==$ifVal?true:false;break;
case 'gt':$result=$fieldVal>$ifVal?true:false;break;
case 'egt':$result=$fieldVal>=$ifVal?true:false;break;
case 'lt':$result=$fieldVal<$ifVal?true:false;break;
case 'elt':$result=$fieldVal<=$ifVal?true:false;break;
case 'time_eq':
case 'time_egt':
case 'time_elt':
$fieldTime=is_numeric($fieldVal)?$fieldVal:strtotime($fieldVal);
$valTime=is_numeric($ifVal)?$ifVal:strtotime($ifVal);
if($ifCond=='time_eq'){
$result=$fieldTime==$valTime?true:false;
}elseif($ifCond=='time_egt'){
$result=$fieldTime>=$valTime?true:false;
}elseif($ifCond=='time_elt'){
$result=$fieldTime<=$valTime?true:false;
}
break;
}
if('or'==$iflv){
if(!empty($resultAnd)){
$resultOr[]=$resultAnd;
}
$resultAnd=array();
$resultOr[]=$result;
}elseif('and'==$iflv){
$resultAnd[]=$result;
}
}
if(!empty($resultAnd)){
$resultOr[]=$resultAnd;
}
if(is_array($resultOr)&&!empty($resultOr)){
$isTrue=false;
foreach ($resultOr as $results){
if(is_array($results)){
$andResult=true;
foreach ($results as $result){
if(!$result){
$andResult=false;
break;
}
}
$results=$andResult;
}
if($results){
$isTrue=true;
break;
}
}
$exclude='';
switch ($params['if_type']){
case '1':$exclude=$isTrue?'':'if:1';break;
case '2':$exclude=$isTrue?'if:2':'';break;
case '3':$exclude=!$isTrue?'':'if:3';break;
case '4':$exclude=!$isTrue?'if:4':'';break;
}
if($exclude){
if(!isset($this->exclude_cont_urls[$contUrlMd5])){
$this->exclude_cont_urls[$contUrlMd5]=array();
}
if(empty($this->first_loop_field)){
$this->exclude_cont_urls[$contUrlMd5][$curUrlMd5]=$exclude;
}else{
if(!isset($this->exclude_cont_urls[$contUrlMd5][$curUrlMd5])){
$this->exclude_cont_urls[$contUrlMd5][$curUrlMd5]=array();
}
$this->exclude_cont_urls[$contUrlMd5][$curUrlMd5][$loopIndex]=$exclude;
}
}
}
}
return $fieldVal;
}
/*数据处理*/
public function process_field($fieldVal,$process,$curUrlMd5,$loopIndex,$contUrlMd5){
if(empty($process)){
return $fieldVal;
}
static $funcs=array('filter','if');
foreach ($process as $params){
if(empty($this->first_loop_field)){
if(isset($this->exclude_cont_urls[$contUrlMd5][$curUrlMd5])){
return $fieldVal;
}
}else{
if(isset($this->exclude_cont_urls[$contUrlMd5][$curUrlMd5][$loopIndex])){
return $fieldVal;
}
}
$funcName='process_f_'.$params['module'];
if(method_exists($this, $funcName)){
if(in_array($params['module'],$funcs)){
$fieldVal=$this->$funcName($fieldVal,$params,$curUrlMd5,$loopIndex,$contUrlMd5);
}else{
$fieldVal=$this->$funcName($fieldVal,$params);
}
}
}
return $fieldVal;
}
/**
* 拼接默认设置
* @param unknown $reg 规则
* @param unknown $merge 拼接字符串
*/
public function set_merge_default($reg,$merge){
if(empty($merge)){
$merge='';
if(!empty($reg)){
if(preg_match_all('/\<match(?P<num>\d*)\>/i', $reg,$match_signs)){
foreach ($match_signs['num'] as $snum){
$merge.=cp_sign('match',$snum);
}
}
}
}
return $merge;
}
/**
* 转换起始网址
* @param string $url
* @return multitype:mixed |unknown
*/
public function convert_source_url($url){
$urls=array();
if(preg_match('/\{param\:(?P<type>[a-z]+)\,(?P<val>.*?)\}/i', $url,$match)){
$fmtUrl=preg_replace('/\{param\:.*?\}/i', '__set:param__', $url);
$type=strtolower($match['type']);
$val=explode("\t", $match['val']);
if($type=='num'){
$num_start = intval($val[0]);
$num_end = intval($val[1]);
$num_end = max ($num_start,$num_end);
$num_inc = max ( 1, intval($val[2]));
$num_desc =$val[3]?1:0;
if($num_desc){
for($i=$num_end;$i>=$num_start;$i--){
$urls[]=str_replace('__set:param__', $num_start+($i-$num_start)*$num_inc, $fmtUrl);
}
}else{
for($i=$num_start;$i<=$num_end;$i++){
$urls[]=str_replace('__set:param__', $num_start+($i-$num_start)*$num_inc, $fmtUrl);
}
}
}elseif($type=='letter'){
$letter_start=ord($val[0]);
$letter_end=ord($val[1]);
$letter_end=max($letter_start,$letter_end);
$letter_desc=$val[2]?1:0;
if($letter_desc){
for($i=$letter_end;$i>=$letter_start;$i--) {
$urls[]=str_replace('__set:param__', chr($i), $fmtUrl);
}
}else{
for($i=$letter_start;$i<=$letter_end;$i++) {
$urls[]=str_replace('__set:param__', chr($i), $fmtUrl);
}
}
}elseif($type=='custom'){
foreach ($val as $v){
$urls[]=str_replace('__set:param__', $v, $fmtUrl);
}
}
return $urls;
}if(preg_match('/\{json\:([^\}]*)\}/i',$url,$match)){
$url=preg_replace('/\{json\:([^\}]*)\}/i','',$url);
$jsonRule=trim($match[1]);
if(is_null($jsonRule)||$jsonRule==''){
$jsonRule='*';
}
$jsonData=$this->get_html($url);
if(!empty($jsonData)){
$urls=$this->rule_module_json_data(array('json'=>$jsonRule,'json_arr'=>'_original_'),$jsonData);
if(empty($urls)){
$urls=array();
}
if(!is_array($urls)){
$urls=array($urls);
}
foreach ($urls as $k=>$v){
if(!is_string($v)||!preg_match('/^\w+\:\/\//i', $v)){
unset($urls[$k]);
}
}
if(!empty($urls)&&is_array($urls)){
$urls=array_unique($urls);
$urls=array_values($urls);
}
return $urls;
}
}elseif(preg_match('/[\r\n]/', $url)){
if(preg_match_all('/^\w+\:\/\/[^\r\n]+/im',$url,$urls)){
$urls=array_unique($urls[0]);
$urls=array_values($urls);
}
return $urls;
}else{
return $url;
}
}
/*排除内容网址的提示信息*/
public function exclude_url_msg($val){
$val=explode(':', $val);
$type='';
if(is_array($val)){
$type=$val[0];
$val=$val[1];
}else{
$type=$val;
$val='';
}
$msg='排除网址';
if($type=='filter'){
if(empty($val)){
$msg='关键词过滤';
}else{
$msg='关键词过滤:'.$val;
}
}elseif($type=='if'){
$msg='条件';
switch ($val){
case '1':$msg.='假';break;
case '2':$msg.='真';break;
case '3':$msg.='假';break;
case '4':$msg.='真';break;
}
if(lang('?p_m_if_'.$val)){
$msg.=''.lang('p_m_if_'.$val);
}
}
return $msg;
}
/*转换(*)通配符*/
public function convert_sign_wildcard($str){
return str_replace(lang('sign_wildcard'), '[\s\S]*?', $str);
}
/*转换[参数]*/
public function convert_sign_match($str){
$str=preg_replace('/\(\?<(content|match)/i', '(?P<match', $str);
$sign_match=$this->sign_addslashes(cp_sign('match','(?P<num>\d*)'));
$str=preg_replace_callback('/(\={0,1})(\s*)([\'\"]{0,1})'.$sign_match.'\3/', function($matches){
$ruleStr=$matches[1].$matches[2].$matches[3].'(?P<match'.$matches['num'].'>';
if(!empty($matches[1])&&!empty($matches[3])){
$ruleStr.='[^\<\>]*?)';
}else{
$ruleStr.='[\s\S]*?)';
}
$ruleStr.=$matches[3];
return $ruleStr;
}, $str);
return $str;
}
public function sign_addslashes($str){
$str=str_replace(array('[',']'), array('\[','\]'), $str);
return $str;
}
/*过滤html标签*/
public function filter_html_tags($content,$tags){
$tags=$this->clear_tags($tags);
$arr1=$arr2=array();
foreach ($tags as $tag){
$tag=strtolower($tag);
if($tag=='script'||$tag=='style'||$tag=='object'){
$arr1[$tag]=$tag;
}else{
$arr2[$tag]=$tag;
}
}
if($arr1){
$content=preg_replace('/<('.implode('|', $arr1).')[^<>]*>[\s\S]*?<\/\1>/i', '', $content);
}
if($arr2){
$content=preg_replace('/<[\/]*('.implode('|', $arr2).')[^<>]*>/i', '', $content);
}
return $content;
}
/*过滤标签*/
public function clear_tags($tags){
if(!is_array($tags)){
$tags = preg_replace('/[\s\,\x{ff0c}]+/u', ',', $tags);
$tags=explode(',', $tags);
}
if(!empty($tags)&&is_array($tags)){
$tags=array_filter($tags);
$tags=array_unique($tags);
$tags=array_values($tags);
}else{
$tags=array();
}
return $tags;
}
/*获取源码*/
public function get_html($url,$open_cache=false,$is_post=false){
if($open_cache&&!empty($this->html_cache_list[$url])){
return $this->html_cache_list[$url];
}
$pageRenderTool=null;
if($this->config['page_render']){
$pageRenderTool=$GLOBALS['config']['page_render']['tool'];
if(empty($pageRenderTool)){
$this->error('页面渲染未设置,请检查<a href="'.url('Setting/page_render').'" target="_blank">渲染设置</a>','Setting/page_render');
return null;
}
}
$html=null;
$headers=array();
$options=array();
if($this->config['request_headers']['open']){
if(!empty($this->config['request_headers']['useragent'])){
$options['useragent']=$this->config['request_headers']['useragent'];
}
if(!empty($this->config['request_headers']['cookie'])){
$headers['cookie']=$this->config['request_headers']['cookie'];
}
if(!empty($this->config['request_headers']['referer'])){
$headers['referer']=$this->config['request_headers']['referer'];
}
if(!empty($this->config['request_headers']['custom_names'])){
foreach ($this->config['request_headers']['custom_names'] as $k=>$v){
if(!empty($v)){
$headers[$v]=$this->config['request_headers']['custom_vals'][$k];
}
}
}
}
$mproxy=model('Proxyip');
$proxy_ip=null;
if(!empty($GLOBALS['config']['proxy']['open'])){
$proxy_ip=$mproxy->get_usable_ip();
$proxyIp=$mproxy->to_proxy_ip($proxy_ip);
if(!empty($proxyIp)){
$options['proxy']=$proxyIp;
}
}
$urlPost=null;
if($is_post){
$urlPost=strpos($url, '?');
if($urlPost!==false){
$urlPost=substr($url, $urlPost+1);
$url=preg_replace('/\?.*$/', '', $url);
}else{
$urlPost='';
}
}
if($pageRenderTool){
if(!empty($options['useragent'])){
$headers['user-agent']=$options['useragent'];
unset($options['useragent']);
}
if(!empty($options['proxy'])){
$options['proxy']=$proxy_ip;
}
if($pageRenderTool=='chrome'){
$chromeConfig=$GLOBALS['config']['page_render']['chrome'];
try {
$chromeSocket=new \util\ChromeSocket($chromeConfig['host'],$chromeConfig['port'],$GLOBALS['config']['page_render']['timeout'],$chromeConfig['filename']);
$chromeSocket->newTab();
$chromeSocket->websocket(null);
if($is_post){
$html=$chromeSocket->getRenderHtml($url,$headers,$options,$this->config['charset'],$urlPost);
}else{
$html=$chromeSocket->getRenderHtml($url,$headers,$options);
}
}catch (\Exception $ex){
$this->error('页面渲染失败,请检查<a href="'.url('Setting/page_render').'" target="_blank">渲染设置</a>','Setting/page_render');
return null;
}
}else{
$this->error('渲染工具不可用,请检查<a href="'.url('Setting/page_render').'" target="_blank">渲染设置</a>','Setting/page_render');
return null;
}
}else{
if($is_post){
$html=get_html($url,$headers,$options,$this->config['charset'],$urlPost);
}else{
$html=get_html($url,$headers,$options,$this->config['charset']);
}
}
if($html==null){
if(!empty($proxy_ip)){
$mproxy->set_ip_failed($proxy_ip);
}
return null;
}
if($this->config['url_complete']){
$base_url=$this->match_base_url($url, $html);
$domain_url=$this->match_domain_url($url, $html);
$html=preg_replace_callback('/(?<=\bhref\=[\'\"])([^\'\"]*)(?=[\'\"])/i',function($matche) use ($base_url,$domain_url){
return \skycaiji\admin\event\Cpattern::create_complete_url($matche[1], $base_url, $domain_url);
},$html);
$html=preg_replace_callback('/(?<=\bsrc\=[\'\"])([^\'\"]*)(?=[\'\"])/i',function($matche) use ($base_url,$domain_url){
return \skycaiji\admin\event\Cpattern::create_complete_url($matche[1], $base_url, $domain_url);
},$html);
}
if($open_cache){
$this->html_cache_list[$url]=$html;
}
return $html;
}
}
?>