used_cont_urls)){ $usedContUrls=array_keys($this->used_cont_urls); if(!empty($usedContUrls)&&is_array($usedContUrls)){ $total=count($usedContUrls); $limit=800; $batch=ceil($total/$limit); for($i=1;$i<=$batch;$i++){ $list=array_slice($usedContUrls,($i-1)*$limit,$limit); if(!empty($list)){ CacheModel::getInstance('cont_url')->db()->where('cname','in',$list)->delete(); } } } } } /** * 优化设置页面post过来的config * @param unknown $config */ public function setConfig($config){ $config['url_complete']=intval($config['url_complete']); $config['url_reverse']=intval($config['url_reverse']); $config['page_render']=intval($config['page_render']); $config['url_repeat']=intval($config['url_repeat']); if(!empty($config['request_headers'])){ if(!is_array($config['request_headers']['custom_names'])){ $config['request_headers']['custom_names']=array(); } if(!is_array($config['request_headers']['custom_vals'])){ $config['request_headers']['custom_vals']=array(); } foreach ($config['request_headers']['custom_names'] as $k=>$v){ if(empty($v)){ unset($config['request_headers']['custom_names'][$k]); unset($config['request_headers']['custom_vals'][$k]); } } $config['request_headers']['custom_names']=array_values($config['request_headers']['custom_names']); $config['request_headers']['custom_vals']=array_values($config['request_headers']['custom_vals']); } foreach ($config['source_url'] as $k=>$v){ if(preg_match('/[\r\n]/', $v)){ if(preg_match_all('/^\w+\:\/\/[^\r\n]+/im',$v,$v_urls)){ $v_urls=array_unique($v_urls[0]); $v_urls=array_values($v_urls); $config['source_url'][$k]=implode("\r\n", $v_urls); }else{ unset($config['source_url'][$k]); } }else{ if(!preg_match('/^\w+\:\/\/.+/i', $v)){ unset($config['source_url'][$k]); } } } $config['source_url']=array_unique($config['source_url']); $config['source_url']=array_filter($config['source_url']); $config['source_url']=array_values($config['source_url']); if(!empty($config['field_list'])){ foreach ($config['field_list'] as $k=>$v){ $config['field_list'][$k]=json_decode(url_b64decode($v),true); } } if(!empty($config['field_process'])){ foreach ($config['field_process'] as $k=>$v){ $config['field_process'][$k]=json_decode(url_b64decode($v),true); $config['field_process'][$k]=$this->setProcess($config['field_process'][$k]); } } $config['common_process']=input('process/a',null,'trim'); $config['common_process']=$this->setProcess($config['common_process']); if(!empty($config['paging_fields'])){ foreach ($config['paging_fields'] as $k=>$v){ $config['paging_fields'][$k]=json_decode(url_b64decode($v),true); } } if(!empty($config['level_urls'])){ foreach ($config['level_urls'] as $k=>$v){ $config['level_urls'][$k]=json_decode(url_b64decode($v),true); } } if(!empty($config['relation_urls'])){ foreach ($config['relation_urls'] as $k=>$v){ $config['relation_urls'][$k]=json_decode(url_b64decode($v),true); } } $config['url_post']=intval($config['url_post']); if(!empty($config['url_posts'])){ if(!is_array($config['url_posts']['names'])){ $config['url_posts']['names']=array(); } if(!is_array($config['url_posts']['vals'])){ $config['url_posts']['vals']=array(); } if(!empty($config['url_posts']['names'])){ foreach ($config['url_posts']['names'] as $k=>$v){ if(empty($v)){ unset($config['url_posts']['names'][$k]); unset($config['url_posts']['vals'][$k]); } } } $config['url_posts']['names']=array_values($config['url_posts']['names']); $config['url_posts']['vals']=array_values($config['url_posts']['vals']); } return $config; } public function init($collData){ $collData['config']=unserialize($collData['config']); $this->collector=$collData; $releData=model('Release')->where(array('task_id'=>$collData['task_id']))->find(); if(!empty($releData)){ $releData=$releData->toArray(); } $this->release=$releData; $keyConfig='collector_config_'.$collData['id']; $cacheConfig=cache($keyConfig); if(empty($cacheConfig)||$cacheConfig['update_time']!=$collData['uptime']){ $config=$this->initConfig($collData['config']); cache($keyConfig,array('update_time'=>$collData['uptime'],'config'=>$config)); }else{ $config=$cacheConfig['config']; } $this->config=$config; } public function initConfig($config){ $newConfig=array(); $newConfig['charset'] = $config['charset']=='custom' ? $config ['charset_custom'] : $config ['charset']; $newConfig['charset']= empty($newConfig['charset'])?'auto':$newConfig['charset']; if(!empty($config['area'])){ if(empty($config['area_module'])){ $newConfig['reg_source_cont']=$this->convert_sign_match($config['area']); } }elseif(!empty($config['area_start'])||!empty($config['area_end'])) { $newConfig['reg_source_cont'] = $config['area_start'] . (!empty($config['area_end']) ? '(?P[\s\S]+?)' : '(?P[\s\S]+)') . $config['area_end']; } if(!empty($newConfig['reg_source_cont'])){ $newConfig['reg_source_cont'] = str_replace ( '(*)', '[\s\S]*?', $newConfig['reg_source_cont'] ); $newConfig['reg_source_cont'] = preg_replace ( '/\\\*([\'\/])/', "\\\\$1", $newConfig['reg_source_cont'] ); }elseif(!empty($config['area_module'])){ $newConfig['reg_source_cont']=$config['area']; } if(empty($config['url_rule_module'])){ if(!empty($config['url_rule'])){ $newConfig['reg_source_cont_url']=$this->convert_sign_match($config['url_rule']); $newConfig['reg_source_cont_url']=preg_replace('/\\\*([\'\/])/', "\\\\$1", $newConfig['reg_source_cont_url']); $newConfig['reg_source_cont_url']=str_replace ( '(*)', '[\s\S]*?', $newConfig['reg_source_cont_url'] ); }else{ $newConfig['reg_source_cont_url']='\bhref=[\'\"](?P[^\'\"\<\>]+?)[\'\"]'; } $config['url_merge']=$this->set_merge_default($newConfig['reg_source_cont_url'], $config['url_merge']); }elseif('xpath'==$config['url_rule_module']){ if(!empty($config['url_rule'])){ $newConfig['reg_source_cont_url']=$config['url_rule']; }else{ $newConfig['reg_source_cont_url']='//a'; } $config['url_merge']=$this->set_merge_default('(?P.+)', $config['url_merge']); }elseif('json'==$config['url_rule_module']){ $newConfig['reg_source_cont_url']=$config['url_rule']; $config['url_merge']=$this->set_merge_default('(?P.+)', $config['url_merge']); } if(!empty($config['url_must'])){ $newConfig['url_must']=preg_replace('/\\\*([\'\/])/', "\\\\$1", $config['url_must']); $newConfig['url_must']=str_replace('(*)', '[\s\S]*?', $newConfig['url_must']); } if(!empty($config['url_ban'])){ $newConfig['url_ban']=preg_replace('/\\\*([\'\/])/', "\\\\$1", $config['url_ban']); $newConfig['url_ban']=str_replace('(*)', '[\s\S]*?', $newConfig['url_ban']); } if(!empty($config['level_urls'])){ $config['new_level_urls']=array(); foreach ($config['level_urls'] as $luk=>$luv){ if(!empty($luv['area'])){ if(empty($luv['area_module'])){ $luv['reg_area']=$this->convert_sign_match($luv['area']); $luv['reg_area']=preg_replace('/\\\*([\'\/])/', "\\\\$1", $luv['reg_area']); $luv['reg_area']=str_replace('(*)', '[\s\S]*?', $luv['reg_area']); }else{ $luv['reg_area']=$luv['area']; } $luv['reg_area_module']=$luv['area_module']; } if(empty($luv['url_rule_module'])){ if(!empty($luv['url_rule'])){ $luv['reg_url']=$this->convert_sign_match($luv['url_rule']); $luv['reg_url']=preg_replace('/\\\*([\'\/])/', "\\\\$1", $luv['reg_url']); $luv['reg_url']=str_replace ( '(*)', '[\s\S]*?', $luv['reg_url'] ); }else{ $luv['reg_url']='\bhref=[\'\"](?P[^\'\"\<\>]+?)[\'\"]'; } $luv['url_merge']=$this->set_merge_default($luv['reg_url'], $luv['url_merge']); }elseif('xpath'==$luv['url_rule_module']){ if(!empty($luv['url_rule'])){ $luv['reg_url']=$luv['url_rule']; }else{ $luv['reg_url']='//a'; } $luv['url_merge']=$this->set_merge_default('(?P.+)', $luv['url_merge']); }elseif('json'==$luv['url_rule_module']){ $luv['reg_url']=$luv['url_rule']; $luv['url_merge']=$this->set_merge_default('(?P.+)', $luv['url_merge']); } $luv['reg_url_module']=$luv['url_rule_module']; if(!empty($luv['url_must'])){ $luv['url_must']=preg_replace('/\\\*([\'\/])/', "\\\\$1", $luv['url_must']); $luv['url_must']=str_replace('(*)', '[\s\S]*?', $luv['url_must']); } if(!empty($luv['url_ban'])){ $luv['url_ban']=preg_replace('/\\\*([\'\/])/', "\\\\$1", $luv['url_ban']); $luv['url_ban']=str_replace('(*)', '[\s\S]*?', $luv['url_ban']); } $config['level_urls'][$luk]=$luv; $config['new_level_urls'][$luv['name']]=$luv; } } $relation_urls=array(); if(!empty($config['relation_urls'])){ foreach ($config['relation_urls'] as $ruv){ if(empty($ruv['url_rule_module'])){ $ruv['reg_url']=$this->convert_sign_match($ruv['url_rule']); $ruv['reg_url']=preg_replace('/\\\*([\'\/])/', "\\\\$1", $ruv['reg_url']); $ruv['reg_url']=str_replace('(*)', '[\s\S]*?', $ruv['reg_url']); $ruv['url_merge']=$this->set_merge_default($ruv['reg_url'], $ruv['url_merge']); }elseif(in_array($ruv['url_rule_module'],array('xpath','json'))){ $ruv['reg_url']=$ruv['url_rule']; $ruv['url_merge']=$this->set_merge_default('(?P.+)', $ruv['url_merge']); } $ruv['reg_url_module']=$ruv['url_rule_module']; $relation_urls[$ruv['name']]=$ruv; } } $relation_depth_urls=array(); foreach ($relation_urls as $ruv){ $rDepth=0; $rFuName=$ruv['page']; if(empty($rFuName)){ $rDepth=0; }else{ $passRelation=false; $rFuPage=$rFuName; do{ if(empty($relation_urls[$rFuPage])){ $passRelation=true; break; } $rFuPage=$relation_urls[$rFuPage]['page']; if($rFuPage==$rFuName){ $passRelation=true; break; } $rDepth++; }while(!empty($rFuPage)); if($passRelation){ continue; } } $relation_depth_urls[$rDepth][$ruv['name']]=$ruv; } ksort($relation_depth_urls); $config['new_relation_urls']=array(); foreach ($relation_depth_urls as $rurls){ if(is_array($rurls)){ $config['new_relation_urls']=array_merge($config['new_relation_urls'],$rurls); } } if(!empty($config['field_list'])){ foreach ($config['field_list'] as $fk=>$fv){ if('rule'==$fv['module']){ $fv['reg_rule']=$this->convert_sign_match($fv['rule']); $fv['reg_rule']=preg_replace('/\\\*([\'\/])/', "\\\\$1", $fv['reg_rule']); $fv['reg_rule']=str_replace('(*)', '[\s\S]*?', $fv['reg_rule']); $fv['rule_merge']=$this->set_merge_default($fv['reg_rule'], $fv['rule_merge']); }elseif('extract'==$fv['module']){ if(!empty($fv['extract_rule'])){ $fv['reg_extract_rule']=$this->convert_sign_match($fv['extract_rule']); $fv['reg_extract_rule']=preg_replace('/\\\*([\'\/])/', "\\\\$1",$fv['reg_extract_rule']); $fv['reg_extract_rule']=str_replace('(*)', '[\s\S]*?', $fv['reg_extract_rule']); $fv['extract_rule_merge']=$this->set_merge_default($fv['reg_extract_rule'], ''); } } $config['field_list'][$fk]=$fv; } } if(!empty($config['field_process'])){ foreach ($config['field_process'] as $k=>$v){ $config['field_process'][$k]=$this->initProcess($v); } } if(!empty($config['common_process'])){ $config['common_process']=$this->initProcess($config['common_process']); } if(!empty($config['paging']['area'])){ if(empty($config['paging']['area_module'])){ $config['paging']['reg_area']=$this->convert_sign_match($config['paging']['area']); $config['paging']['reg_area']=preg_replace('/\\\*([\'\/])/', "\\\\$1", $config['paging']['reg_area']); $config['paging']['reg_area']=str_replace('(*)', '[\s\S]*?', $config['paging']['reg_area']); $config['paging']['reg_area_merge']=$this->set_merge_default($config['paging']['reg_area'], ''); }else{ $config['paging']['reg_area']=$config['paging']['area']; } $config['paging']['reg_area_module']=$config['paging']['area_module']; } if(!empty($config['paging']['url_rule'])){ if(empty($config['paging']['url_rule_module'])){ $config['paging']['reg_url']=$this->convert_sign_match($config['paging']['url_rule']); $config['paging']['reg_url']=preg_replace('/\\\*([\'\/])/', "\\\\$1", $config['paging']['reg_url']); $config['paging']['reg_url']=str_replace ( '(*)', '[\s\S]*?', $config['paging']['reg_url'] ); $config['paging']['url_merge']=$this->set_merge_default($config['paging']['reg_url'], $config['paging']['url_merge']); if(empty($config['paging']['url_merge'])){ $config['paging']['url_merge']=cp_sign('match'); } }else{ $config['paging']['reg_url']=$config['paging']['url_rule']; $config['paging']['url_merge']=$this->set_merge_default('(?P.+)', $config['paging']['url_merge']); } $config['paging']['reg_url_module']=$config['paging']['url_rule_module']; } if(!empty($config['paging']['url_must'])){ $config['paging']['url_must']=preg_replace('/\\\*([\'\/])/', "\\\\$1", $config['paging']['url_must']); $config['paging']['url_must']=str_replace('(*)', '[\s\S]*?', $config['paging']['url_must']); } if(!empty($config['paging']['url_ban'])){ $config['paging']['url_ban']=preg_replace('/\\\*([\'\/])/', "\\\\$1", $config['paging']['url_ban']); $config['paging']['url_ban']=str_replace('(*)', '[\s\S]*?', $config['paging']['url_ban']); } $module_normal_fields=array(); $module_extract_fields=array(); $module_merge_fields=array(); if(!empty($config['field_list'])){ foreach ($config['field_list'] as $fk=>$fv){ $fieldModule=strtolower($fv['module']); $fieldConfig=array('field'=>$fv,'process'=>$config['field_process'][$fk]); if('extract'==$fieldModule){ $module_extract_fields[$fv['name']]=$fieldConfig; }elseif('merge'==$fieldModule){ $module_merge_fields[$fv['name']]=$fieldConfig; }else{ $module_normal_fields[$fv['name']]=$fieldConfig; } } } $config['new_field_list']=array_merge($module_normal_fields,$module_extract_fields,$module_merge_fields); $new_paging_fields=array( 'normal'=>array(), 'extract'=>array(), 'merge'=>array(), ); if(!empty($config['paging_fields'])){ foreach ($config['paging_fields'] as $pfk=>$pfield){ $pfield['delimiter']=str_replace(array('\r','\n'), array("\r","\n"), $pfield['delimiter']); $config['paging_fields'][$pfk]=$pfield; if(!empty($module_normal_fields[$pfield['field']])){ $new_paging_fields['normal'][$pfield['field']]=$pfield; }elseif(!empty($module_extract_fields[$pfield['field']])){ $new_paging_fields['extract'][$pfield['field']]=$pfield; }elseif(!empty($module_merge_fields[$pfield['field']])){ $new_paging_fields['merge'][$pfield['field']]=$pfield; } } } $config['new_paging_fields']=array_merge($new_paging_fields['normal'],$new_paging_fields['extract'],$new_paging_fields['merge']); $config=array_merge($config,$newConfig); return $config; } /*统一:获取网址列表*/ public function _get_urls($source_url,$config,$is_level=false){ $is_level=$is_level?'多级':''; $html=$this->get_html($source_url); if(empty($html)){ return $this->error($is_level.'页面为空'); } $base_url=$this->match_base_url($source_url, $html); $domain_url=$this->match_domain_url($source_url); if(!empty($config['reg_area'])){ if(empty($config['reg_area_module'])){ if(preg_match('/'.$config['reg_area'].'/i',$html,$source_cont)){ if(isset($source_cont['match'])){ $html=$source_cont['match']; }else{ $html=$source_cont[0]; } }else{ $html=''; } }elseif('json'==$config['reg_area_module']){ $html=$this->rule_module_json_data(array('json'=>$config['reg_area'],'json_arr'=>'jsonencode'),json_decode($html,true)); }elseif('xpath'==$config['reg_area_module']){ $html=$this->rule_module_xpath_data(array('xpath'=>$config['reg_area'],'xpath_attr'=>'outerHtml'),$html); }else{ $html=''; } if(empty($html)){ return $this->error("未提取到{$is_level}区域内容!"); } } $cont_urls=$this->rule_match_urls($config, $html); $cont_urls1=array(); if(isset($this->config['url_op'])){ $op_not_complete=in_array('not_complete',$this->config['url_op'])?true:false; }else{ if(isset($this->config['url_complete'])){ $op_not_complete=$this->config['url_complete']?false:true; }else{ $op_not_complete=false; } } foreach ($cont_urls as $cont_url){ if(!$op_not_complete){ $cont_url=$this->create_complete_url($cont_url, $base_url, $domain_url); } if(!empty($config['url_must'])){ if(!preg_match('/'.$config['url_must'].'/i', $cont_url)){ continue; } } if(!empty($config['url_ban'])){ if(preg_match('/'.$config['url_ban'].'/i', $cont_url)){ continue; } } if(!empty($cont_url)){ if(strpos($cont_url,' ')==false){ $cont_urls1[]=$cont_url; } } } $cont_urls=$cont_urls1; unset($cont_urls1); if(empty($cont_urls)){ return $this->error("未获取到".($is_level?$is_level:'内容')."网址!"); }else{ if(!empty($this->config['url_reverse'])){ $cont_urls=array_reverse($cont_urls); } if(!empty($this->config['url_post'])){ $postParams=array(); if(!empty($this->config['url_posts']['names'])){ foreach ($this->config['url_posts']['names'] as $k=>$v){ if (!empty($v)){ $postParams[]=$v.'='.rawurlencode($this->config['url_posts']['vals'][$k]); } } } if(!empty($postParams)){ $postParams=implode('&', $postParams); foreach ($cont_urls as $k=>$v){ $v.=strpos($v,'?')===false?'?':'&'; $v.=$postParams; $cont_urls[$k]=$v; } } } return array_values($cont_urls); } } /*获取内容网址*/ public function getContUrls($source_url){ if(empty($source_url)){ return $this->error('请输入起始网址'); } $config=array( 'reg_area'=>$this->config['reg_source_cont'], 'reg_area_module'=>$this->config['area_module'], 'reg_url'=>$this->config['reg_source_cont_url'], 'reg_url_module'=>$this->config['url_rule_module'], 'url_merge'=>$this->config['url_merge'], 'url_must'=>$this->config['url_must'], 'url_ban'=>$this->config['url_ban'], ); return $this->_get_urls($source_url, $config); } /*获取多级网址*/ public function getLevelUrls($parent_url,$level=1){ $level=$level>1?$level:1; $config=$this->config['level_urls'][$level-1]; if(empty($config)){ return $this->error('没有'.($level).'级网址规则'); } if(empty($config['reg_url'])){ return $this->error('必须填写多级“提取网址规则”'); } if(empty($parent_url)){ return $this->error('请输入父级网址'); } return $this->_get_urls($parent_url, $config,true); } /** * 规则匹配网址 * @param array $config 配置参数 * @param string $html 源码 * @param bool $whole 完全匹配模式 * */ public function rule_match_urls($config,$html,$whole=false){ $cont_urls=array(); if(!empty($config['reg_url'])&&!empty($config['url_merge'])){ $sign_match=$this->sign_addslashes(cp_sign('match','(?P\d*)')); if(preg_match_all('/'.$sign_match.'/i', $config['url_merge'],$match_signs)){ $url_merge=true; if(empty($config['reg_url_module'])){ if(preg_match('/\(\?P/i', $config['reg_url'])){ if(preg_match_all('/'.$config['reg_url'].'/i',$html,$cont_urls,PREG_SET_ORDER)){ if($config['url_merge']==cp_sign('match')){ $url_merge=false; foreach ($cont_urls as $k=>$v){ $cont_urls[$k]=$v['match']; } } } }else{ if($whole){ if(preg_match_all('/'.$config['reg_url'].'/i',$html,$cont_urls)){ $cont_urls=$cont_urls[0]; if($config['url_merge']==cp_sign('match')){ $url_merge=false; }else{ foreach ($cont_urls as $k=>$v){ $cont_urls[$k]=array( 'match'=>$v ); } } } } } }elseif(in_array($config['reg_url_module'],array('xpath','json'))){ if('xpath'==$config['reg_url_module']){ $cont_urls=$this->rule_module_xpath_data ( array ( 'xpath' => $config['reg_url'], 'xpath_attr' => 'href', 'xpath_multi'=>true, 'xpath_multi_type'=>'loop' ),$html); $cont_urls=is_array($cont_urls)?$cont_urls:array(); }elseif('json'==$config['reg_url_module']){ $cont_urls=$this->rule_module_json_data(array('json'=>$config['reg_url'],'json_arr'=>'_original_'),json_decode($html,true)); if(empty($cont_urls)){ $cont_urls=array(); }elseif(!is_array($cont_urls)){ $cont_urls=array($cont_urls); } } if($config['url_merge']==cp_sign('match')){ $url_merge=false; }else{ foreach ($cont_urls as $k=>$v){ $cont_urls[$k]=array( 'match'=>$v ); } } } if($url_merge){ foreach ($cont_urls as $k=>$v){ $re_match=array(); foreach($match_signs['num'] as $ms_k=>$ms_v){ $re_match[$ms_k]=$v['match'.$ms_v]; } $cont_urls[$k]=str_replace($match_signs[0], $re_match, $config['url_merge']); } } } } $cont_urls=is_array($cont_urls)?array_unique($cont_urls):array(); $cont_urls=array_values($cont_urls); return $cont_urls; } /*获取分页链接*/ public function getPagingUrls($from_url,$html,$is_test=false){ $paging_urls=array(); if($this->config['paging']['open']){ if(empty($html)){ $html=$this->get_html($from_url); } if(!empty($this->config['paging']['reg_url'])){ if(!empty($this->config['new_paging_fields'])){ $base_url=$this->match_base_url($from_url, $html); $domain_url=$this->match_domain_url($from_url); $paging_area=''; if(!empty($this->config['paging']['reg_area'])){ if(empty($this->config['paging']['reg_area_module'])){ $sign_match=$this->sign_addslashes(cp_sign('match','(?P\d*)')); if(preg_match_all('/'.$sign_match.'/i', $this->config['paging']['reg_area_merge'],$match_signs)){ if(preg_match('/'.$this->config['paging']['reg_area'].'/i',$html,$m_paging_area)){ $re_match=array(); foreach($match_signs['num'] as $ms_k=>$ms_v){ $re_match[$ms_k]=$m_paging_area['match'.$ms_v]; } $paging_area=str_replace($match_signs[0], $re_match, $this->config['paging']['reg_area_merge']); } }else{ if(preg_match('/'.$this->config['paging']['reg_area'].'/i',$html,$m_paging_area)){ $paging_area=$m_paging_area[0]; } } }elseif('json'==$this->config['paging']['reg_area_module']){ $paging_area=$this->rule_module_json_data(array('json'=>$this->config['paging']['reg_area'],'json_arr'=>'jsonencode'),json_decode($html,true)); }elseif('xpath'==$this->config['paging']['reg_area_module']){ $paging_area=$this->rule_module_xpath_data(array('xpath'=>$this->config['paging']['reg_area'],'xpath_attr'=>'outerHtml'),$html); } }else{ $paging_area=$html; } if(!empty($paging_area)){ if(!empty($this->config['paging']['url_complete'])){ $paging_area=preg_replace_callback('/(?<=\bhref\=[\'\"])([^\'\"]*)(?=[\'\"])/i',function($matche_p_a) use ($base_url,$domain_url){ return \skycaiji\admin\event\Cpattern::create_complete_url($matche_p_a[1], $base_url, $domain_url); },$paging_area); } $m_paging_urls=$this->rule_match_urls($this->config['paging'],$paging_area,true); foreach ($m_paging_urls as $purl){ if(!empty($this->config['paging']['url_must'])){ if(!preg_match('/'.$this->config['paging']['url_must'].'/i', $purl)){ continue; } } if(!empty($this->config['paging']['url_ban'])){ if(preg_match('/'.$this->config['paging']['url_ban'].'/i', $purl)){ continue; } } if($from_url==$purl){ continue; } if(strpos($purl,' ')==false){ $paging_urls[]=$purl; } } if(!empty($paging_urls)){ $paging_urls=array_filter($paging_urls); $paging_urls=array_unique($paging_urls); $paging_urls=array_values($paging_urls); }else{ if($is_test){ return $this->error('未获取到分页链接,请检查分页链接规则'); } } }else{ if($is_test){ return $this->error('未获取到分页区域,请检查分页区域规则'); } } }else{ if($is_test){ return $this->error('请添加分页内容字段'); } } }else{ if($is_test){ return $this->error('必须填写分页链接规则'); } } }else{ if($is_test){ return $this->error('未开启分页'); } } return $paging_urls; } /*设置字段值*/ public function setField($field_config,$cont_url,$html){ $cont_url_md5=md5($cont_url); $field_process=$field_config['process']; $field_params=$field_config['field']; $module=strtolower($field_params['module']); if(!empty($field_params['source'])&&in_array($module, array('rule','xpath','json','auto'))){ $field_source_url=''; $source_echo_msg='——采集'; if('source_url'==$field_params['source']){ $field_source_url=$this->cur_source_url; $source_echo_msg.='起始页'; }elseif(preg_match('/^relation_url:(.+)$/i', $field_params['source'],$relationName)){ $relationName=$relationName[1]; $field_source_url=$this->getRelationUrl($relationName, $cont_url, $html); $source_echo_msg.="关联页“{$relationName}”"; }elseif(preg_match('/^level_url:(.+)$/i', $field_params['source'],$levelName)){ $levelName=$levelName[1]; if(empty($this->config['new_level_urls'][$levelName])){ return; } if(empty($this->cur_level_urls[$levelName])){ return; } $field_source_url=$this->cur_level_urls[$levelName]; $source_echo_msg.="多级页“{$levelName}”"; } if(empty($field_source_url)){ return; } if($field_source_url!=$cont_url){ $cont_url=$field_source_url; $this->echo_msg($source_echo_msg.":{$field_source_url}",'black'); $html=$this->get_html($field_source_url,true); } } static $fieldArr1=array('words','num','time','list'); static $fieldArr2=array('auto','json'); static $baseUrls=array(); static $domainUrls=array(); $urlMd5=md5($cont_url); if(empty($baseUrls[$urlMd5])){ $baseUrls[$urlMd5]=$this->match_base_url($cont_url, $html); } if(empty($domainUrls[$urlMd5])){ $domainUrls[$urlMd5]=$this->match_domain_url($cont_url); } $base_url=$baseUrls[$urlMd5]; $domain_url=$domainUrls[$urlMd5]; $val=''; $field_func='field_module_'.$module; if(method_exists($this, $field_func)){ if('extract'==$module){ if(is_array($this->field_val_list[$field_params['extract']]['values'][$cont_url_md5])){ $val=array(); foreach ($this->field_val_list[$field_params['extract']]['values'][$cont_url_md5] as $k=>$v){ $extract_field_val=array( 'value'=>$v, 'img'=>$this->field_val_list[$field_params['extract']]['imgs'][$cont_url_md5][$k], ); $val[$k]=$this->field_module_extract($field_params, $extract_field_val, $base_url, $domain_url); } }else{ $extract_field_val=array( 'value'=>$this->field_val_list[$field_params['extract']]['values'][$cont_url_md5], 'img'=>$this->field_val_list[$field_params['extract']]['imgs'][$cont_url_md5], ); $val=$this->field_module_extract($field_params, $extract_field_val, $base_url, $domain_url); } }elseif('merge'==$module){ if(empty($this->first_loop_field)){ $cur_field_val_list=array(); foreach ($this->field_val_list as $k=>$v){ $cur_field_val_list[$k]=array( 'value'=>$v['values'][$cont_url_md5], 'img'=>$v['imgs'][$cont_url_md5] ); } $val=$this->field_module_merge($field_params,$cur_field_val_list); }else{ $val=array(); foreach ($this->field_val_list[$this->first_loop_field]['values'][$cont_url_md5] as $v_k=>$v_v){ $cur_field_val_list=array(); foreach ($this->field_val_list as $k=>$v){ $cur_field_val_list[$k]=array( 'value'=>(is_array($v['values'][$cont_url_md5])?$v['values'][$cont_url_md5][$v_k]:$v['values'][$cont_url_md5]), 'img'=>(is_array($v['imgs'][$cont_url_md5][$v_k])?$v['imgs'][$cont_url_md5][$v_k]:$v['imgs'][$cont_url_md5]) ); } $val[$v_k]=$this->field_module_merge($field_params,$cur_field_val_list); } } }elseif(in_array($module,$fieldArr1)){ $val=$this->$field_func($field_params); }elseif(in_array($module,$fieldArr2)){ $val=$this->$field_func($field_params,$html,$cont_url); }else{ $val=$this->$field_func($field_params,$html); } } $vals=null; if(is_array($val)){ $is_loop=true; $vals=array_values($val); }else{ $is_loop=false; $vals=array($val); } $field_name=$field_params['name']; if(!isset($this->field_val_list[$field_name])){ $this->field_val_list[$field_name]=array('values'=>array(),'imgs'=>array()); } foreach ($vals as $v_k=>$val){ if(!empty($field_process)){ $val=$this->processField($val,$field_process); } if(!empty($this->config['common_process'])){ $val=$this->processField($val,$this->config['common_process']); } $val=preg_replace_callback('/(?<=\bhref\=[\'\"])([^\'\"]*)(?=[\'\"])/i',function($matche) use ($base_url,$domain_url){ return \skycaiji\admin\event\Cpattern::create_complete_url($matche[1], $base_url, $domain_url); },$val); $val=preg_replace_callback('/(?<=\bsrc\=[\'\"])([^\'\"]*)(?=[\'\"])/i',function($matche) use ($base_url,$domain_url){ return \skycaiji\admin\event\Cpattern::create_complete_url($matche[1], $base_url, $domain_url); },$val); if($is_loop){ if(!isset($this->field_val_list[$field_name]['values'][$cont_url_md5])){ $this->field_val_list[$field_name]['values'][$cont_url_md5]=array(); $this->field_val_list[$field_name]['imgs'][$cont_url_md5]=array(); } $this->field_val_list[$field_name]['values'][$cont_url_md5][$v_k]=$val; }else{ $this->field_val_list[$field_name]['values'][$cont_url_md5]=$val; } if(!empty($GLOBALS['config']['caiji']['download_img'])&&!empty($val)){ $valImgs=array(); if(preg_match_all('/]*\bsrc=[\'\"]*(\w+\:\/\/[^\'\"\s]+)[\'\"]*/i',$val,$imgUrls)){ $valImgs=is_array($imgUrls[1])?$imgUrls[1]:array(); } if('extract'==$module&&'cover'==$field_params['extract_module']){ $valImgs=array_merge($valImgs,array($val)); } $noImgVal=preg_replace_callback('/\{\[img\]\}(http[s]{0,1}\:\/\/[^\s]+?)\{\[\/img\]\}/i',function($matche) use (&$valImgs){ $valImgs[]=$matche[1]; return $matche[1]; },$val); if($noImgVal!=$val){ if($is_loop){ $this->field_val_list[$field_name]['values'][$cont_url_md5][$v_k]=$noImgVal; }else{ $this->field_val_list[$field_name]['values'][$cont_url_md5]=$noImgVal; } } if(!empty($valImgs)){ $valImgs=array_unique($valImgs); $valImgs=array_values($valImgs); if($is_loop){ $this->field_val_list[$field_name]['imgs'][$cont_url_md5][$v_k]=$valImgs; }else{ $this->field_val_list[$field_name]['imgs'][$cont_url_md5]=$valImgs; } } } } } /*设置分页的字段列表值*/ public function setPagingFields($cont_url,$page_url){ $contMd5=md5($cont_url); $pageMd5=md5($page_url); if(empty($page_url)){ return $this->error('请输入分页网址'); } if(!preg_match('/^\w+\:\/\//',$page_url)){ return $this->error($page_url.'网址不完整'); } if(empty($this->config['paging']['max'])||(count($this->used_paging_urls[$contMd5])<$this->config['paging']['max'])){ $this->set_html_interval(); $this->echo_msg("——采集分页:{$page_url}",'black'); $html=$this->get_html($page_url); if(empty($html)){ return $this->error('分页获取失败:'.$page_url); } if(!isset($this->used_paging_urls[$contMd5][$pageMd5])){ $this->used_paging_urls[$contMd5][$pageMd5]=$page_url; foreach ($this->config['new_paging_fields'] as $v){ $this->setField($this->config['new_field_list'][$v['field']],$page_url,$html); } } $paging_urls=$this->getPagingUrls($page_url,$html); if(!empty($paging_urls)){ $nextUrl=''; foreach ($paging_urls as $purl){ if(!isset($this->used_paging_urls[$contMd5][md5($purl)])&&$cont_url!=$purl){ $nextUrl=$purl; break; } } if(!empty($nextUrl)){ $this->setPagingFields($cont_url,$nextUrl); } } } } public function relation_match_url($html,$config){ if(empty($config['reg_url_module'])){ $url=$this->match_rule($html,$config['reg_url'],$config['url_merge']); }elseif('json'==$config['reg_url_module']){ $url=$this->rule_module_json_data(array('json'=>$config['reg_url'],'json_arr'=>'jsonencode'),json_decode($html,true)); }elseif('xpath'==$config['reg_url_module']){ $url=$this->rule_module_xpath_data(array('xpath'=>$config['reg_url'],'xpath_attr'=>'href'),$html); } return $url; } public function match_rule($html,$rule,$merge,$multi=false,$multi_str=''){ $val=''; $sign_match=$this->sign_addslashes(cp_sign('match','(?P\d*)')); if(!empty($rule)&&preg_match_all('/'.$sign_match.'/i',$merge,$match_signs)){ $multiStr=''; if(!empty($multi)){ preg_match_all('/'.$rule.'/i',$html,$match_conts,PREG_SET_ORDER); $multiStr=str_replace(array('\r','\n'), array("\r","\n"), $multi_str); }else{ if(preg_match('/'.$rule.'/i', $html,$match_cont)){ $match_conts=array($match_cont); } } $curI=0; foreach ($match_conts as $match_cont){ $curI++; $re_match=array(); foreach($match_signs['num'] as $ms_k=>$ms_v){ $re_match[$ms_k]=$match_cont['match'.$ms_v]; } $val.=($curI<=1?'':$multiStr).str_replace($match_signs[0], $re_match, $merge); } } return $val; } /** * 获取关联页网址 * @param unknown $name 关联页名称 * @param unknown $cont_url 内容页网址 * @param unknown $html 内容页源码 * @return string */ public function getRelationUrl($name,$cont_url,$html){ if(empty($html)){ $html=$this->get_html($cont_url,true); } if(empty($html)){ return ''; } $relation_url=$this->config['new_relation_urls'][$name]; if(empty($relation_url)){ return ''; } $page=$relation_url['page']; $pass=false; $depth_pages=array(); $depth=0; while(!empty($page)){ if($page==$name){ $pass=true; break; } if(empty($this->config['new_relation_urls'][$page])){ $pass=true; break; } $depth++; $depth_pages[$depth]=$page; $page=$this->config['new_relation_urls'][$page]['page']; } if($pass){ return ''; } $relationUrl=$this->relation_match_url($html,$relation_url); $this->relation_url_list[$cont_url][$relation_url['page']]=$relationUrl; if(!empty($depth_pages)){ krsort($depth_pages); foreach ($depth_pages as $page){ if(empty($relationUrl)){ return ''; } if(!isset($this->relation_url_list[$cont_url][$page])){ $relationHtml=$this->get_html($relationUrl,true); if(empty($relationHtml)){ return ''; } $relationUrl=$this->relation_match_url($relationHtml,$this->config['new_relation_urls'][$page]); $this->relation_url_list[$cont_url][$page]=$relationUrl; }else{ $relationUrl=$this->relation_url_list[$cont_url][$page]; } } } return $relationUrl; } /*获取内容页字段列表,这里是入口*/ public function getFields($cont_url){ $this->field_val_list=array(); $this->first_loop_field=null; if(empty($cont_url)){ return $this->error('请输入内容页网址'); } if(!preg_match('/^\w+\:\/\//',$cont_url)){ return $this->error($cont_url.'网址不完整'); } $html=$this->get_html($cont_url,false,$this->config['url_post']); if(empty($html)){ return $this->error('抓取页面失败'); } foreach($this->config['new_field_list'] as $field_config){ $this->setField($field_config,$cont_url,$html); } $paging_urls=$this->getPagingUrls($cont_url,$html); if(!empty($paging_urls)){ $this->setPagingFields($cont_url,reset($paging_urls)); } $val_list=array(); if(!empty($this->field_val_list)){ if(empty($this->first_loop_field)){ foreach ($this->field_val_list as $fieldName=>$fieldVal){ $val_values=array_filter($fieldVal['values']); $val_values=implode($this->config['new_paging_fields'][$fieldName]['delimiter'], $val_values); $val_imgs=array(); if(!empty($fieldVal['imgs'])){ foreach ($fieldVal['imgs'] as $v){ if(!empty($v)){ if(is_array($v)){ $val_imgs=array_merge($val_imgs,$v); }else{ $val_imgs[]=$v; } } } if(!empty($val_imgs)){ $val_imgs=array_unique($val_imgs); $val_imgs=array_filter($val_imgs); $val_imgs=array_values($val_imgs); } } $val_list[$fieldName]=array('value'=>$val_values,'img'=>$val_imgs); } }else{ foreach ($this->field_val_list[$this->first_loop_field]['values'] as $page_key=>$page_vals){ if(empty($page_vals)){ continue; } foreach ($page_vals as $loop_index=>$loop_val){ $vals=array(); foreach ($this->field_val_list as $fieldName=>$fieldVals){ if(is_array($fieldVals['values'][$page_key])){ $val_values=$fieldVals['values'][$page_key][$loop_index]; $val_imgs=$fieldVals['imgs'][$page_key][$loop_index]; }else{ $val_values=$fieldVals['values'][$page_key]; $val_imgs=$fieldVals['imgs'][$page_key]; } if(!empty($val_imgs)){ $val_imgs=array_unique($val_imgs); $val_imgs=array_filter($val_imgs); $val_imgs=array_values($val_imgs); } $vals[$fieldName]=array('value'=>$val_values,'img'=>$val_imgs); } $val_list[]=$vals; } } } } return $val_list?$val_list:array(); } /** * 规则匹配,方法可调用,$field_params传入规则参数 * @param array $field_params * @param string $html * @return string */ public function field_module_rule($field_params,&$html){ $val=''; $sign_match=$this->sign_addslashes(cp_sign('match','(?P\d*)')); if(!empty($field_params['reg_rule'])&&preg_match_all('/'.$sign_match.'/i', $field_params['rule_merge'],$match_signs)){ $multiStr=''; $is_loop=false; if(!empty($field_params['rule_multi'])){ preg_match_all('/'.$field_params['reg_rule'].'/i',$html,$match_conts,PREG_SET_ORDER); $is_loop='loop'==$field_params['rule_multi_type']?true:false; if($is_loop){ if(empty($this->first_loop_field)){ $this->first_loop_field=$field_params['name']; } $val=array(); }else{ $multiStr=str_replace(array('\r','\n'), array("\r","\n"), $field_params['rule_multi_str']); } }else{ if(preg_match('/'.$field_params['reg_rule'].'/i', $html,$match_cont)){ $match_conts=array($match_cont); } } $curI=0; if(is_array($match_conts)){ foreach ($match_conts as $match_cont){ $curI++; $re_match=array(); foreach($match_signs['num'] as $ms_k=>$ms_v){ $re_match[$ms_k]=$match_cont['match'.$ms_v]; } $contVal=str_replace($match_signs[0], $re_match, $field_params['rule_merge']); if($is_loop){ $val[]=$contVal; }else{ $val.=($curI<=1?'':$multiStr).$contVal; } } } } return $val; } /** * xpath规则,方法可调用,$field_params传入规则参数 * @param array $field_params * @param string $html * @return string */ public function field_module_xpath($field_params,$html){ if(!empty($field_params['xpath_multi'])){ if('loop'==$field_params['xpath_multi_type']){ if(empty($this->first_loop_field)){ $this->first_loop_field=$field_params['name']; } } } return $this->rule_module_xpath_data($field_params,$html); } public function rule_module_xpath_data($field_params,$html){ $vals=''; if(!empty($field_params['xpath'])){ $dom=new \DOMDocument; @$dom->loadHTML(''.$html); $dom->normalize(); $xPath = new \DOMXPath($dom); $xpath_attr=strtolower($field_params['xpath_attr']); $xpath_attr='custom'==$xpath_attr?strtolower($field_params['xpath_attr_custom']):$xpath_attr; $normal_attr=true; if(in_array($xpath_attr,array('innerhtml','outerhtml','text'))){ $normal_attr=false; } $xpath_q=trim($field_params['xpath']); if(!empty($xpath_attr)){ if(preg_match('/\/\@[\w\-]+$/', $xpath_q)){ $xpath_q=preg_replace('/\@[\w\-]+$/', '', $xpath_q); } if($normal_attr){ $xpath_q=$xpath_q.(preg_match('/\/$/', $xpath_q)?'':'/').'@'.$xpath_attr; } }else{ if(!preg_match('/\/\@[\w\-]+$/', $xpath_q)){ $xpath_attr='innerhtml'; $normal_attr=false; } } $nodes = $xPath->query($xpath_q); $multiStr=''; $is_loop=false; if(!empty($field_params['xpath_multi'])){ $is_loop='loop'==$field_params['xpath_multi_type']?true:false; if($is_loop){ $vals=array(); }else{ $multiStr=str_replace(array('\r','\n'), array("\r","\n"), $field_params['xpath_multi_str']); } } $curI=0; foreach ($nodes as $node){ $curI++; $val=($curI<=1?'':$multiStr); if($normal_attr){ $val.=$node->nodeValue; }else{ switch ($xpath_attr){ case 'innerhtml': $nchilds = $node->childNodes; foreach ($nchilds as $nchild){ $val .= $nchild->ownerDocument->saveHTML($nchild); } break; case 'outerhtml':$val.=$node->ownerDocument->saveHTML($node);break; case 'text': $nchilds = $node->childNodes; foreach ($nchilds as $nchild){ $val .= $nchild->ownerDocument->saveHTML($nchild); } $val=$this->filter_html_tags($val, array('style','script','object')); $val=strip_tags($val); break; } } if($is_loop){ $vals[]=$val; }else{ $vals.=$val; } if(empty($field_params['xpath_multi'])){ break; } } } return $vals; } /*自动获取*/ public function field_module_auto($field_params,&$html,$cur_url){ switch (strtolower($field_params['auto'])){ case 'title':$val=$this->get_title($html);break; case 'content':$val=$this->get_content($html);break; case 'keywords':$val=$this->get_keywords($html);break; case 'description':$val=$this->get_description($html);break; case 'url':$val=$cur_url;break; } return $val; } public function field_module_words($field_params){ return $field_params['words']; } public function field_module_num($field_params){ $start=intval($field_params['num_start']); $end=intval($field_params['num_end']); return rand($start, $end); } public function field_module_time($field_params){ $val=''; $start=empty($field_params['time_start'])?NOW_TIME:strtotime($field_params['time_start']); $end=empty($field_params['time_end'])?NOW_TIME:strtotime($field_params['time_end']); $time=rand($start, $end); if(empty($field_params['time_stamp'])){ $fmt=empty($field_params['time_format'])?'Y-m-d H:i': str_replace(array('[年]','[月]','[日]','[时]','[分]','[秒]'), array('Y','m','d','H','i','s'), $field_params['time_format']); $val=date($fmt,$time); }else{ $val=$time; } return $val; } public function field_module_list($field_params){ $val=''; if(preg_match_all('/[^\r\n]+/', $field_params['list'],$str_list)){ $str_list=$str_list[0]; $randi=array_rand($str_list,1); $val=$str_list[$randi]; } return $val; } public function field_module_merge($field_params,$val_list){ $val=''; if(preg_match_all('/\[\x{5b57}\x{6bb5}\:(.+?)\]/u', $field_params['merge'],$match_fields)){ $val=$field_params['merge']; for($i=0;$irule_module_json_data($field_params,$jsonList[$jsonKey]); return $val; } public function rule_module_json_data($field_params,$jsonArr){ $val=''; if(!empty($jsonArr)){ if(!empty($field_params['json'])){ $jsonFmt=str_replace(array('"',"'",'[',' '), '', $field_params['json']); $jsonFmt=str_replace(']','.',$jsonFmt); $jsonFmt=trim($jsonFmt,'.'); $jsonFmt=explode('.', $jsonFmt); $jsonFmt=array_values($jsonFmt); if(!empty($jsonFmt)){ $val=$jsonArr; $prevKey=''; foreach ($jsonFmt as $i=>$key){ if($prevKey=='*'){ $new_field_params=$field_params; $new_field_params['json']=array_slice($jsonFmt, $i); $new_field_params['json']=implode('.', $new_field_params['json']); foreach ($val as $vk=>$vv){ $val[$vk]=$this->rule_module_json_data($new_field_params,$vv); } break; }else{ if($key!='*'){ $val=$val[$key]; } } $prevKey=$key; } } } } if(is_array($val)){ $json_arr=strtolower($field_params['json_arr']); if(empty($json_arr)){ $json_arr='implode'; } switch ($json_arr){ case 'implode':$arrImplode=str_replace(array('\r','\n'), array("\r","\n"), $field_params['json_arr_implode']);$val=array_implode($arrImplode,$val);break; case 'jsonencode':$val=json_encode($val);break; case 'serialize':$val=serialize($val);break; case '_original_': break; } } return $val; } /*字段提取内容*/ public function field_module_extract($field_params,$extract_field_val,$base_url,$domain_url){ $field_html=$extract_field_val['value']; if(empty($field_html)){ return ''; } $val=''; $extract_module=strtolower($field_params['extract_module']); switch ($extract_module){ case 'cover': if(!empty($extract_field_val['img'])){ $val=reset($extract_field_val['img']); }else{ if(preg_match('/]*\bsrc=[\'\"](?P[^\'\"]+?)[\'\"]/i',$field_html,$cover)){ $cover=$cover['url']; $cover=$this->create_complete_url($cover, $base_url, $domain_url); $val=$cover; } } break; case 'phone': $field_html=$this->filter_html_tags($field_html,'style,script,object'); $field_html=strip_tags($field_html); if(preg_match('/\d{11}/', $field_html,$phone)){ $val=$phone[0]; } break; case 'email': $field_html=$this->filter_html_tags($field_html,'style,script,object'); $field_html=strip_tags($field_html); if(preg_match('/[\w\-]+\@[\w\-\.]+/i', $field_html,$email)){ $val=$email[0]; } break; case 'rule': $val=$this->field_module_rule(array('reg_rule'=>$field_params['reg_extract_rule']), $field_html); if(empty($val)){ if(preg_match('/'.$field_params['reg_extract_rule'].'/i', $field_html,$val)){ $val=$val[0]; } } break; case 'xpath': $val=$this->field_module_xpath(array('xpath'=>$field_params['extract_xpath'],'xpath_attr'=>$field_params['extract_xpath_attr'],'xpath_attr_custom'=>$field_params['extract_xpath_attr_custom']), $field_html); break; case 'json': $val=$this->field_module_json(array('json'=>$field_params['extract_json'],'json_arr'=>$field_params['extract_json_arr'],'json_arr_implode'=>$field_params['extract_json_arr_implode']), $field_html); break; } return $val; } /*数据处理*/ public function processField($fieldVal,$process){ if(empty($fieldVal)||empty($process)){ return $fieldVal; } foreach ($process as $params){ if('html'==$params['module']){ $htmlAllow=array_filter(explode(',',$params['html_allow'])); $htmlFilter=array_filter(explode(',',$params['html_filter'])); if(!empty($htmlAllow)){ $htmlAllowStr=''; foreach ($htmlAllow as $v){ $htmlAllowStr.='<'.$v.'>'; } $fieldVal=strip_tags($fieldVal,$htmlAllowStr); } if(!empty($htmlFilter)){ if(in_array('all', $htmlFilter)){ $fieldVal=$this->filter_html_tags($fieldVal, array('style','script','object')); $fieldVal=strip_tags($fieldVal); }else{ $fieldVal=$this->filter_html_tags($fieldVal, $htmlFilter); } } }elseif('replace'==$params['module']){ $fieldVal=preg_replace('/'.$params['replace_from'].'/i',$params['replace_to'], $fieldVal); }elseif('filter'==$params['module']){ if(!empty($params['filter_list'])){ $filterList=explode("\r\n", $params['filter_list']); $filterList=array_filter($filterList); if(!empty($params['filter_pass'])){ foreach ($filterList as $filterStr){ if(stripos($fieldVal,$filterStr)!==false){ $fieldVal=''; break; } } }else{ $fieldVal=str_ireplace($filterList, $params['filter_replace'], $fieldVal); } } }elseif('tool'==$params['module']){ if(in_array('format', $params['tool_list'])){ $fieldVal=$this->filter_html_tags($fieldVal,array('style','script')); $fieldVal=preg_replace('/\b(style|width|height|align)\s*=\s*([\'\"])[^\<\>\'\"]+?\\2(?=\s|$|\/|>)/i', ' ', $fieldVal); } if(in_array('trim', $params['tool_list'])){ $fieldVal=trim($fieldVal); } if(in_array('is_img', $params['tool_list'])){ if(!empty($GLOBALS['config']['caiji']['download_img'])){ $fieldVal=preg_replace('/(\bhttp[s]{0,1}\:\/\/[^\s]+)/i','{[img]}'."$1".'{[/img]}',$fieldVal); } } }elseif('translate'==$params['module']){ if(!empty($GLOBALS['config']['translate'])&&!empty($GLOBALS['config']['translate']['open'])){ $fieldVal=\util\Translator::translate($fieldVal, $params['translate_from'], $params['translate_to']); } }elseif('batch'==$params['module']){ static $batch_list=array(); if(!empty($params['batch_list'])){ $listMd5=md5($params['batch_list']); if(!isset($batch_list[$listMd5])){ if(preg_match_all('/([^\r\n]+?)\=([^\r\n]+)/', $params['batch_list'],$mlist)){ $batch_re=$mlist[1]; $batch_to=$mlist[2]; $batch_list[$listMd5]=array($batch_re,$batch_to); } }else{ $batch_re=$batch_list[$listMd5][0]; $batch_to=$batch_list[$listMd5][1]; } $batch_re=is_array($batch_re)?$batch_re:null; $batch_to=is_array($batch_to)?$batch_to:null; if(!empty($batch_re)&&count($batch_re)==count($batch_to)){ $fieldVal=str_replace($batch_re, $batch_to, $fieldVal); } } }elseif('substr'==$params['module']){ $params['substr_len']=intval($params['substr_len']); if($params['substr_len']>0){ if(mb_strlen($fieldVal,'utf-8')>$params['substr_len']){ $fieldVal=mb_substr($fieldVal,0,$params['substr_len'],'utf-8').$params['substr_end']; } } }elseif('func'==$params['module']){ if(!empty($params['func_name'])&&function_exists($params['func_name'])){ if(array_key_exists($params['func_name'], config('allow_process_func'))||array_key_exists($params['func_name'], config('EXTEND_PROCESS_FUNC'))){ static $func_param_list=array(); $funcParam=null; if(empty($params['func_param'])){ $funcParam=array($fieldVal); }else{ $fparamMd5=md5($params['func_param']); if(!isset($func_param_list[$fparamMd5])){ if(preg_match_all('/[^\r\n]+/', $params['func_param'],$mfuncParam)){ $func_param_list[$fparamMd5]=$mfuncParam[0]; } } $funcParam=$func_param_list[$fparamMd5]; foreach ($funcParam as $k=>$v){ $funcParam[$k]=str_replace('###', $fieldVal, $v); } } if(!empty($funcParam)&&is_array($funcParam)){ try { $fieldVal=call_user_func_array($params['func_name'], $funcParam); }catch (\Exception $ex){ } } } } } } return $fieldVal; } /*设置数据处理,保存config时使用*/ public function setProcess($processList){ if(!empty($processList)){ foreach ($processList as $k=>$v){ $v['module']=strtolower($v['module']); if(!empty($v['title'])){ $v['title']=str_replace(array("'",'"'),'',strip_tags($v['title'])); } if('html'==$v['module']){ $v['html_allow']=$this->clear_tags($v['html_allow']); $v['html_allow']=implode(',', $v['html_allow']); $v['html_filter']=$this->clear_tags($v['html_filter']); $v['html_filter']=implode(',', $v['html_filter']); }elseif('filter'==$v['module']){ if(preg_match_all('/[^\r\n]+/', $v['filter_list'],$filterList)){ $filterList=array_filter(array_unique($filterList[0])); $v['filter_list']=implode("\r\n",$filterList); } $v['filter_list']=trim($v['filter_list']); } $processList[$k]=$v; } $processList=array_values($processList); } return $processList; } /*初始化数据处理,初始化config时使用*/ public function initProcess($processList){ if(!empty($processList)){ foreach ($processList as $k=>$v){ if('replace'==$v['module']){ $v['replace_from']=preg_replace('/\\\*([\'\/])/', "\\\\$1", $v['replace_from']); $v['replace_from']=str_replace('(*)', '[\s\S]*?', $v['replace_from']); } $processList[$k]=$v; } } return $processList; } /*采集级别网址*/ public function get_level_urls($source_url,$curLevel=1){ $curLevel=$curLevel>0?$curLevel:0; if($curLevel>0){ $nextLevel=0; if(!empty($this->config['level_urls'])){ if(!empty($this->config['level_urls'][$curLevel-1])){ if(!empty($this->config['level_urls'][$curLevel])){ $nextLevel=$curLevel+1; } } } $cont_urls=$this->getLevelUrls($source_url,$curLevel); }else{ $cont_urls=$this->getContUrls($source_url); } return array('urls'=>$cont_urls,'levelName'=>$this->config['level_urls'][$curLevel-1]['name'],'nextLevel'=>$nextLevel); } /*执行采集返回未使用的网址*/ public function _collect_unused_cont_urls($cont_urls=array(),$echo_str=''){ $mcollected=model('Collected'); $count_conts=count($cont_urls); if($this->config['url_repeat']){ $db_cont_urls=array(); }else{ $db_cont_urls=$mcollected->getUrlByUrl($cont_urls); } $unused_cont_urls=array(); $count_used=0; if(!empty($cont_urls)){ foreach ($cont_urls as $cont_url){ if(array_key_exists(md5($cont_url), $this->used_cont_urls)){ $count_used++; }elseif(in_array($cont_url, $db_cont_urls)){ $count_used++; }else{ $unused_cont_urls[md5($cont_url)]=$cont_url; } } } if($count_used>0){ $count_used=min(count($cont_urls),$count_used); $this->echo_msg($echo_str.'采集到'.$count_conts.'条网址,'.$count_used.'条重复,'.(count($unused_cont_urls)).'条有效','black'); }else{ $this->echo_msg($echo_str.'采集到'.$count_conts.'条有效网址','black'); } return $unused_cont_urls; } /*执行级别采集*/ public function _collect_level($source_url,$level=1){ $end_echo=''; $level=max(1,$level); $level_str=''; for($i=1;$i<$level;$i++){ } $next_level_str=$level_str; if($level<=1){ $this->cur_level_urls=array(); } $this->echo_msg('','',true,'
'); $level_data=$this->get_level_urls($source_url,$level); $this->echo_msg($level_str.'抓取到'.$level.'级“'.$this->config['level_urls'][$level-1]['name'].'”网址'.count($level_data['urls']).'条','black'); $mcollected=model('Collected'); $mcacheLevel=CacheModel::getInstance('level_url'); if(!empty($level_data['urls'])){ $level_urls=array(); foreach ($level_data['urls'] as $level_url){ $level_urls["level_{$level}:{$level_url}"]=$level_url; } $level_interval=$GLOBALS['config']['caiji']['interval']*60; $time_interval_list=array(); $cacheLevels=$mcacheLevel->db()->where(array('cname'=>array('in',array_map('md5', array_keys($level_urls)))))->column('dateline','cname'); if(!empty($cacheLevels)){ $count_db_used=0; $sortLevels=array('undb'=>array(),'db'=>array()); foreach ($level_urls as $level_key=>$level_url){ $md5_level_key=md5($level_key); if(!isset($cacheLevels[$md5_level_key])){ $sortLevels['undb'][$level_key]=$level_url; }else{ $time_interval=abs(NOW_TIME-$cacheLevels[$md5_level_key]); if($time_interval<$level_interval){ $this->used_level_urls[$level_key]=1; $count_db_used++; $time_interval_list[]=$time_interval; }else{ $sortLevels['db'][$level_key]=$level_url; } } } if($count_db_used>0){ $this->echo_msg($level_str.$count_db_used.'条已采集网址被过滤,下次采集需等待'.($level_interval-max($time_interval_list)).'秒,设置间隔','black'); if(count($level_urls)<=$count_db_used){ $this->echo_msg($level_str.$level.'级“'.$this->config['level_urls'][$level-1]['name'].'”网址采集完毕!','green',true,$end_echo); return 'completed'; } } $level_urls=array_merge($sortLevels['undb'],$sortLevels['db']); unset($sortLevels); unset($cacheLevels); } $level_data['urls']=$level_urls; } $finished_source=true; $cur_level_i=0;; if(!empty($level_data['urls'])){ foreach ($level_data['urls'] as $level_key=>$level_url){ $cur_level_i++; if(array_key_exists($level_key,$this->used_level_urls)){ continue; } $this->cur_level_urls[$this->config['level_urls'][$level-1]['name']]=$level_url; $this->echo_msg("{$next_level_str}分析第{$level}级:{$level_url}",'black'); if($level_data['nextLevel']>0){ $return_msg=$this->_collect_level($level_url,$level_data['nextLevel']); if($return_msg=='completed'){ $this->echo_msg('','',true,$end_echo); return $return_msg; } }else{ $cont_urls=$this->getContUrls($level_url); $cont_urls=$this->_collect_unused_cont_urls($cont_urls,$next_level_str); $this->cont_urls_list[$level_key]=$cont_urls; $this->_collect_fields($next_level_str); } if($this->collect_num>0){ if(count($this->collected_field_list)>=$this->collect_num){ if($cur_level_iused_level_urls[$source_key]=1; $mcacheLevel->setCache(md5($source_key),$source_key); if($level<=1){ $mcacheSource=CacheModel::getInstance('source_url'); $this->used_source_urls[$source_url]=1; $mcacheSource->setCache(md5($source_url),$source_url); } } $this->echo_msg('','',true,$end_echo); } /*采集字段列表*/ public function _collect_fields($echo_str=''){ $mcollected=model('Collected'); $mcacheSource=CacheModel::getInstance('source_url'); $mcacheLevel=CacheModel::getInstance('level_url'); $mcacheCont=CacheModel::getInstance('cont_url'); foreach ($this->cont_urls_list as $cont_key=>$cont_urls){ $source_type=0; if('_source_is_url_'==$cont_key){ $source_type=0; }elseif(strpos($cont_key,'level_')===0){ $source_type=2; }else{ $source_type=1; } if($source_type==2){ if(array_key_exists($cont_key,$this->used_level_urls)){ continue; } }else{ if(array_key_exists($cont_key,$this->used_source_urls)){ continue; } } $finished_cont=true; $cur_c_i=0; foreach ($cont_urls as $cont_url){ $cur_c_i+=1; $md5_cont_url=md5($cont_url); if(array_key_exists($md5_cont_url,$this->used_cont_urls)){ continue; } if($this->config['url_repeat']||$mcollected->getCountByUrl($cont_url)<=0){ if(!empty($this->collected_field_list)){ if($this->set_html_interval()===true){ if(!$this->config['url_repeat']&&$mcollected->getCountByUrl($cont_url)>0){ $this->used_cont_urls[$md5_cont_url]=1; continue; } } } if($mcacheCont->getCount($md5_cont_url)>0){ $this->used_cont_urls[$md5_cont_url]=1; continue; } $mcacheCont->setCache($md5_cont_url, 1); $this->echo_msg($echo_str."采集内容页:{$cont_url}",'black'); $field_vals_list=$this->getFields($cont_url); $is_loop=empty($this->first_loop_field)?false:true; if(!empty($field_vals_list)){ $is_real_time=false; if(!empty($GLOBALS['config']['caiji']['real_time'])&&!empty($GLOBALS['real_time_release'])){ $is_real_time=true; } if(!$is_loop){ $field_vals_list=array($field_vals_list); }else{ $loop_cont_urls=array(); foreach ($field_vals_list as $k=>$field_vals){ $loop_cont_urls[$k]=$cont_url.'#'.md5(serialize($field_vals)); } if(!empty($loop_cont_urls)){ $loop_exists_urls=$mcollected->getUrlByUrl($loop_cont_urls); if(!empty($loop_exists_urls)){ $loop_exists_urls=array_flip($loop_exists_urls); foreach ($loop_cont_urls as $k=>$loop_cont_url){ if(isset($loop_exists_urls[$loop_cont_url])){ unset($field_vals_list[$k]); } } $field_vals_list=array_values($field_vals_list); $this->echo_msg($echo_str.'已过滤'.count($loop_exists_urls).'条重复数据','black'); } } } foreach ($field_vals_list as $field_vals){ $collected_data=array('url'=>$cont_url,'fields'=>$field_vals); if($is_loop){ $collected_data['url'].='#'.md5(serialize($field_vals)); } $collected_error=''; if(!empty($this->config['field_title'])){ $collected_data['title']=$field_vals[$this->config['field_title']]['value']; } if(!empty($collected_data['title'])){ if($mcollected->getCountByTitle($collected_data['title'])>0){ $collected_error='标题重复:'.mb_substr($collected_data['title'],0,300,'utf-8'); } } if(empty($collected_error)){ if($is_real_time){ $GLOBALS['real_time_release']->export(array($collected_data)); unset($collected_data['fields']); unset($collected_data['title']); } $this->collected_field_list[]=$collected_data; }else{ controller('ReleaseBase','event')->record_collected($collected_data['url'], array('id'=>0,'error'=>$collected_error),array('task_id'=>$this->collector['task_id'],'module'=>$this->release['module']) ); } } } if($is_loop){ controller('ReleaseBase','event')->record_collected( $cont_url,array('id'=>1,'target'=>'','desc'=>'循环入库'),array('task_id'=>$this->collector['task_id'],'module'=>$this->release['module']),null,false ); } } $this->used_cont_urls[$md5_cont_url]=1; if($this->collect_num>0){ if(count($this->collected_field_list)>=$this->collect_num){ if($cur_c_isetCache(md5($cont_key),$cont_key); }elseif($source_type==2){ $mcacheLevel->setCache(md5($cont_key),$cont_key); } if($source_type==2){ $this->used_level_urls[$cont_key]=1; }else{ $this->used_source_urls[$cont_key]=1; } } if($this->collect_num>0&&count($this->collected_field_list)>=$this->collect_num){ break; } } } /*采集,return false表示终止采集*/ public function collect($num=10){ define('IS_COLLECTING', 1); @session_start(); \think\Session::pause(); if(!$this->show_opened_tools){ $opened_tools=array(); if($this->config['page_render']){ $opened_tools[]='页面渲染'; } if($GLOBALS['config']['caiji']['download_img']){ $opened_tools[]='图片本地化'; } if($GLOBALS['config']['proxy']['open']){ $opened_tools[]='代理'; } if(!empty($opened_tools)){ $this->echo_msg('开启功能:'.implode('、', $opened_tools),'black'); } if($num>0){ $this->echo_msg('预计采集'.$num.'条数据','black'); } $this->show_opened_tools=true; } $this->collect_num=$num; $this->collected_field_list=array(); $source_is_url=intval($this->config['source_is_url']); if(!isset($this->original_source_urls)){ $this->original_source_urls=array(); foreach ( $this->config ['source_url'] as $k => $v ) { if(empty($v)){ continue; } $return_s_urls = $this->convert_source_url ( $v ); if (is_array ( $return_s_urls )) { foreach ($return_s_urls as $r_s_u){ $this->original_source_urls[md5($r_s_u)]=$r_s_u; } } else { $this->original_source_urls[md5($return_s_urls)]=$return_s_urls; } } } if(empty($this->original_source_urls)){ $this->echo_msg('没有起始页网址!'); return 'completed'; } if($source_is_url){ if(isset($this->used_source_urls['_source_is_url_'])){ $this->echo_msg('所有起始页采集完毕!','green'); return 'completed'; } }else{ if(count($this->original_source_urls)<=count($this->used_source_urls)){ $this->echo_msg('所有起始页采集完毕!','green'); return 'completed'; } } $source_interval=$GLOBALS['config']['caiji']['interval']*60; $time_interval_list=array(); $source_urls=array(); $mcacheSource=CacheModel::getInstance('source_url'); if($source_is_url){ $source_urls=$this->original_source_urls; }else{ $cacheSources=$mcacheSource->db()->where(array('cname'=>array('in',array_keys($this->original_source_urls))))->column('dateline','cname'); if(!empty($cacheSources)){ $count_db_used=0; $sortSources=array('undb'=>array(),'db'=>array()); foreach ($this->original_source_urls as $sKey=>$sVal){ if(!isset($cacheSources[$sKey])){ $sortSources['undb'][$sKey]=$sVal; }else{ $time_interval=abs(NOW_TIME-$cacheSources[$sKey]); if($time_interval<$source_interval){ $this->used_source_urls[$sVal]=1; $count_db_used++; $time_interval_list[]=$time_interval; }else{ $sortSources['db'][$sKey]=$sVal; } } } if($count_db_used>0){ $this->echo_msg($count_db_used.'条已采集起始网址被过滤,下次采集需等待'.($source_interval-max($time_interval_list)).'秒,设置间隔','black'); if(count($this->original_source_urls)<=count($this->used_source_urls)){ $this->echo_msg('所有起始页采集完毕!','green'); return 'completed'; } } $source_urls=array_merge($sortSources['undb'],$sortSources['db']); unset($sortSources); unset($cacheSources); }else{ $source_urls=$this->original_source_urls; } } $mcollected=model('Collected'); if($source_is_url){ $this->cont_urls_list['_source_is_url_']=array_values($source_urls); $source_urls=array('_source_is_url_'=>'_source_is_url_'); } foreach ($source_urls as $key_source_url=>$source_url){ $this->cur_source_url=$source_url; if(array_key_exists($source_url,$this->used_source_urls)){ continue; } if($source_is_url){ $this->echo_msg("起始页已转换为内容页网址",'black'); }else{ $this->echo_msg("采集起始页:{$source_url}",'green'); } if($source_is_url){ $this->_collect_fields(); }else{ if(!empty($this->config['level_urls'])){ $this->echo_msg('开始分析多级网址','black'); $return_msg=$this->_collect_level($source_url,1); if($return_msg=='completed'){ return $return_msg; } }else{ $cont_urls=$this->getContUrls($source_url); $this->cont_urls_list[$source_url]=$this->_collect_unused_cont_urls($cont_urls); $this->_collect_fields(); } } if($this->collect_num>0&&count($this->collected_field_list)>=$this->collect_num){ break; } } return $this->collected_field_list; } /** * 拼接默认设置 * @param unknown $reg 规则 * @param unknown $merge 拼接字符串 */ public function set_merge_default($reg,$merge){ if(empty($merge)){ $merge=''; if(!empty($reg)){ if(preg_match_all('/\\d*)\>/i', $reg,$match_signs)){ foreach ($match_signs['num'] as $snum){ $merge.=cp_sign('match',$snum); } } } } return $merge; } /** * 转换起始网址 * @param string $url * @return multitype:mixed |unknown */ public function convert_source_url($url){ $urls=array(); if(preg_match('/\{param\:(?P[a-z]+)\,(?P.*?)\}/i', $url,$match)){ $fmtUrl=preg_replace('/\{param\:.*?\}/i', '__set:param__', $url); $type=strtolower($match['type']); $val=explode("\t", $match['val']); if($type=='num'){ $num_start = intval($val[0]); $num_end = intval($val[1]); $num_end = max ($num_start,$num_end); $num_inc = max ( 1, intval($val[2])); $num_desc =$val[3]?1:0; if($num_desc){ for($i=$num_end;$i>=$num_start;$i--){ $urls[]=str_replace('__set:param__', $num_start+($i-$num_start)*$num_inc, $fmtUrl); } }else{ for($i=$num_start;$i<=$num_end;$i++){ $urls[]=str_replace('__set:param__', $num_start+($i-$num_start)*$num_inc, $fmtUrl); } } }elseif($type=='letter'){ $letter_start=ord($val[0]); $letter_end=ord($val[1]); $letter_end=max($letter_start,$letter_end); $letter_desc=$val[2]?1:0; if($letter_desc){ for($i=$letter_end;$i>=$letter_start;$i--) { $urls[]=str_replace('__set:param__', chr($i), $fmtUrl); } }else{ for($i=$letter_start;$i<=$letter_end;$i++) { $urls[]=str_replace('__set:param__', chr($i), $fmtUrl); } } }elseif($type=='custom'){ foreach ($val as $v){ $urls[]=str_replace('__set:param__', $v, $fmtUrl); } } return $urls; }elseif(preg_match('/[\r\n]/', $url)){ if(preg_match_all('/^\w+\:\/\/[^\r\n]+/im',$url,$urls)){ $urls=array_unique($urls[0]); $urls=array_values($urls); } return $urls; }else{ return $url; } } /*转换(*)通配符*/ public function convert_sign_wildcard($str){ return str_replace(lang('sign_wildcard'), '[\s\S]*?', $str); } /*转换[参数]*/ public function convert_sign_match($str){ $str=preg_replace('/\(\?<(content|match)/i', '(?Psign_addslashes(cp_sign('match','(?P\d*)')); $str=preg_replace_callback('/(\={0,1})(\s*)([\'\"]{0,1})'.$sign_match.'\3/', function($matches){ $ruleStr=$matches[1].$matches[2].$matches[3].'(?P'; if(!empty($matches[1])&&!empty($matches[3])){ $ruleStr.='[^\<\>]*?)'; }else{ $ruleStr.='[\s\S]*?)'; } $ruleStr.=$matches[3]; return $ruleStr; }, $str); return $str; } public function sign_addslashes($str){ $str=str_replace(array('[',']'), array('\[','\]'), $str); return $str; } /*过滤html标签*/ public function filter_html_tags($content,$tags){ $tags=$this->clear_tags($tags); $arr1=$arr2=array(); foreach ($tags as $tag){ $tag=strtolower($tag); if($tag=='script'||$tag=='style'||$tag=='object'){ $arr1[$tag]=$tag; }else{ $arr2[$tag]=$tag; } } if($arr1){ $content=preg_replace('/<('.implode('|', $arr1).')[^<>]*>[\s\S]*?<\/\1>/i', '', $content); } if($arr2){ $content=preg_replace('/<[\/]*('.implode('|', $arr2).')[^<>]*>/i', '', $content); } return $content; } /*过滤标签*/ public function clear_tags($tags){ if(!is_array($tags)){ $tags = preg_replace('/[\s\,\x{ff0c}]+/u', ',', $tags); $tags=explode(',', $tags); } if(!empty($tags)&&is_array($tags)){ $tags=array_filter($tags); $tags=array_unique($tags); $tags=array_values($tags); }else{ $tags=array(); } return $tags; } /*获取源码*/ public function get_html($url,$open_cache=false,$is_post=false){ if($open_cache&&!empty($this->html_cache_list[$url])){ return $this->html_cache_list[$url]; } $pageRenderTool=null; if($this->config['page_render']){ $pageRenderTool=$GLOBALS['config']['page_render']['tool']; if(empty($pageRenderTool)){ $this->error('页面渲染未设置,请检查渲染设置','Setting/page_render'); return null; } } $html=null; $headers=array(); $options=array(); if($this->config['request_headers']['open']){ if(!empty($this->config['request_headers']['useragent'])){ $options['useragent']=$this->config['request_headers']['useragent']; } if(!empty($this->config['request_headers']['cookie'])){ $headers['cookie']=$this->config['request_headers']['cookie']; } if(!empty($this->config['request_headers']['referer'])){ $headers['referer']=$this->config['request_headers']['referer']; } if(!empty($this->config['request_headers']['custom_names'])){ foreach ($this->config['request_headers']['custom_names'] as $k=>$v){ if(!empty($v)){ $headers[$v]=$this->config['request_headers']['custom_vals'][$k]; } } } } $mproxy=model('Proxyip'); $proxy_ip=null; if(!empty($GLOBALS['config']['proxy']['open'])){ $proxy_ip=$mproxy->get_usable_ip(); $proxyIp=$mproxy->to_proxy_ip($proxy_ip); if(!empty($proxyIp)){ $options['proxy']=$proxyIp; } } $urlPost=null; if($is_post){ $urlPost=strpos($url, '?'); if($urlPost!==false){ $urlPost=substr($url, $urlPost+1); $url=preg_replace('/\?.*$/', '', $url); }else{ $urlPost=''; } } if($pageRenderTool){ if(!empty($options['useragent'])){ $headers['user-agent']=$options['useragent']; unset($options['useragent']); } if(!empty($options['proxy'])){ $options['proxy']=$proxy_ip; } if($pageRenderTool=='chrome'){ $chromeConfig=$GLOBALS['config']['page_render']['chrome']; try { $chromeSocket=new \util\ChromeSocket($chromeConfig['host'],$chromeConfig['port'],$GLOBALS['config']['page_render']['timeout'],$chromeConfig['filename']); $chromeSocket->newTab(); $chromeSocket->websocket(null); if($is_post){ $html=$chromeSocket->getRenderHtml($url,$headers,$options,$this->config['charset'],$urlPost); }else{ $html=$chromeSocket->getRenderHtml($url,$headers,$options); } }catch (\Exception $ex){ $this->error('页面渲染失败,请检查渲染设置','Setting/page_render'); return null; } }else{ $this->error('渲染工具不可用,请检查渲染设置','Setting/page_render'); return null; } }else{ if($is_post){ $html=get_html($url,$headers,$options,$this->config['charset'],$urlPost); }else{ $html=get_html($url,$headers,$options,$this->config['charset']); } } if($html==null){ if(!empty($proxy_ip)){ $mproxy->set_ip_failed($proxy_ip); } return null; } if($this->config['url_complete']){ $base_url=$this->match_base_url($url, $html); $domain_url=$this->match_domain_url($url, $html); $html=preg_replace_callback('/(?<=\bhref\=[\'\"])([^\'\"]*)(?=[\'\"])/i',function($matche) use ($base_url,$domain_url){ return \skycaiji\admin\event\Cpattern::create_complete_url($matche[1], $base_url, $domain_url); },$html); $html=preg_replace_callback('/(?<=\bsrc\=[\'\"])([^\'\"]*)(?=[\'\"])/i',function($matche) use ($base_url,$domain_url){ return \skycaiji\admin\event\Cpattern::create_complete_url($matche[1], $base_url, $domain_url); },$html); } if($open_cache){ $this->html_cache_list[$url]=$html; } return $html; } } ?>