');
$level_data=$this->collLevelUrls($source_url,$level);
$this->echo_msg($level_str.'抓取到'.$level.'级“'.$this->config['level_urls'][$level-1]['name'].'”网址'.count($level_data['urls']).'条','black');
$mcollected=model('Collected');
$mcacheLevel=CacheModel::getInstance('level_url');
if(!empty($level_data['urls'])){
$level_urls=array();
foreach ($level_data['urls'] as $level_url){
$level_urls["level_{$level}:{$level_url}"]=$level_url;
}
$level_interval=$GLOBALS['config']['caiji']['interval']*60;
$time_interval_list=array();
$cacheLevels=$mcacheLevel->db()->where(array('cname'=>array('in',array_map('md5', array_keys($level_urls)))))->column('dateline','cname');
if(!empty($cacheLevels)){
$count_db_used=0;
$sortLevels=array('undb'=>array(),'db'=>array());
foreach ($level_urls as $level_key=>$level_url){
$md5_level_key=md5($level_key);
if(!isset($cacheLevels[$md5_level_key])){
$sortLevels['undb'][$level_key]=$level_url;
}else{
$time_interval=abs(NOW_TIME-$cacheLevels[$md5_level_key]);
if($time_interval<$level_interval){
$this->used_level_urls[$level_key]=1;
$count_db_used++;
$time_interval_list[]=$time_interval;
}else{
$sortLevels['db'][$level_key]=$level_url;
}
}
}
if($count_db_used>0){
$this->echo_msg($level_str.$count_db_used.'条已采集网址被过滤,下次采集需等待'.($level_interval-max($time_interval_list)).'秒,
设置间隔','black');
if(count($level_urls)<=$count_db_used){
$this->echo_msg($level_str.$level.'级“'.$this->config['level_urls'][$level-1]['name'].'”网址采集完毕!','green',true,$end_echo);
return 'completed';
}
}
$level_urls=array_merge($sortLevels['undb'],$sortLevels['db']);
unset($sortLevels);
unset($cacheLevels);
}
$level_data['urls']=$level_urls;
}
$finished_source=true;
$cur_level_i=0;;
if(!empty($level_data['urls'])){
foreach ($level_data['urls'] as $level_key=>$level_url){
$cur_level_i++;
if(array_key_exists($level_key,$this->used_level_urls)){
continue;
}
$this->cur_level_urls[$this->config['level_urls'][$level-1]['name']]=$level_url;
$this->echo_msg("{$next_level_str}分析第{$level}级:
{$level_url}",'black');
if($level_data['nextLevel']>0){
$return_msg=$this->_collect_level($level_url,$level_data['nextLevel']);
if($return_msg=='completed'){
$this->echo_msg('','',true,$end_echo);
return $return_msg;
}
}else{
$cont_urls=$this->getContUrls($level_url);
$cont_urls=$this->_collect_unused_cont_urls($cont_urls,$next_level_str);
$this->cont_urls_list[$level_key]=$cont_urls;
$this->_collect_fields($next_level_str);
}
if($this->collect_num>0){
if(count($this->collected_field_list)>=$this->collect_num){
if($cur_level_i
used_level_urls[$source_key]=1;
$mcacheLevel->setCache(md5($source_key),$source_key);
if($level<=1){
$mcacheSource=CacheModel::getInstance('source_url');
$this->used_source_urls[$source_url]=1;
$mcacheSource->setCache(md5($source_url),$source_url);
}
}
$this->echo_msg('','',true,$end_echo);
}
/*采集字段列表*/
public function _collect_fields($echo_str=''){
$mcollected=model('Collected');
$mcacheSource=CacheModel::getInstance('source_url');
$mcacheLevel=CacheModel::getInstance('level_url');
$mcacheCont=CacheModel::getInstance('cont_url');
foreach ($this->cont_urls_list as $cont_key=>$cont_urls){
$source_type=0;
if('_source_is_url_'==$cont_key){
$source_type=0;
}elseif(strpos($cont_key,'level_')===0){
$source_type=2;
}else{
$source_type=1;
}
if($source_type==2){
if(array_key_exists($cont_key,$this->used_level_urls)){
continue;
}
}else{
if(array_key_exists($cont_key,$this->used_source_urls)){
continue;
}
}
$finished_cont=true;
$cur_c_i=0;
foreach ($cont_urls as $cont_url){
$cur_c_i+=1;
$md5_cont_url=md5($cont_url);
if(array_key_exists($md5_cont_url,$this->used_cont_urls)){
continue;
}
if($this->config['url_repeat']||$mcollected->getCountByUrl($cont_url)<=0){
if(!empty($this->collected_field_list)){
if($this->set_html_interval()===true){
if(!$this->config['url_repeat']&&$mcollected->getCountByUrl($cont_url)>0){
$this->used_cont_urls[$md5_cont_url]=1;
continue;
}
}
}
if(input('?backstage')){
$backstageDate=CacheModel::getInstance('backstage_task')->db()->field('dateline')->where('cname',$this->collector['task_id'])->find();
if(empty($backstageDate)||$GLOBALS['backstage_task_runtime']<$backstageDate['dateline']){
unset($GLOBALS['backstage_task_ids'][$this->collector['task_id']]);
exit('终止进程');
}
}
if($mcacheCont->getCount($md5_cont_url)>0){
$this->used_cont_urls[$md5_cont_url]=1;
continue;
}
$mcacheCont->setCache($md5_cont_url, 1);
$this->echo_msg($echo_str."采集内容页:{$cont_url}",'black');
$field_vals_list=$this->getFields($cont_url);
$is_loop=empty($this->first_loop_field)?false:true;
if(!empty($field_vals_list)){
$is_real_time=false;
if(!empty($GLOBALS['config']['caiji']['real_time'])&&!empty($GLOBALS['real_time_release'])){
$is_real_time=true;
}
if(!$is_loop){
$field_vals_list=array($field_vals_list);
}else{
$loop_cont_urls=array();
foreach ($field_vals_list as $k=>$field_vals){
$loop_cont_urls[$k]=$cont_url.'#'.md5(serialize($field_vals));
}
if(!empty($loop_cont_urls)){
$loop_exists_urls=$mcollected->getUrlByUrl($loop_cont_urls);
if(!empty($loop_exists_urls)){
$loop_exists_urls=array_flip($loop_exists_urls);
foreach ($loop_cont_urls as $k=>$loop_cont_url){
if(isset($loop_exists_urls[$loop_cont_url])){
unset($field_vals_list[$k]);
}
}
$this->echo_msg($echo_str.'已过滤'.count($loop_exists_urls).'条重复数据','black');
}
}
if(isset($this->exclude_cont_urls[$md5_cont_url])){
$excludeNum=0;
foreach($this->exclude_cont_urls[$md5_cont_url] as $k=>$v){
$excludeNum+=count($v);
}
$this->echo_msg($echo_str.'通过数据处理排除了'.$excludeNum.'条数据','black');
}
$field_vals_list=array_values($field_vals_list);
}
foreach ($field_vals_list as $field_vals){
$collected_error='';
$collected_data=array('url'=>$cont_url,'fields'=>$field_vals);
if($is_loop){
$collected_data['url'].='#'.md5(serialize($field_vals));
}else{
if(isset($this->exclude_cont_urls[$md5_cont_url])){
$collected_error=reset($this->exclude_cont_urls[$md5_cont_url]);
$collected_error=$this->exclude_url_msg($collected_error);
}
}
if(empty($collected_error)){
if(!empty($this->config['field_title'])){
$collected_data['title']=$field_vals[$this->config['field_title']]['value'];
}
if(!empty($collected_data['title'])){
if($mcollected->getCountByTitle($collected_data['title'])>0){
$collected_error='标题重复:'.mb_substr($collected_data['title'],0,300,'utf-8');
}
}
}
if(empty($collected_error)){
if($is_real_time){
$GLOBALS['real_time_release']->export(array($collected_data));
unset($collected_data['fields']);
unset($collected_data['title']);
}
$this->collected_field_list[]=$collected_data;
}else{
if(!$this->config['url_repeat']){
controller('ReleaseBase','event')->record_collected($collected_data['url'],
array('id'=>0,'error'=>$collected_error),array('task_id'=>$this->collector['task_id'],'module'=>$this->release['module'])
);
}else{
$this->echo_msg($collected_error);
}
}
}
}
if($is_loop){
controller('ReleaseBase','event')->record_collected(
$cont_url,array('id'=>1,'target'=>'','desc'=>'循环入库'),array('task_id'=>$this->collector['task_id'],'module'=>$this->release['module']),null,false
);
}
}else{
$this->echo_msg('已采集过该网址','black');
}
$this->used_cont_urls[$md5_cont_url]=1;
if($this->collect_num>0){
if(count($this->collected_field_list)>=$this->collect_num){
if($cur_c_isetCache(md5($cont_key),$cont_key);
}elseif($source_type==2){
$mcacheLevel->setCache(md5($cont_key),$cont_key);
}
if($source_type==2){
$this->used_level_urls[$cont_key]=1;
}else{
$this->used_source_urls[$cont_key]=1;
}
}
if($this->collect_num>0&&count($this->collected_field_list)>=$this->collect_num){
break;
}
}
}
}
?>