在好例子网,分享、交流、成长!
您当前所在位置:首页Perl 开发实例Perl网络编程 → perl脚本爬虫程序,支持爬取北大未名bbs、163新闻、ifeng新闻、猫扑论坛、sina新闻等

perl脚本爬虫程序,支持爬取北大未名bbs、163新闻、ifeng新闻、猫扑论坛、sina新闻等

Perl网络编程

下载此实例
  • 开发语言:Perl
  • 实例大小:0.18M
  • 下载次数:30
  • 浏览次数:1372
  • 发布时间:2013-01-08
  • 实例类别:Perl网络编程
  • 发 布 人:星火燎原
  • 文件格式:.rar
  • 所需积分:2
 相关标签: 采集

实例介绍

【实例简介】

采用perl脚本写的爬虫程序,可以爬取北大未名bbs、163新闻、ifeng新闻、猫扑论坛、sina新闻等
【实例截图】


【核心代码】

抓取代码一:

 

#!/usr/bin/perl -w

package Parser_ifeng;
use Exporter;
our @ISA=qw(Exporter);

our @EXPORT=qw(parser_ifeng);

#use strict;
use warnings;
use LWP::UserAgent;
use LWP::Simple;
use HTML::TokeParser;
use Encode;
use utf8;
use Time::Local;
use Mysql_conn;

#binmode(STDIN, ':encoding(utf8)');
binmode(STDOUT, ':encoding(utf8)');
#binmode(STDERR, ':encoding(utf8)');


my $workdir="/home/webscience/proj/";
my $pmdir="$workdir/pm/";
my $logdir="$workdir/log/";
my $logfile="$logdir/ifeng.log";
my $logfp;
my $tmpdir="$workdir/tmp/";
my $tmpfile="$tmpdir/ifeng.tmp";
my $tmpfile1="$tmpdir/ifeng1.tmp";
my %board_list=();
my $main_url="http://www.tianya.cn/bbs/index.shtml";



sub parser_ifeng
{

	#初始化日志文件
	open($logfp,">>$logfile") or die "can't open file:$!";

	$board_list{'点击排行榜'}="http://news.ifeng.com/toprank/day/";
	$board_list{'评论排行榜'}="http://news.ifeng.com/comment/day/";
	$board_list{'表情_感动'}="http://news.ifeng.com/mood/day/1/";
	$board_list{'表情_同情'}="http://news.ifeng.com/mood/day/2/";
	$board_list{'表情_无聊'}="http://news.ifeng.com/mood/day/3/";
	$board_list{'表情_愤怒'}="http://news.ifeng.com/mood/day/4/";
	$board_list{'表情_搞笑'}="http://news.ifeng.com/mood/day/5/";
	$board_list{'表情_难过'}="http://news.ifeng.com/mood/day/6/";
	$board_list{'表情_高兴'}="http://news.ifeng.com/mood/day/7/";
	$board_list{'表情_路过'}="http://news.ifeng.com/mood/day/8/";

        my $key;
        my $value;
	local %url_hash;#用于去除重复的新闻
        while(($key, $value) = each %board_list){
		my %board;
		$board{'board_name'}=$key;
		$board{'board_url'}=$value;
		$board{'parent_name'}="凤凰网";
		#获取版块id,若是新版块,则写入数据库
		#------
		my ($fid, $parent_id) = insert2forum(%board);
		$board{'board_id'} = $fid;
		$board{'parent_id'} = $parent_id;
		parser_board(%board);
	}
	
	phash(%board_list);#test
}


sub parser_board(%)
{
	my %attr=();
	my $token;
	my $page;
	my $p;
	my %board=@_;
	my $url=$board{'board_url'};
	my @poster_list;
	
        my $tmp;
	my $i=0;
	#获取当前时间
	my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
	$t_year =1900;
	$t_mon =1;
	#unix时间
	local $u_time=timelocal($t_sec,$t_min,$t_hour,$t_mday,$t_mon-1,$t_year-1900);

	print "board_url=$url\n";
        $page=get($url);

        utf8::decode($page);
        $p= HTML::TokeParser->new(\$page);	
	
	#解析新闻列表
        while ($token = $p->get_tag("div")) {
                %attr=%{$token->[1]};
		if(!$attr{'class'} || $attr{'class'} ne 'listPublic')
			{next;}

		$token = $p->get_tag("a");
                %attr=%{$token->[1]};

		#去除重复的新闻	
		my $href=$attr{'href'};
		if($url_hash{"$href"})
			{next;	}

		$i  ;
		$poster_list[$i]->{'poster_url'}=$attr{'href'};
		$poster_list[$i]->{'title_name'}=$p->get_trimmed_text("/a");
		$poster_list[$i]->{'board_name'}=$board{'board_name'};
		$poster_list[$i]->{'board_id'} = $board{'board_id'};	#Add board id
		$poster_list[$i]->{'parent_id'} = $board{'parent_id'};	#Add board id
		$url_hash{"$href"}=$poster_list[$i]->{'title_name'};
			
#		phash(%{$poster_list[$i]});exit;
        }

	$i=1;
	foreach $poster (@poster_list)
	{
		if(!$poster || $poster eq ""){next;}
		phash(%$poster);
		
		my ($code, $poster, $reply) = parser_poster(\%$poster);
		#if($code == 0){
		#	insert2topic($board{'board_id'}, \%$poster, \@$reply);
		#}		
		$i  ;	
	}
}

#获取主贴信息
sub parser_poster
{
	my %attr=();
	my $page;
	my $tmp;
	my $p;
	my $token;	
	my $poster=$_[0];
	local @reply;
	my $url=$poster->{'poster_url'};
	
        $page=get($url);
        utf8::decode($page);
        $p= HTML::TokeParser->new(\$page);	

	#定位到论坛开始 主贴内容开始标识<div id="artical_sth">	
        while($token=$p->get_tag('div')){
                %attr=%{$token->[1]};
                if($attr{'id'} && $attr{'id'} eq 'artical_sth')
                        {last;}
        }
	if(!$token){
		wlog("$url 格式错误,跳过此帖");	
		return -1;
	}

	#抓取发帖时间
	$token=$p->get_tag('span');
	$tmp=$p->get_trimmed_text('/span');
	$pattern='([\d]*)年([\d]*)月([\d]*)日 ([\d]*):([\d]*)';
	if(!($tmp =~ m/$pattern/)){
		wlog("$url 时间获取失败,跳过此帖");	
		return -1;
	}
 	$poster->{'post_time'}="$1-$2-$3 $4:$5:00";		
	
	#抓取新闻来源
	$token=$p->get_tag('span');
	$poster->{'info_source'}=$p->get_trimmed_text('/span');

	#抓取评论数
	$token=$p->get_tag('span');
	$poster->{'reply_num'}=$p->get_trimmed_text('/span');

	#查找主贴内容 定位到<div id="clear">	
	while($token=$p->get_token()){
		if($token->[0] eq "C"&& $token->[1] eq "<!--mainContent begin-->" )
			{last;}
	}
	if(!$token){
		wlog("$url 抓取内容失败,跳过此帖");	
		return -1;
	}
	$poster->{'content'}=$p->get_trimmed_text('div');
	
	#将帖子内容和回复数据写入数据库
	#insert2topic("topic", %$poster);
	phash(%$poster);#test
	#sqldeal(\%$poster,\@reply);
	insert2news(\%$poster);
	return 0;
}

#获取回复信息
sub get_reply()
{
	my $p=$_[0];
	my $tmp;
	my $page;
	my $i=1;
	my $url;
	my %attr;
	my %attr1;
	my $token;
	my $token1;

R_BEGIN: while($token=$p->get_tag('table','div')){#'td')){
		%attr=%{$token->[1]};
		#<td  id="pageDivBottom">,定位下一页url
		if($token->[0] eq 'div' && $attr{'id'} && $attr{'id'} eq 'adsp_content_banner_2'){
####		if($token->[0] eq 'td' && $attr{'id'} && $attr{'id'} eq 'pageDivBottom'){
			while($token=$p->get_tag('a')){
				if($p->get_trimmed_text('/a') eq '下一页'){
					%attr=%{$token->[1]};
					$url=$attr{'href'};
				        $page=get($url);
				        utf8::decode($page);
				        $p= HTML::TokeParser->new(\$page);

					#重新定位到<div id="pContentDiv">	
					while($token=$p->get_tag('div')){
						%attr=%{$token->[1]};
						if($attr{'id'} && $attr{'id'} eq 'pContentDiv')
							{next R_BEGIN;}
					}
				}
			}		
			last R_BEGIN;#没有下一页,已经是最后一页,退出R_BEGIN
		}elsif($token->[0] eq 'div'){
			next;
		}

                ##抓取回复, get_tag('table')
                #获取回帖用户id
                $p->get_tag('a');
		$reply[$i]->{'uid'}=$p->get_trimmed_text('/a');

                #获取回帖时间
                $tmp=$p->get_trimmed_text('/td');
                $patern='回复日期:([\d\D]*)';
                if($tmp=~m/$patern/)
                        {$reply[$i]->{'reply_time'}=$1;}

                #获取回帖内容
                $reply[$i]->{'content'}=$p->get_trimmed_text('table','div');
                $i  ;
	}
}

#打印reply数组内容
sub preply(@)
{
	my @reply=@_;
	my $i;

	for($i=1;$i<=$#reply;$i  )
	{print "[$i]\n";phash(%{$reply[$i]});}
}

#写入日志到
sub wlog($)
{
	my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
	$t_year =1900;
	$t_mon =1;
        print $logfp "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
        print  "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
}

#打印hash表内容
sub phash(%)
{
	my $key;
	my $value;
	my %hash =@_;	
	my $i=1;

	while(($key, $value) = each %hash){   print $i  ,"  $key = $value\n"; } #test
	print "\n";
}

sub sqldeal(\%\@)
{
	my %poster =%{$_[0]};
	my @reply =@{$_[1]};

#	phash(%$poster);#test
#	preply(@reply);#test

}

1;

猫扑采集代码二:

 

#!/usr/bin/perl
package Parser_mop;

use Exporter;
our @ISA = qw{Exporter};
our @EXPORT = qw{parser_mop};

use warnings;
use LWP::Simple;
use HTML::TokeParser;
use Encode;
use utf8;
use Time::Local;
use Mysql_conn;

binmode STDOUT,":utf8";

my $workdir = "/home/webscience/proj/";
my $logfile = "$workdir/log/mop.log";
my $logfh;
my $preurl = "http://dzh2.mop.com/static/rrlistSubLeft_0_";
my $mainurl = "http://dzh2.mop.com/static/rrlistSubLeft_0_0.html";
my $token;
my %board_list = ();
my @poster_list = ();
my %poster = ();
my %attr = ();
my %board = ();
my $i = 0;
my $tmp;

sub parser_mop
{
    my $key;
    my $value;   

    $board_list{"人肉搜索"} = "http://dzh2.mop.com/static/rrlistSubLeft_0_0.html";   

    
    #initial log file
    open($logfh,">>$logfile") or die "cannot open file: $!\n";    

     while(($key, $value) = each %board_list){
    	my %board;
	$board{'parent_name'} = '猫扑';    
        $board{'board_name'} = $key;
        $board{'board_url'} = $value;
         
	my ($fid, $parent_id) = insert2forum(%board);
	$board{'board_id'} = $fid;
	$board{'parent_id'} = $parent_id;
	parser_board(%board);
    }
  
}

sub parser_board
{
    my %attr = ();
    my $token;
    my $page;
    my $p;
    my %board = @_;
    my $boardurl = $board{'board_url'};
    my @poster_list;
    my $url = $boardurl;
    my $backurl = ".html";

    for($count = 0; $count <= 4; $count  )
    {   
         my $tmpurl = "$count$backurl";
         $url = "$preurl$tmpurl";
         print $url;
         print "\n";

         $page = get($url); # or die "get $url failed.\n";
         if(!$page){last;}
         utf8::decode($page);
         $p = HTML::TokeParser->new(\$page);
        
        #parser news list
        #定位到论坛开始
        $i = 0;
        while($token=$p->get_tag("font")){
	     if($p->get_trimmed_text("/font") eq '我的回答'){last;}
        }
        if(!$token){
	     wlog("$url 无法定位帖子列表");	
	     return -1;
        }

        
        #解析帖子列表
        while($token = $p->get_tag("a")){
             %attr = %{$token->[1]};
             $pattern = '([\d] )条';
             $tmp = $p->get_trimmed_text("/a");
             if($tmp =~ m/$pattern/)
             {
           
                 $poster_list[$i]->{'poster_url'} = "$attr{'href'}";               
                 $poster_list[$i]->{'title_name'} = $tmp;
                 $poster_list[$i]->{'reply_num'} = $1;  
                 $poster_list[$i]->{'click_num'} = 0;  
                 $poster_list[$i]->{'board_name'} = $board{'board_name'};
                 $poster_list[$i]->{'board_id'} = $board{'board_id'};
	         $poster_list[$i]->{'parent_id'} = $board{'parent_id'}; 
                  $i  ;    
             }
             next;            
                        
        }

    }
    
    
     foreach $poster (@poster_list){
           if(!$poster || $poster eq ""){next;}
  #         phash(%$poster);
          
           #my($code, $poster, $reply) = 
           parser_poster(%$poster);
           
     }      

    
}

sub parser_poster
{
   
     my %poster = @_;
     my $poster_url = $poster{'poster_url'};
   #  print "$poster_url";
     my @reply;
     $i = 0;

     my $page = get($poster_url);
     utf8::decode($page);
     my $p = HTML::TokeParser->new(\$page);
   #  print "$page";
     
     #main poster
     while($token = $p->get_tag("strong"))
     {
          %attr = %{$token->[1]};
          if(!$attr{'class'} || $attr{'class'} ne "red"){next;}
          $tmp = $p->get_trimmed_text("/strong");
          @mplist = split(/:/,$tmp,2);
#         if(!$mplist || $mplist[0] ne "悬赏值"){next;}
          $pattern = ':([\d] )';
          if($tmp =~ m/$pattern/){
                $poster->{'mp'} = $1;               
          }
          else { last;}
          while($token = $p->get_tag("span"))
          {
                %attr = %{$token->[1]};
                 if(!$attr{'id'} || $attr{'id'} ne "span_Body"){next;}
          }
          $poster->{'content'} = $p->get_trimmed_text("/span"); 
          #post time
          $token = $p->get_tag("em");
          $tmp = $p->get_trimmed_text("/em");
          $pattern = '([\d]{4})-([\d] )-([\d] )([\s] )([\d] ):([\d] ):([\d]) ';
          if($tmp =~ m/$pattern/){ $poster->{'post_time'} = "$1-$2-$3 $5:$6:$7";}

          #post author
          $token = $p->get_tag("span");
          $poster->{'author'} = $p->get_trimmed_text("/span");       
                         	
         
          last;
               
          
     }
     if(!$token){
          wlog("$poster_url error skip this page");
          return -1;
      }

     $tmpp = $p;
     #get reply info
     #reply 1
     while($token = $p->get_tag("p"))
     {
          %attr = %{$token->[1]};
          if(!$attr{'id'} || $attr{'id'} ne "id_topicreply0"){next;}
          $token = $p->get_tag("font");
          if(!$token){last;}
          %attr = %{$token->[1]};
          $reply[$i]->{'reply_time'} = $attr{'title'};
          $token = $p->get_tag("span");          
          $reply[$i]->{'content'} = $p->get_trimmed_text("/span");
          $token = $p->get_tag("span"); 
          $token = $p->get_tag("span"); 
          $reply[$i]->{'uid'} = $p->get_trimmed_text("/span");         
          $i  ;
          last;
     }
     
     $p = $tmpp;
     while($token = $p->get_tag("hr"))
     {          
          %attr = %{$token->[1]};
          if(!$attr{'class'} || $attr{'class'} ne "hr2"){next;}
          $token = $p->get_tag("font");
          if(!$token){last;}
          %attr = %{$token->[1]};
          $reply[$i]->{'reply_time'} = $attr{'title'};
          $token = $p->get_tag("span");          
          $reply[$i]->{'content'} = $p->get_trimmed_text("/span");
          $token = $p->get_tag("span"); 
          $token = $p->get_tag("span"); 
          $reply[$i]->{'uid'} = $p->get_trimmed_text("/span");         
          $i  ;
          
     }

     
     phash(%$poster);
     preply(@reply);
     #database 
     insert2topic(\%$poster,\@reply); 
     return 0;
}

#print reply content
sub preply(@)
{
   
   my @reply = @_;
   for(my $i = 0; $i < $#reply; $i  )
   {
        print "[$i]";
        phash (%{$reply[$i]});
   }
}
#print hash
sub phash(%)
{
    my $key;
    my $value = "";
    my %hash =@_;	
    $i=1;
    while(($key, $value) = each %hash){   print $i  ,"  $key = $value  \n"; } #test
    print "\n";
}
sub wlog($)
{
     ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
     $t_year =1900;
     $t_mon =1;
     print $logfh "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
     print  "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
}


1;


 

新浪采集代码三:

 

#!/usr/bin/perl
package Parser_mil_news_sina;

use Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw(parser_mil_news_sina);

use warnings;
#use strict;
use LWP::Simple;
use LWP::UserAgent;
use HTML::TokeParser;
use Encode;
use utf8;
use Time::Local;
use Mysql_conn;

binmode STDOUT,":utf8";

my $workdir = "/home/webscience/proj/";
my $logdir = "$workdir/log/";
my $logfile = "$logdir/news_sina.log";
my $logfh;
my $preurl = "http://mil.news.sina.com.cn/";
my %board_list = ();
my %board_all = ();


sub parser_mil_news_sina
{ 
     open($logfh,">>logfile") or die "cannot open the log file.\n";
     my $url = "http://mil.news.sina.com.cn/hotnews/Daily/index.shtml";
     my %board;
     $board{'parent_name'} = '军事论坛_新浪网';    
     $board{'board_name'} = '军事热门帖子';
     $board{'board_url'} = $url;
         
     my ($fid, $parent_id) = insert2forum(%board);
     $board{'board_id'} = $fid;
     $board{'parent_id'} = $parent_id;
     parser_board(%board);
     
}

sub parser_board(%)
{
    my %attr = ();
    my $token;
    my $page;
    my $p;
    my %board = @_;
    my $url = $board{'board_url'};
    my @poster_list = ();
   # my $poster ;

    my $i = 0;               
    #军事热门帖子            
    $page = get($url); # or die "get $url failed.\n"; 
    utf8::decode($page);
    $p = HTML::TokeParser->new(\$page);
        
    #parser news list
    while($token = $p->get_tag("tbody")){
          while($token = $p->get_token()){
               if($token->[0] eq "C" && $token->[1] eq "<!-- 帖子排行列表 begin -->")
               {last;}
          }
          if(!$token){
               wlog("$url get 帖子排行列表 failed.");
               return -1;
          }  

         while($token = $p->get_tag("td")){
              %attr = %{$token->[1]};
              if(!$attr{'class'} || $attr{'class'} ne 'linkNews'){ next;}
                 
              $i  ;
              $token = $p->get_tag("a");
              %attr = %{$token->[1]};
              $poster_list[$i]->{'poster_url'} = $attr{'href'};                                 
              $poster_list[$i]->{'title_name'} = $p->get_trimmed_text("/a");
              $token = $p->get_tag("td");
              my $text = $p->get_trimmed_text("/td");
              if($text =~ /:/){$poster_list[$i]->{'post_time'} = "2010-".$text;
                  $poster_list[$i]->{'click_num'} = 0;}
              else {$poster_list[$i]->{'click_num'} = $text;
                    $poster_list[$i]->{'post_time'} = "";}
              
              $poster_list[$i]->{'board_name'} = $board{'board_name'};
              $poster_list[$i]->{'board_id'} = $board{'board_id'};
              $poster_list[$i]->{'parent_id'} = $board{'parent_id'};	#Add board id         
         }
    }
    
             
    

     $i = 1;
     foreach my $poster (@poster_list){
           if(!$poster || $poster eq ""){next;}
           phash(%$poster);
          
           my($code, $poster, $reply) = parser_poster(\%$poster);
           $i  ;
           
     }      
     
      
}
#get main poster content
sub parser_poster
{
     my %attr = ();
     my $page;
     my $p;
     my $token;
     my $poster = $_[0];
     my $url = $poster->{'poster_url'};
   # my $post_time = $poster->{'poster_url'};
     $poster->{'content'} = "content missed";
     $poster->{'keyword'} = "no keyword";     
     $poster->{'reply_num'} = 0;
     if($poster->{'post_time'} eq "")
     {   exit;}
    
     #get current time
     my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
     $t_year =1900;
     $t_mon =1;
     #unix时间
  #   local my $u_time=timelocal($t_sec,$t_min,$t_hour,$t_mday,$t_mon-1,$t_year-1900);


     $page = get($url);
     utf8::decode($page);
     $p = HTML::TokeParser->new(\$page);
     if(!$p){ print "$url cannot get to";return -1;}

     #local to author
     while($token = $p->get_tag("td")){
         %attr = %{$token->[1]};
         print "$attr{'class'}"; 
         if(!$attr{'class'}  && $attr{'class'} ne "postauthor"){ next;} 
             
             $token = $p->get_tag();
             $token = $p->get_tag("a");
              $poster->{'author'} = $p->get_trimmed_text('/a');
              print "author: $poster->{'author'}";
              print "\n";
              last;
             
             if(!$token){
                  wlog("$url get author failed.");
                  return -1;
             }                        
         last;  
     }
     
     
     while($token = $p->get_tag("td")){
           %attr = %{$token->[1]};
           if($attr{'class'}  && $attr{'class'} eq "postcontent")
           {
                while($token = $p->get_tag("em")){
                      %attr = %{$token->[1]};
                      if(my $text = $p->get_trimmed_text("/em") eq "小")
                      {
                           $poster->{'post_time'} = $p->get_text();
                           last;
                      }
                }
                 #news count
                while($token = $p->get_tag("span")){
                     if($token = $p->get_tag("font"))
                     {last;}
                }
                if(!$token){
                      wlog("$url get news count failed.");
                      return -1;
                 }    
                 $poster->{'click_num'} = $p->get_trimmed_text('/font'); 
                if($token = $p->get_tag("font"))
                {    $poster->{'reply_num'} = $p->get_trimmed_text('/font');}
                  #news content 
                  while($token = $p->get_tag("p")){
                        %attr = %{$token->[1]};
                         if($attr{'align'} && $attr{'align'} eq "center")
                         {
                               $poster->{'content'} = $p->get_trimmed_text("font");
                         }
                  }
                 last;
            }
      }
     
  
        
       
               
            
       #click num > 5000
       
       insert2news(\%$poster);
       phash(%$poster);  
      
	 
}

#写入日志到
sub wlog($)
{
	($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
	$t_year =1900;
	$t_mon =1;
        print $logfh "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
        print  "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
}

#打印hash表内容
sub phash(%)
{
	my $key;
	my $value;
	my %hash =@_;	
	my $i=1;

	while(($key, $value) = each %hash){   print $i  ,"  $key = $value\n"; } #test
	print "\n";
}



1;





















163代码四:

 

#!/usr/bin/perl -w

package Parser_163;
use Exporter;
our @ISA=qw(Exporter);

our @EXPORT=qw(parser_163);

#use strict;
use warnings;
use LWP::UserAgent;
use LWP::Simple;
use HTML::TokeParser;
use Encode;
use utf8;
use Time::Local;
use Mysql_conn;

#binmode(STDIN, ':encoding(utf8)');
binmode(STDOUT, ':encoding(utf8)');
#binmode(STDERR, ':encoding(utf8)');


my $workdir="/home/webscience/proj/";
my $pmdir="$workdir/pm/";
my $logdir="$workdir/log/";
my $logfile="$logdir/163.log";
my $logfp;
my $tmpdir="$workdir/tmp/";
my $tmpfile="$tmpdir/163.tmp";
my %board_list=();
my $main_url="http://news.163.com/special/0001386F/rank_whole.html";



sub parser_163
{

	#初始化日志文件
	open($logfp,">>$logfile") or die "can't open file:$!";
	wlog("开始抓取163新闻\n");

	local %url_hash;#用于去除重复的新闻
	my %board;
	$board{'board_name'}="163新闻";
	$board{'board_url'}=$main_url;
	$board{'parent_name'}='163新闻';
	#获取版块id,若是新版块,则写入数据库
	#------
	my ($fid, $parent_id) = insert2forum(%board);
	$board{'board_id'} = $fid;
	$board{'parent_id'} = -1;
	parser_board(%board);
	
}


sub parser_board(%)
{
	my %attr=();
	my $token;
	my $page;
	my $p;
	my %board=@_;
	my $url=$board{'board_url'};
	my @poster_list;
	
        my $tmp;
	my $i=0;
	#获取当前时间
	my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
	$t_year =1900;
	$t_mon =1;
	#unix时间
	local $u_time=timelocal($t_sec,$t_min,$t_hour,$t_mday,$t_mon-1,$t_year-1900);

        $page=get($url);
        utf8::decode($page);
        $p= HTML::TokeParser->new(\$page);	
	
	#解析新闻列表,定位到<div class="tabContents active">
        while ($token = $p->get_tag("div")) {
                %attr=%{$token->[1]};
                if($attr{'class'} && $attr{'class'} eq 'tabContents active')
                        {last;}
	}
        while ($token = $p->get_tag("a","div")) {
		#以<div class="nav_channel_end">为终止标记
                %attr=%{$token->[1]};
                if($attr{'class'} && $attr{'class'} eq 'nav_channel_end')
                        {last;}
		if($token->[0] eq "div")
			{next;}
		#去除重复的新闻	
		my $href=$attr{'href'};
		if($url_hash{"$href"})
			{next;	}

		$i  ;
		$poster_list[$i]->{'poster_url'}=$attr{'href'};
		$poster_list[$i]->{'title_name'}=$p->get_trimmed_text('/a');;
		$poster_list[$i]->{'board_name'}=$board{'board_name'};
		$poster_list[$i]->{'board_id'} = $board{'board_id'};	#Add board id
		$poster_list[$i]->{'parent_id'} = $board{'parent_id'};	#Add board id
		$url_hash{"$href"}=$poster_list[$i]->{'title_name'};
			
#		phash(%{$poster_list[$i]});
        }

	$i=1;
	foreach $poster (@poster_list)
	{
		if(!$poster || $poster eq ""){next;}
		my ($code, $poster, $reply) = parser_poster(\%$poster);
		#if($code == 0){
		#	insert2topic($board{'board_id'}, \%$poster, \@$reply);
		#}		
		$i  ;	
	}
}

#获取主贴信息
sub parser_poster
{
	my %attr=();
	my $page;
	my $tmp;
	my $p;
	my $token;	
	local $poster=$_[0];
	local @reply;
	my $url=$poster->{'poster_url'};
	
        $page=get($url);
        utf8::decode($page);
        $p= HTML::TokeParser->new(\$page);	

	#定位到论坛开始 主贴内容开始标识<div class="sourceTime"	
        while($token=$p->get_tag('div')){
                %attr=%{$token->[1]};
                if($attr{'class'} && $attr{'class'} eq 'sourceTime')
                        {last;}
        }
	if(!$token){
		wlog("$url 格式错误,跳过此帖");	
		return -1;
	}

	#抓取新闻来源
	$token=$p->get_tag('span');
	$poster->{'info_source'}=$p->get_trimmed_text('/span');
	
	#抓取发帖时间
	$token=$p->get_tag('div');
	$tmp=$p->get_trimmed_text('/div');
	$pattern='([\d]*)年([\d]*)月([\d]*)日([\d]*):([\d]*)';
	if(!($tmp =~ m/$pattern/)){
		wlog("$url 时间获取失败,跳过此帖");	
		return -1;
	}
 	$poster->{'post_time'}="$1-$2-$3 $4:$5:00";		
	
	#获取评论url,获取评论数
	while($token=$p->get_tag('a')){
		$tmp=$p->get_trimmed_text('/a');
		if($tmp eq '我来说两句'){
			%attr=%{$token->[1]};		
			$reply_url=$attr{'href'};
		}
	}
	get_reply($reply_url);

	#查找主贴内容 定位到<div class="text clear" id="contentText"	
        $p= HTML::TokeParser->new(\$page);	
        while($token=$p->get_tag('div')){
                %attr=%{$token->[1]};
                if($attr{'id'} && $attr{'id'} eq 'contentText')
                        {last;}
	}
	if(!$token){
		wlog("$url 抓取内容失败,跳过此帖");	
		return -1;
	}
	$poster->{'content'}=$p->get_trimmed_text('script');
	
	#将帖子内容和回复数据写入数据库
	#insert2topic("topic", %$poster);
	#phash(%$poster);#test
	#preply(@reply);#exit;
	#sqldeal(\%$poster,\@reply);
	insert2news(\%$poster);
	return 0;
}

#获取回复信息
sub get_reply()
{
	my $p;
	my $tmp;
	my $page;
	my $i=1;
	my $url=$_[0];
	my $isnext=0;
	my %attr;
	my $token;
	
	#获取评论数
	$page=get($url);
        utf8::decode($page);
        $p= HTML::TokeParser->new(\$page);	
	while($token=$p->get_tag('span')){
                %attr=%{$token->[1]};
		#定位到<span id="ptdown">
		if($attr{'id'} && $attr{'id'} eq "ptdown"){
			$tmp=$p->get_trimmed_text('/span');
			$tmp =~ m/共有原创评论([\d]*)条/;
			$poster->{'reply_num'} = $1;
			return 0;
		}
	}
}

#打印reply数组内容
sub preply(@)
{
	my @reply=@_;
	my $i;

	for($i=1;$i<=$#reply;$i  )
	{print "[$i]\n";phash(%{$reply[$i]});}
}

#写入日志到
sub wlog($)
{
	my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
	$t_year =1900;
	$t_mon =1;
        print $logfp "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
        print  "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
}

#打印hash表内容
sub phash(%)
{
	my $key;
	my $value;
	my %hash =@_;	
	my $i=1;

	while(($key, $value) = each %hash){   print $i  ,"  $key = $value\n"; } #test
	print "\n";
}

sub sqldeal(\%\@)
{
	my %poster =%{$_[0]};
	my @reply =@{$_[1]};

#	phash(%$poster);#test
#	preply(@reply);#test

}

1;

搜狐采集代码五:

 

#!/usr/bin/perl -w

package Parser_sohu;
use Exporter;
our @ISA=qw(Exporter);

our @EXPORT=qw(parser_sohu);

#use strict;
use warnings;
use LWP::UserAgent;
use LWP::Simple;
use HTML::TokeParser;
use Encode;
use utf8;
use Time::Local;
use Mysql_conn;

#binmode(STDIN, ':encoding(utf8)');
binmode(STDOUT, ':encoding(utf8)');
#binmode(STDERR, ':encoding(utf8)');


my $workdir="/home/webscience/proj/";
my $pmdir="$workdir/pm/";
my $logdir="$workdir/log/";
my $logfile="$logdir/sohu.log";
my $logfp;
my $tmpdir="$workdir/tmp/";
my $tmpfile="$tmpdir/sohu.tmp";
my %board_list=();
my $main_url="http://comment.news.sohu.com/djpm/";



sub parser_sohu
{

	#初始化日志文件
	open($logfp,">>$logfile") or die "can't open file:$!";
	wlog("开始抓取sohu新闻\n");

	local %url_hash;#用于去除重复的新闻
	my %board;
	$board{'board_name'}="搜狐新闻";
	$board{'board_url'}=$main_url;
	$board{'parent_name'}='搜狐新闻';
	#获取版块id,若是新版块,则写入数据库
	#------
	my ($fid, $parent_id) = insert2forum(%board);
	$board{'board_id'} = $fid;
	$board{'parent_id'} = -1;
	parser_board(%board);
	
}


sub parser_board(%)
{
	my %attr=();
	my $token;
	my $page;
	my $p;
	my %board=@_;
	my $url=$board{'board_url'};
	my @poster_list;
	
        my $tmp;
	my $i=0;
	#获取当前时间
	my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
	$t_year =1900;
	$t_mon =1;
	#unix时间
	local $u_time=timelocal($t_sec,$t_min,$t_hour,$t_mday,$t_mon-1,$t_year-1900);

        $page=get($url);
        utf8::decode($page);
        $p= HTML::TokeParser->new(\$page);	
	
	#解析新闻列表,定位到<tbody>
        while ($token = $p->get_tag("/tbody")) {
        	for($j=1;$j<=10;$j  ){
			$token = $p->get_tag("tr");
			$token = $p->get_tag("a");
        	        %attr=%{$token->[1]};

			#去除重复的新闻	
			my $href=$attr{'href'};
			if($url_hash{"$href"})
				{next;	}

			$i  ;
			$poster_list[$i]->{'poster_url'}=$attr{'href'};
			if($attr{'title'}){
				$poster_list[$i]->{'title_name'}=$attr{'title'};
			}else{
				$poster_list[$i]->{'title_name'}=$p->get_trimmed_text('/a');;
			}
			$poster_list[$i]->{'board_name'}=$board{'board_name'};
			$poster_list[$i]->{'board_id'} = $board{'board_id'};	#Add board id
			$poster_list[$i]->{'parent_id'} = $board{'parent_id'};	#Add board id
			$url_hash{"$href"}=$poster_list[$i]->{'title_name'};
			
#			phash(%{$poster_list[$i]});
		}
        }

	$i=1;
	foreach $poster (@poster_list)
	{
		if(!$poster || $poster eq ""){next;}
		
		my ($code, $poster, $reply) = parser_poster(\%$poster);
		#if($code == 0){
		#	insert2topic($board{'board_id'}, \%$poster, \@$reply);
		#}		
		$i  ;	
	}
}

#获取主贴信息
sub parser_poster
{
	my %attr=();
	my $page;
	my $tmp;
	my $p;
	my $token;	
	local $poster=$_[0];
	local @reply;
	my $url=$poster->{'poster_url'};
	
        $page=get($url);
        utf8::decode($page);
        $p= HTML::TokeParser->new(\$page);	

	#定位到论坛开始 主贴内容开始标识<div class="sourceTime"	
        while($token=$p->get_tag('div')){
                %attr=%{$token->[1]};
                if($attr{'class'} && $attr{'class'} eq 'sourceTime')
                        {last;}
        }
	if(!$token){
		wlog("$url 格式错误,跳过此帖");	
		return -1;
	}

	#抓取新闻来源
	$token=$p->get_tag('span');
	$poster->{'info_source'}=$p->get_trimmed_text('/span');
	
	#抓取发帖时间
	$token=$p->get_tag('div');
	$tmp=$p->get_trimmed_text('/div');
	$pattern='([\d]*)年([\d]*)月([\d]*)日([\d]*):([\d]*)';
	if(!($tmp =~ m/$pattern/)){
		wlog("$url 时间获取失败,跳过此帖");	
		return -1;
	}
 	$poster->{'post_time'}="$1-$2-$3 $4:$5:00";		
	
	#获取评论url,获取评论数
	while($token=$p->get_tag('a')){
		$tmp=$p->get_trimmed_text('/a');
		if($tmp eq '我来说两句'){
			%attr=%{$token->[1]};		
			$reply_url=$attr{'href'};
		}
	}
	get_reply($reply_url);

	#查找主贴内容 定位到<div class="text clear" id="contentText"	
        $p= HTML::TokeParser->new(\$page);	
        while($token=$p->get_tag('div')){
                %attr=%{$token->[1]};
                if($attr{'id'} && $attr{'id'} eq 'contentText')
                        {last;}
	}
	if(!$token){
		wlog("$url 抓取内容失败,跳过此帖");	
		return -1;
	}
	$poster->{'content'}=$p->get_trimmed_text('script');
	
	#将帖子内容和回复数据写入数据库
	#insert2topic("topic", %$poster);
#	phash(%$poster);#test
	#preply(@reply);#exit;
	#sqldeal(\%$poster,\@reply);
	insert2news(\%$poster);
	return 0;
}

#获取回复信息
sub get_reply()
{
	my $p;
	my $tmp;
	my $page;
	my $i=1;
	my $url=$_[0];
	my $isnext=0;
	my %attr;
	my $token;
	
	#获取评论数
	$page=get($url);
        utf8::decode($page);
        $p= HTML::TokeParser->new(\$page);	
	while($token=$p->get_tag('span')){
                %attr=%{$token->[1]};
		#定位到<span id="allCount">
		if($attr{'id'} && $attr{'id'} eq "allCount"){
			$tmp=$p->get_trimmed_text('/span');
			$tmp =~ m/([\d]*)条/;
			$poster->{'reply_num'} = $1;
			return 0;
		}
	}
}

#打印reply数组内容
sub preply(@)
{
	my @reply=@_;
	my $i;

	for($i=1;$i<=$#reply;$i  )
	{print "[$i]\n";phash(%{$reply[$i]});}
}

#写入日志到
sub wlog($)
{
	my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
	$t_year =1900;
	$t_mon =1;
        print $logfp "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
        print  "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
}

#打印hash表内容
sub phash(%)
{
	my $key;
	my $value;
	my %hash =@_;	
	my $i=1;

	while(($key, $value) = each %hash){   print $i  ,"  $key = $value\n"; } #test
	print "\n";
}

sub sqldeal(\%\@)
{
	my %poster =%{$_[0]};
	my @reply =@{$_[1]};

#	phash(%$poster);#test
#	preply(@reply);#test

}

1;

 


 

标签: 采集

实例下载地址

perl脚本爬虫程序,支持爬取北大未名bbs、163新闻、ifeng新闻、猫扑论坛、sina新闻等

不能下载?内容有错? 点击这里报错 + 投诉 + 提问

好例子网口号:伸出你的我的手 — 分享

相关软件

相关文章

网友评论

发表评论

(您的评论需要经过审核才能显示)

查看所有0条评论>>

小贴士

感谢您为本站写下的评论,您的评论对其它用户来说具有重要的参考价值,所以请认真填写。

  • 类似“顶”、“沙发”之类没有营养的文字,对勤劳贡献的楼主来说是令人沮丧的反馈信息。
  • 相信您也不想看到一排文字/表情墙,所以请不要反馈意义不大的重复字符,也请尽量不要纯表情的回复。
  • 提问之前请再仔细看一遍楼主的说明,或许是您遗漏了。
  • 请勿到处挖坑绊人、招贴广告。既占空间让人厌烦,又没人会搭理,于人于己都无利。

关于好例子网

本站旨在为广大IT学习爱好者提供一个非营利性互相学习交流分享平台。本站所有资源都可以被免费获取学习研究。本站资源来自网友分享,对搜索内容的合法性不具有预见性、识别性、控制性,仅供学习研究,请务必在下载后24小时内给予删除,不得用于其他任何用途,否则后果自负。基于互联网的特殊性,平台无法对用户传输的作品、信息、内容的权属或合法性、安全性、合规性、真实性、科学性、完整权、有效性等进行实质审查;无论平台是否已进行审查,用户均应自行承担因其传输的作品、信息、内容而可能或已经产生的侵权或权属纠纷等法律责任。本站所有资源不代表本站的观点或立场,基于网友分享,根据中国法律《信息网络传播权保护条例》第二十二与二十三条之规定,若资源存在侵权或相关问题请联系本站客服人员,点此联系我们。关于更多版权及免责申明参见 版权及免责申明

;
报警