实例介绍
【实例简介】
采用perl脚本写的爬虫程序,可以爬取北大未名bbs、163新闻、ifeng新闻、猫扑论坛、sina新闻等
【实例截图】

【核心代码】
抓取代码一:
#!/usr/bin/perl -w
package Parser_ifeng;
use Exporter;
our @ISA=qw(Exporter);
our @EXPORT=qw(parser_ifeng);
#use strict;
use warnings;
use LWP::UserAgent;
use LWP::Simple;
use HTML::TokeParser;
use Encode;
use utf8;
use Time::Local;
use Mysql_conn;
#binmode(STDIN, ':encoding(utf8)');
binmode(STDOUT, ':encoding(utf8)');
#binmode(STDERR, ':encoding(utf8)');
my $workdir="/home/webscience/proj/";
my $pmdir="$workdir/pm/";
my $logdir="$workdir/log/";
my $logfile="$logdir/ifeng.log";
my $logfp;
my $tmpdir="$workdir/tmp/";
my $tmpfile="$tmpdir/ifeng.tmp";
my $tmpfile1="$tmpdir/ifeng1.tmp";
my %board_list=();
my $main_url="http://www.tianya.cn/bbs/index.shtml";
sub parser_ifeng
{
#初始化日志文件
open($logfp,">>$logfile") or die "can't open file:$!";
$board_list{'点击排行榜'}="http://news.ifeng.com/toprank/day/";
$board_list{'评论排行榜'}="http://news.ifeng.com/comment/day/";
$board_list{'表情_感动'}="http://news.ifeng.com/mood/day/1/";
$board_list{'表情_同情'}="http://news.ifeng.com/mood/day/2/";
$board_list{'表情_无聊'}="http://news.ifeng.com/mood/day/3/";
$board_list{'表情_愤怒'}="http://news.ifeng.com/mood/day/4/";
$board_list{'表情_搞笑'}="http://news.ifeng.com/mood/day/5/";
$board_list{'表情_难过'}="http://news.ifeng.com/mood/day/6/";
$board_list{'表情_高兴'}="http://news.ifeng.com/mood/day/7/";
$board_list{'表情_路过'}="http://news.ifeng.com/mood/day/8/";
my $key;
my $value;
local %url_hash;#用于去除重复的新闻
while(($key, $value) = each %board_list){
my %board;
$board{'board_name'}=$key;
$board{'board_url'}=$value;
$board{'parent_name'}="凤凰网";
#获取版块id,若是新版块,则写入数据库
#------
my ($fid, $parent_id) = insert2forum(%board);
$board{'board_id'} = $fid;
$board{'parent_id'} = $parent_id;
parser_board(%board);
}
phash(%board_list);#test
}
sub parser_board(%)
{
my %attr=();
my $token;
my $page;
my $p;
my %board=@_;
my $url=$board{'board_url'};
my @poster_list;
my $tmp;
my $i=0;
#获取当前时间
my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
$t_year =1900;
$t_mon =1;
#unix时间
local $u_time=timelocal($t_sec,$t_min,$t_hour,$t_mday,$t_mon-1,$t_year-1900);
print "board_url=$url\n";
$page=get($url);
utf8::decode($page);
$p= HTML::TokeParser->new(\$page);
#解析新闻列表
while ($token = $p->get_tag("div")) {
%attr=%{$token->[1]};
if(!$attr{'class'} || $attr{'class'} ne 'listPublic')
{next;}
$token = $p->get_tag("a");
%attr=%{$token->[1]};
#去除重复的新闻
my $href=$attr{'href'};
if($url_hash{"$href"})
{next; }
$i ;
$poster_list[$i]->{'poster_url'}=$attr{'href'};
$poster_list[$i]->{'title_name'}=$p->get_trimmed_text("/a");
$poster_list[$i]->{'board_name'}=$board{'board_name'};
$poster_list[$i]->{'board_id'} = $board{'board_id'}; #Add board id
$poster_list[$i]->{'parent_id'} = $board{'parent_id'}; #Add board id
$url_hash{"$href"}=$poster_list[$i]->{'title_name'};
# phash(%{$poster_list[$i]});exit;
}
$i=1;
foreach $poster (@poster_list)
{
if(!$poster || $poster eq ""){next;}
phash(%$poster);
my ($code, $poster, $reply) = parser_poster(\%$poster);
#if($code == 0){
# insert2topic($board{'board_id'}, \%$poster, \@$reply);
#}
$i ;
}
}
#获取主贴信息
sub parser_poster
{
my %attr=();
my $page;
my $tmp;
my $p;
my $token;
my $poster=$_[0];
local @reply;
my $url=$poster->{'poster_url'};
$page=get($url);
utf8::decode($page);
$p= HTML::TokeParser->new(\$page);
#定位到论坛开始 主贴内容开始标识<div id="artical_sth">
while($token=$p->get_tag('div')){
%attr=%{$token->[1]};
if($attr{'id'} && $attr{'id'} eq 'artical_sth')
{last;}
}
if(!$token){
wlog("$url 格式错误,跳过此帖");
return -1;
}
#抓取发帖时间
$token=$p->get_tag('span');
$tmp=$p->get_trimmed_text('/span');
$pattern='([\d]*)年([\d]*)月([\d]*)日 ([\d]*):([\d]*)';
if(!($tmp =~ m/$pattern/)){
wlog("$url 时间获取失败,跳过此帖");
return -1;
}
$poster->{'post_time'}="$1-$2-$3 $4:$5:00";
#抓取新闻来源
$token=$p->get_tag('span');
$poster->{'info_source'}=$p->get_trimmed_text('/span');
#抓取评论数
$token=$p->get_tag('span');
$poster->{'reply_num'}=$p->get_trimmed_text('/span');
#查找主贴内容 定位到<div id="clear">
while($token=$p->get_token()){
if($token->[0] eq "C"&& $token->[1] eq "<!--mainContent begin-->" )
{last;}
}
if(!$token){
wlog("$url 抓取内容失败,跳过此帖");
return -1;
}
$poster->{'content'}=$p->get_trimmed_text('div');
#将帖子内容和回复数据写入数据库
#insert2topic("topic", %$poster);
phash(%$poster);#test
#sqldeal(\%$poster,\@reply);
insert2news(\%$poster);
return 0;
}
#获取回复信息
sub get_reply()
{
my $p=$_[0];
my $tmp;
my $page;
my $i=1;
my $url;
my %attr;
my %attr1;
my $token;
my $token1;
R_BEGIN: while($token=$p->get_tag('table','div')){#'td')){
%attr=%{$token->[1]};
#<td id="pageDivBottom">,定位下一页url
if($token->[0] eq 'div' && $attr{'id'} && $attr{'id'} eq 'adsp_content_banner_2'){
#### if($token->[0] eq 'td' && $attr{'id'} && $attr{'id'} eq 'pageDivBottom'){
while($token=$p->get_tag('a')){
if($p->get_trimmed_text('/a') eq '下一页'){
%attr=%{$token->[1]};
$url=$attr{'href'};
$page=get($url);
utf8::decode($page);
$p= HTML::TokeParser->new(\$page);
#重新定位到<div id="pContentDiv">
while($token=$p->get_tag('div')){
%attr=%{$token->[1]};
if($attr{'id'} && $attr{'id'} eq 'pContentDiv')
{next R_BEGIN;}
}
}
}
last R_BEGIN;#没有下一页,已经是最后一页,退出R_BEGIN
}elsif($token->[0] eq 'div'){
next;
}
##抓取回复, get_tag('table')
#获取回帖用户id
$p->get_tag('a');
$reply[$i]->{'uid'}=$p->get_trimmed_text('/a');
#获取回帖时间
$tmp=$p->get_trimmed_text('/td');
$patern='回复日期:([\d\D]*)';
if($tmp=~m/$patern/)
{$reply[$i]->{'reply_time'}=$1;}
#获取回帖内容
$reply[$i]->{'content'}=$p->get_trimmed_text('table','div');
$i ;
}
}
#打印reply数组内容
sub preply(@)
{
my @reply=@_;
my $i;
for($i=1;$i<=$#reply;$i )
{print "[$i]\n";phash(%{$reply[$i]});}
}
#写入日志到
sub wlog($)
{
my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
$t_year =1900;
$t_mon =1;
print $logfp "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
print "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
}
#打印hash表内容
sub phash(%)
{
my $key;
my $value;
my %hash =@_;
my $i=1;
while(($key, $value) = each %hash){ print $i ," $key = $value\n"; } #test
print "\n";
}
sub sqldeal(\%\@)
{
my %poster =%{$_[0]};
my @reply =@{$_[1]};
# phash(%$poster);#test
# preply(@reply);#test
}
1;
猫扑采集代码二:
#!/usr/bin/perl
package Parser_mop;
use Exporter;
our @ISA = qw{Exporter};
our @EXPORT = qw{parser_mop};
use warnings;
use LWP::Simple;
use HTML::TokeParser;
use Encode;
use utf8;
use Time::Local;
use Mysql_conn;
binmode STDOUT,":utf8";
my $workdir = "/home/webscience/proj/";
my $logfile = "$workdir/log/mop.log";
my $logfh;
my $preurl = "http://dzh2.mop.com/static/rrlistSubLeft_0_";
my $mainurl = "http://dzh2.mop.com/static/rrlistSubLeft_0_0.html";
my $token;
my %board_list = ();
my @poster_list = ();
my %poster = ();
my %attr = ();
my %board = ();
my $i = 0;
my $tmp;
sub parser_mop
{
my $key;
my $value;
$board_list{"人肉搜索"} = "http://dzh2.mop.com/static/rrlistSubLeft_0_0.html";
#initial log file
open($logfh,">>$logfile") or die "cannot open file: $!\n";
while(($key, $value) = each %board_list){
my %board;
$board{'parent_name'} = '猫扑';
$board{'board_name'} = $key;
$board{'board_url'} = $value;
my ($fid, $parent_id) = insert2forum(%board);
$board{'board_id'} = $fid;
$board{'parent_id'} = $parent_id;
parser_board(%board);
}
}
sub parser_board
{
my %attr = ();
my $token;
my $page;
my $p;
my %board = @_;
my $boardurl = $board{'board_url'};
my @poster_list;
my $url = $boardurl;
my $backurl = ".html";
for($count = 0; $count <= 4; $count )
{
my $tmpurl = "$count$backurl";
$url = "$preurl$tmpurl";
print $url;
print "\n";
$page = get($url); # or die "get $url failed.\n";
if(!$page){last;}
utf8::decode($page);
$p = HTML::TokeParser->new(\$page);
#parser news list
#定位到论坛开始
$i = 0;
while($token=$p->get_tag("font")){
if($p->get_trimmed_text("/font") eq '我的回答'){last;}
}
if(!$token){
wlog("$url 无法定位帖子列表");
return -1;
}
#解析帖子列表
while($token = $p->get_tag("a")){
%attr = %{$token->[1]};
$pattern = '([\d] )条';
$tmp = $p->get_trimmed_text("/a");
if($tmp =~ m/$pattern/)
{
$poster_list[$i]->{'poster_url'} = "$attr{'href'}";
$poster_list[$i]->{'title_name'} = $tmp;
$poster_list[$i]->{'reply_num'} = $1;
$poster_list[$i]->{'click_num'} = 0;
$poster_list[$i]->{'board_name'} = $board{'board_name'};
$poster_list[$i]->{'board_id'} = $board{'board_id'};
$poster_list[$i]->{'parent_id'} = $board{'parent_id'};
$i ;
}
next;
}
}
foreach $poster (@poster_list){
if(!$poster || $poster eq ""){next;}
# phash(%$poster);
#my($code, $poster, $reply) =
parser_poster(%$poster);
}
}
sub parser_poster
{
my %poster = @_;
my $poster_url = $poster{'poster_url'};
# print "$poster_url";
my @reply;
$i = 0;
my $page = get($poster_url);
utf8::decode($page);
my $p = HTML::TokeParser->new(\$page);
# print "$page";
#main poster
while($token = $p->get_tag("strong"))
{
%attr = %{$token->[1]};
if(!$attr{'class'} || $attr{'class'} ne "red"){next;}
$tmp = $p->get_trimmed_text("/strong");
@mplist = split(/:/,$tmp,2);
# if(!$mplist || $mplist[0] ne "悬赏值"){next;}
$pattern = ':([\d] )';
if($tmp =~ m/$pattern/){
$poster->{'mp'} = $1;
}
else { last;}
while($token = $p->get_tag("span"))
{
%attr = %{$token->[1]};
if(!$attr{'id'} || $attr{'id'} ne "span_Body"){next;}
}
$poster->{'content'} = $p->get_trimmed_text("/span");
#post time
$token = $p->get_tag("em");
$tmp = $p->get_trimmed_text("/em");
$pattern = '([\d]{4})-([\d] )-([\d] )([\s] )([\d] ):([\d] ):([\d]) ';
if($tmp =~ m/$pattern/){ $poster->{'post_time'} = "$1-$2-$3 $5:$6:$7";}
#post author
$token = $p->get_tag("span");
$poster->{'author'} = $p->get_trimmed_text("/span");
last;
}
if(!$token){
wlog("$poster_url error skip this page");
return -1;
}
$tmpp = $p;
#get reply info
#reply 1
while($token = $p->get_tag("p"))
{
%attr = %{$token->[1]};
if(!$attr{'id'} || $attr{'id'} ne "id_topicreply0"){next;}
$token = $p->get_tag("font");
if(!$token){last;}
%attr = %{$token->[1]};
$reply[$i]->{'reply_time'} = $attr{'title'};
$token = $p->get_tag("span");
$reply[$i]->{'content'} = $p->get_trimmed_text("/span");
$token = $p->get_tag("span");
$token = $p->get_tag("span");
$reply[$i]->{'uid'} = $p->get_trimmed_text("/span");
$i ;
last;
}
$p = $tmpp;
while($token = $p->get_tag("hr"))
{
%attr = %{$token->[1]};
if(!$attr{'class'} || $attr{'class'} ne "hr2"){next;}
$token = $p->get_tag("font");
if(!$token){last;}
%attr = %{$token->[1]};
$reply[$i]->{'reply_time'} = $attr{'title'};
$token = $p->get_tag("span");
$reply[$i]->{'content'} = $p->get_trimmed_text("/span");
$token = $p->get_tag("span");
$token = $p->get_tag("span");
$reply[$i]->{'uid'} = $p->get_trimmed_text("/span");
$i ;
}
phash(%$poster);
preply(@reply);
#database
insert2topic(\%$poster,\@reply);
return 0;
}
#print reply content
sub preply(@)
{
my @reply = @_;
for(my $i = 0; $i < $#reply; $i )
{
print "[$i]";
phash (%{$reply[$i]});
}
}
#print hash
sub phash(%)
{
my $key;
my $value = "";
my %hash =@_;
$i=1;
while(($key, $value) = each %hash){ print $i ," $key = $value \n"; } #test
print "\n";
}
sub wlog($)
{
($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
$t_year =1900;
$t_mon =1;
print $logfh "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
print "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
}
1;
新浪采集代码三:
#!/usr/bin/perl
package Parser_mil_news_sina;
use Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw(parser_mil_news_sina);
use warnings;
#use strict;
use LWP::Simple;
use LWP::UserAgent;
use HTML::TokeParser;
use Encode;
use utf8;
use Time::Local;
use Mysql_conn;
binmode STDOUT,":utf8";
my $workdir = "/home/webscience/proj/";
my $logdir = "$workdir/log/";
my $logfile = "$logdir/news_sina.log";
my $logfh;
my $preurl = "http://mil.news.sina.com.cn/";
my %board_list = ();
my %board_all = ();
sub parser_mil_news_sina
{
open($logfh,">>logfile") or die "cannot open the log file.\n";
my $url = "http://mil.news.sina.com.cn/hotnews/Daily/index.shtml";
my %board;
$board{'parent_name'} = '军事论坛_新浪网';
$board{'board_name'} = '军事热门帖子';
$board{'board_url'} = $url;
my ($fid, $parent_id) = insert2forum(%board);
$board{'board_id'} = $fid;
$board{'parent_id'} = $parent_id;
parser_board(%board);
}
sub parser_board(%)
{
my %attr = ();
my $token;
my $page;
my $p;
my %board = @_;
my $url = $board{'board_url'};
my @poster_list = ();
# my $poster ;
my $i = 0;
#军事热门帖子
$page = get($url); # or die "get $url failed.\n";
utf8::decode($page);
$p = HTML::TokeParser->new(\$page);
#parser news list
while($token = $p->get_tag("tbody")){
while($token = $p->get_token()){
if($token->[0] eq "C" && $token->[1] eq "<!-- 帖子排行列表 begin -->")
{last;}
}
if(!$token){
wlog("$url get 帖子排行列表 failed.");
return -1;
}
while($token = $p->get_tag("td")){
%attr = %{$token->[1]};
if(!$attr{'class'} || $attr{'class'} ne 'linkNews'){ next;}
$i ;
$token = $p->get_tag("a");
%attr = %{$token->[1]};
$poster_list[$i]->{'poster_url'} = $attr{'href'};
$poster_list[$i]->{'title_name'} = $p->get_trimmed_text("/a");
$token = $p->get_tag("td");
my $text = $p->get_trimmed_text("/td");
if($text =~ /:/){$poster_list[$i]->{'post_time'} = "2010-".$text;
$poster_list[$i]->{'click_num'} = 0;}
else {$poster_list[$i]->{'click_num'} = $text;
$poster_list[$i]->{'post_time'} = "";}
$poster_list[$i]->{'board_name'} = $board{'board_name'};
$poster_list[$i]->{'board_id'} = $board{'board_id'};
$poster_list[$i]->{'parent_id'} = $board{'parent_id'}; #Add board id
}
}
$i = 1;
foreach my $poster (@poster_list){
if(!$poster || $poster eq ""){next;}
phash(%$poster);
my($code, $poster, $reply) = parser_poster(\%$poster);
$i ;
}
}
#get main poster content
sub parser_poster
{
my %attr = ();
my $page;
my $p;
my $token;
my $poster = $_[0];
my $url = $poster->{'poster_url'};
# my $post_time = $poster->{'poster_url'};
$poster->{'content'} = "content missed";
$poster->{'keyword'} = "no keyword";
$poster->{'reply_num'} = 0;
if($poster->{'post_time'} eq "")
{ exit;}
#get current time
my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
$t_year =1900;
$t_mon =1;
#unix时间
# local my $u_time=timelocal($t_sec,$t_min,$t_hour,$t_mday,$t_mon-1,$t_year-1900);
$page = get($url);
utf8::decode($page);
$p = HTML::TokeParser->new(\$page);
if(!$p){ print "$url cannot get to";return -1;}
#local to author
while($token = $p->get_tag("td")){
%attr = %{$token->[1]};
print "$attr{'class'}";
if(!$attr{'class'} && $attr{'class'} ne "postauthor"){ next;}
$token = $p->get_tag();
$token = $p->get_tag("a");
$poster->{'author'} = $p->get_trimmed_text('/a');
print "author: $poster->{'author'}";
print "\n";
last;
if(!$token){
wlog("$url get author failed.");
return -1;
}
last;
}
while($token = $p->get_tag("td")){
%attr = %{$token->[1]};
if($attr{'class'} && $attr{'class'} eq "postcontent")
{
while($token = $p->get_tag("em")){
%attr = %{$token->[1]};
if(my $text = $p->get_trimmed_text("/em") eq "小")
{
$poster->{'post_time'} = $p->get_text();
last;
}
}
#news count
while($token = $p->get_tag("span")){
if($token = $p->get_tag("font"))
{last;}
}
if(!$token){
wlog("$url get news count failed.");
return -1;
}
$poster->{'click_num'} = $p->get_trimmed_text('/font');
if($token = $p->get_tag("font"))
{ $poster->{'reply_num'} = $p->get_trimmed_text('/font');}
#news content
while($token = $p->get_tag("p")){
%attr = %{$token->[1]};
if($attr{'align'} && $attr{'align'} eq "center")
{
$poster->{'content'} = $p->get_trimmed_text("font");
}
}
last;
}
}
#click num > 5000
insert2news(\%$poster);
phash(%$poster);
}
#写入日志到
sub wlog($)
{
($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
$t_year =1900;
$t_mon =1;
print $logfh "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
print "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
}
#打印hash表内容
sub phash(%)
{
my $key;
my $value;
my %hash =@_;
my $i=1;
while(($key, $value) = each %hash){ print $i ," $key = $value\n"; } #test
print "\n";
}
1;
163代码四:
#!/usr/bin/perl -w
package Parser_163;
use Exporter;
our @ISA=qw(Exporter);
our @EXPORT=qw(parser_163);
#use strict;
use warnings;
use LWP::UserAgent;
use LWP::Simple;
use HTML::TokeParser;
use Encode;
use utf8;
use Time::Local;
use Mysql_conn;
#binmode(STDIN, ':encoding(utf8)');
binmode(STDOUT, ':encoding(utf8)');
#binmode(STDERR, ':encoding(utf8)');
my $workdir="/home/webscience/proj/";
my $pmdir="$workdir/pm/";
my $logdir="$workdir/log/";
my $logfile="$logdir/163.log";
my $logfp;
my $tmpdir="$workdir/tmp/";
my $tmpfile="$tmpdir/163.tmp";
my %board_list=();
my $main_url="http://news.163.com/special/0001386F/rank_whole.html";
sub parser_163
{
#初始化日志文件
open($logfp,">>$logfile") or die "can't open file:$!";
wlog("开始抓取163新闻\n");
local %url_hash;#用于去除重复的新闻
my %board;
$board{'board_name'}="163新闻";
$board{'board_url'}=$main_url;
$board{'parent_name'}='163新闻';
#获取版块id,若是新版块,则写入数据库
#------
my ($fid, $parent_id) = insert2forum(%board);
$board{'board_id'} = $fid;
$board{'parent_id'} = -1;
parser_board(%board);
}
sub parser_board(%)
{
my %attr=();
my $token;
my $page;
my $p;
my %board=@_;
my $url=$board{'board_url'};
my @poster_list;
my $tmp;
my $i=0;
#获取当前时间
my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
$t_year =1900;
$t_mon =1;
#unix时间
local $u_time=timelocal($t_sec,$t_min,$t_hour,$t_mday,$t_mon-1,$t_year-1900);
$page=get($url);
utf8::decode($page);
$p= HTML::TokeParser->new(\$page);
#解析新闻列表,定位到<div class="tabContents active">
while ($token = $p->get_tag("div")) {
%attr=%{$token->[1]};
if($attr{'class'} && $attr{'class'} eq 'tabContents active')
{last;}
}
while ($token = $p->get_tag("a","div")) {
#以<div class="nav_channel_end">为终止标记
%attr=%{$token->[1]};
if($attr{'class'} && $attr{'class'} eq 'nav_channel_end')
{last;}
if($token->[0] eq "div")
{next;}
#去除重复的新闻
my $href=$attr{'href'};
if($url_hash{"$href"})
{next; }
$i ;
$poster_list[$i]->{'poster_url'}=$attr{'href'};
$poster_list[$i]->{'title_name'}=$p->get_trimmed_text('/a');;
$poster_list[$i]->{'board_name'}=$board{'board_name'};
$poster_list[$i]->{'board_id'} = $board{'board_id'}; #Add board id
$poster_list[$i]->{'parent_id'} = $board{'parent_id'}; #Add board id
$url_hash{"$href"}=$poster_list[$i]->{'title_name'};
# phash(%{$poster_list[$i]});
}
$i=1;
foreach $poster (@poster_list)
{
if(!$poster || $poster eq ""){next;}
my ($code, $poster, $reply) = parser_poster(\%$poster);
#if($code == 0){
# insert2topic($board{'board_id'}, \%$poster, \@$reply);
#}
$i ;
}
}
#获取主贴信息
sub parser_poster
{
my %attr=();
my $page;
my $tmp;
my $p;
my $token;
local $poster=$_[0];
local @reply;
my $url=$poster->{'poster_url'};
$page=get($url);
utf8::decode($page);
$p= HTML::TokeParser->new(\$page);
#定位到论坛开始 主贴内容开始标识<div class="sourceTime"
while($token=$p->get_tag('div')){
%attr=%{$token->[1]};
if($attr{'class'} && $attr{'class'} eq 'sourceTime')
{last;}
}
if(!$token){
wlog("$url 格式错误,跳过此帖");
return -1;
}
#抓取新闻来源
$token=$p->get_tag('span');
$poster->{'info_source'}=$p->get_trimmed_text('/span');
#抓取发帖时间
$token=$p->get_tag('div');
$tmp=$p->get_trimmed_text('/div');
$pattern='([\d]*)年([\d]*)月([\d]*)日([\d]*):([\d]*)';
if(!($tmp =~ m/$pattern/)){
wlog("$url 时间获取失败,跳过此帖");
return -1;
}
$poster->{'post_time'}="$1-$2-$3 $4:$5:00";
#获取评论url,获取评论数
while($token=$p->get_tag('a')){
$tmp=$p->get_trimmed_text('/a');
if($tmp eq '我来说两句'){
%attr=%{$token->[1]};
$reply_url=$attr{'href'};
}
}
get_reply($reply_url);
#查找主贴内容 定位到<div class="text clear" id="contentText"
$p= HTML::TokeParser->new(\$page);
while($token=$p->get_tag('div')){
%attr=%{$token->[1]};
if($attr{'id'} && $attr{'id'} eq 'contentText')
{last;}
}
if(!$token){
wlog("$url 抓取内容失败,跳过此帖");
return -1;
}
$poster->{'content'}=$p->get_trimmed_text('script');
#将帖子内容和回复数据写入数据库
#insert2topic("topic", %$poster);
#phash(%$poster);#test
#preply(@reply);#exit;
#sqldeal(\%$poster,\@reply);
insert2news(\%$poster);
return 0;
}
#获取回复信息
sub get_reply()
{
my $p;
my $tmp;
my $page;
my $i=1;
my $url=$_[0];
my $isnext=0;
my %attr;
my $token;
#获取评论数
$page=get($url);
utf8::decode($page);
$p= HTML::TokeParser->new(\$page);
while($token=$p->get_tag('span')){
%attr=%{$token->[1]};
#定位到<span id="ptdown">
if($attr{'id'} && $attr{'id'} eq "ptdown"){
$tmp=$p->get_trimmed_text('/span');
$tmp =~ m/共有原创评论([\d]*)条/;
$poster->{'reply_num'} = $1;
return 0;
}
}
}
#打印reply数组内容
sub preply(@)
{
my @reply=@_;
my $i;
for($i=1;$i<=$#reply;$i )
{print "[$i]\n";phash(%{$reply[$i]});}
}
#写入日志到
sub wlog($)
{
my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
$t_year =1900;
$t_mon =1;
print $logfp "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
print "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
}
#打印hash表内容
sub phash(%)
{
my $key;
my $value;
my %hash =@_;
my $i=1;
while(($key, $value) = each %hash){ print $i ," $key = $value\n"; } #test
print "\n";
}
sub sqldeal(\%\@)
{
my %poster =%{$_[0]};
my @reply =@{$_[1]};
# phash(%$poster);#test
# preply(@reply);#test
}
1;
搜狐采集代码五:
#!/usr/bin/perl -w
package Parser_sohu;
use Exporter;
our @ISA=qw(Exporter);
our @EXPORT=qw(parser_sohu);
#use strict;
use warnings;
use LWP::UserAgent;
use LWP::Simple;
use HTML::TokeParser;
use Encode;
use utf8;
use Time::Local;
use Mysql_conn;
#binmode(STDIN, ':encoding(utf8)');
binmode(STDOUT, ':encoding(utf8)');
#binmode(STDERR, ':encoding(utf8)');
my $workdir="/home/webscience/proj/";
my $pmdir="$workdir/pm/";
my $logdir="$workdir/log/";
my $logfile="$logdir/sohu.log";
my $logfp;
my $tmpdir="$workdir/tmp/";
my $tmpfile="$tmpdir/sohu.tmp";
my %board_list=();
my $main_url="http://comment.news.sohu.com/djpm/";
sub parser_sohu
{
#初始化日志文件
open($logfp,">>$logfile") or die "can't open file:$!";
wlog("开始抓取sohu新闻\n");
local %url_hash;#用于去除重复的新闻
my %board;
$board{'board_name'}="搜狐新闻";
$board{'board_url'}=$main_url;
$board{'parent_name'}='搜狐新闻';
#获取版块id,若是新版块,则写入数据库
#------
my ($fid, $parent_id) = insert2forum(%board);
$board{'board_id'} = $fid;
$board{'parent_id'} = -1;
parser_board(%board);
}
sub parser_board(%)
{
my %attr=();
my $token;
my $page;
my $p;
my %board=@_;
my $url=$board{'board_url'};
my @poster_list;
my $tmp;
my $i=0;
#获取当前时间
my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
$t_year =1900;
$t_mon =1;
#unix时间
local $u_time=timelocal($t_sec,$t_min,$t_hour,$t_mday,$t_mon-1,$t_year-1900);
$page=get($url);
utf8::decode($page);
$p= HTML::TokeParser->new(\$page);
#解析新闻列表,定位到<tbody>
while ($token = $p->get_tag("/tbody")) {
for($j=1;$j<=10;$j ){
$token = $p->get_tag("tr");
$token = $p->get_tag("a");
%attr=%{$token->[1]};
#去除重复的新闻
my $href=$attr{'href'};
if($url_hash{"$href"})
{next; }
$i ;
$poster_list[$i]->{'poster_url'}=$attr{'href'};
if($attr{'title'}){
$poster_list[$i]->{'title_name'}=$attr{'title'};
}else{
$poster_list[$i]->{'title_name'}=$p->get_trimmed_text('/a');;
}
$poster_list[$i]->{'board_name'}=$board{'board_name'};
$poster_list[$i]->{'board_id'} = $board{'board_id'}; #Add board id
$poster_list[$i]->{'parent_id'} = $board{'parent_id'}; #Add board id
$url_hash{"$href"}=$poster_list[$i]->{'title_name'};
# phash(%{$poster_list[$i]});
}
}
$i=1;
foreach $poster (@poster_list)
{
if(!$poster || $poster eq ""){next;}
my ($code, $poster, $reply) = parser_poster(\%$poster);
#if($code == 0){
# insert2topic($board{'board_id'}, \%$poster, \@$reply);
#}
$i ;
}
}
#获取主贴信息
sub parser_poster
{
my %attr=();
my $page;
my $tmp;
my $p;
my $token;
local $poster=$_[0];
local @reply;
my $url=$poster->{'poster_url'};
$page=get($url);
utf8::decode($page);
$p= HTML::TokeParser->new(\$page);
#定位到论坛开始 主贴内容开始标识<div class="sourceTime"
while($token=$p->get_tag('div')){
%attr=%{$token->[1]};
if($attr{'class'} && $attr{'class'} eq 'sourceTime')
{last;}
}
if(!$token){
wlog("$url 格式错误,跳过此帖");
return -1;
}
#抓取新闻来源
$token=$p->get_tag('span');
$poster->{'info_source'}=$p->get_trimmed_text('/span');
#抓取发帖时间
$token=$p->get_tag('div');
$tmp=$p->get_trimmed_text('/div');
$pattern='([\d]*)年([\d]*)月([\d]*)日([\d]*):([\d]*)';
if(!($tmp =~ m/$pattern/)){
wlog("$url 时间获取失败,跳过此帖");
return -1;
}
$poster->{'post_time'}="$1-$2-$3 $4:$5:00";
#获取评论url,获取评论数
while($token=$p->get_tag('a')){
$tmp=$p->get_trimmed_text('/a');
if($tmp eq '我来说两句'){
%attr=%{$token->[1]};
$reply_url=$attr{'href'};
}
}
get_reply($reply_url);
#查找主贴内容 定位到<div class="text clear" id="contentText"
$p= HTML::TokeParser->new(\$page);
while($token=$p->get_tag('div')){
%attr=%{$token->[1]};
if($attr{'id'} && $attr{'id'} eq 'contentText')
{last;}
}
if(!$token){
wlog("$url 抓取内容失败,跳过此帖");
return -1;
}
$poster->{'content'}=$p->get_trimmed_text('script');
#将帖子内容和回复数据写入数据库
#insert2topic("topic", %$poster);
# phash(%$poster);#test
#preply(@reply);#exit;
#sqldeal(\%$poster,\@reply);
insert2news(\%$poster);
return 0;
}
#获取回复信息
sub get_reply()
{
my $p;
my $tmp;
my $page;
my $i=1;
my $url=$_[0];
my $isnext=0;
my %attr;
my $token;
#获取评论数
$page=get($url);
utf8::decode($page);
$p= HTML::TokeParser->new(\$page);
while($token=$p->get_tag('span')){
%attr=%{$token->[1]};
#定位到<span id="allCount">
if($attr{'id'} && $attr{'id'} eq "allCount"){
$tmp=$p->get_trimmed_text('/span');
$tmp =~ m/([\d]*)条/;
$poster->{'reply_num'} = $1;
return 0;
}
}
}
#打印reply数组内容
sub preply(@)
{
my @reply=@_;
my $i;
for($i=1;$i<=$#reply;$i )
{print "[$i]\n";phash(%{$reply[$i]});}
}
#写入日志到
sub wlog($)
{
my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
$t_year =1900;
$t_mon =1;
print $logfp "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
print "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
}
#打印hash表内容
sub phash(%)
{
my $key;
my $value;
my %hash =@_;
my $i=1;
while(($key, $value) = each %hash){ print $i ," $key = $value\n"; } #test
print "\n";
}
sub sqldeal(\%\@)
{
my %poster =%{$_[0]};
my @reply =@{$_[1]};
# phash(%$poster);#test
# preply(@reply);#test
}
1;
标签: 采集
小贴士
感谢您为本站写下的评论,您的评论对其它用户来说具有重要的参考价值,所以请认真填写。
- 类似“顶”、“沙发”之类没有营养的文字,对勤劳贡献的楼主来说是令人沮丧的反馈信息。
- 相信您也不想看到一排文字/表情墙,所以请不要反馈意义不大的重复字符,也请尽量不要纯表情的回复。
- 提问之前请再仔细看一遍楼主的说明,或许是您遗漏了。
- 请勿到处挖坑绊人、招贴广告。既占空间让人厌烦,又没人会搭理,于人于己都无利。
关于好例子网
本站旨在为广大IT学习爱好者提供一个非营利性互相学习交流分享平台。本站所有资源都可以被免费获取学习研究。本站资源来自网友分享,对搜索内容的合法性不具有预见性、识别性、控制性,仅供学习研究,请务必在下载后24小时内给予删除,不得用于其他任何用途,否则后果自负。基于互联网的特殊性,平台无法对用户传输的作品、信息、内容的权属或合法性、安全性、合规性、真实性、科学性、完整权、有效性等进行实质审查;无论平台是否已进行审查,用户均应自行承担因其传输的作品、信息、内容而可能或已经产生的侵权或权属纠纷等法律责任。本站所有资源不代表本站的观点或立场,基于网友分享,根据中国法律《信息网络传播权保护条例》第二十二与二十三条之规定,若资源存在侵权或相关问题请联系本站客服人员,点此联系我们。关于更多版权及免责申明参见 版权及免责申明


网友评论
我要评论