实例介绍
【实例简介】
功能介绍:
这是一款ASP.NET的网络抓取数据的程序,有着较好的界面功能。
通过搜索网站或贴吧,根据你需求的关键字抓取网站上的信息,搜索可做扩展!
页面搜索功能设置:搜索网站选择框,采集信息的显示条数及关键字输入框;
显示列表:编号,信息来源,标题,抓取内容,点击率,抓取时间等功能!
【实例截图】
【核心代码】
protected void btnCrawl_Click(object sender, EventArgs e)
{
btnCrawl.Enabled = false;
string news = PageOperate.GetNullToString(ddlNewsSource.SelectedValue);
string words = PageOperate.GetNullToString(txtWords.Text.Trim());
int nums = PageOperate.GetIntValue(ddlNums.SelectedValue);
if (news == "")
{
PageOperate.AlertAndRedirect("请选择新闻来源!", "Default.aspx");
return;
}
if (words == "")
{
PageOperate.AlertAndRedirect("请填写关键字!", "Default.aspx");
return;
}
if (nums == 0)
{
PageOperate.AlertAndRedirect("请选择抓取的数量!", "Default.aspx");
return;
}
//要抓取的url地址
string url = "";
//抓取到url地址页面的html
string html = "";
//当前页数
int p = 1;
//抓取到的数量
int crawlNum = 0;
//循环参数
bool flag = true;
//百度贴吧参数 相当于当前页 以50为单位递增
int pn = 0;
lblShow.Text = "正在抓取.....";
#region 抓取数据
if (news == "新浪")
{
try
{
//对关键字进行url编码,防止出现乱码
words = HttpUtility.UrlEncode(words, System.Text.Encoding.GetEncoding("gb2312"));
do
{
url = "http://search.sina.com.cn/?q=" words "&range=title&c=news&sort=time&col=&source=&from=&country=&size=&time=&a=&page=" p "&pf=2131425478&ps=2134309112&dpc=1";
html = GetHtml(url);
if (html != "error")
{
string matHtml = ResolverAndOutput(html, "", "", "<h2><a href=\"http://(?<content>. ?)</span></h2>", 1, false);
string[] itemArray = matHtml.Replace("~", "").Split('$');
//flag = false; //itemArray.Length > nums;
//循环获取标题
for (int j = 0; j < itemArray.Length - 1; j )
{
lblShow.Text = "分析到第" p " 页,第" (j 1) "条数据,已采集" crawlNum "条数据!";
//抓取到的数量如果与选择的数量一致,则退出抓取
if (crawlNum == nums)
{
flag = false;
break;
}
//标题 标题清除html标签
string title = CutString(itemArray[j], "target=\"_blank\">", "</a>");
title = PageOperate.CutHTML(title).Trim();
//判断此标题是否已经添加到数据库
DataTable dt = BLL.Pager.GetPager("Id,Title", "Article", "Title = '" title "'");
if (dt.Rows.Count > 0)
continue;
//内容页链接
string conUrl = CutString(itemArray[j], "<h2><a href=\"", "\" target=\"_blank\"");
int splitIndex = conUrl.LastIndexOf('/');
if (splitIndex < 0)
continue;
//根据内容页链接,获取内容
string conPage = GetHtml(conUrl);
if (conPage != "error")
{
string conHtml = "";
int endIndex = 0;
int isExist = conPage.IndexOf("<div id=\"divContent\"");
if (isExist > 0)
{
conHtml = ResolverAndOutput(conPage, "", "", "<div id=\"divContent\"(?<content>. ?)<div id=\"divAttachment\">", 1, false);
endIndex = conHtml.IndexOf("<div id=\"divAttachment\">");
}
else
{
int tempindex = conPage.IndexOf("<div class=\"blkContainerSblkCon BSHARE_POP\"");
if (tempindex > 0)
{
if (conPage.IndexOf("<div class=\"se_edit\"") > 0)
{
conHtml = ResolverAndOutput(conPage, "", "", "<div class=\"blkContainerSblkCon BSHARE_POP\"(?<content>. ?)<div class=\"se_edit\"", 1, false);
endIndex = conHtml.IndexOf("<div class=\"se_edit\"");
}
else if (conPage.IndexOf("<div class=\"wb_rec\" id=\"wb_rec\" style") > 0)
{
conHtml = ResolverAndOutput(conPage, "", "", "<div class=\"blkContainerSblkCon BSHARE_POP\"(?<content>. ?)<div class=\"wb_rec\" id=\"wb_rec\" style", 1, false);
endIndex = conHtml.IndexOf("<div class=\"wb_rec\" id=\"wb_rec\" style");
}
else if (conPage.IndexOf("<iframe width=\"100%\" scrolling=\"no\" height=\"35\"") > 0)
{
conHtml = ResolverAndOutput(conPage, "", "", "<div class=\"blkContainerSblkCon BSHARE_POP\"(?<content>. ?)<iframe width=\"100%\" scrolling=\"no\" height=\"35\"", 1, false);
endIndex = conHtml.IndexOf("<iframe width=\"100%\" scrolling=\"no\" height=\"35\"");
}
else
continue;
}
else
continue;
}
if (endIndex < 0)
continue;
conHtml = conHtml.Substring(0, endIndex);
conHtml = Server.HtmlEncode(conHtml);
int result = AddData("新浪", title, conHtml, "Corp");
if (result > 0)
crawlNum ;
}
}
}
else
{
flag = false;
break;
}
p ;
} while (flag);
}
catch
{
}
}
else if (news == "百度贴吧")
{
try
{
//对关键字进行url编码,防止出现乱码
words = HttpUtility.UrlEncode(words, System.Text.Encoding.GetEncoding("gb2312"));
do
{
url = "http://tieba.baidu.com/f?kw=" words "&pn=" pn;
html = GetHtml(url);
if (html != "error")
{
string matHtml = ResolverAndOutput(html, "", "", "<div class=\"threadlist_text threadlist_title(?<content>. ?)</a><span ></span></div>", 1, false);
string[] itemArray = matHtml.Replace("~", "").Split('$');
//flag = false; //itemArray.Length > nums;
//循环获取标题
for (int j = 0; j < itemArray.Length - 1; j )
{
lblShow.Text = "分析到第" ((pn / 50) 1) " 页,第" (j 1) "条数据,已采集" crawlNum "条数据!";
//抓取到的数量如果与选择的数量一致,则退出抓取
if (crawlNum == nums)
{
flag = false;
break;
}
//标题 标题清除html标签
string title = CutString(itemArray[j], "class=\"j_th_tit\">", "</a>");
title = PageOperate.CutHTML(title).Trim();
//判断此标题是否已经添加到数据库
DataTable dt = BLL.Pager.GetPager("Id,Title", "Article", "Title = '" title "'");
if (dt.Rows.Count > 0)
continue;
//内容页链接
string conUrl = CutString(itemArray[j], "<a href=\"", "\" title=\"");
conUrl = "http://tieba.baidu.com" conUrl;
int splitIndex = conUrl.LastIndexOf('/');
if (splitIndex < 0)
continue;
//根据内容页链接,获取内容
string conPage = GetHtml(conUrl);
if (conPage != "error")
{
string conHtml = "";
int isExist = conPage.IndexOf("<cc><div id=\"post_content_");
if (isExist > 0)
{
conHtml = ResolverAndOutput(conPage, "", "", "<cc><div id=\"post_content_(?<content>. ?)</div></cc>", 1, false);
string[] conArray = conHtml.Replace("~", "").Split('$');
conHtml = conArray[0];
}
else
continue;
conHtml = Server.HtmlEncode(conHtml);
int result = AddData("百度贴吧", title, conHtml, "Corp");
if (result > 0)
crawlNum ;
}
}
}
else
{
flag = false;
break;
}
pn = pn 50;
} while (flag);
}
catch
{
}
}
else if (news == "搜狗")
{
try
{
//对关键字进行url编码,防止出现乱码
words = HttpUtility.UrlEncode(words, System.Text.Encoding.GetEncoding("gb2312"));
do
{
url = "http://news.sogou.com/news?query=" words "&sut=2543&sst0=1396574960819&mode=2&x=30&y=9&page=" p "&w=01029901&dr=1";
html = GetHtml(url);
if (html != "error")
{
string matHtml = ResolverAndOutput(html, "", "", "<h3 class=\"pt\">(?<content>. ?)</h3>", 1, false);
string[] itemArray = matHtml.Replace("~", "").Split('$');
//flag = false; //itemArray.Length > nums;
//循环获取标题
for (int j = 0; j < itemArray.Length - 1; j )
{
lblShow.Text = "分析到第" p " 页,第" (j 1) "条数据,已采集" crawlNum "条数据!";
//抓取到的数量如果与选择的数量一致,则退出抓取
if (crawlNum == nums)
{
flag = false;
break;
}
//标题 标题清除html标签
string title = CutString(itemArray[j], "target=\"_blank\">", "</a>");
title = PageOperate.CutHTML(title).Trim();
//判断此标题是否已经添加到数据库
DataTable dt = BLL.Pager.GetPager("Id,Title", "Article", "Title = '" title "'");
if (dt.Rows.Count > 0)
continue;
//内容页链接
string conUrl = CutString(itemArray[j], "<a class=\"pp\" href=\"", "\" id=\"uigs_");
int splitIndex = conUrl.LastIndexOf('/');
if (splitIndex < 0)
continue;
//根据内容页链接,获取内容
string conPage = GetHtml(conUrl);
if (conPage != "error")
{
string conHtml = "";
int endIndex = 0;
//搜狐新闻
if (conPage.IndexOf("<div class=\"text clear\" id=\"contentText\"") > 0)
{
if (conPage.IndexOf("<div class=\"autoShare clear\">") > 0)
{
conHtml = ResolverAndOutput(conPage, "", "", "<div class=\"text clear\" id=\"contentText\"(?<content>. ?)<div class=\"autoShare clear\">", 1, false);
endIndex = conHtml.IndexOf("<div class=\"autoShare clear\">");
}
else if (conPage.IndexOf("<div class=\"original-title\">") > 0)
{
conHtml = ResolverAndOutput(conPage, "", "", "<div class=\"text clear\" id=\"contentText\"(?<content>. ?)<div class=\"original-title\">", 1, false);
endIndex = conHtml.IndexOf("<div class=\"original-title\">");
}
else
continue;
}//腾讯新闻
else if (conPage.IndexOf("<div id=\"Cnt-Main-Article-QQ\"") > 0)
{
conHtml = ResolverAndOutput(conPage, "", "", "<div id=\"Cnt-Main-Article-QQ\"(?<content>. ?)<span style=\"width:0;height:0;", 1, false);
endIndex = conHtml.IndexOf("<span style=\"width:0;height:0;");
}//网易新闻
else if (conPage.IndexOf("<div id=\"endText\">") > 0)
{
conHtml = ResolverAndOutput(conPage, "", "", "<div id=\"endText\">(?<content>. ?)<div class=\"sharecommend-wrap clearfix\">", 1, false);
endIndex = conHtml.IndexOf("<div class=\"sharecommend-wrap clearfix\">");
}
else
continue;
if (endIndex < 0)
continue;
conHtml = conHtml.Substring(0, endIndex);
conHtml = Server.HtmlEncode(conHtml);
int result = AddData("搜狗", title, conHtml, "Corp");
if (result > 0)
crawlNum ;
}
}
}
else
{
flag = false;
break;
}
p ;
} while (flag);
}
catch
{
lblShow.Text = "抓取数据出现异常!";
lblShow.ForeColor = System.Drawing.Color.Red;
return;
}
}
//最新10条
BindData(repTop10, 10);
//数据列表
BindData(repData, 1000);
btnCrawl.Enabled = true;
#endregion
}
小贴士
感谢您为本站写下的评论,您的评论对其它用户来说具有重要的参考价值,所以请认真填写。
- 类似“顶”、“沙发”之类没有营养的文字,对勤劳贡献的楼主来说是令人沮丧的反馈信息。
- 相信您也不想看到一排文字/表情墙,所以请不要反馈意义不大的重复字符,也请尽量不要纯表情的回复。
- 提问之前请再仔细看一遍楼主的说明,或许是您遗漏了。
- 请勿到处挖坑绊人、招贴广告。既占空间让人厌烦,又没人会搭理,于人于己都无利。
关于好例子网
本站旨在为广大IT学习爱好者提供一个非营利性互相学习交流分享平台。本站所有资源都可以被免费获取学习研究。本站资源来自网友分享,对搜索内容的合法性不具有预见性、识别性、控制性,仅供学习研究,请务必在下载后24小时内给予删除,不得用于其他任何用途,否则后果自负。基于互联网的特殊性,平台无法对用户传输的作品、信息、内容的权属或合法性、安全性、合规性、真实性、科学性、完整权、有效性等进行实质审查;无论平台是否已进行审查,用户均应自行承担因其传输的作品、信息、内容而可能或已经产生的侵权或权属纠纷等法律责任。本站所有资源不代表本站的观点或立场,基于网友分享,根据中国法律《信息网络传播权保护条例》第二十二与二十三条之规定,若资源存在侵权或相关问题请联系本站客服人员,点此联系我们。关于更多版权及免责申明参见 版权及免责申明
网友评论
我要评论