实例介绍
【实例简介】
【实例截图】
【核心代码】
using System;
using System.Collections.Generic;
using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using CollectBlogData.Models;
namespace CollectBlogData.Utility
{
public sealed class HttpUtility
{
/// <summary>
/// 默认获取第一页数据
/// </summary>
/// <returns></returns>
public static string HttpGetHtml()
{
HttpWebRequest request = (HttpWebRequest) WebRequest.Create("http://www.cnblogs.com/");
request.Accept = "text/plain, */*; q=0.01";
request.Method = "GET";
request.Headers.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
request.ContentLength = 0;
request.KeepAlive = false ;
request.Host = "www.cnblogs.com";
request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0";
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream responStream = response.GetResponseStream();
StreamReader reader = new StreamReader(responStream,Encoding.UTF8);
string content = reader.ReadToEnd();
return content;
}
/// <summary>
/// 自动取分页数据
/// </summary>
/// <param name="pageIndex"></param>
/// <returns></returns>
public static string HttpGetPageHtml(string pageIndex)
{
Encoding encoding = Encoding.UTF8;
HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://www.cnblogs.com/mvc/AggSite/PostList.aspx");
request.Accept = "text/plain, */*; q=0.01";
request.Method = "POST";
//request 语言格式
request.Headers.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
//Request编码格式
request.Headers.Add("Accept-Charset","GBK,utf-8;q=0.7,*;q=0.3");
//向POST请求体中添加参数 (博客园的分页参数是以json形式POST过去的)
string postData =
"{\"CategoryType\":\"SiteHome\",\"ParentCategoryId\":0,\"CategoryId\":808,\"PageIndex\":" pageIndex ",\"ItemListActionName\":\"PostList\"}";
byte[] postByte = encoding.GetBytes(postData);
//请求体类型
request.ContentType = "application/json; charset=UTF-8";
//TCP/IP是否保持连接
request.KeepAlive = false;
request.Host = "www.cnblogs.com";
//客户端信息
request.UserAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.1.3.5000 Chrome/26.0.1410.43 Safari/537.1";
request.ContentLength = postByte.Length;
Stream reStream = request.GetRequestStream();
reStream.Write(postByte,0,postByte.Length);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream responStream = response.GetResponseStream();
StreamReader reader = new StreamReader(responStream, Encoding.UTF8);
//保存图片
//Image img = new Bitmap(response.GetResponseStream());
//img.Save(@"E:\Test\test.Bmp", ImageFormat.Bmp);
string content = reader.ReadToEnd();
return content;
}
/// <summary>
/// 正则取文章
/// </summary>
/// <param name="htmlString"></param>
/// <returns></returns>
public static List<Article> GetArticles(string htmlString)
{
List<Article> articleList = new List<Article>();
Regex regex = null;
Article article = null;
regex = new Regex("<div class=\"post_item\">(?<content>.*?)(?=<div class=\"clear\">" @"</div>\s*</div>)",
RegexOptions.Singleline);
if (regex.IsMatch(htmlString))
{
MatchCollection aritcles = regex.Matches(htmlString);
foreach (Match item in aritcles)
{
article = new Article();
//取推荐
regex =new Regex(
"<div class=\"digg\">.*<span.*>(?<digNum>.*)" @"</span>"
".*<div class=\"post_item_body\">", RegexOptions.Singleline);
article.DiggNum = regex.Match(item.Value).Groups["digNum"].Value;
//取文章标题 需要去除转义字符
regex = new Regex("<h3>(?<a>.*)</h3>", RegexOptions.Singleline);
string a = regex.Match(item.Value).Groups["a"].Value;
regex = new Regex("<a\\s.*href=\"(?<href>.*?)\".*>(?<summary>.*)</a>", RegexOptions.Singleline);
article.AritcleUrl = regex.Match(a).Groups["href"].Value;
article.AritcleTitle = regex.Match(a).Groups["summary"].Value;
//取作者图片 先取html img标签再取Src
regex = new Regex("<a.*>(?<img><img[^>].*>)</a>", RegexOptions.Singleline);
string img = regex.Match(item.Value).Groups["img"].Value;
regex = new Regex(@"<img.*src=(?<path>.*)\s .*/>", RegexOptions.Singleline);
article.AuthorImg = regex.Match(img).Groups["path"].Value.TrimEnd('"').TrimStart('"');
//取作者博客URL及链接的target属性
regex = new Regex("<a\\s*?href=\"(?<href>.*)\"\\s*?target=\"(?<target>.*?)\">.*</a>",
RegexOptions.Singleline);
article.AuthorUrl = regex.Match(item.Value).Groups["href"].Value;
string urlTarget = regex.Match(item.Value).Groups["target"].Value;
//取文章简介
//1 先取summary Div中所有内容
regex = new Regex("<p class=\"post_item_summary\">(?<summary>.*)</p>", RegexOptions.Singleline);
string summary = regex.Match(item.Value).Groups["summary"].Value;
//2 取简介
regex = new Regex("(?<indroduct>(?<=</a>).*)", RegexOptions.Singleline);
article.AritcleSummary = regex.Match(summary).Groups["indroduct"].Value;
//取发布人与发布时间
regex =
new Regex(
"<div class=\"post_item_foot\">\\s*<a.*?>(?<publishName>.*)</a>(?<publishTime>.*)<span class=\"article_comment\">",
RegexOptions.Singleline);
article.Author = regex.Match(item.Value).Groups["publishName"].Value;
article.PublishTime = regex.Match(item.Value).Groups["publishTime"].Value.Trim();
//取评论数
regex =
new Regex(
"<span class=\"article_comment\"><a.*>(?<comment>.*)</a></span><span class=\"article_view\">",
RegexOptions.Singleline);
article.CommentNum = regex.Match(item.Value).Groups["comment"].Value;
//取阅读数
regex = new Regex("<span\\s*class=\"article_view\"><a.*>(?<readNum>.*)</a>", RegexOptions.Singleline);
article.ReadNum = regex.Match(item.Value).Groups["readNum"].Value;
articleList.Add(article);
}
}
return articleList;
}
/// <summary>
/// 去除"\t\r\n"特殊字符
/// </summary>
/// <param name="htmlString"></param>
/// <returns></returns>
public static string ClearSpecialTag(string htmlString)
{
string htmlStr = Regex.Replace(htmlString, "\n", "", RegexOptions.IgnoreCase);
htmlStr = Regex.Replace(htmlStr, "\t", "", RegexOptions.IgnoreCase);
htmlStr = Regex.Replace(htmlStr, "\r", "", RegexOptions.IgnoreCase);
htmlStr = Regex.Replace(htmlStr, "\"", "'", RegexOptions.IgnoreCase);
return htmlStr;
}
}
}
好例子网口号:伸出你的我的手 — 分享!
网友评论
小贴士
感谢您为本站写下的评论,您的评论对其它用户来说具有重要的参考价值,所以请认真填写。
- 类似“顶”、“沙发”之类没有营养的文字,对勤劳贡献的楼主来说是令人沮丧的反馈信息。
- 相信您也不想看到一排文字/表情墙,所以请不要反馈意义不大的重复字符,也请尽量不要纯表情的回复。
- 提问之前请再仔细看一遍楼主的说明,或许是您遗漏了。
- 请勿到处挖坑绊人、招贴广告。既占空间让人厌烦,又没人会搭理,于人于己都无利。
关于好例子网
本站旨在为广大IT学习爱好者提供一个非营利性互相学习交流分享平台。本站所有资源都可以被免费获取学习研究。本站资源来自网友分享,对搜索内容的合法性不具有预见性、识别性、控制性,仅供学习研究,请务必在下载后24小时内给予删除,不得用于其他任何用途,否则后果自负。基于互联网的特殊性,平台无法对用户传输的作品、信息、内容的权属或合法性、安全性、合规性、真实性、科学性、完整权、有效性等进行实质审查;无论平台是否已进行审查,用户均应自行承担因其传输的作品、信息、内容而可能或已经产生的侵权或权属纠纷等法律责任。本站所有资源不代表本站的观点或立场,基于网友分享,根据中国法律《信息网络传播权保护条例》第二十二与二十三条之规定,若资源存在侵权或相关问题请联系本站客服人员,点此联系我们。关于更多版权及免责申明参见 版权及免责申明


支持(0) 盖楼(回复)