实例介绍
【实例简介】C#写的蜘蛛程序也叫小偷程序
"蜘蛛"(Spider)是Internet上一种很有用的程序,搜索引擎利用蜘蛛程序将Web页面收集到数据库,企业利用蜘蛛程序监视竞争对手的网站并跟踪变动,个人用户用蜘蛛程序下载Web页面以便脱机使用,开发者利用蜘蛛程序扫描自己的Web检查无效的链接……对于不同的用户,蜘蛛程序有不同的用途。那么,蜘蛛程序到底是怎样工作的呢?
蜘蛛是一种半自动的程序,就象现实当中的蜘蛛在它的Web(蜘蛛网)上旅行一样,蜘蛛程序也按照类似的方式在Web链接织成的网上旅行。蜘蛛程序之所以是半自动的,是因为它总是需要一个初始链接(出发点),但此后的运行情况就要由它自己决定了,蜘蛛程序会扫描起始页面包含的链接,然后访问这些链接指向的页面,再分析和追踪那些页面包含的链接。从理论上看,最终蜘蛛程序会访问到Internet上的每一个页面,因为Internet上几乎每一个页面总是被其他或多或少的页面引用。
【实例截图】
【核心代码】
namespace Spider
{
/// <summary>
/// Perform all of the work of a single thread for the spider.
/// This involves waiting for a URL to becomve available, download
/// and then processing the page.
///
/// </summary>
// 完成必须由单个工作线程执行的操作,包括
// 等待可用的URL,下载和处理页面
public class DocumentWorker
{
/// <summary>
/// The base URI that is to be spidered.
/// </summary>
// 要扫描的基础URI
private Uri m_uri;
/// <summary>
/// The spider that this thread "works for"
/// </summary>
//
private Spider m_spider;
/// <summary>
/// The thread that is being used.
/// </summary>
private Thread m_thread;
/// <summary>
/// The thread number, used to identify this worker.
/// </summary>
// 线程编号,用来标识当前的工作线程
private int m_number;
/// <summary>
/// The name for default documents.
/// </summary>
// 缺省文档的名字
public const string IndexFile = "index.html";
/// <summary>
/// Constructor.
/// </summary>
/// <param name="spider">The spider that owns this worker.</param>
// 构造函数,参数表示拥有当前工作线程的蜘蛛程序
public DocumentWorker(Spider spider)
{
m_spider = spider;
}
/// <summary>
/// This method will take a URI name, such ash /images/blank.gif
/// and convert it into the name of a file for local storage.
/// If the directory structure to hold this file does not exist, it
/// will be created by this method.
/// </summary>
/// <param name="uri">The URI of the file about to be stored</param>
/// <returns></returns>
// 输入参数是一个URI名称,例如/images/blank.gif.
// 把它转换成本地文件名称。如果尚未创建相应的目录
// 结构,则创建之
private string convertFilename(Uri uri)
{
string result = m_spider.OutputPath;
int index1;
int index2;
// add ending slash if needed
if( result[result.Length-1]!='\\' )
result = result "\\";
// strip the query if needed
String path = uri.PathAndQuery;
int queryIndex = path.IndexOf("?");
if( queryIndex!=-1 )
path = path.Substring(0,queryIndex);
// see if an ending / is missing from a directory only
int lastSlash = path.LastIndexOf('/');
int lastDot = path.LastIndexOf('.');
if( path[path.Length-1]!='/' )
{
if(lastSlash>lastDot)
path ="/" IndexFile;
}
// determine actual filename
lastSlash = path.LastIndexOf('/');
string filename = "";
if(lastSlash!=-1)
{
filename=path.Substring(1 lastSlash);
path = path.Substring(0,1 lastSlash);
if(filename.Equals("") )
filename=IndexFile;
}
// 必要时创建目录结构
index1 = 1;
do
{
index2 = path.IndexOf('/',index1);
if(index2!=-1)
{
String dirpart = path.Substring(index1,index2-index1);
result =dirpart;
result ="\\";
Directory.CreateDirectory(result);
index1 = index2 1;
}
} while(index2!=-1);
// attach name
result =filename;
return result;
}
/// <summary>
/// Save a binary file to disk.
/// </summary>
/// <param name="response">The response used to save the file</param>
// 将二进制文件保存到磁盘
private void SaveBinaryFile(WebResponse response)
{
byte []buffer = new byte[1024];
if( m_spider.OutputPath==null )
return;
string filename = convertFilename( response.ResponseUri );
Stream outStream = File.Create( filename );
Stream inStream = response.GetResponseStream();
int l;
do
{
l = inStream.Read(buffer,0,buffer.Length);
if(l>0)
outStream.Write(buffer,0,l);
}
while(l>0);
outStream.Close();
inStream.Close();
}
/// <summary>
/// Save a text file.
/// </summary>
/// <param name="buffer">The text to save</param>
// 保存文本文件
private void SaveTextFile(string buffer)
{
if( m_spider.OutputPath==null )
return;
string filename = convertFilename( m_uri );
StreamWriter outStream = new StreamWriter( filename );
outStream.Write(buffer);
outStream.Close();
}
/// <summary>
/// Download a page
/// </summary>
/// <returns>The data downloaded from the page</returns>
// 下载一个页面
private string GetPage()
{
WebResponse response = null;
Stream stream = null;
StreamReader reader = null;
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(m_uri);
response = request.GetResponse();
stream = response.GetResponseStream();
if( !response.ContentType.ToLower().StartsWith("text/") )
{
SaveBinaryFile(response);
return null;
}
string buffer = "",line;
reader = new StreamReader(stream);
while( (line = reader.ReadLine())!=null )
{
buffer =line "\r\n";
}
SaveTextFile(buffer);
return buffer;
}
catch(WebException e)
{
System.Console.WriteLine("下载失败,错误:" e);
return null;
}
catch(IOException e)
{
System.Console.WriteLine("下载失败,错误:" e);
return null;
}
finally
{
if( reader!=null ) reader.Close();
if( stream!=null ) stream.Close();
if( response!=null ) response.Close();
}
}
/// <summary>
/// Process each link encountered. The link will be recorded
/// for later spidering if it is an http or https docuent,
/// has not been visited before(determined by spider class),
/// and is in the same host as the original base URL.
/// </summary>
/// <param name="link">The URL to process</param>
private void ProcessLink(string link)
{
Uri url;
// fully expand this URL if it was a relative link
try
{
url = new Uri(m_uri,link,false);
}
catch(UriFormatException e)
{
System.Console.WriteLine( "Invalid URI:" link " Error:" e.Message);
return;
}
if(!url.Scheme.ToLower().Equals("http") &&
!url.Scheme.ToLower().Equals("https") )
return;
// comment out this line if you would like to spider
// the whole Internet (yeah right, but it will try)
if( !url.Host.ToLower().Equals( m_uri.Host.ToLower() ) )
return;
//System.Console.WriteLine( "Queue:" url );
m_spider.addURI( url );
}
/// <summary>
/// Process a URL
/// </summary>
/// <param name="page">the URL to process</param>
private void ProcessPage(string page)
{
ParseHTML parse = new ParseHTML();
parse.Source = page;
while(!parse.Eof())
{
char ch = parse.Parse();
if(ch==0)
{
Attribute a = parse.GetTag()["HREF"];
if( a!=null )
ProcessLink(a.Value);
a = parse.GetTag()["SRC"];
if( a!=null )
ProcessLink(a.Value);
}
}
}
/// <summary>
/// This method is the main loop for the spider threads.
/// This method will wait for URL's to become available,
/// and then process them.
/// </summary>
public void Process()
{
while(!m_spider.Quit )
{
m_uri = m_spider.ObtainWork();
m_spider.SpiderDone.WorkerBegin();
System.Console.WriteLine("Download(" this.Number "):" m_uri);
string page = GetPage();
if(page!=null)
ProcessPage(page);
m_spider.SpiderDone.WorkerEnd();
}
}
/// <summary>
/// Start the thread.
/// </summary>
public void start()
{
ThreadStart ts = new ThreadStart( this.Process );
m_thread = new Thread(ts);
m_thread.Start();
}
/// <summary>
/// The thread number. Used only to identify this thread.
/// </summary>
public int Number
{
get
{
return m_number;
}
set
{
m_number = value;
}
}
}
}
相关软件
小贴士
感谢您为本站写下的评论,您的评论对其它用户来说具有重要的参考价值,所以请认真填写。
- 类似“顶”、“沙发”之类没有营养的文字,对勤劳贡献的楼主来说是令人沮丧的反馈信息。
- 相信您也不想看到一排文字/表情墙,所以请不要反馈意义不大的重复字符,也请尽量不要纯表情的回复。
- 提问之前请再仔细看一遍楼主的说明,或许是您遗漏了。
- 请勿到处挖坑绊人、招贴广告。既占空间让人厌烦,又没人会搭理,于人于己都无利。
关于好例子网
本站旨在为广大IT学习爱好者提供一个非营利性互相学习交流分享平台。本站所有资源都可以被免费获取学习研究。本站资源来自网友分享,对搜索内容的合法性不具有预见性、识别性、控制性,仅供学习研究,请务必在下载后24小时内给予删除,不得用于其他任何用途,否则后果自负。基于互联网的特殊性,平台无法对用户传输的作品、信息、内容的权属或合法性、安全性、合规性、真实性、科学性、完整权、有效性等进行实质审查;无论平台是否已进行审查,用户均应自行承担因其传输的作品、信息、内容而可能或已经产生的侵权或权属纠纷等法律责任。本站所有资源不代表本站的观点或立场,基于网友分享,根据中国法律《信息网络传播权保护条例》第二十二与二十三条之规定,若资源存在侵权或相关问题请联系本站客服人员,点此联系我们。关于更多版权及免责申明参见 版权及免责申明


网友评论
我要评论