实例介绍
【实例简介】
【实例截图】
【核心代码】
/** * @Author:zhaojunlike * @Github:https://github.com/zhaojunlike * @Do: 采集某网站整站图集 * Created by zhaojunlike on 6/4/2017. */ const http = require("http"); const querystring = require("querystring"); const restify = require('restify-clients'); const log4js = require("log4js"); const request = require('request'); const cheerio = require('cheerio'); const redis = require("redis"); const process = require("process"); const download = require('download'); const redisConn = redis.createClient({ host: "192.168.99.100", // host: "redis-db", port: "6379", }); const url = require('url'); const fs = require('fs'); const path = require('path'); //请求头,token是我服务端需要的而已 const RequestHeaders = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3047.4 Safari/537.36", "Host": "www.mzitu.com", 'token': '4F39500149264DE474AA8FA4C67379D1', }; //网站来源 const webClient = restify.createStringClient({ url: 'http://www.mzitu.com', headers: RequestHeaders }); //服务端api配置 const serverApiClient = restify.createStringClient({ url: 'http://localhost:8080', //url: 'http://192.168.99.100:81', headers: RequestHeaders }); const RedisConfig = {}; const RegxConfig = { index_tag: /<dl(.*?)class="tags">([\s\S]*?)<\/dl>/g, }; const CacheKeys = { index_tag: "index_tag_queue", tag_list: "tag_list_queue", taotu_list: "taotu_list", page_count: "page_count", page_detail: "page_detail_queue", img_download_url: "img_queue", }; const RemoteConfig = { host: 'http://www.mzitu.com', zhuanti: 'http://www.mzitu.com/zhuanti/', page: 'http://www.mzitu.com/page/', }; const ServerApi = { DocumentAdd: "/Document_add.action", PictureAdd: "/Picture_add.action", //获取上一次的采集对象,这个作为增量采集的标识 DocumentLast: "/Document_last.action", //验证图集是否被采集过了 DocumentCheck: "/Document_check.action" }; let SpiderIDLE = { start: false, index_success: false, img_page_success: false, img_down_success: false, img_taotu_success: false, //BASE_PATH: "../storage/download", BASE_PATH: "./download" }; const Tools = { parseUri: function (uri) { let filePath = url.parse(uri).path; let tmp = filePath.split('/'); return { filename: tmp.pop(), filepath: tmp.join("/") }; }, checkDir: function (dirPath) { let mode = 777; if (!fs.existsSync(dirPath)) { let tmp; dirPath.split('/').forEach(function (dirname) { if (tmp) { tmp = path.join(tmp, dirname); } else { tmp = dirname; } if (!fs.existsSync(tmp)) { if (!fs.mkdirSync(tmp, mode)) { return false; } } }); } return true; } }; //全站采集器 const Spider = { start: function () { webClient.get('/zhuanti/', function (err, req, res, data) { if (err) { return err; } let $ = cheerio.load(data); $(".postlist .tags dd").each(function (index, item) { let $this = $(this); let tag = {}; tag.title = $this.find('img').attr("alt"); tag.banner = $this.find('img').attr("src"); tag.url = $this.find("a").attr("href"); //pop进入队列 redisConn.rpush(CacheKeys.index_tag, JSON.stringify(tag), function (err, reply) { console.log(err, reply); }); }); }); }, //1.获取首页,获取首页有多少个pageCount getPageList: function (callback) { webClient.get('/', function (err, req, res, data) { if (err) { return err; } let $ = cheerio.load(data); $(".nav-links a[class='page-numbers']").each(function (index, item) { let $this = $(this); //pop进入队列 let html = $this.html(); let page = html.match(/\d /); if (parseInt(page)) { redisConn.getset(CacheKeys.page_count, page); } }); redisConn.get(CacheKeys.page_count, function (err, reply) { callback(reply); }); }); }, //2.加入套图页面数据,就是套图的数据 getImgPage: function (callback) { redisConn.decr(CacheKeys.page_count, function (err, reply) { if (err || !reply) { return false; } if (parseInt(reply) <= 1) { console.log("页面套图数据已经采集完毕了!!!!");// SpiderIDLE.img_page_success = true; return false; } console.log(`开始采集页面:${reply}`); //采集这个页面 webClient.get(`/page/${reply}/`, function (err, req, res, data) { if (err || !data) { return false; } let $ = cheerio.load(data); $(".postlist #pins li").each(function (index, item) { let $this = $(this); let document = { title: "", url: "", remote_path: '/', content: "", page_num: "", category_id: "", create_time: "", update_time: "", good_count: 0, view_count: 0, remote_id: 0, }; document.create_time = $this.find(".time").html(); document.view_count = Math.random() * 1000000; document.title = $this.find("img").attr("alt"); document.url = $this.find("a").attr("href"); document.remote_id = document.url.match(/\d /)[0]; document.remote_path = '/' document.remote_id; document.content = $this.find("img").attr("data-original"); document.category_id = reply; document.page_num = reply; //加入队列 //TODO 写入gateway接口 redisConn.rpush(CacheKeys.page_detail, JSON.stringify(document), function (err, reply) { callback(document); }); }); }); }); }, //3.采集套图具体图片,就是套图数量的数据 getTaoTuImgs: function (callback) { redisConn.lpop(CacheKeys.page_detail, function (err, reply) { if (err || !reply) return; let document = JSON.parse(reply); let rePath = document.remote_path = '/' document.url.match(/\d /)[0]; console.log("PATH:", rePath); webClient.get(rePath, function (err, req, res, data) { if (err || !data) return; let $ = cheerio.load(data); let pageCount = $(".main .pagenavi a span").eq(-2).html(); document.detail_count = pageCount; //动态生成链接图片链接 console.log(`获取套图:${document.category_id},${pageCount}张`); for (let i = 2; i <= pageCount; i ) { let img = { category_id: document.category_id, img_url: document.url "/" i, remote_id: document.remote_id, remote_path: document.remote_path "/" i }; //这里可能直接push了10张图进去 redisConn.rpush(CacheKeys.img_download_url, JSON.stringify(img), function (err, reply) { console.log(`加入套图:${img.remote_id}`, err); }); } }); }); }, //4.具体下载 downloadYY: function (callback) { //下载图片 redisConn.lpop(CacheKeys.img_download_url, function (err, reply) { if (err || !reply) { return false; } let img = JSON.parse(reply); webClient.get(img.remote_path, function (err, req, res, data) { if (err) { console.log(err); return; } if (err || !data) return; let $ = cheerio.load(data); //找到图片并且下载 let urlImg = $(".main .main-image img").attr("src"); img.url_img = urlImg; img.path = `/${img.category_id}/`; let fileDetail = Tools.parseUri(urlImg); let savePath = `${SpiderIDLE.BASE_PATH}/images/${img.category_id}${fileDetail.filepath}`; img.location = `/images/${img.category_id}${fileDetail.filepath}/${fileDetail.filename}`; Tools.checkDir(savePath); download(urlImg, savePath).then(function () { console.log("下载真实大图:", urlImg, ",存储:", img.location); callback(img); }); }); }); }, downloadThumbs: function () { }, clearRedis: function () { redisConn.flushdb(function (err) { console.log("清空Redis Cache成功", err); }); }, }; //增量采集器 TODO version2.0 const IncSpider = { run: function (callback) { } }; const SpiderTimer = setInterval(function () { if (SpiderIDLE.start !== true) { return false; } if (SpiderIDLE.img_page_success !== true) { Spider.getImgPage(function (document) { let urlImg = document.content; //1.下载到本地进行存储 let fileDetail = Tools.parseUri(urlImg); let savePath = SpiderIDLE.BASE_PATH '/banner' fileDetail.filepath; Tools.checkDir(savePath); download(urlImg, savePath).then(function () { console.log("DownloadThumbsImg:", urlImg, "SavePath:", savePath); }); //2.提交给服务器,这个只是页面的 document.content = "/banner" fileDetail.filepath "/" fileDetail.filename; document.view_count = parseInt(document.view_count); //TODO 服务端 }); } Spider.downloadYY(function (picture) { //TODO 服务端 }); Spider.getTaoTuImgs(function (document) { //TODO 服务端 }); }, 100); Spider.getPageList(function (count) { console.log(`一共有:${count}个页面需要采集`); SpiderIDLE.start = true; }); //pm2 exit process.on("exit", function () { Spider.clearRedis(); redisConn.end(true); clearInterval(SpiderTimer); });
好例子网口号:伸出你的我的手 — 分享!
相关软件
小贴士
感谢您为本站写下的评论,您的评论对其它用户来说具有重要的参考价值,所以请认真填写。
- 类似“顶”、“沙发”之类没有营养的文字,对勤劳贡献的楼主来说是令人沮丧的反馈信息。
- 相信您也不想看到一排文字/表情墙,所以请不要反馈意义不大的重复字符,也请尽量不要纯表情的回复。
- 提问之前请再仔细看一遍楼主的说明,或许是您遗漏了。
- 请勿到处挖坑绊人、招贴广告。既占空间让人厌烦,又没人会搭理,于人于己都无利。
关于好例子网
本站旨在为广大IT学习爱好者提供一个非营利性互相学习交流分享平台。本站所有资源都可以被免费获取学习研究。本站资源来自网友分享,对搜索内容的合法性不具有预见性、识别性、控制性,仅供学习研究,请务必在下载后24小时内给予删除,不得用于其他任何用途,否则后果自负。基于互联网的特殊性,平台无法对用户传输的作品、信息、内容的权属或合法性、安全性、合规性、真实性、科学性、完整权、有效性等进行实质审查;无论平台是否已进行审查,用户均应自行承担因其传输的作品、信息、内容而可能或已经产生的侵权或权属纠纷等法律责任。本站所有资源不代表本站的观点或立场,基于网友分享,根据中国法律《信息网络传播权保护条例》第二十二与二十三条之规定,若资源存在侵权或相关问题请联系本站客服人员,点此联系我们。关于更多版权及免责申明参见 版权及免责申明
网友评论
我要评论