Nodejs用cheerio爬取《妹子图》网站图片

高级JavaScript组件

下载此实例

开发语言：js
实例大小：2.20M
下载次数：13
浏览次数：496
发布时间：2018-05-12
实例类别：高级JavaScript组件
发布人：crazycode
文件格式：.zip
所需积分：2

相关标签：图片 cheerio 爬取采集

网友评论举报投诉收藏该页

下载此实例

实例介绍

[下载地址]

【实例简介】

【实例截图】

from clipboard

【核心代码】

/**
 * @Author:zhaojunlike
 * @Github:https://github.com/zhaojunlike
 * @Do： 采集某网站整站图集
 * Created by zhaojunlike on 6/4/2017.
 */
const http = require("http");
const querystring = require("querystring");
const restify = require('restify-clients');
const log4js = require("log4js");
const request = require('request');
const cheerio = require('cheerio');
const redis = require("redis");
const process = require("process");
const download = require('download');
const redisConn = redis.createClient({
    host: "192.168.99.100",
    // host: "redis-db",
    port: "6379",
});
const url = require('url');
const fs = require('fs');
const path = require('path');
//请求头，token是我服务端需要的而已
const RequestHeaders = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3047.4 Safari/537.36",
    "Host": "www.mzitu.com",
    'token': '4F39500149264DE474AA8FA4C67379D1',
};
//网站来源
const webClient = restify.createStringClient({
    url: 'http://www.mzitu.com',
    headers: RequestHeaders
});
//服务端api配置
const serverApiClient = restify.createStringClient({
    url: 'http://localhost:8080',
    //url: 'http://192.168.99.100:81',
    headers: RequestHeaders
});
const RedisConfig = {};
const RegxConfig = {
    index_tag: /<dl(.*?)class="tags">([\s\S]*?)<\/dl>/g,
};
const CacheKeys = {
    index_tag: "index_tag_queue",
    tag_list: "tag_list_queue",
    taotu_list: "taotu_list",
    page_count: "page_count",
    page_detail: "page_detail_queue",
    img_download_url: "img_queue",
};
const RemoteConfig = {
    host: 'http://www.mzitu.com',
    zhuanti: 'http://www.mzitu.com/zhuanti/',
    page: 'http://www.mzitu.com/page/',

};
const ServerApi = {
    DocumentAdd: "/Document_add.action",
    PictureAdd: "/Picture_add.action",
    //获取上一次的采集对象,这个作为增量采集的标识
    DocumentLast: "/Document_last.action",
    //验证图集是否被采集过了
    DocumentCheck: "/Document_check.action"

};
let SpiderIDLE = {
    start: false,
    index_success: false,
    img_page_success: false,
    img_down_success: false,
    img_taotu_success: false,
    //BASE_PATH: "../storage/download",
    BASE_PATH: "./download"
};

const Tools = {
    parseUri: function (uri) {
        let filePath = url.parse(uri).path;
        let tmp = filePath.split('/');
        return {
            filename: tmp.pop(),
            filepath: tmp.join("/")
        };
    },
    checkDir: function (dirPath) {
        let mode = 777;
        if (!fs.existsSync(dirPath)) {
            let tmp;
            dirPath.split('/').forEach(function (dirname) {
                if (tmp) {
                    tmp = path.join(tmp, dirname);
                }
                else {
                    tmp = dirname;
                }
                if (!fs.existsSync(tmp)) {
                    if (!fs.mkdirSync(tmp, mode)) {
                        return false;
                    }
                }
            });
        }
        return true;
    }
};

//全站采集器
const Spider = {
    start: function () {
        webClient.get('/zhuanti/', function (err, req, res, data) {
            if (err) {
                return err;
            }
            let $ = cheerio.load(data);
            $(".postlist .tags dd").each(function (index, item) {
                let $this = $(this);
                let tag = {};
                tag.title = $this.find('img').attr("alt");
                tag.banner = $this.find('img').attr("src");
                tag.url = $this.find("a").attr("href");
                //pop进入队列
                redisConn.rpush(CacheKeys.index_tag, JSON.stringify(tag), function (err, reply) {
                    console.log(err, reply);
                });
            });
        });
    },
    //1.获取首页,获取首页有多少个pageCount
    getPageList: function (callback) {
        webClient.get('/', function (err, req, res, data) {
            if (err) {
                return err;
            }
            let $ = cheerio.load(data);
            $(".nav-links a[class='page-numbers']").each(function (index, item) {
                let $this = $(this);
                //pop进入队列
                let html = $this.html();
                let page = html.match(/\d /);
                if (parseInt(page)) {
                    redisConn.getset(CacheKeys.page_count, page);
                }
            });
            redisConn.get(CacheKeys.page_count, function (err, reply) {
                callback(reply);
            });
        });
    },
    //2.加入套图页面数据,就是套图的数据
    getImgPage: function (callback) {
        redisConn.decr(CacheKeys.page_count, function (err, reply) {
            if (err || !reply) {
                return false;
            }
            if (parseInt(reply) <= 1) {
                console.log("页面套图数据已经采集完毕了!!!!");//
                SpiderIDLE.img_page_success = true;
                return false;
            }
            console.log(`开始采集页面:${reply}`);
            //采集这个页面
            webClient.get(`/page/${reply}/`, function (err, req, res, data) {
                if (err || !data) {
                    return false;
                }
                let $ = cheerio.load(data);
                $(".postlist #pins li").each(function (index, item) {
                    let $this = $(this);
                    let document = {
                        title: "",
                        url: "",
                        remote_path: '/',
                        content: "",
                        page_num: "",
                        category_id: "",
                        create_time: "",
                        update_time: "",
                        good_count: 0,
                        view_count: 0,
                        remote_id: 0,
                    };
                    document.create_time = $this.find(".time").html();
                    document.view_count = Math.random() * 1000000;
                    document.title = $this.find("img").attr("alt");
                    document.url = $this.find("a").attr("href");
                    document.remote_id = document.url.match(/\d /)[0];
                    document.remote_path = '/'   document.remote_id;
                    document.content = $this.find("img").attr("data-original");
                    document.category_id = reply;
                    document.page_num = reply;
                    //加入队列
                    //TODO 写入gateway接口
                    redisConn.rpush(CacheKeys.page_detail, JSON.stringify(document), function (err, reply) {
                        callback(document);
                    });
                });
            });

        });
    },
    //3.采集套图具体图片，就是套图数量的数据
    getTaoTuImgs: function (callback) {
        redisConn.lpop(CacheKeys.page_detail, function (err, reply) {
            if (err || !reply) return;
            let document = JSON.parse(reply);
            let rePath = document.remote_path = '/'   document.url.match(/\d /)[0];
            console.log("PATH:", rePath);
            webClient.get(rePath, function (err, req, res, data) {
                if (err || !data) return;
                let $ = cheerio.load(data);
                let pageCount = $(".main .pagenavi a span").eq(-2).html();
                document.detail_count = pageCount;
                //动态生成链接图片链接
                console.log(`获取套图：${document.category_id},${pageCount}张`);
                for (let i = 2; i <= pageCount; i  ) {
                    let img = {
                        category_id: document.category_id,
                        img_url: document.url   "/"   i,
                        remote_id: document.remote_id,
                        remote_path: document.remote_path   "/"   i
                    };
                    //这里可能直接push了10张图进去
                    redisConn.rpush(CacheKeys.img_download_url, JSON.stringify(img), function (err, reply) {
                        console.log(`加入套图:${img.remote_id}`, err);
                    });
                }
            });
        });
    },
    //4.具体下载
    downloadYY: function (callback) {
        //下载图片
        redisConn.lpop(CacheKeys.img_download_url, function (err, reply) {
            if (err || !reply) {
                return false;
            }
            let img = JSON.parse(reply);
            webClient.get(img.remote_path, function (err, req, res, data) {
                if (err) {
                    console.log(err);
                    return;
                }
                if (err || !data) return;
                let $ = cheerio.load(data);
                //找到图片并且下载
                let urlImg = $(".main .main-image img").attr("src");
                img.url_img = urlImg;
                img.path = `/${img.category_id}/`;

                let fileDetail = Tools.parseUri(urlImg);
                let savePath = `${SpiderIDLE.BASE_PATH}/images/${img.category_id}${fileDetail.filepath}`;
                img.location = `/images/${img.category_id}${fileDetail.filepath}/${fileDetail.filename}`;
                Tools.checkDir(savePath);
                download(urlImg, savePath).then(function () {
                    console.log("下载真实大图:", urlImg, ",存储:", img.location);
                    callback(img);
                });
            });
        });

    },
    downloadThumbs: function () {

    },
    clearRedis: function () {
        redisConn.flushdb(function (err) {
            console.log("清空Redis Cache成功", err);
        });
    },
};

//增量采集器 TODO version2.0
const IncSpider = {
    run: function (callback) {

    }
};

const SpiderTimer = setInterval(function () {
    if (SpiderIDLE.start !== true) {
        return false;
    }
    if (SpiderIDLE.img_page_success !== true) {
        Spider.getImgPage(function (document) {
            let urlImg = document.content;
            //1.下载到本地进行存储
            let fileDetail = Tools.parseUri(urlImg);
            let savePath = SpiderIDLE.BASE_PATH   '/banner'   fileDetail.filepath;
            Tools.checkDir(savePath);
            download(urlImg, savePath).then(function () {
                console.log("DownloadThumbsImg:", urlImg, "SavePath:", savePath);
            });

            //2.提交给服务器,这个只是页面的
            document.content = "/banner"   fileDetail.filepath   "/"   fileDetail.filename;
            document.view_count = parseInt(document.view_count);
            //TODO 服务端
        });
    }
    Spider.downloadYY(function (picture) {
        //TODO 服务端
    });
    Spider.getTaoTuImgs(function (document) {
        //TODO 服务端
    });
}, 100);

Spider.getPageList(function (count) {
    console.log(`一共有:${count}个页面需要采集`);
    SpiderIDLE.start = true;
});


//pm2 exit
process.on("exit", function () {
    Spider.clearRedis();
    redisConn.end(true);
    clearInterval(SpiderTimer);
});

标签： 图片 cheerio 爬取采集

实例下载地址

Nodejs用cheerio爬取《妹子图》网站图片

点此下载实例

不能下载？内容有错？点击这里报错 + 投诉 + 提问

好例子网口号：伸出你的我的手 — 分享！

网友评论

我要评论

小贴士

感谢您为本站写下的评论，您的评论对其它用户来说具有重要的参考价值，所以请认真填写。

类似“顶”、“沙发”之类没有营养的文字，对勤劳贡献的楼主来说是令人沮丧的反馈信息。
相信您也不想看到一排文字/表情墙，所以请不要反馈意义不大的重复字符，也请尽量不要纯表情的回复。
提问之前请再仔细看一遍楼主的说明，或许是您遗漏了。
请勿到处挖坑绊人、招贴广告。既占空间让人厌烦，又没人会搭理，于人于己都无利。

关于好例子网

本站旨在为广大IT学习爱好者提供一个非营利性互相学习交流分享平台。本站所有资源都可以被免费获取学习研究。本站资源来自网友分享，对搜索内容的合法性不具有预见性、识别性、控制性，仅供学习研究，请务必在下载后24小时内给予删除，不得用于其他任何用途，否则后果自负。基于互联网的特殊性，平台无法对用户传输的作品、信息、内容的权属或合法性、安全性、合规性、真实性、科学性、完整权、有效性等进行实质审查；无论平台是否已进行审查，用户均应自行承担因其传输的作品、信息、内容而可能或已经产生的侵权或权属纠纷等法律责任。本站所有资源不代表本站的观点或立场，基于网友分享，根据中国法律《信息网络传播权保护条例》第二十二与二十三条之规定，若资源存在侵权或相关问题请联系本站客服人员，点此联系我们。关于更多版权及免责申明参见版权及免责申明

Nodejs用cheerio爬取《妹子图》网站图片

同类人气实例

实例介绍

实例下载地址

Nodejs用cheerio爬取《妹子图》网站图片

相关软件

相关文章

网友评论

小贴士

关于好例子网

下载周排行

下载总排行