为了账号安全,请及时绑定邮箱和手机立即绑定

进击Node.js基础(二)1-4 Promise重写爬虫源码

2016.06.11 14:44 3626浏览
var http = require('http');
var Promise = require('bluebird'); // 第三方 Promises 模块
var cheerio = require('cheerio');  // 爬虫分析模块
var BufferHelper = require('bufferhelper'); // buffer 组装模块
var iconv = require('iconv-lite'); // 字符转码模块

var baseUrl = 'http://www.imooc.com/learn/';
var courseIds = [348, 637, 259, 75, 197]; //要爬取的课程ID
var pagesArr = []; //爬取到的HTML页面集合

// 批量爬取课程页面
courseIds.forEach(function (cid) {
    pagesArr.push(grabPageAsync((baseUrl + cid)));
});

// 异步爬取页面HTML
function grabPageAsync(url) {
    return new Promise(function (resolve, reject) {
        console.log('正在爬取 ' + url);

        http.get(url, function (res) {
            var bufferHelper = new BufferHelper();

            res.on('data', function (chunk) {
                bufferHelper.concat(chunk);
            });

            res.on('end', function () {
                console.log('爬取 ' + url + ' 成功');

                var fullBuffer = bufferHelper.toBuffer();
                var utf8Buffer = iconv.decode(fullBuffer, 'UTF-8');
                var html = utf8Buffer.toString()
                resolve(html);
            });
        }).on('error', function (e) {
            // 爬取成功
            reject(e);

            console.log('爬取 ' + url + ' 失败');
        });
    });
}

// 提取课程信息并打印
Promise
    .all(pagesArr)
    .then(function (pages) {
        var coursesData = [];

        pages.forEach(function (html) {
            // 提取课程信息
            var courses = filterChapters(html);
            coursesData.push(courses);
        });
        // 打印课程信息
        printCourseInfo(coursesData);
    });

// 提取课程信息
function filterChapters(html) {
    var $ = cheerio.load(html);
    var $chapters = $('.chapter');
    var title = $('.hd .l').text();
    var number = parseInt($($(".meta-value strong")[3]).text().trim(), 10);
    var courseData = {
        title: title,
        number: number,
        videos: []
    };

    var $chapter;
    var chapterTitle;
    var chapterData = {};
    var $videos;
    var $video;
    var videoTitle;
    var id;

    $chapters.each(function () {
        $chapter = $(this);
        chapterTitle = $chapter.find('strong').text();
        chapterData = {
            chapterTitle: chapterTitle,
            videos: []
        };
        $videos = $chapter.find('.video').children('li');
        $videos.each(function () {
            $video = $(this).find('.studyvideo');
            videoTitle = $video.text();
            id = $video.attr('href').split('video/')[1];
            chapterData.videos.push({
                title: videoTitle,
                id: id
            })
        });
        courseData.videos.push(chapterData);
    });
    return courseData;
}

// 打印课程信息
function printCourseInfo(coursesData) {
    if(Object.prototype.toString.call(coursesData) == '[object Array]' && coursesData.length > 0){

        coursesData.forEach(function (courseData) {
            console.log('\n\n【' + courseData.number + '】人学过《' + courseData.title + '》');
            console.log('----------------------------------------------');

            courseData.videos.forEach(function (item) {
                console.log('\n' + item.chapterTitle);

                item.videos.forEach(function (video) {
                    console.log(' ' + video.title.trim());
                })
            });
        });
    }else{
        console.log('暂无课程信息');
    }
}
点击查看更多内容
23人点赞

若觉得本文不错,就分享一下吧!

评论

相关文章推荐

正在加载中
意见反馈 邀请有奖 帮助中心 APP下载
官方微信

举报

0/150
提交
取消