为了账号安全,请及时绑定邮箱和手机立即绑定

拉勾网全站爬虫

2019.07.19 12:49 415浏览

从主页出发,分析完几个大类型后进入相应链接对子信息进行抓取,必要时将上级信息通过meta传递给下级

图片描述

1. 提取8个大类型,抓取类型下所有职位链接以及名称

image.png

    def parse(self, response):
        #对8个主大类分类
        main_cates = response.xpath('//div[@class="menu_sub dn"]')
        for main_cate in main_cates:
            cate_name = main_cate.xpath('./dl/dt/span/text()').extract_first()
            cate_url = main_cate.xpath('./dl/dd/a/@href').extract_first()
            meta = {'cate_name': cate_name}
            print(cate_url)
            yield scrapy.Request(url=cate_url, callback=self.parse_job_list, dont_filter=True, meta=meta)

2. 针对类型下的不同职位,获取该职位的所有职位概况
image.png

       def parse_job_list(self, response):
        positions = response.xpath('//ul[@class="item_con_list"]/li')
        for position in positions:
            position_url = position.xpath('.//a[@class="position_link"]/@href').extract_first()
            yield scrapy.Request(url=position_url, callback=self.parse_job_detail, meta=response.meta)


        # 提取出下一页的url
        next_page_url = response.xpath(
            '//div[@class="pager_container"]/a[contains(text(), "下一页")]/@href').extract_first()

        if next_page_url:
            if next_page_url!= "javascript:;":
            # 解析下一页, 指定dont_filter=True, 就可以实现增量爬虫了.
                yield scrapy.Request(url=next_page_url, dont_filter=True, callback=self.parse_job_list, meta=response.meta)

3. 进入每项所展示职位的链接,获取具体信息

image.png

    def parse_job_detail(self, response):
        job_name = response.xpath('//div[@class="job-name"]/span[@class="name"]/text()').extract_first()

        # 工作简介
        salary_range = response.xpath('//dd[@class="job_request"]/p/span[1]/text()').extract_first().replace('/','').strip()
        working_city = response.xpath('//dd[@class="job_request"]/p/span[2]/text()').extract_first().replace('/',
                                                                                                             '').strip()

        experience_required = response.xpath('//dd[@class="job_request"]/p/span[3]/text()').extract_first().replace('/',
                                                                                                                    '').strip()
        education_required = response.xpath('//dd[@class="job_request"]/p/span[4]/text()').extract_first().replace('/',
                                                                                                                   '').strip()
        job_type = response.xpath('//dd[@class="job_request"]/p/span[5]/text()').extract_first().replace('/',
                                                                                                         '').strip()
        position_label = ','.join(response.xpath('//ul[contains(@class, "position-label")]/li/text()').extract())
        publish_time = response.xpath('//p[@class="publish_time"]/text()').extract_first().split('\xa0')[0]

        # 工作详情
        job_advantage = response.xpath('//dd[@class="job-advantage"]/p/text()').extract_first()
        job_detail = '\n'.join(response.xpath('//div[@class="job-detail"]/p//text()').extract()).replace('\xa0', '')
        # job_detail = response.xpath('normalize-space(//div[@class="job-detail"]/p//text())').extract()
        # 工作详细地址
        working_city_temp = response.xpath('//div[@class="work_addr"]/a[1]/text()').extract_first()
        working_district_temp = response.xpath('//div[@class="work_addr"]/a[2]/text()').extract_first()
        working_address_temp = ''.join(response.xpath('//div[@class="work_addr"]/text()').extract()).replace('-',
                                                                                                             '').strip()
        working_address = "{}-{}-{}".format(working_city_temp, working_district_temp, working_address_temp)

        # 公司信息

        company_lagou_url = response.xpath('//dl[@class="job_company"]/dt/a/@href').extract_first()

        company_name = response.xpath('//dl[@class="job_company"]/dt/a/div/h2/em/text()').extract_first().strip()
        #公司领域
        field_pattern = re.compile('<i class="icon-glyph-fourSquare"></i>(.*?)<span', re.S)
        company_field = re.findall(field_pattern, response.body.decode('utf-8'))[0].strip()
        #company_field = ''.join(response.xpath('//ul[@class="c_feature"]/li[@class="icon-glyph-fourSquare"]/text()').extract()).strip()

        #融资情况
        financing_pattern = re.compile('<i class="icon-glyph-trend"></i>(.*?)<span', re.S)
        financing_status = re.findall(financing_pattern, response.body.decode('utf-8'))[0].strip()
        # financing_status = ''.join(response.xpath('//ul[@class="c_feature"]/li[@class="icon-glyph-trend"]/text()').extract()).strip()

        #公司成员数量
        size_pattern = re.compile('<i class="icon-glyph-figure"></i>(.*?)<span', re.S)
        company_size = re.findall(size_pattern, response.body.decode('utf-8'))[0].strip()
        # company_size = ''.join(response.xpath('//ul[@class="c_feature"]/li[@class="icon-glyph-figure"]/text()').extract()).strip()

        #公司主页
        url_pattern = re.compile('<i class="icon-glyph-home"></i>.*?<a.*?>(.*?)</a>.*?<span', re.S)
        company_url = re.findall(url_pattern, response.body.decode('utf-8'))[0].strip()
        # company_url = ''.join(response.xpath('//ul[@class="c_feature"]/li[@class="icon-glyph-home"]//a/@href').extract()).strip()


        item = LagouFullItem(
            cate_name=response.meta.get("cate_name"),
            job_name=job_name,
            salary_range=salary_range,
            working_city=working_city,
            experience_required=experience_required,
            education_required=education_required,
            job_type=job_type,
            position_label=position_label,
            publish_time=publish_time,
            job_advantage=job_advantage,
            job_detail=job_detail,
            working_address=working_address,
            company_lagou_url=company_lagou_url,
            company_name=company_name,
            company_field=company_field,
            financing_status=financing_status,
            company_size=company_size,
            company_url=company_url
        )

        yield item

4. 中间件设置

针对拉钩的反爬,可以对User-Agent和代理进行设置,User-Agent可以使用之前整理的fake也可以自定义相关数据随机获取,代理方面可以自行搭建代理服务器,因为市面上的大多免费代理都不太稳定,针对大量数据爬去可能效果不会很好,可以自己对代理抓取进行一些配置,比如像这里的定期更新代理以及当代理不能用时打入黑名单或者按照分数评级分配代理等等,嫌麻烦的话可以去Github找一些大佬们写好的代理池,能用就OK

    def process_response(self, request, response, spider):
        # 如果返回的response状态不是200,或者出现了验证码, 就重新获取代理.
        if response.status != 200 or "verify" in response.url:
            logger.warning(
                "Proxy {}, 链接 {} 出错, 状态码为 {}".format(request.meta['proxy'], request.url, response.status))
            self.lock.acquire()
            # 如果失效的代理不在代理黑名单中, 表示这是这个代理地址第一次失效, 就执行更新代理的操作.
            if request.meta.get('proxy') not in self.blacked_proxies:
                # 如果代理过期, 就把它添加到代理黑名单列表中
                self.blacked_proxies.add(self.proxy)
                print('\n\n')
                print(self.blacked_proxies)
                print('\n\n')
                self.user_agent = get_random_ua()
                self.proxy = get_random_ip()

            self.lock.release()
            request.meta["proxy"] = None
            request.headers.setdefault('User-Agent', None)
            return request.replace(dont_filter=True)

    def process_exception(self, request, exception, spider):

        if isinstance(exception, self.exception_list):
            logger.warning("Proxy {} 链接出错 {}".format(request.meta['proxy'], exception))
            self.lock.acquire()
            # 如果失效的代理不在代理黑名单中, 表示这是这个代理地址第一次失效, 就执行更新代理的操作.
            if request.meta.get('proxy') not in self.blacked_proxies:
                # 如果代理过期, 就把它添加到代理黑名单列表中
                self.blacked_proxies.add(self.proxy)
                print('\n\n')
                print(self.blacked_proxies)
                print('\n\n')
                self.user_agent = get_random_ua()
                self.proxy = get_random_ip()

            self.lock.release()
            request.meta["proxy"] = None
            request.headers.setdefault('User-Agent', None)

        return request.replace(dont_filter=True)

5. 写入数据库,做后续操作

class LagouFullPipeline(object):
    def process_item(self, item, spider):
        conn = pymysql.connect(host="localhost", user="root", password="299521", port=3306, db='lagou_test')
        cur = conn.cursor()
        sql = """insert into positions(cate_name, job_name, salary_range, working_city, experience_required, education_required,
              job_type,position_label,publish_time, job_advantage,  job_detail, working_address, company_lagou_url,
              company_name, company_field, financing_status,company_size, company_url ) 
              values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        """

        cate_name = item['cate_name']
        job_name = item['job_name']
        salary_range = item['salary_range']
        working_city = item['working_city']
        experience_required = item['experience_required']
        education_required = item['education_required']
        job_type = item['job_type']
        position_label = item['position_label']
        publish_time = item['publish_time']
        job_advantage = item['job_advantage']
        job_detail = item['job_detail']
        working_address = item['working_address']
        company_lagou_url = item['company_lagou_url']
        company_name = item['company_name']
        company_field = item['company_field']
        financing_status = item['financing_status']
        company_size = item['company_size']
        company_url = item['company_url']
        values = (cate_name, job_name, salary_range, working_city, experience_required, education_required, job_type,
                  position_label, publish_time, job_advantage,  job_detail, working_address, company_lagou_url,
                  company_name, company_field, financing_status, company_size, company_url)
        cur.execute(sql, values)
        conn.commit()
        return item

点击查看更多内容

本文首次发布于慕课网 ,转载请注明出处,谢谢合作

1人点赞

若觉得本文不错,就分享一下吧!

评论

相关文章推荐

正在加载中
意见反馈 邀请有奖 帮助中心 APP下载
官方微信

举报

0/150
提交
取消