为了账号安全,请及时绑定邮箱和手机立即绑定
  • 有人知道数据溢出怎么办嘛,我按教程写的爬虫页面提示栈溢出
  • #-*-coding:utf-8-*-
    import json
    import time
    import multiprocessing
    import requests
    from lxml import etree
    
    class HandelLaGou(object):
        def __init__(self):
            self.lagou_session = requests.session()
            self.header = {
                'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.18 Safari/525.19'
            }
            self.city_list = ""
    
        def handle_city(self):
            city_url = "https://www.lagou.com/jobs/allCity.html"
            city_result = self.handle_request(method="GET", url=city_url)
            list = city_result.xpath('//ul[contains(@class, "city_list")]/li/a/text()')
            self.city_list = [x for x in list]
            self.lagou_session.cookies.clear()
    
        def handle_city_job(self, city):
            first_request_url = "https://www.lagou.com/jobs/list_python?&px=default&city=%s"%city
            first_response = self.handle_request(method="GET",url=first_request_url)
            try:
                total_page = first_response.xpath('//span[contains(@class, "span totalNum")]/text()')
                print(total_page)
            except:
                return
            else:
                for i in range(1, int(total_page[0])+1):
                    data = {
                        "pn":i,
                        "kd":"python"
                    }
                    print(i)
                    page_url = "https://www.lagou.com/jobs/positionAjax.json?px=default&city=%s&needAddtionalResult=false"%city
                    referer_url = "https://www.lagou.com/jobs/list_python?&px=default&city=%s"%city
                    self.header['Referer'] = referer_url.encode('utf-8')
                    response = self.handle_request(method="POST", url=page_url, data=data, info=city)
                    lagou_data = json.loads(response)
                    job_list = lagou_data['content']['positionResult']['result']
                    for job in job_list:
                        print(job)
    
        def handle_request(self, method, url , data=None, info=None):
            while True:
                if method == "GET":
                    response = self.lagou_session.get(url=url, headers=self.header)
                    item = etree.HTML(response.text)
                elif method == "POST":
                    response = self.lagou_session.post(url=url, headers=self.header, data=data)
                    item = response.text
                if '频繁' in response.text:
                    self.lagou_session.cookies.clear()
                    first_request_url = "https://www.lagou.com/jobs/list_python?&px=default&city=%s" %info
                    self.handle_request(method="GET", url=first_request_url)
                    time.sleep(15)
                    continue
                return item
    
    if __name__ == '__main__':
        lagou = HandelLaGou()
        lagou.handle_city()
        pool = multiprocessing.Pool(2)
        for city in lagou.city_list:
            pool.apply_async(lagou.handle_city_job, args=(city,))
        pool.close()
        pool.join()


  • import requests
    from lxml import etree
    
    class HandelLaGou(object):
        def __init__(self):
            self.lagou_session = requests.session()
            self.hander = {
                'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.18 Safari/525.19'
            }
            self.city_list = ""
    
        def handle_city(self):
            city_url = "https://www.lagou.com/jobs/allCity.html"
            city_result = self.handle_request(method="GET", url=city_url)
            list = city_result.xpath('//ul[contains(@class, "city_list")]/li/a/text()')
            self.city_list = [x for x in list]
    
        def handle_city_job(self, city):
            
    
    
        def handle_request(self, method, url , data=None, info=None):
            if method == "GET":
                response = self.lagou_session.get(url=url, headers=self.hander)
                item = etree.HTML(response.text)
                return item
    
    
    if __name__ == '__main__':
        lagou = HandelLaGou()
        lagou.handle_city()
        print(lagou.city_list)


  • import re
    
    import requests
    #在此处设置取消警告信息
    import urllib3
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    
    # 这是另一个大佬的办法,可惜过于复杂,我没看懂,简简单单才是真
    # requests.packages.urllib3.disable_warnings()
    # requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += 'HIGH:!DH:!aNULL'
    # try:
    #     requests.packages.urllib3.contrib.pyopenssl.DEFAULT_SSL_CIPHER_LIST += 'HIGH:!DH:!aNULL'
    # except AttributeError:
    #     # no pyopenssl support used / needed / available
    #     pass
    
    class Handle_Lagou(object):
        def __init__(self):
            #使用session保存cookies信息
            self.lagou_session = requests.session()
            self.header = {
                'Connection': 'close',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
            }
            self.city_list=""
        #获取全国所有城市列表的方法
        def handle_city(self):
            city_search = re.compile(r'zhaopin/">(.*?)</a>')
            city_url = "https://www.lagou.com/jobs/allCity.html"
            city_result = self.handle_request(method="GET",url=city_url)
            self.city_list = city_search.findall(city_result)
    
        def handle_request(self,method,url,data=None,info=None):
            if method == "GET":
                # 在此处设置verify = False
                response = self.lagou_session.get(url=url,headers=self.header,verify=False)
            return  response.text
    
    if __name__=='__main__':
        lagou = Handle_Lagou()
        lagou.handle_city()
        print(lagou.city_list)


举报

0/150
提交
取消

邀请2人参与享拼团价

  • ¥6.60

    优惠价

  • ¥1.00

    3人拼团

原价¥69.90

课程须知
必备基础:Python语法基础,函数,面向对象编程 Mysql数据库,了解前端,Flask框架基础
老师告诉你能学到什么?
1.数据抓取:Requests抓取岗位信息、多进程加速抓取、代理隐藏 2.数据存储:数据表设计、保存到Mysql数据库 3.数据可视化:快速掌握Echarts、Echarts绘制图形、Echarts生成云图
加群二维码
  • 慕课Python核心用户群
  • 群号:824631704
  • 付费用户专享
  • 技术学习型社群

微信扫码,参与3人拼团

意见反馈 帮助中心 APP下载
官方微信
友情提示:

您好,此课程属于迁移课程,您已购买该课程,无需重复购买,感谢您对慕课网的支持!