为了账号安全,请及时绑定邮箱和手机立即绑定

爬虫运行失败,craw 1 : https://baike.baidu.com/item/Python/407313?fr=aladdin craw failed

爬虫运行失败,craw 1 : https://baike.baidu.com/item/Python/407313?fr=aladdin craw failed

qq_str_6 2018-04-01 15:40:20
使用的是python2.7 #主函数spider_main.py # coding :utf8 from baike_spider import url_manager,html_downloader,html_outputer,\     html_parser class  SpiderMain(object):     def __init__(self):         self.urls = url_manager.UrlManager()         self.downloader = html_downloader.HtmlDownloader()         self.parser = html_parser.HtmlParser()         self.outputer = html_outputer.HtmlOutputer()     def craw(self,root_url):         count = 1         self.urls.add_new_url(root_url)         while self.urls.has_new_url():             try:                 new_url = self.urls.get_new_url()                 print 'craw %d : %s ' % (count, new_url)                 html_cont = self.downloader.download(new_url)                 new_urls, new_data = self.parser.parse(new_url, html_cont)                 self.urls.add_new_urls(new_urls)                 self.outputer.collect_data(new_data)                 if count == 1000:                     break                 count = count +1             except:                 print 'craw failed'         self.outputer.output_html() if __name__ == "__main__":     root_url = "https://baike.baidu.com/item/Python/407313?fr=aladdin"     obj_spider = SpiderMain( )     obj_spider.craw(root_url) #url_manager.py # coding : utf8 class UrlManager (object):     def __init__(self):         self.new_urls = set()         self.old_urls = set()     def add_new_url(self,url):        if url is None:             return        if url not in self.new_urls  and  url not in  self.old_urls:             self.new_urls.add(url)     def add_new_urls(self,urls):         if urls is None or len (urls) == 0:             return         for url in urls:             self.add_new_url(url)     def has_new_url(self):         return len(self.new_urls) !=0     def get_new_url(self):         new_url = self.new_urls.pop()         self.old_urls.add(new_url)         return new_url # html_parser.py # coding : utf8 import re import urlparse from bs4 import BeautifulSoup class HtmlParser (object):    def _get_new_urls(self,page_url,soup):        new_urls = set ()        #/item/Python/407313?fr=aladdin        links = soup.find_all('a',href=re.compile(r"/item/Python/\d+\?fr=aladdin"))        for link in links:            new_url = link['href']            new_full_url = urlparse.urljoin(page_url,new_url)            new_urls.add(new_full_url)        return new_urls    def  _get_new_date(self,page_url,soup):        res_date={}        # url        res_date['url'] =page_url        #<dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1>        title_node = soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find("h1")        res_date['title'] = title_node.get_text()         #<div class="lemma-summary" label-module="lemmaSummary">        summary_node = soup.find('div',class_="lemma-summary")        res_date['summary'] = summary_node.get_text()        return res_date    def parse(self,page_url,html_cont):       if page_url is None or html_cont is None:           return       soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')       new_urls = self._get_new_urls(page_url,soup)       new_data = self._get_new_data(page_url,soup)       return new_urls,new_data #html_outputer.py # coding : utf8 import re import urlparse from bs4 import BeautifulSoup class HtmlParser (object):    def _get_new_urls(self,page_url,soup):        new_urls = set ()        #/item/Python/407313?fr=aladdin        links = soup.find_all('a',href=re.compile(r"/item/Python/\d+\?fr=aladdin"))        for link in links:            new_url = link['href']            new_full_url = urlparse.urljoin(page_url,new_url)            new_urls.add(new_full_url)        return new_urls    def  _get_new_date(self,page_url,soup):        res_date={}        # url        res_date['url'] =page_url        #<dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1>        title_node = soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find("h1")        res_date['title'] = title_node.get_text()         #<div class="lemma-summary" label-module="lemmaSummary">        summary_node = soup.find('div',class_="lemma-summary")        res_date['summary'] = summary_node.get_text()        return res_date    def parse(self,page_url,html_cont):       if page_url is None or html_cont is None:           return       soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')       new_urls = self._get_new_urls(page_url,soup)       new_data = self._get_new_data(page_url,soup)       return new_urls,new_data #html_downloader.py # coding : utf8 import urllib2 class HtmlDownloader (object):     def download(self,url):         if url is None :             return None         response=urllib2.urlopen(url)         if response.getcode() != 200:             return None         return response.read()
查看完整描述

目前暂无任何回答

  • 0 回答
  • 0 关注
  • 1438 浏览
慕课专栏
更多

添加回答

举报

0/150
提交
取消
意见反馈 帮助中心 APP下载
官方微信