为了账号安全,请及时绑定邮箱和手机立即绑定

为什么这段Python爬虫代码跑不通?

/ 猿问

为什么这段Python爬虫代码跑不通?

zhouss 2018-08-17 14:56:52
import html_downloader
import html_parser
import html_outputer
class SpiderMain(object):	
def __init__(self):		
self.urls = url_manager.UrlManager()		
self.downloader = html_downloader.HtmlDownloader()		
self.parser = html_parser.HtmlParser()		
self.outputer = html_outputer.HtmlOuputer()	
def craw(self, root_url):		
count = 1		
self.urls.add_new_url(root_url)		
while self.urls.has_new_url():			
try:				
new_url = self.urls.get_new_url()				
print('craw %d : %s' %(count, new_url))				
html_cont = self.downloader.download(new_url)				
new.urls, new_data = self.parser.parser(new_url, html_cont)				
self.urls.add_new_url(new_urls)				
self.outputer.collect_data(new_data)				
if count == 1000:					
break				
count = count + 1			
except:				
print('craw failed')		
self.outputer.output_html()
if__name__ == '__main__'
root_url = 'https://baike.baidu.com/item/python'
obj_spider = SpiderMain()
obj_spider.craw(root_url)


import urllib.requestclass HtmlDownloader(object):	
def downloadn(self, url):		
if url is None:			
return None		
response = urllib.request.urlopen(url)		
if response.getcode() != 200:			
return None		
return response.read()		


class UrlManager(object):	
def __init__(self):		
self.new_urls = set()		
self.old_urls = set()			
def add_new_url(self, url):		
if url is None:			
return		
if url not in self.new_urls and url not in self.old_urls:			
self.new_urls.add(url)				
def add_new_url(self, url):		
if urls is None or len(urls) == 0:			
return		
for url in urls:			
self.add_new_url(url)				
def has_new_url(self):		
return len(self.new_urls) != 0			
def get_new_url(self):		
new_url = self.new_urls.pop()		
self.old_urls.add(new_url)		
return new_url


from bs4 import BeautifulSoup
import re
import urllib.parse
class HtmlParser(object):	
def _get_new_urls(self, page_url, soup):		
new_url = ser()		
links = soup.find_all('a', href=re.compile(r'/item/'))		
for link in links:			
new_url = link['href']			
new_full_url = urllib.parse.urljoin(page_url, new_url)			
new.urls.add(new_full_url)		
return new_urls			
def _get_new_data(self, page_url, soup):		
res_data = {}				
# url		
res_data['url'] = page_url				
title_node = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1')		
res_data['title'] = title_node.get_text()				
# lemma-summary		
summary_node = soup.find('div", class_='lemma-summary')		
res_data['summary'] = summary_node.get_text()				
retrun res_data			
def parser(self, page_url, html_cont):		
if page_url is None or html_cont is None:			
retrun					
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')		
new_urls = self._get_new_urls(page_url, soup)		
new_data = self._get_new_data(page_url, soup)		
retrun new_urls, new_data	


class HtmlOutputer(object):	
def __init__(self):		
self.datas = []			
def collect_data(self, data):		
if data is None:			
retrun		
self.datas.append(data)			
def output_html(self):		
fout = open('output.html', 'w', encoding='utf-8')				
fout.write('<html>')		
fout.write('<head><meta http-equiv=\'content-type\' content=\'text/html;charset=utf-8\'></head>')		fout.write('<body>')		fout.write('<table>')				for data in self.datas:			fout.write('<tr>')			fout.write('<td>%s</td>' % data['url'])			fout.write('<td>%s</td>' % data['summary'])			fout.write('</tr>')					fout.write('</table>')		fout.write('</body>')		fout.write('</html>')		fout.close()


查看完整描述

1 回答

?
鱼右

你这么贴代码, 别人怎么看呢

查看完整回答
反对 回复 2018-08-31

添加回答

回复

举报

0/150
提交
取消
意见反馈 帮助中心 APP下载
官方微信