global name 'BeautifulSoup' is not defined
#coding:utf8
from bs4 import BeautifulSoup
import urlparse
import re
class HtmlParser(object):
def parse(self, page_url, html_cont):
if page_url is None or html_cont is None:
return
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
new_urls = self._get_new_urls(page_url, soup)
new_data = self._get_new_data(page_url, soup)
return new_urls, new_data
def _get_new_urls(self, page_url, soup):
new_urls = set()
links = soup.find_all('a', href=re.compile(r"/item/(.*)"))
for link in links:
new_url = link['href']
new_full_url = urlparse.urljoin(page_url,new_url)
new_urls.add(new_full_url)
return new_urls
def _get_new_data(self, page_url, soup):
res_data = {}
res_data['url'] = page_url
title_node = soup.find('dd',class_= "lemmaWgt-lemmaTitle-title").find("h1")
res_data['title'] = title_node.get_text()
summary_node = soup.find('div', class_ = "lemma-summary")
res_data['summary'] = summary_node.get_text()
return res_data在html_parser里面加了:from bs4 import BeautifulSoup 但是还是报错。神奇的是前面的测试BeautifulSoup的程序跑起来没有问题。有大虾帮忙看下怎么回事么
错误如下:
NameErrorTraceback (most recent call last)
/Users/yang/PythonSource/pachong/spider_main.py in <module>()
34 root_url = "https://baike.baidu.com/item/Python/407313?fr=aladdin"
35 obj_spider = SpiderMain()
---> 36 obj_spider.craw(root_url)
37
/Users/yang/PythonSource/pachong/spider_main.py in craw(self, root_url)
21 html_cont = self.downloader.download(new_url)
22 print new_url
---> 23 new_urls, new_data = self.parser.parse(new_url, html_cont)
24 self.urls.add_new_urls(new_urls)
25 self.outputer.collect_data(new_data)
/Users/yang/PythonSource/pachong/html_parser.py in parse(self, page_url, html_cont)
31 title_node = soup.find('dd',class_= "lemmaWgt-lemmaTitle-title").find("h1")
32 res_data['title'] = title_node.get_text()
---> 33
34 summary_node = soup.find('div', class_ = "lemma-summary")
35 res_data['summary'] = summary_node.get_text()
NameError: global name 'BeautifulSoup' is not defined