Python怎么调用bs4???
Python怎么调用bs4???
Python怎么调用bs4???
2016-05-09
__author__ = 'xray'
# coding: utf8
from bs4 import BeautifulSoup
import re, urlparse
class HtmlParser(object):
# def __init__(self):
# self
def get_new_urls(self, page_url, soup):
new_urls = set()
links = soup.find_all('a', href=re.compile(r'/view/\d+\.html'))
for link in links:
new_url = link['href']
new_full_url = urlparse.urljoin(page_url, new_url)
new_urls.add(new_full_url)
return new_urls
def get_new_data(self, page_url, soup):
res_data = {}
#
res_data['url'] = page_url
#
title_node = soup.find('dd', class_='LemmaWgt-LemmaTitle-title').find('h1')
res_data['title'] = title_node.get_text()
#
summary_node = soup.find('div', class_='lemma-summary')
res_data['summary'] = summary_node.get_text()
return res_data
def parse(self, page_url, html_cont):
if page_url is None or html_cont is None:
return
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
new_urls = self.get_new_urls(page_url, soup)
new_data = self.get_new_data(page_url, soup)
return new_urls, new_data举报