-
示例练习代码: # coding:utf-8 from bs4 import BeautifulSoup import re html_doc = "" #引用段落自http://www.crummy.com/software/BeautifulSoup/bs4/doc/#searching-the-tree,其中的短文,因笔记字数要求有限,无法直接复制过来 soup = BeautifulSoup(html_doc, 'html.parser', from_encoding="utf-8") print u"获取所有的链接" links = soup.find_all('a') for link in links: print link.name, link['href'], link.get_text() print u"获取lacie的链接" link_node = soup.find('a', href="http://example.com/lacie") print link_node.name, link_node['href'], link_node.get_text() print u"正则匹配" link_node = soup.find('a', href=re.compile(r"ill")) print link_node.name, link_node['href'], link_node.get_text() print u"获取p段落文字" p_node = soup.find('p', class_="title") print p_node.name, p_node.get_text()查看全部
-
网页解析器:<br> 作用:<br> (1)、提取出新的待爬取URL列表;<br> (2)、解析出有价值的数据:<br> 1、正则匹配;(字符串形式的模糊匹配);<br> 2、结构化解析:(将一个文档看成一个对象,整个文档内容看成一棵DOM树来进行解析)<br> (1)、html.parser:<br> (2)、Beautiful Soup<br> (3)、lxml查看全部
-
python3中已经停用·Python中的urllib2.urlopen
查看全部 -
Python 3:
# coding:utf-8
import urllib
from http import cookiejar
url = "http://www.baidu.com"
print("第一种方法")
response1 = urllib.request.urlopen(url)
print(response1.getcode())
print(len(response1.read()))
print("第二种方法")
request = urllib.request.Request(url)
request.add_header("user-agent", "Mozilla/5.0")
response2 = urllib.request.urlopen(url)
print(response2.getcode())
print(len(response2.read()))
print("第三种方法")
cj= cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
urllib.request.install_opener(opener)
response3 = urllib.request.urlopen(url)
print(response3.getcode())
print(cj)
print(len(response3.read()))查看全部 -
这个使用Python3.0写的,可以实现。
import urllib.request
import http.cookiejarurl = "http://www.baidu.com"
print("--------------------------------------first------------------------------------------------------------")
response1 = urllib.request.urlopen(url)
print(response1.getcode())
print(len(response1.read()))print("---------------------------------------second---------------------------------------------------------")
request = urllib.request.Request(url)
request.add_header("user-agent","Mozilla/0.5")
response2 = urllib.request.urlopen(url)
print(response2.getcode())
print(len(response2.read()))print("--------------------------------------third-----------------------------------------------------------")
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
urllib.request.install_opener(opener)
response3 = urllib.request.urlopen(url)
print(response3.getcode())
print(len(response3.read()))
print(cj)查看全部 -
python37下成功运行
from urllib import request
from http import cookiejar
url = "http://www.baidu.com"
print("方法1")
response1 = request.urlopen(url)
print(response1.getcode())
print(len(response1.read()))
print("\n方法2")
rq = request.Request(url)
rq.add_header("agent", "Mozilla/5.0")
response2 = request.urlopen(rq)
print(response2.getcode())
print(len(response2.read()))
print("\n方法3")
cj = cookiejar.CookieJar()
opener = request.build_opener(request.HTTPCookieProcessor(cj))
request.install_opener(opener)
response3 = request.urlopen(url)
print(response3.getcode())
print(cj)
print(len(response3.read()))
查看全部 -
# -*- coding: utf-8 -*- from bs4 import BeautifulSoup #导入网页解析器BeautifulSoup库 import re #导入正则表达式库 html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 1、创建BeautifulSoup对象 soup = BeautifulSoup(html_doc, # HTML文档字符串 'html.parser', # HTML解析器 from_encoding='utf-8' # HTML文档的编码 ) # 2、搜索节点(find_all, find) print ('获取所有的链接') links = soup.find_all('a') for link in links: # 3、访问节点内容 print (link.name, link['href'], link.get_text()) print ('获取Lacie的链接') link_node = soup.find('a', href='http://example.com/lacie') print (link_node.name, link_node['href'], link_node.get_text()) print ('正则匹配') link_node = soup.find('a', href=re.compile(r"ill")) print (link_node.name, link_node['href'], link_node.get_text()) print ('获取p段落文字') link_node = soup.find('p', class_="title") print (link_node.name, link_node.get_text())
查看全部 -
视频很不错。按照视频的代码,查看多线程库,threading, Queue, 搞了一个多线程的版本,https://github.com/dengshilong/baike_spider/blob/master/spider_thread.py
python3爬虫实例源码 https://github.com/fifths/python_baike_spider.git
查看全部 -
出现一条记录,第二条就failed的原因是
links = soup.find_all('a', href=re.compile(r'/view/\d+\.htm'))
百度百科修改了页面
咱们只需修改爬取规则 把view改成item 再把/\d+\.htm 去掉就能匹配新的url了查看全部 -
我把老师的几个文件整理到一个.py里了,方便去爬其它网站
#!/usr/bin/python #coding=utf-8 import urllib2,re,urlparse from bs4 import BeautifulSoup start_string = """var spider_vue = new Vue({ el:'#spider', data:{ items:[ """ end_string = """]}})""" def craw(root_url): new_urls = set() old_urls = set() count=0; fout = open('js/spider-vue.js',"w+") fout.write(start_string) new_urls.add(root_url) while(new_urls is not None and len(new_urls)!=0 and count<5): count+=1 url = new_urls.pop() old_urls.add(url) urls,new_data = parse(url) for url in urls: if url not in new_urls and url not in old_urls: new_urls.add(url) fout.write("""{ title:'%s', url:'%s', summary:'%s' }""" % (new_data['title'].encode('utf-8'),new_data['url'].encode('utf-8'),'\\n'.join((new_data['summary'].encode('utf-8').split('\n'))[0:-1]))) print '\n'.join((new_data['summary'].encode('utf-8').split('\n'))[0:-1]) if count<5: fout.write(",\n") fout.write(end_string) #<dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1> def parse(url): html = urllib2.urlopen(url) soup = BeautifulSoup(html,'html.parser') try: _as = soup.find_all('a',href = re.compile(r'(.*)/item/(.*)')) except BaseException,e: print e p_urls = [] for a in _as: p_urls.append( urlparse.urljoin(root_url,a['href']) ) data = {'url':url} data['title'] = soup.select("dd.lemmaWgt-lemmaTitle-title h1")[0].get_text() data['summary'] = soup.select("div.lemma-summary")[0].get_text() return p_urls,data def main(): craw(root_url) root_url = "http://baike.baidu.com/item/Python" main();
查看全部 -
python 3.x中urllib库和urilib2库合并成了urllib库。。
其中urllib2.urlopen()变成了urllib.request.urlopen()
urllib2.Request()变成了urllib.request.Request()查看全部 -
爬虫调度端:启动爬虫,停止爬虫,监视爬虫运行情况
URL管理器:对将要爬取的和已经爬取过的URL进行管理;可取出带爬取的URL,将其传送给“网页下载器”
网页下载器:将URL指定的网页下载,存储成一个字符串,在传送给“网页解析器”
网页解析器:解析网页可解析出①有价值的数据②另一方面,每个网页都包含有指向其他网页的URL,解析出来后可补充进“URL管理器”查看全部 -
urllib2方法2: 向服务器提交用户需要输入的数据 将url使用urllib2的Request方法生成一个request对象 request对象可以向服务器提交data和http header 另外可以使用urlopen方法将request对象作为参数向网页提交下载请求 import urllib2 request = urllib2.Request(url) #创建Request对象 request.add_dara('a','1') #添加数据 request.add_header('User-Agent','Mozilla/5.0) #添加http的header 这里伪装成了一个Mozilla浏览器 response = urllib2.urlopen(request) #发送请求获取结果 urllib2下载方法3: 添加特殊情况的处理器 HTTPCookueProcessor #添加以处理需要用户登录才能访问的网页 ProxyHandler #添加以处理需要代理才能访问的网页 HTTPSHandler #添加以处理HTTPS加密协议的网页 HTTPREdirectHandler #添加以处理URL之间自动相互跳转的网页 讲这些handler传给urllib2的build_opener(handler)方法创建opener对象 然后给urllib2下载这个opener使用urllib2的install_opener(opener)方法 这样urllib2就具有了这些场景的处理能力 然后仍然使用urllib2的urlopen(url)方法请求一个url或request 实现网页的下载 代码如下:#举个栗子 我们增强cookie的处理 import urllib2, cookielib cj = cookielib.CookieJar( ) #创建cookie容器 opener = urllib2.build_opener(urllib2.HTTPCookieProcessir(cj)) #创建1个opener urlib2,install_opener(opener) #给urllib2安装opener response = urllib2.urlopen("http://www.baidu.com/") #使用带有cookie的urllib2访问网页查看全部
-
from bs4 import BeautifulSoup from urllib import request import re print(BeautifulSoup) url = 'http://www.baidu.com' response1 = request.urlopen(url) print(response1.getcode()) content = str(response1.read(), 'utf-8') soup = BeautifulSoup( content, 'html.parser', from_encoding='utf8' ) print(soup) # 查找所有标签为a的节点 a = soup.find_all('a') print(a) # 查找所有标签为a,且class为XXX的节点 a1 = soup.find_all('a', class_='toindex') print(a1) # 查找所有标签为a,且内容为登录的节点 a2 = soup.find_all('a', string='登录') print(a2) # 查找所有标签为a,且值符合正则表达式的节点 a3 = soup.find_all('a', string=re.compile(r'百')) print(a3)查看全部
-
http://www.imooc.com/opus/resource?opus_id=1932&tree=imooc%2Fbaike_spider查看全部
举报