爬取news.baidu.com所有图片
我为什么爬取了四张图片,看着页面是有好多图片
import re
import urllib
from bs4 import BeautifulSoup
#根据url获取网页html内容-下载页面
def getHtmlContent(url):
print '获取网页内容'
page = urllib.urlopen(url)
return page
#从html解析出所有的jpg图片的url
def getJPGs(html):
soup = BeautifulSoup(html,'html.parser')
print '获取所有jpg图片'
jpgs = soup.find_all('img',src=re.compile(r'.+\..+'))
newJpgs = []
print '遍历jpgs的url,如下:'
for jpg in jpgs:
print jpg['src'],'\n'
if jpg['src'].find('http')==-1:
print '匹配http://,若无追加http:'
print 'http:'+jpg['src']
newJpgs.append('http:'+jpg['src'])
else:
newJpgs.append(jpg['src'])
print 'newJpgs的集合,如下:'
print newJpgs
return newJpgs
#用图片url下载图片并保存成制定文件名
def downloadJPGs(imgUrl,fileName):
print imgUrl
print fileName
urllib.urlretrieve(imgUrl,fileName)
#批量下载图片,保存到本地
def batchDownLoadJPGs(imgUrls,fileName='F:/python/baidu_news_jpg/'):
count = 1
for imgUrl in imgUrls:
downloadJPGs(imgUrl,''.join([fileName,'{0}.jpg'.format(count)]))
if count+1>len(imgUrls):
print 'html图片全部下载完成'
else:
print '正在下载第',str(count),'张JPG格式图片 '
count +=1
#封装:下载图片
def download(url):
html = getHtmlContent(url)
imgUrls = getJPGs(html)
batchDownLoadJPGs(imgUrls)
def main():
#url = 'http://ent.ifeng.com/a/20180911/43100024_0.shtml?_zbs_baidu_news#p=1'
#url = 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=%202.7%20%20format'
url = 'http://news.baidu.com'
download(url)
if __name__ =='__main__':
main()