无法使用 BeautifulSoup 抓取图像 url

我的抓取代码是。from bs4 import BeautifulSoupimport reroot_tag=["article",{"class":"story"}]image_tag=["img",{"":""},"org-src"]header=["h3",{"class":"story-title"}]news_tag=["a",{"":""},"href"]txt_data=["p",{"":""}]import requestsua1 = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'ua2 = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome'headers = {'User-Agent': ua2, 'Accept': 'text/html,application/xhtml+xml,application/xml;' \ 'q=0.9,image/webp,*/*;q=0.8'}session = requests.Session()response = session.get("website-link", headers=headers)webContent = response.contentbs = BeautifulSoup(webContent, 'lxml')all_tab_data = bs.findAll(root_tag[0], root_tag[1])output=[]for div in all_tab_data: image_url = None div_img = str(div) match = re.search(r"(http(s?):)([/|.|\w|\s|-])*\.(?:jpg|gif|png|jpeg)", div_img) print(match) # match = re.search(r"([^\\s]+(\\.(?i)(jpg|png|gif|bmp))$)",div) if match != None: image_url = str(match.group(0)) else: image_url = div.find(image_tag[0], image_tag[1]).get(image_tag[2]) if image_url !=None: if image_url[0] == '/' and image_url[1] != '/': image_url = main_url + image_url if image_url[0] == '/' and image_url[1] == '/': image_url="https://" + image_url[2:] output.append(image_url)它只给出一个 image_url，然后给出错误 AttributeError: 'NoneType' object has no attribute 'get'

查看完整描述

1 回答

扬帆大鱼

TA贡献1799条经验获得超9个赞

您可能应该尝试重用解析库，而不是自己解析这些部分。考虑这种方法：

from bs4 import BeautifulSoup

import re

root_tag = ["article", {"class":"story"}]

image_tag = ["img", {"":""}, "org-src"]

header = ["h3", {"class":"story-title"}]

news_tag = ["a", {"":""}, "href"]

txt_data = ["p", {"":""}]

# import requests

# ua1 = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'

# ua2 = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome'

# headers = {'User-Agent': ua2,

# 'Accept': 'text/html,application/xhtml+xml,application/xml;' \

# 'q=0.9,image/webp,*/*;q=0.8'}

# session = requests.Session()

# response = session.get("https://www.reuters.com/energy-environment", headers=headers)

# webContent = response.content

# file = open('output', 'wb')

# file.write(webContent)

# file.close()

file = open('output', 'r')

webContent = file.read()

bs = BeautifulSoup(webContent, 'html.parser')

all_tab_data = bs.findAll(*root_tag)

output = []

for div in all_tab_data:

image_url = None

div_img = str(div)

article_section = BeautifulSoup(div_img, 'html.parser')

article_images = article_section.findAll(*image_tag)

if article_images is not None:

output.extend([i.get('org-src') for i in article_images if i and i.get('org-src') is not None])

反对回复 2023-07-18

热搜

最近搜索清空

无法使用 BeautifulSoup 抓取图像 url

无法使用 BeautifulSoup 抓取图像 url

1 回答

添加回答