3 回答

TA贡献1848条经验 获得超10个赞
稍加修改的方法应该可以让您从该站点获得所有所需的内容,而不会出现任何问题。您需要做的就是将所有目标链接存储为方法中的列表get_links()并使用return或yield在对方法进行回调时使用get_inner_content()。您还可以禁用图像以使脚本稍快一些。
以下尝试应该为您提供所有结果:
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.crawler import CrawlerProcess
class FortuneSpider(scrapy.Spider):
name = 'fortune'
url = 'http://fortune.com/fortune500/list/'
def start_requests(self):
option = webdriver.ChromeOptions()
chrome_prefs = {}
option.experimental_options["prefs"] = chrome_prefs
chrome_prefs["profile.default_content_settings"] = {"images": 2}
chrome_prefs["profile.managed_default_content_settings"] = {"images": 2}
self.driver = webdriver.Chrome(options=option)
self.wait = WebDriverWait(self.driver,10)
yield scrapy.Request(self.url,callback=self.get_links)
def get_links(self,response):
self.driver.get(response.url)
item_links = [item.get_attribute("href") for item in self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '[class*="searchResults__title--"] a[class*="searchResults__cellWrapper--"]')))]
return [scrapy.Request(link,callback=self.get_inner_content) for link in item_links]
def get_inner_content(self,response):
self.driver.get(response.url)
chief_executive = self.wait.until(EC.presence_of_element_located((By.XPATH, '//tr[td[.="CEO"]]//td[contains(@class,"dataTable__value--")]/div'))).text
yield {'CEO': chief_executive}
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(FortuneSpider)
process.start()
或使用yield:
def get_links(self,response):
self.driver.get(response.url)
item_links = [item.get_attribute("href") for item in self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '[class*="searchResults__title--"] a[class*="searchResults__cellWrapper--"]')))]
for link in item_links:
yield scrapy.Request(link,callback=self.get_inner_content)

TA贡献1863条经验 获得超2个赞
要从网页https://fortune.com/fortune500/search/
Selenium本身解析不同公司 CEO 的姓名就足够了,您需要:
滚动到网页上的最后一项。
收集href属性并存储在列表中。
将焦点切换到新打开的选项卡并诱导WebDriverWait,
visibility_of_element_located()
您可以使用以下Locator Strategies:
代码块:
# -*- coding: UTF-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get("https://fortune.com/fortune500/search/")
driver.execute_script("arguments[0].scrollIntoView(true);", WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//span[text()='Explore Lists from Other Years']"))))
my_hrefs = [my_elem.get_attribute("href") for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//a[starts-with(@class, 'searchResults__cellWrapper--') and contains(@href, 'fortune500')][.//span/div]")))]
windows_before = driver.current_window_handle
for my_href in my_hrefs:
driver.execute_script("window.open('" + my_href +"');")
WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(2))
windows_after = driver.window_handles
new_window = [x for x in windows_after if x != windows_before][0]
driver.switch_to_window(new_window)
print(WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//table/tbody/tr//td[starts-with(@class, 'dataTable__value')]/div"))).text)
driver.close() # close the window
driver.switch_to.window(windows_before) # switch_to the parent_window_handle
driver.quit()
控制台输出:
C. Douglas McMillon
Darren W. Woods
Timothy D. Cook
Warren E. Buffett
Jeffrey P. Bezos
David S. Wichmann
Brian S. Tyler
Larry J. Merlo
Randall L. Stephenson
Steven H. Collis
Michael K. Wirth
James P. Hackett
Mary T. Barra
W. Craig Jelinek
Larry Page
Michael C. Kaufmann
Stefano Pessina
James Dimon
Hans E. Vestberg
W. Rodney McMullen
H. Lawrence Culp Jr.
Hugh R. Frater
Greg C. Garland
Joseph W. Gorder
Brian T. Moynihan
Satya Nadella
Craig A. Menear
Dennis A. Muilenburg
C. Allen Parker
Michael L. Corbat
Gary R. Heminger
Brian L. Roberts
Gail K. Boudreaux
Michael S. Dell
Marc Doyle
Michael L. Tipsord
Alex Gorsky
Virginia M. Rometty
Brian C. Cornell
Donald H. Layton
David P. Abney
Marvin R. Ellison
Robert H. Swan
Michel A. Khalaf
David S. Taylor
Gregory J. Hayes
Frederick W. Smith
Ramon L. Laguarta
Juan R. Luciano
.
.

TA贡献1824条经验 获得超5个赞
以下是如何在不使用 Selenium 的情况下更快、更轻松地获取公司详细信息的方法。
查看我如何获取company_name并change_the_world提取其他详细信息。
import requests
from bs4 import BeautifulSoup
import re
import html
with requests.Session() as session:
response = session.get("https://content.fortune.com/wp-json/irving/v1/data/franchise-search-results?list_id=2611932")
items = response.json()[1]["items"]
for item in items:
company_name = html.unescape(list(filter(lambda x: x['key'] == 'name', item["fields"]))[0]["value"])
change_the_world = list(filter(lambda x: x['key'] == 'change-the-world-y-n', item["fields"]))[0]["value"]
response = session.get(item["permalink"])
preload_data = BeautifulSoup(response.text, "html.parser").select_one("#preload").text
ceo = re.search('"ceo","value":"(.*?)"', preload_data).groups()[0]
print(f"Company: {company_name}, CEO: {ceo}, Change The World: {change_the_world}")
结果:
公司:Carvana,首席执行官:Ernest C. Garcia,Change The World:否
公司:ManTech International,首席执行官:Kevin M. Phillips,Change The World:否
公司:NuStar Energy,首席执行官:Bradley C. Barron,Change The World:否
公司:Shutterfly,首席执行官:Ryan O'Hara,改变世界:无
公司:Spire,首席执行官:Suzanne Sitherwood,改变世界:无
公司:Align Technology,首席执行官:Joseph M. Hogan,改变世界:无
公司:Herc控股公司,首席执行官:Lawrence H. Silber,改变世界:不
...
添加回答
举报