为了账号安全,请及时绑定邮箱和手机立即绑定

如何从一个链接中生成一个已解析的项目,并从同一项目列表中的其他链接生成其他已解析的项目

如何从一个链接中生成一个已解析的项目,并从同一项目列表中的其他链接生成其他已解析的项目

杨__羊羊 2021-12-26 15:11:01
问题是我一直在从一个地方列表中迭代来刮取纬度经度和海拔高度。问题是当我得到我刮回来的东西时,我无法将它与我当前的 df 链接起来,因为我迭代的名称可能已被修改或跳过。我设法得到了我所看到的名称,但由于它是从其他项目的链接外部解析的,因此无法正常工作。import scrapyimport pandas as pdfrom ..items import latlonglocItemdf = pd.read_csv('wine_df_final.csv')df = df[pd.notnull(df.real_place)]real_place = list(set(df.real_place))class latlonglocSpider(scrapy.Spider):    name = 'latlonglocs'    start_urls = []    for place in real_place:        baseurl =  place.replace(',', '').replace(' ', '+')        cleaned_href = f'http://www.google.com/search?q={baseurl}+coordinates+latitude+longitude+distancesto'        start_urls.append(cleaned_href)    def parse(self, response):        items = latlonglocItem()        items['base_name'] = response.xpath('string(/html/head/title)').get().split(' coordinates')[0]        for href in response.xpath('//*[@id="ires"]/ol/div/h3/a/@href').getall():            if href.startswith('/url?q=https://www.distancesto'):                yield response.follow(href, self.parse_distancesto)            else:                pass        yield items    def parse_distancesto(self, response):        items = latlonglocItem()        try:            items['appellation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[2]/p/strong)').get()            items['latitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[1]/td)').get()            items['longitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[2]/td)').get()            items['elevation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[10]/td)').get()            yield items        except Exception:            pass#output appellation      base_name       elevation    latitude    longitude                  Chalone, USA Santa Cruz, USA                  56.81        35           9.23 发生的事情是我解析了我寻找的内容,然后它进入链接并解析其余信息。但是,显然在我的数据框中,我得到了与其余项目完全无关的名称,即使这样也很难找到匹配项。我希望将信息传递给另一个函数,以便将所有项目一起生成。
查看完整描述

2 回答

?
温温酱

TA贡献1752条经验 获得超4个赞

这可能有效。我会评论我在做什么和你对我在做什么的理解的一点代码。


import scrapy

import pandas as pd

from ..items import latlonglocItem



df = pd.read_csv('wine_df_final.csv')

df = df[pd.notnull(df.real_place)]

real_place = list(set(df.real_place))



class latlonglocSpider(scrapy.Spider): # latlonglocSpider is a child class of scrapy.Spider


    name = 'latlonglocs'

    start_urls = []


    for place in real_place:

        baseurl =  place.replace(',', '').replace(' ', '+')

        cleaned_href = f'http://www.google.com/search?q={baseurl}+coordinates+latitude+longitude+distancesto'

        start_urls.append(cleaned_href)


    def __init__(self): # Constructor for our class

        # Since we did our own constructor we need to call the parents constructor

        scrapy.Spider.__init__(self)

        self.base_name = None # Here is the base_name we can now use class wide


    def parse(self, response):


        items = latlonglocItem()


        items['base_name'] = response.xpath('string(/html/head/title)').get().split(' coordinates')[0]

        self.base_name = items['base_name'] # Lets store the base_name in the class

        for href in response.xpath('//*[@id="ires"]/ol/div/h3/a/@href').getall():

            if href.startswith('/url?q=https://www.distancesto'):

                yield response.follow(href, self.parse_distancesto)

            else:

                pass

        yield items


    def parse_distancesto(self, response):

        items = latlonglocItem()


        try:

            # If for some reason self.base_name is never assigned in

            # parse() then we want to use an empty string instead of the self.base_name


            # The following syntax means use self.base_name unless it is None or empty

            # in which case just use and empty string.

            base_name = self.base_name or "" # If for some reason


            items['appellation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[2]/p/strong)').get()

            items['latitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[1]/td)').get()

            items['longitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[2]/td)').get()

            items['elevation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[10]/td)').get()

            yield items

        except Exception:

            pass


查看完整回答
反对 回复 2021-12-26
?
慕尼黑的夜晚无繁华

TA贡献1864条经验 获得超6个赞

import scrapy

import pandas as pd

from ..items import latlonglocItem



df = pd.read_csv('wine_df_final.csv')

df = df[pd.notnull(df.real_place)]

real_place = list(set(df.real_place))



class latlonglocSpider(scrapy.Spider): # latlonglocSpider is a child class of scrapy.Spider


    name = 'latlonglocs'

    start_urls = []


    for place in real_place:

        baseurl =  place.replace(',', '').replace(' ', '+')

        cleaned_href = f'http://www.google.com/search?q={baseurl}+coordinates+latitude+longitude+distancesto'

        start_urls.append(cleaned_href)


    def __init__(self): # Constructor for our class

        # Since we did our own constructor we need to call the parents constructor

        scrapy.Spider.__init__(self)

        self.base_name = None # Here is the base_name we can now use class wide


    def parse(self, response):


        for href in response.xpath('//*[@id="ires"]/ol/div/h3/a/@href').getall():


            if href.startswith('/url?q=https://www.distancesto'):

                self.base_name = response.xpath('string(/html/head/title)').get().split(' coordinates')[0]


                yield response.follow(href, self.parse_distancesto)

            else:

                pass


    def parse_distancesto(self, response):

        items = latlonglocItem()


        try:

            # If for some reason self.base_name is never assigned in

            # parse() then we want to use an empty string instead of the self.base_name


            # The following syntax means use self.base_name unless it is None or empty

            # in which case just use and empty string.

            items['base_name'] = self.base_name or "" # If for some reason

            items['appellation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[2]/p/strong)').get()

            items['latitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[1]/td)').get()

            items['longitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[2]/td)').get()

            items['elevation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[10]/td)').get()

            yield items

        except Exception:

            pass

并发请求必须设置为 1 才能工作并将 base_name 放置在循环中。


查看完整回答
反对 回复 2021-12-26
  • 2 回答
  • 0 关注
  • 206 浏览
慕课专栏
更多

添加回答

举报

0/150
提交
取消
微信客服

购课补贴
联系客服咨询优惠详情

帮助反馈 APP下载

慕课网APP
您的移动学习伙伴

公众号

扫描二维码
关注慕课网微信公众号