英文:
(Scrapy) How to pass a variable to the ItemLoader
问题
我正在学习Scrapy,并且无法找到解决一个问题的解决方案,该问题是在跟随教程时遇到的。
我有这个项目(Item):
import scrapy
class ChocolateProduct(scrapy.Item):
name = scrapy.Field()
price = scrapy.Field()
url = scrapy.Field()
这是ItemLoader:
from itemloaders.processors import TakeFirst, MapCompose
from scrapy.loader import ItemLoader
class ChocolateProductLoader(ItemLoader):
default_output_processor = TakeFirst()
price_in = MapCompose(lambda x: x.split('£')[-1])
url_in = MapCompose(lambda x: 'https://www.chocolate.co.uk' + x)
以及这个爬虫(spider):
import scrapy
from urllib.parse import urlparse
from scrapeops_guide.itemsloaders import ChocolateProductLoader
from scrapeops_guide.items import ChocolateProduct
class ChocolateSpider(scrapy.Spider):
name = 'chocolate_spider'
allowed_domains = ['chocolate.co.uk']
start_urls = ['https://www.chocolate.co.uk/collections/all']
def parse(self, response, **kwargs):
products = response.css('product-item')
url = urlparse(response.url)
url = f'{url.scheme}://{url.netloc}'
for product in products:
chocolate = ChocolateProductLoader(
item=ChocolateProduct(),
selector=product
)
chocolate.add_css('name', 'a.product-item-meta__title::text')
chocolate.add_css('price', 'span.price',
re='<span class="price">\n '
'<span class="visually-hidden">Sale price'
'</span>(.*)</span>')
chocolate.add_css('url', 'div.product-item-meta a::attr(href)')
yield chocolate.load_item()
next_page = response.css('a[rel="next"]::attr(href)').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
我尝试的目标是不在ItemLoader内使用固定字符串,而是使它变成动态的。例如,用爬虫(spider)中的值替换ItemLoader中的固定字符串(url_in)。
我已经尝试将一个参数传递给ChocolateProductLoader,并将其作为Loader的上下文属性中可用,但我无法在Loader内访问它。我愿意尝试其他方法来实现相同的解决方案。
英文:
I'm learning about Scrapy and can't find a solution for a question that I have following a tutorial.
I have this Item:
import scrapy
class ChocolateProduct(scrapy.Item):
name = scrapy.Field()
price = scrapy.Field()
url = scrapy.Field()
This ItemLoader:
from itemloaders.processors import TakeFirst, MapCompose
from scrapy.loader import ItemLoader
class ChocolateProductLoader(ItemLoader):
default_output_processor = TakeFirst()
price_in = MapCompose(lambda x: x.split('£')[-1])
url_in = MapCompose(lambda x: 'https://www.chocolate.co.uk' + x)
And this spider:
import scrapy
from urllib.parse import urlparse
from scrapeops_guide.itemsloaders import ChocolateProductLoader
from scrapeops_guide.items import ChocolateProduct
class ChocolateSpider(scrapy.Spider):
name = 'chocolate_spider'
allowed_domains = ['chocolate.co.uk']
start_urls = ['https://www.chocolate.co.uk/collections/all']
def parse(self, response, **kwargs):
products = response.css('product-item')
url = urlparse(response.url)
url = f'{url.scheme}://{url.netloc}'
for product in products:
chocolate = ChocolateProductLoader(
item=ChocolateProduct(),
selector=product
)
chocolate.add_css('name', 'a.product-item-meta__title::text')
chocolate.add_css('price', 'span.price',
re='<span class="price">\n '
'<span class="visually-hidden">Sale price'
'</span>(.*)</span>')
chocolate.add_css('url', 'div.product-item-meta a::attr(href)')
yield chocolate.load_item()
next_page = response.css('a[rel="next"]::attr(href)').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
What I'm trying to do is not to use a fixed string inside the ItemLoader and make it dynamic. For example, replace the fixed string from the ItemLoader (url_in) with the value from the spider (url).
I already tried passing a parameter to the ChocolateProductLoader that is available in the context attribute of the Loader but I'm unable to access it inside the Loader. I'm open to other ways to achieve the same solution.
答案1
得分: 0
我找到了一个解决方案。我意识到,如果我在ItemLoader中定义一个希望的变量名称的函数,它就会执行。因此,我将url变量添加到我的Loader实例中:
chocolate = ChocolateProductLoader(
item=ChocolateProduct(),
selector=product,
url=f'{url.scheme}://{url.netloc}'
)
然后,我更改了ItemLoader的代码如下:
from urllib.parse import urlparse
from itemloaders.processors import TakeFirst, MapCompose
from scrapy.loader import ItemLoader
class ChocolateProductLoader(ItemLoader):
default_output_processor = TakeFirst()
price_in = MapCompose(lambda x: x.split('£')[-1])
def url_in(self, values):
return [f'{self.context.get("url")}{value}' for value in values]
我的最终代码如下:
Item:
无更改
ItemLoader:
from urllib.parse import urlparse
from itemloaders.processors import TakeFirst, MapCompose
from scrapy.loader import ItemLoader
class ChocolateProductLoader(ItemLoader):
default_output_processor = TakeFirst()
price_in = MapCompose(lambda x: x.split('£')[-1])
def url_in(self, values):
parsed_url = urlparse(self.context.get("url"))
base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
return [f'{base_url}{value}' for value in values]
Spider:
import scrapy
from scrapeops_guide.itemsloaders import ChocolateProductLoader
from scrapeops_guide.items import ChocolateProduct
class ChocolateSpider(scrapy.Spider):
name = 'chocolate_spider'
allowed_domains = ['chocolate.co.uk']
start_urls = ['https://www.chocolate.co.uk/collections/all']
def parse(self, response, **kwargs):
products = response.css('.product-item')
for product in products:
chocolate = ChocolateProductLoader(
item=ChocolateProduct(),
selector=product,
url=response.url
)
chocolate.add_css('name', 'a.product-item-meta__title::text')
chocolate.add_css('price', 'span.price',
re='<span class="price">\n '
'<span class="visually-hidden">Sale price'
'</span>(.*)</span>')
chocolate.add_css('url', 'div.product-item-meta a::attr(href)')
yield chocolate.load_item()
next_page = response.css('a[rel="next"]::attr(href)').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
希望这些代码能够满足您的需求。
英文:
I found a solution. I realized that if I define a function with that name of the variable that I want on the ItemLoader it is executed. So I added the url variable to my Loader instance:
chocolate = ChocolateProductLoader(
item=ChocolateProduct(),
selector=product,
url=f'{url.scheme}://{url.netloc}'
)
and the ItemLoader code I changed to this:
from urllib.parse import urlparse
from itemloaders.processors import TakeFirst, MapCompose
from scrapy.loader import ItemLoader
class ChocolateProductLoader(ItemLoader):
default_output_processor = TakeFirst()
price_in = MapCompose(lambda x: x.split('£')[-1])
def url_in(self, values):
return [f'{self.context.get("url")}{value}' for value in values]
My final code looks like this:
Item:
No changes
ItemLoader:
from urllib.parse import urlparse
from itemloaders.processors import TakeFirst, MapCompose
from scrapy.loader import ItemLoader
class ChocolateProductLoader(ItemLoader):
default_output_processor = TakeFirst()
price_in = MapCompose(lambda x: x.split('£')[-1])
def url_in(self, values):
parsed_url = urlparse(self.context.get("url"))
base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
return [f'{base_url}{value}' for value in values]
Spider:
import scrapy
from scrapeops_guide.itemsloaders import ChocolateProductLoader
from scrapeops_guide.items import ChocolateProduct
class ChocolateSpider(scrapy.Spider):
name = 'chocolate_spider'
allowed_domains = ['chocolate.co.uk']
start_urls = ['https://www.chocolate.co.uk/collections/all']
def parse(self, response, **kwargs):
products = response.css('.product-item')
for product in products:
chocolate = ChocolateProductLoader(
item=ChocolateProduct(),
selector=product,
url=response.url
)
chocolate.add_css('name', 'a.product-item-meta__title::text')
chocolate.add_css('price', 'span.price',
re='<span class="price">\n '
'<span class="visually-hidden">Sale price'
'</span>(.*)</span>')
chocolate.add_css('url', 'div.product-item-meta a::attr(href)')
yield chocolate.load_item()
next_page = response.css('a[rel="next"]::attr(href)').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论