英文:
Using FormRequest to extract data via HTTP POST
问题
使用FormRequest进行HTTP POST提取数据
嘿,大家好
我将使用Scrapy来爬取https://bitsclassic.com/fa/ 网站上所有产品的详细信息
为了获取产品的URL,我需要向Web服务https://bitsclassic.com/fa/Product/ProductList发送POST请求
我尝试了这个,但没有输出!
我应该如何发送请求?
class BitsclassicSpider(scrapy.Spider):
name = "bitsclassic"
start_urls = ['https://bitsclassic.com/fa']
def parse(self, response):
"""
这个方法是默认的回调函数,在爬虫开始抓取网站时执行。
"""
category_urls = response.css('ul.children a::attr(href)').getall()[1:]
for category_url in category_urls:
yield scrapy.Request(category_url, callback=self.parse_category)
def parse_category(self, response):
"""
这个方法是用于处理分类请求的回调函数。
"""
category_id = re.search(r"/(\d+)-", response.url).group(1)
num_products = 1000
# 创建POST请求的表单数据
form_data = {
'Cats': str(category_id),
'Size': str(num_products)
}
# 发送POST请求以获取产品列表
yield FormRequest(
url='https://bitsclassic.com/fa/Product/ProductList',
method='POST',
formdata=form_data,
callback=self.parse_page
)
def parse_page(self, response):
"""
这个方法是处理产品页面请求的回调函数。
"""
# 使用XPath或CSS选择器从响应中提取数据
title = response.css('p[itemrolep="name"]::text').get()
url = response.url
categories = response.xpath('//div[@class="con-main"]//a/text()').getall()
price = response.xpath('//div[@id="priceBox"]//span[@data-role="price"]/text()').get()
# 处理提取的数据
if price is not None:
price = price.strip()
product_exist = True
else:
price = None
product_exist = False
# 使用提取的数据创建一个新的item
item = BitsclassicItem()
item["title"] = title.strip()
item["categories"] = categories[3:-1]
item["product_exist"] = product_exist
item["price"] = price
item["url"] = response.url
item["domain"] = "bitsclassic.com/fa"
# 将item传递给下一个管道阶段进行进一步处理
yield item
我怀疑我发送请求的方式是否正确?
英文:
Using FormRequest to extract data via HTTP POST
hey guys
I will crawl the details of all the products of the site https://bitsclassic.com/fa/ with scrapy
To get the url of the products, I have to sends a POST request to the web service https://bitsclassic.com/fa/Product/ProductList
I did this, but it doesn't output!
How do I post a request?
class BitsclassicSpider(scrapy.Spider):
name = "bitsclassic"
start_urls = ['https://bitsclassic.com/fa']
def parse(self, response):
"""
This method is the default callback function that will be
executed when the spider starts crawling the website.
"""
category_urls = response.css('ul.children a::attr(href)').getall()[1:]
for category_url in category_urls:
yield scrapy.Request(category_url, callback=self.parse_category)
def parse_category(self, response):
"""
This method is the callback function for the category requests.
"""
category_id = re.search(r"/(\d+)-", response.url).group(1)
num_products = 1000
# Create the form data for the POST request
form_data = {
'Cats': str(category_id),
'Size': str(num_products)
}
# Send a POST request to retrieve the product list
yield FormRequest(
url='https://bitsclassic.com/fa/Product/ProductList',
method='POST',
formdata=form_data,
callback=self.parse_page
)
def parse_page(self, response):
"""
This method is the callback function for the product page requests.
"""
# Extract data from the response using XPath or CSS selectors
title = response.css('p[itemrolep="name"]::text').get()
url = response.url
categories = response.xpath('//div[@class="con-main"]//a/text()').getall()
price = response.xpath('//div[@id="priceBox"]//span[@data-role="price"]/text()').get()
# Process the extracted data
if price is not None:
price = price.strip()
product_exist = True
else:
price = None
product_exist = False
# Create a new item with the extracted data
item = BitsclassicItem()
item["title"] = title.strip()
item["categories"] = categories[3:-1]
item["product_exist"] = product_exist
item["price"] = price
item["url"] = response.url
item["domain"] = "bitsclassic.com/fa"
# Yield the item to pass it to the next pipeline stage for further processing
yield item
I doubted that the way I made the request is correct?
答案1
得分: 1
请求没问题。
<br>
你还有一些其他问题。
- 你从表单请求中获取的响应是JSON响应,你需要将其视为JSON响应,而不是HTML响应。
- 你只获取了每个页面的第一项。你需要使用for循环。
- 有一些可以改进你的代码的地方,我已经进行了一些修改。
import scrapy
from scrapy import FormRequest
from scrapy.http import HtmlResponse
class BitsclassicSpider(scrapy.Spider):
name = "bitsclassic"
start_urls = ['https://bitsclassic.com/fa']
def parse(self, response):
"""
This method is the default callback function that will be
executed when the spider starts crawling the website.
"""
category_urls = response.css('ul.children a')
for category in category_urls[1:]:
category_url = category.css('::attr(href)').get()
category_id = category.re(r"/(\d+)-")[0]
yield scrapy.Request(category_url, callback=self.parse_category, cb_kwargs={'category_id': category_id})
def parse_category(self, response, category_id):
"""
This method is the callback function for the category requests.
"""
num_products = 1000
# Create the form data for the POST request
form_data = {
'Cats': str(category_id),
'Size': str(12)
}
page = 1
form_data['Page'] = str(page)
yield FormRequest(
url='https://bitsclassic.com/fa/Product/ProductList',
method='POST',
formdata=form_data,
callback=self.parse_page,
cb_kwargs={'url': response.url, 'form_data': form_data, 'page': page}
)
def parse_page(self, response, url, form_data, page):
"""
This method is the callback function for the product page requests.
"""
json_data = response.json()
if not json_data:
return
html = json_data.get('Html', '')
if not html.strip():
return
html_res = HtmlResponse(url=url, body=html, encoding='utf-8')
for product in html_res.xpath('//div[@itemrole="item"]'):
# Extract data from the response using XPath or CSS selectors
title = product.css('span[itemrole="name"]::text').get(default='').strip()
# 你需要检查如何获取类别
# categories = product.xpath('//div[@class="con-main"]//a/text()').getall()
price = product.xpath('//span[@class="price"]/text()').get(default='').strip()
product_url = product.xpath('//a[@itemrole="productLink"]/@href').get()
# Process the extracted data
product_exist = True if price else False
# Create a new item with the extracted data
item = BitsclassicItem()
item["title"] = title
# item["categories"] = categories[3:-1]
item["product_exist"] = product_exist
item["price"] = price
item["url"] = product_url
item["domain"] = "bitsclassic.com/fa"
# Yield the item to pass it to the next pipeline stage for further processing
yield item
# 分页
page += 1
form_data['Page'] = str(page)
yield FormRequest(
url='https://bitsclassic.com/fa/Product/ProductList',
method='POST',
formdata=form_data,
callback=self.parse_page,
cb_kwargs={'url': url, 'form_data': form_data, 'page': page}
)
英文:
The request is fine.
<br>
You have a couple of other problems.
- The response you're getting from the form request is a JSON response, and you need to treat it like that instead of an HTML response.
- You only get the first item from each page. You to use a for loop.
- There are some things that you can do to improve your code, I did some of them.
import scrapy
from scrapy import FormRequest
from scrapy.http import HtmlResponse
class BitsclassicSpider(scrapy.Spider):
name = "bitsclassic"
start_urls = ['https://bitsclassic.com/fa']
def parse(self, response):
"""
This method is the default callback function that will be
executed when the spider starts crawling the website.
"""
category_urls = response.css('ul.children a')
for category in category_urls[1:]:
category_url = category.css('::attr(href)').get()
category_id = category.re(r"/(\d+)-")[0]
yield scrapy.Request(category_url, callback=self.parse_category, cb_kwargs={'category_id': category_id})
def parse_category(self, response, category_id):
"""
This method is the callback function for the category requests.
"""
num_products = 1000
# Create the form data for the POST request
form_data = {
'Cats': str(category_id),
'Size': str(12)
}
page = 1
form_data['Page'] = str(page)
yield FormRequest(
url='https://bitsclassic.com/fa/Product/ProductList',
method='POST',
formdata=form_data,
callback=self.parse_page,
cb_kwargs={'url': response.url, 'form_data': form_data, 'page': page}
)
def parse_page(self, response, url, form_data, page):
"""
This method is the callback function for the product page requests.
"""
json_data = response.json()
if not json_data:
return
html = json_data.get('Html', '')
if not html.strip():
return
html_res = HtmlResponse(url=url, body=html, encoding='utf-8')
for product in html_res.xpath('//div[@itemrole="item"]'):
# Extract data from the response using XPath or CSS selectors
title = product.css('span[itemrole="name"]::text').get(default='').strip()
# you need to check how to get the categories
# categories = product.xpath('//div[@class="con-main"]//a/text()').getall()
price = product.xpath('//span[@class="price"]/text()').get(default='').strip()
product_url = product.xpath('//a[@itemrole="productLink"]/@href').get()
# Process the extracted data
product_exist = True if price else False
# Create a new item with the extracted data
item = BitsclassicItem()
item["title"] = title
# item["categories"] = categories[3:-1]
item["product_exist"] = product_exist
item["price"] = price
item["url"] = product_url
item["domain"] = "bitsclassic.com/fa"
# Yield the item to pass it to the next pipeline stage for further processing
yield item
# pagination
page += 1
form_data['Page'] = str(page)
yield FormRequest(
url='https://bitsclassic.com/fa/Product/ProductList',
method='POST',
formdata=form_data,
callback=self.parse_page,
cb_kwargs={'url': url, 'form_data': form_data, 'page': page}
)
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论