英文:
Scrapy chain of requests to combine items from multiple requests
问题
I have translated the code portions you provided. Here is the translated code:
我正在尝试将具有字段 `similarIdeas` 的项目与项目字段合并,该字段现在是一个列表,我正在使用 `requests` 获取数据,但我需要通过链接它们并产生一个单一的项目来完成这些请求,但不确定如何正确执行。这是使用 requests 模块的我的代码:
```python
import scrapy
from scrapy.selector import Selector
import json
import math
import requests
class HouzzScraper(scrapy.Spider):
# ... (其余代码)
我尝试通过在请求中将 item
添加到 meta
中来合并请求,但它没有按预期工作,它只收集了一个 similarIdeas
,应该至少有 25 或 24,并且尽管我没有设置 dont_filter=True
,但会产生许多重复项。以下是尝试链接请求的代码:
import scrapy
from scrapy.selector import Selector
import json
import math
import requests
class HouzzSimilar(scrapy.Spider):
# ... (其余代码)
如果您需要更多帮助或有其他问题,请随时提问。
英文:
I am trying to combine the item with item field similarIdeas
which is a list right now I am using requests
to get the data but I need to yield those requests by chaining them and yield one single item not sure how to do it correctly. Here is my code using requests module:
import scrapy
from scrapy.selector import Selector
import json
import math
import requests
class HouzzScraper(scrapy.Spider):
name = "houzz"
# custom settings
custom_settings = {
"LOG_FILE": "houzz_spider.log",
"IMAGES_STORE": "houzz_images",
"FEEDS": {
"houzz.json": {
"format": "json",
}
},
"ITEM_PIPELINES": {
"houzz_crawler.pipelines.HouzzImagePipeline": 1,
},
}
headers = {
"authority": "www.houzz.com",
"accept": "*/*",
"accept-language": "en,ru;q=0.9",
"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"origin": "https://www.houzz.com",
"referer": "https://www.houzz.com/photos/columbus-ave-residence-contemporary-bathroom-new-york-phvw-vp~160668148",
"rrid": "70402547-c900-47f7-a913-8e1cbc9aa0c3",
"sec-ch-ua": '"Chromium";v="110", "Not A(Brand";v="24", "YaBrowser";v="23"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.1.906 (beta) Yowser/2.5 Safari/537.36",
"x-csrf-token": "i8B5ykgX-eprPj5yAHSxOng08Pa4qAr2Z0TQ",
"x-hz-request": "true",
"x-ol-exp-id": "clhhdi4wu00003y71rnvty395",
"x-ol-exp-name": "Photo - View",
"x-ol-ext-device-id": "23a3cfb8-7a04-4462-af71-d98689271533",
"x-ol-ext-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
"x-ol-product": "Houzz",
"x-ol-product-variant": "Houzz US",
"x-ol-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
"x-requested-with": "XMLHttpRequest",
}
cookies = {
"v": "1683311076_f9d9a715-f45b-42dc-bc6d-7da75774a57f_9bda9dd500ca1e5119bbecaba51e53f0",
"vct": "en-US-vxnkSVVkSBzkSVVkCR%2FkSVVk8B%2FkSVVk4R3kSVVk4h3kSVVk",
"_gcl_au": "1.1.17413922.1683311086",
"crossdevicetracking": "915374c0-439c-46a1-bbf2-3a2aaa487e69",
"_pin_unauth": "dWlkPU16Y3dNbVF6T0dNdE1tWTBOaTAwWTJSa0xUazVZakV0TXprek5XWm1ZV014WWprMw",
"_sp_id.c905": "5af74097-a6bb-46e7-8d14-35ff6d738f39.1683317411.2.1683359810.1683317411.13ad94c9-5560-4fbf-963f-b63e32f2124d",
"g_state": '{"i_p":1684144918349,"i_l":3}',
"browseResultSetGridWidth": "554",
"_gid": "GA1.2.1176067560.1683652076",
"ln_or": "eyIzODE1NzE2IjoiZCJ9",
"_csrf": "G_nV-Kaa7rlqgTwnueAXkJtj",
"jdv": "t7WOzUb2vHLZtWVVHSk%2BXJEWN7ua9zR%2FUkXpY9RYDUW00hxMyur5c%2Bzn6M%2BqQADtWOInJpmlQA37Gxp0L267jdj74Iwe",
"documentWidth": "1318",
"_uetsid": "0bf41840ee8c11edac06995ca98afa3c",
"_uetvid": "1e07d960eb7211ed880b7db3cdc86191",
"_derived_epik": "dj0yJnU9NFBDc3RuOExta3NiM2xfaV9WS0RYbVVLRS1lRVpycDEmbj1tVE1RRUtOUjYwYU1Kalp0el9mNTBBJm09OCZ0PUFBQUFBR1JiUmprJnJtPTgmcnQ9QUFBQUFHUmJSamsmc3A9NQ",
"IR_gbd": "houzz.com",
"IR_5454": "1683703358356%7C0%7C1683703358356%7C%7C",
"_ga": "GA1.2.1658927820.1683311086",
"_dc_gtm_UA-3519678-1": "1",
"_ga_PB0RC2CT7B": "GS1.1.1683703353.11.1.1683704001.59.0.0",
"hzd": "70402547-c900-47f7-a913-8e1cbc9aa0c3%3A%3A%3A%3A%3ASeeMoreIdeas",
}
base_url = "https://www.houzz.com/photos/home-design-ideas-phbr0-bp~"
similar_ideas_api_url = "https://www.houzz.com/j/getSimilarSpaces"
def start_requests(self):
yield scrapy.Request(
url=self.base_url, headers=self.headers, callback=self.parse_ideas
)
def parse_ideas(self, response):
ideas = response.css("a.hz-photo-card__ratio-box::attr(href)").extract()
total_photos = int(
response.css("span.hz-top-pagination__text ::text")
.extract()[4]
.replace(",", "")
)
photos_per_page = int(
response.css("span.hz-top-pagination__text ::text").extract()[2]
)
for idea in ideas:
yield scrapy.Request(
url=idea, headers=self.headers, callback=self.parse_project_url
)
def parse_project_url(self, response):
data = response.css('script[id="hz-ctx"] ::text').get()
json_data = json.loads(data)
space_id = json_data["data"]["pageContentData"]["spaceId"]
space = json_data["data"]["stores"]["data"]["SpaceStore"]["data"][space_id]
project_id = space["projectId"]
space_url = space["url"]
raw_project_url = (
space_url.split("~")[0].replace("phvw", "pj").replace("vp", "vj")
)
project_url = raw_project_url + "~" + str(project_id)
yield scrapy.Request(
url=project_url, headers=self.headers, callback=self.parse_project_idea
)
def parse_project_idea(self, response):
idea_board = response.css(
"div.hz-prj-container.hz-prj-container__photos.clearfix ::attr(href)"
).extract()
for idea_link in idea_board:
yield scrapy.Request(
url=idea_link,
dont_filter=True,
headers=self.headers,
callback=self.parse_idea_details,
)
def parse_idea_details(self, response):
item = {}
item["ideaUrl"] = response.url
item["Title"] = response.css(
"h1.hz-view-photo__space-info__title.text-bold::text"
).get()
subtitle = response.css(
"h1.hz-view-photo__space-info__subtitle.text-m::text"
).get()
item["subTitle"] = subtitle
item["spaceDescription"] = response.css(
"div.hz-view-photo__space-info__description.text-m ::text"
).get()
item["uploadedBy"] = response.css("div.vph-owner-info__details ::text").get()
item["Tags"] = [
{"tag": t}
for t in response.css(
"ul.hz-view-photo__breadcrumb.hz-track-me ::text"
).extract()
]
item["starRating"] = len(
response.css(
"span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
)
)
item["numberOfReviews"] = response.css(
"span.hz-star-rate__review-string::text"
).get()
item["image_urls"] = response.css(
"div.view-photo-image-pane > img::attr(src)"
).extract()
item["similarIdeas"] = []
spaceId = response.url.split("~")[-1]
data = {
"spaceId": spaceId,
"fromItem": "0",
"itemsPerPage": "10",
"contentDescriptor": '{"t":1,"et":12,"id":6258114}',
}
resp = requests.post(
self.similar_ideas_api_url,
cookies=self.cookies,
headers=self.headers,
data=data,
)
data = resp.json()["spaceData"]["spaces"]
space_keys = list(data.keys())
space_urls = [data[key]["url"] for key in space_keys]
for s_url in space_urls:
space_response = requests.get(url=s_url, headers=self.headers)
similar_space = Selector(text=space_response.text)
item["similarIdeas"].append(
{
"ideaUrl": space_response.url,
"Title": similar_space.css(
"h1.hz-view-photo__space-info__title.text-bold::text"
).get(),
"SubTitle": similar_space.css(
"h1.hz-view-photo__space-info__subtitle.text-m::text"
).get(),
"uploadedBy": similar_space.css(
"div.vph-owner-info__details ::text"
).get(),
"Tags": [
{"tag": t}
for t in similar_space.css(
"ul.hz-view-photo__breadcrumb.hz-track-me ::text"
).extract()
],
"starRating": len(
response.css(
"span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
)
),
"numberOfReviews": response.css(
"span.hz-star-rate__review-string::text"
).get(),
"image_urls": similar_space.css(
"div.view-photo-image-pane > img::attr(src)"
).extract(),
}
)
yield item
I tried to join the requests by adding item
to meta
in the requests
but it didn't worked as expected it is only collecting only one similarIdeas
and it should be at least 25 or 24 and giving a lot of duplicates despite the fact I haven't place dont_filter=True
. Here is my code trying chaining requests:
import scrapy
from scrapy.selector import Selector
import json
import math
import requests
class HouzzSimilar(scrapy.Spider):
name = "houzz_s"
# custom settings
custom_settings = {
"LOG_FILE": "houzz_spider.log",
"IMAGES_STORE": "houzz_images",
"FEEDS": {
"houzz.json": {
"format": "json",
}
},
"ITEM_PIPELINES": {
"houzz_crawler.pipelines.HouzzImagePipeline": 1,
},
}
headers = {
"authority": "www.houzz.com",
"accept": "*/*",
"accept-language": "en,ru;q=0.9",
"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"origin": "https://www.houzz.com",
"referer": "https://www.houzz.com/photos/columbus-ave-residence-contemporary-bathroom-new-york-phvw-vp~160668148",
"rrid": "70402547-c900-47f7-a913-8e1cbc9aa0c3",
"sec-ch-ua": '"Chromium";v="110", "Not A(Brand";v="24", "YaBrowser";v="23"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.1.906 (beta) Yowser/2.5 Safari/537.36",
"x-csrf-token": "i8B5ykgX-eprPj5yAHSxOng08Pa4qAr2Z0TQ",
"x-hz-request": "true",
"x-ol-exp-id": "clhhdi4wu00003y71rnvty395",
"x-ol-exp-name": "Photo - View",
"x-ol-ext-device-id": "23a3cfb8-7a04-4462-af71-d98689271533",
"x-ol-ext-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
"x-ol-product": "Houzz",
"x-ol-product-variant": "Houzz US",
"x-ol-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
"x-requested-with": "XMLHttpRequest",
}
cookies = {
"v": "1683311076_f9d9a715-f45b-42dc-bc6d-7da75774a57f_9bda9dd500ca1e5119bbecaba51e53f0",
"vct": "en-US-vxnkSVVkSBzkSVVkCR%2FkSVVk8B%2FkSVVk4R3kSVVk4h3kSVVk",
"_gcl_au": "1.1.17413922.1683311086",
"crossdevicetracking": "915374c0-439c-46a1-bbf2-3a2aaa487e69",
"_pin_unauth": "dWlkPU16Y3dNbVF6T0dNdE1tWTBOaTAwWTJSa0xUazVZakV0TXprek5XWm1ZV014WWprMw",
"_sp_id.c905": "5af74097-a6bb-46e7-8d14-35ff6d738f39.1683317411.2.1683359810.1683317411.13ad94c9-5560-4fbf-963f-b63e32f2124d",
"g_state": '{"i_p":1684144918349,"i_l":3}',
"browseResultSetGridWidth": "554",
"_gid": "GA1.2.1176067560.1683652076",
"ln_or": "eyIzODE1NzE2IjoiZCJ9",
"_csrf": "G_nV-Kaa7rlqgTwnueAXkJtj",
"jdv": "t7WOzUb2vHLZtWVVHSk%2BXJEWN7ua9zR%2FUkXpY9RYDUW00hxMyur5c%2Bzn6M%2BqQADtWOInJpmlQA37Gxp0L267jdj74Iwe",
"documentWidth": "1318",
"_uetsid": "0bf41840ee8c11edac06995ca98afa3c",
"_uetvid": "1e07d960eb7211ed880b7db3cdc86191",
"_derived_epik": "dj0yJnU9NFBDc3RuOExta3NiM2xfaV9WS0RYbVVLRS1lRVpycDEmbj1tVE1RRUtOUjYwYU1Kalp0el9mNTBBJm09OCZ0PUFBQUFBR1JiUmprJnJtPTgmcnQ9QUFBQUFHUmJSamsmc3A9NQ",
"IR_gbd": "houzz.com",
"IR_5454": "1683703358356%7C0%7C1683703358356%7C%7C",
"_ga": "GA1.2.1658927820.1683311086",
"_dc_gtm_UA-3519678-1": "1",
"_ga_PB0RC2CT7B": "GS1.1.1683703353.11.1.1683704001.59.0.0",
"hzd": "70402547-c900-47f7-a913-8e1cbc9aa0c3%3A%3A%3A%3A%3ASeeMoreIdeas",
}
base_url = "https://www.houzz.com/photos/home-design-ideas-phbr0-bp~"
similar_ideas_api_url = "https://www.houzz.com/j/getSimilarSpaces"
def start_requests(self):
yield scrapy.Request(
url=self.base_url, headers=self.headers, callback=self.parse_ideas
)
def parse_ideas(self, response):
ideas = response.css("a.hz-photo-card__ratio-box::attr(href)").extract()
total_photos = int(
response.css("span.hz-top-pagination__text ::text")
.extract()[4]
.replace(",", "")
)
photos_per_page = int(
response.css("span.hz-top-pagination__text ::text").extract()[2]
)
for idea in ideas:
yield scrapy.Request(
url=idea, headers=self.headers, callback=self.parse_project_url
)
def parse_project_url(self, response):
data = response.css('script[id="hz-ctx"] ::text').get()
json_data = json.loads(data)
space_id = json_data["data"]["pageContentData"]["spaceId"]
space = json_data["data"]["stores"]["data"]["SpaceStore"]["data"][space_id]
project_id = space["projectId"]
space_url = space["url"]
raw_project_url = (
space_url.split("~")[0].replace("phvw", "pj").replace("vp", "vj")
)
project_url = raw_project_url + "~" + str(project_id)
yield scrapy.Request(
url=project_url, headers=self.headers, callback=self.parse_project_idea
)
def parse_project_idea(self, response):
idea_board = response.css(
"div.hz-prj-container.hz-prj-container__photos.clearfix ::attr(href)"
).extract()
for idea_link in idea_board:
yield scrapy.Request(
url=idea_link,
headers=self.headers,
callback=self.parse_idea_details,
)
def parse_idea_details(self, response):
item = {}
item["ideaUrl"] = response.url
item["Title"] = response.css(
"h1.hz-view-photo__space-info__title.text-bold::text"
).get()
subtitle = response.css(
"h1.hz-view-photo__space-info__subtitle.text-m::text"
).get()
item["subTitle"] = subtitle
item["spaceDescription"] = response.css(
"div.hz-view-photo__space-info__description.text-m ::text"
).get()
item["uploadedBy"] = response.css("div.vph-owner-info__details ::text").get()
item["Tags"] = [
{"tag": t}
for t in response.css(
"ul.hz-view-photo__breadcrumb.hz-track-me ::text"
).extract()
]
item["starRating"] = len(
response.css(
"span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
)
)
item["numberOfReviews"] = response.css(
"span.hz-star-rate__review-string::text"
).get()
item["image_urls"] = response.css(
"div.view-photo-image-pane > img::attr(src)"
).extract()
item["similarIdeas"] = []
spaceId = response.url.split("~")[-1]
body = f"spaceId={spaceId}&fromItem=0&itemsPerPage=10&contentDescriptor=%7B%22t%22%3A1%2C%22et%22%3A3%2C%22id%22%3A160668148%7D"
yield scrapy.Request(
url=self.similar_ideas_api_url,
method="POST",
cookies=self.cookies,
headers=self.headers,
body=body,
meta={"item": item},
callback=self.get_similar_ideas_urls,
)
def get_similar_ideas_urls(self, response):
data = response.json()["spaceData"]["spaces"]
space_keys = list(data.keys())
space_urls = [data[key]["url"] for key in space_keys]
item = response.meta.get("item")
yield scrapy.Request(
url=space_urls[0],
headers=self.headers,
meta={"item": item, "space_urls": space_urls[1:]},
callback=self.parse_similar_ideas,
)
def parse_similar_ideas(self, response):
item = response.meta.get("item")
space_urls = response.meta.get("space_urls")
item["similarIdeas"].append(
{
"ideaUrl": response.url,
"Title": response.css(
"h1.hz-view-photo__space-info__title.text-bold::text"
).get(),
"subTitle": response.css(
"h1.hz-view-photo__space-info__subtitle.text-m::text"
).get(),
"spaceDescription": response.css(
"div.hz-view-photo__space-info__description.text-m ::text"
).get(),
"uploadedBy": response.css("div.vph-owner-info__details ::text").get(),
"Tags": [
{"tag": t}
for t in response.css(
"ul.hz-view-photo__breadcrumb.hz-track-me ::text"
).extract()
],
"starRating": len(
response.css(
"span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
)
),
"numberOfReviews": response.css(
"span.hz-star-rate__review-string::text"
).get(),
"image_urls": response.css(
"div.view-photo-image-pane > img::attr(src)"
).extract(),
}
)
if len(space_urls) > 0:
yield scrapy.Request(
url=space_urls.pop(0),
headers=self.headers,
meta={"item": item, "space_urls": space_urls[1:]},
callback=self.parse_similar_ideas,
)
yield item
my expected output: https://jsoneditoronline.org/#left=cloud.6a9b829e90014b55975756556c3d0f2d
答案1
得分: 1
你的第二个例子非常接近。这里只有几点建议和一个缺失的地方:
-
你应该使用
cb_kwargs
来在回调方法之间传递数据,而不是使用meta
字典。两者都可以工作,但在这种情况下,使用cb_kwargs
是 scrapy 推荐的做法,我认为它使代码更可读,且需要的代码行数更少。 -
在运行你的第二个例子时,我遇到了很多情况下触发了重复过滤器。像这样链接请求的情况下,如果有一个链接被过滤了,可能意味着该项永远不会被产生。为了避免这种情况,你应该将
space_urls
变量设为一个集合而不是一个列表,以确保每个 URL 都是唯一的,并且你还应该在parse_similar_ideas
方法的请求中添加dont_filter
参数。 -
最后,也是最重要的一点是,你在每次调用
parse_similar_ideas
时都会产生一个 item,这意味着你会为space_urls
列表中的每个单独的 URL 产生相同的 item,并且唯一变化的是 item 中的similarIdeas
字段中的条目数。实际上,你应该在space_urls
中没有更多的 URL 时只产生一个 item,在整个链的最后只产生一个 item。
以下的示例实现了上述几点,并创建了你期望的输出。不过,请确保将你的 custom_settings
再次添加到这个示例中。
# 你的代码在这里
英文:
Your second example is really close. THere are just a couple of things I would recommend and one thing that is missing:
-
You should use
cb_kwargs
to pass data between callback methods, instead of using themeta
dict
. Both will work but using the cb_kwargs is what scrapy recommends in this situation and I believe it makes it more readable and requires fewer lines of code. -
When running your second example I ran into quite a few situations where the dupelicates filter was triggered. When chaining requests like this having single request filtered will likely mean that item will never be yielded. In order to avoid this you should make your
space_urls
variable a set instead of a list so you know each url is unique, and you should also add thedont_filter
argument to the requests in yourparse_similar_ideas
method. -
THe last and most important point is that you are yielding an item at the end of every call to
parse_similar_ideas
, which means you are yielding the same item once for every single url in thespace_urls
list, and the only thing that is changing is the number of items in thesimilarIdeas
field in your item. What you actually want to do is only yield the item once there are no more urls left inspace_urls
, then you are only yielding the item once at the very end of the chain.
The example below implements the above points and creates the output you are expecting. You will want to add your custom_settings back to the example though.
import scrapy
import json
class HouzzSimilar(scrapy.Spider):
name = "houzz"
headers = {
"authority": "www.houzz.com",
"accept": "*/*",
"accept-language": "en,ru;q=0.9",
"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"origin": "https://www.houzz.com",
"referer": "https://www.houzz.com/photos/columbus-ave-residence-contemporary-bathroom-new-york-phvw-vp~160668148",
"rrid": "70402547-c900-47f7-a913-8e1cbc9aa0c3",
"sec-ch-ua": '"Chromium";v="110", "Not A(Brand";v="24", "YaBrowser";v="23"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.1.906 (beta) Yowser/2.5 Safari/537.36",
"x-csrf-token": "i8B5ykgX-eprPj5yAHSxOng08Pa4qAr2Z0TQ",
"x-hz-request": "true",
"x-ol-exp-id": "clhhdi4wu00003y71rnvty395",
"x-ol-exp-name": "Photo - View",
"x-ol-ext-device-id": "23a3cfb8-7a04-4462-af71-d98689271533",
"x-ol-ext-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
"x-ol-product": "Houzz",
"x-ol-product-variant": "Houzz US",
"x-ol-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
"x-requested-with": "XMLHttpRequest",
}
cookies = {
"v": "1683311076_f9d9a715-f45b-42dc-bc6d-7da75774a57f_9bda9dd500ca1e5119bbecaba51e53f0",
"vct": "en-US-vxnkSVVkSBzkSVVkCR%2FkSVVk8B%2FkSVVk4R3kSVVk4h3kSVVk",
"_gcl_au": "1.1.17413922.1683311086",
"crossdevicetracking": "915374c0-439c-46a1-bbf2-3a2aaa487e69",
"_pin_unauth": "dWlkPU16Y3dNbVF6T0dNdE1tWTBOaTAwWTJSa0xUazVZakV0TXprek5XWm1ZV014WWprMw",
"_sp_id.c905": "5af74097-a6bb-46e7-8d14-35ff6d738f39.1683317411.2.1683359810.1683317411.13ad94c9-5560-4fbf-963f-b63e32f2124d",
"g_state": '{"i_p":1684144918349,"i_l":3}',
"browseResultSetGridWidth": "554",
"_gid": "GA1.2.1176067560.1683652076",
"ln_or": "eyIzODE1NzE2IjoiZCJ9",
"_csrf": "G_nV-Kaa7rlqgTwnueAXkJtj",
"jdv": "t7WOzUb2vHLZtWVVHSk%2BXJEWN7ua9zR%2FUkXpY9RYDUW00hxMyur5c%2Bzn6M%2BqQADtWOInJpmlQA37Gxp0L267jdj74Iwe",
"documentWidth": "1318",
"_uetsid": "0bf41840ee8c11edac06995ca98afa3c",
"_uetvid": "1e07d960eb7211ed880b7db3cdc86191",
"_derived_epik": "dj0yJnU9NFBDc3RuOExta3NiM2xfaV9WS0RYbVVLRS1lRVpycDEmbj1tVE1RRUtOUjYwYU1Kalp0el9mNTBBJm09OCZ0PUFBQUFBR1JiUmprJnJtPTgmcnQ9QUFBQUFHUmJSamsmc3A9NQ",
"IR_gbd": "houzz.com",
"IR_5454": "1683703358356%7C0%7C1683703358356%7C%7C",
"_ga": "GA1.2.1658927820.1683311086",
"_dc_gtm_UA-3519678-1": "1",
"_ga_PB0RC2CT7B": "GS1.1.1683703353.11.1.1683704001.59.0.0",
"hzd": "70402547-c900-47f7-a913-8e1cbc9aa0c3%3A%3A%3A%3A%3ASeeMoreIdeas",
}
base_url = "https://www.houzz.com/photos/home-design-ideas-phbr0-bp~"
similar_ideas_api_url = "https://www.houzz.com/j/getSimilarSpaces"
def start_requests(self):
yield scrapy.Request(
url=self.base_url, headers=self.headers, callback=self.parse_ideas
)
def parse_ideas(self, response):
ideas = response.css("a.hz-photo-card__ratio-box::attr(href)").extract()
total_photos = int(
response.css("span.hz-top-pagination__text ::text")
.extract()[4]
.replace(",", "")
)
photos_per_page = int(
response.css("span.hz-top-pagination__text ::text").extract()[2]
)
for idea in ideas:
yield scrapy.Request(
url=idea, headers=self.headers, callback=self.parse_project_url
)
def parse_project_url(self, response):
data = response.css('script[id="hz-ctx"] ::text').get()
json_data = json.loads(data)
space_id = json_data["data"]["pageContentData"]["spaceId"]
space = json_data["data"]["stores"]["data"]["SpaceStore"]["data"][space_id]
project_id = space["projectId"]
space_url = space["url"]
raw_project_url = (
space_url.split("~")[0].replace("phvw", "pj").replace("vp", "vj")
)
project_url = raw_project_url + "~" + str(project_id)
yield scrapy.Request(
url=project_url, headers=self.headers, callback=self.parse_project_idea
)
def parse_project_idea(self, response):
idea_board = response.css(
"div.hz-prj-container.hz-prj-container__photos.clearfix ::attr(href)"
).extract()
for idea_link in idea_board:
yield scrapy.Request(
url=idea_link,
headers=self.headers,
callback=self.parse_idea_details,
)
def parse_idea_details(self, response):
item = {}
item["ideaUrl"] = response.url
item["Title"] = response.css(
"h1.hz-view-photo__space-info__title.text-bold::text"
).get()
subtitle = response.css(
"h1.hz-view-photo__space-info__subtitle.text-m::text"
).get()
item["subTitle"] = subtitle
item["spaceDescription"] = response.css(
"div.hz-view-photo__space-info__description.text-m ::text"
).get()
item["uploadedBy"] = response.css("div.vph-owner-info__details ::text").get()
item["Tags"] = [
{"tag": t}
for t in response.css(
"ul.hz-view-photo__breadcrumb.hz-track-me ::text"
).extract()
]
item["starRating"] = len(
response.css(
"span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
)
)
item["numberOfReviews"] = response.css(
"span.hz-star-rate__review-string::text"
).get()
item["image_urls"] = response.css(
"div.view-photo-image-pane > img::attr(src)"
).extract()
item["similarIdeas"] = []
spaceId = response.url.split("~")[-1]
body = f"spaceId={spaceId}&fromItem=0&itemsPerPage=10&contentDescriptor=%7B%22t%22%3A1%2C%22et%22%3A3%2C%22id%22%3A160668148%7D"
yield scrapy.Request(
url=self.similar_ideas_api_url,
method="POST",
cookies=self.cookies,
headers=self.headers,
body=body,
cb_kwargs={"item": item}, # <-- cb_kwargs
callback=self.get_similar_ideas_urls,
)
def get_similar_ideas_urls(self, response, item=None):
data = response.json()["spaceData"]["spaces"]
space_keys = list(data.keys())
space_urls = set([data[key]["url"] for key in space_keys]) # <- set
yield scrapy.Request(
url=space_urls.pop(),
headers=self.headers,
cb_kwargs={"item": item, "space_urls": space_urls},
callback=self.parse_similar_ideas,
)
def parse_similar_ideas(self, response, item=None, space_urls=None):
item["similarIdeas"].append(
{
"ideaUrl": response.url,
"Title": response.css(
"h1.hz-view-photo__space-info__title.text-bold::text"
).get(),
"subTitle": response.css(
"h1.hz-view-photo__space-info__subtitle.text-m::text"
).get(),
"spaceDescription": response.css(
"div.hz-view-photo__space-info__description.text-m ::text"
).get(),
"uploadedBy": response.css("div.vph-owner-info__details ::text").get(),
"Tags": [
{"tag": t}
for t in response.css(
"ul.hz-view-photo__breadcrumb.hz-track-me ::text"
).extract()
],
"starRating": len(
response.css(
"span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
)
),
"numberOfReviews": response.css(
"span.hz-star-rate__review-string::text"
).get(),
"image_urls": response.css(
"div.view-photo-image-pane > img::attr(src)"
).extract(),
}
)
if len(space_urls) > 0:
yield scrapy.Request(
url=space_urls.pop(),
headers=self.headers,
cb_kwargs={"item": item, "space_urls": space_urls},
dont_filter=True, # <--- add this
callback=self.parse_similar_ideas,
)
else: # <--- this was the piece you were missing
yield item
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论