2023年5月22日 20:03:56go评论181阅读模式

英文:

How can I fix the 'arrays must be in the same length' error in my Python web scraping code using Selenium and BeautifulSoup?

问题

# Ensure all lists have the same length
length = min(len(hotels), len(links), len(prices))
hotels = hotels[:length]
links = links[:length]
prices = prices[:length]

# Take the data into a dataframe
df = pd.DataFrame({'Hotel': hotels, 'Link': links, 'Prices': prices})
print(df)

# Df to csv
df.to_csv('hotels_list30.csv', index=True)

英文:

import bs4
import pandas as pd
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import requests
#Function that open the url with selenium, and return the page source
def getPageBySel(url):
options = webdriver.ChromeOptions()
options.add_argument(&quot;--disable-dev-shm-usage&quot;)
options.add_argument(&quot;--remote-debugging-port=9222&quot;)
options.add_argument(&quot;--window-size=1920x1080&quot;)
driver = webdriver.Chrome(options=options)
driver.get(url)
page = driver.page_source
driver.quit()
return page
offset=0
hotelsArr = { f&#39;https://www.booking.com/searchresults.he.html?aid=397594&amp;label=gog235jc-1DCAEoggI46AdIDlgDaGqIAQGYAQ64ARfIAQzYAQPoAQH4AQKIAgGoAgO4ArvKrqIGwAIB0gIkMWJjMjhhNzItNDZhNC00NDZmLTk1YzgtNjhiOWM0NmM0NDA42AIE4AIB&amp;dest_id=-2601889&amp;dest_type=city&amp;group_adults=2&amp;req_adults=2&amp;no_rooms=1&amp;group_children=0&amp;checkin=2024-01-04&amp;checkout=2024-01-07&amp;req_children=0&amp;offset={offset}&#39;,
f&#39;https://www.booking.com/searchresults.he.html?ss=Manchester%2C+Greater+Manchester%2C+United+Kingdom&amp;ssne=%D7%9E%D7%A0%D7%A6%27%D7%A1%D7%98%D7%A8&amp;ssne_untouched=%D7%9E%D7%A0%D7%A6%27%D7%A1%D7%98%D7%A8&amp;efdco=1&amp;label=gog235jc-1DCAEoggI46AdIDlgDaGqIAQGYAQ64ARfIAQzYAQPoAQH4AQKIAgGoAgO4Ar3Xs6IGwAIB0gIkMWU3MTc2OTUtZDZkNi00NzFhLTk2NWYtMDczNjk5MDNhN2U52AIE4AIB&amp;aid=397594&amp;lang=he&amp;sb=1&amp;src_elem=sb&amp;src=index&amp;dest_id=-2602512&amp;dest_type=city&amp;ac_position=0&amp;ac_click_type=b&amp;ac_langcode=en&amp;ac_suggestion_list_length=5&amp;search_selected=true&amp;search_pageview_id=39e046de839803a2&amp;ac_meta=GhAzOWUwNDZkZTgzOTgwM2EyIAAoATICZW46Ck1hbmNoZXN0ZXJAAEoAUAA%3D&amp;group_adults=2&amp;checkin=2024-01-04&amp;checkout=2024-01-07&amp;no_rooms=1&amp;group_children=0&amp;sb_travel_purpose=leisure&amp;offset={offset}&#39;,
f&#39;https://www.booking.com/searchresults.he.html?aid=7961375&amp;lang=he&amp;sid=ff4607c90e3e0d79763672e65389c94b&amp;sb=1&amp;sb_lp=1&amp;src=index&amp;src_elem=sb&amp;error_url=https%3A%2F%2Fwww.booking.com%2Findex.he.html%3Faid%3D7961375%26sid%3Dff4607c90e3e0d79763672e65389c94b%26sb_price_type%3Dtotal%26%26&amp;ss=Liverpool%2C+Merseyside%2C+United+Kingdom&amp;is_ski_area=&amp;checkin_year=&amp;checkin_month=&amp;checkout_year=&amp;checkout_month=&amp;efdco=1&amp;group_adults=2&amp;group_children=0&amp;no_rooms=1&amp;b_h4u_keep_filters=&amp;from_sf=1&amp;ss_raw=Liverpool&amp;ac_position=0&amp;ac_langcode=en&amp;ac_click_type=b&amp;ac_meta=GhBhNTQ5NGFhODM2MTgwMjFkIAAoATICZW46CUxpdmVycG9vbEAASgBQAA%3D%3D&amp;dest_id=-2601422&amp;dest_type=city&amp;iata=LPL&amp;place_id_lat=53.4109&amp;place_id_lon=-2.97811&amp;search_pageview_id=a5494aa83618021d&amp;search_selected=true&amp;search_pageview_id=a5494aa83618021d&amp;checkin=2024-01-04&amp;checkout=2024-01-07&amp;ac_suggestion_list_length=5&amp;ac_suggestion_theme_list_length=0&amp;offset={offset}&#39;,
f&#39;https://www.booking.com/searchresults.he.html?ss=Birmingham%2C+West+Midlands%2C+United+Kingdom&amp;ssne=%D7%91%D7%A8%D7%9E%D7%99%D7%A0%D7%92%D7%94%D7%90%D7%9D&amp;ssne_untouched=%D7%91%D7%A8%D7%9E%D7%99%D7%A0%D7%92%D7%94%D7%90%D7%9D&amp;efdco=1&amp;label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4AsL2s6IGwAIB0gIkM2VlZjk2YjEtMDJhYi00YmExLTg1NmEtOTIxYTNhNzdhMWQ22AIF4AIB&amp;sid=ff4607c90e3e0d79763672e65389c94b&amp;aid=304142&amp;lang=he&amp;sb=1&amp;src_elem=sb&amp;src=index&amp;dest_id=-2589989&amp;dest_type=city&amp;ac_position=0&amp;ac_click_type=b&amp;ac_langcode=en&amp;ac_suggestion_list_length=5&amp;search_selected=true&amp;search_pageview_id=acea4ea102160498&amp;ac_meta=GhBhY2VhNGVhMTAyMTYwNDk4IAAoATICZW46BWJpcm1pQABKAFAA&amp;group_adults=2&amp;checkin=2024-01-04&amp;checkout=2024-01-07&amp;no_rooms=1&amp;group_children=0&amp;sb_travel_purpose=leisure&amp;offset={offset}&#39;,
f&#39;https://www.booking.com/searchresults.he.html?ss=%D7%90%D7%93%D7%99%D7%A0%D7%91%D7%95%D7%A8%D7%95%2C+%D7%A1%D7%A7%D7%95%D7%98%D7%9C%D7%A0%D7%93%2C+%D7%91%D7%A8%D7%99%D7%98%D7%A0%D7%99%D7%94&amp;ssne=%D7%92%D7%9C%D7%90%D7%96%D7%92%D7%95&amp;ssne_untouched=%D7%92%D7%9C%D7%90%D7%96%D7%92%D7%95&amp;efdco=1&amp;label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4AoP4s6IGwAIB0gIkZmQ0YjhjNTMtZTc2ZS00NDZkLThmMmEtNmUyZDk3YTAwZWJl2AIF4AIB&amp;sid=ff4607c90e3e0d79763672e65389c94b&amp;aid=304142&amp;lang=he&amp;sb=1&amp;src_elem=sb&amp;src=index&amp;dest_id=-2595386&amp;dest_type=city&amp;ac_position=0&amp;ac_click_type=b&amp;ac_langcode=he&amp;ac_suggestion_list_length=5&amp;search_selected=true&amp;search_pageview_id=ec504f01a010003c&amp;ac_meta=GhBlYzUwNGYwMWEwMTAwMDNjIAAoATICaGU6A2VkaUAASgBQAA%3D%3D&amp;group_adults=2&amp;checkin=2024-01-04&amp;checkout=2024-01-07&amp;no_rooms=1&amp;group_children=0&amp;sb_travel_purpose=leisure&amp;offset={offset}&#39;,
f&#39;https://www.booking.com/searchresults.he.html?ss=%D7%99%D7%95%D7%A8%D7%A7&amp;ssne=%D7%99%D7%95%D7%A8%D7%A7&amp;ssne_untouched=%D7%99%D7%95%D7%A8%D7%A7&amp;label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4Auj6s6IGwAIB0gIkM2IwYTU3ODgtNDJiYS00ZDk0LWI0MDAtNGU3M2U3ZDlkNzM42AIF4AIB&amp;sid=ff4607c90e3e0d79763672e65389c94b&amp;aid=304142&amp;lang=he&amp;sb=1&amp;src_elem=sb&amp;src=index&amp;dest_id=-2612321&amp;dest_type=city&amp;group_adults=2&amp;no_rooms=1&amp;group_children=0&amp;sb_travel_purpose=leisure&amp;offset={offset}&#39;
}
x=0
hotels=[]
links=[]
prices=[]
for x in hotelsArr:
offset=0
page = x
# Loop that run on the first 40 pages (offset+25 each time)
while offset &lt; 980:
while True: #Loop that make sure that the page loaded successfully 
temp=getPageBySel(page)
soup = BeautifulSoup(temp, &#39;html.parser&#39;)
if len(soup(&quot;h3&quot;,{&quot;class&quot;:&quot;a4225678b2&quot;}))&gt;0:
break
else:
time.sleep(1.5)
for element in soup.select(&quot;.fcab3ed991.fbd1d3018c.e729ed5ab6&quot;):
# Find the price
price = element.get_text(strip=True) if element else &#39;N/A&#39;
# Append the price to the prices list
prices.append(price)
# Extract the hotel name, link, and price
for element in soup(&quot;h3&quot;, {&quot;class&quot;: &quot;a4225678b2&quot;}):
# Find the hotel link
link = element(&#39;a&#39;)[0][&#39;href&#39;]
# Find the hotel name
name = element.select_one(&#39;.fcab3ed991.a23c043802&#39;).get_text() if element else &#39;N/A&#39;
# Append the data to the respective lists
hotels.append(name)
links.append(link)
offset= offset+25 #Move to the next page
# Ensure all lists have the same length
length = len(hotels)
if len(links) &lt; length:
length = len(links)
if len(prices) &lt; length:
length = len(prices)
#Take the data into dataframe
df = pd.DataFrame({
&#39;Hotel&#39;:hotels, &#39;Link&#39;:links, &#39;Prices&#39;:prices
})
print(df)
#Df to csv
df.to_csv(&#39;hotels_list30.csv&#39;, index=True)

I've get an error in this code, all the arrys must be in the same length, how can i fix this problem? tried everythins, the condition in the end and try to put 'NA' in the black cells.

I've tried to solve this but it didn't work, i need to get a df with the hotels, link and price cols, through scrapping. maybe i've got a mistake there. this is why i also put the booking links.

答案1

得分: 1

Here is the translated code portion:

有一些需要在您的代码中修复的问题

首先是您没有更改URL中的偏移值，因此它会一遍又一遍地抓取相同的URL，直到偏移条件断开，我已经修改了代码并添加了以下内容来更改它

page = page.replace(f"offset={offset - 25}", f"offset={offset}")

此外，不应该在每次偏移抓取时都创建驱动程序实例，而应该使用一个保存时间的单一实例
将`getPageBySel`方法更改如下

def get_driver():
    global driver
    if not driver:
        options = webdriver.ChromeOptions()
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--remote-debugging-port=9222")
        options.add_argument("--window-size=1920x1080")
        # options.headless=True
        driver = webdriver.Chrome(options=options)
    return driver


def getPageBySel(url):
    driver = get_driver()
    print(url)
    driver.get(url)
    page = driver.page_source
    return page

此外，您已经硬编码了最大偏移值为980，这将无法正常工作，因为当记录数量少于该值时，它会将以下循环变成无限循环，因为元素的长度一旦偏移值超过实际结果的数量，它将始终为零

while True:  # 保证页面成功加载的循环
            temp = getPageBySel(page)
            soup = BeautifulSoup(temp, 'html.parser')
            if len(soup("h3", {"class": "a4225678b2"})) > 0:
                break
            else:
                time.sleep(1.5)

因此，您应该根据页面上的结果数量获取最大值
像下面这样，我们传递页面并获取总页面数

def getOffset(x):
    driver = get_driver()
    driver.get(x)
    links = driver.find_elements(By.XPATH, '//button[@class="fc63351294 f9c5690c58"]')
    # 从元素中提取整数值
    values = [int(link.text) for link in links]
    # 找到具有最高值的整数
    highest_value = max(values)
    return highest_value

然后我们将偏移的最大值保留为

maxValue = getOffset(x) * 25

完整代码

Please note that I've translated the code, but it's important to ensure that it integrates correctly into your existing code and dependencies.

英文:

There are a few things that need to be fixed in your code

First is You are not changing the offset value in the url so it keeps scraping the same url again and again until offset condition breaks, i have modified code and added below to change it

 page = page.replace(f&quot;offset={offset - 25}&quot;, f&quot;offset={offset}&quot;)

Also instead ofcreating driver instance again and again for every offset scraping you should use a single one which saves times
change getPageBySel method like below

def get_driver():
global driver
if not driver:
options = webdriver.ChromeOptions()
options.add_argument(&quot;--disable-dev-shm-usage&quot;)
options.add_argument(&quot;--remote-debugging-port=9222&quot;)
options.add_argument(&quot;--window-size=1920x1080&quot;)
# options.headless=True
driver = webdriver.Chrome(options=options)
return driver
def getPageBySel(url):
driver = get_driver()
print(url)
driver.get(url)
page = driver.page_source
return page

Also you have hardcoded the max offset value to 980 which won't work as when no of records are less than that, it turn the below loop into an infinite loop as length of elements will always be zero once offset has exceeded the actual number of results

while True:  # Loop that make sure that the page loaded successfully
temp = getPageBySel(page)
soup = BeautifulSoup(temp, &#39;html.parser&#39;)
if len(soup(&quot;h3&quot;, {&quot;class&quot;: &quot;a4225678b2&quot;})) &gt; 0:
break
else:
time.sleep(1.5)

So you should get this max value according to the number of results in page
like below where we pass the page and get the total pages

def getOffset(x):
driver = get_driver()
driver.get(x)
links = driver.find_elements(By.XPATH, &#39;//button[@class=&quot;fc63351294 f9c5690c58&quot;]&#39;)
# Extract the integer values from the elements
values = [int(link.text) for link in links]
# Find the integer with the highest value
highest_value = max(values)
return highest_value

then we keep max value of offset as

maxValue = getOffset(x) *25

Full Code

import bs4
import pandas as pd
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import requests
driver = None
def get_driver():
global driver
if not driver:
options = webdriver.ChromeOptions()
options.add_argument(&quot;--disable-dev-shm-usage&quot;)
options.add_argument(&quot;--remote-debugging-port=9222&quot;)
options.add_argument(&quot;--window-size=1920x1080&quot;)
# options.headless=True
driver = webdriver.Chrome(options=options)
return driver
def getPageBySel(url):
driver = get_driver()
print(url)
driver.get(url)
page = driver.page_source
return page
def getOffset(x):
driver = get_driver()
driver.get(x)
links = driver.find_elements(By.XPATH, &#39;//button[@class=&quot;fc63351294 f9c5690c58&quot;]&#39;)
# Extract the integer values from the elements
values = [int(link.text) for link in links]
# Find the integer with the highest value
highest_value = max(values)
return highest_value
offset = 0
hotelsArr = {
f&#39;https://www.booking.com/searchresults.he.html?aid=397594&amp;label=gog235jc-1DCAEoggI46AdIDlgDaGqIAQGYAQ64ARfIAQzYAQPoAQH4AQKIAgGoAgO4ArvKrqIGwAIB0gIkMWJjMjhhNzItNDZhNC00NDZmLTk1YzgtNjhiOWM0NmM0NDA42AIE4AIB&amp;dest_id=-2601889&amp;dest_type=city&amp;group_adults=2&amp;req_adults=2&amp;no_rooms=1&amp;group_children=0&amp;checkin=2024-01-04&amp;checkout=2024-01-07&amp;req_children=0&amp;offset={offset}&#39;,
f&#39;https://www.booking.com/searchresults.he.html?ss=Manchester%2C+Greater+Manchester%2C+United+Kingdom&amp;ssne=%D7%9E%D7%A0%D7%A6%27%D7%A1%D7%98%D7%A8&amp;ssne_untouched=%D7%9E%D7%A0%D7%A6%27%D7%A1%D7%98%D7%A8&amp;efdco=1&amp;label=gog235jc-1DCAEoggI46AdIDlgDaGqIAQGYAQ64ARfIAQzYAQPoAQH4AQKIAgGoAgO4Ar3Xs6IGwAIB0gIkMWU3MTc2OTUtZDZkNi00NzFhLTk2NWYtMDczNjk5MDNhN2U52AIE4AIB&amp;aid=397594&amp;lang=he&amp;sb=1&amp;src_elem=sb&amp;src=index&amp;dest_id=-2602512&amp;dest_type=city&amp;ac_position=0&amp;ac_click_type=b&amp;ac_langcode=en&amp;ac_suggestion_list_length=5&amp;search_selected=true&amp;search_pageview_id=39e046de839803a2&amp;ac_meta=GhAzOWUwNDZkZTgzOTgwM2EyIAAoATICZW46Ck1hbmNoZXN0ZXJAAEoAUAA%3D&amp;group_adults=2&amp;checkin=2024-01-04&amp;checkout=2024-01-07&amp;no_rooms=1&amp;group_children=0&amp;sb_travel_purpose=leisure&amp;offset={offset}&#39;,
f&#39;https://www.booking.com/searchresults.he.html?aid=7961375&amp;lang=he&amp;sid=ff4607c90e3e0d79763672e65389c94b&amp;sb=1&amp;sb_lp=1&amp;src=index&amp;src_elem=sb&amp;error_url=https%3A%2F%2Fwww.booking.com%2Findex.he.html%3Faid%3D7961375%26sid%3Dff4607c90e3e0d79763672e65389c94b%26sb_price_type%3Dtotal%26%26&amp;ss=Liverpool%2C+Merseyside%2C+United+Kingdom&amp;is_ski_area=&amp;checkin_year=&amp;checkin_month=&amp;checkout_year=&amp;checkout_month=&amp;efdco=1&amp;group_adults=2&amp;group_children=0&amp;no_rooms=1&amp;b_h4u_keep_filters=&amp;from_sf=1&amp;ss_raw=Liverpool&amp;ac_position=0&amp;ac_langcode=en&amp;ac_click_type=b&amp;ac_meta=GhBhNTQ5NGFhODM2MTgwMjFkIAAoATICZW46CUxpdmVycG9vbEAASgBQAA%3D%3D&amp;dest_id=-2601422&amp;dest_type=city&amp;iata=LPL&amp;place_id_lat=53.4109&amp;place_id_lon=-2.97811&amp;search_pageview_id=a5494aa83618021d&amp;search_selected=true&amp;search_pageview_id=a5494aa83618021d&amp;checkin=2024-01-04&amp;checkout=2024-01-07&amp;ac_suggestion_list_length=5&amp;ac_suggestion_theme_list_length=0&amp;offset={offset}&#39;,
f&#39;https://www.booking.com/searchresults.he.html?ss=Birmingham%2C+West+Midlands%2C+United+Kingdom&amp;ssne=%D7%91%D7%A8%D7%9E%D7%99%D7%A0%D7%92%D7%94%D7%90%D7%9D&amp;ssne_untouched=%D7%91%D7%A8%D7%9E%D7%99%D7%A0%D7%92%D7%94%D7%90%D7%9D&amp;efdco=1&amp;label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4AsL2s6IGwAIB0gIkM2VlZjk2YjEtMDJhYi00YmExLTg1NmEtOTIxYTNhNzdhMWQ22AIF4AIB&amp;sid=ff4607c90e3e0d79763672e65389c94b&amp;aid=304142&amp;lang=he&amp;sb=1&amp;src_elem=sb&amp;src=index&amp;dest_id=-2589989&amp;dest_type=city&amp;ac_position=0&amp;ac_click_type=b&amp;ac_langcode=en&amp;ac_suggestion_list_length=5&amp;search_selected=true&amp;search_pageview_id=acea4ea102160498&amp;ac_meta=GhBhY2VhNGVhMTAyMTYwNDk4IAAoATICZW46BWJpcm1pQABKAFAA&amp;group_adults=2&amp;checkin=2024-01-04&amp;checkout=2024-01-07&amp;no_rooms=1&amp;group_children=0&amp;sb_travel_purpose=leisure&amp;offset={offset}&#39;,
f&#39;https://www.booking.com/searchresults.he.html?ss=%D7%90%D7%93%D7%99%D7%A0%D7%91%D7%95%D7%A8%D7%95%2C+%D7%A1%D7%A7%D7%95%D7%98%D7%9C%D7%A0%D7%93%2C+%D7%91%D7%A8%D7%99%D7%98%D7%A0%D7%99%D7%94&amp;ssne=%D7%92%D7%9C%D7%90%D7%96%D7%92%D7%95&amp;ssne_untouched=%D7%92%D7%9C%D7%90%D7%96%D7%92%D7%95&amp;efdco=1&amp;label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4AoP4s6IGwAIB0gIkZmQ0YjhjNTMtZTc2ZS00NDZkLThmMmEtNmUyZDk3YTAwZWJl2AIF4AIB&amp;sid=ff4607c90e3e0d79763672e65389c94b&amp;aid=304142&amp;lang=he&amp;sb=1&amp;src_elem=sb&amp;src=index&amp;dest_id=-2595386&amp;dest_type=city&amp;ac_position=0&amp;ac_click_type=b&amp;ac_langcode=he&amp;ac_suggestion_list_length=5&amp;search_selected=true&amp;search_pageview_id=ec504f01a010003c&amp;ac_meta=GhBlYzUwNGYwMWEwMTAwMDNjIAAoATICaGU6A2VkaUAASgBQAA%3D%3D&amp;group_adults=2&amp;checkin=2024-01-04&amp;checkout=2024-01-07&amp;no_rooms=1&amp;group_children=0&amp;sb_travel_purpose=leisure&amp;offset={offset}&#39;,
f&#39;https://www.booking.com/searchresults.he.html?ss=%D7%99%D7%95%D7%A8%D7%A7&amp;ssne=%D7%99%D7%95%D7%A8%D7%A7&amp;ssne_untouched=%D7%99%D7%95%D7%A8%D7%A7&amp;label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4Auj6s6IGwAIB0gIkM2IwYTU3ODgtNDJiYS00ZDk0LWI0MDAtNGU3M2U3ZDlkNzM42AIF4AIB&amp;sid=ff4607c90e3e0d79763672e65389c94b&amp;aid=304142&amp;lang=he&amp;sb=1&amp;src_elem=sb&amp;src=index&amp;dest_id=-2612321&amp;dest_type=city&amp;group_adults=2&amp;no_rooms=1&amp;group_children=0&amp;sb_travel_purpose=leisure&amp;offset={offset}&#39;
}
x = 0
hotels = []
links = []
prices = []
for x in hotelsArr:
maxValue = getOffset(x) *25
offset = 0
page = x
# Loop that run on the first 40 pages (offset+25 each time)
while offset &lt; maxValue:
while True:  # Loop that make sure that the page loaded successfully
temp = getPageBySel(page)
soup = BeautifulSoup(temp, &#39;html.parser&#39;)
if len(soup(&quot;h3&quot;, {&quot;class&quot;: &quot;a4225678b2&quot;})) &gt; 0:
break
else:
time.sleep(1.5)
for element in soup.select(&quot;.fcab3ed991.fbd1d3018c.e729ed5ab6&quot;):
# Find the price
price = element.get_text(strip=True) if element else &#39;N/A&#39;
# Append the price to the prices list
prices.append(price)
# Extract the hotel name, link, and price
for element in soup(&quot;h3&quot;, {&quot;class&quot;: &quot;a4225678b2&quot;}):
# Find the hotel link
link = element(&#39;a&#39;)[0][&#39;href&#39;]
# Find the hotel name
name = element.select_one(&#39;.fcab3ed991.a23c043802&#39;).get_text() if element else &#39;N/A&#39;
# Append the data to the respective lists
hotels.append(name)
links.append(link)
offset += 25  # Move to the next page
page = page.replace(f&quot;offset={offset - 25}&quot;, f&quot;offset={offset}&quot;)  # Update the URL with the new offset
# Move to the next page
# Ensure all lists have the same length
length = len(hotels)
if len(links) &lt; length:
length = len(links)
if len(prices) &lt; length:
length = len(prices)
# Take the data into dataframe
df = pd.DataFrame({
&#39;Hotel&#39;: hotels, &#39;Link&#39;: links, &#39;Prices&#39;: prices
})
print(df)
# Df to csv
df.to_csv(&#39;hotels_list30.csv&#39;, index=True)

was able to scrape csv with around 3699 records

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

How can I fix the 'arrays must be in the same length' error in my Python web scraping code using Selenium and BeautifulSoup?

问题

答案1

快速从int16解析为float32的Python代码。

如何更正此代码，以避免引发SettingWithCopyWarning？

我的Speckle（Lee滤波器）为什么会增加噪音而不是减少噪音？

SQLAlchemy映射器事件未触发。

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

发表评论