英文:
How can I fix the 'arrays must be in the same length' error in my Python web scraping code using Selenium and BeautifulSoup?
问题
# Ensure all lists have the same length
length = min(len(hotels), len(links), len(prices))
hotels = hotels[:length]
links = links[:length]
prices = prices[:length]
# Take the data into a dataframe
df = pd.DataFrame({'Hotel': hotels, 'Link': links, 'Prices': prices})
print(df)
# Df to csv
df.to_csv('hotels_list30.csv', index=True)
英文:
import bs4
import pandas as pd
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import requests
#Function that open the url with selenium, and return the page source
def getPageBySel(url):
options = webdriver.ChromeOptions()
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--remote-debugging-port=9222")
options.add_argument("--window-size=1920x1080")
driver = webdriver.Chrome(options=options)
driver.get(url)
page = driver.page_source
driver.quit()
return page
offset=0
hotelsArr = { f'https://www.booking.com/searchresults.he.html?aid=397594&label=gog235jc-1DCAEoggI46AdIDlgDaGqIAQGYAQ64ARfIAQzYAQPoAQH4AQKIAgGoAgO4ArvKrqIGwAIB0gIkMWJjMjhhNzItNDZhNC00NDZmLTk1YzgtNjhiOWM0NmM0NDA42AIE4AIB&dest_id=-2601889&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&checkin=2024-01-04&checkout=2024-01-07&req_children=0&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=Manchester%2C+Greater+Manchester%2C+United+Kingdom&ssne=%D7%9E%D7%A0%D7%A6%27%D7%A1%D7%98%D7%A8&ssne_untouched=%D7%9E%D7%A0%D7%A6%27%D7%A1%D7%98%D7%A8&efdco=1&label=gog235jc-1DCAEoggI46AdIDlgDaGqIAQGYAQ64ARfIAQzYAQPoAQH4AQKIAgGoAgO4Ar3Xs6IGwAIB0gIkMWU3MTc2OTUtZDZkNi00NzFhLTk2NWYtMDczNjk5MDNhN2U52AIE4AIB&aid=397594&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2602512&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=39e046de839803a2&ac_meta=GhAzOWUwNDZkZTgzOTgwM2EyIAAoATICZW46Ck1hbmNoZXN0ZXJAAEoAUAA%3D&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
f'https://www.booking.com/searchresults.he.html?aid=7961375&lang=he&sid=ff4607c90e3e0d79763672e65389c94b&sb=1&sb_lp=1&src=index&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Findex.he.html%3Faid%3D7961375%26sid%3Dff4607c90e3e0d79763672e65389c94b%26sb_price_type%3Dtotal%26%26&ss=Liverpool%2C+Merseyside%2C+United+Kingdom&is_ski_area=&checkin_year=&checkin_month=&checkout_year=&checkout_month=&efdco=1&group_adults=2&group_children=0&no_rooms=1&b_h4u_keep_filters=&from_sf=1&ss_raw=Liverpool&ac_position=0&ac_langcode=en&ac_click_type=b&ac_meta=GhBhNTQ5NGFhODM2MTgwMjFkIAAoATICZW46CUxpdmVycG9vbEAASgBQAA%3D%3D&dest_id=-2601422&dest_type=city&iata=LPL&place_id_lat=53.4109&place_id_lon=-2.97811&search_pageview_id=a5494aa83618021d&search_selected=true&search_pageview_id=a5494aa83618021d&checkin=2024-01-04&checkout=2024-01-07&ac_suggestion_list_length=5&ac_suggestion_theme_list_length=0&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=Birmingham%2C+West+Midlands%2C+United+Kingdom&ssne=%D7%91%D7%A8%D7%9E%D7%99%D7%A0%D7%92%D7%94%D7%90%D7%9D&ssne_untouched=%D7%91%D7%A8%D7%9E%D7%99%D7%A0%D7%92%D7%94%D7%90%D7%9D&efdco=1&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4AsL2s6IGwAIB0gIkM2VlZjk2YjEtMDJhYi00YmExLTg1NmEtOTIxYTNhNzdhMWQ22AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2589989&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=acea4ea102160498&ac_meta=GhBhY2VhNGVhMTAyMTYwNDk4IAAoATICZW46BWJpcm1pQABKAFAA&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=%D7%90%D7%93%D7%99%D7%A0%D7%91%D7%95%D7%A8%D7%95%2C+%D7%A1%D7%A7%D7%95%D7%98%D7%9C%D7%A0%D7%93%2C+%D7%91%D7%A8%D7%99%D7%98%D7%A0%D7%99%D7%94&ssne=%D7%92%D7%9C%D7%90%D7%96%D7%92%D7%95&ssne_untouched=%D7%92%D7%9C%D7%90%D7%96%D7%92%D7%95&efdco=1&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4AoP4s6IGwAIB0gIkZmQ0YjhjNTMtZTc2ZS00NDZkLThmMmEtNmUyZDk3YTAwZWJl2AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2595386&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=he&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=ec504f01a010003c&ac_meta=GhBlYzUwNGYwMWEwMTAwMDNjIAAoATICaGU6A2VkaUAASgBQAA%3D%3D&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=%D7%99%D7%95%D7%A8%D7%A7&ssne=%D7%99%D7%95%D7%A8%D7%A7&ssne_untouched=%D7%99%D7%95%D7%A8%D7%A7&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4Auj6s6IGwAIB0gIkM2IwYTU3ODgtNDJiYS00ZDk0LWI0MDAtNGU3M2U3ZDlkNzM42AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2612321&dest_type=city&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}'
}
x=0
hotels=[]
links=[]
prices=[]
for x in hotelsArr:
offset=0
page = x
# Loop that run on the first 40 pages (offset+25 each time)
while offset < 980:
while True: #Loop that make sure that the page loaded successfully
temp=getPageBySel(page)
soup = BeautifulSoup(temp, 'html.parser')
if len(soup("h3",{"class":"a4225678b2"}))>0:
break
else:
time.sleep(1.5)
for element in soup.select(".fcab3ed991.fbd1d3018c.e729ed5ab6"):
# Find the price
price = element.get_text(strip=True) if element else 'N/A'
# Append the price to the prices list
prices.append(price)
# Extract the hotel name, link, and price
for element in soup("h3", {"class": "a4225678b2"}):
# Find the hotel link
link = element('a')[0]['href']
# Find the hotel name
name = element.select_one('.fcab3ed991.a23c043802').get_text() if element else 'N/A'
# Append the data to the respective lists
hotels.append(name)
links.append(link)
offset= offset+25 #Move to the next page
# Ensure all lists have the same length
length = len(hotels)
if len(links) < length:
length = len(links)
if len(prices) < length:
length = len(prices)
#Take the data into dataframe
df = pd.DataFrame({
'Hotel':hotels, 'Link':links, 'Prices':prices
})
print(df)
#Df to csv
df.to_csv('hotels_list30.csv', index=True)
I've get an error in this code, all the arrys must be in the same length, how can i fix this problem? tried everythins, the condition in the end and try to put 'NA' in the black cells.
I've tried to solve this but it didn't work, i need to get a df with the hotels, link and price cols, through scrapping. maybe i've got a mistake there. this is why i also put the booking links.
答案1
得分: 1
Here is the translated code portion:
有一些需要在您的代码中修复的问题
首先是您没有更改URL中的偏移值,因此它会一遍又一遍地抓取相同的URL,直到偏移条件断开,我已经修改了代码并添加了以下内容来更改它
page = page.replace(f"offset={offset - 25}", f"offset={offset}")
此外,不应该在每次偏移抓取时都创建驱动程序实例,而应该使用一个保存时间的单一实例
将`getPageBySel`方法更改如下
def get_driver():
global driver
if not driver:
options = webdriver.ChromeOptions()
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--remote-debugging-port=9222")
options.add_argument("--window-size=1920x1080")
# options.headless=True
driver = webdriver.Chrome(options=options)
return driver
def getPageBySel(url):
driver = get_driver()
print(url)
driver.get(url)
page = driver.page_source
return page
此外,您已经硬编码了最大偏移值为980,这将无法正常工作,因为当记录数量少于该值时,它会将以下循环变成无限循环,因为元素的长度一旦偏移值超过实际结果的数量,它将始终为零
while True: # 保证页面成功加载的循环
temp = getPageBySel(page)
soup = BeautifulSoup(temp, 'html.parser')
if len(soup("h3", {"class": "a4225678b2"})) > 0:
break
else:
time.sleep(1.5)
因此,您应该根据页面上的结果数量获取最大值
像下面这样,我们传递页面并获取总页面数
def getOffset(x):
driver = get_driver()
driver.get(x)
links = driver.find_elements(By.XPATH, '//button[@class="fc63351294 f9c5690c58"]')
# 从元素中提取整数值
values = [int(link.text) for link in links]
# 找到具有最高值的整数
highest_value = max(values)
return highest_value
然后我们将偏移的最大值保留为
maxValue = getOffset(x) * 25
完整代码
Please note that I've translated the code, but it's important to ensure that it integrates correctly into your existing code and dependencies.
英文:
There are a few things that need to be fixed in your code
First is You are not changing the offset value in the url so it keeps scraping the same url again and again until offset condition breaks, i have modified code and added below to change it
page = page.replace(f"offset={offset - 25}", f"offset={offset}")
Also instead ofcreating driver instance again and again for every offset scraping you should use a single one which saves times
change getPageBySel
method like below
def get_driver():
global driver
if not driver:
options = webdriver.ChromeOptions()
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--remote-debugging-port=9222")
options.add_argument("--window-size=1920x1080")
# options.headless=True
driver = webdriver.Chrome(options=options)
return driver
def getPageBySel(url):
driver = get_driver()
print(url)
driver.get(url)
page = driver.page_source
return page
Also you have hardcoded the max offset value to 980 which won't work as when no of records are less than that, it turn the below loop into an infinite loop as length of elements will always be zero once offset has exceeded the actual number of results
while True: # Loop that make sure that the page loaded successfully
temp = getPageBySel(page)
soup = BeautifulSoup(temp, 'html.parser')
if len(soup("h3", {"class": "a4225678b2"})) > 0:
break
else:
time.sleep(1.5)
So you should get this max value according to the number of results in page
like below where we pass the page and get the total pages
def getOffset(x):
driver = get_driver()
driver.get(x)
links = driver.find_elements(By.XPATH, '//button[@class="fc63351294 f9c5690c58"]')
# Extract the integer values from the elements
values = [int(link.text) for link in links]
# Find the integer with the highest value
highest_value = max(values)
return highest_value
then we keep max value of offset as
maxValue = getOffset(x) *25
Full Code
import bs4
import pandas as pd
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import requests
driver = None
def get_driver():
global driver
if not driver:
options = webdriver.ChromeOptions()
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--remote-debugging-port=9222")
options.add_argument("--window-size=1920x1080")
# options.headless=True
driver = webdriver.Chrome(options=options)
return driver
def getPageBySel(url):
driver = get_driver()
print(url)
driver.get(url)
page = driver.page_source
return page
def getOffset(x):
driver = get_driver()
driver.get(x)
links = driver.find_elements(By.XPATH, '//button[@class="fc63351294 f9c5690c58"]')
# Extract the integer values from the elements
values = [int(link.text) for link in links]
# Find the integer with the highest value
highest_value = max(values)
return highest_value
offset = 0
hotelsArr = {
f'https://www.booking.com/searchresults.he.html?aid=397594&label=gog235jc-1DCAEoggI46AdIDlgDaGqIAQGYAQ64ARfIAQzYAQPoAQH4AQKIAgGoAgO4ArvKrqIGwAIB0gIkMWJjMjhhNzItNDZhNC00NDZmLTk1YzgtNjhiOWM0NmM0NDA42AIE4AIB&dest_id=-2601889&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&checkin=2024-01-04&checkout=2024-01-07&req_children=0&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=Manchester%2C+Greater+Manchester%2C+United+Kingdom&ssne=%D7%9E%D7%A0%D7%A6%27%D7%A1%D7%98%D7%A8&ssne_untouched=%D7%9E%D7%A0%D7%A6%27%D7%A1%D7%98%D7%A8&efdco=1&label=gog235jc-1DCAEoggI46AdIDlgDaGqIAQGYAQ64ARfIAQzYAQPoAQH4AQKIAgGoAgO4Ar3Xs6IGwAIB0gIkMWU3MTc2OTUtZDZkNi00NzFhLTk2NWYtMDczNjk5MDNhN2U52AIE4AIB&aid=397594&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2602512&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=39e046de839803a2&ac_meta=GhAzOWUwNDZkZTgzOTgwM2EyIAAoATICZW46Ck1hbmNoZXN0ZXJAAEoAUAA%3D&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
f'https://www.booking.com/searchresults.he.html?aid=7961375&lang=he&sid=ff4607c90e3e0d79763672e65389c94b&sb=1&sb_lp=1&src=index&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Findex.he.html%3Faid%3D7961375%26sid%3Dff4607c90e3e0d79763672e65389c94b%26sb_price_type%3Dtotal%26%26&ss=Liverpool%2C+Merseyside%2C+United+Kingdom&is_ski_area=&checkin_year=&checkin_month=&checkout_year=&checkout_month=&efdco=1&group_adults=2&group_children=0&no_rooms=1&b_h4u_keep_filters=&from_sf=1&ss_raw=Liverpool&ac_position=0&ac_langcode=en&ac_click_type=b&ac_meta=GhBhNTQ5NGFhODM2MTgwMjFkIAAoATICZW46CUxpdmVycG9vbEAASgBQAA%3D%3D&dest_id=-2601422&dest_type=city&iata=LPL&place_id_lat=53.4109&place_id_lon=-2.97811&search_pageview_id=a5494aa83618021d&search_selected=true&search_pageview_id=a5494aa83618021d&checkin=2024-01-04&checkout=2024-01-07&ac_suggestion_list_length=5&ac_suggestion_theme_list_length=0&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=Birmingham%2C+West+Midlands%2C+United+Kingdom&ssne=%D7%91%D7%A8%D7%9E%D7%99%D7%A0%D7%92%D7%94%D7%90%D7%9D&ssne_untouched=%D7%91%D7%A8%D7%9E%D7%99%D7%A0%D7%92%D7%94%D7%90%D7%9D&efdco=1&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4AsL2s6IGwAIB0gIkM2VlZjk2YjEtMDJhYi00YmExLTg1NmEtOTIxYTNhNzdhMWQ22AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2589989&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=acea4ea102160498&ac_meta=GhBhY2VhNGVhMTAyMTYwNDk4IAAoATICZW46BWJpcm1pQABKAFAA&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=%D7%90%D7%93%D7%99%D7%A0%D7%91%D7%95%D7%A8%D7%95%2C+%D7%A1%D7%A7%D7%95%D7%98%D7%9C%D7%A0%D7%93%2C+%D7%91%D7%A8%D7%99%D7%98%D7%A0%D7%99%D7%94&ssne=%D7%92%D7%9C%D7%90%D7%96%D7%92%D7%95&ssne_untouched=%D7%92%D7%9C%D7%90%D7%96%D7%92%D7%95&efdco=1&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4AoP4s6IGwAIB0gIkZmQ0YjhjNTMtZTc2ZS00NDZkLThmMmEtNmUyZDk3YTAwZWJl2AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2595386&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=he&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=ec504f01a010003c&ac_meta=GhBlYzUwNGYwMWEwMTAwMDNjIAAoATICaGU6A2VkaUAASgBQAA%3D%3D&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=%D7%99%D7%95%D7%A8%D7%A7&ssne=%D7%99%D7%95%D7%A8%D7%A7&ssne_untouched=%D7%99%D7%95%D7%A8%D7%A7&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4Auj6s6IGwAIB0gIkM2IwYTU3ODgtNDJiYS00ZDk0LWI0MDAtNGU3M2U3ZDlkNzM42AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2612321&dest_type=city&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}'
}
x = 0
hotels = []
links = []
prices = []
for x in hotelsArr:
maxValue = getOffset(x) *25
offset = 0
page = x
# Loop that run on the first 40 pages (offset+25 each time)
while offset < maxValue:
while True: # Loop that make sure that the page loaded successfully
temp = getPageBySel(page)
soup = BeautifulSoup(temp, 'html.parser')
if len(soup("h3", {"class": "a4225678b2"})) > 0:
break
else:
time.sleep(1.5)
for element in soup.select(".fcab3ed991.fbd1d3018c.e729ed5ab6"):
# Find the price
price = element.get_text(strip=True) if element else 'N/A'
# Append the price to the prices list
prices.append(price)
# Extract the hotel name, link, and price
for element in soup("h3", {"class": "a4225678b2"}):
# Find the hotel link
link = element('a')[0]['href']
# Find the hotel name
name = element.select_one('.fcab3ed991.a23c043802').get_text() if element else 'N/A'
# Append the data to the respective lists
hotels.append(name)
links.append(link)
offset += 25 # Move to the next page
page = page.replace(f"offset={offset - 25}", f"offset={offset}") # Update the URL with the new offset
# Move to the next page
# Ensure all lists have the same length
length = len(hotels)
if len(links) < length:
length = len(links)
if len(prices) < length:
length = len(prices)
# Take the data into dataframe
df = pd.DataFrame({
'Hotel': hotels, 'Link': links, 'Prices': prices
})
print(df)
# Df to csv
df.to_csv('hotels_list30.csv', index=True)
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论