How can I fix the 'arrays must be in the same length' error in my Python web scraping code using Selenium and BeautifulSoup?

huangapple go评论81阅读模式
英文:

How can I fix the 'arrays must be in the same length' error in my Python web scraping code using Selenium and BeautifulSoup?

问题

# Ensure all lists have the same length
length = min(len(hotels), len(links), len(prices))
hotels = hotels[:length]
links = links[:length]
prices = prices[:length]

# Take the data into a dataframe
df = pd.DataFrame({'Hotel': hotels, 'Link': links, 'Prices': prices})
print(df)

# Df to csv
df.to_csv('hotels_list30.csv', index=True)
英文:
import bs4
import pandas as pd
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import requests
#Function that open the url with selenium, and return the page source
def getPageBySel(url):
options = webdriver.ChromeOptions()
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--remote-debugging-port=9222")
options.add_argument("--window-size=1920x1080")
driver = webdriver.Chrome(options=options)
driver.get(url)
page = driver.page_source
driver.quit()
return page
offset=0
hotelsArr = { f'https://www.booking.com/searchresults.he.html?aid=397594&label=gog235jc-1DCAEoggI46AdIDlgDaGqIAQGYAQ64ARfIAQzYAQPoAQH4AQKIAgGoAgO4ArvKrqIGwAIB0gIkMWJjMjhhNzItNDZhNC00NDZmLTk1YzgtNjhiOWM0NmM0NDA42AIE4AIB&dest_id=-2601889&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&checkin=2024-01-04&checkout=2024-01-07&req_children=0&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=Manchester%2C+Greater+Manchester%2C+United+Kingdom&ssne=%D7%9E%D7%A0%D7%A6%27%D7%A1%D7%98%D7%A8&ssne_untouched=%D7%9E%D7%A0%D7%A6%27%D7%A1%D7%98%D7%A8&efdco=1&label=gog235jc-1DCAEoggI46AdIDlgDaGqIAQGYAQ64ARfIAQzYAQPoAQH4AQKIAgGoAgO4Ar3Xs6IGwAIB0gIkMWU3MTc2OTUtZDZkNi00NzFhLTk2NWYtMDczNjk5MDNhN2U52AIE4AIB&aid=397594&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2602512&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=39e046de839803a2&ac_meta=GhAzOWUwNDZkZTgzOTgwM2EyIAAoATICZW46Ck1hbmNoZXN0ZXJAAEoAUAA%3D&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
f'https://www.booking.com/searchresults.he.html?aid=7961375&lang=he&sid=ff4607c90e3e0d79763672e65389c94b&sb=1&sb_lp=1&src=index&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Findex.he.html%3Faid%3D7961375%26sid%3Dff4607c90e3e0d79763672e65389c94b%26sb_price_type%3Dtotal%26%26&ss=Liverpool%2C+Merseyside%2C+United+Kingdom&is_ski_area=&checkin_year=&checkin_month=&checkout_year=&checkout_month=&efdco=1&group_adults=2&group_children=0&no_rooms=1&b_h4u_keep_filters=&from_sf=1&ss_raw=Liverpool&ac_position=0&ac_langcode=en&ac_click_type=b&ac_meta=GhBhNTQ5NGFhODM2MTgwMjFkIAAoATICZW46CUxpdmVycG9vbEAASgBQAA%3D%3D&dest_id=-2601422&dest_type=city&iata=LPL&place_id_lat=53.4109&place_id_lon=-2.97811&search_pageview_id=a5494aa83618021d&search_selected=true&search_pageview_id=a5494aa83618021d&checkin=2024-01-04&checkout=2024-01-07&ac_suggestion_list_length=5&ac_suggestion_theme_list_length=0&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=Birmingham%2C+West+Midlands%2C+United+Kingdom&ssne=%D7%91%D7%A8%D7%9E%D7%99%D7%A0%D7%92%D7%94%D7%90%D7%9D&ssne_untouched=%D7%91%D7%A8%D7%9E%D7%99%D7%A0%D7%92%D7%94%D7%90%D7%9D&efdco=1&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4AsL2s6IGwAIB0gIkM2VlZjk2YjEtMDJhYi00YmExLTg1NmEtOTIxYTNhNzdhMWQ22AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2589989&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=acea4ea102160498&ac_meta=GhBhY2VhNGVhMTAyMTYwNDk4IAAoATICZW46BWJpcm1pQABKAFAA&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=%D7%90%D7%93%D7%99%D7%A0%D7%91%D7%95%D7%A8%D7%95%2C+%D7%A1%D7%A7%D7%95%D7%98%D7%9C%D7%A0%D7%93%2C+%D7%91%D7%A8%D7%99%D7%98%D7%A0%D7%99%D7%94&ssne=%D7%92%D7%9C%D7%90%D7%96%D7%92%D7%95&ssne_untouched=%D7%92%D7%9C%D7%90%D7%96%D7%92%D7%95&efdco=1&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4AoP4s6IGwAIB0gIkZmQ0YjhjNTMtZTc2ZS00NDZkLThmMmEtNmUyZDk3YTAwZWJl2AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2595386&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=he&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=ec504f01a010003c&ac_meta=GhBlYzUwNGYwMWEwMTAwMDNjIAAoATICaGU6A2VkaUAASgBQAA%3D%3D&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=%D7%99%D7%95%D7%A8%D7%A7&ssne=%D7%99%D7%95%D7%A8%D7%A7&ssne_untouched=%D7%99%D7%95%D7%A8%D7%A7&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4Auj6s6IGwAIB0gIkM2IwYTU3ODgtNDJiYS00ZDk0LWI0MDAtNGU3M2U3ZDlkNzM42AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2612321&dest_type=city&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}'
}
x=0
hotels=[]
links=[]
prices=[]
for x in hotelsArr:
offset=0
page = x
# Loop that run on the first 40 pages (offset+25 each time)
while offset < 980:
while True: #Loop that make sure that the page loaded successfully 
temp=getPageBySel(page)
soup = BeautifulSoup(temp, 'html.parser')
if len(soup("h3",{"class":"a4225678b2"}))>0:
break
else:
time.sleep(1.5)
for element in soup.select(".fcab3ed991.fbd1d3018c.e729ed5ab6"):
# Find the price
price = element.get_text(strip=True) if element else 'N/A'
# Append the price to the prices list
prices.append(price)
# Extract the hotel name, link, and price
for element in soup("h3", {"class": "a4225678b2"}):
# Find the hotel link
link = element('a')[0]['href']
# Find the hotel name
name = element.select_one('.fcab3ed991.a23c043802').get_text() if element else 'N/A'
# Append the data to the respective lists
hotels.append(name)
links.append(link)
offset= offset+25 #Move to the next page
# Ensure all lists have the same length
length = len(hotels)
if len(links) < length:
length = len(links)
if len(prices) < length:
length = len(prices)
#Take the data into dataframe
df = pd.DataFrame({
'Hotel':hotels, 'Link':links, 'Prices':prices
})
print(df)
#Df to csv
df.to_csv('hotels_list30.csv', index=True)

I've get an error in this code, all the arrys must be in the same length, how can i fix this problem? tried everythins, the condition in the end and try to put 'NA' in the black cells.

I've tried to solve this but it didn't work, i need to get a df with the hotels, link and price cols, through scrapping. maybe i've got a mistake there. this is why i also put the booking links.

答案1

得分: 1

Here is the translated code portion:

有一些需要在您的代码中修复的问题

首先是您没有更改URL中的偏移值因此它会一遍又一遍地抓取相同的URL直到偏移条件断开我已经修改了代码并添加了以下内容来更改它

page = page.replace(f"offset={offset - 25}", f"offset={offset}")

此外不应该在每次偏移抓取时都创建驱动程序实例而应该使用一个保存时间的单一实例
`getPageBySel`方法更改如下

def get_driver():
    global driver
    if not driver:
        options = webdriver.ChromeOptions()
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--remote-debugging-port=9222")
        options.add_argument("--window-size=1920x1080")
        # options.headless=True
        driver = webdriver.Chrome(options=options)
    return driver


def getPageBySel(url):
    driver = get_driver()
    print(url)
    driver.get(url)
    page = driver.page_source
    return page

此外您已经硬编码了最大偏移值为980这将无法正常工作因为当记录数量少于该值时它会将以下循环变成无限循环因为元素的长度一旦偏移值超过实际结果的数量它将始终为零

while True:  # 保证页面成功加载的循环
            temp = getPageBySel(page)
            soup = BeautifulSoup(temp, 'html.parser')
            if len(soup("h3", {"class": "a4225678b2"})) > 0:
                break
            else:
                time.sleep(1.5)

因此您应该根据页面上的结果数量获取最大值
像下面这样我们传递页面并获取总页面数

def getOffset(x):
    driver = get_driver()
    driver.get(x)
    links = driver.find_elements(By.XPATH, '//button[@class="fc63351294 f9c5690c58"]')
    # 从元素中提取整数值
    values = [int(link.text) for link in links]
    # 找到具有最高值的整数
    highest_value = max(values)
    return highest_value

然后我们将偏移的最大值保留为

maxValue = getOffset(x) * 25

完整代码

Please note that I've translated the code, but it's important to ensure that it integrates correctly into your existing code and dependencies.

英文:

There are a few things that need to be fixed in your code

First is You are not changing the offset value in the url so it keeps scraping the same url again and again until offset condition breaks, i have modified code and added below to change it

 page = page.replace(f"offset={offset - 25}", f"offset={offset}")

Also instead ofcreating driver instance again and again for every offset scraping you should use a single one which saves times
change getPageBySel method like below

def get_driver():
global driver
if not driver:
options = webdriver.ChromeOptions()
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--remote-debugging-port=9222")
options.add_argument("--window-size=1920x1080")
# options.headless=True
driver = webdriver.Chrome(options=options)
return driver
def getPageBySel(url):
driver = get_driver()
print(url)
driver.get(url)
page = driver.page_source
return page

Also you have hardcoded the max offset value to 980 which won't work as when no of records are less than that, it turn the below loop into an infinite loop as length of elements will always be zero once offset has exceeded the actual number of results

while True:  # Loop that make sure that the page loaded successfully
temp = getPageBySel(page)
soup = BeautifulSoup(temp, 'html.parser')
if len(soup("h3", {"class": "a4225678b2"})) > 0:
break
else:
time.sleep(1.5)

So you should get this max value according to the number of results in page
like below where we pass the page and get the total pages

def getOffset(x):
driver = get_driver()
driver.get(x)
links = driver.find_elements(By.XPATH, '//button[@class="fc63351294 f9c5690c58"]')
# Extract the integer values from the elements
values = [int(link.text) for link in links]
# Find the integer with the highest value
highest_value = max(values)
return highest_value

then we keep max value of offset as

maxValue = getOffset(x) *25

Full Code

import bs4
import pandas as pd
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import requests
driver = None
def get_driver():
global driver
if not driver:
options = webdriver.ChromeOptions()
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--remote-debugging-port=9222")
options.add_argument("--window-size=1920x1080")
# options.headless=True
driver = webdriver.Chrome(options=options)
return driver
def getPageBySel(url):
driver = get_driver()
print(url)
driver.get(url)
page = driver.page_source
return page
def getOffset(x):
driver = get_driver()
driver.get(x)
links = driver.find_elements(By.XPATH, '//button[@class="fc63351294 f9c5690c58"]')
# Extract the integer values from the elements
values = [int(link.text) for link in links]
# Find the integer with the highest value
highest_value = max(values)
return highest_value
offset = 0
hotelsArr = {
f'https://www.booking.com/searchresults.he.html?aid=397594&label=gog235jc-1DCAEoggI46AdIDlgDaGqIAQGYAQ64ARfIAQzYAQPoAQH4AQKIAgGoAgO4ArvKrqIGwAIB0gIkMWJjMjhhNzItNDZhNC00NDZmLTk1YzgtNjhiOWM0NmM0NDA42AIE4AIB&dest_id=-2601889&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&checkin=2024-01-04&checkout=2024-01-07&req_children=0&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=Manchester%2C+Greater+Manchester%2C+United+Kingdom&ssne=%D7%9E%D7%A0%D7%A6%27%D7%A1%D7%98%D7%A8&ssne_untouched=%D7%9E%D7%A0%D7%A6%27%D7%A1%D7%98%D7%A8&efdco=1&label=gog235jc-1DCAEoggI46AdIDlgDaGqIAQGYAQ64ARfIAQzYAQPoAQH4AQKIAgGoAgO4Ar3Xs6IGwAIB0gIkMWU3MTc2OTUtZDZkNi00NzFhLTk2NWYtMDczNjk5MDNhN2U52AIE4AIB&aid=397594&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2602512&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=39e046de839803a2&ac_meta=GhAzOWUwNDZkZTgzOTgwM2EyIAAoATICZW46Ck1hbmNoZXN0ZXJAAEoAUAA%3D&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
f'https://www.booking.com/searchresults.he.html?aid=7961375&lang=he&sid=ff4607c90e3e0d79763672e65389c94b&sb=1&sb_lp=1&src=index&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Findex.he.html%3Faid%3D7961375%26sid%3Dff4607c90e3e0d79763672e65389c94b%26sb_price_type%3Dtotal%26%26&ss=Liverpool%2C+Merseyside%2C+United+Kingdom&is_ski_area=&checkin_year=&checkin_month=&checkout_year=&checkout_month=&efdco=1&group_adults=2&group_children=0&no_rooms=1&b_h4u_keep_filters=&from_sf=1&ss_raw=Liverpool&ac_position=0&ac_langcode=en&ac_click_type=b&ac_meta=GhBhNTQ5NGFhODM2MTgwMjFkIAAoATICZW46CUxpdmVycG9vbEAASgBQAA%3D%3D&dest_id=-2601422&dest_type=city&iata=LPL&place_id_lat=53.4109&place_id_lon=-2.97811&search_pageview_id=a5494aa83618021d&search_selected=true&search_pageview_id=a5494aa83618021d&checkin=2024-01-04&checkout=2024-01-07&ac_suggestion_list_length=5&ac_suggestion_theme_list_length=0&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=Birmingham%2C+West+Midlands%2C+United+Kingdom&ssne=%D7%91%D7%A8%D7%9E%D7%99%D7%A0%D7%92%D7%94%D7%90%D7%9D&ssne_untouched=%D7%91%D7%A8%D7%9E%D7%99%D7%A0%D7%92%D7%94%D7%90%D7%9D&efdco=1&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4AsL2s6IGwAIB0gIkM2VlZjk2YjEtMDJhYi00YmExLTg1NmEtOTIxYTNhNzdhMWQ22AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2589989&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=acea4ea102160498&ac_meta=GhBhY2VhNGVhMTAyMTYwNDk4IAAoATICZW46BWJpcm1pQABKAFAA&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=%D7%90%D7%93%D7%99%D7%A0%D7%91%D7%95%D7%A8%D7%95%2C+%D7%A1%D7%A7%D7%95%D7%98%D7%9C%D7%A0%D7%93%2C+%D7%91%D7%A8%D7%99%D7%98%D7%A0%D7%99%D7%94&ssne=%D7%92%D7%9C%D7%90%D7%96%D7%92%D7%95&ssne_untouched=%D7%92%D7%9C%D7%90%D7%96%D7%92%D7%95&efdco=1&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4AoP4s6IGwAIB0gIkZmQ0YjhjNTMtZTc2ZS00NDZkLThmMmEtNmUyZDk3YTAwZWJl2AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2595386&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=he&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=ec504f01a010003c&ac_meta=GhBlYzUwNGYwMWEwMTAwMDNjIAAoATICaGU6A2VkaUAASgBQAA%3D%3D&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=%D7%99%D7%95%D7%A8%D7%A7&ssne=%D7%99%D7%95%D7%A8%D7%A7&ssne_untouched=%D7%99%D7%95%D7%A8%D7%A7&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4Auj6s6IGwAIB0gIkM2IwYTU3ODgtNDJiYS00ZDk0LWI0MDAtNGU3M2U3ZDlkNzM42AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2612321&dest_type=city&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}'
}
x = 0
hotels = []
links = []
prices = []
for x in hotelsArr:
maxValue = getOffset(x) *25
offset = 0
page = x
# Loop that run on the first 40 pages (offset+25 each time)
while offset < maxValue:
while True:  # Loop that make sure that the page loaded successfully
temp = getPageBySel(page)
soup = BeautifulSoup(temp, 'html.parser')
if len(soup("h3", {"class": "a4225678b2"})) > 0:
break
else:
time.sleep(1.5)
for element in soup.select(".fcab3ed991.fbd1d3018c.e729ed5ab6"):
# Find the price
price = element.get_text(strip=True) if element else 'N/A'
# Append the price to the prices list
prices.append(price)
# Extract the hotel name, link, and price
for element in soup("h3", {"class": "a4225678b2"}):
# Find the hotel link
link = element('a')[0]['href']
# Find the hotel name
name = element.select_one('.fcab3ed991.a23c043802').get_text() if element else 'N/A'
# Append the data to the respective lists
hotels.append(name)
links.append(link)
offset += 25  # Move to the next page
page = page.replace(f"offset={offset - 25}", f"offset={offset}")  # Update the URL with the new offset
# Move to the next page
# Ensure all lists have the same length
length = len(hotels)
if len(links) < length:
length = len(links)
if len(prices) < length:
length = len(prices)
# Take the data into dataframe
df = pd.DataFrame({
'Hotel': hotels, 'Link': links, 'Prices': prices
})
print(df)
# Df to csv
df.to_csv('hotels_list30.csv', index=True)

was able to scrape csv with around 3699 records
How can I fix the 'arrays must be in the same length' error in my Python web scraping code using Selenium and BeautifulSoup?

huangapple
  • 本文由 发表于 2023年5月22日 20:03:56
  • 转载请务必保留本文链接:https://go.coder-hub.com/76306008.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定