英文:
How to scrape all of the pages on amazon from a search result with python
问题
You can modify your code to scrape all the pages from the Amazon search results by adjusting the logic in your while loop. Here's the modified code:
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time
def get_url(search_term, page):
template = 'https://www.amazon.com/s?k={}&page={}'
search_term = search_term.replace(' ', '+')
url = template.format(search_term, page)
return url
# ... (rest of your code remains the same up to the while loop)
def scrape_amazon(search_term):
driver = webdriver.Firefox()
records = []
page = 1
while True:
url = get_url(search_term, page) # Generate URL for the current page
driver.get(url)
time.sleep(2)
# Rest of your code for scraping results on the current page
# Check if there is a "Next" button on the page
pagination_next = driver.find_elements_by_css_selector('.s-pagination-item.s-pagination-button')
if not any(a.text == 'Next' for a in pagination_next):
break # Stop scraping if there are no more pages
page += 1
driver.close()
# Process the records
df = pd.DataFrame(records, columns=['Description', 'Price', 'Rating', 'Review Count', 'URL'])
return df
# ... (rest of your code remains the same)
This code will incrementally change the page number in the URL and continue scraping until there are no more "Next" buttons, effectively scraping all the pages of search results.
英文:
I am trying to scrape all of the pages from a search result on amazon with python. However, the code below only returns the listings on page 1. Can anyone offer some suggestions on how to gather the other pages? Setting a specific range doesn't work?
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time
def get_url(search_term):
template = 'https://www.amazon.com/s?k={}'
search_term = search_term.replace(' ', '+')
url = template.format(search_term)
return url
def scrape_records(item):
atag = item.h2.a
description = atag.text.strip()
url = 'https://amazon.com' + atag.get('href')
price_parent = item.find('span', 'a-price')
price = price_parent.find('span', 'a-offscreen').text.strip() if price_parent and price_parent.find('span', 'a-offscreen') else ''
rating_element = item.find('span', {'class': 'a-icon-alt'})
rating = rating_element.text.strip() if rating_element else ''
review_count_element = item.find('span', {'class': 'a-size-base', 'dir': 'auto'})
review_count = review_count_element.text.strip() if review_count_element else ''
result = (description, price, rating, review_count, url)
return result
def scrape_amazon(search_term):
driver = webdriver.Firefox()
records = []
page = 1
url = get_url(search_term)
driver.get(url)
time.sleep(2) # Add a short delay to let the page load
while True:
# Scroll to the bottom of the page to load more items
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2) # Add a short delay to let the page load
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('div', {'data-component-type': 's-search-result'})
for item in results:
try:
record = scrape_records(item)
records.append(record)
except Exception as e:
print(f"Error scraping item: {e}")
# Check if there is a "Next" button on the page
pagination_next = []
for x in soup.find_all('a', {'class': 's-pagination-item' 's-pagination-button'}):
pagination_next.append(x)
print(pagination_next)
if not any(a.get_text() == 'Next' for a in pagination_next):
break # Stop scraping if there are no more pages
page += 1
driver.close()
# Process the records
df = pd.DataFrame(records, columns=['Description', 'Price', 'Rating', 'Review Count', 'URL'])
return df
# Get user input for the search term
search_term = 'ultrawide monitor'
# Scrape Amazon for the search term
df = scrape_amazon(search_term)
# Export DataFrame to Excel
df.to_excel('output.xlsx', index=False)
Please list anyways that I can make this code scrape all of the pages from the search.
答案1
得分: 2
以下是您要翻译的代码部分:
import undetected_chromedriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as ExpectedConditions
import pandas as pd
import time
from fake_useragent import UserAgent
from selenium.common import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
def get_url(search_term):
template = 'https://www.amazon.com/s?k={}'
search_term = search_term.replace(' ', '+')
url = template.format(search_term)
return url
def scrape_records(item):
atag = item.h2.a
description = atag.text.strip()
url = 'https://amazon.com' + atag.get('href')
price_parent = item.find('span', 'a-price')
price = price_parent.find('span', 'a-offscreen').text.strip() if price_parent and price_parent.find('span',
'a-offscreen') else ''
rating_element = item.find('span', {'class': 'a-icon-alt'})
rating = rating_element.text.strip() if rating_element else ''
review_count_element = item.find('span', {'class': 'a-size-base s-underline-text'})
review_count = review_count_element.text.strip() if review_count_element else ''
result = (description, price, rating, review_count, url)
return result
def scrape_amazon(search_term):
ua = UserAgent()
options = Options()
options.add_argument(f"user-agent={ua.random}")
driver = undetected_chromedriver.Chrome(options=options)
url = get_url(search_term)
driver.get(url)
time.sleep(5)
records = []
while True:
time.sleep(5)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('div', {'data-component-type': 's-search-result'})
for item in results:
try:
record = scrape_records(item)
records.append(record)
except Exception as e:
print(f"Error scraping item: {e}")
try:
nextButton = driver.find_element(By.XPATH, '//a[text()="Next"]')
driver.execute_script("arguments[0].scrollIntoView();", nextButton)
WebDriverWait(driver, 10).until(ExpectedConditions.element_to_be_clickable(nextButton))
nextButton.click()
except NoSuchElementException:
print("Breaking as Last page Reached")
break
driver.close()
df = pd.DataFrame(records, columns=['Description', 'Price', 'Rating', 'Review Count', 'URL'])
return df
search_term = 'ultrawide monitor'
df = scrape_amazon(search_term)
df.to_excel('output.xlsx', index=False)
请注意,这是代码的翻译版本,不包括代码之外的任何内容。
英文:
You can use below, updated Review count locator and changed logic to break out of loop when Next button is not shown otherwise click on next page and scrape Also i have used undetected-chromedriver for selenium and passing fake user agent to avoid bot detection
import undetected_chromedriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as ExpectedConditions
import pandas as pd
import time
from fake_useragent import UserAgent
from selenium.common import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
def get_url(search_term):
template = 'https://www.amazon.com/s?k={}'
search_term = search_term.replace(' ', '+')
url = template.format(search_term)
return url
def scrape_records(item):
atag = item.h2.a
description = atag.text.strip()
url = 'https://amazon.com' + atag.get('href')
price_parent = item.find('span', 'a-price')
price = price_parent.find('span', 'a-offscreen').text.strip() if price_parent and price_parent.find('span',
'a-offscreen') else ''
rating_element = item.find('span', {'class': 'a-icon-alt'})
rating = rating_element.text.strip() if rating_element else ''
review_count_element = item.find('span', {'class': 'a-size-base s-underline-text'})
review_count = review_count_element.text.strip() if review_count_element else ''
result = (description, price, rating, review_count, url)
return result
def scrape_amazon(search_term):
ua = UserAgent()
options = Options()
options.add_argument(f"user-agent={ua.random}")
driver = undetected_chromedriver.Chrome(options=options)
url = get_url(search_term)
driver.get(url)
time.sleep(5)
records = []
while True:
# Scroll to the bottom of the page to load more items
# Add a short delay to let the page load
time.sleep(5)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('div', {'data-component-type': 's-search-result'})
for item in results:
try:
record = scrape_records(item)
records.append(record)
except Exception as e:
print(f"Error scraping item: {e}")
# Check if there is a "Next" button on the page
try:
nextButton = driver.find_element(By.XPATH, '//a[text()="Next"]')
driver.execute_script("arguments[0].scrollIntoView();", nextButton)
WebDriverWait(driver, 10).until(ExpectedConditions.element_to_be_clickable(nextButton))
nextButton.click()
except NoSuchElementException:
print("Breaking as Last page Reached")
break
driver.close()
# Process the records
df = pd.DataFrame(records, columns=['Description', 'Price', 'Rating', 'Review Count', 'URL'])
return df
# Get user input for the search term
search_term = 'ultrawide monitor'
# Scrape Amazon for the search term
df = scrape_amazon(search_term)
# Export DataFrame to Excel
df.to_excel('output.xlsx', index=False)
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论