如何使用Python从亚马逊的搜索结果中爬取所有页面。

huangapple go评论82阅读模式
英文:

How to scrape all of the pages on amazon from a search result with python

问题

You can modify your code to scrape all the pages from the Amazon search results by adjusting the logic in your while loop. Here's the modified code:

from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time

def get_url(search_term, page):
    template = 'https://www.amazon.com/s?k={}&page={}'
    search_term = search_term.replace(' ', '+')
    url = template.format(search_term, page)
    return url

# ... (rest of your code remains the same up to the while loop)

def scrape_amazon(search_term):
    driver = webdriver.Firefox()
    records = []
    page = 1

    while True:
        url = get_url(search_term, page)  # Generate URL for the current page
        driver.get(url)
        time.sleep(2)

        # Rest of your code for scraping results on the current page

        # Check if there is a "Next" button on the page
        pagination_next = driver.find_elements_by_css_selector('.s-pagination-item.s-pagination-button')
        if not any(a.text == 'Next' for a in pagination_next):
            break  # Stop scraping if there are no more pages

        page += 1

    driver.close()

    # Process the records
    df = pd.DataFrame(records, columns=['Description', 'Price', 'Rating', 'Review Count', 'URL'])
    return df

# ... (rest of your code remains the same)

This code will incrementally change the page number in the URL and continue scraping until there are no more "Next" buttons, effectively scraping all the pages of search results.

英文:

I am trying to scrape all of the pages from a search result on amazon with python. However, the code below only returns the listings on page 1. Can anyone offer some suggestions on how to gather the other pages? Setting a specific range doesn't work?

from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time

def get_url(search_term):
    template = 'https://www.amazon.com/s?k={}'
    search_term = search_term.replace(' ', '+')
    url = template.format(search_term)
    return url

def scrape_records(item):
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://amazon.com' + atag.get('href')

    price_parent = item.find('span', 'a-price')
    price = price_parent.find('span', 'a-offscreen').text.strip() if price_parent and price_parent.find('span', 'a-offscreen') else ''

    rating_element = item.find('span', {'class': 'a-icon-alt'})
    rating = rating_element.text.strip() if rating_element else ''

    review_count_element = item.find('span', {'class': 'a-size-base', 'dir': 'auto'})
    review_count = review_count_element.text.strip() if review_count_element else ''

    result = (description, price, rating, review_count, url)
    return result

def scrape_amazon(search_term):
    driver = webdriver.Firefox()
    records = []
    page = 1

    url = get_url(search_term)
    driver.get(url)
    time.sleep(2)  # Add a short delay to let the page load

    while True:
        # Scroll to the bottom of the page to load more items
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Add a short delay to let the page load

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})

        for item in results:
            try:
                record = scrape_records(item)
                records.append(record)
            except Exception as e:
                print(f"Error scraping item: {e}")

        # Check if there is a "Next" button on the page
        pagination_next = []
        for x in soup.find_all('a', {'class': 's-pagination-item' 's-pagination-button'}):
            pagination_next.append(x)
        print(pagination_next)
        if not any(a.get_text() == 'Next' for a in pagination_next):
            break  # Stop scraping if there are no more pages

        page += 1

    driver.close()

    # Process the records
    df = pd.DataFrame(records, columns=['Description', 'Price', 'Rating', 'Review Count', 'URL'])
    return df

# Get user input for the search term
search_term = 'ultrawide monitor'

# Scrape Amazon for the search term
df = scrape_amazon(search_term)

# Export DataFrame to Excel
df.to_excel('output.xlsx', index=False)

Please list anyways that I can make this code scrape all of the pages from the search.

答案1

得分: 2

以下是您要翻译的代码部分:

import undetected_chromedriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as ExpectedConditions
import pandas as pd
import time
from fake_useragent import UserAgent

from selenium.common import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait


def get_url(search_term):
    template = 'https://www.amazon.com/s?k={}'
    search_term = search_term.replace(' ', '+')
    url = template.format(search_term)
    return url


def scrape_records(item):
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://amazon.com' + atag.get('href')

    price_parent = item.find('span', 'a-price')
    price = price_parent.find('span', 'a-offscreen').text.strip() if price_parent and price_parent.find('span',
                                                                                                        'a-offscreen') else ''
    
    rating_element = item.find('span', {'class': 'a-icon-alt'})
    rating = rating_element.text.strip() if rating_element else ''

    review_count_element = item.find('span', {'class': 'a-size-base s-underline-text'})
    review_count = review_count_element.text.strip() if review_count_element else ''

    result = (description, price, rating, review_count, url)
    return result


def scrape_amazon(search_term):
    ua = UserAgent()
    options = Options()
    options.add_argument(f"user-agent={ua.random}")
    driver = undetected_chromedriver.Chrome(options=options)
    url = get_url(search_term)
    driver.get(url)
    time.sleep(5)
    records = []
    while True:

        time.sleep(5)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})

        for item in results:
            try:
                record = scrape_records(item)
                records.append(record)
            except Exception as e:
                print(f"Error scraping item: {e}")

        try:
            nextButton = driver.find_element(By.XPATH, '//a[text()="Next"]')
            driver.execute_script("arguments[0].scrollIntoView();", nextButton)
            WebDriverWait(driver, 10).until(ExpectedConditions.element_to_be_clickable(nextButton))
            nextButton.click()
        except NoSuchElementException:
            print("Breaking as Last page Reached")
            break

    driver.close()

    df = pd.DataFrame(records, columns=['Description', 'Price', 'Rating', 'Review Count', 'URL'])
    return df

search_term = 'ultrawide monitor'

df = scrape_amazon(search_term)

df.to_excel('output.xlsx', index=False)

请注意,这是代码的翻译版本,不包括代码之外的任何内容。

英文:

You can use below, updated Review count locator and changed logic to break out of loop when Next button is not shown otherwise click on next page and scrape Also i have used undetected-chromedriver for selenium and passing fake user agent to avoid bot detection

import undetected_chromedriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as ExpectedConditions
import pandas as pd
import time
from fake_useragent import UserAgent
from selenium.common import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
def get_url(search_term):
template = 'https://www.amazon.com/s?k={}'
search_term = search_term.replace(' ', '+')
url = template.format(search_term)
return url
def scrape_records(item):
atag = item.h2.a
description = atag.text.strip()
url = 'https://amazon.com' + atag.get('href')
price_parent = item.find('span', 'a-price')
price = price_parent.find('span', 'a-offscreen').text.strip() if price_parent and price_parent.find('span',
'a-offscreen') else ''
rating_element = item.find('span', {'class': 'a-icon-alt'})
rating = rating_element.text.strip() if rating_element else ''
review_count_element = item.find('span', {'class': 'a-size-base s-underline-text'})
review_count = review_count_element.text.strip() if review_count_element else ''
result = (description, price, rating, review_count, url)
return result
def scrape_amazon(search_term):
ua = UserAgent()
options = Options()
options.add_argument(f"user-agent={ua.random}")
driver = undetected_chromedriver.Chrome(options=options)
url = get_url(search_term)
driver.get(url)
time.sleep(5)
records = []
while True:
# Scroll to the bottom of the page to load more items
# Add a short delay to let the page load
time.sleep(5)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('div', {'data-component-type': 's-search-result'})
for item in results:
try:
record = scrape_records(item)
records.append(record)
except Exception as e:
print(f"Error scraping item: {e}")
# Check if there is a "Next" button on the page
try:
nextButton = driver.find_element(By.XPATH, '//a[text()="Next"]')
driver.execute_script("arguments[0].scrollIntoView();", nextButton)
WebDriverWait(driver, 10).until(ExpectedConditions.element_to_be_clickable(nextButton))
nextButton.click()
except NoSuchElementException:
print("Breaking as Last page Reached")
break
driver.close()
# Process the records
df = pd.DataFrame(records, columns=['Description', 'Price', 'Rating', 'Review Count', 'URL'])
return df
# Get user input for the search term
search_term = 'ultrawide monitor'
# Scrape Amazon for the search term
df = scrape_amazon(search_term)
# Export DataFrame to Excel
df.to_excel('output.xlsx', index=False)

outputs
如何使用Python从亚马逊的搜索结果中爬取所有页面。

huangapple
  • 本文由 发表于 2023年5月22日 08:34:06
  • 转载请务必保留本文链接:https://go.coder-hub.com/76302452.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定