问题

You can modify your code to scrape all the pages from the Amazon search results by adjusting the logic in your while loop. Here's the modified code:

from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time

def get_url(search_term, page):
    template = 'https://www.amazon.com/s?k={}&page={}'
    search_term = search_term.replace(' ', '+')
    url = template.format(search_term, page)
    return url

# ... (rest of your code remains the same up to the while loop)

def scrape_amazon(search_term):
    driver = webdriver.Firefox()
    records = []
    page = 1

    while True:
        url = get_url(search_term, page)  # Generate URL for the current page
        driver.get(url)
        time.sleep(2)

        # Rest of your code for scraping results on the current page

        # Check if there is a "Next" button on the page
        pagination_next = driver.find_elements_by_css_selector('.s-pagination-item.s-pagination-button')
        if not any(a.text == 'Next' for a in pagination_next):
            break  # Stop scraping if there are no more pages

        page += 1

    driver.close()

    # Process the records
    df = pd.DataFrame(records, columns=['Description', 'Price', 'Rating', 'Review Count', 'URL'])
    return df

# ... (rest of your code remains the same)

This code will incrementally change the page number in the URL and continue scraping until there are no more "Next" buttons, effectively scraping all the pages of search results.

英文:

I am trying to scrape all of the pages from a search result on amazon with python. However, the code below only returns the listings on page 1. Can anyone offer some suggestions on how to gather the other pages? Setting a specific range doesn't work?

from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time

def get_url(search_term):
    template = &#39;https://www.amazon.com/s?k={}&#39;
    search_term = search_term.replace(&#39; &#39;, &#39;+&#39;)
    url = template.format(search_term)
    return url

def scrape_records(item):
    atag = item.h2.a
    description = atag.text.strip()
    url = &#39;https://amazon.com&#39; + atag.get(&#39;href&#39;)

    price_parent = item.find(&#39;span&#39;, &#39;a-price&#39;)
    price = price_parent.find(&#39;span&#39;, &#39;a-offscreen&#39;).text.strip() if price_parent and price_parent.find(&#39;span&#39;, &#39;a-offscreen&#39;) else &#39;&#39;

    rating_element = item.find(&#39;span&#39;, {&#39;class&#39;: &#39;a-icon-alt&#39;})
    rating = rating_element.text.strip() if rating_element else &#39;&#39;

    review_count_element = item.find(&#39;span&#39;, {&#39;class&#39;: &#39;a-size-base&#39;, &#39;dir&#39;: &#39;auto&#39;})
    review_count = review_count_element.text.strip() if review_count_element else &#39;&#39;

    result = (description, price, rating, review_count, url)
    return result

def scrape_amazon(search_term):
    driver = webdriver.Firefox()
    records = []
    page = 1

    url = get_url(search_term)
    driver.get(url)
    time.sleep(2)  # Add a short delay to let the page load

    while True:
        # Scroll to the bottom of the page to load more items
        driver.execute_script(&quot;window.scrollTo(0, document.body.scrollHeight);&quot;)
        time.sleep(2)  # Add a short delay to let the page load

        soup = BeautifulSoup(driver.page_source, &#39;html.parser&#39;)
        results = soup.find_all(&#39;div&#39;, {&#39;data-component-type&#39;: &#39;s-search-result&#39;})

        for item in results:
            try:
                record = scrape_records(item)
                records.append(record)
            except Exception as e:
                print(f&quot;Error scraping item: {e}&quot;)

        # Check if there is a &quot;Next&quot; button on the page
        pagination_next = []
        for x in soup.find_all(&#39;a&#39;, {&#39;class&#39;: &#39;s-pagination-item&#39; &#39;s-pagination-button&#39;}):
            pagination_next.append(x)
        print(pagination_next)
        if not any(a.get_text() == &#39;Next&#39; for a in pagination_next):
            break  # Stop scraping if there are no more pages

        page += 1

    driver.close()

    # Process the records
    df = pd.DataFrame(records, columns=[&#39;Description&#39;, &#39;Price&#39;, &#39;Rating&#39;, &#39;Review Count&#39;, &#39;URL&#39;])
    return df

# Get user input for the search term
search_term = &#39;ultrawide monitor&#39;

# Scrape Amazon for the search term
df = scrape_amazon(search_term)

# Export DataFrame to Excel
df.to_excel(&#39;output.xlsx&#39;, index=False)

Please list anyways that I can make this code scrape all of the pages from the search.

答案1

得分: 2

以下是您要翻译的代码部分：

import undetected_chromedriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as ExpectedConditions
import pandas as pd
import time
from fake_useragent import UserAgent

from selenium.common import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait


def get_url(search_term):
    template = 'https://www.amazon.com/s?k={}'
    search_term = search_term.replace(' ', '+')
    url = template.format(search_term)
    return url


def scrape_records(item):
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://amazon.com' + atag.get('href')

    price_parent = item.find('span', 'a-price')
    price = price_parent.find('span', 'a-offscreen').text.strip() if price_parent and price_parent.find('span',
                                                                                                        'a-offscreen') else ''
    
    rating_element = item.find('span', {'class': 'a-icon-alt'})
    rating = rating_element.text.strip() if rating_element else ''

    review_count_element = item.find('span', {'class': 'a-size-base s-underline-text'})
    review_count = review_count_element.text.strip() if review_count_element else ''

    result = (description, price, rating, review_count, url)
    return result


def scrape_amazon(search_term):
    ua = UserAgent()
    options = Options()
    options.add_argument(f"user-agent={ua.random}")
    driver = undetected_chromedriver.Chrome(options=options)
    url = get_url(search_term)
    driver.get(url)
    time.sleep(5)
    records = []
    while True:

        time.sleep(5)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})

        for item in results:
            try:
                record = scrape_records(item)
                records.append(record)
            except Exception as e:
                print(f"Error scraping item: {e}")

        try:
            nextButton = driver.find_element(By.XPATH, '//a[text()="Next"]')
            driver.execute_script("arguments[0].scrollIntoView();", nextButton)
            WebDriverWait(driver, 10).until(ExpectedConditions.element_to_be_clickable(nextButton))
            nextButton.click()
        except NoSuchElementException:
            print("Breaking as Last page Reached")
            break

    driver.close()

    df = pd.DataFrame(records, columns=['Description', 'Price', 'Rating', 'Review Count', 'URL'])
    return df

search_term = 'ultrawide monitor'

df = scrape_amazon(search_term)

df.to_excel('output.xlsx', index=False)

请注意，这是代码的翻译版本，不包括代码之外的任何内容。

英文:

You can use below, updated Review count locator and changed logic to break out of loop when Next button is not shown otherwise click on next page and scrape Also i have used undetected-chromedriver for selenium and passing fake user agent to avoid bot detection

import undetected_chromedriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as ExpectedConditions
import pandas as pd
import time
from fake_useragent import UserAgent
from selenium.common import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
def get_url(search_term):
template = &#39;https://www.amazon.com/s?k={}&#39;
search_term = search_term.replace(&#39; &#39;, &#39;+&#39;)
url = template.format(search_term)
return url
def scrape_records(item):
atag = item.h2.a
description = atag.text.strip()
url = &#39;https://amazon.com&#39; + atag.get(&#39;href&#39;)
price_parent = item.find(&#39;span&#39;, &#39;a-price&#39;)
price = price_parent.find(&#39;span&#39;, &#39;a-offscreen&#39;).text.strip() if price_parent and price_parent.find(&#39;span&#39;,
&#39;a-offscreen&#39;) else &#39;&#39;
rating_element = item.find(&#39;span&#39;, {&#39;class&#39;: &#39;a-icon-alt&#39;})
rating = rating_element.text.strip() if rating_element else &#39;&#39;
review_count_element = item.find(&#39;span&#39;, {&#39;class&#39;: &#39;a-size-base s-underline-text&#39;})
review_count = review_count_element.text.strip() if review_count_element else &#39;&#39;
result = (description, price, rating, review_count, url)
return result
def scrape_amazon(search_term):
ua = UserAgent()
options = Options()
options.add_argument(f&quot;user-agent={ua.random}&quot;)
driver = undetected_chromedriver.Chrome(options=options)
url = get_url(search_term)
driver.get(url)
time.sleep(5)
records = []
while True:
# Scroll to the bottom of the page to load more items
# Add a short delay to let the page load
time.sleep(5)
driver.execute_script(&quot;window.scrollTo(0, document.body.scrollHeight);&quot;)
soup = BeautifulSoup(driver.page_source, &#39;html.parser&#39;)
results = soup.find_all(&#39;div&#39;, {&#39;data-component-type&#39;: &#39;s-search-result&#39;})
for item in results:
try:
record = scrape_records(item)
records.append(record)
except Exception as e:
print(f&quot;Error scraping item: {e}&quot;)
# Check if there is a &quot;Next&quot; button on the page
try:
nextButton = driver.find_element(By.XPATH, &#39;//a[text()=&quot;Next&quot;]&#39;)
driver.execute_script(&quot;arguments[0].scrollIntoView();&quot;, nextButton)
WebDriverWait(driver, 10).until(ExpectedConditions.element_to_be_clickable(nextButton))
nextButton.click()
except NoSuchElementException:
print(&quot;Breaking as Last page Reached&quot;)
break
driver.close()
# Process the records
df = pd.DataFrame(records, columns=[&#39;Description&#39;, &#39;Price&#39;, &#39;Rating&#39;, &#39;Review Count&#39;, &#39;URL&#39;])
return df
# Get user input for the search term
search_term = &#39;ultrawide monitor&#39;
# Scrape Amazon for the search term
df = scrape_amazon(search_term)
# Export DataFrame to Excel
df.to_excel(&#39;output.xlsx&#39;, index=False)

outputs

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

如何使用Python从亚马逊的搜索结果中爬取所有页面。

问题

答案1

添加条件下的值到新列 Pandas

如何在Python中的进程类的其他方法中使用run方法的变量

如何将线添加到链接的堆叠条形图类别

Having trouble opening a virtual environment in python3.9.

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

发表评论