如何使用Python从亚马逊的搜索结果中爬取所有页面。

huangapple go评论112阅读模式
英文:

How to scrape all of the pages on amazon from a search result with python

问题

You can modify your code to scrape all the pages from the Amazon search results by adjusting the logic in your while loop. Here's the modified code:

  1. from bs4 import BeautifulSoup
  2. from selenium import webdriver
  3. import pandas as pd
  4. import time
  5. def get_url(search_term, page):
  6. template = 'https://www.amazon.com/s?k={}&page={}'
  7. search_term = search_term.replace(' ', '+')
  8. url = template.format(search_term, page)
  9. return url
  10. # ... (rest of your code remains the same up to the while loop)
  11. def scrape_amazon(search_term):
  12. driver = webdriver.Firefox()
  13. records = []
  14. page = 1
  15. while True:
  16. url = get_url(search_term, page) # Generate URL for the current page
  17. driver.get(url)
  18. time.sleep(2)
  19. # Rest of your code for scraping results on the current page
  20. # Check if there is a "Next" button on the page
  21. pagination_next = driver.find_elements_by_css_selector('.s-pagination-item.s-pagination-button')
  22. if not any(a.text == 'Next' for a in pagination_next):
  23. break # Stop scraping if there are no more pages
  24. page += 1
  25. driver.close()
  26. # Process the records
  27. df = pd.DataFrame(records, columns=['Description', 'Price', 'Rating', 'Review Count', 'URL'])
  28. return df
  29. # ... (rest of your code remains the same)

This code will incrementally change the page number in the URL and continue scraping until there are no more "Next" buttons, effectively scraping all the pages of search results.

英文:

I am trying to scrape all of the pages from a search result on amazon with python. However, the code below only returns the listings on page 1. Can anyone offer some suggestions on how to gather the other pages? Setting a specific range doesn't work?

  1. from bs4 import BeautifulSoup
  2. from selenium import webdriver
  3. import pandas as pd
  4. import time
  5. def get_url(search_term):
  6. template = 'https://www.amazon.com/s?k={}'
  7. search_term = search_term.replace(' ', '+')
  8. url = template.format(search_term)
  9. return url
  10. def scrape_records(item):
  11. atag = item.h2.a
  12. description = atag.text.strip()
  13. url = 'https://amazon.com' + atag.get('href')
  14. price_parent = item.find('span', 'a-price')
  15. price = price_parent.find('span', 'a-offscreen').text.strip() if price_parent and price_parent.find('span', 'a-offscreen') else ''
  16. rating_element = item.find('span', {'class': 'a-icon-alt'})
  17. rating = rating_element.text.strip() if rating_element else ''
  18. review_count_element = item.find('span', {'class': 'a-size-base', 'dir': 'auto'})
  19. review_count = review_count_element.text.strip() if review_count_element else ''
  20. result = (description, price, rating, review_count, url)
  21. return result
  22. def scrape_amazon(search_term):
  23. driver = webdriver.Firefox()
  24. records = []
  25. page = 1
  26. url = get_url(search_term)
  27. driver.get(url)
  28. time.sleep(2) # Add a short delay to let the page load
  29. while True:
  30. # Scroll to the bottom of the page to load more items
  31. driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  32. time.sleep(2) # Add a short delay to let the page load
  33. soup = BeautifulSoup(driver.page_source, 'html.parser')
  34. results = soup.find_all('div', {'data-component-type': 's-search-result'})
  35. for item in results:
  36. try:
  37. record = scrape_records(item)
  38. records.append(record)
  39. except Exception as e:
  40. print(f"Error scraping item: {e}")
  41. # Check if there is a "Next" button on the page
  42. pagination_next = []
  43. for x in soup.find_all('a', {'class': 's-pagination-item' 's-pagination-button'}):
  44. pagination_next.append(x)
  45. print(pagination_next)
  46. if not any(a.get_text() == 'Next' for a in pagination_next):
  47. break # Stop scraping if there are no more pages
  48. page += 1
  49. driver.close()
  50. # Process the records
  51. df = pd.DataFrame(records, columns=['Description', 'Price', 'Rating', 'Review Count', 'URL'])
  52. return df
  53. # Get user input for the search term
  54. search_term = 'ultrawide monitor'
  55. # Scrape Amazon for the search term
  56. df = scrape_amazon(search_term)
  57. # Export DataFrame to Excel
  58. df.to_excel('output.xlsx', index=False)

Please list anyways that I can make this code scrape all of the pages from the search.

答案1

得分: 2

以下是您要翻译的代码部分:

  1. import undetected_chromedriver
  2. from bs4 import BeautifulSoup
  3. from selenium.webdriver.chrome.options import Options
  4. from selenium.webdriver.support import expected_conditions as ExpectedConditions
  5. import pandas as pd
  6. import time
  7. from fake_useragent import UserAgent
  8. from selenium.common import NoSuchElementException
  9. from selenium.webdriver.common.by import By
  10. from selenium.webdriver.support.wait import WebDriverWait
  11. def get_url(search_term):
  12. template = 'https://www.amazon.com/s?k={}'
  13. search_term = search_term.replace(' ', '+')
  14. url = template.format(search_term)
  15. return url
  16. def scrape_records(item):
  17. atag = item.h2.a
  18. description = atag.text.strip()
  19. url = 'https://amazon.com' + atag.get('href')
  20. price_parent = item.find('span', 'a-price')
  21. price = price_parent.find('span', 'a-offscreen').text.strip() if price_parent and price_parent.find('span',
  22. 'a-offscreen') else ''
  23. rating_element = item.find('span', {'class': 'a-icon-alt'})
  24. rating = rating_element.text.strip() if rating_element else ''
  25. review_count_element = item.find('span', {'class': 'a-size-base s-underline-text'})
  26. review_count = review_count_element.text.strip() if review_count_element else ''
  27. result = (description, price, rating, review_count, url)
  28. return result
  29. def scrape_amazon(search_term):
  30. ua = UserAgent()
  31. options = Options()
  32. options.add_argument(f"user-agent={ua.random}")
  33. driver = undetected_chromedriver.Chrome(options=options)
  34. url = get_url(search_term)
  35. driver.get(url)
  36. time.sleep(5)
  37. records = []
  38. while True:
  39. time.sleep(5)
  40. driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  41. soup = BeautifulSoup(driver.page_source, 'html.parser')
  42. results = soup.find_all('div', {'data-component-type': 's-search-result'})
  43. for item in results:
  44. try:
  45. record = scrape_records(item)
  46. records.append(record)
  47. except Exception as e:
  48. print(f"Error scraping item: {e}")
  49. try:
  50. nextButton = driver.find_element(By.XPATH, '//a[text()="Next"]')
  51. driver.execute_script("arguments[0].scrollIntoView();", nextButton)
  52. WebDriverWait(driver, 10).until(ExpectedConditions.element_to_be_clickable(nextButton))
  53. nextButton.click()
  54. except NoSuchElementException:
  55. print("Breaking as Last page Reached")
  56. break
  57. driver.close()
  58. df = pd.DataFrame(records, columns=['Description', 'Price', 'Rating', 'Review Count', 'URL'])
  59. return df
  60. search_term = 'ultrawide monitor'
  61. df = scrape_amazon(search_term)
  62. df.to_excel('output.xlsx', index=False)

请注意,这是代码的翻译版本,不包括代码之外的任何内容。

英文:

You can use below, updated Review count locator and changed logic to break out of loop when Next button is not shown otherwise click on next page and scrape Also i have used undetected-chromedriver for selenium and passing fake user agent to avoid bot detection

  1. import undetected_chromedriver
  2. from bs4 import BeautifulSoup
  3. from selenium.webdriver.chrome.options import Options
  4. from selenium.webdriver.support import expected_conditions as ExpectedConditions
  5. import pandas as pd
  6. import time
  7. from fake_useragent import UserAgent
  8. from selenium.common import NoSuchElementException
  9. from selenium.webdriver.common.by import By
  10. from selenium.webdriver.support.wait import WebDriverWait
  11. def get_url(search_term):
  12. template = 'https://www.amazon.com/s?k={}'
  13. search_term = search_term.replace(' ', '+')
  14. url = template.format(search_term)
  15. return url
  16. def scrape_records(item):
  17. atag = item.h2.a
  18. description = atag.text.strip()
  19. url = 'https://amazon.com' + atag.get('href')
  20. price_parent = item.find('span', 'a-price')
  21. price = price_parent.find('span', 'a-offscreen').text.strip() if price_parent and price_parent.find('span',
  22. 'a-offscreen') else ''
  23. rating_element = item.find('span', {'class': 'a-icon-alt'})
  24. rating = rating_element.text.strip() if rating_element else ''
  25. review_count_element = item.find('span', {'class': 'a-size-base s-underline-text'})
  26. review_count = review_count_element.text.strip() if review_count_element else ''
  27. result = (description, price, rating, review_count, url)
  28. return result
  29. def scrape_amazon(search_term):
  30. ua = UserAgent()
  31. options = Options()
  32. options.add_argument(f"user-agent={ua.random}")
  33. driver = undetected_chromedriver.Chrome(options=options)
  34. url = get_url(search_term)
  35. driver.get(url)
  36. time.sleep(5)
  37. records = []
  38. while True:
  39. # Scroll to the bottom of the page to load more items
  40. # Add a short delay to let the page load
  41. time.sleep(5)
  42. driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  43. soup = BeautifulSoup(driver.page_source, 'html.parser')
  44. results = soup.find_all('div', {'data-component-type': 's-search-result'})
  45. for item in results:
  46. try:
  47. record = scrape_records(item)
  48. records.append(record)
  49. except Exception as e:
  50. print(f"Error scraping item: {e}")
  51. # Check if there is a "Next" button on the page
  52. try:
  53. nextButton = driver.find_element(By.XPATH, '//a[text()="Next"]')
  54. driver.execute_script("arguments[0].scrollIntoView();", nextButton)
  55. WebDriverWait(driver, 10).until(ExpectedConditions.element_to_be_clickable(nextButton))
  56. nextButton.click()
  57. except NoSuchElementException:
  58. print("Breaking as Last page Reached")
  59. break
  60. driver.close()
  61. # Process the records
  62. df = pd.DataFrame(records, columns=['Description', 'Price', 'Rating', 'Review Count', 'URL'])
  63. return df
  64. # Get user input for the search term
  65. search_term = 'ultrawide monitor'
  66. # Scrape Amazon for the search term
  67. df = scrape_amazon(search_term)
  68. # Export DataFrame to Excel
  69. df.to_excel('output.xlsx', index=False)

outputs
如何使用Python从亚马逊的搜索结果中爬取所有页面。

huangapple
  • 本文由 发表于 2023年5月22日 08:34:06
  • 转载请务必保留本文链接:https://go.coder-hub.com/76302452.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定