英文:
Could not extract data from website using selenium
问题
我正在尝试从<https://cargillsonline.com/Web/Product?IC=Mg==&NC=QmFieSBQcm9kdWN0cw==>提取数据,其中包含多个子页面。但是每个子页面没有单独的链接用于提取数据。
所以我使用selenium来动态加载网站并导航到每个页面。但当我尝试从第二个页面提取数据时,它只返回第一个页面的内容。
这是我用来运行程序的代码。
from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib3.exceptions import InsecureRequestWarning
from urllib3 import disable_warnings
import time
from pathlib import Path
disable_warnings(InsecureRequestWarning)
agent = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50",}
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url='https://cargillsonline.com/Web/Product?IC=Mg==&NC=QmFieSBQcm9kdWN0cw=='
path='C:/Users/dell/Desktop/Data/DataScraping/chrome_driver/chromedriver'
service = Service(path)
driver = webdriver.Chrome(service=service)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
def get_data():
start = time.process_time()
url=main_url
product_name=[]
product_price=[]
count=0
all_pages=10 #这个数字仅用于测试目的
print('Get Data Processing .....')
for i in range(all_pages):
if(count==0):
add_boxs_v1=soup.find_all(class_='veg')
for product in add_boxs_v1:
product_name.append(product.find('p').text)
add_boxs_v2=soup.find_all(class_='strike1')
for price in add_boxs_v2:
product_price.append(price.find('h4').text)
count+=1
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[@ng-click='selectPage(page + 1, $event)']")).click()
time.sleep(5)
print('done')
df=pd.DataFrame({'Product_name':product_name,'Price':product_price})
return df
df=get_data()
df.head()
请问,有人能指导我在这个过程中哪一步出了问题吗?
英文:
I am trying to extract data from <https://cargillsonline.com/Web/Product?IC=Mg==&NC=QmFieSBQcm9kdWN0cw==>, and it contains multiple sub-pages. But each sub-page doesn't have a separate link to extra.
So I use selenium to dynamically load the website and navigate to each page. But when I try to extract data from the second page, it returns only the first page's content.
This is the code that I used to run the programme.
from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib3.exceptions import InsecureRequestWarning
from urllib3 import disable_warnings
import time
from pathlib import Path
disable_warnings(InsecureRequestWarning)
agent = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50",}
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url='https://cargillsonline.com/Web/Product?IC=Mg==&NC=QmFieSBQcm9kdWN0cw=='
path='C:/Users/dell/Desktop/Data/DataScraping/chrome_driver/chromedriver
service = Service(path)
driver = webdriver.Chrome(service=service)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
def get_data():
start = time.process_time()
url=main_url
product_name=[]
product_price=[]
count=0
all_pages=10 #this number is only for testing purpose
print('Get Data Processing .....')
for i in range(all_pages):
if(count==0):
add_boxs_v1=soup.find_all(class_='veg')
for product in add_boxs_v1:
product_name.append(product.find('p').text)
add_boxs_v2=soup.find_all(class_='strike1')
for price in add_boxs_v2:
product_price.append(price.find('h4').text)
count+=1
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[@ng-click='selectPage(page + 1, $event)']"))).click()
time.sleep(5)
print('done')
df=pd.DataFrame({'Product_name':product_name,'Price':product_price})
return df
df=get_data()
df.head()
Please, someone could guide me on what step I did wrong in this process.
答案1
得分: 2
你只会获得第一页,因为你的 page_source
只包含第一页的内容。在每次点击操作时,你需要捕获当前的 page_source
。
你需要将 page_source
移到循环内部,以获取最新的 page_source
。
url = 'https://cargillsonline.com/Web/Product?IC=Mg==&NC=QmFieSBQcm9kdWN0cw=='
path = 'C:/Users/dell/Desktop/Data/DataScraping/chrome_driver/chromedriver'
service = Service(path)
driver = webdriver.Chrome(service=service)
driver.get(url)
def get_data():
start = time.process_time()
url = main_url
product_name = []
product_price = []
count = 0
all_pages = 10 # 仅用于测试目的的数字
print('Get Data Processing .....')
for i in range(all_pages):
if count == 0:
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
add_boxs_v1 = soup.find_all(class_='veg')
for product in add_boxs_v1:
product_name.append(product.find('p').text)
add_boxs_v2 = soup.find_all(class_='strike1')
for price in add_boxs_v2:
product_price.append(price.find('h4').text)
count += 1
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[@ng-click='selectPage(page + 1, $event)']")).click()
time.sleep(5)
print('done')
df = pd.DataFrame({'Product_name': product_name, 'Price': product_price})
return df
df = get_data()
df.head()
英文:
you are getting only the first page, because your page_source
is first page only. On every click operation you need to capture the current page_source
.
you need to move the page_source
inside the for loop to get the latest page_source
everytime.
url='https://cargillsonline.com/Web/Product?IC=Mg==&NC=QmFieSBQcm9kdWN0cw=='
path='C:/Users/dell/Desktop/Data/DataScraping/chrome_driver/chromedriver
service = Service(path)
driver = webdriver.Chrome(service=service)
driver.get(url)
def get_data():
start = time.process_time()
url=main_url
product_name=[]
product_price=[]
count=0
all_pages=10 #this number is only for testing purpose
print('Get Data Processing .....')
for i in range(all_pages):
if(count==0):
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
add_boxs_v1=soup.find_all(class_='veg')
for product in add_boxs_v1:
product_name.append(product.find('p').text)
add_boxs_v2=soup.find_all(class_='strike1')
for price in add_boxs_v2:
product_price.append(price.find('h4').text)
count+=1
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[@ng-click='selectPage(page + 1, $event)']"))).click()
time.sleep(5)
print('done')
df=pd.DataFrame({'Product_name':product_name,'Price':product_price})
return df
df=get_data()
df.head()
答案2
得分: 1
以下是从您的URL移至第2页的简单代码。其他页面具有相同的CSS模式,因此无需担心。我测试过,它有效。您只需要集成从您的代码中提取的数据部分。
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time
# 创建一个新的Chrome浏览器实例
browser = webdriver.Chrome(ChromeDriverManager().install())
# 导航到网站
browser.get("https://cargillsonline.com/Web/Product?IC=Mg==&NC=QmFieSBQcm9kdWN0cw==")
time.sleep(10)
# 获取按钮(第2页)的CSS路径。
# 对于每个页面,您将增加li:nth-child(4)中的数字。
# 例如,li:nth-child(5),li:nth-child(6)
css_path = """
#divProducts > div.divPagingProd > ul > li:nth-child(4) > a
"""
# 查找按钮并滚动到该按钮,然后单击
button = browser.find_element(By.CSS_SELECTOR, css_path)
browser.execute_script("arguments[0].scrollIntoView();", button)
browser.execute_script("arguments[0].click();", button)
time.sleep(10)
browser.quit()
希望这对您有所帮助。
英文:
Below is my simple code to move to page 2 from your URL. Other pages have the same CSS pattern so you don't need to worry. I test and it works. You just need to integrate the extracted data part from your code.
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time
# create a new Chrome browser instance
browser = webdriver.Chrome(ChromeDriverManager().install())
# navigate to the website
browser.get("https://cargillsonline.com/Web/Product?IC=Mg==&NC=QmFieSBQcm9kdWN0cw==")
time.sleep(10)
# Get the css_path of the button (page 2).
# For each page you will increase the number inside li:nth-child(4)
# For example li:nth-child(5), li:nth-child(6)
css_path = """
#divProducts > div.divPagingProd > ul > li:nth-child(4) > a
"""
# Find the button and scroll down to that button, then click
button = browser.find_element(By.CSS_SELECTOR, css_path)
browser.execute_script("arguments[0].scrollIntoView();", button)
browser.execute_script("arguments[0].click();", button)
time.sleep(10)
browser.quit()
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论