英文:
Selenium navigation through selenium keep looping (python)
问题
我刚刚开始使用Selenium来从网页上抓取表格。因此,我使用Selenium实现了网页的导航。但是,当我运行代码时,结果一直在循环。我很确定我的代码写错了。我应该如何修复代码,以使Selenium导航正常工作?
import requests
import csv
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time # 添加导入时间模块
browser = webdriver.Chrome()
browser.get('https://dir.businessworld.com.my/15/posts/16-Computers-The-Internet')
soup = bs(browser.page_source, 'html.parser') # 指定解析器为'html.parser'
filename = "C:/Users/User/Desktop/test.csv"
csv_writer = csv.writer(open(filename, 'w', newline='')) # 添加newline=''以防止行之间的额外空行
pages_remaining = True
while pages_remaining:
for tr in soup.find_all("tr"):
data = []
for th in tr.find_all("th"):
data.append(th.text.strip())
if data:
print("Inserting headers: {}".format(','.join(data)))
csv_writer.writerow(data)
continue
for td in tr.find_all("td"):
if td.a:
data.append(td.a.text.strip())
else:
data.append(td.text.strip())
if data:
print("Inserting data: {}".format(','.join(data)))
csv_writer.writerow(data)
try:
next_link = browser.find_element_by_xpath('//*[@id="content"]/div[3]/table/tbody/tr/td[2]/table/tbody/tr/td[6]/a')
next_link.click()
time.sleep(30)
except NoSuchElementException:
pages_remaining = False
请注意,我对代码进行了一些修改,包括指定了HTML解析器,修复了CSV写入的newline问题,并添加了时间模块以等待页面加载。希望这可以帮助您解决问题。
英文:
I'm just started using selenium to scrape the table from webpage. So, I implemented the navigation of webpage using selenium. But, the the result keep looping when I run the code. Pretty sure that I wrote the code wrong. What should I fix the code so the navigation selenium works?
import requests
import csv
from bs4 import BeautifulSoup as bs
from selenium import webdriver
browser=webdriver.Chrome()
browser.get('https://dir.businessworld.com.my/15/posts/16-Computers-The-Internet')
# url = requests.get("https://dir.businessworld.com.my/15/posts/16-Computers-The-Internet/")
soup=bs(browser.page_source)
filename = "C:/Users/User/Desktop/test.csv"
csv_writer = csv.writer(open(filename, 'w'))
pages_remaining = True
while pages_remaining:
for tr in soup.find_all("tr"):
data = []
# for headers ( entered only once - the first time - )
for th in tr.find_all("th"):
data.append(th.text)
if data:
print("Inserting headers : {}".format(','.join(data)))
csv_writer.writerow(data)
continue
for td in tr.find_all("td"):
if td.a:
data.append(td.a.text.strip())
else:
data.append(td.text.strip())
if data:
print("Inserting data: {}".format(','.join(data)))
csv_writer.writerow(data)
try:
#Checks if there are more pages with links
next_link = driver.find_element_by_xpath('//*[@id="content"]/div[3]/table/tbody/tr/td[2]/table/tbody/tr/td[6]/a ]')
next_link.click()
time.sleep(30)
except NoSuchElementException:
rows_remaining = False
答案1
得分: 1
检查页面上是否存在“下一页”按钮,然后单击,否则退出循环。
if len(browser.find_elements_by_xpath("//a[contains(.,'下一页')]")) > 0:
browser.find_element_by_xpath("//a[contains(.,'下一页')]").click()
else:
break
不需要使用 time.sleep()
,而是使用 WebDriverWait
。
WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table.postlisting")))
英文:
Check if there any next button
present on the page then click else exit from while loop.
if len(browser.find_elements_by_xpath("//a[contains(.,'Next')]"))>0:
browser.find_element_by_xpath("//a[contains(.,'Next')]").click()
else:
break
No need to use time.sleep()
instead use WebDriverWait
()
Code:
import csv
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
browser=webdriver.Chrome()
browser.get('https://dir.businessworld.com.my/15/posts/16-Computers-The-Internet')
WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table.postlisting")))
soup=bs(browser.page_source)
filename = "C:/Users/User/Desktop/test.csv"
csv_writer = csv.writer(open(filename, 'w'))
pages_remaining = True
while pages_remaining:
WebDriverWait(browser,10).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"table.postlisting")))
for tr in soup.find_all("tr"):
data = []
# for headers ( entered only once - the first time - )
for th in tr.find_all("th"):
data.append(th.text)
if data:
print("Inserting headers : {}".format(','.join(data)))
csv_writer.writerow(data)
continue
for td in tr.find_all("td"):
if td.a:
data.append(td.a.text.strip())
else:
data.append(td.text.strip())
if data:
print("Inserting data: {}".format(','.join(data)))
csv_writer.writerow(data)
if len(browser.find_elements_by_xpath("//a[contains(.,'Next')]"))>0:
browser.find_element_by_xpath("//a[contains(.,'Next')]").click()
else:
break
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论