英文:
Load More using Selenium on Webscraping
问题
I was trying to do webscraping on Reuters for nlp analysis and most of it is working, but I am unable to get the code to click the "load more" button for more news articles. Below is the code currently being used:
import csv
import time
import pprint
from datetime import datetime, timedelta
import requests
import nltk
nltk.download('vader_lexicon')
from urllib.request import urlopen
from bs4 import BeautifulSoup
from bs4.element import Tag
comp_name = 'Apple'
url = 'https://www.reuters.com/search/news?blob=' + comp_name + '&sortBy=date&dateRange=all'
res = requests.get(url.format(1))
soup = BeautifulSoup(res.text,"lxml")
for item in soup.find_all("h3",{"class":"search-result-title"}):
s = str(item)
article_addr = s.partition('a href="')[2].partition('">')[0]
headline = s.partition('a href="')[2].partition('">')[2].partition('</a></h3>')[0]
article_link = 'https://www.reuters.com' + article_addr
try:
resp = requests.get(article_addr)
except Exception as e:
try:
resp = requests.get(article_link)
except Exception as e:
continue
sauce = BeautifulSoup(resp.text,"lxml")
dateTag = sauce.find("div",{"class":"ArticleHeader_date"})
contentTag = sauce.find("div",{"class":"StandardArticleBody_body"})
date = None
title = None
content = None
if isinstance(dateTag,Tag):
date = dateTag.get_text().partition('/')[0]
if isinstance(contentTag,Tag):
content = contentTag.get_text().strip()
time.sleep(3)
link_soup = BeautifulSoup(content)
sentences = link_soup.findAll("p")
print(date, headline, article_link)
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
browser = webdriver.Safari()
browser.get('https://www.reuters.com/search/news?blob=' + comp_name + '&sortBy=date&dateRange=all')
try:
element = WebDriverWait(browser, 3).until(EC.presence_of_element_located((By.ID,'Id_Of_Element')))
except TimeoutException:
print("Time out!")
(Note: I've fixed the typo in the import statement for Keys
and added necessary imports for the Selenium code.)
英文:
I was trying to do webscraping on Reuters for nlp analysis and most of it is working, but I am unable to get the code to click the "load more" button for more news articles. Below is the code currently being used:
import csv
import time
import pprint
from datetime import datetime, timedelta
import requests
import nltk
nltk.download('vader_lexicon')
from urllib.request import urlopen
from bs4 import BeautifulSoup
from bs4.element import Tag
comp_name = 'Apple'
url = 'https://www.reuters.com/search/news?blob=' + comp_name + '&sortBy=date&dateRange=all'
res = requests.get(url.format(1))
soup = BeautifulSoup(res.text,"lxml")
for item in soup.find_all("h3",{"class":"search-result-title"}):
s = str(item)
article_addr = s.partition('a href="')[2].partition('">')[0]
headline = s.partition('a href="')[2].partition('">')[2].partition('</a></h3>')[0]
article_link = 'https://www.reuters.com' + article_addr
try:
resp = requests.get(article_addr)
except Exception as e:
try:
resp = requests.get(article_link)
except Exception as e:
continue
sauce = BeautifulSoup(resp.text,"lxml")
dateTag = sauce.find("div",{"class":"ArticleHeader_date"})
contentTag = sauce.find("div",{"class":"StandardArticleBody_body"})
date = None
title = None
content = None
if isinstance(dateTag,Tag):
date = dateTag.get_text().partition('/')[0]
if isinstance(contentTag,Tag):
content = contentTag.get_text().strip()
time.sleep(3)
link_soup = BeautifulSoup(content)
sentences = link_soup.findAll("p")
print(date, headline, article_link)
from selenium import webdriver
from selenium.webdriver.common.keys import keys
import time
browser = webdriver.Safari()
browser.get('https://www.reuters.com/search/news?blob=' + comp_name + '&sortBy=date&dateRange=all')
try:
element = WebDriverWait(browser, 3).until(EC.presence_of_element_located((By.ID,'Id_Of_Element')))
except TimeoutException:
print("Time out!")
答案1
得分: 3
要点击文本为LOAD MORE RESULTS的元素,您需要使用 WebDriverWait 来等待 element_to_be_clickable()
,并且您可以使用以下 定位策略:
- 代码块:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options, executable_path=r'C:\WebDrivers\chromedriver.exe')
comp_name = 'Apple'
driver.get('https://www.reuters.com/search/news?blob=' + comp_name + '&sortBy=date&dateRange=all')
while True:
try:
driver.execute_script("return arguments[0].scrollIntoView(true);", WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//div[@class='search-result-more-txt']"))))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='search-result-more-txt']"))).click()
print("LOAD MORE RESULTS 按钮已点击")
except TimeoutException:
print("没有更多LOAD MORE RESULTS按钮可点击")
break
driver.quit()
- 控制台输出:
LOAD MORE RESULTS 按钮已点击
LOAD MORE RESULTS 按钮已点击
LOAD MORE RESULTS 按钮已点击
.
.
没有更多LOAD MORE RESULTS按钮可点击
参考
您可以在以下链接中找到相关的详细讨论:
英文:
To click the element with text as LOAD MORE RESULTS you need to induce WebDriverWait for the element_to_be_clickable()
and you can use the following Locator Strategies:
-
Code Block:
from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC options = webdriver.ChromeOptions() options.add_argument("start-maximized") options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option('useAutomationExtension', False) driver = webdriver.Chrome(options=options, executable_path=r'C:\WebDrivers\chromedriver.exe') comp_name = 'Apple' driver.get('https://www.reuters.com/search/news?blob=' + comp_name + '&sortBy=date&dateRange=all') while True: try: driver.execute_script("return arguments[0].scrollIntoView(true);", WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//div[@class='search-result-more-txt']")))) WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='search-result-more-txt']"))).click() print("LOAD MORE RESULTS button clicked") except TimeoutException: print("No more LOAD MORE RESULTS button to be clicked") break driver.quit()
-
Console Output:
LOAD MORE RESULTS button clicked LOAD MORE RESULTS button clicked LOAD MORE RESULTS button clicked . . No more LOAD MORE RESULTS button to be clicked
Reference
You can find a relevant detailed discussion in:
答案2
得分: 0
Sure, here is the translated code portion:
要点击“LOAD MORE RESULTS”按钮,请使用`WebDriverWait()`和`element_to_be_clickable()`函数。
使用while循环并检查计数器小于11以点击10次。
我已在Chrome上进行了测试,因为我没有Safari浏览器,但它也应该可以工作。
```python
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
comp_name = "Apple"
browser = webdriver.Chrome()
browser.get('https://www.reuters.com/search/news?blob=' + comp_name + '&sortBy=date&dateRange=all')
# 接受条款按钮
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button#_evidon-banner-acceptbutton"))).click()
i = 1
while i < 11:
try:
element = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='search-result-more-txt' and text()='LOAD MORE RESULTS']")))
element.location_once_scrolled_into_view
browser.execute_script("arguments[0].click();", element)
print(i)
i = i + 1
except TimeoutException:
print("超时!")
英文:
To click on LOAD MORE RESULTS
induce WebDriverWait
() and element_to_be_clickable
()
Use while loop and check the counter<11 to click on 10 times.
I have tested on Chrome since I don't have safari browser however it should work too.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
comp_name="Apple"
browser = webdriver.Chrome()
browser.get('https://www.reuters.com/search/news?blob=' + comp_name + '&sortBy=date&dateRange=all')
#Accept the trems button
WebDriverWait(browser,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"button#_evidon-banner-acceptbutton"))).click()
i=1
while i<11:
try:
element = WebDriverWait(browser,10).until(EC.element_to_be_clickable((By.XPATH,"//div[@class='search-result-more-txt' and text()='LOAD MORE RESULTS']")))
element.location_once_scrolled_into_view
browser.execute_script("arguments[0].click();", element)
print(i)
i=i+1
except TimeoutException:
print("Time out!")
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论