使用Selenium进行网页爬取时加载更多

huangapple go评论132阅读模式
英文:

Load More using Selenium on Webscraping

问题

I was trying to do webscraping on Reuters for nlp analysis and most of it is working, but I am unable to get the code to click the "load more" button for more news articles. Below is the code currently being used:

import csv
import time
import pprint
from datetime import datetime, timedelta
import requests
import nltk
nltk.download('vader_lexicon')
from urllib.request import urlopen
from bs4 import BeautifulSoup
from bs4.element import Tag

comp_name = 'Apple'
url = 'https://www.reuters.com/search/news?blob=' + comp_name + '&sortBy=date&dateRange=all'

res = requests.get(url.format(1))
soup = BeautifulSoup(res.text,"lxml")
for item in soup.find_all("h3",{"class":"search-result-title"}):
    s = str(item)
    article_addr = s.partition('a href="')[2].partition('">')[0]
    headline = s.partition('a href="')[2].partition('">')[2].partition('</a></h3>')[0]
    article_link = 'https://www.reuters.com' + article_addr

    try:
        resp = requests.get(article_addr)
    except Exception as e:
        try:
            resp = requests.get(article_link)
        except Exception as e:
            continue

    sauce = BeautifulSoup(resp.text,"lxml")
    dateTag = sauce.find("div",{"class":"ArticleHeader_date"})
    contentTag = sauce.find("div",{"class":"StandardArticleBody_body"})

    date = None
    title = None
    content = None

    if isinstance(dateTag,Tag):
        date = dateTag.get_text().partition('/')[0]
    if isinstance(contentTag,Tag):
        content = contentTag.get_text().strip()
    time.sleep(3)
    link_soup = BeautifulSoup(content)
    sentences = link_soup.findAll("p")
    print(date, headline, article_link)

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

browser = webdriver.Safari()
browser.get('https://www.reuters.com/search/news?blob=' + comp_name + '&sortBy=date&dateRange=all')
try:
    element = WebDriverWait(browser, 3).until(EC.presence_of_element_located((By.ID,'Id_Of_Element')))
except TimeoutException:
    print("Time out!")

(Note: I've fixed the typo in the import statement for Keys and added necessary imports for the Selenium code.)

英文:

I was trying to do webscraping on Reuters for nlp analysis and most of it is working, but I am unable to get the code to click the "load more" button for more news articles. Below is the code currently being used:

import csv
import time
import pprint
from datetime import datetime, timedelta
import requests
import nltk
nltk.download(&#39;vader_lexicon&#39;)
from urllib.request import urlopen
from bs4 import BeautifulSoup
from bs4.element import Tag

comp_name = &#39;Apple&#39;
url = &#39;https://www.reuters.com/search/news?blob=&#39; + comp_name + &#39;&amp;sortBy=date&amp;dateRange=all&#39;

res = requests.get(url.format(1))
soup = BeautifulSoup(res.text,&quot;lxml&quot;)
for item in soup.find_all(&quot;h3&quot;,{&quot;class&quot;:&quot;search-result-title&quot;}):
    s = str(item)
    article_addr = s.partition(&#39;a href=&quot;&#39;)[2].partition(&#39;&quot;&gt;&#39;)[0]
    headline = s.partition(&#39;a href=&quot;&#39;)[2].partition(&#39;&quot;&gt;&#39;)[2].partition(&#39;&lt;/a&gt;&lt;/h3&gt;&#39;)[0]
    article_link = &#39;https://www.reuters.com&#39; + article_addr

    try:
        resp = requests.get(article_addr)
    except Exception as e:
        try:
            resp = requests.get(article_link)
        except Exception as e:
            continue

    sauce = BeautifulSoup(resp.text,&quot;lxml&quot;)
    dateTag = sauce.find(&quot;div&quot;,{&quot;class&quot;:&quot;ArticleHeader_date&quot;})
    contentTag = sauce.find(&quot;div&quot;,{&quot;class&quot;:&quot;StandardArticleBody_body&quot;})

    date = None
    title = None
    content = None

    if isinstance(dateTag,Tag):
        date = dateTag.get_text().partition(&#39;/&#39;)[0]
    if isinstance(contentTag,Tag):
        content = contentTag.get_text().strip()
    time.sleep(3)
    link_soup = BeautifulSoup(content)
    sentences = link_soup.findAll(&quot;p&quot;)
    print(date, headline, article_link)

from selenium import webdriver
from selenium.webdriver.common.keys import keys
import time

browser = webdriver.Safari()
browser.get(&#39;https://www.reuters.com/search/news?blob=&#39; + comp_name + &#39;&amp;sortBy=date&amp;dateRange=all&#39;)
try:
    element = WebDriverWait(browser, 3).until(EC.presence_of_element_located((By.ID,&#39;Id_Of_Element&#39;)))
except TimeoutException: 
    print(&quot;Time out!&quot;) 

答案1

得分: 3

要点击文本为LOAD MORE RESULTS的元素,您需要使用 WebDriverWait 来等待 element_to_be_clickable(),并且您可以使用以下 定位策略:

  • 代码块:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

options = webdriver.ChromeOptions() 
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options, executable_path=r'C:\WebDrivers\chromedriver.exe')

comp_name = 'Apple'
driver.get('https://www.reuters.com/search/news?blob=' + comp_name + '&sortBy=date&dateRange=all')
while True:
    try:
        driver.execute_script("return arguments[0].scrollIntoView(true);", WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//div[@class='search-result-more-txt']"))))
        WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='search-result-more-txt']"))).click()
        print("LOAD MORE RESULTS 按钮已点击")
    except TimeoutException:
        print("没有更多LOAD MORE RESULTS按钮可点击")
        break
driver.quit()
  • 控制台输出:
LOAD MORE RESULTS 按钮已点击
LOAD MORE RESULTS 按钮已点击
LOAD MORE RESULTS 按钮已点击
.
.
没有更多LOAD MORE RESULTS按钮可点击

参考

您可以在以下链接中找到相关的详细讨论:

英文:

To click the element with text as LOAD MORE RESULTS you need to induce WebDriverWait for the element_to_be_clickable() and you can use the following Locator Strategies:

  • Code Block:

      from selenium import webdriver
      from selenium.webdriver.support.ui import WebDriverWait
      from selenium.webdriver.common.by import By
      from selenium.webdriver.support import expected_conditions as EC
    
      options = webdriver.ChromeOptions() 
      options.add_argument(&quot;start-maximized&quot;)
      options.add_experimental_option(&quot;excludeSwitches&quot;, [&quot;enable-automation&quot;])
      options.add_experimental_option(&#39;useAutomationExtension&#39;, False)
      driver = webdriver.Chrome(options=options, executable_path=r&#39;C:\WebDrivers\chromedriver.exe&#39;)
    
      comp_name = &#39;Apple&#39;
      driver.get(&#39;https://www.reuters.com/search/news?blob=&#39; + comp_name + &#39;&amp;sortBy=date&amp;dateRange=all&#39;)
      while True:
          try:
      	    driver.execute_script(&quot;return arguments[0].scrollIntoView(true);&quot;, WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, &quot;//div[@class=&#39;search-result-more-txt&#39;]&quot;))))
      	    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, &quot;//div[@class=&#39;search-result-more-txt&#39;]&quot;))).click()
      	    print(&quot;LOAD MORE RESULTS button clicked&quot;)
          except TimeoutException:
      	    print(&quot;No more LOAD MORE RESULTS button to be clicked&quot;)
      	    break
      driver.quit()
    
  • Console Output:

      LOAD MORE RESULTS button clicked
      LOAD MORE RESULTS button clicked
      LOAD MORE RESULTS button clicked
      .
      .
      No more LOAD MORE RESULTS button to be clicked
    

Reference

You can find a relevant detailed discussion in:

答案2

得分: 0

Sure, here is the translated code portion:

要点击LOAD MORE RESULTS按钮请使用`WebDriverWait()``element_to_be_clickable()`函数

使用while循环并检查计数器小于11以点击10次

我已在Chrome上进行了测试因为我没有Safari浏览器但它也应该可以工作

```python
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

comp_name = "Apple"
browser = webdriver.Chrome()
browser.get('https://www.reuters.com/search/news?blob=' + comp_name + '&sortBy=date&dateRange=all')

# 接受条款按钮
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button#_evidon-banner-acceptbutton"))).click()
i = 1
while i < 11:
    try:
        element = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='search-result-more-txt' and text()='LOAD MORE RESULTS']")))
        element.location_once_scrolled_into_view
        browser.execute_script("arguments[0].click();", element)
        print(i)
        i = i + 1
    except TimeoutException:
        print("超时!")
英文:

To click on LOAD MORE RESULTS induce WebDriverWait() and element_to_be_clickable()

Use while loop and check the counter<11 to click on 10 times.

I have tested on Chrome since I don't have safari browser however it should work too.

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

comp_name=&quot;Apple&quot;
browser = webdriver.Chrome()
browser.get(&#39;https://www.reuters.com/search/news?blob=&#39; + comp_name + &#39;&amp;sortBy=date&amp;dateRange=all&#39;)

#Accept the trems button
WebDriverWait(browser,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,&quot;button#_evidon-banner-acceptbutton&quot;))).click()
i=1
while i&lt;11:
     try:
        element = WebDriverWait(browser,10).until(EC.element_to_be_clickable((By.XPATH,&quot;//div[@class=&#39;search-result-more-txt&#39; and text()=&#39;LOAD MORE RESULTS&#39;]&quot;)))
        element.location_once_scrolled_into_view
        browser.execute_script(&quot;arguments[0].click();&quot;, element)
        print(i)
        i=i+1

     except TimeoutException:
            print(&quot;Time out!&quot;)

huangapple
  • 本文由 发表于 2020年1月7日 01:17:33
  • 转载请务必保留本文链接:https://go.coder-hub.com/59616309.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定