2023年5月6日 15:19:42go评论112阅读模式

英文:

Code return empty dataframe, problem in understand logic

问题

以下是代码的翻译部分：

这段代码是用于从名为cafef的网站爬取股票数据的。输入是网站链接和来自该网站HTML的元素，期望输出是一个包含日期、价格和成交量等股票数据的表格。然而，代码未能正常工作，返回了一个空数据框。我不理解第二个try except块，因此无法调试运行这段代码。各位，请帮我解释一下。

from selenium import webdriver
from time import sleep
from selenium.webdriver.common.keys import Keys
import pandas as pd
from selenium.webdriver.support.ui import Select
def crawl(stock):
    date = []
    price = []
    volume = []
    close = []
    stock_id = []
    browser = webdriver.Chrome(executable_path="./chromedriver")
    web = browser.get("https://s.cafef.vn/Lich-su-giao-dich-" + stock + "-1.chn")
    sleep(5)
    
    for count in range(60):
        try:
            date_data = browser.find_elements("Item_DateItem")
            for row in date_data:
                date.append(row.text)
                print(row.text())
            date_data.clear()
            
            price_data = browser.find_elements_by_class_name("Item_Price1")
            for row in price_data:
                price.append(row.text)
            price_data.clear()
        except:
            break
        
        try:
            if count == 0:
                next_page = browser.find_element(By.XPATH, "/html/body/form/div[3]/div/div[2]/div[2]/div[1]/div[3]/div/div/div[2]/div[2]/div[2]/div/div/div/div/table/tbody/tr/td[21]/a")
            else:
                try:
                    next_page = browser.find_element(By.XPATH, "/html/body/form/div[3]/div/div/2/div[2]/div[1]/div[3]/div/div/div[2]/div[2]/div[2]/div/div/div/div/table/tbody/tr/td[22]/a")
                except:
                    next_page = browser.find_element(By.XPATH, "/html/body/form/div[3]/div/div/2/div[2]/div[1]/div[3]/div/div/div[2]/div[2]/div[2]/div/div/div/div/table/tbody/tr/td[23]/a")
            next_page.click()
            sleep(5)
        except:
            break
    
    for i in range(int(len(price) / 10)):
        close.append(price[10 * i + 1].replace(",", ""))
        volume.append(price[10 * i + 2].replace(",", ""))
    
    for i in range(len(date)):
        stock_id.append(stock)
    
    d = {'Stock': stock_id, 'Date': date, 'Close': close, 'Volume': volume}
    df = pd.DataFrame(data=d)
    df.to_csv(stock + ".csv", index=False)
    return df
print(crawl('ABC'))

关于您提到的无法找到XPath元素的问题，您可以检查网站的HTML结构以确保XPath是否正确。此外，您还可以使用浏览器的开发者工具来检查页面元素的XPath路径，以便正确定位它们。

英文:

This code is to crawl stock data from a website named cafef. The input is website link and element from that website's HTML and the expected output is a table with stock data including Date, Price, Volume. However, the code didn't work and it returns an Empty Dataframe. I don't understand the second try except block, so I cannot debug to run this code. Guys please explain for me

from selenium import webdriver
from time import sleep
from selenium.webdriver.common.keys import Keys
import pandas as pd
from selenium.webdriver.support.ui import Select
def crawl(stock):
date=[]
price=[]
volume=[]
close=[]
stock_id=[]
browser = webdriver.Chrome(executable_path=&quot;./chromedriver&quot;)
web = browser.get(&quot;https://s.cafef.vn/Lich-su-giao-dich-&quot;+stock+&quot;-1.chn&quot;)
sleep(5)
for count in range (60):
try:
date_data=browser.find_elements(&quot;Item_DateItem&quot;)
for row in date_data:
date.append(row.text)
print(row.text())
date_data.clear()
price_data=browser.find_elements_by_class_name(&quot;Item_Price1&quot;)
for row in price_data:
price.append(row.text)
price_data.clear()
except:
break
try:
if count == 0:
next_page = browser.find_element(By.XPATH, &quot;/html/body/form/div[3]/div/div[2]/div[2]/div[1]/div[3]/div/div/div[2]/div[2]/div[2]/div/div/div/div/table/tbody/tr/td[21]/a&quot;)
else:
try:
next_page = browser.find_element(By.XPATH, &quot;/html/body/form/div[3]/div/div[2]/div[2]/div[1]/div[3]/div/div/div[2]/div[2]/div[2]/div/div/div/div/table/tbody/tr/td[22]/a&quot;)
except:
next_page = browser.find_element(By.XPATH, &quot;/html/body/form/div[3]/div/div[2]/div[2]/div[1]/div[3]/div/div/div[2]/div[2]/div[2]/div/div/div/div/table/tbody/tr/td[23]/a&quot;)
next_page.click()
sleep(5)
except:
break
for i in range (int(len(price)/10)):
close.append(price[10*i+1].replace(&quot;,&quot;,&quot;&quot;))
volume.append(price[10*i+2].replace(&quot;,&quot;,&quot;&quot;))
for i in range (len(date)):
stock_id.append(stock)
d = {&#39;Stock&#39;: stock_id,&#39;Date&#39;: date,&#39;Close&#39;: close,&#39;Volume&#39;: volume}
df = pd.DataFrame(data=d)
df.to_csv(stock+&quot;.csv&quot;, index=False)
return df
print(crawl(&#39;ABC&#39;))

I tried to find the xpath element but I didn't find

答案1

得分: 0

以下是您提供的代码的翻译部分：

需要从selenium.webdriver.common.by导入By，因为代码中使用了它。
Item_DateItem和Item_Price1是类属性值，所以我们要么需要使用By.CLASS_NAME，要么使用By.CSS_SELECTOR。
有一处使用了row.text()。但text是一个属性而不是方法，所以它会引发异常，循环会在异常处中断并导致空数据框。我建议在这里记录异常而不是捕获并悄悄中断循环，以快速找到根本原因。
您正在使用绝对XPATH来查找下一页按钮。最好找到一些不经常更改的稳定选择器。

所以，在进行所有更改后，代码看起来像这样：

from selenium import webdriver
from time import sleep
from selenium.webdriver.common.keys import Keys
import pandas as pd
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By  # 在代码中使用了By但没有导入
def crawl(stock):
    date = []
    price = []
    volume = []
    close = []
    stock_id = []
    browser = webdriver.Chrome(executable_path="./chromedriver")
    web = browser.get("https://s.cafef.vn/Lich-su-giao-dich-" + stock + "-1.chn")
    sleep(5)
    for count in range(60):
        try:
            date_data = browser.find_elements(By.CSS_SELECTOR, ".Item_DateItem")  # Item_DateItem是一个css类，所以我们要相应地使用"By"
            for row in date_data:
                date.append(row.text)
                print(row.text)  # text是一个属性
            date_data.clear()
            price_data = browser.find_elements(By.CSS_SELECTOR, ".Item_Price1")  # Item_Price1是一个css类，所以我们要相应地使用"By"
            for row in price_data:
                price.append(row.text)
            price_data.clear()
        except:
            break
        try:
            next_page = browser.find_element(By.CSS_SELECTOR, ".CafeF_Paging td:last-child a")  # 最好使用稳定的选择器，而不是使用更有可能更改的绝对XPATH
            next_page.click()
            sleep(5)
        except:
            break
    for i in range(int(len(price) / 10)):
        close.append(price[10 * i + 1].replace(",", ""))
        volume.append(price[10 * i + 2].replace(",", ""))
    for i in range(len(date)):
        stock_id.append(stock)
    d = {'Stock': stock_id, 'Date': date, 'Close': close, 'Volume': volume}
    df = pd.DataFrame(data=d)
    df.to_csv(stock + ".csv", index=False)
    return df
print(crawl('ABC'))

请注意，我已经根据原始代码进行了必要的翻译，但代码中可能仍然存在其他问题或改进的机会。

英文:

There are couple of minor issues, here is the list

By need to be imported from selenium.webdriver.common.by as the logic has usage of it.
Item_DateItem and Item_Price1 are class property values so, either we need to use By.CLASS_NAME or By.CSS_SELECTOR.
There is one usage of row.text(). But text is a property not a method so, it will raise a exception, the loop is breaking on exception and resulting empty data frame. I would suggest here to log the exceptions instead of catching and silently breaking the loop, to get the root cause quickly.
You are using absolute XPATH to find the next page button. Better to find some stable selector which does not change frequently.

So, after making all the changes, the code looks like this:


from selenium import webdriver
from time import sleep
from selenium.webdriver.common.keys import Keys
import pandas as pd
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By # in the code there are usage of By but not imported
def crawl(stock):
 date=[]
 price=[]
 volume=[]
 close=[]
 stock_id=[]
 browser = webdriver.Chrome(executable_path=&quot;./chromedriver&quot;)
 web = browser.get(&quot;https://s.cafef.vn/Lich-su-giao-dich-&quot;+stock+&quot;-1.chn&quot;)
 sleep(5)
 for count in range (60):
  try:
      date_data=browser.find_elements(By.CSS_SELECTOR, &quot;.Item_DateItem&quot;) # Item_DateItem is a css class, so we have to use &quot;By&quot; accordingly
      for row in date_data:
        date.append(row.text)
        print(row.text) # text is a propery
      date_data.clear()
      price_data=browser.find_elements(By.CSS_SELECTOR, &quot;.Item_Price1&quot;) # Item_DateItem is a css class, so we have to use &quot;By&quot; accordingly
      for row in price_data:
        price.append(row.text)
      price_data.clear()
  except:
   break
  try:
    next_page = browser.find_element(By.CSS_SELECTOR, &quot;.CafeF_Paging td:last-child a&quot;) # better to use a stable selector than using a absolute XPATH which is more likely to change
    next_page.click()
    sleep(5)
  except:
    break
 for i in range (int(len(price)/10)):
  close.append(price[10*i+1].replace(&quot;,&quot;,&quot;&quot;))
  volume.append(price[10*i+2].replace(&quot;,&quot;,&quot;&quot;))
 for i in range (len(date)):
  stock_id.append(stock)
 d = {&#39;Stock&#39;: stock_id,&#39;Date&#39;: date,&#39;Close&#39;: close,&#39;Volume&#39;: volume}
 df = pd.DataFrame(data=d)
 df.to_csv(stock+&quot;.csv&quot;, index=False)
 return df
print(crawl(&#39;ABC&#39;))

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

代码返回空的数据框，理解逻辑存在问题。

问题

答案1

如何使用`sorted`方法根据对象的外键对查询集进行排序？

问题与xlsxwriter有关，忽略“=”。

ImportError: 无法从’llama_index.llms’导入名称’CustomLLM’

开发者令牌在此项目中不被允许 – Google广告API

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。