rvest:调整网络抓取功能,以便在遇到无数据时继续。

huangapple go评论105阅读模式
英文:

rvest: Adapt webscraping function to continue if it comes across no data

问题

以下是翻译好的部分:

这个函数可以很好地用于从此数据集存储库中提取不同数据集的Begin:End:日期。然而,并非每个数据集的元数据都包含Begin:End:,这会导致函数停止并抛出错误消息。是否可以修改函数以跳过下一个URL(package_id)而不是停止?

  1. library(tidyverse)
  2. library(rvest)
  3. library(janitor)
  4. # 用于从不同元数据URL提取开始和结束日期的函数
  5. scraper <- function(package_id) {
  6. cat("正在提取数据", package_id, "\n")
  7. data <- str_c("https://portal.edirepository.org/nis/metadataviewer?packageid=",
  8. package_id) %>%
  9. read_html() %>%
  10. html_elements(".subgroup.onehundred_percent") %>%
  11. pluck(1) %>%
  12. html_elements(".roweven") %>%
  13. html_text2()
  14. tibble(begin = pluck(data, 1),
  15. end = pluck(data, 1))
  16. }
  17. # 获取package_id
  18. good_page <-
  19. "http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&amp;q=*:*&amp;fq=-scope:ecotrends&amp;fq=-scope:lter-landsat*&amp;fq=scope:(knb-lter-kbs)&amp;fl=id,packageid,title,author,organization,pubdate,coordinates&amp;debug=false&amp;start=0&amp;rows=3" %>%
  20. read_html()
  21. bad_page <-
  22. "http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&amp;q=*:*&amp;fq=-scope:ecotrends&amp;fq=-scope:lter-landsat*&amp;fq=scope:(knb-lter-kbs)&amp;fl=id,packageid,title,author,organization,pubdate,coordinates&amp;debug=false&amp;start=40&amp;rows=4" %>%
  23. read_html()
  24. # 正常工作,返回发布日期、package_id、开始日期和结束日期
  25. dat1 <- good_page %>%
  26. html_table() %>%
  27. pluck(4) %>%
  28. clean_names() %>%
  29. mutate(across(title, ~ str_squish(str_remove_all(., "\n")))) %>%
  30. mutate(date = map(package_id, scraper)) %>%
  31. unnest(date) %>%
  32. select(!c(title,creators))
  33. dat1
  34. # 出错,因为 knb-lter-kbs.141.1 不包含开始或结束日期
  35. # 上面的函数是否可以修改以跳过下一个package_id而不是停止?
  36. dat2 <- bad_page %>%
  37. html_table() %>%
  38. pluck(4) %>%
  39. clean_names() %>%
  40. mutate(across(title, ~ str_squish(str_remove_all(., "\n")))) %>%
  41. mutate(date = map(package_id, scraper)) %>%
  42. unnest(date) %>%
  43. select(!c(title,creators))
英文:

The following function works well for scraping the Begin: and End: dates for different datasets from this dataset repository. However, not every dataset's metadata includes Begin: and End: which causes the function to stop and throw an error message. Can the function be adapted to skip to the next URL (package_id) rather than stop?

  1. library(tidyverse)
  2. library(rvest)
  3. library(janitor)
  4. # Function for scraping begin and end dates from different metadata URLs
  5. scraper &lt;- function(package_id) {
  6. cat(&quot;Scraping&quot;, package_id, &quot;\n&quot;)
  7. data &lt;- str_c(&quot;https://portal.edirepository.org/nis/metadataviewer?packageid=&quot;,
  8. package_id) %&gt;%
  9. read_html() %&gt;%
  10. html_elements(&quot;.subgroup.onehundred_percent&quot;) %&gt;%
  11. pluck(1) %&gt;%
  12. html_elements(&quot;.roweven&quot;) %&gt;%
  13. html_text2()
  14. tibble(begin = pluck(data, 1),
  15. end = pluck(data, 1))
  16. }
  17. # Grabs package_id
  18. good_page &lt;-
  19. &quot;http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&amp;q=*:*&amp;fq=-scope:ecotrends&amp;fq=-scope:lter-landsat*&amp;fq=scope:(knb-lter-kbs)&amp;fl=id,packageid,title,author,organization,pubdate,coordinates&amp;debug=false&amp;start=0&amp;rows=3&quot; %&gt;%
  20. read_html()
  21. bad_page &lt;-
  22. &quot;http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&amp;q=*:*&amp;fq=-scope:ecotrends&amp;fq=-scope:lter-landsat*&amp;fq=scope:(knb-lter-kbs)&amp;fl=id,packageid,title,author,organization,pubdate,coordinates&amp;debug=false&amp;start=40&amp;rows=4&quot; %&gt;%
  23. read_html()
  24. # Works as it should, returns publication_date, package_id, begin, and end
  25. dat1 &lt;- good_page %&gt;%
  26. html_table() %&gt;%
  27. pluck(4) %&gt;%
  28. clean_names() %&gt;%
  29. mutate(across(title, ~ str_squish(str_remove_all(., &quot;\\n&quot;)))) %&gt;%
  30. mutate(date = map(package_id, scraper)) %&gt;%
  31. unnest(date) %&gt;%
  32. select(!c(title,creators))
  33. dat1
  34. # Breaks because knb-lter-kbs.141.1 does not include begin or end dates
  35. # Can the function above be amended to skip to the next package_id rather than stop?
  36. dat2 &lt;- bad_page %&gt;%
  37. html_table() %&gt;%
  38. pluck(4) %&gt;%
  39. clean_names() %&gt;%
  40. mutate(across(title, ~ str_squish(str_remove_all(., &quot;\\n&quot;)))) %&gt;%
  41. mutate(date = map(package_id, scraper)) %&gt;%
  42. unnest(date) %&gt;%
  43. select(!c(title,creators))

答案1

得分: 2

使用purrr::possibly()函数操作符,您可以定义一个版本的函数,当出现错误时返回NA值:

  1. library(tidyverse)
  2. library(rvest)
  3. library(janitor)
  4. safe_scraper <- possibly(
  5. scraper,
  6. otherwise = tibble(begin = NA_character_, end = NA_character_)
  7. )
  8. bad_page %>%
  9. html_table() %>%
  10. pluck(4) %>%
  11. clean_names() %>%
  12. mutate(
  13. title = str_squish(str_remove_all(title, "\\n")),
  14. date = map(package_id, safe_scraper)
  15. ) %>%
  16. unnest(date) %>%
  17. select(!c(title, creators))
  1. # A tibble: 4 × 4
  2. publication_date package_id begin end
  3. <int> <chr> <chr> <chr>
  4. 1 2018 knb-lter-kbs.139.5 2008-01-01 2008-01-01
  5. 2 2018 knb-lter-kbs.140.5 NA NA
  6. 3 2016 knb-lter-kbs.141.1 Information Manager (LTER Network Office) [ email ] Information Manager (LTER Network Office) [
  7. 4 2018 knb-lter-kbs.162.79 1950-01-01 1950-01-01

从第3行的判断,似乎您的工作流程可能存在其他问题。

英文:

Using the purrr::possibly() function operator, you can define a version of your function that returns NA values when an error occurs:

  1. library(tidyverse)
  2. library(rvest)
  3. library(janitor)
  4. safe_scraper &lt;- possibly(
  5. scraper,
  6. otherwise = tibble(begin = NA_character_, end = NA_character_)
  7. )
  8. bad_page %&gt;%
  9. html_table() %&gt;%
  10. pluck(4) %&gt;%
  11. clean_names() %&gt;%
  12. mutate(
  13. title = str_squish(str_remove_all(title, &quot;\\n&quot;)),
  14. date = map(package_id, safe_scraper)
  15. ) %&gt;%
  16. unnest(date) %&gt;%
  17. select(!c(title, creators))
  1. # A tibble: 4 &#215; 4
  2. publication_date package_id begin end
  3. &lt;int&gt; &lt;chr&gt; &lt;chr&gt; &lt;chr&gt;
  4. 1 2018 knb-lter-kbs.139.5 2008-01-01 2008-01-01
  5. 2 2018 knb-lter-kbs.140.5 NA NA
  6. 3 2016 knb-lter-kbs.141.1 Information Manager (LTER Network Office) [ email ] Information Manager (LTER Network Office) [
  7. 4 2018 knb-lter-kbs.162.79 1950-01-01 1950-01-01

Judging by row 3, though, looks like there may be another issue in your workflow.

答案2

得分: 1

从HTML方法更改为XML方法以获得更一致的数据提取。

  1. library(httr2)
  2. library(tidyverse)
  3. library(janitor)
  4. bad_page <-
  5. "http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&amp;q=*:*&amp;fq=-scope:ecotrends&amp;fq=-scope:lter-landsat*&amp;fq=scope:(knb-lter-kbs)&amp;fl=id,packageid,title,author,organization,pubdate,coordinates&amp;debug=false&amp;start=40&amp;rows=4" %>%
  6. read_html()
  7. scraper <- function(package_id) {
  8. cat("Scraping", package_id, "\n")
  9. data <- str_c(
  10. "https://portal.edirepository.org/nis/metadataviewer?packageid=",
  11. package_id,
  12. "&amp;contentType=application/xml"
  13. ) %>%
  14. request() %>%
  15. req_perform() %>%
  16. resp_body_xml() %>%
  17. xml_find_all(".//calendarDate") %>%
  18. xml_text()
  19. tibble(begin = pluck(data, 1),
  20. end = pluck(data, 1))
  21. }
  22. data <- bad_page %>%
  23. html_table() %>%
  24. pluck(4) %>%
  25. clean_names() %>%
  26. mutate(across(title, ~ str_squish(str_remove_all(., "\\n")))) %>%
  27. mutate(date = map(package_id, possibly(
  28. scraper, otherwise = tibble(begin = NA, end = NA)
  29. ))) %>%
  30. unnest(date, keep_empty = TRUE) %>%
  31. select(!c(title, creators))

结果:

  1. # A tibble: 4 × 4
  2. publication_date package_id begin end
  3. <int> <chr> <chr> <chr>
  4. 1 2018 knb-lter-kbs.139.5 2008-01-01 2008-01-01
  5. 2 2018 knb-lter-kbs.140.5 NA NA
  6. 3 2016 knb-lter-kbs.141.1 NA NA
  7. 4 2018 knb-lter-kbs.162.79 1950-01-01 1950-01-01
英文:

Changed from html approach to XML for more consistent data extraction

  1. library(httr2)
  2. library(tidyverse)
  3. library(janitor)
  4. bad_page &lt;-
  5. &quot;http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&amp;q=*:*&amp;fq=-scope:ecotrends&amp;fq=-scope:lter-landsat*&amp;fq=scope:(knb-lter-kbs)&amp;fl=id,packageid,title,author,organization,pubdate,coordinates&amp;debug=false&amp;start=40&amp;rows=4&quot; %&gt;%
  6. read_html()
  7. scraper &lt;- function(package_id) {
  8. cat(&quot;Scraping&quot;, package_id, &quot;\n&quot;)
  9. data &lt;- str_c(
  10. &quot;https://portal.edirepository.org/nis/metadataviewer?packageid=&quot;,
  11. package_id,
  12. &quot;&amp;contentType=application/xml&quot;
  13. ) %&gt;%
  14. request() %&gt;%
  15. req_perform() %&gt;%
  16. resp_body_xml() %&gt;%
  17. xml_find_all(&quot;.//calendarDate&quot;) %&gt;%
  18. xml_text()
  19. tibble(begin = pluck(data, 1),
  20. end = pluck(data, 1))
  21. }
  22. data &lt;- bad_page %&gt;%
  23. html_table() %&gt;%
  24. pluck(4) %&gt;%
  25. clean_names() %&gt;%
  26. mutate(across(title, ~ str_squish(str_remove_all(., &quot;\\n&quot;)))) %&gt;%
  27. mutate(date = map(package_id, possibly(
  28. scraper, otherwise = tibble(begin = NA, end = NA)
  29. ))) %&gt;%
  30. unnest(date, keep_empty = TRUE) %&gt;%
  31. select(!c(title, creators))

The result:

  1. # A tibble: 4 &#215; 4
  2. publication_date package_id begin end
  3. &lt;int&gt; &lt;chr&gt; &lt;chr&gt; &lt;chr&gt;
  4. 1 2018 knb-lter-kbs.139.5 2008-01-01 2008-01-01
  5. 2 2018 knb-lter-kbs.140.5 NA NA
  6. 3 2016 knb-lter-kbs.141.1 NA NA
  7. 4 2018 knb-lter-kbs.162.79 1950-01-01 1950-01-01

huangapple
  • 本文由 发表于 2023年6月16日 01:45:26
  • 转载请务必保留本文链接:https://go.coder-hub.com/76484264.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定