英文:
rvest: Adapt webscraping function to continue if it comes across no data
问题
以下是翻译好的部分:
这个函数可以很好地用于从此数据集存储库中提取不同数据集的Begin:
和End:
日期。然而,并非每个数据集的元数据都包含Begin:
和End:
,这会导致函数停止并抛出错误消息。是否可以修改函数以跳过下一个URL(package_id
)而不是停止?
library(tidyverse)
library(rvest)
library(janitor)
# 用于从不同元数据URL提取开始和结束日期的函数
scraper <- function(package_id) {
cat("正在提取数据", package_id, "\n")
data <- str_c("https://portal.edirepository.org/nis/metadataviewer?packageid=",
package_id) %>%
read_html() %>%
html_elements(".subgroup.onehundred_percent") %>%
pluck(1) %>%
html_elements(".roweven") %>%
html_text2()
tibble(begin = pluck(data, 1),
end = pluck(data, 1))
}
# 获取package_id
good_page <-
"http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-kbs)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false&start=0&rows=3" %>%
read_html()
bad_page <-
"http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-kbs)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false&start=40&rows=4" %>%
read_html()
# 正常工作,返回发布日期、package_id、开始日期和结束日期
dat1 <- good_page %>%
html_table() %>%
pluck(4) %>%
clean_names() %>%
mutate(across(title, ~ str_squish(str_remove_all(., "\n")))) %>%
mutate(date = map(package_id, scraper)) %>%
unnest(date) %>%
select(!c(title,creators))
dat1
# 出错,因为 knb-lter-kbs.141.1 不包含开始或结束日期
# 上面的函数是否可以修改以跳过下一个package_id而不是停止?
dat2 <- bad_page %>%
html_table() %>%
pluck(4) %>%
clean_names() %>%
mutate(across(title, ~ str_squish(str_remove_all(., "\n")))) %>%
mutate(date = map(package_id, scraper)) %>%
unnest(date) %>%
select(!c(title,creators))
英文:
The following function works well for scraping the Begin:
and End:
dates for different datasets from this dataset repository. However, not every dataset's metadata includes Begin:
and End:
which causes the function to stop and throw an error message. Can the function be adapted to skip to the next URL (package_id
) rather than stop?
library(tidyverse)
library(rvest)
library(janitor)
# Function for scraping begin and end dates from different metadata URLs
scraper <- function(package_id) {
cat("Scraping", package_id, "\n")
data <- str_c("https://portal.edirepository.org/nis/metadataviewer?packageid=",
package_id) %>%
read_html() %>%
html_elements(".subgroup.onehundred_percent") %>%
pluck(1) %>%
html_elements(".roweven") %>%
html_text2()
tibble(begin = pluck(data, 1),
end = pluck(data, 1))
}
# Grabs package_id
good_page <-
"http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-kbs)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false&start=0&rows=3" %>%
read_html()
bad_page <-
"http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-kbs)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false&start=40&rows=4" %>%
read_html()
# Works as it should, returns publication_date, package_id, begin, and end
dat1 <- good_page %>%
html_table() %>%
pluck(4) %>%
clean_names() %>%
mutate(across(title, ~ str_squish(str_remove_all(., "\\n")))) %>%
mutate(date = map(package_id, scraper)) %>%
unnest(date) %>%
select(!c(title,creators))
dat1
# Breaks because knb-lter-kbs.141.1 does not include begin or end dates
# Can the function above be amended to skip to the next package_id rather than stop?
dat2 <- bad_page %>%
html_table() %>%
pluck(4) %>%
clean_names() %>%
mutate(across(title, ~ str_squish(str_remove_all(., "\\n")))) %>%
mutate(date = map(package_id, scraper)) %>%
unnest(date) %>%
select(!c(title,creators))
答案1
得分: 2
使用purrr::possibly()
函数操作符,您可以定义一个版本的函数,当出现错误时返回NA
值:
library(tidyverse)
library(rvest)
library(janitor)
safe_scraper <- possibly(
scraper,
otherwise = tibble(begin = NA_character_, end = NA_character_)
)
bad_page %>%
html_table() %>%
pluck(4) %>%
clean_names() %>%
mutate(
title = str_squish(str_remove_all(title, "\\n")),
date = map(package_id, safe_scraper)
) %>%
unnest(date) %>%
select(!c(title, creators))
# A tibble: 4 × 4
publication_date package_id begin end
<int> <chr> <chr> <chr>
1 2018 knb-lter-kbs.139.5 2008-01-01 2008-01-01
2 2018 knb-lter-kbs.140.5 NA NA
3 2016 knb-lter-kbs.141.1 Information Manager (LTER Network Office) [ email ] Information Manager (LTER Network Office) [ …
4 2018 knb-lter-kbs.162.79 1950-01-01 1950-01-01
从第3行的判断,似乎您的工作流程可能存在其他问题。
英文:
Using the purrr::possibly()
function operator, you can define a version of your function that returns NA
values when an error occurs:
library(tidyverse)
library(rvest)
library(janitor)
safe_scraper <- possibly(
scraper,
otherwise = tibble(begin = NA_character_, end = NA_character_)
)
bad_page %>%
html_table() %>%
pluck(4) %>%
clean_names() %>%
mutate(
title = str_squish(str_remove_all(title, "\\n")),
date = map(package_id, safe_scraper)
) %>%
unnest(date) %>%
select(!c(title, creators))
# A tibble: 4 × 4
publication_date package_id begin end
<int> <chr> <chr> <chr>
1 2018 knb-lter-kbs.139.5 2008-01-01 2008-01-01
2 2018 knb-lter-kbs.140.5 NA NA
3 2016 knb-lter-kbs.141.1 Information Manager (LTER Network Office) [ email ] Information Manager (LTER Network Office) [ …
4 2018 knb-lter-kbs.162.79 1950-01-01 1950-01-01
Judging by row 3, though, looks like there may be another issue in your workflow.
答案2
得分: 1
从HTML方法更改为XML方法以获得更一致的数据提取。
library(httr2)
library(tidyverse)
library(janitor)
bad_page <-
"http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-kbs)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false&start=40&rows=4" %>%
read_html()
scraper <- function(package_id) {
cat("Scraping", package_id, "\n")
data <- str_c(
"https://portal.edirepository.org/nis/metadataviewer?packageid=",
package_id,
"&contentType=application/xml"
) %>%
request() %>%
req_perform() %>%
resp_body_xml() %>%
xml_find_all(".//calendarDate") %>%
xml_text()
tibble(begin = pluck(data, 1),
end = pluck(data, 1))
}
data <- bad_page %>%
html_table() %>%
pluck(4) %>%
clean_names() %>%
mutate(across(title, ~ str_squish(str_remove_all(., "\\n")))) %>%
mutate(date = map(package_id, possibly(
scraper, otherwise = tibble(begin = NA, end = NA)
))) %>%
unnest(date, keep_empty = TRUE) %>%
select(!c(title, creators))
结果:
# A tibble: 4 × 4
publication_date package_id begin end
<int> <chr> <chr> <chr>
1 2018 knb-lter-kbs.139.5 2008-01-01 2008-01-01
2 2018 knb-lter-kbs.140.5 NA NA
3 2016 knb-lter-kbs.141.1 NA NA
4 2018 knb-lter-kbs.162.79 1950-01-01 1950-01-01
英文:
Changed from html approach to XML for more consistent data extraction
library(httr2)
library(tidyverse)
library(janitor)
bad_page <-
"http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-kbs)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false&start=40&rows=4" %>%
read_html()
scraper <- function(package_id) {
cat("Scraping", package_id, "\n")
data <- str_c(
"https://portal.edirepository.org/nis/metadataviewer?packageid=",
package_id,
"&contentType=application/xml"
) %>%
request() %>%
req_perform() %>%
resp_body_xml() %>%
xml_find_all(".//calendarDate") %>%
xml_text()
tibble(begin = pluck(data, 1),
end = pluck(data, 1))
}
data <- bad_page %>%
html_table() %>%
pluck(4) %>%
clean_names() %>%
mutate(across(title, ~ str_squish(str_remove_all(., "\\n")))) %>%
mutate(date = map(package_id, possibly(
scraper, otherwise = tibble(begin = NA, end = NA)
))) %>%
unnest(date, keep_empty = TRUE) %>%
select(!c(title, creators))
The result:
# A tibble: 4 × 4
publication_date package_id begin end
<int> <chr> <chr> <chr>
1 2018 knb-lter-kbs.139.5 2008-01-01 2008-01-01
2 2018 knb-lter-kbs.140.5 NA NA
3 2016 knb-lter-kbs.141.1 NA NA
4 2018 knb-lter-kbs.162.79 1950-01-01 1950-01-01
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论