英文:
rvest: Error in UseMethod("xml_find_all") : no applicable method for 'xml_find_all' applied to an object of class "list"
问题
以下是您请求的代码部分的中文翻译:
# 载入必要的库
library(rvest)
library(xml2)
library(dplyr)
library(purrr)
# 定义网页URL
url <- "https://portal.edirepository.org/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-and)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false&start=0&rows=150"
webpage <- read_html(url)
# 初始化用于存储数据的向量
package_ids <- character()
time_periods_begin <- character()
time_periods_end <- character()
# 提取Package Id
package_ids <- webpage %>%
html_table() %>%
.[[4]] %>%
select(`Package Id ▵▿`) %>%
rename(PackageId = `Package Id ▵▿`)
# 遍历每个PackageId行
for (i in 1:length(package_ids$PackageId)) {
# 构建“查看完整元数据”页面的URL
package_id_link <- paste0("https://portal.edirepository.org/nis/metadataviewer?packageid=", package_ids$PackageId[i])
# 转到“查看完整元数据”页面
metadata_page <- map(package_id_link, read_html)
# 提取开始和结束日期(这是错误发生的地方)
time_period_begin <- html_nodes(metadata_page, "tr:contains('Begin') td:nth-child(2)") %>%
html_text() %>%
trimws()
time_periods_begin <- c(time_periods_begin, time_period_begin)
time_period_end <- html_nodes(metadata_page, "tr:contains('End') td:nth-child(2)") %>%
html_text() %>%
trimws()
time_periods_end <- c(time_periods_end, time_period_end)
}
希望这对您有所帮助。如果您需要进一步的帮助,请随时告诉我。
英文:
The Environmental Data Initiative (EDI) is a repository for datasets from several locations. I would like to scrape the beginning and end dates of each dataset from a single location (see example link here).
- Each dataset for the one location contains a link to a metadata URL that lists the start and end date of the dataset (see example link here).
My code below is attempting to use a for-loop to extract the unique ID for each dataset (i.e., Package Id
) which then gets used to create the metadata page URL for each Package Id
.
However, my for-loop throws an error as it attempts to scrape the begin date from each of the metadata pages.
- The error:
Error in UseMethod("xml_find_all") : no applicable
method for 'xml_find_all' applied to an object of class "list"
How can I adapt my for-loop to extract the begin and end date of each Package Id
?
library(rvest)
library(xml2)
library(dplyr)
library(purrr)
url <- "https://portal.edirepository.org/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-and)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false&start=0&rows=150"
webpage <- read_html(url)
# Initialize vectors to store the data
package_ids <- character()
time_periods_begin <- character()
time_periods_end <- character()
# Extract the Package Id
package_ids <- webpage %>%
html_table() %>%
.[[4]] %>%
select(`Package Id ▵▿`) %>%
rename(PackageId = `Package Id ▵▿`)
# Iterate over each PackageId row
for (i in 1:length(package_ids$PackageId)) {
# Construct the URL for the "View Full Metadata" page
package_id_link <- paste0("https://portal.edirepository.org/nis/metadataviewer?packageid=", package_ids$PackageId)
# Navigate to the "View Full Metadata" page
metadata_page <- map(package_id_link, read_html)
# Extract the Begin and End (this is where the error lives)
time_period_begin <- html_nodes(metadata_page, "tr:contains('Begin') td:nth-child(2)") %>%
html_text() %>%
trimws()
time_periods_begin <- c(time_periods_begin, time_period_begin)
time_period_end <- html_nodes(metadata_page, "tr:contains('End') td:nth-child(2)") %>%
html_text() %>%
trimws()
time_periods_end <- c(time_periods_end, time_period_end)
}
The output should look like this
# Create a data frame with Package Id, Begin, and End
data_frame <- data.frame(PackageId = package_id,
Begin = time_periods_begin,
End = time_periods_end)
data_frame
PackageId Begin End
1 knb-lter-and.2719.6 1971-06-01 2002-03-11
2 knb-lter-and.2720.8 1958-01-01 1979-01-01
3 knb-lter-and.2721.6 1975-01-01 1995-01-01
Update 1
I can get the PackageID, Begin, and End for a single dataset. In the code above, I can get each datasets metadata URL. Now just need to figure out how to extract the PackageID, Begin, and End for each of those 147 metadata URLs.
url <- "https://portal.edirepository.org/nis/metadataviewer?packageid=knb-lter-and.4525.10"
webpage <- read_html(url)
package_id <- html_text(html_nodes(webpage, "td.rowodd + td.roweven")[1])
# Extract the Begin value
time_periods_begin <- html_text(html_nodes(webpage, "td:contains('Begin:') + td")[1])
# Extract the End value
time_periods_end <- html_text(html_nodes(webpage, "td:contains('End:') + td")[1])
data_frame <- data.frame(PackageId = package_id,
Begin = time_periods_begin,
End = time_periods_end)
data_frame
答案1
得分: 1
library(tidyverse)
library(rvest)
library(janitor)
page <-
"http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-and)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false&start=0&rows=150" %>%
read_html()
scraper <- function(package_id) {
cat("Scraping", package_id, "\n")
data <- str_c("https://portal.edirepository.org/nis/metadataviewer?packageid=",
package_id) %>%
read_html() %>%
html_elements(".subgroup.onehundred_percent") %>%
pluck(1) %>%
html_elements(".roweven") %>%
html_text2()
tibble(begin = pluck(data, 1),
end = pluck(data, 2))
}
data <- page %>%
html_table() %>%
pluck(4) %>%
clean_names() %>%
mutate(across(title, ~ str_squish(str_remove_all(., "\\n")))) %>%
mutate(date = map(package_id, scraper)) %>%
unnest(date)
title creators publication_date package_id begin end
<chr> <chr> <int> <chr> <chr> <chr>
1 Invertebrates of the Andrews Experimental Forest: An annotated list of insects and other arthropods, 1971… Andrews… 2014 knb-lter-… 1971… 2002…
2 Vascular plant list on the Andrews Experimental Forest and nearby Research Natural Areas, 1958 to 1979 Andrews… 2014 knb-lter-… 1958… 1979…
3 Bird species list for the Andrews Experimental Forest and Upper McKenzie River Basin, 1975 to 1995 Andrews… 2014 knb-lter-… 1975… 1995…
4 Amphibian and reptile list of the Andrews Experimental Forest, 1975 to 1995 Andrews… 2014 knb-lter-… 1975… 1995…
5 Moss species list of the Andrews Experimental Forest, 1991 Andrews… 2013 knb-lter-… 1991… 1991…
6 Mammal species list of the Andrews Experimental Forest, 1971 to 1976 Anthony… 2014 knb-lter-… 1971… 1976…
7 Ecohydrology and Ecophysiology intensively measured plots in Watershed 1, Andrews Experimental Forest, 20… Andrews… 2016 knb-lter-… 2005… 2011…
8 A Study of Hyporheic Characteristics Along a Longitudinal Profile of Lookout Creek, Oregon, 2003 Andrews… 2013 knb-lter-… 2003… 2003…
9 Annual tree productivity in permanent plots within the H.J. Andrews Experimental Forest Andrews… 2013 knb-lter-… 2000… 2004…
10 Epiphytic macrolichens in relation to forest management and topography in a western Oregon watershed, 199… Andrews… 2014 knb-lter-… 1997… 1999…
英文:
library(tidyverse)
library(rvest)
library(janitor)
page <-
"http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-and)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false&start=0&rows=150" %>%
read_html()
scraper <- function(package_id) {
cat("Scraping", package_id, "\n")
data <- str_c("https://portal.edirepository.org/nis/metadataviewer?packageid=",
package_id) %>%
read_html() %>%
html_elements(".subgroup.onehundred_percent") %>%
pluck(1) %>%
html_elements(".roweven") %>%
html_text2()
tibble(begin = pluck(data, 1),
end = pluck(data, 2))
}
data <- page %>%
html_table() %>%
pluck(4) %>%
clean_names() %>%
mutate(across(title, ~ str_squish(str_remove_all(., "\\n")))) %>%
mutate(date = map(package_id, scraper)) %>%
unnest(date)
title creators publication_date package_id begin end
<chr> <chr> <int> <chr> <chr> <chr>
1 Invertebrates of the Andrews Experimental Forest: An annotated list of insects and other arthropods, 1971… Andrews… 2014 knb-lter-… 1971… 2002…
2 Vascular plant list on the Andrews Experimental Forest and nearby Research Natural Areas, 1958 to 1979 Andrews… 2014 knb-lter-… 1958… 1979…
3 Bird species list for the Andrews Experimental Forest and Upper McKenzie River Basin, 1975 to 1995 Andrews… 2014 knb-lter-… 1975… 1995…
4 Amphibian and reptile list of the Andrews Experimental Forest, 1975 to 1995 Andrews… 2014 knb-lter-… 1975… 1995…
5 Moss species list of the Andrews Experimental Forest, 1991 Andrews… 2013 knb-lter-… 1991… 1991…
6 Mammal species list of the Andrews Experimental Forest, 1971 to 1976 Anthony… 2014 knb-lter-… 1971… 1976…
7 Ecohydrology and Ecophysiology intensively measured plots in Watershed 1, Andrews Experimental Forest, 20… Andrews… 2016 knb-lter-… 2005… 2011…
8 A Study of Hyporheic Characteristics Along a Longitudinal Profile of Lookout Creek, Oregon, 2003 Andrews… 2013 knb-lter-… 2003… 2003…
9 Annual tree productivity in permanent plots within the H.J. Andrews Experimental Forest Andrews… 2013 knb-lter-… 2000… 2004…
10 Epiphytic macrolichens in relation to forest management and topography in a western Oregon watershed, 199… Andrews… 2014 knb-lter-… 1997… 1999…
答案2
得分: 0
以下是如何从每个元数据文件中提取包ID、开始日期和结束日期的代码:
library(rvest)
library(dplyr)
# EDI网页,用于Andrews LTER数据集
url <- "http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-and)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false"
webpage <- read_html(url)
# 提取每个包ID
package_ids <- webpage %>%
html_table() %>%
.[[4]] %>%
select(`Package Id ▵▿`) %>%
rename(PackageId = `Package Id ▵▿`)
zz <- unique(package_ids$PackageId)
# 遍历每个包ID的元数据页面
for (i in 1:length(package_ids$PackageId)) {
curDat = package_ids[package_ids$PackageId == zz[i],]
# 构建“查看完整元数据”页面的URL
package_id_link <- paste0("https://portal.edirepository.org/nis/metadataviewer?packageid=", curDat)
# 读取“查看完整元数据”页面
webpage <- read_html(package_id_link)
# 提取包ID、开始日期和结束日期
package_id <- html_text(html_nodes(webpage, "td.rowodd + td.roweven")[1])
begin_value <- html_text(html_nodes(webpage, "td:contains('Begin:') + td")[1])
end_value <- html_text(html_nodes(webpage, "td:contains('End:') + td")[1])
if( i == 1){
packageID = package_id
time_periods_begin = begin_value
time_periods_end = end_value
} else{
packageID = rbind(packageID, package_id)
time_periods_begin = rbind(time_periods_begin, begin_value)
time_periods_end = rbind(time_periods_end, end_value)
}
}
data_frame <- data.frame(cbind(packageID,
time_periods_begin,
time_periods_end))
colnames(data_frame)[1:3] <- c('PackageId','Begin','End')
rownames(data_frame) <- seq(1,NROW(data_frame),1)
data_frame
PackageId Begin End
1 knb-lter-and.2719.6 1971-06-01 2002-03-11
2 knb-lter-and.2720.8 1958-01-01 1979-01-01
3 knb-lter-and.2721.6 1975-01-01 1995-01-01
4 knb-lter-and.2722.6 1975-01-01 1995-01-01
5 knb-lter-and.2725.6 1991-06-01 1991-08-01
6 knb-lter-and.2726.6 1971-01-01 1976-01-01
7 knb-lter-and.4528.10 2005-09-30 2011-05-05
8 knb-lter-and.4541.3 2003-06-14 2003-11-15
9 knb-lter-and.4544.4 2000-06-01 2004-09-30
10 knb-lter-and.4547.5 1997-09-23 1999-09-15
英文:
Here is how to scrape the Package ID, Begin Date, and End Date from each metadata file
library(rvest)
library(dplyr)
# EDI webpage for Andrews LTER datasets
url <- "http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-and)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false"
webpage <- read_html(url)
# Extract each of the Package Ids
package_ids <- webpage %>%
html_table() %>%
.[[4]] %>%
select(`Package Id ▵▿`) %>%
rename(PackageId = `Package Id ▵▿`)
zz <- unique(package_ids$PackageId)
# Iterate between the metadata page of each Package Id
for (i in 1:length(package_ids$PackageId)) {
curDat = package_ids[package_ids$PackageId == zz[i],]
# Construct the URL for the "View Full Metadata" page
package_id_link <- paste0("https://portal.edirepository.org/nis/metadataviewer?packageid=", curDat)
# Read the "View Full Metadata" page
webpage <- read_html(package_id_link)
# Extract Package ID, Begin Date, and End Date
package_id <- html_text(html_nodes(webpage, "td.rowodd + td.roweven")[1])
begin_value <- html_text(html_nodes(webpage, "td:contains('Begin:') + td")[1])
end_value <- html_text(html_nodes(webpage, "td:contains('End:') + td")[1])
if( i == 1){
packageID = package_id
time_periods_begin = begin_value
time_periods_end = end_value
} else{
packageID = rbind(packageID, package_id)
time_periods_begin = rbind(time_periods_begin, begin_value)
time_periods_end = rbind(time_periods_end, end_value)
}
}
data_frame <- data.frame(cbind(packageID,
time_periods_begin,
time_periods_end))
colnames(data_frame)[1:3] <- c('PackageId','Begin','End')
rownames(data_frame) <- seq(1,NROW(data_frame),1)
data_frame
PackageId Begin End
1 knb-lter-and.2719.6 1971-06-01 2002-03-11
2 knb-lter-and.2720.8 1958-01-01 1979-01-01
3 knb-lter-and.2721.6 1975-01-01 1995-01-01
4 knb-lter-and.2722.6 1975-01-01 1995-01-01
5 knb-lter-and.2725.6 1991-06-01 1991-08-01
6 knb-lter-and.2726.6 1971-01-01 1976-01-01
7 knb-lter-and.4528.10 2005-09-30 2011-05-05
8 knb-lter-and.4541.3 2003-06-14 2003-11-15
9 knb-lter-and.4544.4 2000-06-01 2004-09-30
10 knb-lter-and.4547.5 1997-09-23 1999-09-15
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论