英文:
rvest: Error in UseMethod("xml_find_all") : no applicable method for 'xml_find_all' applied to an object of class "list"
问题
以下是您请求的代码部分的中文翻译:
# 载入必要的库
library(rvest)
library(xml2)
library(dplyr)
library(purrr)
# 定义网页URL
url <- "https://portal.edirepository.org/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-and)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false&start=0&rows=150"
webpage <- read_html(url)
# 初始化用于存储数据的向量
package_ids <- character()
time_periods_begin <- character()
time_periods_end <- character()
# 提取Package Id
package_ids <- webpage %>%
  html_table() %>%
  .[[4]] %>%
  select(`Package Id  ▵▿`) %>%
  rename(PackageId = `Package Id  ▵▿`)
# 遍历每个PackageId行
for (i in 1:length(package_ids$PackageId)) {
  
  # 构建“查看完整元数据”页面的URL
  package_id_link <- paste0("https://portal.edirepository.org/nis/metadataviewer?packageid=", package_ids$PackageId[i])
  
  # 转到“查看完整元数据”页面
  metadata_page <- map(package_id_link, read_html)
  
  # 提取开始和结束日期(这是错误发生的地方)
  time_period_begin <- html_nodes(metadata_page, "tr:contains('Begin') td:nth-child(2)") %>%
    html_text() %>%
    trimws()
  
  time_periods_begin <- c(time_periods_begin, time_period_begin)
  
  time_period_end <- html_nodes(metadata_page, "tr:contains('End') td:nth-child(2)") %>%
    html_text() %>%
    trimws()
  
  time_periods_end <- c(time_periods_end, time_period_end)
}
希望这对您有所帮助。如果您需要进一步的帮助,请随时告诉我。
英文:
The Environmental Data Initiative (EDI) is a repository for datasets from several locations. I would like to scrape the beginning and end dates of each dataset from a single location (see example link here).
- Each dataset for the one location contains a link to a metadata URL that lists the start and end date of the dataset (see example link here).
 
My code below is attempting to use a for-loop to extract the unique ID for each dataset (i.e., Package Id) which then gets used to create the metadata page URL for each Package Id.
However, my for-loop throws an error as it attempts to scrape the begin date from each of the metadata pages.
- The error: 
Error in UseMethod("xml_find_all") : no applicable
method for 'xml_find_all' applied to an object of class "list" 
How can I adapt my for-loop to extract the begin and end date of each Package Id?
library(rvest)
library(xml2)
library(dplyr)
library(purrr)
url <- "https://portal.edirepository.org/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-and)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false&start=0&rows=150"
webpage <- read_html(url)
# Initialize vectors to store the data
package_ids <- character()
time_periods_begin <- character()
time_periods_end <- character()
# Extract the Package Id
package_ids <- webpage %>%
  html_table() %>%
  .[[4]] %>%
  select(`Package Id  ▵▿`) %>%
  rename(PackageId = `Package Id  ▵▿`)
# Iterate over each PackageId row
for (i in 1:length(package_ids$PackageId)) {
  
  # Construct the URL for the "View Full Metadata" page
  package_id_link <- paste0("https://portal.edirepository.org/nis/metadataviewer?packageid=", package_ids$PackageId)
  
  # Navigate to the "View Full Metadata" page
  metadata_page <- map(package_id_link, read_html)
  
  # Extract the Begin and End (this is where the error lives)
  time_period_begin <- html_nodes(metadata_page, "tr:contains('Begin') td:nth-child(2)") %>%
    html_text() %>%
    trimws()
  
  time_periods_begin <- c(time_periods_begin, time_period_begin)
  
  time_period_end <- html_nodes(metadata_page, "tr:contains('End') td:nth-child(2)") %>%
    html_text() %>%
    trimws()
  
  time_periods_end <- c(time_periods_end, time_period_end)
}
The output should look like this
# Create a data frame with Package Id, Begin, and End
data_frame <- data.frame(PackageId = package_id,
                         Begin = time_periods_begin,
                         End = time_periods_end)
data_frame
            PackageId      Begin        End
1 knb-lter-and.2719.6 1971-06-01 2002-03-11
2 knb-lter-and.2720.8 1958-01-01 1979-01-01
3 knb-lter-and.2721.6 1975-01-01 1995-01-01
Update 1
I can get the PackageID, Begin, and End for a single dataset. In the code above, I can get each datasets metadata URL. Now just need to figure out how to extract the PackageID, Begin, and End for each of those 147 metadata URLs.
url <- "https://portal.edirepository.org/nis/metadataviewer?packageid=knb-lter-and.4525.10"
webpage <- read_html(url)
package_id <- html_text(html_nodes(webpage, "td.rowodd + td.roweven")[1])
# Extract the Begin value
time_periods_begin <- html_text(html_nodes(webpage, "td:contains('Begin:') + td")[1])
# Extract the End value
time_periods_end <- html_text(html_nodes(webpage, "td:contains('End:') + td")[1])
data_frame <- data.frame(PackageId = package_id,
                         Begin = time_periods_begin,
                         End = time_periods_end)
data_frame
答案1
得分: 1
library(tidyverse)
library(rvest)
library(janitor)
page <-
  "http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-and)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false&start=0&rows=150" %>%
  read_html()
scraper <- function(package_id) {
  cat("Scraping", package_id, "\n")
  data <- str_c("https://portal.edirepository.org/nis/metadataviewer?packageid=",
        package_id) %>%
    read_html() %>%
    html_elements(".subgroup.onehundred_percent") %>%
    pluck(1) %>%
    html_elements(".roweven") %>%
    html_text2() 
  
  tibble(begin = pluck(data, 1), 
         end = pluck(data, 2))
}
data <- page %>%
  html_table() %>%
  pluck(4) %>%
  clean_names() %>%
  mutate(across(title, ~ str_squish(str_remove_all(., "\\n")))) %>%
  mutate(date = map(package_id, scraper)) %>%
  unnest(date)
title                                                                                                      creators publication_date package_id begin end  
<chr>                                                                                                      <chr>               <int> <chr>      <chr> <chr>
1 Invertebrates of the Andrews Experimental Forest: An annotated list of insects and other arthropods, 1971… Andrews…             2014 knb-lter-… 1971… 2002…
2 Vascular plant list on the Andrews Experimental Forest and nearby Research Natural Areas, 1958 to 1979     Andrews…             2014 knb-lter-… 1958… 1979…
3 Bird species list for the Andrews Experimental Forest and Upper McKenzie River Basin, 1975 to 1995         Andrews…             2014 knb-lter-… 1975… 1995…
4 Amphibian and reptile list of the Andrews Experimental Forest, 1975 to 1995                                Andrews…             2014 knb-lter-… 1975… 1995…
5 Moss species list of the Andrews Experimental Forest, 1991                                                 Andrews…             2013 knb-lter-… 1991… 1991…
6 Mammal species list of the Andrews Experimental Forest, 1971 to 1976                                       Anthony…             2014 knb-lter-… 1971… 1976…
7 Ecohydrology and Ecophysiology intensively measured plots in Watershed 1, Andrews Experimental Forest, 20… Andrews…             2016 knb-lter-… 2005… 2011…
8 A Study of Hyporheic Characteristics Along a Longitudinal Profile of Lookout Creek, Oregon, 2003           Andrews…             2013 knb-lter-… 2003… 2003…
9 Annual tree productivity in permanent plots within the H.J. Andrews Experimental Forest                    Andrews…             2013 knb-lter-… 2000… 2004…
10 Epiphytic macrolichens in relation to forest management and topography in a western Oregon watershed, 199… Andrews…             2014 knb-lter-… 1997… 1999…
英文:
library(tidyverse)
library(rvest)
library(janitor)
page <-
"http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-and)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false&start=0&rows=150" %>%
read_html()
scraper <- function(package_id) {
cat("Scraping", package_id, "\n")
data <- str_c("https://portal.edirepository.org/nis/metadataviewer?packageid=",
package_id) %>%
read_html() %>%
html_elements(".subgroup.onehundred_percent") %>%
pluck(1) %>%
html_elements(".roweven") %>%
html_text2() 
tibble(begin = pluck(data, 1), 
end = pluck(data, 2))
}
data <- page %>%
html_table() %>%
pluck(4) %>%
clean_names() %>%
mutate(across(title, ~ str_squish(str_remove_all(., "\\n")))) %>%
mutate(date = map(package_id, scraper)) %>% 
unnest(date)
title                                                                                                      creators publication_date package_id begin end  
<chr>                                                                                                      <chr>               <int> <chr>      <chr> <chr>
1 Invertebrates of the Andrews Experimental Forest: An annotated list of insects and other arthropods, 1971… Andrews…             2014 knb-lter-… 1971… 2002…
2 Vascular plant list on the Andrews Experimental Forest and nearby Research Natural Areas, 1958 to 1979     Andrews…             2014 knb-lter-… 1958… 1979…
3 Bird species list for the Andrews Experimental Forest and Upper McKenzie River Basin, 1975 to 1995         Andrews…             2014 knb-lter-… 1975… 1995…
4 Amphibian and reptile list of the Andrews Experimental Forest, 1975 to 1995                                Andrews…             2014 knb-lter-… 1975… 1995…
5 Moss species list of the Andrews Experimental Forest, 1991                                                 Andrews…             2013 knb-lter-… 1991… 1991…
6 Mammal species list of the Andrews Experimental Forest, 1971 to 1976                                       Anthony…             2014 knb-lter-… 1971… 1976…
7 Ecohydrology and Ecophysiology intensively measured plots in Watershed 1, Andrews Experimental Forest, 20… Andrews…             2016 knb-lter-… 2005… 2011…
8 A Study of Hyporheic Characteristics Along a Longitudinal Profile of Lookout Creek, Oregon, 2003           Andrews…             2013 knb-lter-… 2003… 2003…
9 Annual tree productivity in permanent plots within the H.J. Andrews Experimental Forest                    Andrews…             2013 knb-lter-… 2000… 2004…
10 Epiphytic macrolichens in relation to forest management and topography in a western Oregon watershed, 199… Andrews…             2014 knb-lter-… 1997… 1999…
答案2
得分: 0
以下是如何从每个元数据文件中提取包ID、开始日期和结束日期的代码:
library(rvest)
library(dplyr)
# EDI网页,用于Andrews LTER数据集
url <- "http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-and)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false"
webpage <- read_html(url)
# 提取每个包ID
package_ids <- webpage %>%
  html_table() %>%
  .[[4]] %>%
  select(`Package Id  ▵▿`) %>%
  rename(PackageId = `Package Id  ▵▿`)
zz <- unique(package_ids$PackageId)
# 遍历每个包ID的元数据页面
for (i in 1:length(package_ids$PackageId)) {
  curDat = package_ids[package_ids$PackageId == zz[i],]
  
  # 构建“查看完整元数据”页面的URL
  package_id_link <- paste0("https://portal.edirepository.org/nis/metadataviewer?packageid=", curDat)
  
  # 读取“查看完整元数据”页面
  webpage <- read_html(package_id_link)
  
  # 提取包ID、开始日期和结束日期
  package_id <- html_text(html_nodes(webpage, "td.rowodd + td.roweven")[1])
  begin_value <- html_text(html_nodes(webpage, "td:contains('Begin:') + td")[1])
  end_value <- html_text(html_nodes(webpage, "td:contains('End:') + td")[1])
  if( i == 1){
    packageID = package_id
    time_periods_begin = begin_value
    time_periods_end = end_value
  } else{
    packageID = rbind(packageID, package_id)
    time_periods_begin = rbind(time_periods_begin, begin_value)
    time_periods_end = rbind(time_periods_end, end_value)
  }
}
data_frame <- data.frame(cbind(packageID,
                               time_periods_begin,
                               time_periods_end))
colnames(data_frame)[1:3] <- c('PackageId','Begin','End')
rownames(data_frame) <- seq(1,NROW(data_frame),1)
data_frame
              PackageId      Begin        End
1   knb-lter-and.2719.6 1971-06-01 2002-03-11
2   knb-lter-and.2720.8 1958-01-01 1979-01-01
3   knb-lter-and.2721.6 1975-01-01 1995-01-01
4   knb-lter-and.2722.6 1975-01-01 1995-01-01
5   knb-lter-and.2725.6 1991-06-01 1991-08-01
6   knb-lter-and.2726.6 1971-01-01 1976-01-01
7  knb-lter-and.4528.10 2005-09-30 2011-05-05
8   knb-lter-and.4541.3 2003-06-14 2003-11-15
9   knb-lter-and.4544.4 2000-06-01 2004-09-30
10  knb-lter-and.4547.5 1997-09-23 1999-09-15
英文:
Here is how to scrape the Package ID, Begin Date, and End Date from each metadata file
library(rvest)
library(dplyr)
# EDI webpage for Andrews LTER datasets
url <- "http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-and)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false"
webpage <- read_html(url)
# Extract each of the Package Ids
package_ids <- webpage %>%
html_table() %>%
.[[4]] %>%
select(`Package Id  ▵▿`) %>%
rename(PackageId = `Package Id  ▵▿`)
zz <- unique(package_ids$PackageId)
# Iterate between the metadata page of each Package Id
for (i in 1:length(package_ids$PackageId)) {
curDat = package_ids[package_ids$PackageId == zz[i],]
# Construct the URL for the "View Full Metadata" page
package_id_link <- paste0("https://portal.edirepository.org/nis/metadataviewer?packageid=", curDat)
# Read the "View Full Metadata" page
webpage <- read_html(package_id_link)
# Extract Package ID, Begin Date, and End Date
package_id <- html_text(html_nodes(webpage, "td.rowodd + td.roweven")[1])
begin_value <- html_text(html_nodes(webpage, "td:contains('Begin:') + td")[1])
end_value <- html_text(html_nodes(webpage, "td:contains('End:') + td")[1])
if( i == 1){
packageID = package_id
time_periods_begin = begin_value
time_periods_end = end_value
} else{
packageID = rbind(packageID, package_id)
time_periods_begin = rbind(time_periods_begin, begin_value)
time_periods_end = rbind(time_periods_end, end_value)
}
}
data_frame <- data.frame(cbind(packageID,
time_periods_begin,
time_periods_end))
colnames(data_frame)[1:3] <- c('PackageId','Begin','End')
rownames(data_frame) <- seq(1,NROW(data_frame),1)
data_frame
PackageId      Begin        End
1   knb-lter-and.2719.6 1971-06-01 2002-03-11
2   knb-lter-and.2720.8 1958-01-01 1979-01-01
3   knb-lter-and.2721.6 1975-01-01 1995-01-01
4   knb-lter-and.2722.6 1975-01-01 1995-01-01
5   knb-lter-and.2725.6 1991-06-01 1991-08-01
6   knb-lter-and.2726.6 1971-01-01 1976-01-01
7  knb-lter-and.4528.10 2005-09-30 2011-05-05
8   knb-lter-and.4541.3 2003-06-14 2003-11-15
9   knb-lter-and.4544.4 2000-06-01 2004-09-30
10  knb-lter-and.4547.5 1997-09-23 1999-09-15
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。


评论