rvest:调整网络抓取功能,以便在遇到无数据时继续。

huangapple go评论74阅读模式
英文:

rvest: Adapt webscraping function to continue if it comes across no data

问题

以下是翻译好的部分:

这个函数可以很好地用于从此数据集存储库中提取不同数据集的Begin:End:日期。然而,并非每个数据集的元数据都包含Begin:End:,这会导致函数停止并抛出错误消息。是否可以修改函数以跳过下一个URL(package_id)而不是停止?

library(tidyverse)
library(rvest)
library(janitor)

# 用于从不同元数据URL提取开始和结束日期的函数

scraper <- function(package_id) {
  cat("正在提取数据", package_id, "\n")
  data <- str_c("https://portal.edirepository.org/nis/metadataviewer?packageid=",
                package_id) %>%
    read_html() %>%
    html_elements(".subgroup.onehundred_percent") %>%
    pluck(1) %>%
    html_elements(".roweven") %>%
    html_text2() 
  
  tibble(begin = pluck(data, 1), 
         end = pluck(data, 1))
}

# 获取package_id

good_page <-
  "http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&amp;q=*:*&amp;fq=-scope:ecotrends&amp;fq=-scope:lter-landsat*&amp;fq=scope:(knb-lter-kbs)&amp;fl=id,packageid,title,author,organization,pubdate,coordinates&amp;debug=false&amp;start=0&amp;rows=3" %>%
  read_html()

bad_page <-
  "http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&amp;q=*:*&amp;fq=-scope:ecotrends&amp;fq=-scope:lter-landsat*&amp;fq=scope:(knb-lter-kbs)&amp;fl=id,packageid,title,author,organization,pubdate,coordinates&amp;debug=false&amp;start=40&amp;rows=4" %>%
  read_html()

# 正常工作,返回发布日期、package_id、开始日期和结束日期

dat1 <- good_page %>%
  html_table() %>%
  pluck(4) %>%
  clean_names() %>%
  mutate(across(title, ~ str_squish(str_remove_all(., "\n")))) %>%
  mutate(date = map(package_id, scraper)) %>%
  unnest(date) %>%
  select(!c(title,creators))
dat1

# 出错,因为 knb-lter-kbs.141.1 不包含开始或结束日期
# 上面的函数是否可以修改以跳过下一个package_id而不是停止?

dat2 <- bad_page %>%
  html_table() %>%
  pluck(4) %>%
  clean_names() %>%
  mutate(across(title, ~ str_squish(str_remove_all(., "\n")))) %>%
  mutate(date = map(package_id, scraper)) %>%
  unnest(date) %>%
  select(!c(title,creators))
英文:

The following function works well for scraping the Begin: and End: dates for different datasets from this dataset repository. However, not every dataset's metadata includes Begin: and End: which causes the function to stop and throw an error message. Can the function be adapted to skip to the next URL (package_id) rather than stop?

library(tidyverse)
library(rvest)
library(janitor)

# Function for scraping begin and end dates from different metadata URLs

scraper &lt;- function(package_id) {
  cat(&quot;Scraping&quot;, package_id, &quot;\n&quot;)
  data &lt;- str_c(&quot;https://portal.edirepository.org/nis/metadataviewer?packageid=&quot;,
                package_id) %&gt;%
    read_html() %&gt;%
    html_elements(&quot;.subgroup.onehundred_percent&quot;) %&gt;%
    pluck(1) %&gt;%
    html_elements(&quot;.roweven&quot;) %&gt;%
    html_text2() 
  
  tibble(begin = pluck(data, 1), 
         end = pluck(data, 1))
}

# Grabs package_id

good_page &lt;-
  &quot;http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&amp;q=*:*&amp;fq=-scope:ecotrends&amp;fq=-scope:lter-landsat*&amp;fq=scope:(knb-lter-kbs)&amp;fl=id,packageid,title,author,organization,pubdate,coordinates&amp;debug=false&amp;start=0&amp;rows=3&quot; %&gt;%
  read_html()

bad_page &lt;-
  &quot;http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&amp;q=*:*&amp;fq=-scope:ecotrends&amp;fq=-scope:lter-landsat*&amp;fq=scope:(knb-lter-kbs)&amp;fl=id,packageid,title,author,organization,pubdate,coordinates&amp;debug=false&amp;start=40&amp;rows=4&quot; %&gt;%
  read_html()

# Works as it should, returns publication_date, package_id, begin, and end

dat1 &lt;- good_page %&gt;%
  html_table() %&gt;%
  pluck(4) %&gt;%
  clean_names() %&gt;%
  mutate(across(title, ~ str_squish(str_remove_all(., &quot;\\n&quot;)))) %&gt;%
  mutate(date = map(package_id, scraper)) %&gt;% 
  unnest(date) %&gt;%
  select(!c(title,creators))
dat1

# Breaks because knb-lter-kbs.141.1 does not include begin or end dates
# Can the function above be amended to skip to the next package_id rather than stop?

dat2 &lt;- bad_page %&gt;%
  html_table() %&gt;%
  pluck(4) %&gt;%
  clean_names() %&gt;%
  mutate(across(title, ~ str_squish(str_remove_all(., &quot;\\n&quot;)))) %&gt;%
  mutate(date = map(package_id, scraper)) %&gt;% 
  unnest(date) %&gt;%
  select(!c(title,creators))

答案1

得分: 2

使用purrr::possibly()函数操作符,您可以定义一个版本的函数,当出现错误时返回NA值:

library(tidyverse)
library(rvest)
library(janitor)

safe_scraper <- possibly(
  scraper, 
  otherwise = tibble(begin = NA_character_, end = NA_character_)
)

bad_page %>%
  html_table() %>%
  pluck(4) %>%
  clean_names() %>%
  mutate(
    title = str_squish(str_remove_all(title, "\\n")),
    date = map(package_id, safe_scraper)
  ) %>% 
  unnest(date) %>%
  select(!c(title, creators))
# A tibble: 4 × 4
  publication_date package_id         begin      end           
             <int> <chr>              <chr>      <chr>         
1             2018 knb-lter-kbs.139.5 2008-01-01 2008-01-01    
2             2018 knb-lter-kbs.140.5 NA         NA            
3             2016 knb-lter-kbs.141.1 Information Manager (LTER Network Office) [ email ] Information Manager (LTER Network Office) [ …
4             2018 knb-lter-kbs.162.79 1950-01-01 1950-01-01

从第3行的判断,似乎您的工作流程可能存在其他问题。

英文:

Using the purrr::possibly() function operator, you can define a version of your function that returns NA values when an error occurs:

library(tidyverse)
library(rvest)
library(janitor)

safe_scraper &lt;- possibly(
  scraper, 
  otherwise = tibble(begin = NA_character_, end = NA_character_)
)

bad_page %&gt;%
  html_table() %&gt;%
  pluck(4) %&gt;%
  clean_names() %&gt;%
  mutate(
    title = str_squish(str_remove_all(title, &quot;\\n&quot;)),
    date = map(package_id, safe_scraper)
  ) %&gt;% 
  unnest(date) %&gt;%
  select(!c(title, creators))
# A tibble: 4 &#215; 4
  publication_date package_id          begin                                               end                                          
             &lt;int&gt; &lt;chr&gt;               &lt;chr&gt;                                               &lt;chr&gt;                                        
1             2018 knb-lter-kbs.139.5  2008-01-01                                          2008-01-01                                   
2             2018 knb-lter-kbs.140.5  NA                                                  NA                                           
3             2016 knb-lter-kbs.141.1  Information Manager (LTER Network Office) [ email ] Information Manager (LTER Network Office) [ …
4             2018 knb-lter-kbs.162.79 1950-01-01                                          1950-01-01

Judging by row 3, though, looks like there may be another issue in your workflow.

答案2

得分: 1

从HTML方法更改为XML方法以获得更一致的数据提取。

library(httr2)
library(tidyverse)
library(janitor)

bad_page <-
  "http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&amp;q=*:*&amp;fq=-scope:ecotrends&amp;fq=-scope:lter-landsat*&amp;fq=scope:(knb-lter-kbs)&amp;fl=id,packageid,title,author,organization,pubdate,coordinates&amp;debug=false&amp;start=40&amp;rows=4" %>%
  read_html()

scraper <- function(package_id) {
  cat("Scraping", package_id, "\n")
  data <- str_c(
    "https://portal.edirepository.org/nis/metadataviewer?packageid=",
    package_id, 
    "&amp;contentType=application/xml"
  ) %>%
    request() %>%
    req_perform() %>%
    resp_body_xml() %>%
    xml_find_all(".//calendarDate") %>%
    xml_text()
  
  tibble(begin = pluck(data, 1),
         end =   pluck(data, 1))
}

data <- bad_page %>%
  html_table() %>%
  pluck(4) %>%
  clean_names() %>%
  mutate(across(title, ~ str_squish(str_remove_all(., "\\n")))) %>%
  mutate(date = map(package_id, possibly(
    scraper, otherwise = tibble(begin = NA, end = NA)
  ))) %>%
  unnest(date, keep_empty = TRUE) %>%
  select(!c(title, creators))

结果:

# A tibble: 4 × 4
  publication_date package_id          begin      end       
             <int> <chr>               <chr>      <chr>     
1             2018 knb-lter-kbs.139.5  2008-01-01 2008-01-01
2             2018 knb-lter-kbs.140.5  NA         NA        
3             2016 knb-lter-kbs.141.1  NA         NA        
4             2018 knb-lter-kbs.162.79 1950-01-01 1950-01-01
英文:

Changed from html approach to XML for more consistent data extraction

library(httr2)
library(tidyverse)
library(janitor)

bad_page &lt;-
  &quot;http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&amp;q=*:*&amp;fq=-scope:ecotrends&amp;fq=-scope:lter-landsat*&amp;fq=scope:(knb-lter-kbs)&amp;fl=id,packageid,title,author,organization,pubdate,coordinates&amp;debug=false&amp;start=40&amp;rows=4&quot; %&gt;%
  read_html()

scraper &lt;- function(package_id) {
  cat(&quot;Scraping&quot;, package_id, &quot;\n&quot;)
  data &lt;- str_c(
    &quot;https://portal.edirepository.org/nis/metadataviewer?packageid=&quot;,
    package_id, 
    &quot;&amp;contentType=application/xml&quot;
  ) %&gt;%
    request() %&gt;%
    req_perform() %&gt;%
    resp_body_xml() %&gt;%
    xml_find_all(&quot;.//calendarDate&quot;) %&gt;%
    xml_text()
  
  tibble(begin = pluck(data, 1),
         end =   pluck(data, 1))
}

data &lt;- bad_page %&gt;%
  html_table() %&gt;%
  pluck(4) %&gt;%
  clean_names() %&gt;%
  mutate(across(title, ~ str_squish(str_remove_all(., &quot;\\n&quot;)))) %&gt;%
  mutate(date = map(package_id, possibly(
    scraper, otherwise = tibble(begin = NA, end = NA)
  ))) %&gt;%
  unnest(date, keep_empty = TRUE) %&gt;%
  select(!c(title, creators))

The result:

# A tibble: 4 &#215; 4
  publication_date package_id          begin      end       
             &lt;int&gt; &lt;chr&gt;               &lt;chr&gt;      &lt;chr&gt;     
1             2018 knb-lter-kbs.139.5  2008-01-01 2008-01-01
2             2018 knb-lter-kbs.140.5  NA         NA        
3             2016 knb-lter-kbs.141.1  NA         NA        
4             2018 knb-lter-kbs.162.79 1950-01-01 1950-01-01

huangapple
  • 本文由 发表于 2023年6月16日 01:45:26
  • 转载请务必保留本文链接:https://go.coder-hub.com/76484264.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定