如何解决在R中进行网页抓取时出现的403禁止错误?

huangapple go评论61阅读模式
英文:

How can I fix a 403 Forbidden error when attempting web-scraping in R?

问题

403 Forbidden错误消息出现在尝试访问国会记录时。

我尝试了这段代码:

library(tidyverse)
library(rvest)
library(httr)

# 函数
get_urls_in_table <- function(master_link) {
  with_user_agent <- GET(master_link, add_headers('user-agent' = 'name@xxx.edu'))
  links <- read_html(with_user_agent) %>% html_nodes("td a") %>% html_attr('href') %>% as.data.frame()
  return(links)
}

get_content <- function(x) {
  with_user_agent <- GET(x, add_headers('user-agent' = 'name@xxx.edu'))
  content <- read_html(with_user_agent) %>% html_nodes(".styled") %>% html_text()
  print("获取内容中")
  Sys.sleep(30) # 在每次抓取尝试之间添加30秒的延迟,以防止查询过于频繁
  return(content)
}

# 第一层
senate_pages <- get_urls_in_table("https://www.congress.gov/congressional-record/108th-congress/browse-by-date")

senate_pages <- senate_pages %>%
  mutate(real_url = paste("https://www.congress.gov", ., sep = "")) %>%
  filter(grepl("senate",real_url))

# 第二层
senate_articles <- lapply(senate_pages$real_url[1:2],get_urls_in_table) 
# 这里只运行了前两个问题,但如果去掉[1:2],应该适用于所有问题

senate_articles <- as.data.frame(do.call(rbind, senate_articles)) %>%
  mutate(real_url = paste("https://www.congress.gov", ., sep = "")) %>%
  filter(grepl("article",real_url))

# 第三层
senate_content <- lapply(senate_articles$real_url[1:3],get_content)
# 再次只运行了前几个,但在更大的范围内应该也适用
senate_content <- as.data.frame(do.call(rbind, senate_content)) # 数据清洗应该在此之后进行

但是得到了403 Forbidden错误消息。我尝试更改用户代理名称,但没有成功,包括我的实际电子邮件地址(而不是上面代码中使用的name@xxx.edu)、我的论文名称、我的姓名以及我的姓名与电子邮件地址之前带有空格。似乎没有什么方法可以解决这个问题。

英文:

403 Forbidden error message when trying to access Congressional Record

I tried this code:

library(tidyverse)
library(rvest)
library(httr)

# Functions
get_urls_in_table &lt;- function(master_link) {
  with_user_agent &lt;- GET(master_link, add_headers(&#39;user-agent&#39; = &#39;name@xxx.edu&#39;))
  links &lt;- read_html(with_user_agent) %&gt;% html_nodes(&quot;td a&quot;) %&gt;% html_attr(&#39;href&#39;) %&gt;% as.data.frame()
  return(links)
}

get_content &lt;- function(x) {
  with_user_agent &lt;- GET(x, add_headers(&#39;user-agent&#39; = &#39;name@xxx.edu&#39;))
  content &lt;- read_html(with_user_agent) %&gt;% html_nodes(&quot;.styled&quot;) %&gt;% html_text()
  print(&quot;getting content&quot;)
  Sys.sleep(30) # Here add 30 sec between each scraping attempt to prevent the queries from becoming too frequent
  return(content)
}

# Layer 1
senate_pages &lt;- get_urls_in_table(&quot;https://www.congress.gov/congressional-record/108th-congress/browse-by-date&quot;)

senate_pages &lt;- senate_pages %&gt;%
  mutate(real_url = paste(&quot;https://www.congress.gov&quot;, ., sep = &quot;&quot;)) %&gt;%
  filter(grepl(&quot;senate&quot;,real_url))

# Layer 2
senate_articles &lt;- lapply(senate_pages$real_url[1:2],get_urls_in_table) 
# Here I only ran the first two issues, but this should work for all if you get rid of [1:2]

senate_articles &lt;- as.data.frame(do.call(rbind, senate_articles)) %&gt;% # Take data frames out of lists
  mutate(real_url = paste(&quot;https://www.congress.gov&quot;, ., sep = &quot;&quot;)) %&gt;%
  filter(grepl(&quot;article&quot;,real_url))

# Layer 3
senate_content &lt;- lapply(senate_articles$real_url[1:3],get_content)
# Again only running the first few, but should work on a larger scale
senate_content &lt;- as.data.frame(do.call(rbind, senate_content)) # Data clean should follow this

And got the 403 Forbidden error message. I have tried to change the user agent to different names without any success, including my actual email address (and not the name@xxx.edu used in the code above), the name of my thesis, my name, and my name with spaces before my email address. Nothing seems to work.

答案1

得分: 0

以下是您要翻译的内容:

"那个403响应实际上包括一个基于Cloudflare的JavaScript挑战(来自Chromote会话的截图),对于阻止静态爬虫和HTTP请求工具(如httr(2)revest)非常有效。

但它也可以检测到带有完全配置的头部的curl请求(即从浏览器的开发工具中调用的复制为cURL请求;在那里我们有用户代理、Cookie和其他从我们完全功能的浏览器会话克隆的头部)。

虽然Chromote一旦其独特的用户代理被更新,目前能够成功通过(使用默认用户代理仍然会触发JavaScript挑战)。在下面的示例中,我只包括了Chromote会话处理,并用Chromote调用替换了httr::GET()调用。而且我不太愿意在请求之间等待30秒... 顺便说一句,很好的可重现示例,不需要额外的更改!:)"

library(tidyverse)
library(rvest)

### 更新 /
library(chromote)
# 新建会话,设置用户代理,值来自当前Windows上的Chrome
b <- ChromoteSession$new()
b$Network$setUserAgentOverride(userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
#> named list()
### / 更新

# 函数
get_urls_in_table <- function(master_link) {
  ### 更新 /
  {
    b$Page$navigate(master_link)
    b$Page$loadEventFired()
  } 
  with_user_agent <- b$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value
  ### / 更新
  links <- read_html(with_user_agent) %>% html_nodes("td a") %>% html_attr('href') %>% as.data.frame()
  return(links)
}

get_content <- function(x) {
  ### 更新 /  
  {
    b$Page$navigate(x)
    b$Page$loadEventFired()
  } 
  with_user_agent <- b$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value
  ### / 更新
  content <- read_html(with_user_agent) %>% html_nodes(".styled") %>% html_text()
  print("获取内容")
  # Sys.sleep(30) # 在每次爬取尝试之间添加30秒的等待,以防止查询变得过于频繁
  return(content)
}

# 第一层
senate_pages <- get_urls_in_table("https://www.congress.gov/congressional-record/108th-congress/browse-by-date")

senate_pages <- senate_pages %>%
  mutate(real_url = paste("https://www.congress.gov", ., sep = "")) %>%
  filter(grepl("senate",real_url))

# 第二层
senate_articles <- lapply(senate_pages$real_url[1:2],get_urls_in_table) 
# 这里我只运行了前两个问题,但如果你去掉[1:2],这应该对所有问题都有效

senate_articles <- as.data.frame(do.call(rbind, senate_articles)) %>%
  mutate(real_url = paste("https://www.congress.gov", ., sep = "")) %>%
  filter(grepl("article",real_url))

# 第三层
senate_content <- lapply(senate_articles$real_url[1:3],get_content)
#> [1] "获取内容"
#> [1] "获取内容"
#> [1] "获取内容"
# 再次只运行了前几个,但应该在更大的规模上工作
senate_content <- as.data.frame(do.call(rbind, senate_content)) # 数据清理应该在此之后进行

### 更新 /
# 关闭Chromote会话
b$close()
#> [1] TRUE

结果:

# senate_content:
as_tibble(senate_content)
#> # A tibble: 3 × 1
#>   V1                                                                            
#>   <chr>                                                                         
#> 1 "\n[Pages S12089-S12092]\nFrom the Congressional Record Online through the Go…
#> 2 "\n[Page S12092]\nFrom the Congressional Record Online through the Government…
#> 3 "\n[Pages S12092-S12093]\nFrom the Congressional Record Online through the Go…

# 第一篇文章的开头:
cat(substr(senate_content[1,1],1,400))
#> 
#> [Pages S12089-S12092]
#> From the Congressional Record Online through the Government Publishing Office [www.gpo.gov]
#> 
#> 
#> 
#> 
#>                           OFFSHORE OUTSOURCING
#> 
#>   Mr. LIEBERMAN. Mr. President, on December 15, 2004, my office 
#> released a white paper entitled "Data Dearth in Offshore Outsourcing: 
#> Policymaking Requires Facts.''' This white paper is closely linked to a 
#> previous white paper ent

创建于2023年5月22日,使用reprex v2.0.2

英文:

That 403 response actually includes a Cloudflare javascript-based challange (screenshot forom Chromote session) and it's quite effective with blocking static scrapers and HTTP request tools like httr(2) and revest.

But it also detects curl requests with fully configured headers (i.e. Copy as cURL call from browser's dev tools; and there we have user agent, cookies and other headers cloned from our fully functional browser session).

Though Chromote, once it's unique user-agent is updated, is currently able to get through (with default user-agent it would still trigger that js challenge). In the example bellow I only included Chromote session handling and replaced httr::GET() calls with Chromote calls. And I wasn't so keen on waiting 30 seconds between requests... BTW, nice reproducible example there, no additional changes were required! 如何解决在R中进行网页抓取时出现的403禁止错误?

library(tidyverse)
library(rvest)

### update /
library(chromote)
# new sessio, set userAgent, value grabbed from current Chrome wof Windows
b &lt;- ChromoteSession$new()
b$Network$setUserAgentOverride(userAgent = &quot;Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36&quot;)
#&gt; named list()
### / update 

# Functions
get_urls_in_table &lt;- function(master_link) {
  ### update /
  {
    b$Page$navigate(master_link)
    b$Page$loadEventFired()
  } 
  with_user_agent &lt;- b$Runtime$evaluate(&quot;document.querySelector(&#39;html&#39;).outerHTML&quot;)$result$value
  ### / update
  links &lt;- read_html(with_user_agent) %&gt;% html_nodes(&quot;td a&quot;) %&gt;% html_attr(&#39;href&#39;) %&gt;% as.data.frame()
  return(links)
}

get_content &lt;- function(x) {
  ### update /  
  {
    b$Page$navigate(x)
    b$Page$loadEventFired()
  } 
  with_user_agent &lt;- b$Runtime$evaluate(&quot;document.querySelector(&#39;html&#39;).outerHTML&quot;)$result$value
  ### / update
  content &lt;- read_html(with_user_agent) %&gt;% html_nodes(&quot;.styled&quot;) %&gt;% html_text()
  print(&quot;getting content&quot;)
  # Sys.sleep(30) # Here add 30 sec between each scraping attempt to prevent the queries from becoming too frequent
  return(content)
}

# Layer 1
senate_pages &lt;- get_urls_in_table(&quot;https://www.congress.gov/congressional-record/108th-congress/browse-by-date&quot;)

senate_pages &lt;- senate_pages %&gt;%
  mutate(real_url = paste(&quot;https://www.congress.gov&quot;, ., sep = &quot;&quot;)) %&gt;%
  filter(grepl(&quot;senate&quot;,real_url))

# Layer 2
senate_articles &lt;- lapply(senate_pages$real_url[1:2],get_urls_in_table) 
# Here I only ran the first two issues, but this should work for all if you get rid of [1:2]

senate_articles &lt;- as.data.frame(do.call(rbind, senate_articles)) %&gt;% # Take data frames out of lists
  mutate(real_url = paste(&quot;https://www.congress.gov&quot;, ., sep = &quot;&quot;)) %&gt;%
  filter(grepl(&quot;article&quot;,real_url))

# Layer 3
senate_content &lt;- lapply(senate_articles$real_url[1:3],get_content)
#&gt; [1] &quot;getting content&quot;
#&gt; [1] &quot;getting content&quot;
#&gt; [1] &quot;getting content&quot;
# Again only running the first few, but should work on a larger scale
senate_content &lt;- as.data.frame(do.call(rbind, senate_content)) # Data clean should follow this

### update /
# close Chromote session
b$close()
#&gt; [1] TRUE

Result:

# senate_content:
as_tibble(senate_content)
#&gt; # A tibble: 3 &#215; 1
#&gt;   V1                                                                            
#&gt;   &lt;chr&gt;                                                                         
#&gt; 1 &quot;\n[Pages S12089-S12092]\nFrom the Congressional Record Online through the Go…
#&gt; 2 &quot;\n[Page S12092]\nFrom the Congressional Record Online through the Government…
#&gt; 3 &quot;\n[Pages S12092-S12093]\nFrom the Congressional Record Online through the Go…

# beginning of the 1st article:
cat(substr(senate_content[1,1],1,400))
#&gt; 
#&gt; [Pages S12089-S12092]
#&gt; From the Congressional Record Online through the Government Publishing Office [www.gpo.gov]
#&gt; 
#&gt; 
#&gt; 
#&gt; 
#&gt;                           OFFSHORE OUTSOURCING
#&gt; 
#&gt;   Mr. LIEBERMAN. Mr. President, on December 15, 2004, my office 
#&gt; released a white paper entitled ``Data Dearth in Offshore Outsourcing: 
#&gt; Policymaking Requires Facts.&#39;&#39; This white paper is closely linked to a 
#&gt; previous white paper ent

<sup>Created on 2023-05-22 with reprex v2.0.2</sup>

huangapple
  • 本文由 发表于 2023年5月22日 06:02:09
  • 转载请务必保留本文链接:https://go.coder-hub.com/76302099.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定