英文:
How can I fix a 403 Forbidden error when attempting web-scraping in R?
问题
403 Forbidden错误消息出现在尝试访问国会记录时。
我尝试了这段代码:
library(tidyverse)
library(rvest)
library(httr)
# 函数
get_urls_in_table <- function(master_link) {
with_user_agent <- GET(master_link, add_headers('user-agent' = 'name@xxx.edu'))
links <- read_html(with_user_agent) %>% html_nodes("td a") %>% html_attr('href') %>% as.data.frame()
return(links)
}
get_content <- function(x) {
with_user_agent <- GET(x, add_headers('user-agent' = 'name@xxx.edu'))
content <- read_html(with_user_agent) %>% html_nodes(".styled") %>% html_text()
print("获取内容中")
Sys.sleep(30) # 在每次抓取尝试之间添加30秒的延迟,以防止查询过于频繁
return(content)
}
# 第一层
senate_pages <- get_urls_in_table("https://www.congress.gov/congressional-record/108th-congress/browse-by-date")
senate_pages <- senate_pages %>%
mutate(real_url = paste("https://www.congress.gov", ., sep = "")) %>%
filter(grepl("senate",real_url))
# 第二层
senate_articles <- lapply(senate_pages$real_url[1:2],get_urls_in_table)
# 这里只运行了前两个问题,但如果去掉[1:2],应该适用于所有问题
senate_articles <- as.data.frame(do.call(rbind, senate_articles)) %>%
mutate(real_url = paste("https://www.congress.gov", ., sep = "")) %>%
filter(grepl("article",real_url))
# 第三层
senate_content <- lapply(senate_articles$real_url[1:3],get_content)
# 再次只运行了前几个,但在更大的范围内应该也适用
senate_content <- as.data.frame(do.call(rbind, senate_content)) # 数据清洗应该在此之后进行
但是得到了403 Forbidden错误消息。我尝试更改用户代理名称,但没有成功,包括我的实际电子邮件地址(而不是上面代码中使用的name@xxx.edu)、我的论文名称、我的姓名以及我的姓名与电子邮件地址之前带有空格。似乎没有什么方法可以解决这个问题。
英文:
403 Forbidden error message when trying to access Congressional Record
I tried this code:
library(tidyverse)
library(rvest)
library(httr)
# Functions
get_urls_in_table <- function(master_link) {
with_user_agent <- GET(master_link, add_headers('user-agent' = 'name@xxx.edu'))
links <- read_html(with_user_agent) %>% html_nodes("td a") %>% html_attr('href') %>% as.data.frame()
return(links)
}
get_content <- function(x) {
with_user_agent <- GET(x, add_headers('user-agent' = 'name@xxx.edu'))
content <- read_html(with_user_agent) %>% html_nodes(".styled") %>% html_text()
print("getting content")
Sys.sleep(30) # Here add 30 sec between each scraping attempt to prevent the queries from becoming too frequent
return(content)
}
# Layer 1
senate_pages <- get_urls_in_table("https://www.congress.gov/congressional-record/108th-congress/browse-by-date")
senate_pages <- senate_pages %>%
mutate(real_url = paste("https://www.congress.gov", ., sep = "")) %>%
filter(grepl("senate",real_url))
# Layer 2
senate_articles <- lapply(senate_pages$real_url[1:2],get_urls_in_table)
# Here I only ran the first two issues, but this should work for all if you get rid of [1:2]
senate_articles <- as.data.frame(do.call(rbind, senate_articles)) %>% # Take data frames out of lists
mutate(real_url = paste("https://www.congress.gov", ., sep = "")) %>%
filter(grepl("article",real_url))
# Layer 3
senate_content <- lapply(senate_articles$real_url[1:3],get_content)
# Again only running the first few, but should work on a larger scale
senate_content <- as.data.frame(do.call(rbind, senate_content)) # Data clean should follow this
And got the 403 Forbidden error message. I have tried to change the user agent to different names without any success, including my actual email address (and not the name@xxx.edu used in the code above), the name of my thesis, my name, and my name with spaces before my email address. Nothing seems to work.
答案1
得分: 0
以下是您要翻译的内容:
"那个403响应实际上包括一个基于Cloudflare的JavaScript挑战(来自Chromote会话的截图),对于阻止静态爬虫和HTTP请求工具(如httr(2)
和revest
)非常有效。
但它也可以检测到带有完全配置的头部的curl
请求(即从浏览器的开发工具中调用的复制为cURL请求;在那里我们有用户代理、Cookie和其他从我们完全功能的浏览器会话克隆的头部)。
虽然Chromote一旦其独特的用户代理被更新,目前能够成功通过(使用默认用户代理仍然会触发JavaScript挑战)。在下面的示例中,我只包括了Chromote会话处理,并用Chromote调用替换了httr::GET()
调用。而且我不太愿意在请求之间等待30秒... 顺便说一句,很好的可重现示例,不需要额外的更改!:)"
library(tidyverse)
library(rvest)
### 更新 /
library(chromote)
# 新建会话,设置用户代理,值来自当前Windows上的Chrome
b <- ChromoteSession$new()
b$Network$setUserAgentOverride(userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
#> named list()
### / 更新
# 函数
get_urls_in_table <- function(master_link) {
### 更新 /
{
b$Page$navigate(master_link)
b$Page$loadEventFired()
}
with_user_agent <- b$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value
### / 更新
links <- read_html(with_user_agent) %>% html_nodes("td a") %>% html_attr('href') %>% as.data.frame()
return(links)
}
get_content <- function(x) {
### 更新 /
{
b$Page$navigate(x)
b$Page$loadEventFired()
}
with_user_agent <- b$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value
### / 更新
content <- read_html(with_user_agent) %>% html_nodes(".styled") %>% html_text()
print("获取内容")
# Sys.sleep(30) # 在每次爬取尝试之间添加30秒的等待,以防止查询变得过于频繁
return(content)
}
# 第一层
senate_pages <- get_urls_in_table("https://www.congress.gov/congressional-record/108th-congress/browse-by-date")
senate_pages <- senate_pages %>%
mutate(real_url = paste("https://www.congress.gov", ., sep = "")) %>%
filter(grepl("senate",real_url))
# 第二层
senate_articles <- lapply(senate_pages$real_url[1:2],get_urls_in_table)
# 这里我只运行了前两个问题,但如果你去掉[1:2],这应该对所有问题都有效
senate_articles <- as.data.frame(do.call(rbind, senate_articles)) %>%
mutate(real_url = paste("https://www.congress.gov", ., sep = "")) %>%
filter(grepl("article",real_url))
# 第三层
senate_content <- lapply(senate_articles$real_url[1:3],get_content)
#> [1] "获取内容"
#> [1] "获取内容"
#> [1] "获取内容"
# 再次只运行了前几个,但应该在更大的规模上工作
senate_content <- as.data.frame(do.call(rbind, senate_content)) # 数据清理应该在此之后进行
### 更新 /
# 关闭Chromote会话
b$close()
#> [1] TRUE
结果:
# senate_content:
as_tibble(senate_content)
#> # A tibble: 3 × 1
#> V1
#> <chr>
#> 1 "\n[Pages S12089-S12092]\nFrom the Congressional Record Online through the Go…
#> 2 "\n[Page S12092]\nFrom the Congressional Record Online through the Government…
#> 3 "\n[Pages S12092-S12093]\nFrom the Congressional Record Online through the Go…
# 第一篇文章的开头:
cat(substr(senate_content[1,1],1,400))
#>
#> [Pages S12089-S12092]
#> From the Congressional Record Online through the Government Publishing Office [www.gpo.gov]
#>
#>
#>
#>
#> OFFSHORE OUTSOURCING
#>
#> Mr. LIEBERMAN. Mr. President, on December 15, 2004, my office
#> released a white paper entitled "Data Dearth in Offshore Outsourcing:
#> Policymaking Requires Facts.''' This white paper is closely linked to a
#> previous white paper ent
创建于2023年5月22日,使用reprex v2.0.2
英文:
That 403 response actually includes a Cloudflare javascript-based challange (screenshot forom Chromote session) and it's quite effective with blocking static scrapers and HTTP request tools like httr(2)
and revest
.
But it also detects curl
requests with fully configured headers (i.e. Copy as cURL call from browser's dev tools; and there we have user agent, cookies and other headers cloned from our fully functional browser session).
Though Chromote, once it's unique user-agent is updated, is currently able to get through (with default user-agent it would still trigger that js challenge). In the example bellow I only included Chromote session handling and replaced httr::GET()
calls with Chromote calls. And I wasn't so keen on waiting 30 seconds between requests... BTW, nice reproducible example there, no additional changes were required!
library(tidyverse)
library(rvest)
### update /
library(chromote)
# new sessio, set userAgent, value grabbed from current Chrome wof Windows
b <- ChromoteSession$new()
b$Network$setUserAgentOverride(userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
#> named list()
### / update
# Functions
get_urls_in_table <- function(master_link) {
### update /
{
b$Page$navigate(master_link)
b$Page$loadEventFired()
}
with_user_agent <- b$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value
### / update
links <- read_html(with_user_agent) %>% html_nodes("td a") %>% html_attr('href') %>% as.data.frame()
return(links)
}
get_content <- function(x) {
### update /
{
b$Page$navigate(x)
b$Page$loadEventFired()
}
with_user_agent <- b$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value
### / update
content <- read_html(with_user_agent) %>% html_nodes(".styled") %>% html_text()
print("getting content")
# Sys.sleep(30) # Here add 30 sec between each scraping attempt to prevent the queries from becoming too frequent
return(content)
}
# Layer 1
senate_pages <- get_urls_in_table("https://www.congress.gov/congressional-record/108th-congress/browse-by-date")
senate_pages <- senate_pages %>%
mutate(real_url = paste("https://www.congress.gov", ., sep = "")) %>%
filter(grepl("senate",real_url))
# Layer 2
senate_articles <- lapply(senate_pages$real_url[1:2],get_urls_in_table)
# Here I only ran the first two issues, but this should work for all if you get rid of [1:2]
senate_articles <- as.data.frame(do.call(rbind, senate_articles)) %>% # Take data frames out of lists
mutate(real_url = paste("https://www.congress.gov", ., sep = "")) %>%
filter(grepl("article",real_url))
# Layer 3
senate_content <- lapply(senate_articles$real_url[1:3],get_content)
#> [1] "getting content"
#> [1] "getting content"
#> [1] "getting content"
# Again only running the first few, but should work on a larger scale
senate_content <- as.data.frame(do.call(rbind, senate_content)) # Data clean should follow this
### update /
# close Chromote session
b$close()
#> [1] TRUE
Result:
# senate_content:
as_tibble(senate_content)
#> # A tibble: 3 × 1
#> V1
#> <chr>
#> 1 "\n[Pages S12089-S12092]\nFrom the Congressional Record Online through the Go…
#> 2 "\n[Page S12092]\nFrom the Congressional Record Online through the Government…
#> 3 "\n[Pages S12092-S12093]\nFrom the Congressional Record Online through the Go…
# beginning of the 1st article:
cat(substr(senate_content[1,1],1,400))
#>
#> [Pages S12089-S12092]
#> From the Congressional Record Online through the Government Publishing Office [www.gpo.gov]
#>
#>
#>
#>
#> OFFSHORE OUTSOURCING
#>
#> Mr. LIEBERMAN. Mr. President, on December 15, 2004, my office
#> released a white paper entitled ``Data Dearth in Offshore Outsourcing:
#> Policymaking Requires Facts.'' This white paper is closely linked to a
#> previous white paper ent
<sup>Created on 2023-05-22 with reprex v2.0.2</sup>
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论