2023年5月22日 06:02:09go评论77阅读模式

英文:

How can I fix a 403 Forbidden error when attempting web-scraping in R?

问题

403 Forbidden错误消息出现在尝试访问国会记录时。

我尝试了这段代码：

library(tidyverse)
library(rvest)
library(httr)

# 函数
get_urls_in_table <- function(master_link) {
  with_user_agent <- GET(master_link, add_headers('user-agent' = 'name@xxx.edu'))
  links <- read_html(with_user_agent) %>% html_nodes("td a") %>% html_attr('href') %>% as.data.frame()
  return(links)
}

get_content <- function(x) {
  with_user_agent <- GET(x, add_headers('user-agent' = 'name@xxx.edu'))
  content <- read_html(with_user_agent) %>% html_nodes(".styled") %>% html_text()
  print("获取内容中")
  Sys.sleep(30) # 在每次抓取尝试之间添加30秒的延迟，以防止查询过于频繁
  return(content)
}

# 第一层
senate_pages <- get_urls_in_table("https://www.congress.gov/congressional-record/108th-congress/browse-by-date")

senate_pages <- senate_pages %>%
  mutate(real_url = paste("https://www.congress.gov", ., sep = "")) %>%
  filter(grepl("senate",real_url))

# 第二层
senate_articles <- lapply(senate_pages$real_url[1:2],get_urls_in_table) 
# 这里只运行了前两个问题，但如果去掉[1:2]，应该适用于所有问题

senate_articles <- as.data.frame(do.call(rbind, senate_articles)) %>%
  mutate(real_url = paste("https://www.congress.gov", ., sep = "")) %>%
  filter(grepl("article",real_url))

# 第三层
senate_content <- lapply(senate_articles$real_url[1:3],get_content)
# 再次只运行了前几个，但在更大的范围内应该也适用
senate_content <- as.data.frame(do.call(rbind, senate_content)) # 数据清洗应该在此之后进行

但是得到了403 Forbidden错误消息。我尝试更改用户代理名称，但没有成功，包括我的实际电子邮件地址（而不是上面代码中使用的name@xxx.edu）、我的论文名称、我的姓名以及我的姓名与电子邮件地址之前带有空格。似乎没有什么方法可以解决这个问题。

英文:

403 Forbidden error message when trying to access Congressional Record

I tried this code:

library(tidyverse)
library(rvest)
library(httr)

# Functions
get_urls_in_table &lt;- function(master_link) {
  with_user_agent &lt;- GET(master_link, add_headers(&#39;user-agent&#39; = &#39;name@xxx.edu&#39;))
  links &lt;- read_html(with_user_agent) %&gt;% html_nodes(&quot;td a&quot;) %&gt;% html_attr(&#39;href&#39;) %&gt;% as.data.frame()
  return(links)
}

get_content &lt;- function(x) {
  with_user_agent &lt;- GET(x, add_headers(&#39;user-agent&#39; = &#39;name@xxx.edu&#39;))
  content &lt;- read_html(with_user_agent) %&gt;% html_nodes(&quot;.styled&quot;) %&gt;% html_text()
  print(&quot;getting content&quot;)
  Sys.sleep(30) # Here add 30 sec between each scraping attempt to prevent the queries from becoming too frequent
  return(content)
}

# Layer 1
senate_pages &lt;- get_urls_in_table(&quot;https://www.congress.gov/congressional-record/108th-congress/browse-by-date&quot;)

senate_pages &lt;- senate_pages %&gt;%
  mutate(real_url = paste(&quot;https://www.congress.gov&quot;, ., sep = &quot;&quot;)) %&gt;%
  filter(grepl(&quot;senate&quot;,real_url))

# Layer 2
senate_articles &lt;- lapply(senate_pages$real_url[1:2],get_urls_in_table) 
# Here I only ran the first two issues, but this should work for all if you get rid of [1:2]

senate_articles &lt;- as.data.frame(do.call(rbind, senate_articles)) %&gt;% # Take data frames out of lists
  mutate(real_url = paste(&quot;https://www.congress.gov&quot;, ., sep = &quot;&quot;)) %&gt;%
  filter(grepl(&quot;article&quot;,real_url))

# Layer 3
senate_content &lt;- lapply(senate_articles$real_url[1:3],get_content)
# Again only running the first few, but should work on a larger scale
senate_content &lt;- as.data.frame(do.call(rbind, senate_content)) # Data clean should follow this

And got the 403 Forbidden error message. I have tried to change the user agent to different names without any success, including my actual email address (and not the name@xxx.edu used in the code above), the name of my thesis, my name, and my name with spaces before my email address. Nothing seems to work.

答案1

得分: 0

以下是您要翻译的内容：

"那个403响应实际上包括一个基于Cloudflare的JavaScript挑战（来自Chromote会话的截图），对于阻止静态爬虫和HTTP请求工具（如httr(2)和revest）非常有效。

但它也可以检测到带有完全配置的头部的curl请求（即从浏览器的开发工具中调用的复制为cURL请求；在那里我们有用户代理、Cookie和其他从我们完全功能的浏览器会话克隆的头部）。

虽然Chromote一旦其独特的用户代理被更新，目前能够成功通过（使用默认用户代理仍然会触发JavaScript挑战）。在下面的示例中，我只包括了Chromote会话处理，并用Chromote调用替换了httr::GET()调用。而且我不太愿意在请求之间等待30秒... 顺便说一句，很好的可重现示例，不需要额外的更改！:)"

library(tidyverse)
library(rvest)

### 更新 /
library(chromote)
# 新建会话，设置用户代理，值来自当前Windows上的Chrome
b <- ChromoteSession$new()
b$Network$setUserAgentOverride(userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
#> named list()
### / 更新

# 函数
get_urls_in_table <- function(master_link) {
  ### 更新 /
  {
    b$Page$navigate(master_link)
    b$Page$loadEventFired()
  } 
  with_user_agent <- b$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value
  ### / 更新
  links <- read_html(with_user_agent) %>% html_nodes("td a") %>% html_attr('href') %>% as.data.frame()
  return(links)
}

get_content <- function(x) {
  ### 更新 /  
  {
    b$Page$navigate(x)
    b$Page$loadEventFired()
  } 
  with_user_agent <- b$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value
  ### / 更新
  content <- read_html(with_user_agent) %>% html_nodes(".styled") %>% html_text()
  print("获取内容")
  # Sys.sleep(30) # 在每次爬取尝试之间添加30秒的等待，以防止查询变得过于频繁
  return(content)
}

# 第一层
senate_pages <- get_urls_in_table("https://www.congress.gov/congressional-record/108th-congress/browse-by-date")

senate_pages <- senate_pages %>%
  mutate(real_url = paste("https://www.congress.gov", ., sep = "")) %>%
  filter(grepl("senate",real_url))

# 第二层
senate_articles <- lapply(senate_pages$real_url[1:2],get_urls_in_table) 
# 这里我只运行了前两个问题，但如果你去掉[1:2]，这应该对所有问题都有效

senate_articles <- as.data.frame(do.call(rbind, senate_articles)) %>%
  mutate(real_url = paste("https://www.congress.gov", ., sep = "")) %>%
  filter(grepl("article",real_url))

# 第三层
senate_content <- lapply(senate_articles$real_url[1:3],get_content)
#> [1] "获取内容"
#> [1] "获取内容"
#> [1] "获取内容"
# 再次只运行了前几个，但应该在更大的规模上工作
senate_content <- as.data.frame(do.call(rbind, senate_content)) # 数据清理应该在此之后进行

### 更新 /
# 关闭Chromote会话
b$close()
#> [1] TRUE

结果：

# senate_content:
as_tibble(senate_content)
#> # A tibble: 3 × 1
#>   V1                                                                            
#>   <chr>                                                                         
#> 1 "\n[Pages S12089-S12092]\nFrom the Congressional Record Online through the Go…
#> 2 "\n[Page S12092]\nFrom the Congressional Record Online through the Government…
#> 3 "\n[Pages S12092-S12093]\nFrom the Congressional Record Online through the Go…

# 第一篇文章的开头：
cat(substr(senate_content[1,1],1,400))
#> 
#> [Pages S12089-S12092]
#> From the Congressional Record Online through the Government Publishing Office [www.gpo.gov]
#> 
#> 
#> 
#> 
#>                           OFFSHORE OUTSOURCING
#> 
#>   Mr. LIEBERMAN. Mr. President, on December 15, 2004, my office 
#> released a white paper entitled "Data Dearth in Offshore Outsourcing: 
#> Policymaking Requires Facts.''' This white paper is closely linked to a 
#> previous white paper ent

^{创建于2023年5月22日，使用reprex v2.0.2}

英文:

That 403 response actually includes a Cloudflare javascript-based challange (screenshot forom Chromote session) and it's quite effective with blocking static scrapers and HTTP request tools like httr(2) and revest.

But it also detects curl requests with fully configured headers (i.e. Copy as cURL call from browser's dev tools; and there we have user agent, cookies and other headers cloned from our fully functional browser session).

Though Chromote, once it's unique user-agent is updated, is currently able to get through (with default user-agent it would still trigger that js challenge). In the example bellow I only included Chromote session handling and replaced httr::GET() calls with Chromote calls. And I wasn't so keen on waiting 30 seconds between requests... BTW, nice reproducible example there, no additional changes were required!

library(tidyverse)
library(rvest)

### update /
library(chromote)
# new sessio, set userAgent, value grabbed from current Chrome wof Windows
b &lt;- ChromoteSession$new()
b$Network$setUserAgentOverride(userAgent = &quot;Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36&quot;)
#&gt; named list()
### / update 

# Functions
get_urls_in_table &lt;- function(master_link) {
  ### update /
  {
    b$Page$navigate(master_link)
    b$Page$loadEventFired()
  } 
  with_user_agent &lt;- b$Runtime$evaluate(&quot;document.querySelector(&#39;html&#39;).outerHTML&quot;)$result$value
  ### / update
  links &lt;- read_html(with_user_agent) %&gt;% html_nodes(&quot;td a&quot;) %&gt;% html_attr(&#39;href&#39;) %&gt;% as.data.frame()
  return(links)
}

get_content &lt;- function(x) {
  ### update /  
  {
    b$Page$navigate(x)
    b$Page$loadEventFired()
  } 
  with_user_agent &lt;- b$Runtime$evaluate(&quot;document.querySelector(&#39;html&#39;).outerHTML&quot;)$result$value
  ### / update
  content &lt;- read_html(with_user_agent) %&gt;% html_nodes(&quot;.styled&quot;) %&gt;% html_text()
  print(&quot;getting content&quot;)
  # Sys.sleep(30) # Here add 30 sec between each scraping attempt to prevent the queries from becoming too frequent
  return(content)
}

# Layer 1
senate_pages &lt;- get_urls_in_table(&quot;https://www.congress.gov/congressional-record/108th-congress/browse-by-date&quot;)

senate_pages &lt;- senate_pages %&gt;%
  mutate(real_url = paste(&quot;https://www.congress.gov&quot;, ., sep = &quot;&quot;)) %&gt;%
  filter(grepl(&quot;senate&quot;,real_url))

# Layer 2
senate_articles &lt;- lapply(senate_pages$real_url[1:2],get_urls_in_table) 
# Here I only ran the first two issues, but this should work for all if you get rid of [1:2]

senate_articles &lt;- as.data.frame(do.call(rbind, senate_articles)) %&gt;% # Take data frames out of lists
  mutate(real_url = paste(&quot;https://www.congress.gov&quot;, ., sep = &quot;&quot;)) %&gt;%
  filter(grepl(&quot;article&quot;,real_url))

# Layer 3
senate_content &lt;- lapply(senate_articles$real_url[1:3],get_content)
#&gt; [1] &quot;getting content&quot;
#&gt; [1] &quot;getting content&quot;
#&gt; [1] &quot;getting content&quot;
# Again only running the first few, but should work on a larger scale
senate_content &lt;- as.data.frame(do.call(rbind, senate_content)) # Data clean should follow this

### update /
# close Chromote session
b$close()
#&gt; [1] TRUE

Result:

# senate_content:
as_tibble(senate_content)
#&gt; # A tibble: 3 &#215; 1
#&gt;   V1                                                                            
#&gt;   &lt;chr&gt;                                                                         
#&gt; 1 &quot;\n[Pages S12089-S12092]\nFrom the Congressional Record Online through the Go…
#&gt; 2 &quot;\n[Page S12092]\nFrom the Congressional Record Online through the Government…
#&gt; 3 &quot;\n[Pages S12092-S12093]\nFrom the Congressional Record Online through the Go…

# beginning of the 1st article:
cat(substr(senate_content[1,1],1,400))
#&gt; 
#&gt; [Pages S12089-S12092]
#&gt; From the Congressional Record Online through the Government Publishing Office [www.gpo.gov]
#&gt; 
#&gt; 
#&gt; 
#&gt; 
#&gt;                           OFFSHORE OUTSOURCING
#&gt; 
#&gt;   Mr. LIEBERMAN. Mr. President, on December 15, 2004, my office 
#&gt; released a white paper entitled ``Data Dearth in Offshore Outsourcing: 
#&gt; Policymaking Requires Facts.&#39;&#39; This white paper is closely linked to a 
#&gt; previous white paper ent

<sup>Created on 2023-05-22 with reprex v2.0.2</sup>

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

如何解决在R中进行网页抓取时出现的403禁止错误？

问题

答案1

无法使用Selenium CSS选择器找到元素，即使单独使用它正常。

Website forbidden when scraping web data in R but works fine in browser

如何使用Python和BeautifulSoup来抓取并记录产品的价格和日期？

如何在 macOS 上使用文件选择对话框在 R 中保存文件？

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

发表评论