问题

我要下载此网站 https://moweek.com.uy/ 中的图像，具体来说，那些属于类 = "expandedCategory" 的图像: "VESTIMENTA", "CALZADO", "ACCESORIOS" (data-category-id="1", data-category-id="2" 和 data-category-id="3")。例如，进入"VESTIMENTA"下的子类别（<div id="expandedCategoryContainer">），除了"Ver todo"之外，像"Activewear"一样点击所有图像并获取数据集中的所有信息以及图像。

我尝试过这样做，但没有成功。

英文:

I want to download the images of this site https://moweek.com.uy/ in R, specifically, those under the class = "expandedCategory": "VESTIMENTA", "CALZADO", "ACCESORIOS" (data-category-id="1", data-category-id="2", and data-category-id="3"). For example, go into the sub categories (<div id="expandedCategoryContainer">) under "VESTIMENTA" (except from "Ver todo") like "Activewear" (<a data-category-id="57" href="/vestimenta/activewear/1" class=" categoryLevelTwoTitle selected'>Activewear</a> == $0) clicking in all the images and obtain all of the information in a dataset as well as the images.

I've trying to do this but I am failing.

pacman::p_load(tidyverse, rvest, httr)
url &lt;- &quot;https://moweek.com.uy/&quot;
html_content &lt;- GET(url)
webpage &lt;- read_html(content(html_content, as = &quot;text&quot;))
category_nodes &lt;- html_nodes(webpage, &quot;.expandedCategory&quot;)
category_urls &lt;- lapply(category_nodes, function(node) html_nodes(node, &quot;a&quot;) %&gt;% 
                          html_attr(&quot;href&quot;)) %&gt;% 
  unlist() %&gt;% 
  str_subset(&quot;/vestimenta/|/calzado/|/accesorios/&quot;)
                 
image_data &lt;- data.frame()
for (url in category_urls) {
  cat_url &lt;- paste0(&quot;https://moweek.com.uy&quot;, url)
  cat_content &lt;- GET(cat_url)
  cat_page &lt;- read_html(content(cat_content, as = &quot;text&quot;))
  
  # Extract the category name and the subcategory name from the page title
  cat_name &lt;- html_text(html_node(cat_page, &quot;title&quot;)) %&gt;%
    str_replace(&quot;- MoWeek&quot;, &quot;&quot;) %&gt;%
    str_to_title()
  subcat_name &lt;- html_text(html_node(cat_page, &quot;.categoryLevelTwoTitle&quot;)) %&gt;%
    str_to_title()
  
  # Extract the image information from the page and store it in a data frame
  image_tags &lt;- html_nodes(cat_page, &quot;.productItem&quot;)
  image_info &lt;- data.frame()
  for (tag in image_tags) {
    name &lt;- html_text(html_node(tag, &quot;.productTitle&quot;))
    price &lt;- html_text(html_node(tag, &quot;.priceText&quot;))
    img_url &lt;- html_attr(html_node(tag, &quot;img&quot;), &quot;src&quot;)
    image_info &lt;- image_info %&gt;%
      add_row(Category = cat_name,
              Subcategory = subcat_name,
              Name = name,
              Price = price,
              Image_URL = img_url)
  }
  image_data &lt;- image_data %&gt;% bind_rows(image_info)
}
for (i in 1:nrow(image_data)) {
  img &lt;- GET(image_data$Image_URL[i])
  category &lt;- image_data$Category[i]
  subcategory &lt;- image_data$Subcategory[i]
  category_folder &lt;- gsub(&quot; &quot;, &quot;_&quot;, category)
  subcategory_folder &lt;- gsub(&quot; &quot;, &quot;_&quot;, subcategory)
  
  if (!dir.exists(category_folder)) {
    dir.create(category_folder)
  }
  if (!dir.exists(paste0(category_folder, &quot;/&quot;, subcategory_folder))) {
    dir.create(paste0(category_folder, &quot;/&quot;, subcategory_folder))
  }
  
  file_name &lt;- paste
}

答案1

得分: 1

这是您提供的代码的翻译：

library(tidyverse)
library(rvest)
library(httr)
url <- "https://moweek.com.uy/"
html_content <- GET(url)
webpage <- read_html(content(html_content, as = "text"))
category_nodes <- html_nodes(webpage, ".expandedCategory")
category_urls <- lapply(category_nodes, function(node) html_nodes(node, "a") %>%
                          html_attr("href")) %>%
  unlist() %>%
  str_subset("/vestimenta/|/calzado/|/accesorios/")
image_data <- tibble()
for (url in category_urls) {
  cat_url <- paste0("https://moweek.com.uy", url)
  # 因为我们发送了很多请求，有些可能会失败
  cat_content <- RETRY("GET", cat_url)
  cat_page <- read_html(content(cat_content, as = "text"))
  # 从页面标题中提取类别名称和子类别名称
  cat_name <- html_text(html_node(cat_page, "title")) %>%
    str_replace("- MoWeek", "") %>%
    str_to_title()
  subcat_name <- html_text(html_node(cat_page, ".categoryLevelTwoTitle")) %>%
    str_to_title()
  # 从页面中提取图像信息并将其存储在数据框中
  image_tags <- html_nodes(cat_page, ".productViewContainer")
  # 初始化列以便 bind_rows() 正常工作
  image_info <- tibble(
    Category = character(),
    Subcategory = character(),
    Name = character(),
    Price = character(),
    Image_URL = character()
  )
  for (tag in image_tags) {
    name <- html_text(html_node(tag, ".productViewName"))
    price <- html_text(html_node(tag, ".productViewPrice"))
    img_url <- html_attr(html_node(tag, ".productViewTop"), "data-hover-image")
    image_info <- image_info %>%
      add_row(Category = cat_name,
              Subcategory = subcat_name,
              Name = name,
              Price = price,
              Image_URL = img_url)
  }
  image_data <- image_data %>% bind_rows(image_info)
}
# 清理数据
image_data <- image_data %>%
  mutate(
    Name = str_trim(Name),
    Price = str_trim(Price)
  )
# 获取每个图像的文件名
file_names <- str_match(image_data$Image_URL, ".+(/.*?$)")[,2] %>%
  str_sub(start = 2L)
for (i in seq_len(nrow(image_data))) {
  url <- image_data$Image_URL[i]
  category <- image_data$Category[i]
  subcategory <- image_data$Subcategory[i]
  category_folder <- gsub(" ", "_", category)
  subcategory_folder <- gsub(" ", "_", subcategory)
  if (!dir.exists(category_folder)) {
    dir.create(category_folder)
  }
  if (!dir.exists(paste0(category_folder, "/", subcategory_folder))) {
    dir.create(paste0(category_folder, "/", subcategory_folder))
  }
  # 下载并将图像存储在正确的目录中
  dir_name <- paste0(category_folder, "/", subcategory_folder, "/")
  file_name <- file_names[i]
  download.file(url, paste0(dir_name, file_name))
}

英文:

This code seemed to work for me.

library(tidyverse) 
library(rvest)
library(httr)
url &lt;- &quot;https://moweek.com.uy/&quot;
html_content &lt;- GET(url)
webpage &lt;- read_html(content(html_content, as = &quot;text&quot;))
category_nodes &lt;- html_nodes(webpage, &quot;.expandedCategory&quot;)
category_urls &lt;- lapply(category_nodes, function(node) html_nodes(node, &quot;a&quot;) %&gt;% 
                          html_attr(&quot;href&quot;)) %&gt;% 
  unlist() %&gt;% 
  str_subset(&quot;/vestimenta/|/calzado/|/accesorios/&quot;)
image_data &lt;- tibble()
for (url in category_urls) {
  cat_url &lt;- paste0(&quot;https://moweek.com.uy&quot;, url)
  # Because we are sending so many requests, some are likely to fail
  cat_content &lt;- RETRY(&quot;GET&quot;, cat_url)
  cat_page &lt;- read_html(content(cat_content, as = &quot;text&quot;))
  
  # Extract the category name and the subcategory name from the page title
  cat_name &lt;- html_text(html_node(cat_page, &quot;title&quot;)) %&gt;%
    str_replace(&quot;- MoWeek&quot;, &quot;&quot;) %&gt;%
    str_to_title()
  subcat_name &lt;- html_text(html_node(cat_page, &quot;.categoryLevelTwoTitle&quot;)) %&gt;%
    str_to_title()
  
  # Extract the image information from the page and store it in a data frame
  image_tags &lt;- html_nodes(cat_page, &quot;.productViewContainer&quot;)
  # Initialise columns so bind_rows() works
  image_info &lt;- tibble(
    Category = character(),
    Subcategory = character(),
    Name = character(),
    Price = character(),
    Image_URL = character()
  )
  for (tag in image_tags) {
    name &lt;- html_text(html_node(tag, &quot;.productViewName&quot;))
    price &lt;- html_text(html_node(tag, &quot;.productViewPrice&quot;))
    img_url &lt;- html_attr(html_node(tag, &quot;.productViewTop&quot;), &quot;data-hover-image&quot;)
    image_info &lt;- image_info %&gt;%
      add_row(Category = cat_name,
              Subcategory = subcat_name,
              Name = name,
              Price = price,
              Image_URL = img_url)
  }
  image_data &lt;- image_data %&gt;% bind_rows(image_info)
}
# Clean data
image_data &lt;- image_data %&gt;%
  mutate(
    Name = str_trim(Name),
    Price = str_trim(Price)
  )
# Get the file name of each image
file_names &lt;- str_match(image_data$Image_URL, &quot;.+(/.*?$)&quot;)[,2] %&gt;%
  str_sub(start = 2L)
for (i in seq_len(nrow(image_data))) {
  url &lt;- image_data$Image_URL[i]
  category &lt;- image_data$Category[i]
  subcategory &lt;- image_data$Subcategory[i]
  category_folder &lt;- gsub(&quot; &quot;, &quot;_&quot;, category)
  subcategory_folder &lt;- gsub(&quot; &quot;, &quot;_&quot;, subcategory)
  
  if (!dir.exists(category_folder)) {
    dir.create(category_folder)
  }
  if (!dir.exists(paste0(category_folder, &quot;/&quot;, subcategory_folder))) {
    dir.create(paste0(category_folder, &quot;/&quot;, subcategory_folder))
  }
  
  # Download and store image in correct directory
  dir_name &lt;- paste0(category_folder, &quot;/&quot;, subcategory_folder, &quot;/&quot;)
  file_name &lt;- file_names[i]
  download.file(url, paste0(dir_name, file_name))
}

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

Downloading images from web and its attributes in R.

问题

答案1

需要帮助创建一个具有均值和标准差的函数。

R Shiny 使用 Highcharter 的 hw_grid 不起作用

使用ifelse改变我的计算结果。

如何在R中使箭头动画化

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。