Downloading images from web and its attributes in R.

huangapple go评论103阅读模式
英文:

Downloading images from web and its attributes in R

问题

我要下载此网站 https://moweek.com.uy/ 中的图像,具体来说,那些属于类 = "expandedCategory" 的图像: "VESTIMENTA", "CALZADO", "ACCESORIOS" (data-category-id="1", data-category-id="2" 和 data-category-id="3")。例如,进入"VESTIMENTA"下的子类别(<div id="expandedCategoryContainer">),除了"Ver todo"之外,像"Activewear"一样点击所有图像并获取数据集中的所有信息以及图像。

我尝试过这样做,但没有成功。

英文:

I want to download the images of this site https://moweek.com.uy/ in R, specifically, those under the class = "expandedCategory": "VESTIMENTA", "CALZADO", "ACCESORIOS" (data-category-id="1", data-category-id="2", and data-category-id="3"). For example, go into the sub categories (<div id="expandedCategoryContainer">) under "VESTIMENTA" (except from "Ver todo") like "Activewear" (<a data-category-id="57" href="/vestimenta/activewear/1" class=" categoryLevelTwoTitle selected'>Activewear</a> == $0) clicking in all the images and obtain all of the information in a dataset as well as the images.

I've trying to do this but I am failing.

  1. pacman::p_load(tidyverse, rvest, httr)
  2. url &lt;- &quot;https://moweek.com.uy/&quot;
  3. html_content &lt;- GET(url)
  4. webpage &lt;- read_html(content(html_content, as = &quot;text&quot;))
  5. category_nodes &lt;- html_nodes(webpage, &quot;.expandedCategory&quot;)
  6. category_urls &lt;- lapply(category_nodes, function(node) html_nodes(node, &quot;a&quot;) %&gt;%
  7. html_attr(&quot;href&quot;)) %&gt;%
  8. unlist() %&gt;%
  9. str_subset(&quot;/vestimenta/|/calzado/|/accesorios/&quot;)
  10. image_data &lt;- data.frame()
  11. for (url in category_urls) {
  12. cat_url &lt;- paste0(&quot;https://moweek.com.uy&quot;, url)
  13. cat_content &lt;- GET(cat_url)
  14. cat_page &lt;- read_html(content(cat_content, as = &quot;text&quot;))
  15. # Extract the category name and the subcategory name from the page title
  16. cat_name &lt;- html_text(html_node(cat_page, &quot;title&quot;)) %&gt;%
  17. str_replace(&quot;- MoWeek&quot;, &quot;&quot;) %&gt;%
  18. str_to_title()
  19. subcat_name &lt;- html_text(html_node(cat_page, &quot;.categoryLevelTwoTitle&quot;)) %&gt;%
  20. str_to_title()
  21. # Extract the image information from the page and store it in a data frame
  22. image_tags &lt;- html_nodes(cat_page, &quot;.productItem&quot;)
  23. image_info &lt;- data.frame()
  24. for (tag in image_tags) {
  25. name &lt;- html_text(html_node(tag, &quot;.productTitle&quot;))
  26. price &lt;- html_text(html_node(tag, &quot;.priceText&quot;))
  27. img_url &lt;- html_attr(html_node(tag, &quot;img&quot;), &quot;src&quot;)
  28. image_info &lt;- image_info %&gt;%
  29. add_row(Category = cat_name,
  30. Subcategory = subcat_name,
  31. Name = name,
  32. Price = price,
  33. Image_URL = img_url)
  34. }
  35. image_data &lt;- image_data %&gt;% bind_rows(image_info)
  36. }
  37. for (i in 1:nrow(image_data)) {
  38. img &lt;- GET(image_data$Image_URL[i])
  39. category &lt;- image_data$Category[i]
  40. subcategory &lt;- image_data$Subcategory[i]
  41. category_folder &lt;- gsub(&quot; &quot;, &quot;_&quot;, category)
  42. subcategory_folder &lt;- gsub(&quot; &quot;, &quot;_&quot;, subcategory)
  43. if (!dir.exists(category_folder)) {
  44. dir.create(category_folder)
  45. }
  46. if (!dir.exists(paste0(category_folder, &quot;/&quot;, subcategory_folder))) {
  47. dir.create(paste0(category_folder, &quot;/&quot;, subcategory_folder))
  48. }
  49. file_name &lt;- paste
  50. }

答案1

得分: 1

这是您提供的代码的翻译:

  1. library(tidyverse)
  2. library(rvest)
  3. library(httr)
  4. url <- "https://moweek.com.uy/"
  5. html_content <- GET(url)
  6. webpage <- read_html(content(html_content, as = "text"))
  7. category_nodes <- html_nodes(webpage, ".expandedCategory")
  8. category_urls <- lapply(category_nodes, function(node) html_nodes(node, "a") %>%
  9. html_attr("href")) %>%
  10. unlist() %>%
  11. str_subset("/vestimenta/|/calzado/|/accesorios/")
  12. image_data <- tibble()
  13. for (url in category_urls) {
  14. cat_url <- paste0("https://moweek.com.uy", url)
  15. # 因为我们发送了很多请求,有些可能会失败
  16. cat_content <- RETRY("GET", cat_url)
  17. cat_page <- read_html(content(cat_content, as = "text"))
  18. # 从页面标题中提取类别名称和子类别名称
  19. cat_name <- html_text(html_node(cat_page, "title")) %>%
  20. str_replace("- MoWeek", "") %>%
  21. str_to_title()
  22. subcat_name <- html_text(html_node(cat_page, ".categoryLevelTwoTitle")) %>%
  23. str_to_title()
  24. # 从页面中提取图像信息并将其存储在数据框中
  25. image_tags <- html_nodes(cat_page, ".productViewContainer")
  26. # 初始化列以便 bind_rows() 正常工作
  27. image_info <- tibble(
  28. Category = character(),
  29. Subcategory = character(),
  30. Name = character(),
  31. Price = character(),
  32. Image_URL = character()
  33. )
  34. for (tag in image_tags) {
  35. name <- html_text(html_node(tag, ".productViewName"))
  36. price <- html_text(html_node(tag, ".productViewPrice"))
  37. img_url <- html_attr(html_node(tag, ".productViewTop"), "data-hover-image")
  38. image_info <- image_info %>%
  39. add_row(Category = cat_name,
  40. Subcategory = subcat_name,
  41. Name = name,
  42. Price = price,
  43. Image_URL = img_url)
  44. }
  45. image_data <- image_data %>% bind_rows(image_info)
  46. }
  47. # 清理数据
  48. image_data <- image_data %>%
  49. mutate(
  50. Name = str_trim(Name),
  51. Price = str_trim(Price)
  52. )
  53. # 获取每个图像的文件名
  54. file_names <- str_match(image_data$Image_URL, ".+(/.*?$)")[,2] %>%
  55. str_sub(start = 2L)
  56. for (i in seq_len(nrow(image_data))) {
  57. url <- image_data$Image_URL[i]
  58. category <- image_data$Category[i]
  59. subcategory <- image_data$Subcategory[i]
  60. category_folder <- gsub(" ", "_", category)
  61. subcategory_folder <- gsub(" ", "_", subcategory)
  62. if (!dir.exists(category_folder)) {
  63. dir.create(category_folder)
  64. }
  65. if (!dir.exists(paste0(category_folder, "/", subcategory_folder))) {
  66. dir.create(paste0(category_folder, "/", subcategory_folder))
  67. }
  68. # 下载并将图像存储在正确的目录中
  69. dir_name <- paste0(category_folder, "/", subcategory_folder, "/")
  70. file_name <- file_names[i]
  71. download.file(url, paste0(dir_name, file_name))
  72. }
英文:

This code seemed to work for me.

  1. library(tidyverse)
  2. library(rvest)
  3. library(httr)
  4. url &lt;- &quot;https://moweek.com.uy/&quot;
  5. html_content &lt;- GET(url)
  6. webpage &lt;- read_html(content(html_content, as = &quot;text&quot;))
  7. category_nodes &lt;- html_nodes(webpage, &quot;.expandedCategory&quot;)
  8. category_urls &lt;- lapply(category_nodes, function(node) html_nodes(node, &quot;a&quot;) %&gt;%
  9. html_attr(&quot;href&quot;)) %&gt;%
  10. unlist() %&gt;%
  11. str_subset(&quot;/vestimenta/|/calzado/|/accesorios/&quot;)
  12. image_data &lt;- tibble()
  13. for (url in category_urls) {
  14. cat_url &lt;- paste0(&quot;https://moweek.com.uy&quot;, url)
  15. # Because we are sending so many requests, some are likely to fail
  16. cat_content &lt;- RETRY(&quot;GET&quot;, cat_url)
  17. cat_page &lt;- read_html(content(cat_content, as = &quot;text&quot;))
  18. # Extract the category name and the subcategory name from the page title
  19. cat_name &lt;- html_text(html_node(cat_page, &quot;title&quot;)) %&gt;%
  20. str_replace(&quot;- MoWeek&quot;, &quot;&quot;) %&gt;%
  21. str_to_title()
  22. subcat_name &lt;- html_text(html_node(cat_page, &quot;.categoryLevelTwoTitle&quot;)) %&gt;%
  23. str_to_title()
  24. # Extract the image information from the page and store it in a data frame
  25. image_tags &lt;- html_nodes(cat_page, &quot;.productViewContainer&quot;)
  26. # Initialise columns so bind_rows() works
  27. image_info &lt;- tibble(
  28. Category = character(),
  29. Subcategory = character(),
  30. Name = character(),
  31. Price = character(),
  32. Image_URL = character()
  33. )
  34. for (tag in image_tags) {
  35. name &lt;- html_text(html_node(tag, &quot;.productViewName&quot;))
  36. price &lt;- html_text(html_node(tag, &quot;.productViewPrice&quot;))
  37. img_url &lt;- html_attr(html_node(tag, &quot;.productViewTop&quot;), &quot;data-hover-image&quot;)
  38. image_info &lt;- image_info %&gt;%
  39. add_row(Category = cat_name,
  40. Subcategory = subcat_name,
  41. Name = name,
  42. Price = price,
  43. Image_URL = img_url)
  44. }
  45. image_data &lt;- image_data %&gt;% bind_rows(image_info)
  46. }
  47. # Clean data
  48. image_data &lt;- image_data %&gt;%
  49. mutate(
  50. Name = str_trim(Name),
  51. Price = str_trim(Price)
  52. )
  53. # Get the file name of each image
  54. file_names &lt;- str_match(image_data$Image_URL, &quot;.+(/.*?$)&quot;)[,2] %&gt;%
  55. str_sub(start = 2L)
  56. for (i in seq_len(nrow(image_data))) {
  57. url &lt;- image_data$Image_URL[i]
  58. category &lt;- image_data$Category[i]
  59. subcategory &lt;- image_data$Subcategory[i]
  60. category_folder &lt;- gsub(&quot; &quot;, &quot;_&quot;, category)
  61. subcategory_folder &lt;- gsub(&quot; &quot;, &quot;_&quot;, subcategory)
  62. if (!dir.exists(category_folder)) {
  63. dir.create(category_folder)
  64. }
  65. if (!dir.exists(paste0(category_folder, &quot;/&quot;, subcategory_folder))) {
  66. dir.create(paste0(category_folder, &quot;/&quot;, subcategory_folder))
  67. }
  68. # Download and store image in correct directory
  69. dir_name &lt;- paste0(category_folder, &quot;/&quot;, subcategory_folder, &quot;/&quot;)
  70. file_name &lt;- file_names[i]
  71. download.file(url, paste0(dir_name, file_name))
  72. }

huangapple
  • 本文由 发表于 2023年4月11日 00:21:49
  • 转载请务必保留本文链接:https://go.coder-hub.com/75978803.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定