Downloading images from web and its attributes in R.

huangapple go评论78阅读模式
英文:

Downloading images from web and its attributes in R

问题

我要下载此网站 https://moweek.com.uy/ 中的图像,具体来说,那些属于类 = "expandedCategory" 的图像: "VESTIMENTA", "CALZADO", "ACCESORIOS" (data-category-id="1", data-category-id="2" 和 data-category-id="3")。例如,进入"VESTIMENTA"下的子类别(<div id="expandedCategoryContainer">),除了"Ver todo"之外,像"Activewear"一样点击所有图像并获取数据集中的所有信息以及图像。

我尝试过这样做,但没有成功。

英文:

I want to download the images of this site https://moweek.com.uy/ in R, specifically, those under the class = "expandedCategory": "VESTIMENTA", "CALZADO", "ACCESORIOS" (data-category-id="1", data-category-id="2", and data-category-id="3"). For example, go into the sub categories (<div id="expandedCategoryContainer">) under "VESTIMENTA" (except from "Ver todo") like "Activewear" (<a data-category-id="57" href="/vestimenta/activewear/1" class=" categoryLevelTwoTitle selected'>Activewear</a> == $0) clicking in all the images and obtain all of the information in a dataset as well as the images.

I've trying to do this but I am failing.

pacman::p_load(tidyverse, rvest, httr)

url &lt;- &quot;https://moweek.com.uy/&quot;
html_content &lt;- GET(url)
webpage &lt;- read_html(content(html_content, as = &quot;text&quot;))

category_nodes &lt;- html_nodes(webpage, &quot;.expandedCategory&quot;)

category_urls &lt;- lapply(category_nodes, function(node) html_nodes(node, &quot;a&quot;) %&gt;% 
                          html_attr(&quot;href&quot;)) %&gt;% 
  unlist() %&gt;% 
  str_subset(&quot;/vestimenta/|/calzado/|/accesorios/&quot;)
                 
image_data &lt;- data.frame()
for (url in category_urls) {
  cat_url &lt;- paste0(&quot;https://moweek.com.uy&quot;, url)
  cat_content &lt;- GET(cat_url)
  cat_page &lt;- read_html(content(cat_content, as = &quot;text&quot;))
  
  # Extract the category name and the subcategory name from the page title
  cat_name &lt;- html_text(html_node(cat_page, &quot;title&quot;)) %&gt;%
    str_replace(&quot;- MoWeek&quot;, &quot;&quot;) %&gt;%
    str_to_title()
  subcat_name &lt;- html_text(html_node(cat_page, &quot;.categoryLevelTwoTitle&quot;)) %&gt;%
    str_to_title()
  
  # Extract the image information from the page and store it in a data frame
  image_tags &lt;- html_nodes(cat_page, &quot;.productItem&quot;)
  image_info &lt;- data.frame()
  for (tag in image_tags) {
    name &lt;- html_text(html_node(tag, &quot;.productTitle&quot;))
    price &lt;- html_text(html_node(tag, &quot;.priceText&quot;))
    img_url &lt;- html_attr(html_node(tag, &quot;img&quot;), &quot;src&quot;)
    image_info &lt;- image_info %&gt;%
      add_row(Category = cat_name,
              Subcategory = subcat_name,
              Name = name,
              Price = price,
              Image_URL = img_url)
  }
  image_data &lt;- image_data %&gt;% bind_rows(image_info)
}

for (i in 1:nrow(image_data)) {
  img &lt;- GET(image_data$Image_URL[i])
  category &lt;- image_data$Category[i]
  subcategory &lt;- image_data$Subcategory[i]
  category_folder &lt;- gsub(&quot; &quot;, &quot;_&quot;, category)
  subcategory_folder &lt;- gsub(&quot; &quot;, &quot;_&quot;, subcategory)
  
  if (!dir.exists(category_folder)) {
    dir.create(category_folder)
  }
  if (!dir.exists(paste0(category_folder, &quot;/&quot;, subcategory_folder))) {
    dir.create(paste0(category_folder, &quot;/&quot;, subcategory_folder))
  }
  
  file_name &lt;- paste
}

答案1

得分: 1

这是您提供的代码的翻译:

library(tidyverse)
library(rvest)
library(httr)

url <- "https://moweek.com.uy/"
html_content <- GET(url)
webpage <- read_html(content(html_content, as = "text"))

category_nodes <- html_nodes(webpage, ".expandedCategory")

category_urls <- lapply(category_nodes, function(node) html_nodes(node, "a") %>%
                          html_attr("href")) %>%
  unlist() %>%
  str_subset("/vestimenta/|/calzado/|/accesorios/")

image_data <- tibble()
for (url in category_urls) {
  cat_url <- paste0("https://moweek.com.uy", url)
  # 因为我们发送了很多请求,有些可能会失败
  cat_content <- RETRY("GET", cat_url)
  cat_page <- read_html(content(cat_content, as = "text"))

  # 从页面标题中提取类别名称和子类别名称
  cat_name <- html_text(html_node(cat_page, "title")) %>%
    str_replace("- MoWeek", "") %>%
    str_to_title()
  subcat_name <- html_text(html_node(cat_page, ".categoryLevelTwoTitle")) %>%
    str_to_title()

  # 从页面中提取图像信息并将其存储在数据框中
  image_tags <- html_nodes(cat_page, ".productViewContainer")
  # 初始化列以便 bind_rows() 正常工作
  image_info <- tibble(
    Category = character(),
    Subcategory = character(),
    Name = character(),
    Price = character(),
    Image_URL = character()
  )
  for (tag in image_tags) {
    name <- html_text(html_node(tag, ".productViewName"))
    price <- html_text(html_node(tag, ".productViewPrice"))
    img_url <- html_attr(html_node(tag, ".productViewTop"), "data-hover-image")
    image_info <- image_info %>%
      add_row(Category = cat_name,
              Subcategory = subcat_name,
              Name = name,
              Price = price,
              Image_URL = img_url)
  }
  image_data <- image_data %>% bind_rows(image_info)
}

# 清理数据
image_data <- image_data %>%
  mutate(
    Name = str_trim(Name),
    Price = str_trim(Price)
  )

# 获取每个图像的文件名
file_names <- str_match(image_data$Image_URL, ".+(/.*?$)")[,2] %>%
  str_sub(start = 2L)

for (i in seq_len(nrow(image_data))) {
  url <- image_data$Image_URL[i]
  category <- image_data$Category[i]
  subcategory <- image_data$Subcategory[i]
  category_folder <- gsub(" ", "_", category)
  subcategory_folder <- gsub(" ", "_", subcategory)

  if (!dir.exists(category_folder)) {
    dir.create(category_folder)
  }
  if (!dir.exists(paste0(category_folder, "/", subcategory_folder))) {
    dir.create(paste0(category_folder, "/", subcategory_folder))
  }

  # 下载并将图像存储在正确的目录中
  dir_name <- paste0(category_folder, "/", subcategory_folder, "/")
  file_name <- file_names[i]
  download.file(url, paste0(dir_name, file_name))
}
英文:

This code seemed to work for me.

library(tidyverse) 
library(rvest)
library(httr)

url &lt;- &quot;https://moweek.com.uy/&quot;
html_content &lt;- GET(url)
webpage &lt;- read_html(content(html_content, as = &quot;text&quot;))

category_nodes &lt;- html_nodes(webpage, &quot;.expandedCategory&quot;)

category_urls &lt;- lapply(category_nodes, function(node) html_nodes(node, &quot;a&quot;) %&gt;% 
                          html_attr(&quot;href&quot;)) %&gt;% 
  unlist() %&gt;% 
  str_subset(&quot;/vestimenta/|/calzado/|/accesorios/&quot;)

image_data &lt;- tibble()
for (url in category_urls) {
  cat_url &lt;- paste0(&quot;https://moweek.com.uy&quot;, url)
  # Because we are sending so many requests, some are likely to fail
  cat_content &lt;- RETRY(&quot;GET&quot;, cat_url)
  cat_page &lt;- read_html(content(cat_content, as = &quot;text&quot;))
  
  # Extract the category name and the subcategory name from the page title
  cat_name &lt;- html_text(html_node(cat_page, &quot;title&quot;)) %&gt;%
    str_replace(&quot;- MoWeek&quot;, &quot;&quot;) %&gt;%
    str_to_title()
  subcat_name &lt;- html_text(html_node(cat_page, &quot;.categoryLevelTwoTitle&quot;)) %&gt;%
    str_to_title()
  
  # Extract the image information from the page and store it in a data frame
  image_tags &lt;- html_nodes(cat_page, &quot;.productViewContainer&quot;)
  # Initialise columns so bind_rows() works
  image_info &lt;- tibble(
    Category = character(),
    Subcategory = character(),
    Name = character(),
    Price = character(),
    Image_URL = character()
  )
  for (tag in image_tags) {
    name &lt;- html_text(html_node(tag, &quot;.productViewName&quot;))
    price &lt;- html_text(html_node(tag, &quot;.productViewPrice&quot;))
    img_url &lt;- html_attr(html_node(tag, &quot;.productViewTop&quot;), &quot;data-hover-image&quot;)
    image_info &lt;- image_info %&gt;%
      add_row(Category = cat_name,
              Subcategory = subcat_name,
              Name = name,
              Price = price,
              Image_URL = img_url)
  }
  image_data &lt;- image_data %&gt;% bind_rows(image_info)
}

# Clean data
image_data &lt;- image_data %&gt;%
  mutate(
    Name = str_trim(Name),
    Price = str_trim(Price)
  )

# Get the file name of each image
file_names &lt;- str_match(image_data$Image_URL, &quot;.+(/.*?$)&quot;)[,2] %&gt;%
  str_sub(start = 2L)

for (i in seq_len(nrow(image_data))) {
  url &lt;- image_data$Image_URL[i]
  category &lt;- image_data$Category[i]
  subcategory &lt;- image_data$Subcategory[i]
  category_folder &lt;- gsub(&quot; &quot;, &quot;_&quot;, category)
  subcategory_folder &lt;- gsub(&quot; &quot;, &quot;_&quot;, subcategory)
  
  if (!dir.exists(category_folder)) {
    dir.create(category_folder)
  }
  if (!dir.exists(paste0(category_folder, &quot;/&quot;, subcategory_folder))) {
    dir.create(paste0(category_folder, &quot;/&quot;, subcategory_folder))
  }
  
  # Download and store image in correct directory
  dir_name &lt;- paste0(category_folder, &quot;/&quot;, subcategory_folder, &quot;/&quot;)
  file_name &lt;- file_names[i]
  download.file(url, paste0(dir_name, file_name))
}

huangapple
  • 本文由 发表于 2023年4月11日 00:21:49
  • 转载请务必保留本文链接:https://go.coder-hub.com/75978803.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定