英文:
Downloading images from web and its attributes in R
问题
我要下载此网站 https://moweek.com.uy/ 中的图像,具体来说,那些属于类 = "expandedCategory" 的图像: "VESTIMENTA", "CALZADO", "ACCESORIOS" (data-category-id="1", data-category-id="2" 和 data-category-id="3")。例如,进入"VESTIMENTA"下的子类别(<div id="expandedCategoryContainer">),除了"Ver todo"之外,像"Activewear"一样点击所有图像并获取数据集中的所有信息以及图像。
我尝试过这样做,但没有成功。
英文:
I want to download the images of this site https://moweek.com.uy/ in R, specifically, those under the class = "expandedCategory": "VESTIMENTA", "CALZADO", "ACCESORIOS" (data-category-id="1", data-category-id="2", and data-category-id="3"). For example, go into the sub categories (<div id="expandedCategoryContainer">) under "VESTIMENTA" (except from "Ver todo") like "Activewear" (<a data-category-id="57" href="/vestimenta/activewear/1" class=" categoryLevelTwoTitle selected'>Activewear</a> == $0) clicking in all the images and obtain all of the information in a dataset as well as the images.
I've trying to do this but I am failing.
pacman::p_load(tidyverse, rvest, httr)
url <- "https://moweek.com.uy/"
html_content <- GET(url)
webpage <- read_html(content(html_content, as = "text"))
category_nodes <- html_nodes(webpage, ".expandedCategory")
category_urls <- lapply(category_nodes, function(node) html_nodes(node, "a") %>%
html_attr("href")) %>%
unlist() %>%
str_subset("/vestimenta/|/calzado/|/accesorios/")
image_data <- data.frame()
for (url in category_urls) {
cat_url <- paste0("https://moweek.com.uy", url)
cat_content <- GET(cat_url)
cat_page <- read_html(content(cat_content, as = "text"))
# Extract the category name and the subcategory name from the page title
cat_name <- html_text(html_node(cat_page, "title")) %>%
str_replace("- MoWeek", "") %>%
str_to_title()
subcat_name <- html_text(html_node(cat_page, ".categoryLevelTwoTitle")) %>%
str_to_title()
# Extract the image information from the page and store it in a data frame
image_tags <- html_nodes(cat_page, ".productItem")
image_info <- data.frame()
for (tag in image_tags) {
name <- html_text(html_node(tag, ".productTitle"))
price <- html_text(html_node(tag, ".priceText"))
img_url <- html_attr(html_node(tag, "img"), "src")
image_info <- image_info %>%
add_row(Category = cat_name,
Subcategory = subcat_name,
Name = name,
Price = price,
Image_URL = img_url)
}
image_data <- image_data %>% bind_rows(image_info)
}
for (i in 1:nrow(image_data)) {
img <- GET(image_data$Image_URL[i])
category <- image_data$Category[i]
subcategory <- image_data$Subcategory[i]
category_folder <- gsub(" ", "_", category)
subcategory_folder <- gsub(" ", "_", subcategory)
if (!dir.exists(category_folder)) {
dir.create(category_folder)
}
if (!dir.exists(paste0(category_folder, "/", subcategory_folder))) {
dir.create(paste0(category_folder, "/", subcategory_folder))
}
file_name <- paste
}
答案1
得分: 1
这是您提供的代码的翻译:
library(tidyverse)
library(rvest)
library(httr)
url <- "https://moweek.com.uy/"
html_content <- GET(url)
webpage <- read_html(content(html_content, as = "text"))
category_nodes <- html_nodes(webpage, ".expandedCategory")
category_urls <- lapply(category_nodes, function(node) html_nodes(node, "a") %>%
html_attr("href")) %>%
unlist() %>%
str_subset("/vestimenta/|/calzado/|/accesorios/")
image_data <- tibble()
for (url in category_urls) {
cat_url <- paste0("https://moweek.com.uy", url)
# 因为我们发送了很多请求,有些可能会失败
cat_content <- RETRY("GET", cat_url)
cat_page <- read_html(content(cat_content, as = "text"))
# 从页面标题中提取类别名称和子类别名称
cat_name <- html_text(html_node(cat_page, "title")) %>%
str_replace("- MoWeek", "") %>%
str_to_title()
subcat_name <- html_text(html_node(cat_page, ".categoryLevelTwoTitle")) %>%
str_to_title()
# 从页面中提取图像信息并将其存储在数据框中
image_tags <- html_nodes(cat_page, ".productViewContainer")
# 初始化列以便 bind_rows() 正常工作
image_info <- tibble(
Category = character(),
Subcategory = character(),
Name = character(),
Price = character(),
Image_URL = character()
)
for (tag in image_tags) {
name <- html_text(html_node(tag, ".productViewName"))
price <- html_text(html_node(tag, ".productViewPrice"))
img_url <- html_attr(html_node(tag, ".productViewTop"), "data-hover-image")
image_info <- image_info %>%
add_row(Category = cat_name,
Subcategory = subcat_name,
Name = name,
Price = price,
Image_URL = img_url)
}
image_data <- image_data %>% bind_rows(image_info)
}
# 清理数据
image_data <- image_data %>%
mutate(
Name = str_trim(Name),
Price = str_trim(Price)
)
# 获取每个图像的文件名
file_names <- str_match(image_data$Image_URL, ".+(/.*?$)")[,2] %>%
str_sub(start = 2L)
for (i in seq_len(nrow(image_data))) {
url <- image_data$Image_URL[i]
category <- image_data$Category[i]
subcategory <- image_data$Subcategory[i]
category_folder <- gsub(" ", "_", category)
subcategory_folder <- gsub(" ", "_", subcategory)
if (!dir.exists(category_folder)) {
dir.create(category_folder)
}
if (!dir.exists(paste0(category_folder, "/", subcategory_folder))) {
dir.create(paste0(category_folder, "/", subcategory_folder))
}
# 下载并将图像存储在正确的目录中
dir_name <- paste0(category_folder, "/", subcategory_folder, "/")
file_name <- file_names[i]
download.file(url, paste0(dir_name, file_name))
}
英文:
This code seemed to work for me.
library(tidyverse)
library(rvest)
library(httr)
url <- "https://moweek.com.uy/"
html_content <- GET(url)
webpage <- read_html(content(html_content, as = "text"))
category_nodes <- html_nodes(webpage, ".expandedCategory")
category_urls <- lapply(category_nodes, function(node) html_nodes(node, "a") %>%
html_attr("href")) %>%
unlist() %>%
str_subset("/vestimenta/|/calzado/|/accesorios/")
image_data <- tibble()
for (url in category_urls) {
cat_url <- paste0("https://moweek.com.uy", url)
# Because we are sending so many requests, some are likely to fail
cat_content <- RETRY("GET", cat_url)
cat_page <- read_html(content(cat_content, as = "text"))
# Extract the category name and the subcategory name from the page title
cat_name <- html_text(html_node(cat_page, "title")) %>%
str_replace("- MoWeek", "") %>%
str_to_title()
subcat_name <- html_text(html_node(cat_page, ".categoryLevelTwoTitle")) %>%
str_to_title()
# Extract the image information from the page and store it in a data frame
image_tags <- html_nodes(cat_page, ".productViewContainer")
# Initialise columns so bind_rows() works
image_info <- tibble(
Category = character(),
Subcategory = character(),
Name = character(),
Price = character(),
Image_URL = character()
)
for (tag in image_tags) {
name <- html_text(html_node(tag, ".productViewName"))
price <- html_text(html_node(tag, ".productViewPrice"))
img_url <- html_attr(html_node(tag, ".productViewTop"), "data-hover-image")
image_info <- image_info %>%
add_row(Category = cat_name,
Subcategory = subcat_name,
Name = name,
Price = price,
Image_URL = img_url)
}
image_data <- image_data %>% bind_rows(image_info)
}
# Clean data
image_data <- image_data %>%
mutate(
Name = str_trim(Name),
Price = str_trim(Price)
)
# Get the file name of each image
file_names <- str_match(image_data$Image_URL, ".+(/.*?$)")[,2] %>%
str_sub(start = 2L)
for (i in seq_len(nrow(image_data))) {
url <- image_data$Image_URL[i]
category <- image_data$Category[i]
subcategory <- image_data$Subcategory[i]
category_folder <- gsub(" ", "_", category)
subcategory_folder <- gsub(" ", "_", subcategory)
if (!dir.exists(category_folder)) {
dir.create(category_folder)
}
if (!dir.exists(paste0(category_folder, "/", subcategory_folder))) {
dir.create(paste0(category_folder, "/", subcategory_folder))
}
# Download and store image in correct directory
dir_name <- paste0(category_folder, "/", subcategory_folder, "/")
file_name <- file_names[i]
download.file(url, paste0(dir_name, file_name))
}
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论