英文:
Finding mean of variable across each month/year
问题
我有一个类似于这样的数据集:
> dput(df)
structure(list(Date = c("3/23/21", "4/11/22", "6/30/22"), Banana_wasted = c(4L,
2L, 5L), Apple_wasted = c(6L, 0L, 3L), Orange_wasted = c(1L,
4L, 1L), Banana_ordered = c(5L, 7L, 7L), Apple_Ordered = c(9L,
8L, 9L), Orange_ordered = c(5L, 6L, 6L), Banana_eaten = c(5L,
5L, 6L), Apple_eaten = c(7L, 7L, 4L), Orange_eaten = c(8L, 8L,
8L)), class = "data.frame", row names = c(NA, -3L))
我想要计算每个月/年水果浪费的百分比(与订购了多少水果有关)。
应该是:
(Banana_wasted
+Apple_wasted
+Orange_wasted
)/(Banana_ordered
+ Apple_ordered
+ Orange_ordered
)
因此,对于 3/21,应该是:
(4+6+1/5+9+5)*100 = 57.9%
我想要为一年中的每个月都这样做。
英文:
I have a dataset that looks similar to this:
> dput(df)
structure(list(Date = c("3/23/21", "4/11/22", "6/30/22"), Banana_wasted = c(4L,
2L, 5L), Apple_wasted = c(6L, 0L, 3L), Orange_wasted = c(1L,
4L, 1L), Banana_ordered = c(5L, 7L, 7L), Apple_Ordered = c(9L,
8L, 9L), Orange_ordered = c(5L, 6L, 6L), Banana_eaten = c(5L,
5L, 6L), Apple_eaten = c(7L, 7L, 4L), Orange_eaten = c(8L, 8L,
8L)), class = "data.frame", row.names = c(NA, -3L))
I want to find the % of fruit wasted per month/year (in relation to how many fruits were ordered).
it should be:
(Banana_wasted
+Apple_wasted
+Orange_wasted
) / (Banana_ordered
+ Apple_ordered
+ Orange_ordered
)
So, for 3/21, it should be:
(4+6+1/5+9+5)*100 = 57.9%
I would like to do this for every month of the year.
答案1
得分: 2
library(tidyverse)
df %>%
group_by(Date = floor_date(mdy(Date), "month")) %>%
summarise(
wasted = sum(across(contains("wasted"))) / sum(across(contains("ordered"))),
wasted_eaten = sum(across(contains("wasted"))) / sum(across(contains("eaten")))
)
# A tibble: 3 x 3
Date wasted wasted_eaten
<date> <dbl> <dbl>
1 2021-03-01 0.579 0.579
2 2022-04-01 0.286 0.314
3 2022-06-01 0.409 0.523
英文:
library(tidyverse)
df %>%
group_by(Date = floor_date(mdy(Date), "month")) %>%
summarise(
wasted = sum(across(contains("wasted"))) / sum(across(contains("ordered"))),
wasted_eaten = sum(across(contains("wasted"))) / sum(across(contains("eaten")))
)
# A tibble: 3 x 3
Date wasted wasted_eaten
<date> <dbl> <dbl>
1 2021-03-01 0.579 0.579
2 2022-04-01 0.286 0.314
3 2022-06-01 0.409 0.523
答案2
得分: 1
库(dplyr)
库(lubridate)
df %>%
变异(日期 = as.Date(日期, format = "%m/%d/%y"),
浪费百分比 = (香蕉浪费 + 苹果浪费 + 橙子浪费) / (香蕉订购 + 苹果订购 + 橙子订购) * 100) %>%
分组依据(年份 = year(日期), 月份 = month(日期)) %>%
汇总(平均浪费百分比 = mean(浪费百分比))
#> # A tibble: 3 × 3
#> # Groups: year [2]
#> year month avg_pct_wasted
#> <dbl> <dbl> <dbl>
#> 1 2021 3 57.9
#> 2 2022 4 28.6
#> 3 2022 6 40.9
英文:
library(dplyr)
library(lubridate)
df %>%
mutate(Date = as.Date(Date, format = "%m/%d/%y"),
pct_wasted = (Banana_wasted + Apple_wasted + Orange_wasted) / (Banana_ordered + Apple_Ordered + Orange_ordered) * 100) %>%
group_by(year = year(Date), month = month(Date)) %>%
summarize(avg_pct_wasted = mean(pct_wasted))
#> # A tibble: 3 × 3
#> # Groups: year [2]
#> year month avg_pct_wasted
#> <dbl> <dbl> <dbl>
#> 1 2021 3 57.9
#> 2 2022 4 28.6
#> 3 2022 6 40.9
<sup>Created on 2023-02-06 with reprex v2.0.2</sup>
答案3
得分: 0
以下是翻译好的代码部分:
library(dplyr)
library(tidyr)
library(lubridate)
dat %>%
rename(Apple_ordered = Apple_Ordered) %>%
pivot_longer(
Banana_wasted:Orange_eaten,
names_to = c("水果", ".value"),
names_sep = "_"
) %>%
group_by(month = floor_date(mdy(Date), "month")) %>%
summarize(pct_wasted = sum(wasted) / sum(ordered)) %>%
ungroup()
# # 一个数据框: 3 × 2
# 月份 百分比浪费
# <日期> <dbl>
# 1 2021-03-01 0.579
# 2 2022-04-01 0.286
# 3 2022-06-01 0.409
library(scales)
dat %>%
rename(Apple_ordered = Apple_Ordered) %>%
pivot_longer(
Banana_wasted:Orange_eaten,
names_to = c("水果", ".value"),
names_sep = "_"
) %>%
group_by(month = strftime(mdy(Date), "%B %Y")) %>%
summarize(pct_wasted = percent(sum(wasted) / sum(ordered), accuracy = 0.1)) %>%
ungroup()
# # 一个数据框: 3 × 2
# 月份 百分比浪费
# <字符> <字符>
# 1 April 2022 28.6%
# 2 June 2022 40.9%
# 3 March 2021 57.9%
英文:
Pivot longer to get single wasted
and ordered
columns across all fruits; use lubridate::floor_date()
and mdy()
to get months from Date
; group by month; then sum and divide to get your percentages:
library(dplyr)
library(tidyr)
library(lubridate)
dat %>%
rename(Apple_ordered = Apple_Ordered) %>% # for consistent capitalization
pivot_longer(
Banana_wasted:Orange_eaten,
names_to = c("Fruit", ".value"),
names_sep = "_"
) %>%
group_by(month = floor_date(mdy(Date), "month")) %>%
summarize(pct_wasted = sum(wasted) / sum(ordered)) %>%
ungroup()
# # A tibble: 3 × 2
# month pct_wasted
# <date> <dbl>
# 1 2021-03-01 0.579
# 2 2022-04-01 0.286
# 3 2022-06-01 0.409
If you prefer character labels, use strftime()
instead of floor_date()
, and scales::percent()
for the percentages:
library(scales)
dat %>%
rename(Apple_ordered = Apple_Ordered) %>%
pivot_longer(
Banana_wasted:Orange_eaten,
names_to = c("Fruit", ".value"),
names_sep = "_"
) %>%
group_by(month = strftime(mdy(Date), "%B %Y")) %>%
summarize(pct_wasted = percent(sum(wasted) / sum(ordered), accuracy = 0.1)) %>%
ungroup()
# # A tibble: 3 × 2
# month pct_wasted
# <chr> <chr>
# 1 April 2022 28.6%
# 2 June 2022 40.9%
# 3 March 2021 57.9%
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论