英文:
Using group_by in a FOR loop to create lead variables
问题
我需要基于数据集中的多个变量创建一组导向变量。虽然我可以在不使用循环的情况下完成这个任务,但我更愿意使用循环来大大减少我的程序代码行数。
当我在循环之外运行代码时,它返回我所期望的确切结果(见下文)。当我将代码放入FOR循环中时(见下文),代码运行时没有错误,并且导向变量被创建,但就好像忽略了group_by函数。此外,循环代码创建的导向变量的名称与我预期的不同。我想了解以下问题:
- 为什么group_by函数在FOR循环内部不起作用?
- 为什么循环代码创建的字段名称与我预期的不同?
这段代码将提供示例数据,并说明问题:
library(lubridate)
library(sqldf)
library(tidyverse)
library(dplyr)
all_data <- lakers
# 创建日期字段
all_data <- transform(all_data, date = as.Date(as.character(date), "%Y%m%d"))
# 创建YEAR、MONTH和YEARMONTH字段
all_data$CY <- year(all_data$date)
all_data$MONTH <- month(all_data$date)
all_data$MONTH <- formatC(all_data$MONTH, width = 2, format = "d", flag = "0")
all_data$MONTH <- paste0("M", all_data$MONTH)
all_data$CY_M <- paste0(all_data$CY, all_data$MONTH)
# 创建分组字段
all_data$GROUP_ID <- paste0(all_data$game_type, "_", all_data$etype)
# 汇总到GROUP_ID、CY_M级别
all_data_agg <- sqldf("SELECT GROUP_ID, SUM(points) points, CY_M
FROM all_data
GROUP BY GROUP_ID, CY_M")
# 按时间降序排序数据集
all_data_agg <- all_data_agg[order(all_data_agg$GROUP_ID, -xtfrm(all_data_agg$CY_M)),]
# 创建导向变量 - 这段代码运行得很好
all_data_agg <- all_data_agg %>%
group_by(GROUP_ID) %>%
mutate(
points_LEAD1A = lead(points, n = 1, default = NA),
points_LEAD2A = lead(points, n = 2, default = NA),
points_LEAD3A = lead(points, n = 3, default = NA)
)
# 使用FOR循环创建导向变量 - 这段代码忽略了group_by函数
fieldname_list <- c("points")
for (i in fieldname_list) {
all_data_agg <- all_data_agg %>%
group_by(GROUP_ID) %>%
mutate(
!!paste0(as.character(i), "_LEAD1B") := !!lead(all_data_agg[as.character(i)], n = 1, default = NA),
!!paste0(as.character(i), "_LEAD2B") := !!lead(all_data_agg[as.character(i)], n = 2, default = NA),
!!paste0(as.character(i), "_LEAD3B") := !!lead(all_data_agg[as.character(i)], n = 3, default = NA)
)
}
请注意,上述代码是您提供的代码的翻译部分。
英文:
I need to create a set of lead variables based on several variables in my data set. While I can do this without using a loop, I'd much rather use a loop and greatly reduce the lines of code in my program.
When I run the code outside of the loop, it returns the exact result I'm looking for (see below). When I put the code into a FOR loop (see below), the code runs without errors and the lead variables are created, but its as if the group_by function is ignored. In addition, the names of the lead variables created by the loop code contain an extra instance of the field name on which they were based. I'd like to understand:
- Why the group_by function doesn't work inside the FOR loop
- Why the field names created by the loop code are different than I would expect
This code will provide sample data as well as illustrate the issue:
library(lubridate)
library(sqldf)
library(tidyverse)
library(dplyr)
all_data <- lakers
# creating a date field
all_data <- transform(all_data, date = as.Date(as.character(date), "%Y%m%d"))
# creating YEAR, MONTH, and YEARMONTH fields
all_data$CY <- year(all_data$date)
all_data$MONTH <- month(all_data$date)
all_data$MONTH <- formatC(all_data$MONTH, width = 2, format = "d", flag = "0")
all_data$MONTH <- paste0("M",all_data$MONTH)
all_data$CY_M <- paste0(all_data$CY, all_data$MONTH)
# creating a grouping field
all_data$GROUP_ID <- paste0(all_data$game_type,"_", all_data$etype)
# aggregating to the GROUP_ID, CY_M level
all_data_agg <- sqldf("SELECT GROUP_ID, SUM(points) points, CY_M
FROM all_data
GROUP BY GROUP_ID, CY_M")
# sorting dataset in descending order by time
all_data_agg <- all_data_agg[order(all_data_agg$GROUP_ID,-xtfrm(all_data_agg$CY_M)),]
# creating lead variables - THIS CODE RUNS PERFECTLY
all_data_agg <- all_data_agg %>%
group_by(GROUP_ID) %>%
mutate(points_LEAD1A = lead(points, n = 1, default = NA),
points_LEAD2A = lead(points, n = 2, default = NA),
points_LEAD3A = lead(points, n = 3, default = NA)
)
# creating lead variables with FOR loop - THIS CODE IGNORES THE GROUP_BY FUNCTION
fieldname_list <- c("points")
for (i in fieldname_list) {
all_data_agg <- all_data_agg %>%
group_by(GROUP_ID) %>%
mutate(!!paste0(as.character(i),"_LEAD1B") := !!lead(all_data_agg[as.character(i)], n = 1, default = NA),
!!paste0(as.character(i),"_LEAD2B") := !!lead(all_data_agg[as.character(i)], n = 2, default = NA),
!!paste0(as.character(i),"_LEAD3B") := !!lead(all_data_agg[as.character(i)], n = 3, default = NA)
)
}
答案1
得分: 1
以下是您要求的翻译:
使用dplyr::across
的更简单方法,考虑了分组并避免使用for
循环:
library(dplyr)
fieldname_list <- c("points")
all_data_agg %>%
group_by(GROUP_ID) %>%
mutate(across(
all_of(fieldname_list),
list(
LEAD1B = ~ lead(.x, n = 1, default = NA),
LEAD2B = ~ lead(.x, n = 2, default = NA),
LEAD3B = ~ lead(.x, n = 3, default = NA)
)
))
#> # A tibble: 127 × 6
#> # Groups: GROUP_ID [19]
#> GROUP_ID CY_M points points_LEAD1B points_LEAD2B points_LEAD3B
#> <chr> <chr> <int> <int> <int> <int>
#> 1 away_ejection 2009M03 0 NA NA NA
#> 2 away_foul 2009M04 0 0 0 0
#> 3 away_foul 2009M03 0 0 0 0
#> 4 away_foul 2009M02 0 0 0 0
#> 5 away_foul 2009M01 0 0 0 0
#> 6 away_foul 2008M12 0 0 0 NA
#> 7 away_foul 2008M11 0 0 NA NA
#> 8 away_foul 2008M10 0 NA NA NA
#> 9 away_free throw 2009M04 115 300 311 212
#> 10 away_free throw 2009M03 300 311 212 322
#> # ℹ 117 more rows
但如果您坚持使用for
循环,那么在mutate
内部,您必须在左侧使用sym()
,并在右侧使用.data
代词或!!sym()
,如下所示:
for (i in fieldname_list) {
all_data_agg <- all_data_agg %>%
group_by(GROUP_ID) %>%
mutate(
!!sym(paste0(as.character(i), "_LEAD1B")) := lead(.data[[i]], n = 1, default = NA),
!!sym(paste0(as.character(i), "_LEAD2B")) := lead(.data[[i]], n = 2, default = NA),
!!sym(paste0(as.character(i), "_LEAD3B")) := lead(.data[[i]], n = 3, default = NA)
)
}
all_data_agg
#> # A tibble: 127 × 6
#> # Groups: GROUP_ID [19]
#> GROUP_ID CY_M points points_LEAD1B points_LEAD2B points_LEAD3B
#> <chr> <chr> <int> <int> <int> <int>
#> 1 away_ejection 2009M03 0 NA NA NA
#> 2 away_foul 2009M04 0 0 0 0
#> 3 away_foul 2009M03 0 0 0 0
#> 4 away_foul 2009M02 0 0 0 0
#> 5 away_foul 2009M01 0 0 0 0
#> 6 away_foul 2008M12 0 0 0 NA
#> 7 away_foul 2008M11 0 0 NA NA
#> 8 away_foul 2008M10 0 NA NA NA
#> 9 away_free throw 2009M04 115 300 311 212
#> 10 away_free throw 2009M03 300 311 212 322
#> # ℹ 117 more rows
希望这些翻译对您有所帮助。
英文:
An easier approach which takes account of the grouping and avoids a for
loop would be to use dplyr::across
:
library(dplyr)
fieldname_list <- c("points")
all_data_agg %>%
group_by(GROUP_ID) %>%
mutate(across(
all_of(fieldname_list),
list(
LEAD1B = ~ lead(.x, n = 1, default = NA),
LEAD2B = ~ lead(.x, n = 2, default = NA),
LEAD3B = ~ lead(.x, n = 3, default = NA)
)
))
#> # A tibble: 127 × 6
#> # Groups: GROUP_ID [19]
#> GROUP_ID CY_M points points_LEAD1B points_LEAD2B points_LEAD3B
#> <chr> <chr> <int> <int> <int> <int>
#> 1 away_ejection 2009M03 0 NA NA NA
#> 2 away_foul 2009M04 0 0 0 0
#> 3 away_foul 2009M03 0 0 0 0
#> 4 away_foul 2009M02 0 0 0 0
#> 5 away_foul 2009M01 0 0 0 0
#> 6 away_foul 2008M12 0 0 0 NA
#> 7 away_foul 2008M11 0 0 NA NA
#> 8 away_foul 2008M10 0 NA NA NA
#> 9 away_free throw 2009M04 115 300 311 212
#> 10 away_free throw 2009M03 300 311 212 322
#> # ℹ 117 more rows
But if you insist on using a for
loop then inside mutate
you have to use sym()
on the LHS and make e.g. use of the .data
pronoun or !!sym()
too on the RHS:
for (i in fieldname_list) {
all_data_agg <- all_data_agg %>%
group_by(GROUP_ID) %>%
mutate(
!!sym(paste0(as.character(i), "_LEAD1B")) := lead(.data[[i]], n = 1, default = NA),
!!sym(paste0(as.character(i), "_LEAD2B")) := lead(.data[[i]], n = 2, default = NA),
!!sym(paste0(as.character(i), "_LEAD3B")) := lead(.data[[i]], n = 3, default = NA)
)
}
all_data_agg
#> # A tibble: 127 × 6
#> # Groups: GROUP_ID [19]
#> GROUP_ID CY_M points points_LEAD1B points_LEAD2B points_LEAD3B
#> <chr> <chr> <int> <int> <int> <int>
#> 1 away_ejection 2009M03 0 NA NA NA
#> 2 away_foul 2009M04 0 0 0 0
#> 3 away_foul 2009M03 0 0 0 0
#> 4 away_foul 2009M02 0 0 0 0
#> 5 away_foul 2009M01 0 0 0 0
#> 6 away_foul 2008M12 0 0 0 NA
#> 7 away_foul 2008M11 0 0 NA NA
#> 8 away_foul 2008M10 0 NA NA NA
#> 9 away_free throw 2009M04 115 300 311 212
#> 10 away_free throw 2009M03 300 311 212 322
#> # ℹ 117 more rows
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论