英文:
How can I combine a join using group variables with inequality joins in `dplyr::join_by`?
问题
I want do a join_by
using group variables taken from a grouped data frame and inequality joins. The group variables will vary, so I don't want to hard-code these.
In this code, I have hard-coded the group variables (month
and port_a
), but these will vary according to the supplied grouped_df
and I want to extract those from that.
I can extract these using group_vars()
, but I cannot figure out how to combine these with the other elements of the join_by()
.
grouped_df %>%
inner_join(breaks, by = join_by(month, port_a,
size >= break_min,
size < break_max)) %>%
select(-break_min, -break_max) %>%
select(all_of(group_vars), port_b, everything())
Reprex
Here is a reprex. It may seem complicated, but most of it is just to set up the data for the code above. I am looking for a dplyr
/Tidyverse approach (not "do it using data.table
"), as this will actually be processed using dbplyr
.
library(dplyr, warn.conflicts = FALSE)
library(tidyr)
# Code to generate data
set.seed(2023)
df <-
expand_grid(month = seq(as.Date("2010-01-01"),
as.Date("2020-01-01"), by = "1 month"),
port_a = 1:5L,
id = 1:100L) %>%
mutate(size = runif( nrow(.), min = 1000, max = 10000),
ret = rnorm(n = nrow(.)))
get_breaks <- function(data, var, n_portfolios) {
probs <- seq(0, 1, length.out = n_portfolios + 1)
data %>%
summarize(port_b = list(1:(n_portfolios + 1)),
breaks = list(quantile({{ var }}, probs)),
.groups = "keep") %>%
unnest(cols = c(port_b, breaks)) %>%
arrange(port_b) %>%
mutate(break_min = breaks, break_max = lead(breaks)) %>%
filter(port_b < n_portfolios + 1) %>%
select(-breaks) %>%
ungroup()
}
grouped_df <-
df %>%
group_by(month, port_a)
group_vars <- group_vars(grouped_df)
group_vars
#> [1] "month" "port_a"
breaks <- get_breaks(grouped_df, size, 5)
# Code of interest
grouped_df %>%
inner_join(breaks, by = join_by(month, port_a,
size >= break_min,
size < break_max)) %>%
select(-break_min, -break_max) %>%
select(all_of(group_vars), port_b, everything())
#> # A tibble: 59,895 × 6
#> # Groups: month, port_a [605]
#> month port_a port_b id size ret
#> <date> <int> <int> <int> <dbl> <dbl>
#> 1 2010-01-01 1 3 1 5200. 0.171
#> 2 2010-01-01 1 2 2 4017. -0.828
#> 3 2010-01-01 1 1 3 2465. 0.961
#> 4 2010-01-01 1 2 4 4565. 0.842
#> 5 2010-01-01 1 1 5 1274. 0.345
#> 6 2010-01-01 1 1 6 2088. -0.418
#> 7 2010-01-01 1 2 7 4835. -0.455
#> 8 2010-01-01 1 3 8 6561. -0.539
#> 9 2010-01-01 1 2 9 3369. -0.0590
#> 10 2010-01-01 1 3 10 5287. -0.356
#> # ℹ 59,885 more rows
Created on 2023-05-20 with reprex v2.0.2
英文:
I want do a join_by
using group variables taken from a grouped data frame and inequality joins. The group variables will vary, so I don't want to hard-code these.
In this code, I have hard-coded the group variables (month
and port_a
), but these will vary according to the supplied grouped_df
and I want to extract those from that.
I can extract these using group_vars()
, but I cannot figure out how to combine these with the other elements of the join_by()
.
grouped_df %>%
inner_join(breaks, by = join_by(month, port_a,
size >= break_min,
size < break_max)) %>%
select(-break_min, -break_max) %>%
select(all_of(group_vars), port_b, everything())
Reprex
Here is a reprex. It may seem complicated, but most of it is just to set up the data for the code above. I am looking for a dplyr
/Tidyverse approach (not "do it using data.table
"), as this will actually be processed using dbplyr
.
library(dplyr, warn.conflicts = FALSE)
library(tidyr)
# Code to generate data
set.seed(2023)
df <-
expand_grid(month = seq(as.Date("2010-01-01"),
as.Date("2020-01-01"), by = "1 month"),
port_a = 1:5L,
id = 1:100L) %>%
mutate(size = runif( nrow(.), min = 1000, max = 10000),
ret = rnorm(n = nrow(.)))
get_breaks <- function(data, var, n_portfolios) {
probs <- seq(0, 1, length.out = n_portfolios + 1)
data %>%
summarize(port_b = list(1:(n_portfolios + 1)),
breaks = list(quantile({{ var }}, probs)),
.groups = "keep") %>%
unnest(cols = c(port_b, breaks)) %>%
arrange(port_b) %>%
mutate(break_min = breaks, break_max = lead(breaks)) %>%
filter(port_b < n_portfolios + 1) %>%
select(-breaks) %>%
ungroup()
}
grouped_df <-
df %>%
group_by(month, port_a)
group_vars <- group_vars(grouped_df)
group_vars
#> [1] "month" "port_a"
breaks <- get_breaks(grouped_df, size, 5)
# Code of interest
grouped_df %>%
inner_join(breaks, by = join_by(month, port_a,
size >= break_min,
size < break_max)) %>%
select(-break_min, -break_max) %>%
select(all_of(group_vars), port_b, everything())
#> # A tibble: 59,895 × 6
#> # Groups: month, port_a [605]
#> month port_a port_b id size ret
#> <date> <int> <int> <int> <dbl> <dbl>
#> 1 2010-01-01 1 3 1 5200. 0.171
#> 2 2010-01-01 1 2 2 4017. -0.828
#> 3 2010-01-01 1 1 3 2465. 0.961
#> 4 2010-01-01 1 2 4 4565. 0.842
#> 5 2010-01-01 1 1 5 1274. 0.345
#> 6 2010-01-01 1 1 6 2088. -0.418
#> 7 2010-01-01 1 2 7 4835. -0.455
#> 8 2010-01-01 1 3 8 6561. -0.539
#> 9 2010-01-01 1 2 9 3369. -0.0590
#> 10 2010-01-01 1 3 10 5287. -0.356
#> # ℹ 59,885 more rows
<sup>Created on 2023-05-20 with reprex v2.0.2</sup>
答案1
得分: 1
你可以使用splice操作符(!!!
):
grouped_df <-
df %>%
group_by(month, port_a)
group_vars <- group_vars(grouped_df)
group_vars
#> [1] "month" "port_a"
breaks <- get_breaks(grouped_df, size, 5)
# Code of interest
grouped_ref <- grouped_df %>%
inner_join(breaks, by = join_by(month, port_a,
size >= break_min,
size < break_max)) %>%
select(-break_min, -break_max) %>%
select(all_of(group_vars), port_b, everything())
## Use splice on group_vars:
grouped_test <- grouped_df %>%
inner_join(breaks, by = join_by(!!!group_vars,
size >= break_min,
size < break_max)) %>%
select(-break_min, -break_max) %>%
select(all_of(group_vars), port_b, everything())
# Test Objects for Exact Equality:
identical(grouped_ref, grouped_test)
#> [1] TRUE
英文:
You can use splice operator (!!!
) :
grouped_df <-
df %>%
group_by(month, port_a)
group_vars <- group_vars(grouped_df)
group_vars
#> [1] "month" "port_a"
breaks <- get_breaks(grouped_df, size, 5)
# Code of interest
grouped_ref <- grouped_df %>%
inner_join(breaks, by = join_by(month, port_a,
size >= break_min,
size < break_max)) %>%
select(-break_min, -break_max) %>%
select(all_of(group_vars), port_b, everything())
## Use splice on group_vars:
grouped_test <- grouped_df %>%
inner_join(breaks, by = join_by(!!!group_vars,
size >= break_min,
size < break_max)) %>%
select(-break_min, -break_max) %>%
select(all_of(group_vars), port_b, everything())
# Test Objects for Exact Equality:
identical(grouped_ref, grouped_test)
#> [1] TRUE
答案2
得分: 0
A workaround could be to rename the variables to "hardcoded" column names, use these column names for the join_by
, then rename them back. This could be done as follows (only the lines with c.str.rename
are important):
c.str.rename <- c(join_var_1 = "month", join_var_2 = "port_a")
grouped_df <-
df %>%
group_by(month, port_a) %>%
rename(all_of(c.str.rename))
group_vars <- group_vars(grouped_df)
breaks <- get_breaks(grouped_df, size, 5)
Code of interest
result1 <-
grouped_df %>%
inner_join(breaks, by = join_by(join_var_1, join_var_2,
size >= break_min,
size < break_max)) %>%
select(-break_min, -break_max) %>%
select(all_of(names(c.str.rename)), port_b, everything()) %>%
rename(setNames(names(c.str.rename), nm = c.str.rename))
identical(result1, result2) # TRUE (result2 contains your final df)
英文:
A workaround could be to rename the variables to "hardcoded" column names, use these column names for the join_by
, then rename them back. This could be done as follows (only the lines with c.str.rename
are important):
c.str.rename <- c(join_var_1 = "month", join_var_2 = "port_a")
grouped_df <-
df %>%
group_by(month, port_a) %>%
rename(all_of(c.str.rename))
group_vars <- group_vars(grouped_df)
breaks <- get_breaks(grouped_df, size, 5)
# Code of interest
result1 <- grouped_df %>%
inner_join(breaks, by = join_by(join_var_1, join_var_2,
size >= break_min,
size < break_max)) %>%
select(-break_min, -break_max) %>%
select(all_of(names(c.str.rename)), port_b, everything()) %>%
rename(setNames(names(c.str.rename), nm = c.str.rename))
identical(result1, result2) # TRUE (result2 contains your final df)
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论