如何在 `dplyr::join_by` 中将使用组变量的连接与不等连接结合起来?

huangapple go评论69阅读模式
英文:

How can I combine a join using group variables with inequality joins in `dplyr::join_by`?

问题

I want do a join_by using group variables taken from a grouped data frame and inequality joins. The group variables will vary, so I don't want to hard-code these.

In this code, I have hard-coded the group variables (month and port_a), but these will vary according to the supplied grouped_df and I want to extract those from that.
I can extract these using group_vars(), but I cannot figure out how to combine these with the other elements of the join_by().

grouped_df %>%
  inner_join(breaks, by = join_by(month, port_a,
                                  size >= break_min,
                                  size < break_max)) %>%
  select(-break_min, -break_max) %>%
  select(all_of(group_vars), port_b, everything())

Reprex

Here is a reprex. It may seem complicated, but most of it is just to set up the data for the code above. I am looking for a dplyr/Tidyverse approach (not "do it using data.table"), as this will actually be processed using dbplyr.

library(dplyr, warn.conflicts = FALSE)
library(tidyr)

# Code to generate data
set.seed(2023)

df <- 
  expand_grid(month = seq(as.Date("2010-01-01"),
                          as.Date("2020-01-01"), by = "1 month"),
              port_a = 1:5L, 
              id = 1:100L) %>%
  mutate(size = runif( nrow(.), min = 1000, max = 10000),
         ret = rnorm(n = nrow(.)))

get_breaks <- function(data, var, n_portfolios) {

  probs <- seq(0, 1, length.out = n_portfolios + 1)
  data %>% 
    summarize(port_b = list(1:(n_portfolios + 1)), 
              breaks = list(quantile({{ var }}, probs)),
              .groups = "keep") %>%
    unnest(cols = c(port_b, breaks)) %>%
    arrange(port_b) %>%
    mutate(break_min = breaks, break_max = lead(breaks)) %>%
    filter(port_b < n_portfolios + 1) %>%
    select(-breaks) %>%
    ungroup()
}
  
grouped_df <-
  df %>%
  group_by(month, port_a)

group_vars <- group_vars(grouped_df)
group_vars
#> [1] "month"  "port_a"

breaks <- get_breaks(grouped_df, size, 5) 

# Code of interest
grouped_df %>%
  inner_join(breaks, by = join_by(month, port_a,
                                  size >= break_min,
                                  size < break_max)) %>%
  select(-break_min, -break_max) %>%
  select(all_of(group_vars), port_b, everything())
#> # A tibble: 59,895 × 6
#> # Groups:   month, port_a [605]
#>    month      port_a port_b    id  size     ret
#>    <date>      <int>  <int> <int> <dbl>   <dbl>
#>  1 2010-01-01      1      3     1 5200.  0.171 
#>  2 2010-01-01      1      2     2 4017. -0.828 
#>  3 2010-01-01      1      1     3 2465.  0.961 
#>  4 2010-01-01      1      2     4 4565.  0.842 
#>  5 2010-01-01      1      1     5 1274.  0.345 
#>  6 2010-01-01      1      1     6 2088. -0.418 
#>  7 2010-01-01      1      2     7 4835. -0.455 
#>  8 2010-01-01      1      3     8 6561. -0.539 
#>  9 2010-01-01      1      2     9 3369. -0.0590
#> 10 2010-01-01      1      3    10 5287. -0.356 
#> # ℹ 59,885 more rows

Created on 2023-05-20 with reprex v2.0.2

英文:

I want do a join_by using group variables taken from a grouped data frame and inequality joins. The group variables will vary, so I don't want to hard-code these.

In this code, I have hard-coded the group variables (month and port_a), but these will vary according to the supplied grouped_df and I want to extract those from that.
I can extract these using group_vars(), but I cannot figure out how to combine these with the other elements of the join_by().

grouped_df %&gt;%
inner_join(breaks, by = join_by(month, port_a,
size &gt;= break_min,
size &lt; break_max)) %&gt;%
select(-break_min, -break_max) %&gt;%
select(all_of(group_vars), port_b, everything())

Reprex

Here is a reprex. It may seem complicated, but most of it is just to set up the data for the code above. I am looking for a dplyr/Tidyverse approach (not "do it using data.table"), as this will actually be processed using dbplyr.

library(dplyr, warn.conflicts = FALSE)
library(tidyr)

# Code to generate data
set.seed(2023)

df &lt;-
  expand_grid(month = seq(as.Date(&quot;2010-01-01&quot;),
                          as.Date(&quot;2020-01-01&quot;), by = &quot;1 month&quot;),
              port_a = 1:5L, 
              id = 1:100L) %&gt;%
  mutate(size = runif( nrow(.), min = 1000, max = 10000),
         ret = rnorm(n = nrow(.)))

get_breaks &lt;- function(data, var, n_portfolios) {

  probs &lt;- seq(0, 1, length.out = n_portfolios + 1)
  data %&gt;% 
    summarize(port_b = list(1:(n_portfolios + 1)), 
              breaks = list(quantile({{ var }}, probs)),
              .groups = &quot;keep&quot;) %&gt;%
    unnest(cols = c(port_b, breaks)) %&gt;%
    arrange(port_b) %&gt;%
    mutate(break_min = breaks, break_max = lead(breaks)) %&gt;%
    filter(port_b &lt; n_portfolios + 1) %&gt;%
    select(-breaks) %&gt;%
    ungroup()
}
  
grouped_df &lt;-
  df %&gt;%
  group_by(month, port_a)

group_vars &lt;- group_vars(grouped_df)
group_vars
#&gt; [1] &quot;month&quot;  &quot;port_a&quot;

breaks &lt;- get_breaks(grouped_df, size, 5) 

# Code of interest
grouped_df %&gt;%
  inner_join(breaks, by = join_by(month, port_a,
                                  size &gt;= break_min,
                                  size &lt; break_max)) %&gt;%
  select(-break_min, -break_max) %&gt;%
  select(all_of(group_vars), port_b, everything())
#&gt; # A tibble: 59,895 &#215; 6
#&gt; # Groups:   month, port_a [605]
#&gt;    month      port_a port_b    id  size     ret
#&gt;    &lt;date&gt;      &lt;int&gt;  &lt;int&gt; &lt;int&gt; &lt;dbl&gt;   &lt;dbl&gt;
#&gt;  1 2010-01-01      1      3     1 5200.  0.171 
#&gt;  2 2010-01-01      1      2     2 4017. -0.828 
#&gt;  3 2010-01-01      1      1     3 2465.  0.961 
#&gt;  4 2010-01-01      1      2     4 4565.  0.842 
#&gt;  5 2010-01-01      1      1     5 1274.  0.345 
#&gt;  6 2010-01-01      1      1     6 2088. -0.418 
#&gt;  7 2010-01-01      1      2     7 4835. -0.455 
#&gt;  8 2010-01-01      1      3     8 6561. -0.539 
#&gt;  9 2010-01-01      1      2     9 3369. -0.0590
#&gt; 10 2010-01-01      1      3    10 5287. -0.356 
#&gt; # ℹ 59,885 more rows

<sup>Created on 2023-05-20 with reprex v2.0.2</sup>

答案1

得分: 1

你可以使用splice操作符(!!!):

grouped_df <-
  df %>%
  group_by(month, port_a)

group_vars <- group_vars(grouped_df)
group_vars
#> [1] "month"  "port_a"

breaks <- get_breaks(grouped_df, size, 5) 

# Code of interest
grouped_ref <-  grouped_df %>%
  inner_join(breaks, by = join_by(month, port_a,
                                  size >= break_min,
                                  size < break_max)) %>%
  select(-break_min, -break_max) %>%
  select(all_of(group_vars), port_b, everything())

## Use splice on group_vars:
grouped_test <-  grouped_df %>%
  inner_join(breaks, by = join_by(!!!group_vars,
                                  size >= break_min,
                                  size < break_max)) %>%
  select(-break_min, -break_max) %>%
  select(all_of(group_vars), port_b, everything())

# Test Objects for Exact Equality:
identical(grouped_ref, grouped_test)
#> [1] TRUE
英文:

You can use splice operator (!!!) :

grouped_df &lt;-
  df %&gt;%
  group_by(month, port_a)

group_vars &lt;- group_vars(grouped_df)
group_vars
#&gt; [1] &quot;month&quot;  &quot;port_a&quot;

breaks &lt;- get_breaks(grouped_df, size, 5) 

# Code of interest
grouped_ref &lt;-  grouped_df %&gt;%
  inner_join(breaks, by = join_by(month, port_a,
                                  size &gt;= break_min,
                                  size &lt; break_max)) %&gt;%
  select(-break_min, -break_max) %&gt;%
  select(all_of(group_vars), port_b, everything())

## Use splice on group_vars:
grouped_test &lt;-  grouped_df %&gt;%
  inner_join(breaks, by = join_by(!!!group_vars,
                                  size &gt;= break_min,
                                  size &lt; break_max)) %&gt;%
  select(-break_min, -break_max) %&gt;%
  select(all_of(group_vars), port_b, everything())

# Test Objects for Exact Equality:
identical(grouped_ref, grouped_test)
#&gt; [1] TRUE

答案2

得分: 0

A workaround could be to rename the variables to "hardcoded" column names, use these column names for the join_by, then rename them back. This could be done as follows (only the lines with c.str.rename are important):

c.str.rename <- c(join_var_1 = "month", join_var_2 = "port_a")

grouped_df <-
df %>%
group_by(month, port_a) %>%
rename(all_of(c.str.rename))

group_vars <- group_vars(grouped_df)
breaks <- get_breaks(grouped_df, size, 5)

Code of interest

result1 <-
grouped_df %>%
inner_join(breaks, by = join_by(join_var_1, join_var_2,
size >= break_min,
size < break_max)) %>%
select(-break_min, -break_max) %>%
select(all_of(names(c.str.rename)), port_b, everything()) %>%
rename(setNames(names(c.str.rename), nm = c.str.rename))

identical(result1, result2) # TRUE (result2 contains your final df)

英文:

A workaround could be to rename the variables to "hardcoded" column names, use these column names for the join_by, then rename them back. This could be done as follows (only the lines with c.str.rename are important):

c.str.rename &lt;- c(join_var_1 = &quot;month&quot;, join_var_2 = &quot;port_a&quot;)
grouped_df &lt;-
df %&gt;%
group_by(month, port_a) %&gt;% 
rename(all_of(c.str.rename))
group_vars &lt;- group_vars(grouped_df)
breaks &lt;- get_breaks(grouped_df, size, 5)
# Code of interest
result1 &lt;- grouped_df %&gt;%
inner_join(breaks, by = join_by(join_var_1, join_var_2, 
size &gt;= break_min,
size &lt; break_max)) %&gt;%
select(-break_min, -break_max) %&gt;%
select(all_of(names(c.str.rename)), port_b, everything()) %&gt;% 
rename(setNames(names(c.str.rename), nm = c.str.rename))
identical(result1, result2) # TRUE (result2 contains your final df)

huangapple
  • 本文由 发表于 2023年5月21日 00:45:26
  • 转载请务必保留本文链接:https://go.coder-hub.com/76296316.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定