如何在 `dplyr::join_by` 中将使用组变量的连接与不等连接结合起来?

huangapple go评论103阅读模式
英文:

How can I combine a join using group variables with inequality joins in `dplyr::join_by`?

问题

I want do a join_by using group variables taken from a grouped data frame and inequality joins. The group variables will vary, so I don't want to hard-code these.

In this code, I have hard-coded the group variables (month and port_a), but these will vary according to the supplied grouped_df and I want to extract those from that.
I can extract these using group_vars(), but I cannot figure out how to combine these with the other elements of the join_by().

  1. grouped_df %>%
  2. inner_join(breaks, by = join_by(month, port_a,
  3. size >= break_min,
  4. size < break_max)) %>%
  5. select(-break_min, -break_max) %>%
  6. select(all_of(group_vars), port_b, everything())

Reprex

Here is a reprex. It may seem complicated, but most of it is just to set up the data for the code above. I am looking for a dplyr/Tidyverse approach (not "do it using data.table"), as this will actually be processed using dbplyr.

  1. library(dplyr, warn.conflicts = FALSE)
  2. library(tidyr)
  3. # Code to generate data
  4. set.seed(2023)
  5. df <-
  6. expand_grid(month = seq(as.Date("2010-01-01"),
  7. as.Date("2020-01-01"), by = "1 month"),
  8. port_a = 1:5L,
  9. id = 1:100L) %>%
  10. mutate(size = runif( nrow(.), min = 1000, max = 10000),
  11. ret = rnorm(n = nrow(.)))
  12. get_breaks <- function(data, var, n_portfolios) {
  13. probs <- seq(0, 1, length.out = n_portfolios + 1)
  14. data %>%
  15. summarize(port_b = list(1:(n_portfolios + 1)),
  16. breaks = list(quantile({{ var }}, probs)),
  17. .groups = "keep") %>%
  18. unnest(cols = c(port_b, breaks)) %>%
  19. arrange(port_b) %>%
  20. mutate(break_min = breaks, break_max = lead(breaks)) %>%
  21. filter(port_b < n_portfolios + 1) %>%
  22. select(-breaks) %>%
  23. ungroup()
  24. }
  25. grouped_df <-
  26. df %>%
  27. group_by(month, port_a)
  28. group_vars <- group_vars(grouped_df)
  29. group_vars
  30. #> [1] "month" "port_a"
  31. breaks <- get_breaks(grouped_df, size, 5)
  32. # Code of interest
  33. grouped_df %>%
  34. inner_join(breaks, by = join_by(month, port_a,
  35. size >= break_min,
  36. size < break_max)) %>%
  37. select(-break_min, -break_max) %>%
  38. select(all_of(group_vars), port_b, everything())
  39. #> # A tibble: 59,895 × 6
  40. #> # Groups: month, port_a [605]
  41. #> month port_a port_b id size ret
  42. #> <date> <int> <int> <int> <dbl> <dbl>
  43. #> 1 2010-01-01 1 3 1 5200. 0.171
  44. #> 2 2010-01-01 1 2 2 4017. -0.828
  45. #> 3 2010-01-01 1 1 3 2465. 0.961
  46. #> 4 2010-01-01 1 2 4 4565. 0.842
  47. #> 5 2010-01-01 1 1 5 1274. 0.345
  48. #> 6 2010-01-01 1 1 6 2088. -0.418
  49. #> 7 2010-01-01 1 2 7 4835. -0.455
  50. #> 8 2010-01-01 1 3 8 6561. -0.539
  51. #> 9 2010-01-01 1 2 9 3369. -0.0590
  52. #> 10 2010-01-01 1 3 10 5287. -0.356
  53. #> # ℹ 59,885 more rows

Created on 2023-05-20 with reprex v2.0.2

英文:

I want do a join_by using group variables taken from a grouped data frame and inequality joins. The group variables will vary, so I don't want to hard-code these.

In this code, I have hard-coded the group variables (month and port_a), but these will vary according to the supplied grouped_df and I want to extract those from that.
I can extract these using group_vars(), but I cannot figure out how to combine these with the other elements of the join_by().

  1. grouped_df %&gt;%
  2. inner_join(breaks, by = join_by(month, port_a,
  3. size &gt;= break_min,
  4. size &lt; break_max)) %&gt;%
  5. select(-break_min, -break_max) %&gt;%
  6. select(all_of(group_vars), port_b, everything())

Reprex

Here is a reprex. It may seem complicated, but most of it is just to set up the data for the code above. I am looking for a dplyr/Tidyverse approach (not "do it using data.table"), as this will actually be processed using dbplyr.

  1. library(dplyr, warn.conflicts = FALSE)
  2. library(tidyr)
  3. # Code to generate data
  4. set.seed(2023)
  5. df &lt;-
  6. expand_grid(month = seq(as.Date(&quot;2010-01-01&quot;),
  7. as.Date(&quot;2020-01-01&quot;), by = &quot;1 month&quot;),
  8. port_a = 1:5L,
  9. id = 1:100L) %&gt;%
  10. mutate(size = runif( nrow(.), min = 1000, max = 10000),
  11. ret = rnorm(n = nrow(.)))
  12. get_breaks &lt;- function(data, var, n_portfolios) {
  13. probs &lt;- seq(0, 1, length.out = n_portfolios + 1)
  14. data %&gt;%
  15. summarize(port_b = list(1:(n_portfolios + 1)),
  16. breaks = list(quantile({{ var }}, probs)),
  17. .groups = &quot;keep&quot;) %&gt;%
  18. unnest(cols = c(port_b, breaks)) %&gt;%
  19. arrange(port_b) %&gt;%
  20. mutate(break_min = breaks, break_max = lead(breaks)) %&gt;%
  21. filter(port_b &lt; n_portfolios + 1) %&gt;%
  22. select(-breaks) %&gt;%
  23. ungroup()
  24. }
  25. grouped_df &lt;-
  26. df %&gt;%
  27. group_by(month, port_a)
  28. group_vars &lt;- group_vars(grouped_df)
  29. group_vars
  30. #&gt; [1] &quot;month&quot; &quot;port_a&quot;
  31. breaks &lt;- get_breaks(grouped_df, size, 5)
  32. # Code of interest
  33. grouped_df %&gt;%
  34. inner_join(breaks, by = join_by(month, port_a,
  35. size &gt;= break_min,
  36. size &lt; break_max)) %&gt;%
  37. select(-break_min, -break_max) %&gt;%
  38. select(all_of(group_vars), port_b, everything())
  39. #&gt; # A tibble: 59,895 &#215; 6
  40. #&gt; # Groups: month, port_a [605]
  41. #&gt; month port_a port_b id size ret
  42. #&gt; &lt;date&gt; &lt;int&gt; &lt;int&gt; &lt;int&gt; &lt;dbl&gt; &lt;dbl&gt;
  43. #&gt; 1 2010-01-01 1 3 1 5200. 0.171
  44. #&gt; 2 2010-01-01 1 2 2 4017. -0.828
  45. #&gt; 3 2010-01-01 1 1 3 2465. 0.961
  46. #&gt; 4 2010-01-01 1 2 4 4565. 0.842
  47. #&gt; 5 2010-01-01 1 1 5 1274. 0.345
  48. #&gt; 6 2010-01-01 1 1 6 2088. -0.418
  49. #&gt; 7 2010-01-01 1 2 7 4835. -0.455
  50. #&gt; 8 2010-01-01 1 3 8 6561. -0.539
  51. #&gt; 9 2010-01-01 1 2 9 3369. -0.0590
  52. #&gt; 10 2010-01-01 1 3 10 5287. -0.356
  53. #&gt; # ℹ 59,885 more rows

<sup>Created on 2023-05-20 with reprex v2.0.2</sup>

答案1

得分: 1

你可以使用splice操作符(!!!):

  1. grouped_df <-
  2. df %>%
  3. group_by(month, port_a)
  4. group_vars <- group_vars(grouped_df)
  5. group_vars
  6. #> [1] "month" "port_a"
  7. breaks <- get_breaks(grouped_df, size, 5)
  8. # Code of interest
  9. grouped_ref <- grouped_df %>%
  10. inner_join(breaks, by = join_by(month, port_a,
  11. size >= break_min,
  12. size < break_max)) %>%
  13. select(-break_min, -break_max) %>%
  14. select(all_of(group_vars), port_b, everything())
  15. ## Use splice on group_vars:
  16. grouped_test <- grouped_df %>%
  17. inner_join(breaks, by = join_by(!!!group_vars,
  18. size >= break_min,
  19. size < break_max)) %>%
  20. select(-break_min, -break_max) %>%
  21. select(all_of(group_vars), port_b, everything())
  22. # Test Objects for Exact Equality:
  23. identical(grouped_ref, grouped_test)
  24. #> [1] TRUE
英文:

You can use splice operator (!!!) :

  1. grouped_df &lt;-
  2. df %&gt;%
  3. group_by(month, port_a)
  4. group_vars &lt;- group_vars(grouped_df)
  5. group_vars
  6. #&gt; [1] &quot;month&quot; &quot;port_a&quot;
  7. breaks &lt;- get_breaks(grouped_df, size, 5)
  8. # Code of interest
  9. grouped_ref &lt;- grouped_df %&gt;%
  10. inner_join(breaks, by = join_by(month, port_a,
  11. size &gt;= break_min,
  12. size &lt; break_max)) %&gt;%
  13. select(-break_min, -break_max) %&gt;%
  14. select(all_of(group_vars), port_b, everything())
  15. ## Use splice on group_vars:
  16. grouped_test &lt;- grouped_df %&gt;%
  17. inner_join(breaks, by = join_by(!!!group_vars,
  18. size &gt;= break_min,
  19. size &lt; break_max)) %&gt;%
  20. select(-break_min, -break_max) %&gt;%
  21. select(all_of(group_vars), port_b, everything())
  22. # Test Objects for Exact Equality:
  23. identical(grouped_ref, grouped_test)
  24. #&gt; [1] TRUE

答案2

得分: 0

A workaround could be to rename the variables to "hardcoded" column names, use these column names for the join_by, then rename them back. This could be done as follows (only the lines with c.str.rename are important):

c.str.rename <- c(join_var_1 = "month", join_var_2 = "port_a")

grouped_df <-
df %>%
group_by(month, port_a) %>%
rename(all_of(c.str.rename))

group_vars <- group_vars(grouped_df)
breaks <- get_breaks(grouped_df, size, 5)

Code of interest

result1 <-
grouped_df %>%
inner_join(breaks, by = join_by(join_var_1, join_var_2,
size >= break_min,
size < break_max)) %>%
select(-break_min, -break_max) %>%
select(all_of(names(c.str.rename)), port_b, everything()) %>%
rename(setNames(names(c.str.rename), nm = c.str.rename))

identical(result1, result2) # TRUE (result2 contains your final df)

英文:

A workaround could be to rename the variables to "hardcoded" column names, use these column names for the join_by, then rename them back. This could be done as follows (only the lines with c.str.rename are important):

  1. c.str.rename &lt;- c(join_var_1 = &quot;month&quot;, join_var_2 = &quot;port_a&quot;)
  2. grouped_df &lt;-
  3. df %&gt;%
  4. group_by(month, port_a) %&gt;%
  5. rename(all_of(c.str.rename))
  6. group_vars &lt;- group_vars(grouped_df)
  7. breaks &lt;- get_breaks(grouped_df, size, 5)
  8. # Code of interest
  9. result1 &lt;- grouped_df %&gt;%
  10. inner_join(breaks, by = join_by(join_var_1, join_var_2,
  11. size &gt;= break_min,
  12. size &lt; break_max)) %&gt;%
  13. select(-break_min, -break_max) %&gt;%
  14. select(all_of(names(c.str.rename)), port_b, everything()) %&gt;%
  15. rename(setNames(names(c.str.rename), nm = c.str.rename))
  16. identical(result1, result2) # TRUE (result2 contains your final df)

huangapple
  • 本文由 发表于 2023年5月21日 00:45:26
  • 转载请务必保留本文链接:https://go.coder-hub.com/76296316.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定