按重叠类别进行分组(一个类别应该分别归入另外两个不同的类别)。

huangapple go评论78阅读模式
英文:

Group by an overlapping category (a category should be grouped in 2 different other categories)

问题

我有一个数据集,其中有'gr1'、'gr2'和'both'这些组。基本上,我想通过c("gr1", "both")c("gr2", "both")来分组'gr'列。

这里我提出了一个使用简单数据框的解决方案,但我想知道是否有一种方法可以进行'复杂'的分组,例如group_by(gr ''using c("gr1", "both") and c("gr2", "both") as groups'')。是否有一种方法可以在dplyr中指定要分组在一起的内容,而不是像下面所示那样手动操作?

library(tidyverse)
set.seed(1234)

df = data.frame(x = 1:10, id = sample(LETTERS[1:3], size = 10, replace = TRUE),
           gr = c(rep("gr1",3), rep("gr2",4),rep("both",3)))

sum.gr1 = df %>%
  filter(gr %in% c("gr1", "both")) %>%
  group_by(id) %>%
  summarize(x.sum = sum(x)) %>%
  mutate(gr.filt = "gr1.both")

sum.gr2 = df %>%
  filter(gr %in% c("gr2", "both")) %>%
  group_by(id) %>%
  summarize(x.sum = sum(x))%>%
  mutate(gr.filt = "gr2.both")

df.gr = rbind(sum.gr1, sum.gr2)
df.gr

希望这能满足你的需求。

英文:

I have a dataset where there are groups 'gr1', 'gr2', and 'both'. Basically, I'd like to group the 'gr' column by c("gr1", "both") and c("gr2", "both").

Here I'm proposing a solution with a simple data frame, but I'd like to know if there is a way to make 'complex' grouping such as group_by(gr ''using c("gr1", "both") and c("gr2", "both") as groups'' ). Is there a way to specify what to group together in dplyr instead of doing the rind like shown below?

library(tidyverse)
set.seed(1234)

df = data.frame(x = 1:10, id = sample(LETTERS[1:3], size = 10, replace = TRUE),
           gr = c(rep("gr1",3), rep("gr2",4),rep("both",3)))

df
    x id   gr
1   1  B  gr1
2   2  B  gr1
3   3  A  gr1
4   4  C  gr2
5   5  A  gr2
6   6  A  gr2
7   7  B  gr2
8   8  B both
9   9  C both
10 10  B both

sum.gr1 = df %>% 
  filter(gr %in% c("gr1", "both")) %>% 
  group_by(id) %>% 
  summarize(x.sum = sum(x)) %>% 
  mutate(gr.filt = "gr1.both")

sum.gr2 = df %>% 
  filter(gr %in% c("gr2", "both")) %>% 
  group_by(id) %>% 
  summarize(x.sum = sum(x))%>% 
  mutate(gr.filt = "gr2.both")

df.gr = rbind(sum.gr1, sum.gr2)
df.gr

# A tibble: 6 × 3
id    x.sum gr.filt 
<chr> <int> <chr>   
1 A         3 gr1.both
2 B        21 gr1.both
3 C         9 gr1.both
4 A        11 gr2.both
5 B        25 gr2.both
6 C        13 gr2.both

答案1

得分: 3

以下是使用 map_df 的版本:

library(dplyr)
library(purrr)

map_df(list(c("gr1", "both"), c("gr2", "both")), ~df %>%
                        filter(gr %in% .x) %>%
                        group_by(id) %>%
                        summarize(x.sum = sum(x)) %>%
                        mutate(gr.filt = paste(.x, collapse = ".")))
 id    x.sum gr.filt 
  <chr> <int> <chr>   
1 A         3 gr1.both
2 B        21 gr1.both
3 C         9 gr1.both
4 A        11 gr2.both
5 B        25 gr2.both
6 C        13 gr2.both
英文:

Update after clarification:

Here is a version using map_df:

library(dplyr)
library(purrr)

map_df(list(c(&quot;gr1&quot;, &quot;both&quot;), c(&quot;gr2&quot;, &quot;both&quot;)), ~df %&gt;% 
                        filter(gr %in% .x) %&gt;% 
                        group_by(id) %&gt;% 
                        summarize(x.sum = sum(x)) %&gt;% 
                        mutate(gr.filt = paste(.x, collapse = &quot;.&quot;)))
 id    x.sum gr.filt 
  &lt;chr&gt; &lt;int&gt; &lt;chr&gt;   
1 A         3 gr1.both
2 B        21 gr1.both
3 C         9 gr1.both
4 A        11 gr2.both
5 B        25 gr2.both
6 C        13 gr2.both

答案2

得分: 3

这里我将“non-both”数据与“both”数据的一个版本组合,其中每一行都复制到“non-both”组中。

library(dplyr)
bind_rows(
  df %>% filter(gr != "both"),
  df %>% filter(gr == "both") %>% select(-gr) %>%
    tidyr::crossing(gr = unique(df$gr[df$gr != "both"]))
) %>%
  count(gr = paste0(gr, ".both"), id, wt = x, name = "x.sum")

结果

         gr id x.sum
1  gr1.both  A     3
2  gr1.both  B    21
3  gr1.both  C     9
4  gr2.both  A    11
5  gr2.both  B    25
6  gr2.both  C    13
英文:

Here I combine the "non-both" data with a version of the "both" data where each row has been copied to each of the "non-both" groups.

library(dplyr)
bind_rows(
  df |&gt; filter(gr != &quot;both&quot;),
  df |&gt; filter(gr == &quot;both&quot;) |&gt; select(-gr) |&gt;
    tidyr::crossing(gr = unique(df$gr[df$gr != &quot;both&quot;]))
) |&gt;
  count(gr = paste0(gr, &quot;.both&quot;), id, wt = x, name = &quot;x.sum&quot;)

Result

        gr id x.sum
1 gr1.both  A     3
2 gr1.both  B    21
3 gr1.both  C     9
4 gr2.both  A    11
5 gr2.both  B    25
6 gr2.both  C    13

huangapple
  • 本文由 发表于 2023年5月25日 02:37:07
  • 转载请务必保留本文链接:https://go.coder-hub.com/76326508.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定