如何计算每个组的百分比取决于不同的变量?

huangapple go评论58阅读模式
英文:

How to calculate percentage for each group depends on the different variable?

问题

以下是代码的翻译部分:

这是虚拟数据集的R代码:

c <- c(10, 20, 30, 40, 50, 40, 2, 40, 10, 50)
b <- c(40, 2, 40, 10, 50, 10, 20, 30, 40, 50)
a <- c(10, 50, 3, 60, 100,40, 2, 40, 10, 50)
id <- c("a", "b", "b", "a", "c", "a", "b", "b", "a", "c")
variation <- c("a3", "a3", "b1", "a2", "b1","a3", "a1", "b1", "a1", "b1")

data <- data.frame(id, a, b, c, variation)
head(data)
#    id   a  b  c variation
# 1   a  10 40 10        a3
# 2   b  50  2 20        a3
# 3   b   3 40 30        b1
# 4   a  60 10 40        a2
# 5   c 100 50 50        b1
# 6   a  40 10 40        a3
# 7   b   2 20  2        a1
# 8   b  40 30 40        b1
# 9   a  10 40 10        a1
# 10  c  50 50 50        b1

我可以为每个单独的id进行百分比计算,过滤后如下:

data_filter <- data %>% filter(id == "a")
data_filter
#   id  a  b  c variation
# 1  a 10 40 10        a3
# 2  a 60 10 40        a2
# 3  a 40 10 40        a3
# 4  a 10 40 10        a1

# 数据转换
data_filter_percentage <- data_filter %>%
  group_by(variation) %>%
  count() %>%
  ungroup() %>%
  mutate(perc = `n` / sum(`n`)) %>%
  arrange(perc) %>%
  mutate(labels = scales::percent(perc))

head(data_filter_percentage)
# A tibble: 3 x 4
#   variation     n  perc labels
#   <chr>     <int> <dbl> <chr> 
# 1 a1            1  0.25 25%   
# 2 a2            1  0.25 25%   
# 3 a3            2  0.5  50%  

然而,我的问题是,是否可以对所有"id"执行上述管道而无需单独过滤?

英文:

This the dummy dataset R code:

c &lt;- c(10, 20, 30, 40, 50, 40, 2, 40, 10, 50)
b &lt;- c(40, 2, 40, 10, 50, 10, 20, 30, 40, 50)
a &lt;- c(10, 50, 3, 60, 100,40, 2, 40, 10, 50)
id &lt;- c(&quot;a&quot;, &quot;b&quot;, &quot;b&quot;, &quot;a&quot;, &quot;c&quot;, &quot;a&quot;, &quot;b&quot;, &quot;b&quot;, &quot;a&quot;, &quot;c&quot;)
variation &lt;- c(&quot;a3&quot;, &quot;a3&quot;, &quot;b1&quot;, &quot;a2&quot;, &quot;b1&quot;,&quot;a3&quot;, &quot;a1&quot;, &quot;b1&quot;, &quot;a1&quot;, &quot;b1&quot; )

data &lt;- data.frame(id, a, b, c, variation)
head(data)
#    id   a  b  c variation
# 1   a  10 40 10        a3
# 2   b  50  2 20        a3
# 3   b   3 40 30        b1
# 4   a  60 10 40        a2
# 5   c 100 50 50        b1
# 6   a  40 10 40        a3
# 7   b   2 20  2        a1
# 8   b  40 30 40        b1
# 9   a  10 40 10        a1
# 10  c  50 50 50        b1

I can calculate percentages for individual id after filtering:

data_filter &lt;- data %&gt;% filter(id == &quot;a&quot;)
data_filter
#   id  a  b  c variation
# 1  a 10 40 10        a3
# 2  a 60 10 40        a2
# 3  a 40 10 40        a3
# 4  a 10 40 10        a1

# Data transformation
data_filter_percentage &lt;- data_filter %&gt;% 
  group_by(variation) %&gt;% # Variable to be transformed
  count() %&gt;% 
  ungroup() %&gt;% 
  mutate(perc = `n` / sum(`n`)) %&gt;% 
  arrange(perc) %&gt;%
  mutate(labels = scales::percent(perc))

head(data_filter_percentage)
# A tibble: 3 x 4
#   variation     n  perc labels
#   &lt;chr&gt;     &lt;int&gt; &lt;dbl&gt; &lt;chr&gt; 
# 1 a1            1  0.25 25%   
# 2 a2            1  0.25 25%   
# 3 a3            2  0.5  50%  

However, my question is, Is it possible to perform above pipeline for all "id" without filtering individually?

答案1

得分: 1

以下是翻译好的代码部分:

library(dplyr)

data %>%
  group_by(id) %>%
  count(variation) %>%
  mutate(perc = n / sum(n), labels = scales::percent(perc)) %>%
  ungroup()

Briefly,

data %>%
  count(id, variation) %>%
  mutate(perc = n / sum(n), labels = scales::percent(perc), .by = id)

# # A tibble: 7 × 5
#   id    variation     n  perc labels
#   <chr> <chr>     <int> <dbl> <chr> 
# 1 a     a1            1  0.25 25%   
# 2 a     a2            1  0.25 25%   
# 3 a     a3            2  0.5  50%   
# 4 b     a1            1  0.25 25%   
# 5 b     a3            1  0.25 25%   
# 6 b     b1            2  0.5  50%   
# 7 c     b1            2  1    100%
英文:

You can try the following workflow:

library(dplyr)

data %&gt;%
  group_by(id) %&gt;%
  count(variation) %&gt;%
  mutate(perc = n / sum(n), labels = scales::percent(perc)) %&gt;%
  ungroup()

Briefly,

data %&gt;%
  count(id, variation) %&gt;%
  mutate(perc = n / sum(n), labels = scales::percent(perc), .by = id)

# # A tibble: 7 &#215; 5
#   id    variation     n  perc labels
#   &lt;chr&gt; &lt;chr&gt;     &lt;int&gt; &lt;dbl&gt; &lt;chr&gt; 
# 1 a     a1            1  0.25 25%   
# 2 a     a2            1  0.25 25%   
# 3 a     a3            2  0.5  50%   
# 4 b     a1            1  0.25 25%   
# 5 b     a3            1  0.25 25%   
# 6 b     b1            2  0.5  50%   
# 7 c     b1            2  1    100%

huangapple
  • 本文由 发表于 2023年7月18日 16:52:55
  • 转载请务必保留本文链接:https://go.coder-hub.com/76711042.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定