按照定义的间隔对一列进行分组和汇总。

huangapple go评论64阅读模式
英文:

group by and summarise a column into defined breaks

问题

以下是翻译的代码部分:

我有以下数据:

(df <- data.frame(
  random_str = rep("ok", 30),
  value = rnorm(30, mean=50, sd=25)
))

我想运行一些摘要统计信息,并可以这样做:

df %>%
  group_by(random_str) %>%
  summarise(max = max(value),
            min = min(value)
            # 等等......
  )

但我还想将“value”列分成区间(例如0-33, 34-66, 67-100)。

我可以手动执行此操作:

df %>%
  group_by(random_str) %>%
  summarise(max = max(value),
            min = min(value),
            # 等等......
            # .. 更多内容
            count1_33 = sum(value < 34),
            count34_66 = sum(value < 67 & value > 33),
            count67_100 = sum(value > 66)
  )

**但是否有一种方法可以自动执行这最后三个摘要,实际上我有数百万行数据,希望有数百个等距区间。**
英文:

I have the following data:

(df &lt;- data.frame(
  random_str = rep(&quot;ok&quot;, 30),
  value      = rnorm(30, mean=50, sd=25)
))

I want to run a few summary statistics, and can do so:

df %&gt;% 
  group_by(random_str) %&gt;% 
  summarise(max = max(value),
            min = min(value)
            # and so on.....
  )

But the value column I also want to split into breaks (e.g. 0-33, 34-66, 67-100).

I can do this manually:

df %&gt;% 
  group_by(random_str) %&gt;% 
  summarise(max = max(value),
            min = min(value),
            # and so on.....
            # .. more stuff
            # ..
            count1_33  = sum(value &lt; 34),
            count34_66  = sum(value &lt; 67 &amp; value &gt; 33),
            count67_100 = sum(value &gt; 66)
  )

But is there a way to do these last three summaries automatically, in reality I have millions of rows of data and want 100s of equally space breaks.

答案1

得分: 1

以下是代码的翻译部分:

set.seed(100)
df <- data.frame(
  random_str = rep("ok", 30),
  value      = rnorm(30, mean=50, sd=25))

library(dplyr)
library(tidyr)

df %>%
  group_by(random_str) %>%
  summarise(max = max(value),
            min = min(value)) %>%
  bind_cols(.,
            df %>%
              group_by(random_str,
                       range = cut(value,
                            breaks = seq(1, by = 33,
                                    length.out = ceiling(max(value)/33) + 1))) %>%
              summarise( n = n(), .groups = "drop") %>%
              pivot_wider(-random_str, names_from = "range", values_from = "n"))

#> # A tibble: 1 x 7
#>   random_str   max   min `(1,34]` `(34,67]` `(67,100]` `(100,133]`
#>   <chr>      <dbl> <dbl>    <int>     <int>      <int>       <int>
#> 1 ok          108.  21.1        5        19          5           1

更新部分的翻译:

set.seed(100)
df <- data.frame(
  random_str = c(rep("ok", 15), rep("not", 15)),
  value      = rnorm(30, mean=500, sd=250))

library(dplyr)
library(purrr)

mseq <- seq(0, 900, 100)

split(df, as.factor(df$random_str)) %>%
  lapply(., function(mdf) {
             map2_dbl(.x = mseq, .y = mseq + 100, 
                      .f = ~nrow(subset(mdf, value > .x & value <= .y))) %>%
                              set_names(., map2_chr(.x = mseq, .y = mseq + 100, 
                                        .f = ~paste0("(",.x, ",", .y, "]")))}) %>%
  bind_rows(., .id = "random_str") %>%
  right_join({df %>%
                group_by(random_str) %>%
                summarise(max = max(value),
                          min = min(value))}, .)
#> Joining, by = "random_str"
#> # A tibble: 2 x 13
#>   random_str   max   min `(0,100]` `(100,200]` `(200,300]` `(300,400]`
#>   <chr>      <dbl> <dbl>     <dbl>       <dbl>       <dbl>       <dbl>
#> 1 not        1078.  211.         0           0           3           3
#> 2 ok          722.  294.         0           0           1           2
#> # ... with 6 more variables: (400,500] <dbl>, (500,600] <dbl>, (600,700] <dbl>,
#> #   (700,800] <dbl>, (800,900] <dbl>, (900,1000] <dbl>

希望这些翻译对你有帮助。

英文:
set.seed(100)
df &lt;- data.frame(
  random_str = rep(&quot;ok&quot;, 30),
  value      = rnorm(30, mean=50, sd=25))

library(dplyr)
library(tidyr)


df %&gt;% 
  group_by(random_str) %&gt;% 
  summarise(max = max(value),
            min = min(value)) %&gt;% 
  bind_cols(., 
            df %&gt;% 
              group_by(random_str, 
                       range = cut(value, 
                            breaks = seq(1, by = 33, 
                                    length.out = ceiling(max(value)/33) + 1))) %&gt;% 
              summarise( n = n(), .groups = &quot;drop&quot;) %&gt;% 
              pivot_wider(-random_str, names_from = &quot;range&quot;, values_from = &quot;n&quot;))

#&gt; # A tibble: 1 x 7
#&gt;   random_str   max   min `(1,34]` `(34,67]` `(67,100]` `(100,133]`
#&gt;   &lt;chr&gt;      &lt;dbl&gt; &lt;dbl&gt;    &lt;int&gt;     &lt;int&gt;      &lt;int&gt;       &lt;int&gt;
#&gt; 1 ok          108.  21.1        5        19          5           1

Update:

I am not sure if we can get cut to include empty ranges. For this, I used purrr::map_2 to provide the ranges "manually" and set their names (looped over each random_str). Then, I bind_rows the resulted dataframes (for each group), and finally will join it with summary stats.

set.seed(100)
df &lt;- data.frame(
  random_str = c(rep(&quot;ok&quot;, 15), rep(&quot;not&quot;, 15)),
  value      = rnorm(30, mean=500, sd=250))

library(dplyr)
library(purrr)

mseq &lt;- seq(0, 900, 100)

split(df, as.factor(df$random_str)) %&gt;% 
  lapply(., function(mdf) {
             map2_dbl(.x = mseq, .y = mseq + 100, 
                      .f = ~nrow(subset(mdf, value &gt; .x &amp; value &lt;= .y))) %&gt;% 
                              set_names(., map2_chr(.x = mseq, .y = mseq + 100, 
                                        .f = ~paste0(&quot;(&quot;,.x, &quot;,&quot;, .y, &quot;]&quot;)))}) %&gt;% 
  bind_rows(., .id = &quot;random_str&quot;) %&gt;% 
  right_join({df %&gt;% 
                group_by(random_str) %&gt;% 
                summarise(max = max(value),
                          min = min(value))}, .)
#&gt; Joining, by = &quot;random_str&quot;
#&gt; # A tibble: 2 x 13
#&gt;   random_str   max   min `(0,100]` `(100,200]` `(200,300]` `(300,400]`
#&gt;   &lt;chr&gt;      &lt;dbl&gt; &lt;dbl&gt;     &lt;dbl&gt;       &lt;dbl&gt;       &lt;dbl&gt;       &lt;dbl&gt;
#&gt; 1 not        1078.  211.         0           0           3           3
#&gt; 2 ok          722.  294.         0           0           1           2
#&gt; # ... with 6 more variables: (400,500] &lt;dbl&gt;, (500,600] &lt;dbl&gt;, (600,700] &lt;dbl&gt;,
#&gt; #   (700,800] &lt;dbl&gt;, (800,900] &lt;dbl&gt;, (900,1000] &lt;dbl&gt;

<sup>Created on 2023-04-07 by the reprex package (v2.0.1)</sup>

huangapple
  • 本文由 发表于 2023年4月7日 00:49:52
  • 转载请务必保留本文链接:https://go.coder-hub.com/75951930.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定