英文:
R code to find a cumulative sum of a vector with a criteria set on another vector of a dataframe
问题
Sure, here is the translated code portion:
p <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
q <- c("a", "a", "a", "b", "b", "c", "c", "c", "c", "c")
r <- c(0, 1, 3, 0, 4, 0, 6, 13, 21, 30)
t <- data.frame(p, q, r)
If you need further assistance with the code or have any questions, please feel free to ask.
英文:
p<-c(1,	2,	3,	4,	5,	6,	7,	8,	9,	10)
q<-c("a",	"a",	"a",	"b",	"b",	"c",	"c",	"c",	"c",	"c")
r<-c(0,	1,	3,	0,	4,	0,	6,	13,	21,	30)
t<-data.frame(p,q,r)
In the above data I am trying to achieve vector r.
Vector r is a cumulative sum of vector p with a lag and a criteria set on vector q.
Please assume that dataframe is already sorted alphabetically using vector q
I have tried creating multiple vector with a lag and then trying to sum. but it isnt ideal.
答案1
得分: 4
最高效的方法是使用 collapse。
首先创建一个 GRP 对象。
然后使用该 GRP 对象执行分组的滞后和累积求和。
collapse vs data.table 基准测试
3 组
mark(e1 = {
  g <- GRP(t, by = "q")
  t[, result := fcumsum(flag(p, g = g, fill = 0), g = g)][]
},
e2 = {
  t[, result := cumsum(shift(p, type = "lag", fill = 0)), by = q][]
}
)
# A tibble: 2 x 13
  expression    min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory     time       gc      
  <bch:expr> <bch:> <bch:>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>     <list>     <list>  
1 e1          434us  484us     1905.    36.7KB     0      953     0      500ms <dt>   <Rprofmem> <bench_tm> <tibble>
2 e2          511us  548us     1646.    32.5KB     2.27   725     1      440ms <dt>   <Rprofmem> <bench_tm> <tibble>
10^7 行,约 10^6 组
t <- t[sample.int(.N, 10^7, T)]
t[, q := sample.int(10^6, 10^7, T)]
mark(e1 = {
  g <- GRP(t, by = "q")
  t[, result := fcumsum(flag(p, g = g, fill = 0), g = g)][]
},
e2 = {
  t[, result := cumsum(shift(p, type = "lag", fill = 0)), by = q][]
}
)
# A tibble: 2 x 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory     time          
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>     <list>        
1 e1          694.9ms  694.9ms    1.44       244MB     0        1     0    694.9ms <dt>   <Rprofmem> <bench_tm [1]>
2 e2            21.3s    21.3s    0.0469      71MB     1.50     1    32      21.3s <dt>   <Rprofmem> <bench_tm [1]>
英文:
The most efficient method would be using collapse.
First create a GRP object.
library(collapse)
library(data.table)
setDT(t)
g <- GRP(t, by = "q")
Then do a grouped lag & cumulative sum using that GRP object.
t[, result := fcumsum(flag(p, g = g, fill = 0), g = g)][]
     p q  r result
 1:  1 a  0      0
 2:  2 a  1      1
 3:  3 a  3      3
 4:  4 b  0      0
 5:  5 b  4      4
 6:  6 c  0      0
 7:  7 c  6      6
 8:  8 c 13     13
 9:  9 c 21     21
10: 10 c 30     30
collapse vs data.table benchmark
3 groups
mark(e1 = {
  g <- GRP(t, by = "q")
  t[, result := fcumsum(flag(p, g = g, fill = 0), g = g)][]
},
e2 = {
  t[, result := cumsum(shift(p, type = "lag", fill = 0)), by = q][]
}
)
# A tibble: 2 x 13
  expression    min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory     time       gc      
  <bch:expr> <bch:> <bch:>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>     <list>     <list>  
1 e1          434us  484us     1905.    36.7KB     0      953     0      500ms <dt>   <Rprofmem> <bench_tm> <tibble>
2 e2          511us  548us     1646.    32.5KB     2.27   725     1      440ms <dt>   <Rprofmem> <bench_tm> <tibble>
10^7 rows, ~ 10^6 groups
t <- t[sample.int(.N, 10^7, T)]
t[, q := sample.int(10^6, 10^7, T)]
mark(e1 = {
  g <- GRP(t, by = "q")
  t[, result := fcumsum(flag(p, g = g, fill = 0), g = g)][]
},
e2 = {
  t[, result := cumsum(shift(p, type = "lag", fill = 0)), by = q][]
}
)
# A tibble: 2 x 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory     time          
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>     <list>        
1 e1          694.9ms  694.9ms    1.44       244MB     0        1     0    694.9ms <dt>   <Rprofmem> <bench_tm [1]>
2 e2            21.3s    21.3s    0.0469      71MB     1.50     1    32      21.3s <dt>   <Rprofmem> <bench_tm [1]>
答案2
得分: 3
1) 这是一个使用 ave 和 cumsum 的基本R解决方案。
ave(p, q, FUN = cumsum) - p
##  [1]  0  1  3  0  4  0  6 13 21 30
2) 这个变种也有效:
ave(p, q, FUN = function(x) c(0, head(cumsum(x), -1)))
3) 相同的思路可以使用 collapse 包来实现:
library(collapse)
fcumsum(p, q) - p
##  [1]  0  1  3  0  4  0  6 13 21 30
英文:
1) This is a base R solution using ave and cumsum.
ave(p, q, FUN = cumsum) - p
##  [1]  0  1  3  0  4  0  6 13 21 30
2) This variation also works:
ave(p, q, FUN = function(x) c(0, head(cumsum(x), -1)))
3) The same idea can be used with the collapse package
library(collapse)
fcumsum(p, q) - p
##  [1]  0  1  3  0  4  0  6 13 21 30
答案3
得分: 2
这将非常快速:
library(dplyr)
t |>
  mutate(result = cumsum(lag(p, default = 0)), .by = q)
#     p q  r result
# 1   1 a  0      0
# 2   2 a  1      1
# 3   3 a  3      3
# 4   4 b  0      0
# 5   5 b  4      4
# 6   6 c  0      0
# 7   7 c  6      6
# 8   8 c 13     13
# 9   9 c 21     21
# 10 10 c 30     30
这将更快:
library(data.table)
setDT(t)[, result := cumsum(shift(p, type = "lag", fill = 0)), by = q]
英文:
This will be quite fast:
library(dplyr)
t |>
  mutate(result = cumsum(lag(p, default = 0)), .by = q)
#     p q  r result
# 1   1 a  0      0
# 2   2 a  1      1
# 3   3 a  3      3
# 4   4 b  0      0
# 5   5 b  4      4
# 6   6 c  0      0
# 7   7 c  6      6
# 8   8 c 13     13
# 9   9 c 21     21
# 10 10 c 30     30
This will be even faster:
library(data.table)
setDT(t)[, result := cumsum(shift(p, type = "lag", fill = 0)), by = q]
答案4
得分: 1
如果我们需要在dplyr内保持良好的性能,可以使用dtplyr。dtplyr是dplyr的data.table前端。它大部分使用data.table来运行代码。并非所有dplyr的动词和操作都可以翻译,但对于像这个示例这样的简单用例,它运行得很好。
library(dtplyr)
t <- lazy_dt(t)
t |>
    group_by(q) |>
    mutate(result = cumsum(lag(p, default = 0))) |>
    as_tibble()
# A tibble: 10 × 4
       p q         r result
   <dbl> <chr> <dbl>  <dbl>
 1     1 a         0      0
 2     2 a         1      1
 3     3 a         3      3
 4     4 b         0      0
 5     5 b         4      4
 6     6 c         0      0
 7     7 c         6      6
 8     8 c        13     13
 9     9 c        21     21
10    10 c        30     30
英文:
If we need good performance while keeping within dplyr, we can use dtplyr. dtplyr is a data.table frontend for dplyr.
It runs most of the code with data.table. Not all dplyr verbs and operations can be translated, but for simple use cases like this one, it works just fine.
library(dtplyr)
t <- lazy_dt(t)
t |> 
    group_by(q) |>
    mutate(result = cumsum(lag(p, default = 0))) |>
    as_tibble()
# A tibble: 10 × 4
       p q         r result
   <dbl> <chr> <dbl>  <dbl>
 1     1 a         0      0
 2     2 a         1      1
 3     3 a         3      3
 4     4 b         0      0
 5     5 b         4      4
 6     6 c         0      0
 7     7 c         6      6
 8     8 c        13     13
 9     9 c        21     21
10    10 c        30     30
</details>
# 答案5
**得分**: 0
使用`data.table`,我们可以尝试以下操作:
```R
setDT(t)[, rr := shift(cumsum(p), fill = 0), q][]
这段代码会生成如下结果:
     p q  r rr
 1:  1 a  0  0
 2:  2 a  1  1
 3:  3 a  3  3
 4:  4 b  0  0
 5:  5 b  4  4
 6:  6 c  0  0
 7:  7 c  6  6
 8:  8 c 13 13
 9:  9 c 21 21
10: 10 c 30 30
请注意,这是给定代码的翻译部分。
英文:
with data.table we can try
> setDT(t)[, rr := shift(cumsum(p), fill = 0), q][]
     p q  r rr
 1:  1 a  0  0
 2:  2 a  1  1
 3:  3 a  3  3
 4:  4 b  0  0
 5:  5 b  4  4
 6:  6 c  0  0
 7:  7 c  6  6
 8:  8 c 13 13
 9:  9 c 21 21
10: 10 c 30 30
答案6
得分: 0
以下是您要翻译的内容:
"如在问题中所述,可以假设向量已经排序,您可以获取在q中发生变化的位置的索引,然后在这些位置减去cumsum。
n <- length(q)
i <- 1 + which(q[-1] != q[-n])
r <- cumsum(p) - p
r <- r - rep(c(0, r[i]), diff(c(1, i, n+1)))
r
# [1]  0  1  3  0  4  0  6 13 21 30
一个可能的问题是,由于cumsum是在整个向量上构建的,所以其结果可能不太准确,与分组拆分时相比。因此,另一种变体是对向量进行子集化。
n <- length(q)
i <- which(q[-1] != q[-n])
unlist(Map(\(i,j) {. <- p[i:j]; cumsum(.) -.}, c(1, i+1), c(i,n)), FALSE, FALSE)
# [1]  0  1  3  0  4  0  6 13 21 30
性能基准
set.seed(42)
q <- rep(letters, sample(1e5:1e6, length(letters), TRUE))
length(q)
#[1] 13535111
p <- sample(as.numeric(1:10), length(q), TRUE)
library(collapse)
library(data.table)
t <- data.frame(p,q)
bench::mark(min_iterations = 7L,
collapse = {g <- GRP(t, by = "q")
 as.data.table(t)[, result := fcumsum(flag(p, g = g, fill = 0), g = g)][]$result},
Map = {n <- length(q)
 i <- which(q[-1] != q[-n])
 unlist(Map(\(i,j) {. <- p[i:j]; cumsum(.) -.}, c(1, i+1), c(i,n)), FALSE, FALSE)},
rep = {n <- length(q)
 i <- 1 + which(q[-1] != q[-n])
 r <- cumsum(p) - p
 r - rep(c(0, r[i]), diff(c(1, i, n+1)))},
ave = {ave(p, q, FUN = cumsum) - p} ) # @G. Grothendieck
结果
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
1 collapse   196.59ms 248.53ms     4.17      518MB     5.36     7     9
2 Map         539.4ms    568ms     1.78      981MB     3.80     7    15
3 rep        442.83ms 481.89ms     2.03      826MB     3.77     7    13
4 ave           1.18s    1.23s     0.801    1006MB     1.37     7    12
在这种情况下,使用collapse与data.table比基本的Map和rep快大约两倍,比ave快5倍。"
英文:
As in the question is stated that it could be assumed that the vectors are already sorted you can take the indices where there is a change in q and subtract this cumsum at those positions.
n <- length(q)
i <- 1 + which(q[-1] != q[-n])
r <- cumsum(p) - p
r <- r - rep(c(0, r[i]), diff(c(1, i, n+1)))
r
# [1]  0  1  3  0  4  0  6 13 21 30
A problem might be, that due to the fact that cumsum is build over the whole vector, its result might not be that accurate as it could be when splitting in groups. So another variant which is subsetting the vector.
n <- length(q)
i <- which(q[-1] != q[-n])
unlist(Map(\(i,j) {. <- p[i:j]; cumsum(.) -.}, c(1, i+1), c(i,n)), FALSE, FALSE)
# [1]  0  1  3  0  4  0  6 13 21 30
Benchmark
set.seed(42)
q <- rep(letters, sample(1e5:1e6, length(letters), TRUE))
length(q)
#[1] 13535111
p <- sample(as.numeric(1:10), length(q), TRUE)
library(collapse)
library(data.table)
t <- data.frame(p,q)
bench::mark(min_iterations = 7L,
collapse = {g <- GRP(t, by = "q")
 as.data.table(t)[, result := fcumsum(flag(p, g = g, fill = 0), g = g)][]$result},
Map = {n <- length(q)
 i <- which(q[-1] != q[-n])
 unlist(Map(\(i,j) {. <- p[i:j]; cumsum(.) -.}, c(1, i+1), c(i,n)), FALSE, FALSE)},
rep = {n <- length(q)
 i <- 1 + which(q[-1] != q[-n])
 r <- cumsum(p) - p
 r - rep(c(0, r[i]), diff(c(1, i, n+1)))},
ave = {ave(p, q, FUN = cumsum) - p} ) # @G. Grothendieck
Result
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
1 collapse   196.59ms 248.53ms     4.17      518MB     5.36     7     9
2 Map         539.4ms    568ms     1.78      981MB     3.80     7    15
3 rep        442.83ms 481.89ms     2.03      826MB     3.77     7    13
4 ave           1.18s    1.23s     0.801    1006MB     1.37     7    12
In this case using collapse with data.table is about two times faster than base Map and rev and 5 times than ave.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。


评论