英文:
R code to find a cumulative sum of a vector with a criteria set on another vector of a dataframe
问题
Sure, here is the translated code portion:
p <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
q <- c("a", "a", "a", "b", "b", "c", "c", "c", "c", "c")
r <- c(0, 1, 3, 0, 4, 0, 6, 13, 21, 30)
t <- data.frame(p, q, r)
If you need further assistance with the code or have any questions, please feel free to ask.
英文:
p<-c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
q<-c("a", "a", "a", "b", "b", "c", "c", "c", "c", "c")
r<-c(0, 1, 3, 0, 4, 0, 6, 13, 21, 30)
t<-data.frame(p,q,r)
In the above data I am trying to achieve vector r.
Vector r is a cumulative sum of vector p with a lag and a criteria set on vector q.
Please assume that dataframe is already sorted alphabetically using vector q
I have tried creating multiple vector with a lag and then trying to sum. but it isnt ideal.
答案1
得分: 4
最高效的方法是使用 collapse
。
首先创建一个 GRP
对象。
然后使用该 GRP
对象执行分组的滞后和累积求和。
collapse
vs data.table
基准测试
3 组
mark(e1 = {
g <- GRP(t, by = "q")
t[, result := fcumsum(flag(p, g = g, fill = 0), g = g)][]
},
e2 = {
t[, result := cumsum(shift(p, type = "lag", fill = 0)), by = q][]
}
)
# A tibble: 2 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:> <bch:> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 e1 434us 484us 1905. 36.7KB 0 953 0 500ms <dt> <Rprofmem> <bench_tm> <tibble>
2 e2 511us 548us 1646. 32.5KB 2.27 725 1 440ms <dt> <Rprofmem> <bench_tm> <tibble>
10^7 行,约 10^6 组
t <- t[sample.int(.N, 10^7, T)]
t[, q := sample.int(10^6, 10^7, T)]
mark(e1 = {
g <- GRP(t, by = "q")
t[, result := fcumsum(flag(p, g = g, fill = 0), g = g)][]
},
e2 = {
t[, result := cumsum(shift(p, type = "lag", fill = 0)), by = q][]
}
)
# A tibble: 2 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list>
1 e1 694.9ms 694.9ms 1.44 244MB 0 1 0 694.9ms <dt> <Rprofmem> <bench_tm [1]>
2 e2 21.3s 21.3s 0.0469 71MB 1.50 1 32 21.3s <dt> <Rprofmem> <bench_tm [1]>
英文:
The most efficient method would be using collapse
.
First create a GRP
object.
library(collapse)
library(data.table)
setDT(t)
g <- GRP(t, by = "q")
Then do a grouped lag & cumulative sum using that GRP
object.
t[, result := fcumsum(flag(p, g = g, fill = 0), g = g)][]
p q r result
1: 1 a 0 0
2: 2 a 1 1
3: 3 a 3 3
4: 4 b 0 0
5: 5 b 4 4
6: 6 c 0 0
7: 7 c 6 6
8: 8 c 13 13
9: 9 c 21 21
10: 10 c 30 30
collapse
vs data.table
benchmark
3 groups
mark(e1 = {
g <- GRP(t, by = "q")
t[, result := fcumsum(flag(p, g = g, fill = 0), g = g)][]
},
e2 = {
t[, result := cumsum(shift(p, type = "lag", fill = 0)), by = q][]
}
)
# A tibble: 2 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:> <bch:> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 e1 434us 484us 1905. 36.7KB 0 953 0 500ms <dt> <Rprofmem> <bench_tm> <tibble>
2 e2 511us 548us 1646. 32.5KB 2.27 725 1 440ms <dt> <Rprofmem> <bench_tm> <tibble>
10^7 rows, ~ 10^6 groups
t <- t[sample.int(.N, 10^7, T)]
t[, q := sample.int(10^6, 10^7, T)]
mark(e1 = {
g <- GRP(t, by = "q")
t[, result := fcumsum(flag(p, g = g, fill = 0), g = g)][]
},
e2 = {
t[, result := cumsum(shift(p, type = "lag", fill = 0)), by = q][]
}
)
# A tibble: 2 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list>
1 e1 694.9ms 694.9ms 1.44 244MB 0 1 0 694.9ms <dt> <Rprofmem> <bench_tm [1]>
2 e2 21.3s 21.3s 0.0469 71MB 1.50 1 32 21.3s <dt> <Rprofmem> <bench_tm [1]>
答案2
得分: 3
1) 这是一个使用 ave
和 cumsum
的基本R解决方案。
ave(p, q, FUN = cumsum) - p
## [1] 0 1 3 0 4 0 6 13 21 30
2) 这个变种也有效:
ave(p, q, FUN = function(x) c(0, head(cumsum(x), -1)))
3) 相同的思路可以使用 collapse
包来实现:
library(collapse)
fcumsum(p, q) - p
## [1] 0 1 3 0 4 0 6 13 21 30
英文:
1) This is a base R solution using ave
and cumsum
.
ave(p, q, FUN = cumsum) - p
## [1] 0 1 3 0 4 0 6 13 21 30
2) This variation also works:
ave(p, q, FUN = function(x) c(0, head(cumsum(x), -1)))
3) The same idea can be used with the collapse package
library(collapse)
fcumsum(p, q) - p
## [1] 0 1 3 0 4 0 6 13 21 30
答案3
得分: 2
这将非常快速:
library(dplyr)
t |>
mutate(result = cumsum(lag(p, default = 0)), .by = q)
# p q r result
# 1 1 a 0 0
# 2 2 a 1 1
# 3 3 a 3 3
# 4 4 b 0 0
# 5 5 b 4 4
# 6 6 c 0 0
# 7 7 c 6 6
# 8 8 c 13 13
# 9 9 c 21 21
# 10 10 c 30 30
这将更快:
library(data.table)
setDT(t)[, result := cumsum(shift(p, type = "lag", fill = 0)), by = q]
英文:
This will be quite fast:
library(dplyr)
t |>
mutate(result = cumsum(lag(p, default = 0)), .by = q)
# p q r result
# 1 1 a 0 0
# 2 2 a 1 1
# 3 3 a 3 3
# 4 4 b 0 0
# 5 5 b 4 4
# 6 6 c 0 0
# 7 7 c 6 6
# 8 8 c 13 13
# 9 9 c 21 21
# 10 10 c 30 30
This will be even faster:
library(data.table)
setDT(t)[, result := cumsum(shift(p, type = "lag", fill = 0)), by = q]
答案4
得分: 1
如果我们需要在dplyr内保持良好的性能,可以使用dtplyr
。dtplyr是dplyr的data.table前端。它大部分使用data.table来运行代码。并非所有dplyr的动词和操作都可以翻译,但对于像这个示例这样的简单用例,它运行得很好。
library(dtplyr)
t <- lazy_dt(t)
t |>
group_by(q) |>
mutate(result = cumsum(lag(p, default = 0))) |>
as_tibble()
# A tibble: 10 × 4
p q r result
<dbl> <chr> <dbl> <dbl>
1 1 a 0 0
2 2 a 1 1
3 3 a 3 3
4 4 b 0 0
5 5 b 4 4
6 6 c 0 0
7 7 c 6 6
8 8 c 13 13
9 9 c 21 21
10 10 c 30 30
英文:
If we need good performance while keeping within dplyr, we can use dtplyr
. dtplyr is a data.table frontend for dplyr.
It runs most of the code with data.table. Not all dplyr verbs and operations can be translated, but for simple use cases like this one, it works just fine.
library(dtplyr)
t <- lazy_dt(t)
t |>
group_by(q) |>
mutate(result = cumsum(lag(p, default = 0))) |>
as_tibble()
# A tibble: 10 × 4
p q r result
<dbl> <chr> <dbl> <dbl>
1 1 a 0 0
2 2 a 1 1
3 3 a 3 3
4 4 b 0 0
5 5 b 4 4
6 6 c 0 0
7 7 c 6 6
8 8 c 13 13
9 9 c 21 21
10 10 c 30 30
</details>
# 答案5
**得分**: 0
使用`data.table`,我们可以尝试以下操作:
```R
setDT(t)[, rr := shift(cumsum(p), fill = 0), q][]
这段代码会生成如下结果:
p q r rr
1: 1 a 0 0
2: 2 a 1 1
3: 3 a 3 3
4: 4 b 0 0
5: 5 b 4 4
6: 6 c 0 0
7: 7 c 6 6
8: 8 c 13 13
9: 9 c 21 21
10: 10 c 30 30
请注意,这是给定代码的翻译部分。
英文:
with data.table
we can try
> setDT(t)[, rr := shift(cumsum(p), fill = 0), q][]
p q r rr
1: 1 a 0 0
2: 2 a 1 1
3: 3 a 3 3
4: 4 b 0 0
5: 5 b 4 4
6: 6 c 0 0
7: 7 c 6 6
8: 8 c 13 13
9: 9 c 21 21
10: 10 c 30 30
答案6
得分: 0
以下是您要翻译的内容:
"如在问题中所述,可以假设向量已经排序,您可以获取在q
中发生变化的位置的索引,然后在这些位置减去cumsum
。
n <- length(q)
i <- 1 + which(q[-1] != q[-n])
r <- cumsum(p) - p
r <- r - rep(c(0, r[i]), diff(c(1, i, n+1)))
r
# [1] 0 1 3 0 4 0 6 13 21 30
一个可能的问题是,由于cumsum
是在整个向量上构建的,所以其结果可能不太准确,与分组拆分时相比。因此,另一种变体是对向量进行子集化。
n <- length(q)
i <- which(q[-1] != q[-n])
unlist(Map(\(i,j) {. <- p[i:j]; cumsum(.) -.}, c(1, i+1), c(i,n)), FALSE, FALSE)
# [1] 0 1 3 0 4 0 6 13 21 30
性能基准
set.seed(42)
q <- rep(letters, sample(1e5:1e6, length(letters), TRUE))
length(q)
#[1] 13535111
p <- sample(as.numeric(1:10), length(q), TRUE)
library(collapse)
library(data.table)
t <- data.frame(p,q)
bench::mark(min_iterations = 7L,
collapse = {g <- GRP(t, by = "q")
as.data.table(t)[, result := fcumsum(flag(p, g = g, fill = 0), g = g)][]$result},
Map = {n <- length(q)
i <- which(q[-1] != q[-n])
unlist(Map(\(i,j) {. <- p[i:j]; cumsum(.) -.}, c(1, i+1), c(i,n)), FALSE, FALSE)},
rep = {n <- length(q)
i <- 1 + which(q[-1] != q[-n])
r <- cumsum(p) - p
r - rep(c(0, r[i]), diff(c(1, i, n+1)))},
ave = {ave(p, q, FUN = cumsum) - p} ) # @G. Grothendieck
结果
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
1 collapse 196.59ms 248.53ms 4.17 518MB 5.36 7 9
2 Map 539.4ms 568ms 1.78 981MB 3.80 7 15
3 rep 442.83ms 481.89ms 2.03 826MB 3.77 7 13
4 ave 1.18s 1.23s 0.801 1006MB 1.37 7 12
在这种情况下,使用collapse与data.table比基本的Map和rep快大约两倍,比ave
快5倍。"
英文:
As in the question is stated that it could be assumed that the vectors are already sorted you can take the indices where there is a change in q
and subtract this cumsum
at those positions.
n <- length(q)
i <- 1 + which(q[-1] != q[-n])
r <- cumsum(p) - p
r <- r - rep(c(0, r[i]), diff(c(1, i, n+1)))
r
# [1] 0 1 3 0 4 0 6 13 21 30
A problem might be, that due to the fact that cumsum
is build over the whole vector, its result might not be that accurate as it could be when splitting in groups. So another variant which is subsetting the vector.
n <- length(q)
i <- which(q[-1] != q[-n])
unlist(Map(\(i,j) {. <- p[i:j]; cumsum(.) -.}, c(1, i+1), c(i,n)), FALSE, FALSE)
# [1] 0 1 3 0 4 0 6 13 21 30
Benchmark
set.seed(42)
q <- rep(letters, sample(1e5:1e6, length(letters), TRUE))
length(q)
#[1] 13535111
p <- sample(as.numeric(1:10), length(q), TRUE)
library(collapse)
library(data.table)
t <- data.frame(p,q)
bench::mark(min_iterations = 7L,
collapse = {g <- GRP(t, by = "q")
as.data.table(t)[, result := fcumsum(flag(p, g = g, fill = 0), g = g)][]$result},
Map = {n <- length(q)
i <- which(q[-1] != q[-n])
unlist(Map(\(i,j) {. <- p[i:j]; cumsum(.) -.}, c(1, i+1), c(i,n)), FALSE, FALSE)},
rep = {n <- length(q)
i <- 1 + which(q[-1] != q[-n])
r <- cumsum(p) - p
r - rep(c(0, r[i]), diff(c(1, i, n+1)))},
ave = {ave(p, q, FUN = cumsum) - p} ) # @G. Grothendieck
Result
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
1 collapse 196.59ms 248.53ms 4.17 518MB 5.36 7 9
2 Map 539.4ms 568ms 1.78 981MB 3.80 7 15
3 rep 442.83ms 481.89ms 2.03 826MB 3.77 7 13
4 ave 1.18s 1.23s 0.801 1006MB 1.37 7 12
In this case using collapse with data.table is about two times faster than base Map and rev and 5 times than ave
.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论