英文:
Looking for a more efficient way of doing this for-loop in R
问题
我的数据集有超过300万条记录,所以这个循环花费了很长时间。
我想要创建一个bout_len_tracker变量,用于计算连续的行数,其中相同的主题(SUBJECT)具有VECTORMAGNITUDECOUNTS >= 1853。
sub_study$bout_len_tracker <- 0
for (i in 2:nrow(sub_study)) {
if ((sub_study$VECTORMAGNITUDECOUNTS[i] >= 1853) & (sub_study$SUBJECT[i] == sub_study$SUBJECT[i-1]))
sub_study$bout_len_tracker[i] <- sub_study$bout_len_tracker[i-1] + 1
}
英文:
My data set has over 3 million records so this loop is taking forever.
I want to create a bout_len_tracker variable that counts how many consecutive rows for the same SUBJECT have VECTORMAGNITUDECOUNTS >= 1853.
sub_study$bout_len_tracker <- 0
for ( i in 2:nrow(sub_study) ) {
if ( (sub_study$VECTORMAGNITUDECOUNTS[i] >= 1853) & (sub_study$SUBJECT[i] == sub_study$SUBJECT[i-1]) )
sub_study$bout_len_tracker[i] <- sub_study$bout_len_tracker[i-1]+1
}
答案1
得分: 1
你可以将 VECTORMAGNITUDECOUNTS >= 1853
的 cumsum
放入 ave
中。这里有一个示例:
dat$len <- with(dat, ave(VECTORMAGNITUDECOUNTS >= 1853, id, FUN=cumsum))
dat
# id t VECTORMAGNITUDECOUNTS len
# 1 1 1 17 1
# 2 1 2 18 2
# 3 1 3 5 2
# 4 2 1 5 0
# 5 2 2 17 1
# 6 2 3 14 2
# 7 3 1 1 0
# 8 3 2 15 1
# 9 3 3 20 2
# 10 4 1 10 1
# 11 4 2 7 1
# 12 4 3 18 2
# 13 5 1 4 0
# 14 5 2 4 0
# 15 5 3 15 1
如果 n
中有 NA
值,
dat$n[sample.int(nrow(dat), nrow(dat)*.2)] <- NA
你可以将其扩展为:
dat$len <- with(dat, ave(VECTORMAGNITUDECOUNTS >= 1853, id, FUN=\(x) cumsum(replace(x, is.na(x), 0))))
数据:
dat <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L), t = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L), n = c(17L, 18L, 5L, 5L, 17L, 14L, 1L,
15L, 20L, 10L, 7L, 18L, 4L, 4L, 15L)), out.attrs = list(dim = c(id = 5L,
t = 3L), dimnames = list(id = c("id=1", "id=2", "id=3", "id=4",
"id=5"), t = c("t=1", "t=2", "t=3"))), row.names = c(NA, -15L
), class = "data.frame")
英文:
You could put cumsum
of VECTORMAGNITUDECOUNTS >= 1853
in ave
. Here an example:
dat$len <- with(dat, ave(n >= 10, id, FUN=cumsum))
dat
# id t n len
# 1 1 1 17 1
# 2 1 2 18 2
# 3 1 3 5 2
# 4 2 1 5 0
# 5 2 2 17 1
# 6 2 3 14 2
# 7 3 1 1 0
# 8 3 2 15 1
# 9 3 3 20 2
# 10 4 1 10 1
# 11 4 2 7 1
# 12 4 3 18 2
# 13 5 1 4 0
# 14 5 2 4 0
# 15 5 3 15 1
If there are NA
s in n
,
dat$n[sample.int(nrow(dat), nrow(dat)*.2)] <- NA
you can expand this to:
dat$len <- with(dat, ave(n >= 10, id, FUN=\(x) cumsum(replace(x, is.na(x), 0))))
Data:
dat <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L), t = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L), n = c(17L, 18L, 5L, 5L, 17L, 14L, 1L,
15L, 20L, 10L, 7L, 18L, 4L, 4L, 15L)), out.attrs = list(dim = c(id = 5L,
t = 3L), dimnames = list(id = c("id=1", "id=2", "id=3", "id=4",
"id=5"), t = c("t=1", "t=2", "t=3"))), row.names = c(NA, -15L
), class = "data.frame")
答案2
得分: 1
以下是代码的翻译部分:
library(data.table)
setDT(dat)[, tracker := (n >= 10) * rowid(n >= 10), id][]
如果需要进一步的帮助,请随时提问。
英文:
library(data.table)
setDT(dat)[,tracker:= (n>=10) * rowid(n >= 10),id][]
id t n tracker
1: 1 1 17 1
2: 1 2 18 2
3: 1 3 5 0
4: 2 1 5 0
5: 2 2 17 1
6: 2 3 14 2
7: 3 1 1 0
8: 3 2 15 1
9: 3 3 20 2
10: 4 1 10 1
11: 4 2 7 0
12: 4 3 18 2
13: 5 1 4 0
14: 5 2 4 0
15: 5 3 15 1
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论