英文:
Efficiently find the first of the last 1's sequence
问题
For test1
and test2
, the expected output is 36 and 29, respectively.
Here is a sub-optimal solution:
temp1 <- cumsum(test1)
which(temp1==max(temp1[duplicated(temp1)&temp1!=max(temp1)]+1))[1]
[1] 36
temp2 <- cumsum(test2)
which(temp2==max(temp2[duplicated(temp2)&temp2!=max(temp2)]+1))[1]
[1] 29
英文:
I have the following vectors with 0s and 1s:
test1 <- c(rep(0,20),rep(1,5),rep(0,10),rep(1,15))
test1
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
^
test2 <- c(rep(0,8),rep(1,4),rep(0,5),rep(1,5),rep(0,6),rep(1,10),rep(0,2))
test2
[1] 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0
^
I need to find the index of first 1 in the last sequence of 1s (indicated by ^ in the above code). I have a solution (below) that doesn't perform well, how could I improve the performance?
For test1
and test2
, the expected output is 36 and 29, respectively.
Here is a sub-optimal solution:
temp1 <- cumsum(test1)
which(temp1==max(temp1[duplicated(temp1)&temp1!=max(temp1)]+1))[1]
[1] 36
temp2 <- cumsum(test2)
which(temp2==max(temp2[duplicated(temp2)&temp2!=max(temp2)]+1))[1]
[1] 29
Note: The length of actual vectors is ~10k.
答案1
得分: 19
data.table
库有一个未导出的函数叫做data.table:::uniqlist(list(x))
。使用三个冒号:::
来访问未导出的函数。该函数确定数据框的列何时更改值,并返回更改的索引。
data.table:::uniqlist(list(test1))
# [1] 1 21 26 36
关于uniqlist
,可以参考@Arun在这里的讨论:https://stackoverflow.com/a/21267854/10276092
然后,我使用了y[length(y)]
来找到向量中的最后一个项目,并使用基础的ifelse()
来检查最后一个索引是否包含1,否则倒数第二个索引必须包含1。
fx <- function(x) {
y <- data.table:::uniqlist(list(x))
ifelse(x[y[length(y)]] == 1, y[length(y)], y[length(y) - 1])
}
英文:
The data.table
library has a non-exported function called data.table:::uniqlist(list(x))
. Use three colons :::
to access non-exported functions. This function determines when columns of a data.frame change value and return indices of the change.
data.table:::uniqlist(list(test1))
# [1] 1 21 26 36
@Arun talks about uniqlist
here: https://stackoverflow.com/a/21267854/10276092
Then I use the y[length(y)]
method of finding the last item in a vector, and base ifelse()
to check if the last index contains a 1, else the second to last index must contain a 1.
fx <- function(x) {
y <- data.table:::uniqlist(list(x))
ifelse(x[y[length(y)]] == 1, y[length(y)], y[length(y) - 1])
}
答案2
得分: 18
使用 rle:
r <- rle(test1)
ix <- max(which(r$values == 1))
sum(r$lengths[1:(ix - 1)]) + 1
# [1] 36
r <- rle(test2)
ix <- max(which(r$values == 1))
sum(r$lengths[1:(ix - 1)]) + 1
# [1] 29
英文:
Using rle:
r <- rle(test1)
ix <- max(which(r$values == 1))
sum(r$lengths[ 1:(ix - 1) ]) + 1
# [1] 36
r <- rle(test2)
ix <- max(which(r$values == 1))
sum(r$lengths[ 1:(ix - 1) ]) + 1
# [1] 29
答案3
得分: 18
另一种方法是使用 which
+ diff
。
idx <- which(test1 == 1)
idx[tail(which(c(0, diff(idx)) != 1), 1)]
#[1] 36
英文:
Another way with which
+ diff
.
idx <- which(test1 == 1)
idx[tail(which(c(0, diff(idx)) != 1), 1)]
#[1] 36
答案4
得分: 16
使用rle运行,然后使用cumsum计算每个运行的结束位置,减去长度并加1以获得起始位置,然后将其减少到仅包含1的运行,最后取最后一个元素。这给出了最后一组1的起始位置,但如果你想要:
- 结束位置,只需省略
-lengths+1
- 最后一组0的运行,将
==1
替换为==0
- 第一组1的运行,将
tail
替换为head
如果没有1,它将返回一个零长度的数字向量。
with(rle(test1), tail((cumsum(lengths) - lengths + 1)[values == 1], 1))
英文:
Run rle and then use cumsum to calculate the end positions of each run and subtract the lengths and add 1 to get the start positions and then reduce that to the runs of 1's only and finally take the last element. This gives the start position of the last run of 1's but if you wanted:
- the end position just omit the
-lengths+1
- the last run of 0's replace the
==1
with==0
- the first run of 1's replace
tail
withhead
If there are no 1's it returns a zero length numeric vector.
with(rle(test1), tail((cumsum(lengths) - lengths + 1)[values == 1], 1))
答案5
得分: 14
以下是您要翻译的内容:
For completeness, here is the benchmark with a vector of size 30001. Feel free to update this if needed.
x <- c(rep(0,14736),rep(1,413),rep(0,830),rep(1,961),rep(0,274),rep(1,12787))
microbenchmark::microbenchmark(rle_zx8754(x),
rle_Grothendieck(x),
which_diff_Maël(x),
uniqlist_Viking(x),
while_Ritchie(x),
#Position_Ritchie(x),
#detect_index_Ritchie(x),
diff_Thomas(x),
#regex_Thomas(x),
#regexpr_Thomas(x),
times = 1000, check='equal')
Unit: microseconds
expr min lq mean median uq
rle_zx8754(x) 339.5 350.45 783.9827 357.45 375.15
rle_Grothendieck(x) 352.7 364.75 616.2324 372.60 391.75
which_diff_Maël(x) 264.2 274.60 404.5521 279.50 292.00
uniqlist_Viking(x) 16.7 22.30 32.1502 25.40 30.65
while_Ritchie(x) 777.6 785.60 1021.0738 801.95 847.15
diff_Thomas(x) 279.4 286.90 500.6373 291.20 306.35
max neval cld
156630.3 1000 cd
11196.5 1000 bc
7263.2 1000 b
3524.9 1000 a
6739.7 1000 d
9435.5 1000 b
functions:
x <- c(rep(0,14736),rep(1,413),rep(0,830),rep(1,961),rep(0,274),rep(1,12787))
rle_zx8754 <- function(x){
r <- rle(x)
ix <- max(which(r$values == 1))
sum(r$lengths[ 1:(ix - 1) ]) + 1
}
which_diff_Maël <- function(x){
idx <- which(x == 1)
idx[tail(which(diff(idx) != 1), 1) + 1]
}
rle_Grothendieck <- function(x){
with(rle(x), tail((cumsum(lengths) - lengths + 1)[values == 1], 1))
}
uniqlist_Viking <- function(x){
y <- data.table:::uniqlist(list(x))
ifelse(x[y[length(y)]] == 1, y[length(y)], y[length(y) - 1])
}
while_Ritchie <- function(x){
l <- length(x)
while (x[l] - x[l - 1] != 1) {
l <- l - 1
}
l
}
Position_Ritchie <- function(x){
Position(isTRUE, diff(x) == 1, right = TRUE) + 1
}
detect_index_Ritchie <- function(x){
purrr::detect_index(diff(x) == 1, isTRUE, .dir = "backward") + 1
}
diff_Thomas <- function(x){
max((2:length(x))[diff(x) == 1])
}
regex_Thomas <- function(x){
nchar(sub("(.*01).*", "\\1", paste0(x, collapse = "")))
}
regexpr_Thomas <- function(x){
attr(regexpr(".*(?<=0)1", paste0(x,collapse = ""), perl = TRUE), "match.length")
}
英文:
For completeness, here is the benchmark with a vector of size 30001. Feel free to update this if needed.
x <- c(rep(0,14736),rep(1,413),rep(0,830),rep(1,961),rep(0,274),rep(1,12787))
microbenchmark::microbenchmark(rle_zx8754(x),
rle_Grothendieck(x),
which_diff_Maël(x),
uniqlist_Viking(x),
while_Ritchie(x),
#Position_Ritchie(x),
#detect_index_Ritchie(x),
diff_Thomas(x),
#regex_Thomas(x),
#regexpr_Thomas(x),
times = 1000, check='equal')
Unit: microseconds
expr min lq mean median uq
rle_zx8754(x) 339.5 350.45 783.9827 357.45 375.15
rle_Grothendieck(x) 352.7 364.75 616.2324 372.60 391.75
which_diff_Maël(x) 264.2 274.60 404.5521 279.50 292.00
uniqlist_Viking(x) 16.7 22.30 32.1502 25.40 30.65
while_Ritchie(x) 777.6 785.60 1021.0738 801.95 847.15
diff_Thomas(x) 279.4 286.90 500.6373 291.20 306.35
max neval cld
156630.3 1000 cd
11196.5 1000 bc
7263.2 1000 b
3524.9 1000 a
6739.7 1000 d
9435.5 1000 b
functions:
x <- c(rep(0,14736),rep(1,413),rep(0,830),rep(1,961),rep(0,274),rep(1,12787))
rle_zx8754 <- function(x){
r <- rle(x)
ix <- max(which(r$values == 1))
sum(r$lengths[ 1:(ix - 1) ]) + 1
}
which_diff_Maël <- function(x){
idx <- which(x == 1)
idx[tail(which(diff(idx) != 1), 1) + 1]
}
rle_Grothendieck <- function(x){
with(rle(x), tail((cumsum(lengths) - lengths + 1)[values == 1], 1))
}
uniqlist_Viking <- function(x){
y <- data.table:::uniqlist(list(x))
ifelse(x[y[length(y)]] == 1, y[length(y)], y[length(y) - 1])
}
while_Ritchie <- function(x){
l <- length(x)
while (x[l] - x[l - 1] != 1) {
l <- l - 1
}
l
}
Position_Ritchie <- function(x){
Position(isTRUE, diff(x) == 1, right = TRUE) + 1
}
detect_index_Ritchie <- function(x){
purrr::detect_index(diff(x) == 1, isTRUE, .dir = "backward") + 1
}
diff_Thomas <- function(x){
max((2:length(x))[diff(x) == 1])
}
regex_Thomas <- function(x){
nchar(sub("(.*01).*", "\\1", paste0(x, collapse = "")))
}
regexpr_Thomas <- function(x){
attr(regexpr(".*(?<=0)1", paste0(x,collapse = ""), perl = TRUE), "match.length")
}
答案6
得分: 12
一个简单的while
循环将是一个(可能非常)快速的方法,其中所寻找的索引位于向量的末尾。
f <- function(x) {
l <- length(x)
while (x[l] - x[l - 1] != 1) {
l <- l - 1
}
l
}
我们也可以使用Position()
或purrr
的等效函数detect_index()
:
Position(isTRUE, diff(test1) == 1, right = TRUE) + 1
或者
purrr::detect_index(diff(test1) == 1, isTRUE, .dir = "backward") + 1
英文:
A simple while
loop will be a (potentially very) fast approach where the sought index is towards the end of the vector.
f <- function(x) {
l <- length(x)
while (x[l] - x[l - 1] != 1) {
l <- l - 1
}
l
}
f(test1)
[1] 36
f(test2)
[1] 29
We could also use Position()
or the purrr
equivalent detect_index()
:
Position(isTRUE, diff(test1) == 1, right = TRUE) + 1
[1] 36
purrr::detect_index(diff(test1) == 1, isTRUE, .dir = "backward") + 1
[1] 36
答案7
得分: 9
-
'regex' approaches
您可以尝试使用正则表达式,像是sub
+ nchar
:
f1 <- function(v) nchar(sub("(.*01).*", "\\1", paste0(v, collapse = "")))
或者regexpr
:
f2 <- function(v) attr(regexpr(".*(?<=0)1", paste0(v, collapse = ""), perl = TRUE), "match.length")
-
'diff' approaches
或者,您可以尝试其他的diff
方法,比如:
f3 <- function(v) tail(which(diff(v) == 1) + 1, 1)
以及
f4 <- function(v) max((2:length(v))[diff(v) == 1])
英文:
I believe you have many ways to do it, and below are some possible approaches:
-
regex
approaches
You can try regex
, like sub
+ nchar
f1 <- function(v) nchar(sub("(.*01).*", "\\1", paste0(v, collapse = "")))
or regexpr
f2 <- function(v) attr(regexpr(".*(?<=0)1", paste0(v,collapse = ""), perl = TRUE), "match.length")
-
diff
approaches
Or, some other diff
options, like
f3 <- function(v) tail(which(diff(v) == 1) + 1, 1)
and
f4 <- function(v) max((2:length(v))[diff(v) == 1])
答案8
得分: 9
另一种方法是使用 rev
和 match
。rev
反转向量,这样 match
就可以用于查找最后一个1序列。
或者使用 Rcpp
编写一个函数,实现相同的功能,但可以从末尾开始迭代。
或者使用 rev
、diff
和 match
。
基准测试显示,Rcpp 函数是最快的,分配的内存最少。其性能取决于能够找到匹配的位置。
英文:
Another way using rev
and match
.
rev
reverses the vector, so that match
, which returns the first hit, can be used to find the last 1 sequence.
f <- \(x) {
. <- rev(x)
i <- match(1, .)
if(is.na(i)) return(NA)
j <- match(0, tail(., -i))
if(is.na(j)) 1
else length(.) - i - j + 2 }
f(test1)
#[1] 36
f(test2)
#[1] 29
f(c(1,1))
#[1] 1
f(c(0,1))
#[1] 2
f(c(1,0))
#[1] 1
f(c(0,0))
#[1] NA
Or write a function using Rcpp
doing the same but can iterate starting from the end.
Rcpp::cppFunction("int f2(NumericVector x) {
auto i = x.end();
while(i != x.begin() && *(--i) != 1.) ;
while(i != x.begin() && *(--i) == 1.) ;
if(*i != 1.) ++i;
return i == x.end() || *i != 1. ? 0 : i - x.begin() + 1;
}")
f2(test1)
#[1] 36
f2(test2)
#[1] 29
f2(c(1,1))
#[1] 1
f2(c(0,1))
#[1] 2
f2(c(1,0))
#[1] 1
f2(c(0,0))
#[1] 0
Or using rev
, diff
and match
.
f3 <- \(x) {
i <- match(-1, diff(rev(x)))
if(is.finite(i)) length(x) - i + 1
else if(x[1] == 1) 1
else NA
}
f3(test1)
#[1] 36
f3(test2)
#[1] 29
f3(c(1,1))
#[1] 1
f3(c(0,1))
#[1] 2
f3(c(1,0))
#[1] 1
f3(c(0,0))
#[1] NA
Benchmark
uniqlist <- function(x) { #M.Viking
y <- data.table:::uniqlist(list(x))
ifelse(x[y[length(y)]] == 1, y[length(y)], y[length(y) - 1]) }
which_diff <- function(x) { #Maël
idx <- which(x == 1)
idx[tail(which(c(0, diff(idx)) != 1), 1)] }
# Dataset from question
x <- rep(c(0,1,0,1,0,1), c(14736,413,830,961,274,12787))
bench::mark(max_iterations = 1e5, f(x), f3(x), which_diff(x),
uniqlist(x), f2(x) )
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
#1 f(x) 199.07µs 251.5µs 3412. 1.21MB 76.3 1341 30
#2 f3(x) 218.05µs 319.61µs 3144. 1.76MB 117. 1079 40
#3 which_diff(x) 155.01µs 177.53µs 5518. 954.17KB 103. 2296 43
#4 uniqlist(x) 17.04µs 17.72µs 55386. 1.36MB 4.04 27442 2
#5 f2(x) 5.61µs 6.13µs 161213. 2.49KB 6.16 78462 3
# Data with many changes between 0 and 1 and hit at end
x <- rep(c(0,1), 1e6)
bench::mark(max_iterations = 1e5, f(x), f3(x), which_diff(x),
uniqlist(x), f2(x) )
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
#1 f(x) 17.97ms 19.86ms 44.6 76.29MB 50.5 23 26
#2 f3(x) 28.77ms 32.78ms 25.6 114.44MB 52.9 14 29
#3 which_diff(x) 14.47ms 16.91ms 52.3 68.67MB 67.8 27 35
#4 uniqlist(x) 2.66ms 3ms 294. 7.63MB 27.8 148 14
#5 f2(x) 1.08µs 1.28µs 701103. 2.49KB 21.0 100000 3
# Data where hit is at beginning
x <- c(0,1,rep(0, 1e6))
bench::mark(max_iterations = 1e5, f(x), f3(x), which_diff(x),
uniqlist(x), f2(x) )
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
#1 f(x) 4.34ms 6.6ms 131. 19.11MB 84.6 71 46
#2 f3(x) 15.1ms 18.73ms 35.9 57.24MB 75.7 18 38
#3 which_diff(x) 1.37ms 1.44ms 529. 7.63MB 93.9 265 47
#4 uniqlist(x) 470.91µs 491.54µs 1994. 1.36MB 0 997 0
#5 f2(x) 364.46µs 375.08µs 2649. 2.49KB 0 1325 0
# Data where hit is at end
x <- c(rep(0, 1e6),1,0)
bench::mark(max_iterations = 1e5, f(x), f3(x), which_diff(x),
uniqlist(x), f2(x) )
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
#1 f(x) 10.53ms 11.33ms 69.8 38.18MB 91.8 35 46
#2 f3(x) 14.19ms 17.18ms 37.6 57.24MB 69.3 19 35
#3 which_diff(x) 1.38ms 1.49ms 512. 7.63MB 77.9 256 39
#4 uniqlist(x) 479.76µs 491.61µs 1997. 1.36MB 0 999 0
#5 f2(x) 1.08µs 1.28µs 683440. 2.49KB 27.3 100000 4
The Rcpp function is the fastest and allocates the lowest amount of memory. Its performance depends where the match could be found.
答案9
得分: 4
也许不是最佳方法,但只是为了易于理解的替代方法
```r
data.frame(var1=c(rep(0,20),rep(1,5),rep(0,10),rep(1,15))) %>%
mutate(new=rleid(var1), row=row_number()) %>%
filter(var1==1 & max(new)==new) %>%
slice_head(n=1) %>%
select(row)
# 输出
row
1 36
<details>
<summary>英文:</summary>
May not be the best but just an alternate for easy understanding
```r
data.frame(var1=c(rep(0,20),rep(1,5),rep(0,10),rep(1,15))) %>%
mutate(new=rleid(var1), row=row_number()) %>%
filter(var1==1 & max(new)==new) %>%
slice_head(n=1) %>%
select(row)
# output
row
1 36
答案10
得分: 4
我们还可以使用data.table
中的rleid
函数:
library(data.table)
i1 <- rleid(test1)
min(which(i1 == max(i1[test1 == 1])))
# [1] 36
i1 <- rleid(test2)
min(which(i1 == max(i1[test2 == 1])))
# [1] 29
英文:
We can also use rleid
from data.table
:
library(data.table)
i1 <- rleid(test1)
min(which(i1 == max(i1[test1 == 1])))
# [1] 36
i1 <- rleid(test2)
min(which(i1 == max(i1[test2 == 1])))
# [1] 29
答案11
得分: 3
Using data.table::rleidv()
rle_seq <- data.table::rleidv(test2)
rle_ones <- rle_seq[test2 != 0]
which_id_last <- rle_ones[length(rle_ones)]
which(rle_seq == which_id_last)[1L]
[1] 30001
英文:
Using data.table::rleidv()
rle_seq <- data.table::rleidv(test2)
rle_ones <- rle_seq[test2 != 0]
which_id_last <- rle_ones[length(rle_ones)]
which(rle_seq == which_id_last)[1L]
[1] 30001
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论