寻找二进制列中的模式 r

huangapple go评论76阅读模式
英文:

looking for patterns in binary columns r

问题

我需要找到并计算在3个或更多连续的零后出现1的ID。

这是我所拥有的示例:

#  ID   Jan  Feb Mar  Apr May Jun Jul Aug Sept Oct
#   1   0    0   0    1   0   0   1   1    1    0
#   2   0    0   0    0   0   0   1   0    0    0
#   3   0    0   0    0   0   0   0   0    0    1
#   4   1    0   0    1   0   1   0   1    0    1
#   5   0    0   1    0   0   1   1   0    0    1
c1 <- c("ID", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sept", "Oct")
c2 <- c(1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0)
c3 <- c(2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0)
c4 <- c(3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1)
c5 <- c(4, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1)
c6 <- c(5, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1)
BD <- data.frame(rbind(c2, c3, c4, c5, c6))
colnames(BD) <- c1

我期望的结果类似于这样:

#  ID   Jan  Feb Mar  Apr May Jun Jul Aug Sept Oct
#   1   0    0   0    1   0   0   1   1    1    0
#   2   0    0   0    0   0   0   1   0    0    1
#   3   0    0   0    0   0   0   0   0    0    1

有人知道如何做吗?谢谢!

英文:

I need to find and count the ID's that appear with a 1 after 3 or more consecutive zeros.

This is a example of what I have:

#  ID   Jan  Feb Mar  Apr May Jun Jul Aug Sept Oct
#   1   0    0   0    1   0   0   1   1    1    0
#   2   0    0   0    0   0   0   1   0    0    0
#   3   0    0   0    0   0   0   0   0    0    1
#   4   1    0   0    1   0   1   0   1    0    1
#   5   0    0   1    0   0   1   1   0    0    1

c1&lt;- c(&quot;ID&quot;,&quot;Jan&quot;,&quot;Feb&quot;, &quot;Mar&quot;,&quot;Apr&quot;, &quot;May&quot;,&quot;Jun&quot;, &quot;Jul&quot;, &quot;Aug&quot;, &quot;Sept&quot;, &quot;Oct&quot;)
c2&lt;-  c(1,0,0,0,1,0,0,1,1,1,0)
c3&lt;- c(2,0,0,0,0,0,0,1,0,0,0)
c4&lt;- c(3,0,0,0,0,0,0,0,0,0,1)
c5&lt;- c(4,1,0,0,1,0,1,0,1,0,1)
c6&lt;- c(5,0,0,1,0,0,1,1,0,0,1)
BD&lt;-data.frame(rbind(c2,c3,c4,c5,c6))
colnames(BD)&lt;-c1

The result of what I expect is something like this:

#  ID   Jan  Feb Mar  Apr May Jun Jul Aug Sept Oct
#   1   0    0   0    1   0   0   1   1    1    0
#   2   0    0   0    0   0   0   1   0    0    1
#   3   0    0   0    0   0   0   0   0    0    1

Anyone know how to do it? Thanks!

答案1

得分: 2

如果您取向量 xrowid(rleid(x)),您将得到每个元素进入每个“run”中的步骤数。您可以检查这是否 >= 3 且元素为 0。如果对于前一个元素(对于移位输出)这是真的,并且元素为 1,则返回 TRUE。然后检查是否对于行中的任何元素都为 TRUE。

library(data.table)

rows <- 
  apply(BD, 1, function(r) any(shift(rowid(rleid(r)) >= 3 & r == 0) & r == 1))

BD[rows,]
#    ID Jan Feb Mar Apr May Jun Jul Aug Sept Oct
# c2  1   0   0   0   1   0   0   1   1    1   0
# c3  2   0   0   0   0   0   0   1   0    0   0
# c4  3   0   0   0   0   0   0   0   0    0   1

这是特定行的示例(第一行):

rbind(
  rowid_rleid = rowid(rleid(unlist(BD[1,]))),
  original = unlist(BD[1,]))
#             ID Jan Feb Mar Apr May Jun Jul Aug Sept Oct
# rowid_rleid  1   1   2   3   1   1   2   1   2    3   1
# original     1   0   0   0   1   0   0   1   1    1   0

希望这可以帮助您理解代码的翻译部分。

英文:

If you take the rowid(rleid(x)) of a vector x you get the number of steps into each "run" each element is*. You can check that this is >= 3 and the element is 0. If that is true for the previous element (for the shifted output) and the element is 1, return TRUE. Then check if this is TRUE for any of the elements in the row.

library(data.table)

rows &lt;- 
  apply(BD, 1, function(r) any(shift(rowid(rleid(r)) &gt;= 3 &amp; r == 0) &amp; r == 1))

BD[rows,]
#    ID Jan Feb Mar Apr May Jun Jul Aug Sept Oct
# c2  1   0   0   0   1   0   0   1   1    1   0
# c3  2   0   0   0   0   0   0   1   0    0   0
# c4  3   0   0   0   0   0   0   0   0    0   1

* Here's an example for a particular row (the first)

rbind(
  rowid_rleid = rowid(rleid(unlist(BD[1,]))),
  original = unlist(BD[1,]))

#             ID Jan Feb Mar Apr May Jun Jul Aug Sept Oct
# rowid_rleid  1   1   2   3   1   1   2   1   2    3   1
# original     1   0   0   0   1   0   0   1   1    1   0

答案2

得分: 2

你可以折叠成字符串并使用 grep() 搜索模式。

k <- 3

grep(sprintf(paste0("%0", k + 1, "d"), 1), apply(d[-1], 1, paste, collapse=""))
# [1] 2 4 5 6 8

如果不需要后面的 1,你可以使用 rle()

d
#     id Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
# c1   1   1   1   0   1   1   0   0   1   1   1   0   0
# c2   2   0   0   0   1   1   1   0   1   1   0   1   0
# c3   3   1   0   0   1   1   0   1   1   1   0   1   0
# c4   4   0   0   0   0   0   1   1   0   0   1   1   0
# c5   5   0   0   0   1   1   1   1   0   0   1   0   1
# c6   6   1   0   0   0   1   0   1   0   0   0   0   1
# c7   7   0   1   0   0   1   0   1   1   1   0   0   1
# c8   8   0   1   1   1   1   1   1   1   0   0   0   1
# c9   9   0   1   0   0   1   1   0   0   1   1   1   0
# c10 10   1   1   0   1   0   1   1   0   0   1   0   1

k <- 3
d$id[sapply(as.data.frame(t(d[-1])), function(x) any(rle(x)$lengths[rle(x)$values == 0] >= k))]
# [1] 2 4 5 6 8

数据:

set.seed(0)
d <- data.frame(id=1:10, 
                  `dimnames<-`(matrix(sample(0:1, 120, r=1), 10), 
                               list(paste0("c", 1:10), month.abb)))
英文:

You could collapse to string and use grep() to search for pattern.

k &lt;- 3

grep(sprintf(paste0(&quot;%0&quot;, k + 1, &quot;d&quot;), 1), apply(d[-1], 1, paste, collapse=&quot;&quot;))
# [1] 2 4 5 6 8

If no following 1 is needed you could use the rle().

d
#     id Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
# c1   1   1   1   0   1   1   0   0   1   1   1   0   0
# c2   2   0   0   0   1   1   1   0   1   1   0   1   0
# c3   3   1   0   0   1   1   0   1   1   1   0   1   0
# c4   4   0   0   0   0   0   1   1   0   0   1   1   0
# c5   5   0   0   0   1   1   1   1   0   0   1   0   1
# c6   6   1   0   0   0   1   0   1   0   0   0   0   1
# c7   7   0   1   0   0   1   0   1   1   1   0   0   1
# c8   8   0   1   1   1   1   1   1   1   0   0   0   1
# c9   9   0   1   0   0   1   1   0   0   1   1   1   0
# c10 10   1   1   0   1   0   1   1   0   0   1   0   1

k &lt;- 3
d$id[sapply(as.data.frame(t(d[-1])), function(x) any(rle(x)$lengths[rle(x)$values == 0] &gt;= k))]
# [1] 2 4 5 6 8

Data:

set.seed(0)
d &lt;- data.frame(id=1:10, 
                  `dimnames&lt;-`(matrix(sample(0:1, 120, r=1), 10), 
                               list(paste0(&quot;c&quot;, 1:10), month.abb)))

答案3

得分: 1

你可以将行合并成字符串,然后使用正则表达式匹配 '0001':

library(tidyverse)
rows = BD %>%
  purrr::pmap(function(...) paste0(list(...)[-1], collapse='')) %>%
  stringr::str_detect('0001')
BD[rows,]
英文:

You can consolidate the rows into strings and use a regular expression to match '0001':

library(tidyverse)
rows = BD %&gt;% 
  purrr::pmap(function(...) paste0(list(...)[-1], collapse=&#39;&#39;)) %&gt;% 
  stringr::str_detect(&#39;0001&#39;)
BD[rows,]

答案4

得分: 1

以下是一个基于R语言的解决方案,可以实现此操作:

BDout <- subset(BD, apply(BD[-1], 1, function(x) head(which(x == 1), 1)) > 3)

这将返回一个名为BDout的数据框,其中包含符合条件的数据。

英文:

Here is a base R solution that can make it

BDout &lt;- subset(BD,apply(BD[-1], 1, function(x) head(which(x==1),1))&gt;3)

such that

&gt; BDout
  ID Jan Feb Mar Apr May Jun Jul Aug Sept Oct
1  1   0   0   0   1   0   0   1   1    1   0
2  2   0   0   0   0   0   0   1   0    0   0
3  3   0   0   0   0   0   0   0   0    0   1

答案5

得分: 1

使用data.table来融合和筛选符合条件的行的选项。

library(data.table)
setDT(BD)[ID %in%
    melt(BD, id.vars="ID")[, 
        mth := .GRP, variable][
            value==1L, ID[mth[1L]>3L | any(diff(mth) > 3L)], ID]$V1
]

对于具有稀疏数据的大型数据集,这应该更快。

英文:

An option using data.table to melt and filter for rows that match conditions.

library(data.table)
setDT(BD)[ID %in%
    melt(BD, id.vars=&quot;ID&quot;)[, 
        mth := .GRP, variable][
            value==1L, ID[mth[1L]&gt;3L | any(diff(mth) &gt; 3L)], ID]$V1
]

It should be faster for large datasets with sparse data.

huangapple
  • 本文由 发表于 2020年1月3日 21:59:06
  • 转载请务必保留本文链接:https://go.coder-hub.com/59579854.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定