2020年1月3日 21:59:06go评论105阅读模式

英文:

looking for patterns in binary columns r

问题

我需要找到并计算在3个或更多连续的零后出现1的ID。

这是我所拥有的示例：

#  ID   Jan  Feb Mar  Apr May Jun Jul Aug Sept Oct
#   1   0    0   0    1   0   0   1   1    1    0
#   2   0    0   0    0   0   0   1   0    0    0
#   3   0    0   0    0   0   0   0   0    0    1
#   4   1    0   0    1   0   1   0   1    0    1
#   5   0    0   1    0   0   1   1   0    0    1
c1 <- c("ID", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sept", "Oct")
c2 <- c(1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0)
c3 <- c(2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0)
c4 <- c(3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1)
c5 <- c(4, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1)
c6 <- c(5, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1)
BD <- data.frame(rbind(c2, c3, c4, c5, c6))
colnames(BD) <- c1

我期望的结果类似于这样：

#  ID   Jan  Feb Mar  Apr May Jun Jul Aug Sept Oct
#   1   0    0   0    1   0   0   1   1    1    0
#   2   0    0   0    0   0   0   1   0    0    1
#   3   0    0   0    0   0   0   0   0    0    1

有人知道如何做吗？谢谢！

英文:

I need to find and count the ID's that appear with a 1 after 3 or more consecutive zeros.

This is a example of what I have:

#  ID   Jan  Feb Mar  Apr May Jun Jul Aug Sept Oct
#   1   0    0   0    1   0   0   1   1    1    0
#   2   0    0   0    0   0   0   1   0    0    0
#   3   0    0   0    0   0   0   0   0    0    1
#   4   1    0   0    1   0   1   0   1    0    1
#   5   0    0   1    0   0   1   1   0    0    1
c1&lt;- c(&quot;ID&quot;,&quot;Jan&quot;,&quot;Feb&quot;, &quot;Mar&quot;,&quot;Apr&quot;, &quot;May&quot;,&quot;Jun&quot;, &quot;Jul&quot;, &quot;Aug&quot;, &quot;Sept&quot;, &quot;Oct&quot;)
c2&lt;-  c(1,0,0,0,1,0,0,1,1,1,0)
c3&lt;- c(2,0,0,0,0,0,0,1,0,0,0)
c4&lt;- c(3,0,0,0,0,0,0,0,0,0,1)
c5&lt;- c(4,1,0,0,1,0,1,0,1,0,1)
c6&lt;- c(5,0,0,1,0,0,1,1,0,0,1)
BD&lt;-data.frame(rbind(c2,c3,c4,c5,c6))
colnames(BD)&lt;-c1

The result of what I expect is something like this:

#  ID   Jan  Feb Mar  Apr May Jun Jul Aug Sept Oct
#   1   0    0   0    1   0   0   1   1    1    0
#   2   0    0   0    0   0   0   1   0    0    1
#   3   0    0   0    0   0   0   0   0    0    1

Anyone know how to do it? Thanks!

答案1

得分: 2

如果您取向量 x 的 rowid(rleid(x))，您将得到每个元素进入每个“run”中的步骤数。您可以检查这是否 >= 3 且元素为 0。如果对于前一个元素（对于移位输出）这是真的，并且元素为 1，则返回 TRUE。然后检查是否对于行中的任何元素都为 TRUE。

library(data.table)
rows <- 
  apply(BD, 1, function(r) any(shift(rowid(rleid(r)) >= 3 & r == 0) & r == 1))
BD[rows,]
#    ID Jan Feb Mar Apr May Jun Jul Aug Sept Oct
# c2  1   0   0   0   1   0   0   1   1    1   0
# c3  2   0   0   0   0   0   0   1   0    0   0
# c4  3   0   0   0   0   0   0   0   0    0   1

这是特定行的示例（第一行）：

rbind(
  rowid_rleid = rowid(rleid(unlist(BD[1,]))),
  original = unlist(BD[1,]))
#             ID Jan Feb Mar Apr May Jun Jul Aug Sept Oct
# rowid_rleid  1   1   2   3   1   1   2   1   2    3   1
# original     1   0   0   0   1   0   0   1   1    1   0

希望这可以帮助您理解代码的翻译部分。

英文:

If you take the rowid(rleid(x)) of a vector x you get the number of steps into each "run" each element is*. You can check that this is >= 3 and the element is 0. If that is true for the previous element (for the shifted output) and the element is 1, return TRUE. Then check if this is TRUE for any of the elements in the row.

library(data.table)
rows &lt;- 
  apply(BD, 1, function(r) any(shift(rowid(rleid(r)) &gt;= 3 &amp; r == 0) &amp; r == 1))
BD[rows,]
#    ID Jan Feb Mar Apr May Jun Jul Aug Sept Oct
# c2  1   0   0   0   1   0   0   1   1    1   0
# c3  2   0   0   0   0   0   0   1   0    0   0
# c4  3   0   0   0   0   0   0   0   0    0   1

* Here's an example for a particular row (the first)

rbind(
  rowid_rleid = rowid(rleid(unlist(BD[1,]))),
  original = unlist(BD[1,]))
#             ID Jan Feb Mar Apr May Jun Jul Aug Sept Oct
# rowid_rleid  1   1   2   3   1   1   2   1   2    3   1
# original     1   0   0   0   1   0   0   1   1    1   0

答案2

得分: 2

你可以折叠成字符串并使用 grep() 搜索模式。

k <- 3
grep(sprintf(paste0("%0", k + 1, "d"), 1), apply(d[-1], 1, paste, collapse=""))
# [1] 2 4 5 6 8

如果不需要后面的 1，你可以使用 rle()。

d
#     id Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
# c1   1   1   1   0   1   1   0   0   1   1   1   0   0
# c2   2   0   0   0   1   1   1   0   1   1   0   1   0
# c3   3   1   0   0   1   1   0   1   1   1   0   1   0
# c4   4   0   0   0   0   0   1   1   0   0   1   1   0
# c5   5   0   0   0   1   1   1   1   0   0   1   0   1
# c6   6   1   0   0   0   1   0   1   0   0   0   0   1
# c7   7   0   1   0   0   1   0   1   1   1   0   0   1
# c8   8   0   1   1   1   1   1   1   1   0   0   0   1
# c9   9   0   1   0   0   1   1   0   0   1   1   1   0
# c10 10   1   1   0   1   0   1   1   0   0   1   0   1
k <- 3
d$id[sapply(as.data.frame(t(d[-1])), function(x) any(rle(x)$lengths[rle(x)$values == 0] >= k))]
# [1] 2 4 5 6 8

数据：

set.seed(0)
d <- data.frame(id=1:10, 
                  `dimnames<-`(matrix(sample(0:1, 120, r=1), 10), 
                               list(paste0("c", 1:10), month.abb)))

英文:

You could collapse to string and use grep() to search for pattern.

k &lt;- 3
grep(sprintf(paste0(&quot;%0&quot;, k + 1, &quot;d&quot;), 1), apply(d[-1], 1, paste, collapse=&quot;&quot;))
# [1] 2 4 5 6 8

If no following 1 is needed you could use the rle().

d
#     id Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
# c1   1   1   1   0   1   1   0   0   1   1   1   0   0
# c2   2   0   0   0   1   1   1   0   1   1   0   1   0
# c3   3   1   0   0   1   1   0   1   1   1   0   1   0
# c4   4   0   0   0   0   0   1   1   0   0   1   1   0
# c5   5   0   0   0   1   1   1   1   0   0   1   0   1
# c6   6   1   0   0   0   1   0   1   0   0   0   0   1
# c7   7   0   1   0   0   1   0   1   1   1   0   0   1
# c8   8   0   1   1   1   1   1   1   1   0   0   0   1
# c9   9   0   1   0   0   1   1   0   0   1   1   1   0
# c10 10   1   1   0   1   0   1   1   0   0   1   0   1
k &lt;- 3
d$id[sapply(as.data.frame(t(d[-1])), function(x) any(rle(x)$lengths[rle(x)$values == 0] &gt;= k))]
# [1] 2 4 5 6 8

Data:

set.seed(0)
d &lt;- data.frame(id=1:10, 
                  `dimnames&lt;-`(matrix(sample(0:1, 120, r=1), 10), 
                               list(paste0(&quot;c&quot;, 1:10), month.abb)))

答案3

得分: 1

你可以将行合并成字符串，然后使用正则表达式匹配 '0001'：

library(tidyverse)
rows = BD %>%
  purrr::pmap(function(...) paste0(list(...)[-1], collapse='')) %>%
  stringr::str_detect('0001')
BD[rows,]

英文:

You can consolidate the rows into strings and use a regular expression to match '0001':

library(tidyverse)
rows = BD %&gt;% 
  purrr::pmap(function(...) paste0(list(...)[-1], collapse=&#39;&#39;)) %&gt;% 
  stringr::str_detect(&#39;0001&#39;)
BD[rows,]

答案4

得分: 1

以下是一个基于R语言的解决方案，可以实现此操作：

BDout <- subset(BD, apply(BD[-1], 1, function(x) head(which(x == 1), 1)) > 3)

这将返回一个名为BDout的数据框，其中包含符合条件的数据。

英文:

Here is a base R solution that can make it

BDout &lt;- subset(BD,apply(BD[-1], 1, function(x) head(which(x==1),1))&gt;3)

such that

&gt; BDout
  ID Jan Feb Mar Apr May Jun Jul Aug Sept Oct
1  1   0   0   0   1   0   0   1   1    1   0
2  2   0   0   0   0   0   0   1   0    0   0
3  3   0   0   0   0   0   0   0   0    0   1

答案5

得分: 1

使用data.table来融合和筛选符合条件的行的选项。

library(data.table)
setDT(BD)[ID %in%
    melt(BD, id.vars="ID")[, 
        mth := .GRP, variable][
            value==1L, ID[mth[1L]>3L | any(diff(mth) > 3L)], ID]$V1
]

对于具有稀疏数据的大型数据集，这应该更快。

英文:

An option using data.table to melt and filter for rows that match conditions.

library(data.table)
setDT(BD)[ID %in%
    melt(BD, id.vars=&quot;ID&quot;)[, 
        mth := .GRP, variable][
            value==1L, ID[mth[1L]&gt;3L | any(diff(mth) &gt; 3L)], ID]$V1
]

It should be faster for large datasets with sparse data.

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

寻找二进制列中的模式 r

问题

答案1

答案2

答案3

答案4

答案5

使用字符串为tibble分配多个列类型。

一个在R中的循环函数

Y轴最小值在双轴图表上

使用`table()`与`for`循环

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。

发表评论