Update run-length ID but skip NA.

huangapple go评论57阅读模式
英文:

Update run-length ID but skip NA

问题

给定这种类型的数据:

df <- data.frame(
  ID = 1:10,
  Sequ = c(NA, 44, 44, NA, NA, 33, 33, 33, 5, 5),
  Q = c(NA, "q1", "q1", NA, NA, "q2", "q2", "q2", "q2", "q2")
)

如何以比以下操作更高效的方式来更新 Sequ 的 run-length ID:

library(dplyr)
library(data.table)
left_join(df, df %>%
  filter(!is.na(Sequ)) %>%
  mutate(Sequ_0 = rleid(Sequ))) %>%
  select(-Sequ)

备注:尽管我使用了 data.table 中的 rleid,但我正在寻找 tidyverse 的解决方案。

英文:

Given this type of data:

df &lt;- data.frame(
  ID = 1:10,
  Sequ = c(NA, 44,44, NA, NA, 33,33,33, 5,5),
  Q = c(NA, &quot;q1&quot;,&quot;q1&quot;, NA, NA, &quot;q2&quot;,&quot;q2&quot;,&quot;q2&quot;, &quot;q2&quot;,&quot;q2&quot;)
)

how can I update the run-length ID of Sequ more efficiently than by doing this:

library(dplyr)
library(data.table)
left_join(df, df %&gt;%
  filter(!is.na(Sequ)) %&gt;%
  mutate(Sequ_0 = rleid(Sequ))) %&gt;%
  select(-Sequ)
   ID    Q Sequ_0
1   1 &lt;NA&gt;     NA
2   2   q1      1
3   3   q1      1
4   4 &lt;NA&gt;     NA
5   5 &lt;NA&gt;     NA
6   6   q2      2
7   7   q2      2
8   8   q2      2
9   9   q2      3
10 10   q2      3

Note: although I'm using rleid from data.table I'm looking for a tidyverse solution.

答案1

得分: 5

    df %>%
       mutate(Sequ_0 = dense_rank(NA^is.na(Q)*consecutive_id(Sequ)))

       ID Sequ    Q Sequ_0
    1   1   NA <NA>     NA
    2   2   44   q1      1
    3   3   44   q1      1
    4   4   NA <NA>     NA
    5   5   NA <NA>     NA
    6   6   33   q2      2
    7   7   33   q2      2
    8   8   33   q2      2
    9   9    5   q2      3
    10 10    5   q2      3

    df %>%
       mutate(Sequ_0 = dense_rank(`is.na<-`(consecutive_id(Sequ), is.na(Q))))

    df %>%
      mutate(Sequ_0 =  replace(Q, !is.na(Q), consecutive_id(na.omit(Sequ))))
英文:
df %&gt;%
   mutate(Sequ_0 = dense_rank(NA^is.na(Q)*consecutive_id(Sequ)))

   ID Sequ    Q Sequ_0
1   1   NA &lt;NA&gt;     NA
2   2   44   q1      1
3   3   44   q1      1
4   4   NA &lt;NA&gt;     NA
5   5   NA &lt;NA&gt;     NA
6   6   33   q2      2
7   7   33   q2      2
8   8   33   q2      2
9   9    5   q2      3
10 10    5   q2      3

df %&gt;%
   mutate(Sequ_0 = dense_rank(`is.na&lt;-`(consecutive_id(Sequ), is.na(Q))))

Also:

df %&gt;%
  mutate(Sequ_0 =  replace(Q, !is.na(Q), consecutive_id(na.omit(Sequ))))

答案2

得分: 4

使用consecutive_id来获取ID,然后排除NA值,将其转换为因子,再转换回数字以获得连续的数字:

library(dplyr) # dplyr_1.1.0 - consecutive_id

df %>%
  mutate(id = as.numeric(as.factor(
    if_else(is.na(Sequ), NA, consecutive_id(Sequ)))))
#    ID Sequ    Q id
# 1   1   NA &lt;NA&gt; NA
# 2   2   44   q1  1
# 3   3   44   q1  1
# 4   4   NA &lt;NA&gt; NA
# 5   5   NA &lt;NA&gt; NA
# 6   6   33   q2  2
# 7   7   33   q2  2
# 8   8   33   q2  2
# 9   9    5   q2  3
# 10 10    5   q2  3
英文:

Using consecutive_id to get ids, then exclude NAs, convert to factor, then back to numeric to have sequential numbers:

library(dplyr) # dplyr_1.1.0 - consecutive_id

df %&gt;% 
  mutate(id = as.numeric(as.factor(
    if_else(is.na(Sequ), NA, consecutive_id(Sequ)))))
#    ID Sequ    Q id
# 1   1   NA &lt;NA&gt; NA
# 2   2   44   q1  1
# 3   3   44   q1  1
# 4   4   NA &lt;NA&gt; NA
# 5   5   NA &lt;NA&gt; NA
# 6   6   33   q2  2
# 7   7   33   q2  2
# 8   8   33   q2  2
# 9   9    5   q2  3
# 10 10    5   q2  3

答案3

得分: 3

这里是另一种使用 arrangeconsecutive_id 的选项:

library(dplyr) #dplyr >= 1.1.0
df %>%
  arrange(Q) %>%
  mutate(Sequ_0 = consecutive_id(Sequ),
         Sequ_0 = ifelse(is.na(Sequ), NA_real_, Sequ_0)) %>%
  arrange(ID)

   ID Sequ    Q Sequ_0
1   1   NA <NA>     NA
2   2   44   q1      1
3   3   44   q1      1
4   4   NA <NA>     NA
5   5   NA <NA>     NA
6   6   33   q2      2
7   7   33   q2      2
8   8   33   q2      2
9   9    5   q2      3
10 10    5   q2      3
英文:

Here is another option using arrange with consecutive_id:

library(dplyr) #dplyr &gt;= 1.1.0
df %&gt;% 
  arrange(Q) %&gt;% 
  mutate(Sequ_0 =  consecutive_id(Sequ),
         Sequ_0 = ifelse(is.na(Sequ), NA_real_, Sequ_0)) %&gt;% 
  arrange(ID)

   ID Sequ    Q Sequ_0
1   1   NA &lt;NA&gt;     NA
2   2   44   q1      1
3   3   44   q1      1
4   4   NA &lt;NA&gt;     NA
5   5   NA &lt;NA&gt;     NA
6   6   33   q2      2
7   7   33   q2      2
8   8   33   q2      2
9   9    5   q2      3
10 10    5   q2      3

答案4

得分: 1

以下是代码的翻译部分:

一个选项是使用matchunique来创建ID,如下所示:

library(tidyverse)
df %>%
  left_join(., df %>%
              drop_na() %>%
              mutate(Sequ_0 = match(Sequ, unique(Sequ))))
#> Joining with `by = join_by(ID, Sequ, Q)`
#>    ID Sequ    Q Sequ_0
#> 1   1   NA <NA>     NA
#> 2   2   44   q1      1
#> 3   3   44   q1      1
#> 4   4   NA <NA>     NA
#> 5   5   NA <NA>     NA
#> 6   6   33   q2      2
#> 7   7   33   q2      2
#> 8   8   33   q2      2
#> 9   9    5   q2      3
#> 10 10    5   q2      3

创建于2023年03月09日,使用reprex v2.0.2

英文:

An option could be using match with unique to create the ID like this:

library(tidyverse)
df %&gt;%
  left_join(., df %&gt;%
              drop_na() %&gt;%
              mutate(Sequ_0 = match(Sequ, unique(Sequ))))
#&gt; Joining with `by = join_by(ID, Sequ, Q)`
#&gt;    ID Sequ    Q Sequ_0
#&gt; 1   1   NA &lt;NA&gt;     NA
#&gt; 2   2   44   q1      1
#&gt; 3   3   44   q1      1
#&gt; 4   4   NA &lt;NA&gt;     NA
#&gt; 5   5   NA &lt;NA&gt;     NA
#&gt; 6   6   33   q2      2
#&gt; 7   7   33   q2      2
#&gt; 8   8   33   q2      2
#&gt; 9   9    5   q2      3
#&gt; 10 10    5   q2      3

<sup>Created on 2023-03-09 with reprex v2.0.2</sup>

答案5

得分: 1

在基本R中:

```R
transform(df, seq0 = with(rle(Sequ), 
              rep(`[<-`(values, !is.na(values), seq(na.omit(values))), lengths)))

       ID Sequ    Q seq0
    1   1   NA <NA>   NA
    2   2   44   q1    1
    3   3   44   q1    1
    4   4   NA <NA>   NA
    5   5   NA <NA>   NA
    6   6   33   q2    2
    7   7   33   q2    2
    8   8   33   q2    2
    9   9    5   q2    3
    10 10    5   q2    3
---
回到基础:

```R
a <- rle(df$Sequ)
a$values[!is.na(a$values)] <- seq_along(na.omit(a$values))
cbind(df, sequ_0 = inverse.rle(a))

       ID Sequ    Q sequ_0
    1   1   NA <NA>     NA
    2   2   44   q1      1
    3   3   44   q1      1
    4   4   NA <NA>     NA
    5   5   NA <NA>     NA
    6   6   33   q2      2
    7   7   33   q2      2
    8   8   33   q2      2
    9   9    5   q2      3
    10 10    5   q2      3
英文:

in base R:

transform(df, seq0 = with(rle(Sequ), 
          rep(`[&lt;-`(values, !is.na(values), seq(na.omit(values))), lengths)))

   ID Sequ    Q seq0
1   1   NA &lt;NA&gt;   NA
2   2   44   q1    1
3   3   44   q1    1
4   4   NA &lt;NA&gt;   NA
5   5   NA &lt;NA&gt;   NA
6   6   33   q2    2
7   7   33   q2    2
8   8   33   q2    2
9   9    5   q2    3
10 10    5   q2    3

Back to basics:

a &lt;- rle(df$Sequ)
a$values[!is.na(a$values)] &lt;- seq_along(na.omit(a$values))
cbind(df, sequ_0 = inverse.rle(a))

   ID Sequ    Q sequ_0
1   1   NA &lt;NA&gt;     NA
2   2   44   q1      1
3   3   44   q1      1
4   4   NA &lt;NA&gt;     NA
5   5   NA &lt;NA&gt;     NA
6   6   33   q2      2
7   7   33   q2      2
8   8   33   q2      2
9   9    5   q2      3
10 10    5   q2      3

huangapple
  • 本文由 发表于 2023年3月9日 16:10:37
  • 转载请务必保留本文链接:https://go.coder-hub.com/75681910.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定