Update run-length ID but skip NA.

huangapple go评论90阅读模式
英文:

Update run-length ID but skip NA

问题

给定这种类型的数据:

  1. df <- data.frame(
  2. ID = 1:10,
  3. Sequ = c(NA, 44, 44, NA, NA, 33, 33, 33, 5, 5),
  4. Q = c(NA, "q1", "q1", NA, NA, "q2", "q2", "q2", "q2", "q2")
  5. )

如何以比以下操作更高效的方式来更新 Sequ 的 run-length ID:

  1. library(dplyr)
  2. library(data.table)
  3. left_join(df, df %>%
  4. filter(!is.na(Sequ)) %>%
  5. mutate(Sequ_0 = rleid(Sequ))) %>%
  6. select(-Sequ)

备注:尽管我使用了 data.table 中的 rleid,但我正在寻找 tidyverse 的解决方案。

英文:

Given this type of data:

  1. df &lt;- data.frame(
  2. ID = 1:10,
  3. Sequ = c(NA, 44,44, NA, NA, 33,33,33, 5,5),
  4. Q = c(NA, &quot;q1&quot;,&quot;q1&quot;, NA, NA, &quot;q2&quot;,&quot;q2&quot;,&quot;q2&quot;, &quot;q2&quot;,&quot;q2&quot;)
  5. )

how can I update the run-length ID of Sequ more efficiently than by doing this:

  1. library(dplyr)
  2. library(data.table)
  3. left_join(df, df %&gt;%
  4. filter(!is.na(Sequ)) %&gt;%
  5. mutate(Sequ_0 = rleid(Sequ))) %&gt;%
  6. select(-Sequ)
  7. ID Q Sequ_0
  8. 1 1 &lt;NA&gt; NA
  9. 2 2 q1 1
  10. 3 3 q1 1
  11. 4 4 &lt;NA&gt; NA
  12. 5 5 &lt;NA&gt; NA
  13. 6 6 q2 2
  14. 7 7 q2 2
  15. 8 8 q2 2
  16. 9 9 q2 3
  17. 10 10 q2 3

Note: although I'm using rleid from data.table I'm looking for a tidyverse solution.

答案1

得分: 5

  1. df %>%
  2. mutate(Sequ_0 = dense_rank(NA^is.na(Q)*consecutive_id(Sequ)))
  3. ID Sequ Q Sequ_0
  4. 1 1 NA <NA> NA
  5. 2 2 44 q1 1
  6. 3 3 44 q1 1
  7. 4 4 NA <NA> NA
  8. 5 5 NA <NA> NA
  9. 6 6 33 q2 2
  10. 7 7 33 q2 2
  11. 8 8 33 q2 2
  12. 9 9 5 q2 3
  13. 10 10 5 q2 3
  14. df %>%
  15. mutate(Sequ_0 = dense_rank(`is.na<-`(consecutive_id(Sequ), is.na(Q))))
  16. df %>%
  17. mutate(Sequ_0 = replace(Q, !is.na(Q), consecutive_id(na.omit(Sequ))))
英文:
  1. df %&gt;%
  2. mutate(Sequ_0 = dense_rank(NA^is.na(Q)*consecutive_id(Sequ)))
  3. ID Sequ Q Sequ_0
  4. 1 1 NA &lt;NA&gt; NA
  5. 2 2 44 q1 1
  6. 3 3 44 q1 1
  7. 4 4 NA &lt;NA&gt; NA
  8. 5 5 NA &lt;NA&gt; NA
  9. 6 6 33 q2 2
  10. 7 7 33 q2 2
  11. 8 8 33 q2 2
  12. 9 9 5 q2 3
  13. 10 10 5 q2 3
  14. df %&gt;%
  15. mutate(Sequ_0 = dense_rank(`is.na&lt;-`(consecutive_id(Sequ), is.na(Q))))

Also:

  1. df %&gt;%
  2. mutate(Sequ_0 = replace(Q, !is.na(Q), consecutive_id(na.omit(Sequ))))

答案2

得分: 4

使用consecutive_id来获取ID,然后排除NA值,将其转换为因子,再转换回数字以获得连续的数字:

  1. library(dplyr) # dplyr_1.1.0 - consecutive_id
  2. df %>%
  3. mutate(id = as.numeric(as.factor(
  4. if_else(is.na(Sequ), NA, consecutive_id(Sequ)))))
  5. # ID Sequ Q id
  6. # 1 1 NA &lt;NA&gt; NA
  7. # 2 2 44 q1 1
  8. # 3 3 44 q1 1
  9. # 4 4 NA &lt;NA&gt; NA
  10. # 5 5 NA &lt;NA&gt; NA
  11. # 6 6 33 q2 2
  12. # 7 7 33 q2 2
  13. # 8 8 33 q2 2
  14. # 9 9 5 q2 3
  15. # 10 10 5 q2 3
英文:

Using consecutive_id to get ids, then exclude NAs, convert to factor, then back to numeric to have sequential numbers:

  1. library(dplyr) # dplyr_1.1.0 - consecutive_id
  2. df %&gt;%
  3. mutate(id = as.numeric(as.factor(
  4. if_else(is.na(Sequ), NA, consecutive_id(Sequ)))))
  5. # ID Sequ Q id
  6. # 1 1 NA &lt;NA&gt; NA
  7. # 2 2 44 q1 1
  8. # 3 3 44 q1 1
  9. # 4 4 NA &lt;NA&gt; NA
  10. # 5 5 NA &lt;NA&gt; NA
  11. # 6 6 33 q2 2
  12. # 7 7 33 q2 2
  13. # 8 8 33 q2 2
  14. # 9 9 5 q2 3
  15. # 10 10 5 q2 3

答案3

得分: 3

这里是另一种使用 arrangeconsecutive_id 的选项:

  1. library(dplyr) #dplyr >= 1.1.0
  2. df %>%
  3. arrange(Q) %>%
  4. mutate(Sequ_0 = consecutive_id(Sequ),
  5. Sequ_0 = ifelse(is.na(Sequ), NA_real_, Sequ_0)) %>%
  6. arrange(ID)
  7. ID Sequ Q Sequ_0
  8. 1 1 NA <NA> NA
  9. 2 2 44 q1 1
  10. 3 3 44 q1 1
  11. 4 4 NA <NA> NA
  12. 5 5 NA <NA> NA
  13. 6 6 33 q2 2
  14. 7 7 33 q2 2
  15. 8 8 33 q2 2
  16. 9 9 5 q2 3
  17. 10 10 5 q2 3
英文:

Here is another option using arrange with consecutive_id:

  1. library(dplyr) #dplyr &gt;= 1.1.0
  2. df %&gt;%
  3. arrange(Q) %&gt;%
  4. mutate(Sequ_0 = consecutive_id(Sequ),
  5. Sequ_0 = ifelse(is.na(Sequ), NA_real_, Sequ_0)) %&gt;%
  6. arrange(ID)
  7. ID Sequ Q Sequ_0
  8. 1 1 NA &lt;NA&gt; NA
  9. 2 2 44 q1 1
  10. 3 3 44 q1 1
  11. 4 4 NA &lt;NA&gt; NA
  12. 5 5 NA &lt;NA&gt; NA
  13. 6 6 33 q2 2
  14. 7 7 33 q2 2
  15. 8 8 33 q2 2
  16. 9 9 5 q2 3
  17. 10 10 5 q2 3

答案4

得分: 1

以下是代码的翻译部分:

一个选项是使用matchunique来创建ID,如下所示:

  1. library(tidyverse)
  2. df %>%
  3. left_join(., df %>%
  4. drop_na() %>%
  5. mutate(Sequ_0 = match(Sequ, unique(Sequ))))
  6. #> Joining with `by = join_by(ID, Sequ, Q)`
  7. #> ID Sequ Q Sequ_0
  8. #> 1 1 NA <NA> NA
  9. #> 2 2 44 q1 1
  10. #> 3 3 44 q1 1
  11. #> 4 4 NA <NA> NA
  12. #> 5 5 NA <NA> NA
  13. #> 6 6 33 q2 2
  14. #> 7 7 33 q2 2
  15. #> 8 8 33 q2 2
  16. #> 9 9 5 q2 3
  17. #> 10 10 5 q2 3

创建于2023年03月09日,使用reprex v2.0.2

英文:

An option could be using match with unique to create the ID like this:

  1. library(tidyverse)
  2. df %&gt;%
  3. left_join(., df %&gt;%
  4. drop_na() %&gt;%
  5. mutate(Sequ_0 = match(Sequ, unique(Sequ))))
  6. #&gt; Joining with `by = join_by(ID, Sequ, Q)`
  7. #&gt; ID Sequ Q Sequ_0
  8. #&gt; 1 1 NA &lt;NA&gt; NA
  9. #&gt; 2 2 44 q1 1
  10. #&gt; 3 3 44 q1 1
  11. #&gt; 4 4 NA &lt;NA&gt; NA
  12. #&gt; 5 5 NA &lt;NA&gt; NA
  13. #&gt; 6 6 33 q2 2
  14. #&gt; 7 7 33 q2 2
  15. #&gt; 8 8 33 q2 2
  16. #&gt; 9 9 5 q2 3
  17. #&gt; 10 10 5 q2 3

<sup>Created on 2023-03-09 with reprex v2.0.2</sup>

答案5

得分: 1

  1. 在基本R中:
  2. ```R
  3. transform(df, seq0 = with(rle(Sequ),
  4. rep(`[<-`(values, !is.na(values), seq(na.omit(values))), lengths)))
  5. ID Sequ Q seq0
  6. 1 1 NA <NA> NA
  7. 2 2 44 q1 1
  8. 3 3 44 q1 1
  9. 4 4 NA <NA> NA
  10. 5 5 NA <NA> NA
  11. 6 6 33 q2 2
  12. 7 7 33 q2 2
  13. 8 8 33 q2 2
  14. 9 9 5 q2 3
  15. 10 10 5 q2 3
  16. ---
  17. 回到基础:
  18. ```R
  19. a <- rle(df$Sequ)
  20. a$values[!is.na(a$values)] <- seq_along(na.omit(a$values))
  21. cbind(df, sequ_0 = inverse.rle(a))
  22. ID Sequ Q sequ_0
  23. 1 1 NA <NA> NA
  24. 2 2 44 q1 1
  25. 3 3 44 q1 1
  26. 4 4 NA <NA> NA
  27. 5 5 NA <NA> NA
  28. 6 6 33 q2 2
  29. 7 7 33 q2 2
  30. 8 8 33 q2 2
  31. 9 9 5 q2 3
  32. 10 10 5 q2 3
英文:

in base R:

  1. transform(df, seq0 = with(rle(Sequ),
  2. rep(`[&lt;-`(values, !is.na(values), seq(na.omit(values))), lengths)))
  3. ID Sequ Q seq0
  4. 1 1 NA &lt;NA&gt; NA
  5. 2 2 44 q1 1
  6. 3 3 44 q1 1
  7. 4 4 NA &lt;NA&gt; NA
  8. 5 5 NA &lt;NA&gt; NA
  9. 6 6 33 q2 2
  10. 7 7 33 q2 2
  11. 8 8 33 q2 2
  12. 9 9 5 q2 3
  13. 10 10 5 q2 3

Back to basics:

  1. a &lt;- rle(df$Sequ)
  2. a$values[!is.na(a$values)] &lt;- seq_along(na.omit(a$values))
  3. cbind(df, sequ_0 = inverse.rle(a))
  4. ID Sequ Q sequ_0
  5. 1 1 NA &lt;NA&gt; NA
  6. 2 2 44 q1 1
  7. 3 3 44 q1 1
  8. 4 4 NA &lt;NA&gt; NA
  9. 5 5 NA &lt;NA&gt; NA
  10. 6 6 33 q2 2
  11. 7 7 33 q2 2
  12. 8 8 33 q2 2
  13. 9 9 5 q2 3
  14. 10 10 5 q2 3

huangapple
  • 本文由 发表于 2023年3月9日 16:10:37
  • 转载请务必保留本文链接:https://go.coder-hub.com/75681910.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定