根据日期间隔创建新列

huangapple go评论98阅读模式
英文:

mutate new column based on date interval

问题

编辑 2023年2月4日

数据:

  1. library(dplyr)
  2. DF <- data.frame(
  3. stringsAsFactors = FALSE,
  4. ID = c(1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L,
  5. 4L, 4L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L),
  6. COLOR = c("BLUE", "RED", "BLUE", "RED",
  7. "RED", "BLUE", "RED", "BLUE", "BLUE", "BLUE",
  8. "BLACK", "GREEN", "GRAY", "GRAY", "RED", "BLUE",
  9. "BLUE", "BLUE", "BLUE", "BLUE"),
  10. COLOR_DATE = c("2001-01-01", "2001-01-01",
  11. "2002-02-02", "2001-01-01", "2002-02-02", "2008-08-08",
  12. "2001-01-01", "2002-02-02", "2009-09-09", "2009-09-09",
  13. "2001-01-01", "2006-06-06", "2001-01-01", "2008-01-01",
  14. "2008-01-01", "2001-01-01", "2002-02-02", "2003-03-03",
  15. "2004-04-04", "2007-07-07")
  16. )

期望的输出:

  1. ID COLOR COLOR_DATE TRUE_COLOR
  2. 1 1 BLUE 2001-01-01 BLUE
  3. 2 2 RED 2001-01-01 MIX
  4. 3 2 BLUE 2002-02-02 MIX
  5. 4 3 RED 2001-01-01 MIX
  6. 5 3 RED 2002-02-02 MIX
  7. 6 3 BLUE 2008-08-08 MIX
  8. 7 4 RED 2001-01-01 BLUE
  9. 8 4 BLUE 2002-02-02 BLUE
  10. 9 4 BLUE 2009-09-09 BLUE
  11. 10 4 BLUE 2009-09-09 BLUE
  12. 11 5 BLUE 2001-01-01 BLUE
  13. 12 5 BLACK 2006-06-06 BLUE
  14. 13 6 GREEN 2001-01-01 <NA>
  15. 14 6 GRAY 2008-01-01 <NA>
  16. 15 6 GRAY 2008-01-01 <NA>
  17. 16 7 RED 2001-01-01 BLUE
  18. 17 7 BLUE 2002-02-02 BLUE
  19. 18 7 BLUE 2003-03-03 BLUE
  20. 19 7 BLUE 2004-04-04 BLUE
  21. 20 7 BLUE 2007-07-07 BLUE

逻辑:

  • 当同一ID中只有RED COLOR时,TRUE_COLOR = RED
  • 当同一ID中只有BLUE COLOR时,TRUE_COLOR = BLUE
  • 当同一ID中同时有REDBLUE COLOR时,TRUE_COLOR = MIX

但是,如果COLOR在过去至少5年内保持不变,那么TRUE_COLOR = REDBLUE(如示例数据中的ID 4和7)。

其他颜色除了REDBLUE都会被忽略。

最后,RED123BLUE234应该被解释为REDBLUE

如何解决?

英文:

Edited 2/4-2023

Data:

  1. library(dplyr)
  2. DF&lt;-data.frame(
  3. stringsAsFactors = FALSE,
  4. ID = c(1L,2L,2L,3L,3L,3L,4L,4L,
  5. 4L,4L,5L,5L,6L,6L,6L,7L,7L,7L,7L,7L),
  6. COLOR = c(&quot;BLUE&quot;,&quot;RED&quot;,&quot;BLUE&quot;,&quot;RED&quot;,
  7. &quot;RED&quot;,&quot;BLUE&quot;,&quot;RED&quot;,&quot;BLUE&quot;,&quot;BLUE&quot;,&quot;BLUE&quot;,&quot;BLUE&quot;,
  8. &quot;BLACK&quot;,&quot;GREEN&quot;,&quot;GRAY&quot;,&quot;GRAY&quot;,&quot;RED&quot;,&quot;BLUE&quot;,&quot;BLUE&quot;,
  9. &quot;BLUE&quot;,&quot;BLUE&quot;),
  10. COLOR_DATE = c(&quot;2001-01-01&quot;,&quot;2001-01-01&quot;,
  11. &quot;2002-02-02&quot;,&quot;2001-01-01&quot;,&quot;2002-02-02&quot;,&quot;2008-08-08&quot;,
  12. &quot;2001-01-01&quot;,&quot;2002-02-02&quot;,&quot;2009-09-09&quot;,&quot;2009-09-09&quot;,
  13. &quot;2001-01-01&quot;,&quot;2006-06-06&quot;,&quot;2001-01-01&quot;,&quot;2008-01-01&quot;,
  14. &quot;2008-01-01&quot;,&quot;2001-01-01&quot;,&quot;2002-02-02&quot;,&quot;2003-03-03&quot;,
  15. &quot;2004-04-04&quot;,&quot;2007-07-07&quot;)
  16. )

Desired output:

  1. ID COLOR COLOR_DATE TRUE_COLOR
  2. 1 1 BLUE 2001-01-01 BLUE
  3. 2 2 RED 2001-01-01 MIX
  4. 3 2 BLUE 2002-02-02 MIX
  5. 4 3 RED 2001-01-01 MIX
  6. 5 3 RED 2002-02-02 MIX
  7. 6 3 BLUE 2008-08-08 MIX
  8. 7 4 RED 2001-01-01 BLUE
  9. 8 4 BLUE 2002-02-02 BLUE
  10. 9 4 BLUE 2009-09-09 BLUE
  11. 10 4 BLUE 2009-09-09 BLUE
  12. 11 5 BLUE 2001-01-01 BLUE
  13. 12 5 BLACK 2006-06-06 BLUE
  14. 13 6 GREEN 2001-01-01 &lt;NA&gt;
  15. 14 6 GRAY 2008-01-01 &lt;NA&gt;
  16. 15 6 GRAY 2008-01-01 &lt;NA&gt;
  17. 16 7 RED 2001-01-01 BLUE
  18. 17 7 BLUE 2002-02-02 BLUE
  19. 18 7 BLUE 2003-03-03 BLUE
  20. 19 7 BLUE 2004-04-04 BLUE
  21. 20 7 BLUE 2007-07-07 BLUE

Logic:

When only RED COLOR in same ID then TRUE_COLOR = RED.
When only BLUE COLOR in same ID then TRUE_COLOR = BLUE.
When both RED and BLUE COLOR in same ID then TRUE_COLOR = MIX.

However,
If the COLOR has been the same for at least the 5 recent years, then TRUE_COLOR = RED or BLUE (as in sample data ID 4 and 7).

Other COLOR than RED or BLUE are ignored.

Finally, RED123, and BLUE234 should be interpreted as RED and BLUE, respectively.

How to solve?

答案1

得分: 1

以下是代码的翻译部分:

  1. library(tidyverse); library(lubridate)
  2. blu_red <- quo(COLOR2 %in% c("BLUE", "RED"))
  3. DF %>%
  4. mutate(COLOR2 = str_extract(COLOR, "BLUE|RED"),
  5. COLOR_DATE = as.Date(COLOR_DATE)) %>%
  6. arrange(COLOR_DATE) %>%
  7. group_by(ID) %>%
  8. mutate(
  9. TRUE_COLOR = case_when(
  10. isTRUE(all(!(!!blu_red))) ~ NA,
  11. isTRUE(n() == 1 & !!blu_red) ~ COLOR2,
  12. isTRUE(n() == 1 & !(!!blu_red)) ~ NA,
  13. isTRUE((last(COLOR_DATE) - COLOR_DATE[last(which(COLOR2 != lag(COLOR2)))]) >= years(5) &
  14. last(COLOR2) %in% c("BLUE", "RED")) ~ last(COLOR2),
  15. isTRUE(all(COLOR2[!!blu_red] == "BLUE")) ~ "BLUE",
  16. isTRUE(all(COLOR2[!!blu_red] == "RED")) ~ "RED",
  17. TRUE ~ "MIX")) %>%
  18. ungroup() %>%
  19. select(- COLOR2) %>%
  20. arrange(ID)

希望这对您有帮助。

英文:
  1. library(tidyverse); library(lubridate)
  2. blu_red &lt;- quo(COLOR2 %in% c(&quot;BLUE&quot;, &quot;RED&quot;))
  3. DF %&gt;%
  4. mutate(COLOR2 = str_extract(COLOR, &quot;BLUE|RED&quot;),
  5. COLOR_DATE = as.Date(COLOR_DATE)) %&gt;%
  6. arrange(COLOR_DATE) %&gt;%
  7. group_by(ID) %&gt;%
  8. mutate(
  9. TRUE_COLOR = case_when(
  10. isTRUE(all(!(!!blu_red))) ~ NA,
  11. isTRUE(n() == 1 &amp; !!blu_red) ~ COLOR2,
  12. isTRUE(n() == 1 &amp; !(!!blu_red)) ~ NA,
  13. isTRUE((last(COLOR_DATE) - COLOR_DATE[last(which(COLOR2 != lag(COLOR2)))]) &gt;= years(5) &amp;
  14. last(COLOR2) %in% c(&quot;BLUE&quot;, &quot;RED&quot;)) ~ last(COLOR2),
  15. isTRUE(all(COLOR2[!!blu_red] == &quot;BLUE&quot;)) ~ &quot;BLUE&quot;,
  16. isTRUE(all(COLOR2[!!blu_red] == &quot;RED&quot;)) ~ &quot;RED&quot;,
  17. TRUE ~ &quot;MIX&quot;)) %&gt;%
  18. ungroup() %&gt;%
  19. select(- COLOR2) %&gt;%
  20. arrange(ID)

答案2

得分: 1

这个解决方案非常好:首先找到每个ID对应的颜色,然后使用dplyr::left_join()将其添加回原始表格。如果没有找到ID的颜色,left_join()会自动用NA填充。

  1. library(tidyverse)
  2. library(lubridate) # 如果tidyverse >= 2.0.0未安装
  3. id_color <- DF %>%
  4. mutate(
  5. red_blue = case_match(
  6. COLOR, # 确定颜色是红色、蓝色还是其他
  7. c("RED", "RED123") ~ "RED",
  8. c("BLUE", "BLUE234") ~ "BLUE",
  9. .default = NA # 忽略所有不是红色/蓝色的颜色
  10. ),
  11. date = ymd(COLOR_DATE) # 将COLOR_DATE转换为日期
  12. ) %>%
  13. drop_na(red_blue) %>% # 删除不是红色/蓝色的颜色
  14. group_by(ID) %>%
  15. # 仅包括日期范围涵盖过去5年的日期
  16. dplyr::filter(date == last(date) | lead(date) >= last(date) - years(5)) %>%
  17. summarise(
  18. TRUE_COLOR = if_else(
  19. length(unique(red_blue)) == 1,
  20. first(red_blue), "MIX"
  21. )
  22. )
  23. left_join(DF, id_color)

上面的代码段是R语言代码,用于处理数据,它首先将每个ID对应的颜色找出,然后将这些颜色添加回原始表格。如果找不到ID的颜色,它会自动用NA填充。

英文:

This solution is quite nice: we first find the color corresponding to each ID and then use dplyr::left_join() to add this back to the original table. If we don't find a color for an ID, then left_join() will automatically fill this in with NA.

  1. library(tidyverse)
  2. library(lubridate) # If tidyverse &gt;= 2.0.0 is not installed
  3. id_color &lt;- DF %&gt;%
  4. mutate(
  5. red_blue = case_match(
  6. COLOR, # Determine if a color is red, blue or neither
  7. c(&quot;RED&quot;, &quot;RED123&quot;) ~ &quot;RED&quot;,
  8. c(&quot;BLUE&quot;, &quot;BLUE234&quot;) ~ &quot;BLUE&quot;,
  9. .default = NA # Ignore all colors that are not red/blue
  10. ),
  11. date = ymd(COLOR_DATE) # Convert the COLOR_DATE to a date
  12. ) %&gt;%
  13. drop_na(red_blue) %&gt;% # Remove colors that are not red/blue
  14. group_by(ID) %&gt;%
  15. # Only include dates such that the range of dates cover the last 5 years
  16. dplyr::filter(date == last(date) | lead(date) &gt;= last(date) - years(5)) %&gt;%
  17. summarise(
  18. TRUE_COLOR = if_else(
  19. length(unique(red_blue)) == 1,
  20. first(red_blue), &quot;MIX&quot;
  21. )
  22. )
  23. left_join(DF, id_color)
  24. #&gt; Joining with `by = join_by(ID)`
  25. #&gt; # A tibble: 20 &#215; 4
  26. #&gt; ID COLOR COLOR_DATE TRUE_COLOR
  27. #&gt; &lt;int&gt; &lt;chr&gt; &lt;chr&gt; &lt;chr&gt;
  28. #&gt; 1 1 BLUE 2001-01-01 BLUE
  29. #&gt; 2 2 RED 2001-01-01 MIX
  30. #&gt; 3 2 BLUE 2002-02-02 MIX
  31. #&gt; 4 3 RED 2001-01-01 MIX
  32. #&gt; 5 3 RED 2002-02-02 MIX
  33. #&gt; 6 3 BLUE 2008-08-08 MIX
  34. #&gt; 7 4 RED 2001-01-01 BLUE
  35. #&gt; 8 4 BLUE 2002-02-02 BLUE
  36. #&gt; 9 4 BLUE 2009-09-09 BLUE
  37. #&gt; 10 4 BLUE 2009-09-09 BLUE
  38. #&gt; 11 5 BLUE 2001-01-01 BLUE
  39. #&gt; 12 5 BLACK 2006-06-06 BLUE
  40. #&gt; 13 6 GREEN 2001-01-01 &lt;NA&gt;
  41. #&gt; 14 6 GRAY 2008-01-01 &lt;NA&gt;
  42. #&gt; 15 6 GRAY 2008-01-01 &lt;NA&gt;
  43. #&gt; 16 7 RED 2001-01-01 BLUE
  44. #&gt; 17 7 BLUE 2002-02-02 BLUE
  45. #&gt; 18 7 BLUE 2003-03-03 BLUE
  46. #&gt; 19 7 BLUE 2004-04-04 BLUE
  47. #&gt; 20 7 BLUE 2007-07-07 BLUE

huangapple
  • 本文由 发表于 2023年3月31日 16:17:36
  • 转载请务必保留本文链接:https://go.coder-hub.com/75896311.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定