根据日期间隔创建新列

huangapple go评论64阅读模式
英文:

mutate new column based on date interval

问题

编辑 2023年2月4日

数据:

library(dplyr)

DF <- data.frame(
  stringsAsFactors = FALSE,
  ID = c(1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L,
         4L, 4L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L),
  COLOR = c("BLUE", "RED", "BLUE", "RED",
            "RED", "BLUE", "RED", "BLUE", "BLUE", "BLUE",
            "BLACK", "GREEN", "GRAY", "GRAY", "RED", "BLUE",
            "BLUE", "BLUE", "BLUE", "BLUE"),
  COLOR_DATE = c("2001-01-01", "2001-01-01",
                  "2002-02-02", "2001-01-01", "2002-02-02", "2008-08-08",
                  "2001-01-01", "2002-02-02", "2009-09-09", "2009-09-09",
                  "2001-01-01", "2006-06-06", "2001-01-01", "2008-01-01",
                  "2008-01-01", "2001-01-01", "2002-02-02", "2003-03-03",
                  "2004-04-04", "2007-07-07")
)

期望的输出:

       ID COLOR COLOR_DATE TRUE_COLOR
1   1  BLUE 2001-01-01       BLUE
2   2   RED 2001-01-01        MIX
3   2  BLUE 2002-02-02        MIX
4   3   RED 2001-01-01        MIX
5   3   RED 2002-02-02        MIX
6   3  BLUE 2008-08-08        MIX
7   4   RED 2001-01-01       BLUE
8   4  BLUE 2002-02-02       BLUE
9   4  BLUE 2009-09-09       BLUE
10  4  BLUE 2009-09-09       BLUE
11  5  BLUE 2001-01-01       BLUE
12  5 BLACK 2006-06-06       BLUE
13  6 GREEN 2001-01-01       <NA>
14  6  GRAY 2008-01-01       <NA>
15  6  GRAY 2008-01-01       <NA>
16  7   RED 2001-01-01       BLUE
17  7  BLUE 2002-02-02       BLUE
18  7  BLUE 2003-03-03       BLUE
19  7  BLUE 2004-04-04       BLUE
20  7  BLUE 2007-07-07       BLUE

逻辑:

  • 当同一ID中只有RED COLOR时,TRUE_COLOR = RED
  • 当同一ID中只有BLUE COLOR时,TRUE_COLOR = BLUE
  • 当同一ID中同时有REDBLUE COLOR时,TRUE_COLOR = MIX

但是,如果COLOR在过去至少5年内保持不变,那么TRUE_COLOR = REDBLUE(如示例数据中的ID 4和7)。

其他颜色除了REDBLUE都会被忽略。

最后,RED123BLUE234应该被解释为REDBLUE

如何解决?

英文:

Edited 2/4-2023

Data:

library(dplyr)

DF&lt;-data.frame(
  stringsAsFactors = FALSE,
                ID = c(1L,2L,2L,3L,3L,3L,4L,4L,
                       4L,4L,5L,5L,6L,6L,6L,7L,7L,7L,7L,7L),
             COLOR = c(&quot;BLUE&quot;,&quot;RED&quot;,&quot;BLUE&quot;,&quot;RED&quot;,
                       &quot;RED&quot;,&quot;BLUE&quot;,&quot;RED&quot;,&quot;BLUE&quot;,&quot;BLUE&quot;,&quot;BLUE&quot;,&quot;BLUE&quot;,
                       &quot;BLACK&quot;,&quot;GREEN&quot;,&quot;GRAY&quot;,&quot;GRAY&quot;,&quot;RED&quot;,&quot;BLUE&quot;,&quot;BLUE&quot;,
                       &quot;BLUE&quot;,&quot;BLUE&quot;),
        COLOR_DATE = c(&quot;2001-01-01&quot;,&quot;2001-01-01&quot;,
                       &quot;2002-02-02&quot;,&quot;2001-01-01&quot;,&quot;2002-02-02&quot;,&quot;2008-08-08&quot;,
                       &quot;2001-01-01&quot;,&quot;2002-02-02&quot;,&quot;2009-09-09&quot;,&quot;2009-09-09&quot;,
                       &quot;2001-01-01&quot;,&quot;2006-06-06&quot;,&quot;2001-01-01&quot;,&quot;2008-01-01&quot;,
                       &quot;2008-01-01&quot;,&quot;2001-01-01&quot;,&quot;2002-02-02&quot;,&quot;2003-03-03&quot;,
                       &quot;2004-04-04&quot;,&quot;2007-07-07&quot;)
)

Desired output:

       ID COLOR COLOR_DATE TRUE_COLOR
1   1  BLUE 2001-01-01       BLUE
2   2   RED 2001-01-01        MIX
3   2  BLUE 2002-02-02        MIX
4   3   RED 2001-01-01        MIX
5   3   RED 2002-02-02        MIX
6   3  BLUE 2008-08-08        MIX
7   4   RED 2001-01-01       BLUE
8   4  BLUE 2002-02-02       BLUE
9   4  BLUE 2009-09-09       BLUE
10  4  BLUE 2009-09-09       BLUE
11  5  BLUE 2001-01-01       BLUE
12  5 BLACK 2006-06-06       BLUE
13  6 GREEN 2001-01-01       &lt;NA&gt;
14  6  GRAY 2008-01-01       &lt;NA&gt;
15  6  GRAY 2008-01-01       &lt;NA&gt;
16  7   RED 2001-01-01       BLUE
17  7  BLUE 2002-02-02       BLUE
18  7  BLUE 2003-03-03       BLUE
19  7  BLUE 2004-04-04       BLUE
20  7  BLUE 2007-07-07       BLUE

Logic:

When only RED COLOR in same ID then TRUE_COLOR = RED.
When only BLUE COLOR in same ID then TRUE_COLOR = BLUE.
When both RED and BLUE COLOR in same ID then TRUE_COLOR = MIX.

However,
If the COLOR has been the same for at least the 5 recent years, then TRUE_COLOR = RED or BLUE (as in sample data ID 4 and 7).

Other COLOR than RED or BLUE are ignored.

Finally, RED123, and BLUE234 should be interpreted as RED and BLUE, respectively.

How to solve?

答案1

得分: 1

以下是代码的翻译部分:

library(tidyverse); library(lubridate)
blu_red <- quo(COLOR2 %in% c("BLUE", "RED"))
DF %>%
  mutate(COLOR2 = str_extract(COLOR, "BLUE|RED"), 
         COLOR_DATE = as.Date(COLOR_DATE)) %>%
  arrange(COLOR_DATE) %>%
  group_by(ID) %>%
  mutate(
    TRUE_COLOR = case_when(
      isTRUE(all(!(!!blu_red))) ~ NA,
      isTRUE(n() == 1 & !!blu_red) ~ COLOR2,
      isTRUE(n() == 1 & !(!!blu_red)) ~ NA,
      isTRUE((last(COLOR_DATE) - COLOR_DATE[last(which(COLOR2 != lag(COLOR2)))]) >= years(5) & 
        last(COLOR2) %in% c("BLUE", "RED")) ~ last(COLOR2),
      isTRUE(all(COLOR2[!!blu_red] == "BLUE")) ~ "BLUE",
      isTRUE(all(COLOR2[!!blu_red] == "RED")) ~ "RED",
      TRUE ~ "MIX")) %>%
  ungroup() %>%
  select(- COLOR2) %>%
  arrange(ID)

希望这对您有帮助。

英文:
library(tidyverse); library(lubridate)
blu_red &lt;- quo(COLOR2 %in% c(&quot;BLUE&quot;, &quot;RED&quot;))
DF %&gt;%
  mutate(COLOR2 = str_extract(COLOR, &quot;BLUE|RED&quot;), 
         COLOR_DATE = as.Date(COLOR_DATE)) %&gt;% 
  arrange(COLOR_DATE) %&gt;% 
  group_by(ID) %&gt;%
  mutate(
    TRUE_COLOR = case_when(
      isTRUE(all(!(!!blu_red))) ~ NA,
      isTRUE(n() == 1 &amp; !!blu_red) ~ COLOR2,
      isTRUE(n() == 1 &amp; !(!!blu_red)) ~ NA,
      isTRUE((last(COLOR_DATE) - COLOR_DATE[last(which(COLOR2 != lag(COLOR2)))]) &gt;= years(5) &amp; 
        last(COLOR2) %in% c(&quot;BLUE&quot;, &quot;RED&quot;)) ~ last(COLOR2),
      isTRUE(all(COLOR2[!!blu_red] == &quot;BLUE&quot;)) ~ &quot;BLUE&quot;,
      isTRUE(all(COLOR2[!!blu_red] == &quot;RED&quot;)) ~ &quot;RED&quot;,
      TRUE ~ &quot;MIX&quot;)) %&gt;% 
  ungroup() %&gt;% 
  select(- COLOR2) %&gt;% 
  arrange(ID)

答案2

得分: 1

这个解决方案非常好:首先找到每个ID对应的颜色,然后使用dplyr::left_join()将其添加回原始表格。如果没有找到ID的颜色,left_join()会自动用NA填充。

library(tidyverse)
library(lubridate) # 如果tidyverse >= 2.0.0未安装

id_color <- DF %>%
  mutate(
    red_blue = case_match(
      COLOR,  # 确定颜色是红色、蓝色还是其他
      c("RED", "RED123") ~ "RED",
      c("BLUE", "BLUE234") ~ "BLUE",
      .default = NA   # 忽略所有不是红色/蓝色的颜色
    ),
    date = ymd(COLOR_DATE)  # 将COLOR_DATE转换为日期
  ) %>%
  drop_na(red_blue) %>%  # 删除不是红色/蓝色的颜色
  group_by(ID) %>%
  # 仅包括日期范围涵盖过去5年的日期
  dplyr::filter(date == last(date) | lead(date) >= last(date) - years(5)) %>%
  summarise(
    TRUE_COLOR = if_else(
      length(unique(red_blue)) == 1,
      first(red_blue), "MIX"
    )
  )

left_join(DF, id_color)

上面的代码段是R语言代码,用于处理数据,它首先将每个ID对应的颜色找出,然后将这些颜色添加回原始表格。如果找不到ID的颜色,它会自动用NA填充。

英文:

This solution is quite nice: we first find the color corresponding to each ID and then use dplyr::left_join() to add this back to the original table. If we don't find a color for an ID, then left_join() will automatically fill this in with NA.

library(tidyverse)
library(lubridate) # If tidyverse &gt;= 2.0.0 is not installed

id_color &lt;- DF %&gt;%
  mutate(
    red_blue = case_match(
      COLOR,  # Determine if a color is red, blue or neither
      c(&quot;RED&quot;, &quot;RED123&quot;) ~ &quot;RED&quot;,
      c(&quot;BLUE&quot;, &quot;BLUE234&quot;) ~ &quot;BLUE&quot;,
      .default = NA   # Ignore all colors that are not red/blue
    ),
    date = ymd(COLOR_DATE)  # Convert the COLOR_DATE to a date
  ) %&gt;%
  drop_na(red_blue) %&gt;%  # Remove colors that are not red/blue
  group_by(ID) %&gt;%
  # Only include dates such that the range of dates cover the last 5 years
  dplyr::filter(date == last(date) | lead(date) &gt;= last(date) - years(5)) %&gt;%
  summarise(
    TRUE_COLOR = if_else(
      length(unique(red_blue)) == 1,
      first(red_blue), &quot;MIX&quot;
    )
  )

left_join(DF, id_color)
#&gt; Joining with `by = join_by(ID)`
#&gt; # A tibble: 20 &#215; 4
#&gt;       ID COLOR COLOR_DATE TRUE_COLOR
#&gt;    &lt;int&gt; &lt;chr&gt; &lt;chr&gt;      &lt;chr&gt;     
#&gt;  1     1 BLUE  2001-01-01 BLUE      
#&gt;  2     2 RED   2001-01-01 MIX       
#&gt;  3     2 BLUE  2002-02-02 MIX       
#&gt;  4     3 RED   2001-01-01 MIX       
#&gt;  5     3 RED   2002-02-02 MIX       
#&gt;  6     3 BLUE  2008-08-08 MIX       
#&gt;  7     4 RED   2001-01-01 BLUE      
#&gt;  8     4 BLUE  2002-02-02 BLUE      
#&gt;  9     4 BLUE  2009-09-09 BLUE      
#&gt; 10     4 BLUE  2009-09-09 BLUE      
#&gt; 11     5 BLUE  2001-01-01 BLUE      
#&gt; 12     5 BLACK 2006-06-06 BLUE      
#&gt; 13     6 GREEN 2001-01-01 &lt;NA&gt;      
#&gt; 14     6 GRAY  2008-01-01 &lt;NA&gt;      
#&gt; 15     6 GRAY  2008-01-01 &lt;NA&gt;      
#&gt; 16     7 RED   2001-01-01 BLUE      
#&gt; 17     7 BLUE  2002-02-02 BLUE      
#&gt; 18     7 BLUE  2003-03-03 BLUE      
#&gt; 19     7 BLUE  2004-04-04 BLUE      
#&gt; 20     7 BLUE  2007-07-07 BLUE

huangapple
  • 本文由 发表于 2023年3月31日 16:17:36
  • 转载请务必保留本文链接:https://go.coder-hub.com/75896311.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定