英文:
mutate new column based on date interval
问题
编辑 2023年2月4日
数据:
library(dplyr)
DF <- data.frame(
stringsAsFactors = FALSE,
ID = c(1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L,
4L, 4L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L),
COLOR = c("BLUE", "RED", "BLUE", "RED",
"RED", "BLUE", "RED", "BLUE", "BLUE", "BLUE",
"BLACK", "GREEN", "GRAY", "GRAY", "RED", "BLUE",
"BLUE", "BLUE", "BLUE", "BLUE"),
COLOR_DATE = c("2001-01-01", "2001-01-01",
"2002-02-02", "2001-01-01", "2002-02-02", "2008-08-08",
"2001-01-01", "2002-02-02", "2009-09-09", "2009-09-09",
"2001-01-01", "2006-06-06", "2001-01-01", "2008-01-01",
"2008-01-01", "2001-01-01", "2002-02-02", "2003-03-03",
"2004-04-04", "2007-07-07")
)
期望的输出:
ID COLOR COLOR_DATE TRUE_COLOR
1 1 BLUE 2001-01-01 BLUE
2 2 RED 2001-01-01 MIX
3 2 BLUE 2002-02-02 MIX
4 3 RED 2001-01-01 MIX
5 3 RED 2002-02-02 MIX
6 3 BLUE 2008-08-08 MIX
7 4 RED 2001-01-01 BLUE
8 4 BLUE 2002-02-02 BLUE
9 4 BLUE 2009-09-09 BLUE
10 4 BLUE 2009-09-09 BLUE
11 5 BLUE 2001-01-01 BLUE
12 5 BLACK 2006-06-06 BLUE
13 6 GREEN 2001-01-01 <NA>
14 6 GRAY 2008-01-01 <NA>
15 6 GRAY 2008-01-01 <NA>
16 7 RED 2001-01-01 BLUE
17 7 BLUE 2002-02-02 BLUE
18 7 BLUE 2003-03-03 BLUE
19 7 BLUE 2004-04-04 BLUE
20 7 BLUE 2007-07-07 BLUE
逻辑:
- 当同一ID中只有
RED COLOR
时,TRUE_COLOR = RED
。 - 当同一ID中只有
BLUE COLOR
时,TRUE_COLOR = BLUE
。 - 当同一ID中同时有
RED
和BLUE COLOR
时,TRUE_COLOR = MIX
。
但是,如果COLOR
在过去至少5年内保持不变,那么TRUE_COLOR = RED
或BLUE
(如示例数据中的ID 4和7)。
其他颜色除了RED
或BLUE
都会被忽略。
最后,RED123
和BLUE234
应该被解释为RED
和BLUE
。
如何解决?
英文:
Edited 2/4-2023
Data:
library(dplyr)
DF<-data.frame(
stringsAsFactors = FALSE,
ID = c(1L,2L,2L,3L,3L,3L,4L,4L,
4L,4L,5L,5L,6L,6L,6L,7L,7L,7L,7L,7L),
COLOR = c("BLUE","RED","BLUE","RED",
"RED","BLUE","RED","BLUE","BLUE","BLUE","BLUE",
"BLACK","GREEN","GRAY","GRAY","RED","BLUE","BLUE",
"BLUE","BLUE"),
COLOR_DATE = c("2001-01-01","2001-01-01",
"2002-02-02","2001-01-01","2002-02-02","2008-08-08",
"2001-01-01","2002-02-02","2009-09-09","2009-09-09",
"2001-01-01","2006-06-06","2001-01-01","2008-01-01",
"2008-01-01","2001-01-01","2002-02-02","2003-03-03",
"2004-04-04","2007-07-07")
)
Desired output:
ID COLOR COLOR_DATE TRUE_COLOR
1 1 BLUE 2001-01-01 BLUE
2 2 RED 2001-01-01 MIX
3 2 BLUE 2002-02-02 MIX
4 3 RED 2001-01-01 MIX
5 3 RED 2002-02-02 MIX
6 3 BLUE 2008-08-08 MIX
7 4 RED 2001-01-01 BLUE
8 4 BLUE 2002-02-02 BLUE
9 4 BLUE 2009-09-09 BLUE
10 4 BLUE 2009-09-09 BLUE
11 5 BLUE 2001-01-01 BLUE
12 5 BLACK 2006-06-06 BLUE
13 6 GREEN 2001-01-01 <NA>
14 6 GRAY 2008-01-01 <NA>
15 6 GRAY 2008-01-01 <NA>
16 7 RED 2001-01-01 BLUE
17 7 BLUE 2002-02-02 BLUE
18 7 BLUE 2003-03-03 BLUE
19 7 BLUE 2004-04-04 BLUE
20 7 BLUE 2007-07-07 BLUE
Logic:
When only RED COLOR
in same ID
then TRUE_COLOR = RED
.
When only BLUE COLOR
in same ID
then TRUE_COLOR = BLUE
.
When both RED
and BLUE COLOR
in same ID
then TRUE_COLOR = MIX
.
However,
If the COLOR
has been the same for at least the 5 recent years, then TRUE_COLOR = RED
or BLUE
(as in sample data ID 4 and 7).
Other COLOR
than RED
or BLUE
are ignored.
Finally, RED123
, and BLUE234
should be interpreted as RED
and BLUE
, respectively.
How to solve?
答案1
得分: 1
以下是代码的翻译部分:
library(tidyverse); library(lubridate)
blu_red <- quo(COLOR2 %in% c("BLUE", "RED"))
DF %>%
mutate(COLOR2 = str_extract(COLOR, "BLUE|RED"),
COLOR_DATE = as.Date(COLOR_DATE)) %>%
arrange(COLOR_DATE) %>%
group_by(ID) %>%
mutate(
TRUE_COLOR = case_when(
isTRUE(all(!(!!blu_red))) ~ NA,
isTRUE(n() == 1 & !!blu_red) ~ COLOR2,
isTRUE(n() == 1 & !(!!blu_red)) ~ NA,
isTRUE((last(COLOR_DATE) - COLOR_DATE[last(which(COLOR2 != lag(COLOR2)))]) >= years(5) &
last(COLOR2) %in% c("BLUE", "RED")) ~ last(COLOR2),
isTRUE(all(COLOR2[!!blu_red] == "BLUE")) ~ "BLUE",
isTRUE(all(COLOR2[!!blu_red] == "RED")) ~ "RED",
TRUE ~ "MIX")) %>%
ungroup() %>%
select(- COLOR2) %>%
arrange(ID)
希望这对您有帮助。
英文:
library(tidyverse); library(lubridate)
blu_red <- quo(COLOR2 %in% c("BLUE", "RED"))
DF %>%
mutate(COLOR2 = str_extract(COLOR, "BLUE|RED"),
COLOR_DATE = as.Date(COLOR_DATE)) %>%
arrange(COLOR_DATE) %>%
group_by(ID) %>%
mutate(
TRUE_COLOR = case_when(
isTRUE(all(!(!!blu_red))) ~ NA,
isTRUE(n() == 1 & !!blu_red) ~ COLOR2,
isTRUE(n() == 1 & !(!!blu_red)) ~ NA,
isTRUE((last(COLOR_DATE) - COLOR_DATE[last(which(COLOR2 != lag(COLOR2)))]) >= years(5) &
last(COLOR2) %in% c("BLUE", "RED")) ~ last(COLOR2),
isTRUE(all(COLOR2[!!blu_red] == "BLUE")) ~ "BLUE",
isTRUE(all(COLOR2[!!blu_red] == "RED")) ~ "RED",
TRUE ~ "MIX")) %>%
ungroup() %>%
select(- COLOR2) %>%
arrange(ID)
答案2
得分: 1
这个解决方案非常好:首先找到每个ID对应的颜色,然后使用dplyr::left_join()
将其添加回原始表格。如果没有找到ID的颜色,left_join()
会自动用NA
填充。
library(tidyverse)
library(lubridate) # 如果tidyverse >= 2.0.0未安装
id_color <- DF %>%
mutate(
red_blue = case_match(
COLOR, # 确定颜色是红色、蓝色还是其他
c("RED", "RED123") ~ "RED",
c("BLUE", "BLUE234") ~ "BLUE",
.default = NA # 忽略所有不是红色/蓝色的颜色
),
date = ymd(COLOR_DATE) # 将COLOR_DATE转换为日期
) %>%
drop_na(red_blue) %>% # 删除不是红色/蓝色的颜色
group_by(ID) %>%
# 仅包括日期范围涵盖过去5年的日期
dplyr::filter(date == last(date) | lead(date) >= last(date) - years(5)) %>%
summarise(
TRUE_COLOR = if_else(
length(unique(red_blue)) == 1,
first(red_blue), "MIX"
)
)
left_join(DF, id_color)
上面的代码段是R语言代码,用于处理数据,它首先将每个ID对应的颜色找出,然后将这些颜色添加回原始表格。如果找不到ID的颜色,它会自动用NA
填充。
英文:
This solution is quite nice: we first find the color corresponding to each ID and then use dplyr::left_join()
to add this back to the original table. If we don't find a color for an ID, then left_join()
will automatically fill this in with NA
.
library(tidyverse)
library(lubridate) # If tidyverse >= 2.0.0 is not installed
id_color <- DF %>%
mutate(
red_blue = case_match(
COLOR, # Determine if a color is red, blue or neither
c("RED", "RED123") ~ "RED",
c("BLUE", "BLUE234") ~ "BLUE",
.default = NA # Ignore all colors that are not red/blue
),
date = ymd(COLOR_DATE) # Convert the COLOR_DATE to a date
) %>%
drop_na(red_blue) %>% # Remove colors that are not red/blue
group_by(ID) %>%
# Only include dates such that the range of dates cover the last 5 years
dplyr::filter(date == last(date) | lead(date) >= last(date) - years(5)) %>%
summarise(
TRUE_COLOR = if_else(
length(unique(red_blue)) == 1,
first(red_blue), "MIX"
)
)
left_join(DF, id_color)
#> Joining with `by = join_by(ID)`
#> # A tibble: 20 × 4
#> ID COLOR COLOR_DATE TRUE_COLOR
#> <int> <chr> <chr> <chr>
#> 1 1 BLUE 2001-01-01 BLUE
#> 2 2 RED 2001-01-01 MIX
#> 3 2 BLUE 2002-02-02 MIX
#> 4 3 RED 2001-01-01 MIX
#> 5 3 RED 2002-02-02 MIX
#> 6 3 BLUE 2008-08-08 MIX
#> 7 4 RED 2001-01-01 BLUE
#> 8 4 BLUE 2002-02-02 BLUE
#> 9 4 BLUE 2009-09-09 BLUE
#> 10 4 BLUE 2009-09-09 BLUE
#> 11 5 BLUE 2001-01-01 BLUE
#> 12 5 BLACK 2006-06-06 BLUE
#> 13 6 GREEN 2001-01-01 <NA>
#> 14 6 GRAY 2008-01-01 <NA>
#> 15 6 GRAY 2008-01-01 <NA>
#> 16 7 RED 2001-01-01 BLUE
#> 17 7 BLUE 2002-02-02 BLUE
#> 18 7 BLUE 2003-03-03 BLUE
#> 19 7 BLUE 2004-04-04 BLUE
#> 20 7 BLUE 2007-07-07 BLUE
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论