英文:
Complete time.series using a function based on data.table
问题
我试图学习更多关于data.table
以及正确的使用方式。
可复现的示例
这是我的数据:
set.seed(1)
df <- data.frame(group1 = rep(c("a", "b"), each = 9),
group2 = rep(c("A", "B", "C"), each = 3),
date = as.Date("2023-01-01") + c(1,2,5,0,3,4,2,4,6,0,2,3,1,2,3,4,6,7),
x = rnorm(18), y = rnorm(18))
我想要在每个组中添加缺失的日期。每个组都以其自己的最大日期和最小日期开始和结束。每个组由group1
和group2
定义。
任务
使用Tidyverse,我会这样做:
fill <- list(x=0, y=1)
out1 <- df %>%
dplyr::group_by(group1, group2) %>%
tidyr::complete(date = seq.Date(min(date), max(date), "day"), fill = fill) %>%
dplyr::ungroup() %>%
as.data.frame()
使用data.table(我认为)解决方案如下:
library(data.table)
fill <- list(x=0, y=1)
dt <- as.data.table(df)
indx <- dt[, .(date = seq(min(date), max(date), "day")), by = list(group1, group2)]
dt <- dt[indx, on = .(date, group1, group2)]
if (length(fill)) dt[, names(fill) := Map(replace, .SD, lapply(.SD, is.na), fill), .SDcols=names(fill)]
out2 <- as.data.frame(dt)
我得到相同的结果:
identical(out1, out2)
# TRUE
将任务制作成函数
现在,我想创建一个函数,该函数接受数据框、日期列、多个分组列和多个要填充的列作为输入。
使用Tidyverse,我会这样做:
complete_ts_tv <- function(df, date, ..., fill = list()){
df %>%
dplyr::group_by(...) %>%
tidyr::complete({{date}} := seq.Date(min({{date}}), max({{date}}), "day"), fill = fill) %>%
dplyr::ungroup() %>%
as.data.frame()
}
identical(out1, complete_ts_tv(df, date, group1, group2, fill = fill))
# TRUE
如何使用data.table创建类似的函数?
我尝试过的方法
我找到了一个解决方案,但如果日期列实际上被称为“date”,则无法正常工作。
library(data.table)
set.seed(1)
df <- data.frame(group1 = rep(c("a", "b"), each = 9),
group2 = rep(c("A", "B", "C"), each = 3),
date = as.Date("2023-01-01") + c(1,2,5,0,3,4,2,4,6,0,2,3,1,2,3,4,6,7),
x = rnorm(18), y = rnorm(18))
fill <- list(x=0, y=1)
complete_ts_dt <- function(df, date, ..., fill = list()) {
dt <- as.data.table(df)
# 为每个组合创建日期序列
indx <- dt[, .(date = seq(min(get(date)), max(get(date)), "day")), by = c(...)]
setnames(indx, "date", date)
# 在原始数据表上执行联接以填充缺失的日期
dt <- dt[indx, on = c(date, ...)]
# 用填充值替换指定列中的NA值
if (length(fill)) {
dt[, (names(fill)) := Map(replace, .SD, lapply(.SD, is.na), fill), .SDcols = names(fill)]
}
# 将结果转换回数据框
out <- as.data.frame(dt)
return(out)
}
# 无法正常工作!
complete_ts_dt(df,
date = "date", "group1", "group2", fill = list(x = 0, y = 1))
# Error in get(date): invalid first argument
# 可以正常工作!
complete_ts_dt(df = dplyr::rename(df, DATE = date),
date = "DATE", "group1", "group2", fill = list(x = 0, y = 1))
有什么帮助吗?
我还对如何使我的函数更适合正确使用data.table 感兴趣。
更新:最终,我这样做了。看起来足够符合data.table吗?
generate_dates <- function(date){
seq.Date(min(date), max(date), by = "day")
}
complete_ts_dt <- function(df, date, ..., fill = list()) {
dt <- as.data.table(df)
# 为每个组合创建日期序列
indx <- dt[, lapply(.SD, generate_dates), by = c(...), .SDcols = date]
# 在原始数据表上执行联接以填充缺失的日期
dt <- dt[indx, on = c(date, ...)]
# 用填充值替换指定列中的NA值
if (length(fill)) {
dt[, (names(fill)) := Map(replace, .SD, lapply(.SD, is.na), fill), .SDcols = names(fill)]
}
# 将结果转换回数据框
out <- as.data.frame(dt)
return(out)
}
现在它可以正常工作了。
英文:
I'm trying to learn more about data.table
and the proper way to use it.
Reproducible example
This is my data:
set.seed(1)
df <- data.frame(group1 = rep(c("a", "b"), each = 9),
group2 = rep(c("A", "B", "C"), each = 3),
date = as.Date("2023-01-01") + c(1,2,5,0,3,4,2,4,6,0,2,3,1,2,3,4,6,7),
x = rnorm(18), y = rnorm(18))
I want to add the missing dates in my dataset in each group.
Each group starts and ends at its own max and min date. Each group is defined by group1
and group2
.
Task
With Tidyverse I would do the following:
fill <- list(x=0, y=1)
out1 <- df |>
dplyr::group_by(group1, group2) |>
tidyr::complete(date = seq.Date(min(date), max(date), "day"), fill = fill) |>
dplyr::ungroup() |>
as.data.frame()
With data.table (I think) the solution would be:
library(data.table)
fill <- list(x=0, y=1)
dt <- as.data.table(df)
indx <- dt[, .(date = seq(min(date), max(date), "day")), by = list(group1, group2)]
dt <- dt[indx, on = .(date, group1, group2)]
if (length(fill)) dt[, names(fill) := Map(replace, .SD, lapply(.SD, is.na), fill), .SDcols=names(fill)]
out2 <- as.data.frame(dt)
I get the same result:
identical(out1, out2)
# TRUE
Make a function out of the task
Now, I want to create a function that takes in input a data.frame, a date column, multiple grouping columns and multiple columns to be filled.
With Tidyverse I would do:
complete_ts_tv <- function(df, date, ..., fill = list()){
df |>
dplyr::group_by(...) |>
tidyr::complete({{date}} := seq.Date(min({{date}}), max({{date}}), "day"), fill = fill) |>
dplyr::ungroup() |>
as.data.frame()
}
identical(out1, complete_ts_tv(df, date, group1, group2, fill = fill))
# TRUE
How do I create a similar function with data.table?
What I tried so far
I found a solution but it doesn't work if the date column is actually called "date".
library(data.table)
set.seed(1)
df <- data.frame(group1 = rep(c("a", "b"), each = 9),
group2 = rep(c("A", "B", "C"), each = 3),
date = as.Date("2023-01-01") + c(1,2,5,0,3,4,2,4,6,0,2,3,1,2,3,4,6,7),
x = rnorm(18), y = rnorm(18))
fill <- list(x=0, y=1)
complete_ts_dt <- function(df, date, ..., fill = list()) {
dt <- as.data.table(df)
# Create a sequence of dates for each group combination
indx <- dt[, .(date = seq(min(get(date)), max(get(date)), "day")), by = c(...)]
setnames(indx, "date", date)
# Perform a join on the original data.table to fill in missing dates
dt <- dt[indx, on = c(date, ...)]
# Replace NA values in the specified columns with the fill values
if (length(fill)) {
dt[, (names(fill)) := Map(replace, .SD, lapply(.SD, is.na), fill), .SDcols = names(fill)]
}
# Convert the result back to a data.frame
out <- as.data.frame(dt)
return(out)
}
# doesnt work!
complete_ts_dt(df,
date = "date", "group1", "group2", fill = list(x = 0, y = 1))
#> Error in get(date): invalid first argument
# works!
complete_ts_dt(df = dplyr::rename(df, DATE = date),
date = "DATE", "group1", "group2", fill = list(x = 0, y = 1))
#> group1 group2 DATE x y
#> 1 a A 2023-01-02 -0.62645381 0.82122120
#> 2 a A 2023-01-03 0.18364332 0.59390132
#> 3 a A 2023-01-04 0.00000000 1.00000000
#> 4 a A 2023-01-05 0.00000000 1.00000000
#> 5 a A 2023-01-06 -0.83562861 0.91897737
#> 6 a B 2023-01-01 1.59528080 0.78213630
#> 7 a B 2023-01-02 0.00000000 1.00000000
#> 8 a B 2023-01-03 0.00000000 1.00000000
#> 9 a B 2023-01-04 0.32950777 0.07456498
#> 10 a B 2023-01-05 -0.82046838 -1.98935170
#> 11 a C 2023-01-03 0.48742905 0.61982575
#> 12 a C 2023-01-04 0.00000000 1.00000000
#> 13 a C 2023-01-05 0.73832471 -0.05612874
#> 14 a C 2023-01-06 0.00000000 1.00000000
#> 15 a C 2023-01-07 0.57578135 -0.15579551
#> 16 b A 2023-01-01 -0.30538839 -1.47075238
#> 17 b A 2023-01-02 0.00000000 1.00000000
#> 18 b A 2023-01-03 1.51178117 -0.47815006
#> 19 b A 2023-01-04 0.38984324 0.41794156
#> 20 b B 2023-01-02 -0.62124058 1.35867955
#> 21 b B 2023-01-03 -2.21469989 -0.10278773
#> 22 b B 2023-01-04 1.12493092 0.38767161
#> 23 b C 2023-01-05 -0.04493361 -0.05380504
#> 24 b C 2023-01-06 0.00000000 1.00000000
#> 25 b C 2023-01-07 -0.01619026 -1.37705956
#> 26 b C 2023-01-08 0.94383621 -0.41499456
Any help?
I would be also interested in understanding how I can make my function more appropriate to a correct use of data.table.
Update: eventually I did like this. Does it look data.table-like enough?
generate_dates <- function(date){
seq.Date(min(date), max(date), by = "day")
}
complete_ts_dt <- function(df, date, ..., fill = list()) {
dt <- as.data.table(df)
# Create a sequence of dates for each group combination
indx <- dt[, lapply(.SD, generate_dates), by = c(...), .SDcols = date]
# Perform a join on the original data.table to fill in missing dates
dt <- dt[indx, on = c(date, ...)]
# Replace NA values in the specified columns with the fill values
if (length(fill)) {
dt[, (names(fill)) := Map(replace, .SD, lapply(.SD, is.na), fill), .SDcols = names(fill)]
}
# Convert the result back to a data.frame
out <- as.data.frame(dt)
return(out)
}
It works with my examples now.
答案1
得分: 1
Here's the translated code you provided:
如果你正在使用 `data.table`,请使用 `setDT` 将 `df` 转换为 `data.table`,然后保持不变。没有必要将函数输出再转换为 `data.frame`。
连接是一个不错的方法。尽管如此,我不认为需要使用 `lapply` 或一个辅助函数,而且 `NA` 填充可以简化。
complete_ts_dt1 <- function(df, date, ..., fill = list()) {
# 创建一个包含所有日期的 data.table
dt <- setnames(
setDT(df)[
,.(date = seq(min(.SD[[1]]), max(.SD[[1]]), "day")),
c(...), .SDcols = date # 按照 ...
], "date", date
)
# 将原始数据连接到包含所有缺失日期的 data.table
df[dt, on = c(..., date)][
is.na(get(names(fill)[[1]])), names(fill) := fill # NA 填充
]
}
另一种方法:
complete_ts_dt2 <- function(df, date, ..., fill = list()) {
setorderv( # 排序生成的 data.table
rbindlist( # 将原始数据绑定到包含缺失日期的 data.table 上
list(
setDT(df),
df[ # 创建一个包含缺失日期的 data.table
,.(
date = as.Date(
sequence(as.integer(diff(.SD[[1]]) - 1L), .SD[[1]][-.N] + 1L)
)
), c(...), .SDcols = date # 按照 ...
][,names(fill) := fill]
), use.names = FALSE
), c(..., date) # 按照 ... 和日期排序
)
}
data.table 在性能方面表现出色。计时:
microbenchmark::microbenchmark(
complete_ts_dt1 = complete_ts_dt1(df, date = "date", "group1", "group2", fill = list(x = 0, y = 1)),
complete_ts_dt2 = complete_ts_dt2(df, date = "date", "group1", "group2", fill = list(x = 0, y = 1)),
complete_ts_tv = complete_ts_tv(df, date, group1, group2, fill = list(x = 0, y = 1)),
times = 10
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> complete_ts_dt1 2.6638 2.7706 4.18143 2.93130 3.3745 9.7147 10
#> complete_ts_dt2 1.4248 1.5499 2.31117 1.89895 2.0148 6.8276 10
#> complete_ts_tv 20.3337 20.7733 24.21463 24.83460 26.8032 28.2474 10
I've provided the translation of the code you provided, focusing on the code portions and omitting the explanation and additional information. If you have any specific questions or need further assistance with this code, please let me know.
英文:
If you're working with data.table
, use setDT
to convert df
to a data.table
by reference and then leave it that way. No reason to convert the function output back to a data.frame
.
The join is a good approach. Although, I don't see the need for lapply
or a helper function, and the NA
fill can be simplified.
complete_ts_dt1 <- function(df, date, ..., fill = list()) {
# create a data.table with all dates filled in
dt <- setnames(
setDT(df)[
,.(date = seq(min(.SD[[1]]), max(.SD[[1]]), "day")),
c(...), .SDcols = date # group by ...
], "date", date
)
# join the original data to a data.table with all the missing dates filled in
df[dt, on = c(..., date)][
is.na(get(names(fill)[[1]])), names(fill) := fill # NA fill
]
}
An alternative approach:
complete_ts_dt2 <- function(df, date, ..., fill = list()) {
setorderv( # sort the resulting data.table
rbindlist( # bind the original data to a data.table with the missing dates
list(
setDT(df),
df[ # create a data.table with the missing dates
,.(
date = as.Date(
sequence(as.integer(diff(.SD[[1]]) - 1L), .SD[[1]][-.N] + 1L)
)
), c(...), .SDcols = date # group by ...
][,names(fill) := fill]
), use.names = FALSE
), c(..., date) # sort by ... and date
)
}
data.table
shines especially when performance is a concern. Timings:
microbenchmark::microbenchmark(
complete_ts_dt1 = complete_ts_dt1(df, date = "date", "group1", "group2", fill = list(x = 0, y = 1)),
complete_ts_dt2 = complete_ts_dt2(df, date = "date", "group1", "group2", fill = list(x = 0, y = 1)),
complete_ts_tv = complete_ts_tv(df, date, group1, group2, fill = list(x = 0, y = 1)),
times = 10
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> complete_ts_dt1 2.6638 2.7706 4.18143 2.93130 3.3745 9.7147 10
#> complete_ts_dt2 1.4248 1.5499 2.31117 1.89895 2.0148 6.8276 10
#> complete_ts_tv 20.3337 20.7733 24.21463 24.83460 26.8032 28.2474 10
Here's a full reprex to show that all the results are the same:
library(tidyverse)
library(data.table)
set.seed(1)
df <- data.frame(group1 = rep(c("a", "b"), each = 9),
group2 = rep(c("A", "B", "C"), each = 3),
date = as.Date("2023-01-01") + c(1,2,5,0,3,4,2,4,6,0,2,3,1,2,3,4,6,7),
x = rnorm(18), y = rnorm(18))
complete_ts_tv <- function(df, date, ..., fill = list()) {
df |>
group_by(...) |>
complete({{date}} := seq.Date(min({{date}}), max({{date}}), "day"), fill = fill) |>
ungroup() |>
as.data.frame()
}
generate_dates <- function(date) seq.Date(min(date), max(date), by = "day")
complete_ts_dt <- function(df, date, ..., fill = list()) {
dt <- as.data.table(df)
indx <- dt[, lapply(.SD, generate_dates), by = c(...), .SDcols = date]
dt <- dt[indx, on = c(date, ...)]
if (length(fill)) {
dt[, (names(fill)) := Map(replace, .SD, lapply(.SD, is.na), fill), .SDcols = names(fill)]
}
dt
}
complete_ts_dt1 <- function(df, date, ..., fill = list()) {
# create a data.table with all dates filled in
dt <- setnames(
setDT(df)[
,.(date = seq(min(.SD[[1]]), max(.SD[[1]]), "day")),
c(...), .SDcols = date # group by ...
], "date", date
)
# join the original data to a data.table with all the missing dates filled in
df[dt, on = c(..., date)][
is.na(get(names(fill)[[1]])), names(fill) := fill # NA fill
]
}
complete_ts_dt2 <- function(df, date, ..., fill = list()) {
setorderv( # sort the resulting data.table
rbindlist( # bind the original data to a data.table with the missing dates
list(
setDT(df),
df[ # create a data.table with the missing dates
,.(
date = as.Date(
sequence(as.integer(diff(.SD[[1]]) - 1L), .SD[[1]][-.N] + 1L)
)
), c(...), .SDcols = date # group by ...
][,names(fill) := fill]
), use.names = FALSE
), c(..., date) # sort by ... and date
)
}
complete_ts_dt2(df, "date", "group1", "group2", fill = list(x = 0, y = 1))[]
#> group1 group2 date x y
#> 1: a A 2023-01-02 -0.62645381 0.82122120
#> 2: a A 2023-01-03 0.18364332 0.59390132
#> 3: a A 2023-01-04 0.00000000 1.00000000
#> 4: a A 2023-01-05 0.00000000 1.00000000
#> 5: a A 2023-01-06 -0.83562861 0.91897737
#> 6: a B 2023-01-01 1.59528080 0.78213630
#> 7: a B 2023-01-02 0.00000000 1.00000000
#> 8: a B 2023-01-03 0.00000000 1.00000000
#> 9: a B 2023-01-04 0.32950777 0.07456498
#> 10: a B 2023-01-05 -0.82046838 -1.98935170
#> 11: a C 2023-01-03 0.48742905 0.61982575
#> 12: a C 2023-01-04 0.00000000 1.00000000
#> 13: a C 2023-01-05 0.73832471 -0.05612874
#> 14: a C 2023-01-06 0.00000000 1.00000000
#> 15: a C 2023-01-07 0.57578135 -0.15579551
#> 16: b A 2023-01-01 -0.30538839 -1.47075238
#> 17: b A 2023-01-02 0.00000000 1.00000000
#> 18: b A 2023-01-03 1.51178117 -0.47815006
#> 19: b A 2023-01-04 0.38984324 0.41794156
#> 20: b B 2023-01-02 -0.62124058 1.35867955
#> 21: b B 2023-01-03 -2.21469989 -0.10278773
#> 22: b B 2023-01-04 1.12493092 0.38767161
#> 23: b C 2023-01-05 -0.04493361 -0.05380504
#> 24: b C 2023-01-06 0.00000000 1.00000000
#> 25: b C 2023-01-07 -0.01619026 -1.37705956
#> 26: b C 2023-01-08 0.94383621 -0.41499456
#> group1 group2 date x y
res <- list(
setDT(complete_ts_tv(df, date, group1, group2, fill = list(x = 0, y = 1))),
complete_ts_dt(df, "date", "group1", "group2", fill = list(x = 0, y = 1)),
complete_ts_dt1(df, "date", "group1", "group2", fill = list(x = 0, y = 1)),
complete_ts_dt2(df, "date", "group1", "group2", fill = list(x = 0, y = 1))
)
identical(res[-4], res[-1])
#> [1] TRUE
Created on 2023-08-29 with reprex v2.0.2
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论