使用基于data.table的函数完成time.series

huangapple go评论107阅读模式
英文:

Complete time.series using a function based on data.table

问题

我试图学习更多关于data.table以及正确的使用方式。

可复现的示例

这是我的数据:

set.seed(1)
df <- data.frame(group1 = rep(c("a", "b"), each = 9), 
                 group2 = rep(c("A", "B", "C"), each = 3),
                 date   = as.Date("2023-01-01") + c(1,2,5,0,3,4,2,4,6,0,2,3,1,2,3,4,6,7),
                 x = rnorm(18), y = rnorm(18))

我想要在每个组中添加缺失的日期。每个组都以其自己的最大日期和最小日期开始和结束。每个组由group1group2定义。

任务

使用Tidyverse,我会这样做:

fill <- list(x=0, y=1)

out1 <- df %>%
  dplyr::group_by(group1, group2) %>%
  tidyr::complete(date = seq.Date(min(date), max(date), "day"), fill = fill) %>%
  dplyr::ungroup() %>%
  as.data.frame()

使用data.table(我认为)解决方案如下:

library(data.table)
fill <- list(x=0, y=1)
dt <- as.data.table(df)
indx <- dt[, .(date = seq(min(date), max(date), "day")), by = list(group1, group2)]
dt <- dt[indx, on = .(date, group1, group2)]
if (length(fill)) dt[, names(fill) := Map(replace, .SD, lapply(.SD, is.na), fill), .SDcols=names(fill)]
out2 <- as.data.frame(dt)

我得到相同的结果:

identical(out1, out2)
# TRUE

将任务制作成函数

现在,我想创建一个函数,该函数接受数据框、日期列、多个分组列和多个要填充的列作为输入。

使用Tidyverse,我会这样做:

complete_ts_tv <- function(df, date, ..., fill = list()){
  
  df %>%
    dplyr::group_by(...) %>%
    tidyr::complete({{date}} := seq.Date(min({{date}}), max({{date}}), "day"), fill = fill) %>%
    dplyr::ungroup() %>%
    as.data.frame()
  
}

identical(out1, complete_ts_tv(df, date, group1, group2, fill = fill))
# TRUE

如何使用data.table创建类似的函数?

我尝试过的方法

我找到了一个解决方案,但如果日期列实际上被称为“date”,则无法正常工作。

library(data.table)
set.seed(1)
df <- data.frame(group1 = rep(c("a", "b"), each = 9), 
                 group2 = rep(c("A", "B", "C"), each = 3),
                 date   = as.Date("2023-01-01") + c(1,2,5,0,3,4,2,4,6,0,2,3,1,2,3,4,6,7),
                 x = rnorm(18), y = rnorm(18))
fill <- list(x=0, y=1)

complete_ts_dt <- function(df, date, ..., fill = list()) {
  
  dt <- as.data.table(df)
  
  # 为每个组合创建日期序列
  indx <- dt[, .(date = seq(min(get(date)), max(get(date)), "day")), by = c(...)]
  
  setnames(indx, "date", date)
  
  # 在原始数据表上执行联接以填充缺失的日期
  dt <- dt[indx, on = c(date, ...)]
  
  # 用填充值替换指定列中的NA值
  if (length(fill)) {
    dt[, (names(fill)) := Map(replace, .SD, lapply(.SD, is.na), fill), .SDcols = names(fill)]
  }
  
  # 将结果转换回数据框
  out <- as.data.frame(dt)
  
  return(out)
}

# 无法正常工作!
complete_ts_dt(df, 
            date = "date", "group1", "group2", fill = list(x = 0, y = 1))
# Error in get(date): invalid first argument

# 可以正常工作!
complete_ts_dt(df = dplyr::rename(df, DATE = date),
            date = "DATE", "group1", "group2", fill = list(x = 0, y = 1))

有什么帮助吗?
我还对如何使我的函数更适合正确使用data.table 感兴趣。

更新:最终,我这样做了。看起来足够符合data.table吗?

generate_dates <- function(date){

  seq.Date(min(date), max(date), by = "day")

}

complete_ts_dt <- function(df, date, ..., fill = list()) {
  
  dt <- as.data.table(df)

  # 为每个组合创建日期序列
  indx <- dt[, lapply(.SD, generate_dates), by = c(...), .SDcols = date]

  # 在原始数据表上执行联接以填充缺失的日期
  dt <- dt[indx, on = c(date, ...)]

  # 用填充值替换指定列中的NA值
  if (length(fill)) {
    dt[, (names(fill)) := Map(replace, .SD, lapply(.SD, is.na), fill), .SDcols = names(fill)]
  }

  # 将结果转换回数据框
  out <- as.data.frame(dt)
  
  return(out)
}

现在它可以正常工作了。

英文:

I'm trying to learn more about data.table and the proper way to use it.

Reproducible example

This is my data:

set.seed(1)
df &lt;- data.frame(group1 = rep(c(&quot;a&quot;, &quot;b&quot;), each = 9), 
                 group2 = rep(c(&quot;A&quot;, &quot;B&quot;, &quot;C&quot;), each = 3),
                 date   = as.Date(&quot;2023-01-01&quot;) + c(1,2,5,0,3,4,2,4,6,0,2,3,1,2,3,4,6,7),
                 x = rnorm(18), y = rnorm(18))

I want to add the missing dates in my dataset in each group.
Each group starts and ends at its own max and min date. Each group is defined by group1 and group2.

Task

With Tidyverse I would do the following:

fill &lt;- list(x=0, y=1)

out1 &lt;- df |&gt; 
  dplyr::group_by(group1, group2) |&gt; 
  tidyr::complete(date = seq.Date(min(date), max(date), &quot;day&quot;), fill = fill) |&gt; 
  dplyr::ungroup() |&gt; 
  as.data.frame()

With data.table (I think) the solution would be:

library(data.table)
fill &lt;- list(x=0, y=1)
dt &lt;- as.data.table(df)
indx &lt;- dt[, .(date = seq(min(date), max(date), &quot;day&quot;)), by = list(group1, group2)]
dt &lt;- dt[indx, on = .(date, group1, group2)]
if (length(fill)) dt[, names(fill) := Map(replace, .SD, lapply(.SD, is.na), fill), .SDcols=names(fill)]
out2 &lt;- as.data.frame(dt)

I get the same result:

identical(out1, out2)
# TRUE

Make a function out of the task

Now, I want to create a function that takes in input a data.frame, a date column, multiple grouping columns and multiple columns to be filled.

With Tidyverse I would do:

complete_ts_tv &lt;- function(df, date, ..., fill = list()){
  
  df |&gt; 
    dplyr::group_by(...) |&gt; 
    tidyr::complete({{date}} := seq.Date(min({{date}}), max({{date}}), &quot;day&quot;), fill = fill) |&gt; 
    dplyr::ungroup() |&gt; 
    as.data.frame()
  
}

identical(out1, complete_ts_tv(df, date, group1, group2, fill = fill))
# TRUE

How do I create a similar function with data.table?

What I tried so far

I found a solution but it doesn't work if the date column is actually called "date".

library(data.table)
set.seed(1)
df &lt;- data.frame(group1 = rep(c(&quot;a&quot;, &quot;b&quot;), each = 9), 
                 group2 = rep(c(&quot;A&quot;, &quot;B&quot;, &quot;C&quot;), each = 3),
                 date   = as.Date(&quot;2023-01-01&quot;) + c(1,2,5,0,3,4,2,4,6,0,2,3,1,2,3,4,6,7),
                 x = rnorm(18), y = rnorm(18))
fill &lt;- list(x=0, y=1)

complete_ts_dt &lt;- function(df, date, ..., fill = list()) {
  
  dt &lt;- as.data.table(df)
  
  # Create a sequence of dates for each group combination
  indx &lt;- dt[, .(date = seq(min(get(date)), max(get(date)), &quot;day&quot;)), by = c(...)]
  
  setnames(indx, &quot;date&quot;, date)
  
  # Perform a join on the original data.table to fill in missing dates
  dt &lt;- dt[indx, on = c(date, ...)]
  
  # Replace NA values in the specified columns with the fill values
  if (length(fill)) {
    dt[, (names(fill)) := Map(replace, .SD, lapply(.SD, is.na), fill), .SDcols = names(fill)]
  }
  
  # Convert the result back to a data.frame
  out &lt;- as.data.frame(dt)
  
  return(out)
}

# doesnt work!
complete_ts_dt(df, 
            date = &quot;date&quot;, &quot;group1&quot;, &quot;group2&quot;, fill = list(x = 0, y = 1))
#&gt; Error in get(date): invalid first argument

# works!
complete_ts_dt(df = dplyr::rename(df, DATE = date),
            date = &quot;DATE&quot;, &quot;group1&quot;, &quot;group2&quot;, fill = list(x = 0, y = 1))
#&gt;    group1 group2       DATE           x           y
#&gt; 1       a      A 2023-01-02 -0.62645381  0.82122120
#&gt; 2       a      A 2023-01-03  0.18364332  0.59390132
#&gt; 3       a      A 2023-01-04  0.00000000  1.00000000
#&gt; 4       a      A 2023-01-05  0.00000000  1.00000000
#&gt; 5       a      A 2023-01-06 -0.83562861  0.91897737
#&gt; 6       a      B 2023-01-01  1.59528080  0.78213630
#&gt; 7       a      B 2023-01-02  0.00000000  1.00000000
#&gt; 8       a      B 2023-01-03  0.00000000  1.00000000
#&gt; 9       a      B 2023-01-04  0.32950777  0.07456498
#&gt; 10      a      B 2023-01-05 -0.82046838 -1.98935170
#&gt; 11      a      C 2023-01-03  0.48742905  0.61982575
#&gt; 12      a      C 2023-01-04  0.00000000  1.00000000
#&gt; 13      a      C 2023-01-05  0.73832471 -0.05612874
#&gt; 14      a      C 2023-01-06  0.00000000  1.00000000
#&gt; 15      a      C 2023-01-07  0.57578135 -0.15579551
#&gt; 16      b      A 2023-01-01 -0.30538839 -1.47075238
#&gt; 17      b      A 2023-01-02  0.00000000  1.00000000
#&gt; 18      b      A 2023-01-03  1.51178117 -0.47815006
#&gt; 19      b      A 2023-01-04  0.38984324  0.41794156
#&gt; 20      b      B 2023-01-02 -0.62124058  1.35867955
#&gt; 21      b      B 2023-01-03 -2.21469989 -0.10278773
#&gt; 22      b      B 2023-01-04  1.12493092  0.38767161
#&gt; 23      b      C 2023-01-05 -0.04493361 -0.05380504
#&gt; 24      b      C 2023-01-06  0.00000000  1.00000000
#&gt; 25      b      C 2023-01-07 -0.01619026 -1.37705956
#&gt; 26      b      C 2023-01-08  0.94383621 -0.41499456

Any help?
I would be also interested in understanding how I can make my function more appropriate to a correct use of data.table.


Update: eventually I did like this. Does it look data.table-like enough?

generate_dates &lt;- function(date){

  seq.Date(min(date), max(date), by = &quot;day&quot;)

}

complete_ts_dt &lt;- function(df, date, ..., fill = list()) {
  
  dt &lt;- as.data.table(df)

  # Create a sequence of dates for each group combination
  indx &lt;- dt[, lapply(.SD, generate_dates), by = c(...), .SDcols = date]

  # Perform a join on the original data.table to fill in missing dates
  dt &lt;- dt[indx, on = c(date, ...)]

  # Replace NA values in the specified columns with the fill values
  if (length(fill)) {
    dt[, (names(fill)) := Map(replace, .SD, lapply(.SD, is.na), fill), .SDcols = names(fill)]
  }

  # Convert the result back to a data.frame
  out &lt;- as.data.frame(dt)
  
  return(out)
}

It works with my examples now.

答案1

得分: 1

Here's the translated code you provided:

如果你正在使用 `data.table`,请使用 `setDT` 将 `df` 转换为 `data.table`,然后保持不变。没有必要将函数输出再转换为 `data.frame`。

连接是一个不错的方法。尽管如此,我不认为需要使用 `lapply` 或一个辅助函数,而且 `NA` 填充可以简化。

complete_ts_dt1 <- function(df, date, ..., fill = list()) {
  # 创建一个包含所有日期的 data.table
  dt <- setnames(
    setDT(df)[
      ,.(date = seq(min(.SD[[1]]), max(.SD[[1]]), "day")),
      c(...), .SDcols = date # 按照 ...
    ], "date", date
  )
  
  # 将原始数据连接到包含所有缺失日期的 data.table
  df[dt, on = c(..., date)][
    is.na(get(names(fill)[[1]])), names(fill) := fill # NA 填充
  ]
}

另一种方法:

complete_ts_dt2 <- function(df, date, ..., fill = list()) {
  setorderv( # 排序生成的 data.table
    rbindlist( # 将原始数据绑定到包含缺失日期的 data.table 上
      list(
        setDT(df),
        df[ # 创建一个包含缺失日期的 data.table
          ,.(
            date = as.Date(
              sequence(as.integer(diff(.SD[[1]]) - 1L), .SD[[1]][-.N] + 1L)
            )
          ), c(...), .SDcols = date # 按照 ...
        ][,names(fill) := fill]
      ), use.names = FALSE
    ), c(..., date) # 按照 ... 和日期排序
  )
}

data.table 在性能方面表现出色。计时:

microbenchmark::microbenchmark(
  complete_ts_dt1 = complete_ts_dt1(df, date = "date", "group1", "group2", fill = list(x = 0, y = 1)),
  complete_ts_dt2 = complete_ts_dt2(df, date = "date", "group1", "group2", fill = list(x = 0, y = 1)),
  complete_ts_tv = complete_ts_tv(df, date, group1, group2, fill = list(x = 0, y = 1)),
  times = 10
)
#> Unit: milliseconds
#>             expr     min      lq     mean   median      uq     max neval
#>  complete_ts_dt1  2.6638  2.7706  4.18143  2.93130  3.3745  9.7147    10
#>  complete_ts_dt2  1.4248  1.5499  2.31117  1.89895  2.0148  6.8276    10
#>   complete_ts_tv 20.3337 20.7733 24.21463 24.83460 26.8032 28.2474    10

I've provided the translation of the code you provided, focusing on the code portions and omitting the explanation and additional information. If you have any specific questions or need further assistance with this code, please let me know.

英文:

If you're working with data.table, use setDT to convert df to a data.table by reference and then leave it that way. No reason to convert the function output back to a data.frame.

The join is a good approach. Although, I don't see the need for lapply or a helper function, and the NA fill can be simplified.

complete_ts_dt1 &lt;- function(df, date, ..., fill = list()) {
# create a data.table with all dates filled in
dt &lt;- setnames(
setDT(df)[
,.(date = seq(min(.SD[[1]]), max(.SD[[1]]), &quot;day&quot;)),
c(...), .SDcols = date # group by ...
], &quot;date&quot;, date
)
# join the original data to a data.table with all the missing dates filled in
df[dt, on = c(..., date)][
is.na(get(names(fill)[[1]])), names(fill) := fill # NA fill
]
}

An alternative approach:

complete_ts_dt2 &lt;- function(df, date, ..., fill = list()) {
setorderv( # sort the resulting data.table
rbindlist( # bind the original data to a data.table with the missing dates
list(
setDT(df),
df[ # create a data.table with the missing dates
,.(
date = as.Date(
sequence(as.integer(diff(.SD[[1]]) - 1L), .SD[[1]][-.N] + 1L)
)
), c(...), .SDcols = date # group by ...
][,names(fill) := fill]
), use.names = FALSE
), c(..., date) # sort by ... and date
)
}

data.table shines especially when performance is a concern. Timings:

microbenchmark::microbenchmark(
complete_ts_dt1 = complete_ts_dt1(df, date = &quot;date&quot;, &quot;group1&quot;, &quot;group2&quot;, fill = list(x = 0, y = 1)),
complete_ts_dt2 = complete_ts_dt2(df, date = &quot;date&quot;, &quot;group1&quot;, &quot;group2&quot;, fill = list(x = 0, y = 1)),
complete_ts_tv = complete_ts_tv(df, date, group1, group2, fill = list(x = 0, y = 1)),
times = 10
)
#&gt; Unit: milliseconds
#&gt;             expr     min      lq     mean   median      uq     max neval
#&gt;  complete_ts_dt1  2.6638  2.7706  4.18143  2.93130  3.3745  9.7147    10
#&gt;  complete_ts_dt2  1.4248  1.5499  2.31117  1.89895  2.0148  6.8276    10
#&gt;   complete_ts_tv 20.3337 20.7733 24.21463 24.83460 26.8032 28.2474    10

Here's a full reprex to show that all the results are the same:

library(tidyverse)
library(data.table)
set.seed(1)
df &lt;- data.frame(group1 = rep(c(&quot;a&quot;, &quot;b&quot;), each = 9), 
group2 = rep(c(&quot;A&quot;, &quot;B&quot;, &quot;C&quot;), each = 3),
date   = as.Date(&quot;2023-01-01&quot;) + c(1,2,5,0,3,4,2,4,6,0,2,3,1,2,3,4,6,7),
x = rnorm(18), y = rnorm(18))
complete_ts_tv &lt;- function(df, date, ..., fill = list()) {
df |&gt; 
group_by(...) |&gt; 
complete({{date}} := seq.Date(min({{date}}), max({{date}}), &quot;day&quot;), fill = fill) |&gt; 
ungroup() |&gt; 
as.data.frame()
}
generate_dates &lt;- function(date) seq.Date(min(date), max(date), by = &quot;day&quot;)
complete_ts_dt &lt;- function(df, date, ..., fill = list()) {
dt &lt;- as.data.table(df)
indx &lt;- dt[, lapply(.SD, generate_dates), by = c(...), .SDcols = date]
dt &lt;- dt[indx, on = c(date, ...)]
if (length(fill)) {
dt[, (names(fill)) := Map(replace, .SD, lapply(.SD, is.na), fill), .SDcols = names(fill)]
}
dt
}
complete_ts_dt1 &lt;- function(df, date, ..., fill = list()) {
# create a data.table with all dates filled in
dt &lt;- setnames(
setDT(df)[
,.(date = seq(min(.SD[[1]]), max(.SD[[1]]), &quot;day&quot;)),
c(...), .SDcols = date # group by ...
], &quot;date&quot;, date
)
# join the original data to a data.table with all the missing dates filled in
df[dt, on = c(..., date)][
is.na(get(names(fill)[[1]])), names(fill) := fill # NA fill
]
}
complete_ts_dt2 &lt;- function(df, date, ..., fill = list()) {
setorderv( # sort the resulting data.table
rbindlist( # bind the original data to a data.table with the missing dates
list(
setDT(df),
df[ # create a data.table with the missing dates
,.(
date = as.Date(
sequence(as.integer(diff(.SD[[1]]) - 1L), .SD[[1]][-.N] + 1L)
)
), c(...), .SDcols = date # group by ...
][,names(fill) := fill]
), use.names = FALSE
), c(..., date) # sort by ... and date
)
}
complete_ts_dt2(df, &quot;date&quot;, &quot;group1&quot;, &quot;group2&quot;, fill = list(x = 0, y = 1))[]
#&gt;     group1 group2       date           x           y
#&gt;  1:      a      A 2023-01-02 -0.62645381  0.82122120
#&gt;  2:      a      A 2023-01-03  0.18364332  0.59390132
#&gt;  3:      a      A 2023-01-04  0.00000000  1.00000000
#&gt;  4:      a      A 2023-01-05  0.00000000  1.00000000
#&gt;  5:      a      A 2023-01-06 -0.83562861  0.91897737
#&gt;  6:      a      B 2023-01-01  1.59528080  0.78213630
#&gt;  7:      a      B 2023-01-02  0.00000000  1.00000000
#&gt;  8:      a      B 2023-01-03  0.00000000  1.00000000
#&gt;  9:      a      B 2023-01-04  0.32950777  0.07456498
#&gt; 10:      a      B 2023-01-05 -0.82046838 -1.98935170
#&gt; 11:      a      C 2023-01-03  0.48742905  0.61982575
#&gt; 12:      a      C 2023-01-04  0.00000000  1.00000000
#&gt; 13:      a      C 2023-01-05  0.73832471 -0.05612874
#&gt; 14:      a      C 2023-01-06  0.00000000  1.00000000
#&gt; 15:      a      C 2023-01-07  0.57578135 -0.15579551
#&gt; 16:      b      A 2023-01-01 -0.30538839 -1.47075238
#&gt; 17:      b      A 2023-01-02  0.00000000  1.00000000
#&gt; 18:      b      A 2023-01-03  1.51178117 -0.47815006
#&gt; 19:      b      A 2023-01-04  0.38984324  0.41794156
#&gt; 20:      b      B 2023-01-02 -0.62124058  1.35867955
#&gt; 21:      b      B 2023-01-03 -2.21469989 -0.10278773
#&gt; 22:      b      B 2023-01-04  1.12493092  0.38767161
#&gt; 23:      b      C 2023-01-05 -0.04493361 -0.05380504
#&gt; 24:      b      C 2023-01-06  0.00000000  1.00000000
#&gt; 25:      b      C 2023-01-07 -0.01619026 -1.37705956
#&gt; 26:      b      C 2023-01-08  0.94383621 -0.41499456
#&gt;     group1 group2       date           x           y
res &lt;- list(
setDT(complete_ts_tv(df, date, group1, group2, fill = list(x = 0, y = 1))),
complete_ts_dt(df, &quot;date&quot;, &quot;group1&quot;, &quot;group2&quot;, fill = list(x = 0, y = 1)),
complete_ts_dt1(df, &quot;date&quot;, &quot;group1&quot;, &quot;group2&quot;, fill = list(x = 0, y = 1)),
complete_ts_dt2(df, &quot;date&quot;, &quot;group1&quot;, &quot;group2&quot;, fill = list(x = 0, y = 1))
)
identical(res[-4], res[-1])
#&gt; [1] TRUE
Created on 2023-08-29 with reprex v2.0.2

huangapple
  • 本文由 发表于 2023年8月5日 02:24:00
  • 转载请务必保留本文链接:https://go.coder-hub.com/76838367.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定