2023年6月22日 16:25:39go评论163阅读模式

英文:

R data.table update join by reference within function

问题

以下是您要翻译的代码部分：

我想在一个函数内更新两个表。以下是一个不使用函数的示例：

library(data.table)
Xtest <- data.table(a = rnorm(20), b = rnorm(20), c = 1:20)
Ytest <- data.table(c = 1:10, d = rnorm(10))

Xtest[Ytest, on = .(c), newcol := i.d]

# > Xtest[Ytest, on = .(c), newcol := i.d]
# > Xtest
# a           b  c      newcol
# 1: -1.68473343 -0.74498296  1  0.35096663
# 2: -0.98461614  2.15317525  2 -1.33890396
# 3: -1.65427602  1.21183896  3  1.49641480
# 4: -0.65045253 -0.74609860  4 -0.03227097
# 5:  1.49058508  1.20315276  5  1.41580186
# 6: -0.31631871  0.68716871  6 -0.03671959
# 7:  1.35923085 -0.20082238  7 -2.27959124
# 8: -0.75649545  0.24058212  8  0.93770862
# 9:  0.22452260 -0.28212892  9 -0.02500419
# 10:  0.30209786  1.33697797 10  0.67729741
# 11:  0.88748221 -0.54421418 11          NA
# 12:  0.47207422 -0.28159382 12          NA
# 13: -1.17270475  0.83940750 13          NA
# 14: -2.02787820 -0.03672582 14          NA
# 15: -0.22187761  0.59137210 15          NA
# 16:  0.97750330 -0.27030756 16          NA
# 17:  0.22725940  0.54617488 17          NA
# 18:  0.94065525 -0.23482152 18          NA
# 19:  2.12049977  0.69920776 19          NA
# 20:  0.06192823  0.12262739 20          NA

# Xtest[, newcol := NULL]

我尝试将上述代码重写为一个函数，但Ycol参数似乎被隐藏了：

myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) {
  eval(substitute(X[Y, on = joinlist, newcol:=i.Ycol])) 
}

# > myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) {
#   +   eval(substitute(X[Y, on = joinlist, newcol:=i.Ycol])) 
#   + }
# > myjoinfunction(Xtest, Ytest, list(c), D, d)
# Error in eval(jsub, SDenv, parent.frame()) : object 'i.Ycol' not found
# 8.
# eval(jsub, SDenv, parent.frame())
# 7.
# eval(jsub, SDenv, parent.frame())
# 6.
# `[.data.table`(Xtest, Ytest, on = list(c), `:=`(D, i.Ycol))
# 5.
# Xtest[Ytest, on = list(c), `:=`(D, i.Ycol)]
# 4.
# eval(substitute(X[Y, on = joinlist, `:=`(newcol, i.Ycol)]))
# 3.
# eval(substitute(X[Y, on = joinlist, `:=`(newcol, i.Ycol)]))
# 2.
# eval(substitute(X[Y, on = joinlist, `:=`(newcol, i.Ycol)]))
# 1.
# myjoinfunction(Xtest, Ytest, list(c), D, d)

如何使Ycol参数在函数内可见？

更新的示例：

要更加通用，这里是一个我想要为其编写函数的更新连接的新示例：

Xtest <- data.table(
a = rnorm(20), 
b = rnorm(20), 
c = rep(letters[1:4], rep(5, 4)), 
d = rep(1:5, 4)
)
Ytest <- data.table(
c = rep(letters[1:2], rep(5, 2)), 
d = rep(1:5, 2), 
e = rnorm(10), 
f = rnorm(10)
)

# > Xtest[Ytest, on = .(c, d), `:=`(newcol1 = i.e, newcol2 = i.f) ]
# > Xtest
# a            b c d    newcol1    newcol2
# 1: -2.4939743 -0.200370619 a 1 -1.4934893 -1.0288955
# 2:  1.0188321 -1.182286508 a 2  1.3811712  0.9747131
# 3:  0.5217161 -0.152117649 a 3 -0.4168069  0.1218213
# 4: -0.1584167  0.583640353 a 4  0.4644738  1.7888567
# 5: -0.4271398  0.020067301 a 5  2.5279998  2.0919953
# 6: -1.7692909  0.250129040 b 1 -1.5964246 -1.0884861
# 7: -0.8899915  0.971742055 b 2  0.3011304  1.2629524
# 8: -0.4490363 -1.540005621 b 3 -0.7992208 -0.5155775
# 9: -0.5706488 -1.037077614 b 4  1.0058213  1.9787692
# 10: -0.0922679  1.444487848 b 5 -0.2893311 -0.6095043
# 11:  

<details>
<summary>英文:</summary>

I want to update join two tables within a function. Here is an example without using a function:

library(data.table)
Xtest <- data.table(a = rnorm(20), b = rnorm(20), c = 1:20)
Ytest <- data.table(c = 1:10, d = rnorm(10))

Xtest[Ytest, on = .(c), newcol := i.d]

> Xtest[Ytest, on = .(c), newcol := i.d]

> Xtest

a b c newcol

1: -1.68473343 -0.74498296 1 0.35096663

2: -0.98461614 2.15317525 2 -1.33890396

3: -1.65427602 1.21183896 3 1.49641480

4: -0.65045253 -0.74609860 4 -0.03227097

5: 1.49058508 1.20315276 5 1.41580186

6: -0.31631871 0.68716871 6 -0.03671959

7: 1.35923085 -0.20082238 7 -2.27959124

8: -0.75649545 0.24058212 8 0.93770862

9: 0.22452260 -0.28212892 9 -0.02500419

10: 0.30209786 1.33697797 10 0.67729741

11: 0.88748221 -0.54421418 11 NA

12: 0.47207422 -0.28159382 12 NA

13: -1.17270475 0.83940750 13 NA

14: -2.02787820 -0.03672582 14 NA

15: -0.22187761 0.59137210 15 NA

16: 0.97750330 -0.27030756 16 NA

17: 0.22725940 0.54617488 17 NA

18: 0.94065525 -0.23482152 18 NA

19: 2.12049977 0.69920776 19 NA

20: 0.06192823 0.12262739 20 NA

Xtest[, newcol := NULL]



I tried to recast the above code into a function, but the Ycol argument seems to be hidden:

myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) {
eval(substitute(X[Y, on = joinlist, newcol:=i.Ycol]))
}

> myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) {

+ eval(substitute(X[Y, on = joinlist, newcol:=i.Ycol]))

+ }

> myjoinfunction(Xtest, Ytest, list(c), D, d)

Error in eval(jsub, SDenv, parent.frame()) : object 'i.Ycol' not found

8. eval(jsub, SDenv, parent.frame())

7. eval(jsub, SDenv, parent.frame())

6. `[.data.table`(Xtest, Ytest, on = list(c), `:=`(D, i.Ycol))

5. Xtest[Ytest, on = list(c), `:=`(D, i.Ycol)]

4. eval(substitute(X[Y, on = joinlist, `:=`(newcol, i.Ycol)]))

3. eval(substitute(X[Y, on = joinlist, `:=`(newcol, i.Ycol)]))

2. eval(substitute(X[Y, on = joinlist, `:=`(newcol, i.Ycol)]))

1. myjoinfunction(Xtest, Ytest, list(c), D, d)



How can I make the Ycol argument visible within the function?

**UPDATED EXAMPLE**

To be more general, here is a new example of an update join I would like to write a function for:

Xtest <- data.table(
a = rnorm(20),
b = rnorm(20),
c = rep(letters[1:4], rep(5, 4)),
d = rep(1:5, 4)
)
Ytest <- data.table(
c = rep(letters[1:2], rep(5, 2)),
d = rep(1:5, 2),
e = rnorm(10),
f = rnorm(10)
)

> Xtest[Ytest, on = .(c, d), `:=`(newcol1 = i.e, newcol2 = i.f) ]

> Xtest

a b c d newcol1 newcol2

1: -2.4939743 -0.200370619 a 1 -1.4934893 -1.0288955

2: 1.0188321 -1.182286508 a 2 1.3811712 0.9747131

3: 0.5217161 -0.152117649 a 3 -0.4168069 0.1218213

4: -0.1584167 0.583640353 a 4 0.4644738 1.7888567

5: -0.4271398 0.020067301 a 5 2.5279998 2.0919953

6: -1.7692909 0.250129040 b 1 -1.5964246 -1.0884861

7: -0.8899915 0.971742055 b 2 0.3011304 1.2629524

8: -0.4490363 -1.540005621 b 3 -0.7992208 -0.5155775

9: -0.5706488 -1.037077614 b 4 1.0058213 1.9787692

10: -0.0922679 1.444487848 b 5 -0.2893311 -0.6095043

11: 0.9924810 -1.144513228 c 1 NA NA

12: 1.2232591 1.503649791 c 2 NA NA

13: 0.8751961 0.892765910 c 3 NA NA

14: 0.9960554 0.499310073 c 4 NA NA

15: -0.6184695 1.867985589 c 5 NA NA

16: 0.6503936 0.422683211 d 1 NA NA

17: -0.6160834 -1.585713893 d 2 NA NA

18: 1.5949931 -0.544704857 d 3 NA NA

19: 0.7232079 -0.006460518 d 4 NA NA

20: -0.2824961 0.119585859 d 5 NA NA


Using a suggestion from David Arenburg in the comments trips up when joinlist is a list of strings

myjoinfunction1 <- function(X, Y, joinlist, newcol, Ycol) X[Y, on = joinlist, newcol:= get(paste0("i.", Ycol))]

> myjoinfunction1(Xtest, Ytest, list("c", "d"), "newcol", "e")

Error in .parse_on(substitute(on), isnull_inames) :

'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.

5. stop("'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.")

4. .parse_on(substitute(on), isnull_inames)

3. `[.data.table`(X, Y, on = joinlist, `:=`(newcol, get(paste0("i.",

Ycol))))

2. X[Y, on = joinlist, `:=`(newcol, get(paste0("i.", Ycol)))]

1. myjoinfunction1(Xtest, Ytest, list("c", "d"), "newcol", "e")



</details>


# 答案1
**得分**: 2

对于更一般的第二个示例：

```R
f <- function(X, Y, joinlist, cols) {
    X[Y, on = joinlist, names(cols) := mget(sprintf("i.%s", cols))]
}

用法：

set.seed(1)
Xtest2 <- data.table(
a = rnorm(20), 
b = rnorm(20), 
c = rep(letters[1:4], rep(5, 4)), 
d = rep(1:5, 4)
)
Ytest2 <- data.table(
c = rep(letters[1:2], rep(5, 2)), 
d = rep(1:5, 2), 
e = rnorm(10), 
f = rnorm(10)
)

f(Xtest2, Ytest2, c("c", "d"), c(newcol1 = "e", newcol2 = "f"))

附注：在向data.table中添加列时有一些要注意的特殊情况（参考超出“列槽”的情况；向从磁盘加载的表中添加列）。

英文:

For the more general second example:

f &lt;- function(X, Y, joinlist, cols) {
    X[Y, on = joinlist, names(cols) := mget(sprintf(&quot;i.%s&quot;, cols))]
}

Usage:

set.seed(1)
Xtest2 &lt;- data.table(
a = rnorm(20), 
b = rnorm(20), 
c = rep(letters[1:4], rep(5, 4)), 
d = rep(1:5, 4)
)
Ytest2 &lt;- data.table(
c = rep(letters[1:2], rep(5, 2)), 
d = rep(1:5, 2), 
e = rnorm(10), 
f = rnorm(10)
)

f(Xtest2, Ytest2, c(&quot;c&quot;, &quot;d&quot;), c(newcol1 = &quot;e&quot;, newcol2 = &quot;f&quot;))

Side note: There are some edge cases to watch out for when adding columns in a data.table (running out of "column slots"; adding to a table loaded from disk)

答案2

得分: 1

这是基于评论的回答。在data.table中，eval和substitute可以应用于数据表的不同位置，而不是将整个数据表包装在eval(substitute(DT))中。此外，当使用字符串而不是表达式作为substitute的参数时，不需要使用eval。

myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) {
  Ycol <- paste0("i.", as.character(substitute(Ycol))[-1])
  X[
    Y, 
    on = eval(substitute(joinlist)), 
    as.character(substitute(newcol))[-1] := mget(Ycol)
  ]
}

用法：

set.seed(42)
Xtest <- data.table(
  a = rnorm(20), 
  b = rnorm(20), 
  c = rep(letters[1:4], rep(5, 4)), 
  d = rep(1:5, 4), 
  id = 1:20
)
Ytest <- data.table(
  c = rep(letters[1:2], rep(5, 2)), 
  d = rep(1:5, 2), 
  e = rnorm(10), 
  f = rnorm(10), 
  id = 1:10
)

# 支持多列用于连接和更新
myjoinfunction(Xtest, Ytest, list(c, d), list(newcol1, newcol2), list(e, f));

# 单列必须以列表形式表示
myjoinfunction(Xtest, Ytest, list(id), list(newcol1), list(e));

通过不将参数表示为字符串，这更适合交互使用而不是编程使用。

欢迎提出改进建议！特别是如果有更优雅的方式来在函数参数中表示单列，或者在函数内部处理列表（我不喜欢在处理列表时必须使用[-1]作为一种解决方法，我认为最终会导致错误）。

英文:

Here is my answer based on the comments. The eval and substitute can be applied at different locations within the data.table, instead of wrapping the entire data.table in eval(substitute(DT)). Also, when using substitute with a string instead of an expression, eval is not needed.

myjoinfunction &lt;- function(X, Y, joinlist, newcol, Ycol) {
  Ycol &lt;- paste0(&quot;i.&quot;, as.character(substitute(Ycol))[-1])
  X[
     Y, 
     on = eval(substitute(joinlist)), 
     as.character(substitute(newcol))[-1] := mget(Ycol)
   d] 
  }

Usage:

set.seed(42)
Xtest &lt;- data.table(
a = rnorm(20), 
b = rnorm(20), 
c = rep(letters[1:4], rep(5, 4)), 
d = rep(1:5, 4), 
id = 1:20
)
Ytest &lt;- data.table(
c = rep(letters[1:2], rep(5, 2)), 
d = rep(1:5, 2), 
e = rnorm(10), 
f = rnorm(10), 
id = 1:10
)

# multiple columns supported both for joining and updating
myjoinfunction(Xtest, Ytest, list(c, d), list(newcol1, newcol2), list(e, f));

# Single columns have to be expressed in a list
myjoinfunction(Xtest, Ytest, list(id), list(newcol1), list(e));

By not having the arguments as strings, this is better for interactive use than programmatic use.

Suggestions for refinements are welcome! Especially if there is a more elegant way to express single columns in the function arguments or handle lists within the function (I don't like having to use [-1] as a workaround within the function when handling lists, I think it will lead to errors eventually.)

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

问题

> Xtest[Ytest, on = .(c), newcol := i.d]

> Xtest

a b c newcol

1: -1.68473343 -0.74498296 1 0.35096663

2: -0.98461614 2.15317525 2 -1.33890396

3: -1.65427602 1.21183896 3 1.49641480

4: -0.65045253 -0.74609860 4 -0.03227097

5: 1.49058508 1.20315276 5 1.41580186

6: -0.31631871 0.68716871 6 -0.03671959

7: 1.35923085 -0.20082238 7 -2.27959124

8: -0.75649545 0.24058212 8 0.93770862

9: 0.22452260 -0.28212892 9 -0.02500419

10: 0.30209786 1.33697797 10 0.67729741

11: 0.88748221 -0.54421418 11 NA

12: 0.47207422 -0.28159382 12 NA

13: -1.17270475 0.83940750 13 NA

14: -2.02787820 -0.03672582 14 NA

15: -0.22187761 0.59137210 15 NA

16: 0.97750330 -0.27030756 16 NA

17: 0.22725940 0.54617488 17 NA

18: 0.94065525 -0.23482152 18 NA

19: 2.12049977 0.69920776 19 NA

20: 0.06192823 0.12262739 20 NA

Xtest[, newcol := NULL]

> myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) {

+ eval(substitute(X[Y, on = joinlist, newcol:=i.Ycol]))

+ }

> myjoinfunction(Xtest, Ytest, list(c), D, d)

Error in eval(jsub, SDenv, parent.frame()) : object 'i.Ycol' not found

8.

eval(jsub, SDenv, parent.frame())

7.

eval(jsub, SDenv, parent.frame())

6.

[.data.table(Xtest, Ytest, on = list(c), :=(D, i.Ycol))

5.

Xtest[Ytest, on = list(c), :=(D, i.Ycol)]

4.

eval(substitute(X[Y, on = joinlist, :=(newcol, i.Ycol)]))

3.

eval(substitute(X[Y, on = joinlist, :=(newcol, i.Ycol)]))

2.

eval(substitute(X[Y, on = joinlist, :=(newcol, i.Ycol)]))

1.

myjoinfunction(Xtest, Ytest, list(c), D, d)

> Xtest[Ytest, on = .(c, d), :=(newcol1 = i.e, newcol2 = i.f) ]

> Xtest

a b c d newcol1 newcol2

1: -2.4939743 -0.200370619 a 1 -1.4934893 -1.0288955

2: 1.0188321 -1.182286508 a 2 1.3811712 0.9747131

3: 0.5217161 -0.152117649 a 3 -0.4168069 0.1218213

4: -0.1584167 0.583640353 a 4 0.4644738 1.7888567

5: -0.4271398 0.020067301 a 5 2.5279998 2.0919953

6: -1.7692909 0.250129040 b 1 -1.5964246 -1.0884861

7: -0.8899915 0.971742055 b 2 0.3011304 1.2629524

8: -0.4490363 -1.540005621 b 3 -0.7992208 -0.5155775

9: -0.5706488 -1.037077614 b 4 1.0058213 1.9787692

10: -0.0922679 1.444487848 b 5 -0.2893311 -0.6095043

11: 0.9924810 -1.144513228 c 1 NA NA

12: 1.2232591 1.503649791 c 2 NA NA

13: 0.8751961 0.892765910 c 3 NA NA

14: 0.9960554 0.499310073 c 4 NA NA

15: -0.6184695 1.867985589 c 5 NA NA

16: 0.6503936 0.422683211 d 1 NA NA

17: -0.6160834 -1.585713893 d 2 NA NA

18: 1.5949931 -0.544704857 d 3 NA NA

19: 0.7232079 -0.006460518 d 4 NA NA

20: -0.2824961 0.119585859 d 5 NA NA

> myjoinfunction1(Xtest, Ytest, list("c", "d"), "newcol", "e")

Error in .parse_on(substitute(on), isnull_inames) :

'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.

5.

stop("'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.")

4.

.parse_on(substitute(on), isnull_inames)

3.

[.data.table(X, Y, on = joinlist, :=(newcol, get(paste0("i.",

Ycol))))

2.

`[.data.table`(Xtest, Ytest, on = list(c), `:=`(D, i.Ycol))

Xtest[Ytest, on = list(c), `:=`(D, i.Ycol)]

eval(substitute(X[Y, on = joinlist, `:=`(newcol, i.Ycol)]))

eval(substitute(X[Y, on = joinlist, `:=`(newcol, i.Ycol)]))

eval(substitute(X[Y, on = joinlist, `:=`(newcol, i.Ycol)]))

> Xtest[Ytest, on = .(c, d), `:=`(newcol1 = i.e, newcol2 = i.f) ]

`[.data.table`(X, Y, on = joinlist, `:=`(newcol, get(paste0("i.",

X[Y, on = joinlist, `:=`(newcol, get(paste0("i.", Ycol)))]