英文:
R data.table update join by reference within function
问题
以下是您要翻译的代码部分:
我想在一个函数内更新两个表。以下是一个不使用函数的示例:
library(data.table)
Xtest <- data.table(a = rnorm(20), b = rnorm(20), c = 1:20)
Ytest <- data.table(c = 1:10, d = rnorm(10))
Xtest[Ytest, on = .(c), newcol := i.d]
# > Xtest[Ytest, on = .(c), newcol := i.d]
# > Xtest
# a b c newcol
# 1: -1.68473343 -0.74498296 1 0.35096663
# 2: -0.98461614 2.15317525 2 -1.33890396
# 3: -1.65427602 1.21183896 3 1.49641480
# 4: -0.65045253 -0.74609860 4 -0.03227097
# 5: 1.49058508 1.20315276 5 1.41580186
# 6: -0.31631871 0.68716871 6 -0.03671959
# 7: 1.35923085 -0.20082238 7 -2.27959124
# 8: -0.75649545 0.24058212 8 0.93770862
# 9: 0.22452260 -0.28212892 9 -0.02500419
# 10: 0.30209786 1.33697797 10 0.67729741
# 11: 0.88748221 -0.54421418 11 NA
# 12: 0.47207422 -0.28159382 12 NA
# 13: -1.17270475 0.83940750 13 NA
# 14: -2.02787820 -0.03672582 14 NA
# 15: -0.22187761 0.59137210 15 NA
# 16: 0.97750330 -0.27030756 16 NA
# 17: 0.22725940 0.54617488 17 NA
# 18: 0.94065525 -0.23482152 18 NA
# 19: 2.12049977 0.69920776 19 NA
# 20: 0.06192823 0.12262739 20 NA
# Xtest[, newcol := NULL]
我尝试将上述代码重写为一个函数,但Ycol参数似乎被隐藏了:
myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) {
eval(substitute(X[Y, on = joinlist, newcol:=i.Ycol]))
}
# > myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) {
# + eval(substitute(X[Y, on = joinlist, newcol:=i.Ycol]))
# + }
# > myjoinfunction(Xtest, Ytest, list(c), D, d)
# Error in eval(jsub, SDenv, parent.frame()) : object 'i.Ycol' not found
# 8.
# eval(jsub, SDenv, parent.frame())
# 7.
# eval(jsub, SDenv, parent.frame())
# 6.
# `[.data.table`(Xtest, Ytest, on = list(c), `:=`(D, i.Ycol))
# 5.
# Xtest[Ytest, on = list(c), `:=`(D, i.Ycol)]
# 4.
# eval(substitute(X[Y, on = joinlist, `:=`(newcol, i.Ycol)]))
# 3.
# eval(substitute(X[Y, on = joinlist, `:=`(newcol, i.Ycol)]))
# 2.
# eval(substitute(X[Y, on = joinlist, `:=`(newcol, i.Ycol)]))
# 1.
# myjoinfunction(Xtest, Ytest, list(c), D, d)
如何使Ycol参数在函数内可见?
更新的示例:
要更加通用,这里是一个我想要为其编写函数的更新连接的新示例:
Xtest <- data.table(
a = rnorm(20),
b = rnorm(20),
c = rep(letters[1:4], rep(5, 4)),
d = rep(1:5, 4)
)
Ytest <- data.table(
c = rep(letters[1:2], rep(5, 2)),
d = rep(1:5, 2),
e = rnorm(10),
f = rnorm(10)
)
# > Xtest[Ytest, on = .(c, d), `:=`(newcol1 = i.e, newcol2 = i.f) ]
# > Xtest
# a b c d newcol1 newcol2
# 1: -2.4939743 -0.200370619 a 1 -1.4934893 -1.0288955
# 2: 1.0188321 -1.182286508 a 2 1.3811712 0.9747131
# 3: 0.5217161 -0.152117649 a 3 -0.4168069 0.1218213
# 4: -0.1584167 0.583640353 a 4 0.4644738 1.7888567
# 5: -0.4271398 0.020067301 a 5 2.5279998 2.0919953
# 6: -1.7692909 0.250129040 b 1 -1.5964246 -1.0884861
# 7: -0.8899915 0.971742055 b 2 0.3011304 1.2629524
# 8: -0.4490363 -1.540005621 b 3 -0.7992208 -0.5155775
# 9: -0.5706488 -1.037077614 b 4 1.0058213 1.9787692
# 10: -0.0922679 1.444487848 b 5 -0.2893311 -0.6095043
# 11:
<details>
<summary>英文:</summary>
I want to update join two tables within a function. Here is an example without using a function:
library(data.table)
Xtest <- data.table(a = rnorm(20), b = rnorm(20), c = 1:20)
Ytest <- data.table(c = 1:10, d = rnorm(10))
Xtest[Ytest, on = .(c), newcol := i.d]
> Xtest[Ytest, on = .(c), newcol := i.d]
> Xtest
a b c newcol
1: -1.68473343 -0.74498296 1 0.35096663
2: -0.98461614 2.15317525 2 -1.33890396
3: -1.65427602 1.21183896 3 1.49641480
4: -0.65045253 -0.74609860 4 -0.03227097
5: 1.49058508 1.20315276 5 1.41580186
6: -0.31631871 0.68716871 6 -0.03671959
7: 1.35923085 -0.20082238 7 -2.27959124
8: -0.75649545 0.24058212 8 0.93770862
9: 0.22452260 -0.28212892 9 -0.02500419
10: 0.30209786 1.33697797 10 0.67729741
11: 0.88748221 -0.54421418 11 NA
12: 0.47207422 -0.28159382 12 NA
13: -1.17270475 0.83940750 13 NA
14: -2.02787820 -0.03672582 14 NA
15: -0.22187761 0.59137210 15 NA
16: 0.97750330 -0.27030756 16 NA
17: 0.22725940 0.54617488 17 NA
18: 0.94065525 -0.23482152 18 NA
19: 2.12049977 0.69920776 19 NA
20: 0.06192823 0.12262739 20 NA
Xtest[, newcol := NULL]
I tried to recast the above code into a function, but the Ycol argument seems to be hidden:
myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) {
eval(substitute(X[Y, on = joinlist, newcol:=i.Ycol]))
}
> myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) {
+ eval(substitute(X[Y, on = joinlist, newcol:=i.Ycol]))
+ }
> myjoinfunction(Xtest, Ytest, list(c), D, d)
Error in eval(jsub, SDenv, parent.frame()) : object 'i.Ycol' not found
8.
eval(jsub, SDenv, parent.frame())
7.
eval(jsub, SDenv, parent.frame())
6.
[.data.table
(Xtest, Ytest, on = list(c), :=
(D, i.Ycol))
5.
Xtest[Ytest, on = list(c), :=
(D, i.Ycol)]
4.
eval(substitute(X[Y, on = joinlist, :=
(newcol, i.Ycol)]))
3.
eval(substitute(X[Y, on = joinlist, :=
(newcol, i.Ycol)]))
2.
eval(substitute(X[Y, on = joinlist, :=
(newcol, i.Ycol)]))
1.
myjoinfunction(Xtest, Ytest, list(c), D, d)
How can I make the Ycol argument visible within the function?
**UPDATED EXAMPLE**
To be more general, here is a new example of an update join I would like to write a function for:
Xtest <- data.table(
a = rnorm(20),
b = rnorm(20),
c = rep(letters[1:4], rep(5, 4)),
d = rep(1:5, 4)
)
Ytest <- data.table(
c = rep(letters[1:2], rep(5, 2)),
d = rep(1:5, 2),
e = rnorm(10),
f = rnorm(10)
)
> Xtest[Ytest, on = .(c, d), :=
(newcol1 = i.e, newcol2 = i.f) ]
> Xtest
a b c d newcol1 newcol2
1: -2.4939743 -0.200370619 a 1 -1.4934893 -1.0288955
2: 1.0188321 -1.182286508 a 2 1.3811712 0.9747131
3: 0.5217161 -0.152117649 a 3 -0.4168069 0.1218213
4: -0.1584167 0.583640353 a 4 0.4644738 1.7888567
5: -0.4271398 0.020067301 a 5 2.5279998 2.0919953
6: -1.7692909 0.250129040 b 1 -1.5964246 -1.0884861
7: -0.8899915 0.971742055 b 2 0.3011304 1.2629524
8: -0.4490363 -1.540005621 b 3 -0.7992208 -0.5155775
9: -0.5706488 -1.037077614 b 4 1.0058213 1.9787692
10: -0.0922679 1.444487848 b 5 -0.2893311 -0.6095043
11: 0.9924810 -1.144513228 c 1 NA NA
12: 1.2232591 1.503649791 c 2 NA NA
13: 0.8751961 0.892765910 c 3 NA NA
14: 0.9960554 0.499310073 c 4 NA NA
15: -0.6184695 1.867985589 c 5 NA NA
16: 0.6503936 0.422683211 d 1 NA NA
17: -0.6160834 -1.585713893 d 2 NA NA
18: 1.5949931 -0.544704857 d 3 NA NA
19: 0.7232079 -0.006460518 d 4 NA NA
20: -0.2824961 0.119585859 d 5 NA NA
Using a suggestion from David Arenburg in the comments trips up when joinlist is a list of strings
myjoinfunction1 <- function(X, Y, joinlist, newcol, Ycol) X[Y, on = joinlist, newcol:= get(paste0("i.", Ycol))]
> myjoinfunction1(Xtest, Ytest, list("c", "d"), "newcol", "e")
Error in .parse_on(substitute(on), isnull_inames) :
'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.
5.
stop("'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.")
4.
.parse_on(substitute(on), isnull_inames)
3.
[.data.table
(X, Y, on = joinlist, :=
(newcol, get(paste0("i.",
Ycol))))
2.
X[Y, on = joinlist, :=
(newcol, get(paste0("i.", Ycol)))]
1.
myjoinfunction1(Xtest, Ytest, list("c", "d"), "newcol", "e")
</details>
# 答案1
**得分**: 2
对于更一般的第二个示例:
```R
f <- function(X, Y, joinlist, cols) {
X[Y, on = joinlist, names(cols) := mget(sprintf("i.%s", cols))]
}
用法:
set.seed(1)
Xtest2 <- data.table(
a = rnorm(20),
b = rnorm(20),
c = rep(letters[1:4], rep(5, 4)),
d = rep(1:5, 4)
)
Ytest2 <- data.table(
c = rep(letters[1:2], rep(5, 2)),
d = rep(1:5, 2),
e = rnorm(10),
f = rnorm(10)
)
f(Xtest2, Ytest2, c("c", "d"), c(newcol1 = "e", newcol2 = "f"))
附注:在向data.table中添加列时有一些要注意的特殊情况(参考超出“列槽”的情况;向从磁盘加载的表中添加列)。
英文:
For the more general second example:
f <- function(X, Y, joinlist, cols) {
X[Y, on = joinlist, names(cols) := mget(sprintf("i.%s", cols))]
}
Usage:
set.seed(1)
Xtest2 <- data.table(
a = rnorm(20),
b = rnorm(20),
c = rep(letters[1:4], rep(5, 4)),
d = rep(1:5, 4)
)
Ytest2 <- data.table(
c = rep(letters[1:2], rep(5, 2)),
d = rep(1:5, 2),
e = rnorm(10),
f = rnorm(10)
)
f(Xtest2, Ytest2, c("c", "d"), c(newcol1 = "e", newcol2 = "f"))
Side note: There are some edge cases to watch out for when adding columns in a data.table (running out of "column slots"; adding to a table loaded from disk)
答案2
得分: 1
这是基于评论的回答。在data.table
中,eval
和substitute
可以应用于数据表的不同位置,而不是将整个数据表包装在eval(substitute(DT))
中。此外,当使用字符串而不是表达式作为substitute
的参数时,不需要使用eval
。
myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) {
Ycol <- paste0("i.", as.character(substitute(Ycol))[-1])
X[
Y,
on = eval(substitute(joinlist)),
as.character(substitute(newcol))[-1] := mget(Ycol)
]
}
用法:
set.seed(42)
Xtest <- data.table(
a = rnorm(20),
b = rnorm(20),
c = rep(letters[1:4], rep(5, 4)),
d = rep(1:5, 4),
id = 1:20
)
Ytest <- data.table(
c = rep(letters[1:2], rep(5, 2)),
d = rep(1:5, 2),
e = rnorm(10),
f = rnorm(10),
id = 1:10
)
# 支持多列用于连接和更新
myjoinfunction(Xtest, Ytest, list(c, d), list(newcol1, newcol2), list(e, f));
# 单列必须以列表形式表示
myjoinfunction(Xtest, Ytest, list(id), list(newcol1), list(e));
通过不将参数表示为字符串,这更适合交互使用而不是编程使用。
欢迎提出改进建议!特别是如果有更优雅的方式来在函数参数中表示单列,或者在函数内部处理列表(我不喜欢在处理列表时必须使用[-1]
作为一种解决方法,我认为最终会导致错误)。
英文:
Here is my answer based on the comments. The eval
and substitute
can be applied at different locations within the data.table, instead of wrapping the entire data.table in eval(substitute(DT))
. Also, when using substitute
with a string instead of an expression, eval
is not needed.
myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) {
Ycol <- paste0("i.", as.character(substitute(Ycol))[-1])
X[
Y,
on = eval(substitute(joinlist)),
as.character(substitute(newcol))[-1] := mget(Ycol)
d]
}
Usage:
set.seed(42)
Xtest <- data.table(
a = rnorm(20),
b = rnorm(20),
c = rep(letters[1:4], rep(5, 4)),
d = rep(1:5, 4),
id = 1:20
)
Ytest <- data.table(
c = rep(letters[1:2], rep(5, 2)),
d = rep(1:5, 2),
e = rnorm(10),
f = rnorm(10),
id = 1:10
)
# multiple columns supported both for joining and updating
myjoinfunction(Xtest, Ytest, list(c, d), list(newcol1, newcol2), list(e, f));
# Single columns have to be expressed in a list
myjoinfunction(Xtest, Ytest, list(id), list(newcol1), list(e));
By not having the arguments as strings, this is better for interactive use than programmatic use.
Suggestions for refinements are welcome! Especially if there is a more elegant way to express single columns in the function arguments or handle lists within the function (I don't like having to use [-1]
as a workaround within the function when handling lists, I think it will lead to errors eventually.)
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论