英文:
Use mapply() within attr()
问题
我正在运行下面的函数,但adist()
不是矢量化的,所以我需要使用rowwise()
来运行它。显然,对于大量数据,这样做非常慢。
在我的情况下,我只有current_text
和previous_text
,并且change
是从adist()
生成的,“trafos”
属性被提取。
df <- tibble(current_text = c("A","AB","ABC"),
previous_text = c("","A","AB"),
change = c("II","MI","MMI"))
df <- df %>%
rowwise() %>%
mutate(change = attr(adist(previous_text, current_text, counts=TRUE),"trafos"))
如何将此作为矢量化函数运行,或者至少作为更快的函数运行?
英文:
I am running the function below, but adist()
is not vectorized, so I need to run it using rowwise()
. Obviously this is very slow with a large amount of data.
In my scenario, I only have current_text
and previous_text
, and change
is generated from adist()
and the "trafos"
attribute is extracted.
df <- tibble(current_text = c("A","AB","ABC"),
previous_text = c("","A","AB"),
change = c("II","MI","MMI"))
df <- df %>%
rowwise() %>%
mutate(change = attr(adist(previous_text, current_text, counts=TRUE),"trafos"))
How can I run this as a vectorized function, or the very least as a faster function?
答案1
得分: 1
一个可能的解决方案:
df = data.frame(current_text = c("A","AB","ABC"),
previous_text = c("","A","AB"))
df$change = apply(df, 1, function(x) {
attr(adist(x[2], x[1], counts=TRUE), "trafos")
})
英文:
A possible solution:
df = data.frame(current_text = c("A","AB","ABC"),
previous_text = c("","A","AB"))
df$change = apply(df, 1, function(x) {
attr(adist(x[2], x[1], counts=TRUE), "trafos")
})
答案2
得分: 1
使用Vectorize
:
my_function <- function(previous_text,current_text){
attr(adist(previous_text, current_text, counts=TRUE),"trafos")
}
vectorized_function <- Vectorize(my_function)
df <- df %>%
mutate(change_vectorized=vectorized_function(previous_text,current_text))
df
current_text previous_text change change_vectorized
<chr> <chr> <chr> <chr>
1 A "" II II
2 AB A MI MI
3 ABC AB MMI MMI
这是性能测试(注意:独立的adist函数在n=100000时失败):
library(microbenchmark)
library(dplyr)
df = data.frame(current_text = c("A","AB","ABC"),
previous_text = c("","A","AB"))
my_function <- function(previous_text,current_text){
attr(adist(previous_text, current_text, counts=TRUE),"trafos")
}
vectorized_function <- Vectorize(my_function)
rowwise_func<- function(df){
df %>%
rowwise() %>%
mutate(change = attr(adist(previous_text, current_text, counts=TRUE),"trafos"))
}
vectorized_func<-function(df){
df %>%
mutate(change_vectorized=vectorized_function(previous_text,current_text))
}
apply_func<-function(df){
df$change =apply(df, 1, function(x) {
attr(adist(x[2], x[1], counts=TRUE), "trafos")
})
}
adist_func <- function(df){
df %>%
mutate(change = diag(attr(adist(previous_text, current_text, counts=TRUE),"trafos")))
}
#n=3
microbenchmark(
rowwise_func(df),
vectorized_func(df),
apply_func(df),
adist_func(df)
)
Unit: microseconds
expr min lq mean median uq max neval cld
rowwise_func(df) 3179.2 3296.35 4242.982 3396.35 3629.15 13649.8 100 c
vectorized_func(df) 1480.2 1541.00 2105.509 1590.45 1739.95 6113.3 100 b
apply_func(df) 110.4 146.50 232.154 174.70 189.95 4628.8 100 a
adist_func(df) 1499.7 1558.30 2303.539 1593.45 1665.15 44858.4 100 b
#n=1000
df_test <- df[sample(1:3,1000,replace=T),]
microbenchmark(
rowwise_func(df_test),
vectorized_func(df_test),
apply_func(df_test),
adist_func(df_test)
)
Unit: milliseconds
expr min lq mean median uq max neval cld
rowwise_func(df_test) 30.0989 31.65590 38.99190 32.62200 41.28515 175.3273 100 b
vectorized_func(df_test) 14.0995 14.73965 19.01849 15.33360 21.55875 48.4703 100 a
apply_func(df_test) 13.9025 14.45740 18.17469 14.89315 18.94695 45.1587 100 a
adist_func(df_test) 174.6990 186.81355 209.26667 208.16060 221.72470 295.1197 100 c
#n=100000
df_test <- df[sample(1:3,100000,replace=T),]
microbenchmark(
rowwise_func(df_test),
vectorized_func(df_test),
apply_func(df_test),
adist_func(df_test)
)
错误信息:
Error in `mutate()`:
ℹ In argument: `change = diag(...)`.
Caused by error:
! cannot allocate vector of size 74.5 Gb
英文:
Using Vectorize
:
my_function <- function(previous_text,current_text){
attr(adist(previous_text, current_text, counts=TRUE),"trafos")
}
vectorized_function <- Vectorize(my_function)
df <- df %>%
mutate(change_vectorized=vectorized_function(previous_text,current_text))
df
current_text previous_text change change_vectorized
<chr> <chr> <chr> <chr>
1 A "" II II
2 AB A MI MI
3 ABC AB MMI MMI
Here is the benchmark (Note: standalone adist fails with n=100000):
library(microbenchmark)
library(dplyr)
df = data.frame(current_text = c("A","AB","ABC"),
previous_text = c("","A","AB"))
my_function <- function(previous_text,current_text){
attr(adist(previous_text, current_text, counts=TRUE),"trafos")
}
vectorized_function <- Vectorize(my_function)
rowwise_func<- function(df){
df %>%
rowwise() %>%
mutate(change = attr(adist(previous_text, current_text, counts=TRUE),"trafos"))
}
vectorized_func<-function(df){
df %>%
mutate(change_vectorized=vectorized_function(previous_text,current_text))
}
apply_func<-function(df){
df$change =apply(df, 1, function(x) {
attr(adist(x[2], x[1], counts=TRUE), "trafos")
})
}
adist_func <- function(df){
df %>%
mutate(change = diag(attr(adist(previous_text, current_text, counts=TRUE),"trafos")))
}
#n=3
microbenchmark(
rowwise_func(df),
vectorized_func(df),
apply_func(df),
adist_func(df)
)
Unit: microseconds
expr min lq mean median uq max neval cld
rowwise_func(df) 3179.2 3296.35 4242.982 3396.35 3629.15 13649.8 100 c
vectorized_func(df) 1480.2 1541.00 2105.509 1590.45 1739.95 6113.3 100 b
apply_func(df) 110.4 146.50 232.154 174.70 189.95 4628.8 100 a
adist_func(df) 1499.7 1558.30 2303.539 1593.45 1665.15 44858.4 100 b
#n=1000
df_test <- df[sample(1:3,1000,replace=T),]
microbenchmark(
rowwise_func(df_test),
vectorized_func(df_test),
apply_func(df_test),
adist_func(df_test)
)
Unit: milliseconds
expr min lq mean median uq max neval cld
rowwise_func(df_test) 30.0989 31.65590 38.99190 32.62200 41.28515 175.3273 100 b
vectorized_func(df_test) 14.0995 14.73965 19.01849 15.33360 21.55875 48.4703 100 a
apply_func(df_test) 13.9025 14.45740 18.17469 14.89315 18.94695 45.1587 100 a
adist_func(df_test) 174.6990 186.81355 209.26667 208.16060 221.72470 295.1197 100 c
#n=100000
df_test <- df[sample(1:3,100000,replace=T),]
microbenchmark(
rowwise_func(df_test),
vectorized_func(df_test),
apply_func(df_test),
adist_func(df_test)
)
Unit: seconds
expr min lq mean median uq max neval cld
rowwise_func(df_test) 3.505702 3.991654 4.264263 4.144656 4.541689 5.356368 100 c
vectorized_func(df_test) 1.438014 1.781691 1.981270 1.934037 2.207320 2.605152 100 a
apply_func(df_test) 1.682728 2.027163 2.252507 2.238286 2.486441 3.211242 100 b
Error in `mutate()`:
ℹ In argument: `change = diag(...)`.
Caused by error:
! cannot allocate vector of size 74.5 Gb
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论