使用 mapply() 在 attr() 内部。

huangapple go评论77阅读模式
英文:

Use mapply() within attr()

问题

我正在运行下面的函数,但adist()不是矢量化的,所以我需要使用rowwise()来运行它。显然,对于大量数据,这样做非常慢。

在我的情况下,我只有current_textprevious_text,并且change是从adist()生成的,“trafos”属性被提取。

df <- tibble(current_text = c("A","AB","ABC"),
             previous_text = c("","A","AB"),
             change = c("II","MI","MMI"))

df <- df %>%
  rowwise() %>%
  mutate(change = attr(adist(previous_text, current_text, counts=TRUE),"trafos"))

如何将此作为矢量化函数运行,或者至少作为更快的函数运行?

英文:

I am running the function below, but adist() is not vectorized, so I need to run it using rowwise(). Obviously this is very slow with a large amount of data.

In my scenario, I only have current_text and previous_text, and change is generated from adist() and the &quot;trafos&quot; attribute is extracted.

df &lt;- tibble(current_text = c(&quot;A&quot;,&quot;AB&quot;,&quot;ABC&quot;),
             previous_text = c(&quot;&quot;,&quot;A&quot;,&quot;AB&quot;),
             change = c(&quot;II&quot;,&quot;MI&quot;,&quot;MMI&quot;))

df &lt;- df %&gt;% 
  rowwise() %&gt;% 
  mutate(change = attr(adist(previous_text, current_text, counts=TRUE),&quot;trafos&quot;))

How can I run this as a vectorized function, or the very least as a faster function?

答案1

得分: 1

一个可能的解决方案:

df = data.frame(current_text = c("A","AB","ABC"),
                previous_text = c("","A","AB"))

df$change = apply(df, 1, function(x) {
    attr(adist(x[2], x[1], counts=TRUE), "trafos")
})
英文:

A possible solution:

df = data.frame(current_text = c(&quot;A&quot;,&quot;AB&quot;,&quot;ABC&quot;),
                previous_text = c(&quot;&quot;,&quot;A&quot;,&quot;AB&quot;))

df$change = apply(df, 1, function(x) {
    attr(adist(x[2], x[1], counts=TRUE), &quot;trafos&quot;)
})

答案2

得分: 1

使用Vectorize

my_function <- function(previous_text,current_text){
  attr(adist(previous_text, current_text, counts=TRUE),"trafos")
}

vectorized_function <- Vectorize(my_function)

df <- df %>%
  mutate(change_vectorized=vectorized_function(previous_text,current_text))

df
  current_text previous_text change change_vectorized
  <chr>        <chr>         <chr>  <chr>            
1 A            ""            II     II               
2 AB           A             MI     MI               
3 ABC          AB            MMI    MMI  

这是性能测试(注意:独立的adist函数在n=100000时失败):

library(microbenchmark)
library(dplyr)

df = data.frame(current_text = c("A","AB","ABC"),
                previous_text = c("","A","AB"))

my_function <- function(previous_text,current_text){
  attr(adist(previous_text, current_text, counts=TRUE),"trafos")
}

vectorized_function <- Vectorize(my_function)

rowwise_func<- function(df){
  df %>% 
    rowwise() %>% 
    mutate(change = attr(adist(previous_text, current_text, counts=TRUE),"trafos"))
}

vectorized_func<-function(df){
  df %>%
    mutate(change_vectorized=vectorized_function(previous_text,current_text))
}

apply_func<-function(df){
  df$change =apply(df, 1, function(x) {
    attr(adist(x[2], x[1], counts=TRUE), "trafos")
  })
}  

adist_func <- function(df){
  df %>% 
    mutate(change = diag(attr(adist(previous_text, current_text, counts=TRUE),"trafos")))
}

#n=3

microbenchmark(
  rowwise_func(df),
  vectorized_func(df),
  apply_func(df),
  adist_func(df)
)

Unit: microseconds
                expr    min      lq     mean  median      uq     max neval cld
    rowwise_func(df) 3179.2 3296.35 4242.982 3396.35 3629.15 13649.8   100   c
 vectorized_func(df) 1480.2 1541.00 2105.509 1590.45 1739.95  6113.3   100  b 
      apply_func(df)  110.4  146.50  232.154  174.70  189.95  4628.8   100 a  
      adist_func(df) 1499.7 1558.30 2303.539 1593.45 1665.15 44858.4   100  b

#n=1000
df_test <- df[sample(1:3,1000,replace=T),]

microbenchmark(
  rowwise_func(df_test),
  vectorized_func(df_test),
  apply_func(df_test),
  adist_func(df_test)
)

Unit: milliseconds
                     expr      min        lq      mean    median        uq      max neval cld
    rowwise_func(df_test)  30.0989  31.65590  38.99190  32.62200  41.28515 175.3273   100  b 
 vectorized_func(df_test)  14.0995  14.73965  19.01849  15.33360  21.55875  48.4703   100 a  
      apply_func(df_test)  13.9025  14.45740  18.17469  14.89315  18.94695  45.1587   100 a  
      adist_func(df_test) 174.6990 186.81355 209.26667 208.16060 221.72470 295.1197   100   c

#n=100000
df_test <- df[sample(1:3,100000,replace=T),]

  microbenchmark(
    rowwise_func(df_test),
    vectorized_func(df_test),
    apply_func(df_test),
    adist_func(df_test)
  )

错误信息:

Error in `mutate()`:
ℹ In argument: `change = diag(...)`.
Caused by error:
! cannot allocate vector of size 74.5 Gb
英文:

Using Vectorize:

my_function &lt;- function(previous_text,current_text){
attr(adist(previous_text, current_text, counts=TRUE),&quot;trafos&quot;)
}
vectorized_function &lt;- Vectorize(my_function)
df &lt;- df %&gt;%
mutate(change_vectorized=vectorized_function(previous_text,current_text))
df
current_text previous_text change change_vectorized
&lt;chr&gt;        &lt;chr&gt;         &lt;chr&gt;  &lt;chr&gt;            
1 A            &quot;&quot;            II     II               
2 AB           A             MI     MI               
3 ABC          AB            MMI    MMI  

Here is the benchmark (Note: standalone adist fails with n=100000):

library(microbenchmark)
library(dplyr)
df = data.frame(current_text = c(&quot;A&quot;,&quot;AB&quot;,&quot;ABC&quot;),
previous_text = c(&quot;&quot;,&quot;A&quot;,&quot;AB&quot;))
my_function &lt;- function(previous_text,current_text){
attr(adist(previous_text, current_text, counts=TRUE),&quot;trafos&quot;)
}
vectorized_function &lt;- Vectorize(my_function)
rowwise_func&lt;- function(df){
df %&gt;% 
rowwise() %&gt;% 
mutate(change = attr(adist(previous_text, current_text, counts=TRUE),&quot;trafos&quot;))
}
vectorized_func&lt;-function(df){
df %&gt;%
mutate(change_vectorized=vectorized_function(previous_text,current_text))
}
apply_func&lt;-function(df){
df$change =apply(df, 1, function(x) {
attr(adist(x[2], x[1], counts=TRUE), &quot;trafos&quot;)
})
}  
adist_func &lt;- function(df){
df %&gt;% 
mutate(change = diag(attr(adist(previous_text, current_text, counts=TRUE),&quot;trafos&quot;)))
}
#n=3
microbenchmark(
rowwise_func(df),
vectorized_func(df),
apply_func(df),
adist_func(df)
)
Unit: microseconds
expr    min      lq     mean  median      uq     max neval cld
rowwise_func(df) 3179.2 3296.35 4242.982 3396.35 3629.15 13649.8   100   c
vectorized_func(df) 1480.2 1541.00 2105.509 1590.45 1739.95  6113.3   100  b 
apply_func(df)  110.4  146.50  232.154  174.70  189.95  4628.8   100 a  
adist_func(df) 1499.7 1558.30 2303.539 1593.45 1665.15 44858.4   100  b
#n=1000
df_test &lt;- df[sample(1:3,1000,replace=T),]
microbenchmark(
rowwise_func(df_test),
vectorized_func(df_test),
apply_func(df_test),
adist_func(df_test)
)
Unit: milliseconds
expr      min        lq      mean    median        uq      max neval cld
rowwise_func(df_test)  30.0989  31.65590  38.99190  32.62200  41.28515 175.3273   100  b 
vectorized_func(df_test)  14.0995  14.73965  19.01849  15.33360  21.55875  48.4703   100 a  
apply_func(df_test)  13.9025  14.45740  18.17469  14.89315  18.94695  45.1587   100 a  
adist_func(df_test) 174.6990 186.81355 209.26667 208.16060 221.72470 295.1197   100   c
#n=100000
df_test &lt;- df[sample(1:3,100000,replace=T),]
microbenchmark(
rowwise_func(df_test),
vectorized_func(df_test),
apply_func(df_test),
adist_func(df_test)
)
Unit: seconds
expr      min       lq     mean   median       uq      max neval cld
rowwise_func(df_test) 3.505702 3.991654 4.264263 4.144656 4.541689 5.356368   100   c
vectorized_func(df_test) 1.438014 1.781691 1.981270 1.934037 2.207320 2.605152   100 a  
apply_func(df_test) 1.682728 2.027163 2.252507 2.238286 2.486441 3.211242   100  b 
Error in `mutate()`:
ℹ In argument: `change = diag(...)`.
Caused by error:
! cannot allocate vector of size 74.5 Gb

huangapple
  • 本文由 发表于 2023年5月25日 06:02:26
  • 转载请务必保留本文链接:https://go.coder-hub.com/76327678.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定