2023年5月25日 06:02:26go评论107阅读模式

英文:

Use mapply() within attr()

问题

我正在运行下面的函数，但adist（）不是矢量化的，所以我需要使用rowwise（）来运行它。显然，对于大量数据，这样做非常慢。

在我的情况下，我只有current_text和previous_text，并且change是从adist（）生成的，“trafos”属性被提取。

df <- tibble(current_text = c("A","AB","ABC"),
             previous_text = c("","A","AB"),
             change = c("II","MI","MMI"))
df <- df %>%
  rowwise() %>%
  mutate(change = attr(adist(previous_text, current_text, counts=TRUE),"trafos"))

如何将此作为矢量化函数运行，或者至少作为更快的函数运行？

英文:

I am running the function below, but adist() is not vectorized, so I need to run it using rowwise(). Obviously this is very slow with a large amount of data.

In my scenario, I only have current_text and previous_text, and change is generated from adist() and the "trafos" attribute is extracted.

df &lt;- tibble(current_text = c(&quot;A&quot;,&quot;AB&quot;,&quot;ABC&quot;),
             previous_text = c(&quot;&quot;,&quot;A&quot;,&quot;AB&quot;),
             change = c(&quot;II&quot;,&quot;MI&quot;,&quot;MMI&quot;))
df &lt;- df %&gt;% 
  rowwise() %&gt;% 
  mutate(change = attr(adist(previous_text, current_text, counts=TRUE),&quot;trafos&quot;))

How can I run this as a vectorized function, or the very least as a faster function?

答案1

得分: 1

一个可能的解决方案：

df = data.frame(current_text = c("A","AB","ABC"),
                previous_text = c("","A","AB"))
df$change = apply(df, 1, function(x) {
    attr(adist(x[2], x[1], counts=TRUE), "trafos")
})

英文:

A possible solution:

df = data.frame(current_text = c(&quot;A&quot;,&quot;AB&quot;,&quot;ABC&quot;),
                previous_text = c(&quot;&quot;,&quot;A&quot;,&quot;AB&quot;))
df$change = apply(df, 1, function(x) {
    attr(adist(x[2], x[1], counts=TRUE), &quot;trafos&quot;)
})

答案2

得分: 1

使用Vectorize：

my_function <- function(previous_text,current_text){
  attr(adist(previous_text, current_text, counts=TRUE),"trafos")
}
vectorized_function <- Vectorize(my_function)
df <- df %>%
  mutate(change_vectorized=vectorized_function(previous_text,current_text))
df
  current_text previous_text change change_vectorized
  <chr>        <chr>         <chr>  <chr>            
1 A            ""            II     II               
2 AB           A             MI     MI               
3 ABC          AB            MMI    MMI

这是性能测试（注意：独立的adist函数在n=100000时失败）：

library(microbenchmark)
library(dplyr)
df = data.frame(current_text = c("A","AB","ABC"),
                previous_text = c("","A","AB"))
my_function <- function(previous_text,current_text){
  attr(adist(previous_text, current_text, counts=TRUE),"trafos")
}
vectorized_function <- Vectorize(my_function)
rowwise_func<- function(df){
  df %>% 
    rowwise() %>% 
    mutate(change = attr(adist(previous_text, current_text, counts=TRUE),"trafos"))
}
vectorized_func<-function(df){
  df %>%
    mutate(change_vectorized=vectorized_function(previous_text,current_text))
}
apply_func<-function(df){
  df$change =apply(df, 1, function(x) {
    attr(adist(x[2], x[1], counts=TRUE), "trafos")
  })
}  
adist_func <- function(df){
  df %>% 
    mutate(change = diag(attr(adist(previous_text, current_text, counts=TRUE),"trafos")))
}
#n=3
microbenchmark(
  rowwise_func(df),
  vectorized_func(df),
  apply_func(df),
  adist_func(df)
)
Unit: microseconds
                expr    min      lq     mean  median      uq     max neval cld
    rowwise_func(df) 3179.2 3296.35 4242.982 3396.35 3629.15 13649.8   100   c
 vectorized_func(df) 1480.2 1541.00 2105.509 1590.45 1739.95  6113.3   100  b 
      apply_func(df)  110.4  146.50  232.154  174.70  189.95  4628.8   100 a  
      adist_func(df) 1499.7 1558.30 2303.539 1593.45 1665.15 44858.4   100  b
#n=1000
df_test <- df[sample(1:3,1000,replace=T),]
microbenchmark(
  rowwise_func(df_test),
  vectorized_func(df_test),
  apply_func(df_test),
  adist_func(df_test)
)
Unit: milliseconds
                     expr      min        lq      mean    median        uq      max neval cld
    rowwise_func(df_test)  30.0989  31.65590  38.99190  32.62200  41.28515 175.3273   100  b 
 vectorized_func(df_test)  14.0995  14.73965  19.01849  15.33360  21.55875  48.4703   100 a  
      apply_func(df_test)  13.9025  14.45740  18.17469  14.89315  18.94695  45.1587   100 a  
      adist_func(df_test) 174.6990 186.81355 209.26667 208.16060 221.72470 295.1197   100   c
#n=100000
df_test <- df[sample(1:3,100000,replace=T),]
  microbenchmark(
    rowwise_func(df_test),
    vectorized_func(df_test),
    apply_func(df_test),
    adist_func(df_test)
  )

错误信息：

Error in `mutate()`:
ℹ In argument: `change = diag(...)`.
Caused by error:
! cannot allocate vector of size 74.5 Gb

英文:

Using Vectorize:

my_function &lt;- function(previous_text,current_text){
attr(adist(previous_text, current_text, counts=TRUE),&quot;trafos&quot;)
}
vectorized_function &lt;- Vectorize(my_function)
df &lt;- df %&gt;%
mutate(change_vectorized=vectorized_function(previous_text,current_text))
df
current_text previous_text change change_vectorized
&lt;chr&gt;        &lt;chr&gt;         &lt;chr&gt;  &lt;chr&gt;            
1 A            &quot;&quot;            II     II               
2 AB           A             MI     MI               
3 ABC          AB            MMI    MMI

Here is the benchmark (Note: standalone adist fails with n=100000):

library(microbenchmark)
library(dplyr)
df = data.frame(current_text = c(&quot;A&quot;,&quot;AB&quot;,&quot;ABC&quot;),
previous_text = c(&quot;&quot;,&quot;A&quot;,&quot;AB&quot;))
my_function &lt;- function(previous_text,current_text){
attr(adist(previous_text, current_text, counts=TRUE),&quot;trafos&quot;)
}
vectorized_function &lt;- Vectorize(my_function)
rowwise_func&lt;- function(df){
df %&gt;% 
rowwise() %&gt;% 
mutate(change = attr(adist(previous_text, current_text, counts=TRUE),&quot;trafos&quot;))
}
vectorized_func&lt;-function(df){
df %&gt;%
mutate(change_vectorized=vectorized_function(previous_text,current_text))
}
apply_func&lt;-function(df){
df$change =apply(df, 1, function(x) {
attr(adist(x[2], x[1], counts=TRUE), &quot;trafos&quot;)
})
}  
adist_func &lt;- function(df){
df %&gt;% 
mutate(change = diag(attr(adist(previous_text, current_text, counts=TRUE),&quot;trafos&quot;)))
}
#n=3
microbenchmark(
rowwise_func(df),
vectorized_func(df),
apply_func(df),
adist_func(df)
)
Unit: microseconds
expr    min      lq     mean  median      uq     max neval cld
rowwise_func(df) 3179.2 3296.35 4242.982 3396.35 3629.15 13649.8   100   c
vectorized_func(df) 1480.2 1541.00 2105.509 1590.45 1739.95  6113.3   100  b 
apply_func(df)  110.4  146.50  232.154  174.70  189.95  4628.8   100 a  
adist_func(df) 1499.7 1558.30 2303.539 1593.45 1665.15 44858.4   100  b
#n=1000
df_test &lt;- df[sample(1:3,1000,replace=T),]
microbenchmark(
rowwise_func(df_test),
vectorized_func(df_test),
apply_func(df_test),
adist_func(df_test)
)
Unit: milliseconds
expr      min        lq      mean    median        uq      max neval cld
rowwise_func(df_test)  30.0989  31.65590  38.99190  32.62200  41.28515 175.3273   100  b 
vectorized_func(df_test)  14.0995  14.73965  19.01849  15.33360  21.55875  48.4703   100 a  
apply_func(df_test)  13.9025  14.45740  18.17469  14.89315  18.94695  45.1587   100 a  
adist_func(df_test) 174.6990 186.81355 209.26667 208.16060 221.72470 295.1197   100   c
#n=100000
df_test &lt;- df[sample(1:3,100000,replace=T),]
microbenchmark(
rowwise_func(df_test),
vectorized_func(df_test),
apply_func(df_test),
adist_func(df_test)
)
Unit: seconds
expr      min       lq     mean   median       uq      max neval cld
rowwise_func(df_test) 3.505702 3.991654 4.264263 4.144656 4.541689 5.356368   100   c
vectorized_func(df_test) 1.438014 1.781691 1.981270 1.934037 2.207320 2.605152   100 a  
apply_func(df_test) 1.682728 2.027163 2.252507 2.238286 2.486441 3.211242   100  b 
Error in `mutate()`:
ℹ In argument: `change = diag(...)`.
Caused by error:
! cannot allocate vector of size 74.5 Gb

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

使用 mapply() 在 attr() 内部。

问题

答案1

答案2

Adding legends to 3 plots in ggplot2

将2行合并为一行，添加额外列。

“as.ITime”函数是否有保留百分之一秒的方法？

如何在ggplot中制作基本的R双条形图

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。