使用 mapply() 在 attr() 内部。

huangapple go评论107阅读模式
英文:

Use mapply() within attr()

问题

我正在运行下面的函数,但adist()不是矢量化的,所以我需要使用rowwise()来运行它。显然,对于大量数据,这样做非常慢。

在我的情况下,我只有current_textprevious_text,并且change是从adist()生成的,“trafos”属性被提取。

  1. df <- tibble(current_text = c("A","AB","ABC"),
  2. previous_text = c("","A","AB"),
  3. change = c("II","MI","MMI"))
  4. df <- df %>%
  5. rowwise() %>%
  6. mutate(change = attr(adist(previous_text, current_text, counts=TRUE),"trafos"))

如何将此作为矢量化函数运行,或者至少作为更快的函数运行?

英文:

I am running the function below, but adist() is not vectorized, so I need to run it using rowwise(). Obviously this is very slow with a large amount of data.

In my scenario, I only have current_text and previous_text, and change is generated from adist() and the &quot;trafos&quot; attribute is extracted.

  1. df &lt;- tibble(current_text = c(&quot;A&quot;,&quot;AB&quot;,&quot;ABC&quot;),
  2. previous_text = c(&quot;&quot;,&quot;A&quot;,&quot;AB&quot;),
  3. change = c(&quot;II&quot;,&quot;MI&quot;,&quot;MMI&quot;))
  4. df &lt;- df %&gt;%
  5. rowwise() %&gt;%
  6. mutate(change = attr(adist(previous_text, current_text, counts=TRUE),&quot;trafos&quot;))

How can I run this as a vectorized function, or the very least as a faster function?

答案1

得分: 1

一个可能的解决方案:

  1. df = data.frame(current_text = c("A","AB","ABC"),
  2. previous_text = c("","A","AB"))
  3. df$change = apply(df, 1, function(x) {
  4. attr(adist(x[2], x[1], counts=TRUE), "trafos")
  5. })
英文:

A possible solution:

  1. df = data.frame(current_text = c(&quot;A&quot;,&quot;AB&quot;,&quot;ABC&quot;),
  2. previous_text = c(&quot;&quot;,&quot;A&quot;,&quot;AB&quot;))
  3. df$change = apply(df, 1, function(x) {
  4. attr(adist(x[2], x[1], counts=TRUE), &quot;trafos&quot;)
  5. })

答案2

得分: 1

使用Vectorize

  1. my_function <- function(previous_text,current_text){
  2. attr(adist(previous_text, current_text, counts=TRUE),"trafos")
  3. }
  4. vectorized_function <- Vectorize(my_function)
  5. df <- df %>%
  6. mutate(change_vectorized=vectorized_function(previous_text,current_text))
  7. df
  8. current_text previous_text change change_vectorized
  9. <chr> <chr> <chr> <chr>
  10. 1 A "" II II
  11. 2 AB A MI MI
  12. 3 ABC AB MMI MMI

这是性能测试(注意:独立的adist函数在n=100000时失败):

  1. library(microbenchmark)
  2. library(dplyr)
  3. df = data.frame(current_text = c("A","AB","ABC"),
  4. previous_text = c("","A","AB"))
  5. my_function <- function(previous_text,current_text){
  6. attr(adist(previous_text, current_text, counts=TRUE),"trafos")
  7. }
  8. vectorized_function <- Vectorize(my_function)
  9. rowwise_func<- function(df){
  10. df %>%
  11. rowwise() %>%
  12. mutate(change = attr(adist(previous_text, current_text, counts=TRUE),"trafos"))
  13. }
  14. vectorized_func<-function(df){
  15. df %>%
  16. mutate(change_vectorized=vectorized_function(previous_text,current_text))
  17. }
  18. apply_func<-function(df){
  19. df$change =apply(df, 1, function(x) {
  20. attr(adist(x[2], x[1], counts=TRUE), "trafos")
  21. })
  22. }
  23. adist_func <- function(df){
  24. df %>%
  25. mutate(change = diag(attr(adist(previous_text, current_text, counts=TRUE),"trafos")))
  26. }
  27. #n=3
  28. microbenchmark(
  29. rowwise_func(df),
  30. vectorized_func(df),
  31. apply_func(df),
  32. adist_func(df)
  33. )
  34. Unit: microseconds
  35. expr min lq mean median uq max neval cld
  36. rowwise_func(df) 3179.2 3296.35 4242.982 3396.35 3629.15 13649.8 100 c
  37. vectorized_func(df) 1480.2 1541.00 2105.509 1590.45 1739.95 6113.3 100 b
  38. apply_func(df) 110.4 146.50 232.154 174.70 189.95 4628.8 100 a
  39. adist_func(df) 1499.7 1558.30 2303.539 1593.45 1665.15 44858.4 100 b
  40. #n=1000
  41. df_test <- df[sample(1:3,1000,replace=T),]
  42. microbenchmark(
  43. rowwise_func(df_test),
  44. vectorized_func(df_test),
  45. apply_func(df_test),
  46. adist_func(df_test)
  47. )
  48. Unit: milliseconds
  49. expr min lq mean median uq max neval cld
  50. rowwise_func(df_test) 30.0989 31.65590 38.99190 32.62200 41.28515 175.3273 100 b
  51. vectorized_func(df_test) 14.0995 14.73965 19.01849 15.33360 21.55875 48.4703 100 a
  52. apply_func(df_test) 13.9025 14.45740 18.17469 14.89315 18.94695 45.1587 100 a
  53. adist_func(df_test) 174.6990 186.81355 209.26667 208.16060 221.72470 295.1197 100 c
  54. #n=100000
  55. df_test <- df[sample(1:3,100000,replace=T),]
  56. microbenchmark(
  57. rowwise_func(df_test),
  58. vectorized_func(df_test),
  59. apply_func(df_test),
  60. adist_func(df_test)
  61. )

错误信息:

  1. Error in `mutate()`:
  2. In argument: `change = diag(...)`.
  3. Caused by error:
  4. ! cannot allocate vector of size 74.5 Gb
英文:

Using Vectorize:

  1. my_function &lt;- function(previous_text,current_text){
  2. attr(adist(previous_text, current_text, counts=TRUE),&quot;trafos&quot;)
  3. }
  4. vectorized_function &lt;- Vectorize(my_function)
  5. df &lt;- df %&gt;%
  6. mutate(change_vectorized=vectorized_function(previous_text,current_text))
  7. df
  8. current_text previous_text change change_vectorized
  9. &lt;chr&gt; &lt;chr&gt; &lt;chr&gt; &lt;chr&gt;
  10. 1 A &quot;&quot; II II
  11. 2 AB A MI MI
  12. 3 ABC AB MMI MMI

Here is the benchmark (Note: standalone adist fails with n=100000):

  1. library(microbenchmark)
  2. library(dplyr)
  3. df = data.frame(current_text = c(&quot;A&quot;,&quot;AB&quot;,&quot;ABC&quot;),
  4. previous_text = c(&quot;&quot;,&quot;A&quot;,&quot;AB&quot;))
  5. my_function &lt;- function(previous_text,current_text){
  6. attr(adist(previous_text, current_text, counts=TRUE),&quot;trafos&quot;)
  7. }
  8. vectorized_function &lt;- Vectorize(my_function)
  9. rowwise_func&lt;- function(df){
  10. df %&gt;%
  11. rowwise() %&gt;%
  12. mutate(change = attr(adist(previous_text, current_text, counts=TRUE),&quot;trafos&quot;))
  13. }
  14. vectorized_func&lt;-function(df){
  15. df %&gt;%
  16. mutate(change_vectorized=vectorized_function(previous_text,current_text))
  17. }
  18. apply_func&lt;-function(df){
  19. df$change =apply(df, 1, function(x) {
  20. attr(adist(x[2], x[1], counts=TRUE), &quot;trafos&quot;)
  21. })
  22. }
  23. adist_func &lt;- function(df){
  24. df %&gt;%
  25. mutate(change = diag(attr(adist(previous_text, current_text, counts=TRUE),&quot;trafos&quot;)))
  26. }
  27. #n=3
  28. microbenchmark(
  29. rowwise_func(df),
  30. vectorized_func(df),
  31. apply_func(df),
  32. adist_func(df)
  33. )
  34. Unit: microseconds
  35. expr min lq mean median uq max neval cld
  36. rowwise_func(df) 3179.2 3296.35 4242.982 3396.35 3629.15 13649.8 100 c
  37. vectorized_func(df) 1480.2 1541.00 2105.509 1590.45 1739.95 6113.3 100 b
  38. apply_func(df) 110.4 146.50 232.154 174.70 189.95 4628.8 100 a
  39. adist_func(df) 1499.7 1558.30 2303.539 1593.45 1665.15 44858.4 100 b
  40. #n=1000
  41. df_test &lt;- df[sample(1:3,1000,replace=T),]
  42. microbenchmark(
  43. rowwise_func(df_test),
  44. vectorized_func(df_test),
  45. apply_func(df_test),
  46. adist_func(df_test)
  47. )
  48. Unit: milliseconds
  49. expr min lq mean median uq max neval cld
  50. rowwise_func(df_test) 30.0989 31.65590 38.99190 32.62200 41.28515 175.3273 100 b
  51. vectorized_func(df_test) 14.0995 14.73965 19.01849 15.33360 21.55875 48.4703 100 a
  52. apply_func(df_test) 13.9025 14.45740 18.17469 14.89315 18.94695 45.1587 100 a
  53. adist_func(df_test) 174.6990 186.81355 209.26667 208.16060 221.72470 295.1197 100 c
  54. #n=100000
  55. df_test &lt;- df[sample(1:3,100000,replace=T),]
  56. microbenchmark(
  57. rowwise_func(df_test),
  58. vectorized_func(df_test),
  59. apply_func(df_test),
  60. adist_func(df_test)
  61. )
  62. Unit: seconds
  63. expr min lq mean median uq max neval cld
  64. rowwise_func(df_test) 3.505702 3.991654 4.264263 4.144656 4.541689 5.356368 100 c
  65. vectorized_func(df_test) 1.438014 1.781691 1.981270 1.934037 2.207320 2.605152 100 a
  66. apply_func(df_test) 1.682728 2.027163 2.252507 2.238286 2.486441 3.211242 100 b
  67. Error in `mutate()`:
  68. In argument: `change = diag(...)`.
  69. Caused by error:
  70. ! cannot allocate vector of size 74.5 Gb

huangapple
  • 本文由 发表于 2023年5月25日 06:02:26
  • 转载请务必保留本文链接:https://go.coder-hub.com/76327678.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定