英文:
Calculating the actual difference and percentage difference for multiple paired variables simultaneously
问题
以下是您要翻译的内容:
我有以下示例数据框,并希望一次计算多个配对变量(“10”和“20”对应于测试年份)的实际和百分比差异:
样本数据:
Group| A_10  |    A_20 |  B_10 |  B_20 
0       20          21        20        23
1       30          10        19        11
2       10          53        30        34
1       22          32        25        20
2       34          40        32        30
0       30          50        NA        40
0       39          40        19        20
1       40          NA        20        20
2       50          10        20        10
0       34          23        30        10
这是当前的工作代码:
library(dplyr)
# 假设数据框命名为'df'并具有以下结构:
# 'var1_1','var1_2',...代表第一组变量
# 'var2_1','var2_2',...代表第二组变量
# 定义要计算差异的变量对
variable_pairs <- list(
  c("A_10", "A_20"),
  c("B_10", "B_20")) # 我还有另外20个配对变量    
# 计算每个变量对的实际差异和百分比差异
df6 <- df %>%
  mutate(
    across(
      all_of(unlist(variable_pairs)),
      ~ .x - get(variable_pairs[[cur_column()]][2]),
      .names = "{.col}_actual_diff"
    ),
    across(
      all_of(unlist(variable_pairs)),
      ~ (.x - get(variable_pairs[[cur_column()]][2])) / get(variable_pairs[[cur_column()]][2]) * 100,
      .names = "{.col}_percentage_diff"
    )
  )
不幸的是,我在某个地方出错了或者过于复杂。上述代码会出现以下错误:
错误 in `mutate()`:
ℹ In argument: `across(...)`.
Caused by error in `across()`:
! Can't compute column `vo2mlkg_12_actual_diff`.
Caused by error in `get()`:
! invalid first argument
Run `rlang::last_trace()` to see where the error occurred.
有人能提出修复或更简单的解决方案吗?
附加说明:
长数据:
Group| variable | phase | Value |
0       A           10        20     
1       B           20        19     
2       C           20        30     
1       D           10        25     
2       E           20        32    
0       F           10        NA     
0       G           20        19     
1       H           10        20     
2       I           10        20     
0       J           20        30
感谢@Maël的解决方案:
```R
library(dplyr)
library(tidyr)
library(magrittr)
df2 <- df[,-2] 
df2 %<>% ...
英文:
I have the following example data frame and would like to calculate the actual and percentage differences across multiple paired variables ("10" and "20" correspond to year tested) at once:
sample data:
Group| A_10  |    A_20 |  B_10 |  B_20 
0       20          21        20        23
1       30          10        19        11
2       10          53        30        34
1       22          32        25        20
2       34          40        32        30
0       30          50        NA        40
0       39          40        19        20
1       40          NA        20        20
2       50          10        20        10
0       34          23        30        10
This is the current working code:
library(dplyr)
# Assuming data frame is named 'df' and has the following structure:
# 'var1_1', 'var1_2', ... represent the first set of variables
# 'var2_1', 'var2_2', ... represent the second set of variables
# Define the pairs of variables for which you want to calculate the differences
variable_pairs <- list(
  c("A_10", "A_20"),
  c("B_10", "B_20")) # I have another 20 paired variabels    
# Calculate the actual and percentage differences for each variable pair
df6 <- df %>%
  mutate(
    across(
      all_of(unlist(variable_pairs)),
      ~ .x - get(variable_pairs[[cur_column()]][2]),
      .names = "{.col}_actual_diff"
    ),
    across(
      all_of(unlist(variable_pairs)),
      ~ (.x - get(variable_pairs[[cur_column()]][2])) / get(variable_pairs[[cur_column()]][2]) * 100,
      .names = "{.col}_percentage_diff"
    )
  )
Unfortunately I am going wrong somewhere or overcomplicating things. The above code give this error: Error in `mutate()`:
ℹ In argument: `across(...)`.
Caused by error in `across()`:
! Can't compute column `vo2mlkg_12_actual_diff`.
Caused by error in `get()`:
! invalid first argument
Run `rlang::last_trace()` to see where the error occurred.
Can anyone suggest a fix or a simpler solution.
addendum:
long data
Group| variable  | phase |  Value | 
0       A           10        20        
1       B           20        19        
2       C           20        30        
1       D           10        25        
2       E           20        32       
0       F           10        NA        
0       G           20        19        
1       H           10        20        
2       I           10        20        
0       J           20        30        
Solution thanks to @Maël:
library(dplyr)
library(tidyr)
library(magrittr)
df2 <- df[,-2] 
df2 %<>%
  pivot_longer(-group, names_sep = "_", names_to = c("set", ".value")) %>%
  {colnames(.) <- c("group", "set", "pre", "post"); .} %>%
  mutate(
    diff = post - pre,
    diff_perc = ((post - pre) / pre) * 100
  )%>%
  group_by(group, set) %>%
  summarize(
    mean_diff = mean(diff, na.rm = TRUE),
    mean_diff_perc = mean(diff_perc, na.rm = TRUE)
  ) %>%
  pivot_wider(names_from = set, values_from = c(mean_diff, mean_diff_perc))
答案1
得分: 1
你可以使用多个 across 函数来计算差异:
library(dplyr)
df %>%
  mutate(across(matches("_post$"), .names = "{gsub('post','', .col)}diff") - across(matches("_pre$")),
         (across(matches("_post$"), .names = "{gsub('post','', .col)}perc_diff") - across(matches("_pre$"))) / across(matches("_post$")))
或者,可能更简单的方法是,首先对数据进行透视,然后计算差异:
library(tidyr)
df %>%
  pivot_longer(-Group, names_sep = "_", names_to = c("set", ".value")) %>%
  mutate(diff = post - pre,
         diff_perc = (post - pre) / post)
英文:
You can use multiple across:
library(dplyr)
df %>% 
  mutate(across(matches("_post$"), .names = "{gsub('post','', .col)}diff") - across(matches("_pre$")),
         (across(matches("_post$"), .names = "{gsub('post','', .col)}perc_diff") - across(matches("_pre$"))) / across(matches("_post$"))) %>% 
# # A tibble: 10 × 9
#    Group A_pre A_post B_pre B_post A_diff B_diff A_perc_diff B_perc_diff
#    <int> <int>  <int> <int>  <int>  <int>  <int>       <dbl>       <dbl>
#  1     0    20     21    20     23      1      3      0.0476      0.130 
#  2     1    30     10    19     11    -20     -8     -2          -0.727 
#  3     2    10     53    30     34     43      4      0.811       0.118 
#  4     1    22     32    25     20     10     -5      0.312      -0.25  
#  5     2    34     40    32     30      6     -2      0.15       -0.0667
#  6     0    30     50    NA     40     20     NA      0.4        NA     
#  7     0    39     40    19     20      1      1      0.025       0.05  
#  8     1    40     NA    20     20     NA      0     NA           0     
#  9     2    50     10    20     10    -40    -10     -4          -1     
# 10     0    34     23    30     10    -11    -20     -0.478      -2     
Or, probably simpler, you can pivot your data first, and then compute the differences:
library(tidyr)
df %>% 
  pivot_longer(-Group, names_sep = "_", names_to = c("set", ".value")) %>% 
  mutate(diff = post - pre,
         diff_perc = (post - pre) / post)
# # A tibble: 20 × 6
#    Group set     pre  post  diff diff_perc
#    <int> <chr> <int> <int> <int>     <dbl>
#  1     0 A        20    21     1    0.0476
#  2     0 B        20    23     3    0.130 
#  3     1 A        30    10   -20   -2     
#  4     1 B        19    11    -8   -0.727 
#  5     2 A        10    53    43    0.811 
#  6     2 B        30    34     4    0.118 
#  7     1 A        22    32    10    0.312 
#  8     1 B        25    20    -5   -0.25  
#  9     2 A        34    40     6    0.15  
# 10     2 B        32    30    -2   -0.0667
# 11     0 A        30    50    20    0.4   
# 12     0 B        NA    40    NA   NA     
# 13     0 A        39    40     1    0.025 
# 14     0 B        19    20     1    0.05  
# 15     1 A        40    NA    NA   NA     
# 16     1 B        20    20     0    0     
# 17     2 A        50    10   -40   -4     
# 18     2 B        20    10   -10   -1     
# 19     0 A        34    23   -11   -0.478 
# 20     0 B        30    10   -20   -2     
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。


评论