英文:
How to create new column with name of column that contains maximum value using dplyr in R?
问题
以下是代码部分的翻译:
我有这样一个数据框:
dat <- data.frame(var1 = rnorm(10), var2 = rnorm(10), var3 = rnorm(10), var4 = rnorm(10))
> dat
var1 var2 var3 var4
1 -1.3784414 1.06816022 1.46578217 -0.4141153
2 -0.3272332 -0.69470574 0.02220395 -0.5502878
3 0.2559891 -0.06964848 -0.34745180 0.6399705
4 0.6029044 1.23680560 -0.72392358 -0.1990832
5 1.3097174 -0.58028595 -0.01487186 -0.8765290
6 -1.2356668 0.41330063 -1.00375989 -1.1974204
7 -0.4126320 3.83320678 -1.42059022 -0.6747575
8 1.7339653 0.58610348 0.40200428 1.4582103
9 1.2994859 1.65355306 0.75985071 0.6455882
10 -0.2353356 2.04468739 -0.11521602 0.3251901
目标是创建一个新的列,该列包含每行中在var2、var3和var4列中包含最大值的列的名称。
使用以下命令不会得到正确的输出:
library(dplyr)
dat %>%
rowwise() %>%
mutate(var.max = colnames(.)[which.max(c_across(var2:var4))])
# A tibble: 10 x 5
# Rowwise:
var1 var2 var3 var4 var.max
<dbl> <dbl> <dbl> <dbl> <chr>
1 -1.38 1.07 1.47 -0.414 var2
2 -0.327 -0.695 0.0222 -0.550 var2
3 0.256 -0.0696 -0.347 0.640 var3
4 0.603 1.24 -0.724 -0.199 var1
5 1.31 -0.580 -0.0149 -0.877 var2
6 -1.24 0.413 -1.00 -1.20 var1
7 -0.413 3.83 -1.42 -0.675 var1
8 1.73 0.586 0.402 1.46 var3
9 1.30 1.65 0.760 0.646 var1
10 -0.235 2.04 -0.115 0.325 var1
但是,如果从数据中排除列var1,它可以正常工作:
dat %>%
select(-var1) %>%
rowwise() %>%
mutate(var.max = colnames(.)[which.max(c_across(var2:var4))])
# A tibble: 10 x 4
# Rowwise:
var2 var3 var4 var.max
<dbl> <dbl> <dbl> <chr>
1 1.07 1.47 -0.414 var3
2 -0.695 0.0222 -0.550 var3
3 -0.0696 -0.347 0.640 var4
4 1.24 -0.724 -0.199 var2
5 -0.580 -0.0149 -0.877 var3
6 0.413 -1.00 -1.20 var2
7 3.83 -1.42 -0.675 var2
8 0.586 0.402 1.46 var4
9 1.65 0.760 0.646 var2
10 2.04 -0.115 0.325 var2
就像当var1在最后位置时一样:
dat %>%
select(var2, var3, var4, var1) %>%
rowwise() %>%
mutate(var.max = colnames(.)[which.max(c_across(var2:var4))])
# A tibble: 10 x 5
# Rowwise:
var2 var3 var4 var1 var.max
<dbl> <dbl> <dbl> <dbl> <chr>
1 1.07 1.47 -0.414 -1.38 var3
2 -0.695 0.0222 -0.550 -0.327 var3
3 -0.0696 -0.347 0.640 0.256 var4
4 1.24 -0.724 -0.199 0.603 var2
5 -0.580 -0.0149 -0.877 1.31 var3
6 0.413 -1.00 -1.20 -1.24 var2
7 3.83 -1.42 -0.675 -0.413 var2
8 0.586 0.402 1.46 1.73 var4
9 1.65 0.760 0.646 1.30 var2
10 2.04 -0.115 0.325 -0.235 var2
我在这里漏掉了什么?
英文:
I have such a data frame:
dat <- data.frame(var1 = rnorm(10), var2 = rnorm(10), var3 = rnorm(10), var4 = rnorm(10))
> dat
var1 var2 var3 var4
1 -1.3784414 1.06816022 1.46578217 -0.4141153
2 -0.3272332 -0.69470574 0.02220395 -0.5502878
3 0.2559891 -0.06964848 -0.34745180 0.6399705
4 0.6029044 1.23680560 -0.72392358 -0.1990832
5 1.3097174 -0.58028595 -0.01487186 -0.8765290
6 -1.2356668 0.41330063 -1.00375989 -1.1974204
7 -0.4126320 3.83320678 -1.42059022 -0.6747575
8 1.7339653 0.58610348 0.40200428 1.4582103
9 1.2994859 1.65355306 0.75985071 0.6455882
10 -0.2353356 2.04468739 -0.11521602 0.3251901
The aim is to create a new column with the name of the column that contains the maximum value in each row within columns var2, var3 and var4.
Using the following command does not result in the correct output:
library(dplyr)
dat %>%
rowwise() %>%
mutate(var.max = colnames(.)[which.max(c_across(var2:var4))])
# A tibble: 10 x 5
# Rowwise:
var1 var2 var3 var4 var.max
<dbl> <dbl> <dbl> <dbl> <chr>
1 -1.38 1.07 1.47 -0.414 var2
2 -0.327 -0.695 0.0222 -0.550 var2
3 0.256 -0.0696 -0.347 0.640 var3
4 0.603 1.24 -0.724 -0.199 var1
5 1.31 -0.580 -0.0149 -0.877 var2
6 -1.24 0.413 -1.00 -1.20 var1
7 -0.413 3.83 -1.42 -0.675 var1
8 1.73 0.586 0.402 1.46 var3
9 1.30 1.65 0.760 0.646 var1
10 -0.235 2.04 -0.115 0.325 var1
But if the column var1 is excluded from the data it works:
dat %>%
select(-var1) %>%
rowwise() %>%
mutate(var.max = colnames(.)[which.max(c_across(var2:var4))])
# A tibble: 10 x 4
# Rowwise:
var2 var3 var4 var.max
<dbl> <dbl> <dbl> <chr>
1 1.07 1.47 -0.414 var3
2 -0.695 0.0222 -0.550 var3
3 -0.0696 -0.347 0.640 var4
4 1.24 -0.724 -0.199 var2
5 -0.580 -0.0149 -0.877 var3
6 0.413 -1.00 -1.20 var2
7 3.83 -1.42 -0.675 var2
8 0.586 0.402 1.46 var4
9 1.65 0.760 0.646 var2
10 2.04 -0.115 0.325 var2
.. just like when var1 is at the last position:
dat %>%
select(var2, var3, var4, var1) %>%
rowwise() %>%
mutate(var.max = colnames(.)[which.max(c_across(var2:var4))])
# A tibble: 10 x 5
# Rowwise:
var2 var3 var4 var1 var.max
<dbl> <dbl> <dbl> <dbl> <chr>
1 1.07 1.47 -0.414 -1.38 var3
2 -0.695 0.0222 -0.550 -0.327 var3
3 -0.0696 -0.347 0.640 0.256 var4
4 1.24 -0.724 -0.199 0.603 var2
5 -0.580 -0.0149 -0.877 1.31 var3
6 0.413 -1.00 -1.20 -1.24 var2
7 3.83 -1.42 -0.675 -0.413 var2
8 0.586 0.402 1.46 1.73 var4
9 1.65 0.760 0.646 1.30 var2
10 2.04 -0.115 0.325 -0.235 var2
What am I missing here?
答案1
得分: 3
为了继续你的逻辑,并且你只是移除了第一列,只需将 1 添加到 which.max()
,即:
library(dplyr)
dat %>%
rowwise() %>%
mutate(max_col = names(dat)[which.max(c_across(var2:var4)) + 1])
如果你想指定要考虑的列,可以这样做:
my_cols <- c('var2', 'var3', 'var4')
dat %>%
rowwise() %>%
mutate(max_col = names(dat)[which.max(c_across(names(dat)[names(dat) %in% my_cols])) + (ncol(dat) - length(my_cols))])
英文:
To continue your logic and since you are only removing the firsrt column, just add 1 to which.max()
, i.e.
library(dplyr)
dat %>%
rowwise() %>%
mutate(max_col = names(dat)[which.max(c_across(var2:var4)) + 1])
# A tibble: 10 × 5
# Rowwise:
var1 var2 var3 var4 max_col
<dbl> <dbl> <dbl> <dbl> <chr>
1 -1.09 0.768 0.251 -2.67 var2
2 -0.822 -1.37 0.901 1.83 var4
3 0.0280 -0.00555 -0.0709 0.729 var4
4 1.45 -0.132 -2.47 1.45 var4
5 0.506 -1.31 -2.75 -0.264 var4
6 -0.00538 1.31 -0.368 0.00679 var2
7 -0.166 -0.976 -1.42 1.50 var4
8 -0.377 -0.101 0.135 0.784 var4
9 0.535 0.438 0.0597 0.924 var4
10 0.281 -0.481 -0.00177 -0.601 var3
If you want to do it by specifying which columns to consider then,
my_cols <- c('var2', 'var3', 'var4')
dat %>%
rowwise() %>%
mutate(max_col = names(dat)[which.max(c_across(names(dat)[names(dat) %in% my_cols])) + (ncol(dat) - length(my_cols))])
答案2
得分: 1
以下是代码部分的翻译:
library(dplyr)
max_col_name <- function(...) {
row_dat <- across(c(...)) # 如果 dplyr 版本 >= 1.1,请使用 `pick()` 代替 `across()`
names(row_dat)[which.max(row_dat)]
}
dat %>%
rowwise() %>%
mutate(max_col = max_col_name(var2:var4))
数据来源于 OP:
set.seed(123)
dat <- data.frame(var1 = rnorm(10), var2 = rnorm(10), var3 = rnorm(10), var4 = rnorm(10))
创建于 2023-02-23,使用 reprex 包 (v2.0.1)
英文:
If you want to avoid adding the number of columns which are left out (in the above case +1
) then we can write a custom function max_col_name()
using across()
or pick()
:
library(dplyr)
max_col_name <- function(...) {
row_dat <- across(c(...)) # if dplyr v >= v 1.1. use `pick()` instead of `across()`
names(row_dat)[which.max(row_dat)]
}
dat %>%
rowwise() %>%
mutate(max_col = max_col_name(var2:var4))
#> # A tibble: 10 x 5
#> # Rowwise:
#> var1 var2 var3 var4 max_col
#> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 -0.560 1.22 -1.07 0.426 var2
#> 2 -0.230 0.360 -0.218 -0.295 var2
#> 3 1.56 0.401 -1.03 0.895 var4
#> 4 0.0705 0.111 -0.729 0.878 var4
#> 5 0.129 -0.556 -0.625 0.822 var4
#> 6 1.72 1.79 -1.69 0.689 var2
#> 7 0.461 0.498 0.838 0.554 var3
#> 8 -1.27 -1.97 0.153 -0.0619 var3
#> 9 -0.687 0.701 -1.14 -0.306 var2
#> 10 -0.446 -0.473 1.25 -0.380 var3
Data from the OP
set.seed(123)
dat <- data.frame(var1 = rnorm(10), var2 = rnorm(10), var3 = rnorm(10), var4 = rnorm(10))
<sup>Created on 2023-02-23 by the reprex package (v2.0.1)</sup>
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论