英文:
function for reshaping data efficiently
问题
I need to reshape a dataframe that looks like:
holdc
country_n popClass_n org P50M P50L P50U P95M P95L P95U P99M P99L P99U P99_9M P99_9L P99_9U
1 Austria Adults ADG 0.0445 0.0392 0.0512 0.0914 0.0753 0.134 0.134 0.1010 0.230 0.152 0.115 0.258
2 Belgium Adults ADG 0.0484 0.0398 0.0572 0.0944 0.0784 0.242 0.125 0.0994 0.338 0.171 0.131 0.690
3 Czechia Adults ADG 0.0429 0.0352 0.0520 0.0901 0.0721 0.145 0.125 0.0948 0.296 0.183 0.121 0.434
4 Denmark Adults ADG 0.0496 0.0415 0.0736 0.0885 0.0763 0.242 0.115 0.0984 0.363 0.159 0.125 0.564
5 Finland Adults ADG 0.0422 0.0376 0.0486 0.0864 0.0746 0.106 0.117 0.1020 0.203 0.333 0.133 0.758
6 France Adults ADG 0.0467 0.0383 0.0593 0.0893 0.0730 0.230 0.122 0.0986 0.353 0.179 0.129 0.578
into a new one that shall look like:
Concatenation P50 P95 P99 P99_9
1 ADG_Adults_Austria_CI_50 0.0392 0.0914 0.134 0.152
2 ADG_Austria_Adults_CI2_5 0.0445 0.0753 0.101 0.115
3 ADG_Austria_Adults_CI97_5 0.0512 0.1340 0.230 0.258
and so forth for all country_n
, popClass_n
, and org
. Each row of the initial dataframe needs to split into three rows in the new dataframe: one row for the 'M' values, one row for the 'U' values, and one row for the 'L' values.
Here's a more concise solution in R:
library(dplyr)
library(tidyr)
# Define a function to reshape the dataframe
reshape_df <- function(df) {
df <- df %>%
pivot_longer(cols = starts_with("P50"), names_to = "Concatenation", values_to = "P50") %>%
mutate(Concatenation = paste(org, popClass_n, country_n, "CI_50", sep = "_")) %>%
select(Concatenation, P50)
df_U <- df %>%
pivot_longer(cols = starts_with("P95"), names_to = "Concatenation", values_to = "P95") %>%
mutate(Concatenation = paste(org, popClass_n, country_n, "CI_97_5", sep = "_")) %>%
select(Concatenation, P95)
df_L <- df %>%
pivot_longer(cols = starts_with("P99"), names_to = "Concatenation", values_to = "P99") %>%
mutate(Concatenation = paste(org, popClass_n, country_n, "CI_2_5", sep = "_")) %>%
select(Concatenation, P99)
df_99_9 <- df %>%
pivot_longer(cols = starts_with("P99_9"), names_to = "Concatenation", values_to = "P99_9") %>%
mutate(Concatenation = paste(org, popClass_n, country_n, "CI_0_1", sep = "_")) %>%
select(Concatenation, P99_9)
final_df <- bind_rows(df_U, df_L, df_99_9)
return(final_df)
}
# Call the function with your dataframe 'holdc'
result_df <- reshape_df(holdc)
This code defines a function reshape_df
that takes your original dataframe as input and reshapes it into the desired format using the pivot_longer
function from the tidyr
package. It creates separate dataframes for 'U', 'L', and '99.9' values and then combines them using bind_rows
.
英文:
I need to reshape a dataframe that looks like:
holdc
country_n popClass_n org P50M P50L P50U P95M P95L P95U P99M P99L P99U P99_9M P99_9L P99_9U
1 Austria Adults ADG 0.0445 0.0392 0.0512 0.0914 0.0753 0.134 0.134 0.1010 0.230 0.152 0.115 0.258
2 Belgium Adults ADG 0.0484 0.0398 0.0572 0.0944 0.0784 0.242 0.125 0.0994 0.338 0.171 0.131 0.690
3 Czechia Adults ADG 0.0429 0.0352 0.0520 0.0901 0.0721 0.145 0.125 0.0948 0.296 0.183 0.121 0.434
4 Denmark Adults ADG 0.0496 0.0415 0.0736 0.0885 0.0763 0.242 0.115 0.0984 0.363 0.159 0.125 0.564
5 Finland Adults ADG 0.0422 0.0376 0.0486 0.0864 0.0746 0.106 0.117 0.1020 0.203 0.333 0.133 0.758
6 France Adults ADG 0.0467 0.0383 0.0593 0.0893 0.0730 0.230 0.122 0.0986 0.353 0.179 0.129 0.578
into a new one that shall look like:
Concatenation P50 P95 P99 P99_9
1 ADG_Adults_Austria_CI_50 0.0392 0.0914 0.134 0.152
2 ADG_Austria_Adults_CI2_5 0.0445 0.0753 0.101 0.115
3 ADG_Austria_Adults_CI97_5 0.0512 0.1340 0.230 0.258
and so forth for all country_n
, popClass_n
, and org
.
Each row of the initial dataframe needs to split into three rows in the new dataframe: one row for the 'M' values, one row for the 'U' values and one row for the 'L' values.
I have written a function in base R that does it, but was wondering if you can suggest a more concise and fast solution, since I have to apply it to a a very large dataset.
My code so far is:
dfM <- f.arrange_perc(holdc,'CI_50')
dfL <- f.arrange_perc(holdc,'CI_2_5')
dfU <- f.arrange_perc(holdc,'CI_97_5')
df <- rbind(dfM, dfL,dfU)
f.arrange_perc <- function(holdc,sCI){
df <- data.frame(matrix("", ncol = 5, nrow = 3*nrow(holdc)))
names(df) <- c('Concatenation','P50','P95','P99','P99_9')
df$Concatenation <- paste(holdc$org,holdc$popClass_n,holdc$country_n,
sCI, sep='_')
if(sCI=='CI_50'){
df$P50 <- holdc$P50M
df$P95 <- holdc$P95M
df$P99 <- holdc$P99M
df$P99_9 <- holdc$P99_9M
} else if (sCI=='CI_2_5'){
df$P50 <- holdc$P50L
df$P95 <- holdc$P95L
df$P99 <- holdc$P99L
df$P99_9 <- holdc$P99_9L
} else if (sCI=='CI_97_5'){
df$P50 <- holdc$P50U
df$P95 <- holdc$P95U
df$P99 <- holdc$P99U
df$P99_9 <- holdc$P99_9U
}
return(df)
}
thanks for any hint
答案1
得分: 1
一行代码:
tidyr::pivot_longer(df, cols = -c("country_n", "popClass_n", "org"), names_to = c(".value", "value"), names_pattern = "(P\\d+)([A-Z]+)")
尽管我理解你想要创建一个连接列的愿望,但最好不要这样做,如果不是因为 整洁数据原则,那就是因为这样做会使以后使用数据变得更容易。
更新:带有连接的答案:
pivot_longer(df, cols = -c("country_n", "popClass_n", "org"), names_to = c(".value", "value"), names_pattern = "(P[\\d_]+)([A-Z]+)") %>%
mutate(Concatenation = paste(org, popClass_n, country_n, sep = "_"), .before = 1, .keep = "unused")
输出:
# A tibble: 18 × 6
Concatenation value P50 P95 P99 P99_9
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 ADG_Adults_Austria M 0.0445 0.0914 0.134 0.152
2 ADG_Adults_Austria L 0.0392 0.0753 0.101 0.115
3 ADG_Adults_Austria U 0.0512 0.134 0.23 0.258
4 ADG_Adults_Belgium M 0.0484 0.0944 0.125 0.171
5 ADG_Adults_Belgium L 0.0398 0.0784 0.0994 0.131
6 ADG_Adults_Belgium U 0.0572 0.242 0.338 0.69
7 ADG_Adults_Czechia M 0.0429 0.0901 0.125 0.183
8 ADG_Adults_Czechia L 0.0352 0.0721 0.0948 0.121
9 ADG_Adults_Czechia U 0.052 0.145 0.296 0.434
10 ADG_Adults_Denmark M 0.0496 0.0885 0.115 0.159
11 ADG_Adults_Denmark L 0.0415 0.0763 0.0984 0.125
12 ADG_Adults_Denmark U 0.0736 0.242 0.363 0.564
13 ADG_Adults_Finland M 0.0422 0.0864 0.117 0.333
14 ADG_Adults_Finland L 0.0376 0.0746 0.102 0.133
15 ADG_Adults_Finland U 0.0486 0.106 0.203 0.758
16 ADG_Adults_France M 0.0467 0.0893 0.122 0.179
17 ADG_Adults_France L 0.0383 0.073 0.0986 0.129
18 ADG_Adults_France U 0.0593 0.23 0.353 0.578
英文:
A one-liner:
tidyr::pivot_longer(df, cols = -c("country_n", "popClass_n", "org"), names_to = c(".value", "value"), names_pattern = "(P\\d+)([A-Z]+)")
As much as I can understand the desire to want to create a concatenation column, you're much better off not doing that, if not for tidy data principles, then simply because it makes it much easier to use the data later.
Update: answer for new data, with concatenation:
pivot_longer(df, cols = -c("country_n", "popClass_n", "org"), names_to = c(".value", "value"), names_pattern = "(P[\\d_]+)([A-Z]+)") |>
mutate(Concatenation = paste(org, popClass_n, country_n, sep = "_"), .before = 1, .keep = "unused")
Output:
# A tibble: 18 × 6
Concatenation value P50 P95 P99 P99_9
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 ADG_Adults_Austria M 0.0445 0.0914 0.134 0.152
2 ADG_Adults_Austria L 0.0392 0.0753 0.101 0.115
3 ADG_Adults_Austria U 0.0512 0.134 0.23 0.258
4 ADG_Adults_Belgium M 0.0484 0.0944 0.125 0.171
5 ADG_Adults_Belgium L 0.0398 0.0784 0.0994 0.131
6 ADG_Adults_Belgium U 0.0572 0.242 0.338 0.69
7 ADG_Adults_Czechia M 0.0429 0.0901 0.125 0.183
8 ADG_Adults_Czechia L 0.0352 0.0721 0.0948 0.121
9 ADG_Adults_Czechia U 0.052 0.145 0.296 0.434
10 ADG_Adults_Denmark M 0.0496 0.0885 0.115 0.159
11 ADG_Adults_Denmark L 0.0415 0.0763 0.0984 0.125
12 ADG_Adults_Denmark U 0.0736 0.242 0.363 0.564
13 ADG_Adults_Finland M 0.0422 0.0864 0.117 0.333
14 ADG_Adults_Finland L 0.0376 0.0746 0.102 0.133
15 ADG_Adults_Finland U 0.0486 0.106 0.203 0.758
16 ADG_Adults_France M 0.0467 0.0893 0.122 0.179
17 ADG_Adults_France L 0.0383 0.073 0.0986 0.129
18 ADG_Adults_France U 0.0593 0.23 0.353 0.578
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论