2023年8月10日 19:24:32go评论119阅读模式

英文:

function for reshaping data efficiently

问题

I need to reshape a dataframe that looks like:

holdc

  country_n popClass_n org   P50M   P50L   P50U   P95M   P95L  P95U  P99M   P99L  P99U P99_9M P99_9L P99_9U
1   Austria     Adults ADG 0.0445 0.0392 0.0512 0.0914 0.0753 0.134 0.134 0.1010 0.230  0.152  0.115  0.258
2   Belgium     Adults ADG 0.0484 0.0398 0.0572 0.0944 0.0784 0.242 0.125 0.0994 0.338  0.171  0.131  0.690
3   Czechia     Adults ADG 0.0429 0.0352 0.0520 0.0901 0.0721 0.145 0.125 0.0948 0.296  0.183  0.121  0.434
4   Denmark     Adults ADG 0.0496 0.0415 0.0736 0.0885 0.0763 0.242 0.115 0.0984 0.363  0.159  0.125  0.564
5   Finland     Adults ADG 0.0422 0.0376 0.0486 0.0864 0.0746 0.106 0.117 0.1020 0.203  0.333  0.133  0.758
6    France     Adults ADG 0.0467 0.0383 0.0593 0.0893 0.0730 0.230 0.122 0.0986 0.353  0.179  0.129  0.578

into a new one that shall look like:

Concatenation    P50    P95   P99 P99_9
1  ADG_Adults_Austria_CI_50 0.0392 0.0914 0.134 0.152
2  ADG_Austria_Adults_CI2_5 0.0445 0.0753 0.101 0.115
3 ADG_Austria_Adults_CI97_5 0.0512 0.1340 0.230 0.258

Here's a more concise solution in R:

library(dplyr)
library(tidyr)

# Define a function to reshape the dataframe
reshape_df <- function(df) {
  df <- df %>%
    pivot_longer(cols = starts_with("P50"), names_to = "Concatenation", values_to = "P50") %>%
    mutate(Concatenation = paste(org, popClass_n, country_n, "CI_50", sep = "_")) %>%
    select(Concatenation, P50)
  
  df_U <- df %>%
    pivot_longer(cols = starts_with("P95"), names_to = "Concatenation", values_to = "P95") %>%
    mutate(Concatenation = paste(org, popClass_n, country_n, "CI_97_5", sep = "_")) %>%
    select(Concatenation, P95)
  
  df_L <- df %>%
    pivot_longer(cols = starts_with("P99"), names_to = "Concatenation", values_to = "P99") %>%
    mutate(Concatenation = paste(org, popClass_n, country_n, "CI_2_5", sep = "_")) %>%
    select(Concatenation, P99)
  
  df_99_9 <- df %>%
    pivot_longer(cols = starts_with("P99_9"), names_to = "Concatenation", values_to = "P99_9") %>%
    mutate(Concatenation = paste(org, popClass_n, country_n, "CI_0_1", sep = "_")) %>%
    select(Concatenation, P99_9)
  
  final_df <- bind_rows(df_U, df_L, df_99_9)
  
  return(final_df)
}

# Call the function with your dataframe 'holdc'
result_df <- reshape_df(holdc)

This code defines a function reshape_df that takes your original dataframe as input and reshapes it into the desired format using the pivot_longer function from the tidyr package. It creates separate dataframes for 'U', 'L', and '99.9' values and then combines them using bind_rows.

英文:

I need to reshape a dataframe that looks like:

holdc

  country_n popClass_n org   P50M   P50L   P50U   P95M   P95L  P95U  P99M   P99L  P99U P99_9M P99_9L P99_9U
1   Austria     Adults ADG 0.0445 0.0392 0.0512 0.0914 0.0753 0.134 0.134 0.1010 0.230  0.152  0.115  0.258
2   Belgium     Adults ADG 0.0484 0.0398 0.0572 0.0944 0.0784 0.242 0.125 0.0994 0.338  0.171  0.131  0.690
3   Czechia     Adults ADG 0.0429 0.0352 0.0520 0.0901 0.0721 0.145 0.125 0.0948 0.296  0.183  0.121  0.434
4   Denmark     Adults ADG 0.0496 0.0415 0.0736 0.0885 0.0763 0.242 0.115 0.0984 0.363  0.159  0.125  0.564
5   Finland     Adults ADG 0.0422 0.0376 0.0486 0.0864 0.0746 0.106 0.117 0.1020 0.203  0.333  0.133  0.758
6    France     Adults ADG 0.0467 0.0383 0.0593 0.0893 0.0730 0.230 0.122 0.0986 0.353  0.179  0.129  0.578

into a new one that shall look like:

Concatenation    P50    P95   P99 P99_9
1  ADG_Adults_Austria_CI_50 0.0392 0.0914 0.134 0.152
2  ADG_Austria_Adults_CI2_5 0.0445 0.0753 0.101 0.115
3 ADG_Austria_Adults_CI97_5 0.0512 0.1340 0.230 0.258

and so forth for all country_n, popClass_n, and org.
Each row of the initial dataframe needs to split into three rows in the new dataframe: one row for the 'M' values, one row for the 'U' values and one row for the 'L' values.
I have written a function in base R that does it, but was wondering if you can suggest a more concise and fast solution, since I have to apply it to a a very large dataset.
My code so far is:

dfM &lt;- f.arrange_perc(holdc,&#39;CI_50&#39;)
dfL &lt;- f.arrange_perc(holdc,&#39;CI_2_5&#39;)
dfU &lt;- f.arrange_perc(holdc,&#39;CI_97_5&#39;)

df &lt;- rbind(dfM, dfL,dfU)

f.arrange_perc &lt;- function(holdc,sCI){
  df &lt;- data.frame(matrix(&quot;&quot;, ncol = 5, nrow = 3*nrow(holdc))) 
  names(df) &lt;- c(&#39;Concatenation&#39;,&#39;P50&#39;,&#39;P95&#39;,&#39;P99&#39;,&#39;P99_9&#39;)
  
  df$Concatenation &lt;- paste(holdc$org,holdc$popClass_n,holdc$country_n, 
                            sCI, sep=&#39;_&#39;)
  if(sCI==&#39;CI_50&#39;){
    df$P50 &lt;- holdc$P50M
    df$P95 &lt;- holdc$P95M
    df$P99 &lt;- holdc$P99M
    df$P99_9 &lt;- holdc$P99_9M
  } else if (sCI==&#39;CI_2_5&#39;){
    df$P50 &lt;- holdc$P50L
    df$P95 &lt;- holdc$P95L
    df$P99 &lt;- holdc$P99L
    df$P99_9 &lt;- holdc$P99_9L
    
  } else if (sCI==&#39;CI_97_5&#39;){
    df$P50 &lt;- holdc$P50U
    df$P95 &lt;- holdc$P95U
    df$P99 &lt;- holdc$P99U
    df$P99_9 &lt;- holdc$P99_9U
    
  }
  return(df)
}

thanks for any hint

答案1

得分: 1

一行代码：

tidyr::pivot_longer(df, cols = -c("country_n", "popClass_n", "org"), names_to = c(".value", "value"), names_pattern = "(P\\d+)([A-Z]+)")

尽管我理解你想要创建一个连接列的愿望，但最好不要这样做，如果不是因为整洁数据原则，那就是因为这样做会使以后使用数据变得更容易。

更新：带有连接的答案：

pivot_longer(df, cols = -c("country_n", "popClass_n", "org"), names_to = c(".value", "value"), names_pattern = "(P[\\d_]+)([A-Z]+)") %>%
   mutate(Concatenation = paste(org, popClass_n, country_n, sep = "_"), .before = 1, .keep = "unused")

输出：

# A tibble: 18 × 6
   Concatenation      value    P50    P95    P99 P99_9
   <chr>              <chr>  <dbl>  <dbl>  <dbl> <dbl>
 1 ADG_Adults_Austria M     0.0445 0.0914 0.134  0.152
 2 ADG_Adults_Austria L     0.0392 0.0753 0.101  0.115
 3 ADG_Adults_Austria U     0.0512 0.134  0.23   0.258
 4 ADG_Adults_Belgium M     0.0484 0.0944 0.125  0.171
 5 ADG_Adults_Belgium L     0.0398 0.0784 0.0994 0.131
 6 ADG_Adults_Belgium U     0.0572 0.242  0.338  0.69 
 7 ADG_Adults_Czechia M     0.0429 0.0901 0.125  0.183
 8 ADG_Adults_Czechia L     0.0352 0.0721 0.0948 0.121
 9 ADG_Adults_Czechia U     0.052  0.145  0.296  0.434
10 ADG_Adults_Denmark M     0.0496 0.0885 0.115  0.159
11 ADG_Adults_Denmark L     0.0415 0.0763 0.0984 0.125
12 ADG_Adults_Denmark U     0.0736 0.242  0.363  0.564
13 ADG_Adults_Finland M     0.0422 0.0864 0.117  0.333
14 ADG_Adults_Finland L     0.0376 0.0746 0.102  0.133
15 ADG_Adults_Finland U     0.0486 0.106  0.203  0.758
16 ADG_Adults_France  M     0.0467 0.0893 0.122  0.179
17 ADG_Adults_France  L     0.0383 0.073  0.0986 0.129
18 ADG_Adults_France  U     0.0593 0.23   0.353  0.578

英文:

A one-liner:

tidyr::pivot_longer(df, cols = -c(&quot;country_n&quot;, &quot;popClass_n&quot;, &quot;org&quot;), names_to = c(&quot;.value&quot;, &quot;value&quot;), names_pattern = &quot;(P\\d+)([A-Z]+)&quot;)

As much as I can understand the desire to want to create a concatenation column, you're much better off not doing that, if not for tidy data principles, then simply because it makes it much easier to use the data later.

Update: answer for new data, with concatenation:

pivot_longer(df, cols = -c(&quot;country_n&quot;, &quot;popClass_n&quot;, &quot;org&quot;), names_to = c(&quot;.value&quot;, &quot;value&quot;), names_pattern = &quot;(P[\\d_]+)([A-Z]+)&quot;) |&gt;
   mutate(Concatenation = paste(org, popClass_n, country_n, sep = &quot;_&quot;), .before = 1, .keep = &quot;unused&quot;)

Output:

# A tibble: 18 &#215; 6
   Concatenation      value    P50    P95    P99 P99_9
   &lt;chr&gt;              &lt;chr&gt;  &lt;dbl&gt;  &lt;dbl&gt;  &lt;dbl&gt; &lt;dbl&gt;
 1 ADG_Adults_Austria M     0.0445 0.0914 0.134  0.152
 2 ADG_Adults_Austria L     0.0392 0.0753 0.101  0.115
 3 ADG_Adults_Austria U     0.0512 0.134  0.23   0.258
 4 ADG_Adults_Belgium M     0.0484 0.0944 0.125  0.171
 5 ADG_Adults_Belgium L     0.0398 0.0784 0.0994 0.131
 6 ADG_Adults_Belgium U     0.0572 0.242  0.338  0.69 
 7 ADG_Adults_Czechia M     0.0429 0.0901 0.125  0.183
 8 ADG_Adults_Czechia L     0.0352 0.0721 0.0948 0.121
 9 ADG_Adults_Czechia U     0.052  0.145  0.296  0.434
10 ADG_Adults_Denmark M     0.0496 0.0885 0.115  0.159
11 ADG_Adults_Denmark L     0.0415 0.0763 0.0984 0.125
12 ADG_Adults_Denmark U     0.0736 0.242  0.363  0.564
13 ADG_Adults_Finland M     0.0422 0.0864 0.117  0.333
14 ADG_Adults_Finland L     0.0376 0.0746 0.102  0.133
15 ADG_Adults_Finland U     0.0486 0.106  0.203  0.758
16 ADG_Adults_France  M     0.0467 0.0893 0.122  0.179
17 ADG_Adults_France  L     0.0383 0.073  0.0986 0.129
18 ADG_Adults_France  U     0.0593 0.23   0.353  0.578

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

函数用于高效地重塑数据

问题

答案1

替换Pandas数据框中的一列值，如果另一列包含特定字符串。

echarts4r在Quarto中无法渲染。

在R中运行GDC_prepare时出现的问题

如何使我在R中制作的图表看起来像我在Python中有的那样？

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

发表评论