我如何在R中整理这些数据?

huangapple go评论63阅读模式
英文:

How I can sort these data out in R?

问题

这是我的数据:

data <- data.frame(
  ID = 1:6,
  Course1A = c("A1", "A1", "A1", "A1", "A1", "A1"),
  Time1A = c(1, 1, 1, 1, 1, 1),
  Score1A = c(20, 17, 20, 16, 20, 11),
  Course2B = c("B1", "B1", "B1", "B1", "B1", "B1"),
  Time2B = c(1, 1, 1, 1, 1, 1),
  Score2B = c(12, 11, 15, 15, 10, 15),
  Course3C = c("C1", "C1", "C1", "C1", "C1", "C1"),
  TimeC3 = c(1, 1, 1, 1, 1, 1),
  ScoreC3 = c(10, 12, 12, 10, 10, 10),
  Course4D = c("D1", "D1", "D1", "D1", "D1", "D1"),
  TimeD4 = c(1, 1, 1, 1, 1, 1),
  ScoreD4 = c(20, 20, 18, 20, 17, 20),
  Course5E = c("E1", "E1", "E1", "E1", "E1", "E1"),
  TimeE5 = c(2, 2, 2, 2, 2, 2),
  Score5E = c(12, 12, 12, 12, 11, 11),
  Course6F = c("F1", "F1", "F1", "F1", "F1", "F1"),
  TimeF6 = c(2, 2, 2, 2, 2, 2),
  ScoreF6 = c(10, 10, 10, 10, 10, 9),
  Course7G = c("G1", "G1", "G1", "G1", "G1", "G1"),
  TimeG7 = c(3, 3, 3, 3, 3, 3),
  Score7G = c(12, 11, 6, 4, 12, 9),
  Course8H = c("H1", "H1", "H1", "H1", "H1", "H1"),
  TimeH8 = c(3, 3, 3, 3, 3, 3),
  Score8H = c(12, 12, 12, 12, 10, 10),
  Gender = c("F", "F", "F", "F", "F", "F"),
  Race = c("A", "B", "C", "C", "C", "C"),
  Health = c("Yes", "Yes", "Yes", "Yes", "Yes", "Yes"),
  stringsAsFactors = FALSE
)

我想获取ID、Course、Time、Gender、Race、Health和Score的列。

我已经使用了以下代码,但得到了错误的表格:

```R
reshaped_data <- data %>% 
  pivot_longer(
    cols = -c(ID, Gender, Race, Health),
    names_to = c(".value", "Group"),
    names_pattern = "([A-Za-z]+)([0-9]+[A-Z])"
  )
英文:

Here is my data:

data &lt;- data.frame(
ID = 1:6,
Course1A = c(&quot;A1&quot;, &quot;A1&quot;, &quot;A1&quot;, &quot;A1&quot;, &quot;A1&quot;, &quot;A1&quot;),
Time1A = c(1, 1, 1, 1, 1, 1),
Score1A = c(20, 17, 20, 16, 20, 11),
Course2B = c(&quot;B1&quot;, &quot;B1&quot;, &quot;B1&quot;, &quot;B1&quot;, &quot;B1&quot;, &quot;B1&quot;),
Time2B = c(1, 1, 1, 1, 1, 1),
Score2B = c(12, 11, 15, 15, 10, 15),
Course3C = c(&quot;C1&quot;, &quot;C1&quot;, &quot;C1&quot;, &quot;C1&quot;, &quot;C1&quot;, &quot;C1&quot;),
TimeC3 = c(1, 1, 1, 1, 1, 1),
ScoreC3 = c(10, 12, 12, 10, 10, 10),
Course4D = c(&quot;D1&quot;, &quot;D1&quot;, &quot;D1&quot;, &quot;D1&quot;, &quot;D1&quot;, &quot;D1&quot;),
TimeD4 = c(1, 1, 1, 1, 1, 1),
ScoreD4 = c(20, 20, 18, 20, 17, 20),
Course5E = c(&quot;E1&quot;, &quot;E1&quot;, &quot;E1&quot;, &quot;E1&quot;, &quot;E1&quot;, &quot;E1&quot;),
TimeE5 = c(2, 2, 2, 2, 2, 2),
Score5E = c(12, 12, 12, 12, 11, 11),
Course6F = c(&quot;F1&quot;, &quot;F1&quot;, &quot;F1&quot;, &quot;F1&quot;, &quot;F1&quot;, &quot;F1&quot;),
TimeF6 = c(2, 2, 2, 2, 2, 2),
ScoreF6 = c(10, 10, 10, 10, 10, 9),
Course7G = c(&quot;G1&quot;, &quot;G1&quot;, &quot;G1&quot;, &quot;G1&quot;, &quot;G1&quot;, &quot;G1&quot;),
TimeG7 = c(3, 3, 3, 3, 3, 3),
Score7G = c(12, 11, 6, 4, 12, 9),
Course8H = c(&quot;H1&quot;, &quot;H1&quot;, &quot;H1&quot;, &quot;H1&quot;, &quot;H1&quot;, &quot;H1&quot;),
TimeH8 = c(3, 3, 3, 3, 3, 3),
Score8H = c(12, 12, 12, 12, 10, 10),
Gender = c(&quot;F&quot;, &quot;F&quot;, &quot;F&quot;, &quot;F&quot;, &quot;F&quot;, &quot;F&quot;),
Race = c(&quot;A&quot;, &quot;B&quot;, &quot;C&quot;, &quot;C&quot;, &quot;C&quot;, &quot;C&quot;),
Health = c(&quot;Yes&quot;, &quot;Yes&quot;, &quot;Yes&quot;, &quot;Yes&quot;, &quot;Yes&quot;, &quot;Yes&quot;),
stringsAsFactors = FALSE

I want to get columns for ID, Course, Time, Gender, Race and health and Score.

I have used the following codes, but I get a wrong table

reshaped_data &lt;- data %&gt;% 
pivot_longer(
cols = -c(ID, Gender, Race, Health),
names_to = c(&quot;.value&quot;, &quot;Group&quot;),
names_pattern = &quot;([A-Za-z]+)([0-9]+[A-Z])&quot;
)

答案1

得分: 2

我认为,如果你使变量的后缀相同,可能会更容易理解。比如,如果你希望3C和C3得到相同的值,那么reshape就会更容易实现。

library(dplyr)
library(tidyr)
library(stringr)
data &lt;- data.frame(
  ID = 1:6,
  Course1A = c("A1", "A1", "A1", "A1", "A1", "A1"),
  Time1A = c(1, 1, 1, 1, 1, 1),
  Score1A = c(20, 17, 20, 16, 20, 11),
  Course2B = c("B1", "B1", "B1", "B1", "B1", "B1"),
  Time2B = c(1, 1, 1, 1, 1, 1),
  Score2B = c(12, 11, 15, 15, 10, 15),
  Course3C = c("C1", "C1", "C1", "C1", "C1", "C1"),
  TimeC3 = c(1, 1, 1, 1, 1, 1),
  ScoreC3 = c(10, 12, 12, 10, 10, 10),
  Course4D = c("D1", "D1", "D1", "D1", "D1", "D1"),
  TimeD4 = c(1, 1, 1, 1, 1, 1),
  ScoreD4 = c(20, 20, 18, 20, 17, 20),
  Course5E = c("E1", "E1", "E1", "E1", "E1", "E1"),
  TimeE5 = c(2, 2, 2, 2, 2, 2),
  Score5E = c(12, 12, 12, 12, 11, 11),
  Course6F = c("F1", "F1", "F1", "F1", "F1", "F1"),
  TimeF6 = c(2, 2, 2, 2, 2, 2),
  ScoreF6 = c(10, 10, 10, 10, 10, 9),
  Course7G = c("G1", "G1", "G1", "G1", "G1", "G1"),
  TimeG7 = c(3, 3, 3, 3, 3, 3),
  Score7G = c(12, 11, 6, 4, 12, 9),
  Course8H = c("H1", "H1", "H1", "H1", "H1", "H1"),
  TimeH8 = c(3, 3, 3, 3, 3, 3),
  Score8H = c(12, 12, 12, 12, 10, 10),
  Gender = c("F", "F", "F", "F", "F", "F"),
  Race = c("A", "B", "C", "C", "C", "C"),
  Health = c("Yes", "Yes", "Yes", "Yes", "Yes", "Yes"),
  stringsAsFactors = FALSE)

为此,我首先会找到你想要的变量,并保存词干(即,Time,Course,Score)和后缀(即,两个字符的结尾)。

sfx &lt;- gsub("(Time|Course|Score)(.*)", "\\2", grep("Time|Score|Course", names(data), value=TRUE))
stem &lt;- gsub("(Time|Course|Score)(.*)", "\\1", grep("Time|Score|Course", names(data), value=TRUE))

接下来,你可以将两个后缀字符拆分开,然后排序它们并粘贴在一起。

sfx &lt;- str_split(sfx, "", simplify=TRUE)
sfx &lt;- apply(sfx, 1, function(x)paste(sort(x), collapse=""))

然后,你可以将新理性化的后缀粘贴到词干上,并重新命名相关列。

names(data)[grep("Time|Score|Course", names(data))] &lt;- paste(stem, sfx, sep="")

然后,reshape就能按预期工作了。

reshaped_data &lt;- data %&gt;% 
  pivot_longer(
    cols = -c(ID, Gender, Race, Health),
    names_to = c(".value", "Group"),
    names_pattern = "(Course|Time|Score)(.{2})$"
  )
reshaped_data
#> # A tibble: 48 × 8
#>       ID Gender Race  Health Group Course  Time Score
#>    <int> <chr>  <chr> <chr>  <chr> <chr>   <dbl> <dbl>
#>  1     1 F      A     Yes    1A    A1          1    20
#>  2     1 F      A     Yes    2B    B1          1    12
#>  3     1 F      A     Yes    3C    C1          1    10
#>  4     1 F      A     Yes    4D    D1          1    20
#>  5     1 F      A     Yes    5E    E1          2    12
#>  6     1 F      A     Yes    6F    F1          2    10
#>  7     1 F      A     Yes    7G    G1          3    12
#>  8     1 F      A     Yes    8H    H1          3    12
#>  9     2 F      B     Yes    1A    A1          1    17
#> 10     2 F      B     Yes    2B    B1          1    11
#> # … with 38 more rows
英文:

I think it might be easiest if you rationalized the names for the variables to make the suffixes the same. For example, you want to make it so that 3C and C3 get the same value, then the reshape will work easily.

library(dplyr)
library(tidyr)
library(stringr)
data &lt;- data.frame(
  ID = 1:6,
  Course1A = c(&quot;A1&quot;, &quot;A1&quot;, &quot;A1&quot;, &quot;A1&quot;, &quot;A1&quot;, &quot;A1&quot;),
  Time1A = c(1, 1, 1, 1, 1, 1),
  Score1A = c(20, 17, 20, 16, 20, 11),
  Course2B = c(&quot;B1&quot;, &quot;B1&quot;, &quot;B1&quot;, &quot;B1&quot;, &quot;B1&quot;, &quot;B1&quot;),
  Time2B = c(1, 1, 1, 1, 1, 1),
  Score2B = c(12, 11, 15, 15, 10, 15),
  Course3C = c(&quot;C1&quot;, &quot;C1&quot;, &quot;C1&quot;, &quot;C1&quot;, &quot;C1&quot;, &quot;C1&quot;),
  TimeC3 = c(1, 1, 1, 1, 1, 1),
  ScoreC3 = c(10, 12, 12, 10, 10, 10),
  Course4D = c(&quot;D1&quot;, &quot;D1&quot;, &quot;D1&quot;, &quot;D1&quot;, &quot;D1&quot;, &quot;D1&quot;),
  TimeD4 = c(1, 1, 1, 1, 1, 1),
  ScoreD4 = c(20, 20, 18, 20, 17, 20),
  Course5E = c(&quot;E1&quot;, &quot;E1&quot;, &quot;E1&quot;, &quot;E1&quot;, &quot;E1&quot;, &quot;E1&quot;),
  TimeE5 = c(2, 2, 2, 2, 2, 2),
  Score5E = c(12, 12, 12, 12, 11, 11),
  Course6F = c(&quot;F1&quot;, &quot;F1&quot;, &quot;F1&quot;, &quot;F1&quot;, &quot;F1&quot;, &quot;F1&quot;),
  TimeF6 = c(2, 2, 2, 2, 2, 2),
  ScoreF6 = c(10, 10, 10, 10, 10, 9),
  Course7G = c(&quot;G1&quot;, &quot;G1&quot;, &quot;G1&quot;, &quot;G1&quot;, &quot;G1&quot;, &quot;G1&quot;),
  TimeG7 = c(3, 3, 3, 3, 3, 3),
  Score7G = c(12, 11, 6, 4, 12, 9),
  Course8H = c(&quot;H1&quot;, &quot;H1&quot;, &quot;H1&quot;, &quot;H1&quot;, &quot;H1&quot;, &quot;H1&quot;),
  TimeH8 = c(3, 3, 3, 3, 3, 3),
  Score8H = c(12, 12, 12, 12, 10, 10),
  Gender = c(&quot;F&quot;, &quot;F&quot;, &quot;F&quot;, &quot;F&quot;, &quot;F&quot;, &quot;F&quot;),
  Race = c(&quot;A&quot;, &quot;B&quot;, &quot;C&quot;, &quot;C&quot;, &quot;C&quot;, &quot;C&quot;),
  Health = c(&quot;Yes&quot;, &quot;Yes&quot;, &quot;Yes&quot;, &quot;Yes&quot;, &quot;Yes&quot;, &quot;Yes&quot;),
  stringsAsFactors = FALSE)

To do this, I would first find the variables you want and save the stem (i.e., Time, Course, Score) and the suffix (i.e., the two-character ending).

sfx &lt;- gsub(&quot;(Time|Course|Score)(.*)&quot;, &quot;\\2&quot;, grep(&quot;Time|Score|Course&quot;, names(data), value=TRUE))
stem &lt;- gsub(&quot;(Time|Course|Score)(.*)&quot;, &quot;\\1&quot;, grep(&quot;Time|Score|Course&quot;, names(data), value=TRUE))

Next, you can split the two suffix characters apart, then sort them and paste them back together.

sfx &lt;- str_split(sfx, &quot;&quot;, simplify=TRUE)
sfx &lt;- apply(sfx, 1, function(x)paste(sort(x), collapse=&quot;&quot;))

Then, you can paste the newly rationalized suffixes on to the stems and rename the relevant columns:

names(data)[grep(&quot;Time|Score|Course&quot;, names(data))] &lt;- paste(stem, sfx, sep=&quot;&quot;)

Then, the reshape works as expected.

reshaped_data &lt;- data %&gt;% 
  pivot_longer(
    cols = -c(ID, Gender, Race, Health),
    names_to = c(&quot;.value&quot;, &quot;Group&quot;),
    names_pattern = &quot;(Course|Time|Score)(.{2})$&quot;
  )
reshaped_data
#&gt; # A tibble: 48 &#215; 8
#&gt;       ID Gender Race  Health Group Course  Time Score
#&gt;    &lt;int&gt; &lt;chr&gt;  &lt;chr&gt; &lt;chr&gt;  &lt;chr&gt; &lt;chr&gt;  &lt;dbl&gt; &lt;dbl&gt;
#&gt;  1     1 F      A     Yes    1A    A1         1    20
#&gt;  2     1 F      A     Yes    2B    B1         1    12
#&gt;  3     1 F      A     Yes    3C    C1         1    10
#&gt;  4     1 F      A     Yes    4D    D1         1    20
#&gt;  5     1 F      A     Yes    5E    E1         2    12
#&gt;  6     1 F      A     Yes    6F    F1         2    10
#&gt;  7     1 F      A     Yes    7G    G1         3    12
#&gt;  8     1 F      A     Yes    8H    H1         3    12
#&gt;  9     2 F      B     Yes    1A    A1         1    17
#&gt; 10     2 F      B     Yes    2B    B1         1    11
#&gt; # … with 38 more rows

<sup>Created on 2023-03-31 with reprex v2.0.2</sup>

huangapple
  • 本文由 发表于 2023年3月31日 22:01:28
  • 转载请务必保留本文链接:https://go.coder-hub.com/75899419.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定