英文:
How I can sort these data out in R?
问题
这是我的数据:
data <- data.frame(
ID = 1:6,
Course1A = c("A1", "A1", "A1", "A1", "A1", "A1"),
Time1A = c(1, 1, 1, 1, 1, 1),
Score1A = c(20, 17, 20, 16, 20, 11),
Course2B = c("B1", "B1", "B1", "B1", "B1", "B1"),
Time2B = c(1, 1, 1, 1, 1, 1),
Score2B = c(12, 11, 15, 15, 10, 15),
Course3C = c("C1", "C1", "C1", "C1", "C1", "C1"),
TimeC3 = c(1, 1, 1, 1, 1, 1),
ScoreC3 = c(10, 12, 12, 10, 10, 10),
Course4D = c("D1", "D1", "D1", "D1", "D1", "D1"),
TimeD4 = c(1, 1, 1, 1, 1, 1),
ScoreD4 = c(20, 20, 18, 20, 17, 20),
Course5E = c("E1", "E1", "E1", "E1", "E1", "E1"),
TimeE5 = c(2, 2, 2, 2, 2, 2),
Score5E = c(12, 12, 12, 12, 11, 11),
Course6F = c("F1", "F1", "F1", "F1", "F1", "F1"),
TimeF6 = c(2, 2, 2, 2, 2, 2),
ScoreF6 = c(10, 10, 10, 10, 10, 9),
Course7G = c("G1", "G1", "G1", "G1", "G1", "G1"),
TimeG7 = c(3, 3, 3, 3, 3, 3),
Score7G = c(12, 11, 6, 4, 12, 9),
Course8H = c("H1", "H1", "H1", "H1", "H1", "H1"),
TimeH8 = c(3, 3, 3, 3, 3, 3),
Score8H = c(12, 12, 12, 12, 10, 10),
Gender = c("F", "F", "F", "F", "F", "F"),
Race = c("A", "B", "C", "C", "C", "C"),
Health = c("Yes", "Yes", "Yes", "Yes", "Yes", "Yes"),
stringsAsFactors = FALSE
)
我想获取ID、Course、Time、Gender、Race、Health和Score的列。
我已经使用了以下代码,但得到了错误的表格:
```R
reshaped_data <- data %>%
pivot_longer(
cols = -c(ID, Gender, Race, Health),
names_to = c(".value", "Group"),
names_pattern = "([A-Za-z]+)([0-9]+[A-Z])"
)
英文:
Here is my data:
data <- data.frame(
ID = 1:6,
Course1A = c("A1", "A1", "A1", "A1", "A1", "A1"),
Time1A = c(1, 1, 1, 1, 1, 1),
Score1A = c(20, 17, 20, 16, 20, 11),
Course2B = c("B1", "B1", "B1", "B1", "B1", "B1"),
Time2B = c(1, 1, 1, 1, 1, 1),
Score2B = c(12, 11, 15, 15, 10, 15),
Course3C = c("C1", "C1", "C1", "C1", "C1", "C1"),
TimeC3 = c(1, 1, 1, 1, 1, 1),
ScoreC3 = c(10, 12, 12, 10, 10, 10),
Course4D = c("D1", "D1", "D1", "D1", "D1", "D1"),
TimeD4 = c(1, 1, 1, 1, 1, 1),
ScoreD4 = c(20, 20, 18, 20, 17, 20),
Course5E = c("E1", "E1", "E1", "E1", "E1", "E1"),
TimeE5 = c(2, 2, 2, 2, 2, 2),
Score5E = c(12, 12, 12, 12, 11, 11),
Course6F = c("F1", "F1", "F1", "F1", "F1", "F1"),
TimeF6 = c(2, 2, 2, 2, 2, 2),
ScoreF6 = c(10, 10, 10, 10, 10, 9),
Course7G = c("G1", "G1", "G1", "G1", "G1", "G1"),
TimeG7 = c(3, 3, 3, 3, 3, 3),
Score7G = c(12, 11, 6, 4, 12, 9),
Course8H = c("H1", "H1", "H1", "H1", "H1", "H1"),
TimeH8 = c(3, 3, 3, 3, 3, 3),
Score8H = c(12, 12, 12, 12, 10, 10),
Gender = c("F", "F", "F", "F", "F", "F"),
Race = c("A", "B", "C", "C", "C", "C"),
Health = c("Yes", "Yes", "Yes", "Yes", "Yes", "Yes"),
stringsAsFactors = FALSE
I want to get columns for ID, Course, Time, Gender, Race and health and Score.
I have used the following codes, but I get a wrong table
reshaped_data <- data %>%
pivot_longer(
cols = -c(ID, Gender, Race, Health),
names_to = c(".value", "Group"),
names_pattern = "([A-Za-z]+)([0-9]+[A-Z])"
)
答案1
得分: 2
我认为,如果你使变量的后缀相同,可能会更容易理解。比如,如果你希望3C和C3得到相同的值,那么reshape就会更容易实现。
library(dplyr)
library(tidyr)
library(stringr)
data <- data.frame(
ID = 1:6,
Course1A = c("A1", "A1", "A1", "A1", "A1", "A1"),
Time1A = c(1, 1, 1, 1, 1, 1),
Score1A = c(20, 17, 20, 16, 20, 11),
Course2B = c("B1", "B1", "B1", "B1", "B1", "B1"),
Time2B = c(1, 1, 1, 1, 1, 1),
Score2B = c(12, 11, 15, 15, 10, 15),
Course3C = c("C1", "C1", "C1", "C1", "C1", "C1"),
TimeC3 = c(1, 1, 1, 1, 1, 1),
ScoreC3 = c(10, 12, 12, 10, 10, 10),
Course4D = c("D1", "D1", "D1", "D1", "D1", "D1"),
TimeD4 = c(1, 1, 1, 1, 1, 1),
ScoreD4 = c(20, 20, 18, 20, 17, 20),
Course5E = c("E1", "E1", "E1", "E1", "E1", "E1"),
TimeE5 = c(2, 2, 2, 2, 2, 2),
Score5E = c(12, 12, 12, 12, 11, 11),
Course6F = c("F1", "F1", "F1", "F1", "F1", "F1"),
TimeF6 = c(2, 2, 2, 2, 2, 2),
ScoreF6 = c(10, 10, 10, 10, 10, 9),
Course7G = c("G1", "G1", "G1", "G1", "G1", "G1"),
TimeG7 = c(3, 3, 3, 3, 3, 3),
Score7G = c(12, 11, 6, 4, 12, 9),
Course8H = c("H1", "H1", "H1", "H1", "H1", "H1"),
TimeH8 = c(3, 3, 3, 3, 3, 3),
Score8H = c(12, 12, 12, 12, 10, 10),
Gender = c("F", "F", "F", "F", "F", "F"),
Race = c("A", "B", "C", "C", "C", "C"),
Health = c("Yes", "Yes", "Yes", "Yes", "Yes", "Yes"),
stringsAsFactors = FALSE)
为此,我首先会找到你想要的变量,并保存词干(即,Time,Course,Score)和后缀(即,两个字符的结尾)。
sfx <- gsub("(Time|Course|Score)(.*)", "\\2", grep("Time|Score|Course", names(data), value=TRUE))
stem <- gsub("(Time|Course|Score)(.*)", "\\1", grep("Time|Score|Course", names(data), value=TRUE))
接下来,你可以将两个后缀字符拆分开,然后排序它们并粘贴在一起。
sfx <- str_split(sfx, "", simplify=TRUE)
sfx <- apply(sfx, 1, function(x)paste(sort(x), collapse=""))
然后,你可以将新理性化的后缀粘贴到词干上,并重新命名相关列。
names(data)[grep("Time|Score|Course", names(data))] <- paste(stem, sfx, sep="")
然后,reshape就能按预期工作了。
reshaped_data <- data %>%
pivot_longer(
cols = -c(ID, Gender, Race, Health),
names_to = c(".value", "Group"),
names_pattern = "(Course|Time|Score)(.{2})$"
)
reshaped_data
#> # A tibble: 48 × 8
#> ID Gender Race Health Group Course Time Score
#> <int> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
#> 1 1 F A Yes 1A A1 1 20
#> 2 1 F A Yes 2B B1 1 12
#> 3 1 F A Yes 3C C1 1 10
#> 4 1 F A Yes 4D D1 1 20
#> 5 1 F A Yes 5E E1 2 12
#> 6 1 F A Yes 6F F1 2 10
#> 7 1 F A Yes 7G G1 3 12
#> 8 1 F A Yes 8H H1 3 12
#> 9 2 F B Yes 1A A1 1 17
#> 10 2 F B Yes 2B B1 1 11
#> # … with 38 more rows
英文:
I think it might be easiest if you rationalized the names for the variables to make the suffixes the same. For example, you want to make it so that 3C and C3 get the same value, then the reshape will work easily.
library(dplyr)
library(tidyr)
library(stringr)
data <- data.frame(
ID = 1:6,
Course1A = c("A1", "A1", "A1", "A1", "A1", "A1"),
Time1A = c(1, 1, 1, 1, 1, 1),
Score1A = c(20, 17, 20, 16, 20, 11),
Course2B = c("B1", "B1", "B1", "B1", "B1", "B1"),
Time2B = c(1, 1, 1, 1, 1, 1),
Score2B = c(12, 11, 15, 15, 10, 15),
Course3C = c("C1", "C1", "C1", "C1", "C1", "C1"),
TimeC3 = c(1, 1, 1, 1, 1, 1),
ScoreC3 = c(10, 12, 12, 10, 10, 10),
Course4D = c("D1", "D1", "D1", "D1", "D1", "D1"),
TimeD4 = c(1, 1, 1, 1, 1, 1),
ScoreD4 = c(20, 20, 18, 20, 17, 20),
Course5E = c("E1", "E1", "E1", "E1", "E1", "E1"),
TimeE5 = c(2, 2, 2, 2, 2, 2),
Score5E = c(12, 12, 12, 12, 11, 11),
Course6F = c("F1", "F1", "F1", "F1", "F1", "F1"),
TimeF6 = c(2, 2, 2, 2, 2, 2),
ScoreF6 = c(10, 10, 10, 10, 10, 9),
Course7G = c("G1", "G1", "G1", "G1", "G1", "G1"),
TimeG7 = c(3, 3, 3, 3, 3, 3),
Score7G = c(12, 11, 6, 4, 12, 9),
Course8H = c("H1", "H1", "H1", "H1", "H1", "H1"),
TimeH8 = c(3, 3, 3, 3, 3, 3),
Score8H = c(12, 12, 12, 12, 10, 10),
Gender = c("F", "F", "F", "F", "F", "F"),
Race = c("A", "B", "C", "C", "C", "C"),
Health = c("Yes", "Yes", "Yes", "Yes", "Yes", "Yes"),
stringsAsFactors = FALSE)
To do this, I would first find the variables you want and save the stem (i.e., Time, Course, Score) and the suffix (i.e., the two-character ending).
sfx <- gsub("(Time|Course|Score)(.*)", "\\2", grep("Time|Score|Course", names(data), value=TRUE))
stem <- gsub("(Time|Course|Score)(.*)", "\\1", grep("Time|Score|Course", names(data), value=TRUE))
Next, you can split the two suffix characters apart, then sort them and paste them back together.
sfx <- str_split(sfx, "", simplify=TRUE)
sfx <- apply(sfx, 1, function(x)paste(sort(x), collapse=""))
Then, you can paste the newly rationalized suffixes on to the stems and rename the relevant columns:
names(data)[grep("Time|Score|Course", names(data))] <- paste(stem, sfx, sep="")
Then, the reshape works as expected.
reshaped_data <- data %>%
pivot_longer(
cols = -c(ID, Gender, Race, Health),
names_to = c(".value", "Group"),
names_pattern = "(Course|Time|Score)(.{2})$"
)
reshaped_data
#> # A tibble: 48 × 8
#> ID Gender Race Health Group Course Time Score
#> <int> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
#> 1 1 F A Yes 1A A1 1 20
#> 2 1 F A Yes 2B B1 1 12
#> 3 1 F A Yes 3C C1 1 10
#> 4 1 F A Yes 4D D1 1 20
#> 5 1 F A Yes 5E E1 2 12
#> 6 1 F A Yes 6F F1 2 10
#> 7 1 F A Yes 7G G1 3 12
#> 8 1 F A Yes 8H H1 3 12
#> 9 2 F B Yes 1A A1 1 17
#> 10 2 F B Yes 2B B1 1 11
#> # … with 38 more rows
<sup>Created on 2023-03-31 with reprex v2.0.2</sup>
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论