英文:
Subset dataframe based on a specific number of unique values in a column in R
问题
以下是您要的翻译内容:
我有一个数据框:
group_eg <- data.frame(
CID = c(091, 091, 091, 091,
101, 101, 101,
102, 102, 102,
103, 103,
104, 104, 104),
PID_A = c(1091, 1091, 1091, 1091,
1101, 1101, 1101,
1102, 1102, 1102,
1103, 1103,
1104, 1104, 1104),
PID_B = c(2091, 2091, 2091, 2091,
2101, 2101, 2101,
2102, 2102, 2102,
2103, 2103,
2104, 2104, 2104),
text = c("eg1", "eg2", "eg3", "eg3",
"eg4","eg5", "eg6",
"eg7", "eg8","eg9",
"eg10", "eg11",
"eg12", "eg13", "eg14")
)
我想将这个数据框分成一组较小的数据框。每个较小的数据框应仅包含按升序排列的两个唯一CID属于的行。如果CID的数量不可被2整除,则最后一个数据框可以包含只属于1个CID的行。请注意,每个唯一CID的观察次数不同,因此我有点困惑。
以下是示例输出:
output1 <- data.frame(
CID = c(091, 091, 091, 091,
101, 101, 101),
PID_A = c(1091, 1091, 1091, 1091,
1101, 1101, 1101),
PID_B = c(2091, 2091, 2091, 2091,
2101, 2101, 2101),
text = c("eg1", "eg2", "eg3", "eg3",
"eg4","eg5", "eg6")
)
output2 <- data.frame(
CID = c(102, 102, 102,
103, 103),
PID_A = c(1102, 1102, 1102,
1103, 1103),
PID_B = c(2102, 2102, 2102,
2103, 2103),
text = c( "eg7", "eg8","eg9",
"eg10", "eg11")
)
output3 <- data.frame(
CID = c(104, 104, 104),
PID_A = c(1104, 1104, 1104),
PID_B = c(2104, 2104, 2104),
text = c("eg12", "eg13", "eg14")
)
有人知道如何做到这一点吗?谢谢!
英文:
I have a dataframe:
group_eg <- data.frame(
CID = c(091, 091, 091, 091,
101, 101, 101,
102, 102, 102,
103, 103,
104, 104, 104),
PID_A = c(1091, 1091, 1091, 1091,
1101, 1101, 1101,
1102, 1102, 1102,
1103, 1103,
1104, 1104, 1104),
PID_B = c(2091, 2091, 2091, 2091,
2101, 2101, 2101,
2102, 2102, 2102,
2103, 2103,
2104, 2104, 2104),
text = c("eg1", "eg2", "eg3", "eg3",
"eg4","eg5", "eg6",
"eg7", "eg8","eg9",
"eg10", "eg11",
"eg12", "eg13", "eg14")
)
I want to divide this dataframe into a list of smaller dataframes. Each of the smaller dataframes should only contain rows that belong to 2 unique CIDs in ascending order. If the number of CIDs is not divisible by 2, the last dataframe can contain the rows that belong to just 1 CID. Notice that there are different number of observations for each unique CID, so I'm a bit stuck.
Here are the example outputs:
output1 <- data.frame(
CID = c(091, 091, 091, 091,
101, 101, 101),
PID_A = c(1091, 1091, 1091, 1091,
1101, 1101, 1101),
PID_B = c(2091, 2091, 2091, 2091,
2101, 2101, 2101),
text = c("eg1", "eg2", "eg3", "eg3",
"eg4","eg5", "eg6")
)
output2 <- data.frame(
CID = c(102, 102, 102,
103, 103),
PID_A = c(1102, 1102, 1102,
1103, 1103),
PID_B = c(2102, 2102, 2102,
2103, 2103),
text = c( "eg7", "eg8","eg9",
"eg10", "eg11")
)
output3 <- data.frame(
CID = c(104, 104, 104),
PID_A = c(1104, 1104, 1104),
PID_B = c(2104, 2104, 2104),
text = c("eg12", "eg13", "eg14")
)
Does anyone know how to do this? Thank you!
答案1
得分: 3
使用 dplyr::consecutive_id 和整数除法 %/%,你可以这样做:
library(dplyr, warn = FALSE)
group_eg |>
group_by(group = (consecutive_id(CID) + 1) %/% 2) |>
group_split()
#> <list_of<
#> tbl_df<
#> CID : double
#> PID_A: double
#> PID_B: double
#> text : character
#> group: double
#> >
#> >[3]>
#> [[1]]
#> # A tibble: 7 × 5
#> CID PID_A PID_B text group
#> <dbl> <dbl> <dbl> <chr> <dbl>
#> 1 91 1091 2091 eg1 1
#> 2 91 1091 2091 eg2 1
#> 3 91 1091 2091 eg3 1
#> 4 91 1091 2091 eg3 1
#> 5 101 1101 2101 eg4 1
#> 6 101 1101 2101 eg5 1
#> 7 101 1101 2101 eg6 1
#>
#> [[2]]
#> # A tibble: 5 × 5
#> CID PID_A PID_B text group
#> <dbl> <dbl> <dbl> <chr> <dbl>
#> 1 102 1102 2102 eg7 2
#> 2 102 1102 2102 eg8 2
#> 3 102 1102 2102 eg9 2
#> 4 103 1103 2103 eg10 2
#> 5 103 1103 2103 eg11 2
#>
#> [[3]]
#> # A tibble: 3 × 5
#> CID PID_A PID_B text group
#> <dbl> <dbl> <dbl> <chr> <dbl>
#> 1 104 1104 2104 eg12 3
#> 2 104 1104 2104 eg13 3
#> 3 104 1104 2104 eg14 3
<details>
<summary>英文:</summary>
Using `dplyr::consecutive_id` and integer division `%/%` you could do:
``` r
library(dplyr, warn = FALSE)
group_eg |>
group_by(group = (consecutive_id(CID) + 1) %/% 2) |>
group_split()
#> <list_of<
#> tbl_df<
#> CID : double
#> PID_A: double
#> PID_B: double
#> text : character
#> group: double
#> >
#> >[3]>
#> [[1]]
#> # A tibble: 7 × 5
#> CID PID_A PID_B text group
#> <dbl> <dbl> <dbl> <chr> <dbl>
#> 1 91 1091 2091 eg1 1
#> 2 91 1091 2091 eg2 1
#> 3 91 1091 2091 eg3 1
#> 4 91 1091 2091 eg3 1
#> 5 101 1101 2101 eg4 1
#> 6 101 1101 2101 eg5 1
#> 7 101 1101 2101 eg6 1
#>
#> [[2]]
#> # A tibble: 5 × 5
#> CID PID_A PID_B text group
#> <dbl> <dbl> <dbl> <chr> <dbl>
#> 1 102 1102 2102 eg7 2
#> 2 102 1102 2102 eg8 2
#> 3 102 1102 2102 eg9 2
#> 4 103 1103 2103 eg10 2
#> 5 103 1103 2103 eg11 2
#>
#> [[3]]
#> # A tibble: 3 × 5
#> CID PID_A PID_B text group
#> <dbl> <dbl> <dbl> <chr> <dbl>
#> 1 104 1104 2104 eg12 3
#> 2 104 1104 2104 eg13 3
#> 3 104 1104 2104 eg14 3
答案2
得分: 3
使用data.table::rleid函数的帮助,您可以执行split操作,这将生成一个包含三个数据框的列表。
library(data.table)
group_eg_split <- split(group_eg, paste0("output_", ceiling(rleid(group_eg$CID)/2)))
group_eg_split
$output_1
CID PID_A PID_B text
1 91 1091 2091 eg1
2 91 1091 2091 eg2
3 91 1091 2091 eg3
4 91 1091 2091 eg3
5 101 1101 2101 eg4
6 101 1101 2101 eg5
7 101 1101 2101 eg6
$output_2
CID PID_A PID_B text
8 102 1102 2102 eg7
9 102 1102 2102 eg8
10 102 1102 2102 eg9
11 103 1103 2103 eg10
12 103 1103 2103 eg11
$output_3
CID PID_A PID_B text
13 104 1104 2104 eg12
14 104 1104 2104 eg13
15 104 1104 2104 eg14
要将列表元素分配给单独的对象,使用list2env。执行此操作后,将在您的环境中生成三个名称分别为"output_1"到"output_3"的对象。
list2env(group_eg_split, envir = .GlobalEnv)
英文:
With the help from data.table::rleid, you can do a split, which gives a list of three dataframes.
library(data.table)
group_eg_split <- split(group_eg, paste0("output_", ceiling(rleid(group_eg$CID)/2)))
group_eg_split
$output_1
CID PID_A PID_B text
1 91 1091 2091 eg1
2 91 1091 2091 eg2
3 91 1091 2091 eg3
4 91 1091 2091 eg3
5 101 1101 2101 eg4
6 101 1101 2101 eg5
7 101 1101 2101 eg6
$output_2
CID PID_A PID_B text
8 102 1102 2102 eg7
9 102 1102 2102 eg8
10 102 1102 2102 eg9
11 103 1103 2103 eg10
12 103 1103 2103 eg11
$output_3
CID PID_A PID_B text
13 104 1104 2104 eg12
14 104 1104 2104 eg13
15 104 1104 2104 eg14
To assign the list elements into individual objects, use list2env. After this, three objects with name "output_1" to "output_3" will be generated in your environment.
list2env(group_eg_split, envir = .GlobalEnv)
答案3
得分: 2
使用基本的R语言:
CID_uniq <- as.character(unique(group_eg$CID))
hash <- ceiling(setNames(seq_along(CID_uniq), CID_uniq) / 2)
list_of_dataframes <-
split(group_eg,
f = hash[as.character(group_eg$CID)]
)
## > str(list_of_dataframes)
## List of 3
## $ 1:'data.frame': 7 obs. of 4 variables:
## ..$ CID : num [1:7] 91 91 91 91 101 101 101
## ..$ PID_A: num [1:7] 1091 1091 1091 1091 1101 ...
## ..$ PID_B: num [1:7] 2091 2091 2091 2091 2101 ...
## ..$ text : chr [1:7] "eg1" "eg2" "eg3" "eg3" ...
## $ 2:'data.frame': 5 obs. of 4 variables:
## ..$ CID : num [1:5] 102 102 102 103 103
## ..$ PID_A: num [1:5] 1102 1102 1102 1103 1103
## ..$ PID_B: num [1:5] 2102 2102 2102 2103 2103
## ..$ text : chr [1:5] "eg7" "eg8" "eg9" "eg10" ...
## $ 3:'data.frame': 3 obs. of 4 variables:
## ..$ CID : num [1:3] 104 104 104
## ..$ PID_A: num [1:3] 1104 1104 1104
## ..$ PID_B: num [1:3] 2104 2104 2104
## ..$ text : chr [1:3] "eg12" "eg13" "eg14"
英文:
with base R:
CID_uniq <- as.character(unique(group_eg$CID))
hash <- ceiling(setNames(seq_along(CID_uniq), CID_uniq) / 2)
list_of_dataframes <-
split(group_eg,
f = hash[as.character(group_eg$CID)]
)
## > str(list_of_dataframes)
## List of 3
## $ 1:'data.frame': 7 obs. of 4 variables:
## ..$ CID : num [1:7] 91 91 91 91 101 101 101
## ..$ PID_A: num [1:7] 1091 1091 1091 1091 1101 ...
## ..$ PID_B: num [1:7] 2091 2091 2091 2091 2101 ...
## ..$ text : chr [1:7] "eg1" "eg2" "eg3" "eg3" ...
## $ 2:'data.frame': 5 obs. of 4 variables:
## ..$ CID : num [1:5] 102 102 102 103 103
## ..$ PID_A: num [1:5] 1102 1102 1102 1103 1103
## ..$ PID_B: num [1:5] 2102 2102 2102 2103 2103
## ..$ text : chr [1:5] "eg7" "eg8" "eg9" "eg10" ...
## $ 3:'data.frame': 3 obs. of 4 variables:
## ..$ CID : num [1:3] 104 104 104
## ..$ PID_A: num [1:3] 1104 1104 1104
## ..$ PID_B: num [1:3] 2104 2104 2104
## ..$ text : chr [1:3] "eg12" "eg13" "eg14"
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。


评论