英文:
Subset dataframe based on a specific number of unique values in a column in R
问题
以下是您要的翻译内容:
我有一个数据框:
group_eg <- data.frame(
CID = c(091, 091, 091, 091,
101, 101, 101,
102, 102, 102,
103, 103,
104, 104, 104),
PID_A = c(1091, 1091, 1091, 1091,
1101, 1101, 1101,
1102, 1102, 1102,
1103, 1103,
1104, 1104, 1104),
PID_B = c(2091, 2091, 2091, 2091,
2101, 2101, 2101,
2102, 2102, 2102,
2103, 2103,
2104, 2104, 2104),
text = c("eg1", "eg2", "eg3", "eg3",
"eg4","eg5", "eg6",
"eg7", "eg8","eg9",
"eg10", "eg11",
"eg12", "eg13", "eg14")
)
我想将这个数据框分成一组较小的数据框。每个较小的数据框应仅包含按升序排列的两个唯一CID属于的行。如果CID的数量不可被2整除,则最后一个数据框可以包含只属于1个CID的行。请注意,每个唯一CID的观察次数不同,因此我有点困惑。
以下是示例输出:
output1 <- data.frame(
CID = c(091, 091, 091, 091,
101, 101, 101),
PID_A = c(1091, 1091, 1091, 1091,
1101, 1101, 1101),
PID_B = c(2091, 2091, 2091, 2091,
2101, 2101, 2101),
text = c("eg1", "eg2", "eg3", "eg3",
"eg4","eg5", "eg6")
)
output2 <- data.frame(
CID = c(102, 102, 102,
103, 103),
PID_A = c(1102, 1102, 1102,
1103, 1103),
PID_B = c(2102, 2102, 2102,
2103, 2103),
text = c( "eg7", "eg8","eg9",
"eg10", "eg11")
)
output3 <- data.frame(
CID = c(104, 104, 104),
PID_A = c(1104, 1104, 1104),
PID_B = c(2104, 2104, 2104),
text = c("eg12", "eg13", "eg14")
)
有人知道如何做到这一点吗?谢谢!
英文:
I have a dataframe:
group_eg <- data.frame(
CID = c(091, 091, 091, 091,
101, 101, 101,
102, 102, 102,
103, 103,
104, 104, 104),
PID_A = c(1091, 1091, 1091, 1091,
1101, 1101, 1101,
1102, 1102, 1102,
1103, 1103,
1104, 1104, 1104),
PID_B = c(2091, 2091, 2091, 2091,
2101, 2101, 2101,
2102, 2102, 2102,
2103, 2103,
2104, 2104, 2104),
text = c("eg1", "eg2", "eg3", "eg3",
"eg4","eg5", "eg6",
"eg7", "eg8","eg9",
"eg10", "eg11",
"eg12", "eg13", "eg14")
)
I want to divide this dataframe into a list of smaller dataframes. Each of the smaller dataframes should only contain rows that belong to 2 unique CIDs in ascending order. If the number of CIDs is not divisible by 2, the last dataframe can contain the rows that belong to just 1 CID. Notice that there are different number of observations for each unique CID, so I'm a bit stuck.
Here are the example outputs:
output1 <- data.frame(
CID = c(091, 091, 091, 091,
101, 101, 101),
PID_A = c(1091, 1091, 1091, 1091,
1101, 1101, 1101),
PID_B = c(2091, 2091, 2091, 2091,
2101, 2101, 2101),
text = c("eg1", "eg2", "eg3", "eg3",
"eg4","eg5", "eg6")
)
output2 <- data.frame(
CID = c(102, 102, 102,
103, 103),
PID_A = c(1102, 1102, 1102,
1103, 1103),
PID_B = c(2102, 2102, 2102,
2103, 2103),
text = c( "eg7", "eg8","eg9",
"eg10", "eg11")
)
output3 <- data.frame(
CID = c(104, 104, 104),
PID_A = c(1104, 1104, 1104),
PID_B = c(2104, 2104, 2104),
text = c("eg12", "eg13", "eg14")
)
Does anyone know how to do this? Thank you!
答案1
得分: 3
使用 dplyr::consecutive_id
和整数除法 %/%
,你可以这样做:
library(dplyr, warn = FALSE)
group_eg |>
group_by(group = (consecutive_id(CID) + 1) %/% 2) |>
group_split()
#> <list_of<
#> tbl_df<
#> CID : double
#> PID_A: double
#> PID_B: double
#> text : character
#> group: double
#> >
#> >[3]>
#> [[1]]
#> # A tibble: 7 × 5
#> CID PID_A PID_B text group
#> <dbl> <dbl> <dbl> <chr> <dbl>
#> 1 91 1091 2091 eg1 1
#> 2 91 1091 2091 eg2 1
#> 3 91 1091 2091 eg3 1
#> 4 91 1091 2091 eg3 1
#> 5 101 1101 2101 eg4 1
#> 6 101 1101 2101 eg5 1
#> 7 101 1101 2101 eg6 1
#>
#> [[2]]
#> # A tibble: 5 × 5
#> CID PID_A PID_B text group
#> <dbl> <dbl> <dbl> <chr> <dbl>
#> 1 102 1102 2102 eg7 2
#> 2 102 1102 2102 eg8 2
#> 3 102 1102 2102 eg9 2
#> 4 103 1103 2103 eg10 2
#> 5 103 1103 2103 eg11 2
#>
#> [[3]]
#> # A tibble: 3 × 5
#> CID PID_A PID_B text group
#> <dbl> <dbl> <dbl> <chr> <dbl>
#> 1 104 1104 2104 eg12 3
#> 2 104 1104 2104 eg13 3
#> 3 104 1104 2104 eg14 3
<details>
<summary>英文:</summary>
Using `dplyr::consecutive_id` and integer division `%/%` you could do:
``` r
library(dplyr, warn = FALSE)
group_eg |>
group_by(group = (consecutive_id(CID) + 1) %/% 2) |>
group_split()
#> <list_of<
#> tbl_df<
#> CID : double
#> PID_A: double
#> PID_B: double
#> text : character
#> group: double
#> >
#> >[3]>
#> [[1]]
#> # A tibble: 7 × 5
#> CID PID_A PID_B text group
#> <dbl> <dbl> <dbl> <chr> <dbl>
#> 1 91 1091 2091 eg1 1
#> 2 91 1091 2091 eg2 1
#> 3 91 1091 2091 eg3 1
#> 4 91 1091 2091 eg3 1
#> 5 101 1101 2101 eg4 1
#> 6 101 1101 2101 eg5 1
#> 7 101 1101 2101 eg6 1
#>
#> [[2]]
#> # A tibble: 5 × 5
#> CID PID_A PID_B text group
#> <dbl> <dbl> <dbl> <chr> <dbl>
#> 1 102 1102 2102 eg7 2
#> 2 102 1102 2102 eg8 2
#> 3 102 1102 2102 eg9 2
#> 4 103 1103 2103 eg10 2
#> 5 103 1103 2103 eg11 2
#>
#> [[3]]
#> # A tibble: 3 × 5
#> CID PID_A PID_B text group
#> <dbl> <dbl> <dbl> <chr> <dbl>
#> 1 104 1104 2104 eg12 3
#> 2 104 1104 2104 eg13 3
#> 3 104 1104 2104 eg14 3
答案2
得分: 3
使用data.table::rleid
函数的帮助,您可以执行split
操作,这将生成一个包含三个数据框的列表。
library(data.table)
group_eg_split <- split(group_eg, paste0("output_", ceiling(rleid(group_eg$CID)/2)))
group_eg_split
$output_1
CID PID_A PID_B text
1 91 1091 2091 eg1
2 91 1091 2091 eg2
3 91 1091 2091 eg3
4 91 1091 2091 eg3
5 101 1101 2101 eg4
6 101 1101 2101 eg5
7 101 1101 2101 eg6
$output_2
CID PID_A PID_B text
8 102 1102 2102 eg7
9 102 1102 2102 eg8
10 102 1102 2102 eg9
11 103 1103 2103 eg10
12 103 1103 2103 eg11
$output_3
CID PID_A PID_B text
13 104 1104 2104 eg12
14 104 1104 2104 eg13
15 104 1104 2104 eg14
要将列表元素分配给单独的对象,使用list2env
。执行此操作后,将在您的环境中生成三个名称分别为"output_1"到"output_3"的对象。
list2env(group_eg_split, envir = .GlobalEnv)
英文:
With the help from data.table::rleid
, you can do a split
, which gives a list of three dataframes.
library(data.table)
group_eg_split <- split(group_eg, paste0("output_", ceiling(rleid(group_eg$CID)/2)))
group_eg_split
$output_1
CID PID_A PID_B text
1 91 1091 2091 eg1
2 91 1091 2091 eg2
3 91 1091 2091 eg3
4 91 1091 2091 eg3
5 101 1101 2101 eg4
6 101 1101 2101 eg5
7 101 1101 2101 eg6
$output_2
CID PID_A PID_B text
8 102 1102 2102 eg7
9 102 1102 2102 eg8
10 102 1102 2102 eg9
11 103 1103 2103 eg10
12 103 1103 2103 eg11
$output_3
CID PID_A PID_B text
13 104 1104 2104 eg12
14 104 1104 2104 eg13
15 104 1104 2104 eg14
To assign the list elements into individual objects, use list2env
. After this, three objects with name "output_1" to "output_3" will be generated in your environment.
list2env(group_eg_split, envir = .GlobalEnv)
答案3
得分: 2
使用基本的R语言:
CID_uniq <- as.character(unique(group_eg$CID))
hash <- ceiling(setNames(seq_along(CID_uniq), CID_uniq) / 2)
list_of_dataframes <-
split(group_eg,
f = hash[as.character(group_eg$CID)]
)
## > str(list_of_dataframes)
## List of 3
## $ 1:'data.frame': 7 obs. of 4 variables:
## ..$ CID : num [1:7] 91 91 91 91 101 101 101
## ..$ PID_A: num [1:7] 1091 1091 1091 1091 1101 ...
## ..$ PID_B: num [1:7] 2091 2091 2091 2091 2101 ...
## ..$ text : chr [1:7] "eg1" "eg2" "eg3" "eg3" ...
## $ 2:'data.frame': 5 obs. of 4 variables:
## ..$ CID : num [1:5] 102 102 102 103 103
## ..$ PID_A: num [1:5] 1102 1102 1102 1103 1103
## ..$ PID_B: num [1:5] 2102 2102 2102 2103 2103
## ..$ text : chr [1:5] "eg7" "eg8" "eg9" "eg10" ...
## $ 3:'data.frame': 3 obs. of 4 variables:
## ..$ CID : num [1:3] 104 104 104
## ..$ PID_A: num [1:3] 1104 1104 1104
## ..$ PID_B: num [1:3] 2104 2104 2104
## ..$ text : chr [1:3] "eg12" "eg13" "eg14"
英文:
with base R:
CID_uniq <- as.character(unique(group_eg$CID))
hash <- ceiling(setNames(seq_along(CID_uniq), CID_uniq) / 2)
list_of_dataframes <-
split(group_eg,
f = hash[as.character(group_eg$CID)]
)
## > str(list_of_dataframes)
## List of 3
## $ 1:'data.frame': 7 obs. of 4 variables:
## ..$ CID : num [1:7] 91 91 91 91 101 101 101
## ..$ PID_A: num [1:7] 1091 1091 1091 1091 1101 ...
## ..$ PID_B: num [1:7] 2091 2091 2091 2091 2101 ...
## ..$ text : chr [1:7] "eg1" "eg2" "eg3" "eg3" ...
## $ 2:'data.frame': 5 obs. of 4 variables:
## ..$ CID : num [1:5] 102 102 102 103 103
## ..$ PID_A: num [1:5] 1102 1102 1102 1103 1103
## ..$ PID_B: num [1:5] 2102 2102 2102 2103 2103
## ..$ text : chr [1:5] "eg7" "eg8" "eg9" "eg10" ...
## $ 3:'data.frame': 3 obs. of 4 variables:
## ..$ CID : num [1:3] 104 104 104
## ..$ PID_A: num [1:3] 1104 1104 1104
## ..$ PID_B: num [1:3] 2104 2104 2104
## ..$ text : chr [1:3] "eg12" "eg13" "eg14"
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论