英文:
How to keep only the first set of duplicate if there are multiple duplicates in a column
问题
clin.info$Sample.ID
存在重复值。如果有多对重复值,我只想保留第一对。
n_occur <- data.frame(table(clin.info$Sample.ID))
multiple.duplicates <- n_occur[n_occur$Freq > 2,]
if(multiple.duplicates$Var1 %in% clin.info$Sample.ID){
clin.info <- clin.info %>%
group_by(Sample.ID) %>%
distinct
}
错误回溯:
Error in if (multiple.duplicates$Var1 %in% clin.info$Sample.ID) { :
argument is of length zero
数据:
> dput(clin.info)
structure(list(Sample.ID = c("TCGA.B2.3924.01", "TCGA.B2.3924.01",
"TCGA.B2.3924.01", "TCGA.B2.3924.01", "TCGA.B2.5635.01", "TCGA.B2.5635.01",
"TCGA.B2.5635.01", "TCGA.B2.5635.01", "TCGA.B2.5635.01", "TCGA.B2.5635.01",
"TCGA.A3.3357.01", "TCGA.A3.3357.01", "TCGA.A3.3367.01", "TCGA.A3.3367.01",
"TCGA.A3.3387.01", "TCGA.A3.3387.01", "TCGA.B0.4698.01", "TCGA.B0.4698.01",
"TCGA.B0.4710.01", "TCGA.B0.4710.01"), age = c("73", "73", "73",
"73", "74", "74", "74", "74", "74", "74", "62", "62", "72", "72",
"49", "49", "75", "75", "75", "75")), row.names = c(67L, 68L,
69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L), class = "data.frame")
> dput(multiple.duplicates)
structure(list(Var1 = structure(6:7, levels = c("TCGA.A3.3357.01",
"TCGA.A3.3367.01", "TCGA.A3.3387.01", "TCGA.B0.4698.01", "TCGA.B0.4710.01",
"TCGA.B2.3924.01", "TCGA.B2.5635.01"), class = "factor"), Freq = c(4L,
6L)), row.names = 6:7, class = "data.frame")
期望输出:
根据multiple.duplicates
,有两个Sample.ID
值有多于一个重复值。
因此,对于这两个Sample.ID
,只保留在clin.info
中的第一组重复值。
英文:
The clin.info$Sample.ID
has duplicates. If there are more than one pair of duplicates, I want to take only the first pair.
n_occur <- data.frame(table(clin.info$Sample.ID))
multiple.duplicates <- n_occur[n_occur$Freq > 2,]
if(multiple.duplicates$Var1 %in% clin.info$Sample.ID){
clin.info <- clin.info %>%
group_by(Sample.ID) %>%
distinct
}
Traceback:
Error in if (multiple.duplicates$Var1 %in% clin.info$Sample.ID) { :
argument is of length zero
Data:
> dput(clin.info)
structure(list(Sample.ID = c("TCGA.B2.3924.01", "TCGA.B2.3924.01",
"TCGA.B2.3924.01", "TCGA.B2.3924.01", "TCGA.B2.5635.01", "TCGA.B2.5635.01",
"TCGA.B2.5635.01", "TCGA.B2.5635.01", "TCGA.B2.5635.01", "TCGA.B2.5635.01",
"TCGA.A3.3357.01", "TCGA.A3.3357.01", "TCGA.A3.3367.01", "TCGA.A3.3367.01",
"TCGA.A3.3387.01", "TCGA.A3.3387.01", "TCGA.B0.4698.01", "TCGA.B0.4698.01",
"TCGA.B0.4710.01", "TCGA.B0.4710.01"), age = c("73", "73", "73",
"73", "74", "74", "74", "74", "74", "74", "62", "62", "72", "72",
"49", "49", "75", "75", "75", "75")), row.names = c(67L, 68L,
69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L), class = "data.frame")
> dput(multiple.duplicates)
structure(list(Var1 = structure(6:7, levels = c("TCGA.A3.3357.01",
"TCGA.A3.3367.01", "TCGA.A3.3387.01", "TCGA.B0.4698.01", "TCGA.B0.4710.01",
"TCGA.B2.3924.01", "TCGA.B2.5635.01"), class = "factor"), Freq = c(4L,
6L)), row.names = 6:7, class = "data.frame")
Expected output:
Based on multiple.duplicates
, there are two Sample.ID
values with more than one duplicate.
Hence, for these two Sample.ID
, keep only the first set of duplicate in clin.info
.
答案1
得分: 2
dplyr::slice_head(clin.info, n = 2, by = Sample.ID)
#> Sample.ID age
#> 1 TCGA.B2.3924.01 73
#> 2 TCGA.B2.3924.01 73
英文:
dplyr::slice_head(clin.info, n = 2, by = Sample.ID)
#> Sample.ID age
#> 1 TCGA.B2.3924.01 73
#> 2 TCGA.B2.3924.01 73
#> 3 TCGA.B2.5635.01 74
#> 4 TCGA.B2.5635.01 74
#> 5 TCGA.A3.3357.01 62
#> 6 TCGA.A3.3357.01 62
#> 7 TCGA.A3.3367.01 72
#> 8 TCGA.A3.3367.01 72
#> 9 TCGA.A3.3387.01 49
#> 10 TCGA.A3.3387.01 49
#> 11 TCGA.B0.4698.01 75
#> 12 TCGA.B0.4698.01 75
#> 13 TCGA.B0.4710.01 75
#> 14 TCGA.B0.4710.01 75
<sup>Created on 2023-05-28 with reprex v2.0.2</sup>
Input data:
clin.info <-
structure(list(Sample.ID = c("TCGA.B2.3924.01", "TCGA.B2.3924.01",
"TCGA.B2.3924.01", "TCGA.B2.3924.01", "TCGA.B2.5635.01", "TCGA.B2.5635.01",
"TCGA.B2.5635.01", "TCGA.B2.5635.01", "TCGA.B2.5635.01", "TCGA.B2.5635.01",
"TCGA.A3.3357.01", "TCGA.A3.3357.01", "TCGA.A3.3367.01", "TCGA.A3.3367.01",
"TCGA.A3.3387.01", "TCGA.A3.3387.01", "TCGA.B0.4698.01", "TCGA.B0.4698.01",
"TCGA.B0.4710.01", "TCGA.B0.4710.01"), age = c("73", "73", "73",
"73", "74", "74", "74", "74", "74", "74", "62", "62", "72", "72",
"49", "49", "75", "75", "75", "75")), row.names = c(67L, 68L,
69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L), class = "data.frame")
答案2
得分: 0
我认为你可以使用以下代码:
dedup <- clin.info %>%
group_by(Sample.ID) %>%
filter(n() > 2) %>%
distinct() %>% ungroup()
if (dim(dedup)[1] > 0) {
result <- clin.info %>%
filter(!(Sample.ID %in% dedup$Sample.ID)) %>%
bind_rows(dedup)
}
英文:
I think you can use below code:
dedup <- clin.info %>%
group_by(Sample.ID) %>%
filter(n() > 2) %>%
distinct() %>% ungroup()
if (dim(dedup)[1] >0) {
result <- clin.info %>%
filter(!(Sample.ID %in% dedup$Sample.ID)) %>%
bind_rows(dedup)
}
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论