2023年5月28日 22:17:15go评论110阅读模式

英文:

How to keep only the first set of duplicate if there are multiple duplicates in a column

问题

clin.info$Sample.ID存在重复值。如果有多对重复值，我只想保留第一对。

n_occur <- data.frame(table(clin.info$Sample.ID))
multiple.duplicates <- n_occur[n_occur$Freq > 2,]
if(multiple.duplicates$Var1 %in% clin.info$Sample.ID){
  clin.info <- clin.info %>%
    group_by(Sample.ID) %>%
    distinct
}

错误回溯：

Error in if (multiple.duplicates$Var1 %in% clin.info$Sample.ID) { : 
  argument is of length zero

数据：

> dput(clin.info)
structure(list(Sample.ID = c("TCGA.B2.3924.01", "TCGA.B2.3924.01", 
"TCGA.B2.3924.01", "TCGA.B2.3924.01", "TCGA.B2.5635.01", "TCGA.B2.5635.01", 
"TCGA.B2.5635.01", "TCGA.B2.5635.01", "TCGA.B2.5635.01", "TCGA.B2.5635.01", 
"TCGA.A3.3357.01", "TCGA.A3.3357.01", "TCGA.A3.3367.01", "TCGA.A3.3367.01", 
"TCGA.A3.3387.01", "TCGA.A3.3387.01", "TCGA.B0.4698.01", "TCGA.B0.4698.01", 
"TCGA.B0.4710.01", "TCGA.B0.4710.01"), age = c("73", "73", "73", 
"73", "74", "74", "74", "74", "74", "74", "62", "62", "72", "72", 
"49", "49", "75", "75", "75", "75")), row.names = c(67L, 68L, 
69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L), class = "data.frame")
> dput(multiple.duplicates)
structure(list(Var1 = structure(6:7, levels = c("TCGA.A3.3357.01", 
"TCGA.A3.3367.01", "TCGA.A3.3387.01", "TCGA.B0.4698.01", "TCGA.B0.4710.01", 
"TCGA.B2.3924.01", "TCGA.B2.5635.01"), class = "factor"), Freq = c(4L, 
6L)), row.names = 6:7, class = "data.frame")

期望输出：

根据multiple.duplicates，有两个Sample.ID值有多于一个重复值。

因此，对于这两个Sample.ID，只保留在clin.info中的第一组重复值。

英文:

The clin.info$Sample.ID has duplicates. If there are more than one pair of duplicates, I want to take only the first pair.

n_occur &lt;- data.frame(table(clin.info$Sample.ID))
multiple.duplicates &lt;- n_occur[n_occur$Freq &gt; 2,]
if(multiple.duplicates$Var1 %in% clin.info$Sample.ID){
  clin.info &lt;- clin.info %&gt;% 
    group_by(Sample.ID) %&gt;% 
    distinct
}

Traceback:

Error in if (multiple.duplicates$Var1 %in% clin.info$Sample.ID) { : 
  argument is of length zero

Data:

&gt; dput(clin.info)
structure(list(Sample.ID = c(&quot;TCGA.B2.3924.01&quot;, &quot;TCGA.B2.3924.01&quot;, 
&quot;TCGA.B2.3924.01&quot;, &quot;TCGA.B2.3924.01&quot;, &quot;TCGA.B2.5635.01&quot;, &quot;TCGA.B2.5635.01&quot;, 
&quot;TCGA.B2.5635.01&quot;, &quot;TCGA.B2.5635.01&quot;, &quot;TCGA.B2.5635.01&quot;, &quot;TCGA.B2.5635.01&quot;, 
&quot;TCGA.A3.3357.01&quot;, &quot;TCGA.A3.3357.01&quot;, &quot;TCGA.A3.3367.01&quot;, &quot;TCGA.A3.3367.01&quot;, 
&quot;TCGA.A3.3387.01&quot;, &quot;TCGA.A3.3387.01&quot;, &quot;TCGA.B0.4698.01&quot;, &quot;TCGA.B0.4698.01&quot;, 
&quot;TCGA.B0.4710.01&quot;, &quot;TCGA.B0.4710.01&quot;), age = c(&quot;73&quot;, &quot;73&quot;, &quot;73&quot;, 
&quot;73&quot;, &quot;74&quot;, &quot;74&quot;, &quot;74&quot;, &quot;74&quot;, &quot;74&quot;, &quot;74&quot;, &quot;62&quot;, &quot;62&quot;, &quot;72&quot;, &quot;72&quot;, 
&quot;49&quot;, &quot;49&quot;, &quot;75&quot;, &quot;75&quot;, &quot;75&quot;, &quot;75&quot;)), row.names = c(67L, 68L, 
69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L), class = &quot;data.frame&quot;)
&gt; dput(multiple.duplicates)
structure(list(Var1 = structure(6:7, levels = c(&quot;TCGA.A3.3357.01&quot;, 
&quot;TCGA.A3.3367.01&quot;, &quot;TCGA.A3.3387.01&quot;, &quot;TCGA.B0.4698.01&quot;, &quot;TCGA.B0.4710.01&quot;, 
&quot;TCGA.B2.3924.01&quot;, &quot;TCGA.B2.5635.01&quot;), class = &quot;factor&quot;), Freq = c(4L, 
6L)), row.names = 6:7, class = &quot;data.frame&quot;)

Expected output:

Based on multiple.duplicates, there are two Sample.ID values with more than one duplicate.

Hence, for these two Sample.ID, keep only the first set of duplicate in clin.info.

答案1

得分: 2

dplyr::slice_head(clin.info, n = 2, by = Sample.ID)
#>          Sample.ID age
#> 1  TCGA.B2.3924.01  73
#> 2  TCGA.B2.3924.01  73

英文:

dplyr::slice_head(clin.info, n = 2, by = Sample.ID)
#&gt;          Sample.ID age
#&gt; 1  TCGA.B2.3924.01  73
#&gt; 2  TCGA.B2.3924.01  73
#&gt; 3  TCGA.B2.5635.01  74
#&gt; 4  TCGA.B2.5635.01  74
#&gt; 5  TCGA.A3.3357.01  62
#&gt; 6  TCGA.A3.3357.01  62
#&gt; 7  TCGA.A3.3367.01  72
#&gt; 8  TCGA.A3.3367.01  72
#&gt; 9  TCGA.A3.3387.01  49
#&gt; 10 TCGA.A3.3387.01  49
#&gt; 11 TCGA.B0.4698.01  75
#&gt; 12 TCGA.B0.4698.01  75
#&gt; 13 TCGA.B0.4710.01  75
#&gt; 14 TCGA.B0.4710.01  75

<sup>Created on 2023-05-28 with reprex v2.0.2</sup>

Input data:

clin.info &lt;-
structure(list(Sample.ID = c(&quot;TCGA.B2.3924.01&quot;, &quot;TCGA.B2.3924.01&quot;, 
&quot;TCGA.B2.3924.01&quot;, &quot;TCGA.B2.3924.01&quot;, &quot;TCGA.B2.5635.01&quot;, &quot;TCGA.B2.5635.01&quot;, 
&quot;TCGA.B2.5635.01&quot;, &quot;TCGA.B2.5635.01&quot;, &quot;TCGA.B2.5635.01&quot;, &quot;TCGA.B2.5635.01&quot;, 
&quot;TCGA.A3.3357.01&quot;, &quot;TCGA.A3.3357.01&quot;, &quot;TCGA.A3.3367.01&quot;, &quot;TCGA.A3.3367.01&quot;, 
&quot;TCGA.A3.3387.01&quot;, &quot;TCGA.A3.3387.01&quot;, &quot;TCGA.B0.4698.01&quot;, &quot;TCGA.B0.4698.01&quot;, 
&quot;TCGA.B0.4710.01&quot;, &quot;TCGA.B0.4710.01&quot;), age = c(&quot;73&quot;, &quot;73&quot;, &quot;73&quot;, 
&quot;73&quot;, &quot;74&quot;, &quot;74&quot;, &quot;74&quot;, &quot;74&quot;, &quot;74&quot;, &quot;74&quot;, &quot;62&quot;, &quot;62&quot;, &quot;72&quot;, &quot;72&quot;, 
&quot;49&quot;, &quot;49&quot;, &quot;75&quot;, &quot;75&quot;, &quot;75&quot;, &quot;75&quot;)), row.names = c(67L, 68L, 
69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L), class = &quot;data.frame&quot;)

答案2

得分: 0

我认为你可以使用以下代码：

dedup <- clin.info %>%
  group_by(Sample.ID) %>%
  filter(n() > 2) %>%
  distinct() %>% ungroup()
if (dim(dedup)[1] > 0) {
    result <- clin.info %>%
    filter(!(Sample.ID %in% dedup$Sample.ID)) %>%
    bind_rows(dedup)
}

英文:

I think you can use below code:

dedup &lt;- clin.info %&gt;%
  group_by(Sample.ID) %&gt;%
  filter(n() &gt; 2) %&gt;%
  distinct() %&gt;% ungroup()
if (dim(dedup)[1] &gt;0) {
    result &lt;- clin.info %&gt;%
    filter(!(Sample.ID  %in% dedup$Sample.ID)) %&gt;%
    bind_rows(dedup)
}

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

如何仅保留列中的第一组重复项，如果有多个重复项。

问题

答案1

答案2

如何从城市中获取国家的名称并将其保存在一个列中，使用 R 语言？

如何在 quarto 块中打印多个 gt 表格？

Fastest way to check if values in list are in dataframe using R

在生成文档目录之前添加执行摘要，并将其编译成微软Word格式。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。