删除行,如果特定列具有与子字符串向量匹配的值(字符串)。

huangapple go评论106阅读模式
英文:

Remove rows if a specific column has values (strings) that match a vector of substrings

问题

The redundant.gs vector contains substrings that might match with the gs_name column in the all.gene.sets dataframe. I want to remove all such matches.

  1. redundant.gs <- c("ANDERSON_BLOOD_CN54GP140_ADJUVANTED", "BUCASAS_PBMC_FLUARIX_FLUVIRIN")
  2. gene.sets <- all.gene.sets[!(all.gene.sets$gs_name %like% redundant.gs),]

数据:

  1. all.gene.sets <- tibble::tribble(
  2. ~gs_cat, ~gs_subcat, ~gs_name,
  3. "C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  4. "C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  5. "C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  6. "C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  7. "C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  8. "C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  9. "C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  10. "C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  11. "C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  12. "C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  13. "C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  14. "C7", "VAX", "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  15. "C7", "VAX", "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  16. "C7", "VAX", "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  17. "C7", "VAX", "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  18. "C7", "VAX", "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  19. "C7", "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
  20. "C7", "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
  21. "C7", "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
  22. "C7", "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
  23. "C7", "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
  24. )
英文:

The redundant.gs vector contains substrings that might match with the gs_name column in the all.gene.sets dataframe. I want to remove all such matches.
My code below only removes rows that match the first substring ANDERSON_BLOOD_CN54GP140_ADJUVANTED but not the second or subsequent substrings.

  1. redundant.gs &lt;- c(&quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED&quot;, &quot;BUCASAS_PBMC_FLUARIX_FLUVIRIN&quot;)
  2. gene.sets &lt;- all.gene.sets[!(all.gene.sets$gs_name %like% redundant.gs),]

Data:

  1. all.gene.sets &lt;- tibble::tribble(
  2. ~gs_cat, ~gs_subcat, ~gs_name,
  3. &quot;C7&quot;, &quot;VAX&quot;, &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN&quot;,
  4. &quot;C7&quot;, &quot;VAX&quot;, &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN&quot;,
  5. &quot;C7&quot;, &quot;VAX&quot;, &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN&quot;,
  6. &quot;C7&quot;, &quot;VAX&quot;, &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN&quot;,
  7. &quot;C7&quot;, &quot;VAX&quot;, &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN&quot;,
  8. &quot;C7&quot;, &quot;VAX&quot;, &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP&quot;,
  9. &quot;C7&quot;, &quot;VAX&quot;, &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP&quot;,
  10. &quot;C7&quot;, &quot;VAX&quot;, &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP&quot;,
  11. &quot;C7&quot;, &quot;VAX&quot;, &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP&quot;,
  12. &quot;C7&quot;, &quot;VAX&quot;, &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP&quot;,
  13. &quot;C7&quot;, &quot;VAX&quot;, &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP&quot;,
  14. &quot;C7&quot;, &quot;VAX&quot;, &quot;BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER&quot;,
  15. &quot;C7&quot;, &quot;VAX&quot;, &quot;BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER&quot;,
  16. &quot;C7&quot;, &quot;VAX&quot;, &quot;BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER&quot;,
  17. &quot;C7&quot;, &quot;VAX&quot;, &quot;BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER&quot;,
  18. &quot;C7&quot;, &quot;VAX&quot;, &quot;BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER&quot;,
  19. &quot;C7&quot;, &quot;IMMUNESIGDB&quot;, &quot;GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN&quot;,
  20. &quot;C7&quot;, &quot;IMMUNESIGDB&quot;, &quot;GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN&quot;,
  21. &quot;C7&quot;, &quot;IMMUNESIGDB&quot;, &quot;GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN&quot;,
  22. &quot;C7&quot;, &quot;IMMUNESIGDB&quot;, &quot;GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN&quot;,
  23. &quot;C7&quot;, &quot;IMMUNESIGDB&quot;, &quot;GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN&quot;,
  24. )

答案1

得分: 2

redundant.gs合并成一个以&quot;|&quot;分隔的字符串,然后用作grepl()中的模式:

  1. keep.rows &lt;- !grepl(
  2. paste(redundant.gs, collapse = &quot;|&quot;),
  3. all.gene.sets$gs_name
  4. )
  5. all.gene.sets[keep.rows, ]
  1. gs_cat gs_subcat gs_name
  2. 17 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
  3. 18 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
  4. 19 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
  5. 20 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
  6. 21 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
英文:

Collapse redundant.gs into a single string separated by &quot;|&quot;, then use as a pattern in grepl():

  1. keep.rows &lt;- !grepl(
  2. paste(redundant.gs, collapse = &quot;|&quot;),
  3. all.gene.sets$gs_name
  4. )
  5. all.gene.sets[keep.rows, ]
  1. gs_cat gs_subcat gs_name
  2. 17 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
  3. 18 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
  4. 19 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
  5. 20 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
  6. 21 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN

答案2

得分: 1

翻译后的内容如下:

使用 str_detect

  1. library(dplyr)
  2. library(stringr)
  3. all.gene.sets %>%
  4. filter(str_detect(gs_name, str_c(redundant.gs, collapse = "|"), negate = TRUE))

输出

  1. # 一个表格: 5 × 3
  2. gs_cat gs_subcat gs_name
  3. <chr> <chr> <chr>
  4. 1 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
  5. 2 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
  6. 3 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
  7. 4 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
  8. 5 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
英文:

With str_detect

  1. library(dplyr)
  2. library(stringr)
  3. all.gene.sets %&gt;%
  4. filter(str_detect(gs_name, str_c(redundant.gs, collapse = &quot;|&quot;), negate = TRUE))

-output

  1. # A tibble: 5 &#215; 3
  2. gs_cat gs_subcat gs_name
  3. &lt;chr&gt; &lt;chr&gt; &lt;chr&gt;
  4. 1 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
  5. 2 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
  6. 3 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
  7. 4 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
  8. 5 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
  9. </details>

huangapple
  • 本文由 发表于 2023年3月23日 09:32:49
  • 转载请务必保留本文链接:https://go.coder-hub.com/75818576.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定