英文:
Remove rows if a specific column has values (strings) that match a vector of substrings
问题
The redundant.gs vector contains substrings that might match with the gs_name column in the all.gene.sets dataframe. I want to remove all such matches.
redundant.gs <- c("ANDERSON_BLOOD_CN54GP140_ADJUVANTED", "BUCASAS_PBMC_FLUARIX_FLUVIRIN")
gene.sets <- all.gene.sets[!(all.gene.sets$gs_name %like% redundant.gs),]
数据:
all.gene.sets <- tibble::tribble(
  ~gs_cat, ~gs_subcat,    ~gs_name,
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  "C7",    "VAX",         "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  "C7",    "VAX",         "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  "C7",    "VAX",         "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  "C7",    "VAX",         "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  "C7",    "VAX",         "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  "C7",    "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
  "C7",    "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
  "C7",    "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
  "C7",    "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
  "C7",    "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
)
英文:
The redundant.gs vector contains substrings that might match with the gs_name column in the all.gene.sets dataframe. I want to remove all such matches.
My code below only removes rows that match the first substring ANDERSON_BLOOD_CN54GP140_ADJUVANTED but not the second or subsequent substrings.
redundant.gs <- c("ANDERSON_BLOOD_CN54GP140_ADJUVANTED", "BUCASAS_PBMC_FLUARIX_FLUVIRIN")
gene.sets <- all.gene.sets[!(all.gene.sets$gs_name %like% redundant.gs),]
Data:
all.gene.sets <- tibble::tribble(
  ~gs_cat, ~gs_subcat,    ~gs_name,
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  "C7",    "VAX",         "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  "C7",    "VAX",         "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  "C7",    "VAX",         "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  "C7",    "VAX",         "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  "C7",    "VAX",         "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  "C7",    "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
  "C7",    "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
  "C7",    "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
  "C7",    "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
  "C7",    "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
)
答案1
得分: 2
把redundant.gs合并成一个以"|"分隔的字符串,然后用作grepl()中的模式:
keep.rows <- !grepl(
  paste(redundant.gs, collapse = "|"),
  all.gene.sets$gs_name
)
all.gene.sets[keep.rows, ]
   gs_cat   gs_subcat                                            gs_name
17     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
18     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
19     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
20     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
21     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
英文:
Collapse redundant.gs into a single string separated by "|", then use as a pattern in grepl():
keep.rows <- !grepl(
  paste(redundant.gs, collapse = "|"),
  all.gene.sets$gs_name
)
all.gene.sets[keep.rows, ]
   gs_cat   gs_subcat                                            gs_name
17     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
18     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
19     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
20     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
21     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
答案2
得分: 1
翻译后的内容如下:
使用 str_detect
library(dplyr)
library(stringr)
all.gene.sets %>%
  filter(str_detect(gs_name, str_c(redundant.gs, collapse = "|"), negate = TRUE))
输出
# 一个表格: 5 × 3
  gs_cat gs_subcat   gs_name                                           
  <chr>  <chr>       <chr>                                             
1 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
2 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
3 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
4 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
5 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
英文:
With str_detect
library(dplyr)
library(stringr)
all.gene.sets %>%
  filter(str_detect(gs_name, str_c(redundant.gs, collapse = "|"), negate = TRUE))
-output
# A tibble: 5 × 3
  gs_cat gs_subcat   gs_name                                           
  <chr>  <chr>       <chr>                                             
1 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
2 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
3 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
4 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
5 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
</details>
				通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。


评论