英文:
Remove rows if a specific column has values (strings) that match a vector of substrings
问题
The redundant.gs
vector contains substrings that might match with the gs_name
column in the all.gene.sets
dataframe. I want to remove all such matches.
redundant.gs <- c("ANDERSON_BLOOD_CN54GP140_ADJUVANTED", "BUCASAS_PBMC_FLUARIX_FLUVIRIN")
gene.sets <- all.gene.sets[!(all.gene.sets$gs_name %like% redundant.gs),]
数据:
all.gene.sets <- tibble::tribble(
~gs_cat, ~gs_subcat, ~gs_name,
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
"C7", "VAX", "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
"C7", "VAX", "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
"C7", "VAX", "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
"C7", "VAX", "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
"C7", "VAX", "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
"C7", "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
"C7", "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
"C7", "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
"C7", "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
"C7", "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
)
英文:
The redundant.gs
vector contains substrings that might match with the gs_name
column in the all.gene.sets
dataframe. I want to remove all such matches.
My code below only removes rows that match the first substring ANDERSON_BLOOD_CN54GP140_ADJUVANTED
but not the second or subsequent substrings.
redundant.gs <- c("ANDERSON_BLOOD_CN54GP140_ADJUVANTED", "BUCASAS_PBMC_FLUARIX_FLUVIRIN")
gene.sets <- all.gene.sets[!(all.gene.sets$gs_name %like% redundant.gs),]
Data:
all.gene.sets <- tibble::tribble(
~gs_cat, ~gs_subcat, ~gs_name,
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
"C7", "VAX", "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
"C7", "VAX", "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
"C7", "VAX", "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
"C7", "VAX", "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
"C7", "VAX", "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
"C7", "VAX", "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
"C7", "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
"C7", "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
"C7", "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
"C7", "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
"C7", "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
)
答案1
得分: 2
把redundant.gs
合并成一个以"|"
分隔的字符串,然后用作grepl()
中的模式:
keep.rows <- !grepl(
paste(redundant.gs, collapse = "|"),
all.gene.sets$gs_name
)
all.gene.sets[keep.rows, ]
gs_cat gs_subcat gs_name
17 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
18 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
19 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
20 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
21 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
英文:
Collapse redundant.gs
into a single string separated by "|"
, then use as a pattern in grepl()
:
keep.rows <- !grepl(
paste(redundant.gs, collapse = "|"),
all.gene.sets$gs_name
)
all.gene.sets[keep.rows, ]
gs_cat gs_subcat gs_name
17 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
18 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
19 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
20 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
21 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
答案2
得分: 1
翻译后的内容如下:
使用 str_detect
library(dplyr)
library(stringr)
all.gene.sets %>%
filter(str_detect(gs_name, str_c(redundant.gs, collapse = "|"), negate = TRUE))
输出
# 一个表格: 5 × 3
gs_cat gs_subcat gs_name
<chr> <chr> <chr>
1 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
2 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
3 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
4 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
5 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
英文:
With str_detect
library(dplyr)
library(stringr)
all.gene.sets %>%
filter(str_detect(gs_name, str_c(redundant.gs, collapse = "|"), negate = TRUE))
-output
# A tibble: 5 × 3
gs_cat gs_subcat gs_name
<chr> <chr> <chr>
1 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
2 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
3 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
4 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
5 C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
</details>
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论