删除行,如果特定列具有与子字符串向量匹配的值(字符串)。

huangapple go评论79阅读模式
英文:

Remove rows if a specific column has values (strings) that match a vector of substrings

问题

The redundant.gs vector contains substrings that might match with the gs_name column in the all.gene.sets dataframe. I want to remove all such matches.

redundant.gs <- c("ANDERSON_BLOOD_CN54GP140_ADJUVANTED", "BUCASAS_PBMC_FLUARIX_FLUVIRIN")
gene.sets <- all.gene.sets[!(all.gene.sets$gs_name %like% redundant.gs),]

数据:

all.gene.sets <- tibble::tribble(
  ~gs_cat, ~gs_subcat,    ~gs_name,
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  "C7",    "VAX",         "ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP",
  "C7",    "VAX",         "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  "C7",    "VAX",         "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  "C7",    "VAX",         "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  "C7",    "VAX",         "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  "C7",    "VAX",         "BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER",
  "C7",    "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
  "C7",    "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
  "C7",    "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
  "C7",    "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
  "C7",    "IMMUNESIGDB", "GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN",
)
英文:

The redundant.gs vector contains substrings that might match with the gs_name column in the all.gene.sets dataframe. I want to remove all such matches.
My code below only removes rows that match the first substring ANDERSON_BLOOD_CN54GP140_ADJUVANTED but not the second or subsequent substrings.

redundant.gs &lt;- c(&quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED&quot;, &quot;BUCASAS_PBMC_FLUARIX_FLUVIRIN&quot;)
gene.sets &lt;- all.gene.sets[!(all.gene.sets$gs_name %like% redundant.gs),]

Data:

all.gene.sets &lt;- tibble::tribble(
  ~gs_cat, ~gs_subcat,    ~gs_name,
  &quot;C7&quot;,    &quot;VAX&quot;,         &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN&quot;,
  &quot;C7&quot;,    &quot;VAX&quot;,         &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN&quot;,
  &quot;C7&quot;,    &quot;VAX&quot;,         &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN&quot;,
  &quot;C7&quot;,    &quot;VAX&quot;,         &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN&quot;,
  &quot;C7&quot;,    &quot;VAX&quot;,         &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_DN&quot;,
  &quot;C7&quot;,    &quot;VAX&quot;,         &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP&quot;,
  &quot;C7&quot;,    &quot;VAX&quot;,         &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP&quot;,
  &quot;C7&quot;,    &quot;VAX&quot;,         &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP&quot;,
  &quot;C7&quot;,    &quot;VAX&quot;,         &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP&quot;,
  &quot;C7&quot;,    &quot;VAX&quot;,         &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP&quot;,
  &quot;C7&quot;,    &quot;VAX&quot;,         &quot;ANDERSON_BLOOD_CN54GP140_ADJUVANTED_WITH_GLA_AF_AGE_18_45YO_1DY_UP&quot;,
  &quot;C7&quot;,    &quot;VAX&quot;,         &quot;BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER&quot;,
  &quot;C7&quot;,    &quot;VAX&quot;,         &quot;BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER&quot;,
  &quot;C7&quot;,    &quot;VAX&quot;,         &quot;BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER&quot;,
  &quot;C7&quot;,    &quot;VAX&quot;,         &quot;BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER&quot;,
  &quot;C7&quot;,    &quot;VAX&quot;,         &quot;BUCASAS_PBMC_FLUARIX_FLUVIRIN_CAUCASIAN_MALE_AGE_18_40YO_HIGH_RESPONDERS_1DY_3DY_POSITIVE_PREDICTIVE_OF_TITER&quot;,
  &quot;C7&quot;,    &quot;IMMUNESIGDB&quot;, &quot;GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN&quot;,
  &quot;C7&quot;,    &quot;IMMUNESIGDB&quot;, &quot;GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN&quot;,
  &quot;C7&quot;,    &quot;IMMUNESIGDB&quot;, &quot;GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN&quot;,
  &quot;C7&quot;,    &quot;IMMUNESIGDB&quot;, &quot;GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN&quot;,
  &quot;C7&quot;,    &quot;IMMUNESIGDB&quot;, &quot;GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN&quot;,
)

答案1

得分: 2

redundant.gs合并成一个以&quot;|&quot;分隔的字符串,然后用作grepl()中的模式:

keep.rows &lt;- !grepl(
  paste(redundant.gs, collapse = &quot;|&quot;),
  all.gene.sets$gs_name
)

all.gene.sets[keep.rows, ]
   gs_cat   gs_subcat                                            gs_name
17     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
18     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
19     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
20     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
21     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
英文:

Collapse redundant.gs into a single string separated by &quot;|&quot;, then use as a pattern in grepl():

keep.rows &lt;- !grepl(
  paste(redundant.gs, collapse = &quot;|&quot;),
  all.gene.sets$gs_name
)

all.gene.sets[keep.rows, ]
   gs_cat   gs_subcat                                            gs_name
17     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
18     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
19     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
20     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
21     C7 IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN

答案2

得分: 1

翻译后的内容如下:

使用 str_detect

library(dplyr)
library(stringr)
all.gene.sets %>%
  filter(str_detect(gs_name, str_c(redundant.gs, collapse = "|"), negate = TRUE))

输出

# 一个表格: 5 × 3
  gs_cat gs_subcat   gs_name                                           
  <chr>  <chr>       <chr>                                             
1 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
2 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
3 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
4 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
5 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
英文:

With str_detect

library(dplyr)
library(stringr)
all.gene.sets %&gt;%
  filter(str_detect(gs_name, str_c(redundant.gs, collapse = &quot;|&quot;), negate = TRUE))

-output

# A tibble: 5 &#215; 3
  gs_cat gs_subcat   gs_name                                           
  &lt;chr&gt;  &lt;chr&gt;       &lt;chr&gt;                                             
1 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
2 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
3 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
4 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN
5 C7     IMMUNESIGDB GSE21063_3H_VS_16H_ANTI_IGM_STIM_NFATC1_KOBCELL_DN


</details>



huangapple
  • 本文由 发表于 2023年3月23日 09:32:49
  • 转载请务必保留本文链接:https://go.coder-hub.com/75818576.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定