Iterate over each row to obtain matches between row values and the rownames of another dataframe df2, then subset df2

huangapple go评论72阅读模式
英文:

Iterate over each row to obtain matches between row values and the rownames of another dataframe df2, then subset df2

问题

以下是您要翻译的内容:

"For each row of hallmark.df, if the rownames(all.deg) match any of the values of the given row, retrieve the corresponding rows of all.deg."
"I also want to keep only the rows of the hallmark.df dataframe where there are more than 25 non-NA columns and less than 500 columns."

"subset.df <- by(hallmark.df, seq_len(nrow(hallmark.df)), function(row) ifelse(all.deg[rownames(all.deg) %in% hallmark.df,]))"

"Input:"

"Expected output:"

"HALLMARK_TNFA_SIGNALING_VIA_NFKB"

"相关问题: https://stackoverflow.com/questions/76707644/how-to-subset-a-dataframe-based-on-matches-to-another-dataframe"

请告诉我,您是否需要进一步的帮助或有其他问题。

英文:

For each row of hallmark.df, if the rownames(all.deg) match any of the values of the given row, retrieve the corresponding rows of all.deg.
I also want to keep only the rows of the hallmark.df dataframe where there are more than 25 non-NA columns and less than 500 columns

subset.df &lt;- by(hallmark.df, seq_len(nrow(hallmark.df)), function(row) 
  ifelse(all.deg[rownames(all.deg) %in% hallmark.df,]))

Input:

all.deg

&gt; dput(all.deg[1:5,1:5])
structure(c(16.0169585624867, 14.3983080662428, 12.7844219145156, 
12.6674945373237, 13.8584047354367, 13.563719599839, 13.6166993468069, 
12.9748157402651, 12.7386065050292, 12.2201616898331, 11.3657998135948, 
11.8253392160132, 12.1132082166185, 11.5123143882139, 10.2967924742924, 
13.7513874043739, 13.2403954818698, 12.4196432226432, 12.4676109090624, 
12.1390647972695, 12.3013113392588, 12.4867673484914, 11.3693921877853, 
10.6359730348998, 10.0122721528039), dim = c(5L, 5L), dimnames = list(
    c(&quot;JUNB&quot;, &quot;ALDOA&quot;, &quot;HLA.A&quot;, &quot;THBD&quot;, &quot;CD74&quot;), c(&quot;TCGA.2K.A9WE.01&quot;, 
    &quot;TCGA.2Z.A9J1.01&quot;, &quot;TCGA.2Z.A9J3.01&quot;, &quot;TCGA.2Z.A9J6.01&quot;, 
    &quot;TCGA.2Z.A9J7.01&quot;)))

&gt; dput(hallmark.df[1:5,1:5])
structure(list(V3 = c(&quot;JUNB&quot;, &quot;PGK1&quot;, &quot;FDPS&quot;, &quot;ARHGEF2&quot;, &quot;CD74&quot;
), V4 = c(&quot;CXCL2&quot;, &quot;PDK1&quot;, &quot;CYP51A1&quot;, &quot;CLASP1&quot;, &quot;CTNNB1&quot;), V5 = c(&quot;ATF3&quot;, 
&quot;GBE1&quot;, &quot;IDI1&quot;, &quot;KIF11&quot;, &quot;JAG2&quot;), V6 = c(&quot;NFKBIA&quot;, &quot;PFKL&quot;, &quot;FDFT1&quot;, 
&quot;KIF23&quot;, &quot;NOTCH1&quot;), V7 = c(&quot;ALDOA&quot;, &quot;JUNB&quot;, &quot;DHCR7&quot;, &quot;ALS2&quot;, 
&quot;DLL1&quot;)), row.names = c(&quot;HALLMARK_TNFA_SIGNALING_VIA_NFKB&quot;, &quot;HALLMARK_HYPOXIA&quot;, 
&quot;HALLMARK_CHOLESTEROL_HOMEOSTASIS&quot;, &quot;HALLMARK_MITOTIC_SPINDLE&quot;, 
&quot;HALLMARK_WNT_BETA_CATENIN_SIGNALING&quot;), class = &quot;data.frame&quot;)

Expected output:

HALLMARK_TNFA_SIGNALING_VIA_NFKB

TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01 TCGA.2Z.A9J7.01
JUNB 16.0169585624867 13.563719599839 11.3657998135948 13.7513874043739 12.3013113392588
CD74 13.8584047354367 12.2201616898331 10.2967924742924 12.1390647972695 10.0122721528039

HALLMARK_WNT_BETA_CATENIN_SIGNALING

TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01 TCGA.2Z.A9J7.01
ALDOA 14.3983080662428 13.6166993468069 11.8253392160132 13.2403954818698 12.4867673484914
JUNB 16.0169585624867 13.563719599839 11.3657998135948 13.7513874043739 12.3013113392588
df.list &lt;- list(HALLMARK_TNFA_SIGNALING_VIA_NFKB, HALLMARK_WNT_BETA_CATENIN_SIGNALING)

Related question: https://stackoverflow.com/questions/76707644/how-to-subset-a-dataframe-based-on-matches-to-another-dataframe

答案1

得分: 1

这可能不是解决您问题的“最佳”方式,但这是一个潜在的解决方案:

library(tidyverse)
all.deg <- structure(c(16.0169585624867, 14.3983080662428, 12.7844219145156, 
                       12.6674945373237, 13.8584047354367, 13.563719599839, 13.6166993468069, 
                       12.9748157402651, 12.7386065050292, 12.2201616898331, 11.3657998135948, 
                       11.8253392160132, 12.1132082166185, 11.5123143882139, 10.2967924742924, 
                       13.7513874043739, 13.2403954818698, 12.4196432226432, 12.4676109090624, 
                       12.1390647972695, 12.3013113392588, 12.4867673484914, 11.3693921877853, 
                       10.6359730348998, 10.0122721528039), dim = c(5L, 5L), dimnames = list(
                         c("JUNB", "ALDOA", "HLA.A", "THBD", "CD74"), c("TCGA.2K.A9WE.01", 
                                                                        "TCGA.2Z.A9J1.01", "TCGA.2Z.A9J3.01", "TCGA.2Z.A9J6.01", 
                                                                        "TCGA.2Z.A9J7.01")))

hallmark.df <- structure(list(V3 = c("JUNB", "PGK1", "FDPS", "ARHGEF2", "CD74"
), V4 = c("CXCL2", "PDK1", "CYP51A1", "CLASP1", "CTNNB1"), V5 = c("ATF3", 
                                                                  "GBE1", "IDI1", "KIF11", "JAG2"), V6 = c("NFKBIA", "PFKL", "FDFT1", 
                                                                                                           "KIF23", "NOTCH1"), V7 = c("ALDOA", "JUNB", "DHCR7", "ALS2", 
                                                                                                                                      "DLL1")), row.names = c("HALLMARK_TNFA_SIGNALING_VIA_NFKB", "HALLMARK_HYPOXIA", 
                                                                                                                                                              "HALLMARK_CHOLESTEROL_HOMEOSTASIS", "HALLMARK_MITOTIC_SPINDLE", 
                                                                                                                                                              "HALLMARK_WNT_BETA_CATENIN_SIGNALING"), class = "data.frame")

output <- list()
for (i in seq_along(hallmark.df)) {
  output[[rownames(hallmark.df)[i]]] <- all.deg[rownames(all.deg) %in% hallmark.df[,i],]
}

# 从列表中移除空数据框
clean_list <- output[which(lapply(output, nrow) != 0)]

# 将数据框导出到全局环境
list2env(clean_list, envir = .GlobalEnv)
#> <environment: R_GlobalEnv>

ls(pattern = "HALLMARK*")
#> [1] "HALLMARK_TNFA_SIGNALING_VIA_NFKB"    "HALLMARK_WNT_BETA_CATENIN_SIGNALING"

HALLMARK_TNFA_SIGNALING_VIA_NFKB
#>      TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#> JUNB        16.01696        13.56372        11.36580        13.75139
#> CD74        13.85840        12.22016        10.29679        12.13906
#>      TCGA.2Z.A9J7.01
#> JUNB        12.30131
#> CD74        10.01227
HALLMARK_WNT_BETA_CATENIN_SIGNALING
#>       TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#> JUNB         16.01696        13.56372        11.36580        13.75139
#> ALDOA        14.39831        13.61670        11.82534        13.24040
#>       TCGA.2Z.A9J7.01
#> JUNB         12.30131
#> ALDOA        12.48677

创建于2023年7月18日,使用reprex v2.0.2


编辑 1

要保留hallmark.df数据框中具有超过25个非NA列但少于500个非NA列的行,可以使用以下代码:

hallmark %>%
  filter(rowSums(!is.na(.)) >= 25 | rowSums(!is.na(.)) <= 500)
英文:

This might not be the 'best' way to solve your problem, but here is one potential solution:

library(tidyverse)
all.deg &lt;- structure(c(16.0169585624867, 14.3983080662428, 12.7844219145156, 
                       12.6674945373237, 13.8584047354367, 13.563719599839, 13.6166993468069, 
                       12.9748157402651, 12.7386065050292, 12.2201616898331, 11.3657998135948, 
                       11.8253392160132, 12.1132082166185, 11.5123143882139, 10.2967924742924, 
                       13.7513874043739, 13.2403954818698, 12.4196432226432, 12.4676109090624, 
                       12.1390647972695, 12.3013113392588, 12.4867673484914, 11.3693921877853, 
                       10.6359730348998, 10.0122721528039), dim = c(5L, 5L), dimnames = list(
                         c(&quot;JUNB&quot;, &quot;ALDOA&quot;, &quot;HLA.A&quot;, &quot;THBD&quot;, &quot;CD74&quot;), c(&quot;TCGA.2K.A9WE.01&quot;, 
                                                                        &quot;TCGA.2Z.A9J1.01&quot;, &quot;TCGA.2Z.A9J3.01&quot;, &quot;TCGA.2Z.A9J6.01&quot;, 
                                                                        &quot;TCGA.2Z.A9J7.01&quot;)))

hallmark.df &lt;- structure(list(V3 = c(&quot;JUNB&quot;, &quot;PGK1&quot;, &quot;FDPS&quot;, &quot;ARHGEF2&quot;, &quot;CD74&quot;
), V4 = c(&quot;CXCL2&quot;, &quot;PDK1&quot;, &quot;CYP51A1&quot;, &quot;CLASP1&quot;, &quot;CTNNB1&quot;), V5 = c(&quot;ATF3&quot;, 
                                                                  &quot;GBE1&quot;, &quot;IDI1&quot;, &quot;KIF11&quot;, &quot;JAG2&quot;), V6 = c(&quot;NFKBIA&quot;, &quot;PFKL&quot;, &quot;FDFT1&quot;, 
                                                                                                           &quot;KIF23&quot;, &quot;NOTCH1&quot;), V7 = c(&quot;ALDOA&quot;, &quot;JUNB&quot;, &quot;DHCR7&quot;, &quot;ALS2&quot;, 
                                                                                                                                      &quot;DLL1&quot;)), row.names = c(&quot;HALLMARK_TNFA_SIGNALING_VIA_NFKB&quot;, &quot;HALLMARK_HYPOXIA&quot;, 
                                                                                                                                                              &quot;HALLMARK_CHOLESTEROL_HOMEOSTASIS&quot;, &quot;HALLMARK_MITOTIC_SPINDLE&quot;, 
                                                                                                                                                              &quot;HALLMARK_WNT_BETA_CATENIN_SIGNALING&quot;), class = &quot;data.frame&quot;)

output &lt;- list()
for (i in seq_along(hallmark.df)) {
  output[[rownames(hallmark.df)[i]]] &lt;- all.deg[rownames(all.deg) %in% hallmark.df[,i],]
}

# Remove empty dataframes from the list
clean_list &lt;- output[which(lapply(output, nrow) != 0)]

# export the dataframes to the global env
list2env(clean_list, envir = .GlobalEnv)
#&gt; &lt;environment: R_GlobalEnv&gt;

ls(pattern = &quot;HALLMARK*&quot;)
#&gt; [1] &quot;HALLMARK_TNFA_SIGNALING_VIA_NFKB&quot;    &quot;HALLMARK_WNT_BETA_CATENIN_SIGNALING&quot;

HALLMARK_TNFA_SIGNALING_VIA_NFKB
#&gt;      TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#&gt; JUNB        16.01696        13.56372        11.36580        13.75139
#&gt; CD74        13.85840        12.22016        10.29679        12.13906
#&gt;      TCGA.2Z.A9J7.01
#&gt; JUNB        12.30131
#&gt; CD74        10.01227
HALLMARK_WNT_BETA_CATENIN_SIGNALING
#&gt;       TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#&gt; JUNB         16.01696        13.56372        11.36580        13.75139
#&gt; ALDOA        14.39831        13.61670        11.82534        13.24040
#&gt;       TCGA.2Z.A9J7.01
#&gt; JUNB         12.30131
#&gt; ALDOA        12.48677

<sup>Created on 2023-07-18 with reprex v2.0.2</sup>


Edit 1

To keep rows of the hallmark.df dataframe where there are more than 25 non-NA columns and less than 500 non-NA columns you can use:

hallmark %&gt;%
  filter(rowSums(!is.na(.)) &gt;= 25 | rowSums(!is.na(.)) &lt;= 500)

答案2

得分: 1

以下是您提供的内容的中文翻译:

我们可以将行名称转换为列,然后通过 {tidyverse} 很容易实现:

all.deg &lt;- structure(c(16.0169585624867, 14.3983080662428, 12.7844219145156, 
                       12.6674945373237, 13.8584047354367, 13.563719599839, 13.6166993468069, 
                       12.9748157402651, 12.7386065050292, 12.2201616898331, 11.3657998135948, 
                       11.8253392160132, 12.1132082166185, 11.5123143882139, 10.2967924742924, 
                       13.7513874043739, 13.2403954818698, 12.4196432226432, 12.4676109090624, 
                       12.1390647972695, 12.3013113392588, 12.4867673484914, 11.3693921877853, 
                       10.6359730348998, 10.0122721528039), dim = c(5L, 5L), dimnames = list(
                         c(&quot;JUNB&quot;, &quot;ALDOA&quot;, &quot;HLA.A&quot;, &quot;THBD&quot;, &quot;CD74&quot;), c(&quot;TCGA.2K.A9WE.01&quot;, 
                                                                        &quot;TCGA.2Z.A9J1.01&quot;, &quot;TCGA.2Z.A9J3.01&quot;, &quot;TCGA.2Z.A9J6.01&quot;, 
                                                                        &quot;TCGA.2Z.A9J7.01&quot;)))
hallmark.df &lt;- structure(list(V3 = c(&quot;JUNB&quot;, &quot;PGK1&quot;, &quot;FDPS&quot;, &quot;ARHGEF2&quot;, &quot;CD74&quot;), 
                              V4 = c(&quot;CXCL2&quot;, &quot;PDK1&quot;, &quot;CYP51A1&quot;, &quot;CLASP1&quot;, &quot;CTNNB1&quot;), 
                              V5 = c(&quot;ATF3&quot;, &quot;GBE1&quot;, &quot;IDI1&quot;, &quot;KIF11&quot;, &quot;JAG2&quot;),
                              V6 = c(&quot;NFKBIA&quot;, &quot;PFKL&quot;, &quot;FDFT1&quot;, &quot;KIF23&quot;, &quot;NOTCH1&quot;),
                              V7 = c(&quot;ALDOA&quot;, &quot;JUNB&quot;, &quot;DHCR7&quot;, &quot;ALS2&quot;, &quot;DLL1&quot;)), 
                         row.names = c(&quot;HALLMARK_TNFA_SIGNALING_VIA_NFKB&quot;, &quot;HALLMARK_HYPOXIA&quot;, 
                                       &quot;HALLMARK_CHOLESTEROL_HOMEOSTASIS&quot;, &quot;HALLMARK_MITOTIC_SPINDLE&quot;, 
                                       &quot;HALLMARK_WNT_BETA_CATENIN_SIGNALING&quot;), class = &quot;data.frame&quot;)

library(tidyverse)
hallmark.df |&gt; 
  as_tibble(rownames = &quot;row&quot;) |&gt; 
  pivot_longer(-row) |&gt; 
  inner_join(
    all.deg |&gt; 
      as_tibble(rownames = &quot;value&quot;),
    by = &quot;value&quot;
  ) |&gt; 
  split(~row) |&gt; 
  map(
    ~ . |&gt; 
      select(-row, -name) |&gt; 
      column_to_rownames(&quot;value&quot;)
  )
#&gt; $HALLMARK_HYPOXIA
#&gt;      TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#&gt; JUNB        16.01696        13.56372         11.3658        13.75139
#&gt;      TCGA.2Z.A9J7.01
#&gt; JUNB        12.30131
#&gt; 
#&gt; $HALLMARK_TNFA_SIGNALING_VIA_NFKB
#&gt;       TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#&gt; JUNB         16.01696        13.56372        11.36580        13.75139
#&gt; ALDOA        14.39831        13.61670        11.82534        13.24040
#&gt;       TCGA.2Z.A9J7.01
#&gt; JUNB         12.30131
#&gt; ALDOA        12.48677
#&gt; 
#&gt; $HALLMARK_WNT_BETA_CATENIN_SIGNALING
#&gt;      TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#&gt; CD74         13.8584        12.22016        10.29679        12.13906
#&gt;      TCGA.2Z.A9J7.01
#&gt; CD74        10.01227

创建于2023年7月18日,使用reprex v2.0.2

英文:

We can transform row names as a column, then it is easily done by {tidyverse}:

all.deg &lt;- structure(c(16.0169585624867, 14.3983080662428, 12.7844219145156, 
                       12.6674945373237, 13.8584047354367, 13.563719599839, 13.6166993468069, 
                       12.9748157402651, 12.7386065050292, 12.2201616898331, 11.3657998135948, 
                       11.8253392160132, 12.1132082166185, 11.5123143882139, 10.2967924742924, 
                       13.7513874043739, 13.2403954818698, 12.4196432226432, 12.4676109090624, 
                       12.1390647972695, 12.3013113392588, 12.4867673484914, 11.3693921877853, 
                       10.6359730348998, 10.0122721528039), dim = c(5L, 5L), dimnames = list(
                         c(&quot;JUNB&quot;, &quot;ALDOA&quot;, &quot;HLA.A&quot;, &quot;THBD&quot;, &quot;CD74&quot;), c(&quot;TCGA.2K.A9WE.01&quot;, 
                                                                        &quot;TCGA.2Z.A9J1.01&quot;, &quot;TCGA.2Z.A9J3.01&quot;, &quot;TCGA.2Z.A9J6.01&quot;, 
                                                                        &quot;TCGA.2Z.A9J7.01&quot;)))
hallmark.df &lt;- structure(list(V3 = c(&quot;JUNB&quot;, &quot;PGK1&quot;, &quot;FDPS&quot;, &quot;ARHGEF2&quot;, &quot;CD74&quot;), 
                              V4 = c(&quot;CXCL2&quot;, &quot;PDK1&quot;, &quot;CYP51A1&quot;, &quot;CLASP1&quot;, &quot;CTNNB1&quot;), 
                              V5 = c(&quot;ATF3&quot;, &quot;GBE1&quot;, &quot;IDI1&quot;, &quot;KIF11&quot;, &quot;JAG2&quot;),
                              V6 = c(&quot;NFKBIA&quot;, &quot;PFKL&quot;, &quot;FDFT1&quot;, &quot;KIF23&quot;, &quot;NOTCH1&quot;),
                              V7 = c(&quot;ALDOA&quot;, &quot;JUNB&quot;, &quot;DHCR7&quot;, &quot;ALS2&quot;, &quot;DLL1&quot;)), 
                         row.names = c(&quot;HALLMARK_TNFA_SIGNALING_VIA_NFKB&quot;, &quot;HALLMARK_HYPOXIA&quot;, 
                                       &quot;HALLMARK_CHOLESTEROL_HOMEOSTASIS&quot;, &quot;HALLMARK_MITOTIC_SPINDLE&quot;, 
                                       &quot;HALLMARK_WNT_BETA_CATENIN_SIGNALING&quot;), class = &quot;data.frame&quot;)

library(tidyverse)
hallmark.df |&gt; 
  as_tibble(rownames = &quot;row&quot;) |&gt; 
  pivot_longer(-row) |&gt; 
  inner_join(
    all.deg |&gt; 
      as_tibble(rownames = &quot;value&quot;),
    by = &quot;value&quot;
  ) |&gt; 
  split(~row) |&gt; 
  map(
    ~ . |&gt; 
      select(-row, -name) |&gt; 
      column_to_rownames(&quot;value&quot;)
  )
#&gt; $HALLMARK_HYPOXIA
#&gt;      TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#&gt; JUNB        16.01696        13.56372         11.3658        13.75139
#&gt;      TCGA.2Z.A9J7.01
#&gt; JUNB        12.30131
#&gt; 
#&gt; $HALLMARK_TNFA_SIGNALING_VIA_NFKB
#&gt;       TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#&gt; JUNB         16.01696        13.56372        11.36580        13.75139
#&gt; ALDOA        14.39831        13.61670        11.82534        13.24040
#&gt;       TCGA.2Z.A9J7.01
#&gt; JUNB         12.30131
#&gt; ALDOA        12.48677
#&gt; 
#&gt; $HALLMARK_WNT_BETA_CATENIN_SIGNALING
#&gt;      TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#&gt; CD74         13.8584        12.22016        10.29679        12.13906
#&gt;      TCGA.2Z.A9J7.01
#&gt; CD74        10.01227

<sup>Created on 2023-07-18 with reprex v2.0.2</sup>

huangapple
  • 本文由 发表于 2023年7月18日 07:10:18
  • 转载请务必保留本文链接:https://go.coder-hub.com/76708608.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定