Iterate over each row to obtain matches between row values and the rownames of another dataframe df2, then subset df2

huangapple go评论96阅读模式
英文:

Iterate over each row to obtain matches between row values and the rownames of another dataframe df2, then subset df2

问题

以下是您要翻译的内容:

"For each row of hallmark.df, if the rownames(all.deg) match any of the values of the given row, retrieve the corresponding rows of all.deg."
"I also want to keep only the rows of the hallmark.df dataframe where there are more than 25 non-NA columns and less than 500 columns."

"subset.df <- by(hallmark.df, seq_len(nrow(hallmark.df)), function(row) ifelse(all.deg[rownames(all.deg) %in% hallmark.df,]))"

"Input:"

"Expected output:"

"HALLMARK_TNFA_SIGNALING_VIA_NFKB"

"相关问题: https://stackoverflow.com/questions/76707644/how-to-subset-a-dataframe-based-on-matches-to-another-dataframe"

请告诉我,您是否需要进一步的帮助或有其他问题。

英文:

For each row of hallmark.df, if the rownames(all.deg) match any of the values of the given row, retrieve the corresponding rows of all.deg.
I also want to keep only the rows of the hallmark.df dataframe where there are more than 25 non-NA columns and less than 500 columns

  1. subset.df &lt;- by(hallmark.df, seq_len(nrow(hallmark.df)), function(row)
  2. ifelse(all.deg[rownames(all.deg) %in% hallmark.df,]))

Input:

all.deg

  1. &gt; dput(all.deg[1:5,1:5])
  2. structure(c(16.0169585624867, 14.3983080662428, 12.7844219145156,
  3. 12.6674945373237, 13.8584047354367, 13.563719599839, 13.6166993468069,
  4. 12.9748157402651, 12.7386065050292, 12.2201616898331, 11.3657998135948,
  5. 11.8253392160132, 12.1132082166185, 11.5123143882139, 10.2967924742924,
  6. 13.7513874043739, 13.2403954818698, 12.4196432226432, 12.4676109090624,
  7. 12.1390647972695, 12.3013113392588, 12.4867673484914, 11.3693921877853,
  8. 10.6359730348998, 10.0122721528039), dim = c(5L, 5L), dimnames = list(
  9. c(&quot;JUNB&quot;, &quot;ALDOA&quot;, &quot;HLA.A&quot;, &quot;THBD&quot;, &quot;CD74&quot;), c(&quot;TCGA.2K.A9WE.01&quot;,
  10. &quot;TCGA.2Z.A9J1.01&quot;, &quot;TCGA.2Z.A9J3.01&quot;, &quot;TCGA.2Z.A9J6.01&quot;,
  11. &quot;TCGA.2Z.A9J7.01&quot;)))
  12. &gt; dput(hallmark.df[1:5,1:5])
  13. structure(list(V3 = c(&quot;JUNB&quot;, &quot;PGK1&quot;, &quot;FDPS&quot;, &quot;ARHGEF2&quot;, &quot;CD74&quot;
  14. ), V4 = c(&quot;CXCL2&quot;, &quot;PDK1&quot;, &quot;CYP51A1&quot;, &quot;CLASP1&quot;, &quot;CTNNB1&quot;), V5 = c(&quot;ATF3&quot;,
  15. &quot;GBE1&quot;, &quot;IDI1&quot;, &quot;KIF11&quot;, &quot;JAG2&quot;), V6 = c(&quot;NFKBIA&quot;, &quot;PFKL&quot;, &quot;FDFT1&quot;,
  16. &quot;KIF23&quot;, &quot;NOTCH1&quot;), V7 = c(&quot;ALDOA&quot;, &quot;JUNB&quot;, &quot;DHCR7&quot;, &quot;ALS2&quot;,
  17. &quot;DLL1&quot;)), row.names = c(&quot;HALLMARK_TNFA_SIGNALING_VIA_NFKB&quot;, &quot;HALLMARK_HYPOXIA&quot;,
  18. &quot;HALLMARK_CHOLESTEROL_HOMEOSTASIS&quot;, &quot;HALLMARK_MITOTIC_SPINDLE&quot;,
  19. &quot;HALLMARK_WNT_BETA_CATENIN_SIGNALING&quot;), class = &quot;data.frame&quot;)

Expected output:

HALLMARK_TNFA_SIGNALING_VIA_NFKB

TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01 TCGA.2Z.A9J7.01
JUNB 16.0169585624867 13.563719599839 11.3657998135948 13.7513874043739 12.3013113392588
CD74 13.8584047354367 12.2201616898331 10.2967924742924 12.1390647972695 10.0122721528039

HALLMARK_WNT_BETA_CATENIN_SIGNALING

TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01 TCGA.2Z.A9J7.01
ALDOA 14.3983080662428 13.6166993468069 11.8253392160132 13.2403954818698 12.4867673484914
JUNB 16.0169585624867 13.563719599839 11.3657998135948 13.7513874043739 12.3013113392588
  1. df.list &lt;- list(HALLMARK_TNFA_SIGNALING_VIA_NFKB, HALLMARK_WNT_BETA_CATENIN_SIGNALING)

Related question: https://stackoverflow.com/questions/76707644/how-to-subset-a-dataframe-based-on-matches-to-another-dataframe

答案1

得分: 1

这可能不是解决您问题的“最佳”方式,但这是一个潜在的解决方案:

  1. library(tidyverse)
  2. all.deg <- structure(c(16.0169585624867, 14.3983080662428, 12.7844219145156,
  3. 12.6674945373237, 13.8584047354367, 13.563719599839, 13.6166993468069,
  4. 12.9748157402651, 12.7386065050292, 12.2201616898331, 11.3657998135948,
  5. 11.8253392160132, 12.1132082166185, 11.5123143882139, 10.2967924742924,
  6. 13.7513874043739, 13.2403954818698, 12.4196432226432, 12.4676109090624,
  7. 12.1390647972695, 12.3013113392588, 12.4867673484914, 11.3693921877853,
  8. 10.6359730348998, 10.0122721528039), dim = c(5L, 5L), dimnames = list(
  9. c("JUNB", "ALDOA", "HLA.A", "THBD", "CD74"), c("TCGA.2K.A9WE.01",
  10. "TCGA.2Z.A9J1.01", "TCGA.2Z.A9J3.01", "TCGA.2Z.A9J6.01",
  11. "TCGA.2Z.A9J7.01")))
  12. hallmark.df <- structure(list(V3 = c("JUNB", "PGK1", "FDPS", "ARHGEF2", "CD74"
  13. ), V4 = c("CXCL2", "PDK1", "CYP51A1", "CLASP1", "CTNNB1"), V5 = c("ATF3",
  14. "GBE1", "IDI1", "KIF11", "JAG2"), V6 = c("NFKBIA", "PFKL", "FDFT1",
  15. "KIF23", "NOTCH1"), V7 = c("ALDOA", "JUNB", "DHCR7", "ALS2",
  16. "DLL1")), row.names = c("HALLMARK_TNFA_SIGNALING_VIA_NFKB", "HALLMARK_HYPOXIA",
  17. "HALLMARK_CHOLESTEROL_HOMEOSTASIS", "HALLMARK_MITOTIC_SPINDLE",
  18. "HALLMARK_WNT_BETA_CATENIN_SIGNALING"), class = "data.frame")
  19. output <- list()
  20. for (i in seq_along(hallmark.df)) {
  21. output[[rownames(hallmark.df)[i]]] <- all.deg[rownames(all.deg) %in% hallmark.df[,i],]
  22. }
  23. # 从列表中移除空数据框
  24. clean_list <- output[which(lapply(output, nrow) != 0)]
  25. # 将数据框导出到全局环境
  26. list2env(clean_list, envir = .GlobalEnv)
  27. #> <environment: R_GlobalEnv>
  28. ls(pattern = "HALLMARK*")
  29. #> [1] "HALLMARK_TNFA_SIGNALING_VIA_NFKB" "HALLMARK_WNT_BETA_CATENIN_SIGNALING"
  30. HALLMARK_TNFA_SIGNALING_VIA_NFKB
  31. #> TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
  32. #> JUNB 16.01696 13.56372 11.36580 13.75139
  33. #> CD74 13.85840 12.22016 10.29679 12.13906
  34. #> TCGA.2Z.A9J7.01
  35. #> JUNB 12.30131
  36. #> CD74 10.01227
  37. HALLMARK_WNT_BETA_CATENIN_SIGNALING
  38. #> TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
  39. #> JUNB 16.01696 13.56372 11.36580 13.75139
  40. #> ALDOA 14.39831 13.61670 11.82534 13.24040
  41. #> TCGA.2Z.A9J7.01
  42. #> JUNB 12.30131
  43. #> ALDOA 12.48677

创建于2023年7月18日,使用reprex v2.0.2


编辑 1

要保留hallmark.df数据框中具有超过25个非NA列但少于500个非NA列的行,可以使用以下代码:

  1. hallmark %>%
  2. filter(rowSums(!is.na(.)) >= 25 | rowSums(!is.na(.)) <= 500)
英文:

This might not be the 'best' way to solve your problem, but here is one potential solution:

  1. library(tidyverse)
  2. all.deg &lt;- structure(c(16.0169585624867, 14.3983080662428, 12.7844219145156,
  3. 12.6674945373237, 13.8584047354367, 13.563719599839, 13.6166993468069,
  4. 12.9748157402651, 12.7386065050292, 12.2201616898331, 11.3657998135948,
  5. 11.8253392160132, 12.1132082166185, 11.5123143882139, 10.2967924742924,
  6. 13.7513874043739, 13.2403954818698, 12.4196432226432, 12.4676109090624,
  7. 12.1390647972695, 12.3013113392588, 12.4867673484914, 11.3693921877853,
  8. 10.6359730348998, 10.0122721528039), dim = c(5L, 5L), dimnames = list(
  9. c(&quot;JUNB&quot;, &quot;ALDOA&quot;, &quot;HLA.A&quot;, &quot;THBD&quot;, &quot;CD74&quot;), c(&quot;TCGA.2K.A9WE.01&quot;,
  10. &quot;TCGA.2Z.A9J1.01&quot;, &quot;TCGA.2Z.A9J3.01&quot;, &quot;TCGA.2Z.A9J6.01&quot;,
  11. &quot;TCGA.2Z.A9J7.01&quot;)))
  12. hallmark.df &lt;- structure(list(V3 = c(&quot;JUNB&quot;, &quot;PGK1&quot;, &quot;FDPS&quot;, &quot;ARHGEF2&quot;, &quot;CD74&quot;
  13. ), V4 = c(&quot;CXCL2&quot;, &quot;PDK1&quot;, &quot;CYP51A1&quot;, &quot;CLASP1&quot;, &quot;CTNNB1&quot;), V5 = c(&quot;ATF3&quot;,
  14. &quot;GBE1&quot;, &quot;IDI1&quot;, &quot;KIF11&quot;, &quot;JAG2&quot;), V6 = c(&quot;NFKBIA&quot;, &quot;PFKL&quot;, &quot;FDFT1&quot;,
  15. &quot;KIF23&quot;, &quot;NOTCH1&quot;), V7 = c(&quot;ALDOA&quot;, &quot;JUNB&quot;, &quot;DHCR7&quot;, &quot;ALS2&quot;,
  16. &quot;DLL1&quot;)), row.names = c(&quot;HALLMARK_TNFA_SIGNALING_VIA_NFKB&quot;, &quot;HALLMARK_HYPOXIA&quot;,
  17. &quot;HALLMARK_CHOLESTEROL_HOMEOSTASIS&quot;, &quot;HALLMARK_MITOTIC_SPINDLE&quot;,
  18. &quot;HALLMARK_WNT_BETA_CATENIN_SIGNALING&quot;), class = &quot;data.frame&quot;)
  19. output &lt;- list()
  20. for (i in seq_along(hallmark.df)) {
  21. output[[rownames(hallmark.df)[i]]] &lt;- all.deg[rownames(all.deg) %in% hallmark.df[,i],]
  22. }
  23. # Remove empty dataframes from the list
  24. clean_list &lt;- output[which(lapply(output, nrow) != 0)]
  25. # export the dataframes to the global env
  26. list2env(clean_list, envir = .GlobalEnv)
  27. #&gt; &lt;environment: R_GlobalEnv&gt;
  28. ls(pattern = &quot;HALLMARK*&quot;)
  29. #&gt; [1] &quot;HALLMARK_TNFA_SIGNALING_VIA_NFKB&quot; &quot;HALLMARK_WNT_BETA_CATENIN_SIGNALING&quot;
  30. HALLMARK_TNFA_SIGNALING_VIA_NFKB
  31. #&gt; TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
  32. #&gt; JUNB 16.01696 13.56372 11.36580 13.75139
  33. #&gt; CD74 13.85840 12.22016 10.29679 12.13906
  34. #&gt; TCGA.2Z.A9J7.01
  35. #&gt; JUNB 12.30131
  36. #&gt; CD74 10.01227
  37. HALLMARK_WNT_BETA_CATENIN_SIGNALING
  38. #&gt; TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
  39. #&gt; JUNB 16.01696 13.56372 11.36580 13.75139
  40. #&gt; ALDOA 14.39831 13.61670 11.82534 13.24040
  41. #&gt; TCGA.2Z.A9J7.01
  42. #&gt; JUNB 12.30131
  43. #&gt; ALDOA 12.48677

<sup>Created on 2023-07-18 with reprex v2.0.2</sup>


Edit 1

To keep rows of the hallmark.df dataframe where there are more than 25 non-NA columns and less than 500 non-NA columns you can use:

  1. hallmark %&gt;%
  2. filter(rowSums(!is.na(.)) &gt;= 25 | rowSums(!is.na(.)) &lt;= 500)

答案2

得分: 1

以下是您提供的内容的中文翻译:

我们可以将行名称转换为列,然后通过 {tidyverse} 很容易实现:

  1. all.deg &lt;- structure(c(16.0169585624867, 14.3983080662428, 12.7844219145156,
  2. 12.6674945373237, 13.8584047354367, 13.563719599839, 13.6166993468069,
  3. 12.9748157402651, 12.7386065050292, 12.2201616898331, 11.3657998135948,
  4. 11.8253392160132, 12.1132082166185, 11.5123143882139, 10.2967924742924,
  5. 13.7513874043739, 13.2403954818698, 12.4196432226432, 12.4676109090624,
  6. 12.1390647972695, 12.3013113392588, 12.4867673484914, 11.3693921877853,
  7. 10.6359730348998, 10.0122721528039), dim = c(5L, 5L), dimnames = list(
  8. c(&quot;JUNB&quot;, &quot;ALDOA&quot;, &quot;HLA.A&quot;, &quot;THBD&quot;, &quot;CD74&quot;), c(&quot;TCGA.2K.A9WE.01&quot;,
  9. &quot;TCGA.2Z.A9J1.01&quot;, &quot;TCGA.2Z.A9J3.01&quot;, &quot;TCGA.2Z.A9J6.01&quot;,
  10. &quot;TCGA.2Z.A9J7.01&quot;)))
  11. hallmark.df &lt;- structure(list(V3 = c(&quot;JUNB&quot;, &quot;PGK1&quot;, &quot;FDPS&quot;, &quot;ARHGEF2&quot;, &quot;CD74&quot;),
  12. V4 = c(&quot;CXCL2&quot;, &quot;PDK1&quot;, &quot;CYP51A1&quot;, &quot;CLASP1&quot;, &quot;CTNNB1&quot;),
  13. V5 = c(&quot;ATF3&quot;, &quot;GBE1&quot;, &quot;IDI1&quot;, &quot;KIF11&quot;, &quot;JAG2&quot;),
  14. V6 = c(&quot;NFKBIA&quot;, &quot;PFKL&quot;, &quot;FDFT1&quot;, &quot;KIF23&quot;, &quot;NOTCH1&quot;),
  15. V7 = c(&quot;ALDOA&quot;, &quot;JUNB&quot;, &quot;DHCR7&quot;, &quot;ALS2&quot;, &quot;DLL1&quot;)),
  16. row.names = c(&quot;HALLMARK_TNFA_SIGNALING_VIA_NFKB&quot;, &quot;HALLMARK_HYPOXIA&quot;,
  17. &quot;HALLMARK_CHOLESTEROL_HOMEOSTASIS&quot;, &quot;HALLMARK_MITOTIC_SPINDLE&quot;,
  18. &quot;HALLMARK_WNT_BETA_CATENIN_SIGNALING&quot;), class = &quot;data.frame&quot;)
  19. library(tidyverse)
  20. hallmark.df |&gt;
  21. as_tibble(rownames = &quot;row&quot;) |&gt;
  22. pivot_longer(-row) |&gt;
  23. inner_join(
  24. all.deg |&gt;
  25. as_tibble(rownames = &quot;value&quot;),
  26. by = &quot;value&quot;
  27. ) |&gt;
  28. split(~row) |&gt;
  29. map(
  30. ~ . |&gt;
  31. select(-row, -name) |&gt;
  32. column_to_rownames(&quot;value&quot;)
  33. )
  34. #&gt; $HALLMARK_HYPOXIA
  35. #&gt; TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
  36. #&gt; JUNB 16.01696 13.56372 11.3658 13.75139
  37. #&gt; TCGA.2Z.A9J7.01
  38. #&gt; JUNB 12.30131
  39. #&gt;
  40. #&gt; $HALLMARK_TNFA_SIGNALING_VIA_NFKB
  41. #&gt; TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
  42. #&gt; JUNB 16.01696 13.56372 11.36580 13.75139
  43. #&gt; ALDOA 14.39831 13.61670 11.82534 13.24040
  44. #&gt; TCGA.2Z.A9J7.01
  45. #&gt; JUNB 12.30131
  46. #&gt; ALDOA 12.48677
  47. #&gt;
  48. #&gt; $HALLMARK_WNT_BETA_CATENIN_SIGNALING
  49. #&gt; TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
  50. #&gt; CD74 13.8584 12.22016 10.29679 12.13906
  51. #&gt; TCGA.2Z.A9J7.01
  52. #&gt; CD74 10.01227

创建于2023年7月18日,使用reprex v2.0.2

英文:

We can transform row names as a column, then it is easily done by {tidyverse}:

  1. all.deg &lt;- structure(c(16.0169585624867, 14.3983080662428, 12.7844219145156,
  2. 12.6674945373237, 13.8584047354367, 13.563719599839, 13.6166993468069,
  3. 12.9748157402651, 12.7386065050292, 12.2201616898331, 11.3657998135948,
  4. 11.8253392160132, 12.1132082166185, 11.5123143882139, 10.2967924742924,
  5. 13.7513874043739, 13.2403954818698, 12.4196432226432, 12.4676109090624,
  6. 12.1390647972695, 12.3013113392588, 12.4867673484914, 11.3693921877853,
  7. 10.6359730348998, 10.0122721528039), dim = c(5L, 5L), dimnames = list(
  8. c(&quot;JUNB&quot;, &quot;ALDOA&quot;, &quot;HLA.A&quot;, &quot;THBD&quot;, &quot;CD74&quot;), c(&quot;TCGA.2K.A9WE.01&quot;,
  9. &quot;TCGA.2Z.A9J1.01&quot;, &quot;TCGA.2Z.A9J3.01&quot;, &quot;TCGA.2Z.A9J6.01&quot;,
  10. &quot;TCGA.2Z.A9J7.01&quot;)))
  11. hallmark.df &lt;- structure(list(V3 = c(&quot;JUNB&quot;, &quot;PGK1&quot;, &quot;FDPS&quot;, &quot;ARHGEF2&quot;, &quot;CD74&quot;),
  12. V4 = c(&quot;CXCL2&quot;, &quot;PDK1&quot;, &quot;CYP51A1&quot;, &quot;CLASP1&quot;, &quot;CTNNB1&quot;),
  13. V5 = c(&quot;ATF3&quot;, &quot;GBE1&quot;, &quot;IDI1&quot;, &quot;KIF11&quot;, &quot;JAG2&quot;),
  14. V6 = c(&quot;NFKBIA&quot;, &quot;PFKL&quot;, &quot;FDFT1&quot;, &quot;KIF23&quot;, &quot;NOTCH1&quot;),
  15. V7 = c(&quot;ALDOA&quot;, &quot;JUNB&quot;, &quot;DHCR7&quot;, &quot;ALS2&quot;, &quot;DLL1&quot;)),
  16. row.names = c(&quot;HALLMARK_TNFA_SIGNALING_VIA_NFKB&quot;, &quot;HALLMARK_HYPOXIA&quot;,
  17. &quot;HALLMARK_CHOLESTEROL_HOMEOSTASIS&quot;, &quot;HALLMARK_MITOTIC_SPINDLE&quot;,
  18. &quot;HALLMARK_WNT_BETA_CATENIN_SIGNALING&quot;), class = &quot;data.frame&quot;)
  19. library(tidyverse)
  20. hallmark.df |&gt;
  21. as_tibble(rownames = &quot;row&quot;) |&gt;
  22. pivot_longer(-row) |&gt;
  23. inner_join(
  24. all.deg |&gt;
  25. as_tibble(rownames = &quot;value&quot;),
  26. by = &quot;value&quot;
  27. ) |&gt;
  28. split(~row) |&gt;
  29. map(
  30. ~ . |&gt;
  31. select(-row, -name) |&gt;
  32. column_to_rownames(&quot;value&quot;)
  33. )
  34. #&gt; $HALLMARK_HYPOXIA
  35. #&gt; TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
  36. #&gt; JUNB 16.01696 13.56372 11.3658 13.75139
  37. #&gt; TCGA.2Z.A9J7.01
  38. #&gt; JUNB 12.30131
  39. #&gt;
  40. #&gt; $HALLMARK_TNFA_SIGNALING_VIA_NFKB
  41. #&gt; TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
  42. #&gt; JUNB 16.01696 13.56372 11.36580 13.75139
  43. #&gt; ALDOA 14.39831 13.61670 11.82534 13.24040
  44. #&gt; TCGA.2Z.A9J7.01
  45. #&gt; JUNB 12.30131
  46. #&gt; ALDOA 12.48677
  47. #&gt;
  48. #&gt; $HALLMARK_WNT_BETA_CATENIN_SIGNALING
  49. #&gt; TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
  50. #&gt; CD74 13.8584 12.22016 10.29679 12.13906
  51. #&gt; TCGA.2Z.A9J7.01
  52. #&gt; CD74 10.01227

<sup>Created on 2023-07-18 with reprex v2.0.2</sup>

huangapple
  • 本文由 发表于 2023年7月18日 07:10:18
  • 转载请务必保留本文链接:https://go.coder-hub.com/76708608.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定