英文:
Iterate over each row to obtain matches between row values and the rownames of another dataframe df2, then subset df2
问题
以下是您要翻译的内容:
"For each row of hallmark.df
, if the rownames(all.deg)
match any of the values of the given row, retrieve the corresponding rows of all.deg
."
"I also want to keep only the rows of the hallmark.df
dataframe where there are more than 25 non-NA columns and less than 500 columns."
"subset.df <- by(hallmark.df, seq_len(nrow(hallmark.df)), function(row) ifelse(all.deg[rownames(all.deg) %in% hallmark.df,]))"
"Input:"
"Expected output:"
"HALLMARK_TNFA_SIGNALING_VIA_NFKB
"
请告诉我,您是否需要进一步的帮助或有其他问题。
英文:
For each row of hallmark.df
, if the rownames(all.deg)
match any of the values of the given row, retrieve the corresponding rows of all.deg
.
I also want to keep only the rows of the hallmark.df
dataframe where there are more than 25 non-NA columns and less than 500 columns
subset.df <- by(hallmark.df, seq_len(nrow(hallmark.df)), function(row)
ifelse(all.deg[rownames(all.deg) %in% hallmark.df,]))
Input:
all.deg
> dput(all.deg[1:5,1:5])
structure(c(16.0169585624867, 14.3983080662428, 12.7844219145156,
12.6674945373237, 13.8584047354367, 13.563719599839, 13.6166993468069,
12.9748157402651, 12.7386065050292, 12.2201616898331, 11.3657998135948,
11.8253392160132, 12.1132082166185, 11.5123143882139, 10.2967924742924,
13.7513874043739, 13.2403954818698, 12.4196432226432, 12.4676109090624,
12.1390647972695, 12.3013113392588, 12.4867673484914, 11.3693921877853,
10.6359730348998, 10.0122721528039), dim = c(5L, 5L), dimnames = list(
c("JUNB", "ALDOA", "HLA.A", "THBD", "CD74"), c("TCGA.2K.A9WE.01",
"TCGA.2Z.A9J1.01", "TCGA.2Z.A9J3.01", "TCGA.2Z.A9J6.01",
"TCGA.2Z.A9J7.01")))
> dput(hallmark.df[1:5,1:5])
structure(list(V3 = c("JUNB", "PGK1", "FDPS", "ARHGEF2", "CD74"
), V4 = c("CXCL2", "PDK1", "CYP51A1", "CLASP1", "CTNNB1"), V5 = c("ATF3",
"GBE1", "IDI1", "KIF11", "JAG2"), V6 = c("NFKBIA", "PFKL", "FDFT1",
"KIF23", "NOTCH1"), V7 = c("ALDOA", "JUNB", "DHCR7", "ALS2",
"DLL1")), row.names = c("HALLMARK_TNFA_SIGNALING_VIA_NFKB", "HALLMARK_HYPOXIA",
"HALLMARK_CHOLESTEROL_HOMEOSTASIS", "HALLMARK_MITOTIC_SPINDLE",
"HALLMARK_WNT_BETA_CATENIN_SIGNALING"), class = "data.frame")
Expected output:
HALLMARK_TNFA_SIGNALING_VIA_NFKB
TCGA.2K.A9WE.01 | TCGA.2Z.A9J1.01 | TCGA.2Z.A9J3.01 | TCGA.2Z.A9J6.01 | TCGA.2Z.A9J7.01 | |
---|---|---|---|---|---|
JUNB | 16.0169585624867 | 13.563719599839 | 11.3657998135948 | 13.7513874043739 | 12.3013113392588 |
CD74 | 13.8584047354367 | 12.2201616898331 | 10.2967924742924 | 12.1390647972695 | 10.0122721528039 |
HALLMARK_WNT_BETA_CATENIN_SIGNALING
TCGA.2K.A9WE.01 | TCGA.2Z.A9J1.01 | TCGA.2Z.A9J3.01 | TCGA.2Z.A9J6.01 | TCGA.2Z.A9J7.01 | |
---|---|---|---|---|---|
ALDOA | 14.3983080662428 | 13.6166993468069 | 11.8253392160132 | 13.2403954818698 | 12.4867673484914 |
JUNB | 16.0169585624867 | 13.563719599839 | 11.3657998135948 | 13.7513874043739 | 12.3013113392588 |
df.list <- list(HALLMARK_TNFA_SIGNALING_VIA_NFKB, HALLMARK_WNT_BETA_CATENIN_SIGNALING)
Related question: https://stackoverflow.com/questions/76707644/how-to-subset-a-dataframe-based-on-matches-to-another-dataframe
答案1
得分: 1
这可能不是解决您问题的“最佳”方式,但这是一个潜在的解决方案:
library(tidyverse)
all.deg <- structure(c(16.0169585624867, 14.3983080662428, 12.7844219145156,
12.6674945373237, 13.8584047354367, 13.563719599839, 13.6166993468069,
12.9748157402651, 12.7386065050292, 12.2201616898331, 11.3657998135948,
11.8253392160132, 12.1132082166185, 11.5123143882139, 10.2967924742924,
13.7513874043739, 13.2403954818698, 12.4196432226432, 12.4676109090624,
12.1390647972695, 12.3013113392588, 12.4867673484914, 11.3693921877853,
10.6359730348998, 10.0122721528039), dim = c(5L, 5L), dimnames = list(
c("JUNB", "ALDOA", "HLA.A", "THBD", "CD74"), c("TCGA.2K.A9WE.01",
"TCGA.2Z.A9J1.01", "TCGA.2Z.A9J3.01", "TCGA.2Z.A9J6.01",
"TCGA.2Z.A9J7.01")))
hallmark.df <- structure(list(V3 = c("JUNB", "PGK1", "FDPS", "ARHGEF2", "CD74"
), V4 = c("CXCL2", "PDK1", "CYP51A1", "CLASP1", "CTNNB1"), V5 = c("ATF3",
"GBE1", "IDI1", "KIF11", "JAG2"), V6 = c("NFKBIA", "PFKL", "FDFT1",
"KIF23", "NOTCH1"), V7 = c("ALDOA", "JUNB", "DHCR7", "ALS2",
"DLL1")), row.names = c("HALLMARK_TNFA_SIGNALING_VIA_NFKB", "HALLMARK_HYPOXIA",
"HALLMARK_CHOLESTEROL_HOMEOSTASIS", "HALLMARK_MITOTIC_SPINDLE",
"HALLMARK_WNT_BETA_CATENIN_SIGNALING"), class = "data.frame")
output <- list()
for (i in seq_along(hallmark.df)) {
output[[rownames(hallmark.df)[i]]] <- all.deg[rownames(all.deg) %in% hallmark.df[,i],]
}
# 从列表中移除空数据框
clean_list <- output[which(lapply(output, nrow) != 0)]
# 将数据框导出到全局环境
list2env(clean_list, envir = .GlobalEnv)
#> <environment: R_GlobalEnv>
ls(pattern = "HALLMARK*")
#> [1] "HALLMARK_TNFA_SIGNALING_VIA_NFKB" "HALLMARK_WNT_BETA_CATENIN_SIGNALING"
HALLMARK_TNFA_SIGNALING_VIA_NFKB
#> TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#> JUNB 16.01696 13.56372 11.36580 13.75139
#> CD74 13.85840 12.22016 10.29679 12.13906
#> TCGA.2Z.A9J7.01
#> JUNB 12.30131
#> CD74 10.01227
HALLMARK_WNT_BETA_CATENIN_SIGNALING
#> TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#> JUNB 16.01696 13.56372 11.36580 13.75139
#> ALDOA 14.39831 13.61670 11.82534 13.24040
#> TCGA.2Z.A9J7.01
#> JUNB 12.30131
#> ALDOA 12.48677
创建于2023年7月18日,使用reprex v2.0.2
编辑 1
要保留hallmark.df数据框中具有超过25个非NA列但少于500个非NA列的行,可以使用以下代码:
hallmark %>%
filter(rowSums(!is.na(.)) >= 25 | rowSums(!is.na(.)) <= 500)
英文:
This might not be the 'best' way to solve your problem, but here is one potential solution:
library(tidyverse)
all.deg <- structure(c(16.0169585624867, 14.3983080662428, 12.7844219145156,
12.6674945373237, 13.8584047354367, 13.563719599839, 13.6166993468069,
12.9748157402651, 12.7386065050292, 12.2201616898331, 11.3657998135948,
11.8253392160132, 12.1132082166185, 11.5123143882139, 10.2967924742924,
13.7513874043739, 13.2403954818698, 12.4196432226432, 12.4676109090624,
12.1390647972695, 12.3013113392588, 12.4867673484914, 11.3693921877853,
10.6359730348998, 10.0122721528039), dim = c(5L, 5L), dimnames = list(
c("JUNB", "ALDOA", "HLA.A", "THBD", "CD74"), c("TCGA.2K.A9WE.01",
"TCGA.2Z.A9J1.01", "TCGA.2Z.A9J3.01", "TCGA.2Z.A9J6.01",
"TCGA.2Z.A9J7.01")))
hallmark.df <- structure(list(V3 = c("JUNB", "PGK1", "FDPS", "ARHGEF2", "CD74"
), V4 = c("CXCL2", "PDK1", "CYP51A1", "CLASP1", "CTNNB1"), V5 = c("ATF3",
"GBE1", "IDI1", "KIF11", "JAG2"), V6 = c("NFKBIA", "PFKL", "FDFT1",
"KIF23", "NOTCH1"), V7 = c("ALDOA", "JUNB", "DHCR7", "ALS2",
"DLL1")), row.names = c("HALLMARK_TNFA_SIGNALING_VIA_NFKB", "HALLMARK_HYPOXIA",
"HALLMARK_CHOLESTEROL_HOMEOSTASIS", "HALLMARK_MITOTIC_SPINDLE",
"HALLMARK_WNT_BETA_CATENIN_SIGNALING"), class = "data.frame")
output <- list()
for (i in seq_along(hallmark.df)) {
output[[rownames(hallmark.df)[i]]] <- all.deg[rownames(all.deg) %in% hallmark.df[,i],]
}
# Remove empty dataframes from the list
clean_list <- output[which(lapply(output, nrow) != 0)]
# export the dataframes to the global env
list2env(clean_list, envir = .GlobalEnv)
#> <environment: R_GlobalEnv>
ls(pattern = "HALLMARK*")
#> [1] "HALLMARK_TNFA_SIGNALING_VIA_NFKB" "HALLMARK_WNT_BETA_CATENIN_SIGNALING"
HALLMARK_TNFA_SIGNALING_VIA_NFKB
#> TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#> JUNB 16.01696 13.56372 11.36580 13.75139
#> CD74 13.85840 12.22016 10.29679 12.13906
#> TCGA.2Z.A9J7.01
#> JUNB 12.30131
#> CD74 10.01227
HALLMARK_WNT_BETA_CATENIN_SIGNALING
#> TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#> JUNB 16.01696 13.56372 11.36580 13.75139
#> ALDOA 14.39831 13.61670 11.82534 13.24040
#> TCGA.2Z.A9J7.01
#> JUNB 12.30131
#> ALDOA 12.48677
<sup>Created on 2023-07-18 with reprex v2.0.2</sup>
Edit 1
To keep rows of the hallmark.df dataframe where there are more than 25 non-NA columns and less than 500 non-NA columns you can use:
hallmark %>%
filter(rowSums(!is.na(.)) >= 25 | rowSums(!is.na(.)) <= 500)
答案2
得分: 1
以下是您提供的内容的中文翻译:
我们可以将行名称转换为列,然后通过 {tidyverse} 很容易实现:
all.deg <- structure(c(16.0169585624867, 14.3983080662428, 12.7844219145156,
12.6674945373237, 13.8584047354367, 13.563719599839, 13.6166993468069,
12.9748157402651, 12.7386065050292, 12.2201616898331, 11.3657998135948,
11.8253392160132, 12.1132082166185, 11.5123143882139, 10.2967924742924,
13.7513874043739, 13.2403954818698, 12.4196432226432, 12.4676109090624,
12.1390647972695, 12.3013113392588, 12.4867673484914, 11.3693921877853,
10.6359730348998, 10.0122721528039), dim = c(5L, 5L), dimnames = list(
c("JUNB", "ALDOA", "HLA.A", "THBD", "CD74"), c("TCGA.2K.A9WE.01",
"TCGA.2Z.A9J1.01", "TCGA.2Z.A9J3.01", "TCGA.2Z.A9J6.01",
"TCGA.2Z.A9J7.01")))
hallmark.df <- structure(list(V3 = c("JUNB", "PGK1", "FDPS", "ARHGEF2", "CD74"),
V4 = c("CXCL2", "PDK1", "CYP51A1", "CLASP1", "CTNNB1"),
V5 = c("ATF3", "GBE1", "IDI1", "KIF11", "JAG2"),
V6 = c("NFKBIA", "PFKL", "FDFT1", "KIF23", "NOTCH1"),
V7 = c("ALDOA", "JUNB", "DHCR7", "ALS2", "DLL1")),
row.names = c("HALLMARK_TNFA_SIGNALING_VIA_NFKB", "HALLMARK_HYPOXIA",
"HALLMARK_CHOLESTEROL_HOMEOSTASIS", "HALLMARK_MITOTIC_SPINDLE",
"HALLMARK_WNT_BETA_CATENIN_SIGNALING"), class = "data.frame")
library(tidyverse)
hallmark.df |>
as_tibble(rownames = "row") |>
pivot_longer(-row) |>
inner_join(
all.deg |>
as_tibble(rownames = "value"),
by = "value"
) |>
split(~row) |>
map(
~ . |>
select(-row, -name) |>
column_to_rownames("value")
)
#> $HALLMARK_HYPOXIA
#> TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#> JUNB 16.01696 13.56372 11.3658 13.75139
#> TCGA.2Z.A9J7.01
#> JUNB 12.30131
#>
#> $HALLMARK_TNFA_SIGNALING_VIA_NFKB
#> TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#> JUNB 16.01696 13.56372 11.36580 13.75139
#> ALDOA 14.39831 13.61670 11.82534 13.24040
#> TCGA.2Z.A9J7.01
#> JUNB 12.30131
#> ALDOA 12.48677
#>
#> $HALLMARK_WNT_BETA_CATENIN_SIGNALING
#> TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#> CD74 13.8584 12.22016 10.29679 12.13906
#> TCGA.2Z.A9J7.01
#> CD74 10.01227
创建于2023年7月18日,使用reprex v2.0.2
英文:
We can transform row names as a column, then it is easily done by {tidyverse}:
all.deg <- structure(c(16.0169585624867, 14.3983080662428, 12.7844219145156,
12.6674945373237, 13.8584047354367, 13.563719599839, 13.6166993468069,
12.9748157402651, 12.7386065050292, 12.2201616898331, 11.3657998135948,
11.8253392160132, 12.1132082166185, 11.5123143882139, 10.2967924742924,
13.7513874043739, 13.2403954818698, 12.4196432226432, 12.4676109090624,
12.1390647972695, 12.3013113392588, 12.4867673484914, 11.3693921877853,
10.6359730348998, 10.0122721528039), dim = c(5L, 5L), dimnames = list(
c("JUNB", "ALDOA", "HLA.A", "THBD", "CD74"), c("TCGA.2K.A9WE.01",
"TCGA.2Z.A9J1.01", "TCGA.2Z.A9J3.01", "TCGA.2Z.A9J6.01",
"TCGA.2Z.A9J7.01")))
hallmark.df <- structure(list(V3 = c("JUNB", "PGK1", "FDPS", "ARHGEF2", "CD74"),
V4 = c("CXCL2", "PDK1", "CYP51A1", "CLASP1", "CTNNB1"),
V5 = c("ATF3", "GBE1", "IDI1", "KIF11", "JAG2"),
V6 = c("NFKBIA", "PFKL", "FDFT1", "KIF23", "NOTCH1"),
V7 = c("ALDOA", "JUNB", "DHCR7", "ALS2", "DLL1")),
row.names = c("HALLMARK_TNFA_SIGNALING_VIA_NFKB", "HALLMARK_HYPOXIA",
"HALLMARK_CHOLESTEROL_HOMEOSTASIS", "HALLMARK_MITOTIC_SPINDLE",
"HALLMARK_WNT_BETA_CATENIN_SIGNALING"), class = "data.frame")
library(tidyverse)
hallmark.df |>
as_tibble(rownames = "row") |>
pivot_longer(-row) |>
inner_join(
all.deg |>
as_tibble(rownames = "value"),
by = "value"
) |>
split(~row) |>
map(
~ . |>
select(-row, -name) |>
column_to_rownames("value")
)
#> $HALLMARK_HYPOXIA
#> TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#> JUNB 16.01696 13.56372 11.3658 13.75139
#> TCGA.2Z.A9J7.01
#> JUNB 12.30131
#>
#> $HALLMARK_TNFA_SIGNALING_VIA_NFKB
#> TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#> JUNB 16.01696 13.56372 11.36580 13.75139
#> ALDOA 14.39831 13.61670 11.82534 13.24040
#> TCGA.2Z.A9J7.01
#> JUNB 12.30131
#> ALDOA 12.48677
#>
#> $HALLMARK_WNT_BETA_CATENIN_SIGNALING
#> TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#> CD74 13.8584 12.22016 10.29679 12.13906
#> TCGA.2Z.A9J7.01
#> CD74 10.01227
<sup>Created on 2023-07-18 with reprex v2.0.2</sup>
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论