英文:
Error in read.metharray(basenames = files, extended = extended, verbose = verbose, :
问题
我只会提供代码的中文翻译,不提供其他内容。以下是您的代码的中文翻译:
# 我想保留仅在第一个下划线之前具有重复子字符串的文件。
# 例如,`abc_xyz_123.idat` 和 `abc_xyz_456.idat` 是一对,而 `abc_xyz_123.idat` 和 `abc_mno_123.idat` 不是。这些对是随机排序的。
# `base` 变量包含没有 `_*.idat` 的重复值,而 `sample.sheet` 中的对应 `Sample.ID` 必须与 `clin.info` 中的 `Sample.ID` 匹配。
# 然后将匹配的 `clin.info` 用作 [`missMethyl`][1] 的输入,该输入需要一个基名列,告诉我们要读取的 idat 文件位于何处。`Basename` 列包含 `filename.substr`。
library(missMethyl)
matched.filenames <- sample.sheet$File.Name[match(clin.info$Sample.ID, sample.sheet$Sample.ID)]
filename.substr <- stringr::str_extract(matched.filenames, "[^_]*")
clin.info$Basename <- file.path(paste0(getwd(), "/idat"), filename.substr)
sample.sheet$Basename <- clin.info$Basename[match(clin.info$Sample.ID, sample.sheet$Sample.ID)]
sample.sheet <- sample.sheet[!(is.na(sample.sheet$Basename)) | !(duplicated(sample.sheet$Basename))] # 如果Basename为NA或不重复,则删除行
clin.info <- clin.info[clin.info$Sample.ID %in% sample.sheet$Sample.ID,]
# 运行 missMethyl
rgSet <- read.metharray.exp(targets=clin.info)
# 异常回溯
# 出现以下错误:
# Error in read.metharray(basenames = files, extended = extended, verbose = verbose, :
# The following specified files do not exist:
# [此错误消息后面跟着Basename中的所有值的名称]
# 示例
# 打印 sample.sheet 和 clin.info 的内容
dput(sample.sheet)
dput(clin.info)
英文:
I want to keep only files with duplicated substrings before the first underscore.
For example, abc_xyz_123.idat
and abc_xyz_456.idat
are pairs, whereas abc_xyz_123.idat
and abc_mno_123.idat
are not. The pairs are in random order.
The base
variable contains the duplicated values without the _*.idat
and the corresponding Sample.ID
in sample.sheet
must match the Sample.ID
in clin.info
.
The matching clin.info
is then used as an input for missMethyl
, which requires a basename column that tells us where the idat files to be read are located. The Basename
column contains the filename.substr
.
library(missMethyl)
matched.filenames <- sample.sheet$File.Name[match(clin.info$Sample.ID, sample.sheet$Sample.ID)]
filename.substr <- stringr::str_extract(matched.filenames, "[^_]*")
clin.info$Basename <- file.path(paste0(getwd(), "/idat"), filename.substr)
sample.sheet$Basename <- clin.info$Basename[match(clin.info$Sample.ID, sample.sheet$Sample.ID)]
sample.sheet <- sample.sheet[!(is.na(sample.sheet$Basename)) || !(duplicated(sample.sheet$Basename))] # Delete row if Basename is NA or not duplicated
clin.info <- clin.info[clin.info$Sample.ID %in% sample.sheet$Sample.ID,]
Run missMethyl:
rgSet <- read.metharray.exp(targets=clin.info)
Traceback:
Error in read.metharray(basenames = files, extended = extended, verbose = verbose, :
The following specified files do not exist:
[This error message is followed by the names of all the values in Basename]
Example:
> dput(sample.sheet)
structure(list(File.Name = c("e079a20c-6cdc-4b74-afdf-3fee58f0b574_noid_Red.idat",
"447ef862-535e-48b5-893d-588203a49eac_noid_Grn.idat", "e079a20c-6cdc-4b74-afdf-3fee58f0b574_noid_Grn.idat",
"447ef862-535e-48b5-893d-588203a49eac_noid_Red.idat", "baed8332-9bc2-46d1-954e-275786b88a94_noid_Grn.idat"
), Sample.ID = c("TCGA.BQ.7059.01A", "TCGA.UZ.A9PM.01A", "TCGA.BQ.7059.01A",
"TCGA.UZ.A9PM.01A", "TCGA.B9.5156.01A")), class = "data.frame", row.names = c(1L,
4L, 5L, 6L, 7L))
> dput(clin.info)
structure(list(subtype = c("2a", "1a", "1b", "1a", "1a", "1b",
"2b", "2b", "2a", "1a", "2b", "1b", "1c", "2b", "2a", "2b", "1b",
"2b", "2b", "1c", "1c", "2b", "1c", "2c", "1a", "1a", "2b", "1c",
"1a", "1c", "2b", "2a", "2c", "2a", "1a", "1a", "1b", "1a", "1b",
"1a", "2a", "1a", "1c", "1a", "1c", "2b", "1a", "2a", "1c", "2a",
"1a", "2b", "2b", "2c", "2b", "2c", "2b", "2a", "2b", "2a", "2b",
"2a", "2a", "2a", "2b", "2b", "2b", "2c", "2c", "2b", "2b", "2b",
"2a", "2b", "1c", "2b", "2a", "2b", "2b", "2a", "2b", "2b", "1a",
"1b", "1c", "1a", "1a", "2a", "1c", "1a", "1a", "1c", "2a", "2b",
"2a", "1c", "2a", "1a", "1a", "1c", "1a", "2c", "1a", "1b", "2c",
"2a", "2c", "2b", "1a", "2c", "1a", "1a", "1c", "2b", "1a", "1c",
"1b", "1c", "2c", "1c", "2b", "1a", "2c", "1c", "2a", "1c", "1c",
"2a", "2b", "2a", "1c", "1b", "1c", "2a", "1a", "1a", "2b", "2c",
"2a", "1a", "2b", "1c", "2a", "1a", "2c", "2c", "2a", "1b", "1b",
"1b", "2b", "1a", "2b", "2b", "1a", "2c", "2c", "2b", "2a", "1c",
"2b", "2a", "1b", "2b", "1b", "1a", "2b", "1a", "2b", "1a", "1c",
"1a", "1b", "1c", "1c", "1a", "1b", "2a", "2a", "2b", "1c", "1c",
"1c", "1c", "1c", "1c", "1c", "1b", "2b", "2a", "2a", "1c", "1a",
"1a", "1a", "1a", "2b", "1c", "2a"), Sample.ID = c("TCGA.2K.A9WE.01A",
"TCGA.2Z.A9J1.01A", "TCGA.2Z.A9J3.01A", "TCGA.2Z.A9J5.01A", "TCGA.2Z.A9J6.01A",
"TCGA.2Z.A9J7.01A", "TCGA.2Z.A9J8.01A", "TCGA.2Z.A9JD.01A", "TCGA.2Z.A9JI.01A",
"TCGA.2Z.A9JJ.01A", "TCGA.2Z.A9JO.01A", "TCGA.2Z.A9JQ.01A", "TCGA.4A.A93W.01A",
"TCGA.4A.A93X.01A", "TCGA.4A.A93Y.01A", "TCGA.5P.A9JU.01A", "TCGA.5P.A9JY.01A",
"TCGA.5P.A9KE.01A", "TCGA.A4.7288.01A", "TCGA.A4.7583.01A", "TCGA.A4.7584.01A",
"TCGA.A4.7585.01A", "TCGA.A4.7734.01A", "TCGA.A4.7915.01A", "TCGA.A4.7996.01A",
"TCGA.A4.7997.01A", "TCGA.A4.8098.01A", "TCGA.A4.8311.01A", "TCGA.A4.8517.01A",
"TCGA.A4.8630.01A", "TCGA.A4.A57E.01A", "TCGA.AL.7173.01A", "TCGA.AL.A5DJ.01A",
"TCGA.B1.5398.01A", "TCGA.B1.7332.01A", "TCGA.B1.A47M.01A", "TCGA.B1.A47N.01A",
"TCGA.B1.A47O.01A", "TCGA.B1.A654.01A", "TCGA.B1.A657.01A", "TCGA.B3.8121.01A",
"TCGA.B3.A6W5.01A", "TCGA.B9.5155.01A", "TCGA.B9.5156.01A", "TCGA.B9.7268.01A",
"TCGA.B9.A44B.01A", "TCGA.B9.A5W7.01A", "TCGA.B9.A5W8.01A", "TCGA.B9.A5W9.01A",
"TCGA.B9.A69E.01A", "TCGA.B9.A8YH.01A", "TCGA.B9.A8YI.01A", "TCGA.BQ.5875.01A",
"TCGA.BQ.5877.01A", "TCGA.BQ.5878.01A", "TCGA.BQ.5879.01A", "TCGA.BQ.5880.01A",
"TCGA.BQ.5881.01A", "TCGA.BQ.5882.01A", "TCGA.BQ.5883.01A", "TCGA.BQ.5885.01A",
"TCGA.BQ.5887.01A", "TCGA.BQ.5888.01A", "TCGA.BQ.5889.01A", "TCGA.BQ.5890.01A",
"TCGA.BQ.5891.01A", "TCGA.BQ.5892.01A", "TCGA.BQ.5893.01A", "TCGA.BQ.5894.01A",
"TCGA.BQ.7044.01A", "TCGA.BQ.7046.01A", "TCGA.BQ.7048.01A", "TCGA.BQ.7049.01A",
"TCGA.BQ.7050.01A", "TCGA.BQ.7051.01A", "TCGA.BQ.7053.01A", "TCGA.BQ.7055.01A",
"TCGA.BQ.7056.01A", "TCGA.BQ.7058.01A", "TCGA.BQ.7059.01A", "TCGA.BQ.7060.01A",
"TCGA.BQ.7061.01A", "TCGA.BQ.7062.01A", "TCGA.DW.5560.01A", "TCGA.DW.5561.01A",
"TCGA.DW.7834.01A", "TCGA.DW.7837.01A", "TCGA.DW.7838.01A", "TCGA.DW.7839.01A",
"TCGA.DW.7840.01A", "TCGA.DW.7841.01A", "TCGA.DW.7842.01A", "TCGA.DW.7963.01B",
"TCGA.DZ.6131.01A", "TCGA.DZ.6132.01A", "TCGA.DZ.6133.01A", "TCGA.DZ.6134.01A",
"TCGA.DZ.6135.01A", "TCGA.EV.5901.01A", "TCGA.EV.5902.01A", "TCGA.EV.5903.01A",
"TCGA.F9.A4JJ.01A", "TCGA.F9.A7Q0.01A", "TCGA.F9.A7VF.01A", "TCGA.F9.A8NY.01A",
"TCGA.F9.A97G.01A", "TCGA.G7.6789.01A", "TCGA.G7.6790.01A", "TCGA.G7.6792.01A",
"TCGA.G7.6793.01A", "TCGA.G7.6795.01A", "TCGA.G7.6796.01A", "TCGA.G7.6797.01A",
"TCGA.G7.7501.01A", "TCGA.G7.7502.01A", "TCGA.G7.A4TM.01A", "TCGA.G7.A8LB.01A",
"TCGA.G7.A8LC.01A", "TCGA.G7.A8LD.01A", "TCGA.G7.A8LE.01A", "TCGA.GL.6846.01A",
"TCGA.GL.7773.01A", "TCGA.GL.7966.01A", "TCGA.GL.8500.01A", "TCGA.GL.A4EM.01A",
"TCGA.GL.A59R.01A", "TCGA.GL.A9DC.01A", "TCGA.GL.A9DD.01A", "TCGA.GL.A9DE.01A",
"TCGA.HE.A5NF.01A", "TCGA.HE.A5NH.01A", "TCGA.HE.A5NI.01A", "TCGA.HE.A5NJ.01A",
"TCGA.HE.A5NK.01A", "TCGA.HE.A5NL.01A", "TCGA.IA.A40U.01A", "TCGA.IA.A40X.01A",
"TCGA.IA.A40Y.01A", "TCGA.IA.A83V.01A", "TCGA.IA.A83W.01A", "TCGA.IZ.8195.01A",
"TCGA.IZ.8196.01A", "TCGA.IZ.A6M8.01A", "TCGA.J7.6720.01A", "TCGA.J7.8537.01A",
"TCGA.J7.A8I2.01A", "TCGA.KV.A6GD.01A", "TCGA.KV.A6GE.01A", "TCGA.KV.A74V.01A",
"TCGA.MH.A55Z.01A", "TCGA.MH.A560.01A", "TCGA.MH.A562.01A", "TCGA.MH.A855.01A",
"TCGA.P4.A5E6.01A", "TCGA.P4.A5E7.01A", "TCGA.P4.A5E8.01A", "TCGA.P4.A5EA.01A",
"TCGA.P4.A5EB.01A", "TCGA.P4.A5ED.01A", "TCGA.P4.AAVL.01A", "TCGA.P4.AAVM.01A",
"TCGA.PJ.A5Z8.01A", "TCGA.PJ.A5Z9.01A", "TCGA.Q2.A5QZ.01A", "TCGA.SX.A71R.01A",
"TCGA.SX.A71U.01A", "TCGA.SX.A7SM.01A", "TCGA.SX.A7SN.01A", "TCGA.SX.A7SO.01A",
"TCGA.SX.A7SP.01A", "TCGA.SX.A7SQ.01A", "TCGA.SX.A7SR.01A", "TCGA.SX.A7SS.01A",
"TCGA.UN.AAZ9.01A", "TCGA.UZ.A9PJ.01A", "TCGA.UZ.A9PK.01A", "TCGA.UZ.A9PL.01A",
"TCGA.UZ.A9PM.01A", "TCGA.UZ.A9PN.01A", "TCGA.UZ.A9PO.01A", "TCGA.UZ.A9PP.01A",
"TCGA.UZ.A9PR.01A", "TCGA.UZ.A9PS.01A", "TCGA.UZ.A9PU.01A", "TCGA.UZ.A9PV.01A",
"TCGA.UZ.A9PX.01A", "TCGA.UZ.A9PZ.01A", "TCGA.UZ.A9Q0.01A", "TCGA.UZ.A9Q1.01A",
"TCGA.V9.A7HT.01A", "TCGA.WN.A9G9.01A", "TCGA.Y8.A894.01A", "TCGA.Y8.A895.01A",
"TCGA.Y8.A896.01A", "TCGA.Y8.A897.01A", "TCGA.Y8.A8RY.01A", "TCGA.Y8.A8RZ.01A",
"TCGA.Y8.A8S0.01A", "TCGA.Y8.A8S1.01A")), class = "data.frame", row.names = c(NA,
-199L))
答案1
得分: 1
对于你的原问题的回答:以下是一种快速提取那些在第一个“_”之前是重复的前缀的方法:
# 加载必要的包。
library(dplyr)
library(stringr)
# ...
# 加载数据,与`sample.sheet`一样。
# ...
# 组装一个包含重复文件的列。
sample.sheet %>%
# 添加一个列,其中包含“_”之前的所有内容(前缀)。
mutate(file_prefix = str_extract(File.Name, "^[^_]*")) %>%
# 计算每个前缀的出现次数。
group_by(file_prefix) %>%
summarize(n = n()) %>%
ungroup() %>%
# 仅包括具有多次出现的前缀:重复项。
filter(n > 1) %>%
select(file_prefix)
对于类似你的示例的 sample.sheet
:
sample.sheet <- structure(
list(
File.Name = c(
"e079a20c-6cdc-4b74-afdf-3fee58f0b574_noid_Red.idat",
"447ef862-535e-48b5-893d-588203a49eac_noid_Grn.idat",
"e079a20c-6cdc-4b74-afdf-3fee58f0b574_noid_Grn.idat",
"447ef862-535e-48b5-893d-588203a49eac_noid_Red.idat",
"baed8332-9bc2-46d1-954e-275786b88a94_noid_Grn.idat"
),
Sample.ID = c(
"TCGA.BQ.7059.01A",
"TCGA.UZ.A9PM.01A",
"TCGA.BQ.7059.01A",
"TCGA.UZ.A9PM.01A",
"TCGA.B9.5156.01A"
)
),
class = "data.frame",
row.names = c(1L, 4L, 5L, 6L, 7L)
)
这个工作流程应该产生以下输出:
# A tibble: 2 × 1
file_prefix
<chr>
1 447ef862-535e-48b5-893d-588203a49eac
2 e079a20c-6cdc-4b74-afdf-3fee58f0b574
英文:
In answer to your original question: here is a quick way to extract those prefixes, before the first _
, that are duplicates:
# Load the necessary packages.
library(dplyr)
library(stringr)
# ...
# Load the data, as with `sample.sheet`.
# ...
# Assemble a column of duplicate files.
sample.sheet %>%
# Add a column with everything (the prefix) before the "_" (the suffix).
mutate(file_prefix = str_extract(File.Name, "^[^_]*")) %>%
# Count the occurrences of each prefix.
group_by(file_prefix) %>%
summarize(n = n()) %>%
ungroup() %>%
# Include only those prefixes with multiple occurrences: duplicates.
filter(n > 1) %>%
select(file_prefix)
Given a sample.sheet
like your example
sample.sheet <- structure(
list(
File.Name = c(
"e079a20c-6cdc-4b74-afdf-3fee58f0b574_noid_Red.idat",
"447ef862-535e-48b5-893d-588203a49eac_noid_Grn.idat",
"e079a20c-6cdc-4b74-afdf-3fee58f0b574_noid_Grn.idat",
"447ef862-535e-48b5-893d-588203a49eac_noid_Red.idat",
"baed8332-9bc2-46d1-954e-275786b88a94_noid_Grn.idat"
),
Sample.ID = c(
"TCGA.BQ.7059.01A",
"TCGA.UZ.A9PM.01A",
"TCGA.BQ.7059.01A",
"TCGA.UZ.A9PM.01A",
"TCGA.B9.5156.01A"
)
),
class = "data.frame",
row.names = c(1L, 4L, 5L, 6L, 7L)
)
this workflow should yield the following output:
# A tibble: 2 × 1
file_prefix
<chr>
1 447ef862-535e-48b5-893d-588203a49eac
2 e079a20c-6cdc-4b74-afdf-3fee58f0b574
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论