英文:
Saving CSV files as tab delimited files while keeping the row names
问题
以下是您提供的代码的翻译部分:
我有一个文件夹,里面装满了CSV文件。每个文件的行名称是基因,列是样本。我需要它们以TSV的形式,所以我将它们每个都保存为TSV文件,并保存到一个新的路径。这是代码:
folder_path <- "/原始/文件夹/路径"
csv_files <- list.files(path=folder_path, pattern="^tpm.*\\.csv$", full.names=TRUE)
new_TSV_files_path <- "/新/文件夹/路径/"
lapply(csv_files , function(file) {
data <- read.csv(file, stringsAsFactors=FALSE, fill=TRUE)
filename <- basename(file)
filename <- sub(".csv$", "", filename)
new_file_path <- file.path(new_TSV_files_path , paste0(filename, ".txt"))
write.table(data,
file=new_file_path,
sep="\t",
row.names=FALSE,
quote=FALSE)
})
问题是,当我将它们保存为TSV并查看时,第一列是名为“X”的基因,而行名称只是数字。我尝试了很多种方法来保持基因作为行名称,但没有成功。以下是我尝试的一个示例代码:
new_TSV_files_path <- "/tsv/文件/路径/"
file_list <- list.files(path=TSV_files_path , pattern="^tpm.*\\.tsv$", full.names=TRUE)
for (file_path in file_list) {
dt <- fread(file_path, data.table=FALSE)
rownames(dt) <- dt[, 1]
dt <- dt[, -1]
# 将数据保存回相同的tsv文件
fwrite(dt, file_path, sep="\t", row.names=TRUE)
}
我需要将基因作为行名称,并删除X列(它是第一列),否则我使用的网络工具将无法工作。有什么解决方法?为什么在从CSV转换为TSV后,行名称变成了一个单独的列?
请注意,我已经将代码中的HTML实体(如<
和"
)转换为正常的文本,以便更容易理解。如果您需要进一步的帮助,请告诉我。
英文:
I have a folder that is full of CSV files. Each file has genes as row names and samples as columns. I need them as TSV, so I saved each one of them as TSV and into a new path. This is the code:
folder_path <- "/original/folder/path"
csv_files <- list.files(path=folder_path, pattern="^tpm.*\\.csv$", full.names=TRUE)
new_TSV_files_path<- "/new/folder/path/"
lapply(csv_files , function(file) {
data <- read.csv(file, stringsAsFactors=FALSE, fill=TRUE)
filename <- basename(file)
filename <- sub(".csv$", "", filename)
new_file_path <- file.path(new_TSV_files_path , paste0(filename, ".txt"))
write.table(data,
file=new_file_path,
sep="\t",
row.names=FALSE,
quote=FALSE)
})
The problem is after I save them as TSV, and I view them, the first column is the genes, named "X" and the row names is just numbers. I tired in many ways to keep the genes as the row names but it is not working. This is a code that I tried for example:
new_TSV_files_path <- "/tsv/files/path/"
file_list <- list.files(path=TSV_files_path , pattern="^tpm.*\\.tsv$", full.names=TRUE)
for (file_path in file_list) {
dt <- fread(file_path, data.table=FALSE)
rownames(dt) <- dt[, 1]
dt <- dt[, -1]
# Save the data back to the same tsv file
fwrite(dt, file_path, sep="\t", row.names=TRUE)
}
I need the genes as row names and I also need to delete the X column (it is the first column) or else the web tool I'm using won't work. What can be done? Why is it that after the transition from CSV to TSV the row names become as a separate column ?
EDIT - a few line of one of the CSVs
c("\"\",\"Riaz_Nivolumab_2017-p001-ar-8813\",\"Riaz_Nivolumab_2017-p002-ar-8815\",\"Riaz_Nivolumab_2017-p017-ar-8890\",\"Riaz_Nivolumab_2017-p026-ar-8920\",\"Riaz_Nivolumab_2017-p034-ar-8929\",\"Riaz_Nivolumab_2017-p036-ar-8898\",\"Riaz_Nivolumab_2017-p037-ar-8900\",\"Riaz_Nivolumab_2017-p038-ar-8895\",\"Riaz_Nivolumab_2017-p039-ar-8819\",\"Riaz_Nivolumab_2017-p046-ar-8904\",\"Riaz_Nivolumab_2017-p047-ar-8836\",\"Riaz_Nivolumab_2017-p048-ar-8897\",\"Riaz_Nivolumab_2017-p049-ar-8824\",\"Riaz_Nivolumab_2017-p052-ar-8839\",\"Riaz_Nivolumab_2017-p065-ar-8857\",\"Riaz_Nivolumab_2017-p067-ar-8840\",\"Riaz_Nivolumab_2017-p078-ar-8864\",\"Riaz_Nivolumab_2017-p079-ar-8849\",\"Riaz_Nivolumab_2017-p082-ar-8822\",\"Riaz_Nivolumab_2017-p085-ar-8829\",\"Riaz_Nivolumab_2017-p090-ar-8866\",\"Riaz_Nivolumab_2017-p092-ar-8867\",\"Riaz_Nivolumab_2017-p098-ar-8853\",\"Riaz_Nivolumab_2017-p101-ar-8834\",\"Riaz_Nivolumab_2017-p103-ar-8872\",\"Riaz_Nivolumab_2017-p106-ar-8926\"",
"\"A1BG\",1.0008835087938,-1.91493588495907,-0.620734484990514,-1.75775209550157,-1.23665217147917,-2.49624508855533,0.727023994594072,0.328722516597936,-0.108114351387649,-0.0836309292648272,-0.325627003998877,4.41269552131938,-0.229120400945704,3.7248031834164,-0.504990077186003,-2.28542936373545,-1.1031748886154,-0.0826211449622227,1.44674204021314,1.3682306825028,0.544742458853122,-2.21733388578132,0.231763591509675,0.0236879027615116,1.79591432017568,-0.638847949374409",
"\"A1CF\",-0.543733010666938,-0.537513815032929,-0.481855746353474,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,5.49732418197924,-0.543733010666938,4.6852457132221,-0.543733010666938,0.117380356180772,-0.543733010666938,-0.543733010666938,1.53929625176185,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.48894973908573,-0.543733010666938,-0.543733010666938,-0.543733010666938",
"\"A2M\",0.0979791979801679,-0.446028724661623,-0.465953763246008,-2.08487244739613,2.01411421717692,1.19837532976838,-0.745540508622708,0.356022989838996,-2.37641175531056,0.517622991895147,2.67515559703126,1.74369521852923,1.19154956592325,2.00377301045288,-0.130083699768657,-0.354493861811036,-3.736442059777,1.59429773965362,1.2081229778692,-2.95554013952561,-1.60581260283625,1.03219514758484,0.925657747621587,-2.02343920511453,-0.246282066712528,0.612339103457181",
"\"A2ML1\",-0.363502635555151,-0.363502635555151,-0.298573188895895,-0.363502635555151,-0.363502635555151,-0.313253413224058,3.64328505792818,-0.363502635555151,-0.363502635555151,-0.363502635555151,0.0272101525164028,-0.363502635555151,-0.107062171894006,-0.363502635555151,-0.363502635555151,-0.363502635555151,-0.363502635555151,-0.212720613261966,-0.363502635555151,-0.363502635555151,3.46425626517263,-0.363502635555151,-0.197854673973831,-0.363502635555151,0.174257390070118,-0.363502635555151"
)
答案1
得分: 1
基本上,您可能想在`read.csv`中设置`row.names=1`,以指示哪一列包含行名称。
csv_files <- list.files(folder_path, full.names=TRUE)
lapply(csv_files, \(x) {
read.csv(x, row.names=1) |>
write.table(file=sprintf('%s/%s', new_tsv, sub('\\.csv$', '.txt', basename(x))),
sep='\t', quote=FALSE)
}) |> invisible()
`invisible`避免在控制台中显示`NULL`。
*测试:*
read.table(sprintf('%s/%s', new_tsv, 'tpm_01.txt'))
# sample1 sample2 sample3
# gene1 14 9 14
# gene2 8 14 17
# gene3 11 9 5
# gene4 12 16 14
# gene5 11 9 13
数据:
tdir <- tempdir()
folder_path <- paste0(tdir, '/old')
dir.create(folder_path)
set.seed(42)
for (i in 1:3) {
write.csv(matrix(rpois(15, 10), 5, 3, dimnames=list(paste0('gene', 1:5), paste0('sample', 1:3))),
sprintf('%s/tpm_%02d.csv', folder_path, i))
}
new_tsv <- paste0(tdir, '/new')
dir.create(new_tsv)
英文:
Essentially you may want to set row.names=1
in read.csv
to indicate which column contains the row names.
csv_files <- list.files(folder_path, full.names=TRUE)
lapply(csv_files, \(x) {
read.csv(x, row.names=1) |>
write.table(file=sprintf('%s/%s', new_tsv, sub('\\.csv$', '.txt', basename(x))),
sep='\t', quote=FALSE)
}) |> invisible()
The invisible
avoids to clutter the console with NULL
s.
Test:
read.table(sprintf('%s/%s', new_tsv, 'tpm_01.txt'))
# sample1 sample2 sample3
# gene1 14 9 14
# gene2 8 14 17
# gene3 11 9 5
# gene4 12 16 14
# gene5 11 9 13
Data:
tdir <- tempdir()
folder_path <- paste0(tdir, '/old')
dir.create(folder_path)
set.seed(42)
for (i in 1:3) {
write.csv(matrix(rpois(15, 10), 5, 3, dimnames=list(paste0('gene', 1:5), paste0('sample', 1:3))),
sprintf('%s/tpm_%02d.csv', folder_path, i))
}
new_tsv <- paste0(tdir, '/new')
dir.create(new_tsv)
答案2
得分: 0
请提供示例结构并准备 reprex:
# 示例文件内容,截断行,行名在名为 "" 的第一列中:
stringr::str_view(stringr::str_trunc(sample_lines, 80))
#> [1] │ "" ,"Riaz_Nivolumab_2017-p001-ar-8813","Riaz_Nivolumab_2017-p002-ar-8815","Ria...
#> [2] │ "A1BG",1.0008835087938,-1.91493588495907,-0.620734484990514,-1.75775209550157...
#> [3] │ "A1CF",-0.543733010666938,-0.537513815032929,-0.481855746353474,-0.5437330106...
#> [4] │ "A2M",0.0979791979801679,-0.446028724661623,-0.465953763246008,-2.08487244739...
#> [5] │ "A2ML1",-0.363502635555151,-0.363502635555151,-0.298573188895895,-0.363502635...
writeLines(sample_lines, "tpm_sample.csv")
将原始代码修改为导入行名(read.csv(..., row.names = 1)
)并在写入时不删除它们(从 write.table()
中删除 row.names=FALSE
):
folder_path <- "./"
csv_files <- list.files(path=folder_path, pattern="^tpm.*\\.csv$", full.names=TRUE)
new_TSV_files_path <- "./"
lapply(csv_files , function(file) {
# 启用从第1列读取行名:
data <- read.csv(file, row.names = 1, stringsAsFactors=FALSE, fill=TRUE)
filename <- basename(file)
filename <- sub(".csv$", ".txt", filename)
new_file_path <- file.path(new_TSV_files_path , filename)
# 不要在 write.table() 中禁用行名 row.names=FALSE
write.table(data,
file=new_file_path,
sep="\t",
quote=FALSE
)
})
#> [[1]]
#> NULL
检查生成的文件:
tsv_files <- list.files(path=new_TSV_files_path, pattern="^tpm.*\\.txt$", full.names=TRUE)
# 只要期望 read.table / write.table 兼容的行名,我们就都准备好了
# 写入的 tsv 内容,截断行:
stringr::str_view(stringr::str_trunc(readLines(tsv_files[1]), 80))
#> [1] │ Riaz_Nivolumab_2017.p001.ar.8813{\t}Riaz_Nivolumab_2017.p002.ar.8815{\t}Riaz_Nivolu...
#> [2] │ A1BG{\t}1.0008835087938{\t}-1.91493588495907{\t}-0.620734484990514{\t}-1.75775209550157{\t}-...
#> [3] │ A1CF{\t}-0.543733010666938{\t}-0.537513815032929{\t}-0.481855746353474{\t}-0.543733010666...
#> [4] │ A2M{\t}0.0979791979801679{\t}-0.446028724661623{\t}-0.465953763246008{\t}-2.0848724473961...
#> [5] │ A2ML1{\t}-0.363502635555151{\t}-0.363502635555151{\t}-0.298573188895895{\t}-0.36350263555...
# 使用 read.table() 重新导入,检查第一列和 rownames 属性:
tsv_in <- read.table(tsv_files[1])
tsv_in[,1:2]
#> Riaz_Nivolumab_2017.p001.ar.8813 Riaz_Nivolumab_2017.p002.ar.8815
#> A1BG 1.0008835 -1.9149359
#> A1CF -0.5437330 -0.5375138
#> A2M 0.0979792 -0.4460287
#> A2ML1 -0.3635026 -0.3635026
rownames(tsv_in)
#> [1] "A1BG" "A1CF" "A2M" "A2ML1"
# fread 处理行名方式不同,还会引发相关警告,
# 行名最终进入列 "V1" 是预期行为
dt_in <- data.table::fread(tsv_files[1])
#> Warning in data.table::fread(tsv_files[1]): Detected 26 column names but the
#> data has 27 columns (i.e. invalid file). Added 1 extra default column name for
#> the first column which is guessed to be row names or an index. Use setnames()
#> afterwards if this guess is not correct, or fix the file write command that
#> created the file to create a valid file.
dt_in[,1:3]
#> V1 Riaz_Nivolumab_2017.p001.ar.8813 Riaz_Nivolumab_2017.p002.ar.8815
#> 1: A1BG 1.0008835 -1.9149359
#> 2: A1CF -0.5437330 -0.5375138
#> 3: A2M 0.0979792 -0.4460287
#> 4: A2ML1 -0.3635026 -0.3635026
rownames(dt_in)
#> [1] "1" "2" "3" "4"
如果问题仍然存在,与提到的 Web 工具和其格式期望有关。
示例文件行:
sample_lines <- c(
"\"\"","Riaz_Nivolumab_2017-p001-ar-8813","Riaz_Nivolumab_2017-p002-ar-8815","Riaz_Nivolumab_2017-p017-ar-8890","Riaz_Nivolumab_2017-p026-ar-8920","Riaz_Nivolumab_2017-p034-ar-8929","Riaz_Nivolumab_2017-p036-ar-8898","Riaz_Nivolumab_2017-p037-ar-8900","Riaz_Nivolumab_2017-p038-ar-8895","R
<details>
<summary>英文:</summary>
Check provided sample structure and prepare reprex:
sample file content, truncated lines, rownames are in first column named "":
stringr::str_view(stringr::str_trunc(sample_lines, 80))
#> [1] │ "","Riaz_Nivolumab_2017-p001-ar-8813","Riaz_Nivolumab_2017-p002-ar-8815","Ria...
#> [2] │ "A1BG",1.0008835087938,-1.91493588495907,-0.620734484990514,-1.75775209550157...
#> [3] │ "A1CF",-0.543733010666938,-0.537513815032929,-0.481855746353474,-0.5437330106...
#> [4] │ "A2M",0.0979791979801679,-0.446028724661623,-0.465953763246008,-2.08487244739...
#> [5] │ "A2ML1",-0.363502635555151,-0.363502635555151,-0.298573188895895,-0.363502635...
writeLines(sample_lines, "tpm_sample.csv")
Modify original code to import row names (`read.csv(..., row.names = 1)`) and NOT to drop those on writing (remove `row.names=FALSE` from `write.table()` ):
``` r
folder_path <- "./"
csv_files <- list.files(path=folder_path, pattern="^tpm.*\\.csv$", full.names=TRUE)
new_TSV_files_path<- "./"
lapply(csv_files , function(file) {
# enable reading rownames from 1st column:
data <- read.csv(file, row.names = 1, stringsAsFactors=FALSE, fill=TRUE)
filename <- basename(file)
filename <- sub(".csv$", ".txt", filename)
new_file_path <- file.path(new_TSV_files_path , filename)
# do NOT diable rownmes with row.names=FALSE
write.table(data,
file=new_file_path,
sep="\t",
quote=FALSE
)
})
#> [[1]]
#> NULL
Check resulting files:
tsv_files <- list.files(path=new_TSV_files_path, pattern="^tpm.*\\.txt$", full.names=TRUE)
# as long as read.table / write.table compatibe rownmes are expected, we are all set
# written tsv content, truncated lines:
stringr::str_view(stringr::str_trunc(readLines(tsv_files[1]), 80))
#> [1] │ Riaz_Nivolumab_2017.p001.ar.8813{\t}Riaz_Nivolumab_2017.p002.ar.8815{\t}Riaz_Nivolu...
#> [2] │ A1BG{\t}1.0008835087938{\t}-1.91493588495907{\t}-0.620734484990514{\t}-1.75775209550157{\t}-...
#> [3] │ A1CF{\t}-0.543733010666938{\t}-0.537513815032929{\t}-0.481855746353474{\t}-0.543733010666...
#> [4] │ A2M{\t}0.0979791979801679{\t}-0.446028724661623{\t}-0.465953763246008{\t}-2.0848724473961...
#> [5] │ A2ML1{\t}-0.363502635555151{\t}-0.363502635555151{\t}-0.298573188895895{\t}-0.36350263555...
# re-import with read.table(), check first columns and rownames attribute:
tsv_in <- read.table(tsv_files[1])
tsv_in[,1:2]
#> Riaz_Nivolumab_2017.p001.ar.8813 Riaz_Nivolumab_2017.p002.ar.8815
#> A1BG 1.0008835 -1.9149359
#> A1CF -0.5437330 -0.5375138
#> A2M 0.0979792 -0.4460287
#> A2ML1 -0.3635026 -0.3635026
rownames(tsv_in)
#> [1] "A1BG" "A1CF" "A2M" "A2ML1"
# fread handles rownames differently and also throws a relevant warning,
# rownmaes ending up in column "V1" is expected behaviour
dt_in <- data.table::fread(tsv_files[1])
#> Warning in data.table::fread(tsv_files[1]): Detected 26 column names but the
#> data has 27 columns (i.e. invalid file). Added 1 extra default column name for
#> the first column which is guessed to be row names or an index. Use setnames()
#> afterwards if this guess is not correct, or fix the file write command that
#> created the file to create a valid file.
dt_in[,1:3]
#> V1 Riaz_Nivolumab_2017.p001.ar.8813 Riaz_Nivolumab_2017.p002.ar.8815
#> 1: A1BG 1.0008835 -1.9149359
#> 2: A1CF -0.5437330 -0.5375138
#> 3: A2M 0.0979792 -0.4460287
#> 4: A2ML1 -0.3635026 -0.3635026
rownames(dt_in)
#> [1] "1" "2" "3" "4"
If the issue remains, it has to do with mentioned webtool and it's format expectations.
Sample file lines:
sample_lines <- c("\"\",\"Riaz_Nivolumab_2017-p001-ar-8813\",\"Riaz_Nivolumab_2017-p002-ar-8815\",\"Riaz_Nivolumab_2017-p017-ar-8890\",\"Riaz_Nivolumab_2017-p026-ar-8920\",\"Riaz_Nivolumab_2017-p034-ar-8929\",\"Riaz_Nivolumab_2017-p036-ar-8898\",\"Riaz_Nivolumab_2017-p037-ar-8900\",\"Riaz_Nivolumab_2017-p038-ar-8895\",\"Riaz_Nivolumab_2017-p039-ar-8819\",\"Riaz_Nivolumab_2017-p046-ar-8904\",\"Riaz_Nivolumab_2017-p047-ar-8836\",\"Riaz_Nivolumab_2017-p048-ar-8897\",\"Riaz_Nivolumab_2017-p049-ar-8824\",\"Riaz_Nivolumab_2017-p052-ar-8839\",\"Riaz_Nivolumab_2017-p065-ar-8857\",\"Riaz_Nivolumab_2017-p067-ar-8840\",\"Riaz_Nivolumab_2017-p078-ar-8864\",\"Riaz_Nivolumab_2017-p079-ar-8849\",\"Riaz_Nivolumab_2017-p082-ar-8822\",\"Riaz_Nivolumab_2017-p085-ar-8829\",\"Riaz_Nivolumab_2017-p090-ar-8866\",\"Riaz_Nivolumab_2017-p092-ar-8867\",\"Riaz_Nivolumab_2017-p098-ar-8853\",\"Riaz_Nivolumab_2017-p101-ar-8834\",\"Riaz_Nivolumab_2017-p103-ar-8872\",\"Riaz_Nivolumab_2017-p106-ar-8926\"",
"\"A1BG\",1.0008835087938,-1.91493588495907,-0.620734484990514,-1.75775209550157,-1.23665217147917,-2.49624508855533,0.727023994594072,0.328722516597936,-0.108114351387649,-0.0836309292648272,-0.325627003998877,4.41269552131938,-0.229120400945704,3.7248031834164,-0.504990077186003,-2.28542936373545,-1.1031748886154,-0.0826211449622227,1.44674204021314,1.3682306825028,0.544742458853122,-2.21733388578132,0.231763591509675,0.0236879027615116,1.79591432017568,-0.638847949374409",
"\"A1CF\",-0.543733010666938,-0.537513815032929,-0.481855746353474,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,5.49732418197924,-0.543733010666938,4.6852457132221,-0.543733010666938,0.117380356180772,-0.543733010666938,-0.543733010666938,1.53929625176185,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.48894973908573,-0.543733010666938,-0.543733010666938,-0.543733010666938",
"\"A2M\",0.0979791979801679,-0.446028724661623,-0.465953763246008,-2.08487244739613,2.01411421717692,1.19837532976838,-0.745540508622708,0.356022989838996,-2.37641175531056,0.517622991895147,2.67515559703126,1.74369521852923,1.19154956592325,2.00377301045288,-0.130083699768657,-0.354493861811036,-3.736442059777,1.59429773965362,1.2081229778692,-2.95554013952561,-1.60581260283625,1.03219514758484,0.925657747621587,-2.02343920511453,-0.246282066712528,0.612339103457181",
"\"A2ML1\",-0.363502635555151,-0.363502635555151,-0.298573188895895,-0.363502635555151,-0.363502635555151,-0.313253413224058,3.64328505792818,-0.363502635555151,-0.363502635555151,-0.363502635555151,0.0272101525164028,-0.363502635555151,-0.107062171894006,-0.363502635555151,-0.363502635555151,-0.363502635555151,-0.363502635555151,-0.212720613261966,-0.363502635555151,-0.363502635555151,3.46425626517263,-0.363502635555151,-0.197854673973831,-0.363502635555151,0.174257390070118,-0.363502635555151"
)
<sup>Created on 2023-06-26 with reprex v2.0.2</sup>
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论