英文:
How do I calculate semi-colon separated values within a cell?
问题
我可以帮您解释如何执行这个操作。首先,您需要将HH
列和NLNL
列中的值提取出来并进行计算。您可以使用R或类似的数据处理工具来完成这个任务。以下是大致的步骤:
-
将每个单元格中的值拆分成两部分,使用分号分割。在R中,您可以使用
strsplit
函数来做到这一点。 -
将拆分后的值转换为数字类型。
-
计算
HH
列中的两个数字相除的结果。 -
计算
NLNL
列中的数字。 -
用
NLNL
中的值减去HH
中的值。 -
将结果放入一个新的列,比如
diff
。
以下是一个可能的R代码示例:
# 假设您的数据框叫做dt
dt$HH <- as.character(dt$HH) # 将HH列转换为字符类型
dt$NLNL <- as.character(dt$NLNL) # 将NLNL列转换为字符类型
# 分割HH列和NLNL列中的值
dt$HH <- sapply(strsplit(dt$HH, ";"), function(x) as.numeric(x[1])/as.numeric(x[2]))
dt$NLNL <- sapply(strsplit(dt$NLNL, ";"), function(x) as.numeric(x[1]))
# 计算diff列
dt$diff <- dt$NLNL - dt$HH
这段代码会将您想要的计算结果放入diff
列中。希望这对您有所帮助!
英文:
I have a table that looks like this:
> head(dt)
variant_id transcript_id HH HNL NLNL
1: chr10_60842447_A_G_b38 chr10_60871326_60871443 32968;685 1440;20 337;1
2: chr10_60846892_G_A_b38 chr10_60871326_60871443 33157;690 1251;15 337;1
3: chr10_60847284_C_T_b38 chr10_60871326_60871443 33157;690 1251;15 337;1
4: chr10_60849980_T_C_b38 chr10_60871326_60871443 33157;690 1251;15 337;1
5: chr10_60850566_A_T_b38 chr10_60871326_60871443 33157;690 1251;15 337;1
6: chr10_60852394_C_A_b38 chr10_60871326_60871443 33157;690 1251;15 337;1
What I would like to do is take values in column HH
and divide the numbers before the semi-colon by the numbers after the semi-colon. For example, for the first row, I would like to do 32968/685
(which would be 48.13
). Then, I would like to do the same for the values in column NLNL
(so for the first row that would be 337
), and then I would like to subtract the value found from column HH
from the value in column NLNL
, so 337-48.13 = 228.87
. I would then like to take that value in place it into a new column called diff
for all rows.
How would I go about doing this? I can pretty easily figure out how to divide the values of one column with another and put the result in a new column, but I don't know how to extract semi-colon separated values from within a cell and manipulate them.
答案1
得分: 3
以下是您提供的代码的中文翻译部分:
一种选项是使用strsplit
按;
拆分,转换为numeric
,然后将第一个元素除以第二个元素:
dt[, new := unlist(lapply(strsplit(HH, ";"),
function(x) as.numeric(x[1])/as.numeric(x[2])))]
或者另一种选项是将列作为data.table
读入,然后进行除法运算:
dt[, new := fread(text = .SD[["HH"]], sep=";")[, V1/V2]]
对于多列,指定列在.SDcols
中,循环遍历列并执行相同的操作:
dt[, paste0("new", 1:3) := lapply(.SD, function(x)
fread(text = x, sep=";")[, V1/V2]), .SDcols = HH:NLNL]
dt
或者使用tidyverse
的选项:
library(dplyr)
library(tidyr)
dt %>%
mutate(rn = row_number()) %>%
separate_rows(HH, HNL, NLNL, convert = TRUE) %>%
group_by(rn, variant_id, transcript_id) %>%
summarise_at(vars(HH:NLNL), ~ first(.)/last(.)) %>%
ungroup %>%
select(-rn)
也可以以紧凑的方式完成:
library(purrr)
library(dplyr)
dt %>%
mutate(across(HH:NLNL, ~ fread(text = ., sep=";") %>%
reduce(`/`)))
数据
dt <- structure(list(variant_id = c("chr10_60842447_A_G_b38",
"chr10_60846892_G_A_b38",
"chr10_60847284_C_T_b38", "chr10_60849980_T_C_b38", "chr10_60850566_A_T_b38",
"chr10_60852394_C_A_b38"), transcript_id = c("chr10_60871326_60871443",
"chr10_60871326_60871443", "chr10_60871326_60871443", "chr10_60871326_60871443",
"chr10_60871326_60871443", "chr10_60871326_60871443"), HH = c("32968;685",
"33157;690", "33157;690", "33157;690", "33157;690", "33157;690"
), HNL = c("1440;20", "1251;15", "1251;15", "1251;15", "1251;15",
"1251;15"), NLNL = c("337;1", "337;1", "337;1", "337;1", "337;1",
"337;1")), class = c("data.table", "data.frame"), row.names = c(NA,
-6L))
请注意,这只是代码的翻译部分,没有其他内容。如果您需要进一步的解释或有其他问题,请随时提出。
英文:
One option is to split
by ;
with strsplit
, convert to numeric
and divide the first element by the second
dt[, new := unlist(lapply(strsplit(HH, ";"),
function(x) as.numeric(x[1])/as.numeric(x[2])))]
Or another option is reading the column as a data.table
with fread
and then do the division
dt[, new := fread(text = .SD[["HH"]], sep=";")[, V1/V2]]
For multiple columns, specify the columns in .SDcols
, loop over the columns and do the same thing
dt[, paste0("new", 1:3) := lapply(.SD, function(x)
fread(text = x, sep=";")[, V1/V2]), .SDcols = HH:NLNL]
dt
# variant_id transcript_id HH HNL NLNL new1 new2 new3
#1: chr10_60842447_A_G_b38 chr10_60871326_60871443 32968;685 1440;20 337;1 48.12847 72.0 337
#2: chr10_60846892_G_A_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4 337
#3: chr10_60847284_C_T_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4 337
#4: chr10_60849980_T_C_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4 337
#5: chr10_60850566_A_T_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4 337
#6: chr10_60852394_C_A_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4 337
Or an option with tidyverse
library(dplyr)
library(tidyr)
dt %>%
mutate(rn = row_number()) %>%
separate_rows(HH, HNL, NLNL, convert = TRUE) %>%
group_by(rn, variant_id, transcript_id) %>%
summarise_at(vars(HH:NLNL), ~ first(.)/last(.)) %>%
ungroup %>%
select(-rn)
# A tibble: 6 x 5
# variant_id transcript_id HH HNL NLNL
# <chr> <chr> <dbl> <dbl> <dbl>
#1 chr10_60842447_A_G_b38 chr10_60871326_60871443 48.1 72 337
#2 chr10_60846892_G_A_b38 chr10_60871326_60871443 48.1 83.4 337
#3 chr10_60847284_C_T_b38 chr10_60871326_60871443 48.1 83.4 337
#4 chr10_60849980_T_C_b38 chr10_60871326_60871443 48.1 83.4 337
#5 chr10_60850566_A_T_b38 chr10_60871326_60871443 48.1 83.4 337
#6 chr10_60852394_C_A_b38 chr10_60871326_60871443 48.1 83.4 337
It can be also done in a compact way with
library(purrr)
library(dplyr)# v 0.8.99.9000
dt %>%
mutate(across(HH:NLNL, ~ fread(text = ., sep=";") %>%
reduce(`/`)))
# variant_id transcript_id HH HNL NLNL
#1 chr10_60842447_A_G_b38 chr10_60871326_60871443 48.12847 72.0 337
#2 chr10_60846892_G_A_b38 chr10_60871326_60871443 48.05362 83.4 337
#3 chr10_60847284_C_T_b38 chr10_60871326_60871443 48.05362 83.4 337
#4 chr10_60849980_T_C_b38 chr10_60871326_60871443 48.05362 83.4 337
#5 chr10_60850566_A_T_b38 chr10_60871326_60871443 48.05362 83.4 337
#6 chr10_60852394_C_A_b38 chr10_60871326_60871443 48.05362 83.4 337
###data
dt <- structure(list(variant_id = c("chr10_60842447_A_G_b38",
"chr10_60846892_G_A_b38",
"chr10_60847284_C_T_b38", "chr10_60849980_T_C_b38", "chr10_60850566_A_T_b38",
"chr10_60852394_C_A_b38"), transcript_id = c("chr10_60871326_60871443",
"chr10_60871326_60871443", "chr10_60871326_60871443", "chr10_60871326_60871443",
"chr10_60871326_60871443", "chr10_60871326_60871443"), HH = c("32968;685",
"33157;690", "33157;690", "33157;690", "33157;690", "33157;690"
), HNL = c("1440;20", "1251;15", "1251;15", "1251;15", "1251;15",
"1251;15"), NLNL = c("337;1", "337;1", "337;1", "337;1", "337;1",
"337;1")), class = c("data.table", "data.frame"), row.names = c(NA,
-6L))
答案2
得分: 2
一种涉及 dplyr
和 gsubfn
的选项可能是:
df %>%
mutate_at(vars(HH, HNL, NLNL), ~ gsubfn("([0-9]+);([0-9]+)", function(x, y) as.numeric(x)/as.numeric(y), .))
variant_id transcript_id HH HNL NLNL
1 chr10_60842447_A_G_b38 chr10_60871326_60871443 48.1284671532847 72.0 337.0
2 chr10_60846892_G_A_b38 chr10_60871326_60871443 48.0536231884058 83.4 337.0
3 chr10_60847284_C_T_b38 chr10_60871326_60871443 48.0536231884058 83.4 337.0
4 chr10_60849980_T_C_b38 chr10_60871326_60871443 48.0536231884058 83.4 337.0
5 chr10_60850566_A_T_b38 chr10_60871326_60871443 48.0536231884058 83.4 337.0
6 chr10_60852394_C_A_b38 chr10_60871326_60871443 48.0536231884058 83.4 33.0
注意:上述代码是将 HH
、HNL
和 NLNL
列中的数据按指定的正则表达式模式进行替换。
英文:
One option involving dplyr
and gsubfn
could be:
df %>%
mutate_at(vars(HH, HNL, NLNL), ~ gsubfn("([0-9]+);([0-9]+)", function(x, y) as.numeric(x)/as.numeric(y), .))
variant_id transcript_id HH HNL NLNL
1 chr10_60842447_A_G_b38 chr10_60871326_60871443 48.1284671532847 72 337
2 chr10_60846892_G_A_b38 chr10_60871326_60871443 48.0536231884058 83.4 337
3 chr10_60847284_C_T_b38 chr10_60871326_60871443 48.0536231884058 83.4 337
4 chr10_60849980_T_C_b38 chr10_60871326_60871443 48.0536231884058 83.4 337
5 chr10_60850566_A_T_b38 chr10_60871326_60871443 48.0536231884058 83.4 337
6 chr10_60852394_C_A_b38 chr10_60871326_60871443 48.0536231884058 83.4 33
答案3
得分: 2
这里是另一种选择。我们将数据拆分在嵌套单元格中,然后映射出数值。
library(tidyverse)
dt %>%
mutate_at(vars(HH:NLNL), list(~str_split(., ";") )) %>%
mutate_at(vars(HH:NLNL), list(~map_dbl(., ~as.numeric(.x) %>% {.[[1]]/.[[2]]})))
编辑:要创建新变量,只需在列表中命名它们:
dt %>%
mutate_at(vars(HH:NLNL), list(new = ~str_split(., ";") )) %>%
mutate_at(vars(HH_new:NLNL_new), list(~map_dbl(., ~as.numeric(.x) %>% {.[[1]]/.[[2]]})))
希望这对你有所帮助。
英文:
Here is another option. We split the data in a nested cell, then map out the values.
library(tidyverse)
dt %>%
mutate_at(vars(HH:NLNL), list(~str_split(., ";") )) %>%
mutate_at(vars(HH:NLNL), list(~map_dbl(., ~as.numeric(.x) %>% {.[[1]]/.[[2]]})))
#> variant_id transcript_id HH HNL NLNL
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 chr10_60842447_A_G_b38 chr10_60871326_60871443 48.1 72 337
#> 2 chr10_60846892_G_A_b38 chr10_60871326_60871443 48.1 83.4 337
#> 3 chr10_60847284_C_T_b38 chr10_60871326_60871443 48.1 83.4 337
#> 4 chr10_60849980_T_C_b38 chr10_60871326_60871443 48.1 83.4 337
#> 5 chr10_60850566_A_T_b38 chr10_60871326_60871443 48.1 83.4 337
#> 6 chr10_60852394_C_A_b38 chr10_60871326_60871443 48.1 83.4 337
EDIT: All you have to do to have new variables is to name them in the list:
dt %>%
mutate_at(vars(HH:NLNL), list(new = ~str_split(., ";") )) %>%
mutate_at(vars(HH_new:NLNL_new), list(~map_dbl(., ~as.numeric(.x) %>% {.[[1]]/.[[2]]})))
#> variant_id transcript_id HH HNL NLNL HH_new HNL_new NLNL_new
#> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 chr10_60842447_… chr10_60871326_60… 32968… 1440… 337;1 48.1 72 337
#> 2 chr10_60846892_… chr10_60871326_60… 33157… 1251… 337;1 48.1 83.4 337
#> 3 chr10_60847284_… chr10_60871326_60… 33157… 1251… 337;1 48.1 83.4 337
#> 4 chr10_60849980_… chr10_60871326_60… 33157… 1251… 337;1 48.1 83.4 337
#> 5 chr10_60850566_… chr10_60871326_60… 33157… 1251… 337;1 48.1 83.4 337
#> 6 chr10_60852394_… chr10_60871326_60… 33157… 1251… 337;1 48.1 83.4 337
答案4
得分: 1
这是一个使用基本的R解决方案的示例,使用strsplit
处理单元格中的分号分隔值:
df <- cbind(df, `colnames<-`(sapply(c("HH", "HNL", "NLNL"),
function(v) sapply(strsplit(df[,v], split = ";"),
function(x) Reduce("/", as.numeric(x)))), c("HHnew", "HNLnew", "NLNLnew")))
df$diff <- with(df, NLNLnew - HHnew)
如此得到:
> df
variant_id transcript_id HH HNL NLNL HHnew HNLnew NLNLnew diff
1 chr10_60842447_A_G_b38 chr10_60871326_60871443 32968;685 1440;20 337;1 48.12847 72.0 337 288.8715
2 chr10_60846892_G_A_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4 337 288.9464
3 chr10_60847284_C_T_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4 337 288.9464
4 chr10_60849980_T_C_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4 337 288.9464
5 chr10_60850566_A_T_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4 337 288.9464
6 chr10_60852394_C_A_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4 337 288.9464
数据
df <- structure(list(variant_id = c("chr10_60842447_A_G_b38", "chr10_60846892_G_A_b38",
"chr10_60847284_C_T_b38", "chr10_60849980_T_C_b38", "chr10_60850566_A_T_b38",
"chr10_60852394_C_A_b38"), transcript_id = c("chr10_60871326_60871443",
"chr10_60871326_60871443", "chr10_60871326_60871443", "chr10_60871326_60871443",
"chr10_60871326_60871443", "chr10_60871326_60871443"), HH = c("32968;685",
"33157;690", "33157;690", "33157;690", "33157;690", "33157;690"
), HNL = c("1440;20", "1251;15", "1251;15", "1251;15", "1251;15",
"1251;15"), NLNL = c("337;1", "337;1", "337;1", "337;1", "337;1",
"337;1")), class = "data.frame", row.names = c(NA, -6L))
英文:
Here is a base R solution using strsplit
to deal with semicolon separated values in a cell:
df <- cbind(df,`colnames<-`(sapply(c("HH","HNL","NLNL"),
function(v) sapply(strsplit(df[,v],split = ";"),
function(x) Reduce("/",as.numeric(x)))),c("HHnew","HNLnew","NLNLnew")))
df$diff <- with(df,NLNLnew - HHnew)
such that
> df
variant_id transcript_id HH HNL NLNL HHnew HNLnew NLNLnew diff
1 chr10_60842447_A_G_b38 chr10_60871326_60871443 32968;685 1440;20 337;1 48.12847 72.0 337 288.8715
2 chr10_60846892_G_A_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4 337 288.9464
3 chr10_60847284_C_T_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4 337 288.9464
4 chr10_60849980_T_C_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4 337 288.9464
5 chr10_60850566_A_T_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4 337 288.9464
6 chr10_60852394_C_A_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4 337 288.9464
DATA
df <- structure(list(variant_id = c("chr10_60842447_A_G_b38", "chr10_60846892_G_A_b38",
"chr10_60847284_C_T_b38", "chr10_60849980_T_C_b38", "chr10_60850566_A_T_b38",
"chr10_60852394_C_A_b38"), transcript_id = c("chr10_60871326_60871443",
"chr10_60871326_60871443", "chr10_60871326_60871443", "chr10_60871326_60871443",
"chr10_60871326_60871443", "chr10_60871326_60871443"), HH = c("32968;685",
"33157;690", "33157;690", "33157;690", "33157;690", "33157;690"
), HNL = c("1440;20", "1251;15", "1251;15", "1251;15", "1251;15",
"1251;15"), NLNL = c("337;1", "337;1", "337;1", "337;1", "337;1",
"337;1")), class = "data.frame", row.names = c(NA, -6L))
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论