英文:
Padding or filling a dataframe in R if I know the range
问题
我正在寻找类似于 [bedtools subtract](https://bedtools.readthedocs.io/en/latest/content/tools/subtract.html) 的东西,但是使用数据框。
例如,假设我有一个如下的范围数据框:
```r
起始 结束 值
0 100 P
我还有另一个已排序的数据框:
起始 结束 值
10 25 A
50 63 B
是否有一种方法可以填充它,使其变成这样:
起始 结束 值
0 9 P1
10 25 A
26 49 P2
50 63 B
64 100 P3
P1,P2 和 P3 是填充第二个数据框的标签,以便覆盖值的整个范围。
我尝试过使用 Dplyr 的 Lag 函数并手动添加填充值,但鉴于基因组特征的长度(包括起始和结束坐标)可能会改变,我希望这种范围填充是自动的。
谢谢!
例如,这是数据的一个小子集:
data_range <- data.frame(start=0, end=100, value="P")
tofill_range <- data.frame(start=c(15, 51, 70),end = c(39, 62, 79), value = c("A","B","C"))
<details>
<summary>英文:</summary>
I'm looking for something similar to [bedtools subtract](https://bedtools.readthedocs.io/en/latest/content/tools/subtract.html) but with dataframes.
For example, say I have the range as a dataframe here:
Start End Value
0 100 P
And I have another dataframe, which is sorted:
Start End Value
10 25 A
50 63 B
Would there be a way to fill this like so:
Start End Value
0 9 P1
10 25 A
26 49 P2
50 63 B
64 100 P3
P1, P2 and P3 labels which are filled in to pad the 2nd dataframe so that the entire range of value gets covered.
I tried using Dplyr's Lag function and adding the padding values manually, but given that the range can change depending on the length of genomic feature (including the start and end co-ordinates), I wanted this range filling to be automatic.
Thank you!
For example, this is a small subset of the data:
```R
data_range<- data.frame(start=0, end=100, value="P")
tofill_range<- data.frame(start=c(15, 51, 70),end = c(39, 62, 79), value = c("A","B","C"))
答案1
得分: 2
以下是使用'dplyr'计算数据框范围的一种方法。对于您的第二个示例,我重新命名了列。我们可以做一些额外的工作,使其适用于任何列名。
库(dplyr)
calc_range <- function(df1, df2) {
df3 <- df2 %>%
transmute(开始 = 结束 + 1,
结束 = 开始 - 1) %>%
rename(开始 = 开始)
开始_df <- bind_rows(df1, df2, df3)
开始_df %>%
选择(!价值) %>%
unlist %>%
sort %>%
矩阵(列数 = 2, 按行 = TRUE) %>%
数据框() %>%
rename(开始 = X1, 结束 = X2) %>%
left_join(开始_df, by = c("开始", "结束")) %>%
mutate(价值 = ifelse(is.na(价值) | 价值 == "P",
paste0("P", cumsum(is.na(价值) | 价值 == "P")),
价值)) %>%
排序(开始)
}
# 测试 1
dfa <- tribble(
~开始, ~结束, ~价值,
0, 100, "P"
)
dfb <- tribble(~开始, ~结束, ~价值,
10, 25, "A",
50, 63, "B")
calc_range(dfa, dfb)
#> 开始 结束 价值
#> 1 0 9 P1
#> 2 10 25 A
#> 3 26 49 P2
#> 4 50 63 B
#> 5 64 100 P3
# 测试 2
数据范围 <- 数据框(开始=0, 结束=100, 价值="P")
填充范围 <- 数据框(开始=c(15, 51, 70),
结束 = c(39, 62, 79),
价值 = c("A","B","C"))
calc_range(数据范围, 填充范围)
#> 开始 结束 价值
#> 1 0 14 P1
#> 2 15 39 A
#> 3 40 50 P2
#> 4 51 62 B
#> 5 63 69 P3
#> 6 70 79 C
#> 7 80 100 P4
英文:
Here is one way to calculate the range of a data.frame with just using 'dplyr'. For your second example I renamed the columns. We could put some more work in to make it work with any column names.
library(dplyr)
calc_range <- function(df1, df2) {
df3 <- df2 %>%
transmute(start = End + 1,
End = Start - 1) %>%
rename(Start = start)
start_df <- bind_rows(df1, df2, df3)
start_df %>%
select(!Value) %>%
unlist %>%
sort %>%
matrix(ncol = 2, byrow = TRUE) %>%
data.frame() %>%
rename(Start = X1, End = X2) %>%
left_join(start_df, by = c("Start", "End")) %>%
mutate(Value = ifelse(is.na(Value) | Value == "P",
paste0("P", cumsum(is.na(Value) | Value == "P")),
Value)) %>%
arrange(Start)
}
# Test 1
dfa <- tribble(
~Start, ~End, ~Value,
0, 100, "P"
)
dfb <- tribble(~Start, ~End, ~Value,
10, 25, "A",
50, 63, "B")
calc_range(dfa, dfb)
#> Start End Value
#> 1 0 9 P1
#> 2 10 25 A
#> 3 26 49 P2
#> 4 50 63 B
#> 5 64 100 P3
# Test 2
data_range <- data.frame(Start=0, End=100, Value="P")
tofill_range <- data.frame(Start=c(15, 51, 70),
End = c(39, 62, 79),
Value = c("A","B","C"))
calc_range(data_range, tofill_range)
#> Start End Value
#> 1 0 14 P1
#> 2 15 39 A
#> 3 40 50 P2
#> 4 51 62 B
#> 5 63 69 P3
#> 6 70 79 C
#> 7 80 100 P4
<sup>Created on 2023-02-23 with reprex v2.0.2</sup>
答案2
得分: 2
使用 dplyr
(版本 v1.1.0 或更高)的 consecutive_id
来获取缺失的范围,使用 between
:
library(dplyr)
ranges <- rowSums(apply(tofill_range[,1:2], 1, function(x)
between(seq(data_range$start, data_range$end), x[1], x[2])))
as_tibble(cbind(ranges, grp = consecutive_id(ranges),
val = seq(data_range[,1], data_range[,2))) %>%
group_by(grp) %>%
filter(ranges == 0) %>%
summarize(start = first(val),
end = last(val),
value = paste0(data_range$value, cur_group_id())) %>%
select(-grp) %>%
bind_rows(., tofill_range) %>%
arrange(start)
# A tibble: 7 × 3
start end value
<dbl> <dbl> <chr>
1 0 14 P1
2 15 39 A
3 40 50 P2
4 51 62 B
5 63 69 P3
6 70 79 C
7 80 100 P4
如果您需要进一步的解释或有任何其他问题,请随时提出。
英文:
Using dplyr
(>= v1.1.0 for consecutive_id
)
Get the missing ranges with between
library(dplyr)
ranges <- rowSums(apply(tofill_range[,1:2], 1, function(x)
between(seq(data_range$start, data_range$end), x[1], x[2])))
as_tibble(cbind(ranges, grp = consecutive_id(ranges),
val = seq(data_range[,1], data_range[,2]))) %>%
group_by(grp) %>%
filter(ranges == 0) %>%
summarize(start = first(val),
end = last(val),
value = paste0(data_range$value, cur_group_id())) %>%
select(-grp) %>%
bind_rows(., tofill_range) %>%
arrange(start)
# A tibble: 7 × 3
start end value
<dbl> <dbl> <chr>
1 0 14 P1
2 15 39 A
3 40 50 P2
4 51 62 B
5 63 69 P3
6 70 79 C
7 80 100 P4
答案3
得分: 2
在基本的 R 中:
``` r
all_ranges <- function(df1, df2){
a <- sort(c(t(df1[-3]), t(df2[-3]), t(df2[-3]) + c(-1,1)))
b <- data.frame(t(matrix(a,2)))
d <- merge(df2, setNames(b, names(df1)[-3]), all = TRUE)
replace(d, is.na(d), paste0(df1[,3], seq(sum(is.na(d)))))
}
data_range<- data.frame(start=0, end=100, value="P")
tofill_range<- data.frame(start=c(15, 51, 70),end = c(39, 62, 79), value = c("A","B","C"))
all_ranges(data_range, tofill_range)
#> start end value
#> 1 0 14 P1
#> 2 15 39 A
#> 3 40 50 P2
#> 4 51 62 B
#> 5 63 69 P3
#> 6 70 79 C
#> 7 80 100 P4
创建于 2023-02-23,使用 reprex v2.0.2
<details>
<summary>英文:</summary>
In base R:
``` r
all_ranges <- function(df1, df2){
a <- sort(c(t(df1[-3]), t(df2[-3]), t(df2[-3]) + c(-1,1)))
b <- data.frame(t(matrix(a,2)))
d <- merge(df2, setNames(b, names(df1)[-3]), all = TRUE)
replace(d, is.na(d), paste0(df1[,3], seq(sum(is.na(d)))))
}
data_range<- data.frame(start=0, end=100, value="P")
tofill_range<- data.frame(start=c(15, 51, 70),end = c(39, 62, 79), value = c("A","B","C"))
all_ranges(data_range, tofill_range)
#> start end value
#> 1 0 14 P1
#> 2 15 39 A
#> 3 40 50 P2
#> 4 51 62 B
#> 5 63 69 P3
#> 6 70 79 C
#> 7 80 100 P4
<sup>Created on 2023-02-23 with reprex v2.0.2</sup>
答案4
得分: 0
一个非常适合这个任务的包是“IRanges”:
library(IRanges)
r1 = IRanges(start = 0, end = 100, names = "P")
r2 = IRanges(start = c(10, 50), end = c(25, 63), names = c("A", "B"))
# 找到间隙
dif = setdiff(r1, r2)
names(dif) = sprintf("%s%d", names(r1), seq_len(length(dif)))
# 合并并排序
ans = sort(c(r2, dif))
as.data.frame(ans)
# start end width names
#1 0 9 10 P1
#2 10 25 16 A
#3 26 49 24 P2
#4 50 63 14 B
#5 64 100 37 P3
英文:
A very suitable package for this task is "IRanges":
library(IRanges)
r1 = IRanges(start = 0, end = 100, names = "P")
r2 = IRanges(start = c(10, 50), end = c(25, 63), names = c("A", "B"))
# find gaps
dif = setdiff(r1, r2)
names(dif) = sprintf("%s%d", names(r1), seq_len(length(dif)))
# merge and sort
ans = sort(c(r2, dif))
as.data.frame(ans)
# start end width names
#1 0 9 10 P1
#2 10 25 16 A
#3 26 49 24 P2
#4 50 63 14 B
#5 64 100 37 P3
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论