英文:
Find overlapping ranges between two data frames after grouping in R
问题
我有两个类似这样的大型数据框:
df1 <- tibble(chrom=c(1,1,1,2,2,2),
start=c(100,200,300,100,200,300),
end=c(150,250,350,120,220,320))
df2 <- tibble(chrom=c(1,1,1,2,2,2),
start2=c(100,50,280,100,10,200),
end2=c(125,100,320,115,15,350))
df1
#> # A tibble: 6 × 3
#> chrom start end
#> <dbl> <dbl> <dbl>
#> 1 1 100 150
#> 2 1 200 250
#> 3 1 300 350
#> 4 2 100 120
#> 5 2 200 220
#> 6 2 300 320
df2
#> # A tibble: 6 × 3
#> chrom start2 end2
#> <dbl> <dbl> <dbl>
#> 1 1 100 125
#> 2 1 50 100
#> 3 1 280 320
#> 4 2 100 115
#> 5 2 10 15
#> 6 2 200 350
我想要找到df2的范围[start2-end2]与df1的范围[start-end]重叠的部分。理想的输出可能类似于以下内容,但不一定需要。主要是我想要重叠范围的坐标。
#> # A tibble: 6 × 8
#> chrom start end start2 end2 overlap overlap_start overlap_end
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
#> 1 1 100 150 100 125 yes 100 125
#> 2 1 200 250 50 100 no NA NA
#> 3 1 300 350 280 320 yes 300 320
#> 4 2 100 120 100 115 yes 100 115
#> 5 2 200 220 10 15 no NA NA
#> 6 2 300 320 200 350 yes 200,220 300,320
请注意,在最后一行中,范围200-350已与df1中的两个范围[200-220,300-320]重叠。
英文:
I have two large data frames that look like this:
df1 <- tibble(chrom=c(1,1,1,2,2,2),
start=c(100,200,300,100,200,300),
end=c(150,250,350,120,220,320))
df2 <- tibble(chrom=c(1,1,1,2,2,2),
start2=c(100,50,280,100,10,200),
end2=c(125,100,320,115,15,350))
df1
#> # A tibble: 6 × 3
#> chrom start end
#> <dbl> <dbl> <dbl>
#> 1 1 100 150
#> 2 1 200 250
#> 3 1 300 350
#> 4 2 100 120
#> 5 2 200 220
#> 6 2 300 320
df2
#> # A tibble: 6 × 3
#> chrom start2 end2
#> <dbl> <dbl> <dbl>
#> 1 1 100 125
#> 2 1 50 100
#> 3 1 280 320
#> 4 2 100 115
#> 5 2 10 15
#> 6 2 200 350
<sup>Created on 2023-01-09 with reprex v2.0.2</sup>
I want to find which range[start2-end2] of df2 overlaps with the range[start-end] of df1.
An ideal output would be something like this, but it's not necessary. Mostly I want the coordinates of the overlapping ranges.
#> # A tibble: 6 × 8
#> chrom start end start2 end2 overlap overlap_start overlap_end
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
#> 1 1 100 150 100 125 yes 100 125
#> 2 1 200 250 50 100 no <NA> <NA>
#> 3 1 300 350 280 320 yes 300 320
#> 4 2 100 120 100 115 yes 100 115
#> 5 2 200 220 10 15 no <NA> <NA>
#> 6 2 300 320 200 350 yes 200,220 300,320
<sup>Created on 2023-01-09 with reprex v2.0.2</sup>
!Note that on the last line, the range 200-350 overlaps already with two ranges from df1[200-220, 300-320].
答案1
得分: 2
I believe you are looking for something like this?
我相信你在寻找类似这样的内容:
I see no need to summarize here, so you'll get two results for the df2-range 200-350.
我在这里不需要总结,所以你将得到df2范围为200-350的两个结果。
library(data.table)
library(matrixStats)
# set to data.table format
setDT(df1); setDT(df2)
# perform join
ans <- df1[df2, .(chrom,
start = x.start, end = x.end,
start2 = i.start2, end2 = i.end2),
on = .(chrom, start < end2, end > start2),
nomatch = NA]
# calculate new columns
ans[, overlap_start := rowMaxs(as.matrix(.SD)), .SDcols = c("start", "start2")]
ans[, overlap_end := rowMins(as.matrix(.SD)), .SDcols = c("end", "end2")]
# chrom start end start2 end2 overlap_start overlap_end
# 1: 1 100 150 100 125 100 125
# 2: 1 NA NA 50 100 NA NA
# 3: 1 300 350 280 320 280 320
# 4: 2 100 120 100 115 100 115
# 5: 2 NA NA 10 15 NA NA
# 6: 2 200 220 200 350 200 220
# 7: 2 300 320 200 350 200 320
以上是翻译好的代码部分。
英文:
I believe you are looking for sometehing like this?
I see no need to summarise here, so you'll get two results for the df2-range 200-350.
library(data.table)
library(matrixStats)
# set to data.table format
setDT(df1); setDT(df2)
# perform join
ans <- df1[df2, .(chrom,
start = x.start, end = x.end,
start2 = i.start2, end2 = i.end2),
on = .(chrom, start < end2, end > start2),
nomatch = NA]
# calculate new columns
ans[, overlap_start := rowMaxs(as.matrix(.SD)), .SDcols = c("start", "start2")]
ans[, overlap_end := rowMins(as.matrix(.SD)), .SDcols = c("end", "end2")]
# chrom start end start2 end2 overlap_start overlap_end
# 1: 1 100 150 100 125 100 125
# 2: 1 NA NA 50 100 NA NA
# 3: 1 300 350 280 320 280 320
# 4: 2 100 120 100 115 100 115
# 5: 2 NA NA 10 15 NA NA
# 6: 2 200 220 200 350 200 220
# 7: 2 300 320 200 350 200 320
答案2
得分: 2
我的建议是使用Bioconductor包GenomicRanges
,它可以使用最优的数据结构来查找区间重叠。
library(GenomicRanges)
df1 <- tibble(chrom=c(1,1,1,2,2,2),
start=c(100,200,300,100,200,300),
end=c(150,250,350,120,220,320))
df2 <- tibble(chrom=c(1,1,1,2,2,2),
start2=c(100,50,280,100,10,200),
end2=c(125,100,320,115,15,350))
overlaps <- findOverlapPairs(makeGRangesFromDataFrame(df1),
makeGRangesFromDataFrame(df2,
end.field = "end2",
start.field = "start2"))
> overlaps
Pairs object with 6 pairs and 0 metadata columns:
first second
<GRanges> <GRanges>
[1] 1:100-150 1:50-100
[2] 1:100-150 1:100-125
[3] 1:300-350 1:280-320
[4] 2:100-120 2:100-115
[5] 2:200-220 2:200-350
[6] 2:300-320 2:200-350
mapply(as.data.frame,
list(S4Vectors::first(overlaps),
S4Vectors::second(overlaps)),
SIMPLIFY = FALSE) |
do.call(what = `cbind`)
seqnames start end width strand seqnames start end width strand
1 1 100 150 51 * 1 50 100 51 *
2 1 100 150 51 * 1 100 125 26 *
3 1 300 350 51 * 1 280 320 41 *
4 2 100 120 21 * 2 100 115 16 *
5 2 200 220 21 * 2 200 350 151 *
6 2 300 320 21 * 2 200 350 151 *
英文:
My advise is to use the Bioconductor package GenomicRanges
, which can use optimal data structures for finding interval overlaps.
library(GenomicRanges)
df1 <- tibble(chrom=c(1,1,1,2,2,2),
start=c(100,200,300,100,200,300),
end=c(150,250,350,120,220,320))
df2 <- tibble(chrom=c(1,1,1,2,2,2),
start2=c(100,50,280,100,10,200),
end2=c(125,100,320,115,15,350))
overlaps <- findOverlapPairs(makeGRangesFromDataFrame(df1),
makeGRangesFromDataFrame(df2,
end.field = "end2",
start.field = "start2"))
> overlaps
Pairs object with 6 pairs and 0 metadata columns:
first second
<GRanges> <GRanges>
[1] 1:100-150 1:50-100
[2] 1:100-150 1:100-125
[3] 1:300-350 1:280-320
[4] 2:100-120 2:100-115
[5] 2:200-220 2:200-350
[6] 2:300-320 2:200-350
mapply(as.data.frame,
list(S4Vectors::first(overlaps),
S4Vectors::second(overlaps)),
SIMPLIFY = FALSE) |>
do.call(what = `cbind`)
seqnames start end width strand seqnames start end width strand
1 1 100 150 51 * 1 50 100 51 *
2 1 100 150 51 * 1 100 125 26 *
3 1 300 350 51 * 1 280 320 41 *
4 2 100 120 21 * 2 100 115 16 *
5 2 200 220 21 * 2 200 350 151 *
6 2 300 320 21 * 2 200 350 151 *
答案3
得分: 0
以下是翻译好的部分:
# 一个更长的“整洁风格”版本:
```R
library(dplyr)
df1 |>
left_join(df2, by = 'chrom') |>
rowwise() |>
mutate(range1 = list(start:end),
range2 = list(start2:end2),
intersect = list(intersect(start:end, start2:end2)),
overlap = c('no', 'yes')[1 + sign(length(intersect))],
overlap_start = ifelse(length(intersect), min(intersect), NA),
overlap_end = ifelse(length(intersect), max(intersect), NA),
) |>
group_by(paste(start2, end2)) |>
summarise(across(chrom : end2),
overlap,
across(starts_with('overlap_'),
~ paste(na.omit(.x), collapse = ','))
) |>
ungroup() |>
select(chrom:overlap_end)
# 一个数据框:18 x 8
chrom start end start2 end2 overlap overlap_start overlap_end
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
1 2 100 120 10 15 no "" ""
2 2 200 220 10 15 no "" ""
3 2 300 320 10 15 no "" ""
4 2 100 120 100 115 yes "100" "115"
5 2 200 220 100 115 no "100" "115"
6 2 300 320 100 115 no "100" "115"
7 1 100 150 100 125 yes "100" "125"
8 1 200 250 100 125 no "100" "125"
9 1 300 350 100 125 no "100" "125"
10 2 100 120 200 350 no "200,300" "220,320"
# ...
要获取数值向量而不是多个重叠的逗号分隔字符串,请使用以下代码片段进行总结:
## ...
across(starts_with('overlap_'),
~ list(c(na.omit(.x)))
)
英文:
A lengthier "tidy-style" version:
library(dplyr)
df1 |>
left_join(df2, by = 'chrom') |>
rowwise() |>
mutate(range1 = list(start:end),
range2 = list(start2:end2),
intersect = list(intersect(start:end, start2:end2)),
overlap = c('no', 'yes')[1 + sign(length(intersect))],
overlap_start = ifelse(length(intersect), min(intersect), NA),
overlap_end = ifelse(length(intersect), max(intersect), NA),
) |>
group_by(paste(start2, end2)) |>
summarise(across(chrom : end2),
overlap,
across(starts_with('overlap_'),
~ paste(na.omit(.x), collapse = ','))
) |>
ungroup() |>
select(chrom:overlap_end)
# A tibble: 18 x 8
chrom start end start2 end2 overlap overlap_start overlap_end
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
1 2 100 120 10 15 no "" ""
2 2 200 220 10 15 no "" ""
3 2 300 320 10 15 no "" ""
4 2 100 120 100 115 yes "100" "115"
5 2 200 220 100 115 no "100" "115"
6 2 300 320 100 115 no "100" "115"
7 1 100 150 100 125 yes "100" "125"
8 1 200 250 100 125 no "100" "125"
9 1 300 350 100 125 no "100" "125"
10 2 100 120 200 350 no "200,300" "220,320"
# ...
to obtain numeric vectors instead of comma-separated strings for multiple overlaps, summarize with the following fragment instead:
## ...
across(starts_with('overlap_'),
~ list(c(na.omit(.x)))
)
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论