Union and intersection of intervals
Asked Answered
C

5

9

I have a group of intervals for different ids. For example:

df <- data.frame(id=c(rep("a",4),rep("b",2),rep("c",3)), start=c(100,250,400,600,150,610,275,600,700), end=c(200,300,550,650,275,640,325,675,725))

The intervals of each id do not overlap but the intervals of the different ids may overlap. Here is a picture:

plot(range(df[,c(2,3)]),c(1,nrow(df)),type="n",xlab="",ylab="",yaxt="n")
for ( ii in 1:nrow(df) ) lines(c(df[ii,2],df[ii,3]),rep(nrow(df)-ii+1,2),col=as.numeric(df$id[ii]),lwd=2)
legend("bottomleft",lwd=2,col=seq_along(levels(df$id)),legend=levels(df$id))

intervals What I'm looking for is for two functions: 1. A function which will take the union of these intervals. For the example above, it will return this data.frame:

union.df <- data.frame(id=rep("a,b,c",4), start=c(100,400,600,700), end=c(325,550,675,725))
  1. A function which will intersect these intervals, only keeping a range if all the ids overlap for that range. For the example above, it will return this data.frame:

intersection.df <- data.frame(id="a,b,c", start=610, end=640)

Cuirassier answered 6/5, 2015 at 14:30 Comment(3)
try ?intersect and ?unionStung
intersect and union won't work - they work on discrete sets, not intervals.Ladyship
Could you clarify how you get "the union of these intervals and their intersection", and how this plays with your ids? The intersection of all intervals will be empty, given that you have multiple non-overlapping intervals already within one individual. Similarly, I don't understand where the union comes from.Ladyship
B
3

This is a bit awkward, but the idea is that you unroll the data into a series of opening and closing events. Then you track how many intervals are open at a time. This assume each group doesn't have any overlapping intervals.

df <- data.frame(id=c(rep("a",4),rep("b",2),rep("c",3)), start=c(100,250,400,600,150,610,275,600,700), end=c(200,300,550,650,275,640,325,675,725))


sets<-function(start, end, group, overlap=length(unique(group))) {
    dd<-rbind(data.frame(pos=start, event=1), data.frame(pos=end, event=-1))
    dd<-aggregate(event~pos, dd, sum)
    dd<-dd[order(dd$pos),]
    dd$open <- cumsum(dd$event)
    r<-rle(dd$open>=overlap)
    ex<-cumsum(r$lengths-1 + rep(1, length(r$lengths))) 
    sx<-ex-r$lengths+1
    cbind(dd$pos[sx[r$values]],dd$pos[ex[r$values]+1])

} 

#union
with(df, sets(start, end, id,1))
#     [,1] [,2]
# [1,]  100  325
# [2,]  400  550
# [3,]  600  675
# [4,]  700  725

#overlap
with(df, sets(start, end, id,3))
#      [,1] [,2]
# [1,]  610  640
Brannen answered 6/5, 2015 at 15:4 Comment(0)
S
6

The intervals package solves the union part of the question:

require(intervals)
idf <- Intervals(df[,2:3])
as.data.frame(interval_union(idf))

And for the intersect part, depending on how the intervals are defined:

idl <- lapply(unique(df$id),function(x){var <- as(Intervals(df[df$id==x,2:3]),"Intervals_full");closed(var)[,1]<- FALSE;return(var)})
idt <- idl[[1]]
for(i in idl)idt <- interval_intersection(idt,i)
res <- as.data.frame(idt) 
res
   V1  V2
1 610 640
Stenopetalous answered 6/5, 2015 at 15:11 Comment(1)
Just edited the answer to cope with 2. part. One could go with the default closed intervals and delete rows in the result that have identical entries (would be 275 275 in this case). All depends on if the intervals are open or closed.Stenopetalous
B
3

This is a bit awkward, but the idea is that you unroll the data into a series of opening and closing events. Then you track how many intervals are open at a time. This assume each group doesn't have any overlapping intervals.

df <- data.frame(id=c(rep("a",4),rep("b",2),rep("c",3)), start=c(100,250,400,600,150,610,275,600,700), end=c(200,300,550,650,275,640,325,675,725))


sets<-function(start, end, group, overlap=length(unique(group))) {
    dd<-rbind(data.frame(pos=start, event=1), data.frame(pos=end, event=-1))
    dd<-aggregate(event~pos, dd, sum)
    dd<-dd[order(dd$pos),]
    dd$open <- cumsum(dd$event)
    r<-rle(dd$open>=overlap)
    ex<-cumsum(r$lengths-1 + rep(1, length(r$lengths))) 
    sx<-ex-r$lengths+1
    cbind(dd$pos[sx[r$values]],dd$pos[ex[r$values]+1])

} 

#union
with(df, sets(start, end, id,1))
#     [,1] [,2]
# [1,]  100  325
# [2,]  400  550
# [3,]  600  675
# [4,]  700  725

#overlap
with(df, sets(start, end, id,3))
#      [,1] [,2]
# [1,]  610  640
Brannen answered 6/5, 2015 at 15:4 Comment(0)
E
2

For the intersection, I would start by counting the number of intervals you're in at each range (the beginning of the range is labeled with ord.dirs$x in this code and the number of intervals in the range is ord.dirs$z):

dirs <- data.frame(x=c(df$start, df$end), y=rep(c(1, -1), each=nrow(df)))
ord.dirs <- dirs[order(dirs$x),]
ord.dirs$z <- cumsum(ord.dirs$y)
ord.dirs <- ord.dirs[!duplicated(ord.dirs$x, fromLast=T),]
ord.dirs
#      x  y z
# 1  100  1 1
# 5  150  1 2
# 10 200 -1 1
# 2  250  1 2
# 14 275 -1 2
# 11 300 -1 1
# 16 325 -1 0
# 3  400  1 1
# 12 550 -1 0
# 8  600  1 2
# 6  610  1 3
# 15 640 -1 2
# 13 650 -1 1
# 17 675 -1 0
# 9  700  1 1
# 18 725 -1 0

Now you just need to grab the ranges where you have the correct number of intervals (3 in this case):

pos.all <- which(ord.dirs$z == length(unique(df$id)))
data.frame(start=ord.dirs$x[pos.all], end=ord.dirs$x[pos.all+1])
#   start end
# 1   610 640

You can similarly use ord.dirs to grab the union of the sets:

zero.pos <- which(ord.dirs$z == 0)
data.frame(start=c(ord.dirs$x[1], ord.dirs$x[head(zero.pos, -1)+1]),
           end=ord.dirs$x[zero.pos])
#   start end
# 1   100 325
# 2   400 550
# 3   600 675
# 4   700 725
Euraeurasia answered 6/5, 2015 at 14:53 Comment(0)
M
2

The GenomicRanges package provide some intersect and overlap funtions:

library(GenomicRanges)
source("http://bioconductor.org/biocLite.R")
biocLite("Gviz")    
library(Gviz)

make a Grange object with equal seqnames (this is important)

df <- data.frame(id=c(rep("a",4),rep("b",2),rep("c",3)),     start=c(100,250,400,600,150,610,275,600,700), end=c(200,300,550,650,275,640,325,675,725))
gr <- GRanges(seqnames = rep(1,nrow(df)),IRanges(start = df$start,end =      df$end))

Now you can plot the ranges with the Gviz package, as well.

d0 <- GenomeAxisTrack()
d1 <- AnnotationTrack(gr,group = df$id,fill=df$id)
plotTracks(c(d0,d1))

The union is done via reduce where intervals are collapsed

as.data.frame(reduce(gr))[,2:3]

the intersect is done via findoverlaps. Afterwards, filterd by ranges which overlaps 3 ranges.

OL <- as.data.frame(findOverlaps(gr,type="within"))
table(OL[,1])

df[as.numeric(names(which(table(OL[,1])==3))),]
Malformation answered 6/5, 2015 at 15:59 Comment(0)
K
1

Using ivs with iv_groups() for the self union and a reduce()d iv_set_intersect() (in ivs 0.2.0, or iv_intersect() in 0.1.0) for the intersection:

library(ivs)
library(dplyr, warn.conflicts = FALSE)
library(purrr)
library(tidyr)

df <- tibble(
  id=c(rep("a",4),rep("b",2),rep("c",3)), 
  start=c(100,250,400,600,150,610,275,600,700), 
  end=c(200,300,550,650,275,640,325,675,725)
)

df <- df %>%
  mutate(range = iv(start, end), .keep = "unused")

df
#> # A tibble: 9 × 2
#>   id         range
#>   <chr>  <iv<dbl>>
#> 1 a     [100, 200)
#> 2 a     [250, 300)
#> 3 a     [400, 550)
#> 4 a     [600, 650)
#> 5 b     [150, 275)
#> 6 b     [610, 640)
#> 7 c     [275, 325)
#> 8 c     [600, 675)
#> 9 c     [700, 725)

# Union:
# Merge all overlapping ranges
df %>% 
  reframe(range = iv_groups(range))
#> # A tibble: 4 × 1
#>        range
#>    <iv<dbl>>
#> 1 [100, 325)
#> 2 [400, 550)
#> 3 [600, 675)
#> 4 [700, 725)

# Intersection:
# Chop the ranges by `id`
df_chopped <- df %>%
  chop(range)

# Reduce an interval set intersection operation over each pair of interval vectors
df_chopped %>%
  reframe(range = reduce(range, iv_set_intersect))
#> # A tibble: 1 × 1
#>        range
#>    <iv<dbl>>
#> 1 [610, 640)
Kick answered 6/3, 2023 at 21:9 Comment(0)

© 2022 - 2024 — McMap. All rights reserved.