Here's a simplified and faster version of the fun3
method from the accepted answer (rep.int
is faster than rep
, 1:rows
is faster than sequence(rows)
, (rows-1):1
was slightly faster than rev(z[-rows])
, and use.names=F
is not needed):
rows=5
z=1:rows
cbind(unlist(lapply(z[-1],function(x)x:rows)),rep(z[-rows],(rows-1):1))
Or if you don't save the sequence in a variable, then the code becomes a bit slower but easier to understand:
cbind(unlist(lapply(2:rows,function(x)x:rows)),rep(1:(rows-1),(rows-1):1))
1:n
is faster than sequence(n)
and seq.int(n)
:
> n=1e6;microbenchmark(sequence(n),1:n,seq(n),seq.int(n))
Unit: nanoseconds
expr min lq mean median uq max neval
sequence(n) 420147 1397684.5 1295462.89 1428241.5 1466248.0 2016984 100
1:n 129 160.0 561.22 242.5 617.0 4028 100
seq(n) 3596 4045.5 7538.04 5081.5 11194.5 23921 100
seq.int(n) 235 289.5 1001.95 502.5 1737.0 5265 100
rep.int
is faster than rep
:
> z1=1:100;z2=100:1;microbenchmark(times=1000,rep.int(z1,z2),rep(z1,z2))
Unit: microseconds
expr min lq mean median uq max neval
rep.int(z1, z2) 28.058 28.351 28.76671 28.490 28.9725 69.061 1000
rep(z1, z2) 29.553 29.849 30.29978 29.963 30.4945 85.321 1000
This is an easy but slow way to generate the indexes of the lower triangle:
combn(rows,2)
This procedural method is also slow:
o=matrix(,(rows^2-rows)/2,2)
n=1;for(i in 1:(rows-1))for(j in(i+1):rows){o[n,]=c(j,i);n=n+1}
o
But the Rcpp version of the procedural method is much faster:
Rcpp::cppFunction('NumericMatrix pairij_cpp(int n){
NumericMatrix out(n*(n-1)/2,2);
int row=0;
for(int i=1;i<=n-1;i++)for(int j=i+1;j<=n;j++){out(row,0)=j;out(row++,1)=i;}
return out;
}')
This shows the median time in ms for each number of rows:
10 100 1000
0.01682 0.2700 24.32 fun1
0.01623 0.2380 20.01 fun2
0.02830 0.1717 5.69 fun3
0.01456 0.1440 9.75 fun4
0.01506 0.1440 5.31 fun3_simplified
0.01436 0.1622 7.35 fun3_simplified_no_variable
0.05219 2.9143 296.11 comb
0.05249 4.9847 520.86 procedural_matrix
0.06642 6.5620 682.25 procedural_vector
0.00360 0.0461 2.80 pairij_cpp(rows)
Benchmark code:
Rcpp::cppFunction('NumericMatrix pairij_cpp(int n){
NumericMatrix out(n*(n-1)/2,2);
int row=0;
for(int i=1;i<=n-1;i++)for(int j=i+1;j<=n;j++){out(row,0)=j;out(row++,1)=i;}
return out;
}')
size=c(10,100,1000)
r=sapply(size,function(rows){
m=matrix(,rows,rows)
b=microbenchmark(times=100,
fun1={which(lower.tri(m)==T,arr.ind=T)},
fun2={which(lower.tri(m),arr.ind=T)},
fun3={z=sequence(rows);cbind(row=unlist(lapply(2:rows,function(x)x:rows),use.names=F),col=rep(z[-length(z)],times=rev(tail(z,-1))-1))},
fun4={row=rev(abs(sequence(seq.int(rows-1))-rows)+1);col=rep.int(seq.int(rows-1),rev(seq.int(rows-1)));cbind(row,col)},
fun3_simplified={z=1:rows;cbind(unlist(lapply(z[-1],function(x)x:rows)),rep.int(z[-rows],(rows-1):1))},
fun3_simplified_no_variable={cbind(unlist(lapply(2:rows,function(x)x:rows)),rep.int(1:(rows-1),(rows-1):1))},
comb={c=t(combn(rows,2));c[,2:1]},
procedural_matrix={o=matrix(,(rows^2-rows)/2,2);n=1;for(i in 1:(rows-1))for(j in(i+1):rows){o[n,]=c(j,i);n=n+1};o},
procedural_vector={o=integer(rows^2-rows);n=1;for(i in 1:(rows-1))for(j in(i+1):rows){o[n]=j;o[n+1]=i;n=n+2};matrix(o,,2,T)},
pairij_cpp(rows)
)
a=aggregate(b$time,list(b$expr),median)
setNames(a[,2]/1e6,a[,1])
})
r2=apply(r,2,function(x)formatC(x,max(0,3-ceiling(log10(min(x,na.rm=T)))),format="f"))
r3=apply(rbind(size,r2),2,function(x)formatC(x,max(nchar(x)),format="s"))
writeLines(apply(cbind(r3,c("",rownames(r))),1,paste,collapse=" "))
rows <- 1e+05
thenlapply
requires lots of memory to build the list, i.e. of orderO(n^2)
. Is there a way to rewritefun3()
so it can be called iteratively in a loop to return subsets of the index pairs, lets say 10K pairs a time? – Trammel