Below is a function, rbinvec2
that efficiently samples uniformly with replacement from the set of valid vectors of length m
. It is able to generate 100K valid samples of a vector of length 101 in less than 1 second.
When I have time, I plan on updating the answer to include a derivation of the underlying PMF.
The function
library(RcppAlgos) # for `permuteGeneral`
library(Rfast) # for `colShuffle`
rbinvec2 <- function(n, m) {
# handle small `m` values with `RcppAlgos`
if (m == 1) {
return(sample(0:1, n, 1))
} else if (m == 2) {
return(matrix(c(0:1, 1:0), 2, 2)[sample(1:2, n, 1),])
} else if (m == 3) {
return(permuteGeneral(0:1, 3, TRUE, NULL, 2, 7)[sample(6, n, 1),])
}
odd <- m%%2 == 1
kk <- (m - sqrt(m^2%%8))/2 # maximum number of doubles
k <- 0:kk
mk <- m - k
mk1 <- (mk + 1)%/%2
mk <- mk%/%2
m1 <- m - 1
m2 <- m%/%2
# calculate the exact (log) PMF of `k`, including the special cases when `m`
# is odd
p <- cbind(lchoose(mk1, k%/%2) + lchoose(mk, (k + 1)%/%2), -Inf)
i <- seq(1, kk + 1, 2)
if (odd) { # special cases
p[-i, 1] <- p[-i, 1] + log(2)
p[i, 2] <- lchoose(mk1[i], -1:(kk%/%2 - 1)) + lchoose(mk[i], 1:(kk%/%2 + 1))
}
# sample the PMF
k <- array(c(rmultinom(1, n, exp(p - max(p[1,])))), dim(p))
# k[-i,] <- rowSums(k[-i,])
s <- rowSums(k)
b <- vector("list", kk + 1) # initialize the list of compositions
# special case: no doubles
if (s[1]) b[[1]] <- rep(1, m*s[1])
s[1] <- 0L
if (odd) { # odd `m`
for (i in which(s != 0L)) {
b[[i]] <- if (i%%2L) {
# even number of doubles
j1 <- (m - i)/2 # 1 partition length
j0 <- j1 + 1 # 0 partition length
i2 <- (i - 1)/2
d0 <- j0 - i2
d1 <- j1 - i2
c(
if (k[i, 1]) { # balanced doubles--1 more 0 than 1
rbind(
# 0 composition
c(colShuffle(matrix(rep.int(1:2, c(d0, i2)), j0, k[i, 1]))),
# 1 composition
c(rbind(
colShuffle(matrix(rep.int(1:2, c(d1, i2)), j1, k[i, 1])),
0L
))
)
} else integer(0)
,
if (k[i, 2]) { # unbalanced doubles--1 more 1 than 0
rbind(
# 0 composition
c(colShuffle(matrix(rep.int(1:2, c(d0 + 1, i2 - 1)), j0, k[i, 2]))),
# 1 composition
c(rbind(
colShuffle(matrix(rep.int(1:2, c(d1 - 1, i2 + 1)), j1, k[i, 2])),
0L
))
)
} else integer(0)
)
} else {
# odd number of doubles
i2 <- i/2 - 1 # min number of (0,0)
j <- m2 - i2 # even/odd composition lengths
idx <- sample(1:2, s[i], 1) # extra 0, or extra 1?
bb <- cbind(rep.int(1:2, c(j - i2, i2)), # extra 0
rep.int(1:2, c(j - i2 - 1, i2 + 1))) # extra 1
c(
rbind(
c(colShuffle(matrix(bb[,idx], j))), # 0 compositions
c(colShuffle(matrix(bb[,3 - idx], j))) # 1 compositions
)
)
}
}
} else { # even `m`
for (i in which(s != 0L)) {
i2 <- (i - 1)%/%2 # number of (0,0)
j <- m2 - i2 # 0 composition length
b[[i]] <- c(
if (i%%2L) {
# even number of doubles--equivalent partition for 0 and 1
matrix(
colShuffle(matrix(rep.int(1:2, c(j - i2, i2)), j, 2*s[i])),
2, j*s[i], 1
)
} else { # odd number of doubles--1 more double 1 than double 0
rbind(
# 0-composition
c(colShuffle(matrix(rep.int(1:2, c(j - i2, i2)), j, s[i]))),
# 1 composition
c(rbind(
colShuffle(
matrix(rep.int(1:2, c(j - i2 - 2, i2 + 1)), j - 1, s[i])
), 0L
))
)
}
)
}
}
abs(
matrix(
# convert the compositions to 0's and 1's
rep.int(rep(0:1, length.out = sum(lengths(b))), unlist(b)), n, m, 1
# shuffle the n samples and perform a full 0 <-> 1 swap in random vectors
)[sample(n),] - sample(0:1, n, 1)
)
}
Example usage
rbinvec2(10, 8) # 10 samples of vector of length 8
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
#> [1,] 0 1 0 1 0 1 0 1
#> [2,] 0 1 0 1 0 0 1 1
#> [3,] 0 1 0 0 1 0 1 1
#> [4,] 0 0 1 1 0 1 1 0
#> [5,] 0 0 1 0 1 0 1 1
#> [6,] 1 0 1 0 1 1 0 0
#> [7,] 1 0 1 0 1 0 0 1
#> [8,] 1 0 1 0 1 0 1 0
#> [9,] 1 0 1 0 1 0 1 0
#> [10,] 1 0 0 1 0 0 1 1
rbinvec2(10, 9)# 10 samples of vector of length 9
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
#> [1,] 0 0 1 1 0 0 1 0 1
#> [2,] 0 1 1 0 1 1 0 0 1
#> [3,] 1 0 0 1 0 1 1 0 0
#> [4,] 1 0 0 1 0 1 0 1 0
#> [5,] 1 1 0 0 1 1 0 0 1
#> [6,] 1 1 0 1 0 0 1 0 0
#> [7,] 0 1 0 1 0 0 1 0 1
#> [8,] 1 0 1 0 0 1 0 0 1
#> [9,] 1 1 0 0 1 0 1 1 0
#> [10,] 0 0 1 0 0 1 1 0 1
Timing
system.time(rbinvec2(1e6, 10)) # 1M samples of vector of length 10
#> user system elapsed
#> 1.52 0.12 1.75
system.time(rbinvec2(1e5, 101)) # 100K samples of vector of length 101
#> user system elapsed
#> 0.69 0.09 0.82
Verify correct behavior
library(data.table) # for checks
fcheck1 <- function(m) {
# returns the number of valid vectors of length `m`
# generate all balanced permutations
x <- if (m%%2) {
rbind(
permuteGeneral(0:1, m, TRUE, c((m + 1)%/%2, m%/%2)),
permuteGeneral(0:1, m, TRUE, c(m%/%2, (m + 1)%/%2))
)
} else {
permuteGeneral(0:1, m, TRUE, c(m%/%2, m%/%2))
}
# count the permutations with no more than two sequential repeated values
sum(!rowSums(x[,-1:-2] == x[,c(-1, -m)] & x[,c(-1, -m)] == x[,-m:(1 - m)]))
}
fcheck2 <- function(n, m) {
# return the unique samples along with their counts
setorder(as.data.table(rbinvec2(n, m))[,.(.N), eval(paste0("V", 1:m))])[]
}
fcheck1(6) # the number of valid vectors of length 6
#> [1] 14
fcheck2(1e6, 6)
#> V1 V2 V3 V4 V5 V6 N
#> <int> <int> <int> <int> <int> <int> <int>
#> 1: 0 0 1 0 1 1 71195
#> 2: 0 0 1 1 0 1 71249
#> 3: 0 1 0 0 1 1 71237
#> 4: 0 1 0 1 0 1 71588
#> 5: 0 1 0 1 1 0 71469
#> 6: 0 1 1 0 0 1 71482
#> 7: 0 1 1 0 1 0 71242
#> 8: 1 0 0 1 0 1 71368
#> 9: 1 0 0 1 1 0 71638
#> 10: 1 0 1 0 0 1 71606
#> 11: 1 0 1 0 1 0 71645
#> 12: 1 0 1 1 0 0 71867
#> 13: 1 1 0 0 1 0 71107
#> 14: 1 1 0 1 0 0 71307
fcheck1(7)
#> [1] 36 # the number of valid vectors of length 7
fcheck2(1e6, 7)
#> V1 V2 V3 V4 V5 V6 V7 N
#> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1: 0 0 1 0 0 1 1 27612
#> 2: 0 0 1 0 1 0 1 27728
#> 3: 0 0 1 0 1 1 0 27451
#> 4: 0 0 1 1 0 0 1 27707
#> 5: 0 0 1 1 0 1 0 27689
#> 6: 0 0 1 1 0 1 1 27460
#> 7: 0 1 0 0 1 0 1 27891
#> 8: 0 1 0 0 1 1 0 27562
#> 9: 0 1 0 1 0 0 1 27805
#> 10: 0 1 0 1 0 1 0 27947
#> 11: 0 1 0 1 0 1 1 27761
#> 12: 0 1 0 1 1 0 0 27893
#> 13: 0 1 0 1 1 0 1 28044
#> 14: 0 1 1 0 0 1 0 27899
#> 15: 0 1 1 0 0 1 1 27756
#> 16: 0 1 1 0 1 0 0 27477
#> 17: 0 1 1 0 1 0 1 27769
#> 18: 0 1 1 0 1 1 0 27756
#> 19: 1 0 0 1 0 0 1 27955
#> 20: 1 0 0 1 0 1 0 27726
#> 21: 1 0 0 1 0 1 1 27931
#> 22: 1 0 0 1 1 0 0 27896
#> 23: 1 0 0 1 1 0 1 27652
#> 24: 1 0 1 0 0 1 0 27720
#> 25: 1 0 1 0 0 1 1 28005
#> 26: 1 0 1 0 1 0 0 28072
#> 27: 1 0 1 0 1 0 1 27802
#> 28: 1 0 1 0 1 1 0 27579
#> 29: 1 0 1 1 0 0 1 27593
#> 30: 1 0 1 1 0 1 0 27846
#> 31: 1 1 0 0 1 0 0 27745
#> 32: 1 1 0 0 1 0 1 27760
#> 33: 1 1 0 0 1 1 0 27840
#> 34: 1 1 0 1 0 0 1 27704
#> 35: 1 1 0 1 0 1 0 27884
#> 36: 1 1 0 1 1 0 0 28083
#> V1 V2 V3 V4 V5 V6 V7 N
The distribution of each unique vector is consistent with uniform sampling on the full set of valid vectors for both cases.
n_L
andn_R
are not much bigger than 20 or 30, since draws that match yourmax_consec
criteria will become vanishingly unlikely for longer sequences. If you want to make larger sequences, this approach will stop making sense. – Briefsinitial_seq <- append(initial_seq, side)
is an anti-pattern called "growing an object in a loop". Every iteration through yourwhile
loop,initial_seq
gets longer, and this is an expensive operation because it may run out of space wherever it is stored on your disk and need to be copied somewhere else with enough room. It's much more efficient to "pre-allocate" a vector by creating it already at it's final length and then filling in values as you go. Read more in The R Inferno – Dharna