Add a progress bar to boot function in R
Asked Answered
S

6

6

I am trying to add a progress bar to a bootstrap function in R. I tried to make the example function as simple as possible (hence i'm using mean in this example).

library(boot)
v1 <- rnorm(1000)
rep_count = 1

m.boot <- function(data, indices) {
  d <- data[indices]
  setWinProgressBar(pb, rep_count)
  rep_count <- rep_count + 1
  Sys.sleep(0.01)
  mean(d, na.rm = T) 
  }

tot_rep <- 200
pb <- winProgressBar(title = "Bootstrap in progress", label = "",
                     min = 0, max = tot_rep, initial = 0, width = 300)
b <- boot(v1, m.boot, R = tot_rep)
close(pb)

The bootstrap functions properly, but the problem is that the value of rep_count does not increase in the loop and the progress bar stays frozen during the process.

If I check the value of rep_count after the bootstrap is complete, it is still 1.

What am i doing wrong? maybe the boot function does not simply insert the m.boot function in a loop and so the variables in it are not increased?

Thank you.

Siret answered 7/6, 2016 at 8:13 Comment(1)
The package pbapply is an easy way to show a progress bar for any task of applying a function using the apply family. github.com/psolymos/pbapply . If you can use your m.boot inside some form of apply, this is would be really simple.Hypognathous
G
2

The pbapply package was designed to work with vectorized functions. There are 2 ways to achieve that in the context of this question: (1) write a wrapper as was suggested, which will not produce the same object of class 'boot'; (2) alternatively, the line lapply(seq_len(RR), fn) can be written as pblapply(seq_len(RR), fn). Option 2 can happen either by locally copying/updating the boot function as shown in the example below, or asking the package maintainer, Brian Ripley, if he would consider adding a progress bar directly or through pbapply as dependency.

My solution (changes indicated by comments):

library(boot)
library(pbapply)
boot2 <- function (data, statistic, R, sim = "ordinary", stype = c("i", 
    "f", "w"), strata = rep(1, n), L = NULL, m = 0, weights = NULL, 
    ran.gen = function(d, p) d, mle = NULL, simple = FALSE, ..., 
    parallel = c("no", "multicore", "snow"), ncpus = getOption("boot.ncpus", 
        1L), cl = NULL) 
{
call <- match.call()
stype <- match.arg(stype)
if (missing(parallel)) 
    parallel <- getOption("boot.parallel", "no")
parallel <- match.arg(parallel)
have_mc <- have_snow <- FALSE
if (parallel != "no" && ncpus > 1L) {
    if (parallel == "multicore") 
        have_mc <- .Platform$OS.type != "windows"
    else if (parallel == "snow") 
        have_snow <- TRUE
    if (!have_mc && !have_snow) 
        ncpus <- 1L
    loadNamespace("parallel")
}
if (simple && (sim != "ordinary" || stype != "i" || sum(m))) {
    warning("'simple=TRUE' is only valid for 'sim=\"ordinary\", stype=\"i\", n=0', so ignored")
    simple <- FALSE
}
if (!exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) 
    runif(1)
seed <- get(".Random.seed", envir = .GlobalEnv, inherits = FALSE)
n <- NROW(data)
if ((n == 0) || is.null(n)) 
    stop("no data in call to 'boot'")
temp.str <- strata
strata <- tapply(seq_len(n), as.numeric(strata))
t0 <- if (sim != "parametric") {
    if ((sim == "antithetic") && is.null(L)) 
        L <- empinf(data = data, statistic = statistic, stype = stype, 
            strata = strata, ...)
    if (sim != "ordinary") 
        m <- 0
    else if (any(m < 0)) 
        stop("negative value of 'm' supplied")
    if ((length(m) != 1L) && (length(m) != length(table(strata)))) 
        stop("length of 'm' incompatible with 'strata'")
    if ((sim == "ordinary") || (sim == "balanced")) {
        if (isMatrix(weights) && (nrow(weights) != length(R))) 
            stop("dimensions of 'R' and 'weights' do not match")
    }
    else weights <- NULL
    if (!is.null(weights)) 
        weights <- t(apply(matrix(weights, n, length(R), 
            byrow = TRUE), 2L, normalize, strata))
    if (!simple) 
        i <- index.array(n, R, sim, strata, m, L, weights)
    original <- if (stype == "f") 
        rep(1, n)
    else if (stype == "w") {
        ns <- tabulate(strata)[strata]
        1/ns
    }
    else seq_len(n)
    t0 <- if (sum(m) > 0L) 
        statistic(data, original, rep(1, sum(m)), ...)
    else statistic(data, original, ...)
    rm(original)
    t0
}
else statistic(data, ...)
pred.i <- NULL
fn <- if (sim == "parametric") {
    ran.gen
    data
    mle
    function(r) {
        dd <- ran.gen(data, mle)
        statistic(dd, ...)
    }
}
else {
    if (!simple && ncol(i) > n) {
        pred.i <- as.matrix(i[, (n + 1L):ncol(i)])
        i <- i[, seq_len(n)]
    }
    if (stype %in% c("f", "w")) {
        f <- freq.array(i)
        rm(i)
        if (stype == "w") 
            f <- f/ns
        if (sum(m) == 0L) 
            function(r) statistic(data, f[r, ], ...)
        else function(r) statistic(data, f[r, ], pred.i[r, 
            ], ...)
    }
    else if (sum(m) > 0L) 
        function(r) statistic(data, i[r, ], pred.i[r, ], 
            ...)
    else if (simple) 
        function(r) statistic(data, index.array(n, 1, sim, 
            strata, m, L, weights), ...)
    else function(r) statistic(data, i[r, ], ...)
}
RR <- sum(R)
res <- if (ncpus > 1L && (have_mc || have_snow)) {
    if (have_mc) {
        parallel::mclapply(seq_len(RR), fn, mc.cores = ncpus)
    }
    else if (have_snow) {
        list(...)
        if (is.null(cl)) {
            cl <- parallel::makePSOCKcluster(rep("localhost", 
              ncpus))
            if (RNGkind()[1L] == "L'Ecuyer-CMRG") 
              parallel::clusterSetRNGStream(cl)
            res <- pblapply(seq_len(RR), fn, cl=cl)
            parallel::stopCluster(cl)
            res
        }
        else pblapply(seq_len(RR), fn, cl=cl)
    }
}
else pblapply(seq_len(RR), fn) #### changed !!!
t.star <- matrix(, RR, length(t0))
for (r in seq_len(RR)) t.star[r, ] <- res[[r]]
if (is.null(weights)) 
    weights <- 1/tabulate(strata)[strata]
boot.return(sim, t0, t.star, temp.str, R, data, statistic, 
    stype, call, seed, L, m, pred.i, weights, ran.gen, mle)
}
## Functions not exported by boot
isMatrix <- boot:::isMatrix
index.array <- boot:::index.array
boot.return <- boot:::boot.return
## Now the example
m.boot <- function(data, indices) {
  d <- data[indices]
  mean(d, na.rm = T) 
}
tot_rep <- 200
v1 <- rnorm(1000)
b <- boot2(v1, m.boot, R = tot_rep)
Graycegrayheaded answered 7/6, 2016 at 17:20 Comment(1)
This is a nice solution, but i don't think that the maintainer of the package is interested in adding any progress bar natively, as this will, inevitably, slow down the performance of the boot function. Nevertheless, this may be an elegant solution using a copy of the boot function, as you did!Siret
K
3

You could use the package progress as below:

library(boot)
library(progress)

v1 <- rnorm(1000)

#add progress bar as parameter to function
m.boot <- function(data, indices, prog) {
  
  #display progress with each run of the function
  prog$tick()
  
  d <- data[indices]
  Sys.sleep(0.01)
  mean(d, na.rm = T) 
  
}

tot_rep <- 200

#initialize progress bar object
pb <- progress_bar$new(total = tot_rep + 1) 

#perform bootstrap
boot(data = v1, statistic = m.boot, R = tot_rep, prog = pb)

I haven't quite figured out yet why it's necessary to set the number of iterations for progress_bar to be +1 the total bootstrap replicates (parameter R), but this is what was necessary in my own code, otherwise it throws an error. It seems like the bootstrap function is run one more time than you specify in parameter R, so if the progress bar is set to only run R times, it thinks the job is finished before it really is.

Kimmie answered 20/7, 2020 at 22:9 Comment(1)
The reason why one has to set total = R+1 is probably because boot() calculates R + 1 statistics - one for the actual data (t0) and R statistics t for the R bootstrap samples. Hence effectively, boot() is called R+1 times.Calcareous
C
2

The increased rep_count is a local variable and lost after each function call. In the next iteration the function gets rep_count from the global environment again, i.e., its value is 1.

You can use <<-:

rep_count <<- rep_count + 1

This assigns to the rep_count first found on the search path outside the function. Of course, using <<- is usually not recommended because side effects of functions should be avoided, but here you have a legitimate use case. However, you should probably wrap the whole thing in a function to avoid a side effect on the global environment.

There might be better solutions ...

Centigram answered 7/6, 2016 at 8:27 Comment(4)
I think this is the more correct way, from a programmer's point of view. But due to my scarce ability in programming, i think i will stick with the solution provided below. Thank you very much!Siret
You don't need advanced programming skills to change this one line of code.Centigram
You are right, I just like using someone's else functions, as i think that they are programmed better than mine. For example, being able to use the pbapply package adds the ability to change style and type of the progress bar just in seconds. I am trying to find a way to use pbapply with bootstrap. But maybe someone will find the solution before I do!Siret
If you want to use an apply type function, you can't use boot. You'd have to write your own bootstrapping (which wouldn't be hard and possibly be educational).Centigram
G
2

The pbapply package was designed to work with vectorized functions. There are 2 ways to achieve that in the context of this question: (1) write a wrapper as was suggested, which will not produce the same object of class 'boot'; (2) alternatively, the line lapply(seq_len(RR), fn) can be written as pblapply(seq_len(RR), fn). Option 2 can happen either by locally copying/updating the boot function as shown in the example below, or asking the package maintainer, Brian Ripley, if he would consider adding a progress bar directly or through pbapply as dependency.

My solution (changes indicated by comments):

library(boot)
library(pbapply)
boot2 <- function (data, statistic, R, sim = "ordinary", stype = c("i", 
    "f", "w"), strata = rep(1, n), L = NULL, m = 0, weights = NULL, 
    ran.gen = function(d, p) d, mle = NULL, simple = FALSE, ..., 
    parallel = c("no", "multicore", "snow"), ncpus = getOption("boot.ncpus", 
        1L), cl = NULL) 
{
call <- match.call()
stype <- match.arg(stype)
if (missing(parallel)) 
    parallel <- getOption("boot.parallel", "no")
parallel <- match.arg(parallel)
have_mc <- have_snow <- FALSE
if (parallel != "no" && ncpus > 1L) {
    if (parallel == "multicore") 
        have_mc <- .Platform$OS.type != "windows"
    else if (parallel == "snow") 
        have_snow <- TRUE
    if (!have_mc && !have_snow) 
        ncpus <- 1L
    loadNamespace("parallel")
}
if (simple && (sim != "ordinary" || stype != "i" || sum(m))) {
    warning("'simple=TRUE' is only valid for 'sim=\"ordinary\", stype=\"i\", n=0', so ignored")
    simple <- FALSE
}
if (!exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) 
    runif(1)
seed <- get(".Random.seed", envir = .GlobalEnv, inherits = FALSE)
n <- NROW(data)
if ((n == 0) || is.null(n)) 
    stop("no data in call to 'boot'")
temp.str <- strata
strata <- tapply(seq_len(n), as.numeric(strata))
t0 <- if (sim != "parametric") {
    if ((sim == "antithetic") && is.null(L)) 
        L <- empinf(data = data, statistic = statistic, stype = stype, 
            strata = strata, ...)
    if (sim != "ordinary") 
        m <- 0
    else if (any(m < 0)) 
        stop("negative value of 'm' supplied")
    if ((length(m) != 1L) && (length(m) != length(table(strata)))) 
        stop("length of 'm' incompatible with 'strata'")
    if ((sim == "ordinary") || (sim == "balanced")) {
        if (isMatrix(weights) && (nrow(weights) != length(R))) 
            stop("dimensions of 'R' and 'weights' do not match")
    }
    else weights <- NULL
    if (!is.null(weights)) 
        weights <- t(apply(matrix(weights, n, length(R), 
            byrow = TRUE), 2L, normalize, strata))
    if (!simple) 
        i <- index.array(n, R, sim, strata, m, L, weights)
    original <- if (stype == "f") 
        rep(1, n)
    else if (stype == "w") {
        ns <- tabulate(strata)[strata]
        1/ns
    }
    else seq_len(n)
    t0 <- if (sum(m) > 0L) 
        statistic(data, original, rep(1, sum(m)), ...)
    else statistic(data, original, ...)
    rm(original)
    t0
}
else statistic(data, ...)
pred.i <- NULL
fn <- if (sim == "parametric") {
    ran.gen
    data
    mle
    function(r) {
        dd <- ran.gen(data, mle)
        statistic(dd, ...)
    }
}
else {
    if (!simple && ncol(i) > n) {
        pred.i <- as.matrix(i[, (n + 1L):ncol(i)])
        i <- i[, seq_len(n)]
    }
    if (stype %in% c("f", "w")) {
        f <- freq.array(i)
        rm(i)
        if (stype == "w") 
            f <- f/ns
        if (sum(m) == 0L) 
            function(r) statistic(data, f[r, ], ...)
        else function(r) statistic(data, f[r, ], pred.i[r, 
            ], ...)
    }
    else if (sum(m) > 0L) 
        function(r) statistic(data, i[r, ], pred.i[r, ], 
            ...)
    else if (simple) 
        function(r) statistic(data, index.array(n, 1, sim, 
            strata, m, L, weights), ...)
    else function(r) statistic(data, i[r, ], ...)
}
RR <- sum(R)
res <- if (ncpus > 1L && (have_mc || have_snow)) {
    if (have_mc) {
        parallel::mclapply(seq_len(RR), fn, mc.cores = ncpus)
    }
    else if (have_snow) {
        list(...)
        if (is.null(cl)) {
            cl <- parallel::makePSOCKcluster(rep("localhost", 
              ncpus))
            if (RNGkind()[1L] == "L'Ecuyer-CMRG") 
              parallel::clusterSetRNGStream(cl)
            res <- pblapply(seq_len(RR), fn, cl=cl)
            parallel::stopCluster(cl)
            res
        }
        else pblapply(seq_len(RR), fn, cl=cl)
    }
}
else pblapply(seq_len(RR), fn) #### changed !!!
t.star <- matrix(, RR, length(t0))
for (r in seq_len(RR)) t.star[r, ] <- res[[r]]
if (is.null(weights)) 
    weights <- 1/tabulate(strata)[strata]
boot.return(sim, t0, t.star, temp.str, R, data, statistic, 
    stype, call, seed, L, m, pred.i, weights, ran.gen, mle)
}
## Functions not exported by boot
isMatrix <- boot:::isMatrix
index.array <- boot:::index.array
boot.return <- boot:::boot.return
## Now the example
m.boot <- function(data, indices) {
  d <- data[indices]
  mean(d, na.rm = T) 
}
tot_rep <- 200
v1 <- rnorm(1000)
b <- boot2(v1, m.boot, R = tot_rep)
Graycegrayheaded answered 7/6, 2016 at 17:20 Comment(1)
This is a nice solution, but i don't think that the maintainer of the package is interested in adding any progress bar natively, as this will, inevitably, slow down the performance of the boot function. Nevertheless, this may be an elegant solution using a copy of the boot function, as you did!Siret
S
1

I think i found a possible solution. This merges the answer of @Roland with the convenience of the pbapply package, using its functions startpb(), closepb(), etc..

library(boot)
library(pbapply)

v1 <- rnorm(1000)
rep_count = 1
tot_rep = 200

m.boot <- function(data, indices) {
  d <- data[indices]
  setpb(pb, rep_count)
  rep_count <<- rep_count + 1
  Sys.sleep(0.01)                #Just to slow down the process
  mean(d, na.rm = T) 
}

pb <- startpb(min = 0, max = tot_rep)
b <- boot(v1, m.boot, R = tot_rep)
closepb(pb)
rep_count = 1

As previously suggested, wrapping everything in a function avoids messing with the rep_count variable.

Siret answered 7/6, 2016 at 9:39 Comment(0)
G
1

The progress bar from the package dplyr works well:

library(dplyr)
library(boot)

v1 <- rnorm(1000)

m.boot <- function(data, indices) {
  d <- data[indices]
  p$tick()$print()  # update progress bar
  Sys.sleep(0.01)
  mean(d, na.rm = T) 
}

tot_rep <- 200
p <- progress_estimated(tot_rep+1)  # init progress bar
b <- boot(v1, m.boot, R = tot_rep)
Gussiegussman answered 22/3, 2018 at 23:13 Comment(0)
H
0

You can use the package pbapply

library(boot)
library(pbapply)
v1 <- rnorm(1000)
rep_count = 1

# your m.boot function ....
m.boot <- function(data, indices) {
                                   d <- data[indices]
                                   mean(d, na.rm = T) 
                                   }

# ... wraped in `bootfunc`
bootfunc <- function(x) { boot(x, m.boot, R = 200) }

# apply function to v1 , returning progress bar
pblapply(v1, bootfunc)

# > b <- pblapply(v1, bootfunc)
# >   |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% Elapsed time: 02s
Hypognathous answered 7/6, 2016 at 8:28 Comment(3)
I have a problem. This function runs the bootstrap function multiple times, obtaining a b object that is not a single bootstrap object, but a vector of 1000 bootstrap objects. I think that the pbapply does not work well with this function maybe.Siret
Indeed @fzara, I'm thinking about this and I'll come back with a work around this issue.Hypognathous
Thank you very much! In the meanwhile i found a workaround, hope it is useful for you too.Siret

© 2022 - 2024 — McMap. All rights reserved.