An option in base, derive from the answers of @Montgomery-Clift and @AdamO, replacing NA
's with latest non-NA
value could be:
y <- c(NA, 2, 2, NA, NA, 3, NA, 4, NA, NA)
i <- c(TRUE, !is.na(y[-1]))
y[i][cumsum(i)]
# [1] NA 2 2 2 2 3 3 4 4 4
When only a few NA
exist they could be overwritten with the values of the latest non-NA value instead of creating a new vector.
fillNaR <- function(y) {
i <- which(is.na(y[-1]))
j <- which(diff(c(-1L,i)) > 1)
k <- diff(c(j, length(i) + 1))
i <- rep(i[j], k)
`[<-`(y, i + sequence(k), y[i])
}
fillNaR(y)
# [1] NA 2 2 2 2 3 3 4 4 4
When speed is important a loop propagating the last non-NA value in a loop could be written using RCPP. To be flexible on the input type this can be done using a template.
Rcpp::sourceCpp(code=r"(
#include <Rcpp.h>
using namespace Rcpp;
template <int RTYPE>
Vector<RTYPE> FNA(const Vector<RTYPE> y) {
auto x = clone(y); //or overwrite original
LogicalVector isNA = is_na(x);
size_t i = 0;
while(isNA[i] && i < x.size()) ++i;
for(++i; i < x.size(); ++i) if(isNA[i]) x[i] = x[i-1];
return x;
}
// [[Rcpp::export]]
RObject fillNaC(RObject x) {
RCPP_RETURN_VECTOR(FNA, x);
}
)")
fillNaC(y)
# [1] NA 2 2 2 2 3 3 4 4 4
Those functions can be used inside lapply
to apply them on all columns of a data.frame
.
DF[] <- lapply(DF, fillNaC)
Other answers using Rcpp, specialized on a data type, look like the following but are updating also the input vector.
y <- c(NA, 2, 2, NA, NA, 3, NA, 4, NA, NA)
Rcpp::cppFunction("NumericVector fillNaCN(NumericVector x) {
for(auto i = x.begin()+1; i < x.end(); ++i) if(*i != *i) *i = *(i-1);
return x;
}")
fillNaCN(y)
# [1] NA 2 2 2 2 3 3 4 4 4
y
# [1] NA 2 2 2 2 3 3 4 4 4
Benchmark
fillNaR <- function(y) {
i <- which(is.na(y[-1]))
j <- which(diff(c(-1L,i)) > 1)
k <- diff(c(j, length(i) + 1))
i <- rep(i[j], k)
`[<-`(y, i + sequence(k), y[i])
}
Rcpp::sourceCpp(code=r"(
#include <Rcpp.h>
using namespace Rcpp;
template <int RTYPE>
Vector<RTYPE> FNA(const Vector<RTYPE> y) {
auto x = clone(y); //or overwrite original
LogicalVector isNA = is_na(x);
size_t i = 0;
while(isNA[i] && i < x.size()) ++i;
for(++i; i < x.size(); ++i) if(isNA[i]) x[i] = x[i-1];
return x;
}
// [[Rcpp::export]]
RObject fillNaC(RObject x) {
RCPP_RETURN_VECTOR(FNA, x);
}
)")
repeat.before <- function(x) { # @Ruben
ind = which(!is.na(x))
if(is.na(x[1])) ind = c(1,ind)
rep(x[ind], times = diff(c(ind, length(x) + 1) ))
}
RB2 <- function(x) {
ind = which(c(TRUE, !is.na(x[-1])))
rep(x[ind], diff(c(ind, length(x) + 1)))
}
MC <- function(y) { # @Montgomery Clift
z <- !is.na(y)
z <- z | !cumsum(z)
y[z][cumsum(z)]
}
MC2 <- function(y) {
z <- c(TRUE, !is.na(y[-1]))
y[z][cumsum(z)]
}
fill.NAs <- function(x) { # @Valentas
is_na <- is.na(x)
x[Reduce(function(i,j) if (is_na[j]) i else j, seq_len(length(x)), accumulate=T)]}
M <- alist(
fillNaR = fillNaR(y),
fillNaC = fillNaC(y),
repeat.before = repeat.before(y),
RB2 = RB2(y),
MC = MC(y),
MC2 = MC2(y),
fill.NAs = fill.NAs(y),
tidyr = tidyr::fill(data.frame(y), y)$y,
zoo = zoo::na.locf(y, na.rm=FALSE),
data.table = data.table::nafill(y, type = "locf"),
data.table2 = with(data.table::data.table(y)[, y := y[1], .(cumsum(!is.na(y)))], y),
imputeTS = imputeTS::na_locf(y, na_remaining = "keep"),
runner = runner::fill_run(y, FALSE),
vctrs = vctrs::vec_fill_missing(y, direction = "down"),
ave = ave(y, cumsum(!is.na(y)), FUN = \(x) x[1])
)
Result
n <- 1e5
set.seed(42); y <- rnorm(n); is.na(y) <- sample(seq_along(y), n/100)
bench::mark(exprs = M) #1% NA
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
# 1 fillNaR 399.82µs 1.02ms 459. 3.56MB 31.9 230 16
# 2 fillNaC 672.85µs 883.74µs 976. 1.15MB 22.0 488 11
# 3 repeat.before 1.28ms 2.8ms 290. 7.57MB 58.0 145 29
# 4 RB2 1.93ms 3.66ms 229. 9.86MB 57.7 115 29
# 5 MC 1.01ms 1.98ms 289. 5.33MB 37.9 145 19
# 6 MC2 884.6µs 1.96ms 393. 6.09MB 53.5 198 27
# 7 fill.NAs 89.37ms 93.1ms 10.1 4.58MB 13.5 6 8
# 8 tidyr 8.42ms 11.3ms 86.3 1.55MB 5.89 44 3
# 9 zoo 1.83ms 3.19ms 216. 7.96MB 31.9 108 16
#10 data.table 73.91µs 259.71µs 2420. 797.38KB 36.0 1210 18
#11 data.table2 54.54ms 58.71ms 16.9 3.47MB 3.75 9 2
#12 imputeTS 623.69µs 1.07ms 494. 2.69MB 30.0 247 15
#13 runner 1.36ms 1.58ms 586. 783.79KB 10.0 293 5
#14 vctrs 149.98µs 317.14µs 1725. 1.53MB 54.0 863 27
#15 ave 137.87ms 149.25ms 6.53 14.77MB 8.17 4 5
set.seed(42); y <- rnorm(n); is.na(y) <- sample(seq_along(y), n/2)
bench::mark(exprs = M) #50% NA
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
# 1 fillNaR 2.15ms 3.13ms 217. 7.92MB 59.7 109 30
# 2 fillNaC 949.22µs 1.09ms 728. 1.15MB 28.0 364 14
# 3 repeat.before 1.36ms 1.89ms 287. 4.77MB 49.6 185 32
# 4 RB2 1.64ms 2.44ms 347. 7.06MB 39.9 174 20
# 5 MC 1.48ms 1.92ms 443. 4.77MB 34.0 222 17
# 6 MC2 1.09ms 1.72ms 479. 5.53MB 45.9 240 23
# 7 fill.NAs 93.17ms 104.28ms 9.58 4.58MB 9.58 5 5
# 8 tidyr 7.09ms 10.07ms 96.7 1.55MB 3.95 49 2
# 9 zoo 1.62ms 2.28ms 344. 5.53MB 29.8 173 15
#10 data.table 389.69µs 484.81µs 1225. 797.38KB 14.0 613 7
#11 data.table2 27.46ms 29.32ms 33.4 3.1MB 3.93 17 2
#12 imputeTS 1.71ms 2.1ms 413. 3.44MB 25.9 207 13
#13 runner 1.62ms 1.75ms 535. 783.79KB 7.98 268 4
#14 vctrs 144.92µs 293.44µs 2045. 1.53MB 48.0 1023 24
#15 ave 66.38ms 71.61ms 14.0 10.78MB 10.5 8 6
Depending on how many NA's are filled up either data.table::nafill
or vctrs::vec_fill_missing
are the fastest.
roll=TRUE
indata.table
. – Chair