Replace NA with previous value with limit on number of consecutive NA
Asked Answered
E

5

9

I would like to replace up to n consecutive NA values in vector with latest non-NA value.

For example, if:

a <- c(1,NA,NA,NA,NA,NA,2,NA,1,NA,NA,NA)
n <- 2

I would like to obtain:

c(1,1,1,NA,NA,NA,2,2,1,1,1,NA)

n is maximum number of NA values that can be replaced by given element).

I know na.locf() function, but I don't know how to set the limit n. Is it possible to do it?

Eyla answered 4/4, 2017 at 12:58 Comment(1)
Some additional alternatives here: Fill NA in a time series only to a limited numberHaro
L
10

Here's an option using na.locf and rle

library(zoo)
r <- rle(is.na(a))
a <- na.locf(a)
is.na(a) <- sequence(r$lengths) > n & rep(r$values, r$lengths)
a
# [1]  1  1  1 NA NA NA  2  2  1  1  1 NA

So here I first computed the run lengths of elements in a (including the NA entries), then replaced all NA's using na.locf and finally turned those elements back to NA's where the run lengths were greater than n and the elements were NA.

Lattimore answered 4/4, 2017 at 13:8 Comment(1)
This answers this question, but note that it doesn't work if the first value in a is NA. In that case, you need to have a <- na.locf(a, na.rm = FALSE) instead of a <- na.locf(a).Anthologize
U
3

As another idea, we can find the last indices of "a" without NAs:

is = seq_along(a)
i = cummax((!is.na(a)) * is)
i
# [1] 1 1 1 1 1 1 7 7 9 9 9 9

Replace the last non-NA index with the current index if last non-NA is more than "n" steps away:

wh = (is - i) > n
i[wh] = is[wh]
i
# [1]  1  1  1  4  5  6  7  7  9  9  9 12

And subset "a":

a[i]
# [1]  1  1  1 NA NA NA  2  2  1  1  1 NA
Unsettle answered 4/4, 2017 at 15:17 Comment(1)
@989 : I might be missing something, but shouldn't the result be c(1, 1, 1, 1, 1, NA, 2, 2, 1, 1, 1, NA) in that case? I get this result after a[i] from above.Unsettle
B
2

You could do this using split and replace in base R

f <- function(a, n) {
  # split the vector based on the position of non-NA values
  l <- split(a, cumsum(seq_along(a) %in% which(!is.na(a))))
  unlist(lapply(l, function(r) replace(r, 1:(n+1), r[1])[seq_along(r)]),use.names = FALSE)
}

f(a, n = 2)
#[1]  1  1  1 NA NA NA  2  2  1  1  1 NA

f(a, n = 3)
#[1]  1  1  1  1 NA NA  2  2  1  1  1  1

Benchmarking (random generated vector of size 7467)

library(microbenchmark)
library(dplyr)
library(zoo)
set.seed(123)
a <- unlist(replicate(1000, c(sample(10, 2), rep(NA, sample.int(10, 1)))))
length(a)
# [1] 7467
n <- 3
f_989 <- function(a, n) {
  # split the vector based on the position of non-NA values
  l <- split(a, cumsum(seq_along(a) %in% which(!is.na(a))))
  unlist(lapply(l, function(r) replace(r, 1:(n+1), r[1])[seq_along(r)]),use.names = FALSE)
}
f_zx8754 <- function(a, n)
data.frame(a) %>% mutate(gr = cumsum(!is.na(a))) %>% 
  group_by(gr) %>% 
  mutate(res = if_else(row_number() <= n + 1, na.locf(a), a)) %>% 
  .$res
f_docendo_discimus <- function(a, n){
    r <- rle(is.na(a))
    a <- na.locf(a)
    is.na(a) <- sequence(r$lengths) > n & rep(r$values, r$lengths)
    a
}
f_akrun <- function(a,n) 
ave(a, cumsum(!is.na(a)), FUN = function(x) replace(x, pmin(length(x), seq(n+1)), x[1]))

f_alexis_laz=function(a,n){
    is = seq_along(a)
    i = cummax((!is.na(a)) * is)
    wh = (is - i) > n
    i[wh] = is[wh]
    a[i]
}
r <- f_989(a,n)
identical(r, f_zx8754(a,n))
# [1] TRUE
identical(r, f_docendo_discimus(a,n))
# [1] TRUE
identical(r, f_akrun(a,n))
# [1] TRUE
identical(r, f_alexis_laz(a,n))
# [1] TRUE
res <- microbenchmark("f1"=f_989(a,n), "f2"=f_zx8754(a,n), 
"f3"=f_docendo_discimus(a,n), "f4"=f_akrun(a,n), "f5"=f_alexis_laz(a,n))

print(res, order="mean")

# Unit: microseconds
 # expr        min         lq       mean      median          uq        max neval
   # f5    129.804    137.014    161.106    141.6715    151.7375   1031.511   100
   # f3   1249.351   1354.215   1471.478   1392.9750   1482.2140   2553.086   100
   # f1   4736.895   5093.852   5630.367   5345.3450   6069.9260   8848.513   100
   # f4  22165.601  23936.866  24660.990  24485.6725  24883.6440  29453.177   100
   # f2 205854.339 215582.174 221524.448 218643.9540 224211.0435 261512.922   100
Balneology answered 4/4, 2017 at 13:54 Comment(0)
E
1

We can use a base R approach by creating a grouping variable with cumsum and diff, then using the grouping variable in ave we replace the NA values based on the condition given by 'n'

ave(a, cumsum(c(TRUE, diff(is.na(a)) < 0)), 
      FUN = function(x) replace(x, is.na(x) & seq_along(x) <= n + 1, x[1]))
#[1]  1  1  1 NA NA NA  2  2  1  1  1 NA

Or more compact option

ave(a, cumsum(!is.na(a)), FUN = function(x) replace(x, pmin(length(x), seq(n+1)), x[1]))
#[1]  1  1  1 NA NA NA  2  2  1  1  1 NA
Estradiol answered 4/4, 2017 at 13:13 Comment(2)
I don't seem to get the correct result when I use a = c(1,1,1,NA,NA,NA,2,NA,1,NA,NA,NA) with your first approach (I get [1] 1 1 1 NA NA NA 2 2 1 1 1 NA)Lattimore
@docendodiscimus I think it was based on the grouping variable. My initial thought is that the OP's example will be having a single non-NA element before each block of NAsEstradiol
B
0

Using dplyr::group_by and zoo::na.locf:

library(dplyr)
library(zoo)

data.frame(a) %>% 
  mutate(gr = cumsum(!is.na(a))) %>% 
  group_by(gr) %>% 
  mutate(res = if_else(row_number() <= n + 1, na.locf(a), a)) %>% 
  .$res

# [1]  1  1  1 NA NA NA  2  2  1  1  1 NA
Betook answered 4/4, 2017 at 13:12 Comment(0)

© 2022 - 2024 — McMap. All rights reserved.