Count Trailing and Leading NA for each vector

Asked 15/9, 2019 at 15:46 Answered 16/9, 2019 at 1:44

I have some vectors in the following format

    v1 <- c(NA,NA,NA,10,10,10,10)
    v2 <- c(NA,NA, 3, 3, 3,NA,NA)
    v3 <- c( 5, 5, NA,NA,NA,NA,NA)

For each vector I want to calculate how many leading NAs and trailing NAs.

    For v1, LeadNA = 3, TrailNA = 0
    For v2, LeadNA = 2, TrailNA = 2
    For v3, LeadNA = 0, TrailNA = 5

Dendrology answered 15/9, 2019 at 15:46 Comment(0)

1) Cumsum - An option would be to create a logical vector with cumsum on the presence of non-NA elements and get the sum (base R - No packages used)

f1 <- function(vec, trail = FALSE) {
  if(trail) {
     vec <- rev(vec)
    }
    sum(!cumsum(!is.na(vec)))
 }

f1(v1)
#[1] 3
f1(v1, TRUE)
#[1] 0

sapply(mget(paste0("v", 1:3)), f1)
#  v1 v2 v3 
# 3  2  0 
sapply(mget(paste0("v", 1:3)), f1, TRUE)  
#  v1 v2 v3 
#  0  2  5

2 rle - Another base R option is rle (No packages are used)

with(rle(is.na(v2)), lengths[values & seq_along(values) %in% c(1, length(values))])

Misfile answered 15/9, 2019 at 15:51 Comment(0)

Wrapper over which.max:

leading.nas <- function(x) {
  if (length(x) == 0) {
   0L 
  }
  else {
    which.min(!is.na(x)) - 1
  }
}

Bevis answered 15/9, 2019 at 16:17 Comment(1)

Could also use which.min – Envious 15/9, 2019 at 16:48

This turns out to be similar to @Bulat's solution

count_nas <- function(x) {
  nas <- is.na(x)

  if (sum(nas) == length(x)) {
    warning('all elements were NA')
    return(c(start_na = NA_integer_, end_na = NA_integer_))
  }

  c(start_na = which.min(nas) - 1,
    end_na = which.min(rev(nas)) - 1)
}

count_nas(v1)
#start_na   end_na 
#       3        0 

sapply(list(v1,v2,v3), count_nas)
#         [,1] [,2] [,3]
#start_na    3    2    0
#end_na      0    2    5

As far as performance, this is the fastest method with @akrun's methods being in the ballpark.

v_test3 <- sample(10000)
v_test3[c(1:3, 9998:10000)] <- NA_integer_

Unit: microseconds
             expr     min       lq      mean   median       uq     max neval
     akrun_cumsum   175.7   182.15   193.580   186.55   200.80   354.7   100
        akrun_rle   168.6   199.25   210.635   209.25   221.00   289.3   100
    g_grothen_zoo  1848.5  1904.45  2008.994  1941.40  2001.35  4799.6   100
 g_grothen_reduce 12467.3 12888.10 14174.157 13445.15 15054.35 28241.6   100
        www_rleid  5357.2  5439.40  5741.471  5517.15  5947.15  8470.4   100
   bulat_and_cole    63.5    66.45    73.681    71.25    75.75    96.9   100

Code for reproducibility:

library(microbenchmark)
library(zoo)
library(data.table)

v_test3 <- sample(10000)
v_test3[c(1:3, 9998:10000)] <- NA_integer_

count_nas <- function(x) {
  nas <- is.na(x)

  if (sum(nas) == length(x)) {
    warning('all elements were NA')
    return(c(start_na = NA_integer_, end_na = NA_integer_))
  }

  c(start_na = which.min(nas) - 1,
    end_na = which.min(rev(nas)) - 1)
}

countNA <- function(x) {
  len <- function(fromLast = FALSE) length(na.locf(x, fromLast = fromLast))
  if (all(is.na(x))) c(left = NA, right = NA)
  else length(x) - c(left = len(), right = len(TRUE))
}

f1 <- function(vec, trail = FALSE) {
  if(trail) {
    vec <- rev(vec)
  }
  sum(!cumsum(!is.na(vec)))
}

count_fun <- function(x){
  y <- rleid(x)
  z <- split(x, y)[c(1, length(unique(y)))]
  ans <- sapply(z, function(x) sum(is.na(x)))
  return(unname(ans))
}

countNA2 <- function(x) {
  f <- function(x) sum(Reduce(all, is.na(x), acc = TRUE))
  if (all(is.na(x))) c(left = NA, right = NA)
  else c(left = f(x), right = f(rev(x)))
}

microbenchmark(
  akrun_cumsum = {
    f1(v_test3, TRUE)
    f1(v_test3, FALSE)
  }
  , 
  akrun_rle = {
    with(rle(is.na(v_test3)), lengths[values & seq_along(values) %in% c(1, length(values))])
  }
  ,
  g_grothen_zoo = {
    countNA(v_test3)
  }
  ,
  g_grothen_reduce = {
    countNA2(v_test3)
  }
  ,
  www_rleid = {
    count_fun(v_test3)
  }
  ,
  bulat_and_cole = {
    count_nas(v_test3)
  }
)

Spohr answered 16/9, 2019 at 1:44 Comment(0)

1) na.locf Remove the leading NAs using na.locf and determine the difference in length between the original and reduced vector. Do the same for the trailing NAs. It is not clear what should be returned if the input vector is empty or all NAs so we return NA for both left and right.

library(zoo)

countNA <- function(x) {
  len <- function(fromLast = FALSE) length(na.locf(x, fromLast = fromLast))
  if (all(is.na(x))) c(left = NA, right = NA)
  else length(x) - c(left = len(), right = len(TRUE))
}

countNA(v1)
##  left right 
##     3     0 

countNA(v2)
##  left right 
##     2     2 

countNA(v3)
##  left right 
##     0     5

It would also be possible to use na.fill to perform this calculation.

2) Reduce A second approach is to use Reduce. It gives the same answer. No packages are used.

countNA2 <- function(x) {
  f <- function(x) sum(Reduce(all, is.na(x), acc = TRUE))
  if (all(is.na(x))) c(left = NA, right = NA)
  else c(left = f(x), right = f(rev(x)))
}

Envious answered 15/9, 2019 at 16:6 Comment(0)

A function returns two numbers. The first is the count of leading NA. The second is the count of trailing NA. It requires the rleid function from the data.table package.

library(data.table)

count_fun <- function(x){
  y <- rleid(x)
  z <- split(x, y)[c(1, length(unique(y)))]
  ans <- sapply(z, function(x) sum(is.na(x)))
  return(unname(ans))
}

count_fun(v1)
# [1] 3 0

count_fun(v2)
# [1] 2 2

count_fun(v3)
# [1] 0 5

Crawler answered 15/9, 2019 at 17:0 Comment(0)

Recommended topics

Hot tags