Another, quite similar way like @tpetzoldt and @tmfmnk, also removing the NA
.
input <- c(1,2,1,NA,NA,3,2,NA,1,5,6,NA,2,2) #Add consecutive NA
. <- is.na(input)
split(input[!.], cumsum(.)[!.])
#$`0`
#[1] 1 2 1
#
#$`2`
#[1] 3 2
#
#$`3`
#[1] 1 5 6
#
#$`4`
#[1] 2 2
Or the other way round
i <- !is.na(input)
split(input[i], cumsum(!i)[i])
or even
i <- is.na(input)
j <- which(!i)
split(input[j], cumsum(i)[j])
In case consecutive NA
should not be removed just convert it to factor
.
split(input[!.], factor(cumsum(.))[!.])
#$`0`
#[1] 1 2 1
#
#$`1`
#numeric(0)
#
#$`2`
#[1] 3 2
#
#$`3`
#[1] 1 5 6
#
#$`4`
#[1] 2 2
Or another way, not using split.
i <- is.na(input)
j <- which(i)
Map(\(...) input[seq(...)], from = c(1, 1+j), by = 1,
length.out = c(j-1, length(i)) - c(0, j))
#[[1]]
#[1] 1 2 1
#
#[[2]]
#numeric(0)
#
#[[3]]
#[1] 3 2
#
#[[4]]
#[1] 1 5 6
#
#[[5]]
#[1] 2 2
Or using Rcpp
:
Rcpp::cppFunction("
Rcpp::List splitByNa(const Rcpp::NumericVector& x) {
std::vector< std::vector<double> > res;
res.push_back(std::vector<double>());
for(auto const& y : x) {
if(NumericVector::is_na(y)) res.push_back(std::vector<double>());
else res.back().push_back(y);
}
return wrap( res );
}")
splitByNa(input)
#[[1]]
#[1] 1 2 1
#
#[[2]]
#numeric(0)
#
#[[3]]
#[1] 3 2
#
#[[4]]
#[1] 1 5 6
#
#[[5]]
#[1] 2 2
Benchmark
set.seed(42)
n <- 1e5
input <- sample(c(1:9, NA), n, TRUE)
library(tidyverse) #for TarJae
bench::mark(check = FALSE,
tmfmnk = split(na.omit(input), cumsum(is.na(input))[!is.na(input)]),
tpetzoldt = {tmp <- cumsum(is.na(input))
lapply(split(input, tmp), na.omit)},
TarJae = {tibble(input) %>%
group_by(id = cumsum(is.na(input))) %>%
na.omit %>%
group_split() %>%
map(.,~(.x %>%select(-id))) %>%
map(.,~(.x %>%pull))},
ChrisR = strsplit(paste(input, collapse = " "), " NA "), #Returns String
Thomas = split(na.omit(input), findInterval(seq_along(input)[!is.na(input)], which(is.na(input)))),
GKi1 = {. <- is.na(input); split(input[!.], cumsum(.)[!.])},
GKi2 = {i <- !is.na(input); split(input[i], cumsum(!i)[i])},
GKi3 = {i <- is.na(input); j <- which(!i); split(input[j], cumsum(.)[j])},
GKi4 = {i <- is.na(input)
j <- which(i)
Map(\(...) input[seq(...)], from = c(1, 1+j), by = 1,
length.out = c(j-1, length(i)) - c(0, j))},
GKi5 = splitByNa(input)
)
express…¹ min median itr/s…² mem_al…³ gc/se…⁴ n_itr n_gc total…⁵ result
<bch:exp> <bch:t> <bch:t> <dbl> <bch:by> <dbl> <int> <dbl> <bch:t> <list>
1 tmfmnk 4.86ms 5.29ms 8.77e+1 7.95MB 24.9 67 19 764.3ms <NULL>
2 tpetzoldt 37.32ms 38.24ms 2.44e+1 4.4MB 5.63 13 3 532.5ms <NULL>
3 TarJae 10.88s 10.88s 9.19e-2 109.69MB 3.68 1 40 10.9s <NULL>
4 ChrisR 13.72ms 13.94ms 6.75e+1 1.8MB 1.99 34 1 503.6ms <NULL>
5 Thomas 5.46ms 5.74ms 1.55e+2 8.71MB 25.8 78 13 503.5ms <NULL>
6 GKi1 4.67ms 4.92ms 1.77e+2 6.63MB 27.8 89 14 502.7ms <NULL>
7 GKi2 4.68ms 4.92ms 1.79e+2 6.63MB 29.8 90 15 504.2ms <NULL>
8 GKi3 4.33ms 4.54ms 1.20e+2 5.52MB 15.9 60 8 501.9ms <NULL>
9 GKi4 56.37ms 61.99ms 1.64e+1 1.88MB 5.47 9 3 548.5ms <NULL>
10 GKi5 2.41ms 2.72ms 3.20e+2 1.26MB 9.99 160 5 500.3ms <NULL>
The Rcpp version is the fastest, has lowest memory consumption and is able to handle consecutive NA.
na.omit
:lapply(split(input, tmp), \(x) na.omit(x) |> magrittr::set_attributes(NULL) )
. Or adjust the anonymous function to base R. – Debbradebby