Check for 5 consecutive TRUE values from the back of a vector
Asked Answered
F

5

13

I have the following data:

x <- c(F, T, T, T, F, T, T, T, T, T)
names(x) <- letters[1:10]
y <- c(T, F, T, T, T, F, T, T, T, T)
names(y) <- letters[1:10]
z <- c(T, T, F, T, T, T, T, T, F, F)
names(z) <- letters[1:10]
a <- c(T, T, T, T, T, F, T, F, T, T, T, T, T)
names(a) <- letters[1:13]

I want to create a function which can subset the first 5 consecutive T values, but from the back. For example, if I pass x object through that function, I should get the following output:

#    f    g    h    i    j 
# TRUE TRUE TRUE TRUE TRUE

Or if I pass y through it, I should just get an NA. Because there are no first 5 T values from the back.

z has first 5 consecutive T values in the middle and, hence, those should be returned.

#    d    e    f    g    h 
# TRUE TRUE TRUE TRUE TRUE

In a, there are two sets of 5 consecutive values, in the beginning and in the end. Since, the first group from the back would be the one at the end and hence those values should be returned.

#    i    j    k    l    m 
# TRUE TRUE TRUE TRUE TRUE

How can I make this function?

Fostoria answered 9/8, 2023 at 14:24 Comment(2)
Starting point: rev(x)[rev(x)][1:5]Rhizopod
oops, this comment-answer doesn't handle the consecutive partCrosier
E
7

With a basic for loop:

foo <- function(x) {
  true_in_a_row <- 0L
  found         <- FALSE
  for (i in length(x):1L) {
    if (x[i]) true_in_a_row <- true_in_a_row + 1L else true_in_a_row <- 0L
    if (true_in_a_row == 5L) {
      found <- TRUE
      break
    }
  }
  if (found) x[i:(i+4L)] else NA
}

foo(x)
#    f    g    h    i    j 
# TRUE TRUE TRUE TRUE TRUE 
foo(y)
# [1] NA
foo(z)
#    d    e    f    g    h 
# TRUE TRUE TRUE TRUE TRUE 
foo(a)
#    i    j    k    l    m 
# TRUE TRUE TRUE TRUE TRUE 

Benchmark

set.seed(42)
x <- sample(c(TRUE, FALSE), size = 1e6, replace = TRUE)
bench::mark(foo(x), last5(x), f_zoo(x), f_gregexpr(x), f_rle(x), f_embed(x))[1:4]
# # A tibble: 6 × 4
#   expression         min   median   `itr/sec`
#   <bch:expr>    <bch:tm> <bch:tm>       <dbl>
# 1 foo(x)           1.9µs    6.2µs 152792.    
# 2 last5(x)         107ms 149.53ms      5.35  
# 3 f_zoo(x)        14.39s   14.39s      0.0695
# 4 f_gregexpr(x) 259.58ms 283.42ms      3.53  
# 5 f_rle(x)         1.94s    1.94s      0.514 
# 6 f_embed(x)    187.22ms 201.41ms      5.04  

# With sparser TRUEs:
x <- sample(c(TRUE, FALSE), size = 1e6, replace = TRUE, prob = c(0.05, 0.95))
bench::mark(foo(x), last5(x), f_zoo(x), f_gregexpr(x), f_rle(x), f_embed(x))[1:4]
# 1 foo(x)         33.12ms  33.36ms    29.0  
# 2 last5(x)       13.11ms   25.5ms    37.9  
# 3 f_zoo(x)         5.14s    5.14s     0.194
# 4 f_gregexpr(x)  75.98ms  76.72ms    12.6  
# 5 f_rle(x)      208.37ms 221.82ms     4.58 
# 6 f_embed(x)     69.01ms  80.64ms    11.9 
Erudition answered 9/8, 2023 at 14:42 Comment(0)
A
9

Here's a solution using rle to calculate runs of values

last5 <- function(x) {
  with(rle(x), {
    group <- tail(which(lengths>=5 & values), 1)
    if (length(group)<1) return(NA)
    start <- ifelse(group>1, sum(lengths[1:(group-1)]),0) + (lengths[group]-5)+1
    x[start:(start+4)]
  })  
}

which gives the following output

last5(x)
#    f    g    h    i    j 
# TRUE TRUE TRUE TRUE TRUE 
last5(y)
# [1] NA
last5(z)
#    d    e    f    g    h 
# TRUE TRUE TRUE TRUE TRUE 
last5(a)
#    i    j    k    l    m 
# TRUE TRUE TRUE TRUE TRUE 

The idea is that it finds all runs with more than 5 TRUE values, then takes the last group (if there is one) and takes the last 5 values from that group)

Apteral answered 9/8, 2023 at 14:43 Comment(1)
"The answer is ALWAYS rle() " -- me.Welltimed
E
7

With a basic for loop:

foo <- function(x) {
  true_in_a_row <- 0L
  found         <- FALSE
  for (i in length(x):1L) {
    if (x[i]) true_in_a_row <- true_in_a_row + 1L else true_in_a_row <- 0L
    if (true_in_a_row == 5L) {
      found <- TRUE
      break
    }
  }
  if (found) x[i:(i+4L)] else NA
}

foo(x)
#    f    g    h    i    j 
# TRUE TRUE TRUE TRUE TRUE 
foo(y)
# [1] NA
foo(z)
#    d    e    f    g    h 
# TRUE TRUE TRUE TRUE TRUE 
foo(a)
#    i    j    k    l    m 
# TRUE TRUE TRUE TRUE TRUE 

Benchmark

set.seed(42)
x <- sample(c(TRUE, FALSE), size = 1e6, replace = TRUE)
bench::mark(foo(x), last5(x), f_zoo(x), f_gregexpr(x), f_rle(x), f_embed(x))[1:4]
# # A tibble: 6 × 4
#   expression         min   median   `itr/sec`
#   <bch:expr>    <bch:tm> <bch:tm>       <dbl>
# 1 foo(x)           1.9µs    6.2µs 152792.    
# 2 last5(x)         107ms 149.53ms      5.35  
# 3 f_zoo(x)        14.39s   14.39s      0.0695
# 4 f_gregexpr(x) 259.58ms 283.42ms      3.53  
# 5 f_rle(x)         1.94s    1.94s      0.514 
# 6 f_embed(x)    187.22ms 201.41ms      5.04  

# With sparser TRUEs:
x <- sample(c(TRUE, FALSE), size = 1e6, replace = TRUE, prob = c(0.05, 0.95))
bench::mark(foo(x), last5(x), f_zoo(x), f_gregexpr(x), f_rle(x), f_embed(x))[1:4]
# 1 foo(x)         33.12ms  33.36ms    29.0  
# 2 last5(x)       13.11ms   25.5ms    37.9  
# 3 f_zoo(x)         5.14s    5.14s     0.194
# 4 f_gregexpr(x)  75.98ms  76.72ms    12.6  
# 5 f_rle(x)      208.37ms 221.82ms     4.58 
# 6 f_embed(x)     69.01ms  80.64ms    11.9 
Erudition answered 9/8, 2023 at 14:42 Comment(0)
G
4

This is an opportunity for showcasing stats::filter, one of my favorite R functions. The following solution is unlikely to be significantly more efficient than the simple for loop (that could be easily achieved by implementing the for loop with Rcpp). However, the approach allows extending the problem to finding all sequences in an efficient way.

f_filter <- function(x) {
  x <- rev(x)
  y <- stats::filter(x, rep(1, 5), sides = 1)
  i <- which(y == 5)[1]
  if (is.finite(i)) x[i:(i-4)] else NA
}

f_filter(x)
#   f    g    h    i    j 
#TRUE TRUE TRUE TRUE TRUE 
f_filter(y)
#[1] NA
f_filter(z)
#   d    e    f    g    h 
#TRUE TRUE TRUE TRUE TRUE 
f_filter(a)
#   i    j    k    l    m 
#TRUE TRUE TRUE TRUE TRUE 

set.seed(42)
x <- sample(c(TRUE, FALSE), size = 1e6, replace = TRUE)
bench::mark(foo(x), f_filter(x))[1:4]
#  expression       min   median `itr/sec`
#  <bch:expr>  <bch:tm> <bch:tm>     <dbl>
#1 foo(x)         1.9µs    2.1µs  444340. 
#2 f_filter(x)   19.1ms   19.8ms      49.9

x <- sample(c(TRUE, FALSE), size = 1e6, replace = TRUE, prob = c(0.05, 0.95))
bench::mark(foo(x), f_filter(x))[1:4]
#  expression       min   median `itr/sec`
#  <bch:expr>  <bch:tm> <bch:tm>     <dbl>
#1 foo(x)        42.6ms   43.1ms      23.1
#2 f_filter(x)   18.1ms   18.9ms      52.9
Grandioso answered 10/8, 2023 at 12:22 Comment(1)
Really interesting application of stats::filter, +1!Reina
R
3

Actually you have many options to implement it, and below are just three of them:

  1. gregexpr
f <- function(v) {
    idx <- tail(gregexpr("1{5}", paste0(+v, collapse = ""))[[1]], 1)
    if (idx <= 0) NA else v[idx + (0:4)]
}
  1. rle
f <- function(v) {
    r <- tail(Filter(
        \(x) sum(x) == 5,
        split(v, with(rle(v), rep(seq_along(lengths), lengths)))
    ), 1)
    if (length(r)) r[[1]] else NA
}
  1. embed
f <- function(v) {
    idx <- which(rowSums(embed(v, 5)) == 5)
    if (length(idx)) v[max(idx) + (0:4)] else NA
}

Output

> f(x)
   f    g    h    i    j
TRUE TRUE TRUE TRUE TRUE

> f(y)
[1] NA

> f(z)
   d    e    f    g    h
TRUE TRUE TRUE TRUE TRUE

> f(a)
   i    j    k    l    m
TRUE TRUE TRUE TRUE TRUE
Reina answered 9/8, 2023 at 14:41 Comment(3)
Somehow, diving into regex to do a simple boolean inspection seems like the long way around.Welltimed
@CarlWitthoft you are right, that's true that regexp plays a trick but far less efficient.Reina
@CarlWitthoft I added a version with rleReina
B
3

For each subsequence of 5 check if it is all TRUE returning a logical vector and apply which to get the positions, wx. If it is non-empty return the 5 elements that end in the largest entry in wx and other return NA.

library(zoo)

f <- function(zz) {
  wx <- which(rollapplyr(zz, 5, all, fill = FALSE))
  if (length(wx) > 0) zz[seq(to = max(wx), length = 5)] else NA
}

# tests

f(x)
##    f    g    h    i    j 
## TRUE TRUE TRUE TRUE TRUE 

f(y)
## [1] NA

f(z)
##    d    e    f    g    h 
## TRUE TRUE TRUE TRUE TRUE 

f(a)
##    i    j    k    l    m 
## TRUE TRUE TRUE TRUE TRUE 
Brierwood answered 9/8, 2023 at 14:45 Comment(0)

© 2022 - 2024 — McMap. All rights reserved.