Calculate timestamps for data with known frequency and missing data

type timestamp count <chr> <dttm> <int> 1 $ NA NA 2 D NA 229 3 M NA NA 4 D NA 230 5 D NA 231 6 D NA 232 7 D NA 233 8 D NA 234 9 D NA 235 10 D NA 236 11 D NA 237 12 D NA 238 13 D NA 239 14 S 2024-01-24 16:11:11.000 NA 15 D NA 241 16 D NA 242 17 D NA 243 18 D NA 126 19 D NA 127 20 S 2024-01-24 16:13:29.000 NA 21 D NA 128

structure(list(type = c("$", "D", "M", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "S", "D", "D", "D", "D", "D", "S", "D" ), timestamp = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1706130671, NA, NA, NA, NA, NA, 1706130809, NA ), tzone = "America/New_York", class = c("POSIXct", "POSIXt")), count = c(NA, 229L, NA, 230L, 231L, 232L, 233L, 234L, 235L, 236L, 237L, 238L, 239L, NA, 241L, 242L, 243L, 126L, 127L, NA, 128L)), row.names = c(NA, -21L), class = c("tbl_df", "tbl", "data.frame"))

type timestamp count <chr> <dttm> <int> 1 $ NA NA 2 D 2024-01-24 16:11:10.250 229 3 M NA NA 4 D 2024-01-24 16:11:10.312 230 5 D 2024-01-24 16:11:10.375 231 6 D 2024-01-24 16:11:10.437 232 7 D 2024-01-24 16:11:10.500 233 8 D 2024-01-24 16:11:10.562 234 9 D 2024-01-24 16:11:10.625 235 10 D 2024-01-24 16:11:10.687 236 11 D 2024-01-24 16:11:10.750 237 12 D 2024-01-24 16:11:10.812 238 13 D 2024-01-24 16:11:10.875 239 14 S 2024-01-24 16:11:11.000 NA 15 D 2024-01-24 16:11:11.000 241 16 D 2024-01-24 16:11:11.062 242 17 D 2024-01-24 16:11:11.125 243 18 D 2024-01-24 16:13:28.875 126 19 D 2024-01-24 16:13:28.937 127 20 S 2024-01-24 16:13:29.000 NA 21 D 2024-01-24 16:13:29.000 128

Here's a shot that goes through some timestamp-gymnastics.

library(dplyr)
# library(tidyr) # fill
df |>
  mutate(count2 = count, nexttime = timestamp, prevtime = timestamp) |>
  tidyr::fill(count2, .direction = "updown") |>
  mutate(
    count2 = count2 + 256*cumsum(c(FALSE, diff(count2) < 0)),
    nextind = if_else(is.na(timestamp), count2[NA], count2),
    prevind = nextind
  ) |>
  tidyr::fill(prevtime, prevind, .direction = "down") |>
  tidyr::fill(nexttime, nextind, .direction = "up") |>
  mutate(
    newtimestamp = case_when(
      !is.na(timestamp) ~ timestamp,
      is.na(prevtime) | abs(count2 - nextind) < abs(count2 - prevind) ~
        nexttime + (count2 - nextind)/16,
      TRUE ~
        prevtime + (count2 - prevind)/16
    )
  ) |>
  select(names(df), newtimestamp)
# # A tibble: 21 × 4
#    type  timestamp               count newtimestamp           
#    <chr> <dttm>                  <int> <dttm>                 
#  1 $     NA                         NA 2024-01-24 16:11:10.250
#  2 D     NA                        229 2024-01-24 16:11:10.250
#  3 M     NA                         NA 2024-01-24 16:11:10.312
#  4 D     NA                        230 2024-01-24 16:11:10.312
#  5 D     NA                        231 2024-01-24 16:11:10.375
#  6 D     NA                        232 2024-01-24 16:11:10.437
#  7 D     NA                        233 2024-01-24 16:11:10.500
#  8 D     NA                        234 2024-01-24 16:11:10.562
#  9 D     NA                        235 2024-01-24 16:11:10.625
# 10 D     NA                        236 2024-01-24 16:11:10.687
# 11 D     NA                        237 2024-01-24 16:11:10.750
# 12 D     NA                        238 2024-01-24 16:11:10.812
# 13 D     NA                        239 2024-01-24 16:11:10.875
# 14 S     2024-01-24 16:11:11.000    NA 2024-01-24 16:11:11.000
# 15 D     NA                        241 2024-01-24 16:11:11.000
# 16 D     NA                        242 2024-01-24 16:11:11.062
# 17 D     NA                        243 2024-01-24 16:11:11.125
# 18 D     NA                        126 2024-01-24 16:13:28.875
# 19 D     NA                        127 2024-01-24 16:13:28.937
# 20 S     2024-01-24 16:13:29.000    NA 2024-01-24 16:13:29.000
# 21 D     NA                        128 2024-01-24 16:13:29.000

Notes:

count2 is just count fully interpolated for NAs
The use of nexttime/prevtime is to carry-forward and carry-backward timestamp until there is another non-NA timestamp, I choose which to use in the case_when;
The nextind/prevind are used to subtract from count2 so that I can account for 1/16th seconds.
The case_when is really where most of the logic works, determine if the original timestamp should be retained, or (count2-nextind)/16 (or prevind) 1/16ths seconds from the nexttime (prevtime).

A data.table solution looks fairly similar. Using R-4.2 or newer, we can use |> _[] formatting:

library(data.table)
out <- as.data.table(df) |>
  _[, count2 := nafill(nafill(count, type = "nocb"), type = "locf") ] |>
  _[, count2 := count2 + 256*cumsum(c(FALSE, diff(count2) < 0)) ] |>
  _[, nextind := fifelse(is.na(timestamp), count2[NA], count2) ] |>
  _[, prevind := nextind ] |>
  _[, c("prevtime", "prevind") := lapply(.SD, nafill, type = "locf"), .SDcols = c("timestamp", "prevind")] |>
  _[, c("nexttime", "nextind") := lapply(.SD, nafill, type = "nocb"), .SDcols = c("timestamp", "nextind")] |>
  _[, newtimestamp := fcase(
    !is.na(timestamp), timestamp,
    is.na(prevtime) | abs(count2 - nextind) < abs(count2 - prevind), nexttime + (count2 - nextind)/16,
    rep(TRUE, .N), prevtime + (count2 - prevind)/16) ] |>
  _[, .SD, .SDcols = c(names(df), "newtimestamp")]

If on R before 4.2, we can use data.table's ][-piping.

DT <- as.data.table(df) # setDT(df) is canonical, avoiding that here for side-effect
DT[, count2 := nafill(nafill(count, type = "nocb"), type = "locf")
   ][, count2 := count2 + 256*cumsum(c(FALSE, diff(count2) < 0))
   ][, nextind := fifelse(is.na(timestamp), count2[NA], count2)
   ][, prevind := nextind
   ][, c("prevtime", "prevind") := lapply(.SD, nafill, type = "locf"), .SDcols = c("timestamp", "prevind")
   ][, c("nexttime", "nextind") := lapply(.SD, nafill, type = "nocb"), .SDcols = c("timestamp", "nextind")
   ][, newtimestamp := fcase(
     !is.na(timestamp), timestamp,
     is.na(prevtime) | abs(count2 - nextind) < abs(count2 - prevind), nexttime + (count2 - nextind)/16,
     rep(TRUE, .N), prevtime + (count2 - prevind)/16)
   ][, .SD, .SDcols = c(names(df), "newtimestamp")]

I prefer tidyr::fill's .direction="updown", it reduces the call stack and is easier to read in pipes like this.

Recommended topics

Hot tags