Below is a vectorized function that uses collapse::fmode
. It will process your "real" dataset in less than half a second and can handle any data type. It wasn't clear if NA
should be included in the count, hence the incl.na
argument.
rowsames <- function(df, incl.na = TRUE) {
u <- unlist(df, 0, 0)
m <- match(u, unique(u), NA_integer_, if (incl.na) NULL else NA)
rowSums(`dim<-`(fmode(m, sequence(rep.int(nrow(df), ncol(df))), na.rm = TRUE,
use.g.names = FALSE) == m, dim(df)), na.rm = TRUE)
}
Applying it on the example data.frame
:
set.seed(1548243204)
df = data.frame(statement_1 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_2 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_3 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_4 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_5 = sample(c(NA, 1:5), 1000, replace = TRUE))
rowsames(df)[1:10]
#> [1] 2 3 3 2 3 2 2 2 3 2
rowsames(df, FALSE)[1:10]
#> [1] 2 3 3 2 3 2 2 2 3 1
Note NA
is the most frequent value in the 10th row, so rowsames
returns 2
for that row if incl.na
is set to TRUE
and 1
otherwise.
df[1:10,]
#> statement_1 statement_2 statement_3 statement_4 statement_5
#> 1 5 5 4 NA 2
#> 2 4 3 5 5 5
#> 3 5 5 NA 5 NA
#> 4 4 5 2 1 5
#> 5 3 4 4 4 3
#> 6 3 NA 3 1 1
#> 7 NA 4 NA 2 4
#> 8 NA 1 4 3 3
#> 9 4 2 4 3 4
#> 10 2 NA NA 4 5
my real data has 30 cols and 200.000 rows
df <- as.data.frame(matrix(sample(c(NA, 1:5), 6e6, 1), 2e5, 30, 0,
list(NULL, paste0("statement_", 1:30))))
system.time(rowsames(df))
#> user system elapsed
#> 0.41 0.03 0.43
Additional Benchmarking
Extending @RonakShah's benchmarking:
set.seed(3452)
df = data.frame(statement_1 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_2 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_3 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_4 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_5 = sample(c(NA, 1:5), 1e5, replace = TRUE))
fun_dplyr <- function(df) {
columns = c("statement_1", "statement_2", "statement_3", "statement_4", "statement_5")
df %>%
rowwise() %>%
mutate(identical_count = case_when(
# all NA, then NA
all(is.na(c_across(all_of(columns)))) ~ NA_real_,
# else, count the number of identical values
TRUE ~ max(table(c_across(all_of(columns)))))) %>%
ungroup()
}
fun_apply <- function(df) {
apply(as.matrix(df), MARGIN=1, FUN=\(x) max(tabulate(x)))
}
fun_dapply <- function(df) {
collapse::dapply(df, \(x) max(tabulate(x)), MARGIN = 1)
}
Timings:
microbenchmark::microbenchmark(
# fun_dplyr = fun_dplyr(df), # too slow
fun_apply = fun_apply(df),
fun_dapply = fun_dapply(df),
rowsames = rowsames(df)
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> fun_apply 298.4000 327.0602 355.26551 352.78755 379.27705 433.5157 100
#> fun_dapply 235.7283 261.1802 300.46456 287.33650 335.46850 442.8762 100
#> rowsames 34.4906 36.1296 37.61214 37.16805 38.68065 47.0696 100