You can split
the seqence along a by a which is categorized (factor
) by b.
split(seq_along(a), factor(a, b))
If needed the result can be unnamed.
unname(split(seq_along(a), factor(a, b)))
Or reducing to those which are in b.
. <- which(a %in% b)
split(., factor(a[.], b))
Or a way closer to the method from @Ritchie Sacramento and @ThomasIsCoding and using fastmatch
.
library(fastmatch)
. <- which(a %fin% b)
split(., factor(fmatch(a[.], b), 1:length(b)))
Or using collapse::rsplit
.
library(collapse)
rsplit(NULL, a)[as.character(b)] #Returns NA where others return numeric(0)
Or some combinations of collapse
and fastmatch
.
library(fastmatch)
Library(collapse)
. <- fmatch(a, b)
attr(., "levels") <- b
class(.) <- "factor" #WARNING Please don't use that
gsplit(NULL, .)[-length(b)-1]
i <- which(a %fin% b)
. <- fmatch(a[i], b)
attr(., "levels") <- b
class(.) <- "factor" #WARNING Please don't use that
gsplit(i, .)
. <- fmatch(a, b)
i <- which(!is.na(.))
attr(., "levels") <- b
class(.) <- "factor" #WARNING Please don't use that
gsplit(i, .[i])
Or write it in C++ using rcpp.
Rcpp::cppFunction("
Rcpp::List getIdx(const Rcpp::IntegerVector& a,
const Rcpp::IntegerVector& b) {
std::unordered_map<int, std::vector<int> > m;
for(auto const& i : b) m[i].clear();
for(int i=0; i<=a.size(); ++i) {
auto j = m.find(a[i]);
if(j != m.end()) j->second.push_back(i+1);
}
std::vector< std::vector<int> > res;
for(auto const& i : b) res.push_back(std::move(m[i]));
return wrap( res );
}")
getIdx(a, b)
Benchmark
library(collapse) #For Mael and rsplit, gsplit
library(fastmatch) #For fmatch
library(data.table) #For jblood94
methods = alist(original = sapply(b, function(x) which(a %in% x)),
Thomas = unname(by(seq_along(a), a, list)[as.character(b)]), #Not equal the original -> check = FALSE
"==" = lapply(b, function(x) which(a == x)),
Christian = {res = rep_len(list(integer(0)), length(b))
comm = intersect(b, a)
res[match(comm, b)] = lapply(comm, function(x) which(a==x))
res},
Mael1 = lapply(b, function(x) a %==% x),
Mael2 = lapply(b, function(x) whichv(a, x)),
Ritchie = { m <- b[match(a, b)]
idx <- !is.na(m)
unname(split(which(idx), factor(match(m[idx], b), levels = seq.int(length(b)))))},
jblood94 = {out <- rep(list(integer(0)), length(b))
dt <- data.table(i = (i <- which(a %fin% b)), a = a[i])[, .(i = .(i)), a]
out[fmatch(dt[[1]], b)] <- dt[[2]]
out},
split = split(seq_along(a), factor(a, b)), #Returns named list
splitu = unname(split(seq_along(a), factor(a, b))),
splitIn = {. <- which(a %in% b); split(., factor(a[.], b))}, #Returns named list
fmatch = {. <- which(a %fin% b) #Returns named list
split(., factor(fmatch(a[.], b), 1:length(b))) },
rcpp = getIdx(a, b),
rsplit = rsplit(NULL, a)[as.character(b)],
fmCol1 = {. <- fmatch(a, b); attr(., "levels") <- b
class(.) <- "factor"; gsplit(NULL, .)[-length(b)-1]},
fmCol2 = {i <- which(a %fin% b); . <- fmatch(a[i], b); attr(., "levels") <- b
class(.) <- "factor"; gsplit(i, .)},
fmCol3 = {. <- fmatch(a, b); i <- which(!is.na(.)); attr(., "levels") <- b
class(.) <- "factor"; gsplit(i, .[i])}
)
Results
set.seed(1)
a <- sample(1:100, 200, replace=TRUE)
b <- sample(1:100, 40)
bench::mark(check = FALSE, exprs = methods)
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total…¹
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:t>
1 original 110.13µs 123.87µs 7619. 3.97MB 22.4 3405 10 447ms
2 Thomas 940.95µs 993.35µs 1000. 256.16KB 36.2 442 16 442ms
3 == 66.82µs 74.11µs 12243. 117.27KB 29.0 5482 13 448ms
4 Christian 64.22µs 70.38µs 13075. 83.47KB 30.2 5636 13 431ms
5 Mael1 39.7µs 42.81µs 22750. 41.07KB 32.4 9130 13 401ms
6 Mael2 42.09µs 45.83µs 21051. 41.12KB 31.4 9384 14 446ms
7 Ritchie 18.22µs 20.33µs 47043. 49.07KB 28.2 9994 6 212ms
8 jblood94 348.23µs 372.25µs 2613. 1.96MB 23.5 1221 11 467ms
9 split 14.74µs 16.53µs 58574. 8.68KB 17.6 9997 3 171ms
10 splitu 15.2µs 17.08µs 56286. 8.68KB 22.5 9996 4 178ms
11 splitIn 15.17µs 16.95µs 56171. 10.23KB 28.1 9995 5 178ms
12 fmatch 15.49µs 17.33µs 54217. 9KB 21.7 9996 4 184ms
13 rcpp 6.14µs 6.75µs 140790. 2.85KB 14.1 9999 1 71ms
14 rsplit 22.46µs 25.09µs 37062. 9.84KB 29.7 9992 8 270ms
15 fmCol1 13.62µs 15.44µs 61478. 42.27KB 30.8 9995 5 163ms
16 fmCol2 12.64µs 14.2µs 68176. 4.46KB 27.3 9996 4 147ms
17 fmCol3 15.31µs 17.02µs 55035. 15.62KB 38.6 9993 7 182ms
set.seed(42)
a <- sample(1:10000, 20000, replace=TRUE)
b <- sample(1:10000, 4000)
bench::mark(check = FALSE, exprs = methods)
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total…¹
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:t>
1 original 698.85ms 698.85ms 1.43 1.19GB 35.8 1 25 699ms
2 Thomas 115.5ms 118.76ms 8.45 2.6MB 42.2 5 25 592ms
3 == 137.85ms 165.14ms 5.73 610.78MB 72.6 3 38 524ms
4 Christian 118.31ms 123.18ms 8.13 527.1MB 89.5 5 55 615ms
5 Mael1 45.27ms 47.62ms 18.4 305.42MB 116. 10 63 544ms
6 Mael2 43.84ms 45.76ms 21.9 305.42MB 137. 11 69 503ms
7 Ritchie 2.71ms 2.77ms 352. 1.21MB 7.96 177 4 503ms
8 jblood94 4.7ms 6.02ms 163. 649.87KB 29.8 82 15 503ms
9 split 4.01ms 4.1ms 238. 752.3KB 3.99 119 2 501ms
10 splitu 4.02ms 4.18ms 229. 752.3KB 3.98 115 2 503ms
11 splitIn 2.68ms 2.8ms 346. 863.79KB 5.97 174 3 502ms
12 fmatch 2.36ms 2.51ms 383. 769.34KB 5.98 192 3 501ms
13 rcpp 875.88µs 896.47µs 930. 33.79KB 8.00 465 4 500ms
14 rsplit 1.97ms 2.06ms 464. 902.81KB 12.0 232 6 500ms
15 fmCol1 854.57µs 893.17µs 1035. 672.34KB 16.0 518 8 500ms
16 fmCol2 410.06µs 425.88µs 2156. 407.27KB 22.0 1078 11 500ms
17 fmCol3 365.01µs 381.88µs 2285. 454KB 26.0 1143 13 500ms
set.seed(1) #Taken from jblood94, Test only a selection of methods
a <- sample(1:1e6, 2e6, replace = TRUE)
b <- sample(1:1e6, 4e5)
bench::mark(check = FALSE, exprs = methods[-1:-6], min_iterations = 7)
expression min median `itr/sec` mem_alloc gc/se…¹ n_itr n_gc total_…²
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
1 Ritchie 593.69ms 789.89ms 1.29 127.36MB 2.20 7 12 5.45s
2 jblood94 500.65ms 616.8ms 1.47 57.11MB 4.84 7 23 4.75s
3 split 1.15s 1.28s 0.767 79.14MB 0.658 7 6 9.12s
4 splitu 1.25s 1.39s 0.724 75.14MB 0.517 7 5 9.67s
5 splitIn 650.54ms 884.24ms 1.15 86.79MB 0.987 7 6 6.08s
6 fmatch 598.77ms 626.69ms 1.55 76.69MB 1.11 7 5 4.5s
7 rcpp 382.63ms 389.04ms 2.48 3.05MB 0.355 7 1 2.82s
8 rsplit 618.36ms 723.24ms 1.39 84.64MB 1.59 7 8 5.05s
9 fmCol1 187.54ms 235.62ms 3.91 65.61MB 3.35 7 6 1.79s
10 fmCol2 64.61ms 67.62ms 13.7 39.68MB 5.87 7 3 511.42ms
11 fmCol3 58.71ms 64.26ms 14.4 44.26MB 5.41 8 3 554.08ms
a
andb
are unique? (sample
withoutreplace = TRUE
will give unique values.) – Ostrawas.list(a[(which(a %in% b))])
? – Dextran