Fast way to return matching elements as list

Asked 17/10, 2022 at 12:28 Answered 19/10, 2022 at 14:57

I have two vectors:

set.seed(1)

a <- sample(1:100,200, replace=T)

b <- sample(1:100,40, replace=F)

I want to find the the positions of elements in a that match b:

sapply(b, function(x) which(a %in% x))

this would do the job but it takes very long

Is there a way to store the results in a list that is actually fast?

The desired output would look like this:

sapply(b, function(x) which(a %in% x))
[[1]]
integer(0)

[[2]]
integer(0)

[[3]]
[1] 107 142 199

[[4]]
[1] 109 126

[[5]]
[1] 136 167

[[6]]
integer(0)

[[7]]
integer(0)

[[8]]
[1]  73  91 176

[[9]]
[1]  51 146 181

Herophilus answered 17/10, 2022 at 12:28 Comment(3)

The real values in a and b are unique? (sample without replace = TRUE will give unique values.) – Ostraw 17/10, 2022 at 12:40

What about as.list(a[(which(a %in% b))])? – Dextran 17/10, 2022 at 12:42

@Maël OP also seems to want to have the mismatched values in the list – Autonomous 17/10, 2022 at 12:49

We can play a trick with by and index the list by as.character(b) (BUT, I would say this option is just fun but not fast), e.g.,

> unname(by(seq_along(a), a, list)[as.character(b)])
[[1]]
NULL

[[2]]
NULL

[[3]]
[1] 107 142 199

[[4]]
[1] 109 126

[[5]]
[1] 136 167

[[6]]
NULL

[[7]]
NULL

[[8]]
[1]  73  91 176

[[9]]
[1]  51 146 181

[[10]]
[1] 191

[[11]]
[1]  55 118

[[12]]
[1] 192

[[13]]
[1]  40  64 110 165

[[14]]
[1]  20  22 122 175

[[15]]
[1] 137 189

[[16]]
[1] 134

[[17]]
[1] 128

[[18]]
[1]  17  81 184

[[19]]
NULL

[[20]]
[1] 188 194

[[21]]
[1]  98 180

[[22]]
[1]  62 145

[[23]]
[1] 33

[[24]]
NULL

[[25]]
[1] 47

[[26]]
NULL

[[27]]
[1]  29 114 159

[[28]]
[1]  18  26 171

[[29]]
[1]  28  69 186 200

[[30]]
[1] 42

[[31]]
[1]  79 158 190

[[32]]
[1]  5 38 58 82

[[33]]
[1]  35  74 121

[[34]]
[1] 150

[[35]]
[1]  34  36 139

[[36]]
[1]  70 100 117 195

[[37]]
NULL

[[38]]
[1]  32  46 102

[[39]]
[1]  89 133

[[40]]
[1] 127 129 160

Teleology answered 17/10, 2022 at 14:7 Comment(2)

@Herophilus Be aware that this option is fun but not that fast (as you expected).... – Teleology 17/10, 2022 at 14:13

@Penitential I don't think my answer is fast – Teleology 17/10, 2022 at 14:14

You can split the seqence along a by a which is categorized (factor) by b.

split(seq_along(a), factor(a, b))

If needed the result can be unnamed.

unname(split(seq_along(a), factor(a, b)))

Or reducing to those which are in b.

. <- which(a %in% b)
split(., factor(a[.], b))

Or a way closer to the method from @Ritchie Sacramento and @ThomasIsCoding and using fastmatch.

library(fastmatch)
. <- which(a %fin% b)
split(., factor(fmatch(a[.], b), 1:length(b)))

Or using collapse::rsplit.

library(collapse)
rsplit(NULL, a)[as.character(b)]  #Returns NA where others return numeric(0)

Or some combinations of collapse and fastmatch.

library(fastmatch)
Library(collapse)

. <- fmatch(a, b)
attr(., "levels") <- b
class(.) <- "factor"  #WARNING Please don't use that
gsplit(NULL, .)[-length(b)-1]

i <- which(a %fin% b)
. <- fmatch(a[i], b)
attr(., "levels") <- b
class(.) <- "factor"  #WARNING Please don't use that
gsplit(i, .)

. <- fmatch(a, b)
i <- which(!is.na(.))
attr(., "levels") <- b
class(.) <- "factor"  #WARNING Please don't use that
gsplit(i, .[i])

Or write it in C++ using rcpp.

Rcpp::cppFunction("
Rcpp::List getIdx(const Rcpp::IntegerVector& a,
                  const Rcpp::IntegerVector& b) {
  std::unordered_map<int, std::vector<int> > m;
  for(auto const& i : b) m[i].clear();
  for(int i=0; i<=a.size(); ++i) {
    auto j = m.find(a[i]);
    if(j != m.end()) j->second.push_back(i+1);
  }
  std::vector< std::vector<int> > res;
  for(auto const& i : b) res.push_back(std::move(m[i]));
  return wrap( res );
}")
getIdx(a, b)

Benchmark

library(collapse) #For Mael and rsplit, gsplit
library(fastmatch) #For fmatch
library(data.table) #For jblood94

methods = alist(original = sapply(b, function(x) which(a %in% x)),
 Thomas = unname(by(seq_along(a), a, list)[as.character(b)]), #Not equal the original -> check = FALSE
 "==" = lapply(b, function(x) which(a == x)),
 Christian = {res  = rep_len(list(integer(0)), length(b))
                comm = intersect(b, a)
                res[match(comm, b)] = lapply(comm, function(x) which(a==x))
                res},
 Mael1 = lapply(b, function(x) a %==% x),
 Mael2 = lapply(b, function(x) whichv(a, x)),
 Ritchie = { m <- b[match(a, b)]
    idx <- !is.na(m)
    unname(split(which(idx), factor(match(m[idx], b), levels = seq.int(length(b)))))},
 jblood94 = {out <- rep(list(integer(0)), length(b))
   dt <- data.table(i = (i <- which(a %fin% b)), a = a[i])[, .(i = .(i)), a]
   out[fmatch(dt[[1]], b)] <- dt[[2]]
   out},
 split = split(seq_along(a), factor(a, b)), #Returns named list
 splitu = unname(split(seq_along(a), factor(a, b))),
 splitIn = {. <- which(a %in% b); split(., factor(a[.], b))},  #Returns named list
 fmatch = {. <- which(a %fin% b)   #Returns named list
   split(., factor(fmatch(a[.], b), 1:length(b))) },
 rcpp = getIdx(a, b),
 rsplit = rsplit(NULL, a)[as.character(b)],
 fmCol1 = {. <- fmatch(a, b); attr(., "levels") <- b
    class(.) <- "factor"; gsplit(NULL, .)[-length(b)-1]},
 fmCol2 = {i <- which(a %fin% b); . <- fmatch(a[i], b); attr(., "levels") <- b
    class(.) <- "factor"; gsplit(i, .)},
 fmCol3 = {. <- fmatch(a, b); i <- which(!is.na(.)); attr(., "levels") <- b
    class(.) <- "factor"; gsplit(i, .[i])}
)

Results

set.seed(1)
a <- sample(1:100, 200, replace=TRUE)
b <- sample(1:100, 40)
bench::mark(check = FALSE, exprs = methods)

   expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total…¹
   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl> <bch:t>
 1 original   110.13µs 123.87µs     7619.    3.97MB     22.4  3405    10   447ms
 2 Thomas     940.95µs 993.35µs     1000.  256.16KB     36.2   442    16   442ms
 3 ==          66.82µs  74.11µs    12243.  117.27KB     29.0  5482    13   448ms
 4 Christian   64.22µs  70.38µs    13075.   83.47KB     30.2  5636    13   431ms
 5 Mael1        39.7µs  42.81µs    22750.   41.07KB     32.4  9130    13   401ms
 6 Mael2       42.09µs  45.83µs    21051.   41.12KB     31.4  9384    14   446ms
 7 Ritchie     18.22µs  20.33µs    47043.   49.07KB     28.2  9994     6   212ms
 8 jblood94   348.23µs 372.25µs     2613.    1.96MB     23.5  1221    11   467ms
 9 split       14.74µs  16.53µs    58574.    8.68KB     17.6  9997     3   171ms
10 splitu       15.2µs  17.08µs    56286.    8.68KB     22.5  9996     4   178ms
11 splitIn     15.17µs  16.95µs    56171.   10.23KB     28.1  9995     5   178ms
12 fmatch      15.49µs  17.33µs    54217.       9KB     21.7  9996     4   184ms
13 rcpp         6.14µs   6.75µs   140790.    2.85KB     14.1  9999     1    71ms
14 rsplit      22.46µs  25.09µs    37062.    9.84KB     29.7  9992     8   270ms
15 fmCol1      13.62µs  15.44µs    61478.   42.27KB     30.8  9995     5   163ms
16 fmCol2      12.64µs   14.2µs    68176.    4.46KB     27.3  9996     4   147ms
17 fmCol3      15.31µs  17.02µs    55035.   15.62KB     38.6  9993     7   182ms

set.seed(42)
a <- sample(1:10000, 20000, replace=TRUE)
b <- sample(1:10000, 4000)
bench::mark(check = FALSE, exprs = methods)

   expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total…¹
   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl> <bch:t>
 1 original   698.85ms 698.85ms      1.43    1.19GB    35.8      1    25   699ms
 2 Thomas      115.5ms 118.76ms      8.45     2.6MB    42.2      5    25   592ms
 3 ==         137.85ms 165.14ms      5.73  610.78MB    72.6      3    38   524ms
 4 Christian  118.31ms 123.18ms      8.13   527.1MB    89.5      5    55   615ms
 5 Mael1       45.27ms  47.62ms     18.4   305.42MB   116.      10    63   544ms
 6 Mael2       43.84ms  45.76ms     21.9   305.42MB   137.      11    69   503ms
 7 Ritchie      2.71ms   2.77ms    352.      1.21MB     7.96   177     4   503ms
 8 jblood94      4.7ms   6.02ms    163.    649.87KB    29.8     82    15   503ms
 9 split        4.01ms    4.1ms    238.     752.3KB     3.99   119     2   501ms
10 splitu       4.02ms   4.18ms    229.     752.3KB     3.98   115     2   503ms
11 splitIn      2.68ms    2.8ms    346.    863.79KB     5.97   174     3   502ms
12 fmatch       2.36ms   2.51ms    383.    769.34KB     5.98   192     3   501ms
13 rcpp       875.88µs 896.47µs    930.     33.79KB     8.00   465     4   500ms
14 rsplit       1.97ms   2.06ms    464.    902.81KB    12.0    232     6   500ms
15 fmCol1     854.57µs 893.17µs   1035.    672.34KB    16.0    518     8   500ms
16 fmCol2     410.06µs 425.88µs   2156.    407.27KB    22.0   1078    11   500ms
17 fmCol3     365.01µs 381.88µs   2285.       454KB    26.0   1143    13   500ms

set.seed(1) #Taken from jblood94, Test only a selection of methods
a <- sample(1:1e6, 2e6, replace = TRUE)
b <- sample(1:1e6, 4e5)
bench::mark(check = FALSE, exprs = methods[-1:-6], min_iterations = 7)

   expression      min   median `itr/sec` mem_alloc gc/se…¹ n_itr  n_gc total_…²
   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>   <dbl> <int> <dbl> <bch:tm>
 1 Ritchie    593.69ms 789.89ms     1.29   127.36MB   2.20      7    12    5.45s
 2 jblood94   500.65ms  616.8ms     1.47    57.11MB   4.84      7    23    4.75s
 3 split         1.15s    1.28s     0.767   79.14MB   0.658     7     6    9.12s
 4 splitu        1.25s    1.39s     0.724   75.14MB   0.517     7     5    9.67s
 5 splitIn    650.54ms 884.24ms     1.15    86.79MB   0.987     7     6    6.08s
 6 fmatch     598.77ms 626.69ms     1.55    76.69MB   1.11      7     5     4.5s
 7 rcpp       382.63ms 389.04ms     2.48     3.05MB   0.355     7     1    2.82s
 8 rsplit     618.36ms 723.24ms     1.39    84.64MB   1.59      7     8    5.05s
 9 fmCol1     187.54ms 235.62ms     3.91    65.61MB   3.35      7     6    1.79s
10 fmCol2      64.61ms  67.62ms    13.7     39.68MB   5.87      7     3 511.42ms
11 fmCol3      58.71ms  64.26ms    14.4     44.26MB   5.41      8     3 554.08ms

Ostraw answered 17/10, 2022 at 12:50 Comment(5)

yes but I would need a bigger speed improvement because the vectors are really large – Herophilus 17/10, 2022 at 14:7

I add a version written in c++ which can be used in R with rcpp. This one is with the given data about 15 times faster than the original. – Ostraw 17/10, 2022 at 19:46

the split + factor looks super neat, love it! +1! – Teleology 18/10, 2022 at 11:58

@GKi, thanks for checking. I checked again, and now it's the same--maybe from starting a new session? I'll update my answer. – Apologetic 20/10, 2022 at 20:40

@Apologetic Thanks for the comment, what forced me to look again on it and improve the speed of the C++ version. – Ostraw 21/10, 2022 at 19:24

Here is another way to solve your problem. I also added a benchmark of the solutions proposed by @GKi, @ThomasIsCoding, and mine.

res  = rep_len(list(integer(0)), length(b))
comm = intersect(b, a)                            # do not flip the order of elements
res[match(comm, b)] = lapply(comm, function(x) which(a==x))

[[1]]
integer(0)

[[2]]
integer(0)

[[3]]
[1] 107 142 199

[[4]]
[1] 109 126

[[5]]
[1] 136 167

[[6]]
integer(0)

[[7]]
integer(0)

[[8]]
[1]  73  91 176

[[9]]
[1]  51 146 181

...

Benchmark

f1_kamgang = function() {
  res  = rep_len(list(integer(0)), length(b))
  comm = intersect(b, a)                               # do not flip the order of elements
  res[match(comm, b)] = lapply(comm, function(x) which(a==x))
  res
}

f2_Thomas = function() {
  unname(by(seq_along(a), a, list)[as.character(b)])
}

f3_GKi = function() {
  sapply(b, function(x) which(a == x))     
}

f4_original = function() {
  sapply(b, function(x) which(a %in% x))     
}



microbenchmark::microbenchmark(
  f1_kamgang(),
  f2_Thomas(),
  f3_GKi(),
  f4_original(),
  times=5L
)

Unit: microseconds
          expr    min     lq    mean median     uq    max neval cld
  f1_kamgang()  191.2  194.8  209.70  214.3  216.5  231.7     5 a  
   f2_Thomas() 2105.4 2130.0 2240.76 2267.9 2308.8 2391.7     5   c
      f3_GKi()  223.5  231.0  242.72  242.6  253.9  262.6     5 a  
 f4_original()  365.5  366.3  393.10  376.9  399.6  457.2     5  b

Villenage answered 17/10, 2022 at 13:5 Comment(2)

Thanks, sorry I updated the question to clarify some things. this is close but I would actually like to group all matching entries into the same position in the list. – Herophilus 17/10, 2022 at 13:21

Try this and let me know if it's fast enough – Villenage 17/10, 2022 at 14:31

Here is a vectorized approach that is efficient:

f <- function(x, y) {
  m <- y[match(x, y)]
  idx <- !is.na(m)
  unname(split(which(idx), factor(match(m[idx], b), levels = seq.int(length(y)))))
}

f(a, b)

[[1]]
integer(0)

[[2]]
integer(0)

[[3]]
[1] 107 142 199

[[4]]
[1] 109 126

[[5]]
[1] 136 167

[[6]]
integer(0)

[[7]]
integer(0)

[[8]]
[1]  73  91 176

[[9]]
[1]  51 146 181

[[10]]
[1] 191

...

Hanley answered 17/10, 2022 at 15:15 Comment(0)

You can use %==%, a faster version of which from collapse (3x faster). Another collapse function that might come in handy is whichv. Also lapply is faster than sapply:

library(collapse)
lapply(b, function(x) a %==% x)
lapply(b, function(x) whichv(a, x))

identical(sapply(b, function(x) a %==% x),
          sapply(b, function(x) which(a %in% x)))
#[1] TRUE

benchmark

bench::mark(
  by = unname(by(seq_along(a), a, list)[as.character(b)]),
  original = sapply(b, function(x) which(a %in% x)),
  "==" = sapply(b, function(x) which(a == x)),
  collapse_s = sapply(b, function(x) collapse::whichv(a, x)),
  collapse_l = lapply(b, function(x) collapse::whichv(a, x)),
  "%==%" = lapply(b, function(x) a %==% x),
  time_unit = "ms",
  check = FALSE
)

# expression    min median `itr/sec` mem_a…¹ gc/se…² n_itr  n_gc total…³ result
# by         1.56   1.74        545.  27.1KB    15.6   244     7    448. <NULL>
# original   0.189  0.221      3770. 140.9KB    11.4  1659     5    440. <NULL>
# ==         0.103  0.118      7657.  74.8KB    13.8  3319     6    433. <NULL>
# collapse_s 0.0838 0.0917     9884.  39.8KB    15.8  4389     7    444. <NULL>
# collapse_l 0.0722 0.0794    11379.  38.9KB    13.5  4217     5    371. <NULL>
# %==%       0.0591 0.0648    13852.  38.8KB    15.5  6239     7    450. <NULL>

Penitential answered 17/10, 2022 at 12:35 Comment(0)

It's worth mentioning that for very large vectors, the fmatch solution provided by GKi is faster than Rcpp. We can squeeze a little more performance by using data.table instead of split:

library(fastmatch)
library(data.table)

set.seed(1)
a <- sample(1:1e6, 2e6, replace = TRUE)
b <- sample(1:1e6, 4e5)

Rcpp::cppFunction("
Rcpp::List getIdx(const Rcpp::IntegerVector& a,
                  const Rcpp::IntegerVector& b) {
  std::map<int, std::vector<int> > m;
  for(auto const& i : b) m[i].clear();
  for(int i=0; i<=a.size(); ++i) {
    auto j = m.find(a[i]);
    if(j != m.end()) j->second.push_back(i+1);
  }
  std::vector< std::vector<int> > res;
  for(auto const& i : b) res.push_back(m[i]);
  return wrap( res );
}")

f1 <- function(a, b) {
  out <- rep(list(integer(0)), length(b))
  dt <- data.table(i = (i <- which(a %fin% b)), a = a[i])[, .(i = .(i)), a]
  out[fmatch(dt[[1]], b)] <- dt[[2]]
  out
} 
f2 <- function(a, b) unname(split(seq_along(a), factor(a, b)))
f3 <- function(a, b) unname(split(. <- which(a %fin% b), factor(fmatch(a[.], b), 1:length(b))))

microbenchmark::microbenchmark(
  data.table = f1(a, b),
  splitu = f2(a, b),
  fmatch = f3(a, b),
  rcpp = getIdx(a, b),
  times = 10,
  check = "identical"
)

#> Unit: milliseconds
#>        expr       min        lq      mean    median        uq       max neval
#>  data.table  514.9984  537.0271  666.7136  627.8260  801.4822  917.1895    10
#>      splitu 1149.5673 1304.1898 1534.1828 1433.2475 1857.5032 2001.8947    10
#>      fmatch  539.6848  648.0697  708.2637  725.9943  773.4836  853.1337    10
#>        rcpp 1179.0837 1201.9983 1247.6945 1209.8060 1233.9712 1545.2003    10

Apologetic answered 19/10, 2022 at 14:57 Comment(0)

If you are happy with NA instead of elements with lengths zero, you can coerce the vector as list and replace.

replace(as.list(a), !a %in% b, NA)

Autonomous answered 17/10, 2022 at 12:36 Comment(0)

We can play a trick with by and index the list by as.character(b) (BUT, I would say this option is just fun but not fast), e.g.,

> unname(by(seq_along(a), a, list)[as.character(b)])
[[1]]
NULL

[[2]]
NULL

[[3]]
[1] 107 142 199

[[4]]
[1] 109 126

[[5]]
[1] 136 167

[[6]]
NULL

[[7]]
NULL

[[8]]
[1]  73  91 176

[[9]]
[1]  51 146 181

[[10]]
[1] 191

[[11]]
[1]  55 118

[[12]]
[1] 192

[[13]]
[1]  40  64 110 165

[[14]]
[1]  20  22 122 175

[[15]]
[1] 137 189

[[16]]
[1] 134

[[17]]
[1] 128

[[18]]
[1]  17  81 184

[[19]]
NULL

[[20]]
[1] 188 194

[[21]]
[1]  98 180

[[22]]
[1]  62 145

[[23]]
[1] 33

[[24]]
NULL

[[25]]
[1] 47

[[26]]
NULL

[[27]]
[1]  29 114 159

[[28]]
[1]  18  26 171

[[29]]
[1]  28  69 186 200

[[30]]
[1] 42

[[31]]
[1]  79 158 190

[[32]]
[1]  5 38 58 82

[[33]]
[1]  35  74 121

[[34]]
[1] 150

[[35]]
[1]  34  36 139

[[36]]
[1]  70 100 117 195

[[37]]
NULL

[[38]]
[1]  32  46 102

[[39]]
[1]  89 133

[[40]]
[1] 127 129 160

Teleology answered 17/10, 2022 at 14:7 Comment(2)

@Herophilus Be aware that this option is fun but not that fast (as you expected).... – Teleology 17/10, 2022 at 14:13

@Penitential I don't think my answer is fast – Teleology 17/10, 2022 at 14:14

Hot tags

Godot Unity Godot Help Programming Godot 4.X GUI GDScript 3D 2D Physics CSharp Godot 3.X VR XR Projects C++

Benchmark

Recommended topics

Hot tags