Is there an efficient way to extract a slice of a 3d array?
Asked Answered
B

4

12

Consider this:

a = array(1:60, c(3,4,5))

# Extract (*,2,2)^th elements
a[cbind(1:3,2,2)]
                                
# ^^ returns a vector of length 3, c(16, 17, 18)
# Note this is asymptotically inefficient -- cbind(1:k,2,2) is a kx3 matrix
# So it's not making use of slicing
 
# Extract (1, y_i, z_i)^th elements
N = 100000
y = sample(1:4, N, replace = TRUE)
z = sample(1:5, N, replace = TRUE)
a[cbind(1, y, z)]

# ^^ returns a vector of length N

How do I efficiently extract the (*, y_i, z_i) elements, with the result as a Nx3 matrix (i.e. 2d array)?

Note this question is similar but the only answer proceeds elementwise + so will be slow.

Browse answered 19/8, 2024 at 9:29 Comment(1)
I once had (approximately) this same need in VBA - perhaps it would be interesting for future readers of your question. (after reviewing my question, I was looking for a generalized solution to take an N dimensional array, specify an index in one of the dimensions, and get the resulting N-1 dimensional slice)Kriskrischer
S
8

A way might be to calculate the linear index of the array using its dim.

outer((y-1)*dim(a)[1] + (z-1)*prod(dim(a)[1:2]), seq_len(nrow(a)), \(x, y) a[x+y])
do.call(cbind, lapply(seq_len(nrow(a)), \(x, y) a[x+y], (y-1)*dim(a)[1] + (z-1)*prod(dim(a)[1:2])))

Or reducing the dim and selecting whole columns or whole rows.

t(`dim<-`(a, c(dim(a)[1], prod(dim(a)[-1])))[,y + (z - 1)*dim(a)[2]])

t(`dim<-`(a, c(dim(a)[1], prod(dim(a)[-1]))))[y + (z - 1)*dim(a)[2],]


Benchmark:

m <- alist(
  matrix_indexing = `dim<-`(a[cbind(rep(seq_len(nrow(a)), each = N), y, z)], c(N, nrow(a))),
  sapply_indexing = sapply(seq_len(nrow(a)), \(i) a[cbind(i, y, z)]),
  sapply_indexing2 = {yz <- cbind(y, z); sapply(1:dim(a)[1], \(i) a[i,,][yz])},
  linearIndex = outer((y-1)*dim(a)[1] + (z-1)*prod(dim(a)[1:2]), seq_len(nrow(a)), \(x, y) a[x+y]),
  linearIndex2 = do.call(cbind, lapply(seq_len(nrow(a)), \(x, y) a[x+y], (y-1)*dim(a)[1] + (z-1)*prod(dim(a)[1:2]))),
  asplit = do.call(rbind, asplit(a, -1)[cbind(y, z)]),
  aperm = `dim<-`(aperm(a, c(2, 3, 1)), c(prod(dim(a)[-1]), dim(a)[1]))[y + (z - 1)*dim(a)[2],],
  dimT = t(`dim<-`(a, c(dim(a)[1], prod(dim(a)[-1])))[,y + (z - 1)*dim(a)[2]]),
  tDim = t(`dim<-`(a, c(dim(a)[1], prod(dim(a)[-1]))))[y + (z - 1)*dim(a)[2],]
)
a = array(1:60, c(3,4,5))
N = 100000
y = sample(1:4, N, replace = TRUE)
z = sample(1:5, N, replace = TRUE)
bench::mark(exprs = m)
#  expression     min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
#  <bch:expr> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
#1 matrix_in…  9.46ms  9.59ms     102.     6.87MB     26.2    39    10    382.4ms
#2 sapply_in…  3.76ms  3.81ms     261.     8.02MB     92.2    85    30    325.4ms
#3 sapply_in…  2.89ms  2.96ms     334.     5.38MB     72.5   115    25    344.7ms
#4 linearInd…   4.7ms  4.76ms     208.     9.57MB    132.     55    35    264.3ms
#5 linearInd…   3.1ms  3.14ms     316.     7.25MB    115.     96    35    303.3ms
#6 asplit      40.8ms  40.8ms      24.5    3.08MB    221.      1     9     40.8ms
#7 aperm        1.3ms  1.31ms     748.     2.29MB     48.5   324    21    433.3ms
#8 dimT        2.03ms  2.06ms     478.     3.44MB     55.8   197    23    412.5ms
#9 tDim         1.3ms  1.31ms     748.     2.29MB     48.2   326    21    435.9ms
a = array(1:60000, c(30, 40, 50))
N = 1000
y = sample(1:40, N, replace = TRUE)
z = sample(1:50, N, replace = TRUE)
bench::mark(exprs = m)
#  expression            min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
#  <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
#1 matrix_indexing   952.7µs 960.17µs     1030.     703KB    14.7    492     7
#2 sapply_indexing   530.8µs 540.62µs     1813.     825KB    28.3    832    13
#3 sapply_indexing2  896.2µs 919.12µs     1077.     729KB    14.9    506     7
#4 linearIndex       426.7µs 435.92µs     2263.     836KB    39.5   1032    18
#5 linearIndex2      326.9µs 334.29µs     2945.     606KB    33.1   1244    14
#6 asplit              6.6ms   6.79ms      145.     637KB     8.53    68     4
#7 aperm             318.1µs 323.11µs     2900.     363KB    19.3   1355     9
#8 dimT              155.9µs 163.63µs     6061.     481KB    55.8   2714    25
#9 tDim              198.1µs 208.65µs     4721.     598KB    53.7   2108    24
Spancel answered 19/8, 2024 at 12:27 Comment(0)
P
4

Adding a couple more options to @GKi's benchmark:

yz <- cbind(y, z)
sapply(1:dim(a)[1], \(i) a[i,,][yz])

and

`dim<-`(aperm(a, c(2, 3, 1)), c(prod(dim(a)[-1]), dim(a)[1]))[y + (z - 1)*dim(a)[2],]

Benchmark

m <- alist(
  matrix_indexing = `dim<-`(a[cbind(rep(seq_len(nrow(a)), each = N), y, z)], c(N, nrow(a))),
  sapply_indexing = sapply(seq_len(nrow(a)), \(i) a[cbind(i, y, z)]),
  sapply_indexing2 = {yz <- cbind(y, z); sapply(1:dim(a)[1], \(i) a[i,,][yz])},
  linearIndex = outer((y-1)*dim(a)[1] + (z-1)*prod(dim(a)[1:2]), seq_len(nrow(a)), \(x, y) a[x+y]),
  linearIndex2 = do.call(cbind, lapply(seq_len(nrow(a)), \(x, y) a[x+y], (y-1)*dim(a)[1] + (z-1)*prod(dim(a)[1:2]))),
  asplit = do.call(rbind, asplit(a, -1)[cbind(y, z)]),
  aperm = `dim<-`(aperm(a, c(2, 3, 1)), c(prod(dim(a)[-1]), dim(a)[1]))[y + (z - 1)*dim(a)[2],]
)

a = array(1:60, c(3,4,5))
N = 100000
y = sample(1:4, N, replace = TRUE)
z = sample(1:5, N, replace = TRUE)
bench::mark(exprs = m)
#> # A tibble: 7 × 6
#>   expression            min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 matrix_indexing    3.83ms    4.7ms     210.     6.87MB     71.1
#> 2 sapply_indexing    2.32ms   3.73ms     271.    11.96MB     91.5
#> 3 sapply_indexing2    1.9ms    2.8ms     360.     5.41MB     78.6
#> 4 linearIndex        4.24ms   5.21ms     192.     9.57MB    123. 
#> 5 linearIndex2       1.89ms   3.16ms     322.     7.28MB    103. 
#> 6 asplit            26.15ms  34.32ms      31.3    3.08MB    135. 
#> 7 aperm             699.7µs   1.07ms     976.     2.29MB     62.1

Larger array, fewer samples:

a = array(1:60000, c(30, 40, 50))
N = 1000
y = sample(1:40, N, replace = TRUE)
z = sample(1:50, N, replace = TRUE)
bench::mark(exprs = m)
#> # A tibble: 7 × 6
#>   expression            min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 matrix_indexing   312.3µs  443.2µs     2223.     703KB     44.3
#> 2 sapply_indexing     305µs 495.25µs     2048.     825KB     42.6
#> 3 sapply_indexing2  493.5µs  651.8µs     1623.     729KB     30.4
#> 4 linearIndex       325.1µs  466.3µs     2246.     836KB     50.6
#> 5 linearIndex2        194µs  341.5µs     3060.     606KB     48.8
#> 6 asplit             4.57ms   4.73ms      204.     637KB     12.6
#> 7 aperm             199.7µs  277.9µs     3642.     363KB     24.8
Papen answered 19/8, 2024 at 13:16 Comment(2)
For the general case, shouldn't your aperm version take a 3-element vector as an input to the aperm() function itself? Not that I'd expect any difference in speedWehrmacht
Yes, I believe aperm() would be the way to go in the general case of (*, y_i, z_i), (y_i, *, z_i), or (y_i, z_i, *). @GKi's dimT or tDim seem to be best for the OP's specific case of (*, y_i, z_i) (depending on N and the dimensions of a).Papen
D
3

You can repeat each element of the 1st indexing vector for N times, and bind it with the 2nd and 3rd indexing vectors. Then reshape the extracted slice into a proper dimension.

`dim<-`(a[cbind(rep(1:3, each = N), y, z)], c(N, 3))

#           [,1] [,2] [,3]
#      [1,]   40   41   42
#      [2,]   49   50   51
#      [3,]   34   35   36
#      [4,]    7    8    9
#      [5,]   46   47   48
#      [6,]   16   17   18
#      [7,]   22   23   24
#      [8,]   25   26   27
#      [9,]   49   50   51
#     [10,]   16   17   18
#  [ reached getOption("max.print") -- omitted 99990 rows ]

You can also use sapply

sapply(1:3, \(i) a[cbind(i, y, z)])

Benchmark

I increase the dimension of a to 30x40x50, and N=1000. It's surprising for me that sapply is faster than matrix indexing.

a = array(1:60000, c(30, 40, 50))
N = 1000
y = sample(1:40, N, replace = TRUE)
z = sample(1:50, N, replace = TRUE)

microbenchmark::microbenchmark(
  matrix_indexing = `dim<-`(a[cbind(rep(seq_len(nrow(a)), each = N), y, z)], c(N, nrow(a))),
  sapply_indexing = sapply(seq_len(nrow(a)), \(i) a[cbind(i, y, z)]),
  times = 100L,
  check = "identical",
  unit = "relative"
)

# Unit: relative
#             expr      min       lq     mean   median       uq       max neval cld
#  matrix_indexing 1.937193 1.930684 1.621237 1.908087 1.927041 0.1649859   100  a 
#  sapply_indexing 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000   100   b

If I reduce N to 100, the result reverses.

N <- 100

# Unit: relative
#             expr      min       lq    mean   median       uq      max neval cld
#  matrix_indexing 1.000000 1.000000 1.00000 1.000000 1.000000 1.000000   100  a 
#  sapply_indexing 1.515337 1.519891 1.54679 1.516215 1.517535 1.756281   100   b
Duax answered 19/8, 2024 at 10:41 Comment(4)
due to the very small i?Lamina
surprising that sapply performs so excellently here, the star of today!Cameliacamella
it seem the speed depends on the distribution of dimensions, interesting!Cameliacamella
The values in ‘data’ are taken to be those in the array with the leftmost subscript moving fastest <?array> may be at play here.Gravois
C
3

For handy coding, probably you will be interested in asplit

do.call(rbind, asplit(a, -1)[cbind(y, z)])

but it is definitely slower than `dim<-` by @Darren Tsai

Benchmark

microbenchmark(
  darren = `dim<-`(a[cbind(rep(1:3, each = N), y, z)], c(N, 3)),
  tic = do.call(rbind, asplit(a, -1)[cbind(y, z)]),
  check = "identical",
  unit = "relative"
)

shows

Unit: relative
   expr      min       lq     mean   median       uq      max neval
 darren 1.000000  1.00000  1.00000  1.00000  1.00000  1.00000   100
    tic 9.000844 11.42021 16.63028 13.42301 19.16755 25.68256   100
Cameliacamella answered 19/8, 2024 at 10:58 Comment(0)

© 2022 - 2025 — McMap. All rights reserved.