#const-generics

no-std array-bin-ops

Efficient array binary operations

6 releases

0.1.6 Nov 25, 2022
0.1.5 Jan 22, 2022

#726 in Math

Download history 14/week @ 2023-12-08 10/week @ 2023-12-15 10/week @ 2023-12-22 7/week @ 2024-01-05 22/week @ 2024-01-12 99/week @ 2024-01-19 46/week @ 2024-01-26 14/week @ 2024-02-02 10/week @ 2024-02-09 67/week @ 2024-02-16 128/week @ 2024-02-23 76/week @ 2024-03-01 42/week @ 2024-03-08 21/week @ 2024-03-15 18/week @ 2024-03-22

162 downloads per month
Used in iter_num_tools

Custom license

14KB
245 lines

array_bin_ops

An example implementation of Array Element-Wise Binary Operations in Rust.

Trying to produce efficient code where possible, avoiding any memory safety issues. Current benchmarks show it being faster than any safe code currently available (using std only)

Example ASM

Given the following rust code

pub fn add_i64x32(lhs: [i64; 32], rhs: [i64; 32]) -> [i64; 32] {
    Array(lhs) + rhs
}

It outputs the following asm, which is performing 16 i64x2 add operations, in an unrolled loop to avoid branching.

add_i64x32:
 sub     rsp, 72
 mov     rax, rdi
 movdqu  xmm1, xmmword, ptr, [rsi]
 movdqu  xmm3, xmmword, ptr, [rsi, +, 16]
 movdqu  xmm5, xmmword, ptr, [rsi, +, 32]
 movdqu  xmm7, xmmword, ptr, [rsi, +, 48]
 movdqu  xmm15, xmmword, ptr, [rsi, +, 64]
 movdqu  xmm8, xmmword, ptr, [rsi, +, 80]
 movdqu  xmm9, xmmword, ptr, [rsi, +, 96]
 movdqu  xmm10, xmmword, ptr, [rsi, +, 112]
 movdqu  xmm14, xmmword, ptr, [rsi, +, 128]
 movdqu  xmm13, xmmword, ptr, [rsi, +, 144]
 movdqu  xmm12, xmmword, ptr, [rsi, +, 160]
 movdqu  xmm11, xmmword, ptr, [rsi, +, 176]
 movups  xmm0, xmmword, ptr, [rsi, +, 192]
 movaps  xmmword, ptr, [rsp], xmm0
 movdqu  xmm2, xmmword, ptr, [rsi, +, 208]
 movups  xmm0, xmmword, ptr, [rsi, +, 224]
 movaps  xmmword, ptr, [rsp, +, 48], xmm0
 movdqu  xmm0, xmmword, ptr, [rdx]
 paddq   xmm0, xmm1
 movdqa  xmmword, ptr, [rsp, +, 32], xmm0
 movdqu  xmm0, xmmword, ptr, [rdx, +, 16]
 paddq   xmm0, xmm3
 movdqa  xmmword, ptr, [rsp, +, 16], xmm0
 movdqu  xmm4, xmmword, ptr, [rdx, +, 32]
 paddq   xmm4, xmm5
 movdqu  xmm6, xmmword, ptr, [rdx, +, 48]
 paddq   xmm6, xmm7
 movdqu  xmm1, xmmword, ptr, [rdx, +, 64]
 paddq   xmm1, xmm15
 movdqu  xmm15, xmmword, ptr, [rdx, +, 80]
 paddq   xmm15, xmm8
 movdqu  xmm8, xmmword, ptr, [rdx, +, 96]
 paddq   xmm8, xmm9
 movdqu  xmm9, xmmword, ptr, [rdx, +, 112]
 paddq   xmm9, xmm10
 movdqu  xmm10, xmmword, ptr, [rdx, +, 128]
 paddq   xmm10, xmm14
 movdqu  xmm14, xmmword, ptr, [rdx, +, 144]
 paddq   xmm14, xmm13
 movdqu  xmm13, xmmword, ptr, [rdx, +, 160]
 paddq   xmm13, xmm12
 movdqu  xmm12, xmmword, ptr, [rdx, +, 176]
 paddq   xmm12, xmm11
 movdqu  xmm3, xmmword, ptr, [rdx, +, 192]
 paddq   xmm3, xmmword, ptr, [rsp]
 movdqu  xmm7, xmmword, ptr, [rdx, +, 208]
 paddq   xmm7, xmm2
 movdqu  xmm5, xmmword, ptr, [rdx, +, 224]
 paddq   xmm5, xmmword, ptr, [rsp, +, 48]
 movdqu  xmm11, xmmword, ptr, [rsi, +, 240]
 movdqu  xmm0, xmmword, ptr, [rdx, +, 240]
 paddq   xmm0, xmm11
 movaps  xmm2, xmmword, ptr, [rsp, +, 32]
 movups  xmmword, ptr, [rdi], xmm2
 movaps  xmm2, xmmword, ptr, [rsp, +, 16]
 movups  xmmword, ptr, [rdi, +, 16], xmm2
 movdqu  xmmword, ptr, [rdi, +, 32], xmm4
 movdqu  xmmword, ptr, [rdi, +, 48], xmm6
 movdqu  xmmword, ptr, [rdi, +, 64], xmm1
 movdqu  xmmword, ptr, [rdi, +, 80], xmm15
 movdqu  xmmword, ptr, [rdi, +, 96], xmm8
 movdqu  xmmword, ptr, [rdi, +, 112], xmm9
 movdqu  xmmword, ptr, [rdi, +, 128], xmm10
 movdqu  xmmword, ptr, [rdi, +, 144], xmm14
 movdqu  xmmword, ptr, [rdi, +, 160], xmm13
 movdqu  xmmword, ptr, [rdi, +, 176], xmm12
 movdqu  xmmword, ptr, [rdi, +, 192], xmm3
 movdqu  xmmword, ptr, [rdi, +, 208], xmm7
 movdqu  xmmword, ptr, [rdi, +, 224], xmm5
 movdqu  xmmword, ptr, [rdi, +, 240], xmm0
 add     rsp, 72
 ret

No runtime deps