James Van Buskirk
2022-05-22 09:05:58 UTC
Just for fun I thought I would try various strategies for permuting
an array of single-precision floating point numbers using AVX2.
bitrev1.asm just does vpunpckl/hdq/qdq or its lane-crossing
synthesis with vperm2i128 and has a hard limit of 24 clocks to
bit-reverse a 64-element array because all of these operations
use pipeline 5.
D:\gfortran\james\bitrev>type bitrev1.asm
format MS64 COFF
section '.text' code readable writeable executable
public bitrev1
bitrev1:
sub rsp, 56
vmovdqu [rsp+32], xmm6
vmovdqu [rsp+16], xmm7
vmovdqu [rsp], xmm8
vmovdqu ymm0, [rcx]
vmovdqu ymm1, [rcx+32]
vmovdqu ymm2, [rcx+64]
vmovdqu ymm3, [rcx+96]
vmovdqu ymm4, [rcx+128]
vmovdqu ymm5, [rcx+160]
vmovdqu ymm6, [rcx+192]
vmovdqu ymm8, [rcx+224]
vperm2i128 ymm7, ymm0, ymm1, 32
vperm2i128 ymm0, ymm0, ymm1, 49
vperm2i128 ymm1, ymm2, ymm3, 32
vperm2i128 ymm2, ymm2, ymm3, 49
vperm2i128 ymm3, ymm4, ymm5, 32
vperm2i128 ymm4, ymm4, ymm5, 49
vperm2i128 ymm5, ymm6, ymm8, 32
vperm2i128 ymm6, ymm6, ymm8, 49
vpunpckldq ymm8, ymm7, ymm3
vpunpckhdq ymm7, ymm7, ymm3
vpunpckldq ymm3, ymm0, ymm4
vpunpckhdq ymm0, ymm0, ymm4
vpunpckldq ymm4, ymm1, ymm5
vpunpckhdq ymm1, ymm1, ymm5
vpunpckldq ymm5, ymm2, ymm6
vpunpckhdq ymm6, ymm2, ymm6
vpunpcklqdq ymm2, ymm8, ymm4
vpunpckhqdq ymm8, ymm8, ymm4
vpunpcklqdq ymm4, ymm7, ymm1
vpunpckhqdq ymm7, ymm7, ymm1
vpunpcklqdq ymm1, ymm3, ymm5
vpunpckhqdq ymm3, ymm3, ymm5
vpunpcklqdq ymm5, ymm0, ymm6
vpunpckhqdq ymm0, ymm0, ymm6
vmovdqu [rcx], ymm2
vmovdqu [rcx+32], ymm1
vmovdqu [rcx+64], ymm4
vmovdqu [rcx+96], ymm5
vmovdqu [rcx+128], ymm8
vmovdqu [rcx+160], ymm3
vmovdqu [rcx+192], ymm7
vmovdqu [rcx+224], ymm0
epilog:
vmovdqu xmm6, [rsp+32]
vmovdqu xmm7, [rsp+16]
vmovdqu xmm8, [rsp]
add rsp, 56
ret
D:\gfortran\james\bitrev>fasm bitrev1.asm
flat assembler version 1.71.49 (1048576 kilobytes memory)
1 passes, 357 bytes.
bitrev2.asm is the ugliest version, using permutations up front
and in back to shift the data into position and then vpblendd
to shift data between registers. It has the potential to be fastest
because only 14 operations require pipeline 5.
D:\gfortran\james\bitrev>type bitrev2.asm
format MS64 COFF
section '.text' code readable writeable executable
public bitrev2
bitrev2:
sub rsp, 56
vmovdqu [rsp+32], xmm6
vmovdqu [rsp+16], xmm7
vmovdqu [rsp], xmm8
vmovdqu ymm0, [rcx]
vmovdqu ymm4, [rcx+32]
vmovdqu ymm2, [rcx+64]
vmovdqu ymm6, [rcx+96]
vmovdqu ymm1, [rcx+128]
vmovdqu ymm5, [rcx+160]
vmovdqu ymm3, [rcx+192]
vmovdqu ymm8, [rcx+224]
vmovdqu ymm7, yword [perm0]
vpermd ymm1, ymm7, ymm1
vpermq ymm2, ymm2, 177
vmovdqu ymm7, yword [perm1]
vpermd ymm3, ymm7, ymm3
vpermq ymm4, ymm4, 78
vmovdqu ymm7, yword [perm0+16]
vpermd ymm5, ymm7, ymm5
vpermq ymm6, ymm6, 27
vmovdqu ymm7, yword [perm1+16]
vpermd ymm8, ymm7, ymm8
vpblendd ymm7, ymm0, ymm4, 240
vpblendd ymm0, ymm0, ymm4, 15
vpblendd ymm4, ymm2, ymm6, 240
vpblendd ymm2, ymm2, ymm6, 15
vpblendd ymm6, ymm1, ymm5, 240
vpblendd ymm1, ymm1, ymm5, 15
vpblendd ymm5, ymm3, ymm8, 240
vpblendd ymm3, ymm3, ymm8, 15
vpblendd ymm8, ymm7, ymm4, 204
vpblendd ymm7, ymm7, ymm4, 51
vpblendd ymm4, ymm0, ymm2, 204
vpblendd ymm0, ymm0, ymm2, 51
vpblendd ymm2, ymm6, ymm5, 204
vpblendd ymm6, ymm6, ymm5, 51
vpblendd ymm5, ymm1, ymm3, 204
vpblendd ymm1, ymm1, ymm3, 51
vpblendd ymm3, ymm8, ymm2, 170
vpblendd ymm8, ymm8, ymm2, 85
vpblendd ymm2, ymm7, ymm1, 170
vpblendd ymm7, ymm7, ymm1, 85
vpblendd ymm1, ymm4, ymm5, 170
vpblendd ymm4, ymm4, ymm5, 85
vpblendd ymm5, ymm0, ymm6, 170
vpblendd ymm0, ymm0, ymm6, 85
vmovdqu ymm6, yword [perm1+8]
vpermd ymm8, ymm6, ymm8
vmovdqu ymm6, yword [perm2]
vpermd ymm2, ymm6, ymm2
vmovdqu ymm6, yword [perm3]
vpermd ymm7, ymm6, ymm7
vpermq ymm1, ymm1, 78
vmovdqu ymm6, yword [perm1+24]
vpermd ymm4, ymm6, ymm4
vmovdqu ymm6, yword [perm2+16]
vpermd ymm5, ymm6, ymm5
vmovdqu ymm6, yword [perm3+16]
vpermd ymm0, ymm6, ymm0
vmovdqu [rcx], ymm3
vmovdqu [rcx+32], ymm1
vmovdqu [rcx+64], ymm2
vmovdqu [rcx+96], ymm5
vmovdqu [rcx+128], ymm8
vmovdqu [rcx+160], ymm4
vmovdqu [rcx+192], ymm7
vmovdqu [rcx+224], ymm0
.epilog:
vmovdqu xmm6, [rsp+32]
vmovdqu xmm7, [rsp+16]
vmovdqu xmm8, [rsp]
add rsp, 56
ret
section '.data' data readable writeable align 32
align 32
perm0 dd 1,0,7,6,5,4,3,2,1,0,7,6
perm1 dd 7,6,1,0,3,2,5,4,7,6,1,0,3,2
perm2 dd 2,7,0,5,6,3,4,1,2,7,0,5
perm3 dd 3,6,1,4,7,2,5,0,3,6,1,4
D:\gfortran\james\bitrev>fasm bitrev2.asm
flat assembler version 1.71.49 (1048576 kilobytes memory)
3 passes, 901 bytes.
bitrev3.asm uses vpgatherdd, which is a really slow instruction.
It was the easiest to write, though.
D:\gfortran\james\bitrev>type bitrev3.asm
format MS64 COFF
section '.text' code readable writeable executable
public bitrev3
bitrev3:
sub rsp, 72
vmovdqu [rsp+48], xmm6
vmovdqu [rsp+32], xmm7
vmovdqu [rsp+16], xmm8
vmovdqu [rsp], xmm9
vmovdqu ymm6, yword [table]
vpcmpeqd ymm7, ymm7, ymm7
vpgatherdd ymm0, [rcx+ymm6], ymm7
vpcmpeqd ymm7, ymm7, ymm7
vpgatherdd ymm1, [rcx+ymm6+16], ymm7
vpcmpeqd ymm7, ymm7, ymm7
vpgatherdd ymm2, [rcx+ymm6+8], ymm7
vpcmpeqd ymm7, ymm7, ymm7
vpgatherdd ymm3, [rcx+ymm6+24], ymm7
vpcmpeqd ymm7, ymm7, ymm7
vpgatherdd ymm4, [rcx+ymm6+4], ymm7
vpcmpeqd ymm7, ymm7, ymm7
vpgatherdd ymm5, [rcx+ymm6+20], ymm7
vpcmpeqd ymm7, ymm7, ymm7
vpgatherdd ymm8, [rcx+ymm6+12], ymm7
vpcmpeqd ymm7, ymm7, ymm7
vpgatherdd ymm9, [rcx+ymm6+28], ymm7
vmovdqu [rcx], ymm0
vmovdqu [rcx+32], ymm1
vmovdqu [rcx+64], ymm2
vmovdqu [rcx+96], ymm3
vmovdqu [rcx+128], ymm4
vmovdqu [rcx+160], ymm5
vmovdqu [rcx+192], ymm8
vmovdqu [rcx+224], ymm9
epilog:
vmovdqu xmm6, [rsp+48]
vmovdqu xmm7, [rsp+32]
vmovdqu xmm8, [rsp+16]
vmovdqu xmm9, [rsp]
add rsp, 72
ret
section '.data' data readable writeable align 32
align 32
table dd 0,128,64,192,32,160,96,224
D:\gfortran\james\bitrev>fasm bitrev3.asm
flat assembler version 1.71.49 (1048576 kilobytes memory)
3 passes, 401 bytes.
bitrev4.asm just reads addresses to swap from a lookup table. It
seems to have a hard limit of 56 clocks because my Haswell appears
to have only only store pipeline.
D:\gfortran\james\bitrev>type bitrev4.asm
format MS64 COFF
section '.text' code readable writeable executable
public bitrev4
bitrev4:
push rbx
lea r8, [table+56]
mov r9, -56
.outer:
mov rax, qword [r8+r9]
.inner:
movzx edx, ah
movzx r11d, al
mov r10d, [rcx+4*rdx]
mov ebx, [rcx+4*r11]
mov [rcx+4*r11], r10d
mov [rcx+4*rdx], ebx
shr rax, 16
jnz .inner
add r9, 8
jnz .outer
pop rbx
ret
section '.data' data readable writeable align 32
align 32
table db 1,32,2,16,3,48,4,8,5,40,6,24,7,56,9,36,10,20,11,52,13
db 44,14,28,15,60,17,34,19,50,21,42,22,26,23,58,25,38,27
db 54,29,46,31,62,35,49,37,41,39,57,43,53,47,61,55,59
D:\gfortran\james\bitrev>fasm bitrev4.asm
flat assembler version 1.71.49 (1048576 kilobytes memory)
3 passes, 279 bytes.
A timing function is required.
D:\gfortran\james\bitrev>type rdtscp.asm
format MS64 COFF
section '.text' code readable writeable executable
public _rdtscp
_rdtscp:
rdtscp
shl rdx, 32
or rax, rdx
ret
D:\gfortran\james\bitrev>fasm rdtscp.asm
flat assembler version 1.71.49 (1048576 kilobytes memory)
1 passes, 111 bytes.
I used the free gfortran compiler to test the procedures. It first tests
to make sure that the subroutine does its bit-reversal correctly: for
each subroutine the sum of the absolute values of the differences
between actual and theoretical outputs is shown to be zero. Then
it goes into a loop where it times the subroutine for 1000 repititions
and stores the time in an array. It prints out the results for 23 such
timings and moves on to testing the next subroutine.
To make a long story short these tests seemed to show that
bitrev1 took about 27 clocks, bitrev2 about 23, bitrev3 about
93, and bitrev4 about 72 clocks.
Oh well, maybe I should have gone out and played in the snow
instead today.
D:\gfortran\james\bitrev>type timer.f90
module funcs
use ISO_FORTRAN_ENV
implicit none
interface
subroutine bitrev1(x) bind(C,name='bitrev1')
import
implicit none
real(REAL32), intent(inout) :: x(0:63)
end subroutine bitrev1
subroutine bitrev2(x) bind(C,name='bitrev2')
import
implicit none
real(REAL32), intent(inout) :: x(0:63)
end subroutine bitrev2
subroutine bitrev3(x) bind(C,name='bitrev3')
import
implicit none
real(REAL32), intent(inout) :: x(0:63)
end subroutine bitrev3
subroutine bitrev4(x) bind(C,name='bitrev4')
import
implicit none
real(REAL32), intent(inout) :: x(0:63)
end subroutine bitrev4
function rdtscp() bind(C,name='_rdtscp')
import
implicit none
integer(INT64) rdtscp
end function rdtscp
end interface
contains
function br64(x)
integer br64
integer, intent(in) :: x
integer t
t = x
br64 = ((((ibits(t,0,1)*2+ibits(t,1,1))*2+ibits(t,2,1))*2+ &
ibits(t,3,1))*2+ibits(t,4,1))*2+ibits(t,5,1)
end function br64
end module funcs
program start
use funcs
implicit none
real(REAL32) x(0:63)
integer i
real(REAL32) check
integer(INT64) timestamps(24)
type has_fun
procedure(bitrev1), pointer, nopass :: fun
end type has_fun
type(has_fun) lots(4)
integer j
integer k
lots(1)%fun => bitrev1
lots(2)%fun => bitrev2
lots(3)%fun => bitrev3
lots(4)%fun => bitrev4
do j = 1, 4
x = [(i,i=0,63)]
call lots(j)%fun(x)
check = 0
do i = 0, 63
check = check+abs(br64(i)-x(i))
end do
write(*,'(*(g0))') 'bitrev',j,': check = ',check
do i = 1, 1000
call lots(j)%fun(x)
check = rdtscp()
end do
timestamps(1) = rdtscp()
do i = 2, size(timestamps)
do k = 1, 1000
call lots(j)%fun(x)
end do
timestamps(i) = rdtscp()
end do
write(*,'(g0)') timestamps(2:)-timestamps(:size(timestamps)-1)
end do
end program start
D:\gfortran\james\bitrev>gfortran -O3 timer.f90 bitrev1.obj bitrev2.obj bitr
obj bitrev4.obj rdtscp.obj -otimer
D:\gfortran\james\bitrev>timer
bitrev1: check = 0.00000000
27324
27007
27185
27027
27010
27025
27067
27006
27067
27025
27067
27025
27067
27006
27067
27025
27276
27027
27013
27025
27067
27006
27067
bitrev2: check = 0.00000000
23300
23126
23215
23232
23209
23232
23206
23232
23194
23221
23194
23232
23221
23235
23194
23232
23197
23232
23194
23221
23194
23235
23194
bitrev3: check = 0.00000000
93190
93139
93116
93124
93119
93124
93116
93113
93127
93113
93127
93113
93116
93124
93116
93124
93116
93116
93124
93113
93113
93124
93113
bitrev4: check = 0.00000000
72801
72772
72062
72101
72086
72074
72086
72074
72086
72074
72086
72074
72086
72074
72086
72074
72086
72074
72086
72074
108450
74655
74803
an array of single-precision floating point numbers using AVX2.
bitrev1.asm just does vpunpckl/hdq/qdq or its lane-crossing
synthesis with vperm2i128 and has a hard limit of 24 clocks to
bit-reverse a 64-element array because all of these operations
use pipeline 5.
D:\gfortran\james\bitrev>type bitrev1.asm
format MS64 COFF
section '.text' code readable writeable executable
public bitrev1
bitrev1:
sub rsp, 56
vmovdqu [rsp+32], xmm6
vmovdqu [rsp+16], xmm7
vmovdqu [rsp], xmm8
vmovdqu ymm0, [rcx]
vmovdqu ymm1, [rcx+32]
vmovdqu ymm2, [rcx+64]
vmovdqu ymm3, [rcx+96]
vmovdqu ymm4, [rcx+128]
vmovdqu ymm5, [rcx+160]
vmovdqu ymm6, [rcx+192]
vmovdqu ymm8, [rcx+224]
vperm2i128 ymm7, ymm0, ymm1, 32
vperm2i128 ymm0, ymm0, ymm1, 49
vperm2i128 ymm1, ymm2, ymm3, 32
vperm2i128 ymm2, ymm2, ymm3, 49
vperm2i128 ymm3, ymm4, ymm5, 32
vperm2i128 ymm4, ymm4, ymm5, 49
vperm2i128 ymm5, ymm6, ymm8, 32
vperm2i128 ymm6, ymm6, ymm8, 49
vpunpckldq ymm8, ymm7, ymm3
vpunpckhdq ymm7, ymm7, ymm3
vpunpckldq ymm3, ymm0, ymm4
vpunpckhdq ymm0, ymm0, ymm4
vpunpckldq ymm4, ymm1, ymm5
vpunpckhdq ymm1, ymm1, ymm5
vpunpckldq ymm5, ymm2, ymm6
vpunpckhdq ymm6, ymm2, ymm6
vpunpcklqdq ymm2, ymm8, ymm4
vpunpckhqdq ymm8, ymm8, ymm4
vpunpcklqdq ymm4, ymm7, ymm1
vpunpckhqdq ymm7, ymm7, ymm1
vpunpcklqdq ymm1, ymm3, ymm5
vpunpckhqdq ymm3, ymm3, ymm5
vpunpcklqdq ymm5, ymm0, ymm6
vpunpckhqdq ymm0, ymm0, ymm6
vmovdqu [rcx], ymm2
vmovdqu [rcx+32], ymm1
vmovdqu [rcx+64], ymm4
vmovdqu [rcx+96], ymm5
vmovdqu [rcx+128], ymm8
vmovdqu [rcx+160], ymm3
vmovdqu [rcx+192], ymm7
vmovdqu [rcx+224], ymm0
epilog:
vmovdqu xmm6, [rsp+32]
vmovdqu xmm7, [rsp+16]
vmovdqu xmm8, [rsp]
add rsp, 56
ret
D:\gfortran\james\bitrev>fasm bitrev1.asm
flat assembler version 1.71.49 (1048576 kilobytes memory)
1 passes, 357 bytes.
bitrev2.asm is the ugliest version, using permutations up front
and in back to shift the data into position and then vpblendd
to shift data between registers. It has the potential to be fastest
because only 14 operations require pipeline 5.
D:\gfortran\james\bitrev>type bitrev2.asm
format MS64 COFF
section '.text' code readable writeable executable
public bitrev2
bitrev2:
sub rsp, 56
vmovdqu [rsp+32], xmm6
vmovdqu [rsp+16], xmm7
vmovdqu [rsp], xmm8
vmovdqu ymm0, [rcx]
vmovdqu ymm4, [rcx+32]
vmovdqu ymm2, [rcx+64]
vmovdqu ymm6, [rcx+96]
vmovdqu ymm1, [rcx+128]
vmovdqu ymm5, [rcx+160]
vmovdqu ymm3, [rcx+192]
vmovdqu ymm8, [rcx+224]
vmovdqu ymm7, yword [perm0]
vpermd ymm1, ymm7, ymm1
vpermq ymm2, ymm2, 177
vmovdqu ymm7, yword [perm1]
vpermd ymm3, ymm7, ymm3
vpermq ymm4, ymm4, 78
vmovdqu ymm7, yword [perm0+16]
vpermd ymm5, ymm7, ymm5
vpermq ymm6, ymm6, 27
vmovdqu ymm7, yword [perm1+16]
vpermd ymm8, ymm7, ymm8
vpblendd ymm7, ymm0, ymm4, 240
vpblendd ymm0, ymm0, ymm4, 15
vpblendd ymm4, ymm2, ymm6, 240
vpblendd ymm2, ymm2, ymm6, 15
vpblendd ymm6, ymm1, ymm5, 240
vpblendd ymm1, ymm1, ymm5, 15
vpblendd ymm5, ymm3, ymm8, 240
vpblendd ymm3, ymm3, ymm8, 15
vpblendd ymm8, ymm7, ymm4, 204
vpblendd ymm7, ymm7, ymm4, 51
vpblendd ymm4, ymm0, ymm2, 204
vpblendd ymm0, ymm0, ymm2, 51
vpblendd ymm2, ymm6, ymm5, 204
vpblendd ymm6, ymm6, ymm5, 51
vpblendd ymm5, ymm1, ymm3, 204
vpblendd ymm1, ymm1, ymm3, 51
vpblendd ymm3, ymm8, ymm2, 170
vpblendd ymm8, ymm8, ymm2, 85
vpblendd ymm2, ymm7, ymm1, 170
vpblendd ymm7, ymm7, ymm1, 85
vpblendd ymm1, ymm4, ymm5, 170
vpblendd ymm4, ymm4, ymm5, 85
vpblendd ymm5, ymm0, ymm6, 170
vpblendd ymm0, ymm0, ymm6, 85
vmovdqu ymm6, yword [perm1+8]
vpermd ymm8, ymm6, ymm8
vmovdqu ymm6, yword [perm2]
vpermd ymm2, ymm6, ymm2
vmovdqu ymm6, yword [perm3]
vpermd ymm7, ymm6, ymm7
vpermq ymm1, ymm1, 78
vmovdqu ymm6, yword [perm1+24]
vpermd ymm4, ymm6, ymm4
vmovdqu ymm6, yword [perm2+16]
vpermd ymm5, ymm6, ymm5
vmovdqu ymm6, yword [perm3+16]
vpermd ymm0, ymm6, ymm0
vmovdqu [rcx], ymm3
vmovdqu [rcx+32], ymm1
vmovdqu [rcx+64], ymm2
vmovdqu [rcx+96], ymm5
vmovdqu [rcx+128], ymm8
vmovdqu [rcx+160], ymm4
vmovdqu [rcx+192], ymm7
vmovdqu [rcx+224], ymm0
.epilog:
vmovdqu xmm6, [rsp+32]
vmovdqu xmm7, [rsp+16]
vmovdqu xmm8, [rsp]
add rsp, 56
ret
section '.data' data readable writeable align 32
align 32
perm0 dd 1,0,7,6,5,4,3,2,1,0,7,6
perm1 dd 7,6,1,0,3,2,5,4,7,6,1,0,3,2
perm2 dd 2,7,0,5,6,3,4,1,2,7,0,5
perm3 dd 3,6,1,4,7,2,5,0,3,6,1,4
D:\gfortran\james\bitrev>fasm bitrev2.asm
flat assembler version 1.71.49 (1048576 kilobytes memory)
3 passes, 901 bytes.
bitrev3.asm uses vpgatherdd, which is a really slow instruction.
It was the easiest to write, though.
D:\gfortran\james\bitrev>type bitrev3.asm
format MS64 COFF
section '.text' code readable writeable executable
public bitrev3
bitrev3:
sub rsp, 72
vmovdqu [rsp+48], xmm6
vmovdqu [rsp+32], xmm7
vmovdqu [rsp+16], xmm8
vmovdqu [rsp], xmm9
vmovdqu ymm6, yword [table]
vpcmpeqd ymm7, ymm7, ymm7
vpgatherdd ymm0, [rcx+ymm6], ymm7
vpcmpeqd ymm7, ymm7, ymm7
vpgatherdd ymm1, [rcx+ymm6+16], ymm7
vpcmpeqd ymm7, ymm7, ymm7
vpgatherdd ymm2, [rcx+ymm6+8], ymm7
vpcmpeqd ymm7, ymm7, ymm7
vpgatherdd ymm3, [rcx+ymm6+24], ymm7
vpcmpeqd ymm7, ymm7, ymm7
vpgatherdd ymm4, [rcx+ymm6+4], ymm7
vpcmpeqd ymm7, ymm7, ymm7
vpgatherdd ymm5, [rcx+ymm6+20], ymm7
vpcmpeqd ymm7, ymm7, ymm7
vpgatherdd ymm8, [rcx+ymm6+12], ymm7
vpcmpeqd ymm7, ymm7, ymm7
vpgatherdd ymm9, [rcx+ymm6+28], ymm7
vmovdqu [rcx], ymm0
vmovdqu [rcx+32], ymm1
vmovdqu [rcx+64], ymm2
vmovdqu [rcx+96], ymm3
vmovdqu [rcx+128], ymm4
vmovdqu [rcx+160], ymm5
vmovdqu [rcx+192], ymm8
vmovdqu [rcx+224], ymm9
epilog:
vmovdqu xmm6, [rsp+48]
vmovdqu xmm7, [rsp+32]
vmovdqu xmm8, [rsp+16]
vmovdqu xmm9, [rsp]
add rsp, 72
ret
section '.data' data readable writeable align 32
align 32
table dd 0,128,64,192,32,160,96,224
D:\gfortran\james\bitrev>fasm bitrev3.asm
flat assembler version 1.71.49 (1048576 kilobytes memory)
3 passes, 401 bytes.
bitrev4.asm just reads addresses to swap from a lookup table. It
seems to have a hard limit of 56 clocks because my Haswell appears
to have only only store pipeline.
D:\gfortran\james\bitrev>type bitrev4.asm
format MS64 COFF
section '.text' code readable writeable executable
public bitrev4
bitrev4:
push rbx
lea r8, [table+56]
mov r9, -56
.outer:
mov rax, qword [r8+r9]
.inner:
movzx edx, ah
movzx r11d, al
mov r10d, [rcx+4*rdx]
mov ebx, [rcx+4*r11]
mov [rcx+4*r11], r10d
mov [rcx+4*rdx], ebx
shr rax, 16
jnz .inner
add r9, 8
jnz .outer
pop rbx
ret
section '.data' data readable writeable align 32
align 32
table db 1,32,2,16,3,48,4,8,5,40,6,24,7,56,9,36,10,20,11,52,13
db 44,14,28,15,60,17,34,19,50,21,42,22,26,23,58,25,38,27
db 54,29,46,31,62,35,49,37,41,39,57,43,53,47,61,55,59
D:\gfortran\james\bitrev>fasm bitrev4.asm
flat assembler version 1.71.49 (1048576 kilobytes memory)
3 passes, 279 bytes.
A timing function is required.
D:\gfortran\james\bitrev>type rdtscp.asm
format MS64 COFF
section '.text' code readable writeable executable
public _rdtscp
_rdtscp:
rdtscp
shl rdx, 32
or rax, rdx
ret
D:\gfortran\james\bitrev>fasm rdtscp.asm
flat assembler version 1.71.49 (1048576 kilobytes memory)
1 passes, 111 bytes.
I used the free gfortran compiler to test the procedures. It first tests
to make sure that the subroutine does its bit-reversal correctly: for
each subroutine the sum of the absolute values of the differences
between actual and theoretical outputs is shown to be zero. Then
it goes into a loop where it times the subroutine for 1000 repititions
and stores the time in an array. It prints out the results for 23 such
timings and moves on to testing the next subroutine.
To make a long story short these tests seemed to show that
bitrev1 took about 27 clocks, bitrev2 about 23, bitrev3 about
93, and bitrev4 about 72 clocks.
Oh well, maybe I should have gone out and played in the snow
instead today.
D:\gfortran\james\bitrev>type timer.f90
module funcs
use ISO_FORTRAN_ENV
implicit none
interface
subroutine bitrev1(x) bind(C,name='bitrev1')
import
implicit none
real(REAL32), intent(inout) :: x(0:63)
end subroutine bitrev1
subroutine bitrev2(x) bind(C,name='bitrev2')
import
implicit none
real(REAL32), intent(inout) :: x(0:63)
end subroutine bitrev2
subroutine bitrev3(x) bind(C,name='bitrev3')
import
implicit none
real(REAL32), intent(inout) :: x(0:63)
end subroutine bitrev3
subroutine bitrev4(x) bind(C,name='bitrev4')
import
implicit none
real(REAL32), intent(inout) :: x(0:63)
end subroutine bitrev4
function rdtscp() bind(C,name='_rdtscp')
import
implicit none
integer(INT64) rdtscp
end function rdtscp
end interface
contains
function br64(x)
integer br64
integer, intent(in) :: x
integer t
t = x
br64 = ((((ibits(t,0,1)*2+ibits(t,1,1))*2+ibits(t,2,1))*2+ &
ibits(t,3,1))*2+ibits(t,4,1))*2+ibits(t,5,1)
end function br64
end module funcs
program start
use funcs
implicit none
real(REAL32) x(0:63)
integer i
real(REAL32) check
integer(INT64) timestamps(24)
type has_fun
procedure(bitrev1), pointer, nopass :: fun
end type has_fun
type(has_fun) lots(4)
integer j
integer k
lots(1)%fun => bitrev1
lots(2)%fun => bitrev2
lots(3)%fun => bitrev3
lots(4)%fun => bitrev4
do j = 1, 4
x = [(i,i=0,63)]
call lots(j)%fun(x)
check = 0
do i = 0, 63
check = check+abs(br64(i)-x(i))
end do
write(*,'(*(g0))') 'bitrev',j,': check = ',check
do i = 1, 1000
call lots(j)%fun(x)
check = rdtscp()
end do
timestamps(1) = rdtscp()
do i = 2, size(timestamps)
do k = 1, 1000
call lots(j)%fun(x)
end do
timestamps(i) = rdtscp()
end do
write(*,'(g0)') timestamps(2:)-timestamps(:size(timestamps)-1)
end do
end program start
D:\gfortran\james\bitrev>gfortran -O3 timer.f90 bitrev1.obj bitrev2.obj bitr
obj bitrev4.obj rdtscp.obj -otimer
D:\gfortran\james\bitrev>timer
bitrev1: check = 0.00000000
27324
27007
27185
27027
27010
27025
27067
27006
27067
27025
27067
27025
27067
27006
27067
27025
27276
27027
27013
27025
27067
27006
27067
bitrev2: check = 0.00000000
23300
23126
23215
23232
23209
23232
23206
23232
23194
23221
23194
23232
23221
23235
23194
23232
23197
23232
23194
23221
23194
23235
23194
bitrev3: check = 0.00000000
93190
93139
93116
93124
93119
93124
93116
93113
93127
93113
93127
93113
93116
93124
93116
93124
93116
93116
93124
93113
93113
93124
93113
bitrev4: check = 0.00000000
72801
72772
72062
72101
72086
72074
72086
72074
72086
72074
72086
72074
72086
72074
72086
72074
72086
72074
72086
72074
108450
74655
74803