Post by s***@crayne.orgHello all,
http://en.wikipedia.org/wiki/Duff%27s_device
I am now trying to apply Duff's device loop unrolling in my MMX code. I
was wondering whether anyone here have an example of Duff's device
equivalent assembly code?
Thank you very much.
cheok
in pseudo-assembly code that looks like gas (GNU as):
.data
.align 4
jump_table: .long 0f, 1f, 2f, ...
.global unrolled
unrolled:
/*
* THIS IS LEFT AS AN EXERCISE :)
* divide a value at this point and use the remainder as
* a jump table offset. Move the code offset into a register.
* jmpl *%reg
* Note: handle the remainder first, then the quotient.
*/
0:
/* code */
2:
/* code*/
1:
/* code */
/*
* cmpl and jump back to 0b if there are more iterations to perform.
*/
ret
-----
Also here's some working code I wrote. It improved the performance of
some C code generated by gcc that I found was using the majority of the
time when I profiled.
.include "asm_macros.s"
.data
.align 4
mark_used_table: .long 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f
.text
/*
* This is used when debugging.
*/
fmt_reg: .string "reg 0x%x\n"
/*
* This assumes that num bits is > 0.
* mark_used (bitmap, numbits, bitoffset)
*/
.align 4
mark_used:
savereg %edx
savereg %esi
/*
* 12 is bitmap
* 16 is num bits
* 20 is bit offset
*/
movl 16(%esp),%eax #num bits
movl $8,%ecx #8 bits per loop cycle
xor %edx,%edx #0 %edx
div %ecx
/*
* At this point %eax is the quotient and %edx is the remainder.
*/
#dumpreg %edx
movl 12(%esp),%eax #bitmap
movl mark_used_table(,%edx,4),%ecx #jump table offset
movl 20(%esp),%edx #first bit/bit offset
movl 16(%esp),%esi #num bits
.if 0
dumpreg %eax
dumpreg %ecx
dumpreg %edx
dumpreg %esi
.endif
/*
* At this point the state should be:
* %eax is the bitmap
* %ecx is the jump table address
* %edx is used as the bit offset
* %esi is the number of bits remaining
*/
#dumpreg %eax
#dumpreg %edx
#dumpreg %esi
/*
* Jump to the address from the jump table.
*/
jmpl *%ecx
0:
bts %edx,(%eax) #set the bit %edx in (%eax)
jc bit_set #jump to bit_set if the bit is already set
addl $1,%edx #advance to the next bit
subl $1,%esi #subtract from the total bits remaining
7:
bts %edx,(%eax)
jc bit_set
addl $1,%edx
subl $1,%esi
6:
bts %edx,(%eax)
jc bit_set
addl $1,%edx
subl $1,%esi
5:
bts %edx,(%eax)
jc bit_set
addl $1,%edx
subl $1,%esi
4:
bts %edx,(%eax)
jc bit_set
addl $1,%edx
subl $1,%esi
3:
bts %edx,(%eax)
jc bit_set
addl $1,%edx
subl $1,%esi
2:
bts %edx,(%eax)
jc bit_set
addl $1,%edx
subl $1,%esi
1:
bts %edx,(%eax)
jc bit_set
addl $1,%edx
subl $1,%esi
cmpl $8,%edx
jl bit_less_than_8
subl $8,%edx #subtract 8 for the next byte offset
addl $1,%eax #advance the byte
bit_less_than_8:
cmpl $0,%esi
jg 0b
restorereg %esi
restorereg %edx
ret
bit_set:
/*
* The bit is already set, and it shouldn't be.
* The fact that it's set indicates a flaw in the allocator.
*/
pushl %edx
pushl %eax
pushl $1f
call printf
call abort
1: .string "ERROR: bit is already set. byte is %p bit is %d\n"
Have fun :)
-George