Byte Swap com SSE4.1
Para finalizar esta série de optimizações inúteis, trago-vos a versão final desta função:
BITS 32
%define UNROLL_COUNT (4)
section .data
align 16
shuffle: dd 0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b
section .text
global sse41_bswap64
sse41_bswap64:
push ebp
mov edx, [esp+8] ; buffer -- assumed aligned 16
mov ecx, [esp+12] ; length in #words
test ecx, ecx
jz near _end
and ecx, -(UNROLL_COUNT*2) ; make ecx even
jz _finalize
movdqu xmm7, [shuffle]
align 16
_loop:
sub ecx, UNROLL_COUNT*2
; use movntdqa with sse 4.1
movntdqa xmm0, [edx + 00]
movntdqa xmm1, [edx + 16]
movntdqa xmm2, [edx + 32]
movntdqa xmm3, [edx + 48]
pshufb xmm0, xmm7 ; p5, 1l 1t
pshufb xmm1, xmm7 ; p5, 1l 1t
pshufb xmm2, xmm7 ; p5, 1l 1t
pshufb xmm3, xmm7 ; p5, 1l 1t
; use movntdq --- the data won't be accessed again
; until the end of the function
movntdq [edx + 00], xmm0
movntdq [edx + 16], xmm1
movntdq [edx + 32], xmm2
movntdq [edx + 48], xmm3
lea edx, [edx+ 16*UNROLL_COUNT];
jnz _loop ; no dependency, all flags were
; computed in the beginning of the loop
_finalize:
sfence
mov ebp, [esp+8]
and ebp, (UNROLL_COUNT*2)-1 ; ebp = count mod UNROLL
jz _end
_endloop:
mov eax, [edx]
bswap eax ; p0+p5
mov ecx, [edx+4]
bswap ecx
mov [edx], ecx
mov [edx+4], eax
sub ebp, 1
lea edx, [edx+8]
jnz _endloop
_end:
pop ebp
ret
A única diferença nesta é que agora não apenas os stores, mas também os loads são não temporais, evitando a poluição da cache com dados que sabemos que nao vão ser usados. A performance desta última versão é a melhor do grupo, como demonstrado:
[dcoder@localhost bswap]$ ./bswap ref done sse2 done ssse3 done SSE4.1 done Ref : 1804875336 cycles SSE2 : 1822536180 cycles SSSE3: 1012685742 cycles SSE41: 998087949 cycles Speedup: 44.700449%