Mais um blog inútil.

Abril 29, 2009

Byte Swap com SSE4.1

Filed under: Assembly,Coding,Useless — dongs @ 18:03

Para finalizar esta série de optimizações inúteis, trago-vos a versão final desta função:

BITS 32

%define UNROLL_COUNT (4)

section .data
align 16
shuffle: dd 0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b

section .text
global sse41_bswap64
sse41_bswap64:
  push ebp
  mov edx, [esp+8] ; buffer -- assumed aligned 16
  mov ecx, [esp+12] ; length in #words
  test   ecx, ecx
  jz  near   _end

  and    ecx, -(UNROLL_COUNT*2) ; make ecx even
  jz     _finalize

  movdqu xmm7, [shuffle]
align 16
_loop:
  sub ecx, UNROLL_COUNT*2
  ; use movntdqa with sse 4.1   
  movntdqa xmm0, [edx + 00]
  movntdqa xmm1, [edx + 16]
  movntdqa xmm2, [edx + 32]
  movntdqa xmm3, [edx + 48]

  pshufb xmm0, xmm7 ; p5, 1l 1t
  pshufb xmm1, xmm7 ; p5, 1l 1t
  pshufb xmm2, xmm7 ; p5, 1l 1t
  pshufb xmm3, xmm7 ; p5, 1l 1t

  ; use movntdq --- the data won't be accessed again 
  ; until the end of the function
  movntdq [edx + 00], xmm0
  movntdq [edx + 16], xmm1
  movntdq [edx + 32], xmm2
  movntdq [edx + 48], xmm3

  lea edx, [edx+ 16*UNROLL_COUNT];
  jnz _loop ; no dependency, all flags were 
             ; computed in the beginning of the loop
_finalize:
  sfence
  mov  ebp, [esp+8]
  and  ebp, (UNROLL_COUNT*2)-1 ; ebp = count mod UNROLL
  jz _end

_endloop:
  mov eax, [edx]
  bswap eax ; p0+p5
  mov ecx, [edx+4]
  bswap ecx
  mov [edx], ecx
  mov [edx+4], eax
  sub ebp, 1
  lea edx, [edx+8]
  jnz _endloop
_end:
  pop ebp
  ret

A única diferença nesta é que agora não apenas os stores, mas também os loads são não temporais, evitando a poluição da cache com dados que sabemos que nao vão ser usados. A performance desta última versão é a melhor do grupo, como demonstrado:

[dcoder@localhost bswap]$ ./bswap 
ref done
sse2 done
ssse3 done
SSE4.1 done
Ref  : 1804875336 cycles
SSE2 : 1822536180 cycles
SSSE3: 1012685742 cycles
SSE41: 998087949 cycles
Speedup: 44.700449%

Comentar

widgeon
widgeon
widgeon
widgeon