here it is Paul
/'==================================================='/
' using fast SSE to caculate a normale of vector
'
' by Emil halim
'
/'==================================================='/
type Vector4
as single x,y,z,w
end type
type Vector3
as single x,y,z
end type
dim as integer i
dim as Vector4 in_vec(0 to 200)
for i = 0 to 199
in_vec(i).x = rnd * 10.0
in_vec(i).y = rnd * 10.0
in_vec(i).z = rnd * 10.0
in_vec(i).w = 0.0
next
dim as Vector3 out_vec(0 to 200)
dim as integer Cycles1 , Cycles , save
'
' normalizing vector
'====================
'
' FreeBasic code
asm rdtsc ' measure of time
asm mov [save],Eax
for i = 0 to 199
dim as single Rec_len = 1.0 / sqr((in_vec(i).x*in_vec(i).x) + (in_vec(i).y*in_vec(i).y) + (in_vec(i).z*in_vec(i).z))
in_vec(i).x *= Rec_len
in_vec(i).y *= Rec_len
in_vec(i).z *= Rec_len
next
asm rdtsc ' measure of time
asm SUB Eax, [save]
asm mov [Cycles1],eax
' SSE asm code
for i = 0 to 199
in_vec(i).x = rnd * 10.0
in_vec(i).y = rnd * 10.0
in_vec(i).z = rnd * 10.0
in_vec(i).w = 0.0
next
dim as integer in_addr = @in_vec(0).x
dim as integer out_addr = @out_vec(0).x
asm
rdtsc ' measure of time
mov [save],Eax
mov esi , [in_addr]
mov edi , [out_addr]
mov ecx , 199
shr ecx , 1
.lab:
movups xmm0, [esi]
movups xmm3, [esi+16]
'prefetchnta [esi+3*16]
movaps xmm2, xmm0
movaps xmm5, xmm3
mulps xmm0, xmm0
mulps xmm3, xmm3
movaps xmm1, xmm0 'ddccbbaa
movaps xmm4, xmm3
shufps xmm0, xmm1,0b01001110
shufps xmm3, xmm4,0b01001110
addps xmm0, xmm1
addps xmm3, xmm4
movaps xmm1, xmm0
movaps xmm4, xmm3
shufps xmm1, xmm1,0b00010001
shufps xmm4, xmm4,0b00010001
addps xmm0, xmm1
addps xmm3, xmm4
rsqrtps xmm0, xmm0
rsqrtps xmm3, xmm3
mulps xmm2, xmm0
mulps xmm5, xmm3
movups [edi], xmm2
movups [edi+12], xmm5
'prefetchnta [edi+3*12]
add esi,4*4*2
add edi,3*4*2
dec ecx
jnz .lab
rdtsc ' measure of time
sub Eax, [save]
mov [Cycles],eax
end asm
'test the results
i = 10
print in_vec(i).x , in_vec(i).y , in_vec(i).z , in_vec(i).w
print out_vec(i).x , out_vec(i).y , out_vec(i).z
dim as single v_len = sqr((out_vec(i).x*out_vec(i).x) + (out_vec(i).y*out_vec(i).y) + (out_vec(i).z*out_vec(i).z))
print v_len ' must be 1.0
print "SSE time = " ; Cycles
print "FreeBasic time = " ; Cycles1
Do
Loop