I have mod my last code , so that you can see the advantage of using SSE asm code in floating calculation.
so there are 2 codes , one for orignal FreeBasic and the other for SSE asm.
please test it and feed back your results.
In my system
==========
FreeBasic time = 18884 Cycles
SSE time = 7280 Cycles
here is the code
/'==================================================='/
' using fast SSE to caculate a normale of vector
'
' by Emil halim
'
/'==================================================='/
type Vector4
as single x,y,z,w
end type
type Vector3
as single x,y,z
end type
dim as integer i
dim as Vector4 in_vec(0 to 200)
for i = 0 to 199
in_vec(i).x = rnd * 10.0
in_vec(i).y = rnd * 10.0
in_vec(i).z = rnd * 10.0
in_vec(i).w = 0.0
next
dim as Vector3 out_vec(0 to 200)
dim as integer Cycles1 , Cycles , save
'
' normalizing vector
'====================
'
' FreeBasic code
asm rdtsc ' measure of time
asm mov [save],Eax
for i = 0 to 199
dim as single Rec_len = 1.0 / sqr((in_vec(i).x*in_vec(i).x) + (in_vec(i).y*in_vec(i).y) + (in_vec(i).z*in_vec(i).z))
in_vec(i).x *= Rec_len
in_vec(i).y *= Rec_len
in_vec(i).z *= Rec_len
next
asm rdtsc ' measure of time
asm SUB Eax, [save]
asm mov [Cycles1],eax
' SSE asm code
for i = 0 to 199
in_vec(i).x = rnd * 10.0
in_vec(i).y = rnd * 10.0
in_vec(i).z = rnd * 10.0
in_vec(i).w = 0.0
next
dim as integer in_addr = @in_vec(0).x
dim as integer out_addr = @out_vec(0).x
asm
rdtsc ' measure of time
mov [save],Eax
mov esi , [in_addr]
mov edi , [out_addr]
mov ecx , 199
.lab:
movups xmm0, [esi]
movaps xmm2, xmm0
mulps xmm0, xmm0
movaps xmm1, xmm0 'ddccbbaa
shufps xmm0, xmm1,0b01001110
addps xmm0, xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1,0b00010001
addps xmm0, xmm1
rsqrtps xmm0, xmm0
mulps xmm2, xmm0
movups [edi], xmm2
add esi,4*4
add edi,3*4
dec ecx
jnz .lab
rdtsc ' measure of time
SUB Eax, [save]
mov [Cycles],eax
end asm
'test the results
i = 10
print in_vec(i).x , in_vec(i).y , in_vec(i).z , in_vec(i).w
print out_vec(i).x , out_vec(i).y , out_vec(i).z
dim as single v_len = sqr((out_vec(i).x*out_vec(i).x) + (out_vec(i).y*out_vec(i).y) + (out_vec(i).z*out_vec(i).z))
print v_len ' must be 1.0
print "SSE time = " ; Cycles
print "FreeBasic time = " ; Cycles1
Do
Loop