another one (using mmx):
void fade(uint32 *dst, uint32 *src, int intense, int isize)
{
_asm {
mov edi,dst // dst-pointer
mov esi,src // src-pointer
mov ecx,isize // loop counter
shl ecx,2 // *4
add edi,ecx // count backwards
add esi,ecx
neg ecx
mov eax,intense // fade
imul eax,0x01010101 // to 8:8:8:8
pxor mm3,mm3 // 0
movd mm1,eax
punpcklbw mm1,mm3 // 8:8:8:8 to 16:16:16:16 (each upper byte 0)
floop: movq mm0,[esi+ecx] // read two source pixels
movq mm4,mm0 // copy
punpcklbw mm0,mm3 // pixel1: upper dword 8:8:8:8 to 16:16:16:16
psrlq mm4,32 // lower dword to upper dword
pmullw mm0,mm1 // pixel1 * intense
psrlw mm0,8 // pixel1 >>= 8
punpcklbw mm4,mm3 // pixel2: 8:8:8:8 to 16:16:16:16
packuswb mm0,mm3 // pixel1: 16:16:16:16 to 8:8:8:8
pmullw mm4,mm1 // pixel2 * intense
psrlw mm4,8 // pixel2 >>= 8
packuswb mm4,mm3 // pixel2: 16:16:16:16 to 8:8:8:8
psllq mm4,32 // pixel1 to upper dword
paddw mm4,mm0 // pixel2 to lower dword
movq [edi+ecx],mm4 // store two pixels
add ecx,8 // next pixel
jnz floop // loop
emms
};
}
(not quite optimal, the source is 10 years old)