Thank you for your input.  I would feel more confident if you tested
my code, though.

Then post a pointer to your code, sources preferred.


Simple enough to cut and paste here, but this is an untested excerpt
that assumes the arrays are a multiple of 16 floats aligned on 16 byte

void equalSSE2(int mn, float *aax, float *bbx, int prefetchN)
int NN = 16, MN = mn/NN, MN4 = NN*MN, N = mn-MN4;
push eax
push ecx
push edx
push esi
mov ecx, prefetchN
imul ecx, 64
mov eax, aax
mov edx, bbx
mov esi, MN
test esi, esi
jle $L1
align 16;
movaps xmm0, [edx]
movaps xmm1, [edx+16]
movaps xmm2, [edx+32]
movaps xmm3, [edx+48]


movntpd [eax], xmm0
movntpd [eax+16], xmm1
movntpd [eax+32], xmm2
movntpd [eax+48], xmm3

add edx, 64
add eax, 64

dec esi
jnz $L2
pop esi
pop edx
pop ecx
pop eax
//code for array elements MN4 to mn-1 would go here