stuff like this is all over the place, this actually works slower than using the 03 flag
Code: Select all
// calculate: 1 / x
REALINLINE f32 reciprocal( const f32 f )
{
#if defined (IRRLICHT_FAST_MATH)
// SSE Newton-Raphson reciprocal estimate, accurate to 23 significant
// bi ts of the mantissa
// One Newtown-Raphson Iteration:
// f(i+1) = 2 * rcpss(f) - f * rcpss(f) * rcpss(f)
f32 rec;
__asm rcpss xmm0, f // xmm0 = rcpss(f)
__asm movss xmm1, f // xmm1 = f
__asm mulss xmm1, xmm0 // xmm1 = f * rcpss(f)
__asm mulss xmm1, xmm0 // xmm2 = f * rcpss(f) * rcpss(f)
__asm addss xmm0, xmm0 // xmm0 = 2 * rcpss(f)
__asm subss xmm0, xmm1 // xmm0 = 2 * rcpss(f)
// - f * rcpss(f) * rcpss(f)
__asm movss rec, xmm0 // return xmm0
return rec;
//! i do not divide through 0.. (fpu expection)
// instead set f to a high value to get a return value near zero..
// -1000000000000.f.. is use minus to stay negative..
// must test's here (plane.normal dot anything ) checks on <= 0.f
//u32 x = (-(AIR(f) != 0 ) >> 31 ) & ( IR(f) ^ 0xd368d4a5 ) ^ 0xd368d4a5;
//return 1.f / FR ( x );
#else // no fast math
return 1.f / f;
#endif
}
secondly, YOURE WASTING AND CLOGGING UP SSE REGISTERS, clogging them up and THE COMPILER CANT OPTIMIZE LOOPS involving rsqrt, sqrt and rcp BECAUSE THE INSTRUCTION THAT OPERATES ON 4 FLOATS IS BEING USED ON 1... AND IN ASSEMBLY!!! SO THE COMPILER CANT OPTIMIZE THAT!
thirdly, you're using unaligned loads and stores so thats slower than proper SSE
Please follow my SIMD thread for possible fixes (reverting scalar rsqrt, etc. to normal math library calls)