[OBSOLETE] SIMD IRRLICHT VECTORS!
Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO
I read the entire article, its all very interesting however note the section that defines the compiler flags:
The author did not take advantage of every available optimization flag therefore where GCC may be a superior compiler, it is not far superior as the article would lead you to believe. Also, (/GS-) is enabled by default which is something that would most definitely increase the complexity of the underlying ASM at the advantage of increasing security.
Take that article with a grain of salt, the author in my opinion did a great job explaining the ASM that was generated but did not explore all the possible compiler flags to truly make the tests between the different compilers fair and equal. Too much assumption is placed around the single (/O2) computational flag.
Code: Select all
CC command line:
gcc -O2 -msse test.c -S -o test.asm
MSVC command line:
cl /O2 /arch:SSE /c /FA test.c
ICC’s command line:
icc -O2 -msse test.c -S -o test.asm
Take that article with a grain of salt, the author in my opinion did a great job explaining the ASM that was generated but did not explore all the possible compiler flags to truly make the tests between the different compilers fair and equal. Too much assumption is placed around the single (/O2) computational flag.
Dream Big Or Go Home.
Help Me Help You.
Help Me Help You.
Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO
to be fair if you're writing SSE and your code needs it that badly... you may want to use the O3 flag first XD
P.S. found a bug in the vector class which caused it to completely not work! AND I FOUND A WAY TO be able to use new and delete on SIMD classes requiring 16byte alignment
P.S. found a bug in the vector class which caused it to completely not work! AND I FOUND A WAY TO be able to use new and delete on SIMD classes requiring 16byte alignment
Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO
I fixed up irrMath.h which was using some nasty inline assembly
Code: Select all
// Copyright (C) 2002-2012 Nikolaus Gebhardt
// This file is part of the "Irrlicht Engine".
// For conditions of distribution and use, see copyright notice in irrlicht.h
#ifndef __IRR_MATH_H_INCLUDED__
#define __IRR_MATH_H_INCLUDED__
#include "IrrCompileConfig.h"
#include "irrTypes.h"
#include <math.h>
#include <float.h>
#include <stdlib.h> // for abs() etc.
#include <limits.h> // For INT_MAX / UINT_MAX
#if defined(_IRR_SOLARIS_PLATFORM_) || defined(__BORLANDC__) || defined (__BCPLUSPLUS__) || defined (_WIN32_WCE)
#define sqrtf(X) (irr::f32)sqrt((irr::f64)(X))
#define sinf(X) (irr::f32)sin((irr::f64)(X))
#define cosf(X) (irr::f32)cos((irr::f64)(X))
#define asinf(X) (irr::f32)asin((irr::f64)(X))
#define acosf(X) (irr::f32)acos((irr::f64)(X))
#define atan2f(X,Y) (irr::f32)atan2((irr::f64)(X),(irr::f64)(Y))
#define ceilf(X) (irr::f32)ceil((irr::f64)(X))
#define floorf(X) (irr::f32)floor((irr::f64)(X))
#define powf(X,Y) (irr::f32)pow((irr::f64)(X),(irr::f64)(Y))
#define fmodf(X,Y) (irr::f32)fmod((irr::f64)(X),(irr::f64)(Y))
#define fabsf(X) (irr::f32)fabs((irr::f64)(X))
#define logf(X) (irr::f32)log((irr::f64)(X))
#endif
#ifndef FLT_MAX
#define FLT_MAX 3.402823466E+38F
#endif
#ifndef FLT_MIN
#define FLT_MIN 1.17549435e-38F
#endif
namespace irr
{
namespace core
{
//! Rounding error constant often used when comparing f32 values.
const s32 ROUNDING_ERROR_S32 = 0;
#ifdef __IRR_HAS_S64
const s64 ROUNDING_ERROR_S64 = 0;
#endif
const f32 ROUNDING_ERROR_f32 = 0.000001f;
const f64 ROUNDING_ERROR_f64 = 0.00000001;
#ifdef PI // make sure we don't collide with a define
#undef PI
#endif
//! Constant for PI.
const f32 PI = 3.14159265359f;
//! Constant for reciprocal of PI.
const f32 RECIPROCAL_PI = 1.0f/PI;
//! Constant for half of PI.
const f32 HALF_PI = PI/2.0f;
#ifdef PI64 // make sure we don't collide with a define
#undef PI64
#endif
//! Constant for 64bit PI.
const f64 PI64 = 3.1415926535897932384626433832795028841971693993751;
//! Constant for 64bit reciprocal of PI.
const f64 RECIPROCAL_PI64 = 1.0/PI64;
//! 32bit Constant for converting from degrees to radians
const f32 DEGTORAD = PI / 180.0f;
//! 32bit constant for converting from radians to degrees (formally known as GRAD_PI)
const f32 RADTODEG = 180.0f / PI;
//! 64bit constant for converting from degrees to radians (formally known as GRAD_PI2)
const f64 DEGTORAD64 = PI64 / 180.0;
//! 64bit constant for converting from radians to degrees
const f64 RADTODEG64 = 180.0 / PI64;
//! Utility function to convert a radian value to degrees
/** Provided as it can be clearer to write radToDeg(X) than RADTODEG * X
\param radians The radians value to convert to degrees.
*/
inline f32 radToDeg(f32 radians)
{
return RADTODEG * radians;
}
//! Utility function to convert a radian value to degrees
/** Provided as it can be clearer to write radToDeg(X) than RADTODEG * X
\param radians The radians value to convert to degrees.
*/
inline f64 radToDeg(f64 radians)
{
return RADTODEG64 * radians;
}
//! Utility function to convert a degrees value to radians
/** Provided as it can be clearer to write degToRad(X) than DEGTORAD * X
\param degrees The degrees value to convert to radians.
*/
inline f32 degToRad(f32 degrees)
{
return DEGTORAD * degrees;
}
//! Utility function to convert a degrees value to radians
/** Provided as it can be clearer to write degToRad(X) than DEGTORAD * X
\param degrees The degrees value to convert to radians.
*/
inline f64 degToRad(f64 degrees)
{
return DEGTORAD64 * degrees;
}
//! returns minimum of two values. Own implementation to get rid of the STL (VS6 problems)
template<class T>
inline const T& min_(const T& a, const T& b)
{
return a < b ? a : b;
}
//! returns minimum of three values. Own implementation to get rid of the STL (VS6 problems)
template<class T>
inline const T& min_(const T& a, const T& b, const T& c)
{
return a < b ? min_(a, c) : min_(b, c);
}
//! returns maximum of two values. Own implementation to get rid of the STL (VS6 problems)
template<class T>
inline const T& max_(const T& a, const T& b)
{
return a < b ? b : a;
}
//! returns maximum of three values. Own implementation to get rid of the STL (VS6 problems)
template<class T>
inline const T& max_(const T& a, const T& b, const T& c)
{
return a < b ? max_(b, c) : max_(a, c);
}
//! returns abs of two values. Own implementation to get rid of STL (VS6 problems)
template<class T>
inline T abs_(const T& a)
{
return a < (T)0 ? -a : a;
}
//! returns linear interpolation of a and b with ratio t
//! \return: a if t==0, b if t==1, and the linear interpolation else
template<class T>
inline T lerp(const T& a, const T& b, const f32 t)
{
return (T)(a*(1.f-t)) + (b*t);
}
//! clamps a value between low and high
template <class T>
inline const T clamp (const T& value, const T& low, const T& high)
{
return min_ (max_(value,low), high);
}
//! swaps the content of the passed parameters
// Note: We use the same trick as boost and use two template arguments to
// avoid ambiguity when swapping objects of an Irrlicht type that has not
// it's own swap overload. Otherwise we get conflicts with some compilers
// in combination with stl.
template <class T1, class T2>
inline void swap(T1& a, T2& b)
{
T1 c(a);
a = b;
b = c;
}
//! returns if a equals b, taking possible rounding errors into account
inline bool equals(const f64 a, const f64 b, const f64 tolerance = ROUNDING_ERROR_f64)
{
return (a + tolerance >= b) && (a - tolerance <= b);
}
//! returns if a equals b, taking possible rounding errors into account
inline bool equals(const f32 a, const f32 b, const f32 tolerance = ROUNDING_ERROR_f32)
{
return (a + tolerance >= b) && (a - tolerance <= b);
}
union FloatIntUnion32
{
FloatIntUnion32(float f1 = 0.0f) : f(f1) {}
// Portable sign-extraction
bool sign() const { return (i >> 31) != 0; }
irr::s32 i;
irr::f32 f;
};
//! We compare the difference in ULP's (spacing between floating-point numbers, aka ULP=1 means there exists no float between).
//\result true when numbers have a ULP <= maxUlpDiff AND have the same sign.
inline bool equalsByUlp(f32 a, f32 b, int maxUlpDiff)
{
// Based on the ideas and code from Bruce Dawson on
// http://www.altdevblogaday.com/2012/02/22/comparing-floating-point-numbers-2012-edition/
// When floats are interpreted as integers the two nearest possible float numbers differ just
// by one integer number. Also works the other way round, an integer of 1 interpreted as float
// is for example the smallest possible float number.
FloatIntUnion32 fa(a);
FloatIntUnion32 fb(b);
// Different signs, we could maybe get difference to 0, but so close to 0 using epsilons is better.
if ( fa.sign() != fb.sign() )
{
// Check for equality to make sure +0==-0
if (fa.i == fb.i)
return true;
return false;
}
// Find the difference in ULPs.
int ulpsDiff = abs_(fa.i- fb.i);
if (ulpsDiff <= maxUlpDiff)
return true;
return false;
}
#if 0
//! returns if a equals b, not using any rounding tolerance
inline bool equals(const s32 a, const s32 b)
{
return (a == b);
}
//! returns if a equals b, not using any rounding tolerance
inline bool equals(const u32 a, const u32 b)
{
return (a == b);
}
#endif
//! returns if a equals b, taking an explicit rounding tolerance into account
inline bool equals(const s32 a, const s32 b, const s32 tolerance = ROUNDING_ERROR_S32)
{
return (a + tolerance >= b) && (a - tolerance <= b);
}
//! returns if a equals b, taking an explicit rounding tolerance into account
inline bool equals(const u32 a, const u32 b, const s32 tolerance = ROUNDING_ERROR_S32)
{
return (a + tolerance >= b) && (a - tolerance <= b);
}
#ifdef __IRR_HAS_S64
//! returns if a equals b, taking an explicit rounding tolerance into account
inline bool equals(const s64 a, const s64 b, const s64 tolerance = ROUNDING_ERROR_S64)
{
return (a + tolerance >= b) && (a - tolerance <= b);
}
#endif
//! returns if a equals zero, taking rounding errors into account
inline bool iszero(const f64 a, const f64 tolerance = ROUNDING_ERROR_f64)
{
return fabs(a) <= tolerance;
}
//! returns if a equals zero, taking rounding errors into account
inline bool iszero(const f32 a, const f32 tolerance = ROUNDING_ERROR_f32)
{
return fabsf(a) <= tolerance;
}
//! returns if a equals not zero, taking rounding errors into account
inline bool isnotzero(const f32 a, const f32 tolerance = ROUNDING_ERROR_f32)
{
return fabsf(a) > tolerance;
}
//! returns if a equals zero, taking rounding errors into account
inline bool iszero(const s32 a, const s32 tolerance = 0)
{
return ( a & 0x7ffffff ) <= tolerance;
}
//! returns if a equals zero, taking rounding errors into account
inline bool iszero(const u32 a, const u32 tolerance = 0)
{
return a <= tolerance;
}
#ifdef __IRR_HAS_S64
//! returns if a equals zero, taking rounding errors into account
inline bool iszero(const s64 a, const s64 tolerance = 0)
{
return abs_(a) <= tolerance;
}
#endif
inline s32 s32_min(s32 a, s32 b)
{
const s32 mask = (a - b) >> 31;
return (a & mask) | (b & ~mask);
}
inline s32 s32_max(s32 a, s32 b)
{
const s32 mask = (a - b) >> 31;
return (b & mask) | (a & ~mask);
}
inline s32 s32_clamp (s32 value, s32 low, s32 high)
{
return s32_min(s32_max(value,low), high);
}
/*
float IEEE-754 bit represenation
0 0x00000000
1.0 0x3f800000
0.5 0x3f000000
3 0x40400000
+inf 0x7f800000
-inf 0xff800000
+NaN 0x7fc00000 or 0x7ff00000
in general: number = (sign ? -1:1) * 2^(exponent) * 1.(mantissa bits)
*/
typedef union { u32 u; s32 s; f32 f; } inttofloat;
#define F32_AS_S32(f) (*((s32 *) &(f)))
#define F32_AS_U32(f) (*((u32 *) &(f)))
#define F32_AS_U32_POINTER(f) ( ((u32 *) &(f)))
#define F32_VALUE_0 0x00000000
#define F32_VALUE_1 0x3f800000
#define F32_SIGN_BIT 0x80000000U
#define F32_EXPON_MANTISSA 0x7FFFFFFFU
//! code is taken from IceFPU
//! Integer representation of a floating-point value.
#ifdef IRRLICHT_FAST_MATH
#define IR(x) ((u32&)(x))
#else
inline u32 IR(f32 x) {inttofloat tmp; tmp.f=x; return tmp.u;}
#endif
//! Absolute integer representation of a floating-point value
#define AIR(x) (IR(x)&0x7fffffff)
//! Floating-point representation of an integer value.
#ifdef IRRLICHT_FAST_MATH
#define FR(x) ((f32&)(x))
#else
inline f32 FR(u32 x) {inttofloat tmp; tmp.u=x; return tmp.f;}
inline f32 FR(s32 x) {inttofloat tmp; tmp.s=x; return tmp.f;}
#endif
//! integer representation of 1.0
#define IEEE_1_0 0x3f800000
//! integer representation of 255.0
#define IEEE_255_0 0x437f0000
#ifdef IRRLICHT_FAST_MATH
#define F32_LOWER_0(f) (F32_AS_U32(f) > F32_SIGN_BIT)
#define F32_LOWER_EQUAL_0(f) (F32_AS_S32(f) <= F32_VALUE_0)
#define F32_GREATER_0(f) (F32_AS_S32(f) > F32_VALUE_0)
#define F32_GREATER_EQUAL_0(f) (F32_AS_U32(f) <= F32_SIGN_BIT)
#define F32_EQUAL_1(f) (F32_AS_U32(f) == F32_VALUE_1)
#define F32_EQUAL_0(f) ( (F32_AS_U32(f) & F32_EXPON_MANTISSA ) == F32_VALUE_0)
// only same sign
#define F32_A_GREATER_B(a,b) (F32_AS_S32((a)) > F32_AS_S32((b)))
#else
#define F32_LOWER_0(n) ((n) < 0.0f)
#define F32_LOWER_EQUAL_0(n) ((n) <= 0.0f)
#define F32_GREATER_0(n) ((n) > 0.0f)
#define F32_GREATER_EQUAL_0(n) ((n) >= 0.0f)
#define F32_EQUAL_1(n) ((n) == 1.0f)
#define F32_EQUAL_0(n) ((n) == 0.0f)
#define F32_A_GREATER_B(a,b) ((a) > (b))
#endif
#ifndef REALINLINE
#ifdef _MSC_VER
#define REALINLINE __forceinline
#else
#define REALINLINE inline
#endif
#endif
#if defined(__BORLANDC__) || defined (__BCPLUSPLUS__)
// 8-bit bools in borland builder
//! conditional set based on mask and arithmetic shift
REALINLINE u32 if_c_a_else_b ( const c8 condition, const u32 a, const u32 b )
{
return ( ( -condition >> 7 ) & ( a ^ b ) ) ^ b;
}
//! conditional set based on mask and arithmetic shift
REALINLINE u32 if_c_a_else_0 ( const c8 condition, const u32 a )
{
return ( -condition >> 31 ) & a;
}
#else
//! conditional set based on mask and arithmetic shift
REALINLINE u32 if_c_a_else_b ( const s32 condition, const u32 a, const u32 b )
{
return ( ( -condition >> 31 ) & ( a ^ b ) ) ^ b;
}
//! conditional set based on mask and arithmetic shift
REALINLINE u16 if_c_a_else_b ( const s16 condition, const u16 a, const u16 b )
{
return ( ( -condition >> 15 ) & ( a ^ b ) ) ^ b;
}
//! conditional set based on mask and arithmetic shift
REALINLINE u32 if_c_a_else_0 ( const s32 condition, const u32 a )
{
return ( -condition >> 31 ) & a;
}
#endif
/*
if (condition) state |= m; else state &= ~m;
*/
REALINLINE void setbit_cond ( u32 &state, s32 condition, u32 mask )
{
// 0, or any postive to mask
//s32 conmask = -condition >> 31;
state ^= ( ( -condition >> 31 ) ^ state ) & mask;
}
inline f32 round_( f32 x )
{
return floorf( x + 0.5f );
}
REALINLINE void clearFPUException ()
{
#ifdef IRRLICHT_FAST_MATH
return;
#ifdef feclearexcept
feclearexcept(FE_ALL_EXCEPT);
#elif defined(_MSC_VER)
__asm fnclex;
#elif defined(__GNUC__) && defined(__x86__)
__asm__ __volatile__ ("fclex \n\t");
#else
//# warn clearFPUException not supported.
#endif
#endif
}
// calculate: sqrt ( x )
REALINLINE f32 squareroot(const f32 f)
{
return sqrtf(f);
}
// calculate: sqrt ( x )
REALINLINE f64 squareroot(const f64 f)
{
return sqrt(f);
}
// calculate: sqrt ( x )
REALINLINE s32 squareroot(const s32 f)
{
return static_cast<s32>(squareroot(static_cast<f32>(f)));
}
#ifdef __IRR_HAS_S64
// calculate: sqrt ( x )
REALINLINE s64 squareroot(const s64 f)
{
return static_cast<s64>(squareroot(static_cast<f64>(f)));
}
#endif
// calculate: 1 / sqrt ( x )
REALINLINE f64 reciprocal_squareroot(const f64 x)
{
#if defined ( IRRLICHT_FAST_MATH )
double result = 1.0 / sqrt(x);
//! pending perf test
//_mm_store_sd(&result,_mm_div_sd(_mm_set_pd(0.0,1.0),_mm_sqrt_sd(_mm_load_sd(&x))));
return result;
#else // no fast math
return 1.0 / sqrt(x);
#endif
}
// calculate: 1 / sqrtf ( x )
REALINLINE f32 reciprocal_squareroot(const f32 f)
{
#if defined ( IRRLICHT_FAST_MATH )
float result;
_mm_store_ss(&result,_mm_rsqrt_ps(_mm_load_ss(&f)));
return result;
#else // no fast math
return 1.f / sqrtf(f);
#endif
}
// calculate: 1 / sqrtf( x )
REALINLINE s32 reciprocal_squareroot(const s32 x)
{
return static_cast<s32>(reciprocal_squareroot(static_cast<f32>(x)));
}
// calculate: 1 / x
REALINLINE f32 reciprocal( const f32 f )
{
#if defined (IRRLICHT_FAST_MATH)
float result;
_mm_store_ss(&result,_mm_rcp_ps(_mm_load_ss(&f)));
return result;
#else // no fast math
return 1.f / f;
#endif
}
// calculate: 1 / x
REALINLINE f64 reciprocal ( const f64 f )
{
return 1.0 / f;
}
// calculate: 1 / x, low precision allowed
REALINLINE f32 reciprocal_approxim ( const f32 f )
{
//what was here before was not faster
return reciprocal(f);
}
REALINLINE s32 floor32(f32 x)
{
#ifdef IRRLICHT_FAST_MATH
const f32 h = 0.5f;
s32 t;
#if defined(_MSC_VER)
__asm
{
fld x
fsub h
fistp t
}
#elif defined(__GNUC__)
__asm__ __volatile__ (
"fsub %2 \n\t"
"fistpl %0"
: "=m" (t)
: "t" (x), "f" (h)
: "st"
);
#else
//# warn IRRLICHT_FAST_MATH not supported.
return (s32) floorf ( x );
#endif
return t;
#else // no fast math
return (s32) floorf ( x );
#endif
}
REALINLINE s32 ceil32 ( f32 x )
{
#ifdef IRRLICHT_FAST_MATH
const f32 h = 0.5f;
s32 t;
#if defined(_MSC_VER)
__asm
{
fld x
fadd h
fistp t
}
#elif defined(__GNUC__)
__asm__ __volatile__ (
"fadd %2 \n\t"
"fistpl %0 \n\t"
: "=m"(t)
: "t"(x), "f"(h)
: "st"
);
#else
//# warn IRRLICHT_FAST_MATH not supported.
return (s32) ceilf ( x );
#endif
return t;
#else // not fast math
return (s32) ceilf ( x );
#endif
}
REALINLINE s32 round32(f32 x)
{
#if defined(IRRLICHT_FAST_MATH)
s32 t;
#if defined(_MSC_VER)
__asm
{
fld x
fistp t
}
#elif defined(__GNUC__)
__asm__ __volatile__ (
"fistpl %0 \n\t"
: "=m"(t)
: "t"(x)
: "st"
);
#else
//# warn IRRLICHT_FAST_MATH not supported.
return (s32) round_(x);
#endif
return t;
#else // no fast math
return (s32) round_(x);
#endif
}
inline f32 f32_max3(const f32 a, const f32 b, const f32 c)
{
return a > b ? (a > c ? a : c) : (b > c ? b : c);
}
inline f32 f32_min3(const f32 a, const f32 b, const f32 c)
{
return a < b ? (a < c ? a : c) : (b < c ? b : c);
}
inline f32 fract ( f32 x )
{
return x - floorf ( x );
}
} // end namespace core
} // end namespace irr
#ifndef IRRLICHT_FAST_MATH
using irr::core::IR;
using irr::core::FR;
#endif
#endif
Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO
okay, we managed to compile the whole thing in MSVC and in SSE2 as well, also figured out a way to get SSE3 in MSVC
it turns out that IRR_FAST_MATH didnt really work in the first place, so we dont use it for now (until we fix it - in stock irrlicht)
finally implemented the replacement new operators etc.
got the swizzle in (but only for 4d vectors, 8d would require 16million overloaded functions XD)
implemented more irrlicht functions of vector3df (implemented all from irrMath.h)
P.S. I didnt update the listing cause I hit the character limit, I'll get soren to host the files when we're done
it turns out that IRR_FAST_MATH didnt really work in the first place, so we dont use it for now (until we fix it - in stock irrlicht)
finally implemented the replacement new operators etc.
got the swizzle in (but only for 4d vectors, 8d would require 16million overloaded functions XD)
implemented more irrlicht functions of vector3df (implemented all from irrMath.h)
P.S. I didnt update the listing cause I hit the character limit, I'll get soren to host the files when we're done
Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO
So what do we do to add this speed-up?
Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO
probably adding the new headers from the listings in the first post... however I started doing 32bit integer vectors and didnt get the chance to finish them... so I'll let everyone know when I'll have a new version thats compilable and works in 32 and 16bit integer mode
Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO
Okay thanks, would be good to give it a try. I´m getting frame stuttering when the irrlicht camera flys a path and goes from a less complex part of the scene to a more complex part, or that´s how it looks.
I was wondering if that was because Irrlicht was starting to do the math for all those objects to be rendered, so am hoping that maybe this will improve it.
I was wondering if that was because Irrlicht was starting to do the math for all those objects to be rendered, so am hoping that maybe this will improve it.
Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO
THE vectorSIMD.h file containing our beautiful vector classes (part 1, doesn't fit in one post):
Code: Select all
// Copyright (C) 2014 Mateusz 'DevSH' Kielan
// This file is part of the "Irrlicht Engine".
// Contributed from "Build a World"
// For conditions of distribution and use, see copyright notice in irrlicht.h
#ifndef __IRR_VECTOR_SIMD_H_INCLUDED__
#define __IRR_VECTOR_SIMD_H_INCLUDED__
#include "IrrCompileConfig.h"
#ifdef __IRR_COMPILE_WITH_X86_SIMD_
#ifndef __IRR_COMPILE_WITH_SSE2
#error "Either give up on SIMD vectors, check your compiler settings for the -m*sse* flag, or upgrade your CPU"
#endif // __IRR_COMPILE_WITH_SSE2
#include "irrMath.h"
#include <stdint.h>
#include "SColor.h"
namespace irr
{
namespace core
{
class vectorSIMDf;
template <class T>
class vectorSIMD_32;
template <class T>
class vectorSIMD_16;
//a class for bitwise shizz
template <int components> class vectorSIMDBool
{
public:
inline vectorSIMDBool() {_mm_store_ps((float*)value,_mm_setzero_ps());}
//! These constructors will bytewise cast the reg into the value
inline vectorSIMDBool(const __m128 ®) {_mm_store_ps((float*)value,reg);}
inline vectorSIMDBool(const __m128d ®) {_mm_store_pd((double*)value,reg);}
inline vectorSIMDBool(const __m128i ®) {_mm_store_si128((__m128i*)value,reg);}
inline vectorSIMDBool(const vectorSIMDBool& other) {_mm_store_ps((float*)value,_mm_load_ps((float*)other.value));}
//! reads 16 bytes from an array of uint8_t
inline vectorSIMDBool(uint8_t* const &array) {_mm_store_ps((float*)value,_mm_loadu_ps((float*)array));}
//! same as above, BUT WILL CRASH IF ARRAY NOT 16 BYTE ALIGNED
inline vectorSIMDBool(uint8_t* const &array, bool ALIGNED) {_mm_store_ps((float*)value,_mm_load_ps((float*)array));}
//! Constructor with the same value for all elements
explicit vectorSIMDBool(const bool &n) {_mm_store_si128((__m128i*)value,n ? _mm_set_epi64x(-0x1ll,-0x1ll):_mm_setzero_si128());}
inline vectorSIMDBool operator~() const { return _mm_xor_si128(getAsRegister(),_mm_set_epi64x(-0x1ll,-0x1ll)); }
inline vectorSIMDBool operator&(const vectorSIMDBool &other) const { return _mm_and_si128(getAsRegister(),other.getAsRegister()); }
inline vectorSIMDBool operator|(const vectorSIMDBool &other) const { return _mm_or_si128(getAsRegister(),other.getAsRegister()); }
inline vectorSIMDBool operator^(const vectorSIMDBool &other) const { return _mm_xor_si128(getAsRegister(),other.getAsRegister()); }
/*
NO BITSHIFTING SUPPORT
*/
inline vectorSIMDBool<components> operator!() const { return vectorSIMDBool<components>(); }
inline vectorSIMDBool<components> operator&&(const vectorSIMDBool<components> &other) const { return vectorSIMDBool<components>(); }
inline vectorSIMDBool<components> operator||(const vectorSIMDBool<components> &other) const { return vectorSIMDBool<components>(); }
//! like GLSL, returns true if any bit of value is set
inline bool any(void) const
{
return ((uint64_t*)value)[0]|((uint64_t*)value)[1];
}
//! like GLSL, returns true if all bits of value are set
inline bool allBits(void) const
{
return (((uint64_t*)value)[0]&((uint64_t*)value)[1])==0xffffffffffffffffull;
}
//! like GLSL, returns true if all components non zero
inline bool all(void) const
{
return 0;
}
//! in case you want to do your own SSE
inline __m128i getAsRegister() const {_mm_load_si128((__m128i*)value);}
#ifdef _IRR_WINDOWS_
__declspec(align(SIMD_ALIGNMENT)) uint8_t value[16];
};
#else
uint8_t value[16];
} __attribute__ ((__aligned__(SIMD_ALIGNMENT)));
#endif
//! partial specialization for variable width vectors
template <>
inline bool vectorSIMDBool<2>::all(void) const
{
return (((uint64_t*)value)[0]&&((uint64_t*)value)[1]);
}
template <>
inline bool vectorSIMDBool<4>::all(void) const
{
return ((uint32_t*)value)[0]&&((uint32_t*)value)[1]&&((uint32_t*)value)[2]&&((uint32_t*)value)[3];
}
template <>
inline bool vectorSIMDBool<8>::all(void) const
{
__m128i xmm0 = _mm_xor_si128(_mm_cmpeq_epi16(getAsRegister(),_mm_setzero_si128()),_mm_set_epi16(-1,-1,-1,-1,-1,-1,-1,-1));
xmm0 = _mm_and_si128(xmm0,_mm_shuffle_epi32(xmm0,_MM_SHUFFLE(0,1,2,3))); // (0&&6,1&&7, 2&&4,3&&5, ...)
xmm0 = _mm_and_si128(xmm0,_mm_shufflelo_epi16(xmm0,_MM_SHUFFLE(1,0,3,2))); // (0&&2&&4&&6, 1&&3&&5&&7, ... )
uint16_t tmpStorage[2];
_mm_store_ss((float*)tmpStorage,_mm_castsi128_ps(xmm0));
return tmpStorage[0]&tmpStorage[1];
}/*
template <>
inline bool vectorSIMDBool<16>::all(void) const
{
__m128i xmm0 = _mm_xor_si128(_mm_cmpeq_epi8(getAsRegister(),_mm_setzero_si128()),_mm_set_epi8(-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1));
xmm0 = _mm_and_si128(xmm0,_mm_shuffle_epi32(xmm0,_MM_SHUFFLE(0,1,2,3))); // (0&&12,1&&13,2&&14,3&&15, 4&&8,5&&9,6&&10,7&&11, ...)
xmm0 = _mm_and_si128(xmm0,_mm_shufflelo_epi16(xmm0,_MM_SHUFFLE(1,0,3,2))); // (0&&4&&8&&12,1&&5&&9&&13,2&&6&&10&&14,3&&7&&11&&15, ...)
xmm0 = _mm_and_si128(xmm0,_mm_slli_si128(xmm0,2)); // (even &&,odd &&, ...)
_mm_store_si128((__m128i*)tmpStorage,xmm0);
return tmpStorage[0]&&tmpStorage[1];
}
//! following do ANDs (not bitwise ANDs)
template <>
inline vectorSIMDBool<2> vectorSIMDBool<2>::operator&&(const vectorSIMDBool<2> &other) const
{
}
template <>
inline vectorSIMDBool<2> vectorSIMDBool<2>::operator||(const vectorSIMDBool<2> &other) const
{
}
template <>
inline vectorSIMDBool<2> vectorSIMDBool<2>::operator!() const
{
}*/
template <>
inline vectorSIMDBool<4> vectorSIMDBool<4>::operator&&(const vectorSIMDBool<4> &other) const
{
__m128 xmm0 = _mm_and_ps(_mm_cmpneq_ps(_mm_castsi128_ps(other.getAsRegister()),_mm_setzero_ps()),_mm_cmpneq_ps(_mm_castsi128_ps(getAsRegister()),_mm_setzero_ps()));
return vectorSIMDBool<4>(xmm0);
}
template <>
inline vectorSIMDBool<4> vectorSIMDBool<4>::operator||(const vectorSIMDBool<4> &other) const
{
__m128i xmm0 = _mm_or_si128(other.getAsRegister(),getAsRegister());
return vectorSIMDBool<4>(_mm_cmpneq_ps(_mm_castsi128_ps(xmm0),_mm_setzero_ps()));
}
template <>
inline vectorSIMDBool<4> vectorSIMDBool<4>::operator!() const
{
return vectorSIMDBool<4>(_mm_cmpeq_ps(_mm_castsi128_ps(getAsRegister()),_mm_setzero_ps()));
}
template <>
inline vectorSIMDBool<8> vectorSIMDBool<8>::operator&&(const vectorSIMDBool<8> &other) const
{
__m128i xmm0 = _mm_andnot_si128(_mm_cmpeq_epi16(other.getAsRegister(),_mm_setzero_si128()),_mm_xor_si128(_mm_cmpeq_epi16(getAsRegister(),_mm_setzero_si128()),_mm_set_epi16(-1,-1,-1,-1,-1,-1,-1,-1)));
return vectorSIMDBool<8>(xmm0);
}
template <>
inline vectorSIMDBool<8> vectorSIMDBool<8>::operator||(const vectorSIMDBool<8> &other) const
{
__m128i xmm0 = _mm_or_si128(other.getAsRegister(),getAsRegister());
return vectorSIMDBool<8>(_mm_xor_si128(_mm_cmpeq_epi16(xmm0,_mm_setzero_si128()),_mm_set_epi16(-1,-1,-1,-1,-1,-1,-1,-1)));
}
template <>
inline vectorSIMDBool<8> vectorSIMDBool<8>::operator!() const
{
return vectorSIMDBool<8>(_mm_cmpeq_epi16(getAsRegister(),_mm_setzero_si128()));
}/*
template <>
inline vectorSIMDBool<16> vectorSIMDBool<16>::operator&&(const vectorSIMDBool<16> &other) const
{
}
template <>
inline vectorSIMDBool<16> vectorSIMDBool<16>::operator||(const vectorSIMDBool<16> &other) const
{
}
template <>
inline vectorSIMDBool<16> vectorSIMDBool<16>::operator!() const
{
}*/
//! Typedef for N-bit wide boolean vectors
//typedef vectorSIMDBool<16> vector16db_SIMD;
typedef vectorSIMDBool<8> vector8db_SIMD;
typedef vectorSIMDBool<4> vector4db_SIMD;
//typedef vectorSIMDBool<2> vector2db_SIMD;
Code: Select all
#include "SIMDswizzle.h"
#ifdef _IRR_WINDOWS_
__declspec(align(SIMD_ALIGNMENT)) class vectorSIMDf : public SIMD_32bitSwizzleAble<vectorSIMDf,__m128>
#else
class vectorSIMDf : public SIMD_32bitSwizzleAble<vectorSIMDf,__m128>
#endif
{
public:
//! Default constructor (null vector).
inline vectorSIMDf() {_mm_store_ps(pointer,_mm_setzero_ps());}
//! Constructor with four different values, FASTEST IF the values are constant literals
//yes this is correct usage with _mm_set_**(), due to little endianness the thing gets set in "reverse" order
inline explicit vectorSIMDf(const float &nx, const float &ny, const float &nz, const float &nw) {_mm_store_ps(pointer,_mm_set_ps(nw,nz,ny,nx));}
//! 3d constructor
inline explicit vectorSIMDf(const float &nx, const float &ny, const float &nz) {_mm_store_ps(pointer,_mm_set_ps(0.f,nz,ny,nx));}
//! 2d constructor
inline explicit vectorSIMDf(const float &nx, const float &ny) {_mm_store_ps(pointer,_mm_set_ps(0.f,0.f,ny,nx));}
//! Fast Constructor from floats, they come in normal order [0]=X,[1]=Y, etc.
inline vectorSIMDf(float* const &array) {_mm_store_ps(pointer,_mm_loadu_ps(array));}
//! Fastest Constructor from floats, they come in normal order [0]=X,[1]=Y, etc.
//! Address has to be aligned to 16bytes OR WILL CRASH
inline vectorSIMDf(float* const &array, bool ALIGNED) {_mm_store_ps(pointer,_mm_load_ps(array));}
//! Fastest and most natural constructor
inline vectorSIMDf(const __m128 ®) {_mm_store_ps(pointer,reg);}
//! Constructor with the same value for all elements
inline explicit vectorSIMDf(const float &n) {_mm_store_ps(pointer,_mm_load_ps1(&n));}
//! Copy constructor
inline vectorSIMDf(const vectorSIMDf& other) {_mm_store_ps(pointer,other.getAsRegister());}
static inline void* operator new(size_t size) throw(std::bad_alloc)
{
void *memoryallocatedaligned = 0;
#ifdef _IRR_WINDOWS_
memoryallocatedaligned = _aligned_malloc(size,SIMD_ALIGNMENT);
#else
posix_memalign((void**)&memoryallocatedaligned,SIMD_ALIGNMENT,size);
#endif
return memoryallocatedaligned;
}
static inline void operator delete(void* ptr)
{
#ifdef _IRR_WINDOWS_
_aligned_free(ptr);
#else
free(ptr);
#endif
}
static inline void* operator new[](size_t size) throw(std::bad_alloc)
{
void *memoryallocatedaligned = 0;
#ifdef _IRR_WINDOWS_
memoryallocatedaligned = _aligned_malloc(size,SIMD_ALIGNMENT);
#else
posix_memalign((void**)&memoryallocatedaligned,SIMD_ALIGNMENT,size);
#endif
return memoryallocatedaligned;
}
static inline void operator delete[](void* ptr) throw()
{
#ifdef _IRR_WINDOWS_
_aligned_free(ptr);
#else
free(ptr);
#endif
}
static inline void* operator new(std::size_t size,void* p) throw(std::bad_alloc)
{
return p;
}
static inline void operator delete(void* p,void* t) throw() {}
static inline void* operator new[](std::size_t size,void* p) throw(std::bad_alloc)
{
return p;
}
static inline void operator delete[](void* p,void* t) throw() {}
/*
inline vectorSIMDf(const vectorSIMDu32& other);
inline vectorSIMDf(const vectorSIMDi32& other);
inline vectorSIMDf(const vectorSIMDu16& other);
inline vectorSIMDf(const vectorSIMDi16& other);
*/
inline vectorSIMDf& operator=(const vectorSIMDf& other) { _mm_store_ps(pointer,other.getAsRegister()); return *this; }
//! bitwise ops
inline vectorSIMDf operator&(const vectorSIMDf& other) {return _mm_and_ps(getAsRegister(),other.getAsRegister());}
inline vectorSIMDf operator|(const vectorSIMDf& other) {return _mm_or_ps(getAsRegister(),other.getAsRegister());}
inline vectorSIMDf operator^(const vectorSIMDf& other) {return _mm_xor_ps(getAsRegister(),other.getAsRegister());}
//! in case you want to do your own SSE
inline __m128 getAsRegister() const {return _mm_load_ps(pointer);}
// operators against vectors
inline vectorSIMDf operator-() const { return _mm_xor_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000u)),getAsRegister()); }
inline vectorSIMDf operator+(const vectorSIMDf& other) const { return _mm_add_ps(other.getAsRegister(),getAsRegister()); }
inline vectorSIMDf& operator+=(const vectorSIMDf& other) { _mm_store_ps(pointer,_mm_add_ps(other.getAsRegister(),getAsRegister())); return *this; }
inline vectorSIMDf operator-(const vectorSIMDf& other) const { return _mm_sub_ps(getAsRegister(),other.getAsRegister()); }
inline vectorSIMDf& operator-=(const vectorSIMDf& other) { _mm_store_ps(pointer,_mm_sub_ps(getAsRegister(),other.getAsRegister())); return *this; }
inline vectorSIMDf operator*(const vectorSIMDf& other) const { return _mm_mul_ps(getAsRegister(),other.getAsRegister()); }
inline vectorSIMDf& operator*=(const vectorSIMDf& other) { _mm_store_ps(pointer,_mm_mul_ps(getAsRegister(),other.getAsRegister())); return *this; }
#ifdef IRRLICHT_FAST_MATH
inline vectorSIMDf operator/(const vectorSIMDf& other) const { return _mm_mul_ps(getAsRegister(),_mm_rcp_ps(other.getAsRegister())); }
inline vectorSIMDf& operator/=(const vectorSIMDf& other) { _mm_store_ps(pointer,_mm_mul_ps(getAsRegister(),_mm_rcp_ps(other.getAsRegister()))); return *this; }
#else
inline vectorSIMDf operator/(const vectorSIMDf& other) const { return preciseDivision(other); }
inline vectorSIMDf& operator/=(const vectorSIMDf& other) { (*this) = preciseDivision(other); return *this; }
#endif
inline vectorSIMDf preciseDivision(const vectorSIMDf& other) const { return _mm_div_ps(getAsRegister(),other.getAsRegister()); }
//operators against scalars
inline vectorSIMDf operator+(const float &val) const { return (*this)+vectorSIMDf(val); }
inline vectorSIMDf& operator+=(const float &val) { return ( (*this) += vectorSIMDf(val) ); }
inline vectorSIMDf operator-(const float &val) const { return (*this)-vectorSIMDf(val); }
inline vectorSIMDf& operator-=(const float &val) { return ( (*this) -= vectorSIMDf(val) ); }
inline vectorSIMDf operator*(const float &val) const { return (*this)*vectorSIMDf(val); }
inline vectorSIMDf& operator*=(const float &val) { return ( (*this) *= vectorSIMDf(val) ); }
#ifdef IRRLICHT_FAST_MATH
inline vectorSIMDf operator/(const float &v) const { return vectorSIMDf(_mm_mul_ps(_mm_rcp_ps(_mm_load_ps1(&v)),getAsRegister())); }
inline vectorSIMDf& operator/=(const float &v) { _mm_store_ps(pointer,_mm_mul_ps(_mm_rcp_ps(_mm_load_ps1(&v)),getAsRegister())); return *this; }
#else
inline vectorSIMDf operator/(const float &v) const { return vectorSIMDf(_mm_div_ps(getAsRegister(),_mm_load_ps1(&v))); }
inline vectorSIMDf& operator/=(const float &v) { _mm_store_ps(pointer,_mm_div_ps(getAsRegister(),_mm_load_ps1(&v))); return *this; }
#endif
//! I AM BREAKING IRRLICHT'S COMPARISON OPERATORS
inline vector4db_SIMD operator<=(const vectorSIMDf& other) const
{
return _mm_cmple_ps(getAsRegister(),other.getAsRegister());
}
inline vector4db_SIMD operator>=(const vectorSIMDf& other) const
{
return _mm_cmpge_ps(getAsRegister(),other.getAsRegister());
}
inline vector4db_SIMD operator<(const vectorSIMDf& other) const
{
return _mm_cmplt_ps(getAsRegister(),other.getAsRegister());
}
inline vector4db_SIMD operator>(const vectorSIMDf& other) const
{
return _mm_cmpgt_ps(getAsRegister(),other.getAsRegister());
}
//! only the method that returns bool confirms if two vectors are exactly the same
inline vectorSIMDf operator==(const vectorSIMDf& other) const
{
return _mm_cmpeq_ps(getAsRegister(),other.getAsRegister());
}
inline vectorSIMDf operator!=(const vectorSIMDf& other) const
{
return _mm_cmpneq_ps(getAsRegister(),other.getAsRegister());
}
// functions
//! zeroes out out of range components (useful before performing a dot product so it doesnt get polluted with random values)
//! WARNING IT DOES COST CYCLES
inline void makeSafe2D(void) {_mm_store_ps(pointer,_mm_and_ps(_mm_load_ps(pointer),_mm_castsi128_ps(_mm_set_epi32(0,0,-1,-1))));}
inline void makeSafe3D(void) {_mm_store_ps(pointer,_mm_and_ps(_mm_load_ps(pointer),_mm_castsi128_ps(_mm_set_epi32(0,-1,-1,-1))));}
//! slightly faster than memcpy'ing into the pointers
inline vectorSIMDf& set(float* const &array) {_mm_store_ps(pointer,_mm_loadu_ps(array)); return *this;}
//! FASTEST WAY TO SET VALUES, Address has to be aligned to 16bytes OR WILL CRASH
inline vectorSIMDf& set(float* const &array, bool ALIGNED) {_mm_store_ps(pointer,_mm_load_ps(array));}
//! normal set() like vector3df's, but for different dimensional vectors
inline vectorSIMDf& set(const float &nx, const float &ny, const float &nz, const float &nw) {_mm_store_ps(pointer,_mm_set_ps(nw,nz,ny,nx)); return *this;}
inline vectorSIMDf& set(const float &nx, const float &ny, const float &nz) {_mm_store_ps(pointer,_mm_set_ps(0.f,nz,ny,nx)); return *this;}
inline vectorSIMDf& set(const float &nx, const float &ny) {_mm_store_ps(pointer,_mm_set_ps(0.f,0.f,ny,nx)); return *this;}
inline vectorSIMDf& set(const vectorSIMDf& p) {_mm_store_ps(pointer,p.getAsRegister()); return *this;}
//! convert from vectorNdf types of irrlicht - it will read a few values past the range of the allocated memory but _mm_loadu_ps shouldnt have that kind of protection
inline vectorSIMDf& set(const vector3df &p) {_mm_store_ps(pointer,_mm_loadu_ps(&p.X)); makeSafe3D(); return *this;}
inline vectorSIMDf& set(const vector2df &p) {_mm_store_ps(pointer,_mm_loadu_ps(&p.X)); makeSafe2D(); return *this;}
//! going directly from vectorSIMD to irrlicht types is safe cause vectorSIMDf is wider
inline vector2df& getAsVector2df(void) const
{
return *((vector2df*)pointer);
}
inline vector3df& getAsVector3df(void) const
{
return *((vector3df*)pointer);
}
//! Get length of the vector.
inline float getLengthAsFloat() const
{
__m128 xmm0 = getAsRegister();
float result;
#ifdef __IRR_COMPILE_WITH_SSE3
xmm0 = _mm_mul_ps(xmm0,xmm0);
xmm0 = _mm_hadd_ps(xmm0,xmm0);
xmm0 = _mm_sqrt_ps(_mm_hadd_ps(xmm0,xmm0));
_mm_store_ss(&result,xmm0);
return result;
#elif defined(__IRR_COMPILE_WITH_SSE2)
xmm0 = _mm_mul_ps(xmm0,xmm0);
xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(0,1,2,3)));
xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(2,3,0,1)));
xmm0 = _mm_sqrt_ps(xmm0);
_mm_store_ss(&result,xmm0);
return result;
#endif
}
//! Useful when you have to divide a vector by another vector's length (so you dont convert/store to a scalar)
//! all components are filled with length
//! if you need something else, you can get the register and shuffle
inline vectorSIMDf getLength() const
{
__m128 xmm0 = getAsRegister();
#ifdef __IRR_COMPILE_WITH_SSE3
xmm0 = _mm_mul_ps(xmm0,xmm0);
xmm0 = _mm_hadd_ps(xmm0,xmm0);
return _mm_sqrt_ps(_mm_hadd_ps(xmm0,xmm0));
#elif defined(__IRR_COMPILE_WITH_SSE2)
xmm0 = _mm_mul_ps(xmm0,xmm0);
xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(0,1,2,3)));
xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(2,3,0,1)));
return _mm_sqrt_ps(xmm0);
#endif
}
inline vectorSIMDf getSquareRoot() const
{
return _mm_sqrt_ps(getAsRegister());
}
inline vectorSIMDf getReciprocalSQRT() const
{
return _mm_rsqrt_ps(getAsRegister());
}
//! Get the dot product with another vector.
inline float dotProductAsFloat(const vectorSIMDf& other) const
{
float result;
__m128 xmm0 = getAsRegister();
__m128 xmm1 = other.getAsRegister();/*
#ifdef __IRR_COMPILE_WITH_SSE4_1
xmm0 = _mm_dp_ps(xmm0,xmm1,);
#error "Implementation in >=SSE4.1 not ready yet"
#elif __IRR_COMPILE_WITH_SSE3*/
#ifdef __IRR_COMPILE_WITH_SSE3
xmm0 = _mm_mul_ps(xmm0,xmm1);
xmm0 = _mm_hadd_ps(xmm0,xmm0);
xmm0 = _mm_hadd_ps(xmm0,xmm0);
_mm_store_ss(&result,xmm0);
return result;
#elif defined(__IRR_COMPILE_WITH_SSE2)
xmm0 = _mm_mul_ps(xmm0,xmm1);
xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(0,1,2,3)));
xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(2,3,0,1)));
_mm_store_ss(&result,xmm0);
return result;
#endif
}
inline vectorSIMDf dotProduct(const vectorSIMDf& other) const
{
__m128 xmm0 = getAsRegister();
__m128 xmm1 = other.getAsRegister();
#ifdef __IRR_COMPILE_WITH_SSE3
xmm0 = _mm_mul_ps(xmm0,xmm1);
xmm0 = _mm_hadd_ps(xmm0,xmm0);
return _mm_hadd_ps(xmm0,xmm0);
#elif defined(__IRR_COMPILE_WITH_SSE2)
xmm0 = _mm_mul_ps(xmm0,xmm1);
xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(0,1,2,3)));
return _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(2,3,0,1)));
#endif
}
Last edited by devsh on Fri May 01, 2015 12:23 pm, edited 5 times in total.
Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO
(part 2, file didn't fit in one post):
Code: Select all
//! Get squared length of the vector.
/** This is useful because it is much faster than getLength().
\return Squared length of the vector. **/
inline float getLengthSQAsFloat() const
{
float result;
_mm_store_ss(&result,dotProduct(*this).getAsRegister());
return result;
}
//! Useful when you have to divide a vector by another vector's length (so you dont convert/store to a scalar)
inline vectorSIMDf getLengthSQ() const
{
return dotProduct(*this);
}
//! Get distance from another point.
/** Here, the vector is interpreted as point in 3 dimensional space. **/
inline float getDistanceFromAsFloat(const vectorSIMDf& other) const
{
float result;
_mm_store_ss(&result,((*this)-other).getLength().getAsRegister());
return result;
}
inline vectorSIMDf getDistanceFrom(const vectorSIMDf& other) const
{
return ((*this)-other).getLength();
}
//! Returns squared distance from another point.
/** Here, the vector is interpreted as point in 3 dimensional space. **/
inline float getDistanceFromSQAsFloat(const vectorSIMDf& other) const
{
float result;
_mm_store_ss(&result,((*this)-other).getLengthSQ().getAsRegister());
return result;
}
inline vectorSIMDf getDistanceFromSQ(const vectorSIMDf& other) const
{
return ((*this)-other).getLengthSQ();
}
//! Calculates the cross product with another vector.
/** \param p Vector to multiply with.
\return Crossproduct of this vector with p. **/
inline vectorSIMDf crossProduct(const vectorSIMDf& p) const
{
__m128 xmm0 = getAsRegister();
__m128 xmm1 = p.getAsRegister();
#ifdef __IRR_COMPILE_WITH_SSE2 //! SSE2 implementation is faster than previous SSE3 implementation
__m128 backslash = _mm_mul_ps(FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,0,2,1)),FAST_FLOAT_SHUFFLE(xmm1,_MM_SHUFFLE(3,1,0,2)));
__m128 forwardslash = _mm_mul_ps(FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,1,0,2)),FAST_FLOAT_SHUFFLE(xmm1,_MM_SHUFFLE(3,0,2,1)));
return _mm_sub_ps(backslash,forwardslash); //returns 0 in the last component :D
#endif
}
//! Normalizes the vector.
/** In case of the 0 vector the result is still 0, otherwise
the length of the vector will be 1.
\return Reference to this vector after normalization. **/
inline vectorSIMDf normalize() const
{
__m128 xmm0 = getAsRegister();
__m128 xmm1 = getLengthSQ().getAsRegister();// the uncecessary load/store and variable construction will get optimized out with inline
#ifdef IRRLICHT_FAST_MATH
return _mm_mul_ps(xmm0,_mm_rsqrt_ps(xmm1));
#else
return _mm_div_ps(xmm0,_mm_sqrt_ps(xmm1));
#endif
}
//! Sets the length of the vector to a new value
inline vectorSIMDf& setLengthAsFloat(float newlength)
{
(*this) = normalize()*newlength;
return (*this);
}
//! Inverts the vector.
inline vectorSIMDf& invert()
{
_mm_store_ps(pointer,_mm_xor_ps(_mm_castsi128_ps(_mm_set_epi32(0x80000000u,0x80000000u,0x80000000u,0x80000000u)),getAsRegister()));
return *this;
}
//! Returns component-wise absolute value of a
inline vectorSIMDf abs(const vectorSIMDf& a) const
{
return _mm_and_ps(a.getAsRegister(),_mm_castsi128_ps(_mm_set_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)));
}
//! Returns component-wise absolute value of itself
inline vectorSIMDf getAbsoluteValue() const
{
return abs(*this);
}
Code: Select all
//! Rotates the vector by a specified number of RADIANS around the Y axis and the specified center.
/** \param radians Number of RADIANS to rotate around the Y axis.
\param center The center of the rotation. */
inline void rotateXZByRAD(const float &radians, const vectorSIMDf& center)
{
__m128 xmm1 = center.getAsRegister();
__m128 xmm0 = _mm_sub_ps(getAsRegister(),xmm1);
float cs = cosf(radians);
float sn = sinf(radians);
__m128 xmm2 = _mm_mul_ps(_mm_load_ps1(&cs),xmm0); // now contains (X*cos,radom_crap,Z*cos,random_crap)
__m128 xmm3 = _mm_mul_ps(_mm_load_ps1(&sn),FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,0,1,2))); // now contains (Z*sin,radom_crap,X*cos,random_crap)
xmm3 = _mm_xor_ps(xmm3,_mm_castsi128_ps(_mm_set_epi32(0,0,0,0x80000000u))); // invert the Z*sin
xmm0 = _mm_add_ps(_mm_add_ps(xmm2,xmm3),xmm1); // gives us ((X*cs - Z*sn), (X*cs - Z*sn), (X*sn + Z*cs), (X*sn + Z*cs))
_mm_maskmoveu_si128(_mm_castps_si128(xmm0),_mm_set_epi32(0,-1,0,-1),(char*)pointer);// only overwrites the X,Z elements of our vector
}
inline void rotateXZByRAD(const float &radians)
{
__m128 xmm0 = getAsRegister();
float cs = cosf(radians);
float sn = sinf(radians);
__m128 xmm2 = _mm_mul_ps(_mm_load_ps1(&cs),xmm0); // now contains (X*cos,radom_crap,Z*cos,random_crap)
__m128 xmm3 = _mm_mul_ps(_mm_load_ps1(&sn),FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,0,1,2))); // now contains (Z*sin,radom_crap,X*cos,random_crap)
xmm3 = _mm_xor_ps(xmm3,_mm_castsi128_ps(_mm_set_epi32(0,0,0,0x80000000u))); // invert the Z*sin
xmm0 = _mm_add_ps(xmm2,xmm3); // gives us ((X*cs - Z*sn), (X*cs - Z*sn), (X*sn + Z*cs), (X*sn + Z*cs))
_mm_maskmoveu_si128(_mm_castps_si128(xmm0),_mm_set_epi32(0,-1,0,-1),(char*)pointer);// only overwrites the X,Z elements of our vector
}
//! Rotates the vector by a specified number of RADIANS around the Z axis and the specified center.
/** \param RADIANS: Number of RADIANS to rotate around the Z axis.
\param center: The center of the rotation. */
inline void rotateXYByRAD(const float &radians, const vectorSIMDf& center)
{
__m128 xmm1 = center.getAsRegister();
__m128 xmm0 = _mm_sub_ps(getAsRegister(),xmm1);
float cs = cosf(radians);
float sn = sinf(radians);
__m128 xmm2 = _mm_mul_ps(_mm_load_ps1(&cs),xmm0); // now contains (X*cos,Y*cos,...,...)
__m128 xmm3 = _mm_mul_ps(_mm_load_ps1(&sn),FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,2,0,1))); // now contains (Y*sin,X*cos,...)
xmm3 = _mm_xor_ps(xmm3,_mm_castsi128_ps(_mm_set_epi32(0,0,0,0x80000000u))); // invert the Y*sin
xmm0 = _mm_add_ps(_mm_add_ps(xmm2,xmm3),xmm1); // gives us ((X*cs - Y*sn), (Y*cs + X*sn),...,...)
_mm_maskmoveu_si128(_mm_castps_si128(xmm0),_mm_set_epi32(0,0,-1,-1),(char*)pointer);// only overwrites the X,Y elements of our vector
}
inline void rotateXYByRAD(const float &radians)
{
__m128 xmm0 = getAsRegister();
float cs = cosf(radians);
float sn = sinf(radians);
__m128 xmm2 = _mm_mul_ps(_mm_load_ps1(&cs),xmm0); // now contains (X*cos,Y*cos,...,...)
__m128 xmm3 = _mm_mul_ps(_mm_load_ps1(&sn),FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,2,0,1))); // now contains (Y*sin,X*sin,...)
xmm3 = _mm_xor_ps(xmm3,_mm_castsi128_ps(_mm_set_epi32(0,0,0,0x80000000u))); // invert the Y*sin
xmm0 = _mm_add_ps(xmm2,xmm3); // gives us ((X*cs - Y*sn), (Y*cs + X*sn),...,...)
_mm_maskmoveu_si128(_mm_castps_si128(xmm0),_mm_set_epi32(0,0,-1,-1),(char*)pointer);// only overwrites the X,Y elements of our vector
}
//! Rotates the vector by a specified number of degrees around the X axis and the specified center.
/** \param degrees: Number of degrees to rotate around the X axis.
\param center: The center of the rotation. */
inline void rotateYZByRAD(const float &radians, const vectorSIMDf& center)
{
__m128 xmm1 = center.getAsRegister();
__m128 xmm0 = _mm_sub_ps(getAsRegister(),xmm1);
float cs = cosf(radians);
float sn = sinf(radians);
__m128 xmm2 = _mm_mul_ps(_mm_load_ps1(&cs),xmm0); // now contains (X*cos,Y*cos,...,...)
__m128 xmm3 = _mm_mul_ps(_mm_load_ps1(&sn),FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,1,2,0))); // now contains (...,Z*sin,Y*sin,...)
xmm3 = _mm_xor_ps(xmm3,_mm_castsi128_ps(_mm_set_epi32(0,0,0x80000000u,0))); // invert the Z*sin
xmm0 = _mm_add_ps(_mm_add_ps(xmm2,xmm3),xmm1); // gives us ((X*cs - Y*sn), (Y*cs + X*sn),...,...)
_mm_maskmoveu_si128(_mm_castps_si128(xmm0),_mm_set_epi32(0,-1,-1,0),(char*)pointer);// only overwrites the X,Y elements of our vector
}
inline void rotateYZByRAD(const float &radians)
{
__m128 xmm0 = getAsRegister();
float cs = cosf(radians);
float sn = sinf(radians);
__m128 xmm2 = _mm_mul_ps(_mm_load_ps1(&cs),xmm0); // now contains (X*cos,Y*cos,...,...)
__m128 xmm3 = _mm_mul_ps(_mm_load_ps1(&sn),FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,1,2,0))); // now contains (...,Z*sin,Y*sin,...)
xmm3 = _mm_xor_ps(xmm3,_mm_castsi128_ps(_mm_set_epi32(0,0,0x80000000u,0))); // invert the Z*sin
xmm0 = _mm_add_ps(xmm2,xmm3); // gives us ((X*cs - Y*sn), (Y*cs + X*sn),...,...)
_mm_maskmoveu_si128(_mm_castps_si128(xmm0),_mm_set_epi32(0,-1,-1,0),(char*)pointer);// only overwrites the X,Y elements of our vector
}
//! Get the rotations that would make a (0,0,1) direction vector point in the same direction as this direction vector.
/* Thanks to Arras on the Irrlicht forums for this method. This utility method is very useful for
orienting scene nodes towards specific targets. For example, if this vector represents the difference
between two scene nodes, then applying the result of getHorizontalAngle() to one scene node will point
it at the other one.
Example code:
// Where target and seeker are of type ISceneNode*
const vector3df toTarget(target->getAbsolutePosition() - seeker->getAbsolutePosition());
const vector3df requiredRotation = toTarget.getHorizontalAngle();
seeker->setRotation(requiredRotation);
\return A rotation vector containing the X (pitch) and Y (raw) rotations (in degrees) that when applied to a
+Z (e.g. 0, 0, 1) direction vector would make it point in the same direction as this vector. The Z (roll) rotation
is always 0, since two Euler rotations are sufficient to point in any given direction. *
inline vectorSIMDf getHorizontalAngle3D() const
{
vectorSIMDf angle;
const float tmp = atan2f(x,z);
angle.y = tmp;
__m128 xmm0 = ((*this)*(*this)).getAsRegister();
xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,0,1,2)));
float z1;
_mm_store_ss(&z1,_mm_sqrt_ss(xmm0));
angle.x = atan2f(z1, y) - core::PI*0.5f;
return angle;
}
//! Get the spherical coordinate angles, can we do 4-sphere coordinates
/** This returns Euler radians for the point represented by
this vector.
*
inline vectorSIMDf getSphericalCoordinates3D() const
{
vectorSIMDf angle = *this;
angle.makeSafe3D();
angle = angle.getLength();
if (angle.w) //doesnt matter which component
{
if (X!=0)
{
angle.Y = atan2f(Z,X);
}
else if (Z<0)
angle.Y=180;
angle.X = (T)(acos(Y * core::reciprocal_squareroot(length)) * RADTODEG64);
}
else
return vectorSIMDf(0.f);
}
//! Builds a direction vector from (this) rotation vector.
/** This vector is assumed to be a rotation vector composed of 3 Euler angle rotations, in degrees.
The implementation performs the same calculations as using a matrix to do the rotation.
\param[in] forwards The direction representing "forwards" which will be rotated by this vector.
If you do not provide a direction, then the +Z axis (0, 0, 1) will be assumed to be forwards.
\return A direction vector calculated by rotating the forwards direction by the 3 Euler angles
(in degrees) represented by this vector. *
inline vectorSIMDf rotationToDirection3D() const
{
const float cr = cosf( x );
const float sr = sinf( x );
const float cp = cosf( y );
const float sp = sinf( y );
const float cy = cosf( z );
const float sy = sinf( z );
const float crsp = cr*sp;
return vectorSIMDf(( crsp*cy+sr*sy ), ( crsp*sy-sr*cy ), ( cr*cp ),0);
}
inline vectorSIMDf rotationToDirection3D(const vectorSIMDf &forwards = vectorSIMDf(0, 0, 1, 0)) const
{
const float cr = cosf( x );
const float sr = sinf( x );
const float cp = cosf( y );
const float sp = sinf( y );
const float cy = cosf( z );
const float sy = sinf( z );
const float crsp = cr*sp;
const float srsp = sr*sp;
const f64 pseudoMatrix[] = {
( cp*cy ), ( cp*sy ), ( -sp ),
( srsp*cy-cr*sy ), ( srsp*sy+cr*cy ), ( sr*cp ),
( crsp*cy+sr*sy ), ( crsp*sy-sr*cy ), ( cr*cp )};
return vector3d<T>(
(T)(forwards.X * pseudoMatrix[0] +
forwards.Y * pseudoMatrix[3] +
forwards.Z * pseudoMatrix[6]),
(T)(forwards.X * pseudoMatrix[1] +
forwards.Y * pseudoMatrix[4] +
forwards.Z * pseudoMatrix[7]),
(T)(forwards.X * pseudoMatrix[2] +
forwards.Y * pseudoMatrix[5] +
forwards.Z * pseudoMatrix[8]));
}*/
static inline vectorSIMDf fromSColor(const irr::video::SColor &col)
{
vectorSIMDf retVal;
__m128i xmm0 = _mm_castps_si128(_mm_load_ss((float*)&col.color));
xmm0 = _mm_unpacklo_epi8(xmm0,_mm_setzero_si128());
xmm0 = _mm_unpacklo_epi16(xmm0,_mm_setzero_si128());
__m128 xmm1 = _mm_div_ps(_mm_cvtepi32_ps(xmm0),_mm_set_ps(255.f,255.f,255.f,255.f));
xmm1 = FAST_FLOAT_SHUFFLE(xmm1,_MM_SHUFFLE(3,0,1,2));
_mm_store_ps(retVal.pointer,xmm1);
return retVal;
}
union
{
struct{
float X; float Y; float Z; float W;
};
struct{
float x; float y; float z; float w;
};
struct{
float r; float g; float b; float a;
};
struct{
float s; float t; float p; float q;
};
float pointer[4];
};
#ifdef _IRR_WINDOWS_
};
#else
} __attribute__ ((__aligned__(SIMD_ALIGNMENT)));
#endif
static inline vectorSIMDf radToDeg(const vectorSIMDf& radians)
{
return radians*vectorSIMDf(RADTODEG);
}
static inline vectorSIMDf degToRad(const vectorSIMDf& degrees)
{
return degrees*vectorSIMDf(DEGTORAD);
}
static inline vectorSIMDf mix(const vectorSIMDf& a, const vectorSIMDf& b, const vectorSIMDf& t)
{
return a+(b-a)*t;
}
static inline vectorSIMDf lerp(const vectorSIMDf& a, const vectorSIMDf& b, const vectorSIMDf& t)
{
return mix(a,b,t);
}
template<>
inline vectorSIMDf max_(const vectorSIMDf& a, const vectorSIMDf& b)
{
return _mm_max_ps(a.getAsRegister(),b.getAsRegister());
}
template<>
inline vectorSIMDf min_(const vectorSIMDf& a, const vectorSIMDf& b)
{
return _mm_min_ps(a.getAsRegister(),b.getAsRegister());
}
inline vectorSIMDf clamp(const vectorSIMDf& value, const vectorSIMDf& low, const vectorSIMDf& high)
{
return min_(max_(value,low),high);
}
inline vectorSIMDf floor(const vectorSIMDf& a)
{
vectorSIMDf b = a;
vector4db_SIMD notTooLargeToFloor = b.getAbsoluteValue()<vectorSIMDf(float(0x800000)); //cutoff point for flooring
__m128i xmm0 = _mm_cvtps_epi32(b.getAsRegister());
_mm_maskmoveu_si128(_mm_castps_si128(_mm_cvtepi32_ps(xmm0)),notTooLargeToFloor.getAsRegister(),(char*)b.pointer);
return b;
}
inline vectorSIMDf fract(const vectorSIMDf& a)
{
return a-floor(a);
}
inline vectorSIMDf sqrt(const vectorSIMDf& a)
{
return _mm_sqrt_ps(a.getAsRegister());
}
inline vectorSIMDf inversesqrt(const vectorSIMDf& a)
{
return _mm_rsqrt_ps(a.getAsRegister());
}
inline vectorSIMDf reciprocal(const vectorSIMDf& a)
{
return _mm_rcp_ps(a.getAsRegister());
}
//! Typedef for a f32 n-dimensional vector.
typedef vectorSIMDf vector4df_SIMD;
typedef vectorSIMDf vector3df_SIMD;
typedef vectorSIMDf vector2df_SIMD;
template <class T>
class vectorSIMD_32 : public SIMD_32bitSwizzleAble<vectorSIMD_32<T>,__m128i>
{
public:
//! Default constructor (null vector).
inline vectorSIMD_32() {_mm_store_si128((__m128i*)pointer,_mm_setzero_si128());}
inline vectorSIMD_32(T* const &array) {_mm_store_si128((__m128i*)pointer,_mm_loadu_si128((__m128i*)array));}
inline vectorSIMD_32(T* const &array, bool ALIGNED) {_mm_store_si128((__m128i*)pointer,_mm_load_si128((__m128i*)array));}
//! Fastest and most natural constructor
inline vectorSIMD_32(const __m128i ®) {_mm_store_si128((__m128i*)pointer,reg);}
//! Constructor with the same value for all elements
inline explicit vectorSIMD_32(const T &n) {_mm_store_si128((__m128i*)pointer,_mm_castps_si128(_mm_load_ps1((float*)&n)));}
//! Copy constructor
inline vectorSIMD_32(const vectorSIMD_32<T>& other) {_mm_store_si128((__m128i*)pointer,other.getAsRegister());}
static inline void* operator new(size_t size) throw(std::bad_alloc)
{
void *memoryallocatedaligned = 0;
#ifdef _IRR_WINDOWS_
memoryallocatedaligned = _aligned_malloc(size,SIMD_ALIGNMENT);
#else
posix_memalign((void**)&memoryallocatedaligned,SIMD_ALIGNMENT,size);
#endif
return memoryallocatedaligned;
}
static inline void operator delete(void* ptr)
{
#ifdef _IRR_WINDOWS_
_aligned_free(ptr);
#else
free(ptr);
#endif
}
static inline void* operator new[](size_t size) throw(std::bad_alloc)
{
void *memoryallocatedaligned = 0;
#ifdef _IRR_WINDOWS_
memoryallocatedaligned = _aligned_malloc(size,SIMD_ALIGNMENT);
#else
posix_memalign((void**)&memoryallocatedaligned,SIMD_ALIGNMENT,size);
#endif
return memoryallocatedaligned;
}
static inline void operator delete[](void* ptr) throw()
{
#ifdef _IRR_WINDOWS_
_aligned_free(ptr);
#else
free(ptr);
#endif
}
static inline void* operator new(std::size_t size,void* p) throw(std::bad_alloc)
{
return p;
}
static inline void operator delete(void* p,void* t) throw() {}
static inline void* operator new[](std::size_t size,void* p) throw(std::bad_alloc)
{
return p;
}
static inline void operator delete[](void* p,void* t) throw() {}
/*
inline vectorSIMDf(const vectorSIMDu32& other);
inline vectorSIMDf(const vectorSIMDi32& other);
inline vectorSIMDf(const vectorSIMDu16& other);
inline vectorSIMDf(const vectorSIMDi16& other);
**/
inline vectorSIMD_32<T>& operator=(const vectorSIMD_32<T>& other) { _mm_store_si128((__m128i*)pointer,other.getAsRegister()); return *this; }
//! bitwise ops
inline vectorSIMD_32<T> operator&(const vectorSIMD_32<T>& other) {return _mm_and_si128(getAsRegister(),other.getAsRegister());}
inline vectorSIMD_32<T> operator|(const vectorSIMD_32<T>& other) {return _mm_or_si128(getAsRegister(),other.getAsRegister());}
inline vectorSIMD_32<T> operator^(const vectorSIMD_32<T>& other) {return _mm_xor_si128(getAsRegister(),other.getAsRegister());}
//! in case you want to do your own SSE
inline __m128i getAsRegister() const {return _mm_load_si128((__m128i*)pointer);}
/*
// operators against vectors
inline vectorSIMD_32<T> operator-() const { return _mm_xor_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000u)),getAsRegister()); }
inline vectorSIMD_32<T> operator+(const vectorSIMD_32<T>& other) const { return _mm_add_ps(other.getAsRegister(),getAsRegister()); }
inline vectorSIMD_32<T>& operator+=(const vectorSIMD_32<T>& other) { _mm_store_ps(pointer,_mm_add_ps(other.getAsRegister(),getAsRegister())); return *this; }
inline vectorSIMD_32<T> operator-(const vectorSIMD_32<T>& other) const { return _mm_sub_ps(getAsRegister(),other.getAsRegister()); }
inline vectorSIMD_32<T>& operator-=(const vectorSIMD_32<T>& other) { _mm_store_ps(pointer,_mm_sub_ps(getAsRegister(),other.getAsRegister())); return *this; }
inline vectorSIMDf operator*(const vectorSIMDf& other) const { return _mm_mul_ps(getAsRegister(),other.getAsRegister()); }
inline vectorSIMD_32<T> operator*(const vectorSIMD_32<T>& other) const { return _mm_mul_ps(getAsRegister(),other.getAsRegister()); }
inline vectorSIMD_32<T>& operator*=(const vectorSIMD_32<T>& other) { _mm_store_ps(pointer,_mm_mul_ps(getAsRegister(),other.getAsRegister())); return *this; }
inline vectorSIMDf operator/(const vectorSIMDf& other) const { return preciseDivision(other); }
inline vectorSIMD_32<T> operator/(const vectorSIMD_32<T>& other) const { return preciseDivision(other); }
inline vectorSIMD_32<T>& operator/=(const vectorSIMD_32<T>& other) { (*this) = preciseDivision(other); return *this; }
/*
//operators against scalars
inline vectorSIMDf operator+(const float &val) const { return (*this)+vectorSIMDf(val); }
inline vectorSIMDf& operator+=(const float &val) { return ( (*this) += vectorSIMDf(val) ); }
inline vectorSIMDf operator-(const float &val) const { return (*this)-vectorSIMDf(val); }
inline vectorSIMDf& operator-=(const float &val) { return ( (*this) -= vectorSIMDf(val) ); }
inline vectorSIMDf operator*(const float &val) const { return (*this)*vectorSIMDf(val); }
inline vectorSIMDf& operator*=(const float &val) { return ( (*this) *= vectorSIMDf(val) ); }
#ifdef IRRLICHT_FAST_MATH
inline vectorSIMDf operator/(const float &v) const { return vectorSIMDf(_mm_mul_ps(_mm_rcp_ps(_mm_load_ps1(&v)),getAsRegister())); }
inline vectorSIMDf& operator/=(const float &v) { _mm_store_ps(pointer,_mm_mul_ps(_mm_rcp_ps(_mm_load_ps1(&v)),getAsRegister())); return *this; }
#else
inline vectorSIMDf operator/(const float &v) const { return vectorSIMDf(_mm_div_ps(getAsRegister(),_mm_load_ps1(&v))); }
inline vectorSIMDf& operator/=(const float &v) { _mm_store_ps(pointer,_mm_div_ps(getAsRegister(),_mm_load_ps1(&v))); return *this; }
#endif
//! I AM BREAKING IRRLICHT'S COMPARISON OPERATORS
inline vector4db_SIMD operator<=(const vectorSIMDf& other) const
{
return _mm_cmple_ps(getAsRegister(),other.getAsRegister());
}
inline vector4db_SIMD operator>=(const vectorSIMDf& other) const
{
return _mm_cmpge_ps(getAsRegister(),other.getAsRegister());
}
inline vector4db_SIMD operator<(const vectorSIMDf& other) const
{
return _mm_cmplt_ps(getAsRegister(),other.getAsRegister());
}
inline vector4db_SIMD operator>(const vectorSIMDf& other) const
{
return _mm_cmpgt_ps(getAsRegister(),other.getAsRegister());
}
//! only the method that returns bool confirms if two vectors are exactly the same
inline vectorSIMDf operator==(const vectorSIMDf& other) const
{
return _mm_cmpeq_ps(getAsRegister(),other.getAsRegister());
}
inline vectorSIMDf operator!=(const vectorSIMDf& other) const
{
return _mm_cmpneq_ps(getAsRegister(),other.getAsRegister());
}
// functions
//! zeroes out out of range components (useful before performing a dot product so it doesnt get polluted with random values)
//! WARNING IT DOES COST CYCLES
inline void makeSafe2D(void) {_mm_store_ps(pointer,_mm_and_ps(_mm_load_ps(pointer),_mm_castsi128_ps(_mm_set_epi32(0,0,-1,-1))));}
inline void makeSafe3D(void) {_mm_store_ps(pointer,_mm_and_ps(_mm_load_ps(pointer),_mm_castsi128_ps(_mm_set_epi32(0,-1,-1,-1))));}
//! slightly faster than memcpy'ing into the pointers
inline vectorSIMDf& set(float* const &array) {_mm_store_ps(pointer,_mm_loadu_ps(array)); return *this;}
//! FASTEST WAY TO SET VALUES, Address has to be aligned to 16bytes OR WILL CRASH
inline vectorSIMDf& set(float* const &array, bool ALIGNED) {_mm_store_ps(pointer,_mm_load_ps(array));}
//! normal set() like vector3df's, but for different dimensional vectors
inline vectorSIMDf& set(const float &nx, const float &ny, const float &nz, const float &nw) {_mm_store_ps(pointer,_mm_set_ps(nw,nz,ny,nx)); return *this;}
inline vectorSIMDf& set(const float &nx, const float &ny, const float &nz) {_mm_store_ps(pointer,_mm_set_ps(0.f,nz,ny,nx)); return *this;}
inline vectorSIMDf& set(const float &nx, const float &ny) {_mm_store_ps(pointer,_mm_set_ps(0.f,0.f,ny,nx)); return *this;}
inline vectorSIMDf& set(const vectorSIMDf& p) {_mm_store_ps(pointer,p.getAsRegister()); return *this;}
//! convert from vectorNdf types of irrlicht - it will read a few values past the range of the allocated memory but _mm_loadu_ps shouldnt have that kind of protection
inline vectorSIMDf& set(const vector3df &p) {_mm_store_ps(pointer,_mm_loadu_ps(&p.X)); makeSafe3D(); return *this;}
inline vectorSIMDf& set(const vector2df &p) {_mm_store_ps(pointer,_mm_loadu_ps(&p.X)); makeSafe2D(); return *this;}
//! going directly from vectorSIMD to irrlicht types is safe cause vectorSIMDf is wider
inline vector2df& getAsVector2df(void) const
{
return *((vector2df*)pointer);
}
inline vector3df& getAsVector3df(void) const
{
return *((vector3df*)pointer);
}
//! Get length of the vector.
inline float getLengthAsFloat() const
{
__m128 xmm0 = getAsRegister();
float result;/*
#ifdef __IRR_COMPILE_WITH_SSE4_1
xmm0 = _mm_dp_ps(xmm0,xmm0,);
xmm0 = _mm_sqrt_ps(xmm0);
#error "Implementation in >=SSE4.1 not ready yet"
#elif __IRR_COMPILE_WITH_SSE3*/ /*
#ifdef __IRR_COMPILE_WITH_SSE3
xmm0 = _mm_mul_ps(xmm0,xmm0);
xmm0 = _mm_hadd_ps(xmm0,xmm0);
xmm0 = _mm_sqrt_ps(_mm_hadd_ps(xmm0,xmm0));
_mm_store_ss(&result,xmm0);
return result;
#elif defined(__IRR_COMPILE_WITH_SSE2)
xmm0 = _mm_mul_ps(xmm0,xmm0);
xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(0,1,2,3)));
xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(2,3,0,1)));
xmm0 = _mm_sqrt_ps(xmm0);
_mm_store_ss(&result,xmm0);
return result;
#endif
}
//! Useful when you have to divide a vector by another vector's length (so you dont convert/store to a scalar)
//! all components are filled with length
//! if you need something else, you can get the register and shuffle
inline vectorSIMDf getLength() const
{
__m128 xmm0 = getAsRegister();
#ifdef __IRR_COMPILE_WITH_SSE3
xmm0 = _mm_mul_ps(xmm0,xmm0);
xmm0 = _mm_hadd_ps(xmm0,xmm0);
return _mm_sqrt_ps(_mm_hadd_ps(xmm0,xmm0));
#elif defined(__IRR_COMPILE_WITH_SSE2)
xmm0 = _mm_mul_ps(xmm0,xmm0);
xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(0,1,2,3)));
xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(2,3,0,1)));
return _mm_sqrt_ps(xmm0);
#endif
}
//! Get the dot product with another vector.
inline float dotProductAsInt(const vectorSIMDf& other) const
{
float result;
__m128 xmm0 = getAsRegister();
__m128 xmm1 = other.getAsRegister();/*
#ifdef __IRR_COMPILE_WITH_SSE4_1
xmm0 = _mm_dp_ps(xmm0,xmm1,);
#error "Implementation in >=SSE4.1 not ready yet"
#elif __IRR_COMPILE_WITH_SSE3*/ /*
#ifdef __IRR_COMPILE_WITH_SSE3
xmm0 = _mm_mul_ps(xmm0,xmm1);
xmm0 = _mm_hadd_ps(xmm0,xmm0);
xmm0 = _mm_hadd_ps(xmm0,xmm0);
_mm_store_ss(&result,xmm0);
return result;
#elif defined(__IRR_COMPILE_WITH_SSE2)
xmm0 = _mm_mul_ps(xmm0,xmm1);
xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(0,1,2,3)));
xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(2,3,0,1)));
_mm_store_ss(&result,xmm0);
return result;
#endif
}
inline vectorSIMDf dotProduct(const vectorSIMDf& other) const
{
__m128 xmm0 = getAsRegister();
__m128 xmm1 = other.getAsRegister();/*
#ifdef __IRR_COMPILE_WITH_SSE4_1
xmm0 = _mm_dp_ps(xmm0,xmm1,);
#error "Implementation in >=SSE4.1 not ready yet"
#elif __IRR_COMPILE_WITH_SSE3*/ /*
#ifdef __IRR_COMPILE_WITH_SSE3
xmm0 = _mm_mul_ps(xmm0,xmm1);
xmm0 = _mm_hadd_ps(xmm0,xmm0);
return _mm_hadd_ps(xmm0,xmm0);
#elif defined(__IRR_COMPILE_WITH_SSE2)
xmm0 = _mm_mul_ps(xmm0,xmm1);
xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(0,1,2,3)));
return _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(2,3,0,1)));
#endif
}
//! Get squared length of the vector.
/** This is useful because it is much faster than getLength().
\return Squared length of the vector. *
inline float getLengthSQAsFloat() const
{
float result;
_mm_store_ss(&result,dotProduct(*this).getAsRegister());
return result;
}
//! Useful when you have to divide a vector by another vector's length (so you dont convert/store to a scalar)
inline vectorSIMDf getLengthSQ() const
{
return dotProduct(*this);
}
//! Get distance from another point.
/** Here, the vector is interpreted as point in 3 dimensional space. *
inline float getDistanceFromAsFloat(const vectorSIMDf& other) const
{
float result;
_mm_store_ss(&result,((*this)-other).getLength().getAsRegister());
return result;
}
inline vectorSIMDf getDistanceFrom(const vectorSIMDf& other) const
{
return ((*this)-other).getLength();
}
//! Returns squared distance from another point.
/** Here, the vector is interpreted as point in 3 dimensional space. *
inline uint32_t getDistanceFromSQAsFloat(const vectorSIMDf& other) const
{
float result;
_mm_store_ss(&result,((*this)-other).getLengthSQ().getAsRegister());
return result;
}
inline uint32_t getDistanceFromSQ(const vectorSIMDf& other) const
{
return ((*this)-other).getLengthSQ();
}
//! Calculates the cross product with another vector.
/** \param p Vector to multiply with.
\return Crossproduct of this vector with p. *
inline vectorSIMDf crossProduct(const vectorSIMDf& p) const
{
__m128 xmm0 = getAsRegister();
__m128 xmm1 = p.getAsRegister();
#ifdef __IRR_COMPILE_WITH_SSE2 //! SSE2 implementation is faster than previous SSE3 implementation
__m128 backslash = _mm_mul_ps(FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,0,2,1)),FAST_FLOAT_SHUFFLE(xmm1,_MM_SHUFFLE(3,1,0,2)));
__m128 forwardslash = _mm_mul_ps(FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,1,0,2)),FAST_FLOAT_SHUFFLE(xmm1,_MM_SHUFFLE(3,0,2,1)));
return _mm_sub_ps(backslash,forwardslash); //returns 0 in the last component :D
#endif
}
//! Inverts the vector.
inline vectorSIMDf& invert()
{
_mm_store_ps(pointer,_mm_xor_ps(_mm_castsi128_ps(_mm_set_epi32(0x80000000u,0x80000000u,0x80000000u,0x80000000u)),getAsRegister()));
return *this;
}
//! Returns component-wise absolute value of a
inline vectorSIMDf abs(const vectorSIMDf& a) const
{
return _mm_and_ps(a.getAsRegister(),_mm_castsi128_ps(_mm_set_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)));
}
//! Returns component-wise absolute value of itself
inline vectorSIMDf getAbsoluteValue() const
{
return abs(*this);
}
*/
#ifdef _IRR_WINDOWS_
__declspec(align(SIMD_ALIGNMENT)) union
#else
union
#endif
{
struct{
T X; T Y; T Z; T W;
};
struct{
T x; T y; T z; T w;
};
struct{
T r; T g; T b; T a;
};
struct{
T s; T t; T p; T q;
};
T pointer[4];
};
#ifdef _IRR_WINDOWS_
};
#else
} __attribute__ ((__aligned__(SIMD_ALIGNMENT)));
#endif
/*
class vectorSIMDi32 : public vectorSIMD_32<int32_t>
{
//! Constructor with four different values, FASTEST IF the values are constant literals
//yes this is correct usage with _mm_set_**(), due to little endianness the thing gets set in "reverse" order
inline explicit vectorSIMDi32(const int32_t &nx, const int32_t &ny, const int32_t &nz, const int32_t &nw) {_mm_store_si128((__m128i*)pointer,_mm_set_epi32(nw,nz,ny,nx));}
//! 3d constructor
inline explicit vectorSIMDi32(const int32_t &nx, const int32_t &ny, const int32_t &nz) {_mm_store_si128((__m128i*)pointer,_mm_set_epi32(0,nz,ny,nx));}
//! 2d constructor
inline explicit vectorSIMDi32(const int32_t &nx, const int32_t &ny) {_mm_store_si128((__m128i*)pointer,_mm_set_epi32(0,0,ny,nx));}
};
class vectorSIMDu32 : public vectorSIMD_32<uint32_t>
{
//! Constructor with four different values, FASTEST IF the values are constant literals
//yes this is correct usage with _mm_set_**(), due to little endianness the thing gets set in "reverse" order
inline explicit vectorSIMDu32(const uint32_t &nx, const uint32_t &ny, const uint32_t &nz, const uint32_t &nw) {_mm_store_si128((__m128i*)pointer,_mm_set_epi32((const int32_t&)nw,(const int32_t&)nz,(const int32_t&)ny,(const int32_t&)nx));}
//! 3d constructor
inline explicit vectorSIMDu32(const uint32_t &nx, const uint32_t &ny, const uint32_t &nz) {_mm_store_si128((__m128i*)pointer,_mm_set_epi32(0,(const int32_t&)nz,(const int32_t&)ny,(const int32_t&)nx));}
//! 2d constructor
inline explicit vectorSIMDu32(const uint32_t &nx, const uint32_t &ny) {_mm_store_si128((__m128i*)pointer,_mm_set_epi32(0,0,(const int32_t&)ny,(const int32_t&)nx));}
};
/*
inline vectorSIMDi32 mix(const vectorSIMDi32& a, const vectorSIMDi32& b, const vectorSIMDf& t)
{
return a+(b-a)*t;
}
inline vectorSIMDi32 lerp(const vectorSIMDi32& a, const vectorSIMDi32& b, const vectorSIMDf& t)
{
return mix(a,b,t);
}
template<>
inline vectorSIMDi32 max_(const vectorSIMDi32& a, const vectorSIMDi32& b)
{
return _mm_max_ps(a.getAsRegister(),b.getAsRegister());
}
template<>
inline vectorSIMDi32 min_(const vectorSIMDi32& a, const vectorSIMDi32& b)
{
return _mm_min_ps(a.getAsRegister(),b.getAsRegister());
}
inline vectorSIMDi32 clamp(const vectorSIMDi32& value, const vectorSIMDi32& low, const vectorSIMDi32& high)
{
return min_(max_(value,low),high);
}
//! Typedef for an integer 3d vector.
typedef vectorSIMDu32 vector4du32_SIMD;
typedef vectorSIMDu32 vector3du32_SIMD;
typedef vectorSIMDu32 vector2du32_SIMD;
typedef vectorSIMDi32 vector4di32_SIMD;
typedef vectorSIMDi32 vector3di32_SIMD;
typedef vectorSIMDi32 vector2di32_SIMD;
typedef vectorSIMDu16 vector8du16_SIMD;
typedef vectorSIMDu16 vector7du16_SIMD;
typedef vectorSIMDu16 vector6du16_SIMD;
typedef vectorSIMDu16 vector5du16_SIMD;
typedef vectorSIMDu16 vector4du16_SIMD;
typedef vectorSIMDu16 vector3du16_SIMD;
typedef vectorSIMDu16 vector2du16_SIMD;
typedef vectorSIMDi16 vector8di16_SIMD;
typedef vectorSIMDi16 vector7di16_SIMD;
typedef vectorSIMDi16 vector6di16_SIMD;
typedef vectorSIMDi16 vector5di16_SIMD;
typedef vectorSIMDi16 vector4di16_SIMD;
typedef vectorSIMDi16 vector3di16_SIMD;
typedef vectorSIMDi16 vector2di16_SIMD;*/
} // end namespace core
} // end namespace irr
#endif
#endif
Last edited by devsh on Fri May 01, 2015 12:15 pm, edited 4 times in total.
Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO
Support file SIMDswizzle.h for the vectorSIMdf:
(sorry forum completely freaked out when I used the code frame)
(sorry forum completely freaked out when I used the code frame)
#ifndef _SIMD_SWIZZLE_H_
#define _SIMD_SWIZZLE_H_
template <class T, class X>
class SIMD_32bitSwizzleAble
{
template<int mask>
inline X shuffleFunc(X reg) const;
public:
inline T xxxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,0,0)>(((T*)this)->getAsRegister());}
inline T xxxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,0,0)>(((T*)this)->getAsRegister());}
inline T xxxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,0,0)>(((T*)this)->getAsRegister());}
inline T xxxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,0,0)>(((T*)this)->getAsRegister());}
inline T xxyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,0,0)>(((T*)this)->getAsRegister());}
inline T xxyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,0,0)>(((T*)this)->getAsRegister());}
inline T xxyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,0,0)>(((T*)this)->getAsRegister());}
inline T xxyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,0,0)>(((T*)this)->getAsRegister());}
inline T xxzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,0,0)>(((T*)this)->getAsRegister());}
inline T xxzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,0,0)>(((T*)this)->getAsRegister());}
inline T xxzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,0,0)>(((T*)this)->getAsRegister());}
inline T xxzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,0,0)>(((T*)this)->getAsRegister());}
inline T xxwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,0,0)>(((T*)this)->getAsRegister());}
inline T xxwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,0,0)>(((T*)this)->getAsRegister());}
inline T xxwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,0,0)>(((T*)this)->getAsRegister());}
inline T xxww() const {return shuffleFunc<_MM_SHUFFLE(3,3,0,0)>(((T*)this)->getAsRegister());}
inline T xyxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,1,0)>(((T*)this)->getAsRegister());}
inline T xyxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,1,0)>(((T*)this)->getAsRegister());}
inline T xyxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,1,0)>(((T*)this)->getAsRegister());}
inline T xyxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,1,0)>(((T*)this)->getAsRegister());}
inline T xyyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,1,0)>(((T*)this)->getAsRegister());}
inline T xyyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,1,0)>(((T*)this)->getAsRegister());}
inline T xyyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,1,0)>(((T*)this)->getAsRegister());}
inline T xyyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,1,0)>(((T*)this)->getAsRegister());}
inline T xyzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,1,0)>(((T*)this)->getAsRegister());}
inline T xyzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,1,0)>(((T*)this)->getAsRegister());}
inline T xyzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,1,0)>(((T*)this)->getAsRegister());}
inline T xyzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,1,0)>(((T*)this)->getAsRegister());}
inline T xywx() const {return shuffleFunc<_MM_SHUFFLE(0,3,1,0)>(((T*)this)->getAsRegister());}
inline T xywy() const {return shuffleFunc<_MM_SHUFFLE(1,3,1,0)>(((T*)this)->getAsRegister());}
inline T xywz() const {return shuffleFunc<_MM_SHUFFLE(2,3,1,0)>(((T*)this)->getAsRegister());}
inline T xyww() const {return shuffleFunc<_MM_SHUFFLE(3,3,1,0)>(((T*)this)->getAsRegister());}
inline T xzxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,2,0)>(((T*)this)->getAsRegister());}
inline T xzxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,2,0)>(((T*)this)->getAsRegister());}
inline T xzxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,2,0)>(((T*)this)->getAsRegister());}
inline T xzxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,2,0)>(((T*)this)->getAsRegister());}
inline T xzyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,2,0)>(((T*)this)->getAsRegister());}
inline T xzyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,2,0)>(((T*)this)->getAsRegister());}
inline T xzyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,2,0)>(((T*)this)->getAsRegister());}
inline T xzyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,2,0)>(((T*)this)->getAsRegister());}
inline T xzzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,2,0)>(((T*)this)->getAsRegister());}
inline T xzzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,2,0)>(((T*)this)->getAsRegister());}
inline T xzzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,2,0)>(((T*)this)->getAsRegister());}
inline T xzzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,2,0)>(((T*)this)->getAsRegister());}
inline T xzwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,2,0)>(((T*)this)->getAsRegister());}
inline T xzwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,2,0)>(((T*)this)->getAsRegister());}
inline T xzwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,2,0)>(((T*)this)->getAsRegister());}
inline T xzww() const {return shuffleFunc<_MM_SHUFFLE(3,3,2,0)>(((T*)this)->getAsRegister());}
inline T xwxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,3,0)>(((T*)this)->getAsRegister());}
inline T xwxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,3,0)>(((T*)this)->getAsRegister());}
inline T xwxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,3,0)>(((T*)this)->getAsRegister());}
inline T xwxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,3,0)>(((T*)this)->getAsRegister());}
inline T xwyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,3,0)>(((T*)this)->getAsRegister());}
inline T xwyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,3,0)>(((T*)this)->getAsRegister());}
inline T xwyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,3,0)>(((T*)this)->getAsRegister());}
inline T xwyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,3,0)>(((T*)this)->getAsRegister());}
inline T xwzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,3,0)>(((T*)this)->getAsRegister());}
inline T xwzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,3,0)>(((T*)this)->getAsRegister());}
inline T xwzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,3,0)>(((T*)this)->getAsRegister());}
inline T xwzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,3,0)>(((T*)this)->getAsRegister());}
inline T xwwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,3,0)>(((T*)this)->getAsRegister());}
inline T xwwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,3,0)>(((T*)this)->getAsRegister());}
inline T xwwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,3,0)>(((T*)this)->getAsRegister());}
inline T xwww() const {return shuffleFunc<_MM_SHUFFLE(3,3,3,0)>(((T*)this)->getAsRegister());}
inline T yxxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,0,0)>(((T*)this)->getAsRegister());}
inline T yxxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,0,1)>(((T*)this)->getAsRegister());}
inline T yxxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,0,1)>(((T*)this)->getAsRegister());}
inline T yxxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,0,1)>(((T*)this)->getAsRegister());}
inline T yxyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,0,1)>(((T*)this)->getAsRegister());}
inline T yxyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,0,1)>(((T*)this)->getAsRegister());}
inline T yxyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,0,1)>(((T*)this)->getAsRegister());}
inline T yxyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,0,1)>(((T*)this)->getAsRegister());}
inline T yxzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,0,1)>(((T*)this)->getAsRegister());}
inline T yxzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,0,1)>(((T*)this)->getAsRegister());}
inline T yxzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,0,1)>(((T*)this)->getAsRegister());}
inline T yxzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,0,1)>(((T*)this)->getAsRegister());}
inline T yxwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,0,1)>(((T*)this)->getAsRegister());}
inline T yxwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,0,1)>(((T*)this)->getAsRegister());}
inline T yxwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,0,1)>(((T*)this)->getAsRegister());}
inline T yxww() const {return shuffleFunc<_MM_SHUFFLE(3,3,0,1)>(((T*)this)->getAsRegister());}
inline T yyxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,1,1)>(((T*)this)->getAsRegister());}
inline T yyxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,1,1)>(((T*)this)->getAsRegister());}
inline T yyxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,1,1)>(((T*)this)->getAsRegister());}
inline T yyxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,1,1)>(((T*)this)->getAsRegister());}
inline T yyyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,1,1)>(((T*)this)->getAsRegister());}
inline T yyyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,1,1)>(((T*)this)->getAsRegister());}
inline T yyyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,1,1)>(((T*)this)->getAsRegister());}
inline T yyyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,1,1)>(((T*)this)->getAsRegister());}
inline T yyzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,1,1)>(((T*)this)->getAsRegister());}
inline T yyzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,1,1)>(((T*)this)->getAsRegister());}
inline T yyzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,1,1)>(((T*)this)->getAsRegister());}
inline T yyzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,1,1)>(((T*)this)->getAsRegister());}
inline T yywx() const {return shuffleFunc<_MM_SHUFFLE(0,3,1,1)>(((T*)this)->getAsRegister());}
inline T yywy() const {return shuffleFunc<_MM_SHUFFLE(1,3,1,1)>(((T*)this)->getAsRegister());}
inline T yywz() const {return shuffleFunc<_MM_SHUFFLE(2,3,1,1)>(((T*)this)->getAsRegister());}
inline T yyww() const {return shuffleFunc<_MM_SHUFFLE(3,3,1,1)>(((T*)this)->getAsRegister());}
inline T yzxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,2,1)>(((T*)this)->getAsRegister());}
inline T yzxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,2,1)>(((T*)this)->getAsRegister());}
inline T yzxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,2,1)>(((T*)this)->getAsRegister());}
inline T yzxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,2,1)>(((T*)this)->getAsRegister());}
inline T yzyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,2,1)>(((T*)this)->getAsRegister());}
inline T yzyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,2,1)>(((T*)this)->getAsRegister());}
inline T yzyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,2,1)>(((T*)this)->getAsRegister());}
inline T yzyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,2,1)>(((T*)this)->getAsRegister());}
inline T yzzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,2,1)>(((T*)this)->getAsRegister());}
inline T yzzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,2,1)>(((T*)this)->getAsRegister());}
inline T yzzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,2,1)>(((T*)this)->getAsRegister());}
inline T yzzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,2,1)>(((T*)this)->getAsRegister());}
inline T yzwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,2,1)>(((T*)this)->getAsRegister());}
inline T yzwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,2,1)>(((T*)this)->getAsRegister());}
inline T yzwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,2,1)>(((T*)this)->getAsRegister());}
inline T yzww() const {return shuffleFunc<_MM_SHUFFLE(3,3,2,1)>(((T*)this)->getAsRegister());}
inline T ywxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,3,1)>(((T*)this)->getAsRegister());}
inline T ywxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,3,1)>(((T*)this)->getAsRegister());}
inline T ywxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,3,1)>(((T*)this)->getAsRegister());}
inline T ywxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,3,1)>(((T*)this)->getAsRegister());}
inline T ywyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,3,1)>(((T*)this)->getAsRegister());}
inline T ywyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,3,1)>(((T*)this)->getAsRegister());}
inline T ywyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,3,1)>(((T*)this)->getAsRegister());}
inline T ywyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,3,1)>(((T*)this)->getAsRegister());}
inline T ywzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,3,1)>(((T*)this)->getAsRegister());}
inline T ywzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,3,1)>(((T*)this)->getAsRegister());}
inline T ywzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,3,1)>(((T*)this)->getAsRegister());}
inline T ywzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,3,1)>(((T*)this)->getAsRegister());}
inline T ywwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,3,1)>(((T*)this)->getAsRegister());}
inline T ywwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,3,1)>(((T*)this)->getAsRegister());}
inline T ywwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,3,1)>(((T*)this)->getAsRegister());}
inline T ywww() const {return shuffleFunc<_MM_SHUFFLE(3,3,3,1)>(((T*)this)->getAsRegister());}
inline T zxxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,0,2)>(((T*)this)->getAsRegister());}
inline T zxxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,0,2)>(((T*)this)->getAsRegister());}
inline T zxxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,0,2)>(((T*)this)->getAsRegister());}
inline T zxxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,0,2)>(((T*)this)->getAsRegister());}
inline T zxyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,0,2)>(((T*)this)->getAsRegister());}
inline T zxyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,0,2)>(((T*)this)->getAsRegister());}
inline T zxyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,0,2)>(((T*)this)->getAsRegister());}
inline T zxyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,0,2)>(((T*)this)->getAsRegister());}
inline T zxzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,0,2)>(((T*)this)->getAsRegister());}
inline T zxzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,0,2)>(((T*)this)->getAsRegister());}
inline T zxzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,0,2)>(((T*)this)->getAsRegister());}
inline T zxzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,0,2)>(((T*)this)->getAsRegister());}
inline T zxwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,0,2)>(((T*)this)->getAsRegister());}
inline T zxwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,0,2)>(((T*)this)->getAsRegister());}
inline T zxwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,0,2)>(((T*)this)->getAsRegister());}
inline T zxww() const {return shuffleFunc<_MM_SHUFFLE(3,3,0,2)>(((T*)this)->getAsRegister());}
inline T zyxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,1,2)>(((T*)this)->getAsRegister());}
inline T zyxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,1,2)>(((T*)this)->getAsRegister());}
inline T zyxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,1,2)>(((T*)this)->getAsRegister());}
inline T zyxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,1,2)>(((T*)this)->getAsRegister());}
inline T zyyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,1,2)>(((T*)this)->getAsRegister());}
inline T zyyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,1,2)>(((T*)this)->getAsRegister());}
inline T zyyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,1,2)>(((T*)this)->getAsRegister());}
inline T zyyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,1,2)>(((T*)this)->getAsRegister());}
inline T zyzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,1,2)>(((T*)this)->getAsRegister());}
inline T zyzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,1,2)>(((T*)this)->getAsRegister());}
inline T zyzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,1,2)>(((T*)this)->getAsRegister());}
inline T zyzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,1,2)>(((T*)this)->getAsRegister());}
inline T zywx() const {return shuffleFunc<_MM_SHUFFLE(0,3,1,2)>(((T*)this)->getAsRegister());}
inline T zywy() const {return shuffleFunc<_MM_SHUFFLE(1,3,1,2)>(((T*)this)->getAsRegister());}
inline T zywz() const {return shuffleFunc<_MM_SHUFFLE(2,3,1,2)>(((T*)this)->getAsRegister());}
inline T zyww() const {return shuffleFunc<_MM_SHUFFLE(3,3,1,2)>(((T*)this)->getAsRegister());}
inline T zzxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,2,2)>(((T*)this)->getAsRegister());}
inline T zzxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,2,2)>(((T*)this)->getAsRegister());}
inline T zzxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,2,2)>(((T*)this)->getAsRegister());}
inline T zzxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,2,2)>(((T*)this)->getAsRegister());}
inline T zzyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,2,2)>(((T*)this)->getAsRegister());}
inline T zzyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,2,2)>(((T*)this)->getAsRegister());}
inline T zzyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,2,2)>(((T*)this)->getAsRegister());}
inline T zzyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,2,2)>(((T*)this)->getAsRegister());}
inline T zzzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,2,2)>(((T*)this)->getAsRegister());}
inline T zzzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,2,2)>(((T*)this)->getAsRegister());}
inline T zzzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,2,2)>(((T*)this)->getAsRegister());}
inline T zzzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,2,2)>(((T*)this)->getAsRegister());}
inline T zzwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,2,2)>(((T*)this)->getAsRegister());}
inline T zzwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,2,2)>(((T*)this)->getAsRegister());}
inline T zzwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,2,2)>(((T*)this)->getAsRegister());}
inline T zzww() const {return shuffleFunc<_MM_SHUFFLE(3,3,2,2)>(((T*)this)->getAsRegister());}
inline T zwxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,3,2)>(((T*)this)->getAsRegister());}
inline T zwxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,3,2)>(((T*)this)->getAsRegister());}
inline T zwxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,3,2)>(((T*)this)->getAsRegister());}
inline T zwxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,3,2)>(((T*)this)->getAsRegister());}
inline T zwyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,3,2)>(((T*)this)->getAsRegister());}
inline T zwyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,3,2)>(((T*)this)->getAsRegister());}
inline T zwyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,3,2)>(((T*)this)->getAsRegister());}
inline T zwyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,3,2)>(((T*)this)->getAsRegister());}
inline T zwzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,3,2)>(((T*)this)->getAsRegister());}
inline T zwzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,3,2)>(((T*)this)->getAsRegister());}
inline T zwzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,3,2)>(((T*)this)->getAsRegister());}
inline T zwzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,3,2)>(((T*)this)->getAsRegister());}
inline T zwwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,3,2)>(((T*)this)->getAsRegister());}
inline T zwwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,3,2)>(((T*)this)->getAsRegister());}
inline T zwwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,3,2)>(((T*)this)->getAsRegister());}
inline T zwww() const {return shuffleFunc<_MM_SHUFFLE(3,3,3,2)>(((T*)this)->getAsRegister());}
inline T wxxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,0,3)>(((T*)this)->getAsRegister());}
inline T wxxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,0,3)>(((T*)this)->getAsRegister());}
inline T wxxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,0,3)>(((T*)this)->getAsRegister());}
inline T wxxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,0,3)>(((T*)this)->getAsRegister());}
inline T wxyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,0,3)>(((T*)this)->getAsRegister());}
inline T wxyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,0,3)>(((T*)this)->getAsRegister());}
inline T wxyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,0,3)>(((T*)this)->getAsRegister());}
inline T wxyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,0,3)>(((T*)this)->getAsRegister());}
inline T wxzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,0,3)>(((T*)this)->getAsRegister());}
inline T wxzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,0,3)>(((T*)this)->getAsRegister());}
inline T wxzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,0,3)>(((T*)this)->getAsRegister());}
inline T wxzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,0,3)>(((T*)this)->getAsRegister());}
inline T wxwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,0,3)>(((T*)this)->getAsRegister());}
inline T wxwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,0,3)>(((T*)this)->getAsRegister());}
inline T wxwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,0,3)>(((T*)this)->getAsRegister());}
inline T wxww() const {return shuffleFunc<_MM_SHUFFLE(3,3,0,3)>(((T*)this)->getAsRegister());}
inline T wyxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,1,3)>(((T*)this)->getAsRegister());}
inline T wyxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,1,3)>(((T*)this)->getAsRegister());}
inline T wyxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,1,3)>(((T*)this)->getAsRegister());}
inline T wyxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,1,3)>(((T*)this)->getAsRegister());}
inline T wyyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,1,3)>(((T*)this)->getAsRegister());}
inline T wyyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,1,3)>(((T*)this)->getAsRegister());}
inline T wyyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,1,3)>(((T*)this)->getAsRegister());}
inline T wyyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,1,3)>(((T*)this)->getAsRegister());}
inline T wyzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,1,3)>(((T*)this)->getAsRegister());}
inline T wyzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,1,3)>(((T*)this)->getAsRegister());}
inline T wyzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,1,3)>(((T*)this)->getAsRegister());}
inline T wyzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,1,3)>(((T*)this)->getAsRegister());}
inline T wywx() const {return shuffleFunc<_MM_SHUFFLE(0,3,1,3)>(((T*)this)->getAsRegister());}
inline T wywy() const {return shuffleFunc<_MM_SHUFFLE(1,3,1,3)>(((T*)this)->getAsRegister());}
inline T wywz() const {return shuffleFunc<_MM_SHUFFLE(2,3,1,3)>(((T*)this)->getAsRegister());}
inline T wyww() const {return shuffleFunc<_MM_SHUFFLE(3,3,1,3)>(((T*)this)->getAsRegister());}
inline T wzxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,2,3)>(((T*)this)->getAsRegister());}
inline T wzxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,2,3)>(((T*)this)->getAsRegister());}
inline T wzxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,2,3)>(((T*)this)->getAsRegister());}
inline T wzxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,2,3)>(((T*)this)->getAsRegister());}
inline T wzyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,2,3)>(((T*)this)->getAsRegister());}
inline T wzyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,2,3)>(((T*)this)->getAsRegister());}
inline T wzyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,2,3)>(((T*)this)->getAsRegister());}
inline T wzyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,2,3)>(((T*)this)->getAsRegister());}
inline T wzzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,2,3)>(((T*)this)->getAsRegister());}
inline T wzzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,2,3)>(((T*)this)->getAsRegister());}
inline T wzzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,2,3)>(((T*)this)->getAsRegister());}
inline T wzzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,2,3)>(((T*)this)->getAsRegister());}
inline T wzwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,2,3)>(((T*)this)->getAsRegister());}
inline T wzwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,2,3)>(((T*)this)->getAsRegister());}
inline T wzwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,2,3)>(((T*)this)->getAsRegister());}
inline T wzww() const {return shuffleFunc<_MM_SHUFFLE(3,3,2,3)>(((T*)this)->getAsRegister());}
inline T wwxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,3,3)>(((T*)this)->getAsRegister());}
inline T wwxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,3,3)>(((T*)this)->getAsRegister());}
inline T wwxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,3,3)>(((T*)this)->getAsRegister());}
inline T wwxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,3,3)>(((T*)this)->getAsRegister());}
inline T wwyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,3,3)>(((T*)this)->getAsRegister());}
inline T wwyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,3,3)>(((T*)this)->getAsRegister());}
inline T wwyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,3,3)>(((T*)this)->getAsRegister());}
inline T wwyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,3,3)>(((T*)this)->getAsRegister());}
inline T wwzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,3,3)>(((T*)this)->getAsRegister());}
inline T wwzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,3,3)>(((T*)this)->getAsRegister());}
inline T wwzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,3,3)>(((T*)this)->getAsRegister());}
inline T wwzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,3,3)>(((T*)this)->getAsRegister());}
inline T wwwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,3,3)>(((T*)this)->getAsRegister());}
inline T wwwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,3,3)>(((T*)this)->getAsRegister());}
inline T wwwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,3,3)>(((T*)this)->getAsRegister());}
inline T wwww() const {return shuffleFunc<_MM_SHUFFLE(3,3,3,3)>(((T*)this)->getAsRegister());}
};
#define FAST_FLOAT_SHUFFLE(X,Y) _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(X),Y))
template <>
template <int mask>
inline __m128 SIMD_32bitSwizzleAble<vectorSIMDf,__m128>::shuffleFunc(__m128 reg) const
{
return FAST_FLOAT_SHUFFLE(reg,mask);
}
template <>
template <int mask>
inline __m128i SIMD_32bitSwizzleAble<vectorSIMD_32<int32_t>,__m128i>::shuffleFunc(__m128i reg) const
{
return _mm_shuffle_epi32(reg,mask);
}
template <>
template <int mask>
inline __m128i SIMD_32bitSwizzleAble<vectorSIMD_32<uint32_t>,__m128i>::shuffleFunc(__m128i reg) const
{
return _mm_shuffle_epi32(reg,mask);
}
#endif
Last edited by devsh on Fri May 01, 2015 12:34 pm, edited 4 times in total.
Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO
matrixSIMD4.h :
Code: Select all
// Copyright (C) 2002-2012 Nikolaus Gebhardt
// This file is part of the "Irrlicht Engine".
// For conditions of distribution and use, see copyright notice in irrlicht.h
#ifndef __IRR_MATRIX_H_INCLUDED__
#define __IRR_MATRIX_H_INCLUDED__
#define __IRR_COMPILE_WITH_X86_SIMD_
#ifdef __IRR_COMPILE_WITH_X86_SIMD_
#include "matrix4.h"
#include "vectorSIMD.h"
namespace irr
{
namespace core
{
//! 4x4 matrix. Mostly used as transformation matrix for 3d calculations.
/** Translations in the 4th column, this is laid out in memory in the completely opposite way to irrlicht matrix4. */
class matrixSIMD4
{
public:
//! Default constructor
/** \param constructor Choose the initialization style */
matrixSIMD4( matrix4::eConstructor constructor = matrix4::EM4CONST_IDENTITY );
//! Copy constructor
/** \param other Other matrix to copy from
\param constructor Choose the initialization style */
matrixSIMD4(const matrixSIMD4& other, matrix4::eConstructor constructor = matrix4::EM4CONST_COPY);
//! init from 4 row vectors
inline matrixSIMD4(const vectorSIMDf& row0,const vectorSIMDf& row1,const vectorSIMDf& row2,const vectorSIMDf& row3)
{
rows[0] = row0;
rows[1] = row1;
rows[2] = row2;
rows[3] = row3;
}
//! init from 16 floats
inline matrixSIMD4( const float& x0,const float& y0,const float& z0,const float& w0,
const float& x1,const float& y1,const float& z1,const float& w1,
const float& x2,const float& y2,const float& z2,const float& w2,
const float& x3,const float& y3,const float& z3,const float& w3)
{
rows[0] = _mm_set_ps(w0,z0,y0,x0);
rows[1] = _mm_set_ps(w1,z1,y1,x1);
rows[2] = _mm_set_ps(w2,z2,y2,x2);
rows[3] = _mm_set_ps(w3,z3,y3,x3);
}
//! init from 1 float
explicit matrixSIMD4( const float& scalar)
{
rows[0] = _mm_set1_ps(scalar);
rows[1] = _mm_set1_ps(scalar);
rows[2] = _mm_set1_ps(scalar);
rows[3] = _mm_set1_ps(scalar);
}
//! init from 1 float
explicit matrixSIMD4( const matrix4& retardedIrrlichtMatrix)
{
__m128 xmm0 = _mm_loadu_ps(retardedIrrlichtMatrix.pointer);
__m128 xmm1 = _mm_loadu_ps(retardedIrrlichtMatrix.pointer+4);
__m128 xmm2 = _mm_loadu_ps(retardedIrrlichtMatrix.pointer+8);
__m128 xmm3 = _mm_loadu_ps(retardedIrrlichtMatrix.pointer+12);
_MM_TRANSPOSE4_PS(xmm0,xmm1,xmm2,xmm3);
rows[0] = xmm0;
rows[1] = xmm1;
rows[2] = xmm2;
rows[3] = xmm3;
}
inline matrix4 getAsRetardedIrrlichtMatrix()
{
__m128 xmm0 = rows[0].getAsRegister();
__m128 xmm1 = rows[1].getAsRegister();
__m128 xmm2 = rows[2].getAsRegister();
__m128 xmm3 = rows[3].getAsRegister();
_MM_TRANSPOSE4_PS(xmm0,xmm1,xmm2,xmm3)
#ifdef _IRR_WINDOWS_
__declspec(align(16)) matrix4 outRIMatrix;
#else
matrix4 outRIMatrix __attribute__ ((__aligned__(16)));
#endif
_mm_store_ps(outRIMatrix.pointer,xmm0);
_mm_store_ps(outRIMatrix.pointer+1,xmm1);
_mm_store_ps(outRIMatrix.pointer+2,xmm2);
_mm_store_ps(outRIMatrix.pointer+3,xmm3);
return outRIMatrix;
}
//! Simple operator for directly accessing every element of the matrix.
inline float& operator()(const s32 &row, const s32 &col)
{
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return rows[row].pointer[col];
}
//! Simple operator for directly accessing every element of the matrix.
inline const float& operator()(const s32 &row, const s32 &col) const { return rows[row].pointer[col]; }
//! Simple operator for linearly accessing every element of the matrix.
inline float& operator[](u32 index)
{
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return ((float*)rows[0].pointer)[index];
}
//! Simple operator for linearly accessing every element of the matrix.
inline const float& operator[](u32 index) const { return ((float*)rows[0].pointer)[index]; }
//! Sets this matrix equal to the other matrix.
matrixSIMD4& operator=(const matrixSIMD4 &other);
//! Sets all elements of this matrix to the value.
matrixSIMD4& operator=(const float& scalar);
//! Returns pointer to internal array
inline const float* pointer() const { return rows[0].pointer; }
inline float* pointer()
{
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return rows[0].pointer;
}
//! Returns true if other matrix is equal to this matrix.
inline bool operator==(const matrixSIMD4 &other) const;
//! Returns true if other matrix is not equal to this matrix.
inline bool operator!=(const matrixSIMD4 &other) const;
//! Add another matrix.
matrixSIMD4 operator+(const matrixSIMD4& other) const;
//! Add another matrix.
matrixSIMD4& operator+=(const matrixSIMD4& other);
//! Subtract another matrix.
matrixSIMD4 operator-(const matrixSIMD4& other) const;
//! Subtract another matrix.
matrixSIMD4& operator-=(const matrixSIMD4& other);
//! set this matrix to the product of two matrices
/** Calculate b*a */
inline matrixSIMD4& setbyproduct(const matrixSIMD4& other_a,const matrixSIMD4& other_b );
//! Set this matrix to the product of two matrices
/** Calculate b*a, no optimization used,
use it if you know you never have a identity matrix */
matrixSIMD4& setbyproduct_nocheck(const matrixSIMD4& other_a,const matrixSIMD4& other_b );
//! Multiply by another matrix.
/** Calculate other*this */
matrixSIMD4 operator*(const matrixSIMD4& other) const;
//! Multiply by another matrix.
/** Calculate and return other*this */
matrixSIMD4& operator*=(const matrixSIMD4& other);
//! Multiply by scalar.
matrixSIMD4 operator*(const float& scalar) const;
//! Multiply by scalar.
matrixSIMD4& operator*=(const float& scalar);
//! Set matrix to identity.
inline matrixSIMD4& makeIdentity()
{
rows[0] = _mm_set_ps(0,0,0,1);
rows[1] = _mm_set_ps(0,0,1,0);
rows[2] = _mm_set_ps(0,1,0,0);
rows[3] = _mm_set_ps(1,0,0,0);
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=true;
#endif
return *this;
}
//! Returns true if the matrix is the identity matrix
bool isIdentity() const;
//! Returns true if the matrix is orthogonal
inline bool isOrthogonal() const;
//! Set the translation of the current matrix. Will erase any previous values.
matrixSIMD4& setTranslation( const vectorSIMDf& translation );
//! Gets the current translation
vectorSIMDf getTranslation() const;
//! Set the inverse translation of the current matrix. Will erase any previous values.
matrixSIMD4& setInverseTranslation( const vectorSIMDf& translation );
//! Set Scale
matrixSIMD4& setScale( const vectorSIMDf& scale );
//! Set Scale
matrixSIMD4& setScale( const float scale ) { return setScale(_mm_set1_ps(scale)); }
//! Get Scale
core::vectorSIMDf getScale() const;
//! Translate a vector by the inverse of the translation part of this matrix.
void inverseTranslateVect( vector3df& vect ) const;
/*
//! Rotate a vector by the inverse of the rotation part of this matrix.
void inverseRotateVect( vector3df& vect ) const;
//! Rotate a vector by the rotation part of this matrix.
void rotateVect( vector3df& vect ) const;
//! An alternate transform vector method, writing into a second vector
void rotateVect(core::vector3df& out, const core::vector3df& in) const;
//! An alternate transform vector method, writing into an array of 3 floats
void rotateVect(float *out,const core::vector3df &in) const;
*/
//! Transforms the vector by this matrix
void transformVect( vector3df& vect) const;
//! Transforms input vector by this matrix and stores result in output vector
void transformVect( vector3df& out, const vector3df& in ) const;
//! An alternate transform vector method, writing into an array of 4 floats
void transformVect(float *out,const core::vector3df &in) const;
//! An alternate transform vector method, reading from and writing to an array of 3 floats
void transformVec3(float *out, const float * in) const;
//! Translate a vector by the translation part of this matrix.
void translateVect( vector3df& vect ) const;
//! Creates a new matrix as interpolated matrix from two other ones.
/** \param b: other matrix to interpolate with
\param time: Must be a value between 0 and 1. */
matrixSIMD4 interpolate(const core::matrixSIMD4& b, float factor) const;
//! Gets transposed matrix
matrixSIMD4 getTransposed() const;
//! Gets transposed matrix
inline void getTransposed( matrixSIMD4& dest ) const;
private:
//! Matrix data, stored in row-major order
vectorSIMDf rows[4];
#if defined ( USE_MATRIX_TEST )
//! Flag is this matrix is identity matrix
mutable u32 definitelyIdentityMatrix;
#endif
#if defined ( USE_MATRIX_TEST_DEBUG )
u32 id;
mutable u32 calls;
#endif
};
Code: Select all
// Default constructor
inline matrixSIMD4::matrixSIMD4( eConstructor constructor )
#if defined ( USE_MATRIX_TEST )
: definitelyIdentityMatrix(BIT_UNTESTED)
#endif
#if defined ( USE_MATRIX_TEST_DEBUG )
,id ( MTest.ID++), calls ( 0 )
#endif
{
switch ( constructor )
{
case EM4CONST_NOTHING:
case EM4CONST_COPY:
break;
case EM4CONST_IDENTITY:
case EM4CONST_INVERSE:
default:
makeIdentity();
break;
}
}
// Copy constructor
inline matrixSIMD4::matrixSIMD4( const matrixSIMD4& other, eConstructor constructor)
#if defined ( USE_MATRIX_TEST )
: definitelyIdentityMatrix(BIT_UNTESTED)
#endif
#if defined ( USE_MATRIX_TEST_DEBUG )
,id ( MTest.ID++), calls ( 0 )
#endif
{
switch ( constructor )
{
case EM4CONST_IDENTITY:
makeIdentity();
break;
case EM4CONST_NOTHING:
break;
case EM4CONST_COPY:
*this = other;
break;
case EM4CONST_TRANSPOSED:
other.getTransposed(*this);
break;
case EM4CONST_INVERSE:
if (!other.getInverse(*this))
*this = 0.f;
break;
case EM4CONST_INVERSE_TRANSPOSED:
if (!other.getInverseTransposed(*this))
*this = 0.f;
else
*this = getTransposed();
break;
}
}
//! Add another matrix.
inline matrixSIMD4 matrixSIMD4::operator+(const matrixSIMD4& other) const
{
matrixSIMD4 temp ( EM4CONST_NOTHING );
temp.rows[0] = rows[0]+other.rows[0];
temp.rows[1] = rows[1]+other.rows[1];
temp.rows[2] = rows[2]+other.rows[2];
temp.rows[3] = rows[3]+other.rows[3];
return temp;
}
//! Add another matrix.
inline matrixSIMD4& matrixSIMD4::operator+=(const matrixSIMD4& other)
{
rows[0] += other.rows[0];
rows[1] += other.rows[1];
rows[2] += other.rows[2];
rows[3] += other.rows[3];
return *this;
}
//! Subtract another matrix.
inline matrixSIMD4 matrixSIMD4::operator-(const matrixSIMD4& other) const
{
matrixSIMD4 temp ( EM4CONST_NOTHING );
temp.rows[0] = rows[0]-other.rows[0];
temp.rows[1] = rows[1]-other.rows[1];
temp.rows[2] = rows[2]-other.rows[2];
temp.rows[3] = rows[3]-other.rows[3];
return temp;
}
//! Subtract another matrix.
inline matrixSIMD4& matrixSIMD4::operator-=(const matrixSIMD4& other)
{
rows[0] += other.rows[0];
rows[1] += other.rows[1];
rows[2] += other.rows[2];
rows[3] += other.rows[3];
return *this;
}
//! Multiply by scalar.
inline matrixSIMD4 matrixSIMD4::operator*(const float& scalar) const
{
matrixSIMD4 temp ( EM4CONST_NOTHING );
temp.rows[0] = rows[0]*scalar;
temp.rows[1] = rows[1]*scalar;
temp.rows[2] = rows[2]*scalar;
temp.rows[3] = rows[3]*scalar;
return temp;
}
//! Multiply by scalar.
inline matrixSIMD4& matrixSIMD4::operator*=(const float& scalar)
{
rows[0] *= scalar;
rows[1] *= scalar;
rows[2] *= scalar;
rows[3] *= scalar;
return *this;
}
//! Multiply by another matrix.
inline matrixSIMD4& matrixSIMD4::operator*=(const matrixSIMD4& other)
{
#if defined ( USE_MATRIX_TEST )
// do checks on your own in order to avoid copy creation
if ( !other.isIdentity() )
{
if ( this->isIdentity() )
{
return (*this = other);
}
else
{
matrixSIMD4 temp ( *this );
return setbyproduct_nocheck( temp, other );
}
}
return *this;
#else
matrixSIMD4 temp ( *this );
return setbyproduct_nocheck( temp, other );
#endif
}
//! multiply by another matrix
// set this matrix to the product of two other matrices
// goal is to reduce stack use and copy
inline matrixSIMD4& matrixSIMD4::setbyproduct_nocheck(const matrixSIMD4& other_a,const matrixSIMD4& other_b ) //A*B
{
// xmm4-7 will now become columuns of B
__m128 xmm4 = other_b.rows[0].getAsRegister();
__m128 xmm5 = other_b.rows[1].getAsRegister();
__m128 xmm6 = other_b.rows[2].getAsRegister();
__m128 xmm7 = other_b.rows[3].getAsRegister();
_MM_TRANSPOSE4_PS(xmm4,xmm5,xmm6,xmm7)
__m128 xmm0 = other_a.rows[0].getAsRegister();
__m128 xmm1 = _mm_hadd_ps(_mm_mul_ps(xmm0,xmm4),_mm_mul_ps(xmm0,xmm5)); //(x_l,x_u,y_l,y_u)
__m128 xmm2 = _mm_hadd_ps(_mm_mul_ps(xmm0,xmm6),_mm_mul_ps(xmm0,xmm7)); //(z_l,z_u,w_l,w_u)
rows[0] = _mm_hadd_ps(xmm1,xmm2); //(x,y,z,w)
xmm0 = other_a.rows[1].getAsRegister();
xmm1 = _mm_hadd_ps(_mm_mul_ps(xmm0,xmm4),_mm_mul_ps(xmm0,xmm5)); //(x_l,x_u,y_l,y_u)
xmm2 = _mm_hadd_ps(_mm_mul_ps(xmm0,xmm6),_mm_mul_ps(xmm0,xmm7)); //(z_l,z_u,w_l,w_u)
rows[1] = _mm_hadd_ps(xmm1,xmm2); //(x,y,z,w)
xmm0 = other_a.rows[2].getAsRegister();
xmm1 = _mm_hadd_ps(_mm_mul_ps(xmm0,xmm4),_mm_mul_ps(xmm0,xmm5)); //(x_l,x_u,y_l,y_u)
xmm2 = _mm_hadd_ps(_mm_mul_ps(xmm0,xmm6),_mm_mul_ps(xmm0,xmm7)); //(z_l,z_u,w_l,w_u)
rows[2] = _mm_hadd_ps(xmm1,xmm2); //(x,y,z,w)
xmm0 = other_a.rows[3].getAsRegister();
xmm1 = _mm_hadd_ps(_mm_mul_ps(xmm0,xmm4),_mm_mul_ps(xmm0,xmm5)); //(x_l,x_u,y_l,y_u)
xmm2 = _mm_hadd_ps(_mm_mul_ps(xmm0,xmm6),_mm_mul_ps(xmm0,xmm7)); //(z_l,z_u,w_l,w_u)
rows[3] = _mm_hadd_ps(xmm1,xmm2); //(x,y,z,w)
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
//! multiply by another matrix
// set this matrix to the product of two other matrices
// goal is to reduce stack use and copy
inline matrixSIMD4& matrixSIMD4::setbyproduct(const matrixSIMD4& other_a, const matrixSIMD4& other_b )
{
#if defined ( USE_MATRIX_TEST )
if ( other_a.isIdentity () )
return (*this = other_b);
else
if ( other_b.isIdentity () )
return (*this = other_a);
else
return setbyproduct_nocheck(other_a,other_b);
#else
return setbyproduct_nocheck(other_a,other_b);
#endif
}
//! multiply by another matrix
inline matrixSIMD4 matrixSIMD4::operator*(const matrixSIMD4& m2) const
{
#if defined ( USE_MATRIX_TEST )
// Testing purpose..
if ( this->isIdentity() )
return m2;
if ( m2.isIdentity() )
return *this;
definitelyIdentityMatrix=false;
#endif
matrixSIMD4 m3 ( EM4CONST_NOTHING );
return m3.setbyproduct_nocheck(*this,m2);
}
inline vectorSIMDf matrixSIMD4::getTranslation() const
{
__m128 xmm0 = _mm_unpackhi_ps(rows[0].getAsRegister(),rows[1].getAsRegister()); // (0z,1z,0w,1w)
__m128 xmm1 = _mm_unpackhi_ps(rows[2].getAsRegister(),rows[3].getAsRegister()); // (2z,3z,2w,3w)
__m128 xmm2 = _mm_movehl_ps(xmm1,xmm0);// (0w,1w,2w,3w)
return xmm2;
}
inline vectorSIMDf matrixSIMD4::getTranslation3D() const
{
__m128 xmm0 = _mm_unpackhi_ps(rows[0].getAsRegister(),rows[1].getAsRegister()); // (0z,1z,0w,1w)
__m128 xmm1 = _mm_unpackhi_ps(rows[2].getAsRegister(),_mm_setzero_ps()); // (2z,0,2w,0)
__m128 xmm2 = _mm_movehl_ps(xmm1,xmm0);// (0w,1w,2w,0)
return xmm2;
}
inline matrixSIMD4& matrixSIMD4::setTranslation( const vectorSIMDf& translation )
{
rows[0].W = translation.X;
rows[1].W = translation.Y;
rows[2].W = translation.Z;
rows[3].W = translation.W;
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
inline matrixSIMD4& matrixSIMD4::setTranslation3D( const vectorSIMDf& translation )
{
rows[0].W = translation.X;
rows[1].W = translation.Y;
rows[2].W = translation.Z;
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
inline matrixSIMD4& matrixSIMD4::setInverseTranslation( const vectorSIMDf& translation )
{
return setTranslation(-translation);
}
inline matrixSIMD4& matrixSIMD4::setInverseTranslation3D( const vectorSIMDf& translation )
{
return setTranslation3D(-translation);
}
inline matrixSIMD4& matrixSIMD4::setScale( const vectorSIMDf& scale )
{
//_m128i xmm0 = _mm_castps_si128(_mm_mul_ps(scale.getAsRegister(),_mm_rsqrt_ps(getScaleSQ().getAsRegister())));
_m128i xmm0 = _mm_castps_si128(scale.getAsRegister());
_mm_maskmoveu_si128(xmm0,_mm_set_epi32(0,0,0,-1),(char*)rows);
_mm_maskmoveu_si128(xmm0,_mm_set_epi32(0,0,-1,0),(char*)(rows+1));
_mm_maskmoveu_si128(xmm0,_mm_set_epi32(0,-1,0,0),(char*)(rows+2));
_mm_maskmoveu_si128(xmm0,_mm_set_epi32(-1,0,0,0),(char*)(rows+3));
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
inline matrixSIMD4& matrixSIMD4::setScale3D const vectorSIMDf& scale )
{
//_m128i xmm0 = _mm_castps_si128(_mm_mul_ps(scale.getAsRegister(),_mm_rsqrt_ps(getScaleSQ().getAsRegister())));
_m128i xmm0 = _mm_castps_si128(scale.getAsRegister());
_mm_maskmoveu_si128(xmm0,_mm_set_epi32(0,0,0,-1),(char*)rows);
_mm_maskmoveu_si128(xmm0,_mm_set_epi32(0,0,-1,0),(char*)(rows+1));
_mm_maskmoveu_si128(xmm0,_mm_set_epi32(0,-1,0,0),(char*)(rows+2));
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
//! Returns the absolute values of the scales of the matrix.
/**
Note that this returns the absolute (positive) values unless only scale is set.
Unfortunately it does not appear to be possible to extract any original negative
values. The best that we could do would be to arbitrarily make one scale
negative if one or three of them were negative.
FIXME - return the original values.
*/
inline vectorSIMDf matrixSIMD4::getScaleSQ() const
{
#ifdef __IRR_COMPILE_WITH_SSE3
// xmm4-7 will now become columuns of B
__m128 xmm4 = rows[0].getAsRegister();
__m128 xmm5 = rows[1].getAsRegister();
__m128 xmm6 = rows[2].getAsRegister();
__m128 xmm7 = _mm_setzero_ps();
// g==0
__m128 xmm0 = _mm_unpacklo_ps(xmm4.xmm5);
__m128 xmm1 = _mm_unpacklo_ps(xmm6,xmm7); // (2x,g,2y,g)
__m128 xmm2 = _mm_unpackhi_ps(xmm4,xmm5);
__m128 xmm3 = _mm_unpackhi_ps(xmm6,xmm7); // (2z,g,2w,g)
xmm4 = _mm_movelh_ps(xmm1,xmm0); //(0x,1x,2x,g)
xmm5 = _mm_movehl_ps(xmm1,xmm0);
xmm6 = _mm_movelh_ps(xmm3,xmm2); //(0z,1z,,2z,g)
// See http://www.robertblum.com/articles/2005/02/14/decomposing-matrices
// We have to do the full calculation.
xmm0 = _mm_mul_ps(xmm4,xmm4);// column 0 squared
xmm1 = _mm_mul_ps(xmm5,xmm5);// column 1 squared
xmm2 = _mm_mul_ps(xmm6,xmm6);// column 2 squared
xmm4 = _mm_hadd_ps(xmm0,xmm1);
xmm5 = _mm_hadd_ps(xmm2,xmm7);
xmm6 = _mm_hadd_ps(xmm4,xmm5);
return xmm6;
#elif defined(__IRR_COMPILE_WITH_SSE2)
#error "SSE2 version not implemented yet"
#endif
}
inline vectorSIMDf matrixSIMD4::getScale() const
{
#ifdef __IRR_COMPILE_WITH_SSE3
return getScaleSQ().getSquareRoot();
#elif defined(__IRR_COMPILE_WITH_SSE2)
#error "SSE2 version not implemented yet"
#endif
}
/*
check identity with epsilon
solve floating range problems..
*/
inline bool matrixSIMD4::isIdentity() const
{
#if defined ( USE_MATRIX_TEST )
if (definitelyIdentityMatrix)
return true;
#endif
vector4db_SIMD tmp = (rows[0]!=vectorSIMDf(1.f,0.f,0.f,0.f))|(rows[1]!=vectorSIMDf(0.f,1.f,0.f,0.f))|(rows[2]!=vectorSIMDf(0.f,0.f,1.f,0.f))|(rows[3]!=vectorSIMDf(0.f,0.f,0.f,1.f));
if (tmp.any())
return false;
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=true;
#endif
return true;
}
/* Check orthogonality of matrix. */
inline bool matrixSIMD4::isOrthogonal() const
{
//all of the column vectors have to be orthogonal to each other
return ((*this)*(*this).getTransposed()).isIdentity();
}
Last edited by devsh on Fri May 01, 2015 12:41 pm, edited 2 times in total.
Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO
part2 of matrisSIMD4.h :
Code: Select all
/*
inline void matrixSIMD4::rotateVect( vector3df& vect ) const
{
vector3df tmp = vect;
vect.X = tmp.X*M[0] + tmp.Y*M[4] + tmp.Z*M[8];
vect.Y = tmp.X*M[1] + tmp.Y*M[5] + tmp.Z*M[9];
vect.Z = tmp.X*M[2] + tmp.Y*M[6] + tmp.Z*M[10];
}
//! An alternate transform vector method, writing into a second vector
inline void matrixSIMD4::rotateVect(core::vector3df& out, const core::vector3df& in) const
{
out.X = in.X*M[0] + in.Y*M[4] + in.Z*M[8];
out.Y = in.X*M[1] + in.Y*M[5] + in.Z*M[9];
out.Z = in.X*M[2] + in.Y*M[6] + in.Z*M[10];
}
//! An alternate transform vector method, writing into an array of 3 floats
inline void matrixSIMD4::rotateVect(float *out, const core::vector3df& in) const
{
out[0] = in.X*M[0] + in.Y*M[4] + in.Z*M[8];
out[1] = in.X*M[1] + in.Y*M[5] + in.Z*M[9];
out[2] = in.X*M[2] + in.Y*M[6] + in.Z*M[10];
}
inline void matrixSIMD4::inverseRotateVect( vector3df& vect ) const
{
vector3df tmp = vect;
vect.X = tmp.X*M[0] + tmp.Y*M[1] + tmp.Z*M[2];
vect.Y = tmp.X*M[4] + tmp.Y*M[5] + tmp.Z*M[6];
vect.Z = tmp.X*M[8] + tmp.Y*M[9] + tmp.Z*M[10];
}
*/
inline void matrixSIMD4::transformVect( vectorSIMDf& vect) const
{
transformVect(vect,vect);
}
inline void matrixSIMD4::transformVect( vectorSIMDf& out, const vectorSIMDf& in) const
{
transformVect(out.pointer,in);
}
inline void matrixSIMD4::transformVect(float *out, const vectorSIMDf &in) const
{
__m128 xmm4 = in.getAsRegister();
__m128 xmm0 = _mm_mul_ps(rows[0].getAsRegister(),xmm4);
__m128 xmm1 = _mm_mul_ps(rows[1].getAsRegister(),xmm4);
__m128 xmm2 = _mm_mul_ps(rows[2].getAsRegister(),xmm4);
__m128 xmm3 = _mm_mul_ps(rows[3].getAsRegister(),xmm4);
xmm4 = _mm_hadd_ps(xmm2,xmm3);
xmm2 = _mm_hadd_ps(xmm0,xmm1);
_mm_store_ps(out,_mm_hadd_ps(xmm2,xmm4));
}
/*
//! Transforms a plane by this matrix
inline void matrixSIMD4::transformPlane( core::plane3d<f32> &plane) const
{
core::plane3df temp;
transformPlane(plane,temp);
plane = temp;
}
//! Transforms a plane by this matrix
inline void matrixSIMD4::transformPlane( const core::plane3d<f32> &in, core::plane3d<f32> &out) const
{
matrixSIMD4 transposedInverse(*this, EM4CONST_INVERSE);
out.Normal.X = in.Normal.X*transposedInverse[0] + in.Normal.Y*transposedInverse[1] + in.Normal.Z*transposedInverse[2] + in.D*transposedInverse[3];
out.Normal.Y = in.Normal.X*transposedInverse[4] + in.Normal.Y*transposedInverse[5] + in.Normal.Z*transposedInverse[6] + in.D*transposedInverse[7];
out.Normal.Z = in.Normal.X*transposedInverse[8] + in.Normal.Y*transposedInverse[9] + in.Normal.Z*transposedInverse[10] + in.D*transposedInverse[11];
out.D = in.Normal.X*transposedInverse[12] + in.Normal.Y*transposedInverse[13] + in.Normal.Z*transposedInverse[14] + in.D*transposedInverse[15];
}
//! Transforms a axis aligned bounding box
inline void matrixSIMD4::transformBox(core::aabbox3d<f32>& box) const
{
#if defined ( USE_MATRIX_TEST )
if (isIdentity())
return;
#endif
transformVect(box.MinEdge);
transformVect(box.MaxEdge);
box.repair();
}
//! Transforms a axis aligned bounding box more accurately than transformBox()
inline void matrixSIMD4::transformBoxEx(core::aabbox3d<f32>& box) const
{
#if defined ( USE_MATRIX_TEST )
if (isIdentity())
return;
#endif
const f32 Amin[3] = {box.MinEdge.X, box.MinEdge.Y, box.MinEdge.Z};
const f32 Amax[3] = {box.MaxEdge.X, box.MaxEdge.Y, box.MaxEdge.Z};
f32 Bmin[3];
f32 Bmax[3];
Bmin[0] = Bmax[0] = M[12];
Bmin[1] = Bmax[1] = M[13];
Bmin[2] = Bmax[2] = M[14];
const matrixSIMD4 &m = *this;
for (u32 i = 0; i < 3; ++i)
{
for (u32 j = 0; j < 3; ++j)
{
const f32 a = m(j,i) * Amin[j];
const f32 b = m(j,i) * Amax[j];
if (a < b)
{
Bmin[i] += a;
Bmax[i] += b;
}
else
{
Bmin[i] += b;
Bmax[i] += a;
}
}
}
box.MinEdge.X = Bmin[0];
box.MinEdge.Y = Bmin[1];
box.MinEdge.Z = Bmin[2];
box.MaxEdge.X = Bmax[0];
box.MaxEdge.Y = Bmax[1];
box.MaxEdge.Z = Bmax[2];
}
*/
inline void matrixSIMD4::inverseTranslateVect( vectorSIMDf& vect ) const
{
__m128 xmm0 = _mm_unpackhi_ps(rows[0].getAsRegister(),rows[1].getAsRegister()); // (0z,1z,0w,1w)
__m128 xmm1 = _mm_unpackhi_ps(rows[2].getAsRegister(),_mm_setzero_ps()); // (2z,3z,2w,3w)
__m128 xmm2 = _mm_movehl_ps(xmm1,xmm0);// (0w,1w,2w,3w)
vect -= xmm2;
}
inline void matrixSIMD4::translateVect( vector3df& vect ) const
{
__m128 xmm0 = _mm_unpackhi_ps(rows[0].getAsRegister(),rows[1].getAsRegister()); // (0z,1z,0w,1w)
__m128 xmm1 = _mm_unpackhi_ps(rows[2].getAsRegister(),_mm_setzero_ps()); // (2z,3z,2w,3w)
__m128 xmm2 = _mm_movehl_ps(xmm1,xmm0);// (0w,1w,2w,3w)
vect += xmm2;
}
inline bool matrixSIMD4::getInverse(matrixSIMD4& out) const
{
/// Calculates the inverse of this Matrix
/// The inverse is calculated using Cramers rule.
/// If no inverse exists then 'false' is returned.
#if defined ( USE_MATRIX_TEST )
if ( this->isIdentity() )
{
out=*this;
return true;
}
#endif
vector4db_SIMD isReasonable = (rows[3]==vectorSIMDf(0.f,0.f,0.f,1.f));
vectorSIMDf determinant4;
if (isReasonable.all())
{
// last row is 0,0,0,1 like in a sane 4x4 matrix used in games
vectorSIMDf tmpA = rows[1].zxxw()*rows[2].yzyw();// (m(1, 2) * m(2, 1)
vectorSIMDf tmpB = rows[1].yzyw()*rows[2].zxxw();// (m(1, 1) * m(2, 2))
__m128 tmpC = tmpA-tmpB; //1st column of out matrix
__m128 preDeterminant = rows[0]*tmpC;
preDeterminant = _mm_hadd_ps(preDeterminant,preDeterminant); // (x+y,z+w,..)
determinant4 = _mm_hadd_ps(preDeterminant,preDeterminant); //
if (((uint32_t*)determinant4.pointer)[0]==0.f)
return false;
tmpA = rows[0].zxyw()*rows[2].yzxw();
tmpB = rows[0].yzxw()*rows[2].zxyw();
__m128 tmpD = tmpA-tmpB; // 2nd column of out matrix
tmpA = rows[0].yzxw()*rows[1].zxyw();
tmpB = rows[0].zxyw()*rows[1].yzxw();
__m128 tmpE = tmpA-tmpB; // 3rd column of out matrix
__m128 xmm0 = tmpC;
__m128 xmm1 = tmpD;
__m128 xmm2 = tmpE;
__m128 xmm3 = _mm_setzero_ps();
_MM_TRANSPOSE4_PS(xmm0,xmm1,xmm2,xmm3)
__m128 xmm4 = getTranslation3D().getAsRegister();
xmm0 = _mm_mul_ps(xmm0,xmm4); //out(0,3)
xmm1 = _mm_mul_ps(xmm1,xmm4); //out(1,3)
xmm2 = _mm_or_ps(_mm_mul_ps(xmm2,xmm4),_mm_castsi128_ps(_mm_set_epi32(0,-1,0,-1))); //out(2,3)
xmm0 = _mm_hsub_ps(xmm0,xmm1); // C.x-D.x,E.x,C.y-D.y,E.y
xmm1 = _mm_hsub_ps(xmm2,preDeterminant); // C.z-D.z,E.z,x+y-z-w,x+Y-z-w
xmm2 = _mm_hsub_ps(xmm0,xmm1); // C.x-D.x-E.x,C.y-D.y-E.y,C.z-D.z-E.z,0
/*
out(0, 3) = m(0, 3) * tmpC.x +
m(1, 3) * -tmpD.x +
m(2, 3) * -tmpE.x;
out(1, 3) = m(0, 3) * tmpC.y +
m(1, 3) * -tmpD.y +
m(2, 3) * -tmpE.y;
out(2, 3) = m(0, 3) * -tmpC.z +
m(1, 3) * -tmpD.z;
m(2, 3) * tmpE.z;
*/
_MM_TRANSPOSE4_PS(tmpC,tmpD,tmpE,xmm2)
out.rows[0] = tmpC;
out.rows[1] = tmpD;
out.rows[2] = tmpE;
out.rows[3] = xmm2;
tmpA = xmm1;
out[15] = -tmpA.w;
}
else
{
/**
out(0, 0) = m(1, 1) * (m(2, 2) * m(3, 3) - m(2, 3) * m(3, 2)) + m(1, 2) * (m(2, 3) * m(3, 1) - m(2, 1) * m(3, 3)) + m(1, 3) * (m(2, 1) * m(3, 2) - m(2, 2) * m(3, 1)));
out(1, 0) = m(1, 2) * (m(2, 0) * m(3, 3) - m(2, 3) * m(3, 0)) + m(1, 3) * (m(2, 2) * m(3, 0) - m(2, 0) * m(3, 2)) + m(1, 0) * (m(2, 3) * m(3, 2) - m(2, 2) * m(3, 3)));
out(2, 0) = m(1, 3) * (m(2, 0) * m(3, 1) - m(2, 1) * m(3, 0)) + m(1, 0) * (m(2, 1) * m(3, 3) - m(2, 3) * m(3, 1)) + m(1, 1) * (m(2, 3) * m(3, 0) - m(2, 0) * m(3, 3)));
out(3, 0) = m(1, 0) * (m(2, 2) * m(3, 1) - m(2, 1) * m(3, 2)) + m(1, 1) * (m(2, 0) * m(3, 2) - m(2, 2) * m(3, 0)) + m(1, 2) * (m(2, 1) * m(3, 0) - m(2, 0) * m(3, 1)));
out(0, 1) = (m(2, 1) * (m(0, 2) * m(3, 3) - m(0, 3) * m(3, 2)) + m(2, 2) * (m(0, 3) * m(3, 1) - m(0, 1) * m(3, 3)) + m(2, 3) * (m(0, 1) * m(3, 2) - m(0, 2) * m(3, 1)));
out(1, 1) = (m(2, 2) * (m(0, 0) * m(3, 3) - m(0, 3) * m(3, 0)) + m(2, 3) * (m(0, 2) * m(3, 0) - m(0, 0) * m(3, 2)) + m(2, 0) * (m(0, 3) * m(3, 2) - m(0, 2) * m(3, 3)));
out(2, 1) = (m(2, 3) * (m(0, 0) * m(3, 1) - m(0, 1) * m(3, 0)) + m(2, 0) * (m(0, 1) * m(3, 3) - m(0, 3) * m(3, 1)) + m(2, 1) * (m(0, 3) * m(3, 0) - m(0, 0) * m(3, 3)));
out(3, 1) = (m(2, 0) * (m(0, 2) * m(3, 1) - m(0, 1) * m(3, 2)) + m(2, 1) * (m(0, 0) * m(3, 2) - m(0, 2) * m(3, 0)) + m(2, 2) * (m(0, 1) * m(3, 0) - m(0, 0) * m(3, 1)));
out(0, 2) = (m(3, 1) * (m(0, 2) * m(1, 3) - m(0, 3) * m(1, 2)) + m(3, 2) * (m(0, 3) * m(1, 1) - m(0, 1) * m(1, 3)) + m(3, 3) * (m(0, 1) * m(1, 2) - m(0, 2) * m(1, 1)));
out(1, 2) = (m(3, 2) * (m(0, 0) * m(1, 3) - m(0, 3) * m(1, 0)) + m(3, 3) * (m(0, 2) * m(1, 0) - m(0, 0) * m(1, 2)) + m(3, 0) * (m(0, 3) * m(1, 2) - m(0, 2) * m(1, 3)));
out(2, 2) = (m(3, 3) * (m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0)) + m(3, 0) * (m(0, 1) * m(1, 3) - m(0, 3) * m(1, 1)) + m(3, 1) * (m(0, 3) * m(1, 0) - m(0, 0) * m(1, 3)));
out(3, 2) = (m(3, 0) * (m(0, 2) * m(1, 1) - m(0, 1) * m(1, 2)) + m(3, 1) * (m(0, 0) * m(1, 2) - m(0, 2) * m(1, 0)) + m(3, 2) * (m(0, 1) * m(1, 0) - m(0, 0) * m(1, 1)));
out(0, 3) = (m(0, 1) * (m(1, 3) * m(2, 2) - m(1, 2) * m(2, 3)) + m(0, 2) * (m(1, 1) * m(2, 3) - m(1, 3) * m(2, 1)) + m(0, 3) * (m(1, 2) * m(2, 1) - m(1, 1) * m(2, 2)));
out(1, 3) = (m(0, 2) * (m(1, 3) * m(2, 0) - m(1, 0) * m(2, 3)) + m(0, 3) * (m(1, 0) * m(2, 2) - m(1, 2) * m(2, 0)) + m(0, 0) * (m(1, 2) * m(2, 3) - m(1, 3) * m(2, 2)));
out(2, 3) = (m(0, 3) * (m(1, 1) * m(2, 0) - m(1, 0) * m(2, 1)) + m(0, 0) * (m(1, 3) * m(2, 1) - m(1, 1) * m(2, 3)) + m(0, 1) * (m(1, 0) * m(2, 3) - m(1, 3) * m(2, 0)));
out(3, 3) = (m(0, 0) * (m(1, 1) * m(2, 2) - m(1, 2) * m(2, 1)) + m(0, 1) * (m(1, 2) * m(2, 0) - m(1, 0) * m(2, 2)) + m(0, 2) * (m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0)));
**/
vectorSIMDf tmpA = rows[2].zxxz();
vectorSIMDf tmpB = rows[3].wwyy();
vectorSIMDf tmpC = rows[2].wwyy();
vectorSIMDf tmpD = rows[3].zxxz();
vectorSIMDf tmpE = rows[2].wzyx();
vectorSIMDf tmpF = rows[3].yxwz();
vectorSIMDf tmpG = rows[2].yxwz();
vectorSIMDf tmpH = rows[3].wzyx();
vectorSIMDf tmpI = rows[2].ywwy();
vectorSIMDf tmpJ = rows[3].zzxx();
vectorSIMDf tmpK = rows[2].zzxx();
vectorSIMDf tmpL = rows[3].ywwy();
__m128 xmm0 = (rows[1].yzwx()*(tmpA*tmpB-tmpC*tmpD)+rows[1].zwxy()*(tmpE*tmpF-tmpG*tmpH)+rows[1].wxyz()*(tmpI*tmpJ-tmpK*tmpL)).getAsRegister();
determinant4 = rows[0].dotProduct(xmm0);
if (((uint32_t*)determinant4.pointer)[0]==0.f)
return false;
vectorSIMDf tmpM = rows[0].zxxz();
vectorSIMDf tmpN = rows[0].wwyy();
vectorSIMDf tmpO = rows[0].wzyx();
vectorSIMDf tmpP = rows[0].yxwz();
vectorSIMDf tmpQ = rows[0].ywwy();
vectorSIMDf tmpR = rows[0].zzxx();
__m128 xmm1 = (rows[2].yzwx()*(tmpM*tmpB-tmpN*tmpD)+rows[2].zwxy()*(tmpO*tmpF-tmpP*tmpH)+rows[2].wxyz()*(tmpQ*tmpJ-tmpR*tmpL)).getAsRegister();
vectorSIMDf tmpS = rows[1].wwyy();
vectorSIMDf tmpT = rows[1].zxxz();
vectorSIMDf tmpU = rows[1].yxwz();
vectorSIMDf tmpV = rows[1].wzyx();
vectorSIMDf tmpW = rows[1].zzxx();
vectorSIMDf tmpX = rows[1].ywwy();
__m128 xmm2 = (rows[3].yzwx()*(tmpM*tmpS-tmpN*tmpT)+rows[3].zwxy()*(tmpO*tmpU-tmpP*tmpV)+rows[3].wxyz()*(tmpQ*tmpW-tmpR*tmpX)).getAsRegister();
__m128 xmm3 = (rows[0].yzwx()*(tmpS*tmpA-tmpT*tmpC)+rows[0].zwxy()*(tmpU*tmpE-tmpV*tmpG)+rows[0].wxyz()*(tmpW*tmpI-tmpX*tmpK)).getAsRegister();
_MM_TRANSPOSE4_PS(xmm0,xmm1,xmm2,xmm3)
out.rows[0] = xmm0;
out.rows[1] = xmm1;
out.rows[2] = xmm2;
out.rows[3] = xmm3;
}
__m128 xmm0 = _mm_rcp_ps(determinant4.getAsRegister());
out.rows[0] *= xmm0;
out.rows[1] *= xmm0;
out.rows[2] *= xmm0;
out.rows[3] *= xmm0;
#if defined ( USE_MATRIX_TEST )
out.definitelyIdentityMatrix = false;
#endif
return true;
}
/*
//! Inverts a primitive matrix which only contains a translation and a rotation
//! \param out: where result matrix is written to.
inline bool matrixSIMD4::getInversePrimitive ( matrixSIMD4& out ) const
{
out.M[0 ] = M[0];
out.M[1 ] = M[4];
out.M[2 ] = M[8];
out.M[3 ] = 0;
out.M[4 ] = M[1];
out.M[5 ] = M[5];
out.M[6 ] = M[9];
out.M[7 ] = 0;
out.M[8 ] = M[2];
out.M[9 ] = M[6];
out.M[10] = M[10];
out.M[11] = 0;
out.M[12] = -(M[12]*M[0] + M[13]*M[1] + M[14]*M[2]);
out.M[13] = -(M[12]*M[4] + M[13]*M[5] + M[14]*M[6]);
out.M[14] = -(M[12]*M[8] + M[13]*M[9] + M[14]*M[10]);
out.M[15] = 1;
#if defined ( USE_MATRIX_TEST )
out.definitelyIdentityMatrix = definitelyIdentityMatrix;
#endif
return true;
}
//!
inline bool matrixSIMD4::makeInverse()
{
#if defined ( USE_MATRIX_TEST )
if (definitelyIdentityMatrix)
return true;
#endif
matrixSIMD4 temp ( EM4CONST_NOTHING );
if (getInverse(temp))
{
*this = temp;
return true;
}
return false;
}
*/
inline matrixSIMD4& matrixSIMD4::operator=(const matrixSIMD4 &other)
{
_mm_store_ps(rows[0].pointer,other.rows[0].getAsRegister());
_mm_store_ps(rows[1].pointer,other.rows[1].getAsRegister());
_mm_store_ps(rows[2].pointer,other.rows[2].getAsRegister());
_mm_store_ps(rows[3].pointer,other.rows[3].getAsRegister());
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=other.definitelyIdentityMatrix;
#endif
return *this;
}
inline matrixSIMD4& matrixSIMD4::operator=(const float& scalar)
{
__m128 xmm0 = _mm_load_ps1(&scalar);
_mm_store_ps(rows[0].pointer,xmm0);
_mm_store_ps(rows[1].pointer,xmm0);
_mm_store_ps(rows[2].pointer,xmm0);
_mm_store_ps(rows[3].pointer,xmm0);
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
inline bool matrixSIMD4::operator==(const matrixSIMD4 &other) const
{
#if defined ( USE_MATRIX_TEST )
if (definitelyIdentityMatrix && other.definitelyIdentityMatrix)
return true;
#endif
return !((*this)!=other);
}
inline bool matrixSIMD4::operator!=(const matrixSIMD4 &other) const
{
return ((rows[0]!=other.rows[0])|(rows[1]!=other.rows[1])|(rows[2]!=other.rows[2])|(rows[3]!=other.rows[3])).any();
}
/*
// Builds a right-handed perspective projection matrix based on a field of view
inline matrixSIMD4& matrixSIMD4::buildProjectionMatrixPerspectiveFovRH(
f32 fieldOfViewRadians, f32 aspectRatio, f32 zNear, f32 zFar)
{
const f32 h = reciprocal(tan(fieldOfViewRadians*0.5));
_IRR_DEBUG_BREAK_IF(aspectRatio==0.f); //divide by zero
const float w = h / aspectRatio;
_IRR_DEBUG_BREAK_IF(zNear==zFar); //divide by zero
M[0] = w;
M[1] = 0;
M[2] = 0;
M[3] = 0;
M[4] = 0;
M[5] = h;
M[6] = 0;
M[7] = 0;
M[8] = 0;
M[9] = 0;
M[10] = (zFar/(zNear-zFar)); // DirectX version
// M[10] = (zFar+zNear/(zNear-zFar)); // OpenGL version
M[11] = -1;
M[12] = 0;
M[13] = 0;
M[14] = (zNear*zFar/(zNear-zFar)); // DirectX version
// M[14] = (2.0f*zNear*zFar/(zNear-zFar)); // OpenGL version
M[15] = 0;
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
// Builds a left-handed perspective projection matrix based on a field of view
inline matrixSIMD4& matrixSIMD4::buildProjectionMatrixPerspectiveFovLH(
f32 fieldOfViewRadians, f32 aspectRatio, f32 zNear, f32 zFar)
{
const f32 h = reciprocal(tan(fieldOfViewRadians*0.5));
_IRR_DEBUG_BREAK_IF(aspectRatio==0.f); //divide by zero
const float w = (h / aspectRatio);
_IRR_DEBUG_BREAK_IF(zNear==zFar); //divide by zero
M[0] = w;
M[1] = 0;
M[2] = 0;
M[3] = 0;
M[4] = 0;
M[5] = h;
M[6] = 0;
M[7] = 0;
M[8] = 0;
M[9] = 0;
M[10] = (zFar/(zFar-zNear));
M[11] = 1;
M[12] = 0;
M[13] = 0;
M[14] = (-zNear*zFar/(zFar-zNear));
M[15] = 0;
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
// Builds a left-handed perspective projection matrix based on a field of view, with far plane culling at infinity
inline matrixSIMD4& matrixSIMD4::buildProjectionMatrixPerspectiveFovInfinityLH(
f32 fieldOfViewRadians, f32 aspectRatio, f32 zNear, f32 epsilon)
{
const f32 h = reciprocal(tan(fieldOfViewRadians*0.5));
_IRR_DEBUG_BREAK_IF(aspectRatio==0.f); //divide by zero
const float w = h / aspectRatio;
M[0] = w;
M[1] = 0;
M[2] = 0;
M[3] = 0;
M[4] = 0;
M[5] = h;
M[6] = 0;
M[7] = 0;
M[8] = 0;
M[9] = 0;
M[10] = (1.f-epsilon);
M[11] = 1;
M[12] = 0;
M[13] = 0;
M[14] = (zNear*(epsilon-1.f));
M[15] = 0;
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
// Builds a left-handed orthogonal projection matrix.
inline matrixSIMD4& matrixSIMD4::buildProjectionMatrixOrthoLH(
f32 widthOfViewVolume, f32 heightOfViewVolume, f32 zNear, f32 zFar)
{
_IRR_DEBUG_BREAK_IF(widthOfViewVolume==0.f); //divide by zero
_IRR_DEBUG_BREAK_IF(heightOfViewVolume==0.f); //divide by zero
_IRR_DEBUG_BREAK_IF(zNear==zFar); //divide by zero
M[0] = (2/widthOfViewVolume);
M[1] = 0;
M[2] = 0;
M[3] = 0;
M[4] = 0;
M[5] = (2/heightOfViewVolume);
M[6] = 0;
M[7] = 0;
M[8] = 0;
M[9] = 0;
M[10] = (1/(zFar-zNear));
M[11] = 0;
M[12] = 0;
M[13] = 0;
M[14] = (zNear/(zNear-zFar));
M[15] = 1;
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
// Builds a right-handed orthogonal projection matrix.
inline matrixSIMD4& matrixSIMD4::buildProjectionMatrixOrthoRH(
f32 widthOfViewVolume, f32 heightOfViewVolume, f32 zNear, f32 zFar)
{
_IRR_DEBUG_BREAK_IF(widthOfViewVolume==0.f); //divide by zero
_IRR_DEBUG_BREAK_IF(heightOfViewVolume==0.f); //divide by zero
_IRR_DEBUG_BREAK_IF(zNear==zFar); //divide by zero
M[0] = (2/widthOfViewVolume);
M[1] = 0;
M[2] = 0;
M[3] = 0;
M[4] = 0;
M[5] = (2/heightOfViewVolume);
M[6] = 0;
M[7] = 0;
M[8] = 0;
M[9] = 0;
M[10] = (1/(zNear-zFar));
M[11] = 0;
M[12] = 0;
M[13] = 0;
M[14] = (zNear/(zNear-zFar));
M[15] = 1;
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
// Builds a right-handed perspective projection matrix.
inline matrixSIMD4& matrixSIMD4::buildProjectionMatrixPerspectiveRH(
f32 widthOfViewVolume, f32 heightOfViewVolume, f32 zNear, f32 zFar)
{
_IRR_DEBUG_BREAK_IF(widthOfViewVolume==0.f); //divide by zero
_IRR_DEBUG_BREAK_IF(heightOfViewVolume==0.f); //divide by zero
_IRR_DEBUG_BREAK_IF(zNear==zFar); //divide by zero
M[0] = (2*zNear/widthOfViewVolume);
M[1] = 0;
M[2] = 0;
M[3] = 0;
M[4] = 0;
M[5] = (2*zNear/heightOfViewVolume);
M[6] = 0;
M[7] = 0;
M[8] = 0;
M[9] = 0;
M[10] = (zFar/(zNear-zFar));
M[11] = -1;
M[12] = 0;
M[13] = 0;
M[14] = (zNear*zFar/(zNear-zFar));
M[15] = 0;
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
// Builds a left-handed perspective projection matrix.
inline matrixSIMD4& matrixSIMD4::buildProjectionMatrixPerspectiveLH(
f32 widthOfViewVolume, f32 heightOfViewVolume, f32 zNear, f32 zFar)
{
_IRR_DEBUG_BREAK_IF(widthOfViewVolume==0.f); //divide by zero
_IRR_DEBUG_BREAK_IF(heightOfViewVolume==0.f); //divide by zero
_IRR_DEBUG_BREAK_IF(zNear==zFar); //divide by zero
M[0] = (2*zNear/widthOfViewVolume);
M[1] = 0;
M[2] = 0;
M[3] = 0;
M[4] = 0;
M[5] = (2*zNear/heightOfViewVolume);
M[6] = 0;
M[7] = 0;
M[8] = 0;
M[9] = 0;
M[10] = (zFar/(zFar-zNear));
M[11] = 1;
M[12] = 0;
M[13] = 0;
M[14] = (zNear*zFar/(zNear-zFar));
M[15] = 0;
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
// Builds a matrix that flattens geometry into a plane.
inline matrixSIMD4& matrixSIMD4::buildShadowMatrix(const core::vector3df& light, core::plane3df plane, f32 point)
{
plane.Normal.normalize();
const f32 d = plane.Normal.dotProduct(light);
M[ 0] = (-plane.Normal.X * light.X + d);
M[ 1] = (-plane.Normal.X * light.Y);
M[ 2] = (-plane.Normal.X * light.Z);
M[ 3] = (-plane.Normal.X * point);
M[ 4] = (-plane.Normal.Y * light.X);
M[ 5] = (-plane.Normal.Y * light.Y + d);
M[ 6] = (-plane.Normal.Y * light.Z);
M[ 7] = (-plane.Normal.Y * point);
M[ 8] = (-plane.Normal.Z * light.X);
M[ 9] = (-plane.Normal.Z * light.Y);
M[10] = (-plane.Normal.Z * light.Z + d);
M[11] = (-plane.Normal.Z * point);
M[12] = (-plane.D * light.X);
M[13] = (-plane.D * light.Y);
M[14] = (-plane.D * light.Z);
M[15] = (-plane.D * point + d);
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
// Builds a left-handed look-at matrix.
inline matrixSIMD4& matrixSIMD4::buildCameraLookAtMatrixLH(
const vector3df& position,
const vector3df& target,
const vector3df& upVector)
{
vector3df zaxis = target - position;
zaxis.normalize();
vector3df xaxis = upVector.crossProduct(zaxis);
xaxis.normalize();
vector3df yaxis = zaxis.crossProduct(xaxis);
M[0] = xaxis.X;
M[1] = yaxis.X;
M[2] = zaxis.X;
M[3] = 0;
M[4] = xaxis.Y;
M[5] = yaxis.Y;
M[6] = zaxis.Y;
M[7] = 0;
M[8] = xaxis.Z;
M[9] = yaxis.Z;
M[10] =zaxis.Z;
M[11] = 0;
M[12] = -xaxis.dotProduct(position);
M[13] = -yaxis.dotProduct(position);
M[14] = -zaxis.dotProduct(position);
M[15] = 1;
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
// Builds a right-handed look-at matrix.
inline matrixSIMD4& matrixSIMD4::buildCameraLookAtMatrixRH(
const vector3df& position,
const vector3df& target,
const vector3df& upVector)
{
vector3df zaxis = position - target;
zaxis.normalize();
vector3df xaxis = upVector.crossProduct(zaxis);
xaxis.normalize();
vector3df yaxis = zaxis.crossProduct(xaxis);
M[0] = xaxis.X;
M[1] = yaxis.X;
M[2] = zaxis.X;
M[3] = 0;
M[4] = xaxis.Y;
M[5] = yaxis.Y;
M[6] = zaxis.Y;
M[7] = 0;
M[8] = xaxis.Z;
M[9] = yaxis.Z;
M[10] = zaxis.Z;
M[11] = 0;
M[12] = -xaxis.dotProduct(position);
M[13] = -yaxis.dotProduct(position);
M[14] = -zaxis.dotProduct(position);
M[15] = 1;
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
*/
// creates a new matrix as interpolated matrix from this and the passed one.
inline matrixSIMD4 matrixSIMD4::interpolate(const matrixSIMD4& b, const float &factor) const
{
matrixSIMD4 mat ( EM4CONST_NOTHING );
mat.rows[0] = vectorSIMDf::mix(this->rows[0],b.rows[0],factor);
mat.rows[1] = vectorSIMDf::mix(this->rows[1],b.rows[1],factor);
mat.rows[2] = vectorSIMDf::mix(this->rows[2],b.rows[2],factor);
mat.rows[3] = vectorSIMDf::mix(this->rows[3],b.rows[3],factor);
return mat;
}
// returns transposed matrix
inline matrixSIMD4 matrixSIMD4::getTransposed() const
{
matrixSIMD4 t ( EM4CONST_NOTHING );
getTransposed ( t );
return t;
}
// returns transposed matrix
inline void matrixSIMD4::getTransposed( matrixSIMD4& o ) const
{
__m128 xmm0 = rows[0].getAsRegister();
__m128 xmm1 = rows[1].getAsRegister();
__m128 xmm2 = rows[2].getAsRegister();
__m128 xmm3 = rows[3].getAsRegister();
_MM_TRANSPOSE4_PS(xmm0,xmm1,xmm2,xmm3)
_mm_store_ps((float*)o.rows,xmm0);
_mm_store_ps((float*)(o.rows+1),xmm1);
_mm_store_ps((float*)(o.rows+2),xmm2);
_mm_store_ps((float*)(o.rows+3),xmm3);
#if defined ( USE_MATRIX_TEST )
o.definitelyIdentityMatrix=definitelyIdentityMatrix;
#endif
}
/*
// used to scale <-1,-1><1,1> to viewport
inline matrixSIMD4& matrixSIMD4::buildNDCToDCMatrix( const core::rect<s32>& viewport, f32 zScale)
{
const f32 scaleX = (viewport.getWidth() - 0.75f ) * 0.5f;
const f32 scaleY = -(viewport.getHeight() - 0.75f ) * 0.5f;
const f32 dx = -0.5f + ( (viewport.UpperLeftCorner.X + viewport.LowerRightCorner.X ) * 0.5f );
const f32 dy = -0.5f + ( (viewport.UpperLeftCorner.Y + viewport.LowerRightCorner.Y ) * 0.5f );
makeIdentity();
M[12] = dx;
M[13] = dy;
return setScale(core::vectorSIMDf(scaleX, scaleY, zScale));
}
//! Builds a matrix that rotates from one vector to another
/** \param from: vector to rotate from
\param to: vector to rotate to
http://www.euclideanspace.com/maths/geometry/rotations/conversions/angleToMatrix/index.htm
*
inline matrixSIMD4& matrixSIMD4::buildRotateFromTo(const core::vector3df& from, const core::vector3df& to)
{
// unit vectors
core::vector3df f(from);
core::vector3df t(to);
f.normalize();
t.normalize();
// axis multiplication by sin
core::vector3df vs(t.crossProduct(f));
// axis of rotation
core::vector3df v(vs);
v.normalize();
// cosinus angle
float ca = f.dotProduct(t);
core::vector3df vt(v * (1 - ca));
M[0] = vt.X * v.X + ca;
M[5] = vt.Y * v.Y + ca;
M[10] = vt.Z * v.Z + ca;
vt.X *= v.Y;
vt.Z *= v.X;
vt.Y *= v.Z;
M[1] = vt.X - vs.Z;
M[2] = vt.Z + vs.Y;
M[3] = 0;
M[4] = vt.X + vs.Z;
M[6] = vt.Y - vs.X;
M[7] = 0;
M[8] = vt.Z - vs.Y;
M[9] = vt.Y + vs.X;
M[11] = 0;
M[12] = 0;
M[13] = 0;
M[14] = 0;
M[15] = 1;
return *this;
}
//! Builds a matrix which rotates a source vector to a look vector over an arbitrary axis
/** \param camPos: viewer position in world coord
\param center: object position in world-coord, rotation pivot
\param translation: object final translation from center
\param axis: axis to rotate about
\param from: source vector to rotate from
*
inline void matrixSIMD4::buildAxisAlignedBillboard(
const core::vector3df& camPos,
const core::vector3df& center,
const core::vector3df& translation,
const core::vector3df& axis,
const core::vector3df& from)
{
// axis of rotation
core::vector3df up = axis;
up.normalize();
const core::vector3df forward = (camPos - center).normalize();
const core::vector3df right = up.crossProduct(forward).normalize();
// correct look vector
const core::vector3df look = right.crossProduct(up);
// rotate from to
// axis multiplication by sin
const core::vector3df vs = look.crossProduct(from);
// cosinus angle
const f32 ca = from.dotProduct(look);
core::vector3df vt(up * (1.f - ca));
M[0] = (vt.X * up.X + ca);
M[5] = (vt.Y * up.Y + ca);
M[10] =(vt.Z * up.Z + ca);
vt.X *= up.Y;
vt.Z *= up.X;
vt.Y *= up.Z;
M[1] = (vt.X - vs.Z);
M[2] = (vt.Z + vs.Y);
M[3] = 0;
M[4] = (vt.X + vs.Z);
M[6] = (vt.Y - vs.X);
M[7] = 0;
M[8] = (vt.Z - vs.Y);
M[9] = (vt.Y + vs.X);
M[11] = 0;
setRotationCenter(center, translation);
}
//! Builds a combined matrix which translate to a center before rotation and translate afterwards
inline void matrixSIMD4::setRotationCenter(const core::vector3df& center, const core::vector3df& translation)
{
M[12] = -M[0]*center.X - M[4]*center.Y - M[8]*center.Z + (center.X - translation.X );
M[13] = -M[1]*center.X - M[5]*center.Y - M[9]*center.Z + (center.Y - translation.Y );
M[14] = -M[2]*center.X - M[6]*center.Y - M[10]*center.Z + (center.Z - translation.Z );
M[15] = 1.0;
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
}
inline matrixSIMD4& matrixSIMD4::buildTextureTransform( f32 rotateRad,
const core::vector2df &rotatecenter,
const core::vector2df &translate,
const core::vector2df &scale)
{
const f32 c = cosf(rotateRad);
const f32 s = sinf(rotateRad);
M[0] = (c * scale.X);
M[1] = (s * scale.Y);
M[2] = 0;
M[3] = 0;
M[4] = (-s * scale.X);
M[5] = (c * scale.Y);
M[6] = 0;
M[7] = 0;
M[8] = (c * scale.X * rotatecenter.X + -s * rotatecenter.Y + translate.X);
M[9] = (s * scale.Y * rotatecenter.X + c * rotatecenter.Y + translate.Y);
M[10] = 1;
M[11] = 0;
M[12] = 0;
M[13] = 0;
M[14] = 0;
M[15] = 1;
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
// rotate about z axis, center ( 0.5, 0.5 )
inline matrixSIMD4& matrixSIMD4::setTextureRotationCenter( f32 rotateRad )
{
const f32 c = cosf(rotateRad);
const f32 s = sinf(rotateRad);
M[0] = c;
M[1] = s;
M[4] = -s;
M[5] = c;
M[8] = (0.5f * ( s - c) + 0.5f);
M[9] = (-0.5f * ( s + c) + 0.5f);
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix = definitelyIdentityMatrix && (rotateRad==0.0f);
#endif
return *this;
}
inline matrixSIMD4& matrixSIMD4::setTextureTranslate ( f32 x, f32 y )
{
M[8] = x;
M[9] = y;
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix = definitelyIdentityMatrix && (x==0.0f) && (y==0.0f);
#endif
return *this;
}
inline matrixSIMD4& matrixSIMD4::setTextureTranslateTransposed ( f32 x, f32 y )
{
M[2] = x;
M[6] = y;
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix = definitelyIdentityMatrix && (x==0.0f) && (y==0.0f) ;
#endif
return *this;
}
inline matrixSIMD4& matrixSIMD4::setTextureScale ( f32 sx, f32 sy )
{
M[0] = sx;
M[5] = sy;
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix = definitelyIdentityMatrix && (sx==1.0f) && (sy==1.0f);
#endif
return *this;
}
inline matrixSIMD4& matrixSIMD4::setTextureScaleCenter( f32 sx, f32 sy )
{
M[0] = sx;
M[5] = sy;
M[8] = (0.5f - 0.5f * sx);
M[9] = (0.5f - 0.5f * sy);
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix = definitelyIdentityMatrix && (sx==1.0f) && (sy==1.0f);
#endif
return *this;
}
// sets all matrix data members at once
inline matrixSIMD4& matrixSIMD4::setM(const float* data)
{
memcpy(M,data, 16*sizeof(float));
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
// sets if the matrix is definitely identity matrix
inline void matrixSIMD4::setDefinitelyIdentityMatrix( bool isDefinitelyIdentityMatrix)
{
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix = isDefinitelyIdentityMatrix;
#endif
}
// gets if the matrix is definitely identity matrix
inline bool matrixSIMD4::getDefinitelyIdentityMatrix() const
{
#if defined ( USE_MATRIX_TEST )
return definitelyIdentityMatrix;
#else
return false;
#endif
}
//! Compare two matrices using the equal method
inline bool matrixSIMD4::equals(const core::matrixSIMD4& other, const float tolerance) const
{
#if defined ( USE_MATRIX_TEST )
if (definitelyIdentityMatrix && other.definitelyIdentityMatrix)
return true;
#endif
for (s32 i = 0; i < 16; ++i)
if (!core::equals(M[i],other.M[i], tolerance))
return false;
return true;
}
// Multiply by scalar.
inline matrixSIMD4 operator*(const float scalar, const matrixSIMD4& mat)
{
return mat*scalar;
}*/
//! global const identity matrix
IRRLICHT_API extern const matrixSIMD4 IdentityMatrix;
} // end namespace core
} // end namespace irr
#endif
#endif