https://github.com/buildaworldnet/Irrli ... er/include
EVERY CLASS IN THIS THREAD HAS AT LEAST 1 BUG!
For now:
-- the vectorized 4d float class (with exceptions of some functions to do with rotation)
-- the vectorized boolean class implemented only for 4 and 2 component wide vectors.
-- fixed up irrMath.h
-- aligned memory irrAllocator
-- aligned malloc/free wrappers
Please test!!!
If you have some suggestions for functions, such as reflect() feel free to contribute
***ALSO FEEL FREE TO CONTRIBUTE SOURCE CODE***
UPDATE 5/09/2014:
-- added support for adapting to featuresets SSE2,SSE3,SSSE3,SSE4.1,SSE4.2,AVX,AVX2 (only in some places, need to figure out where this happens)
-- added initalization of 2d and 3d vectors as 4d SIMDs with 0s in tail
-- added bitwise operation on 4df SIMD
-- added "makeSafeNd()" functions for zeroing out the last components
-- added GLSL style member names (lower case x,y,z,w and r,g,b,a and s,t,p,q)
-- removed SSSE3 requirement in favour of SSE3
-- implemented one of the rotate functions with SSE2 (no SSE3 needed)
UPDATE 5/09/2014:
--implemented all of the rotate functions (with and without center) - only got 1 function left to implement
--removed inline constructors
--caught a fatal bug (not returning the value of _mm_load_ps() in getAsRegister())
--added overloaded new and delete operators (so you can use new/ new[] and delete/ [] delete without worrying about 16byte alignment) //commented out
--fixed up irrMath.h to not use assembly SSE anymore
--implemented all functions of irrMath for floats for vectorSIMDf (except for floor and fract, look at target B)
UPDATE 30/04/2014:
--implemented the floor and fract functions for vectorSIMDf
--a few more functions added to vectorSIMDf
--implemented a matrixSIMDf class with basic functionality (without matrix building functions)
NOTE: THE MATRIX CLASS IS NOT A DROP IN REPLACEMENT OR IN FACT, ANY SORT OF COMPATIBLE REPLACEMENT FOR IRRLICHT'S CMatrix4<>/matrix4
THE ORDER OF THE ELEMENTS OF THE MATRIX IN MEMORY IS COMPLETELY DIFFERENT AND THE MATRICES MULTIPLY THE PROPER WAY AROUND
Targets:
A) add runtime or compile time (but strict) swizzle support for 8 and 16 component vectors
B) implement all the GLSL functions with 4df vectors
C) implement some spherical coordinate functions in vectorSIMDf
E) implement 32 and 16bit vectors of signed and unsigned integers
F) make conversion functions between all of the types
H) implement all the basic types of irrlicht in SIMD vectors
G) implement more funny functions from GLSL such as pow(), exp2() etc http://gruntthepeon.free.fr/ssemath/
All code provided is on the irrlicht license (and please attribute me and BaW)
as I progress I'll add more code and update the listings
For the fixed up irrMath.h, follow the link:http://irrlicht.sourceforge.net/forum/v ... 02#p289502
Minor Changes:
Code: Select all
Index: irrlicht/source/Irrlicht/Irrlicht.cpp
===================================================================
namespace core
{
- const matrix4 IdentityMatrix(matrix4::EM4CONST_IDENTITY);
+ const matrix4 IdentityMatrix(matrix4::EM4CONST_IDENTITY);
+#ifdef __IRR_COMPILE_WITH_X86_SIMD_
+ //const matrixSIMD4 IdentityMatrix(matrix4::EM4CONST_IDENTITY);
+#endif
irr::core::stringc LOCALE_DECIMAL_POINTS(".");
}
Index: irrlicht/include/irrlicht.h
===================================================================
@@ -158,7 +158,8 @@
#include "Keycodes.h"
#include "line2d.h"
#include "line3d.h"
-#include "matrix4.h"
+#include "matrix4.h"
+#include "matrixSIMD4.h"
#include "plane3d.h"
#include "position2d.h"
#include "quaternion.h"
@@ -183,7 +184,8 @@
#include "SViewFrustum.h"
#include "triangle3d.h"
#include "vector2d.h"
-#include "vector3d.h"
+#include "vector3d.h"
+#include "vectorSIMD.h"
Index: irrlicht/include/IrrCompileConfig.h
===================================================================
@@ -13,6 +13,40 @@
// it undefined
//#define IRRLICHT_VERSION_SVN -alpha
#define IRRLICHT_SDK_VERSION "1.8.1-baw"
+
+#define __IRR_COMPILE_WITH_X86_SIMD_
+
+#ifdef __IRR_COMPILE_WITH_X86_SIMD_
+#define __IRR_COMPILE_WITH_SSE2
+#define __IRR_COMPILE_WITH_SSE3
+
+#include <immintrin.h>
+
+#ifdef __SSE2__
+#define __IRR_COMPILE_WITH_SSE2
+#endif
+
+#ifdef __SSE3__
+#define __IRR_COMPILE_WITH_SSE3
+#endif
+
+#ifdef __SSE4_1__
+#define __IRR_COMPILE_WITH_SSE4_1
+#endif
+
+#ifdef __AVX__
+#define __IRR_COMPILE_WITH_AVX
+#endif
+
+
+
+#ifdef __IRR_COMPILE_WITH_AVX
+#define SIMD_ALIGNMENT 32
+#else
+#define SIMD_ALIGNMENT 16
+#endif // __IRR_COMPILE_WITH_AVX
+
+#endif
#include <stdio.h> // TODO: Although included elsewhere this is required at least for mingw
@@ -673,7 +707,7 @@
precision will be lower but speed higher. currently X86 only
*/
#if !defined(_IRR_OSX_PLATFORM_) && !defined(_IRR_SOLARIS_PLATFORM_)
- //#define IRRLICHT_FAST_MATH
+// #define IRRLICHT_FAST_MATH
#ifdef NO_IRRLICHT_FAST_MATH
#undef IRRLICHT_FAST_MATH
#endif
irrAllocator.h
Code: Select all
// Copyright (C) 2002-2012 Nikolaus Gebhardt
// This file is part of the "Irrlicht Engine" and the "irrXML" project.
// For conditions of distribution and use, see copyright notice in irrlicht.h and irrXML.h
#ifndef __IRR_ALLOCATOR_H_INCLUDED__
#define __IRR_ALLOCATOR_H_INCLUDED__
#include "irrTypes.h"
#include <new>
// necessary for older compilers
#include <memory.h>
namespace irr
{
namespace core
{
#ifdef DEBUG_CLIENTBLOCK
#undef DEBUG_CLIENTBLOCK
#define DEBUG_CLIENTBLOCK new
#endif
//! Very simple allocator implementation, containers using it can be used across dll boundaries
#ifdef __AVX__
template <typename T, std::size_t Alignment=32>
#else
template <typename T, std::size_t Alignment=16>
#endif
class irrAllocator
{
public:
//! Destructor
virtual ~irrAllocator() {}
//! Allocate memory for an array of objects
T* allocate(size_t cnt)
{
return (T*)internal_new(cnt* sizeof(T));
}
//! Deallocate memory for an array of objects
void deallocate(T* ptr)
{
internal_delete(ptr);
}
//! Construct an element
void construct(T* ptr, const T&e)
{
new ((void*)ptr) T(e);
}
//! Destruct an element
void destruct(T* ptr)
{
ptr->~T();
}
protected:
#ifdef __IRR_COMPILE_WITH_X86_SIMD_
virtual void* internal_new(size_t cnt)
{
void *memoryallocatedaligned = 0;
#ifdef _IRR_WINDOWS_
memoryallocatedaligned = _aligned_malloc(cnt,Alignment);
#else
posix_memalign((void**)&memoryallocatedaligned,Alignment,cnt);
#endif
return memoryallocatedaligned;
}
virtual void internal_delete(void* ptr)
{
#ifdef _IRR_WINDOWS_
_aligned_free(ptr);
#else
free(ptr);
#endif
}
#else
virtual void* internal_new(size_t cnt)
{
return operator new(cnt);
}
virtual void internal_delete(void* ptr)
{
operator delete(ptr);
}
#endif
};
//! Fast allocator, only to be used in containers inside the same memory heap.
/** Containers using it are NOT able to be used it across dll boundaries. Use this
when using in an internal class or function or when compiled into a static lib */
#ifdef __AVX__
template <typename T, std::size_t Alignment=32>
#else
template <typename T, std::size_t Alignment=16>
#endif
class irrAllocatorFast
{
public:
#ifdef __IRR_COMPILE_WITH_X86_SIMD_
//! Allocate memory for an array of objects
T* allocate(size_t cnt)
{
cnt *= sizeof(T);
T *memoryallocatedaligned = 0;
#ifdef _IRR_WINDOWS_
memoryallocatedaligned = (T*)_aligned_malloc(cnt,Alignment);
#else
posix_memalign((void**)&memoryallocatedaligned,Alignment,cnt);
#endif
return memoryallocatedaligned;
}
//! Deallocate memory for an array of objects
void deallocate(T* ptr)
{
#ifdef _IRR_WINDOWS_
_aligned_free(ptr);
#else
free(ptr);
#endif
}
#else
//! Allocate memory for an array of objects
T* allocate(size_t cnt)
{
return (T*)operator new(cnt* sizeof(T));
}
//! Deallocate memory for an array of objects
void deallocate(T* ptr)
{
operator delete(ptr);
}
#endif // __IRR_COMPILE_WITH_X86_SIMD_
//! Construct an element
void construct(T* ptr, const T&e)
{
new ((void*)ptr) T(e);
}
//! Destruct an element
void destruct(T* ptr)
{
ptr->~T();
}
};
#ifdef DEBUG_CLIENTBLOCK
#undef DEBUG_CLIENTBLOCK
#define DEBUG_CLIENTBLOCK new( _CLIENT_BLOCK, __FILE__, __LINE__)
#endif
//! defines an allocation strategy
enum eAllocStrategy
{
ALLOC_STRATEGY_SAFE = 0,
ALLOC_STRATEGY_DOUBLE = 1,
ALLOC_STRATEGY_SQRT = 2
};
} // end namespace core
} // end namespace irr
#endif
alligned malloc wrappers
Code: Select all
// aligned crossplatform malloc
inline void* FW_malloc_align(size_t inNumBytes, size_t alignment)
{
void *memoryallocatedaligned = 0;
#ifdef _IRR_WINDOWS_
memoryallocatedaligned = _aligned_malloc(inNumBytes,alignment);
#else
posix_memalign((void**)&memoryallocatedaligned,alignment,inNumBytes);
#endif
return memoryallocatedaligned;
}
// aligned crossplatform free
inline void FW_free_align(void *alignedMemoryBlock)
{
#ifdef _IRR_WINDOWS_
_aligned_free(alignedMemoryBlock);
#else
free(alignedMemoryBlock);
#endif
}
Code: Select all
Index: irrlicht/include/irrMath.h
===================================================================
@@ -120,7 +120,7 @@
//! returns minimum of two values. Own implementation to get rid of the STL (VS6 problems)
template<class T>
- inline const T& min_(const T& a, const T& b)
+ inline T min_(const T& a, const T& b)
{
return a < b ? a : b;
}
@@ -134,7 +134,7 @@
//! returns maximum of two values. Own implementation to get rid of the STL (VS6 problems)
template<class T>
- inline const T& max_(const T& a, const T& b)
+ inline T max_(const T& a, const T& b)
{
return a < b ? b : a;
}
@@ -453,18 +453,6 @@
REALINLINE void clearFPUException ()
{
-#ifdef IRRLICHT_FAST_MATH
- return;
-#ifdef feclearexcept
- feclearexcept(FE_ALL_EXCEPT);
-#elif defined(_MSC_VER)
- __asm fnclex;
-#elif defined(__GNUC__) && defined(__x86__)
- __asm__ __volatile__ ("fclex \n\t");
-#else
-# warn clearFPUException not supported.
-#endif
-#endif
}
// calculate: sqrt ( x )
@@ -496,30 +484,23 @@
// calculate: 1 / sqrt ( x )
REALINLINE f64 reciprocal_squareroot(const f64 x)
{
+#if defined ( IRRLICHT_FAST_MATH )
+ double result = 1.0 / sqrt(x);
+ //! pending perf test
+ //_mm_store_sd(&result,_mm_div_sd(_mm_set_pd(0.0,1.0),_mm_sqrt_sd(_mm_load_sd(&x))));
+ return result;
+#else // no fast math
return 1.0 / sqrt(x);
+#endif
}
// calculate: 1 / sqrtf ( x )
REALINLINE f32 reciprocal_squareroot(const f32 f)
{
-#if defined ( IRRLICHT_FAST_MATH )
- #if defined(_MSC_VER)
- // SSE reciprocal square root estimate, accurate to 12 significant
- // bits of the mantissa
- f32 recsqrt;
- __asm rsqrtss xmm0, f // xmm0 = rsqrtss(f)
- __asm movss recsqrt, xmm0 // return xmm0
- return recsqrt;
-
-/*
- // comes from Nvidia
- u32 tmp = (u32(IEEE_1_0 << 1) + IEEE_1_0 - *(u32*)&x) >> 1;
- f32 y = *(f32*)&tmp;
- return y * (1.47f - 0.47f * x * y * y);
-*/
- #else
- return 1.f / sqrtf(f);
- #endif
+#if defined ( IRRLICHT_FAST_MATH ) && defined ( __IRR_COMPILE_WITH_SSE2 )
+ float result;
+ _mm_store_ss(&result,_mm_rsqrt_ps(_mm_load_ss(&f)));
+ return result;
#else // no fast math
return 1.f / sqrtf(f);
#endif
@@ -534,31 +515,10 @@
// calculate: 1 / x
REALINLINE f32 reciprocal( const f32 f )
{
-#if defined (IRRLICHT_FAST_MATH)
-
- // SSE Newton-Raphson reciprocal estimate, accurate to 23 significant
- // bi ts of the mantissa
- // One Newtown-Raphson Iteration:
- // f(i+1) = 2 * rcpss(f) - f * rcpss(f) * rcpss(f)
- f32 rec;
- __asm rcpss xmm0, f // xmm0 = rcpss(f)
- __asm movss xmm1, f // xmm1 = f
- __asm mulss xmm1, xmm0 // xmm1 = f * rcpss(f)
- __asm mulss xmm1, xmm0 // xmm2 = f * rcpss(f) * rcpss(f)
- __asm addss xmm0, xmm0 // xmm0 = 2 * rcpss(f)
- __asm subss xmm0, xmm1 // xmm0 = 2 * rcpss(f)
- // - f * rcpss(f) * rcpss(f)
- __asm movss rec, xmm0 // return xmm0
- return rec;
-
-
- //! i do not divide through 0.. (fpu expection)
- // instead set f to a high value to get a return value near zero..
- // -1000000000000.f.. is use minus to stay negative..
- // must test's here (plane.normal dot anything ) checks on <= 0.f
- //u32 x = (-(AIR(f) != 0 ) >> 31 ) & ( IR(f) ^ 0xd368d4a5 ) ^ 0xd368d4a5;
- //return 1.f / FR ( x );
-
+#if defined (IRRLICHT_FAST_MATH) && defined ( __IRR_COMPILE_WITH_SSE2 )
+ float result;
+ _mm_store_ss(&result,_mm_rcp_ps(_mm_load_ss(&f)));
+ return result;
#else // no fast math
return 1.f / f;
#endif
@@ -573,106 +533,21 @@
// calculate: 1 / x, low precision allowed
REALINLINE f32 reciprocal_approxim ( const f32 f )
- {
-#if defined( IRRLICHT_FAST_MATH)
+ {
+ //what was here before was not faster
+ return reciprocal(f);
+ }
- // SSE Newton-Raphson reciprocal estimate, accurate to 23 significant
- // bi ts of the mantissa
- // One Newtown-Raphson Iteration:
- // f(i+1) = 2 * rcpss(f) - f * rcpss(f) * rcpss(f)
- f32 rec;
- __asm rcpss xmm0, f // xmm0 = rcpss(f)
- __asm movss xmm1, f // xmm1 = f
- __asm mulss xmm1, xmm0 // xmm1 = f * rcpss(f)
- __asm mulss xmm1, xmm0 // xmm2 = f * rcpss(f) * rcpss(f)
- __asm addss xmm0, xmm0 // xmm0 = 2 * rcpss(f)
- __asm subss xmm0, xmm1 // xmm0 = 2 * rcpss(f)
- // - f * rcpss(f) * rcpss(f)
- __asm movss rec, xmm0 // return xmm0
- return rec;
-
-/*
- // SSE reciprocal estimate, accurate to 12 significant bits of
- f32 rec;
- __asm rcpss xmm0, f // xmm0 = rcpss(f)
- __asm movss rec , xmm0 // return xmm0
- return rec;
-*/
-/*
- register u32 x = 0x7F000000 - IR ( p );
- const f32 r = FR ( x );
- return r * (2.0f - p * r);
-*/
-#else // no fast math
- return 1.f / f;
-#endif
- }
-
-
REALINLINE s32 floor32(f32 x)
{
-#ifdef IRRLICHT_FAST_MATH
- const f32 h = 0.5f;
-
- s32 t;
-
-#if defined(_MSC_VER)
- __asm
- {
- fld x
- fsub h
- fistp t
- }
-#elif defined(__GNUC__)
- __asm__ __volatile__ (
- "fsub %2 \n\t"
- "fistpl %0"
- : "=m" (t)
- : "t" (x), "f" (h)
- : "st"
- );
-#else
-# warn IRRLICHT_FAST_MATH not supported.
return (s32) floorf ( x );
-#endif
- return t;
-#else // no fast math
- return (s32) floorf ( x );
-#endif
}
REALINLINE s32 ceil32 ( f32 x )
{
-#ifdef IRRLICHT_FAST_MATH
- const f32 h = 0.5f;
-
- s32 t;
-
-#if defined(_MSC_VER)
- __asm
- {
- fld x
- fadd h
- fistp t
- }
-#elif defined(__GNUC__)
- __asm__ __volatile__ (
- "fadd %2 \n\t"
- "fistpl %0 \n\t"
- : "=m"(t)
- : "t"(x), "f"(h)
- : "st"
- );
-#else
-# warn IRRLICHT_FAST_MATH not supported.
return (s32) ceilf ( x );
-#endif
- return t;
-#else // not fast math
- return (s32) ceilf ( x );
-#endif
}
@@ -679,30 +554,7 @@
REALINLINE s32 round32(f32 x)
{
-#if defined(IRRLICHT_FAST_MATH)
- s32 t;
-
-#if defined(_MSC_VER)
- __asm
- {
- fld x
- fistp t
- }
-#elif defined(__GNUC__)
- __asm__ __volatile__ (
- "fistpl %0 \n\t"
- : "=m"(t)
- : "t"(x)
- : "st"
- );
-#else
-# warn IRRLICHT_FAST_MATH not supported.
return (s32) round_(x);
-#endif
- return t;
-#else // no fast math
- return (s32) round_(x);
-#endif
}
LIST OF NEW FILES:
include/SIMDswizzle.h : http://irrlicht.sourceforge.net/forum/v ... 03#p293603
include/matrixSIMD4.h : http://irrlicht.sourceforge.net/forum/v ... 04#p293604
include/vectorSIMD.h : http://irrlicht.sourceforge.net/forum/v ... 00#p293600