fast dsp multiply matrix 4x4 on ios and android

Discuss about anything related to the Irrlicht Engine, or read announcements about any significant features or usage changes.
Post Reply
feelthat
Posts: 194
Joined: Sat Feb 02, 2013 5:27 am

fast dsp multiply matrix 4x4 on ios and android

Post by feelthat »

//in matrix4.h
#if (TARGET_OS_IPHONE == 1)
#include <Accelerate/Accelerate.h>

#elif defined(ANDROID_NDK)
#include <arm_neon.h>

#endif


//! multiply by another matrix
// set this matrix to the product of two other matrices
// goal is to reduce stack use and copy
template <class T>
inline CMatrix4<T>& CMatrix4<T>::setbyproduct_nocheck(const CMatrix4<T>& other_a,const CMatrix4<T>& other_b )
{
const T *m1 = other_a.M;
const T *m2 = other_b.M;

#if (TARGET_OS_IPHONE == 1)
vDSP_mmul( m2, 1, m1, 1, M, 1, 4, 4, 4 );

#elif defined(ANDROID_NDK)
float32x4_t x0,x1,x2,x3;
float32x4_t y0,y1,y2,y3;

x0 = vld1q_f32(&m1[0]);
x1 = vld1q_f32(&m1[4]);
x2 = vld1q_f32(&m1[8]);
x3 = vld1q_f32(&m1[12]);

y0 = vld1q_f32(&m2[0]);
y1 = vld1q_f32(&m2[4]);
y2 = vld1q_f32(&m2[8]);
y3 = vld1q_f32(&m2[12]);

const float32x4_t A0 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y0,0)));
const float32x4_t B0 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y0,1)));
const float32x4_t C0 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y0,2)));
const float32x4_t D0 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y0,3)));
const float32x4_t _R0 = vaddq_f32( vaddq_f32(A0 , B0), vaddq_f32(C0, D0) );

const float32x4_t A1 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y1,0)));
const float32x4_t B1 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y1,1)));
const float32x4_t C1 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y1,2)));
const float32x4_t D1 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y1,3)));
const float32x4_t _R1 = vaddq_f32( vaddq_f32(A1 , B1), vaddq_f32(C1 , D1) );

const float32x4_t A2 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y2,0)));
const float32x4_t B2 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y2,1)));
const float32x4_t C2 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y2,2)));
const float32x4_t D2 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y2,3)));
const float32x4_t _R2 = vaddq_f32( vaddq_f32(A2 , B2), vaddq_f32(C2 , D2) );

const float32x4_t A3 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y3,0)));
const float32x4_t B3 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y3,1)));
const float32x4_t C3 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y3,2)));
const float32x4_t D3 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y3,3)));
const float32x4_t _R3 = vaddq_f32( vaddq_f32(A3 , B3), vaddq_f32(C3, D3) );

vst1q_f32(&M[0], _R0);
vst1q_f32(&M[4], _R1);
vst1q_f32(&M[8], _R2);
vst1q_f32(&M[12], _R3);

#else
M[0] = m1[0]*m2[0] + m1[4]*m2[1] + m1[8]*m2[2] + m1[12]*m2[3];
M[1] = m1[1]*m2[0] + m1[5]*m2[1] + m1[9]*m2[2] + m1[13]*m2[3];
M[2] = m1[2]*m2[0] + m1[6]*m2[1] + m1[10]*m2[2] + m1[14]*m2[3];
M[3] = m1[3]*m2[0] + m1[7]*m2[1] + m1[11]*m2[2] + m1[15]*m2[3];

M[4] = m1[0]*m2[4] + m1[4]*m2[5] + m1[8]*m2[6] + m1[12]*m2[7];
M[5] = m1[1]*m2[4] + m1[5]*m2[5] + m1[9]*m2[6] + m1[13]*m2[7];
M[6] = m1[2]*m2[4] + m1[6]*m2[5] + m1[10]*m2[6] + m1[14]*m2[7];
M[7] = m1[3]*m2[4] + m1[7]*m2[5] + m1[11]*m2[6] + m1[15]*m2[7];

M[8] = m1[0]*m2[8] + m1[4]*m2[9] + m1[8]*m2[10] + m1[12]*m2[11];
M[9] = m1[1]*m2[8] + m1[5]*m2[9] + m1[9]*m2[10] + m1[13]*m2[11];
M[10] = m1[2]*m2[8] + m1[6]*m2[9] + m1[10]*m2[10] + m1[14]*m2[11];
M[11] = m1[3]*m2[8] + m1[7]*m2[9] + m1[11]*m2[10] + m1[15]*m2[11];

M[12] = m1[0]*m2[12] + m1[4]*m2[13] + m1[8]*m2[14] + m1[12]*m2[15];
M[13] = m1[1]*m2[12] + m1[5]*m2[13] + m1[9]*m2[14] + m1[13]*m2[15];
M[14] = m1[2]*m2[12] + m1[6]*m2[13] + m1[10]*m2[14] + m1[14]*m2[15];
M[15] = m1[3]*m2[12] + m1[7]*m2[13] + m1[11]*m2[14] + m1[15]*m2[15];
#endif

#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
Last edited by feelthat on Sun Feb 15, 2015 11:30 am, edited 2 times in total.
feelthat
Posts: 194
Joined: Sat Feb 02, 2013 5:27 am

Re: fast dsp multiply matrix on ios and android

Post by feelthat »

//! multiply by another matrix
template <class T>
inline CMatrix4<T> CMatrix4<T>::operator*(const CMatrix4<T>& m2) const
{
#if defined ( USE_MATRIX_TEST )
// Testing purpose..
if ( this->isIdentity() )
return m2;
if ( m2.isIdentity() )
return *this;
#endif

CMatrix4<T> m3 ( EM4CONST_NOTHING );

const T* m1 = M;

#if (TARGET_OS_IPHONE == 1)
vDSP_mmul( m2.M, 1, m1, 1, m3.M, 1, 4, 4, 4 );

#elif defined(ANDROID_NDK)
float32x4_t x0,x1,x2,x3;
float32x4_t y0,y1,y2,y3;

x0 = vld1q_f32(&m1[0]);
x1 = vld1q_f32(&m1[4]);
x2 = vld1q_f32(&m1[8]);
x3 = vld1q_f32(&m1[12]);

y0 = vld1q_f32(&m2.M[0]);
y1 = vld1q_f32(&m2.M[4]);
y2 = vld1q_f32(&m2.M[8]);
y3 = vld1q_f32(&m2.M[12]);

const float32x4_t A0 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y0,0)));
const float32x4_t B0 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y0,1)));
const float32x4_t C0 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y0,2)));
const float32x4_t D0 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y0,3)));
const float32x4_t _R0 = vaddq_f32( vaddq_f32(A0 , B0), vaddq_f32(C0, D0) );

const float32x4_t A1 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y1,0)));
const float32x4_t B1 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y1,1)));
const float32x4_t C1 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y1,2)));
const float32x4_t D1 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y1,3)));
const float32x4_t _R1 = vaddq_f32( vaddq_f32(A1 , B1), vaddq_f32(C1 , D1) );

const float32x4_t A2 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y2,0)));
const float32x4_t B2 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y2,1)));
const float32x4_t C2 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y2,2)));
const float32x4_t D2 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y2,3)));
const float32x4_t _R2 = vaddq_f32( vaddq_f32(A2 , B2), vaddq_f32(C2 , D2) );

const float32x4_t A3 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y3,0)));
const float32x4_t B3 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y3,1)));
const float32x4_t C3 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y3,2)));
const float32x4_t D3 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y3,3)));
const float32x4_t _R3 = vaddq_f32( vaddq_f32(A3 , B3), vaddq_f32(C3, D3) );

vst1q_f32(&m3.M[0], _R0);
vst1q_f32(&m3.M[4], _R1);
vst1q_f32(&m3.M[8], _R2);
vst1q_f32(&m3.M[12],_R3);

#else
m3[0] = m1[0]*m2[0] + m1[4]*m2[1] + m1[8]*m2[2] + m1[12]*m2[3];
m3[1] = m1[1]*m2[0] + m1[5]*m2[1] + m1[9]*m2[2] + m1[13]*m2[3];
m3[2] = m1[2]*m2[0] + m1[6]*m2[1] + m1[10]*m2[2] + m1[14]*m2[3];
m3[3] = m1[3]*m2[0] + m1[7]*m2[1] + m1[11]*m2[2] + m1[15]*m2[3];

m3[4] = m1[0]*m2[4] + m1[4]*m2[5] + m1[8]*m2[6] + m1[12]*m2[7];
m3[5] = m1[1]*m2[4] + m1[5]*m2[5] + m1[9]*m2[6] + m1[13]*m2[7];
m3[6] = m1[2]*m2[4] + m1[6]*m2[5] + m1[10]*m2[6] + m1[14]*m2[7];
m3[7] = m1[3]*m2[4] + m1[7]*m2[5] + m1[11]*m2[6] + m1[15]*m2[7];

m3[8] = m1[0]*m2[8] + m1[4]*m2[9] + m1[8]*m2[10] + m1[12]*m2[11];
m3[9] = m1[1]*m2[8] + m1[5]*m2[9] + m1[9]*m2[10] + m1[13]*m2[11];
m3[10] = m1[2]*m2[8] + m1[6]*m2[9] + m1[10]*m2[10] + m1[14]*m2[11];
m3[11] = m1[3]*m2[8] + m1[7]*m2[9] + m1[11]*m2[10] + m1[15]*m2[11];

m3[12] = m1[0]*m2[12] + m1[4]*m2[13] + m1[8]*m2[14] + m1[12]*m2[15];
m3[13] = m1[1]*m2[12] + m1[5]*m2[13] + m1[9]*m2[14] + m1[13]*m2[15];
m3[14] = m1[2]*m2[12] + m1[6]*m2[13] + m1[10]*m2[14] + m1[14]*m2[15];
m3[15] = m1[3]*m2[12] + m1[7]*m2[13] + m1[11]*m2[14] + m1[15]*m2[15];
#endif

return m3;
}
feelthat
Posts: 194
Joined: Sat Feb 02, 2013 5:27 am

Re: fast dsp multiply matrix on ios and android

Post by feelthat »

Android.mk

LOCAL_CFLAGS , LOCAL_CPPFLAGS add 3 options

-DHAVE_NEON=1
-mfpu=neon
-mfloat-abi=softfp
devsh
Competition winner
Posts: 2057
Joined: Tue Dec 09, 2008 6:00 pm
Location: UK
Contact:

Re: fast dsp multiply matrix 4x4 on ios and android

Post by devsh »

I need to finish and release my SIMD vectors and matrices, then you could do the non x86 code?
feelthat
Posts: 194
Joined: Sat Feb 02, 2013 5:27 am

Re: fast dsp multiply matrix 4x4 on ios and android

Post by feelthat »

arm is non x86~~~
look -->

#elif defined(ANDROID_NDK)
#include <arm_neon.h>
devsh wrote:I need to finish and release my SIMD vectors and matrices, then you could do the non x86 code?
devsh
Competition winner
Posts: 2057
Joined: Tue Dec 09, 2008 6:00 pm
Location: UK
Contact:

Re: fast dsp multiply matrix 4x4 on ios and android

Post by devsh »

look at the comma, what does that imply about the structure of the sentence?
devsh wrote:I need to finish and release my SIMD vectors and matrices, then you could do the non x86 code?
Post Reply