//in matrix4.h
#if (TARGET_OS_IPHONE == 1)
#include <Accelerate/Accelerate.h>
#elif defined(ANDROID_NDK)
#include <arm_neon.h>
#endif
//! multiply by another matrix
// set this matrix to the product of two other matrices
// goal is to reduce stack use and copy
template <class T>
inline CMatrix4<T>& CMatrix4<T>::setbyproduct_nocheck(const CMatrix4<T>& other_a,const CMatrix4<T>& other_b )
{
const T *m1 = other_a.M;
const T *m2 = other_b.M;
#if (TARGET_OS_IPHONE == 1)
vDSP_mmul( m2, 1, m1, 1, M, 1, 4, 4, 4 );
#elif defined(ANDROID_NDK)
float32x4_t x0,x1,x2,x3;
float32x4_t y0,y1,y2,y3;
x0 = vld1q_f32(&m1[0]);
x1 = vld1q_f32(&m1[4]);
x2 = vld1q_f32(&m1[8]);
x3 = vld1q_f32(&m1[12]);
y0 = vld1q_f32(&m2[0]);
y1 = vld1q_f32(&m2[4]);
y2 = vld1q_f32(&m2[8]);
y3 = vld1q_f32(&m2[12]);
const float32x4_t A0 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y0,0)));
const float32x4_t B0 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y0,1)));
const float32x4_t C0 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y0,2)));
const float32x4_t D0 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y0,3)));
const float32x4_t _R0 = vaddq_f32( vaddq_f32(A0 , B0), vaddq_f32(C0, D0) );
const float32x4_t A1 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y1,0)));
const float32x4_t B1 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y1,1)));
const float32x4_t C1 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y1,2)));
const float32x4_t D1 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y1,3)));
const float32x4_t _R1 = vaddq_f32( vaddq_f32(A1 , B1), vaddq_f32(C1 , D1) );
const float32x4_t A2 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y2,0)));
const float32x4_t B2 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y2,1)));
const float32x4_t C2 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y2,2)));
const float32x4_t D2 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y2,3)));
const float32x4_t _R2 = vaddq_f32( vaddq_f32(A2 , B2), vaddq_f32(C2 , D2) );
const float32x4_t A3 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y3,0)));
const float32x4_t B3 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y3,1)));
const float32x4_t C3 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y3,2)));
const float32x4_t D3 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y3,3)));
const float32x4_t _R3 = vaddq_f32( vaddq_f32(A3 , B3), vaddq_f32(C3, D3) );
vst1q_f32(&M[0], _R0);
vst1q_f32(&M[4], _R1);
vst1q_f32(&M[8], _R2);
vst1q_f32(&M[12], _R3);
#else
M[0] = m1[0]*m2[0] + m1[4]*m2[1] + m1[8]*m2[2] + m1[12]*m2[3];
M[1] = m1[1]*m2[0] + m1[5]*m2[1] + m1[9]*m2[2] + m1[13]*m2[3];
M[2] = m1[2]*m2[0] + m1[6]*m2[1] + m1[10]*m2[2] + m1[14]*m2[3];
M[3] = m1[3]*m2[0] + m1[7]*m2[1] + m1[11]*m2[2] + m1[15]*m2[3];
M[4] = m1[0]*m2[4] + m1[4]*m2[5] + m1[8]*m2[6] + m1[12]*m2[7];
M[5] = m1[1]*m2[4] + m1[5]*m2[5] + m1[9]*m2[6] + m1[13]*m2[7];
M[6] = m1[2]*m2[4] + m1[6]*m2[5] + m1[10]*m2[6] + m1[14]*m2[7];
M[7] = m1[3]*m2[4] + m1[7]*m2[5] + m1[11]*m2[6] + m1[15]*m2[7];
M[8] = m1[0]*m2[8] + m1[4]*m2[9] + m1[8]*m2[10] + m1[12]*m2[11];
M[9] = m1[1]*m2[8] + m1[5]*m2[9] + m1[9]*m2[10] + m1[13]*m2[11];
M[10] = m1[2]*m2[8] + m1[6]*m2[9] + m1[10]*m2[10] + m1[14]*m2[11];
M[11] = m1[3]*m2[8] + m1[7]*m2[9] + m1[11]*m2[10] + m1[15]*m2[11];
M[12] = m1[0]*m2[12] + m1[4]*m2[13] + m1[8]*m2[14] + m1[12]*m2[15];
M[13] = m1[1]*m2[12] + m1[5]*m2[13] + m1[9]*m2[14] + m1[13]*m2[15];
M[14] = m1[2]*m2[12] + m1[6]*m2[13] + m1[10]*m2[14] + m1[14]*m2[15];
M[15] = m1[3]*m2[12] + m1[7]*m2[13] + m1[11]*m2[14] + m1[15]*m2[15];
#endif
#if defined ( USE_MATRIX_TEST )
definitelyIdentityMatrix=false;
#endif
return *this;
}
fast dsp multiply matrix 4x4 on ios and android
fast dsp multiply matrix 4x4 on ios and android
Last edited by feelthat on Sun Feb 15, 2015 11:30 am, edited 2 times in total.
Re: fast dsp multiply matrix on ios and android
//! multiply by another matrix
template <class T>
inline CMatrix4<T> CMatrix4<T>::operator*(const CMatrix4<T>& m2) const
{
#if defined ( USE_MATRIX_TEST )
// Testing purpose..
if ( this->isIdentity() )
return m2;
if ( m2.isIdentity() )
return *this;
#endif
CMatrix4<T> m3 ( EM4CONST_NOTHING );
const T* m1 = M;
#if (TARGET_OS_IPHONE == 1)
vDSP_mmul( m2.M, 1, m1, 1, m3.M, 1, 4, 4, 4 );
#elif defined(ANDROID_NDK)
float32x4_t x0,x1,x2,x3;
float32x4_t y0,y1,y2,y3;
x0 = vld1q_f32(&m1[0]);
x1 = vld1q_f32(&m1[4]);
x2 = vld1q_f32(&m1[8]);
x3 = vld1q_f32(&m1[12]);
y0 = vld1q_f32(&m2.M[0]);
y1 = vld1q_f32(&m2.M[4]);
y2 = vld1q_f32(&m2.M[8]);
y3 = vld1q_f32(&m2.M[12]);
const float32x4_t A0 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y0,0)));
const float32x4_t B0 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y0,1)));
const float32x4_t C0 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y0,2)));
const float32x4_t D0 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y0,3)));
const float32x4_t _R0 = vaddq_f32( vaddq_f32(A0 , B0), vaddq_f32(C0, D0) );
const float32x4_t A1 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y1,0)));
const float32x4_t B1 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y1,1)));
const float32x4_t C1 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y1,2)));
const float32x4_t D1 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y1,3)));
const float32x4_t _R1 = vaddq_f32( vaddq_f32(A1 , B1), vaddq_f32(C1 , D1) );
const float32x4_t A2 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y2,0)));
const float32x4_t B2 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y2,1)));
const float32x4_t C2 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y2,2)));
const float32x4_t D2 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y2,3)));
const float32x4_t _R2 = vaddq_f32( vaddq_f32(A2 , B2), vaddq_f32(C2 , D2) );
const float32x4_t A3 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y3,0)));
const float32x4_t B3 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y3,1)));
const float32x4_t C3 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y3,2)));
const float32x4_t D3 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y3,3)));
const float32x4_t _R3 = vaddq_f32( vaddq_f32(A3 , B3), vaddq_f32(C3, D3) );
vst1q_f32(&m3.M[0], _R0);
vst1q_f32(&m3.M[4], _R1);
vst1q_f32(&m3.M[8], _R2);
vst1q_f32(&m3.M[12],_R3);
#else
m3[0] = m1[0]*m2[0] + m1[4]*m2[1] + m1[8]*m2[2] + m1[12]*m2[3];
m3[1] = m1[1]*m2[0] + m1[5]*m2[1] + m1[9]*m2[2] + m1[13]*m2[3];
m3[2] = m1[2]*m2[0] + m1[6]*m2[1] + m1[10]*m2[2] + m1[14]*m2[3];
m3[3] = m1[3]*m2[0] + m1[7]*m2[1] + m1[11]*m2[2] + m1[15]*m2[3];
m3[4] = m1[0]*m2[4] + m1[4]*m2[5] + m1[8]*m2[6] + m1[12]*m2[7];
m3[5] = m1[1]*m2[4] + m1[5]*m2[5] + m1[9]*m2[6] + m1[13]*m2[7];
m3[6] = m1[2]*m2[4] + m1[6]*m2[5] + m1[10]*m2[6] + m1[14]*m2[7];
m3[7] = m1[3]*m2[4] + m1[7]*m2[5] + m1[11]*m2[6] + m1[15]*m2[7];
m3[8] = m1[0]*m2[8] + m1[4]*m2[9] + m1[8]*m2[10] + m1[12]*m2[11];
m3[9] = m1[1]*m2[8] + m1[5]*m2[9] + m1[9]*m2[10] + m1[13]*m2[11];
m3[10] = m1[2]*m2[8] + m1[6]*m2[9] + m1[10]*m2[10] + m1[14]*m2[11];
m3[11] = m1[3]*m2[8] + m1[7]*m2[9] + m1[11]*m2[10] + m1[15]*m2[11];
m3[12] = m1[0]*m2[12] + m1[4]*m2[13] + m1[8]*m2[14] + m1[12]*m2[15];
m3[13] = m1[1]*m2[12] + m1[5]*m2[13] + m1[9]*m2[14] + m1[13]*m2[15];
m3[14] = m1[2]*m2[12] + m1[6]*m2[13] + m1[10]*m2[14] + m1[14]*m2[15];
m3[15] = m1[3]*m2[12] + m1[7]*m2[13] + m1[11]*m2[14] + m1[15]*m2[15];
#endif
return m3;
}
template <class T>
inline CMatrix4<T> CMatrix4<T>::operator*(const CMatrix4<T>& m2) const
{
#if defined ( USE_MATRIX_TEST )
// Testing purpose..
if ( this->isIdentity() )
return m2;
if ( m2.isIdentity() )
return *this;
#endif
CMatrix4<T> m3 ( EM4CONST_NOTHING );
const T* m1 = M;
#if (TARGET_OS_IPHONE == 1)
vDSP_mmul( m2.M, 1, m1, 1, m3.M, 1, 4, 4, 4 );
#elif defined(ANDROID_NDK)
float32x4_t x0,x1,x2,x3;
float32x4_t y0,y1,y2,y3;
x0 = vld1q_f32(&m1[0]);
x1 = vld1q_f32(&m1[4]);
x2 = vld1q_f32(&m1[8]);
x3 = vld1q_f32(&m1[12]);
y0 = vld1q_f32(&m2.M[0]);
y1 = vld1q_f32(&m2.M[4]);
y2 = vld1q_f32(&m2.M[8]);
y3 = vld1q_f32(&m2.M[12]);
const float32x4_t A0 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y0,0)));
const float32x4_t B0 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y0,1)));
const float32x4_t C0 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y0,2)));
const float32x4_t D0 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y0,3)));
const float32x4_t _R0 = vaddq_f32( vaddq_f32(A0 , B0), vaddq_f32(C0, D0) );
const float32x4_t A1 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y1,0)));
const float32x4_t B1 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y1,1)));
const float32x4_t C1 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y1,2)));
const float32x4_t D1 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y1,3)));
const float32x4_t _R1 = vaddq_f32( vaddq_f32(A1 , B1), vaddq_f32(C1 , D1) );
const float32x4_t A2 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y2,0)));
const float32x4_t B2 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y2,1)));
const float32x4_t C2 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y2,2)));
const float32x4_t D2 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y2,3)));
const float32x4_t _R2 = vaddq_f32( vaddq_f32(A2 , B2), vaddq_f32(C2 , D2) );
const float32x4_t A3 = vmulq_f32(x0, vdupq_n_f32(vgetq_lane_f32(y3,0)));
const float32x4_t B3 = vmulq_f32(x1, vdupq_n_f32(vgetq_lane_f32(y3,1)));
const float32x4_t C3 = vmulq_f32(x2, vdupq_n_f32(vgetq_lane_f32(y3,2)));
const float32x4_t D3 = vmulq_f32(x3, vdupq_n_f32(vgetq_lane_f32(y3,3)));
const float32x4_t _R3 = vaddq_f32( vaddq_f32(A3 , B3), vaddq_f32(C3, D3) );
vst1q_f32(&m3.M[0], _R0);
vst1q_f32(&m3.M[4], _R1);
vst1q_f32(&m3.M[8], _R2);
vst1q_f32(&m3.M[12],_R3);
#else
m3[0] = m1[0]*m2[0] + m1[4]*m2[1] + m1[8]*m2[2] + m1[12]*m2[3];
m3[1] = m1[1]*m2[0] + m1[5]*m2[1] + m1[9]*m2[2] + m1[13]*m2[3];
m3[2] = m1[2]*m2[0] + m1[6]*m2[1] + m1[10]*m2[2] + m1[14]*m2[3];
m3[3] = m1[3]*m2[0] + m1[7]*m2[1] + m1[11]*m2[2] + m1[15]*m2[3];
m3[4] = m1[0]*m2[4] + m1[4]*m2[5] + m1[8]*m2[6] + m1[12]*m2[7];
m3[5] = m1[1]*m2[4] + m1[5]*m2[5] + m1[9]*m2[6] + m1[13]*m2[7];
m3[6] = m1[2]*m2[4] + m1[6]*m2[5] + m1[10]*m2[6] + m1[14]*m2[7];
m3[7] = m1[3]*m2[4] + m1[7]*m2[5] + m1[11]*m2[6] + m1[15]*m2[7];
m3[8] = m1[0]*m2[8] + m1[4]*m2[9] + m1[8]*m2[10] + m1[12]*m2[11];
m3[9] = m1[1]*m2[8] + m1[5]*m2[9] + m1[9]*m2[10] + m1[13]*m2[11];
m3[10] = m1[2]*m2[8] + m1[6]*m2[9] + m1[10]*m2[10] + m1[14]*m2[11];
m3[11] = m1[3]*m2[8] + m1[7]*m2[9] + m1[11]*m2[10] + m1[15]*m2[11];
m3[12] = m1[0]*m2[12] + m1[4]*m2[13] + m1[8]*m2[14] + m1[12]*m2[15];
m3[13] = m1[1]*m2[12] + m1[5]*m2[13] + m1[9]*m2[14] + m1[13]*m2[15];
m3[14] = m1[2]*m2[12] + m1[6]*m2[13] + m1[10]*m2[14] + m1[14]*m2[15];
m3[15] = m1[3]*m2[12] + m1[7]*m2[13] + m1[11]*m2[14] + m1[15]*m2[15];
#endif
return m3;
}
Re: fast dsp multiply matrix on ios and android
Android.mk
LOCAL_CFLAGS , LOCAL_CPPFLAGS add 3 options
-DHAVE_NEON=1
-mfpu=neon
-mfloat-abi=softfp
LOCAL_CFLAGS , LOCAL_CPPFLAGS add 3 options
-DHAVE_NEON=1
-mfpu=neon
-mfloat-abi=softfp
Re: fast dsp multiply matrix 4x4 on ios and android
I need to finish and release my SIMD vectors and matrices, then you could do the non x86 code?
Re: fast dsp multiply matrix 4x4 on ios and android
arm is non x86~~~
look -->
#elif defined(ANDROID_NDK)
#include <arm_neon.h>
look -->
#elif defined(ANDROID_NDK)
#include <arm_neon.h>
devsh wrote:I need to finish and release my SIMD vectors and matrices, then you could do the non x86 code?
Re: fast dsp multiply matrix 4x4 on ios and android
look at the comma, what does that imply about the structure of the sentence?
devsh wrote:I need to finish and release my SIMD vectors and matrices, then you could do the non x86 code?