Here's the test code (it load a identity matrix from a text file so It can't be optimized away and every computation depends on result of previous computation so there's no room for optimizer removing code, but it can still optimize loops etc.) for matrix multiplication:
Code: Select all
#undef NDEBUG
#include <assert.h>
#include <emmintrin.h>
#include <chrono>
#include <iostream>
#include <fstream>
#define STOP_TIMER(name) std::cout << "RUNTIME of " << name << \
std::chrono::duration_cast<std::chrono::microseconds>( \
t2-t1 \
).count() << " microseconds " << std::endl;
typedef std::chrono::high_resolution_clock Clock;
struct Matrix{
float m[16];
};
struct SMatrix{
__m128 m[4];
};
//naive matrix multiplication (the compiler is amazing to optimize this ugly code)
void mmul(const float *a, const float *b, float *r)
{
for (int i=0; i<16; i+=4)
for (int j=0; j<4; j++)
r[i+j] = b[i]*a[j] + b[i+1]*a[j+4] + b[i+2]*a[j+8] + b[i+3]*a[j+12];
}
//fastest SSE matrix mul I found on the web (there were other implementations, all were slower)
void mmul_sse(const float * a, const float * b, float * r)
{
volatile __m128 a_line, b_line, r_line;
for (int i=0; i<16; i+=4) {
a_line = _mm_load_ps(a);
b_line = _mm_set1_ps(b[i]);
r_line = _mm_mul_ps(a_line, b_line);
for (int j=1; j<4; j++) {
a_line = _mm_load_ps(&a[j*4]);
b_line = _mm_set1_ps(b[i+j]);
r_line = _mm_add_ps(_mm_mul_ps(a_line, b_line), r_line);
}
_mm_store_ps(&r[i], r_line); // r[i] = r_line
}
}
/** Irrlicht's matrix multiplication */
void irr_mul(const float * m1, const float * m2, float * M){
M[0] = m1[0]*m2[0] + m1[4]*m2[1] + m1[8]*m2[2] + m1[12]*m2[3];
M[1] = m1[1]*m2[0] + m1[5]*m2[1] + m1[9]*m2[2] + m1[13]*m2[3];
M[2] = m1[2]*m2[0] + m1[6]*m2[1] + m1[10]*m2[2] + m1[14]*m2[3];
M[3] = m1[3]*m2[0] + m1[7]*m2[1] + m1[11]*m2[2] + m1[15]*m2[3];
M[4] = m1[0]*m2[4] + m1[4]*m2[5] + m1[8]*m2[6] + m1[12]*m2[7];
M[5] = m1[1]*m2[4] + m1[5]*m2[5] + m1[9]*m2[6] + m1[13]*m2[7];
M[6] = m1[2]*m2[4] + m1[6]*m2[5] + m1[10]*m2[6] + m1[14]*m2[7];
M[7] = m1[3]*m2[4] + m1[7]*m2[5] + m1[11]*m2[6] + m1[15]*m2[7];
M[8] = m1[0]*m2[8] + m1[4]*m2[9] + m1[8]*m2[10] + m1[12]*m2[11];
M[9] = m1[1]*m2[8] + m1[5]*m2[9] + m1[9]*m2[10] + m1[13]*m2[11];
M[10] = m1[2]*m2[8] + m1[6]*m2[9] + m1[10]*m2[10] + m1[14]*m2[11];
M[11] = m1[3]*m2[8] + m1[7]*m2[9] + m1[11]*m2[10] + m1[15]*m2[11];
M[12] = m1[0]*m2[12] + m1[4]*m2[13] + m1[8]*m2[14] + m1[12]*m2[15];
M[13] = m1[1]*m2[12] + m1[5]*m2[13] + m1[9]*m2[14] + m1[13]*m2[15];
M[14] = m1[2]*m2[12] + m1[6]*m2[13] + m1[10]*m2[14] + m1[14]*m2[15];
M[15] = m1[3]*m2[12] + m1[7]*m2[13] + m1[11]*m2[14] + m1[15]*m2[15];
}
void printMatrix(float * a){
int i = 0;
std::cout<<std::endl<<a[0]<<" "<<a[1]<<" "<<a[2]<<" "<<a[3]<<" "<<std::endl;
std::cout<<a[4]<<" "<<a[5]<<" "<<a[6]<<" "<<a[7]<<" "<<std::endl;
std::cout<<a[8]<<" "<<a[9]<<" "<<a[10]<<" "<<a[11]<<" "<<std::endl;
std::cout<<a[12]<<" "<<a[13]<<" "<<a[14]<<" "<<a[15]<<" "<<std::endl;
}
void loadMatrix(float * a){
std::ifstream myFile("identityMatrix.txt");
if (!myFile.is_open()){
std::cout << "failed to open file\n";
return ;
}
int i=0;
float b;
std::cout<<"entering\n"<<std::endl;
while(myFile >> b){
a[i++]=b;
std::cout<<b<<" ";
}
myFile.close();
}
int main(){
Matrix a,b,c;
SMatrix x,y,z;
int times = 10000000; //number of multiplication loops
loadMatrix(a.m);
loadMatrix(b.m);
loadMatrix(c.m);
loadMatrix(reinterpret_cast<float *>(x.m));
loadMatrix(reinterpret_cast<float *>(y.m));
loadMatrix(reinterpret_cast<float *>(z.m));
auto t1 = Clock::now();
auto t2 = Clock::now();
assert( alignof(a)==alignof(b));
assert( alignof(a)== 4);
assert( alignof(x)==alignof(y));
assert( alignof(x)== 16);
t1 = Clock::now();
for(int i=0; i<times; i++){
mmul_sse( reinterpret_cast<float *>(x.m),
reinterpret_cast<float *>(y.m),
reinterpret_cast<float *>(z.m));
mmul_sse( reinterpret_cast<float *>(z.m),
reinterpret_cast<float *>(y.m),
reinterpret_cast<float *>(x.m));
mmul_sse( reinterpret_cast<float *>(x.m),
reinterpret_cast<float *>(z.m),
reinterpret_cast<float *>(y.m));
}
t2 = Clock::now();
STOP_TIMER("SSE matrix mul \t");
t1 = Clock::now();
for(int i=0; i<times; i++){
mmul(a.m,b.m,c.m); //each time we generate input for successive operation (prevent code stripping)
mmul(c.m,b.m,a.m);
mmul(a.m,c.m,b.m);
}
t2 = Clock::now();
STOP_TIMER("regularMatrix mul\t");
t1 = Clock::now();
for(int i=0; i<times; i++){
irr_mul(a.m,b.m,c.m);
irr_mul(c.m,b.m,a.m);
irr_mul(a.m,c.m,b.m);
}
t2 = Clock::now();
STOP_TIMER("irrMatrix mul\t");
//check if we get identities (if not, matrix multiplication is wrong)
printMatrix(a.m);
printMatrix(b.m);
printMatrix(c.m);
printMatrix(reinterpret_cast<float *>(x.m));
printMatrix(reinterpret_cast<float *>(y.m));
printMatrix(reinterpret_cast<float *>(z.m));
return 0;
}
Here are the results on GCC 4.9.X (-msse2 always on):
-O optimization level
-O1 optimization levelRUNTIME of SSE matrix mul 2276000 microseconds
RUNTIME of regularMatrix mul 2252000 microseconds
RUNTIME of irrMatrix mul 2061000 microseconds
RUNTIME of SSE matrix mul 2277000 microseconds
RUNTIME of regularMatrix mul 2332000 microseconds
RUNTIME of irrMatrix mul 2078000 microseconds
-O2 optimization level
-O3 optimization levelRUNTIME of SSE matrix mul 2044000 microseconds
RUNTIME of regularMatrix mul 2252000 microseconds
RUNTIME of irrMatrix mul 2078000 microseconds
-O4 optimization levelRUNTIME of SSE matrix mul 1786000 microseconds
RUNTIME of regularMatrix mul 2154000 microseconds
RUNTIME of irrMatrix mul 2293000 microseconds
-Os optimization levelRUNTIME of SSE matrix mul 1795000 microseconds
RUNTIME of regularMatrix mul 2090000 microseconds
RUNTIME of irrMatrix mul 2064000 microseconds
As you see, SSE gives little improvement only with optimization levels that are too aggressive for a whole application optimization level, while for optimization level that are usefull for whole application it tends to be slightly slower.RUNTIME of SSE matrix mul 2134000 microseconds
RUNTIME of regularMatrix mul 2273000 microseconds
RUNTIME of irrMatrix mul 2099000 microseconds
There are 2 problems:
1) Compiler and CPU optimizations are amazing. If you try to implements dot product in SSE you find it will be faster in SSE1(5 instructions than SSE4(1 instruction) because compiler can organize register dependencies better for many small instructions reducing pipeline latencies, and on the other side, CPU can pipeline better simple instructions.
2) Matrix multiplication is heavily based on dot product wich is a memory bandwith limited operation
3) Matrix multiplication is usually the heaviest operation when doing mass math .. in any engine.
In general using SIMD (even with the help of intrinsics) adds a lot of complexity. in most cases the gain is very little or null.
The only place where SIMD shines is for quaternions where I can get 7/8 x speed up when creating a rotation around an axis (3/4 speed up for creating 4 different quaternions at once, and 2x speed up because if you compute Cos yourself you know you can get Sin almost for free but you are sacrificing some precision here ).
The test is already too artificial (only 3 matrices => fits entirely in L1 cache and cache lines) in a real scenario we will be more memory limited than this test.
Anyway I'm curious to see results on more modern CPUs (mine is rather old). It would be nice to see also someone test with AVX (unluckily I don't have that instruction set)
IdentityMatrix.txt:
Code: Select all
1.0 0.0 0.0 0.0
0.0 1.0 0.0 0.0
0.0 0.0 1.0 0.0
0.0 0.0 0.0 1.0
SIMD instructions also consume more power (important for laptops)