About SIMD math (with profiling code)

REDDemon · Post by **REDDemon** » Sun Apr 05, 2015 8:44 am

I was trying to think if integrating SIMD math in my thesis project (for matrix and vectors and few other stuff).

Here's the test code (it load a identity matrix from a text file so It can't be optimized away and every computation depends on result of previous computation so there's no room for optimizer removing code, but it can still optimize loops etc.) for matrix multiplication:

Code: Select all

#undef NDEBUG
#include <assert.h>
#include <emmintrin.h>
#include <chrono>
#include <iostream>
#include <fstream>
 
 
#define STOP_TIMER(name)  std::cout << "RUNTIME of " << name << \
    std::chrono::duration_cast<std::chrono::microseconds>( \
            t2-t1 \
    ).count() << " microseconds " << std::endl;
 
 
typedef std::chrono::high_resolution_clock Clock;
 
struct Matrix{
    float m[16];
};
 
struct SMatrix{
    __m128 m[4];
};
 
//naive matrix multiplication (the compiler is amazing to optimize this ugly code)
void mmul(const float *a, const float *b, float *r)
{
  for (int i=0; i<16; i+=4)
    for (int j=0; j<4; j++)
      r[i+j] = b[i]*a[j] + b[i+1]*a[j+4] + b[i+2]*a[j+8] + b[i+3]*a[j+12];
}
 
//fastest SSE matrix mul I found on the web (there were other implementations, all were slower)
void mmul_sse(const float * a, const float * b, float * r)
{
  volatile __m128 a_line, b_line, r_line;
  for (int i=0; i<16; i+=4) {
    a_line = _mm_load_ps(a);        
    b_line = _mm_set1_ps(b[i]);    
    r_line = _mm_mul_ps(a_line, b_line);
    for (int j=1; j<4; j++) {
      a_line = _mm_load_ps(&a[j*4]); 
      b_line = _mm_set1_ps(b[i+j]);  
                                     
      r_line = _mm_add_ps(_mm_mul_ps(a_line, b_line), r_line);
    }
    _mm_store_ps(&r[i], r_line);     // r[i] = r_line
  }
}
 
/** Irrlicht's matrix multiplication */
void irr_mul(const float * m1, const float * m2, float * M){
 
    M[0] = m1[0]*m2[0] + m1[4]*m2[1] + m1[8]*m2[2] + m1[12]*m2[3];
    M[1] = m1[1]*m2[0] + m1[5]*m2[1] + m1[9]*m2[2] + m1[13]*m2[3];
    M[2] = m1[2]*m2[0] + m1[6]*m2[1] + m1[10]*m2[2] + m1[14]*m2[3];
    M[3] = m1[3]*m2[0] + m1[7]*m2[1] + m1[11]*m2[2] + m1[15]*m2[3];
    M[4] = m1[0]*m2[4] + m1[4]*m2[5] + m1[8]*m2[6] + m1[12]*m2[7];
    M[5] = m1[1]*m2[4] + m1[5]*m2[5] + m1[9]*m2[6] + m1[13]*m2[7];
    M[6] = m1[2]*m2[4] + m1[6]*m2[5] + m1[10]*m2[6] + m1[14]*m2[7];
    M[7] = m1[3]*m2[4] + m1[7]*m2[5] + m1[11]*m2[6] + m1[15]*m2[7];
    M[8] = m1[0]*m2[8] + m1[4]*m2[9] + m1[8]*m2[10] + m1[12]*m2[11];
    M[9] = m1[1]*m2[8] + m1[5]*m2[9] + m1[9]*m2[10] + m1[13]*m2[11];
    M[10] = m1[2]*m2[8] + m1[6]*m2[9] + m1[10]*m2[10] + m1[14]*m2[11];
    M[11] = m1[3]*m2[8] + m1[7]*m2[9] + m1[11]*m2[10] + m1[15]*m2[11];
    M[12] = m1[0]*m2[12] + m1[4]*m2[13] + m1[8]*m2[14] + m1[12]*m2[15];
    M[13] = m1[1]*m2[12] + m1[5]*m2[13] + m1[9]*m2[14] + m1[13]*m2[15];
    M[14] = m1[2]*m2[12] + m1[6]*m2[13] + m1[10]*m2[14] + m1[14]*m2[15];
    M[15] = m1[3]*m2[12] + m1[7]*m2[13] + m1[11]*m2[14] + m1[15]*m2[15];
}
 
void printMatrix(float * a){
    int i = 0;
    std::cout<<std::endl<<a[0]<<" "<<a[1]<<" "<<a[2]<<" "<<a[3]<<" "<<std::endl;
    std::cout<<a[4]<<" "<<a[5]<<" "<<a[6]<<" "<<a[7]<<" "<<std::endl;
    std::cout<<a[8]<<" "<<a[9]<<" "<<a[10]<<" "<<a[11]<<" "<<std::endl;
    std::cout<<a[12]<<" "<<a[13]<<" "<<a[14]<<" "<<a[15]<<" "<<std::endl;
}
 
void loadMatrix(float * a){
 
    std::ifstream myFile("identityMatrix.txt");  
 
    if (!myFile.is_open()){  
        std::cout << "failed to open file\n";
        return ;
    }
 
    int i=0;
    float b;
    std::cout<<"entering\n"<<std::endl;
    while(myFile >> b){
        a[i++]=b;
        std::cout<<b<<" ";
    }
 
    myFile.close();
}
 
int main(){
    Matrix a,b,c;
    SMatrix x,y,z;
    int times = 10000000; //number of multiplication loops
 
    loadMatrix(a.m);
    loadMatrix(b.m);
    loadMatrix(c.m);
 
    loadMatrix(reinterpret_cast<float *>(x.m));
    loadMatrix(reinterpret_cast<float *>(y.m));
    loadMatrix(reinterpret_cast<float *>(z.m));
 
 
    auto t1 = Clock::now();
    auto t2 = Clock::now();
 
    assert( alignof(a)==alignof(b));
    assert( alignof(a)== 4);
    assert( alignof(x)==alignof(y));
    assert( alignof(x)== 16);
 
    t1 = Clock::now();
    for(int i=0; i<times; i++){
        mmul_sse(   reinterpret_cast<float *>(x.m),
                    reinterpret_cast<float *>(y.m),
                    reinterpret_cast<float *>(z.m));
        mmul_sse(   reinterpret_cast<float *>(z.m),
                    reinterpret_cast<float *>(y.m),
                    reinterpret_cast<float *>(x.m));
        mmul_sse(   reinterpret_cast<float *>(x.m),
                    reinterpret_cast<float *>(z.m),
                    reinterpret_cast<float *>(y.m));
 
    }
    t2 = Clock::now();
 
    STOP_TIMER("SSE matrix mul   \t");
 
 
 
    t1 = Clock::now();
    for(int i=0; i<times; i++){
        mmul(a.m,b.m,c.m); //each time we generate input for successive operation (prevent code stripping)
        mmul(c.m,b.m,a.m);
        mmul(a.m,c.m,b.m);
    }
    t2 = Clock::now();
 
 
    STOP_TIMER("regularMatrix mul\t");
 
    t1 = Clock::now();
    for(int i=0; i<times; i++){
        irr_mul(a.m,b.m,c.m);
        irr_mul(c.m,b.m,a.m);
        irr_mul(a.m,c.m,b.m);
    }
    t2 = Clock::now();
 
 
    STOP_TIMER("irrMatrix mul\t");
 
   //check if we get identities (if not, matrix multiplication is wrong)
    printMatrix(a.m);
    printMatrix(b.m);
    printMatrix(c.m);
    printMatrix(reinterpret_cast<float *>(x.m));
    printMatrix(reinterpret_cast<float *>(y.m));
    printMatrix(reinterpret_cast<float *>(z.m));
 
    return 0;
}

Here are the results on GCC 4.9.X (-msse2 always on):

-O optimization level

RUNTIME of SSE matrix mul 2276000 microseconds
RUNTIME of regularMatrix mul 2252000 microseconds
RUNTIME of irrMatrix mul 2061000 microseconds

-O1 optimization level

RUNTIME of SSE matrix mul 2277000 microseconds
RUNTIME of regularMatrix mul 2332000 microseconds
RUNTIME of irrMatrix mul 2078000 microseconds

-O2 optimization level

RUNTIME of SSE matrix mul 2044000 microseconds
RUNTIME of regularMatrix mul 2252000 microseconds
RUNTIME of irrMatrix mul 2078000 microseconds

-O3 optimization level

RUNTIME of SSE matrix mul 1786000 microseconds
RUNTIME of regularMatrix mul 2154000 microseconds
RUNTIME of irrMatrix mul 2293000 microseconds

-O4 optimization level

RUNTIME of SSE matrix mul 1795000 microseconds
RUNTIME of regularMatrix mul 2090000 microseconds
RUNTIME of irrMatrix mul 2064000 microseconds

-Os optimization level

RUNTIME of SSE matrix mul 2134000 microseconds
RUNTIME of regularMatrix mul 2273000 microseconds
RUNTIME of irrMatrix mul 2099000 microseconds

As you see, SSE gives little improvement only with optimization levels that are too aggressive for a whole application optimization level, while for optimization level that are usefull for whole application it tends to be slightly slower.

There are 2 problems:
1) Compiler and CPU optimizations are amazing. If you try to implements dot product in SSE you find it will be faster in SSE1(5 instructions than SSE4(1 instruction) because compiler can organize register dependencies better for many small instructions reducing pipeline latencies, and on the other side, CPU can pipeline better simple instructions.

2) Matrix multiplication is heavily based on dot product wich is a memory bandwith limited operation

3) Matrix multiplication is usually the heaviest operation when doing mass math .. in any engine.

In general using SIMD (even with the help of intrinsics) adds a lot of complexity. in most cases the gain is very little or null.
The only place where SIMD shines is for quaternions where I can get 7/8 x speed up when creating a rotation around an axis (3/4 speed up for creating 4 different quaternions at once, and 2x speed up because if you compute Cos yourself you know you can get Sin almost for free but you are sacrificing some precision here ).

The test is already too artificial (only 3 matrices => fits entirely in L1 cache and cache lines) in a real scenario we will be more memory limited than this test.

Anyway I'm curious to see results on more modern CPUs (mine is rather old). It would be nice to see also someone test with AVX (unluckily I don't have that instruction set)

IdentityMatrix.txt:

Code: Select all

 
1.0 0.0 0.0 0.0
0.0 1.0 0.0 0.0
0.0 0.0 1.0 0.0
0.0 0.0 0.0 1.0

edit:
SIMD instructions also consume more power (important for laptops)

hendu · Post by **hendu** » Sun Apr 05, 2015 9:01 am

As you see, SSE gives little improvement only with optimization levels that are too aggressive for a whole application optimization level, while for optimization level that are usefull for whole application it tends to be slightly slower.

I disagree, there is no such thing as "too aggressive optimization". I build everything with the highest possible, if something breaks it's 99% a fault in your code.

thanhle · Post by **thanhle** » Sun Apr 05, 2015 9:32 am

Is SIMD better than everything else out there?
I think it's better to make everything works. Then optimization comes later. Otherwise you will never accomplish what you want to do.

Regards
thanh

hendu · Post by **hendu** » Sun Apr 05, 2015 9:49 am

-Os

RUNTIME of SSE matrix mul 1931963 microseconds
RUNTIME of regularMatrix mul 1616349 microseconds
RUNTIME of irrMatrix mul 773213 microseconds

-O3

RUNTIME of SSE matrix mul 1903855 microseconds
RUNTIME of regularMatrix mul 804209 microseconds
RUNTIME of irrMatrix mul 764512 microseconds

-O3 with properly annotated code (marked pointers restrict const)

RUNTIME of SSE matrix mul 1909559 microseconds
RUNTIME of regularMatrix mul 606689 microseconds
RUNTIME of irrMatrix mul 813468 microseconds

Conclusion: compiler > manual SSE code

REDDemon · Post by **REDDemon** » Sun Apr 05, 2015 3:29 pm

hendu wrote:
As you see, SSE gives little improvement only with optimization levels that are too aggressive for a whole application optimization level, while for optimization level that are usefull for whole application it tends to be slightly slower.
I disagree, there is no such thing as "too aggressive optimization". I build everything with the highest possible, if something breaks it's 99% a fault in your code.

I think you are referring to something different. GCC use massive optimizations under O3 abusing(well in reality who rely on undefined behaviour is the real abuser

) what is called "undefined behaviour" (like using a un-initialized variable), if you rely on undefined behaviour and compile with O3 you are going to break your code and I agree with that.

Using O3 not always produce faster code than O2 (especially because sometimes O3 unroll "too much" and bloat the code), but increase compile time for sure.

What I wanted to show is that messing with SSE manually is just lost time because CPUs and Compilers will do a better job.

hendu · Post by **hendu** » Sun Apr 05, 2015 5:02 pm

If O3 produces measurably worse code, you report that as a compiler bug.

devsh · Post by **devsh** » Fri Apr 17, 2015 7:41 pm

I can do vector transformation in 7 instructions with my SIMD matrix SSE3 class (4 muls, 3 hadds, 1 assign), as opposed to irrlicht's 32 (4x 7 for dot plus 1 for assign)

For matrix matrix mul I can get it in 8 (for transpose) + 4*4 instructions (4x mul and 3 hadds) + 4 assigns which gives us 28 ops for matrix multiplication as opposed to 64 muls, 48 adds, and 16 assignments

REDDemon · Post by **REDDemon** » Tue Apr 21, 2015 2:42 pm

hendu wrote:If O3 produces measurably worse code, you report that as a compiler bug.

...

devsh you beat me

my matrix multiplication was 37 instructions. but timings can you beat your compiler in vectorizing irrlicht's matrix multiplication ?

devsh · Post by **devsh** » Wed Apr 22, 2015 3:01 pm

yes I can, but now I'm working on heavily optimizing the matrix inverse op... because irrlicht never checks if its a sane transformation (with 0,0,0,1 in the last row) which greatly reduces the equation for the determinant and inverse (by a factor of 4, since each element is a sum of 3 multiplications of scalar and a term in brackets which is the difference of two products).

Plus the determinant of the matrix is the dot product of the Nth row of the original matrix and the Nth column of the inverse scaled up by the determinant.
Whoever wrote the normal non SSE matrix4 class, didnt notice that and unless optimized by clever compiler, there is redundancy in caclulating inverse matrix elements and determinant.

And choice of storing the matrix elements column first or row first has great impact of speed of matrix vector multiplication, especially with SSE3 where you have the _mm_hadd_ps instruction it is better to store matrices by rows instead of by columns as irrlicht currently does (which would force you to shuffle to get a vector of just X component to use for the column addition version of matrix vector multiplication, hence adding 4 instructions)

This is funking hilarious since irrlicht calls its matrix class row-major XD

hendu · Post by **hendu** » Wed Apr 22, 2015 3:24 pm

0,0,0,1 in the last row would mean no translation, which is essential for the usual camera matrix? Or did you mean from the other direction, the w component.

CuteAlien · Post by **CuteAlien** » Wed Apr 22, 2015 4:23 pm

@devsh: Matrix is really row-major order. But I also got confused at first when I just checked. Because the operators taking row and column are like (row,col) instead of (col, row) as one would rather expect (who starts with y before x?). Unfortunately nothing I can change now without breaking user-code in unexpected ways (thought I can bet this also causes it's share of bugs...).

edit: Argh - I get it now. This was meant not as at(x,y) but as replacement for [y][x]. That's why it has this order of parameters probably.

mongoose7 · Post by **mongoose7** » Thu Apr 23, 2015 1:28 am

In order to pass matrices to the GPU, they are stored as the transpose. Everything works fine except that multiplication must reverse the order of the matrices (AT*BT = (BA)T).

devsh · Post by **devsh** » Thu Apr 23, 2015 7:08 pm

0,0,0,1 in the last row would mean no translation, which is essential for the usual camera matrix? Or did you mean from the other direction, the w component.

Okay lets clear one thing up once for all... it doesn't matter how the matrix is stored in memory, THE FOLLOWING HOLDS!!!

ALWAYS matrix vec multiplication is equivalent to these two:

Code: Select all

M*v = Sum over i {v.getComponent(i)*M.getColumn(i)}
(M*v).getComponent(i) = v.dotProduct(M.getRow(i))

So if GPU, etc uses a 4 column vector with .w compoent set to 1 to emulate translation.... the last ROW must be (0,0,0,1) and the last COLUMN must be (transXYZ,1)

END OF ABSOLUTE MATH TRUTH

In order to pass matrices to the GPU, they are stored as the transpose. Everything works fine except that multiplication must reverse the order of the matrices (AT*BT = (BA)T).

Nah OpenGL has functions for passing matrices both ways (so you can store your matrices however you want)

@devsh: Matrix is really row-major order. But I also got confused at first when I just checked. Because the operators taking row and column are like (row,col) instead of (col, row) as one would rather expect (who starts with y before x?). Unfortunately nothing I can change now without breaking user-code in unexpected ways (thought I can bet this also causes it's share of bugs...).

edit: Argh - I get it now. This was meant not as at(x,y) but as replacement for [y][x]. That's why it has this order of parameters probably.

Why do you think I wrote a new class??? obviously I dont want to break my won code using matrix4!!!

The (a,b) operator is correctly implemented, so a is the row, b is the column. This is the correct mathematical way, and its the same in my SIMD class.
HOWEVER the order of floats in the matrix, M[0],M[1],M[2],M[3] are not the first row, they are the first column!

CuteAlien · Post by **CuteAlien** » Thu Apr 23, 2015 7:24 pm

devsh wrote:HOWEVER the order of floats in the matrix, M[0],M[1],M[2],M[3] are not the first row, they are the first column!

No they are not. They are the first row.

devsh · Post by **devsh** » Fri Apr 24, 2015 7:41 am

Code: Select all

    template <class T>
    inline vector3d<T> CMatrix4<T>::getTranslation() const
    {
        return vector3d<T>(M[12], M[13], M[14]);
    }

-----------BEGIN ABSOLUTE TRUTH-------------

Translation in an abstract mathematical matrix used to manipulate a vector (positionXYZ,1) is in the LAST COLUMN!!!

-----------------------------------------------------------------

Above we can see CONSECUTIVE MEMORY POSITIONS RETURNED AS A TRANSLATION VECTOR

HENCE M[12],M[13],M[14],M[15] is a COLUMN not a ROW

Irrlicht Engine

About SIMD math (with profiling code)

About SIMD math (with profiling code)

Re: About SIMD math (with profiling code)

Re: About SIMD math (with profiling code)

Re: About SIMD math (with profiling code)

Re: About SIMD math (with profiling code)

Re: About SIMD math (with profiling code)

Re: About SIMD math (with profiling code)

Re: About SIMD math (with profiling code)

Re: About SIMD math (with profiling code)

Re: About SIMD math (with profiling code)

Re: About SIMD math (with profiling code)

Re: About SIMD math (with profiling code)

Re: About SIMD math (with profiling code)

Re: About SIMD math (with profiling code)

Re: About SIMD math (with profiling code)