About SIMD math (with profiling code)

Discuss about anything related to the Irrlicht Engine, or read announcements about any significant features or usage changes.
REDDemon
Developer
Posts: 1044
Joined: Tue Aug 31, 2010 8:06 pm
Location: Genova (Italy)

About SIMD math (with profiling code)

Post by REDDemon »

I was trying to think if integrating SIMD math in my thesis project (for matrix and vectors and few other stuff).

Here's the test code (it load a identity matrix from a text file so It can't be optimized away and every computation depends on result of previous computation so there's no room for optimizer removing code, but it can still optimize loops etc.) for matrix multiplication:

Code: Select all

#undef NDEBUG
#include <assert.h>
#include <emmintrin.h>
#include <chrono>
#include <iostream>
#include <fstream>
 
 
#define STOP_TIMER(name)  std::cout << "RUNTIME of " << name << \
    std::chrono::duration_cast<std::chrono::microseconds>( \
            t2-t1 \
    ).count() << " microseconds " << std::endl;
 
 
typedef std::chrono::high_resolution_clock Clock;
 
struct Matrix{
    float m[16];
};
 
struct SMatrix{
    __m128 m[4];
};
 
//naive matrix multiplication (the compiler is amazing to optimize this ugly code)
void mmul(const float *a, const float *b, float *r)
{
  for (int i=0; i<16; i+=4)
    for (int j=0; j<4; j++)
      r[i+j] = b[i]*a[j] + b[i+1]*a[j+4] + b[i+2]*a[j+8] + b[i+3]*a[j+12];
}
 
//fastest SSE matrix mul I found on the web (there were other implementations, all were slower)
void mmul_sse(const float * a, const float * b, float * r)
{
  volatile __m128 a_line, b_line, r_line;
  for (int i=0; i<16; i+=4) {
    a_line = _mm_load_ps(a);        
    b_line = _mm_set1_ps(b[i]);    
    r_line = _mm_mul_ps(a_line, b_line);
    for (int j=1; j<4; j++) {
      a_line = _mm_load_ps(&a[j*4]); 
      b_line = _mm_set1_ps(b[i+j]);  
                                     
      r_line = _mm_add_ps(_mm_mul_ps(a_line, b_line), r_line);
    }
    _mm_store_ps(&r[i], r_line);     // r[i] = r_line
  }
}
 
/** Irrlicht's matrix multiplication */
void irr_mul(const float * m1, const float * m2, float * M){
 
    M[0] = m1[0]*m2[0] + m1[4]*m2[1] + m1[8]*m2[2] + m1[12]*m2[3];
    M[1] = m1[1]*m2[0] + m1[5]*m2[1] + m1[9]*m2[2] + m1[13]*m2[3];
    M[2] = m1[2]*m2[0] + m1[6]*m2[1] + m1[10]*m2[2] + m1[14]*m2[3];
    M[3] = m1[3]*m2[0] + m1[7]*m2[1] + m1[11]*m2[2] + m1[15]*m2[3];
    M[4] = m1[0]*m2[4] + m1[4]*m2[5] + m1[8]*m2[6] + m1[12]*m2[7];
    M[5] = m1[1]*m2[4] + m1[5]*m2[5] + m1[9]*m2[6] + m1[13]*m2[7];
    M[6] = m1[2]*m2[4] + m1[6]*m2[5] + m1[10]*m2[6] + m1[14]*m2[7];
    M[7] = m1[3]*m2[4] + m1[7]*m2[5] + m1[11]*m2[6] + m1[15]*m2[7];
    M[8] = m1[0]*m2[8] + m1[4]*m2[9] + m1[8]*m2[10] + m1[12]*m2[11];
    M[9] = m1[1]*m2[8] + m1[5]*m2[9] + m1[9]*m2[10] + m1[13]*m2[11];
    M[10] = m1[2]*m2[8] + m1[6]*m2[9] + m1[10]*m2[10] + m1[14]*m2[11];
    M[11] = m1[3]*m2[8] + m1[7]*m2[9] + m1[11]*m2[10] + m1[15]*m2[11];
    M[12] = m1[0]*m2[12] + m1[4]*m2[13] + m1[8]*m2[14] + m1[12]*m2[15];
    M[13] = m1[1]*m2[12] + m1[5]*m2[13] + m1[9]*m2[14] + m1[13]*m2[15];
    M[14] = m1[2]*m2[12] + m1[6]*m2[13] + m1[10]*m2[14] + m1[14]*m2[15];
    M[15] = m1[3]*m2[12] + m1[7]*m2[13] + m1[11]*m2[14] + m1[15]*m2[15];
}
 
void printMatrix(float * a){
    int i = 0;
    std::cout<<std::endl<<a[0]<<" "<<a[1]<<" "<<a[2]<<" "<<a[3]<<" "<<std::endl;
    std::cout<<a[4]<<" "<<a[5]<<" "<<a[6]<<" "<<a[7]<<" "<<std::endl;
    std::cout<<a[8]<<" "<<a[9]<<" "<<a[10]<<" "<<a[11]<<" "<<std::endl;
    std::cout<<a[12]<<" "<<a[13]<<" "<<a[14]<<" "<<a[15]<<" "<<std::endl;
}
 
void loadMatrix(float * a){
 
    std::ifstream myFile("identityMatrix.txt");  
 
    if (!myFile.is_open()){  
        std::cout << "failed to open file\n";
        return ;
    }
 
    int i=0;
    float b;
    std::cout<<"entering\n"<<std::endl;
    while(myFile >> b){
        a[i++]=b;
        std::cout<<b<<" ";
    }
 
    myFile.close();
}
 
int main(){
    Matrix a,b,c;
    SMatrix x,y,z;
    int times = 10000000; //number of multiplication loops
 
    loadMatrix(a.m);
    loadMatrix(b.m);
    loadMatrix(c.m);
 
    loadMatrix(reinterpret_cast<float *>(x.m));
    loadMatrix(reinterpret_cast<float *>(y.m));
    loadMatrix(reinterpret_cast<float *>(z.m));
 
 
    auto t1 = Clock::now();
    auto t2 = Clock::now();
 
    assert( alignof(a)==alignof(b));
    assert( alignof(a)== 4);
    assert( alignof(x)==alignof(y));
    assert( alignof(x)== 16);
 
    t1 = Clock::now();
    for(int i=0; i<times; i++){
        mmul_sse(   reinterpret_cast<float *>(x.m),
                    reinterpret_cast<float *>(y.m),
                    reinterpret_cast<float *>(z.m));
        mmul_sse(   reinterpret_cast<float *>(z.m),
                    reinterpret_cast<float *>(y.m),
                    reinterpret_cast<float *>(x.m));
        mmul_sse(   reinterpret_cast<float *>(x.m),
                    reinterpret_cast<float *>(z.m),
                    reinterpret_cast<float *>(y.m));
 
    }
    t2 = Clock::now();
 
    STOP_TIMER("SSE matrix mul   \t");
 
 
 
    t1 = Clock::now();
    for(int i=0; i<times; i++){
        mmul(a.m,b.m,c.m); //each time we generate input for successive operation (prevent code stripping)
        mmul(c.m,b.m,a.m);
        mmul(a.m,c.m,b.m);
    }
    t2 = Clock::now();
 
 
    STOP_TIMER("regularMatrix mul\t");
 
    t1 = Clock::now();
    for(int i=0; i<times; i++){
        irr_mul(a.m,b.m,c.m);
        irr_mul(c.m,b.m,a.m);
        irr_mul(a.m,c.m,b.m);
    }
    t2 = Clock::now();
 
 
    STOP_TIMER("irrMatrix mul\t");
 
   //check if we get identities (if not, matrix multiplication is wrong)
    printMatrix(a.m);
    printMatrix(b.m);
    printMatrix(c.m);
    printMatrix(reinterpret_cast<float *>(x.m));
    printMatrix(reinterpret_cast<float *>(y.m));
    printMatrix(reinterpret_cast<float *>(z.m));
 
    return 0;
}

Here are the results on GCC 4.9.X (-msse2 always on):

-O optimization level
RUNTIME of SSE matrix mul 2276000 microseconds
RUNTIME of regularMatrix mul 2252000 microseconds
RUNTIME of irrMatrix mul 2061000 microseconds
-O1 optimization level
RUNTIME of SSE matrix mul 2277000 microseconds
RUNTIME of regularMatrix mul 2332000 microseconds
RUNTIME of irrMatrix mul 2078000 microseconds

-O2 optimization level
RUNTIME of SSE matrix mul 2044000 microseconds
RUNTIME of regularMatrix mul 2252000 microseconds
RUNTIME of irrMatrix mul 2078000 microseconds
-O3 optimization level
RUNTIME of SSE matrix mul 1786000 microseconds
RUNTIME of regularMatrix mul 2154000 microseconds
RUNTIME of irrMatrix mul 2293000 microseconds
-O4 optimization level
RUNTIME of SSE matrix mul 1795000 microseconds
RUNTIME of regularMatrix mul 2090000 microseconds
RUNTIME of irrMatrix mul 2064000 microseconds
-Os optimization level
RUNTIME of SSE matrix mul 2134000 microseconds
RUNTIME of regularMatrix mul 2273000 microseconds
RUNTIME of irrMatrix mul 2099000 microseconds
As you see, SSE gives little improvement only with optimization levels that are too aggressive for a whole application optimization level, while for optimization level that are usefull for whole application it tends to be slightly slower.

There are 2 problems:
1) Compiler and CPU optimizations are amazing. If you try to implements dot product in SSE you find it will be faster in SSE1(5 instructions than SSE4(1 instruction) because compiler can organize register dependencies better for many small instructions reducing pipeline latencies, and on the other side, CPU can pipeline better simple instructions.

2) Matrix multiplication is heavily based on dot product wich is a memory bandwith limited operation

3) Matrix multiplication is usually the heaviest operation when doing mass math .. in any engine.

In general using SIMD (even with the help of intrinsics) adds a lot of complexity. in most cases the gain is very little or null.
The only place where SIMD shines is for quaternions where I can get 7/8 x speed up when creating a rotation around an axis (3/4 speed up for creating 4 different quaternions at once, and 2x speed up because if you compute Cos yourself you know you can get Sin almost for free but you are sacrificing some precision here ).

The test is already too artificial (only 3 matrices => fits entirely in L1 cache and cache lines) in a real scenario we will be more memory limited than this test.

Anyway I'm curious to see results on more modern CPUs (mine is rather old). It would be nice to see also someone test with AVX (unluckily I don't have that instruction set)

IdentityMatrix.txt:

Code: Select all

 
1.0 0.0 0.0 0.0
0.0 1.0 0.0 0.0
0.0 0.0 1.0 0.0
0.0 0.0 0.0 1.0
 
edit:
SIMD instructions also consume more power (important for laptops)
Junior Irrlicht Developer.
Real value in social networks is not about "increasing" number of followers, but about getting in touch with Amazing people.
- by Me
hendu
Posts: 2600
Joined: Sat Dec 18, 2010 12:53 pm

Re: About SIMD math (with profiling code)

Post by hendu »

As you see, SSE gives little improvement only with optimization levels that are too aggressive for a whole application optimization level, while for optimization level that are usefull for whole application it tends to be slightly slower.
I disagree, there is no such thing as "too aggressive optimization". I build everything with the highest possible, if something breaks it's 99% a fault in your code.
thanhle
Posts: 325
Joined: Wed Jun 12, 2013 8:09 am

Re: About SIMD math (with profiling code)

Post by thanhle »

Is SIMD better than everything else out there?
I think it's better to make everything works. Then optimization comes later. Otherwise you will never accomplish what you want to do.

Regards
thanh
hendu
Posts: 2600
Joined: Sat Dec 18, 2010 12:53 pm

Re: About SIMD math (with profiling code)

Post by hendu »

-Os

RUNTIME of SSE matrix mul 1931963 microseconds
RUNTIME of regularMatrix mul 1616349 microseconds
RUNTIME of irrMatrix mul 773213 microseconds

-O3

RUNTIME of SSE matrix mul 1903855 microseconds
RUNTIME of regularMatrix mul 804209 microseconds
RUNTIME of irrMatrix mul 764512 microseconds

-O3 with properly annotated code (marked pointers restrict const)

RUNTIME of SSE matrix mul 1909559 microseconds
RUNTIME of regularMatrix mul 606689 microseconds
RUNTIME of irrMatrix mul 813468 microseconds


Conclusion: compiler > manual SSE code ;)
REDDemon
Developer
Posts: 1044
Joined: Tue Aug 31, 2010 8:06 pm
Location: Genova (Italy)

Re: About SIMD math (with profiling code)

Post by REDDemon »

hendu wrote:
As you see, SSE gives little improvement only with optimization levels that are too aggressive for a whole application optimization level, while for optimization level that are usefull for whole application it tends to be slightly slower.
I disagree, there is no such thing as "too aggressive optimization". I build everything with the highest possible, if something breaks it's 99% a fault in your code.
I think you are referring to something different. GCC use massive optimizations under O3 abusing(well in reality who rely on undefined behaviour is the real abuser :D ) what is called "undefined behaviour" (like using a un-initialized variable), if you rely on undefined behaviour and compile with O3 you are going to break your code and I agree with that.

Using O3 not always produce faster code than O2 (especially because sometimes O3 unroll "too much" and bloat the code), but increase compile time for sure.

What I wanted to show is that messing with SSE manually is just lost time because CPUs and Compilers will do a better job.
Junior Irrlicht Developer.
Real value in social networks is not about "increasing" number of followers, but about getting in touch with Amazing people.
- by Me
hendu
Posts: 2600
Joined: Sat Dec 18, 2010 12:53 pm

Re: About SIMD math (with profiling code)

Post by hendu »

If O3 produces measurably worse code, you report that as a compiler bug. ;)
devsh
Competition winner
Posts: 2057
Joined: Tue Dec 09, 2008 6:00 pm
Location: UK
Contact:

Re: About SIMD math (with profiling code)

Post by devsh »

I can do vector transformation in 7 instructions with my SIMD matrix SSE3 class (4 muls, 3 hadds, 1 assign), as opposed to irrlicht's 32 (4x 7 for dot plus 1 for assign)

For matrix matrix mul I can get it in 8 (for transpose) + 4*4 instructions (4x mul and 3 hadds) + 4 assigns which gives us 28 ops for matrix multiplication as opposed to 64 muls, 48 adds, and 16 assignments
REDDemon
Developer
Posts: 1044
Joined: Tue Aug 31, 2010 8:06 pm
Location: Genova (Italy)

Re: About SIMD math (with profiling code)

Post by REDDemon »

hendu wrote:If O3 produces measurably worse code, you report that as a compiler bug. ;)
:roll:

...

devsh you beat me :) my matrix multiplication was 37 instructions. but timings can you beat your compiler in vectorizing irrlicht's matrix multiplication ? :)
Junior Irrlicht Developer.
Real value in social networks is not about "increasing" number of followers, but about getting in touch with Amazing people.
- by Me
devsh
Competition winner
Posts: 2057
Joined: Tue Dec 09, 2008 6:00 pm
Location: UK
Contact:

Re: About SIMD math (with profiling code)

Post by devsh »

yes I can, but now I'm working on heavily optimizing the matrix inverse op... because irrlicht never checks if its a sane transformation (with 0,0,0,1 in the last row) which greatly reduces the equation for the determinant and inverse (by a factor of 4, since each element is a sum of 3 multiplications of scalar and a term in brackets which is the difference of two products).

Plus the determinant of the matrix is the dot product of the Nth row of the original matrix and the Nth column of the inverse scaled up by the determinant.
Whoever wrote the normal non SSE matrix4 class, didnt notice that and unless optimized by clever compiler, there is redundancy in caclulating inverse matrix elements and determinant.

And choice of storing the matrix elements column first or row first has great impact of speed of matrix vector multiplication, especially with SSE3 where you have the _mm_hadd_ps instruction it is better to store matrices by rows instead of by columns as irrlicht currently does (which would force you to shuffle to get a vector of just X component to use for the column addition version of matrix vector multiplication, hence adding 4 instructions)

This is funking hilarious since irrlicht calls its matrix class row-major XD
hendu
Posts: 2600
Joined: Sat Dec 18, 2010 12:53 pm

Re: About SIMD math (with profiling code)

Post by hendu »

0,0,0,1 in the last row would mean no translation, which is essential for the usual camera matrix? Or did you mean from the other direction, the w component.
CuteAlien
Admin
Posts: 9734
Joined: Mon Mar 06, 2006 2:25 pm
Location: Tübingen, Germany
Contact:

Re: About SIMD math (with profiling code)

Post by CuteAlien »

@devsh: Matrix is really row-major order. But I also got confused at first when I just checked. Because the operators taking row and column are like (row,col) instead of (col, row) as one would rather expect (who starts with y before x?). Unfortunately nothing I can change now without breaking user-code in unexpected ways (thought I can bet this also causes it's share of bugs...).

edit: Argh - I get it now. This was meant not as at(x,y) but as replacement for [y][x]. That's why it has this order of parameters probably.
IRC: #irrlicht on irc.libera.chat
Code snippet repository: https://github.com/mzeilfelder/irr-playground-micha
Free racer made with Irrlicht: http://www.irrgheist.com/hcraftsource.htm
mongoose7
Posts: 1227
Joined: Wed Apr 06, 2011 12:13 pm

Re: About SIMD math (with profiling code)

Post by mongoose7 »

In order to pass matrices to the GPU, they are stored as the transpose. Everything works fine except that multiplication must reverse the order of the matrices (AT*BT = (BA)T).
devsh
Competition winner
Posts: 2057
Joined: Tue Dec 09, 2008 6:00 pm
Location: UK
Contact:

Re: About SIMD math (with profiling code)

Post by devsh »

0,0,0,1 in the last row would mean no translation, which is essential for the usual camera matrix? Or did you mean from the other direction, the w component.
Okay lets clear one thing up once for all... it doesn't matter how the matrix is stored in memory, THE FOLLOWING HOLDS!!!

ALWAYS matrix vec multiplication is equivalent to these two:

Code: Select all

M*v = Sum over i {v.getComponent(i)*M.getColumn(i)}
(M*v).getComponent(i) = v.dotProduct(M.getRow(i))
So if GPU, etc uses a 4 column vector with .w compoent set to 1 to emulate translation.... the last ROW must be (0,0,0,1) and the last COLUMN must be (transXYZ,1)

END OF ABSOLUTE MATH TRUTH
In order to pass matrices to the GPU, they are stored as the transpose. Everything works fine except that multiplication must reverse the order of the matrices (AT*BT = (BA)T).
Nah OpenGL has functions for passing matrices both ways (so you can store your matrices however you want)
@devsh: Matrix is really row-major order. But I also got confused at first when I just checked. Because the operators taking row and column are like (row,col) instead of (col, row) as one would rather expect (who starts with y before x?). Unfortunately nothing I can change now without breaking user-code in unexpected ways (thought I can bet this also causes it's share of bugs...).

edit: Argh - I get it now. This was meant not as at(x,y) but as replacement for [y][x]. That's why it has this order of parameters probably.
Why do you think I wrote a new class??? obviously I dont want to break my won code using matrix4!!!

The (a,b) operator is correctly implemented, so a is the row, b is the column. This is the correct mathematical way, and its the same in my SIMD class.
HOWEVER the order of floats in the matrix, M[0],M[1],M[2],M[3] are not the first row, they are the first column!
CuteAlien
Admin
Posts: 9734
Joined: Mon Mar 06, 2006 2:25 pm
Location: Tübingen, Germany
Contact:

Re: About SIMD math (with profiling code)

Post by CuteAlien »

devsh wrote:HOWEVER the order of floats in the matrix, M[0],M[1],M[2],M[3] are not the first row, they are the first column!
No they are not. They are the first row.
IRC: #irrlicht on irc.libera.chat
Code snippet repository: https://github.com/mzeilfelder/irr-playground-micha
Free racer made with Irrlicht: http://www.irrgheist.com/hcraftsource.htm
devsh
Competition winner
Posts: 2057
Joined: Tue Dec 09, 2008 6:00 pm
Location: UK
Contact:

Re: About SIMD math (with profiling code)

Post by devsh »

Code: Select all

    template <class T>
    inline vector3d<T> CMatrix4<T>::getTranslation() const
    {
        return vector3d<T>(M[12], M[13], M[14]);
    }
-----------BEGIN ABSOLUTE TRUTH-------------

Translation in an abstract mathematical matrix used to manipulate a vector (positionXYZ,1) is in the LAST COLUMN!!!

-----------------------------------------------------------------

Above we can see CONSECUTIVE MEMORY POSITIONS RETURNED AS A TRANSLATION VECTOR

HENCE M[12],M[13],M[14],M[15] is a COLUMN not a ROW
Post Reply