[HLSL]A Complete Shader Instancing Example

Post those lines of code you feel like sharing or find what you require for your project here; or simply use them as tutorials.
vinjn
Posts: 27
Joined: Tue May 20, 2008 4:45 am

[HLSL]A Complete Shader Instancing Example

Post by vinjn »

The example's idea is from the following post.
Thanks slavik262.
http://irrlicht.sourceforge.net/phpBB2/ ... 03984e737e

Question:
the clipping issue is not solved in this example, do you have any suggestions?

The standalone source file.

Code: Select all

 
#include <irrlicht.h>
 
#pragma comment(lib,"irrlicht.lib")
 
#pragma warning( disable: 4244 )
 
using namespace irr;
 
using namespace core;
using namespace scene;
using namespace video;
using namespace io;
using namespace gui;
 
IrrlichtDevice* device = 0;
scene::ISceneManager* smgr = 0;
video::IVideoDriver* driver = 0;
gui::IGUIEnvironment* env = 0;
scene::ISceneCollisionManager* coll = 0;
scene::IMeshManipulator* mm = 0;
video::IGPUProgrammingServices* gpu = 0;
 
bool initIrrlicht(s32 w, s32 h, bool opengl = true, s32 bpp=32, bool fullscreen=false, bool stentil = false);
 
const int NUM_BATCH_INSTANCES = 60; 
 
core::matrix4 instanceWorldArray[NUM_BATCH_INSTANCES];
core::array<ISceneNode*> totalNodeArray;
core::matrix4 viewProjection;
 
struct InstancingShaderCB : public video::IShaderConstantSetCallBack
{
        void OnSetConstants(video::IMaterialRendererServices* services,
                s32 userData)
        {
                services->setVertexShaderConstant("instanceWorldArray", (f32*)instanceWorldArray, 16*NUM_BATCH_INSTANCES);
                services->setVertexShaderConstant("viewProjection", viewProjection.pointer(), 16);      
        }
};
 
s32 addHighLevelShader(const c8* shader_file, const c8* vs_entry, const c8* ps_entry,
                                           IShaderConstantSetCallBack* callback, 
                                           E_MATERIAL_TYPE baseMateria = video::EMT_SOLID)
{
 
        const c8* ps_file = ps_entry ? shader_file : NULL;
        return  gpu->addHighLevelShaderMaterialFromFiles(
                shader_file, vs_entry, video::EVST_VS_2_0,
                ps_file, ps_entry, video::EPST_PS_2_0, 
                callback, baseMateria);
}
 
int main()
{
        if (!initIrrlicht(800,800,false))
                return -1;
 
        device->setWindowCaption(L"instancing");
 
        //mesh generation
        IMesh* aMesh = smgr->getGeometryCreator()->createSphereMesh(4,4,4);     
        IMesh* bMesh = mm->createMeshWith2TCoords(aMesh);
        IMeshBuffer* bBuffer = bMesh->getMeshBuffer(0);
 
        //create dupBuffer with bBuffer repeated NUM_BATCH_INSTANCES times
        SMeshBufferLightMap dupBuffer;
        for (int k=0;k<NUM_BATCH_INSTANCES;k++)
        {
                S3DVertex2TCoords* verts = (S3DVertex2TCoords*)bBuffer->getVertices();
                for (u32 i=0; i<bBuffer->getVertexCount(); ++i)
                {
                        verts[i].TCoords2.X = k;//assign the index of instance that each vertex belongs to
                }
                dupBuffer.append(bBuffer);
        }
 
        aMesh->drop();
        bMesh->drop();
 
        //save transformation in one EmptySceneNode which doesn't render itself
        f32 scale = 18;
        for (u32 i=0;i<10;i++)
                for (u32 j=0;j<10;j++)
                        for (u32 k=0;k<10;k++)
        {
                ISceneNode* empty = smgr->addEmptySceneNode();
                empty->setPosition(vector3df(i*scale,j*scale,k*scale));
                empty->setScale(vector3df(1+rand()%2));
                empty->setRotation(vector3df(rand()%255,rand()%255,rand()%255));
                totalNodeArray.push_back(empty);
        }
 
        smgr->addCameraSceneNodeFPS();
 
        SMaterial mtrl;
        //shader
        s32 mtrlShader = addHighLevelShader("../../media/instancing.hlsl", "vs_main", NULL,
                new InstancingShaderCB);
        mtrl.Lighting = false;
        mtrl.setTexture(0, driver->getTexture("../../media/fire.bmp"));
        mtrl.MaterialType = (video::E_MATERIAL_TYPE)mtrlShader;
 
        while(device->run())
        {
                if (device->isWindowActive())
                {
                        driver->beginScene(true, true, SColor(255,122,122,122));
 
                        viewProjection = driver->getTransform(video::ETS_PROJECTION);
                        viewProjection *= driver->getTransform(video::ETS_VIEW);
                        
                        smgr->drawAll();                        
                        {
                                driver->setMaterial(mtrl);
                                int nRemainingBoxes = totalNodeArray.size();
                                int node_idx = 0;
                                while( nRemainingBoxes > 0)
                                {
                                        int nRenderBoxes = core::min_( nRemainingBoxes, NUM_BATCH_INSTANCES );
 
                                        nRemainingBoxes -= nRenderBoxes;
                                        for (int i=0;i<nRenderBoxes;i++)
                                        {
                                                instanceWorldArray[i] = totalNodeArray[node_idx++]->getAbsoluteTransformation();
                                        }
                                        driver->drawMeshBuffer(&dupBuffer);
                                }                               
                        }
                        env->drawAll();
 
                        driver->endScene();
 
                        u32 fps = driver->getFPS();
                        u32 nPolygons = driver->getPrimitiveCountDrawn();
                        wchar_t info[256];
                        swprintf(info, L"fps: %d, poly: %d", fps, nPolygons);
                        device->setWindowCaption(info);
                }
        }
        
        device->drop();
 
        return 0;
}
 
 
bool initIrrlicht(s32 w, s32 h, bool opengl, s32 bpp, bool fullscreen, bool stentil)
{
        SIrrlichtCreationParameters param;
        param.WindowSize = core::dimension2d<u32>(w, h);
        param.AntiAlias = true;
        param.Fullscreen = fullscreen;
        param.Bits = bpp;
        param.Stencilbuffer = stentil;
        param.DriverType = opengl ? EDT_OPENGL : EDT_DIRECT3D9;
 
        device = createDeviceEx(param);
 
        if (device)
        {
                driver = device->getVideoDriver();
                env = device->getGUIEnvironment();
                //driver->setTextureCreationFlag(ETCF_ALWAYS_32_BIT,true);
                smgr = device->getSceneManager();
                coll = smgr->getSceneCollisionManager();
                mm = smgr->getMeshManipulator();
                gpu = driver->getGPUProgrammingServices();
 
                //setCursorVisible(false);
                return true;
        }
        else
                return false;   
}
 

The shader is directly taken from slavik262's post.

Code: Select all

 
float4x4 viewProjection;
#define NUM_BATCH_INSTANCES 60
float4x4 instanceWorldArray[NUM_BATCH_INSTANCES];
 
struct VertexInput 
{ 
   float3 position: POSITION; 
   float3 normal : NORMAL; 
   float2 uv : TEXCOORD0; 
   float2 uv2 : TEXCOORD1; 
   float4 color : COLOR0; 
}; 
 
struct VertexOutput 
{ 
   float4 screenPos : POSITION; 
   float4 color : COLOR0; 
   float2 uv : TEXCOORD0; 
}; 
 
VertexOutput vs_main(VertexInput IN) 
{ 
   VertexOutput OUT = (VertexOutput)0;
   int index = IN.uv2.x;
   float4x4 WVP = mul(instanceWorldArray[index], viewProjection);
   OUT.screenPos = mul(float4(IN.position,1), WVP); 
   OUT.color = IN.color;
   OUT.uv = IN.uv;
 
   return OUT; 
} 
 
[/img]


And the screen capture

admin-edit: Removed screenshot link as it didn't show, but instead brought up a dialog requestiong a password.
RaverGames
Posts: 17
Joined: Sat Sep 25, 2010 8:54 am

Post by RaverGames »

Very helpful thanks ;)
slavik262
Posts: 753
Joined: Sun Nov 22, 2009 9:25 pm
Location: Wisconsin, USA

Post by slavik262 »

Very cool.

Seeing this made me realize that I had never posted my code. :oops:

I'll be sure to do that soon.
vinjn
Posts: 27
Joined: Tue May 20, 2008 4:45 am

Post by vinjn »

slavik262 wrote:Very cool.

Seeing this made me realize that I had never posted my code. :oops:

I'll be sure to do that soon.
You know what, it takes me some days to translate your English post into C++. And during the process, I learned a lot.
Thanks again.
slavik262
Posts: 753
Joined: Sun Nov 22, 2009 9:25 pm
Location: Wisconsin, USA

Post by slavik262 »

To answer your question about clipping:

Clipping becomes difficult since you can really only clip each batch of 60 meshes. Since they're all technically one mesh, they all share a single draw call; this is the big advantage of shader instancing. It also means, though, that there's no way to not draw any of the 60. You can give them a scale of 0 to make them invisible, but the vertex information will still be processed by the GPU.
evandromillian
Posts: 40
Joined: Wed Apr 01, 2009 11:45 am
Location: São Paulo - Brazil

Post by evandromillian »

It is possible to upload the frustum to GPU and test each vertex against? We can lost some performance for scenes like these above, but with an RTS army (where we can't ensure all instances will be in the screen) this can be a performance save.
Next generation for Irrlicht!!!!!
archmager
Posts: 3
Joined: Tue Nov 09, 2010 7:54 am

Post by archmager »

I tried this code with an mesh(size:100k) and rendered it for 100 times,but the speed is lower than the default method,I dont't kown the reason.
devsh
Competition winner
Posts: 2057
Joined: Tue Dec 09, 2008 6:00 pm
Location: UK
Contact:

Post by devsh »

the mesh is bigger than vertex cache... plus try this instancing method in GLSL with geometry shader... you wouldn't need to use 2 texture coordinates to code for instances, just pass the instances for different objects
slavik262
Posts: 753
Joined: Sun Nov 22, 2009 9:25 pm
Location: Wisconsin, USA

Post by slavik262 »

devsh wrote:the mesh is bigger than vertex cache... plus try this instancing method in GLSL with geometry shader... you wouldn't need to use 2 texture coordinates to code for instances, just pass the instances for different objects
Yes, but then you need geometry shader support. :roll:
The only reason I'm using two texture coordinates is because Irrlicht lacks a flexible vertex format. Other than that, this is a pretty standard way to do it. See the DirectX SDK. My method is described as "shader instancing," and your geometry shader method is described as "hardware instancing."
devsh
Competition winner
Posts: 2057
Joined: Tue Dec 09, 2008 6:00 pm
Location: UK
Contact:

Post by devsh »

hardware instance == superior due to memory and transfer load

P.S. I'm converting this example to GLSL and adding a geometry shader instanced version
slavik262
Posts: 753
Joined: Sun Nov 22, 2009 9:25 pm
Location: Wisconsin, USA

Post by slavik262 »

devsh wrote:hardware instance == superior due to memory and transfer load
Hardware instance = only possible on newer cards. I know you love everything GLSL and geometry shader, but you must admit that there are times when a lower tech solution is advantageous.
devsh
Competition winner
Posts: 2057
Joined: Tue Dec 09, 2008 6:00 pm
Location: UK
Contact:

Post by devsh »

EDIT: There was double culling, now I get 130 fps looking at it and 500 looking away, compared to 150fps without culling

ALTHOUGH it's only culling against frustum BOX not the FRUSTUM PLANES. as this type of culling brings the fps to 30 and 46

EDIT 2: After making it not transform the bbox by the AbsoluteTransformation each frame I have made it run at 145fps and 700 fps making it only 5 fps slower

EDIT 3: With 41 instances and GLSL shader translation of your method (not hardware instance) I have got 540 fps pretty close to 700 which is me rendering an empty screen (with culling). without culling I get 565 fps constant. I chose 41 instances to compare with hardware instance which can only do 41 triangles (however software and hardware instance can be combined to give a lot more). 82 instances give 540 fps too and so does 123 bring little change

OLD:
I added clipping/culling without instancing enabled and found that without culling I get 85fps and with culling I get 50fps looking at it and 94fps looking away. Culling is certainly NOT the way to go here, maybe per batch bounding box cull would be better?
devsh
Competition winner
Posts: 2057
Joined: Tue Dec 09, 2008 6:00 pm
Location: UK
Contact:

Post by devsh »

Geometry Shaders suck :(

Speed decrease.... anyway here is my code for box culling to make the thing run faster

Code: Select all

#include <irrlicht.h>

#pragma comment(lib,"irrlicht.lib")

#pragma warning( disable: 4244 )

using namespace irr;

using namespace core;
using namespace scene;
using namespace video;
using namespace io;
using namespace gui;

IrrlichtDevice* device = 0;
scene::ISceneManager* smgr = 0;
video::IVideoDriver* driver = 0;
gui::IGUIEnvironment* env = 0;
scene::ISceneCollisionManager* coll = 0;
scene::IMeshManipulator* mm = 0;
video::IGPUProgrammingServices* gpu = 0;

bool initIrrlicht(s32 w, s32 h, bool opengl = true, s32 bpp=32, bool fullscreen=false, bool stentil = false);

const int NUM_BATCH_INSTANCES = 128;

core::matrix4 instanceWorldArray[NUM_BATCH_INSTANCES];
core::array<ISceneNode*> totalNodeArray;
core::array<core::aabbox3df> totalNodeBBoxArray;
core::matrix4 viewProjection;

struct InstancingShaderCB : public video::IShaderConstantSetCallBack
{
   void OnSetConstants(video::IMaterialRendererServices* services,
      s32 userData)
   {
      services->setVertexShaderConstant("instanceWorldArray", (f32*)instanceWorldArray, 16*NUM_BATCH_INSTANCES);
      services->setVertexShaderConstant("viewProjection", viewProjection.pointer(), 16);
   }
};

bool isCulled(const core::aabbox3d<f32> tbox, const scene::ICameraSceneNode* cam)
{
	if (!cam)
	{
		return false;
	}

			return !(tbox.intersectsWithBox(cam->getViewFrustum()->getBoundingBox() ));
}

int main()
{
   if (!initIrrlicht(800,800))
      return -1;

   device->setWindowCaption(L"instancing");

   //mesh generation
   IMesh* aMesh = smgr->getGeometryCreator()->createSphereMesh(4,4,4);
   IMesh* bMesh = mm->createMeshWith2TCoords(aMesh);
   IMeshBuffer* bBuffer = bMesh->getMeshBuffer(0);

   //create dupBuffer with bBuffer repeated NUM_BATCH_INSTANCES times
   SMeshBufferLightMap dupBuffer;
   for (int k=0;k<NUM_BATCH_INSTANCES;k++)
   {
      S3DVertex2TCoords* verts = (S3DVertex2TCoords*)bBuffer->getVertices();
      for (u32 i=0; i<bBuffer->getVertexCount(); i++)
      {
         verts[i].TCoords2.X = k;//assign the index of instance that each vertex belongs to
      }
      dupBuffer.append(verts,bBuffer->getVertexCount(),bBuffer->getIndices(),bBuffer->getIndexCount());
      dupBuffer.setHardwareMappingHint(scene::EHM_STATIC);
   }

   //aMesh->drop();
   //bMesh->drop();

   //save transformation in one EmptySceneNode which doesn't render itself
   f32 scale = 18.f;
   for (u32 i=0;i<20;i++)
      for (u32 j=0;j<10;j++)
         for (u32 k=0;k<20;k++)
   {
      ISceneNode* empty = smgr->addEmptySceneNode();
      empty->setPosition(vector3df(i*scale,j*scale,k*scale));
      empty->setScale(vector3df(1+rand()%2));
      empty->setRotation(vector3df(rand()%360,rand()%360,rand()%360));
      totalNodeArray.push_back(empty);
      empty->updateAbsolutePosition();
      empty->getTransformedBoundingBox();
      totalNodeBBoxArray.push_back(empty->getTransformedBoundingBox());
   }

   scene::ICameraSceneNode* cam = smgr->addCameraSceneNodeFPS();

   SMaterial mtrl;
   //shader
   InstancingShaderCB* callback = new InstancingShaderCB();
   s32 mtrlShader = gpu->addHighLevelShaderMaterialFromFiles("../../media/instancing.vert", "", video::EVST_VS_2_0,
                            "../../media/instancing.frag", "", video::EPST_PS_2_0, /*"../../media/instancing.geom", "",
                            video::EGST_GS_4_0, scene::EPT_TRIANGLES,scene::EPT_TRIANGLES,24,*/ callback, video::EMT_SOLID);
   callback->drop();
   mtrl.Lighting = false;
   mtrl.setTexture(0, driver->getTexture("../../media/fire.bmp"));
   mtrl.MaterialType = (video::E_MATERIAL_TYPE)mtrlShader;
   device->getCursorControl()->setVisible(false);

   s32 lastFPS = -1;
   while(device->run())
   {
      if (device->isWindowActive())
      {
         driver->beginScene(true, true, SColor(255,122,122,122));

         smgr->drawAll();

         viewProjection = driver->getTransform(video::ETS_PROJECTION);
         viewProjection *= driver->getTransform(video::ETS_VIEW);
         {
            driver->setMaterial(mtrl);
            int nRemainingBoxes = totalNodeArray.size();
            int node_idx = 0;
            while( nRemainingBoxes > 0)
            {
               int nRenderBoxes = core::min_( nRemainingBoxes, NUM_BATCH_INSTANCES );

               nRemainingBoxes -= nRenderBoxes;
               for (int i=0;i<nRenderBoxes&&node_idx<totalNodeArray.size(); )
               {
                  if (!isCulled(totalNodeBBoxArray[node_idx],cam))
                  {
                    instanceWorldArray[i] = totalNodeArray[node_idx]->getAbsoluteTransformation();
                    i++;
                  }
                  else
                    nRemainingBoxes--;
                  node_idx++;
               }
               driver->drawMeshBuffer(&dupBuffer);
            }
         }
         env->drawAll();

         driver->endScene();

         int fps = driver->getFPS();

         if (lastFPS != fps)
         {
            core::stringw str = L"fps: ";
            str += fps;
            str += ", poly: ";
            str += driver->getPrimitiveCountDrawn();

            device->setWindowCaption(str.c_str());
            lastFPS = fps;
         }
      }
   }

   device->drop();

   return 0;
}


bool initIrrlicht(s32 w, s32 h, bool opengl, s32 bpp, bool fullscreen, bool stentil)
{
   SIrrlichtCreationParameters param;
   param.WindowSize = core::dimension2d<u32>(w, h);
   param.AntiAlias = true;
   param.Fullscreen = fullscreen;
   param.Bits = bpp;
   param.Stencilbuffer = stentil;
   param.DriverType = opengl ? EDT_OPENGL : EDT_DIRECT3D9;

   device = createDeviceEx(param);

   if (device)
   {
      driver = device->getVideoDriver();
      env = device->getGUIEnvironment();
      //driver->setTextureCreationFlag(ETCF_ALWAYS_32_BIT,true);
      smgr = device->getSceneManager();
      coll = smgr->getSceneCollisionManager();
      mm = smgr->getMeshManipulator();
      gpu = driver->getGPUProgrammingServices();

      //setCursorVisible(false);
      return true;
   }
   else
      return false;
}
and my glsl translation of shaders

Code: Select all

uniform mat4 viewProjection;
#define NUM_BATCH_INSTANCES 128
uniform mat4 instanceWorldArray[NUM_BATCH_INSTANCES];

void main()
{
   int index = int(gl_MultiTexCoord1.x);
   mat4 WVP = viewProjection*instanceWorldArray[index];
   gl_Position = WVP*gl_Vertex;
   gl_TexCoord[0] = gl_MultiTexCoord0;
}

Code: Select all

uniform sampler2D tex0;



void main( void )

{

	gl_FragColor = texture2D(tex0,gl_TexCoord[0].xy);

}
kevinsbro
Posts: 51
Joined: Fri Nov 05, 2010 8:18 pm

Re: [HLSL]A Complete Shader Instancing Example

Post by kevinsbro »

So when trying to use this shader and expand on it, I noticed that the normal data also needs to be transformed.
How would I do this in the shader? I'm not very good with the math yet :(
REDDemon
Developer
Posts: 1044
Joined: Tue Aug 31, 2010 8:06 pm
Location: Genova (Italy)

Re: [HLSL]A Complete Shader Instancing Example

Post by REDDemon »

It happens only to me that when I open this page a password is requested?
Junior Irrlicht Developer.
Real value in social networks is not about "increasing" number of followers, but about getting in touch with Amazing people.
- by Me
Post Reply