I’ve just updated my prior article on virtual function overhead with corrected timing numbers — I hadn’t noticed that my CPU cycle counts were only 32 bits wide so timings of more than 2secs would wrap back around to zero.
If you want to run this test on your own hardware, I’ve put my code below the jump. You’ll have to build your own CFastTimer class, but it should be pretty clear what it does — it simply reads out of the CPU clock-cycle counter and computes a difference.
file 1: class definitions header
class TestVector4_Virtual { public: virtual float GetX() const; virtual float SetX( float in ); virtual float GetY() const; virtual float SetY( float in ); virtual float GetZ() const; virtual float SetZ( float in ); virtual float GetW() const; virtual float SetW( float in ); private: float x,y,z,w; }; class TestVector4_Direct { public: __declspec(noinline) float GetX() const; __declspec(noinline) float SetX( float in ); __declspec(noinline) float GetY() const; __declspec(noinline) float SetY( float in ); __declspec(noinline) float GetZ() const; __declspec(noinline) float SetZ( float in ); __declspec(noinline) float GetW() const; __declspec(noinline) float SetW( float in ); private: float x,y,z,w; }; class TestVector4_Inline { public: inline float GetX() const; inline float SetX( float in ); inline float GetY() const; inline float SetY( float in ); inline float GetZ() const; inline float SetZ( float in ); inline float GetW() const; inline float SetW( float in ); private: float x,y,z,w; }; inline float TestVector4_Inline::GetX() const { return x; } inline float TestVector4_Inline::SetX( float in ) { return x = in; } /* and so on for GetY, Z, W... */ |
file 2: class definitions cpp
These functions are defined here to prevent the compiler from inlining them when they’re used.
float TestVector4_Virtual::GetX() const { return x; } float TestVector4_Virtual::SetX( float in ) { return x = in; } /* and so on for y,z,w... */ float TestVector4_Direct::GetX() const { return x; } float TestVector4_Direct::SetX( float in ) { return x = in; } /* and so on for y,z,w... */ |
file 3: test loop
#define ARRAY_SIZE 1024 #define TEST_ITERATIONS 10000 template <class T> void InitWithRandom( T *ptr, int num ) { while( num > 0 ) { ptr->SetX( RandomFloat(-1024.f, 1024.0f) ); ptr->SetY( RandomFloat(-1024.f, 1024.0f) ); ptr->SetZ( RandomFloat(-1024.f, 1024.0f) ); ptr->SetW( RandomFloat(-1024.f, 1024.0f) ); ++ptr; --num; } } template <class T> void SumTest( T * RESTRICT in1, T * RESTRICT in2, T * RESTRICT out, const int num ) { for ( int i = 0; i < num ; ++i ) { out[i].SetX( in1[i].GetX() + in2[i].GetX() ); out[i].SetY( in1[i].GetY() + in2[i].GetY() ); out[i].SetZ( in1[i].GetZ() + in2[i].GetZ() ); out[i].SetW( in1[i].GetW() + in2[i].GetW() ); } } template <class T> float TestTimings( ) { // set up input and output and preheat the cache T A[ ARRAY_SIZE ]; T B[ ARRAY_SIZE ]; T C[ ARRAY_SIZE ]; InitWithRandom( A , ARRAY_SIZE ); InitWithRandom( B , ARRAY_SIZE ); InitWithRandom( C , ARRAY_SIZE ); uint64 retval = 0; CFastTimer t1; int dontOptimizeThisLoopToNothing = 0; for ( int i = 0 ; i < N_ITERS ; ++i ) { t1.Start(); SumTest( A, B, C, ARRAY_SIZE ); t1.End(); dontOptimizeThisLoopToNothing += i; retval += t1.GetClockCycleDelta(); } // force compiler to actually use the data so it doesn't vanish the loop above float ac = 0; for ( int i = 0 ; i < ARRAY_SIZE ; ++i ) { ac += C[i].GetX(); ac += C[i].GetY(); ac += C[i].GetZ(); ac += C[i].GetW(); } printf( "%f %d\n", ac, dontOptimizeThisLoopToNothing ); // just ignore these return CyclesToMilliseconds(retval) ; } void RunTest() { // get timings for each type float tVirt, tDirect, tInline; tVirt = TestTimings< TestVector4_Virtual >(); tDirect = TestTimings< TestVector4_Direct >(); tInline = TestTimings< TestVector4_Inline >(); printf( "\n%d iterations over %d vectors\n", TEST_ITERATIONS , ARRAY_SIZE ); printf( "virtual: %.3f ms\n", tVirt ); printf( "direct: %.3f ms\n", tDirect ); printf( "inline: %.3f ms\n", tInline ); } |
Assembly output
And, just in case you’re curious, here’s the assembly the compiler generates for the different versions of SumTest:
Direct Function
; Begin code for function: ??$SumTest@VTestVector4_Direct@@@@YAXPIAVTestVector4_Direct@@00H@Z ; 58 : { mflr r12 bl __savegprlr_26 stfd fr31,-40h(r1) stwu r1,-90h(r1) .endprolog $M89780: ; 59 : for ( int i = 0; i < num ; ++i ) cmpwi cr6,r6,0 ble cr6,$LN1@SumTest@2 mr r31,r4 subf r27,r4,r3 subf r26,r4,r5 mr r28,r6 $LL3@SumTest@2: ; 60 : { ; 61 : out[i].SetX( in1[i].GetX() + in2[i].GetX() ); add r30,r27,r31 add r29,r26,r31 mr r3,r30 bl ?GetX@TestVector4_Direct@@QBAMXZ mr r3,r31 fmr fr31,fr1 bl ?GetX@TestVector4_Direct@@QBAMXZ mr r3,r29 fadds fr1,fr31,fr1 bl ?SetX@TestVector4_Direct@@QAAMM@Z ; 62 : out[i].SetY( in1[i].GetY() + in2[i].GetY() ); mr r3,r30 bl ?GetY@TestVector4_Direct@@QBAMXZ mr r3,r31 fmr fr31,fr1 bl ?GetY@TestVector4_Direct@@QBAMXZ mr r3,r29 fadds fr1,fr31,fr1 bl ?SetY@TestVector4_Direct@@QAAMM@Z ; 63 : out[i].SetZ( in1[i].GetZ() + in2[i].GetZ() ); mr r3,r30 bl ?GetZ@TestVector4_Direct@@QBAMXZ mr r3,r31 fmr fr31,fr1 bl ?GetZ@TestVector4_Direct@@QBAMXZ mr r3,r29 fadds fr1,fr31,fr1 bl ?SetZ@TestVector4_Direct@@QAAMM@Z ; 64 : out[i].SetW( in1[i].GetW() + in2[i].GetW() ); mr r3,r30 bl ?GetW@TestVector4_Direct@@QBAMXZ mr r3,r31 fmr fr31,fr1 bl ?GetW@TestVector4_Direct@@QBAMXZ mr r3,r29 fadds fr1,fr31,fr1 bl ?SetW@TestVector4_Direct@@QAAMM@Z addic. r28,r28,-1 ; 0FFFFh addi r31,r31,16 ; 10h bne $LL3@SumTest@2 $LN1@SumTest@2: ; 65 : } ; 66 : } addi r1,r1,144 ; 90h lfd fr31,-40h(r1) b __restgprlr_26 $M89781: ; End code for function: ??$SumTest@VTestVector4_Direct@@@@YAXPIAVTestVector4_Direct@@00H@Z |
Virtual Function
??$SumTest@VTestVector4_Virtual@@@@YAXPIAVTestVector4_Virtual@@00H@Z PROC NEAR ; SumTest<TestVector4_Virtual>, COMDAT ; Begin code for function: ??$SumTest@VTestVector4_Virtual@@@@YAXPIAVTestVector4_Virtual@@00H@Z ; 58 : { mflr r12 bl __savegprlr_25 stfd fr31,-48h(r1) stwu r1,-0A0h(r1) .endprolog $M89754: ; 59 : for ( int i = 0; i < num ; ++i ) cmpwi cr6,r6,0 ble cr6,$LN1@SumTest mr r31,r4 subf r30,r4,r3 subf r29,r4,r5 mr r26,r6 $LL3@SumTest: ; 60 : { ; 61 : out[i].SetX( in1[i].GetX() + in2[i].GetX() ); lwz r11,0(r31) add r28,r29,r31 lwzx r25,r29,r31 add r27,r30,r31 mr r3,r31 lwz r10,0(r11) mtctr r10 bctrl lwzx r9,r30,r31 mr r3,r27 fmr fr31,fr1 lwz r8,0(r9) mtctr r8 bctrl lwz r7,4(r25) mr r3,r28 fadds fr1,fr31,fr1 mtctr r7 bctrl ; 62 : out[i].SetY( in1[i].GetY() + in2[i].GetY() ); lwz r6,0(r31) mr r3,r31 lwz r5,8(r6) lwzx r25,r29,r31 mtctr r5 bctrl lwzx r4,r30,r31 mr r3,r27 fmr fr31,fr1 lwz r11,8(r4) mtctr r11 bctrl lwz r10,0Ch(r25) mr r3,r28 fadds fr1,fr31,fr1 mtctr r10 bctrl ; 63 : out[i].SetZ( in1[i].GetZ() + in2[i].GetZ() ); lwz r9,0(r31) mr r3,r31 lwz r8,10h(r9) lwzx r25,r29,r31 mtctr r8 bctrl lwzx r7,r30,r31 mr r3,r27 fmr fr31,fr1 lwz r6,10h(r7) mtctr r6 bctrl lwz r5,14h(r25) mr r3,r28 fadds fr1,fr31,fr1 mtctr r5 bctrl ; 64 : out[i].SetW( in1[i].GetW() + in2[i].GetW() ); lwz r4,0(r31) mr r3,r31 lwz r11,18h(r4) lwzx r25,r29,r31 mtctr r11 bctrl lwzx r10,r30,r31 fmr fr31,fr1 mr r3,r27 lwz r9,18h(r10) mtctr r9 bctrl lwz r8,1Ch(r25) fadds fr1,fr31,fr1 mr r3,r28 mtctr r8 bctrl addic. r26,r26,-1 ; 0FFFFh addi r31,r31,20 ; 14h bne $LL3@SumTest $LN1@SumTest: ; 65 : } ; 66 : } addi r1,r1,160 ; 0A0h lfd fr31,-48h(r1) b __restgprlr_25 $M89755: ; End code for function: ??$SumTest@VTestVector4_Virtual@@@@YAXPIAVTestVector4_Virtual@@00H@Z |
Inlined Function
(notice the use of software pipelining to reduce hazards)
; Begin code for function: ??$SumTest@VTestVector4_Inline@@@@YAXPIAVTestVector4_Inline@@00H@Z ; 58 : { mflr r12 bl __savegprlr_29 stfd fr29,-38h(r1) stfd fr30,-30h(r1) stfd fr31,-28h(r1) .endprolog $M89879: ; 59 : for ( int i = 0; i < num ; ++i ) li r7,0 cmpwi cr6,r6,4 blt cr6,$LC33@SumTest@3 addi r11,r6,-4 ; 0FFFCh addi r9,r3,16 ; 10h srwi r11,r11,2 addi r10,r5,8 addi r8,r11,1 addi r11,r4,4 ; 64 : out[i].SetW( in1[i].GetW() + in2[i].GetW() ); subf r31,r4,r3 subf r30,r4,r5 subf r29,r5,r3 slwi r7,r8,2 $LL34@SumTest@3: lfs fr0,-4(r11) addic. r8,r8,-1 ; 0FFFFh lfs fr13,-10h(r9) lfsx fr12,r31,r11 fadds fr11,fr0,fr13 lfs fr10,0(r11) fadds fr8,fr12,fr10 lfsx fr9,r10,r29 lfs fr7,4(r11) lfs fr6,8(r11) fadds fr5,fr9,fr7 lfs fr4,-4(r9) lfs fr3,0Ch(r11) fadds fr2,fr6,fr4 lfs fr1,0(r9) lfs fr0,10h(r11) fadds fr13,fr3,fr1 lfs fr12,4(r9) lfs fr10,14h(r11) fadds fr9,fr0,fr12 lfs fr7,8(r9) lfs fr6,18h(r11) fadds fr4,fr10,fr7 lfs fr3,0Ch(r9) lfs fr1,1Ch(r11) fadds fr0,fr6,fr3 lfs fr12,10h(r9) lfs fr10,20h(r11) fadds fr7,fr1,fr12 lfs fr6,14h(r9) lfs fr3,24h(r11) fadds fr1,fr10,fr6 lfs fr12,18h(r9) fadds fr6,fr3,fr12 lfs fr3,1Ch(r9) lfs fr10,28h(r11) fadds fr10,fr10,fr3 lfs fr3,20h(r9) lfs fr12,2Ch(r11) fadds fr12,fr12,fr3 lfs fr31,30h(r11) lfs fr3,24h(r9) fadds fr3,fr31,fr3 lfs fr30,34h(r11) lfs fr31,28h(r9) fadds fr31,fr30,fr31 lfs fr29,38h(r11) lfs fr30,2Ch(r9) addi r9,r9,64 ; 40h fadds fr30,fr29,fr30 stfs fr11,-8(r10) stfsx fr8,r30,r11 addi r11,r11,64 ; 40h stfs fr5,0(r10) stfs fr2,4(r10) stfs fr13,8(r10) stfs fr9,0Ch(r10) stfs fr4,10h(r10) stfs fr0,14h(r10) stfs fr7,18h(r10) stfs fr1,1Ch(r10) stfs fr6,20h(r10) stfs fr10,24h(r10) stfs fr12,28h(r10) stfs fr3,2Ch(r10) stfs fr31,30h(r10) stfs fr30,34h(r10) addi r10,r10,64 ; 40h bne $LL34@SumTest@3 $LC33@SumTest@3: ; 59 : for ( int i = 0; i < num ; ++i ) cmpw cr6,r7,r6 bge cr6,$LN32@SumTest@3 slwi r11,r7,4 subf r31,r4,r3 add r8,r11,r4 add r10,r11,r5 add r9,r11,r3 addi r11,r8,4 subf r4,r4,r5 addi r10,r10,8 subf r5,r5,r3 subf r8,r7,r6 $LC3@SumTest@3: ; 60 : { ; 61 : out[i].SetX( in1[i].GetX() + in2[i].GetX() ); lfs fr0,-4(r11) addic. r8,r8,-1 ; 0FFFFh lfs fr13,0(r9) ; 62 : out[i].SetY( in1[i].GetY() + in2[i].GetY() ); lfsx fr12,r31,r11 fadds fr11,fr0,fr13 lfs fr10,0(r11) ; 63 : out[i].SetZ( in1[i].GetZ() + in2[i].GetZ() ); lfsx fr9,r10,r5 fadds fr8,fr12,fr10 lfs fr7,4(r11) ; 64 : out[i].SetW( in1[i].GetW() + in2[i].GetW() ); lfs fr6,8(r11) fadds fr5,fr9,fr7 lfs fr4,0Ch(r9) addi r9,r9,16 ; 10h fadds fr3,fr6,fr4 stfs fr11,-8(r10) stfsx fr8,r4,r11 addi r11,r11,16 ; 10h stfs fr5,0(r10) stfs fr3,4(r10) addi r10,r10,16 ; 10h bne $LC3@SumTest@3 $LN32@SumTest@3: ; 65 : } ; 66 : } lfd fr29,-38h(r1) lfd fr30,-30h(r1) lfd fr31,-28h(r1) b __restgprlr_29 $M89880: ; End code for function: ??$SumTest@VTestVector4_Inline@@@@YAXPIAVTestVector4_Inline@@00H@Z |
Virtual functions – an experiment | .mischief.mayhem.soap. says:
[...] article by Elan Ruskin, when he measures the overhead of virtual functions. There’s also a follow-up with test code, make sure to check it out as well. I’ve decided to extend the test [...]
June 14, 2009, 5:25 ammos says:
I discovered this blog through a friend’s shared item in Google Reader months ago. I’ve just now remembered it existed, and thought I’d drop in to say that I’ve really enjoyed the posts so far, and wouldn’t mind more!
August 5, 2009, 8:40 am