Code For Testing Virtual Function Speed

I’ve just updated my prior article on virtual function overhead with corrected timing numbers — I hadn’t noticed that my CPU cycle counts were only 32 bits wide so timings of more than 2secs would wrap back around to zero.

If you want to run this test on your own hardware, I’ve put my code below the jump. You’ll have to build your own CFastTimer class, but it should be pretty clear what it does — it simply reads out of the CPU clock-cycle counter and computes a difference.

file 1: class definitions header

class TestVector4_Virtual
{
public:
	virtual float GetX() const;
	virtual float SetX( float in );
	virtual float GetY() const;
	virtual float SetY( float in );
	virtual float GetZ() const;
	virtual float SetZ( float in );
	virtual float GetW() const;
	virtual float SetW( float in );
private:
	float x,y,z,w;
};
 
class TestVector4_Direct
{
public:
	__declspec(noinline) float GetX() const;
	__declspec(noinline) float SetX( float in );
	__declspec(noinline) float GetY() const;
	__declspec(noinline) float SetY( float in );
	__declspec(noinline) float GetZ() const;
	__declspec(noinline) float SetZ( float in );
	__declspec(noinline) float GetW() const;
	__declspec(noinline) float SetW( float in );
private:
	float x,y,z,w;
};
 
class TestVector4_Inline
{
public:
	inline float GetX() const;
	inline float SetX( float in );
	inline float GetY() const;
	inline float SetY( float in );
	inline float GetZ() const;
	inline float SetZ( float in );
	inline float GetW() const;
	inline float SetW( float in );
private:
	float x,y,z,w;
};
 
inline float TestVector4_Inline::GetX() const
{
	return x;
}
inline float TestVector4_Inline::SetX( float in )
{
	return x = in;
}
 
/* and so on for GetY, Z, W... */

file 2: class definitions cpp

These functions are defined here to prevent the compiler from inlining them when they’re used.

float TestVector4_Virtual::GetX() const
{
	return x;
}
float TestVector4_Virtual::SetX( float in )
{
	return x = in;
}
/* and so on for y,z,w... */
 
float TestVector4_Direct::GetX() const
{
	return x;
}
float TestVector4_Direct::SetX( float in )
{
	return x = in;
}
/* and so on for y,z,w... */

file 3: test loop

#define ARRAY_SIZE 1024
#define TEST_ITERATIONS 10000
 
template <class T>
void InitWithRandom( T *ptr, int num )
{
	while( num > 0 )
	{
		ptr->SetX( RandomFloat(-1024.f, 1024.0f) );
		ptr->SetY( RandomFloat(-1024.f, 1024.0f) );
		ptr->SetZ( RandomFloat(-1024.f, 1024.0f) );
		ptr->SetW( RandomFloat(-1024.f, 1024.0f) );
		++ptr;
		--num;
	}
}
 
template <class T>
void SumTest( T * RESTRICT in1, T * RESTRICT in2, T * RESTRICT out, const int num )
{
	for ( int i = 0; i < num ; ++i )
	{
		out[i].SetX( in1[i].GetX() + in2[i].GetX() );
		out[i].SetY( in1[i].GetY() + in2[i].GetY() );
		out[i].SetZ( in1[i].GetZ() + in2[i].GetZ() );
		out[i].SetW( in1[i].GetW() + in2[i].GetW() );
	}
}
 
template <class T>
float TestTimings( )
{
	// set up input and output and preheat the cache
	T A[ ARRAY_SIZE ];
	T B[ ARRAY_SIZE ];
	T C[ ARRAY_SIZE ];
 
	InitWithRandom( A , ARRAY_SIZE );
	InitWithRandom( B , ARRAY_SIZE );
	InitWithRandom( C , ARRAY_SIZE );
 
	uint64 retval = 0;
	CFastTimer t1;
	int dontOptimizeThisLoopToNothing = 0;
	for ( int i = 0 ; i < N_ITERS ; ++i )
	{
		t1.Start();
		SumTest( A, B, C, ARRAY_SIZE );
		t1.End();
		dontOptimizeThisLoopToNothing  += i;
		retval += t1.GetClockCycleDelta();
	}
	// force compiler to actually use the data so it doesn't vanish the loop above
	float ac = 0;
	for ( int i = 0 ; i < ARRAY_SIZE ; ++i )
	{
		ac += C[i].GetX();
		ac += C[i].GetY();
		ac += C[i].GetZ();
		ac += C[i].GetW();
	}
	printf( "%f %dn", ac, dontOptimizeThisLoopToNothing  ); // just ignore these
	return CyclesToMilliseconds(retval) ;
}
 
void RunTest()
{
	// get timings for each type
	float tVirt, tDirect, tInline;
	tVirt = TestTimings< TestVector4_Virtual >();
	tDirect = TestTimings< TestVector4_Direct >();
	tInline = TestTimings< TestVector4_Inline >();
 
	printf( "n%d iterations over %d vectorsn", TEST_ITERATIONS , ARRAY_SIZE );
	printf( "virtual: %.3f msn", tVirt );
	printf( "direct: %.3f msn", tDirect );
	printf( "inline: %.3f msn", tInline );
}

Assembly output

And, just in case you’re curious, here’s the assembly the compiler generates for the different versions of SumTest:

Direct Function

; Begin code for function: ??$SumTest@VTestVector4_Direct@@@@YAXPIAVTestVector4_Direct@@00H@Z
 
; 58   : {
 
	mflr         r12
	bl           __savegprlr_26
	stfd         fr31,-40h(r1)
	stwu         r1,-90h(r1)
.endprolog
$M89780:
 
; 59   : 	for ( int i = 0; i < num ; ++i )
 
	cmpwi        cr6,r6,0
	ble          cr6,$LN1@SumTest@2
	mr           r31,r4
	subf         r27,r4,r3
	subf         r26,r4,r5
	mr           r28,r6
$LL3@SumTest@2:
 
; 60   : 	{
; 61   : 		out[i].SetX( in1[i].GetX() + in2[i].GetX() );
 
	add          r30,r27,r31
	add          r29,r26,r31
	mr           r3,r30
	bl           ?GetX@TestVector4_Direct@@QBAMXZ
	mr           r3,r31
	fmr          fr31,fr1
	bl           ?GetX@TestVector4_Direct@@QBAMXZ
	mr           r3,r29
	fadds        fr1,fr31,fr1
	bl           ?SetX@TestVector4_Direct@@QAAMM@Z
 
; 62   : 		out[i].SetY( in1[i].GetY() + in2[i].GetY() );
 
	mr           r3,r30
	bl           ?GetY@TestVector4_Direct@@QBAMXZ
	mr           r3,r31
	fmr          fr31,fr1
	bl           ?GetY@TestVector4_Direct@@QBAMXZ
	mr           r3,r29
	fadds        fr1,fr31,fr1
	bl           ?SetY@TestVector4_Direct@@QAAMM@Z
 
; 63   : 		out[i].SetZ( in1[i].GetZ() + in2[i].GetZ() );
 
	mr           r3,r30
	bl           ?GetZ@TestVector4_Direct@@QBAMXZ
	mr           r3,r31
	fmr          fr31,fr1
	bl           ?GetZ@TestVector4_Direct@@QBAMXZ
	mr           r3,r29
	fadds        fr1,fr31,fr1
	bl           ?SetZ@TestVector4_Direct@@QAAMM@Z
 
; 64   : 		out[i].SetW( in1[i].GetW() + in2[i].GetW() );
 
	mr           r3,r30
	bl           ?GetW@TestVector4_Direct@@QBAMXZ
	mr           r3,r31
	fmr          fr31,fr1
	bl           ?GetW@TestVector4_Direct@@QBAMXZ
	mr           r3,r29
	fadds        fr1,fr31,fr1
	bl           ?SetW@TestVector4_Direct@@QAAMM@Z
	addic.       r28,r28,-1		; 0FFFFh
	addi         r31,r31,16		; 10h
	bne          $LL3@SumTest@2
$LN1@SumTest@2:
 
; 65   : 	}
; 66   : }
 
	addi         r1,r1,144		; 90h
	lfd          fr31,-40h(r1)
	b            __restgprlr_26
$M89781:
; End code for function: ??$SumTest@VTestVector4_Direct@@@@YAXPIAVTestVector4_Direct@@00H@Z

Virtual Function

??$SumTest@VTestVector4_Virtual@@@@YAXPIAVTestVector4_Virtual@@00H@Z PROC NEAR ; SumTest<TestVector4_Virtual>, COMDAT
 
; Begin code for function: ??$SumTest@VTestVector4_Virtual@@@@YAXPIAVTestVector4_Virtual@@00H@Z
 
; 58   : {
 
	mflr         r12
	bl           __savegprlr_25
	stfd         fr31,-48h(r1)
	stwu         r1,-0A0h(r1)
.endprolog
$M89754:
 
; 59   : 	for ( int i = 0; i < num ; ++i )
 
	cmpwi        cr6,r6,0
	ble          cr6,$LN1@SumTest
	mr           r31,r4
	subf         r30,r4,r3
	subf         r29,r4,r5
	mr           r26,r6
$LL3@SumTest:
 
; 60   : 	{
; 61   : 		out[i].SetX( in1[i].GetX() + in2[i].GetX() );
 
	lwz          r11,0(r31)
	add          r28,r29,r31
	lwzx         r25,r29,r31
	add          r27,r30,r31
	mr           r3,r31
	lwz          r10,0(r11)
	mtctr        r10
	bctrl
	lwzx         r9,r30,r31
	mr           r3,r27
	fmr          fr31,fr1
	lwz          r8,0(r9)
	mtctr        r8
	bctrl
	lwz          r7,4(r25)
	mr           r3,r28
	fadds        fr1,fr31,fr1
	mtctr        r7
	bctrl
 
; 62   : 		out[i].SetY( in1[i].GetY() + in2[i].GetY() );
 
	lwz          r6,0(r31)
	mr           r3,r31
	lwz          r5,8(r6)
	lwzx         r25,r29,r31
	mtctr        r5
	bctrl
	lwzx         r4,r30,r31
	mr           r3,r27
	fmr          fr31,fr1
	lwz          r11,8(r4)
	mtctr        r11
	bctrl
	lwz          r10,0Ch(r25)
	mr           r3,r28
	fadds        fr1,fr31,fr1
	mtctr        r10
	bctrl
 
; 63   : 		out[i].SetZ( in1[i].GetZ() + in2[i].GetZ() );
 
	lwz          r9,0(r31)
	mr           r3,r31
	lwz          r8,10h(r9)
	lwzx         r25,r29,r31
	mtctr        r8
	bctrl
	lwzx         r7,r30,r31
	mr           r3,r27
	fmr          fr31,fr1
	lwz          r6,10h(r7)
	mtctr        r6
	bctrl
	lwz          r5,14h(r25)
	mr           r3,r28
	fadds        fr1,fr31,fr1
	mtctr        r5
	bctrl
 
; 64   : 		out[i].SetW( in1[i].GetW() + in2[i].GetW() );
 
	lwz          r4,0(r31)
	mr           r3,r31
	lwz          r11,18h(r4)
	lwzx         r25,r29,r31
	mtctr        r11
	bctrl
	lwzx         r10,r30,r31
	fmr          fr31,fr1
	mr           r3,r27
	lwz          r9,18h(r10)
	mtctr        r9
	bctrl
	lwz          r8,1Ch(r25)
	fadds        fr1,fr31,fr1
	mr           r3,r28
	mtctr        r8
	bctrl
	addic.       r26,r26,-1		; 0FFFFh
	addi         r31,r31,20		; 14h
	bne          $LL3@SumTest
$LN1@SumTest:
 
; 65   : 	}
; 66   : }
 
	addi         r1,r1,160		; 0A0h
	lfd          fr31,-48h(r1)
	b            __restgprlr_25
$M89755:
; End code for function: ??$SumTest@VTestVector4_Virtual@@@@YAXPIAVTestVector4_Virtual@@00H@Z

Inlined Function

(notice the use of software pipelining to reduce hazards)

; Begin code for function: ??$SumTest@VTestVector4_Inline@@@@YAXPIAVTestVector4_Inline@@00H@Z
 
; 58   : {
 
	mflr         r12
	bl           __savegprlr_29
	stfd         fr29,-38h(r1)
	stfd         fr30,-30h(r1)
	stfd         fr31,-28h(r1)
.endprolog
$M89879:
 
; 59   : 	for ( int i = 0; i < num ; ++i )
 
	li           r7,0
	cmpwi        cr6,r6,4
	blt          cr6,$LC33@SumTest@3
	addi         r11,r6,-4		; 0FFFCh
	addi         r9,r3,16		; 10h
	srwi         r11,r11,2
	addi         r10,r5,8
	addi         r8,r11,1
	addi         r11,r4,4
 
; 64   : 		out[i].SetW( in1[i].GetW() + in2[i].GetW() );
 
	subf         r31,r4,r3
	subf         r30,r4,r5
	subf         r29,r5,r3
	slwi         r7,r8,2
$LL34@SumTest@3:
	lfs          fr0,-4(r11)
	addic.       r8,r8,-1		; 0FFFFh
	lfs          fr13,-10h(r9)
	lfsx         fr12,r31,r11
	fadds        fr11,fr0,fr13
	lfs          fr10,0(r11)
	fadds        fr8,fr12,fr10
	lfsx         fr9,r10,r29
	lfs          fr7,4(r11)
	lfs          fr6,8(r11)
	fadds        fr5,fr9,fr7
	lfs          fr4,-4(r9)
	lfs          fr3,0Ch(r11)
	fadds        fr2,fr6,fr4
	lfs          fr1,0(r9)
	lfs          fr0,10h(r11)
	fadds        fr13,fr3,fr1
	lfs          fr12,4(r9)
	lfs          fr10,14h(r11)
	fadds        fr9,fr0,fr12
	lfs          fr7,8(r9)
	lfs          fr6,18h(r11)
	fadds        fr4,fr10,fr7
	lfs          fr3,0Ch(r9)
	lfs          fr1,1Ch(r11)
	fadds        fr0,fr6,fr3
	lfs          fr12,10h(r9)
	lfs          fr10,20h(r11)
	fadds        fr7,fr1,fr12
	lfs          fr6,14h(r9)
	lfs          fr3,24h(r11)
	fadds        fr1,fr10,fr6
	lfs          fr12,18h(r9)
	fadds        fr6,fr3,fr12
	lfs          fr3,1Ch(r9)
	lfs          fr10,28h(r11)
	fadds        fr10,fr10,fr3
	lfs          fr3,20h(r9)
	lfs          fr12,2Ch(r11)
	fadds        fr12,fr12,fr3
	lfs          fr31,30h(r11)
	lfs          fr3,24h(r9)
	fadds        fr3,fr31,fr3
	lfs          fr30,34h(r11)
	lfs          fr31,28h(r9)
	fadds        fr31,fr30,fr31
	lfs          fr29,38h(r11)
	lfs          fr30,2Ch(r9)
	addi         r9,r9,64		; 40h
	fadds        fr30,fr29,fr30
	stfs         fr11,-8(r10)
	stfsx        fr8,r30,r11
	addi         r11,r11,64		; 40h
	stfs         fr5,0(r10)
	stfs         fr2,4(r10)
	stfs         fr13,8(r10)
	stfs         fr9,0Ch(r10)
	stfs         fr4,10h(r10)
	stfs         fr0,14h(r10)
	stfs         fr7,18h(r10)
	stfs         fr1,1Ch(r10)
	stfs         fr6,20h(r10)
	stfs         fr10,24h(r10)
	stfs         fr12,28h(r10)
	stfs         fr3,2Ch(r10)
	stfs         fr31,30h(r10)
	stfs         fr30,34h(r10)
	addi         r10,r10,64		; 40h
	bne          $LL34@SumTest@3
$LC33@SumTest@3:
 
; 59   : 	for ( int i = 0; i < num ; ++i )
 
	cmpw         cr6,r7,r6
	bge          cr6,$LN32@SumTest@3
	slwi         r11,r7,4
	subf         r31,r4,r3
	add          r8,r11,r4
	add          r10,r11,r5
	add          r9,r11,r3
	addi         r11,r8,4
	subf         r4,r4,r5
	addi         r10,r10,8
	subf         r5,r5,r3
	subf         r8,r7,r6
$LC3@SumTest@3:
 
; 60   : 	{
; 61   : 		out[i].SetX( in1[i].GetX() + in2[i].GetX() );
 
	lfs          fr0,-4(r11)
	addic.       r8,r8,-1		; 0FFFFh
	lfs          fr13,0(r9)
 
; 62   : 		out[i].SetY( in1[i].GetY() + in2[i].GetY() );
 
	lfsx         fr12,r31,r11
	fadds        fr11,fr0,fr13
	lfs          fr10,0(r11)
 
; 63   : 		out[i].SetZ( in1[i].GetZ() + in2[i].GetZ() );
 
	lfsx         fr9,r10,r5
	fadds        fr8,fr12,fr10
	lfs          fr7,4(r11)
 
; 64   : 		out[i].SetW( in1[i].GetW() + in2[i].GetW() );
 
	lfs          fr6,8(r11)
	fadds        fr5,fr9,fr7
	lfs          fr4,0Ch(r9)
	addi         r9,r9,16		; 10h
	fadds        fr3,fr6,fr4
	stfs         fr11,-8(r10)
	stfsx        fr8,r4,r11
	addi         r11,r11,16		; 10h
	stfs         fr5,0(r10)
	stfs         fr3,4(r10)
	addi         r10,r10,16		; 10h
	bne          $LC3@SumTest@3
$LN32@SumTest@3:
 
; 65   : 	}
; 66   : }
 
	lfd          fr29,-38h(r1)
	lfd          fr30,-30h(r1)
	lfd          fr31,-28h(r1)
	b            __restgprlr_29
$M89880:
; End code for function: ??$SumTest@VTestVector4_Inline@@@@YAXPIAVTestVector4_Inline@@00H@Z

2 Comments

  1. […] article by Elan Ruskin, when he measures the overhead of virtual functions. There’s also a follow-up with test code, make sure to check it out as well. I’ve decided to extend the test […]

  2. mos says:

    I discovered this blog through a friend’s shared item in Google Reader months ago. I’ve just now remembered it existed, and thought I’d drop in to say that I’ve really enjoyed the posts so far, and wouldn’t mind more! 🙂

Leave a Reply