flipcode - Faster Float-To-Int Conversion

/*

Editor's note:  
COTD Entry:  Faster Float-To-Int Conversion by JCAB [jcab@roningames.com]

   Warning: this COTD is seriously MSVC-only. It might apply to other compilers (in fact, I'm certain it does apply to, at least, other older compilers). But the implementation doesn't.

   Warning: some assembly required if you want to understand how this is done.

   Recently, two people have asked around me about the float-to-int conversion routines being very slow. This is a problem endemic to the way the Microsoft C library implements a little function called _ftol().

   So, I've been trying out alternative implementations of this routine, and I've come up with several. I've been testing them on a 700 MHZ PC using this programs:

*/

// This first one checks the alternative implementation for correctness.
// It needs the _ftol() function used (see below) rewritten as
// "int FTOL(float)" and changed accordingly:

volatile int k;

int main(int argc, char* argv[])
{
 char s[4096];

 enum { NUMTESTS = 10000000 };
 static float rndbuf[NUMTESTS];

 int i;
 for (i = 0; i < NUMTESTS; ++i) {
  int intg1 = rand();
  int intg2 = rand();
  int intg3 = rand();
  int frac = rand();
  rndbuf[i] = float(intg1) - float(intg2) + float(intg3) + float(frac) / RAND_MAX;
 }

 for (i = 0; i < NUMTESTS; ++i) {
  k = rndbuf[i];
  if (k != FIST(rndbuf[i])) {
   sprintf(s, "Bad: %f (%08x) -> %d != %d\n", rndbuf[i], *(int*)&rndbuf[i], int(rndbuf[i]), FIST(rndbuf[i]));
   OutputDebugString(s);
  }
 }

 return 0;
}



// This program checks for performance:

volatile int k;

int main(int argc, char* argv[])
{
 char s[4096];

 enum { NUMTESTS = 10000000 };
 static float rndbuf[NUMTESTS];

 int i;
 for (i = 0; i < NUMTESTS; ++i) {
  int intg1 = rand();
  int intg2 = rand();
  int intg3 = rand();
  int frac = rand();
  rndbuf[i] = float(intg1) - float(intg2) + float(intg3) + float(frac) / RAND_MAX;
 }

 DWORD time = GetTickCount();

 int j;
 for (j = 0; j < 10; ++j) {
  for (i = 0; i < NUMTESTS; ++i) {
   k = rndbuf[i];
  }
 }

 time = GetTickCount() - time;

 sprintf(s, "Time = %d ms\n", time);
 OutputDebugString(s);

 return 0;
}


   The different functions I checked were:

1- The compiler's own.

2- Faster version that uses some global memory and requires rounding mode to be on:


extern "C" __declspec(naked) void __cdecl _ftol()
{
	const static int zpfp[2] = { 0xBEFFFFFF, 0x3EFFFFFF };

	__asm {
		SUB	ESP,4
		FST	DWORD PTR [ESP]
		MOV	EAX,DWORD PTR [ESP]
		SHR	EAX,29
		AND	EAX,4
		FADD	DWORD PTR [zpfp+EAX]
		FISTP	DWORD PTR [ESP]
		POP	EAX
		RET
	}
}


3- Slower version that uses no global memory and requires rounding mode to be on:


extern "C" __declspec(naked) void __cdecl _ftol()
{
	__asm {
		SUB	ESP,4
		FST	DWORD PTR [ESP]
		MOV	EAX,DWORD PTR [ESP]
		AND	EAX,0x80000000
		XOR	EAX,0xBEFFFFFF
		MOV	DWORD PTR [ESP],EAX
		FADD	DWORD PTR [ESP]
		FISTP	DWORD PTR [ESP]
		POP	EAX
		RET
	}
}


4- Version independent of the rounding mode, but which only works correctly on floats:


extern "C" __declspec(naked) void __cdecl _ftol()
{
	__asm {
		SUB	ESP,4
		FSTP	DWORD PTR [ESP]
		POP	EAX
		MOV	EDX,EAX
		MOV	ECX,EAX
		AND	EAX,0x007FFFFF
		OR	EAX,0x00800000
		AND	ECX,0x7F800000
		JZ	zero
		SHR	ECX,23
		SUB	ECX,0x96
		JC	negexp
		SHL	EAX,CL
		JMP	shifted
negexp:	NEG	ECX
		SHR	EAX,CL
shifted:	AND	EDX,0x80000000
		JNZ	negative
		RET
negative:	NEG	EAX
		RET
zero:		SUB	EAX,EAX
		RET
	}
}


5- Version independent of the rounding mode, but which works correctly on doubles:


extern "C" __declspec(naked) void __cdecl _ftol()
{
	__asm {
		PUSH	EDX
		PUSH	ECX
		SUB	ESP,8
		FSTP	QWORD PTR [ESP]
		POP	EAX
		POP	EDX
		MOV	ECX,EDX
		AND	EDX,0x000FFFFF
		OR	EDX,0x00100000
		SHL	EDX,11
		SHR	EAX,21
		OR	EAX,EDX
		MOV	EDX,ECX
		AND	ECX,0x7FF00000
		JZ	zero
		SHR	ECX,20
		SUB	ECX,0x41E
		JC	negexp
		SHL	EAX,CL
		JMP	shifted
negexp:	NEG	ECX
		CMP	ECX,32
		JAE	zero
		SHR	EAX,CL
shifted:	AND	EDX,0x80000000
		POP	ECX
		POP	EDX
		JNZ	negative
		RET
negative:	NEG	EAX
		RET
zero:		SUB	EAX,EAX
		POP	ECX
		POP	EDX
		RET
	}
}


6- Compiling with -QIfist, which doesn't return the correct result (it rounds, not chops). But it can be made to return the correct result if the rounding mode is changed at program startup and never modified afterwards.

   Note that I haven't made any efforts at optimizing the different routines. Specifically, might be possible to convert some jumps into CMOV instructions (PPro and up only). Note also that, in order for this replacement routines to work, they must be accessed before the one in the C library. The safest way to do it is to make sure it is in the same CPP file as the main() or WinMain() function. The timing results were:


1- Time = 10805 ms
2- Time =  3916 ms
3- Time =  4227 ms
4- Time =  4707 ms
5- Time =  7531 ms
6- Time =  2343 ms


   The different implementations have different trade-offs that make them all potentially viable for different developers.

   I'd like to see those functions optimized. If you do, please, send me a copy.