//============================================================================]
// The two following templated classes contain a static member function (doOperation)
// performing assignation operation between two values(= or +=).
// so T_lhs and T_rhs can be float, const float, double, a Complex class or whatever.
//============================================================================]
struct AssignOpAssign
{
template <typename T_lhs, typename T_rhs
inline static void doOperation(T_lhs & lhs, T_rhs & rhs) { lhs = rhs; }
};
struct AssignOpAdd
{
template <typename T_lhs, typename T_rhs
inline static void doOperation(T_lhs & lhs, T_rhs & rhs) { lhs += rhs; }
};
//============================================================================]
// The AssignOpLoopUnroller class contains the code performing the loop unrolling
// using the template recursion principle.
// This principe is simple: a template with an integer parameter contains a static
// function exec. Loop<N::exec calls Loop<N-1::exec
// Explicit specialisation is used to stop the recursion: Loop<0::exec() does nothing.
//============================================================================]
template <typename T_lhs, typename T_rhs, typename T_operator
struct AssignOpLoopUnroller
{
template <int N
struct Loop
{
inline static void exec(T_lhs * aLArray, T_rhs * aRArray)
{
T_operator::doOperation(aLArray[N - 1], aRArray[N - 1]);
Loop<N - 1::exec(aLArray, aRArray);
}
};
struct Loop<0
{
static inline void exec(T_lhs * aLArray, T_rhs * aRArray) { }
};
};
/**
* A toy vector class illustrating the use of this "unrolling metaprogram"
* Of course, other kinds of unrollers would have to be developed for other kind
* of operations (like a ConstantAssignOpLoopUnroller).
*/
template <typename T, int N
struct Vector
{
T & operator [] (int i) { return m_aValues[i]; }
const T & operator [] (int i) const { return m_aValues[i]; }
Vector & operator += (const Vector & rhs)
{
AssignOpLoopUnroller<T, const T, AssignOpAdd ::Loop<N::exec(m_aValues, rhs.m_aValues);
return *this;
}
Vector & operator = (const Vector & rhs)
{
AssignOpLoopUnroller<T, const T, AssignOpAssign ::Loop<N::exec(m_aValues, rhs.m_aValues);
return *this;
}
const Vector operator + (const Vector & rhs) const { return Vector(*this) += rhs; }
protected:
T m_aValues[N];
};
//============================================================================]
// Some code using this class.
// Amazing, it works! :)
//============================================================================]
# include <iostream
using namespace std;
typedef Vector<float, 4 Vect4;
void foo(Vect4 & lhs, Vect4 &rhs) { lhs += rhs; }
void bar(Vect4 & lhs, Vect4 &rhs) { lhs = rhs; }
void main()
{
Vect4 vect1;
Vect4 vect2;
vect1[0] = 1; vect1[1] = 7; vect1[2] = 3; vect1[3] = 11;
vect2[0] = 41; vect2[1] = 35; vect2[2] = 39; vect2[3] = 31;
foo(vect2, vect1);
bar(vect1, vect2);
cout << "vect1 =";
for (int i = 0; i < 4; ++i)
cout << " " << vect1[i];
cout << endl;
}
//============================================================================]
// Samples from the assembly listing generated by MSVC++ :
//============================================================================]
/*
?foo@@YIXAAU?$Vector@M$03@@0@Z PROC NEAR ; foo, COMDAT
; 92 : lhs += rhs;
fld DWORD PTR [edx+12]
fadd DWORD PTR [ecx+12]
fstp DWORD PTR [ecx+12]
fld DWORD PTR [edx+8]
fadd DWORD PTR [ecx+8]
fstp DWORD PTR [ecx+8]
fld DWORD PTR [edx+4]
fadd DWORD PTR [ecx+4]
fstp DWORD PTR [ecx+4]
fld DWORD PTR [ecx]
fadd DWORD PTR [edx]
fstp DWORD PTR [ecx]
; 93 : }
ret 0
?foo@@YIXAAU?$Vector@M$03@@0@Z ENDP ; foo
PUBLIC ?bar@@YIXAAU?$Vector@M$03@@0@Z ; bar
; COMDAT ?bar@@YIXAAU?$Vector@M$03@@0@Z
_TEXT SEGMENT
?bar@@YIXAAU?$Vector@M$03@@0@Z PROC NEAR ; bar, COMDAT
; 97 : lhs = rhs;
mov eax, DWORD PTR [edx+12]
mov DWORD PTR [ecx+12], eax
mov eax, DWORD PTR [edx+8]
mov DWORD PTR [ecx+8], eax
mov eax, DWORD PTR [edx+4]
mov DWORD PTR [ecx+4], eax
mov edx, DWORD PTR [edx]
mov DWORD PTR [ecx], edx
; 98 : }
ret 0
?bar@@YIXAAU?$Vector@M$03@@0@Z ENDP ; bar
*/
/*
The assembly code shows that as expected, the Vect4 operations are inlined and
the loops are unrolled!
The VCPP compiler inlining depth will limit this unrolling. By default,
inline_depth == 8
You can increase this value up to 255.
*/ |