Code: Select all
#include <iostream>
#include <iomanip>
#include <stdint.h>
// Reads the CPU cycle counter (RDTSC)
inline uint64_t rdtsc() {
uint32_t lo, hi;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ((uint64_t)hi << 32) | lo;
}
// Prevent the compiler from inlining the functions for more accurate measurement
__attribute__((noinline)) int test_for(int iterations) {
volatile int sum = 0; // volatile forces the additions to actually happen
for (int i = 0; i < iterations; ++i) {
sum += 1; // add 1 (the repeated operation)
}
return sum;
}
__attribute__((noinline)) int test_labels(int iterations) {
volatile int sum = 0;
unsigned int counter = iterations; // use unsigned so that the shift is logical
// Jump table: index 0 ? loop body, index 1 ? exit
static const void* jt[] = { &&loop_body, &&exit_label };
// Initial jump to the body (always taken because iterations > 0)
goto loop_body;
loop_body:
sum += 1;
--counter;
// Compute the index WITHOUT comparisons:
// (counter - 1) >> 31 gives 0 if counter >= 1, and 1 if counter == 0
// This works because when counter is 0, counter-1 is UINT_MAX (all bits 1),
// and shifting right by 31 bits (logical) yields 1.
int index = (counter - 1) >> 31;
goto *jt[index];
exit_label:
return sum;
}
int main() {
system("color 0A");
const int ITERATIONS = 20000000; // 10 million repetitions
const int REPEAT = 15; // number of measurements (we take the minimum)
uint64_t best_for = ~0ULL, best_labels = ~0ULL;
// Warm-up (to stabilize caches and branch predictor)
test_for(ITERATIONS);
test_labels(ITERATIONS);
for (int r = 0; r < REPEAT; ++r) {
uint64_t start, end;
start = rdtsc();
volatile int res_for = test_for(ITERATIONS);
end = rdtsc();
uint64_t cycles_for = end - start;
if (cycles_for < best_for) best_for = cycles_for;
start = rdtsc();
volatile int res_labels = test_labels(ITERATIONS);
end = rdtsc();
uint64_t cycles_labels = end - start;
if (cycles_labels < best_labels) best_labels = cycles_labels;
// Show results to ensure they are not optimized away
std::cout << "Run " << r << ": for sum = " << res_for
<< ", labels sum = " << res_labels << std::endl;
}
double per_for = static_cast<double>(best_for) / ITERATIONS;
double per_labels = static_cast<double>(best_labels) / ITERATIONS;
std::cout << std::fixed << std::setprecision(3);
std::cout << "\n--- Results (lowest of " << REPEAT << " runs) ---\n";
std::cout << "For loop: " << best_for << " total cycles, "
<< per_for << " cycles/iteration\n";
std::cout << "Labels as values: " << best_labels << " total cycles, "
<< per_labels << " cycles/iteration\n";
if (per_labels < per_for)
std::cout << "\nLabels as values is faster by "
<< (per_for - per_labels) << " cycles/iter.\n";
else
std::cout << "\nFor loop is faster by "
<< (per_labels - per_for) << " cycles/iter.\n";
system("PAUSE");
return 0;
}
It might just be because of all the branchless stuff, and not just labels as values.