Code: Select all
#include <iostream>
#include <iomanip>
#include <stdint.h>
// Reads the CPU cycle counter (RDTSC)
inline uint64_t rdtsc() {
uint32_t lo, hi;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ((uint64_t)hi << 32) | lo;
}
// Prevent the compiler from inlining the functions for more accurate measurement
__attribute__((noinline)) int test_for(int iterations) {
volatile int sum = 0; // volatile forces the additions to actually happen
for (int i = 0; i < iterations; ++i) {
sum += 1; // add 1 (the repeated operation)
}
return sum;
}
__attribute__((noinline)) int test_labels(int iterations) {
volatile int sum = 0;
unsigned int counter = iterations; // use unsigned so that the shift is logical
// Jump table: index 0 ? loop body, index 1 ? exit
static const void* jt[] = { &&loop_body, &&exit_label };
// Initial jump to the body (always taken because iterations > 0)
goto loop_body;
loop_body:
sum += 1;
--counter;
// Compute the index WITHOUT comparisons:
// (counter - 1) >> 31 gives 0 if counter >= 1, and 1 if counter == 0
// This works because when counter is 0, counter-1 is UINT_MAX (all bits 1),
// and shifting right by 31 bits (logical) yields 1.
int index = (counter - 1) >> 31;
goto *jt[index];
exit_label:
return sum;
}
int main() {
system("color 0A");
const int ITERATIONS = 20000000; // 10 million repetitions
const int REPEAT = 15; // number of measurements (we take the minimum)
uint64_t best_for = ~0ULL, best_labels = ~0ULL;
// Warm-up (to stabilize caches and branch predictor)
test_for(ITERATIONS);
test_labels(ITERATIONS);
for (int r = 0; r < REPEAT; ++r) {
uint64_t start, end;
start = rdtsc();
volatile int res_for = test_for(ITERATIONS);
end = rdtsc();
uint64_t cycles_for = end - start;
if (cycles_for < best_for) best_for = cycles_for;
start = rdtsc();
volatile int res_labels = test_labels(ITERATIONS);
end = rdtsc();
uint64_t cycles_labels = end - start;
if (cycles_labels < best_labels) best_labels = cycles_labels;
// Show results to ensure they are not optimized away
std::cout << "Run " << r << ": for sum = " << res_for
<< ", labels sum = " << res_labels << std::endl;
}
double per_for = static_cast<double>(best_for) / ITERATIONS;
double per_labels = static_cast<double>(best_labels) / ITERATIONS;
std::cout << std::fixed << std::setprecision(3);
std::cout << "\n--- Results (lowest of " << REPEAT << " runs) ---\n";
std::cout << "For loop: " << best_for << " total cycles, "
<< per_for << " cycles/iteration\n";
std::cout << "Labels as values: " << best_labels << " total cycles, "
<< per_labels << " cycles/iteration\n";
if (per_labels < per_for)
std::cout << "\nLabels as values is faster by "
<< (per_for - per_labels) << " cycles/iter.\n";
else
std::cout << "\nFor loop is faster by "
<< (per_labels - per_for) << " cycles/iter.\n";
system("PAUSE");
return 0;
}
It might just be because of all the branchless stuff, and not just labels as values.
Edit: Incredible, after seeing labels as values win countless times a few days ago, now it turns out that the "for" loop wins hands down again and again, almost 2.5-3.0 times faster, incredible, although it's cold now, I don't know if that has anything to do with it, but I need more proof.
Maybe when it's hot labels as values become higher, but not faster, but rather the conditions become very slow?
Maybe labels as values is more stable at different temperatures, I have seen that it maintains very little variety of results, but the "for" loop is very unstable, sometimes it is very, very fast, and other times it becomes slower than labels as values, even when it is stable, it continues to show very inconstant numbers compared to labels as values, which keeps its first 3 digits constant almost all the time.
By the way, I have seen that in Windows XP the program executes fewer tasks than in Linux.
However, I know that Windows XP uses much less electrical energy than Windows 10.
Speed, useful life, electrical consumption, contexts... benchmarks can lead to false information if it is not reviewed carefully in the context that occurs.