/** 22AUG 2017 my friend Georgiy Vorobiev (http://gv-itblog.pro/) have sent to me a link:* https://habrahabr.ru/company/infopulse/blog/336110/* We decided to perform some experiments/benchmarks how good Quake rsqrt() is nowadays* if compare it with several ways todo it in user space for application which is running in CPU*/
/** Example of launching this benchmark>> g++ -O3 -Ofast -msse2 main.cpp -o test>> ./testBenchmark for sqrt. Based on 50000000 experiments===========================================need time for FastInvSqrt: 0.121188 secondssimple test: FastInvSqrt(4.0)=0.499154need time for stdInvSqrt: 0.196041 secondssimple test: stdInvSqrt(4.0)=0.500000need time for stdInvSqrtPowerBased: 0.198133 secondssimple test: stdInvSqrtPowerBased(4.0)=0.500000need time for stdInvSqrtImpl_1: 2.157104 secondssimple test: InvSqrtImpl_1(4.0)=0.500000need time for stdInvSqrtImpl_2: 3.030861 secondssimple test: InvSqrtImpl_2(4.0)=0.499848need time for stdInvSqrtImpl_3: 0.064819 secondssimple test: InvSqrtImpl_3(4.0)=0.499878===========================================*/#include <time.h>#include <stdio.h>#include <math.h>/** Number of repeated experiments*/const size_t experiments = 50 * 1000 * 1000;/** Emulator of random number*/float get_pseudo_random_number(size_t i) { return 2.0 * i + 1.0;}/** Test implementation from https://habrahabr.ru/company/infopulse/blog/336110/*/float FastInvSqrt(float x){ float xhalf = 0.5f * x; int i = *(int*)&x; i = 0x5f3759df - (i >> 1); x = *(float*)&i; x = x*(1.5f-(xhalf*x*x)); return x;}/** Naive way to implement*/float stdInvSqrt(float x) { return 1.0/sqrt(x);}/** Naive way to implement but via power*/float stdInvSqrtPowerBased(float x) { return pow(x, -0.5);}/** Usual way how anybody can implement own pow()*/float InvSqrtImpl_1(float x){ return exp(-0.5f * log(x));}/** Usual way how anybody can implement own approx_sqrt()*/float approx_sqrt(float x){ float y = x; while(fabs(y*y - x) >= 1e-3f*x) { y = (x/y + y) / 2; } return y;}/** Implementation based on our approximate way to evaluate*/float InvSqrtImpl_2(float x){ return 1.0f/approx_sqrt(x);}/** Let's use SSE2 instruction and leverage on hardware* https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html*/float InvSqrtImpl_3(float x){ float y; asm ( "rsqrtss %[x], %%xmm0;" // EVAL rsqrtss of "x" and store result in xmm0 "movss %%xmm0, %[y];" // LOAD value from xmm0 into y : : [ x ] "m" ( x ), [ y ] "m" ( y ) /*[ [asmSymbolicName] ] constraint (cvariablename) 'm' -- memory operand */ : "xmm0" ); return y;}int main(int argc, char *argv[]){ printf("Benchmark for sqrt. Based on %u experiments\n", (unsigned int)experiments); printf("===========================================\n"); { clock_t t0 = clock(); for (size_t i = 0; i < experiments; ++i) volatile float res = FastInvSqrt(get_pseudo_random_number(i)); printf("need time for FastInvSqrt: %lf seconds\n", double(clock() - t0)/CLOCKS_PER_SEC); printf("simple test: FastInvSqrt(4.0)=%lf\n\n", FastInvSqrt(4.0f)); } { clock_t t0 = clock(); for (size_t i = 0; i < experiments; ++i) volatile float res = stdInvSqrt(get_pseudo_random_number(i)); printf("need time for stdInvSqrt: %lf seconds\n", double(clock() - t0)/CLOCKS_PER_SEC); printf("simple test: stdInvSqrt(4.0)=%lf\n\n", stdInvSqrt(4.0f)); } { clock_t t0 = clock(); for (size_t i = 0; i < experiments; ++i) volatile float res = stdInvSqrtPowerBased(get_pseudo_random_number(i)); printf("need time for stdInvSqrtPowerBased: %lf seconds\n", double(clock() - t0)/CLOCKS_PER_SEC); printf("simple test: stdInvSqrtPowerBased(4.0)=%lf\n\n", stdInvSqrtPowerBased(4.0f)); } { clock_t t0 = clock(); for (size_t i = 0; i < experiments; ++i) volatile float res = InvSqrtImpl_1(get_pseudo_random_number(i)); printf("need time for stdInvSqrtImpl_1: %lf seconds\n", double(clock() - t0)/CLOCKS_PER_SEC); printf("simple test: InvSqrtImpl_1(4.0)=%lf\n\n", InvSqrtImpl_1(4.0f)); } { clock_t t0 = clock(); for (size_t i = 0; i < experiments; ++i) volatile float res = InvSqrtImpl_2(get_pseudo_random_number(i)); printf("need time for stdInvSqrtImpl_2: %lf seconds\n", double(clock() - t0)/CLOCKS_PER_SEC); printf("simple test: InvSqrtImpl_2(4.0)=%lf\n\n", InvSqrtImpl_2(4.0f)); } { clock_t t0 = clock(); for (size_t i = 0; i < experiments; ++i) volatile float res = InvSqrtImpl_3(get_pseudo_random_number(i)); printf("need time for stdInvSqrtImpl_3: %lf seconds\n", double(clock() - t0)/CLOCKS_PER_SEC); printf("simple test: InvSqrtImpl_3(4.0)=%lf\n\n", InvSqrtImpl_3(4.0f)); } printf("===========================================\n"); return 0;}
p.s. Makefile for qmake
QT += core QT -= gui CONFIG += c++11 TARGET = sqrt_benchmark CONFIG += console CONFIG -= app_bundle TEMPLATE = app SOURCES += main.cpp