#include <iostream>
#include <cmath>
#include <sys/time.h>
#include <cstdlib>

#ifdef _SP_
typedef float Real;
#else
typedef double Real;
#endif /* _SP_ */

#if defined(__clang__)
    const char* compiler = "Clang/LLVM";
#elif defined(__ICC) || defined(__INTEL_COMPILER)
    const char* compiler = "Intel ICC/ICPC";
#elif defined(__GNUC__) || defined(__GNUG__)
    const char* compiler = "GNU GCC/G++";
#elif defined(__HP_cc) || defined(__HP_aCC)
    const char* compiler = "Hewlett-Packard C/aC++";
#elif defined(__IBMC__) || defined(__IBMCPP__)
    const char* compiler = "IBM XL C/C++";
#elif defined(_MSC_VER)
    const char* compiler = "Microsoft Visual Studio";
#elif defined(__PGI)
    const char* compiler = "Portland Group PGCC/PGCPP";
#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
    const char* compiler = "Oracle Solaris Studio";
#else
    const char* compiler = "Unknown";
#endif


#define __FMA(a,b,c) (a*b+c)
#define Nloop (1<<24)
#define Nsamples 1000

__attribute__((aligned(64))) Real c0[16];
__attribute__((aligned(64))) Real c1[16];


// timer
double get_wtime(void) {
    struct timeval t;
    gettimeofday(&t, NULL);
    return (double)t.tv_sec + (double)t.tv_usec * 1.0e-6;
}


inline Real _peak_perf(const Real u0)
{
    __attribute__((aligned(64))) Real q0[16] = {u0,u0,u0,u0,u0,u0,u0,u0,u0,u0,u0,u0,u0,u0,u0,u0};
    __attribute__((aligned(64))) Real q1[16] = {u0,u0,u0,u0,u0,u0,u0,u0,u0,u0,u0,u0,u0,u0,u0,u0};

    for (size_t i = 0; i < Nloop; ++i)
    {
        q0[0] = __FMA(c0[0], q1[0], q0[0]);
        q0[1] = __FMA(c0[1], q1[1], q0[1]);
        q0[2] = __FMA(c0[2], q1[2], q0[2]);
        q0[3] = __FMA(c0[3], q1[3], q0[3]);
        q0[4] = __FMA(c0[4], q1[4], q0[4]);
        q0[5] = __FMA(c0[5], q1[5], q0[5]);
        q0[6] = __FMA(c0[6], q1[6], q0[6]);
        q0[7] = __FMA(c0[7], q1[7], q0[7]);
        q0[8] = __FMA(c0[8], q1[8], q0[8]);
        q0[9] = __FMA(c0[9], q1[9], q0[9]);
        q0[10] = __FMA(c0[10], q1[10], q0[10]);
        q0[11] = __FMA(c0[11], q1[11], q0[11]);
        q0[12] = __FMA(c0[12], q1[12], q0[12]);
        q0[13] = __FMA(c0[13], q1[13], q0[13]);
        q0[14] = __FMA(c0[14], q1[14], q0[14]);
        q0[15] = __FMA(c0[15], q1[15], q0[15]);
    } // 32 * Nloop flop

    // whenever possible try to split a complex loop-body into multiple loops
    // with simpler loop bodies.  This will help the compiler to better
    // optimize your code.  We could have written one loop here  but the
    // resulting executable will perform worse.  (Neglecting that the computed
    // result will be different, but we don't care here.)
    for (size_t i = 0; i < Nloop; ++i)
    {
        q1[0] = __FMA(c1[0], q0[0], q1[0]);
        q1[1] = __FMA(c1[1], q0[1], q1[1]);
        q1[2] = __FMA(c1[2], q0[2], q1[2]);
        q1[3] = __FMA(c1[3], q0[3], q1[3]);
        q1[4] = __FMA(c1[4], q0[4], q1[4]);
        q1[5] = __FMA(c1[5], q0[5], q1[5]);
        q1[6] = __FMA(c1[6], q0[6], q1[6]);
        q1[7] = __FMA(c1[7], q0[7], q1[7]);
        q1[8] = __FMA(c1[8], q0[8], q1[8]);
        q1[9] = __FMA(c1[9], q0[9], q1[9]);
        q1[10] = __FMA(c1[10], q0[10], q1[10]);
        q1[11] = __FMA(c1[11], q0[11], q1[11]);
        q1[12] = __FMA(c1[12], q0[12], q1[12]);
        q1[13] = __FMA(c1[13], q0[13], q1[13]);
        q1[14] = __FMA(c1[14], q0[14], q1[14]);
        q1[15] = __FMA(c1[15], q0[15], q1[15]);
    } // 32 * Nloop flop

    // 62 flop
    q0[0] = __FMA(1.0f, q0[8], q0[0]);
    q0[1] = __FMA(1.0f, q0[9], q0[1]);
    q0[2] = __FMA(1.0f, q0[10], q0[2]);
    q0[3] = __FMA(1.0f, q0[11], q0[3]);
    q0[4] = __FMA(1.0f, q0[12], q0[4]);
    q0[5] = __FMA(1.0f, q0[13], q0[5]);
    q0[6] = __FMA(1.0f, q0[14], q0[6]);
    q0[7] = __FMA(1.0f, q0[15], q0[7]);

    q1[0] = __FMA(1.0f, q1[5], q1[0]);
    q1[1] = __FMA(1.0f, q1[6], q1[1]);
    q1[2] = __FMA(1.0f, q1[7], q1[2]);
    q1[3] = __FMA(1.0f, q1[8], q1[3]);
    q1[4] = __FMA(1.0f, q1[9], q1[4]);
    q1[5] = __FMA(1.0f, q1[13], q1[5]);
    q1[6] = __FMA(1.0f, q1[14], q1[6]);
    q1[7] = __FMA(1.0f, q1[15], q1[7]);

    q0[0] = __FMA(1.0f, q0[4], q0[0]);
    q0[1] = __FMA(1.0f, q0[5], q0[1]);
    q0[2] = __FMA(1.0f, q0[6], q0[2]);
    q0[3] = __FMA(1.0f, q0[7], q0[3]);

    q1[0] = __FMA(1.0f, q1[4], q1[0]);
    q1[1] = __FMA(1.0f, q1[5], q1[1]);
    q1[2] = __FMA(1.0f, q1[6], q1[2]);
    q1[3] = __FMA(1.0f, q1[7], q1[3]);

    q0[0] = __FMA(1.0f, q0[2], q0[0]);
    q0[1] = __FMA(1.0f, q0[3], q0[1]);

    q1[0] = __FMA(1.0f, q1[2], q1[0]);
    q1[1] = __FMA(1.0f, q1[3], q1[1]);

    q0[0] = __FMA(1.0f, q0[1], q0[0]);
    q1[0] = __FMA(1.0f, q1[1], q1[0]);
    return __FMA(1.0f, q1[0], q0[0]);
}


int main(int argc, char* argv[])
{
    // initialize some random coefficients coefficients to prevent the compiler
    // from optimizing the loop of the performance code.
    srand48(1);
    for (int i = 0; i < 16; ++i)
    {
        c0[i] = (drand48() + 1.0)*1.0e-4;
        c1[i] = -(drand48() + 1.0)*1.0e-4;
    }

    // initial value
    const Real u0 = 1.0e-5;

    double ret  = 0.0;
    for (int i = 0; i < 5; ++i)
        ret += _peak_perf(u0); // warm-up

    double tsum = 0.0;
    double tmin = 1000.0;
    double tmax = 0.0;
    for (int i = 0; i < Nsamples; ++i)
    {
        const double t0 = get_wtime();
        ret += _peak_perf(u0);
        const double t1 = get_wtime();

        const double teval = t1-t0;
        tmin = (teval<tmin) ? teval : tmin;
        tmax = (teval>tmax) ? teval : tmax;
        tsum += teval;
    }

    // report
    const double evalflop = (64.0*Nloop + 62.0);
    const double sampflop = evalflop*Nsamples;
    // 2.5GHz, 24 cores, 2*FMA (=4 flops/cycle), AVX SIMD width)
    const double peakFlop  = 3.5 * 1* (2 * 2) * 256.0/(sizeof(Real)*8.0);
    const double minperf = evalflop/tmax * 1.0e-9;
    const double maxperf = evalflop/tmin * 1.0e-9;
    const double avgperf = sampflop/tsum * 1.0e-9; // Gflop/s
    std::cout << "Compiler:                            " << compiler << std::endl;
    std::cout << "Return value:                        " << ret/Nsamples << std::endl;
    std::cout << "Precision:                           " << sizeof(Real) << " byte" << std::endl;
    std::cout << "Number of samples:                   " << Nsamples << std::endl;
    std::cout << "Nominal floating point performance:  " << peakFlop << " Gflop/s" << std::endl;
    std::cout << "Measured floating point performance: ";
    std::cout << "(min:" << minperf << " | max:" << maxperf << " | avg:" << avgperf << ") Gflop/s" << std::endl;
    std::cout << "Measured avg peak performance:       ";
    std::cout << "(min:" << minperf/peakFlop*100.0 << " | max:" << maxperf/peakFlop*100.0 << " | avg:" << avgperf/peakFlop*100.0 << ") %" << std::endl;

    return 0;
}
