#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <sys/time.h>
#include <time.h>
#include <inttypes.h>

#include <e-hal.h>
#include <e-loader.h>

#include "simpleMath.h"

// prototype
inline void sync_workers();
inline void release_workers();

uint8_t volatile *const barrier   = (void *) 0x3FFF ;    /**< If barrier == 1 the core busy waits till the host resets it to 0. */

int retval = EXIT_SUCCESS;
e_platform_t platform;
e_epiphany_t dev;
uint8_t devRows, devCols;

struct timespec start, end;
uint64_t delta_us = 0;
uint64_t delta_us_tmp = 0;

#define _Num            1024
uint32_t _NumElements   = _Num;
uint32_t _NumLoop       = 8; //128;//8192;


int main(int argc, char **argv) {
  fprintf(stdout, "#; Starting host application ...\n");

  // create storage
  float volatile (*const a)[_NumElements*16] = calloc(_NumElements*16, sizeof(float));
  float volatile (*const b)[_NumElements*16] = calloc(_NumElements*16, sizeof(float));
  float volatile (*const c)[_NumElements*16] = calloc(_NumElements*16, sizeof(float));

  // init array randomly
  srand((unsigned int)time(NULL));
  for(uint32_t i=0; i<_NumElements*16; i++)
  {
    (*a)[i] = (float)rand()/(float)(RAND_MAX);
    (*b)[i] = (float)rand()/(float)(RAND_MAX);
    (*c)[i] = (float)rand()/(float)(RAND_MAX);
  }

////////////////////////////////////////////////////////////////

  // measure host time for executing simpleMath
  clock_gettime(CLOCK_MONOTONIC_RAW, &start);
  for(uint32_t n=0; n < _NumLoop; n++)
  {
    // the Epiphany has 16 cores
    for( uint32_t i=0; i< 16; i++)
    {
      for( uint32_t j=0; j< _NumElements; j++)
      {
        simpleMath1(&((*a)[i*16+j]), &((*b)[i*16+j]), &((*c)[i*16+j]) );
      }
    }
  }
  clock_gettime(CLOCK_MONOTONIC_RAW, &end);
  delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
  printf("HOST         simpleMath1 :: number of elements: %"PRIu32"\t loops: %"PRIu32"\t measured time: %"PRIu64"us \n", _NumElements*16, _NumLoop, (uint64_t)(delta_us) );
  fflush(stdout);

  delta_us_tmp = delta_us;

////////////////////////////////////////////////////////////////
  
  //// init epiphany
  e_init(NULL);
  e_reset_system();
  e_get_platform_info(&platform);
  devRows = platform.rows;
  devCols = platform.cols;
  e_open(&dev, 0, 0, platform.rows, platform.cols);
  e_load_group("e_worker1.elf", &dev, 0, 0, platform.rows, platform.cols, E_FALSE);
  e_start_group(&dev);

  // measure epiphany time
  clock_gettime(CLOCK_MONOTONIC_RAW, &start);
  for(uint32_t n=0; n < _NumLoop; n++)
  {
    // check if all cores reached the barrier
    sync_workers();
    // release workers
    release_workers();
  }
  clock_gettime(CLOCK_MONOTONIC_RAW, &end);
  delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
  printf("DEV worker1  simpleMath1 :: number of elements: %"PRIu32"\t loops: %"PRIu32"\t measured time: %"PRIu64"us \n", _NumElements*16, _NumLoop, (uint64_t)(delta_us) );
  fflush(stdout);

////////////////////////////////////////////////////////////////

  printf("\t\t\t\t\t\t\t\t\t\t/ %f\n", (float)delta_us_tmp / (float)delta_us);

////////////////////////////////////////////////////////////////

  // measure host time for executing simpleMath
  clock_gettime(CLOCK_MONOTONIC_RAW, &start);
  for(uint32_t n=0; n < _NumLoop; n++)
  {
    // the Epiphany has 16 cores
    for( uint32_t i=0; i< 16; i++)
    {
      for( uint32_t j=0; j< _NumElements; j++)
      {
        simpleMath2(&((*a)[i*16+j]), &((*b)[i*16+j]), &((*c)[i*16+j]) );
      }
    }
  }
  clock_gettime(CLOCK_MONOTONIC_RAW, &end);
  delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
  printf("HOST         simpleMath2 :: number of elements: %"PRIu32"\t loops: %"PRIu32"\t measured time: %"PRIu64"us \n", _NumElements*16, _NumLoop, (uint64_t)(delta_us) );
  fflush(stdout);
  
////////////////////////////////////////////////////////////////
  // measure host time for executing simpleMath
  clock_gettime(CLOCK_MONOTONIC_RAW, &start);
  for(uint32_t n=0; n < _NumLoop; n++)
  {
    // the Epiphany has 16 cores
    for( uint32_t i=0; i< 16; i++)
    {
      for( uint32_t j=0; j< _NumElements; j++)
      {
        (*c)[i*16+j] = simpleMath3((*a)[i*16+j], (*b)[i*16+j], (*c)[i*16+j] );
      }
    }
  }
  clock_gettime(CLOCK_MONOTONIC_RAW, &end);
  delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
  printf("HOST         simpleMath3 :: number of elements: %"PRIu32"\t loops: %"PRIu32"\t measured time: %"PRIu64"us \n", _NumElements*16, _NumLoop, (uint64_t)(delta_us) );
  fflush(stdout);


////////////////////////////////////////////////////////////////
  
  //// init epiphany
  e_close(&dev);
  e_init(NULL);
  e_reset_system();
  e_get_platform_info(&platform);
  devRows = platform.rows;
  devCols = platform.cols;
  e_open(&dev, 0, 0, platform.rows, platform.cols);
  e_load_group("e_worker2.elf", &dev, 0, 0, platform.rows, platform.cols, E_FALSE);
  e_start_group(&dev);

  // measure epiphany time
  clock_gettime(CLOCK_MONOTONIC_RAW, &start);
  for(uint32_t n=0; n < _NumLoop; n++)
  {
    // check if all cores reached the barrier
    sync_workers();
    // release workers
    release_workers();
  }
  clock_gettime(CLOCK_MONOTONIC_RAW, &end);
  delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
  printf("DEV worker2  simpleMath2 :: number of elements: %"PRIu32"\t loops: %"PRIu32"\t measured time: %"PRIu64"us \n", _NumElements*16, _NumLoop, (uint64_t)(delta_us) );
  fflush(stdout);
  
////////////////////////////////////////////////////////////////

  //// init epiphany
  e_close(&dev);
  e_init(NULL);
  e_reset_system();
  e_get_platform_info(&platform);
  devRows = platform.rows;
  devCols = platform.cols;
  e_open(&dev, 0, 0, platform.rows, platform.cols);
  e_load_group("e_worker3.elf", &dev, 0, 0, platform.rows, platform.cols, E_FALSE);
  e_start_group(&dev);

  // measure epiphany time
  clock_gettime(CLOCK_MONOTONIC_RAW, &start);
  for(uint32_t n=0; n < _NumLoop; n++)
  {
    // check if all cores reached the barrier
    sync_workers();
    // release workers
    release_workers();
  }
  clock_gettime(CLOCK_MONOTONIC_RAW, &end);
  delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
  printf("DEV worker3  simpleMath3 :: number of elements: %"PRIu32"\t loops: %"PRIu32"\t measured time: %"PRIu64"us \n", _NumElements*16, _NumLoop, (uint64_t)(delta_us) );
  fflush(stdout);

////////////////////////////////////////////////////////////////

  //// init epiphany
  e_close(&dev);
  e_init(NULL);
  e_reset_system();
  e_get_platform_info(&platform);
  devRows = platform.rows;
  devCols = platform.cols;
  e_open(&dev, 0, 0, platform.rows, platform.cols);
  e_load_group("e_worker4.elf", &dev, 0, 0, platform.rows, platform.cols, E_FALSE);
  e_start_group(&dev);
/*
  off_t a_ptr[devRows*devCols];
  off_t b_ptr[devRows*devCols];
  off_t c_ptr[devRows*devCols];

  float a_input[_NumElements];
  float b_input[_NumElements];
  float c_input[_NumElements];
  for(uint32_t i=0; i<_NumElements; i++)
  {
    a_input[i] = 2;
    b_input[i] = 3;
    c_input[i] = 7;
  }

  sync_workers();
  for(uint32_t i=0; i<devRows ; i++) {
    for(uint32_t j=0; j<devCols; j++) {
      e_read(&dev,i,j,(off_t)0x4000,&(a_ptr[i*devCols+j]),sizeof(off_t));
      e_read(&dev,i,j,(off_t)0x4010,&(b_ptr[i*devCols+j]),sizeof(off_t));
      e_read(&dev,i,j,(off_t)0x4020,&(c_ptr[i*devCols+j]),sizeof(off_t));
      
      e_write(&dev,i,j,a_ptr[i*devCols+j],a_input, sizeof(a_input)); 
      e_write(&dev,i,j,b_ptr[i*devCols+j],b_input, sizeof(b_input)); 
      e_write(&dev,i,j,c_ptr[i*devCols+j],c_input, sizeof(c_input)); 

//      printf("a_ptr %x\n", (uint64_t)a_ptr[i*devCols+j]);
//      printf("b_ptr %x\n", (uint64_t)b_ptr[i*devCols+j]);
//      printf("c_ptr %x\n", (uint64_t)c_ptr[i*devCols+j]);
    }
  }
*/
  release_workers();
  
  clock_gettime(CLOCK_MONOTONIC_RAW, &start);
  for(uint32_t n=0; n < _NumLoop; n++)
  {
    // check if all cores reached the barrier
    sync_workers();
    // release workers
    release_workers();
  }
  clock_gettime(CLOCK_MONOTONIC_RAW, &end);
  delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
  printf("DEV worker4  simpleMath3 :: number of elements: %"PRIu32"\t loops: %"PRIu32"\t measured time: %"PRIu64"us \n", _NumElements*16, _NumLoop, (uint64_t)(delta_us) );
  fflush(stdout);

/*
  sync_workers();

  for(uint32_t i=0; i<devRows ; i++) {
    for(uint32_t j=0; j<devCols; j++) {
      e_read(&dev,i,j,a_ptr[i*devCols+j],a_input,sizeof(a_input));
      e_read(&dev,i,j,b_ptr[i*devCols+j],b_input,sizeof(b_input));
      e_read(&dev,i,j,c_ptr[i*devCols+j],c_input,sizeof(c_input));

//      for(uint32_t n=0; n<_Num; n++) {
//        printf("%4d :: %f\t\t%f\t\t%f\n",n,a_input[n],b_input[n],c_input[n]);
//      }
    }
  }
*/
////////////////////////////////////////////////////////////////
  //// init epiphany
  e_close(&dev);
  e_init(NULL);
  e_reset_system();
  e_get_platform_info(&platform);
  devRows = platform.rows;
  devCols = platform.cols;
  e_open(&dev, 0, 0, platform.rows, platform.cols);
  e_load_group("e_worker4o.elf", &dev, 0, 0, platform.rows, platform.cols, E_FALSE);
  e_start_group(&dev);

  release_workers();
  
  clock_gettime(CLOCK_MONOTONIC_RAW, &start);
  for(uint32_t n=0; n < _NumLoop; n++)
  {
    // check if all cores reached the barrier
    sync_workers();
    // release workers
    release_workers();
  }
  clock_gettime(CLOCK_MONOTONIC_RAW, &end);
  delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
  printf("DEV worker4o simpleMath3 :: number of elements: %"PRIu32"\t loops: %"PRIu32"\t measured time: %"PRIu64"us \n", _NumElements*16, _NumLoop, (uint64_t)(delta_us) );
  fflush(stdout);

////////////////////////////////////////////////////////////////

  printf("\n#; Cleanup and finish application!\n");
  e_close(&dev);
  free(a);
  free(b);
  free(c);

  return retval;
}

//////////////////////////////// helper 

inline void sync_workers()    
{
  uint16_t active  = devRows*devCols;
  uint8_t  val;
  uint8_t  waiting[devRows][devCols];
  //init
  for(  uint16_t i=0; i< devRows; i++) {
    for(uint16_t j=0; j< devCols; j++) { waiting[i][j] = 0; }
  }
  uint64_t limit = 400000;
  while(active)
  {
    if(!(limit--)) {printf("ERROR: something went wront - time exceeded\n"); exit(1);}

    for(  uint16_t i=0; i< devRows; i++)
    {   
      for(uint16_t j=0; j< devCols; j++)
      {   
        if(waiting[i][j] == 0)
        {
          // prevents reading from already waiting cores
          e_read(&dev,i,j,(off_t)barrier,&val,sizeof(val));
          if (val == 1)
          {
            active--; 
            waiting[i][j] = 1;
          }
        }
      }   
    }   
  }
}

inline void release_workers()
{
  uint8_t val;
  // continue cores
  for(uint8_t i=0; i<devRows; i++) 
  {
    for(uint8_t j=0; j<devCols; j++) 
    {   
      val = 0;
      e_write(&dev,i,j,(off_t)barrier,&val, sizeof(uint8_t)); 
    }   
  }
}
