/*
 * main.c
 */
#include <ti/sysbios/BIOS.h>
#include <ti/sysbios/knl/Task.h>
#include "Profile.h"

#include"arm_neon.h"

#define height 320
#define width 576
#define L 256

int smooth( int c_input[height][width],int c_output[height][width],int ker[3][3])
{
  int i,j,res;
  int buff[height+2][width+2];
  for(i=0;i<(height+2);i++)
  {
    for(j=0;j<(width+2);j++)
    {
      buff[i][j]=0;
    }
  }
  for(i=0;i<height;i++)
  {
    for(j=0;j<width;j++)
    {
      buff[i+1][j+1]=(c_input[i][j]);
    }
  }

  for(i=0;i<height;i++)
  {
    for(j=0;j<width;j++)
    {
      res=0;
      res+=buff[i][j]*ker[0][0];
      res+=buff[i][j+1]*ker[0][1];
      res+=buff[i][j+2]*ker[0][2];
      res+=buff[i+1][j]*ker[1][0];
      res+=buff[i+1][j+1]*ker[1][1];
      res+=buff[i+1][j+2]*ker[1][2];
      res+=buff[i+2][j]*ker[2][0];
      res+=buff[i+2][j+1]*ker[2][1];
      res+=buff[i+2][j+2]*ker[2][2];
      c_output[i][j]=res/9;
    }
  }
  return 0;

}

int smoothening_neon( int16_t n_input[height][width],int16_t kernal[3][3],int16_t n_output[height][width])
{
  int i,j;
  int16x8_t NeonK1,NeonK2,NeonK3,NeonK4,NeonK5,NeonK6,NeonK7,NeonK8,NeonK9;
  int16x8_t NeonB1,NeonB2,NeonB3,NeonB4,NeonB5,NeonB6,NeonB7,NeonB8,NeonB9;
  int16x8_t result;
  int a=height+2;
  int b=width+2;
  int16_t* ptr[a][b];
  int16_t vbuff[a][b];
  for(i=0;i<height+2;i++)
  {
    for(j=0;j<width+2;j++)
    {
      vbuff[i][j]=0;
    }
  }
  for(i=0;i<height;i++)
  {

    for(j=0;j<width;j++)
    {
      vbuff[i+1][j+1]=n_input[i][j];

    }

  }


  NeonK1=vdupq_n_s16(kernal[0][0]);
  NeonK2=vdupq_n_s16(kernal[0][1]);
  NeonK3=vdupq_n_s16(kernal[0][2]);
  NeonK4=vdupq_n_s16(kernal[1][0]);
  NeonK5=vdupq_n_s16(kernal[1][1]);
  NeonK6=vdupq_n_s16(kernal[1][2]);
  NeonK7=vdupq_n_s16(kernal[2][0]);
  NeonK8=vdupq_n_s16(kernal[2][1]);
  NeonK9=vdupq_n_s16(kernal[2][2]);

  for(i=0;i<height;i++)
  {
    j=0;
    while(j<width)

    {
      int16_t zero1=0;
      result=vdupq_n_s16(zero1);

      ptr[i][j+0]=&vbuff[i][j+0];
      ptr[i][j+1]=&vbuff[i][j+1] ;
      ptr[i][j+2]=&vbuff[i][j+2];
      ptr[i][j+3]=&vbuff[i+1][j+0];
      ptr[i][j+4]=&vbuff[i+1][j+1] ;
      ptr[i][j+5]=&vbuff[i+1][j+2] ;
      ptr[i][j+6]=&vbuff[i+2][j+0];
      ptr[i][j+7]=&vbuff[i+2][j+1];
      ptr[i][j+8]=&vbuff[i+2][j+2];

      NeonB1=vld1q_s16(ptr[i][j+0]);

      NeonB2=vld1q_s16(ptr[i][j+1]);
      NeonB3=vld1q_s16(ptr[i][j+2]);
      //	NeonB2=vextq_s16(NeonB1,NeonB3,2);
      //  NeonB2=vextq_s16(NeonB3,NeonB2,1);
      NeonB4=vld1q_s16(ptr[i][j+3]);
      NeonB5=vld1q_s16(ptr[i][j+4]);
      NeonB6=vld1q_s16(ptr[i][j+5]);
      //	NeonB5=vextq_s16(NeonB4,NeonB6,2);
      //  NeonB5=vextq_s16(NeonB6,NeonB5,1);
      NeonB7=vld1q_s16(ptr[i][j+6]);


      NeonB8=vld1q_s16(ptr[i][j+7]);
      NeonB9=vld1q_s16(ptr[i][j+8]);
      // NeonB8=vextq_s16(NeonB7,NeonB9,2);
      //NeonB8=vextq_s16(NeonB9,NeonB8,1);

      result=vmulq_s16 (NeonB1,NeonK1);
      result=vmlaq_s16(result,NeonB2,NeonK2);
      result=vmlaq_s16(result,NeonB3,NeonK3);
      result=vmlaq_s16(result,NeonB4,NeonK4);
      result=vmlaq_s16(result,NeonB5,NeonK5);
      result=vmlaq_s16(result,NeonB6,NeonK6);
      result=vmlaq_s16(result,NeonB7,NeonK7);
      result=vmlaq_s16(result,NeonB8,NeonK8);
      result=vmlaq_s16(result,NeonB9,NeonK9);
      result=result/9;
      vst1q_s16 (&n_output[i][j],result);
      j=j+8;


    }

  }

  return 0;
}

void taskMain(void)
{
	init_Profile();
	int c_input[height][width];
	int result =0;
	sint16 n_output[height][width];
	sint16 n_input[height][width];
	int val=1, i,j;
	for(i=0;i<height;i++)
	{
		for(j=0;j<width;j++)
		{
			c_input[i][j]=val;
			n_input[i][j]=val;
			val=val+1;
		}
	}
	int ker[3][3]={
			{1,1,1},
			{1,1,1},
			{1,1,1}};
	int  c_output[height][width];
	Start_Profile(Smooth_c);
	result =smooth( c_input,c_output,ker);
	End_Profile(Smooth_c);
	Start_Profile(Smooth_neon);
	//smoothening_neon(n_input,ker,n_output);
	End_Profile(Smooth_neon);
	reset_Profile();
}



int main(void){
	BIOS_start();
	return 0;
}
