TMS320C6678: Matrix multiply using dsplib

Pratheek Suresh

Intellectual 675 points

Part Number: TMS320C6678

Hi,

I'm trying to multiply matrix of order of 2k x 2k using sp matrix multiply routine .

Both the matrix are in ddr and both l1 and l2 are cached fully.

Single core performance is very slow to meet my real time requirement.

I want to parallelise the code to multi core using openmp .

I understand the single core performance hit is due to non aligned read, which is inherent problem of matrix multiply.

Is there a way to achieve better performance. I'm looking at performance close to the bench mark given for dsplib matrix multiply routine.

over 8 years ago

0 Yordan Kovachev over 8 years ago

TI__Guru**** 161600 points

Hi,

I'm looping the dsp experts. Their feedback will be posted directly here.

Best Regards,
Yordan

0 ran35366 over 8 years ago in reply to Yordan Kovachev

TI__Genius 12805 points

Hi Pratheek

This is very interesting problem. Obviously 16M bytes data cannot fit inside the L2 and DDR has issues with not-consecutive data so here is what I suggest:

1. Use EDMA to load data into L2. Use B index to read sequentially from DDR and write with stride in L2.

2. Divide the matrix into multiple parts, each part should fit inside L1D. L1D is 32Kbytes or 8K sp values, so the breaking the matrix into parts of 64 by 64 (or similar partition). Read and write the data to L2, and use EDMA to move data from L2 into DDR and back

I suggest that as a first step try to do it for Matrix of 128 by 128 in L2 memory and if this works for you move to EDMA scheme

In the past I developed code that does Cholesky decomposition for large matrix (I tested 128x128) in my case the data is in L2 but the partition is done to fit into L1D cache and store intermediate results. The function that I use processes size at least 10x10 and I put the an example how it is done with source code that includes debug code in this posting. The real code obviously does not have all the printf that consumes so many cycles and some of the operations are not needed (there are there for debug purposes only). In addition I attach a document with benchmark that was measured (again, for Cholesky). Notice that size must be at least 10 (I had similar functions for smaller matrix)

Note this is my personal function, not TI, so there is no explicit or implicit guarantee that it works, give the right results, or provide any support

/cfs-file/__key/communityserver-discussions-components-files/791/0363.Test-Bench-Report-of-the-Single-Precision-Floating-point-Implementation-of-Cholesky-Decomposition-on-Key.doc

void complexCholeskyLarger_10(int N, FLOAT *in, FLOAT *out, FLOAT *OneOverDiag )

{

   FLOAT x0, x1, xx0,xx1 ;
    int k,l,m    ;

    FLOAT   *p1, *p2   ;

    FLOAT   y0,y1;
    FLOAT *p_in   ,*p_in1, *p_out1 , *p_out;
    int diffP   ;
    double a01 , b01 , c01, d01, e01, f01, g01, h01;
    float   a0 ;

/////   In Step 6 I will try to strean-line the pointers
////    The in pointer can be restrict
////    The output pointer can not be since it is re-read after written
////    We may separate the

    if (N < 10)
    {
       printf( " N must be at least 10 \n") ;
       return ;
    }

//////// Processing The first row/column

      p_in = in   ;
      p_out = out   ;
      printf("p_in %x p_out %x \n", p_in, p_out)   ;

       a01 =    _amemd8(in) ; //
       a0 = _lof(a01)   ; // for debug
      in = in + 2 ;
      y0 = _rsqrsp(a0)   ; // first NR guess,
      y1 = y0 * (3 - a0 * y0 * y0) * 0.5   ; // first iteration, 16 bits accuracy
      x0 = y1 * (3 - a0 * y1 * y1) * 0.5   ;   ; //   Second iterations

      printf(" a0 and x0 %f %f \n", a0,x0) ;
      b01 = _ftod(x0,x0) ;

    *out++ = a0 * x0   ;
    *out++ = 0.0   ;

      printf(" first out %f \n", a0 * x0 ) ;

      *OneOverDiag++ = x0   ;
      _nassert( N >= 10 ); /* The smallest matrix is 10x10            */

      diffP = 4 ;

///   The next block calculates all the numbers that are in the first column

      for (k=1; k< N; k++)
      {
       a01 =    _amemd8(in) ;
       printf(" a01   %f %f \n", _lof(a01), _hif(a01) ) ;

       c01 = _dmpysp(a01,b01) ;
       _amemd8(out)   = c01    ;
       in = in + diffP   ;
       out = out + diffP ;
       diffP = diffP + 2 ;
        printf("p_in %x p_out %x \n", in, out)   ;
        printf("out %f %f \n", _lof(c01), _hif(c01) ) ;
   }

///////// Processing of second Row/column

///Bring back the pointers in and out

     in = p_in + 4   ;//points at diagonal of second row
     out = p_out + 2 ;


    a01 = _amemd8(out) ;
    out = out + 2 ;
    b01 = _dmpysp(a01, a01) ;
    xx0 =   _hif(b01)+ _lof(b01)   ;

    printf("xx0 %f p_in %x \n",xx0, p_in ) ;

    xx1 =    (float) *in ; // reading only the real value
    x0 =    (xx1 - xx0)   ;
    printf("xx1 %f x0 %f \n",xx1,x0) ;


      y0 = _rsqrsp(x0)   ; // first NR guess,
      y1 = y0 * (3.0 - x0 * y0 * y0) * 0.5   ; // first iteration, 16 bits accuracy
      x1 = y1 * (3.0 - x0 * y1 * y1) * 0.5   ;   ; //   Second iterations

      d01 = _ftod(x1,x1) ;
      *OneOverDiag++ = x1 ;


       *out++ = x0 * x1 ;
       printf( "out %f \n", x0*x1) ;
       *out++ = 0.0    ; // save the diagonal

//     At this point, out points to the first element of the third row (position 6) and
//     in points at the real part of the diagonal of row 2 (position 4)
///////////////////
///   The next block calculates all the numbers that are in the second column
///   a01 has the value of first element of the second row
       diffP = 4   ;
       in = in + 4   ;
       for (l = 2; l < N; l++)
       {

           printf("in %x diffP %d \n", in, diffP) ;

           h01 =    _amemd8(out) ; // read the output input
           out = out + 2 ;         // points at the second element of the row
           printf(" h01 %f %f \n",_lof(h01), _hif(h01) ) ;

           e01 = _complex_conjugate_mpysp(h01, a01)   ;

           printf(" e01 %f %f \n",_lof(e01), _hif(e01) ) ;

           f01 = _amemd8(in)   ; // reading the input
           g01 = _dsubsp(f01, e01) ;

           printf(" f01 %f %f \n",_lof(f01), _hif(f01) ) ;
           printf(" g01 %f %f \n",_lof(g01), _hif(g01) ) ;

            c01 = _dmpysp(g01,d01) ;

           printf(" c01 %f %f \n",_lof(c01), _hif(c01) ) ;
            _amemd8(out)   = c01    ;
          printf("difP = %d \n", diffP) ;
            out = out + diffP   ;
            diffP = diffP +2   ;
            in = in + diffP    ;
       }

/////    Starting of the generic loop for row k
//// Starting with row 2, the pointer p1 starts out+6, and it will
///   incremented by 6, 8, 10 etc
////   So diffP1 is 6, p1 = out+ 6 and after each loop p1 = p1 + diffP1
///    and diffP1 = diffP1 + 2
////   p_in points at in + 10, and then 18, 28,40
////   so p_in is increamented 2 more than p1

       in =p_in ;
       out = p_out ;

      p1 = out + 6   ;
      p_in = in + 10 ;

      diffP = 6   ;
     printf(" in %x out %x \n", in, out) ;
   for (k= 2; k<N; k++)
     {

        p_in = in + k * (k+3)   ;
////    Step 1 - calculate the sum
       printf(" p1 %x , p_in %x \n", p1, p_in ) ;
       p_out1 = p1 ;
       xx1 = 0.0   ;
       for (l=0; l<k; l++)
       {
               a01 =    _amemd8(p_out1) ;
               printf("a01 %f %f \n", _lof(a01), _hif(a01) ) ;
            b01 = _dmpysp(a01, a01) ;
            xx0 =   _hif(b01)+ _lof(b01)   ;
            p_out1 = p_out1 + 2     ;
            xx1 = xx1 + xx0 ;
            printf("sumD %f \n", xx1) ;
       }
///   p_out1 points to the last element in row k (diagonal)
///   p1 points at the first eleemnt
///   p_in points to diagonal in the input

//      h01 = _amemd8(p_in) ;
      xx0 = *p_in ;
      x0 =    (xx0 - xx1)   ;
      printf("xx1 %f xx0 %f x0 %f \n",xx1,xx0, x0) ;

      y0 = _rsqrsp(x0)   ; // first NR guess,
      y1 = y0 * (3.0 - x0 * y0 * y0) * 0.5   ; // first iteration, 16 bits accuracy
      x1 = y1 * (3.0 - x0 * y1 * y1) * 0.5   ;   ; //   Second iterations

//    x1 is 1/sqrt

      d01 = _ftod(x1,x1) ;

      *OneOverDiag++ = x1 ;
       printf("x1 = %f \n", x1) ;

/// Now we have the 1/sqrt, we ca;culate all othe relements of row k, starting with
/// The diagonal
/// p_out1 points at the diagonal (output) - for row 2 it is 10, for row 3 it is 18
/// For row 2 it increament by 6, 8, etc
/// For row 3 it increament by 8, 10, etc.
/// p1 points at the first element of row k
/// Note that the bottle neck may not be the multipliers, but registers or load,
/// So we may change the addresing back to multiplicatiopns

//       m = k * (k+1)   ;
//       n = k * (k+3) ; //diagonal location
//       p1 = &out[m]   ;

       *p_out1++ = x0 * x1 ;
       *p_out1++ = 0.0 ;
       printf("diagonal output %f \n", x0 * x1 ) ;
///    Next step is to build all the values that are below the diagonal
////   These elements
////   The diagonal is in location k*(k+3)
////   The next one is in location k*(k+3) + 2 * (k + 1)
////   The next one is in address k*(k+3) + 2*(k+ 3) + 2 * (k+1)
////   And in general. in row k+u, the address of the new element is k*(k+3) + sum (2*r) where r is from k+1 to u

///    Now to the address of the values. The value in column k and row l (k < l)
///    has the following equation (A input matrix, L and U are the outputs)
///    A(k+m,k) = sum (u from o to k-1) L(k,u) * L(k+m,u) conjugated + D1 * L(k+m,k)
///    Thus
////   L(k+m,k) = ( A(k+m,k) - sum) * x1

       if (k+1 == N) return ;

//    Calculate all values below the diagonal (for L)

       b01 = _ftod(0.0, 0.0) ;
       for (l = k+1; l < N; l++)
       {

           p1 = out + k*(k+1)   ; //   leave addressing as it is
           p2 = out + l * (l+1) ;
           p_in1 = in + k*2 + (l+1) * l ;
           p_out1 = out + k*2 + (l+1) * l ;


           for (m=0; m<k;m++)
           {
                printf("p1 %x p2 %x \n", p1, p2 ) ;
               h01 =    _amemd8(p1) ; // read the output input
               a01 =    _amemd8(p2) ;

               p1 = p1 + 2   ;
               p2 = p2 + 2   ;

                   printf("h01 and a 01 %f %f   %f %f \n", _lof(h01), _hif(h01), _lof(a01), _hif(a01) ) ;

               e01 = _complex_conjugate_mpysp(h01, a01)   ; //   or the other way around
               b01 = _daddsp(e01, b01) ; // accumulate image and real
           }




////////////////////////////////////////////////////////////

           printf(" b01 %f %f \n",_lof(b01), _hif(b01) ) ;

           f01 = _amemd8(p_in1)   ; // reading the input
           g01 = _dsubsp(f01, b01) ;
           printf(" f01 %f %f \n",_lof(f01), _hif(f01) ) ;
           printf(" g01 %f %f \n",_lof(g01), _hif(g01) ) ;

            c01 = _dmpysp(g01,d01) ;
           printf(" c01 %f %f \n",_lof(c01), _hif(c01) ) ;

           _amemd8(p_out1)   = c01    ;
           p1 = p1 + 2   ;

///////   What is the address of the output ?????

       }

   }

   return ;
}

Processors

Processors forum

TMS320C6678: Matrix multiply using dsplib