Blame thirdparty/openblas/xianyi-OpenBLAS-e6e87a2/kernel/generic/zgemmkernel_2x2.c

kusano 2b45e8
#include "common.h"
kusano 2b45e8
/********************************
kusano 2b45e8
  ADD1 a*c
kusano 2b45e8
  ADD2 b*c
kusano 2b45e8
  ADD3 a*d
kusano 2b45e8
  ADD4 b*d
kusano 2b45e8
*********************************/
kusano 2b45e8
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
kusano 2b45e8
#ifdef	TRMMKERNEL
kusano 2b45e8
		, BLASLONG offset
kusano 2b45e8
#endif
kusano 2b45e8
		)
kusano 2b45e8
{
kusano 2b45e8
   BLASLONG i,j,k;
kusano 2b45e8
   FLOAT *C0,*C1,*ptrba,*ptrbb;
kusano 2b45e8
   FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15;
kusano 2b45e8
   for (j=0; j
kusano 2b45e8
     {
kusano 2b45e8
        C0 = C;
kusano 2b45e8
        C1 = C0+2*ldc;
kusano 2b45e8
        ptrba = ba;
kusano 2b45e8
        for (i=0; i
kusano 2b45e8
          {
kusano 2b45e8
             ptrbb = bb;
kusano 2b45e8
             res0 = 0;
kusano 2b45e8
             res1 = 0;
kusano 2b45e8
             res2 = 0;
kusano 2b45e8
             res3 = 0;
kusano 2b45e8
             res4 = 0;
kusano 2b45e8
             res5 = 0;
kusano 2b45e8
             res6 = 0;
kusano 2b45e8
             res7 = 0;
kusano 2b45e8
             for (k=0; k
kusano 2b45e8
               {
kusano 2b45e8
#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
kusano 2b45e8
                  load0 = ptrba[4*0+0];
kusano 2b45e8
                  load1 = ptrbb[4*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[4*0+1];
kusano 2b45e8
                  res1 = res1+load2*load1;
kusano 2b45e8
                  load3 = ptrbb[4*0+1];
kusano 2b45e8
                  res0 = res0-load2*load3;
kusano 2b45e8
                  res1 = res1+load0*load3;
kusano 2b45e8
                  load4 = ptrba[4*0+2];
kusano 2b45e8
                  res2 = res2+load4*load1;
kusano 2b45e8
                  load5 = ptrba[4*0+3];
kusano 2b45e8
                  res3 = res3+load5*load1;
kusano 2b45e8
                  res2 = res2-load5*load3;
kusano 2b45e8
                  res3 = res3+load4*load3;
kusano 2b45e8
                  load6 = ptrbb[4*0+2];
kusano 2b45e8
                  res4 = res4+load0*load6;
kusano 2b45e8
                  res5 = res5+load2*load6;
kusano 2b45e8
                  load7 = ptrbb[4*0+3];
kusano 2b45e8
                  res4 = res4-load2*load7;
kusano 2b45e8
                  res5 = res5+load0*load7;
kusano 2b45e8
                  res6 = res6+load4*load6;
kusano 2b45e8
                  res7 = res7+load5*load6;
kusano 2b45e8
                  res6 = res6-load5*load7;
kusano 2b45e8
                  res7 = res7+load4*load7;
kusano 2b45e8
                  load8 = ptrba[4*1+0];
kusano 2b45e8
                  load9 = ptrbb[4*1+0];
kusano 2b45e8
                  res0 = res0+load8*load9;
kusano 2b45e8
                  load10 = ptrba[4*1+1];
kusano 2b45e8
                  res1 = res1+load10*load9;
kusano 2b45e8
                  load11 = ptrbb[4*1+1];
kusano 2b45e8
                  res0 = res0-load10*load11;
kusano 2b45e8
                  res1 = res1+load8*load11;
kusano 2b45e8
                  load12 = ptrba[4*1+2];
kusano 2b45e8
                  res2 = res2+load12*load9;
kusano 2b45e8
                  load13 = ptrba[4*1+3];
kusano 2b45e8
                  res3 = res3+load13*load9;
kusano 2b45e8
                  res2 = res2-load13*load11;
kusano 2b45e8
                  res3 = res3+load12*load11;
kusano 2b45e8
                  load14 = ptrbb[4*1+2];
kusano 2b45e8
                  res4 = res4+load8*load14;
kusano 2b45e8
                  res5 = res5+load10*load14;
kusano 2b45e8
                  load15 = ptrbb[4*1+3];
kusano 2b45e8
                  res4 = res4-load10*load15;
kusano 2b45e8
                  res5 = res5+load8*load15;
kusano 2b45e8
                  res6 = res6+load12*load14;
kusano 2b45e8
                  res7 = res7+load13*load14;
kusano 2b45e8
                  res6 = res6-load13*load15;
kusano 2b45e8
                  res7 = res7+load12*load15;
kusano 2b45e8
                  load0 = ptrba[4*2+0];
kusano 2b45e8
                  load1 = ptrbb[4*2+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[4*2+1];
kusano 2b45e8
                  res1 = res1+load2*load1;
kusano 2b45e8
                  load3 = ptrbb[4*2+1];
kusano 2b45e8
                  res0 = res0-load2*load3;
kusano 2b45e8
                  res1 = res1+load0*load3;
kusano 2b45e8
                  load4 = ptrba[4*2+2];
kusano 2b45e8
                  res2 = res2+load4*load1;
kusano 2b45e8
                  load5 = ptrba[4*2+3];
kusano 2b45e8
                  res3 = res3+load5*load1;
kusano 2b45e8
                  res2 = res2-load5*load3;
kusano 2b45e8
                  res3 = res3+load4*load3;
kusano 2b45e8
                  load6 = ptrbb[4*2+2];
kusano 2b45e8
                  res4 = res4+load0*load6;
kusano 2b45e8
                  res5 = res5+load2*load6;
kusano 2b45e8
                  load7 = ptrbb[4*2+3];
kusano 2b45e8
                  res4 = res4-load2*load7;
kusano 2b45e8
                  res5 = res5+load0*load7;
kusano 2b45e8
                  res6 = res6+load4*load6;
kusano 2b45e8
                  res7 = res7+load5*load6;
kusano 2b45e8
                  res6 = res6-load5*load7;
kusano 2b45e8
                  res7 = res7+load4*load7;
kusano 2b45e8
                  load8 = ptrba[4*3+0];
kusano 2b45e8
                  load9 = ptrbb[4*3+0];
kusano 2b45e8
                  res0 = res0+load8*load9;
kusano 2b45e8
                  load10 = ptrba[4*3+1];
kusano 2b45e8
                  res1 = res1+load10*load9;
kusano 2b45e8
                  load11 = ptrbb[4*3+1];
kusano 2b45e8
                  res0 = res0-load10*load11;
kusano 2b45e8
                  res1 = res1+load8*load11;
kusano 2b45e8
                  load12 = ptrba[4*3+2];
kusano 2b45e8
                  res2 = res2+load12*load9;
kusano 2b45e8
                  load13 = ptrba[4*3+3];
kusano 2b45e8
                  res3 = res3+load13*load9;
kusano 2b45e8
                  res2 = res2-load13*load11;
kusano 2b45e8
                  res3 = res3+load12*load11;
kusano 2b45e8
                  load14 = ptrbb[4*3+2];
kusano 2b45e8
                  res4 = res4+load8*load14;
kusano 2b45e8
                  res5 = res5+load10*load14;
kusano 2b45e8
                  load15 = ptrbb[4*3+3];
kusano 2b45e8
                  res4 = res4-load10*load15;
kusano 2b45e8
                  res5 = res5+load8*load15;
kusano 2b45e8
                  res6 = res6+load12*load14;
kusano 2b45e8
                  res7 = res7+load13*load14;
kusano 2b45e8
                  res6 = res6-load13*load15;
kusano 2b45e8
                  res7 = res7+load12*load15;
kusano 2b45e8
#endif
kusano 2b45e8
#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
kusano 2b45e8
                  load0 = ptrba[4*0+0];
kusano 2b45e8
                  load1 = ptrbb[4*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[4*0+1];
kusano 2b45e8
                  res1 = res1+load2*load1;
kusano 2b45e8
                  load3 = ptrbb[4*0+1];
kusano 2b45e8
                  res0 = res0+load2*load3;
kusano 2b45e8
                  res1 = res1-load0*load3;
kusano 2b45e8
                  load4 = ptrba[4*0+2];
kusano 2b45e8
                  res2 = res2+load4*load1;
kusano 2b45e8
                  load5 = ptrba[4*0+3];
kusano 2b45e8
                  res3 = res3+load5*load1;
kusano 2b45e8
                  res2 = res2+load5*load3;
kusano 2b45e8
                  res3 = res3-load4*load3;
kusano 2b45e8
                  load6 = ptrbb[4*0+2];
kusano 2b45e8
                  res4 = res4+load0*load6;
kusano 2b45e8
                  res5 = res5+load2*load6;
kusano 2b45e8
                  load7 = ptrbb[4*0+3];
kusano 2b45e8
                  res4 = res4+load2*load7;
kusano 2b45e8
                  res5 = res5-load0*load7;
kusano 2b45e8
                  res6 = res6+load4*load6;
kusano 2b45e8
                  res7 = res7+load5*load6;
kusano 2b45e8
                  res6 = res6+load5*load7;
kusano 2b45e8
                  res7 = res7-load4*load7;
kusano 2b45e8
                  load8 = ptrba[4*1+0];
kusano 2b45e8
                  load9 = ptrbb[4*1+0];
kusano 2b45e8
                  res0 = res0+load8*load9;
kusano 2b45e8
                  load10 = ptrba[4*1+1];
kusano 2b45e8
                  res1 = res1+load10*load9;
kusano 2b45e8
                  load11 = ptrbb[4*1+1];
kusano 2b45e8
                  res0 = res0+load10*load11;
kusano 2b45e8
                  res1 = res1-load8*load11;
kusano 2b45e8
                  load12 = ptrba[4*1+2];
kusano 2b45e8
                  res2 = res2+load12*load9;
kusano 2b45e8
                  load13 = ptrba[4*1+3];
kusano 2b45e8
                  res3 = res3+load13*load9;
kusano 2b45e8
                  res2 = res2+load13*load11;
kusano 2b45e8
                  res3 = res3-load12*load11;
kusano 2b45e8
                  load14 = ptrbb[4*1+2];
kusano 2b45e8
                  res4 = res4+load8*load14;
kusano 2b45e8
                  res5 = res5+load10*load14;
kusano 2b45e8
                  load15 = ptrbb[4*1+3];
kusano 2b45e8
                  res4 = res4+load10*load15;
kusano 2b45e8
                  res5 = res5-load8*load15;
kusano 2b45e8
                  res6 = res6+load12*load14;
kusano 2b45e8
                  res7 = res7+load13*load14;
kusano 2b45e8
                  res6 = res6+load13*load15;
kusano 2b45e8
                  res7 = res7-load12*load15;
kusano 2b45e8
                  load0 = ptrba[4*2+0];
kusano 2b45e8
                  load1 = ptrbb[4*2+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[4*2+1];
kusano 2b45e8
                  res1 = res1+load2*load1;
kusano 2b45e8
                  load3 = ptrbb[4*2+1];
kusano 2b45e8
                  res0 = res0+load2*load3;
kusano 2b45e8
                  res1 = res1-load0*load3;
kusano 2b45e8
                  load4 = ptrba[4*2+2];
kusano 2b45e8
                  res2 = res2+load4*load1;
kusano 2b45e8
                  load5 = ptrba[4*2+3];
kusano 2b45e8
                  res3 = res3+load5*load1;
kusano 2b45e8
                  res2 = res2+load5*load3;
kusano 2b45e8
                  res3 = res3-load4*load3;
kusano 2b45e8
                  load6 = ptrbb[4*2+2];
kusano 2b45e8
                  res4 = res4+load0*load6;
kusano 2b45e8
                  res5 = res5+load2*load6;
kusano 2b45e8
                  load7 = ptrbb[4*2+3];
kusano 2b45e8
                  res4 = res4+load2*load7;
kusano 2b45e8
                  res5 = res5-load0*load7;
kusano 2b45e8
                  res6 = res6+load4*load6;
kusano 2b45e8
                  res7 = res7+load5*load6;
kusano 2b45e8
                  res6 = res6+load5*load7;
kusano 2b45e8
                  res7 = res7-load4*load7;
kusano 2b45e8
                  load8 = ptrba[4*3+0];
kusano 2b45e8
                  load9 = ptrbb[4*3+0];
kusano 2b45e8
                  res0 = res0+load8*load9;
kusano 2b45e8
                  load10 = ptrba[4*3+1];
kusano 2b45e8
                  res1 = res1+load10*load9;
kusano 2b45e8
                  load11 = ptrbb[4*3+1];
kusano 2b45e8
                  res0 = res0+load10*load11;
kusano 2b45e8
                  res1 = res1-load8*load11;
kusano 2b45e8
                  load12 = ptrba[4*3+2];
kusano 2b45e8
                  res2 = res2+load12*load9;
kusano 2b45e8
                  load13 = ptrba[4*3+3];
kusano 2b45e8
                  res3 = res3+load13*load9;
kusano 2b45e8
                  res2 = res2+load13*load11;
kusano 2b45e8
                  res3 = res3-load12*load11;
kusano 2b45e8
                  load14 = ptrbb[4*3+2];
kusano 2b45e8
                  res4 = res4+load8*load14;
kusano 2b45e8
                  res5 = res5+load10*load14;
kusano 2b45e8
                  load15 = ptrbb[4*3+3];
kusano 2b45e8
                  res4 = res4+load10*load15;
kusano 2b45e8
                  res5 = res5-load8*load15;
kusano 2b45e8
                  res6 = res6+load12*load14;
kusano 2b45e8
                  res7 = res7+load13*load14;
kusano 2b45e8
                  res6 = res6+load13*load15;
kusano 2b45e8
                  res7 = res7-load12*load15;
kusano 2b45e8
#endif
kusano 2b45e8
#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
kusano 2b45e8
                  load0 = ptrba[4*0+0];
kusano 2b45e8
                  load1 = ptrbb[4*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[4*0+1];
kusano 2b45e8
                  res1 = res1-load2*load1;
kusano 2b45e8
                  load3 = ptrbb[4*0+1];
kusano 2b45e8
                  res0 = res0+load2*load3;
kusano 2b45e8
                  res1 = res1+load0*load3;
kusano 2b45e8
                  load4 = ptrba[4*0+2];
kusano 2b45e8
                  res2 = res2+load4*load1;
kusano 2b45e8
                  load5 = ptrba[4*0+3];
kusano 2b45e8
                  res3 = res3-load5*load1;
kusano 2b45e8
                  res2 = res2+load5*load3;
kusano 2b45e8
                  res3 = res3+load4*load3;
kusano 2b45e8
                  load6 = ptrbb[4*0+2];
kusano 2b45e8
                  res4 = res4+load0*load6;
kusano 2b45e8
                  res5 = res5-load2*load6;
kusano 2b45e8
                  load7 = ptrbb[4*0+3];
kusano 2b45e8
                  res4 = res4+load2*load7;
kusano 2b45e8
                  res5 = res5+load0*load7;
kusano 2b45e8
                  res6 = res6+load4*load6;
kusano 2b45e8
                  res7 = res7-load5*load6;
kusano 2b45e8
                  res6 = res6+load5*load7;
kusano 2b45e8
                  res7 = res7+load4*load7;
kusano 2b45e8
                  load8 = ptrba[4*1+0];
kusano 2b45e8
                  load9 = ptrbb[4*1+0];
kusano 2b45e8
                  res0 = res0+load8*load9;
kusano 2b45e8
                  load10 = ptrba[4*1+1];
kusano 2b45e8
                  res1 = res1-load10*load9;
kusano 2b45e8
                  load11 = ptrbb[4*1+1];
kusano 2b45e8
                  res0 = res0+load10*load11;
kusano 2b45e8
                  res1 = res1+load8*load11;
kusano 2b45e8
                  load12 = ptrba[4*1+2];
kusano 2b45e8
                  res2 = res2+load12*load9;
kusano 2b45e8
                  load13 = ptrba[4*1+3];
kusano 2b45e8
                  res3 = res3-load13*load9;
kusano 2b45e8
                  res2 = res2+load13*load11;
kusano 2b45e8
                  res3 = res3+load12*load11;
kusano 2b45e8
                  load14 = ptrbb[4*1+2];
kusano 2b45e8
                  res4 = res4+load8*load14;
kusano 2b45e8
                  res5 = res5-load10*load14;
kusano 2b45e8
                  load15 = ptrbb[4*1+3];
kusano 2b45e8
                  res4 = res4+load10*load15;
kusano 2b45e8
                  res5 = res5+load8*load15;
kusano 2b45e8
                  res6 = res6+load12*load14;
kusano 2b45e8
                  res7 = res7-load13*load14;
kusano 2b45e8
                  res6 = res6+load13*load15;
kusano 2b45e8
                  res7 = res7+load12*load15;
kusano 2b45e8
                  load0 = ptrba[4*2+0];
kusano 2b45e8
                  load1 = ptrbb[4*2+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[4*2+1];
kusano 2b45e8
                  res1 = res1-load2*load1;
kusano 2b45e8
                  load3 = ptrbb[4*2+1];
kusano 2b45e8
                  res0 = res0+load2*load3;
kusano 2b45e8
                  res1 = res1+load0*load3;
kusano 2b45e8
                  load4 = ptrba[4*2+2];
kusano 2b45e8
                  res2 = res2+load4*load1;
kusano 2b45e8
                  load5 = ptrba[4*2+3];
kusano 2b45e8
                  res3 = res3-load5*load1;
kusano 2b45e8
                  res2 = res2+load5*load3;
kusano 2b45e8
                  res3 = res3+load4*load3;
kusano 2b45e8
                  load6 = ptrbb[4*2+2];
kusano 2b45e8
                  res4 = res4+load0*load6;
kusano 2b45e8
                  res5 = res5-load2*load6;
kusano 2b45e8
                  load7 = ptrbb[4*2+3];
kusano 2b45e8
                  res4 = res4+load2*load7;
kusano 2b45e8
                  res5 = res5+load0*load7;
kusano 2b45e8
                  res6 = res6+load4*load6;
kusano 2b45e8
                  res7 = res7-load5*load6;
kusano 2b45e8
                  res6 = res6+load5*load7;
kusano 2b45e8
                  res7 = res7+load4*load7;
kusano 2b45e8
                  load8 = ptrba[4*3+0];
kusano 2b45e8
                  load9 = ptrbb[4*3+0];
kusano 2b45e8
                  res0 = res0+load8*load9;
kusano 2b45e8
                  load10 = ptrba[4*3+1];
kusano 2b45e8
                  res1 = res1-load10*load9;
kusano 2b45e8
                  load11 = ptrbb[4*3+1];
kusano 2b45e8
                  res0 = res0+load10*load11;
kusano 2b45e8
                  res1 = res1+load8*load11;
kusano 2b45e8
                  load12 = ptrba[4*3+2];
kusano 2b45e8
                  res2 = res2+load12*load9;
kusano 2b45e8
                  load13 = ptrba[4*3+3];
kusano 2b45e8
                  res3 = res3-load13*load9;
kusano 2b45e8
                  res2 = res2+load13*load11;
kusano 2b45e8
                  res3 = res3+load12*load11;
kusano 2b45e8
                  load14 = ptrbb[4*3+2];
kusano 2b45e8
                  res4 = res4+load8*load14;
kusano 2b45e8
                  res5 = res5-load10*load14;
kusano 2b45e8
                  load15 = ptrbb[4*3+3];
kusano 2b45e8
                  res4 = res4+load10*load15;
kusano 2b45e8
                  res5 = res5+load8*load15;
kusano 2b45e8
                  res6 = res6+load12*load14;
kusano 2b45e8
                  res7 = res7-load13*load14;
kusano 2b45e8
                  res6 = res6+load13*load15;
kusano 2b45e8
                  res7 = res7+load12*load15;
kusano 2b45e8
#endif
kusano 2b45e8
#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
kusano 2b45e8
                  load0 = ptrba[4*0+0];
kusano 2b45e8
                  load1 = ptrbb[4*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[4*0+1];
kusano 2b45e8
                  res1 = res1-load2*load1;
kusano 2b45e8
                  load3 = ptrbb[4*0+1];
kusano 2b45e8
                  res0 = res0-load2*load3;
kusano 2b45e8
                  res1 = res1-load0*load3;
kusano 2b45e8
                  load4 = ptrba[4*0+2];
kusano 2b45e8
                  res2 = res2+load4*load1;
kusano 2b45e8
                  load5 = ptrba[4*0+3];
kusano 2b45e8
                  res3 = res3-load5*load1;
kusano 2b45e8
                  res2 = res2-load5*load3;
kusano 2b45e8
                  res3 = res3-load4*load3;
kusano 2b45e8
                  load6 = ptrbb[4*0+2];
kusano 2b45e8
                  res4 = res4+load0*load6;
kusano 2b45e8
                  res5 = res5-load2*load6;
kusano 2b45e8
                  load7 = ptrbb[4*0+3];
kusano 2b45e8
                  res4 = res4-load2*load7;
kusano 2b45e8
                  res5 = res5-load0*load7;
kusano 2b45e8
                  res6 = res6+load4*load6;
kusano 2b45e8
                  res7 = res7-load5*load6;
kusano 2b45e8
                  res6 = res6-load5*load7;
kusano 2b45e8
                  res7 = res7-load4*load7;
kusano 2b45e8
                  load8 = ptrba[4*1+0];
kusano 2b45e8
                  load9 = ptrbb[4*1+0];
kusano 2b45e8
                  res0 = res0+load8*load9;
kusano 2b45e8
                  load10 = ptrba[4*1+1];
kusano 2b45e8
                  res1 = res1-load10*load9;
kusano 2b45e8
                  load11 = ptrbb[4*1+1];
kusano 2b45e8
                  res0 = res0-load10*load11;
kusano 2b45e8
                  res1 = res1-load8*load11;
kusano 2b45e8
                  load12 = ptrba[4*1+2];
kusano 2b45e8
                  res2 = res2+load12*load9;
kusano 2b45e8
                  load13 = ptrba[4*1+3];
kusano 2b45e8
                  res3 = res3-load13*load9;
kusano 2b45e8
                  res2 = res2-load13*load11;
kusano 2b45e8
                  res3 = res3-load12*load11;
kusano 2b45e8
                  load14 = ptrbb[4*1+2];
kusano 2b45e8
                  res4 = res4+load8*load14;
kusano 2b45e8
                  res5 = res5-load10*load14;
kusano 2b45e8
                  load15 = ptrbb[4*1+3];
kusano 2b45e8
                  res4 = res4-load10*load15;
kusano 2b45e8
                  res5 = res5-load8*load15;
kusano 2b45e8
                  res6 = res6+load12*load14;
kusano 2b45e8
                  res7 = res7-load13*load14;
kusano 2b45e8
                  res6 = res6-load13*load15;
kusano 2b45e8
                  res7 = res7-load12*load15;
kusano 2b45e8
                  load0 = ptrba[4*2+0];
kusano 2b45e8
                  load1 = ptrbb[4*2+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[4*2+1];
kusano 2b45e8
                  res1 = res1-load2*load1;
kusano 2b45e8
                  load3 = ptrbb[4*2+1];
kusano 2b45e8
                  res0 = res0-load2*load3;
kusano 2b45e8
                  res1 = res1-load0*load3;
kusano 2b45e8
                  load4 = ptrba[4*2+2];
kusano 2b45e8
                  res2 = res2+load4*load1;
kusano 2b45e8
                  load5 = ptrba[4*2+3];
kusano 2b45e8
                  res3 = res3-load5*load1;
kusano 2b45e8
                  res2 = res2-load5*load3;
kusano 2b45e8
                  res3 = res3-load4*load3;
kusano 2b45e8
                  load6 = ptrbb[4*2+2];
kusano 2b45e8
                  res4 = res4+load0*load6;
kusano 2b45e8
                  res5 = res5-load2*load6;
kusano 2b45e8
                  load7 = ptrbb[4*2+3];
kusano 2b45e8
                  res4 = res4-load2*load7;
kusano 2b45e8
                  res5 = res5-load0*load7;
kusano 2b45e8
                  res6 = res6+load4*load6;
kusano 2b45e8
                  res7 = res7-load5*load6;
kusano 2b45e8
                  res6 = res6-load5*load7;
kusano 2b45e8
                  res7 = res7-load4*load7;
kusano 2b45e8
                  load8 = ptrba[4*3+0];
kusano 2b45e8
                  load9 = ptrbb[4*3+0];
kusano 2b45e8
                  res0 = res0+load8*load9;
kusano 2b45e8
                  load10 = ptrba[4*3+1];
kusano 2b45e8
                  res1 = res1-load10*load9;
kusano 2b45e8
                  load11 = ptrbb[4*3+1];
kusano 2b45e8
                  res0 = res0-load10*load11;
kusano 2b45e8
                  res1 = res1-load8*load11;
kusano 2b45e8
                  load12 = ptrba[4*3+2];
kusano 2b45e8
                  res2 = res2+load12*load9;
kusano 2b45e8
                  load13 = ptrba[4*3+3];
kusano 2b45e8
                  res3 = res3-load13*load9;
kusano 2b45e8
                  res2 = res2-load13*load11;
kusano 2b45e8
                  res3 = res3-load12*load11;
kusano 2b45e8
                  load14 = ptrbb[4*3+2];
kusano 2b45e8
                  res4 = res4+load8*load14;
kusano 2b45e8
                  res5 = res5-load10*load14;
kusano 2b45e8
                  load15 = ptrbb[4*3+3];
kusano 2b45e8
                  res4 = res4-load10*load15;
kusano 2b45e8
                  res5 = res5-load8*load15;
kusano 2b45e8
                  res6 = res6+load12*load14;
kusano 2b45e8
                  res7 = res7-load13*load14;
kusano 2b45e8
                  res6 = res6-load13*load15;
kusano 2b45e8
                  res7 = res7-load12*load15;
kusano 2b45e8
#endif
kusano 2b45e8
                  ptrba = ptrba+16;
kusano 2b45e8
                  ptrbb = ptrbb+16;
kusano 2b45e8
               }
kusano 2b45e8
             for (k=0; k<(bk&3); k+=1) 
kusano 2b45e8
               {
kusano 2b45e8
#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
kusano 2b45e8
                  load0 = ptrba[4*0+0];
kusano 2b45e8
                  load1 = ptrbb[4*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[4*0+1];
kusano 2b45e8
                  res1 = res1+load2*load1;
kusano 2b45e8
                  load3 = ptrbb[4*0+1];
kusano 2b45e8
                  res0 = res0-load2*load3;
kusano 2b45e8
                  res1 = res1+load0*load3;
kusano 2b45e8
                  load4 = ptrba[4*0+2];
kusano 2b45e8
                  res2 = res2+load4*load1;
kusano 2b45e8
                  load5 = ptrba[4*0+3];
kusano 2b45e8
                  res3 = res3+load5*load1;
kusano 2b45e8
                  res2 = res2-load5*load3;
kusano 2b45e8
                  res3 = res3+load4*load3;
kusano 2b45e8
                  load6 = ptrbb[4*0+2];
kusano 2b45e8
                  res4 = res4+load0*load6;
kusano 2b45e8
                  res5 = res5+load2*load6;
kusano 2b45e8
                  load7 = ptrbb[4*0+3];
kusano 2b45e8
                  res4 = res4-load2*load7;
kusano 2b45e8
                  res5 = res5+load0*load7;
kusano 2b45e8
                  res6 = res6+load4*load6;
kusano 2b45e8
                  res7 = res7+load5*load6;
kusano 2b45e8
                  res6 = res6-load5*load7;
kusano 2b45e8
                  res7 = res7+load4*load7;
kusano 2b45e8
#endif
kusano 2b45e8
#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
kusano 2b45e8
                  load0 = ptrba[4*0+0];
kusano 2b45e8
                  load1 = ptrbb[4*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[4*0+1];
kusano 2b45e8
                  res1 = res1+load2*load1;
kusano 2b45e8
                  load3 = ptrbb[4*0+1];
kusano 2b45e8
                  res0 = res0+load2*load3;
kusano 2b45e8
                  res1 = res1-load0*load3;
kusano 2b45e8
                  load4 = ptrba[4*0+2];
kusano 2b45e8
                  res2 = res2+load4*load1;
kusano 2b45e8
                  load5 = ptrba[4*0+3];
kusano 2b45e8
                  res3 = res3+load5*load1;
kusano 2b45e8
                  res2 = res2+load5*load3;
kusano 2b45e8
                  res3 = res3-load4*load3;
kusano 2b45e8
                  load6 = ptrbb[4*0+2];
kusano 2b45e8
                  res4 = res4+load0*load6;
kusano 2b45e8
                  res5 = res5+load2*load6;
kusano 2b45e8
                  load7 = ptrbb[4*0+3];
kusano 2b45e8
                  res4 = res4+load2*load7;
kusano 2b45e8
                  res5 = res5-load0*load7;
kusano 2b45e8
                  res6 = res6+load4*load6;
kusano 2b45e8
                  res7 = res7+load5*load6;
kusano 2b45e8
                  res6 = res6+load5*load7;
kusano 2b45e8
                  res7 = res7-load4*load7;
kusano 2b45e8
#endif
kusano 2b45e8
#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
kusano 2b45e8
                  load0 = ptrba[4*0+0];
kusano 2b45e8
                  load1 = ptrbb[4*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[4*0+1];
kusano 2b45e8
                  res1 = res1-load2*load1;
kusano 2b45e8
                  load3 = ptrbb[4*0+1];
kusano 2b45e8
                  res0 = res0+load2*load3;
kusano 2b45e8
                  res1 = res1+load0*load3;
kusano 2b45e8
                  load4 = ptrba[4*0+2];
kusano 2b45e8
                  res2 = res2+load4*load1;
kusano 2b45e8
                  load5 = ptrba[4*0+3];
kusano 2b45e8
                  res3 = res3-load5*load1;
kusano 2b45e8
                  res2 = res2+load5*load3;
kusano 2b45e8
                  res3 = res3+load4*load3;
kusano 2b45e8
                  load6 = ptrbb[4*0+2];
kusano 2b45e8
                  res4 = res4+load0*load6;
kusano 2b45e8
                  res5 = res5-load2*load6;
kusano 2b45e8
                  load7 = ptrbb[4*0+3];
kusano 2b45e8
                  res4 = res4+load2*load7;
kusano 2b45e8
                  res5 = res5+load0*load7;
kusano 2b45e8
                  res6 = res6+load4*load6;
kusano 2b45e8
                  res7 = res7-load5*load6;
kusano 2b45e8
                  res6 = res6+load5*load7;
kusano 2b45e8
                  res7 = res7+load4*load7;
kusano 2b45e8
#endif
kusano 2b45e8
#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
kusano 2b45e8
                  load0 = ptrba[4*0+0];
kusano 2b45e8
                  load1 = ptrbb[4*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[4*0+1];
kusano 2b45e8
                  res1 = res1-load2*load1;
kusano 2b45e8
                  load3 = ptrbb[4*0+1];
kusano 2b45e8
                  res0 = res0-load2*load3;
kusano 2b45e8
                  res1 = res1-load0*load3;
kusano 2b45e8
                  load4 = ptrba[4*0+2];
kusano 2b45e8
                  res2 = res2+load4*load1;
kusano 2b45e8
                  load5 = ptrba[4*0+3];
kusano 2b45e8
                  res3 = res3-load5*load1;
kusano 2b45e8
                  res2 = res2-load5*load3;
kusano 2b45e8
                  res3 = res3-load4*load3;
kusano 2b45e8
                  load6 = ptrbb[4*0+2];
kusano 2b45e8
                  res4 = res4+load0*load6;
kusano 2b45e8
                  res5 = res5-load2*load6;
kusano 2b45e8
                  load7 = ptrbb[4*0+3];
kusano 2b45e8
                  res4 = res4-load2*load7;
kusano 2b45e8
                  res5 = res5-load0*load7;
kusano 2b45e8
                  res6 = res6+load4*load6;
kusano 2b45e8
                  res7 = res7-load5*load6;
kusano 2b45e8
                  res6 = res6-load5*load7;
kusano 2b45e8
                  res7 = res7-load4*load7;
kusano 2b45e8
#endif
kusano 2b45e8
                  ptrba = ptrba+4;
kusano 2b45e8
                  ptrbb = ptrbb+4;
kusano 2b45e8
               }
kusano 2b45e8
             load0 = res0*alphar;
kusano 2b45e8
             C0[0] = C0[0]+load0;
kusano 2b45e8
             load1 = res1*alphar;
kusano 2b45e8
             C0[1] = C0[1]+load1;
kusano 2b45e8
             load0 = res1*alphai;
kusano 2b45e8
             C0[0] = C0[0]-load0;
kusano 2b45e8
             load1 = res0*alphai;
kusano 2b45e8
             C0[1] = C0[1]+load1;
kusano 2b45e8
             load2 = res2*alphar;
kusano 2b45e8
             C0[2] = C0[2]+load2;
kusano 2b45e8
             load3 = res3*alphar;
kusano 2b45e8
             C0[3] = C0[3]+load3;
kusano 2b45e8
             load2 = res3*alphai;
kusano 2b45e8
             C0[2] = C0[2]-load2;
kusano 2b45e8
             load3 = res2*alphai;
kusano 2b45e8
             C0[3] = C0[3]+load3;
kusano 2b45e8
             load4 = res4*alphar;
kusano 2b45e8
             C1[0] = C1[0]+load4;
kusano 2b45e8
             load5 = res5*alphar;
kusano 2b45e8
             C1[1] = C1[1]+load5;
kusano 2b45e8
             load4 = res5*alphai;
kusano 2b45e8
             C1[0] = C1[0]-load4;
kusano 2b45e8
             load5 = res4*alphai;
kusano 2b45e8
             C1[1] = C1[1]+load5;
kusano 2b45e8
             load6 = res6*alphar;
kusano 2b45e8
             C1[2] = C1[2]+load6;
kusano 2b45e8
             load7 = res7*alphar;
kusano 2b45e8
             C1[3] = C1[3]+load7;
kusano 2b45e8
             load6 = res7*alphai;
kusano 2b45e8
             C1[2] = C1[2]-load6;
kusano 2b45e8
             load7 = res6*alphai;
kusano 2b45e8
             C1[3] = C1[3]+load7;
kusano 2b45e8
             C0 = C0+4;
kusano 2b45e8
             C1 = C1+4;
kusano 2b45e8
          }
kusano 2b45e8
        for (i=0; i<(bm&1); i+=1) 
kusano 2b45e8
          {
kusano 2b45e8
             ptrbb = bb;
kusano 2b45e8
             res0 = 0;
kusano 2b45e8
             res1 = 0;
kusano 2b45e8
             res2 = 0;
kusano 2b45e8
             res3 = 0;
kusano 2b45e8
             for (k=0; k
kusano 2b45e8
               {
kusano 2b45e8
#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
kusano 2b45e8
                  load0 = ptrba[2*0+0];
kusano 2b45e8
                  load1 = ptrbb[4*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[2*0+1];
kusano 2b45e8
                  res1 = res1+load2*load1;
kusano 2b45e8
                  load3 = ptrbb[4*0+1];
kusano 2b45e8
                  res0 = res0-load2*load3;
kusano 2b45e8
                  res1 = res1+load0*load3;
kusano 2b45e8
                  load4 = ptrbb[4*0+2];
kusano 2b45e8
                  res2 = res2+load0*load4;
kusano 2b45e8
                  res3 = res3+load2*load4;
kusano 2b45e8
                  load5 = ptrbb[4*0+3];
kusano 2b45e8
                  res2 = res2-load2*load5;
kusano 2b45e8
                  res3 = res3+load0*load5;
kusano 2b45e8
#endif
kusano 2b45e8
#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
kusano 2b45e8
				  load0 = ptrba[2*0+0];
kusano 2b45e8
                  load1 = ptrbb[4*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[2*0+1];
kusano 2b45e8
                  res1 = res1+load2*load1;
kusano 2b45e8
                  load3 = ptrbb[4*0+1];
kusano 2b45e8
                  res0 = res0+load2*load3;
kusano 2b45e8
                  res1 = res1-load0*load3;
kusano 2b45e8
                  load4 = ptrbb[4*0+2];
kusano 2b45e8
                  res2 = res2+load0*load4;
kusano 2b45e8
                  res3 = res3+load2*load4;
kusano 2b45e8
                  load5 = ptrbb[4*0+3];
kusano 2b45e8
                  res2 = res2+load2*load5;
kusano 2b45e8
                  res3 = res3-load0*load5;
kusano 2b45e8
#endif
kusano 2b45e8
#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
kusano 2b45e8
                  load0 = ptrba[2*0+0];
kusano 2b45e8
                  load1 = ptrbb[4*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[2*0+1];
kusano 2b45e8
                  res1 = res1-load2*load1;
kusano 2b45e8
                  load3 = ptrbb[4*0+1];
kusano 2b45e8
                  res0 = res0+load2*load3;
kusano 2b45e8
                  res1 = res1+load0*load3;
kusano 2b45e8
                  load4 = ptrbb[4*0+2];
kusano 2b45e8
                  res2 = res2+load0*load4;
kusano 2b45e8
                  res3 = res3-load2*load4;
kusano 2b45e8
                  load5 = ptrbb[4*0+3];
kusano 2b45e8
                  res2 = res2+load2*load5;
kusano 2b45e8
                  res3 = res3+load0*load5;
kusano 2b45e8
#endif
kusano 2b45e8
#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
kusano 2b45e8
                  load0 = ptrba[2*0+0];
kusano 2b45e8
                  load1 = ptrbb[4*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[2*0+1];
kusano 2b45e8
                  res1 = res1-load2*load1;
kusano 2b45e8
                  load3 = ptrbb[4*0+1];
kusano 2b45e8
                  res0 = res0-load2*load3;
kusano 2b45e8
                  res1 = res1-load0*load3;
kusano 2b45e8
                  load4 = ptrbb[4*0+2];
kusano 2b45e8
                  res2 = res2+load0*load4;
kusano 2b45e8
                  res3 = res3-load2*load4;
kusano 2b45e8
                  load5 = ptrbb[4*0+3];
kusano 2b45e8
                  res2 = res2-load2*load5;
kusano 2b45e8
                  res3 = res3-load0*load5;
kusano 2b45e8
#endif
kusano 2b45e8
                  ptrba = ptrba+2;
kusano 2b45e8
                  ptrbb = ptrbb+4;
kusano 2b45e8
               }
kusano 2b45e8
             load0 = res0*alphar;
kusano 2b45e8
             C0[0] = C0[0]+load0;
kusano 2b45e8
             load1 = res1*alphar;
kusano 2b45e8
             C0[1] = C0[1]+load1;
kusano 2b45e8
             load0 = res1*alphai;
kusano 2b45e8
             C0[0] = C0[0]-load0;
kusano 2b45e8
             load1 = res0*alphai;
kusano 2b45e8
             C0[1] = C0[1]+load1;
kusano 2b45e8
             load2 = res2*alphar;
kusano 2b45e8
             C1[0] = C1[0]+load2;
kusano 2b45e8
             load3 = res3*alphar;
kusano 2b45e8
             C1[1] = C1[1]+load3;
kusano 2b45e8
             load2 = res3*alphai;
kusano 2b45e8
             C1[0] = C1[0]-load2;
kusano 2b45e8
             load3 = res2*alphai;
kusano 2b45e8
             C1[1] = C1[1]+load3;
kusano 2b45e8
             C0 = C0+2;
kusano 2b45e8
             C1 = C1+2;
kusano 2b45e8
          }
kusano 2b45e8
        k = (bk<<2);
kusano 2b45e8
        bb = bb+k;
kusano 2b45e8
        i = (ldc<<2);
kusano 2b45e8
        C = C+i;
kusano 2b45e8
     }
kusano 2b45e8
   for (j=0; j<(bn&1); j+=1) 
kusano 2b45e8
     {
kusano 2b45e8
        C0 = C;
kusano 2b45e8
        ptrba = ba;
kusano 2b45e8
        for (i=0; i
kusano 2b45e8
          {
kusano 2b45e8
             ptrbb = bb;
kusano 2b45e8
             res0 = 0;
kusano 2b45e8
             res1 = 0;
kusano 2b45e8
             res2 = 0;
kusano 2b45e8
             res3 = 0;
kusano 2b45e8
             for (k=0; k
kusano 2b45e8
               {
kusano 2b45e8
#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
kusano 2b45e8
                  load0 = ptrba[4*0+0];
kusano 2b45e8
                  load1 = ptrbb[2*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[4*0+1];
kusano 2b45e8
                  res1 = res1+load2*load1;
kusano 2b45e8
                  load3 = ptrbb[2*0+1];
kusano 2b45e8
                  res0 = res0-load2*load3;
kusano 2b45e8
                  res1 = res1+load0*load3;
kusano 2b45e8
                  load4 = ptrba[4*0+2];
kusano 2b45e8
                  res2 = res2+load4*load1;
kusano 2b45e8
                  load5 = ptrba[4*0+3];
kusano 2b45e8
                  res3 = res3+load5*load1;
kusano 2b45e8
                  res2 = res2-load5*load3;
kusano 2b45e8
                  res3 = res3+load4*load3;
kusano 2b45e8
#endif
kusano 2b45e8
#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
kusano 2b45e8
                  load0 = ptrba[4*0+0];
kusano 2b45e8
                  load1 = ptrbb[2*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[4*0+1];
kusano 2b45e8
                  res1 = res1+load2*load1;
kusano 2b45e8
                  load3 = ptrbb[2*0+1];
kusano 2b45e8
                  res0 = res0+load2*load3;
kusano 2b45e8
                  res1 = res1-load0*load3;
kusano 2b45e8
                  load4 = ptrba[4*0+2];
kusano 2b45e8
                  res2 = res2+load4*load1;
kusano 2b45e8
                  load5 = ptrba[4*0+3];
kusano 2b45e8
                  res3 = res3+load5*load1;
kusano 2b45e8
                  res2 = res2+load5*load3;
kusano 2b45e8
                  res3 = res3-load4*load3;
kusano 2b45e8
#endif
kusano 2b45e8
#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
kusano 2b45e8
                  load0 = ptrba[4*0+0];
kusano 2b45e8
                  load1 = ptrbb[2*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[4*0+1];
kusano 2b45e8
                  res1 = res1-load2*load1;
kusano 2b45e8
                  load3 = ptrbb[2*0+1];
kusano 2b45e8
                  res0 = res0+load2*load3;
kusano 2b45e8
                  res1 = res1+load0*load3;
kusano 2b45e8
                  load4 = ptrba[4*0+2];
kusano 2b45e8
                  res2 = res2+load4*load1;
kusano 2b45e8
                  load5 = ptrba[4*0+3];
kusano 2b45e8
                  res3 = res3-load5*load1;
kusano 2b45e8
                  res2 = res2+load5*load3;
kusano 2b45e8
                  res3 = res3+load4*load3;
kusano 2b45e8
#endif
kusano 2b45e8
#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
kusano 2b45e8
                  load0 = ptrba[4*0+0];
kusano 2b45e8
                  load1 = ptrbb[2*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[4*0+1];
kusano 2b45e8
                  res1 = res1-load2*load1;
kusano 2b45e8
                  load3 = ptrbb[2*0+1];
kusano 2b45e8
                  res0 = res0-load2*load3;
kusano 2b45e8
                  res1 = res1-load0*load3;
kusano 2b45e8
                  load4 = ptrba[4*0+2];
kusano 2b45e8
                  res2 = res2+load4*load1;
kusano 2b45e8
                  load5 = ptrba[4*0+3];
kusano 2b45e8
                  res3 = res3-load5*load1;
kusano 2b45e8
                  res2 = res2-load5*load3;
kusano 2b45e8
                  res3 = res3-load4*load3;
kusano 2b45e8
#endif
kusano 2b45e8
                  ptrba = ptrba+4;
kusano 2b45e8
                  ptrbb = ptrbb+2;
kusano 2b45e8
               }
kusano 2b45e8
             load0 = res0*alphar;
kusano 2b45e8
             C0[0] = C0[0]+load0;
kusano 2b45e8
             load1 = res1*alphar;
kusano 2b45e8
             C0[1] = C0[1]+load1;
kusano 2b45e8
             load0 = res1*alphai;
kusano 2b45e8
             C0[0] = C0[0]-load0;
kusano 2b45e8
             load1 = res0*alphai;
kusano 2b45e8
             C0[1] = C0[1]+load1;
kusano 2b45e8
             load2 = res2*alphar;
kusano 2b45e8
             C0[2] = C0[2]+load2;
kusano 2b45e8
             load3 = res3*alphar;
kusano 2b45e8
             C0[3] = C0[3]+load3;
kusano 2b45e8
             load2 = res3*alphai;
kusano 2b45e8
             C0[2] = C0[2]-load2;
kusano 2b45e8
             load3 = res2*alphai;
kusano 2b45e8
             C0[3] = C0[3]+load3;
kusano 2b45e8
             C0 = C0+4;
kusano 2b45e8
          }
kusano 2b45e8
        for (i=0; i<(bm&1); i+=1) 
kusano 2b45e8
          {
kusano 2b45e8
             ptrbb = bb;
kusano 2b45e8
             res0 = 0;
kusano 2b45e8
             res1 = 0;
kusano 2b45e8
             for (k=0; k
kusano 2b45e8
               {
kusano 2b45e8
#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
kusano 2b45e8
                  load0 = ptrba[2*0+0];
kusano 2b45e8
                  load1 = ptrbb[2*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[2*0+1];
kusano 2b45e8
                  res1 = res1+load2*load1;
kusano 2b45e8
                  load3 = ptrbb[2*0+1];
kusano 2b45e8
                  res0 = res0-load2*load3;
kusano 2b45e8
                  res1 = res1+load0*load3;
kusano 2b45e8
#endif
kusano 2b45e8
#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
kusano 2b45e8
                  load0 = ptrba[2*0+0];
kusano 2b45e8
                  load1 = ptrbb[2*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[2*0+1];
kusano 2b45e8
                  res1 = res1+load2*load1;
kusano 2b45e8
                  load3 = ptrbb[2*0+1];
kusano 2b45e8
                  res0 = res0+load2*load3;
kusano 2b45e8
                  res1 = res1-load0*load3;
kusano 2b45e8
#endif
kusano 2b45e8
#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
kusano 2b45e8
				  load0 = ptrba[2*0+0];
kusano 2b45e8
                  load1 = ptrbb[2*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[2*0+1];
kusano 2b45e8
                  res1 = res1-load2*load1;
kusano 2b45e8
                  load3 = ptrbb[2*0+1];
kusano 2b45e8
                  res0 = res0+load2*load3;
kusano 2b45e8
                  res1 = res1+load0*load3;
kusano 2b45e8
#endif
kusano 2b45e8
#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
kusano 2b45e8
                  load0 = ptrba[2*0+0];
kusano 2b45e8
                  load1 = ptrbb[2*0+0];
kusano 2b45e8
                  res0 = res0+load0*load1;
kusano 2b45e8
                  load2 = ptrba[2*0+1];
kusano 2b45e8
                  res1 = res1-load2*load1;
kusano 2b45e8
                  load3 = ptrbb[2*0+1];
kusano 2b45e8
                  res0 = res0-load2*load3;
kusano 2b45e8
                  res1 = res1-load0*load3;
kusano 2b45e8
#endif
kusano 2b45e8
                  ptrba = ptrba+2;
kusano 2b45e8
                  ptrbb = ptrbb+2;
kusano 2b45e8
               }
kusano 2b45e8
             load0 = res0*alphar;
kusano 2b45e8
             C0[0] = C0[0]+load0;
kusano 2b45e8
             load1 = res1*alphar;
kusano 2b45e8
             C0[1] = C0[1]+load1;
kusano 2b45e8
             load0 = res1*alphai;
kusano 2b45e8
             C0[0] = C0[0]-load0;
kusano 2b45e8
             load1 = res0*alphai;
kusano 2b45e8
             C0[1] = C0[1]+load1;
kusano 2b45e8
             C0 = C0+2;
kusano 2b45e8
          }
kusano 2b45e8
        k = (bk<<1);
kusano 2b45e8
        bb = bb+k;
kusano 2b45e8
        i = (ldc<<1);
kusano 2b45e8
        C = C+i;
kusano 2b45e8
     }
kusano 2b45e8
   return 0;
kusano 2b45e8
}