|
kusano |
2b45e8 |
#include "common.h"
|
|
kusano |
2b45e8 |
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
|
|
kusano |
2b45e8 |
#ifdef TRMMKERNEL
|
|
kusano |
2b45e8 |
,BLASLONG offset
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
)
|
|
kusano |
2b45e8 |
{
|
|
kusano |
2b45e8 |
BLASLONG i,j,k;
|
|
kusano |
2b45e8 |
FLOAT *C0,*C1,*ptrba,*ptrbb;
|
|
kusano |
2b45e8 |
FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
|
|
kusano |
2b45e8 |
BLASLONG off, temp;
|
|
kusano |
2b45e8 |
#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
kusano |
2b45e8 |
off = -offset;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
for (j=0; j
|
|
kusano |
2b45e8 |
{
|
|
kusano |
2b45e8 |
C0 = C;
|
|
kusano |
2b45e8 |
C1 = C0+ldc;
|
|
kusano |
2b45e8 |
#if defined(TRMMKERNEL) && defined(LEFT)
|
|
kusano |
2b45e8 |
off = offset;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
ptrba = ba;
|
|
kusano |
2b45e8 |
for (i=0; i
|
|
kusano |
2b45e8 |
{
|
|
kusano |
2b45e8 |
#if (defined(LEFT) && defined(TRANSA)) || \
|
|
kusano |
2b45e8 |
(!defined(LEFT) && !defined(TRANSA))
|
|
kusano |
2b45e8 |
ptrbb = bb;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
ptrba += off*2;
|
|
kusano |
2b45e8 |
ptrbb = bb + off*2;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
res0 = 0;
|
|
kusano |
2b45e8 |
res1 = 0;
|
|
kusano |
2b45e8 |
res2 = 0;
|
|
kusano |
2b45e8 |
res3 = 0;
|
|
kusano |
2b45e8 |
#if (defined(LEFT) && !defined(TRANSA)) || \
|
|
kusano |
2b45e8 |
(!defined(LEFT) && defined(TRANSA))
|
|
kusano |
2b45e8 |
temp = bk-off;
|
|
kusano |
2b45e8 |
#elif defined(LEFT)
|
|
kusano |
2b45e8 |
temp = off+2;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
temp = off+2;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
for (k=0; k
|
|
kusano |
2b45e8 |
{
|
|
kusano |
2b45e8 |
load0 = ptrba[2*0+0];
|
|
kusano |
2b45e8 |
load1 = ptrbb[2*0+0];
|
|
kusano |
2b45e8 |
res0 = res0+load0*load1;
|
|
kusano |
2b45e8 |
load2 = ptrba[2*0+1];
|
|
kusano |
2b45e8 |
res1 = res1+load2*load1;
|
|
kusano |
2b45e8 |
load3 = ptrbb[2*0+1];
|
|
kusano |
2b45e8 |
res2 = res2+load0*load3;
|
|
kusano |
2b45e8 |
res3 = res3+load2*load3;
|
|
kusano |
2b45e8 |
load4 = ptrba[2*1+0];
|
|
kusano |
2b45e8 |
load5 = ptrbb[2*1+0];
|
|
kusano |
2b45e8 |
res0 = res0+load4*load5;
|
|
kusano |
2b45e8 |
load6 = ptrba[2*1+1];
|
|
kusano |
2b45e8 |
res1 = res1+load6*load5;
|
|
kusano |
2b45e8 |
load7 = ptrbb[2*1+1];
|
|
kusano |
2b45e8 |
res2 = res2+load4*load7;
|
|
kusano |
2b45e8 |
res3 = res3+load6*load7;
|
|
kusano |
2b45e8 |
load0 = ptrba[2*2+0];
|
|
kusano |
2b45e8 |
load1 = ptrbb[2*2+0];
|
|
kusano |
2b45e8 |
res0 = res0+load0*load1;
|
|
kusano |
2b45e8 |
load2 = ptrba[2*2+1];
|
|
kusano |
2b45e8 |
res1 = res1+load2*load1;
|
|
kusano |
2b45e8 |
load3 = ptrbb[2*2+1];
|
|
kusano |
2b45e8 |
res2 = res2+load0*load3;
|
|
kusano |
2b45e8 |
res3 = res3+load2*load3;
|
|
kusano |
2b45e8 |
load4 = ptrba[2*3+0];
|
|
kusano |
2b45e8 |
load5 = ptrbb[2*3+0];
|
|
kusano |
2b45e8 |
res0 = res0+load4*load5;
|
|
kusano |
2b45e8 |
load6 = ptrba[2*3+1];
|
|
kusano |
2b45e8 |
res1 = res1+load6*load5;
|
|
kusano |
2b45e8 |
load7 = ptrbb[2*3+1];
|
|
kusano |
2b45e8 |
res2 = res2+load4*load7;
|
|
kusano |
2b45e8 |
res3 = res3+load6*load7;
|
|
kusano |
2b45e8 |
ptrba = ptrba+8;
|
|
kusano |
2b45e8 |
ptrbb = ptrbb+8;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
for (k=0; k<(temp&3); k+=1)
|
|
kusano |
2b45e8 |
{
|
|
kusano |
2b45e8 |
load0 = ptrba[2*0+0];
|
|
kusano |
2b45e8 |
load1 = ptrbb[2*0+0];
|
|
kusano |
2b45e8 |
res0 = res0+load0*load1;
|
|
kusano |
2b45e8 |
load2 = ptrba[2*0+1];
|
|
kusano |
2b45e8 |
res1 = res1+load2*load1;
|
|
kusano |
2b45e8 |
load3 = ptrbb[2*0+1];
|
|
kusano |
2b45e8 |
res2 = res2+load0*load3;
|
|
kusano |
2b45e8 |
res3 = res3+load2*load3;
|
|
kusano |
2b45e8 |
ptrba = ptrba+2;
|
|
kusano |
2b45e8 |
ptrbb = ptrbb+2;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
res0 = res0*alpha;
|
|
kusano |
2b45e8 |
C0[0] = res0;
|
|
kusano |
2b45e8 |
res1 = res1*alpha;
|
|
kusano |
2b45e8 |
C0[1] = res1;
|
|
kusano |
2b45e8 |
res2 = res2*alpha;
|
|
kusano |
2b45e8 |
C1[0] = res2;
|
|
kusano |
2b45e8 |
res3 = res3*alpha;
|
|
kusano |
2b45e8 |
C1[1] = res3;
|
|
kusano |
2b45e8 |
#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
kusano |
2b45e8 |
(!defined(LEFT) && !defined(TRANSA))
|
|
kusano |
2b45e8 |
temp = bk - off;
|
|
kusano |
2b45e8 |
#ifdef LEFT
|
|
kusano |
2b45e8 |
temp -= 2;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
temp -= 2;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
ptrba += temp*2;
|
|
kusano |
2b45e8 |
ptrbb += temp*2;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#ifdef LEFT
|
|
kusano |
2b45e8 |
off += 2;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
C0 = C0+2;
|
|
kusano |
2b45e8 |
C1 = C1+2;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
for (i=0; i<(bm&1); i+=1)
|
|
kusano |
2b45e8 |
{
|
|
kusano |
2b45e8 |
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
|
|
kusano |
2b45e8 |
ptrbb = bb;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
ptrba += off;
|
|
kusano |
2b45e8 |
ptrbb = bb+off*2;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
res0 = 0;
|
|
kusano |
2b45e8 |
res1 = 0;
|
|
kusano |
2b45e8 |
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
kusano |
2b45e8 |
temp = bk-off;
|
|
kusano |
2b45e8 |
#elif defined(LEFT)
|
|
kusano |
2b45e8 |
temp = off+1;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
temp = off+2;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
for (k=0; k
|
|
kusano |
2b45e8 |
{
|
|
kusano |
2b45e8 |
load0 = ptrba[0+0];
|
|
kusano |
2b45e8 |
load1 = ptrbb[2*0+0];
|
|
kusano |
2b45e8 |
res0 = res0+load0*load1;
|
|
kusano |
2b45e8 |
load2 = ptrbb[2*0+1];
|
|
kusano |
2b45e8 |
res1 = res1+load0*load2;
|
|
kusano |
2b45e8 |
ptrba = ptrba+1;
|
|
kusano |
2b45e8 |
ptrbb = ptrbb+2;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
res0 = res0*alpha;
|
|
kusano |
2b45e8 |
C0[0] = res0;
|
|
kusano |
2b45e8 |
res1 = res1*alpha;
|
|
kusano |
2b45e8 |
C1[0] = res1;
|
|
kusano |
2b45e8 |
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
kusano |
2b45e8 |
temp = bk-off;
|
|
kusano |
2b45e8 |
#ifdef LEFT
|
|
kusano |
2b45e8 |
temp -= 1;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
temp -= 2;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
ptrba += temp;
|
|
kusano |
2b45e8 |
ptrbb += temp*2;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#ifdef LEFT
|
|
kusano |
2b45e8 |
off += 1;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
C0 = C0+1;
|
|
kusano |
2b45e8 |
C1 = C1+1;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
kusano |
2b45e8 |
off += 2;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
k = (bk<<1);
|
|
kusano |
2b45e8 |
bb = bb+k;
|
|
kusano |
2b45e8 |
i = (ldc<<1);
|
|
kusano |
2b45e8 |
C = C+i;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
for (j=0; j<(bn&1); j+=1)
|
|
kusano |
2b45e8 |
{
|
|
kusano |
2b45e8 |
C0 = C;
|
|
kusano |
2b45e8 |
#if defined(TRMMKERNEL) && defined(LEFT)
|
|
kusano |
2b45e8 |
off = offset;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
ptrba = ba;
|
|
kusano |
2b45e8 |
for (i=0; i
|
|
kusano |
2b45e8 |
{
|
|
kusano |
2b45e8 |
#if (defined(LEFT) && defined(TRANSA)) || \
|
|
kusano |
2b45e8 |
(!defined(LEFT) && !defined(TRANSA))
|
|
kusano |
2b45e8 |
ptrbb = bb;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
ptrba += off*2;
|
|
kusano |
2b45e8 |
ptrbb = bb + off;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
res0 = 0;
|
|
kusano |
2b45e8 |
res1 = 0;
|
|
kusano |
2b45e8 |
#if (defined(LEFT) && !defined(TRANSA)) || \
|
|
kusano |
2b45e8 |
(!defined(LEFT) && defined(TRANSA))
|
|
kusano |
2b45e8 |
temp = bk-off;
|
|
kusano |
2b45e8 |
#elif defined(LEFT)
|
|
kusano |
2b45e8 |
temp = off+2;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
temp = off+1;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
for (k=0; k
|
|
kusano |
2b45e8 |
{
|
|
kusano |
2b45e8 |
load0 = ptrba[2*0+0];
|
|
kusano |
2b45e8 |
load1 = ptrbb[0+0];
|
|
kusano |
2b45e8 |
res0 = res0+load0*load1;
|
|
kusano |
2b45e8 |
load2 = ptrba[2*0+1];
|
|
kusano |
2b45e8 |
res1 = res1+load2*load1;
|
|
kusano |
2b45e8 |
ptrba = ptrba+2;
|
|
kusano |
2b45e8 |
ptrbb = ptrbb+1;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
res0 = res0*alpha;
|
|
kusano |
2b45e8 |
C0[0] = res0;
|
|
kusano |
2b45e8 |
res1 = res1*alpha;
|
|
kusano |
2b45e8 |
C0[1] = res1;
|
|
kusano |
2b45e8 |
#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
kusano |
2b45e8 |
(!defined(LEFT) && !defined(TRANSA))
|
|
kusano |
2b45e8 |
temp = bk - off;
|
|
kusano |
2b45e8 |
#ifdef LEFT
|
|
kusano |
2b45e8 |
temp -= 2;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
temp -= 1;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
ptrba += temp*2;
|
|
kusano |
2b45e8 |
ptrbb += temp;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#ifdef LEFT
|
|
kusano |
2b45e8 |
off += 2;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
C0 = C0+2;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
for (i=0; i<(bm&1); i+=1)
|
|
kusano |
2b45e8 |
{
|
|
kusano |
2b45e8 |
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
kusano |
2b45e8 |
ptrbb = bb;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
ptrba += off;
|
|
kusano |
2b45e8 |
ptrbb = bb+off;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
res0 = 0;
|
|
kusano |
2b45e8 |
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
kusano |
2b45e8 |
temp = bk-off;
|
|
kusano |
2b45e8 |
#elif defined(LEFT)
|
|
kusano |
2b45e8 |
temp = off + 1;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
temp = off + 1;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
for (k=0; k
|
|
kusano |
2b45e8 |
{
|
|
kusano |
2b45e8 |
load0 = ptrba[0+0];
|
|
kusano |
2b45e8 |
load1 = ptrbb[0+0];
|
|
kusano |
2b45e8 |
res0 = res0+load0*load1;
|
|
kusano |
2b45e8 |
ptrba = ptrba+1;
|
|
kusano |
2b45e8 |
ptrbb = ptrbb+1;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
res0 = res0*alpha;
|
|
kusano |
2b45e8 |
C0[0] = res0;
|
|
kusano |
2b45e8 |
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
kusano |
2b45e8 |
temp = bk-off;
|
|
kusano |
2b45e8 |
#ifdef LEFT
|
|
kusano |
2b45e8 |
temp -= 1;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
temp -= 1;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
ptrba += temp;
|
|
kusano |
2b45e8 |
ptrbb += temp;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#ifdef LEFT
|
|
kusano |
2b45e8 |
off += 1;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
C0 = C0+1;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
kusano |
2b45e8 |
off += 1;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
k = (bk<<0);
|
|
kusano |
2b45e8 |
bb = bb+k;
|
|
kusano |
2b45e8 |
C = C+ldc;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
return 0;
|
|
kusano |
2b45e8 |
}
|