|
kusano |
2b45e8 |
#include "common.h"
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
//These are auto-tuning codes on Loongson-3A platform.
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
//#define prefetch(x) __builtin_prefetch(x)
|
|
kusano |
2b45e8 |
//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
|
|
kusano |
2b45e8 |
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
|
|
kusano |
2b45e8 |
#define likely(x) __builtin_expect(!!(x), 1)
|
|
kusano |
2b45e8 |
#define unlikely(x) __builtin_expect(!!(x), 0)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define spec_loop_alpha1 do {Y[i] += A[LDA * j + i] * X[k]; i++;} while(0)
|
|
kusano |
2b45e8 |
#define spec_loop do {Y[i] += ALPHA * A[LDA * j + i] * X[k]; i++;} while(0)
|
|
kusano |
2b45e8 |
#define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
|
|
kusano |
2b45e8 |
#define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER)
|
|
kusano |
2b45e8 |
{
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
BLASLONG kx=0, ky=0;
|
|
kusano |
2b45e8 |
if(!ALPHA)
|
|
kusano |
2b45e8 |
return 0;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
//if(INCX < 0)
|
|
kusano |
2b45e8 |
// kx = (1-N) * INCX;
|
|
kusano |
2b45e8 |
// INCX = -INCX;
|
|
kusano |
2b45e8 |
//if(INCY < 0)
|
|
kusano |
2b45e8 |
// ky = (1-M) * INCY;
|
|
kusano |
2b45e8 |
// INCY = -INCY;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
BLASLONG fahead = 30;
|
|
kusano |
2b45e8 |
BLASLONG spec_unroll = 4;
|
|
kusano |
2b45e8 |
BLASLONG tMQ = M - M % spec_unroll;
|
|
kusano |
2b45e8 |
BLASLONG j = 0, k = 0;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
if(ALPHA == 1) {
|
|
kusano |
2b45e8 |
if(INCY == 1) {
|
|
kusano |
2b45e8 |
for(k=kx; likely(j < N); j++, k += INCX) {
|
|
kusano |
2b45e8 |
BLASLONG i = 0;
|
|
kusano |
2b45e8 |
for(; likely(i < tMQ);) {
|
|
kusano |
2b45e8 |
prefetch(A[LDA * j + i + fahead]);
|
|
kusano |
2b45e8 |
prefetch(Y[i + fahead]);
|
|
kusano |
2b45e8 |
/*loop_mark*/ spec_loop_alpha1;
|
|
kusano |
2b45e8 |
/*loop_mark*/ spec_loop_alpha1;
|
|
kusano |
2b45e8 |
/*loop_mark*/ spec_loop_alpha1;
|
|
kusano |
2b45e8 |
/*loop_mark*/ spec_loop_alpha1;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
for(; likely(i < M);) {
|
|
kusano |
2b45e8 |
spec_loop_alpha1;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
} else {
|
|
kusano |
2b45e8 |
for(k=kx; likely(j < N); j++, k += INCX) {
|
|
kusano |
2b45e8 |
BLASLONG i = 0, h = ky;
|
|
kusano |
2b45e8 |
for(; likely(i < tMQ);) {
|
|
kusano |
2b45e8 |
prefetch(A[LDA * j + i + fahead]);
|
|
kusano |
2b45e8 |
prefetch(Y[h + fahead]);
|
|
kusano |
2b45e8 |
/*loop_mark*/ norm_loop_alpha1;
|
|
kusano |
2b45e8 |
/*loop_mark*/ norm_loop_alpha1;
|
|
kusano |
2b45e8 |
/*loop_mark*/ norm_loop_alpha1;
|
|
kusano |
2b45e8 |
/*loop_mark*/ norm_loop_alpha1;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
for(; likely(i < M);) {
|
|
kusano |
2b45e8 |
norm_loop_alpha1;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
} else {
|
|
kusano |
2b45e8 |
if(INCY == 1) {
|
|
kusano |
2b45e8 |
for(k=kx; likely(j < N); j++, k += INCX) {
|
|
kusano |
2b45e8 |
BLASLONG i = 0;
|
|
kusano |
2b45e8 |
for(; likely(i < tMQ);) {
|
|
kusano |
2b45e8 |
prefetch(A[LDA * j + i + fahead]);
|
|
kusano |
2b45e8 |
prefetch(Y[i + fahead]);
|
|
kusano |
2b45e8 |
/*loop_mark*/ spec_loop;
|
|
kusano |
2b45e8 |
/*loop_mark*/ spec_loop;
|
|
kusano |
2b45e8 |
/*loop_mark*/ spec_loop;
|
|
kusano |
2b45e8 |
/*loop_mark*/ spec_loop;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
for(; likely(i < M);) {
|
|
kusano |
2b45e8 |
spec_loop;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
} else {
|
|
kusano |
2b45e8 |
for(k=kx; likely(j < N); j++, k += INCX) {
|
|
kusano |
2b45e8 |
BLASLONG i = 0, h = ky;
|
|
kusano |
2b45e8 |
for(; likely(i < tMQ);) {
|
|
kusano |
2b45e8 |
prefetch(A[LDA * j + i + fahead]);
|
|
kusano |
2b45e8 |
prefetch(Y[h + fahead]);
|
|
kusano |
2b45e8 |
/*loop_mark*/ norm_loop;
|
|
kusano |
2b45e8 |
/*loop_mark*/ norm_loop;
|
|
kusano |
2b45e8 |
/*loop_mark*/ norm_loop;
|
|
kusano |
2b45e8 |
/*loop_mark*/ norm_loop;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
for(; likely(i < M);) {
|
|
kusano |
2b45e8 |
norm_loop;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
return 0;
|
|
kusano |
2b45e8 |
}
|