|
kusano |
2b45e8 |
#include "common.h"
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
//typedef int BLASLONG;
|
|
kusano |
2b45e8 |
//typedef double FLOAT;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
|
|
kusano |
2b45e8 |
#define likely(x) __builtin_expect(!!(x), 1)
|
|
kusano |
2b45e8 |
#define unlikely(x) __builtin_expect(!!(x), 0)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if !defined(CONJ) && !defined(XCONJ)
|
|
kusano |
2b45e8 |
#define spec_loop_alpha1 spec_loop_alpha1_0
|
|
kusano |
2b45e8 |
#define spec_loop spec_loop_0
|
|
kusano |
2b45e8 |
#define norm_loop_alpha1 norm_loop_alpha1_0
|
|
kusano |
2b45e8 |
#define norm_loop norm_loop_0
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(CONJ) && !defined(XCONJ)
|
|
kusano |
2b45e8 |
#define spec_loop_alpha1 spec_loop_alpha1_1
|
|
kusano |
2b45e8 |
#define spec_loop spec_loop_1
|
|
kusano |
2b45e8 |
#define norm_loop_alpha1 norm_loop_alpha1_1
|
|
kusano |
2b45e8 |
#define norm_loop norm_loop_1
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if !defined(CONJ) && defined(XCONJ)
|
|
kusano |
2b45e8 |
#define spec_loop_alpha1 spec_loop_alpha1_2
|
|
kusano |
2b45e8 |
#define spec_loop spec_loop_2
|
|
kusano |
2b45e8 |
#define norm_loop_alpha1 norm_loop_alpha1_2
|
|
kusano |
2b45e8 |
#define norm_loop norm_loop_2
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(CONJ) && defined(XCONJ)
|
|
kusano |
2b45e8 |
#define spec_loop_alpha1 spec_loop_alpha1_3
|
|
kusano |
2b45e8 |
#define spec_loop spec_loop_3
|
|
kusano |
2b45e8 |
#define norm_loop_alpha1 norm_loop_alpha1_3
|
|
kusano |
2b45e8 |
#define norm_loop norm_loop_3
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define spec_loop_alpha1_0 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define spec_loop_alpha1_1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define spec_loop_alpha1_2 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define spec_loop_alpha1_3 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define spec_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define spec_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define spec_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define spec_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define norm_loop_alpha1_0 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define norm_loop_alpha1_1 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define norm_loop_alpha1_2 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define norm_loop_alpha1_3 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define norm_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define norm_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define norm_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define norm_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
if(!rALPHA && iALPHA)
|
|
kusano |
2b45e8 |
return 0;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
BLASLONG fahead = 60;
|
|
kusano |
2b45e8 |
BLASLONG spec_unroll = 2;
|
|
kusano |
2b45e8 |
BLASLONG tMQ = M - M % spec_unroll;
|
|
kusano |
2b45e8 |
BLASLONG j = 0, k = 0, jj = 0;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
if(rALPHA == 1 && iALPHA == 0) {
|
|
kusano |
2b45e8 |
if(INCY == 1) {
|
|
kusano |
2b45e8 |
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
|
kusano |
2b45e8 |
BLASLONG i = 0, ii = 0;
|
|
kusano |
2b45e8 |
for(; likely(i < tMQ); i += spec_unroll) {
|
|
kusano |
2b45e8 |
prefetch(A[jj + ii + fahead]);
|
|
kusano |
2b45e8 |
prefetch(Y[ii + fahead]);
|
|
kusano |
2b45e8 |
/*loop_mark*/ spec_loop_alpha1;
|
|
kusano |
2b45e8 |
/*loop_mark*/ spec_loop_alpha1;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
for(; likely(i < M); i++) {
|
|
kusano |
2b45e8 |
spec_loop_alpha1;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
} else {
|
|
kusano |
2b45e8 |
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
|
kusano |
2b45e8 |
BLASLONG i = 0, ii = 0, iii = 0;
|
|
kusano |
2b45e8 |
for(; likely(i < tMQ); i += spec_unroll) {
|
|
kusano |
2b45e8 |
prefetch(A[jj + ii + fahead]);
|
|
kusano |
2b45e8 |
prefetch(Y[iii + fahead]);
|
|
kusano |
2b45e8 |
/*loop_mark*/ norm_loop_alpha1;
|
|
kusano |
2b45e8 |
/*loop_mark*/ norm_loop_alpha1;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
for(; likely(i < M); i++) {
|
|
kusano |
2b45e8 |
norm_loop_alpha1;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
} else {
|
|
kusano |
2b45e8 |
FLOAT rTmp, iTmp;
|
|
kusano |
2b45e8 |
if(INCY == 1) {
|
|
kusano |
2b45e8 |
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
|
kusano |
2b45e8 |
BLASLONG i = 0, ii = 0;
|
|
kusano |
2b45e8 |
for(; likely(i < tMQ); i += spec_unroll) {
|
|
kusano |
2b45e8 |
prefetch(A[jj + ii + fahead]);
|
|
kusano |
2b45e8 |
prefetch(Y[ii + fahead]);
|
|
kusano |
2b45e8 |
/*loop_mark*/ spec_loop;
|
|
kusano |
2b45e8 |
/*loop_mark*/ spec_loop;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
for(; likely(i < M); i++) {
|
|
kusano |
2b45e8 |
spec_loop;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
} else {
|
|
kusano |
2b45e8 |
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
|
kusano |
2b45e8 |
BLASLONG i = 0, ii = 0, iii = 0;
|
|
kusano |
2b45e8 |
for(; likely(i < tMQ); i += spec_unroll) {
|
|
kusano |
2b45e8 |
prefetch(A[jj + ii + fahead]);
|
|
kusano |
2b45e8 |
prefetch(Y[iii + fahead]);
|
|
kusano |
2b45e8 |
/*loop_mark*/ norm_loop;
|
|
kusano |
2b45e8 |
/*loop_mark*/ norm_loop;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
for(; likely(i < M); i++) {
|
|
kusano |
2b45e8 |
norm_loop;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
return 0;
|
|
kusano |
2b45e8 |
}
|