Blame thirdparty/openblas/xianyi-OpenBLAS-e6e87a2/kernel/mips64/zgemv_t_loongson3a.c

kusano 2b45e8
#include "common.h" 
kusano 2b45e8
kusano 2b45e8
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
kusano 2b45e8
#define likely(x) __builtin_expect(!!(x), 1)
kusano 2b45e8
#define unlikely(x) __builtin_expect(!!(x), 0)
kusano 2b45e8
kusano 2b45e8
#if !defined(CONJ) && !defined(XCONJ)
kusano 2b45e8
#define spec_loop_alpha1	spec_loop_alpha1_0
kusano 2b45e8
#define spec_loop			spec_loop_0
kusano 2b45e8
#define norm_loop_alpha1	norm_loop_alpha1_0
kusano 2b45e8
#define norm_loop			norm_loop_0
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#if  defined(CONJ) && !defined(XCONJ)
kusano 2b45e8
#define spec_loop_alpha1 	spec_loop_alpha1_1
kusano 2b45e8
#define spec_loop			spec_loop_1
kusano 2b45e8
#define norm_loop_alpha1	norm_loop_alpha1_1
kusano 2b45e8
#define norm_loop			norm_loop_1
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#if  !defined(CONJ) && defined(XCONJ)
kusano 2b45e8
#define spec_loop_alpha1 	spec_loop_alpha1_2
kusano 2b45e8
#define spec_loop			spec_loop_2
kusano 2b45e8
#define norm_loop_alpha1	norm_loop_alpha1_2
kusano 2b45e8
#define norm_loop			norm_loop_2
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#if  defined(CONJ) && defined(XCONJ)
kusano 2b45e8
#define spec_loop_alpha1 	spec_loop_alpha1_3
kusano 2b45e8
#define spec_loop			spec_loop_3
kusano 2b45e8
#define norm_loop_alpha1	norm_loop_alpha1_3
kusano 2b45e8
#define norm_loop			norm_loop_3
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#define spec_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
kusano 2b45e8
#define spec_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
kusano 2b45e8
#define spec_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
kusano 2b45e8
#define spec_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
kusano 2b45e8
kusano 2b45e8
#define spec_loop_0 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
kusano 2b45e8
#define spec_loop_1 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
kusano 2b45e8
#define spec_loop_2 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
kusano 2b45e8
#define spec_loop_3 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
kusano 2b45e8
kusano 2b45e8
#define norm_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
kusano 2b45e8
#define norm_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
kusano 2b45e8
#define norm_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
kusano 2b45e8
#define norm_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
kusano 2b45e8
kusano 2b45e8
#define norm_loop_0 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
kusano 2b45e8
#define norm_loop_1 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
kusano 2b45e8
#define norm_loop_2 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
kusano 2b45e8
#define norm_loop_3 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
kusano 2b45e8
kusano 2b45e8
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
kusano 2b45e8
kusano 2b45e8
	if(!rALPHA && iALPHA)
kusano 2b45e8
		return 0;
kusano 2b45e8
kusano 2b45e8
	BLASLONG fahead = 30;
kusano 2b45e8
	BLASLONG spec_unroll = 2;
kusano 2b45e8
	BLASLONG tMQ = M - M % spec_unroll;
kusano 2b45e8
	BLASLONG j = 0, k = 0, jj = 0;
kusano 2b45e8
kusano 2b45e8
	if(rALPHA == 1 && iALPHA == 0) {
kusano 2b45e8
		if(INCX == 1) {
kusano 2b45e8
			for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
kusano 2b45e8
				BLASLONG i = 0, ii = 0;
kusano 2b45e8
				for(; likely(i < tMQ); i += spec_unroll) {
kusano 2b45e8
					prefetch(A[jj + ii + fahead]);
kusano 2b45e8
					prefetch(X[ii + fahead]);
kusano 2b45e8
					/*loop_mark*/ spec_loop_alpha1;
kusano 2b45e8
					/*loop_mark*/ spec_loop_alpha1;
kusano 2b45e8
				}
kusano 2b45e8
				for(; likely(i < M); i++) {
kusano 2b45e8
					spec_loop_alpha1;
kusano 2b45e8
				}
kusano 2b45e8
			}
kusano 2b45e8
		} else {
kusano 2b45e8
			for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
kusano 2b45e8
				BLASLONG i = 0, ii = 0, iii = 0;
kusano 2b45e8
				for(; likely(i < tMQ); i += spec_unroll) {
kusano 2b45e8
					prefetch(A[jj + ii + fahead]);
kusano 2b45e8
					prefetch(X[iii + fahead]);
kusano 2b45e8
					/*loop_mark*/ norm_loop_alpha1;
kusano 2b45e8
					/*loop_mark*/ norm_loop_alpha1;
kusano 2b45e8
				}
kusano 2b45e8
				for(; likely(i < M); i++) {
kusano 2b45e8
					norm_loop_alpha1;
kusano 2b45e8
				}
kusano 2b45e8
			}
kusano 2b45e8
		}
kusano 2b45e8
	} else {
kusano 2b45e8
		FLOAT rTmp, iTmp;
kusano 2b45e8
		if(INCX == 1) {
kusano 2b45e8
			for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
kusano 2b45e8
				BLASLONG i = 0, ii = 0;
kusano 2b45e8
				for(; likely(i < tMQ); i += spec_unroll) {
kusano 2b45e8
					prefetch(A[jj + ii + fahead]);
kusano 2b45e8
					prefetch(X[ii + fahead]);
kusano 2b45e8
					/*loop_mark*/ spec_loop;
kusano 2b45e8
					/*loop_mark*/ spec_loop;
kusano 2b45e8
				}
kusano 2b45e8
				for(; likely(i < M); i++) {
kusano 2b45e8
					spec_loop;
kusano 2b45e8
				}
kusano 2b45e8
			}
kusano 2b45e8
		} else {
kusano 2b45e8
			for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
kusano 2b45e8
				BLASLONG i = 0, ii = 0, iii = 0;
kusano 2b45e8
				for(; likely(i < tMQ); i += spec_unroll) {
kusano 2b45e8
					prefetch(A[jj + ii + fahead]);
kusano 2b45e8
					prefetch(X[iii + fahead]);
kusano 2b45e8
					/*loop_mark*/ norm_loop;
kusano 2b45e8
					/*loop_mark*/ norm_loop;
kusano 2b45e8
				}
kusano 2b45e8
				for(; likely(i < M); i++) {
kusano 2b45e8
					norm_loop;
kusano 2b45e8
				}
kusano 2b45e8
			}
kusano 2b45e8
		}
kusano 2b45e8
	}
kusano 2b45e8
	return 0;
kusano 2b45e8
}