Blame thirdparty/openblas/xianyi-OpenBLAS-e6e87a2/kernel/mips64/gemv_t_loongson3a.c

kusano 2b45e8
#include "common.h"
kusano 2b45e8
kusano 2b45e8
//These are auto-tuning codes on Loongson-3A platform. 
kusano 2b45e8
kusano 2b45e8
//#define prefetch(x) __builtin_prefetch(x)
kusano 2b45e8
//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
kusano 2b45e8
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
kusano 2b45e8
#define likely(x) __builtin_expect(!!(x), 1)
kusano 2b45e8
#define unlikely(x) __builtin_expect(!!(x), 0)
kusano 2b45e8
kusano 2b45e8
#define spec_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[i]; i++;} while(0)
kusano 2b45e8
#define spec_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[i]; i++;} while(0)
kusano 2b45e8
#define norm_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[h]; i++; h += INCX;} while(0)
kusano 2b45e8
#define norm_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[h]; i++; h += INCX;} while(0)
kusano 2b45e8
kusano 2b45e8
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
kusano 2b45e8
kusano 2b45e8
	if(!ALPHA)
kusano 2b45e8
		return 0;
kusano 2b45e8
kusano 2b45e8
//	if(INCX < 0)
kusano 2b45e8
//		INCX = -INCX;
kusano 2b45e8
//	if(INCY < 0)
kusano 2b45e8
//		INCY = -INCY;
kusano 2b45e8
kusano 2b45e8
	BLASLONG fahead = 30;
kusano 2b45e8
	BLASLONG spec_unroll = 3;
kusano 2b45e8
	BLASLONG tMQ = M - M % spec_unroll;
kusano 2b45e8
	BLASLONG j = 0, k = 0;
kusano 2b45e8
kusano 2b45e8
	if(ALPHA == 1) {
kusano 2b45e8
		if(INCX == 1) {
kusano 2b45e8
			for(; likely(j < N); j++, k += INCY) {
kusano 2b45e8
				BLASLONG i = 0;
kusano 2b45e8
				for(; likely(i < tMQ);) {
kusano 2b45e8
					prefetch(A[LDA * j + i + fahead]);
kusano 2b45e8
					prefetch(X[i + fahead]);
kusano 2b45e8
					/*loop_mark*/ spec_loop_alpha1;
kusano 2b45e8
					/*loop_mark*/ spec_loop_alpha1;
kusano 2b45e8
					/*loop_mark*/ spec_loop_alpha1;
kusano 2b45e8
				}
kusano 2b45e8
				for(; likely(i < M);) {
kusano 2b45e8
					spec_loop_alpha1;
kusano 2b45e8
				}
kusano 2b45e8
			}
kusano 2b45e8
		} else {
kusano 2b45e8
			for(; likely(j < N); j++, k += INCY) {
kusano 2b45e8
				BLASLONG i = 0, h = 0;
kusano 2b45e8
				for(; likely(i < tMQ);) {
kusano 2b45e8
					prefetch(A[LDA * j + i + fahead]);
kusano 2b45e8
					prefetch(X[h + fahead]);
kusano 2b45e8
					/*loop_mark*/ norm_loop_alpha1;
kusano 2b45e8
					/*loop_mark*/ norm_loop_alpha1;
kusano 2b45e8
					/*loop_mark*/ norm_loop_alpha1;
kusano 2b45e8
				}
kusano 2b45e8
				for(; likely(i < M);) {
kusano 2b45e8
					norm_loop_alpha1;
kusano 2b45e8
				}
kusano 2b45e8
			}
kusano 2b45e8
		}
kusano 2b45e8
	} else {
kusano 2b45e8
		if(INCX == 1) {
kusano 2b45e8
			for(; likely(j < N); j++, k += INCY) {
kusano 2b45e8
				BLASLONG i = 0;
kusano 2b45e8
				for(; likely(i < tMQ);) {
kusano 2b45e8
					prefetch(A[LDA * j + i + fahead]);
kusano 2b45e8
					prefetch(X[i + fahead]);
kusano 2b45e8
					/*loop_mark*/ spec_loop;
kusano 2b45e8
					/*loop_mark*/ spec_loop;
kusano 2b45e8
					/*loop_mark*/ spec_loop;
kusano 2b45e8
				}
kusano 2b45e8
				for(; likely(i < M);) {
kusano 2b45e8
					spec_loop;
kusano 2b45e8
				}
kusano 2b45e8
			}
kusano 2b45e8
		} else {
kusano 2b45e8
			for(; likely(j < N); j++, k += INCY) {
kusano 2b45e8
				BLASLONG i = 0, h = 0;
kusano 2b45e8
				for(; likely(i < tMQ);) {
kusano 2b45e8
					prefetch(A[LDA * j + i + fahead]);
kusano 2b45e8
					prefetch(X[h + fahead]);
kusano 2b45e8
					/*loop_mark*/ norm_loop;
kusano 2b45e8
					/*loop_mark*/ norm_loop;
kusano 2b45e8
					/*loop_mark*/ norm_loop;
kusano 2b45e8
				}
kusano 2b45e8
				for(; likely(i < M);) {
kusano 2b45e8
					norm_loop;
kusano 2b45e8
				}
kusano 2b45e8
			}
kusano 2b45e8
		}
kusano 2b45e8
	}
kusano 2b45e8
	return 0;
kusano 2b45e8
}