|
kusano |
2b45e8 |
/*********************************************************************/
|
|
kusano |
2b45e8 |
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
kusano |
2b45e8 |
/* All rights reserved. */
|
|
kusano |
2b45e8 |
/* */
|
|
kusano |
2b45e8 |
/* Redistribution and use in source and binary forms, with or */
|
|
kusano |
2b45e8 |
/* without modification, are permitted provided that the following */
|
|
kusano |
2b45e8 |
/* conditions are met: */
|
|
kusano |
2b45e8 |
/* */
|
|
kusano |
2b45e8 |
/* 1. Redistributions of source code must retain the above */
|
|
kusano |
2b45e8 |
/* copyright notice, this list of conditions and the following */
|
|
kusano |
2b45e8 |
/* disclaimer. */
|
|
kusano |
2b45e8 |
/* */
|
|
kusano |
2b45e8 |
/* 2. Redistributions in binary form must reproduce the above */
|
|
kusano |
2b45e8 |
/* copyright notice, this list of conditions and the following */
|
|
kusano |
2b45e8 |
/* disclaimer in the documentation and/or other materials */
|
|
kusano |
2b45e8 |
/* provided with the distribution. */
|
|
kusano |
2b45e8 |
/* */
|
|
kusano |
2b45e8 |
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
kusano |
2b45e8 |
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
kusano |
2b45e8 |
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
kusano |
2b45e8 |
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
kusano |
2b45e8 |
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
kusano |
2b45e8 |
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
kusano |
2b45e8 |
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
kusano |
2b45e8 |
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
kusano |
2b45e8 |
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
kusano |
2b45e8 |
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
kusano |
2b45e8 |
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
kusano |
2b45e8 |
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
kusano |
2b45e8 |
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
kusano |
2b45e8 |
/* POSSIBILITY OF SUCH DAMAGE. */
|
|
kusano |
2b45e8 |
/* */
|
|
kusano |
2b45e8 |
/* The views and conclusions contained in the software and */
|
|
kusano |
2b45e8 |
/* documentation are those of the authors and should not be */
|
|
kusano |
2b45e8 |
/* interpreted as representing official policies, either expressed */
|
|
kusano |
2b45e8 |
/* or implied, of The University of Texas at Austin. */
|
|
kusano |
2b45e8 |
/*********************************************************************/
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#include <stdio.h></stdio.h>
|
|
kusano |
2b45e8 |
#include <string.h></string.h>
|
|
kusano |
2b45e8 |
#include "common.h"
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
int get_L2_size(void);
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define DEFAULT_GEMM_P 128
|
|
kusano |
2b45e8 |
#define DEFAULT_GEMM_Q 128
|
|
kusano |
2b45e8 |
#define DEFAULT_GEMM_R 128
|
|
kusano |
2b45e8 |
#define DEFAULT_GEMM_OFFSET_A 0
|
|
kusano |
2b45e8 |
#define DEFAULT_GEMM_OFFSET_B 0
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
/* Global Parameter */
|
|
kusano |
2b45e8 |
#if GEMM_OFFSET_A == gemm_offset_a
|
|
kusano |
2b45e8 |
BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG gemm_offset_a = GEMM_OFFSET_A;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if GEMM_OFFSET_B == gemm_offset_b
|
|
kusano |
2b45e8 |
BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG gemm_offset_b = GEMM_OFFSET_B;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if SGEMM_P == sgemm_p
|
|
kusano |
2b45e8 |
BLASLONG sgemm_p = DEFAULT_GEMM_P;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG sgemm_p = SGEMM_P;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#if DGEMM_P == dgemm_p
|
|
kusano |
2b45e8 |
BLASLONG dgemm_p = DEFAULT_GEMM_P;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG dgemm_p = DGEMM_P;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#if CGEMM_P == cgemm_p
|
|
kusano |
2b45e8 |
BLASLONG cgemm_p = DEFAULT_GEMM_P;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG cgemm_p = CGEMM_P;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#if ZGEMM_P == zgemm_p
|
|
kusano |
2b45e8 |
BLASLONG zgemm_p = DEFAULT_GEMM_P;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG zgemm_p = ZGEMM_P;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if SGEMM_Q == sgemm_q
|
|
kusano |
2b45e8 |
BLASLONG sgemm_q = DEFAULT_GEMM_Q;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG sgemm_q = SGEMM_Q;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#if DGEMM_Q == dgemm_q
|
|
kusano |
2b45e8 |
BLASLONG dgemm_q = DEFAULT_GEMM_Q;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG dgemm_q = DGEMM_Q;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#if CGEMM_Q == cgemm_q
|
|
kusano |
2b45e8 |
BLASLONG cgemm_q = DEFAULT_GEMM_Q;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG cgemm_q = CGEMM_Q;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#if ZGEMM_Q == zgemm_q
|
|
kusano |
2b45e8 |
BLASLONG zgemm_q = DEFAULT_GEMM_Q;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG zgemm_q = ZGEMM_Q;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if SGEMM_R == sgemm_r
|
|
kusano |
2b45e8 |
BLASLONG sgemm_r = DEFAULT_GEMM_R;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG sgemm_r = SGEMM_R;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#if DGEMM_R == dgemm_r
|
|
kusano |
2b45e8 |
BLASLONG dgemm_r = DEFAULT_GEMM_R;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG dgemm_r = DGEMM_R;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#if CGEMM_R == cgemm_r
|
|
kusano |
2b45e8 |
BLASLONG cgemm_r = DEFAULT_GEMM_R;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG cgemm_r = CGEMM_R;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#if ZGEMM_R == zgemm_r
|
|
kusano |
2b45e8 |
BLASLONG zgemm_r = DEFAULT_GEMM_R;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG zgemm_r = ZGEMM_R;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(EXPRECISION) || defined(QUAD_PRECISION)
|
|
kusano |
2b45e8 |
#if QGEMM_P == qgemm_p
|
|
kusano |
2b45e8 |
BLASLONG qgemm_p = DEFAULT_GEMM_P;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG qgemm_p = QGEMM_P;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#if XGEMM_P == xgemm_p
|
|
kusano |
2b45e8 |
BLASLONG xgemm_p = DEFAULT_GEMM_P;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG xgemm_p = XGEMM_P;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#if QGEMM_Q == qgemm_q
|
|
kusano |
2b45e8 |
BLASLONG qgemm_q = DEFAULT_GEMM_Q;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG qgemm_q = QGEMM_Q;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#if XGEMM_Q == xgemm_q
|
|
kusano |
2b45e8 |
BLASLONG xgemm_q = DEFAULT_GEMM_Q;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG xgemm_q = XGEMM_Q;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#if QGEMM_R == qgemm_r
|
|
kusano |
2b45e8 |
BLASLONG qgemm_r = DEFAULT_GEMM_R;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG qgemm_r = QGEMM_R;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#if XGEMM_R == xgemm_r
|
|
kusano |
2b45e8 |
BLASLONG xgemm_r = DEFAULT_GEMM_R;
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
BLASLONG xgemm_r = XGEMM_R;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(ARCH_X86) || defined(ARCH_X86_64)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
int get_L2_size(void){
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
int eax, ebx, ecx, edx;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \
|
|
kusano |
2b45e8 |
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
|
kusano |
2b45e8 |
defined(CORE_NEHALEM) || defined(ATOM) || defined(GENERIC)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
return BITMASK(ecx, 16, 0xffff);
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
int info[15];
|
|
kusano |
2b45e8 |
int i;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
cpuid(2, &eax, &ebx, &ecx, &edx);
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
info[ 0] = BITMASK(eax, 8, 0xff);
|
|
kusano |
2b45e8 |
info[ 1] = BITMASK(eax, 16, 0xff);
|
|
kusano |
2b45e8 |
info[ 2] = BITMASK(eax, 24, 0xff);
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
info[ 3] = BITMASK(ebx, 0, 0xff);
|
|
kusano |
2b45e8 |
info[ 4] = BITMASK(ebx, 8, 0xff);
|
|
kusano |
2b45e8 |
info[ 5] = BITMASK(ebx, 16, 0xff);
|
|
kusano |
2b45e8 |
info[ 6] = BITMASK(ebx, 24, 0xff);
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
info[ 7] = BITMASK(ecx, 0, 0xff);
|
|
kusano |
2b45e8 |
info[ 8] = BITMASK(ecx, 8, 0xff);
|
|
kusano |
2b45e8 |
info[ 9] = BITMASK(ecx, 16, 0xff);
|
|
kusano |
2b45e8 |
info[10] = BITMASK(ecx, 24, 0xff);
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
info[11] = BITMASK(edx, 0, 0xff);
|
|
kusano |
2b45e8 |
info[12] = BITMASK(edx, 8, 0xff);
|
|
kusano |
2b45e8 |
info[13] = BITMASK(edx, 16, 0xff);
|
|
kusano |
2b45e8 |
info[14] = BITMASK(edx, 24, 0xff);
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
for (i = 0; i < 15; i++){
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
switch (info[i]){
|
|
kusano |
2b45e8 |
case 0x3b :
|
|
kusano |
2b45e8 |
case 0x41 :
|
|
kusano |
2b45e8 |
case 0x79 :
|
|
kusano |
2b45e8 |
return 128;
|
|
kusano |
2b45e8 |
break;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
case 0x3c :
|
|
kusano |
2b45e8 |
case 0x42 :
|
|
kusano |
2b45e8 |
case 0x7a :
|
|
kusano |
2b45e8 |
case 0x7e :
|
|
kusano |
2b45e8 |
case 0x82 :
|
|
kusano |
2b45e8 |
return 256;
|
|
kusano |
2b45e8 |
break;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
case 0x43 :
|
|
kusano |
2b45e8 |
case 0x7b :
|
|
kusano |
2b45e8 |
case 0x7f :
|
|
kusano |
2b45e8 |
case 0x83 :
|
|
kusano |
2b45e8 |
case 0x86 :
|
|
kusano |
2b45e8 |
return 512;
|
|
kusano |
2b45e8 |
break;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
case 0x44 :
|
|
kusano |
2b45e8 |
case 0x78 :
|
|
kusano |
2b45e8 |
case 0x7c :
|
|
kusano |
2b45e8 |
case 0x84 :
|
|
kusano |
2b45e8 |
case 0x87 :
|
|
kusano |
2b45e8 |
return 1024;
|
|
kusano |
2b45e8 |
break;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
case 0x45 :
|
|
kusano |
2b45e8 |
case 0x7d :
|
|
kusano |
2b45e8 |
case 0x85 :
|
|
kusano |
2b45e8 |
return 2048;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
case 0x49 :
|
|
kusano |
2b45e8 |
return 4096;
|
|
kusano |
2b45e8 |
break;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
/* Never reached */
|
|
kusano |
2b45e8 |
return 0;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
void blas_set_parameter(void){
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
char *p;
|
|
kusano |
2b45e8 |
int factor;
|
|
kusano |
2b45e8 |
int size = get_L2_size();
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS)
|
|
kusano |
2b45e8 |
size >>= 7;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(CORE_BANIAS) && (HAVE_HIT > 1)
|
|
kusano |
2b45e8 |
sgemm_p = 64 / HAVE_HIT * size;
|
|
kusano |
2b45e8 |
dgemm_p = 32 / HAVE_HIT * size;
|
|
kusano |
2b45e8 |
cgemm_p = 32 / HAVE_HIT * size;
|
|
kusano |
2b45e8 |
zgemm_p = 16 / HAVE_HIT * size;
|
|
kusano |
2b45e8 |
#ifdef EXPRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 16 / HAVE_HIT * size;
|
|
kusano |
2b45e8 |
xgemm_p = 8 / HAVE_HIT * size;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#ifdef QUAD_PRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 8 / HAVE_HIT * size;
|
|
kusano |
2b45e8 |
xgemm_p = 4 / HAVE_HIT * size;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
sgemm_p = 64 * size;
|
|
kusano |
2b45e8 |
dgemm_p = 32 * size;
|
|
kusano |
2b45e8 |
cgemm_p = 32 * size;
|
|
kusano |
2b45e8 |
zgemm_p = 16 * size;
|
|
kusano |
2b45e8 |
#ifdef EXPRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 16 * size;
|
|
kusano |
2b45e8 |
xgemm_p = 8 * size;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#ifdef QUAD_PRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 8 * size;
|
|
kusano |
2b45e8 |
xgemm_p = 4 * size;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(CORE_NORTHWOOD)
|
|
kusano |
2b45e8 |
size >>= 7;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#ifdef ALLOC_HUGETLB
|
|
kusano |
2b45e8 |
sgemm_p = 128 * size;
|
|
kusano |
2b45e8 |
dgemm_p = 64 * size;
|
|
kusano |
2b45e8 |
cgemm_p = 64 * size;
|
|
kusano |
2b45e8 |
zgemm_p = 32 * size;
|
|
kusano |
2b45e8 |
#ifdef EXPRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 32 * size;
|
|
kusano |
2b45e8 |
xgemm_p = 16 * size;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#ifdef QUAD_PRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 16 * size;
|
|
kusano |
2b45e8 |
xgemm_p = 8 * size;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
sgemm_p = 96 * size;
|
|
kusano |
2b45e8 |
dgemm_p = 48 * size;
|
|
kusano |
2b45e8 |
cgemm_p = 48 * size;
|
|
kusano |
2b45e8 |
zgemm_p = 24 * size;
|
|
kusano |
2b45e8 |
#ifdef EXPRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 24 * size;
|
|
kusano |
2b45e8 |
xgemm_p = 12 * size;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#ifdef QUAD_PRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 12 * size;
|
|
kusano |
2b45e8 |
xgemm_p = 6 * size;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(CORE_CORE2)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
size >>= 9;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
sgemm_p = 92 * size;
|
|
kusano |
2b45e8 |
dgemm_p = 46 * size;
|
|
kusano |
2b45e8 |
cgemm_p = 46 * size;
|
|
kusano |
2b45e8 |
zgemm_p = 23 * size;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#ifdef EXPRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 23 * size;
|
|
kusano |
2b45e8 |
xgemm_p = 11 * size;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#ifdef QUAD_PRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 11 * size;
|
|
kusano |
2b45e8 |
xgemm_p = 5 * size;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(PENRYN)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
size >>= 9;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
sgemm_p = 1024;
|
|
kusano |
2b45e8 |
dgemm_p = 512;
|
|
kusano |
2b45e8 |
cgemm_p = 512;
|
|
kusano |
2b45e8 |
zgemm_p = 256;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#ifdef EXPRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 256;
|
|
kusano |
2b45e8 |
xgemm_p = 128;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#ifdef QUAD_PRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 21 * size + 4;
|
|
kusano |
2b45e8 |
xgemm_p = 10 * size + 2;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(DUNNINGTON)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
size >>= 9;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
sgemm_p = 384;
|
|
kusano |
2b45e8 |
dgemm_p = 384;
|
|
kusano |
2b45e8 |
cgemm_p = 384;
|
|
kusano |
2b45e8 |
zgemm_p = 384;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#ifdef EXPRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 384;
|
|
kusano |
2b45e8 |
xgemm_p = 384;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#ifdef QUAD_PRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 21 * size + 4;
|
|
kusano |
2b45e8 |
xgemm_p = 10 * size + 2;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(NEHALEM)
|
|
kusano |
2b45e8 |
sgemm_p = 1024;
|
|
kusano |
2b45e8 |
dgemm_p = 512;
|
|
kusano |
2b45e8 |
cgemm_p = 512;
|
|
kusano |
2b45e8 |
zgemm_p = 256;
|
|
kusano |
2b45e8 |
#ifdef EXPRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 256;
|
|
kusano |
2b45e8 |
xgemm_p = 128;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(CORE_PRESCOTT) || defined(GENERIC)
|
|
kusano |
2b45e8 |
size >>= 6;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
if (size > 16) size = 16;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
sgemm_p = 56 * size;
|
|
kusano |
2b45e8 |
dgemm_p = 28 * size;
|
|
kusano |
2b45e8 |
cgemm_p = 28 * size;
|
|
kusano |
2b45e8 |
zgemm_p = 14 * size;
|
|
kusano |
2b45e8 |
#ifdef EXPRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 14 * size;
|
|
kusano |
2b45e8 |
xgemm_p = 7 * size;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#ifdef QUAD_PRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 7 * size;
|
|
kusano |
2b45e8 |
xgemm_p = 3 * size;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(CORE_OPTERON)
|
|
kusano |
2b45e8 |
sgemm_p = 224 + 14 * (size >> 5);
|
|
kusano |
2b45e8 |
dgemm_p = 112 + 14 * (size >> 6);
|
|
kusano |
2b45e8 |
cgemm_p = 116 + 14 * (size >> 6);
|
|
kusano |
2b45e8 |
zgemm_p = 58 + 14 * (size >> 7);
|
|
kusano |
2b45e8 |
#ifdef EXPRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 58 + 14 * (size >> 7);
|
|
kusano |
2b45e8 |
xgemm_p = 29 + 14 * (size >> 8);
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#ifdef QUAD_PRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 29 + 14 * (size >> 8);
|
|
kusano |
2b45e8 |
xgemm_p = 15 + 14 * (size >> 9);
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(ATOM)
|
|
kusano |
2b45e8 |
size >>= 8;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
sgemm_p = 256;
|
|
kusano |
2b45e8 |
dgemm_p = 128;
|
|
kusano |
2b45e8 |
cgemm_p = 128;
|
|
kusano |
2b45e8 |
zgemm_p = 64;
|
|
kusano |
2b45e8 |
#ifdef EXPRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 64;
|
|
kusano |
2b45e8 |
xgemm_p = 32;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#ifdef QUAD_PRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 32;
|
|
kusano |
2b45e8 |
xgemm_p = 16;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(CORE_BARCELONA)
|
|
kusano |
2b45e8 |
size >>= 8;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
sgemm_p = 232 * size;
|
|
kusano |
2b45e8 |
dgemm_p = 116 * size;
|
|
kusano |
2b45e8 |
cgemm_p = 116 * size;
|
|
kusano |
2b45e8 |
zgemm_p = 58 * size;
|
|
kusano |
2b45e8 |
#ifdef EXPRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 58 * size;
|
|
kusano |
2b45e8 |
xgemm_p = 26 * size;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#ifdef QUAD_PRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 26 * size;
|
|
kusano |
2b45e8 |
xgemm_p = 13 * size;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
p = getenv("GOTO_BLOCK_FACTOR");
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
if (p) {
|
|
kusano |
2b45e8 |
factor = atoi(p);
|
|
kusano |
2b45e8 |
if (factor < 10) factor = 10;
|
|
kusano |
2b45e8 |
if (factor > 200) factor = 200;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
sgemm_p = ((long)((double)sgemm_p * (double)factor * 1.e-2)) & ~7L;
|
|
kusano |
2b45e8 |
dgemm_p = ((long)((double)dgemm_p * (double)factor * 1.e-2)) & ~7L;
|
|
kusano |
2b45e8 |
cgemm_p = ((long)((double)cgemm_p * (double)factor * 1.e-2)) & ~7L;
|
|
kusano |
2b45e8 |
zgemm_p = ((long)((double)zgemm_p * (double)factor * 1.e-2)) & ~7L;
|
|
kusano |
2b45e8 |
#ifdef EXPRECISION
|
|
kusano |
2b45e8 |
qgemm_p = ((long)((double)qgemm_p * (double)factor * 1.e-2)) & ~7L;
|
|
kusano |
2b45e8 |
xgemm_p = ((long)((double)xgemm_p * (double)factor * 1.e-2)) & ~7L;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
if (sgemm_p == 0) sgemm_p = 64;
|
|
kusano |
2b45e8 |
if (dgemm_p == 0) dgemm_p = 64;
|
|
kusano |
2b45e8 |
if (cgemm_p == 0) cgemm_p = 64;
|
|
kusano |
2b45e8 |
if (zgemm_p == 0) zgemm_p = 64;
|
|
kusano |
2b45e8 |
#ifdef EXPRECISION
|
|
kusano |
2b45e8 |
if (qgemm_p == 0) qgemm_p = 64;
|
|
kusano |
2b45e8 |
if (xgemm_p == 0) xgemm_p = 64;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#ifdef QUAD_PRECISION
|
|
kusano |
2b45e8 |
if (qgemm_p == 0) qgemm_p = 64;
|
|
kusano |
2b45e8 |
if (xgemm_p == 0) xgemm_p = 64;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
sgemm_p = (sgemm_p + SGEMM_UNROLL_M - 1) & ~(SGEMM_UNROLL_M - 1);
|
|
kusano |
2b45e8 |
dgemm_p = (dgemm_p + DGEMM_UNROLL_M - 1) & ~(DGEMM_UNROLL_M - 1);
|
|
kusano |
2b45e8 |
cgemm_p = (cgemm_p + CGEMM_UNROLL_M - 1) & ~(CGEMM_UNROLL_M - 1);
|
|
kusano |
2b45e8 |
zgemm_p = (zgemm_p + ZGEMM_UNROLL_M - 1) & ~(ZGEMM_UNROLL_M - 1);
|
|
kusano |
2b45e8 |
#ifdef QUAD_PRECISION
|
|
kusano |
2b45e8 |
qgemm_p = (qgemm_p + QGEMM_UNROLL_M - 1) & ~(QGEMM_UNROLL_M - 1);
|
|
kusano |
2b45e8 |
xgemm_p = (xgemm_p + XGEMM_UNROLL_M - 1) & ~(XGEMM_UNROLL_M - 1);
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
|
|
kusano |
2b45e8 |
dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
|
|
kusano |
2b45e8 |
cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
|
|
kusano |
2b45e8 |
zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
|
|
kusano |
2b45e8 |
#if defined(EXPRECISION) || defined(QUAD_PRECISION)
|
|
kusano |
2b45e8 |
qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
|
|
kusano |
2b45e8 |
xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if 0
|
|
kusano |
2b45e8 |
fprintf(stderr, "SGEMM ... %3d, %3d, %3d\n", SGEMM_P, SGEMM_Q, SGEMM_R);
|
|
kusano |
2b45e8 |
fprintf(stderr, "DGEMM ... %3d, %3d, %3d\n", DGEMM_P, DGEMM_Q, DGEMM_R);
|
|
kusano |
2b45e8 |
fprintf(stderr, "CGEMM ... %3d, %3d, %3d\n", CGEMM_P, CGEMM_Q, CGEMM_R);
|
|
kusano |
2b45e8 |
fprintf(stderr, "ZGEMM ... %3d, %3d, %3d\n", ZGEMM_P, ZGEMM_Q, ZGEMM_R);
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
return;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if 0
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
int get_current_cpu_info(void){
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
int nlprocs, ncores, cmplegacy;
|
|
kusano |
2b45e8 |
int htt = 0;
|
|
kusano |
2b45e8 |
int apicid = 0;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(CORE_PRESCOTT) || defined(CORE_OPTERON)
|
|
kusano |
2b45e8 |
int eax, ebx, ecx, edx;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
cpuid(1, &eax, &ebx, &ecx, &edx);
|
|
kusano |
2b45e8 |
nlprocs = BITMASK(ebx, 16, 0xff);
|
|
kusano |
2b45e8 |
apicid = BITMASK(ebx, 24, 0xff);
|
|
kusano |
2b45e8 |
htt = BITMASK(edx, 28, 0x01);
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(CORE_PRESCOTT)
|
|
kusano |
2b45e8 |
cpuid(4, &eax, &ebx, &ecx, &edx);
|
|
kusano |
2b45e8 |
ncores = BITMASK(eax, 26, 0x3f);
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
if (htt == 0) nlprocs = 0;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(CORE_OPTERON)
|
|
kusano |
2b45e8 |
cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
|
|
kusano |
2b45e8 |
ncores = BITMASK(ecx, 0, 0xff);
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
|
|
kusano |
2b45e8 |
cmplegacy = BITMASK(ecx, 1, 0x01);
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
if (htt == 0) {
|
|
kusano |
2b45e8 |
nlprocs = 0;
|
|
kusano |
2b45e8 |
ncores = 0;
|
|
kusano |
2b45e8 |
cmplegacy = 0;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
ncores ++;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
fprintf(stderr, "APICID = %d Number of core = %d\n", apicid, ncores);
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
return 0;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(ARCH_IA64)
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
static inline BLASULONG cpuid(BLASULONG regnum){
|
|
kusano |
2b45e8 |
BLASULONG value;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#ifndef __ECC
|
|
kusano |
2b45e8 |
asm ("mov %0=cpuid[%r1]" : "=r"(value) : "rO"(regnum));
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
value = __getIndReg(_IA64_REG_INDR_CPUID, regnum);
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
return value;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if 1
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
void blas_set_parameter(void){
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
BLASULONG cpuid3, size;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
cpuid3 = cpuid(3);
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
size = BITMASK(cpuid3, 16, 0xff);
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
sgemm_p = 192 * (size + 1);
|
|
kusano |
2b45e8 |
dgemm_p = 96 * (size + 1);
|
|
kusano |
2b45e8 |
cgemm_p = 96 * (size + 1);
|
|
kusano |
2b45e8 |
zgemm_p = 48 * (size + 1);
|
|
kusano |
2b45e8 |
#ifdef EXPRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 64 * (size + 1);
|
|
kusano |
2b45e8 |
xgemm_p = 32 * (size + 1);
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#ifdef QUAD_PRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 32 * (size + 1);
|
|
kusano |
2b45e8 |
xgemm_p = 16 * (size + 1);
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
|
|
kusano |
2b45e8 |
dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
|
|
kusano |
2b45e8 |
cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
|
|
kusano |
2b45e8 |
zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
|
|
kusano |
2b45e8 |
#if defined(EXPRECISION) || defined(QUAD_PRECISION)
|
|
kusano |
2b45e8 |
qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
|
|
kusano |
2b45e8 |
xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
return;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#else
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#define IA64_SYS_NAME "/sys/devices/system/cpu/cpu0/cache/index3/size"
|
|
kusano |
2b45e8 |
#define IA64_PROC_NAME "/proc/pal/cpu0/cache_info"
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
void blas_set_parameter(void){
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
BLASULONG cpuid3;
|
|
kusano |
2b45e8 |
int size = 0;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if 1
|
|
kusano |
2b45e8 |
char buffer[128];
|
|
kusano |
2b45e8 |
FILE *infile;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
if ((infile = fopen(IA64_SYS_NAME, "r")) != NULL) {
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
fgets(buffer, sizeof(buffer), infile);
|
|
kusano |
2b45e8 |
fclose(infile);
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
size = atoi(buffer) / 1536;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
if (size <= 0) {
|
|
kusano |
2b45e8 |
if ((infile = fopen(IA64_PROC_NAME, "r")) != NULL) {
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
while(fgets(buffer, sizeof(buffer), infile) != NULL) {
|
|
kusano |
2b45e8 |
if ((!strncmp("Data/Instruction Cache level 3", buffer, 30))) break;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
fgets(buffer, sizeof(buffer), infile);
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
fclose(infile);
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
*strstr(buffer, "bytes") = (char)NULL;
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
size = atoi(strchr(buffer, ':') + 1) / 1572864;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
/* The last resort */
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
if (size <= 0) {
|
|
kusano |
2b45e8 |
cpuid3 = cpuid(3);
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
size = BITMASK(cpuid3, 16, 0xff) + 1;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
sgemm_p = 320 * size;
|
|
kusano |
2b45e8 |
dgemm_p = 160 * size;
|
|
kusano |
2b45e8 |
cgemm_p = 160 * size;
|
|
kusano |
2b45e8 |
zgemm_p = 80 * size;
|
|
kusano |
2b45e8 |
#ifdef EXPRECISION
|
|
kusano |
2b45e8 |
qgemm_p = 80 * size;
|
|
kusano |
2b45e8 |
xgemm_p = 40 * size;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
|
|
kusano |
2b45e8 |
dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
|
|
kusano |
2b45e8 |
cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
|
|
kusano |
2b45e8 |
zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
|
|
kusano |
2b45e8 |
#ifdef EXPRECISION
|
|
kusano |
2b45e8 |
qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
|
|
kusano |
2b45e8 |
xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
return;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(ARCH_MIPS64)
|
|
kusano |
2b45e8 |
void blas_set_parameter(void){
|
|
kusano |
2b45e8 |
#if defined(LOONGSON3A)
|
|
kusano |
2b45e8 |
#ifdef SMP
|
|
kusano |
2b45e8 |
if(blas_num_threads == 1){
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
//single thread
|
|
kusano |
2b45e8 |
dgemm_r = 1024;
|
|
kusano |
2b45e8 |
#ifdef SMP
|
|
kusano |
2b45e8 |
}else{
|
|
kusano |
2b45e8 |
//multi thread
|
|
kusano |
2b45e8 |
dgemm_r = 200;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
#if defined(LOONGSON3B)
|
|
kusano |
2b45e8 |
#ifdef SMP
|
|
kusano |
2b45e8 |
if(blas_num_threads == 1 || blas_num_threads == 2){
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
//single thread
|
|
kusano |
2b45e8 |
dgemm_r = 640;
|
|
kusano |
2b45e8 |
#ifdef SMP
|
|
kusano |
2b45e8 |
}else{
|
|
kusano |
2b45e8 |
//multi thread
|
|
kusano |
2b45e8 |
dgemm_r = 160;
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
#endif
|
|
kusano |
2b45e8 |
|
|
kusano |
2b45e8 |
}
|
|
kusano |
2b45e8 |
#endif
|