Blob Blame Raw
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin.           */
/* All rights reserved.                                              */
/*                                                                   */
/* Redistribution and use in source and binary forms, with or        */
/* without modification, are permitted provided that the following   */
/* conditions are met:                                               */
/*                                                                   */
/*   1. Redistributions of source code must retain the above         */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer.                                                  */
/*                                                                   */
/*   2. Redistributions in binary form must reproduce the above      */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer in the documentation and/or other materials       */
/*      provided with the distribution.                              */
/*                                                                   */
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
/*                                                                   */
/* The views and conclusions contained in the software and           */
/* documentation are those of the authors and should not be          */
/* interpreted as representing official policies, either expressed   */
/* or implied, of The University of Texas at Austin.                 */
/*********************************************************************/

#include <stdio.h>
#include <string.h>
#include "common.h"

#ifdef BUILD_KERNEL
#include "kernelTS.h"
#endif

#undef DEBUG

static void init_parameter(void);

gotoblas_t TABLE_NAME = {
  DTB_DEFAULT_ENTRIES ,

  GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN,

  0, 0, 0,
  SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, MAX(SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N),
#ifdef HAVE_EXCLUSIVE_CACHE
  1,
#else
  0,
#endif

  samax_kTS,  samin_kTS,  smax_kTS,  smin_kTS,
  isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS,
  snrm2_kTS,  sasum_kTS,  scopy_kTS, sdot_kTS,
  dsdot_kTS,
  srot_kTS,   saxpy_kTS,  sscal_kTS, sswap_kTS,
  sgemv_nTS,  sgemv_tTS, sger_kTS,
  ssymv_LTS, ssymv_UTS,

  sgemm_kernelTS, sgemm_betaTS, 
#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
  sgemm_incopyTS, sgemm_itcopyTS, 
#else
  sgemm_oncopyTS, sgemm_otcopyTS,
#endif
  sgemm_oncopyTS, sgemm_otcopyTS,
  strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS,
#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
  strsm_iunucopyTS, strsm_iunncopyTS, strsm_iutucopyTS, strsm_iutncopyTS,
  strsm_ilnucopyTS, strsm_ilnncopyTS, strsm_iltucopyTS, strsm_iltncopyTS,
#else
  strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS,
  strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS,
#endif
  strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS,
  strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS,
  strmm_kernel_RNTS, strmm_kernel_RTTS, strmm_kernel_LNTS, strmm_kernel_LTTS,
#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
  strmm_iunucopyTS, strmm_iunncopyTS, strmm_iutucopyTS, strmm_iutncopyTS,
  strmm_ilnucopyTS, strmm_ilnncopyTS, strmm_iltucopyTS, strmm_iltncopyTS,
#else
  strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS,
  strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS,
#endif
  strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS,
  strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS,
#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
  ssymm_iutcopyTS, ssymm_iltcopyTS, 
#else
  ssymm_outcopyTS, ssymm_oltcopyTS,
#endif
  ssymm_outcopyTS, ssymm_oltcopyTS,

#ifndef NO_LAPACK
  sneg_tcopyTS, slaswp_ncopyTS,
#else
  NULL,NULL,
#endif

  0, 0, 0,
  DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N),

  damax_kTS,  damin_kTS,  dmax_kTS,  dmin_kTS,
  idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS,
  dnrm2_kTS,  dasum_kTS,  dcopy_kTS, ddot_kTS,
  drot_kTS,   daxpy_kTS,  dscal_kTS, dswap_kTS,
  dgemv_nTS,  dgemv_tTS,  dger_kTS,
  dsymv_LTS,  dsymv_UTS,

  dgemm_kernelTS, dgemm_betaTS, 
#if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
  dgemm_incopyTS, dgemm_itcopyTS, 
#else
  dgemm_oncopyTS, dgemm_otcopyTS,
#endif
  dgemm_oncopyTS, dgemm_otcopyTS,
  dtrsm_kernel_LNTS, dtrsm_kernel_LTTS, dtrsm_kernel_RNTS, dtrsm_kernel_RTTS,
#if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
  dtrsm_iunucopyTS, dtrsm_iunncopyTS, dtrsm_iutucopyTS, dtrsm_iutncopyTS,
  dtrsm_ilnucopyTS, dtrsm_ilnncopyTS, dtrsm_iltucopyTS, dtrsm_iltncopyTS,
#else
  dtrsm_ounucopyTS, dtrsm_ounncopyTS, dtrsm_outucopyTS, dtrsm_outncopyTS,
  dtrsm_olnucopyTS, dtrsm_olnncopyTS, dtrsm_oltucopyTS, dtrsm_oltncopyTS,
#endif
  dtrsm_ounucopyTS, dtrsm_ounncopyTS, dtrsm_outucopyTS, dtrsm_outncopyTS,
  dtrsm_olnucopyTS, dtrsm_olnncopyTS, dtrsm_oltucopyTS, dtrsm_oltncopyTS,
  dtrmm_kernel_RNTS, dtrmm_kernel_RTTS, dtrmm_kernel_LNTS, dtrmm_kernel_LTTS,
#if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
  dtrmm_iunucopyTS, dtrmm_iunncopyTS, dtrmm_iutucopyTS, dtrmm_iutncopyTS,
  dtrmm_ilnucopyTS, dtrmm_ilnncopyTS, dtrmm_iltucopyTS, dtrmm_iltncopyTS,
#else
  dtrmm_ounucopyTS, dtrmm_ounncopyTS, dtrmm_outucopyTS, dtrmm_outncopyTS,
  dtrmm_olnucopyTS, dtrmm_olnncopyTS, dtrmm_oltucopyTS, dtrmm_oltncopyTS,
#endif
  dtrmm_ounucopyTS, dtrmm_ounncopyTS, dtrmm_outucopyTS, dtrmm_outncopyTS,
  dtrmm_olnucopyTS, dtrmm_olnncopyTS, dtrmm_oltucopyTS, dtrmm_oltncopyTS,
#if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
  dsymm_iutcopyTS, dsymm_iltcopyTS, 
#else
  dsymm_outcopyTS, dsymm_oltcopyTS,
#endif
  dsymm_outcopyTS, dsymm_oltcopyTS,

#ifndef NO_LAPACK
  dneg_tcopyTS, dlaswp_ncopyTS,
#else
  NULL, NULL,
#endif

#ifdef EXPRECISION

  0, 0, 0,
  QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N),

  qamax_kTS,  qamin_kTS,  qmax_kTS,  qmin_kTS,
  iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS,
  qnrm2_kTS,  qasum_kTS,  qcopy_kTS, qdot_kTS,
  qrot_kTS,   qaxpy_kTS,  qscal_kTS, qswap_kTS,
  qgemv_nTS,  qgemv_tTS,  qger_kTS,
  qsymv_LTS,  qsymv_UTS,

  qgemm_kernelTS, qgemm_betaTS, 
#if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N
  qgemm_incopyTS, qgemm_itcopyTS, 
#else
  qgemm_oncopyTS, qgemm_otcopyTS,
#endif
  qgemm_oncopyTS, qgemm_otcopyTS,
  qtrsm_kernel_LNTS, qtrsm_kernel_LTTS, qtrsm_kernel_RNTS, qtrsm_kernel_RTTS,
#if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N
  qtrsm_iunucopyTS, qtrsm_iunncopyTS, qtrsm_iutucopyTS, qtrsm_iutncopyTS,
  qtrsm_ilnucopyTS, qtrsm_ilnncopyTS, qtrsm_iltucopyTS, qtrsm_iltncopyTS,
#else
  qtrsm_ounucopyTS, qtrsm_ounncopyTS, qtrsm_outucopyTS, qtrsm_outncopyTS,
  qtrsm_olnucopyTS, qtrsm_olnncopyTS, qtrsm_oltucopyTS, qtrsm_oltncopyTS,
#endif
  qtrsm_ounucopyTS, qtrsm_ounncopyTS, qtrsm_outucopyTS, qtrsm_outncopyTS,
  qtrsm_olnucopyTS, qtrsm_olnncopyTS, qtrsm_oltucopyTS, qtrsm_oltncopyTS,
  qtrmm_kernel_RNTS, qtrmm_kernel_RTTS, qtrmm_kernel_LNTS, qtrmm_kernel_LTTS,
#if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N
  qtrmm_iunucopyTS, qtrmm_iunncopyTS, qtrmm_iutucopyTS, qtrmm_iutncopyTS,
  qtrmm_ilnucopyTS, qtrmm_ilnncopyTS, qtrmm_iltucopyTS, qtrmm_iltncopyTS,
#else
  qtrmm_ounucopyTS, qtrmm_ounncopyTS, qtrmm_outucopyTS, qtrmm_outncopyTS,
  qtrmm_olnucopyTS, qtrmm_olnncopyTS, qtrmm_oltucopyTS, qtrmm_oltncopyTS,
#endif
  qtrmm_ounucopyTS, qtrmm_ounncopyTS, qtrmm_outucopyTS, qtrmm_outncopyTS,
  qtrmm_olnucopyTS, qtrmm_olnncopyTS, qtrmm_oltucopyTS, qtrmm_oltncopyTS,
#if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N
  qsymm_iutcopyTS, qsymm_iltcopyTS, 
#else
  qsymm_outcopyTS, qsymm_oltcopyTS,
#endif
  qsymm_outcopyTS, qsymm_oltcopyTS,

#ifndef NO_LAPACK
  qneg_tcopyTS, qlaswp_ncopyTS,
#else
  NULL, NULL,
#endif

#endif

  0, 0, 0,
  CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N, MAX(CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N),

  camax_kTS, camin_kTS, icamax_kTS, icamin_kTS,
  cnrm2_kTS, casum_kTS, ccopy_kTS,
  cdotu_kTS, cdotc_kTS, csrot_kTS,
  caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS, 

  cgemv_nTS, cgemv_tTS, cgemv_rTS, cgemv_cTS, 
  cgemv_oTS, cgemv_uTS, cgemv_sTS, cgemv_dTS, 
  cgeru_kTS, cgerc_kTS, cgerv_kTS, cgerd_kTS, 
  csymv_LTS, csymv_UTS,
  chemv_LTS, chemv_UTS, chemv_MTS, chemv_VTS,
  
  cgemm_kernel_nTS, cgemm_kernel_lTS, cgemm_kernel_rTS, cgemm_kernel_bTS,
  cgemm_betaTS,

#if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
  cgemm_incopyTS, cgemm_itcopyTS,
#else
  cgemm_oncopyTS, cgemm_otcopyTS,
#endif
  cgemm_oncopyTS, cgemm_otcopyTS,
  
  ctrsm_kernel_LNTS, ctrsm_kernel_LTTS, ctrsm_kernel_LRTS, ctrsm_kernel_LCTS,
  ctrsm_kernel_RNTS, ctrsm_kernel_RTTS, ctrsm_kernel_RRTS, ctrsm_kernel_RCTS,
  
#if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
  ctrsm_iunucopyTS,  ctrsm_iunncopyTS,  ctrsm_iutucopyTS,  ctrsm_iutncopyTS,
  ctrsm_ilnucopyTS,  ctrsm_ilnncopyTS,  ctrsm_iltucopyTS,  ctrsm_iltncopyTS,
#else
  ctrsm_ounucopyTS,  ctrsm_ounncopyTS,  ctrsm_outucopyTS,  ctrsm_outncopyTS,
  ctrsm_olnucopyTS,  ctrsm_olnncopyTS,  ctrsm_oltucopyTS,  ctrsm_oltncopyTS,
#endif
  ctrsm_ounucopyTS,  ctrsm_ounncopyTS,  ctrsm_outucopyTS,  ctrsm_outncopyTS,
  ctrsm_olnucopyTS,  ctrsm_olnncopyTS,  ctrsm_oltucopyTS,  ctrsm_oltncopyTS,
  
  ctrmm_kernel_RNTS,  ctrmm_kernel_RTTS,  ctrmm_kernel_RRTS,  ctrmm_kernel_RCTS,
  ctrmm_kernel_LNTS,  ctrmm_kernel_LTTS,  ctrmm_kernel_LRTS,  ctrmm_kernel_LCTS,
  
#if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
  ctrmm_iunucopyTS,  ctrmm_iunncopyTS,  ctrmm_iutucopyTS,  ctrmm_iutncopyTS,
  ctrmm_ilnucopyTS,  ctrmm_ilnncopyTS,  ctrmm_iltucopyTS,  ctrmm_iltncopyTS,
#else
  ctrmm_ounucopyTS,  ctrmm_ounncopyTS,  ctrmm_outucopyTS,  ctrmm_outncopyTS,
  ctrmm_olnucopyTS,  ctrmm_olnncopyTS,  ctrmm_oltucopyTS,  ctrmm_oltncopyTS,
#endif
  ctrmm_ounucopyTS,  ctrmm_ounncopyTS,  ctrmm_outucopyTS,  ctrmm_outncopyTS,
  ctrmm_olnucopyTS,  ctrmm_olnncopyTS,  ctrmm_oltucopyTS,  ctrmm_oltncopyTS,
  
#if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
  csymm_iutcopyTS,  csymm_iltcopyTS,
#else
  csymm_outcopyTS,  csymm_oltcopyTS,
#endif
  csymm_outcopyTS,  csymm_oltcopyTS,
#if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
  chemm_iutcopyTS,  chemm_iltcopyTS,
#else
  chemm_outcopyTS,  chemm_oltcopyTS,
#endif
  chemm_outcopyTS,  chemm_oltcopyTS,
  
  cgemm3m_kernelTS,
  
  cgemm3m_incopybTS,  cgemm3m_incopyrTS,
  cgemm3m_incopyiTS,  cgemm3m_itcopybTS,
  cgemm3m_itcopyrTS,  cgemm3m_itcopyiTS,
  cgemm3m_oncopybTS,  cgemm3m_oncopyrTS,
  cgemm3m_oncopyiTS,  cgemm3m_otcopybTS,
  cgemm3m_otcopyrTS,  cgemm3m_otcopyiTS,
  
  csymm3m_iucopybTS,  csymm3m_ilcopybTS,
  csymm3m_iucopyrTS,  csymm3m_ilcopyrTS,
  csymm3m_iucopyiTS,  csymm3m_ilcopyiTS,
  csymm3m_oucopybTS,  csymm3m_olcopybTS,
  csymm3m_oucopyrTS,  csymm3m_olcopyrTS,
  csymm3m_oucopyiTS,  csymm3m_olcopyiTS,

  chemm3m_iucopybTS,  chemm3m_ilcopybTS,
  chemm3m_iucopyrTS,  chemm3m_ilcopyrTS,
  chemm3m_iucopyiTS,  chemm3m_ilcopyiTS, 

  chemm3m_oucopybTS,  chemm3m_olcopybTS,
  chemm3m_oucopyrTS,  chemm3m_olcopyrTS,
  chemm3m_oucopyiTS,  chemm3m_olcopyiTS,

#ifndef NO_LAPACK
  cneg_tcopyTS, claswp_ncopyTS,
#else
  NULL, NULL,
#endif

  0, 0, 0,
  ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N, MAX(ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N),

  zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS,
  znrm2_kTS, zasum_kTS, zcopy_kTS,
  zdotu_kTS, zdotc_kTS, zdrot_kTS,
  zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS, 

  zgemv_nTS, zgemv_tTS, zgemv_rTS, zgemv_cTS, 
  zgemv_oTS, zgemv_uTS, zgemv_sTS, zgemv_dTS, 
  zgeru_kTS, zgerc_kTS, zgerv_kTS, zgerd_kTS, 
  zsymv_LTS, zsymv_UTS,
  zhemv_LTS, zhemv_UTS, zhemv_MTS, zhemv_VTS,

  zgemm_kernel_nTS, zgemm_kernel_lTS, zgemm_kernel_rTS, zgemm_kernel_bTS,
  zgemm_betaTS,

#if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
  zgemm_incopyTS, zgemm_itcopyTS,
#else
  zgemm_oncopyTS, zgemm_otcopyTS,
#endif
  zgemm_oncopyTS, zgemm_otcopyTS,
  
  ztrsm_kernel_LNTS, ztrsm_kernel_LTTS, ztrsm_kernel_LRTS, ztrsm_kernel_LCTS,
  ztrsm_kernel_RNTS, ztrsm_kernel_RTTS, ztrsm_kernel_RRTS, ztrsm_kernel_RCTS,
  
#if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
  ztrsm_iunucopyTS,  ztrsm_iunncopyTS,  ztrsm_iutucopyTS,  ztrsm_iutncopyTS,
  ztrsm_ilnucopyTS,  ztrsm_ilnncopyTS,  ztrsm_iltucopyTS,  ztrsm_iltncopyTS,
#else
  ztrsm_ounucopyTS,  ztrsm_ounncopyTS,  ztrsm_outucopyTS,  ztrsm_outncopyTS,
  ztrsm_olnucopyTS,  ztrsm_olnncopyTS,  ztrsm_oltucopyTS,  ztrsm_oltncopyTS,
#endif
  ztrsm_ounucopyTS,  ztrsm_ounncopyTS,  ztrsm_outucopyTS,  ztrsm_outncopyTS,
  ztrsm_olnucopyTS,  ztrsm_olnncopyTS,  ztrsm_oltucopyTS,  ztrsm_oltncopyTS,
  
  ztrmm_kernel_RNTS,  ztrmm_kernel_RTTS,  ztrmm_kernel_RRTS,  ztrmm_kernel_RCTS,
  ztrmm_kernel_LNTS,  ztrmm_kernel_LTTS,  ztrmm_kernel_LRTS,  ztrmm_kernel_LCTS,
  
#if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
  ztrmm_iunucopyTS,  ztrmm_iunncopyTS,  ztrmm_iutucopyTS,  ztrmm_iutncopyTS,
  ztrmm_ilnucopyTS,  ztrmm_ilnncopyTS,  ztrmm_iltucopyTS,  ztrmm_iltncopyTS,
#else
  ztrmm_ounucopyTS,  ztrmm_ounncopyTS,  ztrmm_outucopyTS,  ztrmm_outncopyTS,
  ztrmm_olnucopyTS,  ztrmm_olnncopyTS,  ztrmm_oltucopyTS,  ztrmm_oltncopyTS,
#endif
  ztrmm_ounucopyTS,  ztrmm_ounncopyTS,  ztrmm_outucopyTS,  ztrmm_outncopyTS,
  ztrmm_olnucopyTS,  ztrmm_olnncopyTS,  ztrmm_oltucopyTS,  ztrmm_oltncopyTS,
  
#if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
  zsymm_iutcopyTS,  zsymm_iltcopyTS,
#else
  zsymm_outcopyTS,  zsymm_oltcopyTS,
#endif
  zsymm_outcopyTS,  zsymm_oltcopyTS,
#if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
  zhemm_iutcopyTS,  zhemm_iltcopyTS,
#else
  zhemm_outcopyTS,  zhemm_oltcopyTS,
#endif
  zhemm_outcopyTS,  zhemm_oltcopyTS,
  
  zgemm3m_kernelTS,
  
  zgemm3m_incopybTS,  zgemm3m_incopyrTS,
  zgemm3m_incopyiTS,  zgemm3m_itcopybTS,
  zgemm3m_itcopyrTS,  zgemm3m_itcopyiTS,
  zgemm3m_oncopybTS,  zgemm3m_oncopyrTS,
  zgemm3m_oncopyiTS,  zgemm3m_otcopybTS,
  zgemm3m_otcopyrTS,  zgemm3m_otcopyiTS,
  
  zsymm3m_iucopybTS,  zsymm3m_ilcopybTS,
  zsymm3m_iucopyrTS,  zsymm3m_ilcopyrTS,
  zsymm3m_iucopyiTS,  zsymm3m_ilcopyiTS,
  zsymm3m_oucopybTS,  zsymm3m_olcopybTS,
  zsymm3m_oucopyrTS,  zsymm3m_olcopyrTS,
  zsymm3m_oucopyiTS,  zsymm3m_olcopyiTS,

  zhemm3m_iucopybTS,  zhemm3m_ilcopybTS,
  zhemm3m_iucopyrTS,  zhemm3m_ilcopyrTS,
  zhemm3m_iucopyiTS,  zhemm3m_ilcopyiTS, 

  zhemm3m_oucopybTS,  zhemm3m_olcopybTS,
  zhemm3m_oucopyrTS,  zhemm3m_olcopyrTS,
  zhemm3m_oucopyiTS,  zhemm3m_olcopyiTS,

#ifndef NO_LAPACK
  zneg_tcopyTS, zlaswp_ncopyTS,
#else
  NULL, NULL,
#endif

#ifdef EXPRECISION

  0, 0, 0,
  XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N),

  xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS,
  xnrm2_kTS, xasum_kTS, xcopy_kTS,
  xdotu_kTS, xdotc_kTS, xqrot_kTS,
  xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS, 

  xgemv_nTS, xgemv_tTS, xgemv_rTS, xgemv_cTS, 
  xgemv_oTS, xgemv_uTS, xgemv_sTS, xgemv_dTS, 
  xgeru_kTS, xgerc_kTS, xgerv_kTS, xgerd_kTS, 
  xsymv_LTS, xsymv_UTS,
  xhemv_LTS, xhemv_UTS, xhemv_MTS, xhemv_VTS,

  xgemm_kernel_nTS, xgemm_kernel_lTS, xgemm_kernel_rTS, xgemm_kernel_bTS,
  xgemm_betaTS,

#if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
  xgemm_incopyTS, xgemm_itcopyTS,
#else
  xgemm_oncopyTS, xgemm_otcopyTS,
#endif
  xgemm_oncopyTS, xgemm_otcopyTS,
  
  xtrsm_kernel_LNTS, xtrsm_kernel_LTTS, xtrsm_kernel_LRTS, xtrsm_kernel_LCTS,
  xtrsm_kernel_RNTS, xtrsm_kernel_RTTS, xtrsm_kernel_RRTS, xtrsm_kernel_RCTS,
  
#if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
  xtrsm_iunucopyTS,  xtrsm_iunncopyTS,  xtrsm_iutucopyTS,  xtrsm_iutncopyTS,
  xtrsm_ilnucopyTS,  xtrsm_ilnncopyTS,  xtrsm_iltucopyTS,  xtrsm_iltncopyTS,
#else
  xtrsm_ounucopyTS,  xtrsm_ounncopyTS,  xtrsm_outucopyTS,  xtrsm_outncopyTS,
  xtrsm_olnucopyTS,  xtrsm_olnncopyTS,  xtrsm_oltucopyTS,  xtrsm_oltncopyTS,
#endif
  xtrsm_ounucopyTS,  xtrsm_ounncopyTS,  xtrsm_outucopyTS,  xtrsm_outncopyTS,
  xtrsm_olnucopyTS,  xtrsm_olnncopyTS,  xtrsm_oltucopyTS,  xtrsm_oltncopyTS,
  
  xtrmm_kernel_RNTS,  xtrmm_kernel_RTTS,  xtrmm_kernel_RRTS,  xtrmm_kernel_RCTS,
  xtrmm_kernel_LNTS,  xtrmm_kernel_LTTS,  xtrmm_kernel_LRTS,  xtrmm_kernel_LCTS,
  
#if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
  xtrmm_iunucopyTS,  xtrmm_iunncopyTS,  xtrmm_iutucopyTS,  xtrmm_iutncopyTS,
  xtrmm_ilnucopyTS,  xtrmm_ilnncopyTS,  xtrmm_iltucopyTS,  xtrmm_iltncopyTS,
#else
  xtrmm_ounucopyTS,  xtrmm_ounncopyTS,  xtrmm_outucopyTS,  xtrmm_outncopyTS,
  xtrmm_olnucopyTS,  xtrmm_olnncopyTS,  xtrmm_oltucopyTS,  xtrmm_oltncopyTS,
#endif
  xtrmm_ounucopyTS,  xtrmm_ounncopyTS,  xtrmm_outucopyTS,  xtrmm_outncopyTS,
  xtrmm_olnucopyTS,  xtrmm_olnncopyTS,  xtrmm_oltucopyTS,  xtrmm_oltncopyTS,
  
#if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
  xsymm_iutcopyTS,  xsymm_iltcopyTS,
#else
  xsymm_outcopyTS,  xsymm_oltcopyTS,
#endif
  xsymm_outcopyTS,  xsymm_oltcopyTS,
#if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
  xhemm_iutcopyTS,  xhemm_iltcopyTS,
#else
  xhemm_outcopyTS,  xhemm_oltcopyTS,
#endif
  xhemm_outcopyTS,  xhemm_oltcopyTS,
  
  xgemm3m_kernelTS,
  
  xgemm3m_incopybTS,  xgemm3m_incopyrTS,
  xgemm3m_incopyiTS,  xgemm3m_itcopybTS,
  xgemm3m_itcopyrTS,  xgemm3m_itcopyiTS,
  xgemm3m_oncopybTS,  xgemm3m_oncopyrTS,
  xgemm3m_oncopyiTS,  xgemm3m_otcopybTS,
  xgemm3m_otcopyrTS,  xgemm3m_otcopyiTS,
  
  xsymm3m_iucopybTS,  xsymm3m_ilcopybTS,
  xsymm3m_iucopyrTS,  xsymm3m_ilcopyrTS,
  xsymm3m_iucopyiTS,  xsymm3m_ilcopyiTS,
  xsymm3m_oucopybTS,  xsymm3m_olcopybTS,
  xsymm3m_oucopyrTS,  xsymm3m_olcopyrTS,
  xsymm3m_oucopyiTS,  xsymm3m_olcopyiTS,

  xhemm3m_iucopybTS,  xhemm3m_ilcopybTS,
  xhemm3m_iucopyrTS,  xhemm3m_ilcopyrTS,
  xhemm3m_iucopyiTS,  xhemm3m_ilcopyiTS, 

  xhemm3m_oucopybTS,  xhemm3m_olcopybTS,
  xhemm3m_oucopyrTS,  xhemm3m_olcopyrTS,
  xhemm3m_oucopyiTS,  xhemm3m_olcopyiTS,

#ifndef NO_LAPACK
  xneg_tcopyTS, xlaswp_ncopyTS,
#else
  NULL, NULL,
#endif

#endif

  init_parameter,

  SNUMOPT, DNUMOPT, QNUMOPT,

};

#ifdef ARCH_X86
static int get_l2_size_old(void){
  int i, eax, ebx, ecx, edx, cpuid_level;
  int info[15];

  cpuid(2, &eax, &ebx, &ecx, &edx);
  
  info[ 0] = BITMASK(eax,  8, 0xff);
  info[ 1] = BITMASK(eax, 16, 0xff);
  info[ 2] = BITMASK(eax, 24, 0xff);
  
  info[ 3] = BITMASK(ebx,  0, 0xff);
  info[ 4] = BITMASK(ebx,  8, 0xff);
  info[ 5] = BITMASK(ebx, 16, 0xff);
  info[ 6] = BITMASK(ebx, 24, 0xff);
  
  info[ 7] = BITMASK(ecx,  0, 0xff);
  info[ 8] = BITMASK(ecx,  8, 0xff);
  info[ 9] = BITMASK(ecx, 16, 0xff);
  info[10] = BITMASK(ecx, 24, 0xff);
  
  info[11] = BITMASK(edx,  0, 0xff);
  info[12] = BITMASK(edx,  8, 0xff);
  info[13] = BITMASK(edx, 16, 0xff);
  info[14] = BITMASK(edx, 24, 0xff);
  
  for (i = 0; i < 15; i++){
    
    switch (info[i]){
      
      /* This table is from http://www.sandpile.org/ia32/cpuid.htm */
      
    case 0x1a :
      return 96;
      
    case 0x39 :
    case 0x3b :
    case 0x41 :
    case 0x79 :
    case 0x81 :
      return 128;
      
    case 0x3a :
      return 192;
      
    case 0x21 :
    case 0x3c :
    case 0x42 :
    case 0x7a :
    case 0x7e :
    case 0x82 :
      return 256;
      
    case 0x3d :
      return 384;
      
    case 0x3e :
    case 0x43 :
    case 0x7b :
    case 0x7f :
    case 0x83 :
    case 0x86 :
      return 512;
      
    case 0x44 :
    case 0x78 :
    case 0x7c :
    case 0x84 :
    case 0x87 :
      return 1024;
      
    case 0x45 :
    case 0x7d :
    case 0x85 :
      return 2048;

    case 0x48 :
      return 3184;
      
    case 0x49 :
      return 4096;
      
    case 0x4e :
      return 6144;
    }
  }
  return 0;
}
#endif

static __inline__ int get_l2_size(void){

  int eax, ebx, ecx, edx, l2;

  cpuid(0x80000006, &eax, &ebx, &ecx, &edx);

  l2 = BITMASK(ecx, 16, 0xffff);

#ifndef ARCH_X86
  return l2;

#else

  if (l2 > 0) return l2;

  return get_l2_size_old();
#endif
}

static __inline__ int get_l3_size(void){

  int eax, ebx, ecx, edx;

  cpuid(0x80000006, &eax, &ebx, &ecx, &edx);

  return BITMASK(edx, 18, 0x3fff) * 512;
}


static void init_parameter(void) {

  int l2 = get_l2_size();

  TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
  TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
  TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
  TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
#ifdef EXPRECISION
  TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q;
  TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q;
#endif

#if defined(CORE_KATMAI)  || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH)

#ifdef DEBUG
  fprintf(stderr, "Katmai, Coppermine, Banias\n");
#endif

  TABLE_NAME.sgemm_p =  64 * (l2 >> 7);
  TABLE_NAME.dgemm_p =  32 * (l2 >> 7);
  TABLE_NAME.cgemm_p =  32 * (l2 >> 7);
  TABLE_NAME.zgemm_p =  16 * (l2 >> 7);
#ifdef EXPRECISION
  TABLE_NAME.qgemm_p =  16 * (l2 >> 7);
  TABLE_NAME.xgemm_p =   8 * (l2 >> 7);
#endif
#endif

#ifdef CORE_NORTHWOOD

#ifdef DEBUG
  fprintf(stderr, "Northwood\n");
#endif

  TABLE_NAME.sgemm_p =  96 * (l2 >> 7);
  TABLE_NAME.dgemm_p =  48 * (l2 >> 7);
  TABLE_NAME.cgemm_p =  48 * (l2 >> 7);
  TABLE_NAME.zgemm_p =  24 * (l2 >> 7);
#ifdef EXPRECISION
  TABLE_NAME.qgemm_p =  24 * (l2 >> 7);
  TABLE_NAME.xgemm_p =  12 * (l2 >> 7);
#endif
#endif

#ifdef ATOM

#ifdef DEBUG
  fprintf(stderr, "Atom\n");
#endif

  TABLE_NAME.sgemm_p = 256;
  TABLE_NAME.dgemm_p = 128;
  TABLE_NAME.cgemm_p = 128;
  TABLE_NAME.zgemm_p =  64;
#ifdef EXPRECISION
  TABLE_NAME.qgemm_p =  64;
  TABLE_NAME.xgemm_p =  32;
#endif
#endif

#ifdef CORE_PRESCOTT

#ifdef DEBUG
  fprintf(stderr, "Prescott\n");
#endif

  TABLE_NAME.sgemm_p =  56 * (l2 >> 7);
  TABLE_NAME.dgemm_p =  28 * (l2 >> 7);
  TABLE_NAME.cgemm_p =  28 * (l2 >> 7);
  TABLE_NAME.zgemm_p =  14 * (l2 >> 7);
#ifdef EXPRECISION
  TABLE_NAME.qgemm_p =  14 * (l2 >> 7);
  TABLE_NAME.xgemm_p =   7 * (l2 >> 7);
#endif
#endif

#ifdef CORE2

#ifdef DEBUG
  fprintf(stderr, "Core2\n");
#endif

  TABLE_NAME.sgemm_p =  92 * (l2 >> 9);
  TABLE_NAME.dgemm_p =  46 * (l2 >> 9);
  TABLE_NAME.cgemm_p =  46 * (l2 >> 9);
  TABLE_NAME.zgemm_p =  23 * (l2 >> 9);
#ifdef EXPRECISION
  TABLE_NAME.qgemm_p =  92 * (l2 >> 9);
  TABLE_NAME.xgemm_p =  46 * (l2 >> 9);
#endif
#endif

#ifdef PENRYN

#ifdef DEBUG
  fprintf(stderr, "Penryn\n");
#endif

  TABLE_NAME.sgemm_p =  42 * (l2 >> 9) + 8;
  TABLE_NAME.dgemm_p =  42 * (l2 >> 9) + 8;
  TABLE_NAME.cgemm_p =  21 * (l2 >> 9) + 4;
  TABLE_NAME.zgemm_p =  21 * (l2 >> 9) + 4;
#ifdef EXPRECISION
  TABLE_NAME.qgemm_p =  42 * (l2 >> 9) + 8;
  TABLE_NAME.xgemm_p =  21 * (l2 >> 9) + 4;
#endif
#endif

#ifdef NEHALEM

#ifdef DEBUG
  fprintf(stderr, "Nehalem\n");
#endif

  TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
  TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
  TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
  TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
  TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
  TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif

#ifdef OPTERON

#ifdef DEBUG
  fprintf(stderr, "Opteron\n");
#endif

  TABLE_NAME.sgemm_p = 224 +  56 * (l2 >> 7);
  TABLE_NAME.dgemm_p = 112 +  28 * (l2 >> 7);
  TABLE_NAME.cgemm_p = 112 +  28 * (l2 >> 7);
  TABLE_NAME.zgemm_p =  56 +  14 * (l2 >> 7);
#ifdef EXPRECISION
  TABLE_NAME.qgemm_p =  56 +  14 * (l2 >> 7);
  TABLE_NAME.xgemm_p =  28 +   7 * (l2 >> 7);
#endif
#endif

#ifdef BARCELONA

#ifdef DEBUG
  fprintf(stderr, "Barcelona\n");
#endif

  TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
  TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
  TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
  TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
  TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
  TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif

#ifdef NANO

#ifdef DEBUG
  fprintf(stderr, "NANO\n");
#endif

  TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
  TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
  TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
  TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
  TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
  TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif


  TABLE_NAME.sgemm_p = (TABLE_NAME.sgemm_p + SGEMM_DEFAULT_UNROLL_M - 1) & ~(SGEMM_DEFAULT_UNROLL_M - 1);
  TABLE_NAME.dgemm_p = (TABLE_NAME.dgemm_p + DGEMM_DEFAULT_UNROLL_M - 1) & ~(DGEMM_DEFAULT_UNROLL_M - 1);
  TABLE_NAME.cgemm_p = (TABLE_NAME.cgemm_p + CGEMM_DEFAULT_UNROLL_M - 1) & ~(CGEMM_DEFAULT_UNROLL_M - 1);
  TABLE_NAME.zgemm_p = (TABLE_NAME.zgemm_p + ZGEMM_DEFAULT_UNROLL_M - 1) & ~(ZGEMM_DEFAULT_UNROLL_M - 1);
#ifdef QUAD_PRECISION
  TABLE_NAME.qgemm_p = (TABLE_NAME.qgemm_p + QGEMM_DEFAULT_UNROLL_M - 1) & ~(QGEMM_DEFAULT_UNROLL_M - 1);
  TABLE_NAME.xgemm_p = (TABLE_NAME.xgemm_p + XGEMM_DEFAULT_UNROLL_M - 1) & ~(XGEMM_DEFAULT_UNROLL_M - 1);
#endif

#ifdef DEBUG
  fprintf(stderr, "L2 = %8d DGEMM_P  .. %d\n", l2, TABLE_NAME.dgemm_p);
#endif

  TABLE_NAME.sgemm_r = (((BUFFER_SIZE - 
			       ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q *  4 + TABLE_NAME.offsetA 
				 + TABLE_NAME.align) & ~TABLE_NAME.align)
			       ) / (TABLE_NAME.sgemm_q *  4) - 15) & ~15);

  TABLE_NAME.dgemm_r = (((BUFFER_SIZE - 
			       ((TABLE_NAME.dgemm_p * TABLE_NAME.dgemm_q *  8 + TABLE_NAME.offsetA 
				 + TABLE_NAME.align) & ~TABLE_NAME.align)
			       ) / (TABLE_NAME.dgemm_q *  8) - 15) & ~15);

#ifdef EXPRECISION
  TABLE_NAME.qgemm_r = (((BUFFER_SIZE - 
			       ((TABLE_NAME.qgemm_p * TABLE_NAME.qgemm_q * 16 + TABLE_NAME.offsetA 
				 + TABLE_NAME.align) & ~TABLE_NAME.align)
			       ) / (TABLE_NAME.qgemm_q * 16) - 15) & ~15);
#endif

  TABLE_NAME.cgemm_r = (((BUFFER_SIZE - 
			       ((TABLE_NAME.cgemm_p * TABLE_NAME.cgemm_q *  8 + TABLE_NAME.offsetA 
				 + TABLE_NAME.align) & ~TABLE_NAME.align)
			       ) / (TABLE_NAME.cgemm_q *  8) - 15) & ~15);

  TABLE_NAME.zgemm_r = (((BUFFER_SIZE - 
			       ((TABLE_NAME.zgemm_p * TABLE_NAME.zgemm_q * 16 + TABLE_NAME.offsetA 
				 + TABLE_NAME.align) & ~TABLE_NAME.align)
			       ) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15);

#ifdef EXPRECISION
  TABLE_NAME.xgemm_r = (((BUFFER_SIZE - 
			       ((TABLE_NAME.xgemm_p * TABLE_NAME.xgemm_q * 32 + TABLE_NAME.offsetA 
				 + TABLE_NAME.align) & ~TABLE_NAME.align)	
		       ) / (TABLE_NAME.xgemm_q * 32) - 15) & ~15);
#endif

}