kusano 2b45e8
/*********************************************************************/
kusano 2b45e8
/* Copyright 2009, 2010 The University of Texas at Austin.           */
kusano 2b45e8
/* All rights reserved.                                              */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/* Redistribution and use in source and binary forms, with or        */
kusano 2b45e8
/* without modification, are permitted provided that the following   */
kusano 2b45e8
/* conditions are met:                                               */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*   1. Redistributions of source code must retain the above         */
kusano 2b45e8
/*      copyright notice, this list of conditions and the following  */
kusano 2b45e8
/*      disclaimer.                                                  */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*   2. Redistributions in binary form must reproduce the above      */
kusano 2b45e8
/*      copyright notice, this list of conditions and the following  */
kusano 2b45e8
/*      disclaimer in the documentation and/or other materials       */
kusano 2b45e8
/*      provided with the distribution.                              */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
kusano 2b45e8
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
kusano 2b45e8
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
kusano 2b45e8
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
kusano 2b45e8
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
kusano 2b45e8
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
kusano 2b45e8
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
kusano 2b45e8
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
kusano 2b45e8
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
kusano 2b45e8
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
kusano 2b45e8
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
kusano 2b45e8
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
kusano 2b45e8
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
kusano 2b45e8
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/* The views and conclusions contained in the software and           */
kusano 2b45e8
/* documentation are those of the authors and should not be          */
kusano 2b45e8
/* interpreted as representing official policies, either expressed   */
kusano 2b45e8
/* or implied, of The University of Texas at Austin.                 */
kusano 2b45e8
/*********************************************************************/
kusano 2b45e8
kusano 2b45e8
/* This implementation is completely wrong. I'll rewrite this */
kusano 2b45e8
kusano 2b45e8
#ifndef SYMCOPY_H
kusano 2b45e8
#define SYMCOPY_H
kusano 2b45e8
kusano 2b45e8
#if !defined(XDOUBLE) || !defined(QUAD_PRECISION)
kusano 2b45e8
kusano 2b45e8
static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
kusano 2b45e8
  BLASLONG is, js;
kusano 2b45e8
kusano 2b45e8
  FLOAT *aa1, *aa2;
kusano 2b45e8
  FLOAT *b1, *b2;
kusano 2b45e8
  FLOAT *bb1, *bb2;
kusano 2b45e8
  FLOAT *cc1, *cc2;
kusano 2b45e8
  FLOAT a11, a12;
kusano 2b45e8
  FLOAT a21, a22;
kusano 2b45e8
kusano 2b45e8
  b1 = b;
kusano 2b45e8
  b2 = b;
kusano 2b45e8
kusano 2b45e8
  for (js = 0; js < m; js += 2){
kusano 2b45e8
kusano 2b45e8
    aa1 = a + 0 * lda;
kusano 2b45e8
    aa2 = a + 1 * lda;
kusano 2b45e8
    a  += 2 * lda + 2;
kusano 2b45e8
    
kusano 2b45e8
    bb1 = b1 + 0 * m;
kusano 2b45e8
    bb2 = b1 + 1 * m;
kusano 2b45e8
    b1 += 2 * m + 2;
kusano 2b45e8
	  
kusano 2b45e8
    cc1 = b2 + 0 * m;
kusano 2b45e8
    cc2 = b2 + 1 * m;
kusano 2b45e8
    b2 += 2 * m + 2;
kusano 2b45e8
kusano 2b45e8
    if (m - js >= 2){
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      a21 = *(aa1 + 1);
kusano 2b45e8
      
kusano 2b45e8
      a22 = *(aa2 + 1);
kusano 2b45e8
      
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = a21;
kusano 2b45e8
      *(bb2 + 0) = a21;
kusano 2b45e8
      *(bb2 + 1) = a22;
kusano 2b45e8
      aa1 += 2;
kusano 2b45e8
      aa2 += 2;
kusano 2b45e8
      bb1 += 2;
kusano 2b45e8
      bb2 += 2;
kusano 2b45e8
      
kusano 2b45e8
      cc1 += 2 * m;
kusano 2b45e8
      cc2 += 2 * m;
kusano 2b45e8
kusano 2b45e8
      is = ((m - js - 2) >> 1);
kusano 2b45e8
kusano 2b45e8
      while (is > 0){
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
	
kusano 2b45e8
	aa1 += 2;
kusano 2b45e8
	aa2 += 2;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = a22;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a12;
kusano 2b45e8
	*(cc2 + 0) = a21;
kusano 2b45e8
	*(cc2 + 1) = a22;
kusano 2b45e8
kusano 2b45e8
	bb1 += 2;
kusano 2b45e8
	bb2 += 2;
kusano 2b45e8
kusano 2b45e8
	cc1 += 2 * m;
kusano 2b45e8
	cc2 += 2 * m;
kusano 2b45e8
kusano 2b45e8
	is --;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      is = ((m - js - 2) & 1);
kusano 2b45e8
kusano 2b45e8
      if (is == 1){
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a12;
kusano 2b45e8
      }
kusano 2b45e8
    }
kusano 2b45e8
    
kusano 2b45e8
    if (m - js == 1){
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
    }
kusano 2b45e8
kusano 2b45e8
  }
kusano 2b45e8
}
kusano 2b45e8
kusano 2b45e8
static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
kusano 2b45e8
  BLASLONG is, js;
kusano 2b45e8
kusano 2b45e8
  FLOAT *aa1, *aa2;
kusano 2b45e8
  FLOAT *b1, *b2;
kusano 2b45e8
  FLOAT *bb1, *bb2;
kusano 2b45e8
  FLOAT *cc1, *cc2;
kusano 2b45e8
  FLOAT a11, a12;
kusano 2b45e8
  FLOAT a21, a22;
kusano 2b45e8
kusano 2b45e8
  b1 = b;
kusano 2b45e8
  b2 = b;
kusano 2b45e8
kusano 2b45e8
  for (js = 0; js < m; js += 2){
kusano 2b45e8
kusano 2b45e8
    aa1 = a + 0 * lda;
kusano 2b45e8
    aa2 = a + 1 * lda;
kusano 2b45e8
    a  += 2 * lda;
kusano 2b45e8
    
kusano 2b45e8
    bb1 = b1 + 0 * m;
kusano 2b45e8
    bb2 = b1 + 1 * m;
kusano 2b45e8
    b1 += 2 * m;
kusano 2b45e8
	  
kusano 2b45e8
    cc1 = b2 + 0 * m;
kusano 2b45e8
    cc2 = b2 + 1 * m;
kusano 2b45e8
    b2 += 2;
kusano 2b45e8
kusano 2b45e8
    if (m - js >= 2){
kusano 2b45e8
kusano 2b45e8
      for (is = 0; is < js; is += 2){
kusano 2b45e8
      
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
kusano 2b45e8
	aa1 += 2;
kusano 2b45e8
	aa2 += 2;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = a22;
kusano 2b45e8
	
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a12;
kusano 2b45e8
	*(cc2 + 0) = a21;
kusano 2b45e8
	*(cc2 + 1) = a22;
kusano 2b45e8
	
kusano 2b45e8
	bb1 += 2;
kusano 2b45e8
	bb2 += 2;
kusano 2b45e8
	
kusano 2b45e8
	cc1 += 2 * m;
kusano 2b45e8
	cc2 += 2 * m;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      
kusano 2b45e8
      a12 = *(aa2 + 0);
kusano 2b45e8
      a22 = *(aa2 + 1);
kusano 2b45e8
      
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = a12;
kusano 2b45e8
      *(bb2 + 0) = a12;
kusano 2b45e8
      *(bb2 + 1) = a22;
kusano 2b45e8
    }
kusano 2b45e8
    
kusano 2b45e8
    if (m - js == 1){
kusano 2b45e8
      for (is = 0; is < js; is += 2){
kusano 2b45e8
      
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	aa1 += 2;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc2 + 0) = a21;
kusano 2b45e8
	bb1 += 2;
kusano 2b45e8
	
kusano 2b45e8
	cc1 += 2 * m;
kusano 2b45e8
	cc2 += 2 * m;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
    }
kusano 2b45e8
  }
kusano 2b45e8
}
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
kusano 2b45e8
  BLASLONG is, js;
kusano 2b45e8
kusano 2b45e8
  FLOAT *aa1, *aa2;
kusano 2b45e8
  FLOAT *b1, *b2;
kusano 2b45e8
  FLOAT *bb1, *bb2;
kusano 2b45e8
  FLOAT *cc1, *cc2;
kusano 2b45e8
  FLOAT a11, a21, a31, a41;
kusano 2b45e8
  FLOAT a12, a22, a32, a42;
kusano 2b45e8
kusano 2b45e8
  b1 = b;
kusano 2b45e8
  b2 = b;
kusano 2b45e8
kusano 2b45e8
  lda *= 2;
kusano 2b45e8
kusano 2b45e8
  for (js = 0; js < m; js += 2){
kusano 2b45e8
kusano 2b45e8
    aa1 = a + 0 * lda;
kusano 2b45e8
    aa2 = a + 1 * lda;
kusano 2b45e8
    a  += 2 * lda + 4;
kusano 2b45e8
    
kusano 2b45e8
    bb1 = b1 + 0 * m;
kusano 2b45e8
    bb2 = b1 + 2 * m;
kusano 2b45e8
    b1 += 4 * m + 4;
kusano 2b45e8
	  
kusano 2b45e8
    cc1 = b2 + 0 * m;
kusano 2b45e8
    cc2 = b2 + 2 * m;
kusano 2b45e8
    b2 += 4 * m + 4;
kusano 2b45e8
kusano 2b45e8
    if (m - js >= 2){
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      a21 = *(aa1 + 1);
kusano 2b45e8
      a31 = *(aa1 + 2);
kusano 2b45e8
      a41 = *(aa1 + 3);
kusano 2b45e8
  
kusano 2b45e8
      a12 = *(aa2 + 2);
kusano 2b45e8
      a22 = *(aa2 + 3);
kusano 2b45e8
      
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = a21;
kusano 2b45e8
      *(bb1 + 2) = a31;
kusano 2b45e8
      *(bb1 + 3) = a41;
kusano 2b45e8
kusano 2b45e8
      *(bb2 + 0) = a31;
kusano 2b45e8
      *(bb2 + 1) = a41;
kusano 2b45e8
      *(bb2 + 2) = a12;
kusano 2b45e8
      *(bb2 + 3) = a22;
kusano 2b45e8
kusano 2b45e8
      aa1 += 4;
kusano 2b45e8
      aa2 += 4;
kusano 2b45e8
      bb1 += 4;
kusano 2b45e8
      bb2 += 4;
kusano 2b45e8
      
kusano 2b45e8
      cc1 += 4 * m;
kusano 2b45e8
      cc2 += 4 * m;
kusano 2b45e8
kusano 2b45e8
      is = ((m - js - 2) >> 1);
kusano 2b45e8
kusano 2b45e8
      while (is > 0){
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a31 = *(aa1 + 2);
kusano 2b45e8
	a41 = *(aa1 + 3);
kusano 2b45e8
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
	a32 = *(aa2 + 2);
kusano 2b45e8
	a42 = *(aa2 + 3);
kusano 2b45e8
	
kusano 2b45e8
	aa1 += 4;
kusano 2b45e8
	aa2 += 4;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb1 + 2) = a31;
kusano 2b45e8
	*(bb1 + 3) = a41;
kusano 2b45e8
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = a22;
kusano 2b45e8
	*(bb2 + 2) = a32;
kusano 2b45e8
	*(bb2 + 3) = a42;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a21;
kusano 2b45e8
	*(cc1 + 2) = a12;
kusano 2b45e8
	*(cc1 + 3) = a22;
kusano 2b45e8
kusano 2b45e8
	*(cc2 + 0) = a31;
kusano 2b45e8
	*(cc2 + 1) = a41;
kusano 2b45e8
	*(cc2 + 2) = a32;
kusano 2b45e8
	*(cc2 + 3) = a42;
kusano 2b45e8
kusano 2b45e8
	bb1 += 4;
kusano 2b45e8
	bb2 += 4;
kusano 2b45e8
kusano 2b45e8
	cc1 += 4 * m;
kusano 2b45e8
	cc2 += 4 * m;
kusano 2b45e8
kusano 2b45e8
	is --;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      if (m & 1){
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = a22;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a21;
kusano 2b45e8
	*(cc1 + 2) = a12;
kusano 2b45e8
	*(cc1 + 3) = a22;
kusano 2b45e8
      }
kusano 2b45e8
    }
kusano 2b45e8
    
kusano 2b45e8
    if (m - js == 1){
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      a21 = *(aa1 + 1);
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = a21;
kusano 2b45e8
    }
kusano 2b45e8
kusano 2b45e8
  }
kusano 2b45e8
}
kusano 2b45e8
kusano 2b45e8
static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
kusano 2b45e8
  BLASLONG is, js;
kusano 2b45e8
kusano 2b45e8
  FLOAT *aa1, *aa2;
kusano 2b45e8
  FLOAT *b1, *b2;
kusano 2b45e8
  FLOAT *bb1, *bb2;
kusano 2b45e8
  FLOAT *cc1, *cc2;
kusano 2b45e8
  FLOAT a11, a21, a31, a41;
kusano 2b45e8
  FLOAT a12, a22, a32, a42;
kusano 2b45e8
kusano 2b45e8
  b1 = b;
kusano 2b45e8
  b2 = b;
kusano 2b45e8
kusano 2b45e8
  lda *= 2;
kusano 2b45e8
kusano 2b45e8
  for (js = 0; js < m; js += 2){
kusano 2b45e8
kusano 2b45e8
    aa1 = a + 0 * lda;
kusano 2b45e8
    aa2 = a + 1 * lda;
kusano 2b45e8
    a  += 2 * lda;
kusano 2b45e8
    
kusano 2b45e8
    bb1 = b1 + 0 * m;
kusano 2b45e8
    bb2 = b1 + 2 * m;
kusano 2b45e8
    b1 += 4 * m;
kusano 2b45e8
	  
kusano 2b45e8
    cc1 = b2 + 0 * m;
kusano 2b45e8
    cc2 = b2 + 2 * m;
kusano 2b45e8
    b2 += 4;
kusano 2b45e8
kusano 2b45e8
    if (m - js >= 2){
kusano 2b45e8
kusano 2b45e8
      for (is = 0; is < js; is += 2){
kusano 2b45e8
      
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a31 = *(aa1 + 2);
kusano 2b45e8
	a41 = *(aa1 + 3);
kusano 2b45e8
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
	a32 = *(aa2 + 2);
kusano 2b45e8
	a42 = *(aa2 + 3);
kusano 2b45e8
kusano 2b45e8
	aa1 += 4;
kusano 2b45e8
	aa2 += 4;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb1 + 2) = a31;
kusano 2b45e8
	*(bb1 + 3) = a41;
kusano 2b45e8
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = a22;
kusano 2b45e8
	*(bb2 + 2) = a32;
kusano 2b45e8
	*(bb2 + 3) = a42;
kusano 2b45e8
	
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a21;
kusano 2b45e8
	*(cc1 + 2) = a12;
kusano 2b45e8
	*(cc1 + 3) = a22;
kusano 2b45e8
kusano 2b45e8
	*(cc2 + 0) = a31;
kusano 2b45e8
	*(cc2 + 1) = a41;
kusano 2b45e8
	*(cc2 + 2) = a32;
kusano 2b45e8
	*(cc2 + 3) = a42;
kusano 2b45e8
	
kusano 2b45e8
	bb1 += 4;
kusano 2b45e8
	bb2 += 4;
kusano 2b45e8
	
kusano 2b45e8
	cc1 += 4 * m;
kusano 2b45e8
	cc2 += 4 * m;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      a21 = *(aa1 + 1);
kusano 2b45e8
      
kusano 2b45e8
      a12 = *(aa2 + 0);
kusano 2b45e8
      a22 = *(aa2 + 1);
kusano 2b45e8
      a32 = *(aa2 + 2);
kusano 2b45e8
      a42 = *(aa2 + 3);
kusano 2b45e8
      
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = a21;
kusano 2b45e8
      *(bb1 + 2) = a12;
kusano 2b45e8
      *(bb1 + 3) = a22;
kusano 2b45e8
kusano 2b45e8
      *(bb2 + 0) = a12;
kusano 2b45e8
      *(bb2 + 1) = a22;
kusano 2b45e8
      *(bb2 + 2) = a32;
kusano 2b45e8
      *(bb2 + 3) = a42;
kusano 2b45e8
    }
kusano 2b45e8
    
kusano 2b45e8
    if (m - js == 1){
kusano 2b45e8
      for (is = 0; is < js; is += 2){
kusano 2b45e8
      
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a31 = *(aa1 + 2);
kusano 2b45e8
	a41 = *(aa1 + 3);
kusano 2b45e8
	aa1 += 4;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb1 + 2) = a31;
kusano 2b45e8
	*(bb1 + 3) = a41;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a21;
kusano 2b45e8
	*(cc2 + 0) = a31;
kusano 2b45e8
	*(cc2 + 1) = a41;
kusano 2b45e8
	bb1 += 4;
kusano 2b45e8
	
kusano 2b45e8
	cc1 += 4 * m;
kusano 2b45e8
	cc2 += 4 * m;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      a21 = *(aa1 + 1);
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = a21;
kusano 2b45e8
    }
kusano 2b45e8
  }
kusano 2b45e8
}
kusano 2b45e8
kusano 2b45e8
static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
kusano 2b45e8
  BLASLONG is, js;
kusano 2b45e8
kusano 2b45e8
  FLOAT *aa1, *aa2;
kusano 2b45e8
  FLOAT *b1, *b2;
kusano 2b45e8
  FLOAT *bb1, *bb2;
kusano 2b45e8
  FLOAT *cc1, *cc2;
kusano 2b45e8
  FLOAT a11, a21, a31, a41;
kusano 2b45e8
  FLOAT a12, a22, a32, a42;
kusano 2b45e8
kusano 2b45e8
  b1 = b;
kusano 2b45e8
  b2 = b;
kusano 2b45e8
kusano 2b45e8
  lda *= 2;
kusano 2b45e8
kusano 2b45e8
  for (js = 0; js < m; js += 2){
kusano 2b45e8
kusano 2b45e8
    aa1 = a + 0 * lda;
kusano 2b45e8
    aa2 = a + 1 * lda;
kusano 2b45e8
    a  += 2 * lda + 4;
kusano 2b45e8
    
kusano 2b45e8
    bb1 = b1 + 0 * m;
kusano 2b45e8
    bb2 = b1 + 2 * m;
kusano 2b45e8
    b1 += 4 * m + 4;
kusano 2b45e8
	  
kusano 2b45e8
    cc1 = b2 + 0 * m;
kusano 2b45e8
    cc2 = b2 + 2 * m;
kusano 2b45e8
    b2 += 4 * m + 4;
kusano 2b45e8
kusano 2b45e8
    if (m - js >= 2){
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      a31 = *(aa1 + 2);
kusano 2b45e8
      a41 = *(aa1 + 3);
kusano 2b45e8
  
kusano 2b45e8
      a12 = *(aa2 + 2);
kusano 2b45e8
      
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = 0.;
kusano 2b45e8
      *(bb1 + 2) = a31;
kusano 2b45e8
      *(bb1 + 3) = a41;
kusano 2b45e8
kusano 2b45e8
      *(bb2 + 0) = a31;
kusano 2b45e8
      *(bb2 + 1) = -a41;
kusano 2b45e8
      *(bb2 + 2) = a12;
kusano 2b45e8
      *(bb2 + 3) = 0.;
kusano 2b45e8
kusano 2b45e8
      aa1 += 4;
kusano 2b45e8
      aa2 += 4;
kusano 2b45e8
      bb1 += 4;
kusano 2b45e8
      bb2 += 4;
kusano 2b45e8
      
kusano 2b45e8
      cc1 += 4 * m;
kusano 2b45e8
      cc2 += 4 * m;
kusano 2b45e8
kusano 2b45e8
      is = ((m - js - 2) >> 1);
kusano 2b45e8
kusano 2b45e8
      while (is > 0){
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a31 = *(aa1 + 2);
kusano 2b45e8
	a41 = *(aa1 + 3);
kusano 2b45e8
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
	a32 = *(aa2 + 2);
kusano 2b45e8
	a42 = *(aa2 + 3);
kusano 2b45e8
	
kusano 2b45e8
	aa1 += 4;
kusano 2b45e8
	aa2 += 4;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb1 + 2) = a31;
kusano 2b45e8
	*(bb1 + 3) = a41;
kusano 2b45e8
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = a22;
kusano 2b45e8
	*(bb2 + 2) = a32;
kusano 2b45e8
	*(bb2 + 3) = a42;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = -a21;
kusano 2b45e8
	*(cc1 + 2) = a12;
kusano 2b45e8
	*(cc1 + 3) = -a22;
kusano 2b45e8
kusano 2b45e8
	*(cc2 + 0) = a31;
kusano 2b45e8
	*(cc2 + 1) = -a41;
kusano 2b45e8
	*(cc2 + 2) = a32;
kusano 2b45e8
	*(cc2 + 3) = -a42;
kusano 2b45e8
kusano 2b45e8
	bb1 += 4;
kusano 2b45e8
	bb2 += 4;
kusano 2b45e8
kusano 2b45e8
	cc1 += 4 * m;
kusano 2b45e8
	cc2 += 4 * m;
kusano 2b45e8
kusano 2b45e8
	is --;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      if (m & 1){
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = a22;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = -a21;
kusano 2b45e8
	*(cc1 + 2) = a12;
kusano 2b45e8
	*(cc1 + 3) = -a22;
kusano 2b45e8
      }
kusano 2b45e8
    }
kusano 2b45e8
    
kusano 2b45e8
    if (m - js == 1){
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = 0.;
kusano 2b45e8
    }
kusano 2b45e8
kusano 2b45e8
  }
kusano 2b45e8
}
kusano 2b45e8
kusano 2b45e8
static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
kusano 2b45e8
  BLASLONG is, js;
kusano 2b45e8
kusano 2b45e8
  FLOAT *aa1, *aa2;
kusano 2b45e8
  FLOAT *b1, *b2;
kusano 2b45e8
  FLOAT *bb1, *bb2;
kusano 2b45e8
  FLOAT *cc1, *cc2;
kusano 2b45e8
  FLOAT a11, a21, a31, a41;
kusano 2b45e8
  FLOAT a12, a22, a32, a42;
kusano 2b45e8
kusano 2b45e8
  b1 = b;
kusano 2b45e8
  b2 = b;
kusano 2b45e8
kusano 2b45e8
  lda *= 2;
kusano 2b45e8
kusano 2b45e8
  for (js = 0; js < m; js += 2){
kusano 2b45e8
kusano 2b45e8
    aa1 = a + 0 * lda;
kusano 2b45e8
    aa2 = a + 1 * lda;
kusano 2b45e8
    a  += 2 * lda;
kusano 2b45e8
    
kusano 2b45e8
    bb1 = b1 + 0 * m;
kusano 2b45e8
    bb2 = b1 + 2 * m;
kusano 2b45e8
    b1 += 4 * m;
kusano 2b45e8
	  
kusano 2b45e8
    cc1 = b2 + 0 * m;
kusano 2b45e8
    cc2 = b2 + 2 * m;
kusano 2b45e8
    b2 += 4;
kusano 2b45e8
kusano 2b45e8
    if (m - js >= 2){
kusano 2b45e8
kusano 2b45e8
      for (is = 0; is < js; is += 2){
kusano 2b45e8
      
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a31 = *(aa1 + 2);
kusano 2b45e8
	a41 = *(aa1 + 3);
kusano 2b45e8
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
	a32 = *(aa2 + 2);
kusano 2b45e8
	a42 = *(aa2 + 3);
kusano 2b45e8
kusano 2b45e8
	aa1 += 4;
kusano 2b45e8
	aa2 += 4;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb1 + 2) = a31;
kusano 2b45e8
	*(bb1 + 3) = a41;
kusano 2b45e8
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = a22;
kusano 2b45e8
	*(bb2 + 2) = a32;
kusano 2b45e8
	*(bb2 + 3) = a42;
kusano 2b45e8
	
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = -a21;
kusano 2b45e8
	*(cc1 + 2) = a12;
kusano 2b45e8
	*(cc1 + 3) = -a22;
kusano 2b45e8
kusano 2b45e8
	*(cc2 + 0) = a31;
kusano 2b45e8
	*(cc2 + 1) = -a41;
kusano 2b45e8
	*(cc2 + 2) = a32;
kusano 2b45e8
	*(cc2 + 3) = -a42;
kusano 2b45e8
	
kusano 2b45e8
	bb1 += 4;
kusano 2b45e8
	bb2 += 4;
kusano 2b45e8
	
kusano 2b45e8
	cc1 += 4 * m;
kusano 2b45e8
	cc2 += 4 * m;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      
kusano 2b45e8
      a12 = *(aa2 + 0);
kusano 2b45e8
      a22 = *(aa2 + 1);
kusano 2b45e8
      a32 = *(aa2 + 2);
kusano 2b45e8
      
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = 0.;
kusano 2b45e8
      *(bb1 + 2) = a12;
kusano 2b45e8
      *(bb1 + 3) = -a22;
kusano 2b45e8
kusano 2b45e8
      *(bb2 + 0) = a12;
kusano 2b45e8
      *(bb2 + 1) = a22;
kusano 2b45e8
      *(bb2 + 2) = a32;
kusano 2b45e8
      *(bb2 + 3) = 0.;
kusano 2b45e8
    }
kusano 2b45e8
    
kusano 2b45e8
    if (m - js == 1){
kusano 2b45e8
      for (is = 0; is < js; is += 2){
kusano 2b45e8
      
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a31 = *(aa1 + 2);
kusano 2b45e8
	a41 = *(aa1 + 3);
kusano 2b45e8
	aa1 += 4;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb1 + 2) = a31;
kusano 2b45e8
	*(bb1 + 3) = a41;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = -a21;
kusano 2b45e8
	*(cc2 + 0) = a31;
kusano 2b45e8
	*(cc2 + 1) = -a41;
kusano 2b45e8
	bb1 += 4;
kusano 2b45e8
	
kusano 2b45e8
	cc1 += 4 * m;
kusano 2b45e8
	cc2 += 4 * m;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = 0.;
kusano 2b45e8
    }
kusano 2b45e8
  }
kusano 2b45e8
}
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
kusano 2b45e8
  BLASLONG is, js;
kusano 2b45e8
kusano 2b45e8
  FLOAT *aa1, *aa2;
kusano 2b45e8
  FLOAT *b1, *b2;
kusano 2b45e8
  FLOAT *bb1, *bb2;
kusano 2b45e8
  FLOAT *cc1, *cc2;
kusano 2b45e8
  FLOAT a11, a21, a31, a41;
kusano 2b45e8
  FLOAT a12, a22, a32, a42;
kusano 2b45e8
kusano 2b45e8
  b1 = b;
kusano 2b45e8
  b2 = b;
kusano 2b45e8
kusano 2b45e8
  lda *= 2;
kusano 2b45e8
kusano 2b45e8
  for (js = 0; js < m; js += 2){
kusano 2b45e8
kusano 2b45e8
    aa1 = a + 0 * lda;
kusano 2b45e8
    aa2 = a + 1 * lda;
kusano 2b45e8
    a  += 2 * lda + 4;
kusano 2b45e8
    
kusano 2b45e8
    bb1 = b1 + 0 * m;
kusano 2b45e8
    bb2 = b1 + 2 * m;
kusano 2b45e8
    b1 += 4 * m + 4;
kusano 2b45e8
	  
kusano 2b45e8
    cc1 = b2 + 0 * m;
kusano 2b45e8
    cc2 = b2 + 2 * m;
kusano 2b45e8
    b2 += 4 * m + 4;
kusano 2b45e8
kusano 2b45e8
    if (m - js >= 2){
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      a31 = *(aa1 + 2);
kusano 2b45e8
      a41 = *(aa1 + 3);
kusano 2b45e8
  
kusano 2b45e8
      a12 = *(aa2 + 2);
kusano 2b45e8
      
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = 0.;
kusano 2b45e8
      *(bb1 + 2) = a31;
kusano 2b45e8
      *(bb1 + 3) = -a41;
kusano 2b45e8
kusano 2b45e8
      *(bb2 + 0) = a31;
kusano 2b45e8
      *(bb2 + 1) = a41;
kusano 2b45e8
      *(bb2 + 2) = a12;
kusano 2b45e8
      *(bb2 + 3) = 0.;
kusano 2b45e8
kusano 2b45e8
      aa1 += 4;
kusano 2b45e8
      aa2 += 4;
kusano 2b45e8
      bb1 += 4;
kusano 2b45e8
      bb2 += 4;
kusano 2b45e8
      
kusano 2b45e8
      cc1 += 4 * m;
kusano 2b45e8
      cc2 += 4 * m;
kusano 2b45e8
kusano 2b45e8
      is = ((m - js - 2) >> 1);
kusano 2b45e8
kusano 2b45e8
      while (is > 0){
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a31 = *(aa1 + 2);
kusano 2b45e8
	a41 = *(aa1 + 3);
kusano 2b45e8
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
	a32 = *(aa2 + 2);
kusano 2b45e8
	a42 = *(aa2 + 3);
kusano 2b45e8
	
kusano 2b45e8
	aa1 += 4;
kusano 2b45e8
	aa2 += 4;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = -a21;
kusano 2b45e8
	*(bb1 + 2) = a31;
kusano 2b45e8
	*(bb1 + 3) = -a41;
kusano 2b45e8
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = -a22;
kusano 2b45e8
	*(bb2 + 2) = a32;
kusano 2b45e8
	*(bb2 + 3) = -a42;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a21;
kusano 2b45e8
	*(cc1 + 2) = a12;
kusano 2b45e8
	*(cc1 + 3) = a22;
kusano 2b45e8
kusano 2b45e8
	*(cc2 + 0) = a31;
kusano 2b45e8
	*(cc2 + 1) = a41;
kusano 2b45e8
	*(cc2 + 2) = a32;
kusano 2b45e8
	*(cc2 + 3) = a42;
kusano 2b45e8
kusano 2b45e8
	bb1 += 4;
kusano 2b45e8
	bb2 += 4;
kusano 2b45e8
kusano 2b45e8
	cc1 += 4 * m;
kusano 2b45e8
	cc2 += 4 * m;
kusano 2b45e8
kusano 2b45e8
	is --;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      if (m & 1){
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = -a21;
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = -a22;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a21;
kusano 2b45e8
	*(cc1 + 2) = a12;
kusano 2b45e8
	*(cc1 + 3) = a22;
kusano 2b45e8
      }
kusano 2b45e8
    }
kusano 2b45e8
    
kusano 2b45e8
    if (m - js == 1){
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = 0.;
kusano 2b45e8
    }
kusano 2b45e8
kusano 2b45e8
  }
kusano 2b45e8
}
kusano 2b45e8
kusano 2b45e8
static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
kusano 2b45e8
  BLASLONG is, js;
kusano 2b45e8
kusano 2b45e8
  FLOAT *aa1, *aa2;
kusano 2b45e8
  FLOAT *b1, *b2;
kusano 2b45e8
  FLOAT *bb1, *bb2;
kusano 2b45e8
  FLOAT *cc1, *cc2;
kusano 2b45e8
  FLOAT a11, a21, a31, a41;
kusano 2b45e8
  FLOAT a12, a22, a32, a42;
kusano 2b45e8
kusano 2b45e8
  b1 = b;
kusano 2b45e8
  b2 = b;
kusano 2b45e8
kusano 2b45e8
  lda *= 2;
kusano 2b45e8
kusano 2b45e8
  for (js = 0; js < m; js += 2){
kusano 2b45e8
kusano 2b45e8
    aa1 = a + 0 * lda;
kusano 2b45e8
    aa2 = a + 1 * lda;
kusano 2b45e8
    a  += 2 * lda;
kusano 2b45e8
    
kusano 2b45e8
    bb1 = b1 + 0 * m;
kusano 2b45e8
    bb2 = b1 + 2 * m;
kusano 2b45e8
    b1 += 4 * m;
kusano 2b45e8
	  
kusano 2b45e8
    cc1 = b2 + 0 * m;
kusano 2b45e8
    cc2 = b2 + 2 * m;
kusano 2b45e8
    b2 += 4;
kusano 2b45e8
kusano 2b45e8
    if (m - js >= 2){
kusano 2b45e8
kusano 2b45e8
      for (is = 0; is < js; is += 2){
kusano 2b45e8
      
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a31 = *(aa1 + 2);
kusano 2b45e8
	a41 = *(aa1 + 3);
kusano 2b45e8
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
	a32 = *(aa2 + 2);
kusano 2b45e8
	a42 = *(aa2 + 3);
kusano 2b45e8
kusano 2b45e8
	aa1 += 4;
kusano 2b45e8
	aa2 += 4;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = -a21;
kusano 2b45e8
	*(bb1 + 2) = a31;
kusano 2b45e8
	*(bb1 + 3) = -a41;
kusano 2b45e8
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = -a22;
kusano 2b45e8
	*(bb2 + 2) = a32;
kusano 2b45e8
	*(bb2 + 3) = -a42;
kusano 2b45e8
	
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a21;
kusano 2b45e8
	*(cc1 + 2) = a12;
kusano 2b45e8
	*(cc1 + 3) = a22;
kusano 2b45e8
kusano 2b45e8
	*(cc2 + 0) = a31;
kusano 2b45e8
	*(cc2 + 1) = a41;
kusano 2b45e8
	*(cc2 + 2) = a32;
kusano 2b45e8
	*(cc2 + 3) = a42;
kusano 2b45e8
	
kusano 2b45e8
	bb1 += 4;
kusano 2b45e8
	bb2 += 4;
kusano 2b45e8
	
kusano 2b45e8
	cc1 += 4 * m;
kusano 2b45e8
	cc2 += 4 * m;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      
kusano 2b45e8
      a12 = *(aa2 + 0);
kusano 2b45e8
      a22 = *(aa2 + 1);
kusano 2b45e8
      a32 = *(aa2 + 2);
kusano 2b45e8
      
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = 0.;
kusano 2b45e8
      *(bb1 + 2) = a12;
kusano 2b45e8
      *(bb1 + 3) = a22;
kusano 2b45e8
kusano 2b45e8
      *(bb2 + 0) = a12;
kusano 2b45e8
      *(bb2 + 1) = -a22;
kusano 2b45e8
      *(bb2 + 2) = a32;
kusano 2b45e8
      *(bb2 + 3) = 0.;
kusano 2b45e8
    }
kusano 2b45e8
    
kusano 2b45e8
    if (m - js == 1){
kusano 2b45e8
      for (is = 0; is < js; is += 2){
kusano 2b45e8
      
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a31 = *(aa1 + 2);
kusano 2b45e8
	a41 = *(aa1 + 3);
kusano 2b45e8
	aa1 += 4;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = -a21;
kusano 2b45e8
	*(bb1 + 2) = a31;
kusano 2b45e8
	*(bb1 + 3) = -a41;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a21;
kusano 2b45e8
	*(cc2 + 0) = a31;
kusano 2b45e8
	*(cc2 + 1) = a41;
kusano 2b45e8
	bb1 += 4;
kusano 2b45e8
	
kusano 2b45e8
	cc1 += 4 * m;
kusano 2b45e8
	cc2 += 4 * m;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = 0.;
kusano 2b45e8
    }
kusano 2b45e8
  }
kusano 2b45e8
}
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
kusano 2b45e8
  BLASLONG is, js;
kusano 2b45e8
kusano 2b45e8
  FLOAT *aa1, *aa2;
kusano 2b45e8
  FLOAT *b1, *b2;
kusano 2b45e8
  FLOAT *bb1, *bb2;
kusano 2b45e8
  FLOAT *cc1, *cc2;
kusano 2b45e8
  FLOAT a11, a12;
kusano 2b45e8
  FLOAT a21, a22;
kusano 2b45e8
kusano 2b45e8
  b1 = b;
kusano 2b45e8
  b2 = b;
kusano 2b45e8
kusano 2b45e8
  for (js = 0; js < m; js += 2){
kusano 2b45e8
kusano 2b45e8
    aa1 = a + 0 * lda;
kusano 2b45e8
    aa2 = a + 1 * lda;
kusano 2b45e8
    a  += 2 * lda + 2;
kusano 2b45e8
    
kusano 2b45e8
    bb1 = b1 + 0 * m;
kusano 2b45e8
    bb2 = b1 + 1 * m;
kusano 2b45e8
    b1 += 2 * m + 2;
kusano 2b45e8
	  
kusano 2b45e8
    cc1 = b2 + 0 * m;
kusano 2b45e8
    cc2 = b2 + 1 * m;
kusano 2b45e8
    b2 += 2 * m + 2;
kusano 2b45e8
kusano 2b45e8
    if (m - js >= 2){
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      a21 = *(aa1 + 1);
kusano 2b45e8
      
kusano 2b45e8
      a22 = *(aa2 + 1);
kusano 2b45e8
      
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = a21;
kusano 2b45e8
      *(bb2 + 0) = a21;
kusano 2b45e8
      *(bb2 + 1) = a22;
kusano 2b45e8
      aa1 += 2;
kusano 2b45e8
      aa2 += 2;
kusano 2b45e8
      bb1 += 2;
kusano 2b45e8
      bb2 += 2;
kusano 2b45e8
      
kusano 2b45e8
      cc1 += 2 * m;
kusano 2b45e8
      cc2 += 2 * m;
kusano 2b45e8
kusano 2b45e8
      is = ((m - js - 2) >> 1);
kusano 2b45e8
kusano 2b45e8
      while (is > 0){
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
	
kusano 2b45e8
	aa1 += 2;
kusano 2b45e8
	aa2 += 2;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = a22;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a12;
kusano 2b45e8
	*(cc2 + 0) = a21;
kusano 2b45e8
	*(cc2 + 1) = a22;
kusano 2b45e8
kusano 2b45e8
	bb1 += 2;
kusano 2b45e8
	bb2 += 2;
kusano 2b45e8
kusano 2b45e8
	cc1 += 2 * m;
kusano 2b45e8
	cc2 += 2 * m;
kusano 2b45e8
kusano 2b45e8
	is --;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      is = ((m - js - 2) & 1);
kusano 2b45e8
kusano 2b45e8
      if (is == 1){
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a12;
kusano 2b45e8
      }
kusano 2b45e8
    }
kusano 2b45e8
    
kusano 2b45e8
    if (m - js == 1){
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
    }
kusano 2b45e8
kusano 2b45e8
  }
kusano 2b45e8
}
kusano 2b45e8
kusano 2b45e8
static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
kusano 2b45e8
  BLASLONG is, js;
kusano 2b45e8
kusano 2b45e8
  FLOAT *aa1, *aa2;
kusano 2b45e8
  FLOAT *b1, *b2;
kusano 2b45e8
  FLOAT *bb1, *bb2;
kusano 2b45e8
  FLOAT *cc1, *cc2;
kusano 2b45e8
  FLOAT a11, a12;
kusano 2b45e8
  FLOAT a21, a22;
kusano 2b45e8
kusano 2b45e8
  b1 = b;
kusano 2b45e8
  b2 = b;
kusano 2b45e8
kusano 2b45e8
  for (js = 0; js < m; js += 2){
kusano 2b45e8
kusano 2b45e8
    aa1 = a + 0 * lda;
kusano 2b45e8
    aa2 = a + 1 * lda;
kusano 2b45e8
    a  += 2 * lda + 2;
kusano 2b45e8
    
kusano 2b45e8
    bb1 = b1 + 0 * m;
kusano 2b45e8
    bb2 = b1 + 1 * m;
kusano 2b45e8
    b1 += 2 * m + 2;
kusano 2b45e8
	  
kusano 2b45e8
    cc1 = b2 + 0 * m;
kusano 2b45e8
    cc2 = b2 + 1 * m;
kusano 2b45e8
    b2 += 2 * m + 2;
kusano 2b45e8
kusano 2b45e8
    if (m - js >= 2){
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      a21 = *(aa1 + 1);
kusano 2b45e8
      
kusano 2b45e8
      a22 = *(aa2 + 1);
kusano 2b45e8
      
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = a21;
kusano 2b45e8
      *(bb2 + 0) = a21;
kusano 2b45e8
      *(bb2 + 1) = a22;
kusano 2b45e8
      aa1 += 2;
kusano 2b45e8
      aa2 += 2;
kusano 2b45e8
      bb1 += 2;
kusano 2b45e8
      bb2 += 2;
kusano 2b45e8
      
kusano 2b45e8
      cc1 += 2 * m;
kusano 2b45e8
      cc2 += 2 * m;
kusano 2b45e8
kusano 2b45e8
      is = ((m - js - 2) >> 1);
kusano 2b45e8
kusano 2b45e8
      while (is > 0){
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
	
kusano 2b45e8
	aa1 += 2;
kusano 2b45e8
	aa2 += 2;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = a22;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a12;
kusano 2b45e8
	*(cc2 + 0) = a21;
kusano 2b45e8
	*(cc2 + 1) = a22;
kusano 2b45e8
kusano 2b45e8
	bb1 += 2;
kusano 2b45e8
	bb2 += 2;
kusano 2b45e8
kusano 2b45e8
	cc1 += 2 * m;
kusano 2b45e8
	cc2 += 2 * m;
kusano 2b45e8
kusano 2b45e8
	is --;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      is = ((m - js - 2) & 1);
kusano 2b45e8
kusano 2b45e8
      if (is == 1){
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a12;
kusano 2b45e8
      }
kusano 2b45e8
    }
kusano 2b45e8
    
kusano 2b45e8
    if (m - js == 1){
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
    }
kusano 2b45e8
kusano 2b45e8
  }
kusano 2b45e8
}
kusano 2b45e8
kusano 2b45e8
static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
kusano 2b45e8
  BLASLONG is, js;
kusano 2b45e8
kusano 2b45e8
  FLOAT *aa1, *aa2;
kusano 2b45e8
  FLOAT *b1, *b2;
kusano 2b45e8
  FLOAT *bb1, *bb2;
kusano 2b45e8
  FLOAT *cc1, *cc2;
kusano 2b45e8
  FLOAT a11, a12;
kusano 2b45e8
  FLOAT a21, a22;
kusano 2b45e8
kusano 2b45e8
  b1 = b;
kusano 2b45e8
  b2 = b;
kusano 2b45e8
kusano 2b45e8
  for (js = 0; js < m; js += 2){
kusano 2b45e8
kusano 2b45e8
    aa1 = a + 0 * lda;
kusano 2b45e8
    aa2 = a + 1 * lda;
kusano 2b45e8
    a  += 2 * lda;
kusano 2b45e8
    
kusano 2b45e8
    bb1 = b1 + 0 * m;
kusano 2b45e8
    bb2 = b1 + 1 * m;
kusano 2b45e8
    b1 += 2 * m;
kusano 2b45e8
	  
kusano 2b45e8
    cc1 = b2 + 0 * m;
kusano 2b45e8
    cc2 = b2 + 1 * m;
kusano 2b45e8
    b2 += 2;
kusano 2b45e8
kusano 2b45e8
    if (m - js >= 2){
kusano 2b45e8
kusano 2b45e8
      for (is = 0; is < js; is += 2){
kusano 2b45e8
      
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
kusano 2b45e8
	aa1 += 2;
kusano 2b45e8
	aa2 += 2;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = a22;
kusano 2b45e8
	
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a12;
kusano 2b45e8
	*(cc2 + 0) = a21;
kusano 2b45e8
	*(cc2 + 1) = a22;
kusano 2b45e8
	
kusano 2b45e8
	bb1 += 2;
kusano 2b45e8
	bb2 += 2;
kusano 2b45e8
	
kusano 2b45e8
	cc1 += 2 * m;
kusano 2b45e8
	cc2 += 2 * m;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      
kusano 2b45e8
      a12 = *(aa2 + 0);
kusano 2b45e8
      a22 = *(aa2 + 1);
kusano 2b45e8
      
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = a12;
kusano 2b45e8
      *(bb2 + 0) = a12;
kusano 2b45e8
      *(bb2 + 1) = a22;
kusano 2b45e8
    }
kusano 2b45e8
    
kusano 2b45e8
    if (m - js == 1){
kusano 2b45e8
      for (is = 0; is < js; is += 2){
kusano 2b45e8
      
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	aa1 += 2;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc2 + 0) = a21;
kusano 2b45e8
	bb1 += 2;
kusano 2b45e8
	
kusano 2b45e8
	cc1 += 2 * m;
kusano 2b45e8
	cc2 += 2 * m;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
    }
kusano 2b45e8
  }
kusano 2b45e8
}
kusano 2b45e8
kusano 2b45e8
static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
kusano 2b45e8
  BLASLONG is, js;
kusano 2b45e8
kusano 2b45e8
  FLOAT *aa1, *aa2;
kusano 2b45e8
  FLOAT *b1, *b2;
kusano 2b45e8
  FLOAT *bb1, *bb2;
kusano 2b45e8
  FLOAT *cc1, *cc2;
kusano 2b45e8
  FLOAT a11, a12;
kusano 2b45e8
  FLOAT a21, a22;
kusano 2b45e8
kusano 2b45e8
  b1 = b;
kusano 2b45e8
  b2 = b;
kusano 2b45e8
kusano 2b45e8
  for (js = 0; js < m; js += 2){
kusano 2b45e8
kusano 2b45e8
    aa1 = a + 0 * lda;
kusano 2b45e8
    aa2 = a + 1 * lda;
kusano 2b45e8
    a  += 2 * lda;
kusano 2b45e8
    
kusano 2b45e8
    bb1 = b1 + 0 * m;
kusano 2b45e8
    bb2 = b1 + 1 * m;
kusano 2b45e8
    b1 += 2 * m;
kusano 2b45e8
	  
kusano 2b45e8
    cc1 = b2 + 0 * m;
kusano 2b45e8
    cc2 = b2 + 1 * m;
kusano 2b45e8
    b2 += 2;
kusano 2b45e8
kusano 2b45e8
    if (m - js >= 2){
kusano 2b45e8
kusano 2b45e8
      for (is = 0; is < js; is += 2){
kusano 2b45e8
      
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
kusano 2b45e8
	aa1 += 2;
kusano 2b45e8
	aa2 += 2;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = a22;
kusano 2b45e8
	
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a12;
kusano 2b45e8
	*(cc2 + 0) = a21;
kusano 2b45e8
	*(cc2 + 1) = a22;
kusano 2b45e8
	
kusano 2b45e8
	bb1 += 2;
kusano 2b45e8
	bb2 += 2;
kusano 2b45e8
	
kusano 2b45e8
	cc1 += 2 * m;
kusano 2b45e8
	cc2 += 2 * m;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      
kusano 2b45e8
      a12 = *(aa2 + 0);
kusano 2b45e8
      a22 = *(aa2 + 1);
kusano 2b45e8
      
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = a12;
kusano 2b45e8
      *(bb2 + 0) = a12;
kusano 2b45e8
      *(bb2 + 1) = a22;
kusano 2b45e8
    }
kusano 2b45e8
    
kusano 2b45e8
    if (m - js == 1){
kusano 2b45e8
      for (is = 0; is < js; is += 2){
kusano 2b45e8
      
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	aa1 += 2;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc2 + 0) = a21;
kusano 2b45e8
	bb1 += 2;
kusano 2b45e8
	
kusano 2b45e8
	cc1 += 2 * m;
kusano 2b45e8
	cc2 += 2 * m;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
    }
kusano 2b45e8
  }
kusano 2b45e8
}
kusano 2b45e8
kusano 2b45e8
static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
kusano 2b45e8
  BLASLONG is, js;
kusano 2b45e8
kusano 2b45e8
  FLOAT *aa1, *aa2;
kusano 2b45e8
  FLOAT *b1, *b2;
kusano 2b45e8
  FLOAT *bb1, *bb2;
kusano 2b45e8
  FLOAT *cc1, *cc2;
kusano 2b45e8
  FLOAT a11, a21, a31, a41;
kusano 2b45e8
  FLOAT a12, a22, a32, a42;
kusano 2b45e8
kusano 2b45e8
  b1 = b;
kusano 2b45e8
  b2 = b;
kusano 2b45e8
kusano 2b45e8
  lda *= 2;
kusano 2b45e8
kusano 2b45e8
  for (js = 0; js < m; js += 2){
kusano 2b45e8
kusano 2b45e8
    aa1 = a + 0 * lda;
kusano 2b45e8
    aa2 = a + 1 * lda;
kusano 2b45e8
    a  += 2 * lda + 4;
kusano 2b45e8
    
kusano 2b45e8
    bb1 = b1 + 0 * m;
kusano 2b45e8
    bb2 = b1 + 2 * m;
kusano 2b45e8
    b1 += 4 * m + 4;
kusano 2b45e8
	  
kusano 2b45e8
    cc1 = b2 + 0 * m;
kusano 2b45e8
    cc2 = b2 + 2 * m;
kusano 2b45e8
    b2 += 4 * m + 4;
kusano 2b45e8
kusano 2b45e8
    if (m - js >= 2){
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      a21 = *(aa1 + 1);
kusano 2b45e8
      a31 = *(aa1 + 2);
kusano 2b45e8
      a41 = *(aa1 + 3);
kusano 2b45e8
  
kusano 2b45e8
      a12 = *(aa2 + 2);
kusano 2b45e8
      a22 = *(aa2 + 3);
kusano 2b45e8
      
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = a21;
kusano 2b45e8
      *(bb1 + 2) = a31;
kusano 2b45e8
      *(bb1 + 3) = a41;
kusano 2b45e8
kusano 2b45e8
      *(bb2 + 0) = a31;
kusano 2b45e8
      *(bb2 + 1) = a41;
kusano 2b45e8
      *(bb2 + 2) = a12;
kusano 2b45e8
      *(bb2 + 3) = a22;
kusano 2b45e8
kusano 2b45e8
      aa1 += 4;
kusano 2b45e8
      aa2 += 4;
kusano 2b45e8
      bb1 += 4;
kusano 2b45e8
      bb2 += 4;
kusano 2b45e8
      
kusano 2b45e8
      cc1 += 4 * m;
kusano 2b45e8
      cc2 += 4 * m;
kusano 2b45e8
kusano 2b45e8
      is = ((m - js - 2) >> 1);
kusano 2b45e8
kusano 2b45e8
      while (is > 0){
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a31 = *(aa1 + 2);
kusano 2b45e8
	a41 = *(aa1 + 3);
kusano 2b45e8
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
	a32 = *(aa2 + 2);
kusano 2b45e8
	a42 = *(aa2 + 3);
kusano 2b45e8
	
kusano 2b45e8
	aa1 += 4;
kusano 2b45e8
	aa2 += 4;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb1 + 2) = a31;
kusano 2b45e8
	*(bb1 + 3) = a41;
kusano 2b45e8
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = a22;
kusano 2b45e8
	*(bb2 + 2) = a32;
kusano 2b45e8
	*(bb2 + 3) = a42;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a21;
kusano 2b45e8
	*(cc1 + 2) = a12;
kusano 2b45e8
	*(cc1 + 3) = a22;
kusano 2b45e8
kusano 2b45e8
	*(cc2 + 0) = a31;
kusano 2b45e8
	*(cc2 + 1) = a41;
kusano 2b45e8
	*(cc2 + 2) = a32;
kusano 2b45e8
	*(cc2 + 3) = a42;
kusano 2b45e8
kusano 2b45e8
	bb1 += 4;
kusano 2b45e8
	bb2 += 4;
kusano 2b45e8
kusano 2b45e8
	cc1 += 4 * m;
kusano 2b45e8
	cc2 += 4 * m;
kusano 2b45e8
kusano 2b45e8
	is --;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      if (m & 1){
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = a22;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a21;
kusano 2b45e8
	*(cc1 + 2) = a12;
kusano 2b45e8
	*(cc1 + 3) = a22;
kusano 2b45e8
      }
kusano 2b45e8
    }
kusano 2b45e8
    
kusano 2b45e8
    if (m - js == 1){
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      a21 = *(aa1 + 1);
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = a21;
kusano 2b45e8
    }
kusano 2b45e8
kusano 2b45e8
  }
kusano 2b45e8
}
kusano 2b45e8
kusano 2b45e8
static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
kusano 2b45e8
  BLASLONG is, js;
kusano 2b45e8
kusano 2b45e8
  FLOAT *aa1, *aa2;
kusano 2b45e8
  FLOAT *b1, *b2;
kusano 2b45e8
  FLOAT *bb1, *bb2;
kusano 2b45e8
  FLOAT *cc1, *cc2;
kusano 2b45e8
  FLOAT a11, a21, a31, a41;
kusano 2b45e8
  FLOAT a12, a22, a32, a42;
kusano 2b45e8
kusano 2b45e8
  b1 = b;
kusano 2b45e8
  b2 = b;
kusano 2b45e8
kusano 2b45e8
  lda *= 2;
kusano 2b45e8
kusano 2b45e8
  for (js = 0; js < m; js += 2){
kusano 2b45e8
kusano 2b45e8
    aa1 = a + 0 * lda;
kusano 2b45e8
    aa2 = a + 1 * lda;
kusano 2b45e8
    a  += 2 * lda + 4;
kusano 2b45e8
    
kusano 2b45e8
    bb1 = b1 + 0 * m;
kusano 2b45e8
    bb2 = b1 + 2 * m;
kusano 2b45e8
    b1 += 4 * m + 4;
kusano 2b45e8
	  
kusano 2b45e8
    cc1 = b2 + 0 * m;
kusano 2b45e8
    cc2 = b2 + 2 * m;
kusano 2b45e8
    b2 += 4 * m + 4;
kusano 2b45e8
kusano 2b45e8
    if (m - js >= 2){
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      a21 = *(aa1 + 1);
kusano 2b45e8
      a31 = *(aa1 + 2);
kusano 2b45e8
      a41 = *(aa1 + 3);
kusano 2b45e8
  
kusano 2b45e8
      a12 = *(aa2 + 2);
kusano 2b45e8
      a22 = *(aa2 + 3);
kusano 2b45e8
      
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = a21;
kusano 2b45e8
      *(bb1 + 2) = a31;
kusano 2b45e8
      *(bb1 + 3) = a41;
kusano 2b45e8
kusano 2b45e8
      *(bb2 + 0) = a31;
kusano 2b45e8
      *(bb2 + 1) = a41;
kusano 2b45e8
      *(bb2 + 2) = a12;
kusano 2b45e8
      *(bb2 + 3) = a22;
kusano 2b45e8
kusano 2b45e8
      aa1 += 4;
kusano 2b45e8
      aa2 += 4;
kusano 2b45e8
      bb1 += 4;
kusano 2b45e8
      bb2 += 4;
kusano 2b45e8
      
kusano 2b45e8
      cc1 += 4 * m;
kusano 2b45e8
      cc2 += 4 * m;
kusano 2b45e8
kusano 2b45e8
      is = ((m - js - 2) >> 1);
kusano 2b45e8
kusano 2b45e8
      while (is > 0){
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a31 = *(aa1 + 2);
kusano 2b45e8
	a41 = *(aa1 + 3);
kusano 2b45e8
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
	a32 = *(aa2 + 2);
kusano 2b45e8
	a42 = *(aa2 + 3);
kusano 2b45e8
	
kusano 2b45e8
	aa1 += 4;
kusano 2b45e8
	aa2 += 4;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb1 + 2) = a31;
kusano 2b45e8
	*(bb1 + 3) = a41;
kusano 2b45e8
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = a22;
kusano 2b45e8
	*(bb2 + 2) = a32;
kusano 2b45e8
	*(bb2 + 3) = a42;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a21;
kusano 2b45e8
	*(cc1 + 2) = a12;
kusano 2b45e8
	*(cc1 + 3) = a22;
kusano 2b45e8
kusano 2b45e8
	*(cc2 + 0) = a31;
kusano 2b45e8
	*(cc2 + 1) = a41;
kusano 2b45e8
	*(cc2 + 2) = a32;
kusano 2b45e8
	*(cc2 + 3) = a42;
kusano 2b45e8
kusano 2b45e8
	bb1 += 4;
kusano 2b45e8
	bb2 += 4;
kusano 2b45e8
kusano 2b45e8
	cc1 += 4 * m;
kusano 2b45e8
	cc2 += 4 * m;
kusano 2b45e8
kusano 2b45e8
	is --;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      if (m & 1){
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = a22;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a21;
kusano 2b45e8
	*(cc1 + 2) = a12;
kusano 2b45e8
	*(cc1 + 3) = a22;
kusano 2b45e8
      }
kusano 2b45e8
    }
kusano 2b45e8
    
kusano 2b45e8
    if (m - js == 1){
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      a21 = *(aa1 + 1);
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = a21;
kusano 2b45e8
    }
kusano 2b45e8
kusano 2b45e8
  }
kusano 2b45e8
}
kusano 2b45e8
kusano 2b45e8
static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
kusano 2b45e8
  BLASLONG is, js;
kusano 2b45e8
kusano 2b45e8
  FLOAT *aa1, *aa2;
kusano 2b45e8
  FLOAT *b1, *b2;
kusano 2b45e8
  FLOAT *bb1, *bb2;
kusano 2b45e8
  FLOAT *cc1, *cc2;
kusano 2b45e8
  FLOAT a11, a21, a31, a41;
kusano 2b45e8
  FLOAT a12, a22, a32, a42;
kusano 2b45e8
kusano 2b45e8
  b1 = b;
kusano 2b45e8
  b2 = b;
kusano 2b45e8
kusano 2b45e8
  lda *= 2;
kusano 2b45e8
kusano 2b45e8
  for (js = 0; js < m; js += 2){
kusano 2b45e8
kusano 2b45e8
    aa1 = a + 0 * lda;
kusano 2b45e8
    aa2 = a + 1 * lda;
kusano 2b45e8
    a  += 2 * lda;
kusano 2b45e8
    
kusano 2b45e8
    bb1 = b1 + 0 * m;
kusano 2b45e8
    bb2 = b1 + 2 * m;
kusano 2b45e8
    b1 += 4 * m;
kusano 2b45e8
	  
kusano 2b45e8
    cc1 = b2 + 0 * m;
kusano 2b45e8
    cc2 = b2 + 2 * m;
kusano 2b45e8
    b2 += 4;
kusano 2b45e8
kusano 2b45e8
    if (m - js >= 2){
kusano 2b45e8
kusano 2b45e8
      for (is = 0; is < js; is += 2){
kusano 2b45e8
      
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a31 = *(aa1 + 2);
kusano 2b45e8
	a41 = *(aa1 + 3);
kusano 2b45e8
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
	a32 = *(aa2 + 2);
kusano 2b45e8
	a42 = *(aa2 + 3);
kusano 2b45e8
kusano 2b45e8
	aa1 += 4;
kusano 2b45e8
	aa2 += 4;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb1 + 2) = a31;
kusano 2b45e8
	*(bb1 + 3) = a41;
kusano 2b45e8
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = a22;
kusano 2b45e8
	*(bb2 + 2) = a32;
kusano 2b45e8
	*(bb2 + 3) = a42;
kusano 2b45e8
	
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a21;
kusano 2b45e8
	*(cc1 + 2) = a12;
kusano 2b45e8
	*(cc1 + 3) = a22;
kusano 2b45e8
kusano 2b45e8
	*(cc2 + 0) = a31;
kusano 2b45e8
	*(cc2 + 1) = a41;
kusano 2b45e8
	*(cc2 + 2) = a32;
kusano 2b45e8
	*(cc2 + 3) = a42;
kusano 2b45e8
	
kusano 2b45e8
	bb1 += 4;
kusano 2b45e8
	bb2 += 4;
kusano 2b45e8
	
kusano 2b45e8
	cc1 += 4 * m;
kusano 2b45e8
	cc2 += 4 * m;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      a21 = *(aa1 + 1);
kusano 2b45e8
      
kusano 2b45e8
      a12 = *(aa2 + 0);
kusano 2b45e8
      a22 = *(aa2 + 1);
kusano 2b45e8
      a32 = *(aa2 + 2);
kusano 2b45e8
      a42 = *(aa2 + 3);
kusano 2b45e8
      
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = a21;
kusano 2b45e8
      *(bb1 + 2) = a12;
kusano 2b45e8
      *(bb1 + 3) = a22;
kusano 2b45e8
kusano 2b45e8
      *(bb2 + 0) = a12;
kusano 2b45e8
      *(bb2 + 1) = a22;
kusano 2b45e8
      *(bb2 + 2) = a32;
kusano 2b45e8
      *(bb2 + 3) = a42;
kusano 2b45e8
    }
kusano 2b45e8
    
kusano 2b45e8
    if (m - js == 1){
kusano 2b45e8
      for (is = 0; is < js; is += 2){
kusano 2b45e8
      
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a31 = *(aa1 + 2);
kusano 2b45e8
	a41 = *(aa1 + 3);
kusano 2b45e8
	aa1 += 4;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb1 + 2) = a31;
kusano 2b45e8
	*(bb1 + 3) = a41;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a21;
kusano 2b45e8
	*(cc2 + 0) = a31;
kusano 2b45e8
	*(cc2 + 1) = a41;
kusano 2b45e8
	bb1 += 4;
kusano 2b45e8
	
kusano 2b45e8
	cc1 += 4 * m;
kusano 2b45e8
	cc2 += 4 * m;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      a21 = *(aa1 + 1);
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = a21;
kusano 2b45e8
    }
kusano 2b45e8
  }
kusano 2b45e8
}
kusano 2b45e8
kusano 2b45e8
static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
kusano 2b45e8
  BLASLONG is, js;
kusano 2b45e8
kusano 2b45e8
  FLOAT *aa1, *aa2;
kusano 2b45e8
  FLOAT *b1, *b2;
kusano 2b45e8
  FLOAT *bb1, *bb2;
kusano 2b45e8
  FLOAT *cc1, *cc2;
kusano 2b45e8
  FLOAT a11, a21, a31, a41;
kusano 2b45e8
  FLOAT a12, a22, a32, a42;
kusano 2b45e8
kusano 2b45e8
  b1 = b;
kusano 2b45e8
  b2 = b;
kusano 2b45e8
kusano 2b45e8
  lda *= 2;
kusano 2b45e8
kusano 2b45e8
  for (js = 0; js < m; js += 2){
kusano 2b45e8
kusano 2b45e8
    aa1 = a + 0 * lda;
kusano 2b45e8
    aa2 = a + 1 * lda;
kusano 2b45e8
    a  += 2 * lda;
kusano 2b45e8
    
kusano 2b45e8
    bb1 = b1 + 0 * m;
kusano 2b45e8
    bb2 = b1 + 2 * m;
kusano 2b45e8
    b1 += 4 * m;
kusano 2b45e8
	  
kusano 2b45e8
    cc1 = b2 + 0 * m;
kusano 2b45e8
    cc2 = b2 + 2 * m;
kusano 2b45e8
    b2 += 4;
kusano 2b45e8
kusano 2b45e8
    if (m - js >= 2){
kusano 2b45e8
kusano 2b45e8
      for (is = 0; is < js; is += 2){
kusano 2b45e8
      
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a31 = *(aa1 + 2);
kusano 2b45e8
	a41 = *(aa1 + 3);
kusano 2b45e8
kusano 2b45e8
	a12 = *(aa2 + 0);
kusano 2b45e8
	a22 = *(aa2 + 1);
kusano 2b45e8
	a32 = *(aa2 + 2);
kusano 2b45e8
	a42 = *(aa2 + 3);
kusano 2b45e8
kusano 2b45e8
	aa1 += 4;
kusano 2b45e8
	aa2 += 4;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb1 + 2) = a31;
kusano 2b45e8
	*(bb1 + 3) = a41;
kusano 2b45e8
kusano 2b45e8
	*(bb2 + 0) = a12;
kusano 2b45e8
	*(bb2 + 1) = a22;
kusano 2b45e8
	*(bb2 + 2) = a32;
kusano 2b45e8
	*(bb2 + 3) = a42;
kusano 2b45e8
	
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a21;
kusano 2b45e8
	*(cc1 + 2) = a12;
kusano 2b45e8
	*(cc1 + 3) = a22;
kusano 2b45e8
kusano 2b45e8
	*(cc2 + 0) = a31;
kusano 2b45e8
	*(cc2 + 1) = a41;
kusano 2b45e8
	*(cc2 + 2) = a32;
kusano 2b45e8
	*(cc2 + 3) = a42;
kusano 2b45e8
	
kusano 2b45e8
	bb1 += 4;
kusano 2b45e8
	bb2 += 4;
kusano 2b45e8
	
kusano 2b45e8
	cc1 += 4 * m;
kusano 2b45e8
	cc2 += 4 * m;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      a21 = *(aa1 + 1);
kusano 2b45e8
      
kusano 2b45e8
      a12 = *(aa2 + 0);
kusano 2b45e8
      a22 = *(aa2 + 1);
kusano 2b45e8
      a32 = *(aa2 + 2);
kusano 2b45e8
      a42 = *(aa2 + 3);
kusano 2b45e8
      
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = a21;
kusano 2b45e8
      *(bb1 + 2) = a12;
kusano 2b45e8
      *(bb1 + 3) = a22;
kusano 2b45e8
kusano 2b45e8
      *(bb2 + 0) = a12;
kusano 2b45e8
      *(bb2 + 1) = a22;
kusano 2b45e8
      *(bb2 + 2) = a32;
kusano 2b45e8
      *(bb2 + 3) = a42;
kusano 2b45e8
    }
kusano 2b45e8
    
kusano 2b45e8
    if (m - js == 1){
kusano 2b45e8
      for (is = 0; is < js; is += 2){
kusano 2b45e8
      
kusano 2b45e8
	a11 = *(aa1 + 0);
kusano 2b45e8
	a21 = *(aa1 + 1);
kusano 2b45e8
	a31 = *(aa1 + 2);
kusano 2b45e8
	a41 = *(aa1 + 3);
kusano 2b45e8
	aa1 += 4;
kusano 2b45e8
	
kusano 2b45e8
	*(bb1 + 0) = a11;
kusano 2b45e8
	*(bb1 + 1) = a21;
kusano 2b45e8
	*(bb1 + 2) = a31;
kusano 2b45e8
	*(bb1 + 3) = a41;
kusano 2b45e8
kusano 2b45e8
	*(cc1 + 0) = a11;
kusano 2b45e8
	*(cc1 + 1) = a21;
kusano 2b45e8
	*(cc2 + 0) = a31;
kusano 2b45e8
	*(cc2 + 1) = a41;
kusano 2b45e8
	bb1 += 4;
kusano 2b45e8
	
kusano 2b45e8
	cc1 += 4 * m;
kusano 2b45e8
	cc2 += 4 * m;
kusano 2b45e8
      }
kusano 2b45e8
kusano 2b45e8
      a11 = *(aa1 + 0);
kusano 2b45e8
      a21 = *(aa1 + 1);
kusano 2b45e8
      *(bb1 + 0) = a11;
kusano 2b45e8
      *(bb1 + 1) = a21;
kusano 2b45e8
    }
kusano 2b45e8
  }
kusano 2b45e8
}
kusano 2b45e8
kusano 2b45e8
#endif
kusano 2b45e8
#endif
kusano 2b45e8