Blob Blame Raw
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin.           */
/* All rights reserved.                                              */
/*                                                                   */
/* Redistribution and use in source and binary forms, with or        */
/* without modification, are permitted provided that the following   */
/* conditions are met:                                               */
/*                                                                   */
/*   1. Redistributions of source code must retain the above         */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer.                                                  */
/*                                                                   */
/*   2. Redistributions in binary form must reproduce the above      */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer in the documentation and/or other materials       */
/*      provided with the distribution.                              */
/*                                                                   */
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
/*                                                                   */
/* The views and conclusions contained in the software and           */
/* documentation are those of the authors and should not be          */
/* interpreted as representing official policies, either expressed   */
/* or implied, of The University of Texas at Austin.                 */
/*********************************************************************/

/* This implementation is completely wrong. I'll rewrite this */

#ifndef SYMCOPY_H
#define SYMCOPY_H

#if !defined(XDOUBLE) || !defined(QUAD_PRECISION)

static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  BLASLONG is, js;

  FLOAT *aa1, *aa2;
  FLOAT *b1, *b2;
  FLOAT *bb1, *bb2;
  FLOAT *cc1, *cc2;
  FLOAT a11, a12;
  FLOAT a21, a22;

  b1 = b;
  b2 = b;

  for (js = 0; js < m; js += 2){

    aa1 = a + 0 * lda;
    aa2 = a + 1 * lda;
    a  += 2 * lda + 2;
    
    bb1 = b1 + 0 * m;
    bb2 = b1 + 1 * m;
    b1 += 2 * m + 2;
	  
    cc1 = b2 + 0 * m;
    cc2 = b2 + 1 * m;
    b2 += 2 * m + 2;

    if (m - js >= 2){

      a11 = *(aa1 + 0);
      a21 = *(aa1 + 1);
      
      a22 = *(aa2 + 1);
      
      *(bb1 + 0) = a11;
      *(bb1 + 1) = a21;
      *(bb2 + 0) = a21;
      *(bb2 + 1) = a22;
      aa1 += 2;
      aa2 += 2;
      bb1 += 2;
      bb2 += 2;
      
      cc1 += 2 * m;
      cc2 += 2 * m;

      is = ((m - js - 2) >> 1);

      while (is > 0){
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);
	
	aa1 += 2;
	aa2 += 2;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb2 + 0) = a12;
	*(bb2 + 1) = a22;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = a12;
	*(cc2 + 0) = a21;
	*(cc2 + 1) = a22;

	bb1 += 2;
	bb2 += 2;

	cc1 += 2 * m;
	cc2 += 2 * m;

	is --;
      }

      is = ((m - js - 2) & 1);

      if (is == 1){
	a11 = *(aa1 + 0);
	a12 = *(aa2 + 0);
	
	*(bb1 + 0) = a11;
	*(bb2 + 0) = a12;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = a12;
      }
    }
    
    if (m - js == 1){
      a11 = *(aa1 + 0);
      *(bb1 + 0) = a11;
    }

  }
}

static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  BLASLONG is, js;

  FLOAT *aa1, *aa2;
  FLOAT *b1, *b2;
  FLOAT *bb1, *bb2;
  FLOAT *cc1, *cc2;
  FLOAT a11, a12;
  FLOAT a21, a22;

  b1 = b;
  b2 = b;

  for (js = 0; js < m; js += 2){

    aa1 = a + 0 * lda;
    aa2 = a + 1 * lda;
    a  += 2 * lda;
    
    bb1 = b1 + 0 * m;
    bb2 = b1 + 1 * m;
    b1 += 2 * m;
	  
    cc1 = b2 + 0 * m;
    cc2 = b2 + 1 * m;
    b2 += 2;

    if (m - js >= 2){

      for (is = 0; is < js; is += 2){
      
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);

	aa1 += 2;
	aa2 += 2;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb2 + 0) = a12;
	*(bb2 + 1) = a22;
	
	*(cc1 + 0) = a11;
	*(cc1 + 1) = a12;
	*(cc2 + 0) = a21;
	*(cc2 + 1) = a22;
	
	bb1 += 2;
	bb2 += 2;
	
	cc1 += 2 * m;
	cc2 += 2 * m;
      }

      a11 = *(aa1 + 0);
      
      a12 = *(aa2 + 0);
      a22 = *(aa2 + 1);
      
      *(bb1 + 0) = a11;
      *(bb1 + 1) = a12;
      *(bb2 + 0) = a12;
      *(bb2 + 1) = a22;
    }
    
    if (m - js == 1){
      for (is = 0; is < js; is += 2){
      
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	aa1 += 2;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(cc1 + 0) = a11;
	*(cc2 + 0) = a21;
	bb1 += 2;
	
	cc1 += 2 * m;
	cc2 += 2 * m;
      }

      a11 = *(aa1 + 0);
      *(bb1 + 0) = a11;
    }
  }
}


static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  BLASLONG is, js;

  FLOAT *aa1, *aa2;
  FLOAT *b1, *b2;
  FLOAT *bb1, *bb2;
  FLOAT *cc1, *cc2;
  FLOAT a11, a21, a31, a41;
  FLOAT a12, a22, a32, a42;

  b1 = b;
  b2 = b;

  lda *= 2;

  for (js = 0; js < m; js += 2){

    aa1 = a + 0 * lda;
    aa2 = a + 1 * lda;
    a  += 2 * lda + 4;
    
    bb1 = b1 + 0 * m;
    bb2 = b1 + 2 * m;
    b1 += 4 * m + 4;
	  
    cc1 = b2 + 0 * m;
    cc2 = b2 + 2 * m;
    b2 += 4 * m + 4;

    if (m - js >= 2){

      a11 = *(aa1 + 0);
      a21 = *(aa1 + 1);
      a31 = *(aa1 + 2);
      a41 = *(aa1 + 3);
  
      a12 = *(aa2 + 2);
      a22 = *(aa2 + 3);
      
      *(bb1 + 0) = a11;
      *(bb1 + 1) = a21;
      *(bb1 + 2) = a31;
      *(bb1 + 3) = a41;

      *(bb2 + 0) = a31;
      *(bb2 + 1) = a41;
      *(bb2 + 2) = a12;
      *(bb2 + 3) = a22;

      aa1 += 4;
      aa2 += 4;
      bb1 += 4;
      bb2 += 4;
      
      cc1 += 4 * m;
      cc2 += 4 * m;

      is = ((m - js - 2) >> 1);

      while (is > 0){
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a31 = *(aa1 + 2);
	a41 = *(aa1 + 3);

	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);
	a32 = *(aa2 + 2);
	a42 = *(aa2 + 3);
	
	aa1 += 4;
	aa2 += 4;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb1 + 2) = a31;
	*(bb1 + 3) = a41;

	*(bb2 + 0) = a12;
	*(bb2 + 1) = a22;
	*(bb2 + 2) = a32;
	*(bb2 + 3) = a42;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = a21;
	*(cc1 + 2) = a12;
	*(cc1 + 3) = a22;

	*(cc2 + 0) = a31;
	*(cc2 + 1) = a41;
	*(cc2 + 2) = a32;
	*(cc2 + 3) = a42;

	bb1 += 4;
	bb2 += 4;

	cc1 += 4 * m;
	cc2 += 4 * m;

	is --;
      }

      if (m & 1){
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb2 + 0) = a12;
	*(bb2 + 1) = a22;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = a21;
	*(cc1 + 2) = a12;
	*(cc1 + 3) = a22;
      }
    }
    
    if (m - js == 1){
      a11 = *(aa1 + 0);
      a21 = *(aa1 + 1);
      *(bb1 + 0) = a11;
      *(bb1 + 1) = a21;
    }

  }
}

static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  BLASLONG is, js;

  FLOAT *aa1, *aa2;
  FLOAT *b1, *b2;
  FLOAT *bb1, *bb2;
  FLOAT *cc1, *cc2;
  FLOAT a11, a21, a31, a41;
  FLOAT a12, a22, a32, a42;

  b1 = b;
  b2 = b;

  lda *= 2;

  for (js = 0; js < m; js += 2){

    aa1 = a + 0 * lda;
    aa2 = a + 1 * lda;
    a  += 2 * lda;
    
    bb1 = b1 + 0 * m;
    bb2 = b1 + 2 * m;
    b1 += 4 * m;
	  
    cc1 = b2 + 0 * m;
    cc2 = b2 + 2 * m;
    b2 += 4;

    if (m - js >= 2){

      for (is = 0; is < js; is += 2){
      
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a31 = *(aa1 + 2);
	a41 = *(aa1 + 3);

	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);
	a32 = *(aa2 + 2);
	a42 = *(aa2 + 3);

	aa1 += 4;
	aa2 += 4;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb1 + 2) = a31;
	*(bb1 + 3) = a41;

	*(bb2 + 0) = a12;
	*(bb2 + 1) = a22;
	*(bb2 + 2) = a32;
	*(bb2 + 3) = a42;
	
	*(cc1 + 0) = a11;
	*(cc1 + 1) = a21;
	*(cc1 + 2) = a12;
	*(cc1 + 3) = a22;

	*(cc2 + 0) = a31;
	*(cc2 + 1) = a41;
	*(cc2 + 2) = a32;
	*(cc2 + 3) = a42;
	
	bb1 += 4;
	bb2 += 4;
	
	cc1 += 4 * m;
	cc2 += 4 * m;
      }

      a11 = *(aa1 + 0);
      a21 = *(aa1 + 1);
      
      a12 = *(aa2 + 0);
      a22 = *(aa2 + 1);
      a32 = *(aa2 + 2);
      a42 = *(aa2 + 3);
      
      *(bb1 + 0) = a11;
      *(bb1 + 1) = a21;
      *(bb1 + 2) = a12;
      *(bb1 + 3) = a22;

      *(bb2 + 0) = a12;
      *(bb2 + 1) = a22;
      *(bb2 + 2) = a32;
      *(bb2 + 3) = a42;
    }
    
    if (m - js == 1){
      for (is = 0; is < js; is += 2){
      
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a31 = *(aa1 + 2);
	a41 = *(aa1 + 3);
	aa1 += 4;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb1 + 2) = a31;
	*(bb1 + 3) = a41;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = a21;
	*(cc2 + 0) = a31;
	*(cc2 + 1) = a41;
	bb1 += 4;
	
	cc1 += 4 * m;
	cc2 += 4 * m;
      }

      a11 = *(aa1 + 0);
      a21 = *(aa1 + 1);
      *(bb1 + 0) = a11;
      *(bb1 + 1) = a21;
    }
  }
}

static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  BLASLONG is, js;

  FLOAT *aa1, *aa2;
  FLOAT *b1, *b2;
  FLOAT *bb1, *bb2;
  FLOAT *cc1, *cc2;
  FLOAT a11, a21, a31, a41;
  FLOAT a12, a22, a32, a42;

  b1 = b;
  b2 = b;

  lda *= 2;

  for (js = 0; js < m; js += 2){

    aa1 = a + 0 * lda;
    aa2 = a + 1 * lda;
    a  += 2 * lda + 4;
    
    bb1 = b1 + 0 * m;
    bb2 = b1 + 2 * m;
    b1 += 4 * m + 4;
	  
    cc1 = b2 + 0 * m;
    cc2 = b2 + 2 * m;
    b2 += 4 * m + 4;

    if (m - js >= 2){

      a11 = *(aa1 + 0);
      a31 = *(aa1 + 2);
      a41 = *(aa1 + 3);
  
      a12 = *(aa2 + 2);
      
      *(bb1 + 0) = a11;
      *(bb1 + 1) = 0.;
      *(bb1 + 2) = a31;
      *(bb1 + 3) = a41;

      *(bb2 + 0) = a31;
      *(bb2 + 1) = -a41;
      *(bb2 + 2) = a12;
      *(bb2 + 3) = 0.;

      aa1 += 4;
      aa2 += 4;
      bb1 += 4;
      bb2 += 4;
      
      cc1 += 4 * m;
      cc2 += 4 * m;

      is = ((m - js - 2) >> 1);

      while (is > 0){
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a31 = *(aa1 + 2);
	a41 = *(aa1 + 3);

	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);
	a32 = *(aa2 + 2);
	a42 = *(aa2 + 3);
	
	aa1 += 4;
	aa2 += 4;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb1 + 2) = a31;
	*(bb1 + 3) = a41;

	*(bb2 + 0) = a12;
	*(bb2 + 1) = a22;
	*(bb2 + 2) = a32;
	*(bb2 + 3) = a42;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = -a21;
	*(cc1 + 2) = a12;
	*(cc1 + 3) = -a22;

	*(cc2 + 0) = a31;
	*(cc2 + 1) = -a41;
	*(cc2 + 2) = a32;
	*(cc2 + 3) = -a42;

	bb1 += 4;
	bb2 += 4;

	cc1 += 4 * m;
	cc2 += 4 * m;

	is --;
      }

      if (m & 1){
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb2 + 0) = a12;
	*(bb2 + 1) = a22;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = -a21;
	*(cc1 + 2) = a12;
	*(cc1 + 3) = -a22;
      }
    }
    
    if (m - js == 1){
      a11 = *(aa1 + 0);
      *(bb1 + 0) = a11;
      *(bb1 + 1) = 0.;
    }

  }
}

static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  BLASLONG is, js;

  FLOAT *aa1, *aa2;
  FLOAT *b1, *b2;
  FLOAT *bb1, *bb2;
  FLOAT *cc1, *cc2;
  FLOAT a11, a21, a31, a41;
  FLOAT a12, a22, a32, a42;

  b1 = b;
  b2 = b;

  lda *= 2;

  for (js = 0; js < m; js += 2){

    aa1 = a + 0 * lda;
    aa2 = a + 1 * lda;
    a  += 2 * lda;
    
    bb1 = b1 + 0 * m;
    bb2 = b1 + 2 * m;
    b1 += 4 * m;
	  
    cc1 = b2 + 0 * m;
    cc2 = b2 + 2 * m;
    b2 += 4;

    if (m - js >= 2){

      for (is = 0; is < js; is += 2){
      
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a31 = *(aa1 + 2);
	a41 = *(aa1 + 3);

	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);
	a32 = *(aa2 + 2);
	a42 = *(aa2 + 3);

	aa1 += 4;
	aa2 += 4;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb1 + 2) = a31;
	*(bb1 + 3) = a41;

	*(bb2 + 0) = a12;
	*(bb2 + 1) = a22;
	*(bb2 + 2) = a32;
	*(bb2 + 3) = a42;
	
	*(cc1 + 0) = a11;
	*(cc1 + 1) = -a21;
	*(cc1 + 2) = a12;
	*(cc1 + 3) = -a22;

	*(cc2 + 0) = a31;
	*(cc2 + 1) = -a41;
	*(cc2 + 2) = a32;
	*(cc2 + 3) = -a42;
	
	bb1 += 4;
	bb2 += 4;
	
	cc1 += 4 * m;
	cc2 += 4 * m;
      }

      a11 = *(aa1 + 0);
      
      a12 = *(aa2 + 0);
      a22 = *(aa2 + 1);
      a32 = *(aa2 + 2);
      
      *(bb1 + 0) = a11;
      *(bb1 + 1) = 0.;
      *(bb1 + 2) = a12;
      *(bb1 + 3) = -a22;

      *(bb2 + 0) = a12;
      *(bb2 + 1) = a22;
      *(bb2 + 2) = a32;
      *(bb2 + 3) = 0.;
    }
    
    if (m - js == 1){
      for (is = 0; is < js; is += 2){
      
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a31 = *(aa1 + 2);
	a41 = *(aa1 + 3);
	aa1 += 4;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb1 + 2) = a31;
	*(bb1 + 3) = a41;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = -a21;
	*(cc2 + 0) = a31;
	*(cc2 + 1) = -a41;
	bb1 += 4;
	
	cc1 += 4 * m;
	cc2 += 4 * m;
      }

      a11 = *(aa1 + 0);
      *(bb1 + 0) = a11;
      *(bb1 + 1) = 0.;
    }
  }
}


static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  BLASLONG is, js;

  FLOAT *aa1, *aa2;
  FLOAT *b1, *b2;
  FLOAT *bb1, *bb2;
  FLOAT *cc1, *cc2;
  FLOAT a11, a21, a31, a41;
  FLOAT a12, a22, a32, a42;

  b1 = b;
  b2 = b;

  lda *= 2;

  for (js = 0; js < m; js += 2){

    aa1 = a + 0 * lda;
    aa2 = a + 1 * lda;
    a  += 2 * lda + 4;
    
    bb1 = b1 + 0 * m;
    bb2 = b1 + 2 * m;
    b1 += 4 * m + 4;
	  
    cc1 = b2 + 0 * m;
    cc2 = b2 + 2 * m;
    b2 += 4 * m + 4;

    if (m - js >= 2){

      a11 = *(aa1 + 0);
      a31 = *(aa1 + 2);
      a41 = *(aa1 + 3);
  
      a12 = *(aa2 + 2);
      
      *(bb1 + 0) = a11;
      *(bb1 + 1) = 0.;
      *(bb1 + 2) = a31;
      *(bb1 + 3) = -a41;

      *(bb2 + 0) = a31;
      *(bb2 + 1) = a41;
      *(bb2 + 2) = a12;
      *(bb2 + 3) = 0.;

      aa1 += 4;
      aa2 += 4;
      bb1 += 4;
      bb2 += 4;
      
      cc1 += 4 * m;
      cc2 += 4 * m;

      is = ((m - js - 2) >> 1);

      while (is > 0){
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a31 = *(aa1 + 2);
	a41 = *(aa1 + 3);

	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);
	a32 = *(aa2 + 2);
	a42 = *(aa2 + 3);
	
	aa1 += 4;
	aa2 += 4;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = -a21;
	*(bb1 + 2) = a31;
	*(bb1 + 3) = -a41;

	*(bb2 + 0) = a12;
	*(bb2 + 1) = -a22;
	*(bb2 + 2) = a32;
	*(bb2 + 3) = -a42;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = a21;
	*(cc1 + 2) = a12;
	*(cc1 + 3) = a22;

	*(cc2 + 0) = a31;
	*(cc2 + 1) = a41;
	*(cc2 + 2) = a32;
	*(cc2 + 3) = a42;

	bb1 += 4;
	bb2 += 4;

	cc1 += 4 * m;
	cc2 += 4 * m;

	is --;
      }

      if (m & 1){
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = -a21;
	*(bb2 + 0) = a12;
	*(bb2 + 1) = -a22;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = a21;
	*(cc1 + 2) = a12;
	*(cc1 + 3) = a22;
      }
    }
    
    if (m - js == 1){
      a11 = *(aa1 + 0);
      *(bb1 + 0) = a11;
      *(bb1 + 1) = 0.;
    }

  }
}

static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  BLASLONG is, js;

  FLOAT *aa1, *aa2;
  FLOAT *b1, *b2;
  FLOAT *bb1, *bb2;
  FLOAT *cc1, *cc2;
  FLOAT a11, a21, a31, a41;
  FLOAT a12, a22, a32, a42;

  b1 = b;
  b2 = b;

  lda *= 2;

  for (js = 0; js < m; js += 2){

    aa1 = a + 0 * lda;
    aa2 = a + 1 * lda;
    a  += 2 * lda;
    
    bb1 = b1 + 0 * m;
    bb2 = b1 + 2 * m;
    b1 += 4 * m;
	  
    cc1 = b2 + 0 * m;
    cc2 = b2 + 2 * m;
    b2 += 4;

    if (m - js >= 2){

      for (is = 0; is < js; is += 2){
      
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a31 = *(aa1 + 2);
	a41 = *(aa1 + 3);

	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);
	a32 = *(aa2 + 2);
	a42 = *(aa2 + 3);

	aa1 += 4;
	aa2 += 4;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = -a21;
	*(bb1 + 2) = a31;
	*(bb1 + 3) = -a41;

	*(bb2 + 0) = a12;
	*(bb2 + 1) = -a22;
	*(bb2 + 2) = a32;
	*(bb2 + 3) = -a42;
	
	*(cc1 + 0) = a11;
	*(cc1 + 1) = a21;
	*(cc1 + 2) = a12;
	*(cc1 + 3) = a22;

	*(cc2 + 0) = a31;
	*(cc2 + 1) = a41;
	*(cc2 + 2) = a32;
	*(cc2 + 3) = a42;
	
	bb1 += 4;
	bb2 += 4;
	
	cc1 += 4 * m;
	cc2 += 4 * m;
      }

      a11 = *(aa1 + 0);
      
      a12 = *(aa2 + 0);
      a22 = *(aa2 + 1);
      a32 = *(aa2 + 2);
      
      *(bb1 + 0) = a11;
      *(bb1 + 1) = 0.;
      *(bb1 + 2) = a12;
      *(bb1 + 3) = a22;

      *(bb2 + 0) = a12;
      *(bb2 + 1) = -a22;
      *(bb2 + 2) = a32;
      *(bb2 + 3) = 0.;
    }
    
    if (m - js == 1){
      for (is = 0; is < js; is += 2){
      
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a31 = *(aa1 + 2);
	a41 = *(aa1 + 3);
	aa1 += 4;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = -a21;
	*(bb1 + 2) = a31;
	*(bb1 + 3) = -a41;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = a21;
	*(cc2 + 0) = a31;
	*(cc2 + 1) = a41;
	bb1 += 4;
	
	cc1 += 4 * m;
	cc2 += 4 * m;
      }

      a11 = *(aa1 + 0);
      *(bb1 + 0) = a11;
      *(bb1 + 1) = 0.;
    }
  }
}


static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  BLASLONG is, js;

  FLOAT *aa1, *aa2;
  FLOAT *b1, *b2;
  FLOAT *bb1, *bb2;
  FLOAT *cc1, *cc2;
  FLOAT a11, a12;
  FLOAT a21, a22;

  b1 = b;
  b2 = b;

  for (js = 0; js < m; js += 2){

    aa1 = a + 0 * lda;
    aa2 = a + 1 * lda;
    a  += 2 * lda + 2;
    
    bb1 = b1 + 0 * m;
    bb2 = b1 + 1 * m;
    b1 += 2 * m + 2;
	  
    cc1 = b2 + 0 * m;
    cc2 = b2 + 1 * m;
    b2 += 2 * m + 2;

    if (m - js >= 2){

      a11 = *(aa1 + 0);
      a21 = *(aa1 + 1);
      
      a22 = *(aa2 + 1);
      
      *(bb1 + 0) = a11;
      *(bb1 + 1) = a21;
      *(bb2 + 0) = a21;
      *(bb2 + 1) = a22;
      aa1 += 2;
      aa2 += 2;
      bb1 += 2;
      bb2 += 2;
      
      cc1 += 2 * m;
      cc2 += 2 * m;

      is = ((m - js - 2) >> 1);

      while (is > 0){
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);
	
	aa1 += 2;
	aa2 += 2;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb2 + 0) = a12;
	*(bb2 + 1) = a22;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = a12;
	*(cc2 + 0) = a21;
	*(cc2 + 1) = a22;

	bb1 += 2;
	bb2 += 2;

	cc1 += 2 * m;
	cc2 += 2 * m;

	is --;
      }

      is = ((m - js - 2) & 1);

      if (is == 1){
	a11 = *(aa1 + 0);
	a12 = *(aa2 + 0);
	
	*(bb1 + 0) = a11;
	*(bb2 + 0) = a12;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = a12;
      }
    }
    
    if (m - js == 1){
      a11 = *(aa1 + 0);
      *(bb1 + 0) = a11;
    }

  }
}

static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  BLASLONG is, js;

  FLOAT *aa1, *aa2;
  FLOAT *b1, *b2;
  FLOAT *bb1, *bb2;
  FLOAT *cc1, *cc2;
  FLOAT a11, a12;
  FLOAT a21, a22;

  b1 = b;
  b2 = b;

  for (js = 0; js < m; js += 2){

    aa1 = a + 0 * lda;
    aa2 = a + 1 * lda;
    a  += 2 * lda + 2;
    
    bb1 = b1 + 0 * m;
    bb2 = b1 + 1 * m;
    b1 += 2 * m + 2;
	  
    cc1 = b2 + 0 * m;
    cc2 = b2 + 1 * m;
    b2 += 2 * m + 2;

    if (m - js >= 2){

      a11 = *(aa1 + 0);
      a21 = *(aa1 + 1);
      
      a22 = *(aa2 + 1);
      
      *(bb1 + 0) = a11;
      *(bb1 + 1) = a21;
      *(bb2 + 0) = a21;
      *(bb2 + 1) = a22;
      aa1 += 2;
      aa2 += 2;
      bb1 += 2;
      bb2 += 2;
      
      cc1 += 2 * m;
      cc2 += 2 * m;

      is = ((m - js - 2) >> 1);

      while (is > 0){
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);
	
	aa1 += 2;
	aa2 += 2;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb2 + 0) = a12;
	*(bb2 + 1) = a22;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = a12;
	*(cc2 + 0) = a21;
	*(cc2 + 1) = a22;

	bb1 += 2;
	bb2 += 2;

	cc1 += 2 * m;
	cc2 += 2 * m;

	is --;
      }

      is = ((m - js - 2) & 1);

      if (is == 1){
	a11 = *(aa1 + 0);
	a12 = *(aa2 + 0);
	
	*(bb1 + 0) = a11;
	*(bb2 + 0) = a12;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = a12;
      }
    }
    
    if (m - js == 1){
      a11 = *(aa1 + 0);
      *(bb1 + 0) = a11;
    }

  }
}

static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  BLASLONG is, js;

  FLOAT *aa1, *aa2;
  FLOAT *b1, *b2;
  FLOAT *bb1, *bb2;
  FLOAT *cc1, *cc2;
  FLOAT a11, a12;
  FLOAT a21, a22;

  b1 = b;
  b2 = b;

  for (js = 0; js < m; js += 2){

    aa1 = a + 0 * lda;
    aa2 = a + 1 * lda;
    a  += 2 * lda;
    
    bb1 = b1 + 0 * m;
    bb2 = b1 + 1 * m;
    b1 += 2 * m;
	  
    cc1 = b2 + 0 * m;
    cc2 = b2 + 1 * m;
    b2 += 2;

    if (m - js >= 2){

      for (is = 0; is < js; is += 2){
      
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);

	aa1 += 2;
	aa2 += 2;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb2 + 0) = a12;
	*(bb2 + 1) = a22;
	
	*(cc1 + 0) = a11;
	*(cc1 + 1) = a12;
	*(cc2 + 0) = a21;
	*(cc2 + 1) = a22;
	
	bb1 += 2;
	bb2 += 2;
	
	cc1 += 2 * m;
	cc2 += 2 * m;
      }

      a11 = *(aa1 + 0);
      
      a12 = *(aa2 + 0);
      a22 = *(aa2 + 1);
      
      *(bb1 + 0) = a11;
      *(bb1 + 1) = a12;
      *(bb2 + 0) = a12;
      *(bb2 + 1) = a22;
    }
    
    if (m - js == 1){
      for (is = 0; is < js; is += 2){
      
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	aa1 += 2;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(cc1 + 0) = a11;
	*(cc2 + 0) = a21;
	bb1 += 2;
	
	cc1 += 2 * m;
	cc2 += 2 * m;
      }

      a11 = *(aa1 + 0);
      *(bb1 + 0) = a11;
    }
  }
}

static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  BLASLONG is, js;

  FLOAT *aa1, *aa2;
  FLOAT *b1, *b2;
  FLOAT *bb1, *bb2;
  FLOAT *cc1, *cc2;
  FLOAT a11, a12;
  FLOAT a21, a22;

  b1 = b;
  b2 = b;

  for (js = 0; js < m; js += 2){

    aa1 = a + 0 * lda;
    aa2 = a + 1 * lda;
    a  += 2 * lda;
    
    bb1 = b1 + 0 * m;
    bb2 = b1 + 1 * m;
    b1 += 2 * m;
	  
    cc1 = b2 + 0 * m;
    cc2 = b2 + 1 * m;
    b2 += 2;

    if (m - js >= 2){

      for (is = 0; is < js; is += 2){
      
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);

	aa1 += 2;
	aa2 += 2;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb2 + 0) = a12;
	*(bb2 + 1) = a22;
	
	*(cc1 + 0) = a11;
	*(cc1 + 1) = a12;
	*(cc2 + 0) = a21;
	*(cc2 + 1) = a22;
	
	bb1 += 2;
	bb2 += 2;
	
	cc1 += 2 * m;
	cc2 += 2 * m;
      }

      a11 = *(aa1 + 0);
      
      a12 = *(aa2 + 0);
      a22 = *(aa2 + 1);
      
      *(bb1 + 0) = a11;
      *(bb1 + 1) = a12;
      *(bb2 + 0) = a12;
      *(bb2 + 1) = a22;
    }
    
    if (m - js == 1){
      for (is = 0; is < js; is += 2){
      
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	aa1 += 2;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(cc1 + 0) = a11;
	*(cc2 + 0) = a21;
	bb1 += 2;
	
	cc1 += 2 * m;
	cc2 += 2 * m;
      }

      a11 = *(aa1 + 0);
      *(bb1 + 0) = a11;
    }
  }
}

static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  BLASLONG is, js;

  FLOAT *aa1, *aa2;
  FLOAT *b1, *b2;
  FLOAT *bb1, *bb2;
  FLOAT *cc1, *cc2;
  FLOAT a11, a21, a31, a41;
  FLOAT a12, a22, a32, a42;

  b1 = b;
  b2 = b;

  lda *= 2;

  for (js = 0; js < m; js += 2){

    aa1 = a + 0 * lda;
    aa2 = a + 1 * lda;
    a  += 2 * lda + 4;
    
    bb1 = b1 + 0 * m;
    bb2 = b1 + 2 * m;
    b1 += 4 * m + 4;
	  
    cc1 = b2 + 0 * m;
    cc2 = b2 + 2 * m;
    b2 += 4 * m + 4;

    if (m - js >= 2){

      a11 = *(aa1 + 0);
      a21 = *(aa1 + 1);
      a31 = *(aa1 + 2);
      a41 = *(aa1 + 3);
  
      a12 = *(aa2 + 2);
      a22 = *(aa2 + 3);
      
      *(bb1 + 0) = a11;
      *(bb1 + 1) = a21;
      *(bb1 + 2) = a31;
      *(bb1 + 3) = a41;

      *(bb2 + 0) = a31;
      *(bb2 + 1) = a41;
      *(bb2 + 2) = a12;
      *(bb2 + 3) = a22;

      aa1 += 4;
      aa2 += 4;
      bb1 += 4;
      bb2 += 4;
      
      cc1 += 4 * m;
      cc2 += 4 * m;

      is = ((m - js - 2) >> 1);

      while (is > 0){
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a31 = *(aa1 + 2);
	a41 = *(aa1 + 3);

	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);
	a32 = *(aa2 + 2);
	a42 = *(aa2 + 3);
	
	aa1 += 4;
	aa2 += 4;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb1 + 2) = a31;
	*(bb1 + 3) = a41;

	*(bb2 + 0) = a12;
	*(bb2 + 1) = a22;
	*(bb2 + 2) = a32;
	*(bb2 + 3) = a42;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = a21;
	*(cc1 + 2) = a12;
	*(cc1 + 3) = a22;

	*(cc2 + 0) = a31;
	*(cc2 + 1) = a41;
	*(cc2 + 2) = a32;
	*(cc2 + 3) = a42;

	bb1 += 4;
	bb2 += 4;

	cc1 += 4 * m;
	cc2 += 4 * m;

	is --;
      }

      if (m & 1){
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb2 + 0) = a12;
	*(bb2 + 1) = a22;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = a21;
	*(cc1 + 2) = a12;
	*(cc1 + 3) = a22;
      }
    }
    
    if (m - js == 1){
      a11 = *(aa1 + 0);
      a21 = *(aa1 + 1);
      *(bb1 + 0) = a11;
      *(bb1 + 1) = a21;
    }

  }
}

static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  BLASLONG is, js;

  FLOAT *aa1, *aa2;
  FLOAT *b1, *b2;
  FLOAT *bb1, *bb2;
  FLOAT *cc1, *cc2;
  FLOAT a11, a21, a31, a41;
  FLOAT a12, a22, a32, a42;

  b1 = b;
  b2 = b;

  lda *= 2;

  for (js = 0; js < m; js += 2){

    aa1 = a + 0 * lda;
    aa2 = a + 1 * lda;
    a  += 2 * lda + 4;
    
    bb1 = b1 + 0 * m;
    bb2 = b1 + 2 * m;
    b1 += 4 * m + 4;
	  
    cc1 = b2 + 0 * m;
    cc2 = b2 + 2 * m;
    b2 += 4 * m + 4;

    if (m - js >= 2){

      a11 = *(aa1 + 0);
      a21 = *(aa1 + 1);
      a31 = *(aa1 + 2);
      a41 = *(aa1 + 3);
  
      a12 = *(aa2 + 2);
      a22 = *(aa2 + 3);
      
      *(bb1 + 0) = a11;
      *(bb1 + 1) = a21;
      *(bb1 + 2) = a31;
      *(bb1 + 3) = a41;

      *(bb2 + 0) = a31;
      *(bb2 + 1) = a41;
      *(bb2 + 2) = a12;
      *(bb2 + 3) = a22;

      aa1 += 4;
      aa2 += 4;
      bb1 += 4;
      bb2 += 4;
      
      cc1 += 4 * m;
      cc2 += 4 * m;

      is = ((m - js - 2) >> 1);

      while (is > 0){
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a31 = *(aa1 + 2);
	a41 = *(aa1 + 3);

	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);
	a32 = *(aa2 + 2);
	a42 = *(aa2 + 3);
	
	aa1 += 4;
	aa2 += 4;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb1 + 2) = a31;
	*(bb1 + 3) = a41;

	*(bb2 + 0) = a12;
	*(bb2 + 1) = a22;
	*(bb2 + 2) = a32;
	*(bb2 + 3) = a42;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = a21;
	*(cc1 + 2) = a12;
	*(cc1 + 3) = a22;

	*(cc2 + 0) = a31;
	*(cc2 + 1) = a41;
	*(cc2 + 2) = a32;
	*(cc2 + 3) = a42;

	bb1 += 4;
	bb2 += 4;

	cc1 += 4 * m;
	cc2 += 4 * m;

	is --;
      }

      if (m & 1){
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb2 + 0) = a12;
	*(bb2 + 1) = a22;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = a21;
	*(cc1 + 2) = a12;
	*(cc1 + 3) = a22;
      }
    }
    
    if (m - js == 1){
      a11 = *(aa1 + 0);
      a21 = *(aa1 + 1);
      *(bb1 + 0) = a11;
      *(bb1 + 1) = a21;
    }

  }
}

static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  BLASLONG is, js;

  FLOAT *aa1, *aa2;
  FLOAT *b1, *b2;
  FLOAT *bb1, *bb2;
  FLOAT *cc1, *cc2;
  FLOAT a11, a21, a31, a41;
  FLOAT a12, a22, a32, a42;

  b1 = b;
  b2 = b;

  lda *= 2;

  for (js = 0; js < m; js += 2){

    aa1 = a + 0 * lda;
    aa2 = a + 1 * lda;
    a  += 2 * lda;
    
    bb1 = b1 + 0 * m;
    bb2 = b1 + 2 * m;
    b1 += 4 * m;
	  
    cc1 = b2 + 0 * m;
    cc2 = b2 + 2 * m;
    b2 += 4;

    if (m - js >= 2){

      for (is = 0; is < js; is += 2){
      
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a31 = *(aa1 + 2);
	a41 = *(aa1 + 3);

	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);
	a32 = *(aa2 + 2);
	a42 = *(aa2 + 3);

	aa1 += 4;
	aa2 += 4;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb1 + 2) = a31;
	*(bb1 + 3) = a41;

	*(bb2 + 0) = a12;
	*(bb2 + 1) = a22;
	*(bb2 + 2) = a32;
	*(bb2 + 3) = a42;
	
	*(cc1 + 0) = a11;
	*(cc1 + 1) = a21;
	*(cc1 + 2) = a12;
	*(cc1 + 3) = a22;

	*(cc2 + 0) = a31;
	*(cc2 + 1) = a41;
	*(cc2 + 2) = a32;
	*(cc2 + 3) = a42;
	
	bb1 += 4;
	bb2 += 4;
	
	cc1 += 4 * m;
	cc2 += 4 * m;
      }

      a11 = *(aa1 + 0);
      a21 = *(aa1 + 1);
      
      a12 = *(aa2 + 0);
      a22 = *(aa2 + 1);
      a32 = *(aa2 + 2);
      a42 = *(aa2 + 3);
      
      *(bb1 + 0) = a11;
      *(bb1 + 1) = a21;
      *(bb1 + 2) = a12;
      *(bb1 + 3) = a22;

      *(bb2 + 0) = a12;
      *(bb2 + 1) = a22;
      *(bb2 + 2) = a32;
      *(bb2 + 3) = a42;
    }
    
    if (m - js == 1){
      for (is = 0; is < js; is += 2){
      
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a31 = *(aa1 + 2);
	a41 = *(aa1 + 3);
	aa1 += 4;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb1 + 2) = a31;
	*(bb1 + 3) = a41;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = a21;
	*(cc2 + 0) = a31;
	*(cc2 + 1) = a41;
	bb1 += 4;
	
	cc1 += 4 * m;
	cc2 += 4 * m;
      }

      a11 = *(aa1 + 0);
      a21 = *(aa1 + 1);
      *(bb1 + 0) = a11;
      *(bb1 + 1) = a21;
    }
  }
}

static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  BLASLONG is, js;

  FLOAT *aa1, *aa2;
  FLOAT *b1, *b2;
  FLOAT *bb1, *bb2;
  FLOAT *cc1, *cc2;
  FLOAT a11, a21, a31, a41;
  FLOAT a12, a22, a32, a42;

  b1 = b;
  b2 = b;

  lda *= 2;

  for (js = 0; js < m; js += 2){

    aa1 = a + 0 * lda;
    aa2 = a + 1 * lda;
    a  += 2 * lda;
    
    bb1 = b1 + 0 * m;
    bb2 = b1 + 2 * m;
    b1 += 4 * m;
	  
    cc1 = b2 + 0 * m;
    cc2 = b2 + 2 * m;
    b2 += 4;

    if (m - js >= 2){

      for (is = 0; is < js; is += 2){
      
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a31 = *(aa1 + 2);
	a41 = *(aa1 + 3);

	a12 = *(aa2 + 0);
	a22 = *(aa2 + 1);
	a32 = *(aa2 + 2);
	a42 = *(aa2 + 3);

	aa1 += 4;
	aa2 += 4;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb1 + 2) = a31;
	*(bb1 + 3) = a41;

	*(bb2 + 0) = a12;
	*(bb2 + 1) = a22;
	*(bb2 + 2) = a32;
	*(bb2 + 3) = a42;
	
	*(cc1 + 0) = a11;
	*(cc1 + 1) = a21;
	*(cc1 + 2) = a12;
	*(cc1 + 3) = a22;

	*(cc2 + 0) = a31;
	*(cc2 + 1) = a41;
	*(cc2 + 2) = a32;
	*(cc2 + 3) = a42;
	
	bb1 += 4;
	bb2 += 4;
	
	cc1 += 4 * m;
	cc2 += 4 * m;
      }

      a11 = *(aa1 + 0);
      a21 = *(aa1 + 1);
      
      a12 = *(aa2 + 0);
      a22 = *(aa2 + 1);
      a32 = *(aa2 + 2);
      a42 = *(aa2 + 3);
      
      *(bb1 + 0) = a11;
      *(bb1 + 1) = a21;
      *(bb1 + 2) = a12;
      *(bb1 + 3) = a22;

      *(bb2 + 0) = a12;
      *(bb2 + 1) = a22;
      *(bb2 + 2) = a32;
      *(bb2 + 3) = a42;
    }
    
    if (m - js == 1){
      for (is = 0; is < js; is += 2){
      
	a11 = *(aa1 + 0);
	a21 = *(aa1 + 1);
	a31 = *(aa1 + 2);
	a41 = *(aa1 + 3);
	aa1 += 4;
	
	*(bb1 + 0) = a11;
	*(bb1 + 1) = a21;
	*(bb1 + 2) = a31;
	*(bb1 + 3) = a41;

	*(cc1 + 0) = a11;
	*(cc1 + 1) = a21;
	*(cc2 + 0) = a31;
	*(cc2 + 1) = a41;
	bb1 += 4;
	
	cc1 += 4 * m;
	cc2 += 4 * m;
      }

      a11 = *(aa1 + 0);
      a21 = *(aa1 + 1);
      *(bb1 + 0) = a11;
      *(bb1 + 1) = a21;
    }
  }
}

#endif
#endif