kusano 7d535a
kusano 7d535a
/*! @file scolumn_bmod.c
kusano 7d535a
 *  \brief performs numeric block updates
kusano 7d535a
 *
kusano 7d535a
 * 
kusano 7d535a
 * -- SuperLU routine (version 3.0) --
kusano 7d535a
 * Univ. of California Berkeley, Xerox Palo Alto Research Center,
kusano 7d535a
 * and Lawrence Berkeley National Lab.
kusano 7d535a
 * October 15, 2003
kusano 7d535a
 *
kusano 7d535a
 * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
kusano 7d535a
 *
kusano 7d535a
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
kusano 7d535a
 * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
kusano 7d535a
 * 
kusano 7d535a
 *  Permission is hereby granted to use or copy this program for any
kusano 7d535a
 *  purpose, provided the above notices are retained on all copies.
kusano 7d535a
 *  Permission to modify the code and to distribute modified code is
kusano 7d535a
 *  granted, provided the above notices are retained, and a notice that
kusano 7d535a
 *  the code was modified is included with the above copyright notice.
kusano 7d535a
 * 
kusano 7d535a
*/
kusano 7d535a
kusano 7d535a
#include <stdio.h></stdio.h>
kusano 7d535a
#include <stdlib.h></stdlib.h>
kusano 7d535a
#include "slu_sdefs.h"
kusano 7d535a
kusano 7d535a
/* 
kusano 7d535a
 * Function prototypes 
kusano 7d535a
 */
kusano 7d535a
void susolve(int, int, float*, float*);
kusano 7d535a
void slsolve(int, int, float*, float*);
kusano 7d535a
void smatvec(int, int, int, float*, float*, float*);
kusano 7d535a
kusano 7d535a
kusano 7d535a
kusano 7d535a
/*! \brief 
kusano 7d535a
 *
kusano 7d535a
 * 
kusano 7d535a
 * Purpose:
kusano 7d535a
 * ========
kusano 7d535a
 * Performs numeric block updates (sup-col) in topological order.
kusano 7d535a
 * It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
kusano 7d535a
 * Special processing on the supernodal portion of L\U[*,j]
kusano 7d535a
 * Return value:   0 - successful return
kusano 7d535a
 *               > 0 - number of bytes allocated when run out of space
kusano 7d535a
 * 
kusano 7d535a
 */
kusano 7d535a
int
kusano 7d535a
scolumn_bmod (
kusano 7d535a
	     const int  jcol,	  /* in */
kusano 7d535a
	     const int  nseg,	  /* in */
kusano 7d535a
	     float     *dense,	  /* in */
kusano 7d535a
	     float     *tempv,	  /* working array */
kusano 7d535a
	     int        *segrep,  /* in */
kusano 7d535a
	     int        *repfnz,  /* in */
kusano 7d535a
	     int        fpanelc,  /* in -- first column in the current panel */
kusano 7d535a
	     GlobalLU_t *Glu,     /* modified */
kusano 7d535a
	     SuperLUStat_t *stat  /* output */
kusano 7d535a
	     )
kusano 7d535a
{
kusano 7d535a
kusano 7d535a
#ifdef _CRAY
kusano 7d535a
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
kusano 7d535a
         ftcs2 = _cptofcd("N", strlen("N")),
kusano 7d535a
         ftcs3 = _cptofcd("U", strlen("U"));
kusano 7d535a
#endif
kusano 7d535a
    int         incx = 1, incy = 1;
kusano 7d535a
    float      alpha, beta;
kusano 7d535a
    
kusano 7d535a
    /* krep = representative of current k-th supernode
kusano 7d535a
     * fsupc = first supernodal column
kusano 7d535a
     * nsupc = no of columns in supernode
kusano 7d535a
     * nsupr = no of rows in supernode (used as leading dimension)
kusano 7d535a
     * luptr = location of supernodal LU-block in storage
kusano 7d535a
     * kfnz = first nonz in the k-th supernodal segment
kusano 7d535a
     * no_zeros = no of leading zeros in a supernodal U-segment
kusano 7d535a
     */
kusano 7d535a
    float       ukj, ukj1, ukj2;
kusano 7d535a
    int          luptr, luptr1, luptr2;
kusano 7d535a
    int          fsupc, nsupc, nsupr, segsze;
kusano 7d535a
    int          nrow;	  /* No of rows in the matrix of matrix-vector */
kusano 7d535a
    int          jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno;
kusano 7d535a
    register int lptr, kfnz, isub, irow, i;
kusano 7d535a
    register int no_zeros, new_next; 
kusano 7d535a
    int          ufirst, nextlu;
kusano 7d535a
    int          fst_col; /* First column within small LU update */
kusano 7d535a
    int          d_fsupc; /* Distance between the first column of the current
kusano 7d535a
			     panel and the first column of the current snode. */
kusano 7d535a
    int          *xsup, *supno;
kusano 7d535a
    int          *lsub, *xlsub;
kusano 7d535a
    float       *lusup;
kusano 7d535a
    int          *xlusup;
kusano 7d535a
    int          nzlumax;
kusano 7d535a
    float       *tempv1;
kusano 7d535a
    float      zero = 0.0;
kusano 7d535a
    float      one = 1.0;
kusano 7d535a
    float      none = -1.0;
kusano 7d535a
    int          mem_error;
kusano 7d535a
    flops_t      *ops = stat->ops;
kusano 7d535a
kusano 7d535a
    xsup    = Glu->xsup;
kusano 7d535a
    supno   = Glu->supno;
kusano 7d535a
    lsub    = Glu->lsub;
kusano 7d535a
    xlsub   = Glu->xlsub;
kusano 7d535a
    lusup   = Glu->lusup;
kusano 7d535a
    xlusup  = Glu->xlusup;
kusano 7d535a
    nzlumax = Glu->nzlumax;
kusano 7d535a
    jcolp1 = jcol + 1;
kusano 7d535a
    jsupno = supno[jcol];
kusano 7d535a
    
kusano 7d535a
    /* 
kusano 7d535a
     * For each nonz supernode segment of U[*,j] in topological order 
kusano 7d535a
     */
kusano 7d535a
    k = nseg - 1;
kusano 7d535a
    for (ksub = 0; ksub < nseg; ksub++) {
kusano 7d535a
kusano 7d535a
	krep = segrep[k];
kusano 7d535a
	k--;
kusano 7d535a
	ksupno = supno[krep];
kusano 7d535a
	if ( jsupno != ksupno ) { /* Outside the rectangular supernode */
kusano 7d535a
kusano 7d535a
	    fsupc = xsup[ksupno];
kusano 7d535a
	    fst_col = SUPERLU_MAX ( fsupc, fpanelc );
kusano 7d535a
kusano 7d535a
  	    /* Distance from the current supernode to the current panel; 
kusano 7d535a
	       d_fsupc=0 if fsupc > fpanelc. */
kusano 7d535a
  	    d_fsupc = fst_col - fsupc; 
kusano 7d535a
kusano 7d535a
	    luptr = xlusup[fst_col] + d_fsupc;
kusano 7d535a
	    lptr = xlsub[fsupc] + d_fsupc;
kusano 7d535a
kusano 7d535a
	    kfnz = repfnz[krep];
kusano 7d535a
	    kfnz = SUPERLU_MAX ( kfnz, fpanelc );
kusano 7d535a
kusano 7d535a
	    segsze = krep - kfnz + 1;
kusano 7d535a
	    nsupc = krep - fst_col + 1;
kusano 7d535a
	    nsupr = xlsub[fsupc+1] - xlsub[fsupc];	/* Leading dimension */
kusano 7d535a
	    nrow = nsupr - d_fsupc - nsupc;
kusano 7d535a
	    krep_ind = lptr + nsupc - 1;
kusano 7d535a
kusano 7d535a
	    ops[TRSV] += segsze * (segsze - 1);
kusano 7d535a
	    ops[GEMV] += 2 * nrow * segsze;
kusano 7d535a
kusano 7d535a
kusano 7d535a
	    /* 
kusano 7d535a
	     * Case 1: Update U-segment of size 1 -- col-col update 
kusano 7d535a
	     */
kusano 7d535a
	    if ( segsze == 1 ) {
kusano 7d535a
	  	ukj = dense[lsub[krep_ind]];
kusano 7d535a
		luptr += nsupr*(nsupc-1) + nsupc;
kusano 7d535a
kusano 7d535a
		for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
kusano 7d535a
		    irow = lsub[i];
kusano 7d535a
		    dense[irow] -=  ukj*lusup[luptr];
kusano 7d535a
		    luptr++;
kusano 7d535a
		}
kusano 7d535a
kusano 7d535a
	    } else if ( segsze <= 3 ) {
kusano 7d535a
		ukj = dense[lsub[krep_ind]];
kusano 7d535a
		luptr += nsupr*(nsupc-1) + nsupc-1;
kusano 7d535a
		ukj1 = dense[lsub[krep_ind - 1]];
kusano 7d535a
		luptr1 = luptr - nsupr;
kusano 7d535a
kusano 7d535a
		if ( segsze == 2 ) { /* Case 2: 2cols-col update */
kusano 7d535a
		    ukj -= ukj1 * lusup[luptr1];
kusano 7d535a
		    dense[lsub[krep_ind]] = ukj;
kusano 7d535a
		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
kusano 7d535a
		    	irow = lsub[i];
kusano 7d535a
		    	luptr++;
kusano 7d535a
		    	luptr1++;
kusano 7d535a
		    	dense[irow] -= ( ukj*lusup[luptr]
kusano 7d535a
					+ ukj1*lusup[luptr1] );
kusano 7d535a
		    }
kusano 7d535a
		} else { /* Case 3: 3cols-col update */
kusano 7d535a
		    ukj2 = dense[lsub[krep_ind - 2]];
kusano 7d535a
		    luptr2 = luptr1 - nsupr;
kusano 7d535a
		    ukj1 -= ukj2 * lusup[luptr2-1];
kusano 7d535a
		    ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
kusano 7d535a
		    dense[lsub[krep_ind]] = ukj;
kusano 7d535a
		    dense[lsub[krep_ind-1]] = ukj1;
kusano 7d535a
		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
kusano 7d535a
		    	irow = lsub[i];
kusano 7d535a
		    	luptr++;
kusano 7d535a
		    	luptr1++;
kusano 7d535a
			luptr2++;
kusano 7d535a
		    	dense[irow] -= ( ukj*lusup[luptr]
kusano 7d535a
			     + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
kusano 7d535a
		    }
kusano 7d535a
		}
kusano 7d535a
kusano 7d535a
kusano 7d535a
kusano 7d535a
	    } else {
kusano 7d535a
	  	/*
kusano 7d535a
		 * Case: sup-col update
kusano 7d535a
		 * Perform a triangular solve and block update,
kusano 7d535a
		 * then scatter the result of sup-col update to dense
kusano 7d535a
		 */
kusano 7d535a
kusano 7d535a
		no_zeros = kfnz - fst_col;
kusano 7d535a
kusano 7d535a
	        /* Copy U[*,j] segment from dense[*] to tempv[*] */
kusano 7d535a
	        isub = lptr + no_zeros;
kusano 7d535a
	        for (i = 0; i < segsze; i++) {
kusano 7d535a
	  	    irow = lsub[isub];
kusano 7d535a
		    tempv[i] = dense[irow];
kusano 7d535a
		    ++isub; 
kusano 7d535a
	        }
kusano 7d535a
kusano 7d535a
	        /* Dense triangular solve -- start effective triangle */
kusano 7d535a
		luptr += nsupr * no_zeros + no_zeros; 
kusano 7d535a
		
kusano 7d535a
#ifdef USE_VENDOR_BLAS
kusano 7d535a
#ifdef _CRAY
kusano 7d535a
		STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
kusano 7d535a
		       &nsupr, tempv, &incx );
kusano 7d535a
#else		
kusano 7d535a
		strsv_( "L", "N", "U", &segsze, &lusup[luptr], 
kusano 7d535a
		       &nsupr, tempv, &incx );
kusano 7d535a
#endif		
kusano 7d535a
 		luptr += segsze;  /* Dense matrix-vector */
kusano 7d535a
		tempv1 = &tempv[segsze];
kusano 7d535a
                alpha = one;
kusano 7d535a
                beta = zero;
kusano 7d535a
#ifdef _CRAY
kusano 7d535a
		SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
kusano 7d535a
		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
kusano 7d535a
#else
kusano 7d535a
		sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
kusano 7d535a
		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
kusano 7d535a
#endif
kusano 7d535a
#else
kusano 7d535a
		slsolve ( nsupr, segsze, &lusup[luptr], tempv );
kusano 7d535a
kusano 7d535a
 		luptr += segsze;  /* Dense matrix-vector */
kusano 7d535a
		tempv1 = &tempv[segsze];
kusano 7d535a
		smatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1);
kusano 7d535a
#endif
kusano 7d535a
		
kusano 7d535a
		
kusano 7d535a
                /* Scatter tempv[] into SPA dense[] as a temporary storage */
kusano 7d535a
                isub = lptr + no_zeros;
kusano 7d535a
                for (i = 0; i < segsze; i++) {
kusano 7d535a
                    irow = lsub[isub];
kusano 7d535a
                    dense[irow] = tempv[i];
kusano 7d535a
                    tempv[i] = zero;
kusano 7d535a
                    ++isub;
kusano 7d535a
                }
kusano 7d535a
kusano 7d535a
		/* Scatter tempv1[] into SPA dense[] */
kusano 7d535a
		for (i = 0; i < nrow; i++) {
kusano 7d535a
		    irow = lsub[isub];
kusano 7d535a
		    dense[irow] -= tempv1[i];
kusano 7d535a
		    tempv1[i] = zero;
kusano 7d535a
		    ++isub;
kusano 7d535a
		}
kusano 7d535a
	    }
kusano 7d535a
	    
kusano 7d535a
	} /* if jsupno ... */
kusano 7d535a
kusano 7d535a
    } /* for each segment... */
kusano 7d535a
kusano 7d535a
    /*
kusano 7d535a
     *	Process the supernodal portion of L\U[*,j]
kusano 7d535a
     */
kusano 7d535a
    nextlu = xlusup[jcol];
kusano 7d535a
    fsupc = xsup[jsupno];
kusano 7d535a
kusano 7d535a
    /* Copy the SPA dense into L\U[*,j] */
kusano 7d535a
    new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc];
kusano 7d535a
    while ( new_next > nzlumax ) {
kusano 7d535a
	if (mem_error = sLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu))
kusano 7d535a
	    return (mem_error);
kusano 7d535a
	lusup = Glu->lusup;
kusano 7d535a
	lsub = Glu->lsub;
kusano 7d535a
    }
kusano 7d535a
kusano 7d535a
    for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) {
kusano 7d535a
  	irow = lsub[isub];
kusano 7d535a
	lusup[nextlu] = dense[irow];
kusano 7d535a
        dense[irow] = zero;
kusano 7d535a
	++nextlu;
kusano 7d535a
    }
kusano 7d535a
kusano 7d535a
    xlusup[jcolp1] = nextlu;	/* Close L\U[*,jcol] */
kusano 7d535a
kusano 7d535a
    /* For more updates within the panel (also within the current supernode), 
kusano 7d535a
     * should start from the first column of the panel, or the first column 
kusano 7d535a
     * of the supernode, whichever is bigger. There are 2 cases:
kusano 7d535a
     *    1) fsupc < fpanelc, then fst_col := fpanelc
kusano 7d535a
     *    2) fsupc >= fpanelc, then fst_col := fsupc
kusano 7d535a
     */
kusano 7d535a
    fst_col = SUPERLU_MAX ( fsupc, fpanelc );
kusano 7d535a
kusano 7d535a
    if ( fst_col < jcol ) {
kusano 7d535a
kusano 7d535a
  	/* Distance between the current supernode and the current panel.
kusano 7d535a
	   d_fsupc=0 if fsupc >= fpanelc. */
kusano 7d535a
  	d_fsupc = fst_col - fsupc;
kusano 7d535a
kusano 7d535a
	lptr = xlsub[fsupc] + d_fsupc;
kusano 7d535a
	luptr = xlusup[fst_col] + d_fsupc;
kusano 7d535a
	nsupr = xlsub[fsupc+1] - xlsub[fsupc];	/* Leading dimension */
kusano 7d535a
	nsupc = jcol - fst_col;	/* Excluding jcol */
kusano 7d535a
	nrow = nsupr - d_fsupc - nsupc;
kusano 7d535a
kusano 7d535a
	/* Points to the beginning of jcol in snode L\U(jsupno) */
kusano 7d535a
	ufirst = xlusup[jcol] + d_fsupc;	
kusano 7d535a
kusano 7d535a
	ops[TRSV] += nsupc * (nsupc - 1);
kusano 7d535a
	ops[GEMV] += 2 * nrow * nsupc;
kusano 7d535a
	
kusano 7d535a
#ifdef USE_VENDOR_BLAS
kusano 7d535a
#ifdef _CRAY
kusano 7d535a
	STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], 
kusano 7d535a
	       &nsupr, &lusup[ufirst], &incx );
kusano 7d535a
#else
kusano 7d535a
	strsv_( "L", "N", "U", &nsupc, &lusup[luptr], 
kusano 7d535a
	       &nsupr, &lusup[ufirst], &incx );
kusano 7d535a
#endif
kusano 7d535a
	
kusano 7d535a
	alpha = none; beta = one; /* y := beta*y + alpha*A*x */
kusano 7d535a
kusano 7d535a
#ifdef _CRAY
kusano 7d535a
	SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
kusano 7d535a
	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
kusano 7d535a
#else
kusano 7d535a
	sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
kusano 7d535a
	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
kusano 7d535a
#endif
kusano 7d535a
#else
kusano 7d535a
	slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );
kusano 7d535a
kusano 7d535a
	smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc],
kusano 7d535a
		&lusup[ufirst], tempv );
kusano 7d535a
	
kusano 7d535a
        /* Copy updates from tempv[*] into lusup[*] */
kusano 7d535a
	isub = ufirst + nsupc;
kusano 7d535a
	for (i = 0; i < nrow; i++) {
kusano 7d535a
	    lusup[isub] -= tempv[i];
kusano 7d535a
	    tempv[i] = 0.0;
kusano 7d535a
	    ++isub;
kusano 7d535a
	}
kusano 7d535a
kusano 7d535a
#endif
kusano 7d535a
	
kusano 7d535a
	
kusano 7d535a
    } /* if fst_col < jcol ... */ 
kusano 7d535a
kusano 7d535a
    return 0;
kusano 7d535a
}