|
kusano |
7d535a |
|
|
kusano |
7d535a |
/*! @file scolumn_bmod.c
|
|
kusano |
7d535a |
* \brief performs numeric block updates
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
* -- SuperLU routine (version 3.0) --
|
|
kusano |
7d535a |
* Univ. of California Berkeley, Xerox Palo Alto Research Center,
|
|
kusano |
7d535a |
* and Lawrence Berkeley National Lab.
|
|
kusano |
7d535a |
* October 15, 2003
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
* Copyright (c) 1994 by Xerox Corporation. All rights reserved.
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
* THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
|
|
kusano |
7d535a |
* EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
* Permission is hereby granted to use or copy this program for any
|
|
kusano |
7d535a |
* purpose, provided the above notices are retained on all copies.
|
|
kusano |
7d535a |
* Permission to modify the code and to distribute modified code is
|
|
kusano |
7d535a |
* granted, provided the above notices are retained, and a notice that
|
|
kusano |
7d535a |
* the code was modified is included with the above copyright notice.
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
*/
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
#include <stdio.h></stdio.h>
|
|
kusano |
7d535a |
#include <stdlib.h></stdlib.h>
|
|
kusano |
7d535a |
#include "slu_sdefs.h"
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/*
|
|
kusano |
7d535a |
* Function prototypes
|
|
kusano |
7d535a |
*/
|
|
kusano |
7d535a |
void susolve(int, int, float*, float*);
|
|
kusano |
7d535a |
void slsolve(int, int, float*, float*);
|
|
kusano |
7d535a |
void smatvec(int, int, int, float*, float*, float*);
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/*! \brief
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
* Purpose:
|
|
kusano |
7d535a |
* ========
|
|
kusano |
7d535a |
* Performs numeric block updates (sup-col) in topological order.
|
|
kusano |
7d535a |
* It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
|
|
kusano |
7d535a |
* Special processing on the supernodal portion of L\U[*,j]
|
|
kusano |
7d535a |
* Return value: 0 - successful return
|
|
kusano |
7d535a |
* > 0 - number of bytes allocated when run out of space
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
*/
|
|
kusano |
7d535a |
int
|
|
kusano |
7d535a |
scolumn_bmod (
|
|
kusano |
7d535a |
const int jcol, /* in */
|
|
kusano |
7d535a |
const int nseg, /* in */
|
|
kusano |
7d535a |
float *dense, /* in */
|
|
kusano |
7d535a |
float *tempv, /* working array */
|
|
kusano |
7d535a |
int *segrep, /* in */
|
|
kusano |
7d535a |
int *repfnz, /* in */
|
|
kusano |
7d535a |
int fpanelc, /* in -- first column in the current panel */
|
|
kusano |
7d535a |
GlobalLU_t *Glu, /* modified */
|
|
kusano |
7d535a |
SuperLUStat_t *stat /* output */
|
|
kusano |
7d535a |
)
|
|
kusano |
7d535a |
{
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
#ifdef _CRAY
|
|
kusano |
7d535a |
_fcd ftcs1 = _cptofcd("L", strlen("L")),
|
|
kusano |
7d535a |
ftcs2 = _cptofcd("N", strlen("N")),
|
|
kusano |
7d535a |
ftcs3 = _cptofcd("U", strlen("U"));
|
|
kusano |
7d535a |
#endif
|
|
kusano |
7d535a |
int incx = 1, incy = 1;
|
|
kusano |
7d535a |
float alpha, beta;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* krep = representative of current k-th supernode
|
|
kusano |
7d535a |
* fsupc = first supernodal column
|
|
kusano |
7d535a |
* nsupc = no of columns in supernode
|
|
kusano |
7d535a |
* nsupr = no of rows in supernode (used as leading dimension)
|
|
kusano |
7d535a |
* luptr = location of supernodal LU-block in storage
|
|
kusano |
7d535a |
* kfnz = first nonz in the k-th supernodal segment
|
|
kusano |
7d535a |
* no_zeros = no of leading zeros in a supernodal U-segment
|
|
kusano |
7d535a |
*/
|
|
kusano |
7d535a |
float ukj, ukj1, ukj2;
|
|
kusano |
7d535a |
int luptr, luptr1, luptr2;
|
|
kusano |
7d535a |
int fsupc, nsupc, nsupr, segsze;
|
|
kusano |
7d535a |
int nrow; /* No of rows in the matrix of matrix-vector */
|
|
kusano |
7d535a |
int jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno;
|
|
kusano |
7d535a |
register int lptr, kfnz, isub, irow, i;
|
|
kusano |
7d535a |
register int no_zeros, new_next;
|
|
kusano |
7d535a |
int ufirst, nextlu;
|
|
kusano |
7d535a |
int fst_col; /* First column within small LU update */
|
|
kusano |
7d535a |
int d_fsupc; /* Distance between the first column of the current
|
|
kusano |
7d535a |
panel and the first column of the current snode. */
|
|
kusano |
7d535a |
int *xsup, *supno;
|
|
kusano |
7d535a |
int *lsub, *xlsub;
|
|
kusano |
7d535a |
float *lusup;
|
|
kusano |
7d535a |
int *xlusup;
|
|
kusano |
7d535a |
int nzlumax;
|
|
kusano |
7d535a |
float *tempv1;
|
|
kusano |
7d535a |
float zero = 0.0;
|
|
kusano |
7d535a |
float one = 1.0;
|
|
kusano |
7d535a |
float none = -1.0;
|
|
kusano |
7d535a |
int mem_error;
|
|
kusano |
7d535a |
flops_t *ops = stat->ops;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
xsup = Glu->xsup;
|
|
kusano |
7d535a |
supno = Glu->supno;
|
|
kusano |
7d535a |
lsub = Glu->lsub;
|
|
kusano |
7d535a |
xlsub = Glu->xlsub;
|
|
kusano |
7d535a |
lusup = Glu->lusup;
|
|
kusano |
7d535a |
xlusup = Glu->xlusup;
|
|
kusano |
7d535a |
nzlumax = Glu->nzlumax;
|
|
kusano |
7d535a |
jcolp1 = jcol + 1;
|
|
kusano |
7d535a |
jsupno = supno[jcol];
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/*
|
|
kusano |
7d535a |
* For each nonz supernode segment of U[*,j] in topological order
|
|
kusano |
7d535a |
*/
|
|
kusano |
7d535a |
k = nseg - 1;
|
|
kusano |
7d535a |
for (ksub = 0; ksub < nseg; ksub++) {
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
krep = segrep[k];
|
|
kusano |
7d535a |
k--;
|
|
kusano |
7d535a |
ksupno = supno[krep];
|
|
kusano |
7d535a |
if ( jsupno != ksupno ) { /* Outside the rectangular supernode */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
fsupc = xsup[ksupno];
|
|
kusano |
7d535a |
fst_col = SUPERLU_MAX ( fsupc, fpanelc );
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Distance from the current supernode to the current panel;
|
|
kusano |
7d535a |
d_fsupc=0 if fsupc > fpanelc. */
|
|
kusano |
7d535a |
d_fsupc = fst_col - fsupc;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
luptr = xlusup[fst_col] + d_fsupc;
|
|
kusano |
7d535a |
lptr = xlsub[fsupc] + d_fsupc;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
kfnz = repfnz[krep];
|
|
kusano |
7d535a |
kfnz = SUPERLU_MAX ( kfnz, fpanelc );
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
segsze = krep - kfnz + 1;
|
|
kusano |
7d535a |
nsupc = krep - fst_col + 1;
|
|
kusano |
7d535a |
nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */
|
|
kusano |
7d535a |
nrow = nsupr - d_fsupc - nsupc;
|
|
kusano |
7d535a |
krep_ind = lptr + nsupc - 1;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
ops[TRSV] += segsze * (segsze - 1);
|
|
kusano |
7d535a |
ops[GEMV] += 2 * nrow * segsze;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/*
|
|
kusano |
7d535a |
* Case 1: Update U-segment of size 1 -- col-col update
|
|
kusano |
7d535a |
*/
|
|
kusano |
7d535a |
if ( segsze == 1 ) {
|
|
kusano |
7d535a |
ukj = dense[lsub[krep_ind]];
|
|
kusano |
7d535a |
luptr += nsupr*(nsupc-1) + nsupc;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
|
|
kusano |
7d535a |
irow = lsub[i];
|
|
kusano |
7d535a |
dense[irow] -= ukj*lusup[luptr];
|
|
kusano |
7d535a |
luptr++;
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
} else if ( segsze <= 3 ) {
|
|
kusano |
7d535a |
ukj = dense[lsub[krep_ind]];
|
|
kusano |
7d535a |
luptr += nsupr*(nsupc-1) + nsupc-1;
|
|
kusano |
7d535a |
ukj1 = dense[lsub[krep_ind - 1]];
|
|
kusano |
7d535a |
luptr1 = luptr - nsupr;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
if ( segsze == 2 ) { /* Case 2: 2cols-col update */
|
|
kusano |
7d535a |
ukj -= ukj1 * lusup[luptr1];
|
|
kusano |
7d535a |
dense[lsub[krep_ind]] = ukj;
|
|
kusano |
7d535a |
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
|
|
kusano |
7d535a |
irow = lsub[i];
|
|
kusano |
7d535a |
luptr++;
|
|
kusano |
7d535a |
luptr1++;
|
|
kusano |
7d535a |
dense[irow] -= ( ukj*lusup[luptr]
|
|
kusano |
7d535a |
+ ukj1*lusup[luptr1] );
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
} else { /* Case 3: 3cols-col update */
|
|
kusano |
7d535a |
ukj2 = dense[lsub[krep_ind - 2]];
|
|
kusano |
7d535a |
luptr2 = luptr1 - nsupr;
|
|
kusano |
7d535a |
ukj1 -= ukj2 * lusup[luptr2-1];
|
|
kusano |
7d535a |
ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
|
|
kusano |
7d535a |
dense[lsub[krep_ind]] = ukj;
|
|
kusano |
7d535a |
dense[lsub[krep_ind-1]] = ukj1;
|
|
kusano |
7d535a |
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
|
|
kusano |
7d535a |
irow = lsub[i];
|
|
kusano |
7d535a |
luptr++;
|
|
kusano |
7d535a |
luptr1++;
|
|
kusano |
7d535a |
luptr2++;
|
|
kusano |
7d535a |
dense[irow] -= ( ukj*lusup[luptr]
|
|
kusano |
7d535a |
+ ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
} else {
|
|
kusano |
7d535a |
/*
|
|
kusano |
7d535a |
* Case: sup-col update
|
|
kusano |
7d535a |
* Perform a triangular solve and block update,
|
|
kusano |
7d535a |
* then scatter the result of sup-col update to dense
|
|
kusano |
7d535a |
*/
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
no_zeros = kfnz - fst_col;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Copy U[*,j] segment from dense[*] to tempv[*] */
|
|
kusano |
7d535a |
isub = lptr + no_zeros;
|
|
kusano |
7d535a |
for (i = 0; i < segsze; i++) {
|
|
kusano |
7d535a |
irow = lsub[isub];
|
|
kusano |
7d535a |
tempv[i] = dense[irow];
|
|
kusano |
7d535a |
++isub;
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Dense triangular solve -- start effective triangle */
|
|
kusano |
7d535a |
luptr += nsupr * no_zeros + no_zeros;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
#ifdef USE_VENDOR_BLAS
|
|
kusano |
7d535a |
#ifdef _CRAY
|
|
kusano |
7d535a |
STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
|
|
kusano |
7d535a |
&nsupr, tempv, &incx );
|
|
kusano |
7d535a |
#else
|
|
kusano |
7d535a |
strsv_( "L", "N", "U", &segsze, &lusup[luptr],
|
|
kusano |
7d535a |
&nsupr, tempv, &incx );
|
|
kusano |
7d535a |
#endif
|
|
kusano |
7d535a |
luptr += segsze; /* Dense matrix-vector */
|
|
kusano |
7d535a |
tempv1 = &tempv[segsze];
|
|
kusano |
7d535a |
alpha = one;
|
|
kusano |
7d535a |
beta = zero;
|
|
kusano |
7d535a |
#ifdef _CRAY
|
|
kusano |
7d535a |
SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr],
|
|
kusano |
7d535a |
&nsupr, tempv, &incx, &beta, tempv1, &incy );
|
|
kusano |
7d535a |
#else
|
|
kusano |
7d535a |
sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr],
|
|
kusano |
7d535a |
&nsupr, tempv, &incx, &beta, tempv1, &incy );
|
|
kusano |
7d535a |
#endif
|
|
kusano |
7d535a |
#else
|
|
kusano |
7d535a |
slsolve ( nsupr, segsze, &lusup[luptr], tempv );
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
luptr += segsze; /* Dense matrix-vector */
|
|
kusano |
7d535a |
tempv1 = &tempv[segsze];
|
|
kusano |
7d535a |
smatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1);
|
|
kusano |
7d535a |
#endif
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Scatter tempv[] into SPA dense[] as a temporary storage */
|
|
kusano |
7d535a |
isub = lptr + no_zeros;
|
|
kusano |
7d535a |
for (i = 0; i < segsze; i++) {
|
|
kusano |
7d535a |
irow = lsub[isub];
|
|
kusano |
7d535a |
dense[irow] = tempv[i];
|
|
kusano |
7d535a |
tempv[i] = zero;
|
|
kusano |
7d535a |
++isub;
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Scatter tempv1[] into SPA dense[] */
|
|
kusano |
7d535a |
for (i = 0; i < nrow; i++) {
|
|
kusano |
7d535a |
irow = lsub[isub];
|
|
kusano |
7d535a |
dense[irow] -= tempv1[i];
|
|
kusano |
7d535a |
tempv1[i] = zero;
|
|
kusano |
7d535a |
++isub;
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
} /* if jsupno ... */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
} /* for each segment... */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/*
|
|
kusano |
7d535a |
* Process the supernodal portion of L\U[*,j]
|
|
kusano |
7d535a |
*/
|
|
kusano |
7d535a |
nextlu = xlusup[jcol];
|
|
kusano |
7d535a |
fsupc = xsup[jsupno];
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Copy the SPA dense into L\U[*,j] */
|
|
kusano |
7d535a |
new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc];
|
|
kusano |
7d535a |
while ( new_next > nzlumax ) {
|
|
kusano |
7d535a |
if (mem_error = sLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu))
|
|
kusano |
7d535a |
return (mem_error);
|
|
kusano |
7d535a |
lusup = Glu->lusup;
|
|
kusano |
7d535a |
lsub = Glu->lsub;
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) {
|
|
kusano |
7d535a |
irow = lsub[isub];
|
|
kusano |
7d535a |
lusup[nextlu] = dense[irow];
|
|
kusano |
7d535a |
dense[irow] = zero;
|
|
kusano |
7d535a |
++nextlu;
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
xlusup[jcolp1] = nextlu; /* Close L\U[*,jcol] */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* For more updates within the panel (also within the current supernode),
|
|
kusano |
7d535a |
* should start from the first column of the panel, or the first column
|
|
kusano |
7d535a |
* of the supernode, whichever is bigger. There are 2 cases:
|
|
kusano |
7d535a |
* 1) fsupc < fpanelc, then fst_col := fpanelc
|
|
kusano |
7d535a |
* 2) fsupc >= fpanelc, then fst_col := fsupc
|
|
kusano |
7d535a |
*/
|
|
kusano |
7d535a |
fst_col = SUPERLU_MAX ( fsupc, fpanelc );
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
if ( fst_col < jcol ) {
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Distance between the current supernode and the current panel.
|
|
kusano |
7d535a |
d_fsupc=0 if fsupc >= fpanelc. */
|
|
kusano |
7d535a |
d_fsupc = fst_col - fsupc;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
lptr = xlsub[fsupc] + d_fsupc;
|
|
kusano |
7d535a |
luptr = xlusup[fst_col] + d_fsupc;
|
|
kusano |
7d535a |
nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */
|
|
kusano |
7d535a |
nsupc = jcol - fst_col; /* Excluding jcol */
|
|
kusano |
7d535a |
nrow = nsupr - d_fsupc - nsupc;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Points to the beginning of jcol in snode L\U(jsupno) */
|
|
kusano |
7d535a |
ufirst = xlusup[jcol] + d_fsupc;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
ops[TRSV] += nsupc * (nsupc - 1);
|
|
kusano |
7d535a |
ops[GEMV] += 2 * nrow * nsupc;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
#ifdef USE_VENDOR_BLAS
|
|
kusano |
7d535a |
#ifdef _CRAY
|
|
kusano |
7d535a |
STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr],
|
|
kusano |
7d535a |
&nsupr, &lusup[ufirst], &incx );
|
|
kusano |
7d535a |
#else
|
|
kusano |
7d535a |
strsv_( "L", "N", "U", &nsupc, &lusup[luptr],
|
|
kusano |
7d535a |
&nsupr, &lusup[ufirst], &incx );
|
|
kusano |
7d535a |
#endif
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
alpha = none; beta = one; /* y := beta*y + alpha*A*x */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
#ifdef _CRAY
|
|
kusano |
7d535a |
SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
|
|
kusano |
7d535a |
&lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
|
|
kusano |
7d535a |
#else
|
|
kusano |
7d535a |
sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
|
|
kusano |
7d535a |
&lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
|
|
kusano |
7d535a |
#endif
|
|
kusano |
7d535a |
#else
|
|
kusano |
7d535a |
slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc],
|
|
kusano |
7d535a |
&lusup[ufirst], tempv );
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Copy updates from tempv[*] into lusup[*] */
|
|
kusano |
7d535a |
isub = ufirst + nsupc;
|
|
kusano |
7d535a |
for (i = 0; i < nrow; i++) {
|
|
kusano |
7d535a |
lusup[isub] -= tempv[i];
|
|
kusano |
7d535a |
tempv[i] = 0.0;
|
|
kusano |
7d535a |
++isub;
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
#endif
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
} /* if fst_col < jcol ... */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
return 0;
|
|
kusano |
7d535a |
}
|