|
kusano |
7d535a |
|
|
kusano |
7d535a |
/*! @file zpanel_bmod.c
|
|
kusano |
7d535a |
* \brief Performs numeric block updates
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
* -- SuperLU routine (version 3.0) --
|
|
kusano |
7d535a |
* Univ. of California Berkeley, Xerox Palo Alto Research Center,
|
|
kusano |
7d535a |
* and Lawrence Berkeley National Lab.
|
|
kusano |
7d535a |
* October 15, 2003
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
* Copyright (c) 1994 by Xerox Corporation. All rights reserved.
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
* THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
|
|
kusano |
7d535a |
* EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
* Permission is hereby granted to use or copy this program for any
|
|
kusano |
7d535a |
* purpose, provided the above notices are retained on all copies.
|
|
kusano |
7d535a |
* Permission to modify the code and to distribute modified code is
|
|
kusano |
7d535a |
* granted, provided the above notices are retained, and a notice that
|
|
kusano |
7d535a |
* the code was modified is included with the above copyright notice.
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
*/
|
|
kusano |
7d535a |
/*
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
*/
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
#include <stdio.h></stdio.h>
|
|
kusano |
7d535a |
#include <stdlib.h></stdlib.h>
|
|
kusano |
7d535a |
#include "slu_zdefs.h"
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/*
|
|
kusano |
7d535a |
* Function prototypes
|
|
kusano |
7d535a |
*/
|
|
kusano |
7d535a |
void zlsolve(int, int, doublecomplex *, doublecomplex *);
|
|
kusano |
7d535a |
void zmatvec(int, int, int, doublecomplex *, doublecomplex *, doublecomplex *);
|
|
kusano |
7d535a |
extern void zcheck_tempv();
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/*! \brief
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
* Purpose
|
|
kusano |
7d535a |
* =======
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
* Performs numeric block updates (sup-panel) in topological order.
|
|
kusano |
7d535a |
* It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
|
|
kusano |
7d535a |
* Special processing on the supernodal portion of L\U[*,j]
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
* Before entering this routine, the original nonzeros in the panel
|
|
kusano |
7d535a |
* were already copied into the spa[m,w].
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
* Updated/Output parameters-
|
|
kusano |
7d535a |
* dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned
|
|
kusano |
7d535a |
* collectively in the m-by-w vector dense[*].
|
|
kusano |
7d535a |
*
|
|
kusano |
7d535a |
*/
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
void
|
|
kusano |
7d535a |
zpanel_bmod (
|
|
kusano |
7d535a |
const int m, /* in - number of rows in the matrix */
|
|
kusano |
7d535a |
const int w, /* in */
|
|
kusano |
7d535a |
const int jcol, /* in */
|
|
kusano |
7d535a |
const int nseg, /* in */
|
|
kusano |
7d535a |
doublecomplex *dense, /* out, of size n by w */
|
|
kusano |
7d535a |
doublecomplex *tempv, /* working array */
|
|
kusano |
7d535a |
int *segrep, /* in */
|
|
kusano |
7d535a |
int *repfnz, /* in, of size n by w */
|
|
kusano |
7d535a |
GlobalLU_t *Glu, /* modified */
|
|
kusano |
7d535a |
SuperLUStat_t *stat /* output */
|
|
kusano |
7d535a |
)
|
|
kusano |
7d535a |
{
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
#ifdef USE_VENDOR_BLAS
|
|
kusano |
7d535a |
#ifdef _CRAY
|
|
kusano |
7d535a |
_fcd ftcs1 = _cptofcd("L", strlen("L")),
|
|
kusano |
7d535a |
ftcs2 = _cptofcd("N", strlen("N")),
|
|
kusano |
7d535a |
ftcs3 = _cptofcd("U", strlen("U"));
|
|
kusano |
7d535a |
#endif
|
|
kusano |
7d535a |
int incx = 1, incy = 1;
|
|
kusano |
7d535a |
doublecomplex alpha, beta;
|
|
kusano |
7d535a |
#endif
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
register int k, ksub;
|
|
kusano |
7d535a |
int fsupc, nsupc, nsupr, nrow;
|
|
kusano |
7d535a |
int krep, krep_ind;
|
|
kusano |
7d535a |
doublecomplex ukj, ukj1, ukj2;
|
|
kusano |
7d535a |
int luptr, luptr1, luptr2;
|
|
kusano |
7d535a |
int segsze;
|
|
kusano |
7d535a |
int block_nrow; /* no of rows in a block row */
|
|
kusano |
7d535a |
register int lptr; /* Points to the row subscripts of a supernode */
|
|
kusano |
7d535a |
int kfnz, irow, no_zeros;
|
|
kusano |
7d535a |
register int isub, isub1, i;
|
|
kusano |
7d535a |
register int jj; /* Index through each column in the panel */
|
|
kusano |
7d535a |
int *xsup, *supno;
|
|
kusano |
7d535a |
int *lsub, *xlsub;
|
|
kusano |
7d535a |
doublecomplex *lusup;
|
|
kusano |
7d535a |
int *xlusup;
|
|
kusano |
7d535a |
int *repfnz_col; /* repfnz[] for a column in the panel */
|
|
kusano |
7d535a |
doublecomplex *dense_col; /* dense[] for a column in the panel */
|
|
kusano |
7d535a |
doublecomplex *tempv1; /* Used in 1-D update */
|
|
kusano |
7d535a |
doublecomplex *TriTmp, *MatvecTmp; /* used in 2-D update */
|
|
kusano |
7d535a |
doublecomplex zero = {0.0, 0.0};
|
|
kusano |
7d535a |
doublecomplex one = {1.0, 0.0};
|
|
kusano |
7d535a |
doublecomplex comp_temp, comp_temp1;
|
|
kusano |
7d535a |
register int ldaTmp;
|
|
kusano |
7d535a |
register int r_ind, r_hi;
|
|
kusano |
7d535a |
static int first = 1, maxsuper, rowblk, colblk;
|
|
kusano |
7d535a |
flops_t *ops = stat->ops;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
xsup = Glu->xsup;
|
|
kusano |
7d535a |
supno = Glu->supno;
|
|
kusano |
7d535a |
lsub = Glu->lsub;
|
|
kusano |
7d535a |
xlsub = Glu->xlsub;
|
|
kusano |
7d535a |
lusup = Glu->lusup;
|
|
kusano |
7d535a |
xlusup = Glu->xlusup;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
if ( first ) {
|
|
kusano |
7d535a |
maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) );
|
|
kusano |
7d535a |
rowblk = sp_ienv(4);
|
|
kusano |
7d535a |
colblk = sp_ienv(5);
|
|
kusano |
7d535a |
first = 0;
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
ldaTmp = maxsuper + rowblk;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/*
|
|
kusano |
7d535a |
* For each nonz supernode segment of U[*,j] in topological order
|
|
kusano |
7d535a |
*/
|
|
kusano |
7d535a |
k = nseg - 1;
|
|
kusano |
7d535a |
for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* krep = representative of current k-th supernode
|
|
kusano |
7d535a |
* fsupc = first supernodal column
|
|
kusano |
7d535a |
* nsupc = no of columns in a supernode
|
|
kusano |
7d535a |
* nsupr = no of rows in a supernode
|
|
kusano |
7d535a |
*/
|
|
kusano |
7d535a |
krep = segrep[k--];
|
|
kusano |
7d535a |
fsupc = xsup[supno[krep]];
|
|
kusano |
7d535a |
nsupc = krep - fsupc + 1;
|
|
kusano |
7d535a |
nsupr = xlsub[fsupc+1] - xlsub[fsupc];
|
|
kusano |
7d535a |
nrow = nsupr - nsupc;
|
|
kusano |
7d535a |
lptr = xlsub[fsupc];
|
|
kusano |
7d535a |
krep_ind = lptr + nsupc - 1;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
repfnz_col = repfnz;
|
|
kusano |
7d535a |
dense_col = dense;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
TriTmp = tempv;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Sequence through each column in panel -- triangular solves */
|
|
kusano |
7d535a |
for (jj = jcol; jj < jcol + w; jj++,
|
|
kusano |
7d535a |
repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
kfnz = repfnz_col[krep];
|
|
kusano |
7d535a |
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
segsze = krep - kfnz + 1;
|
|
kusano |
7d535a |
luptr = xlusup[fsupc];
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
ops[TRSV] += 4 * segsze * (segsze - 1);
|
|
kusano |
7d535a |
ops[GEMV] += 8 * nrow * segsze;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Case 1: Update U-segment of size 1 -- col-col update */
|
|
kusano |
7d535a |
if ( segsze == 1 ) {
|
|
kusano |
7d535a |
ukj = dense_col[lsub[krep_ind]];
|
|
kusano |
7d535a |
luptr += nsupr*(nsupc-1) + nsupc;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
|
|
kusano |
7d535a |
irow = lsub[i];
|
|
kusano |
7d535a |
zz_mult(&comp_temp, &ukj, &lusup[luptr]);
|
|
kusano |
7d535a |
z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
|
|
kusano |
7d535a |
++luptr;
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
} else if ( segsze <= 3 ) {
|
|
kusano |
7d535a |
ukj = dense_col[lsub[krep_ind]];
|
|
kusano |
7d535a |
ukj1 = dense_col[lsub[krep_ind - 1]];
|
|
kusano |
7d535a |
luptr += nsupr*(nsupc-1) + nsupc-1;
|
|
kusano |
7d535a |
luptr1 = luptr - nsupr;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
if ( segsze == 2 ) {
|
|
kusano |
7d535a |
zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
|
|
kusano |
7d535a |
z_sub(&ukj, &ukj, &comp_temp);
|
|
kusano |
7d535a |
dense_col[lsub[krep_ind]] = ukj;
|
|
kusano |
7d535a |
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
|
|
kusano |
7d535a |
irow = lsub[i];
|
|
kusano |
7d535a |
luptr++; luptr1++;
|
|
kusano |
7d535a |
zz_mult(&comp_temp, &ukj, &lusup[luptr]);
|
|
kusano |
7d535a |
zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
|
|
kusano |
7d535a |
z_add(&comp_temp, &comp_temp, &comp_temp1);
|
|
kusano |
7d535a |
z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
} else {
|
|
kusano |
7d535a |
ukj2 = dense_col[lsub[krep_ind - 2]];
|
|
kusano |
7d535a |
luptr2 = luptr1 - nsupr;
|
|
kusano |
7d535a |
zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
|
|
kusano |
7d535a |
z_sub(&ukj1, &ukj1, &comp_temp);
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
|
|
kusano |
7d535a |
zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
|
|
kusano |
7d535a |
z_add(&comp_temp, &comp_temp, &comp_temp1);
|
|
kusano |
7d535a |
z_sub(&ukj, &ukj, &comp_temp);
|
|
kusano |
7d535a |
dense_col[lsub[krep_ind]] = ukj;
|
|
kusano |
7d535a |
dense_col[lsub[krep_ind-1]] = ukj1;
|
|
kusano |
7d535a |
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
|
|
kusano |
7d535a |
irow = lsub[i];
|
|
kusano |
7d535a |
luptr++; luptr1++; luptr2++;
|
|
kusano |
7d535a |
zz_mult(&comp_temp, &ukj, &lusup[luptr]);
|
|
kusano |
7d535a |
zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
|
|
kusano |
7d535a |
z_add(&comp_temp, &comp_temp, &comp_temp1);
|
|
kusano |
7d535a |
zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
|
|
kusano |
7d535a |
z_add(&comp_temp, &comp_temp, &comp_temp1);
|
|
kusano |
7d535a |
z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
} else { /* segsze >= 4 */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Copy U[*,j] segment from dense[*] to TriTmp[*], which
|
|
kusano |
7d535a |
holds the result of triangular solves. */
|
|
kusano |
7d535a |
no_zeros = kfnz - fsupc;
|
|
kusano |
7d535a |
isub = lptr + no_zeros;
|
|
kusano |
7d535a |
for (i = 0; i < segsze; ++i) {
|
|
kusano |
7d535a |
irow = lsub[isub];
|
|
kusano |
7d535a |
TriTmp[i] = dense_col[irow]; /* Gather */
|
|
kusano |
7d535a |
++isub;
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* start effective triangle */
|
|
kusano |
7d535a |
luptr += nsupr * no_zeros + no_zeros;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
#ifdef USE_VENDOR_BLAS
|
|
kusano |
7d535a |
#ifdef _CRAY
|
|
kusano |
7d535a |
CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
|
|
kusano |
7d535a |
&nsupr, TriTmp, &incx );
|
|
kusano |
7d535a |
#else
|
|
kusano |
7d535a |
ztrsv_( "L", "N", "U", &segsze, &lusup[luptr],
|
|
kusano |
7d535a |
&nsupr, TriTmp, &incx );
|
|
kusano |
7d535a |
#endif
|
|
kusano |
7d535a |
#else
|
|
kusano |
7d535a |
zlsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
|
|
kusano |
7d535a |
#endif
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
} /* else ... */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
} /* for jj ... end tri-solves */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Block row updates; push all the way into dense[*] block */
|
|
kusano |
7d535a |
for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
r_hi = SUPERLU_MIN(nrow, r_ind + rowblk);
|
|
kusano |
7d535a |
block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind);
|
|
kusano |
7d535a |
luptr = xlusup[fsupc] + nsupc + r_ind;
|
|
kusano |
7d535a |
isub1 = lptr + nsupc + r_ind;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
repfnz_col = repfnz;
|
|
kusano |
7d535a |
TriTmp = tempv;
|
|
kusano |
7d535a |
dense_col = dense;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Sequence through each column in panel -- matrix-vector */
|
|
kusano |
7d535a |
for (jj = jcol; jj < jcol + w; jj++,
|
|
kusano |
7d535a |
repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
kfnz = repfnz_col[krep];
|
|
kusano |
7d535a |
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
segsze = krep - kfnz + 1;
|
|
kusano |
7d535a |
if ( segsze <= 3 ) continue; /* skip unrolled cases */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Perform a block update, and scatter the result of
|
|
kusano |
7d535a |
matrix-vector to dense[]. */
|
|
kusano |
7d535a |
no_zeros = kfnz - fsupc;
|
|
kusano |
7d535a |
luptr1 = luptr + nsupr * no_zeros;
|
|
kusano |
7d535a |
MatvecTmp = &TriTmp[maxsuper];
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
#ifdef USE_VENDOR_BLAS
|
|
kusano |
7d535a |
alpha = one;
|
|
kusano |
7d535a |
beta = zero;
|
|
kusano |
7d535a |
#ifdef _CRAY
|
|
kusano |
7d535a |
CGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1],
|
|
kusano |
7d535a |
&nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
|
|
kusano |
7d535a |
#else
|
|
kusano |
7d535a |
zgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1],
|
|
kusano |
7d535a |
&nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
|
|
kusano |
7d535a |
#endif
|
|
kusano |
7d535a |
#else
|
|
kusano |
7d535a |
zmatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
|
|
kusano |
7d535a |
TriTmp, MatvecTmp);
|
|
kusano |
7d535a |
#endif
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Scatter MatvecTmp[*] into SPA dense[*] temporarily
|
|
kusano |
7d535a |
* such that MatvecTmp[*] can be re-used for the
|
|
kusano |
7d535a |
* the next blok row update. dense[] will be copied into
|
|
kusano |
7d535a |
* global store after the whole panel has been finished.
|
|
kusano |
7d535a |
*/
|
|
kusano |
7d535a |
isub = isub1;
|
|
kusano |
7d535a |
for (i = 0; i < block_nrow; i++) {
|
|
kusano |
7d535a |
irow = lsub[isub];
|
|
kusano |
7d535a |
z_sub(&dense_col[irow], &dense_col[irow],
|
|
kusano |
7d535a |
&MatvecTmp[i]);
|
|
kusano |
7d535a |
MatvecTmp[i] = zero;
|
|
kusano |
7d535a |
++isub;
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
} /* for jj ... */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
} /* for each block row ... */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Scatter the triangular solves into SPA dense[*] */
|
|
kusano |
7d535a |
repfnz_col = repfnz;
|
|
kusano |
7d535a |
TriTmp = tempv;
|
|
kusano |
7d535a |
dense_col = dense;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
for (jj = jcol; jj < jcol + w; jj++,
|
|
kusano |
7d535a |
repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
|
|
kusano |
7d535a |
kfnz = repfnz_col[krep];
|
|
kusano |
7d535a |
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
segsze = krep - kfnz + 1;
|
|
kusano |
7d535a |
if ( segsze <= 3 ) continue; /* skip unrolled cases */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
no_zeros = kfnz - fsupc;
|
|
kusano |
7d535a |
isub = lptr + no_zeros;
|
|
kusano |
7d535a |
for (i = 0; i < segsze; i++) {
|
|
kusano |
7d535a |
irow = lsub[isub];
|
|
kusano |
7d535a |
dense_col[irow] = TriTmp[i];
|
|
kusano |
7d535a |
TriTmp[i] = zero;
|
|
kusano |
7d535a |
++isub;
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
} /* for jj ... */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
} else { /* 1-D block modification */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Sequence through each column in the panel */
|
|
kusano |
7d535a |
for (jj = jcol; jj < jcol + w; jj++,
|
|
kusano |
7d535a |
repfnz_col += m, dense_col += m) {
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
kfnz = repfnz_col[krep];
|
|
kusano |
7d535a |
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
segsze = krep - kfnz + 1;
|
|
kusano |
7d535a |
luptr = xlusup[fsupc];
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
ops[TRSV] += 4 * segsze * (segsze - 1);
|
|
kusano |
7d535a |
ops[GEMV] += 8 * nrow * segsze;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Case 1: Update U-segment of size 1 -- col-col update */
|
|
kusano |
7d535a |
if ( segsze == 1 ) {
|
|
kusano |
7d535a |
ukj = dense_col[lsub[krep_ind]];
|
|
kusano |
7d535a |
luptr += nsupr*(nsupc-1) + nsupc;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
|
|
kusano |
7d535a |
irow = lsub[i];
|
|
kusano |
7d535a |
zz_mult(&comp_temp, &ukj, &lusup[luptr]);
|
|
kusano |
7d535a |
z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
|
|
kusano |
7d535a |
++luptr;
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
} else if ( segsze <= 3 ) {
|
|
kusano |
7d535a |
ukj = dense_col[lsub[krep_ind]];
|
|
kusano |
7d535a |
luptr += nsupr*(nsupc-1) + nsupc-1;
|
|
kusano |
7d535a |
ukj1 = dense_col[lsub[krep_ind - 1]];
|
|
kusano |
7d535a |
luptr1 = luptr - nsupr;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
if ( segsze == 2 ) {
|
|
kusano |
7d535a |
zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
|
|
kusano |
7d535a |
z_sub(&ukj, &ukj, &comp_temp);
|
|
kusano |
7d535a |
dense_col[lsub[krep_ind]] = ukj;
|
|
kusano |
7d535a |
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
|
|
kusano |
7d535a |
irow = lsub[i];
|
|
kusano |
7d535a |
++luptr; ++luptr1;
|
|
kusano |
7d535a |
zz_mult(&comp_temp, &ukj, &lusup[luptr]);
|
|
kusano |
7d535a |
zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
|
|
kusano |
7d535a |
z_add(&comp_temp, &comp_temp, &comp_temp1);
|
|
kusano |
7d535a |
z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
} else {
|
|
kusano |
7d535a |
ukj2 = dense_col[lsub[krep_ind - 2]];
|
|
kusano |
7d535a |
luptr2 = luptr1 - nsupr;
|
|
kusano |
7d535a |
zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
|
|
kusano |
7d535a |
z_sub(&ukj1, &ukj1, &comp_temp);
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
|
|
kusano |
7d535a |
zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
|
|
kusano |
7d535a |
z_add(&comp_temp, &comp_temp, &comp_temp1);
|
|
kusano |
7d535a |
z_sub(&ukj, &ukj, &comp_temp);
|
|
kusano |
7d535a |
dense_col[lsub[krep_ind]] = ukj;
|
|
kusano |
7d535a |
dense_col[lsub[krep_ind-1]] = ukj1;
|
|
kusano |
7d535a |
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
|
|
kusano |
7d535a |
irow = lsub[i];
|
|
kusano |
7d535a |
++luptr; ++luptr1; ++luptr2;
|
|
kusano |
7d535a |
zz_mult(&comp_temp, &ukj, &lusup[luptr]);
|
|
kusano |
7d535a |
zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
|
|
kusano |
7d535a |
z_add(&comp_temp, &comp_temp, &comp_temp1);
|
|
kusano |
7d535a |
zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
|
|
kusano |
7d535a |
z_add(&comp_temp, &comp_temp, &comp_temp1);
|
|
kusano |
7d535a |
z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
} else { /* segsze >= 4 */
|
|
kusano |
7d535a |
/*
|
|
kusano |
7d535a |
* Perform a triangular solve and block update,
|
|
kusano |
7d535a |
* then scatter the result of sup-col update to dense[].
|
|
kusano |
7d535a |
*/
|
|
kusano |
7d535a |
no_zeros = kfnz - fsupc;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Copy U[*,j] segment from dense[*] to tempv[*]:
|
|
kusano |
7d535a |
* The result of triangular solve is in tempv[*];
|
|
kusano |
7d535a |
* The result of matrix vector update is in dense_col[*]
|
|
kusano |
7d535a |
*/
|
|
kusano |
7d535a |
isub = lptr + no_zeros;
|
|
kusano |
7d535a |
for (i = 0; i < segsze; ++i) {
|
|
kusano |
7d535a |
irow = lsub[isub];
|
|
kusano |
7d535a |
tempv[i] = dense_col[irow]; /* Gather */
|
|
kusano |
7d535a |
++isub;
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* start effective triangle */
|
|
kusano |
7d535a |
luptr += nsupr * no_zeros + no_zeros;
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
#ifdef USE_VENDOR_BLAS
|
|
kusano |
7d535a |
#ifdef _CRAY
|
|
kusano |
7d535a |
CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
|
|
kusano |
7d535a |
&nsupr, tempv, &incx );
|
|
kusano |
7d535a |
#else
|
|
kusano |
7d535a |
ztrsv_( "L", "N", "U", &segsze, &lusup[luptr],
|
|
kusano |
7d535a |
&nsupr, tempv, &incx );
|
|
kusano |
7d535a |
#endif
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
luptr += segsze; /* Dense matrix-vector */
|
|
kusano |
7d535a |
tempv1 = &tempv[segsze];
|
|
kusano |
7d535a |
alpha = one;
|
|
kusano |
7d535a |
beta = zero;
|
|
kusano |
7d535a |
#ifdef _CRAY
|
|
kusano |
7d535a |
CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr],
|
|
kusano |
7d535a |
&nsupr, tempv, &incx, &beta, tempv1, &incy );
|
|
kusano |
7d535a |
#else
|
|
kusano |
7d535a |
zgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr],
|
|
kusano |
7d535a |
&nsupr, tempv, &incx, &beta, tempv1, &incy );
|
|
kusano |
7d535a |
#endif
|
|
kusano |
7d535a |
#else
|
|
kusano |
7d535a |
zlsolve ( nsupr, segsze, &lusup[luptr], tempv );
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
luptr += segsze; /* Dense matrix-vector */
|
|
kusano |
7d535a |
tempv1 = &tempv[segsze];
|
|
kusano |
7d535a |
zmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
|
|
kusano |
7d535a |
#endif
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Scatter tempv[*] into SPA dense[*] temporarily, such
|
|
kusano |
7d535a |
* that tempv[*] can be used for the triangular solve of
|
|
kusano |
7d535a |
* the next column of the panel. They will be copied into
|
|
kusano |
7d535a |
* ucol[*] after the whole panel has been finished.
|
|
kusano |
7d535a |
*/
|
|
kusano |
7d535a |
isub = lptr + no_zeros;
|
|
kusano |
7d535a |
for (i = 0; i < segsze; i++) {
|
|
kusano |
7d535a |
irow = lsub[isub];
|
|
kusano |
7d535a |
dense_col[irow] = tempv[i];
|
|
kusano |
7d535a |
tempv[i] = zero;
|
|
kusano |
7d535a |
isub++;
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
/* Scatter the update from tempv1[*] into SPA dense[*] */
|
|
kusano |
7d535a |
/* Start dense rectangular L */
|
|
kusano |
7d535a |
for (i = 0; i < nrow; i++) {
|
|
kusano |
7d535a |
irow = lsub[isub];
|
|
kusano |
7d535a |
z_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]);
|
|
kusano |
7d535a |
tempv1[i] = zero;
|
|
kusano |
7d535a |
++isub;
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
} /* else segsze>=4 ... */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
} /* for each column in the panel... */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
} /* else 1-D update ... */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
} /* for each updating supernode ... */
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
}
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
|
|
kusano |
7d535a |
|