#include "linalg.h" typedef void (*linop_blas_t)(QLA_Complex *out, QLA_Complex *in, void *args); /* regular CG */ QOP_status_t QOPPCV(invert_cg_blas)(linop_blas_t linop, void *args, QOP_invert_arg_t *inv_arg, QOP_resid_arg_t *res_arg, QLA_Complex *out, QLA_Complex *in, int n) { QLA_Real a; QLA_Real rsq, oldrsq, pkp, relnorm2; QLA_Real insq; QLA_Real rsqstop; QLA_Complex *r, *p, *Mp; int iteration=0, total_iterations=0, nrestart=-1; int restart_iterations=inv_arg->restart; int max_iterations=inv_arg->max_iter; int max_restarts=inv_arg->max_restarts; if(max_restarts<0) max_restarts = 5; r = (QLA_Complex *) malloc(n*sizeof(QLA_Complex)); p = (QLA_Complex *) malloc(n*sizeof(QLA_Complex)); Mp = (QLA_Complex *) malloc(n*sizeof(QLA_Complex)); insq = norm2_v(in, n); rsqstop = res_arg->rsqmin * insq; VERB(LOW, "CG: rsqstop = %g\n", rsqstop); rsq = 0; relnorm2 = 1.; oldrsq = rsq; /* Default output values unless reassigned */ res_arg->final_rsq = 0; res_arg->final_rel = 0; res_arg->final_iter = 0; res_arg->final_restart = 0; /* Special case of exactly zero source */ if(insq == 0.){ VERB(LOW, "CG: exiting because of zero source\n"); free(Mp); free(p); free(r); v_eq_zero(out, n); return QOP_SUCCESS; } while(1) { if( (total_iterations==0) || (iteration>=restart_iterations) || (total_iterations>=max_iterations) || ((rsqstop <= 0 || rsqrelmin <= 0 || relnorm2relmin)) ){ /* only way out */ /* stop when we exhaust iterations */ if( (total_iterations>=max_iterations) || (nrestart>=max_restarts) ) break; /* otherwise, restart */ nrestart++; /* compute true residual */ linop(Mp, out, args); iteration = 1; total_iterations++; v_eq_v_minus_v(r, in, Mp, n); rsq = norm2_v(r, n); /* compute FNAL norm if requested */ if(res_arg->relmin > 0) relnorm2 = relnorm2_v(r, out, n); VERB(LOW, "CG: (re)start: iter %i rsq = %g rel = %g\n", total_iterations, rsq, relnorm2); /* stop here if converged */ if( ((rsqstop <= 0 || rsqrelmin <= 0 || relnorm2relmin)) || (total_iterations>=max_iterations) ) break; //v_eq_v(p, r, n); QLA_Real s = 1.0 / rsq; v_eq_r_times_v(p, s, r, n); } else { //r_eq_re_V_dot_V(&b, Mp, r, subset); //b = -a*b/oldrsq; //b *= rsq / oldrsq; //b = rsq; //v_eq_r_times_v_plus_v(p, b, p, r, n); //v_teq_r(p, b, n); //v_peq_v(p, r, n); QLA_Real s = 1.0 / rsq; v_peq_r_times_v(p, s, r, n); } linop(Mp, p, args); iteration++; total_iterations++; pkp = re_v_dot_v(p, Mp, n); //a = rsq / pkp; //v_peq_r_times_v(out, a, p, n); //v_meq_r_times_v(r, a, Mp, n); //QLA_Real s = a / b; a = 1.0 / pkp; v_peq_r_times_v(out, a, p, n); v_meq_r_times_v(r, a, Mp, n); oldrsq = rsq; rsq = norm2_v(r, n); /* compute FNAL norm if requested */ if(res_arg->relmin > 0) relnorm2 = relnorm2_v(r, out, n); VERB(HI, "CG: iter %i rsq = %g rel = %g\n", total_iterations, rsq, relnorm2); } VERB(LOW, "CG: done: iter %i rsq = %g rel = %g\n", total_iterations, rsq, relnorm2); free(r); free(p); free(Mp); res_arg->final_rsq = rsq/insq; res_arg->final_rel = relnorm2; res_arg->final_iter = total_iterations; res_arg->final_restart = nrestart; return QOP_SUCCESS; } /* milti-shift CG */ QOP_status_t QOPPCV(invert_cgms_blas)(linop_blas_t linop, void *args, QOP_invert_arg_t *inv_arg, QOP_resid_arg_t **res_arg, QLA_Real *shifts, int nshifts, QLA_Complex **out, QLA_Complex *in, int n) { double a[nshifts], b[nshifts], s[nshifts]; double bo[nshifts], z[nshifts], zo[nshifts], zn[nshifts]; double rsq, oldrsq, pkp, relnorm2; double insq; double rsqstop; QLA_Real t; int iteration=0, i, imin, imax; QLA_Complex *r, *Mp, *pm[nshifts]; imin = 0; for(i=1; ishifts[imax]) imax = i; r = (QLA_Complex *) malloc(n*sizeof(QLA_Complex)); Mp = (QLA_Complex *) malloc(n*sizeof(QLA_Complex)); for(i=0; irsqmin * insq; VERB(LOW, "CGMS: rsqstop = %g\n", rsqstop); rsq = insq; relnorm2 = 0; //printf("start %g\n", rsq); while(1) { oldrsq = rsq; linop(Mp, pm[imin], args); //if(shifts[imin]!=0.0) V_peq_r_times_V(Mp, shifts+imin, p, subset); iteration++; pkp = s[imin]*s[imin] * re_v_dot_v(pm[imin], Mp, n); b[imin] = rsq / pkp; zn[imin] = 1; for(i=0; irelmin > 0){ t = b[imax] * s[imax]; v_meq_r_times_v(r, t, Mp, n); relnorm2 = relnorm2_v(r, out[imax], n); } VERB(HI, "CGMS: iter %i rsq = %g rel = %g\n", iteration, rsq, relnorm2); if( (iteration%inv_arg->restart==0) || (iteration>=inv_arg->max_iter) || ((rsqstop <= 0 || rsqrelmin <= 0 || relnorm2relmin)) ) { /* only way out */ break; } a[imin] = rsq / oldrsq; for(i=0; ifinal_rsq = rsq/insq; res_arg[i]->final_rel = relnorm2; res_arg[i]->final_iter = iteration; res_arg[i]->final_restart = 0; } VERB(LOW, "CGMS: done: iter %i rsq = %g\n", iteration, rsq); return QOP_SUCCESS; } /***************************************************************************/ typedef struct { Vector *in, *out; QDP_Subset subset; QOPPCV(linop_t) *linop; int _n; } linop_args_t; static void linop_blas(QLA_Complex *out, QLA_Complex *in, void *args) { linop_args_t *a = (linop_args_t *)args; int _n = a->_n; insert_packed_V(a->in, in, a->subset); //insert_packed_V(a->out, out, a->subset); a->linop(a->out, a->in, a->subset); extract_packed_V(out, a->out, a->subset); //int n = QDP_subset_len(a->subset)*csize_V; //v_eq_v(out, in, n); } /* regular CG */ QOP_status_t QOPPCV(invert_cg)(QOPPCV(linop_t) *linop, QOP_invert_arg_t *inv_arg, QOP_resid_arg_t *res_arg, Vector *out, Vector *in, Vector *p, QDP_Subset subset vIndexDef) { QLA_Complex *bout, *bin; linop_args_t args; int n = QDP_subset_len(subset)*csize_V; QOP_status_t st; args.linop = linop; //create_V(args.in); args.in = p; create_V(args.out); args.subset = subset; args._n = _N; bin = (QLA_Complex *) malloc(n*sizeof(QLA_Complex)); bout = (QLA_Complex *) malloc(n*sizeof(QLA_Complex)); extract_packed_V(bin, in, subset); extract_packed_V(bout, out, subset); //V_eq_V(p, out, subset); //linop(args.out, p, subset); //V_meq_V(args.out, in, subset); //QLA_Real rsq; //r_eq_norm2_V(&rsq, args.out, subset); //VERB(LOW, "CG: (re)start: iter 0 rsq = %g\n", rsq); //linop_blas(bout, bout, &args); //v_meq_v(bout, bin, n); //VERB(LOW, "CG: (re)start: iter 0 rsq = %g\n", norm2_v(bout, n)); st = QOPPCV(invert_cg_blas)(linop_blas, (void *)&args, inv_arg, res_arg, bout, bin, n); insert_packed_V(out, bout, subset); free(bin); free(bout); //destroy_V(args.in); destroy_V(args.out); return st; } /* milti-shift CG */ QOP_status_t QOPPCV(invert_cgms)(QOPPCV(linop_t) *linop, QOP_invert_arg_t *inv_arg, QOP_resid_arg_t **res_arg, QLA_Real *shifts, int nshifts, Vector **out, Vector *in, Vector *p, QDP_Subset subset vIndexDef) { QLA_Complex **bout, *bin; linop_args_t args; int n = QDP_subset_len(subset)*csize_V; QOP_status_t st; int i; args.linop = linop; args.in = p; create_V(args.out); args.subset = subset; bin = (QLA_Complex *) malloc(n*sizeof(QLA_Complex)); extract_packed_V((void *)bin, in, subset); bout = (QLA_Complex **) malloc(nshifts*sizeof(QLA_Complex *)); for(i=0; i