invcg2_timing_hacks_3.h

Go to the documentation of this file.
00001 // -*- C++ -*-
00002 // $Id: invcg2_timing_hacks_3.h,v 3.0 2006/04/03 04:59:14 edwards Exp $
00003 /*! \file
00004  *  \brief Conjugate-Gradient algorithm for a generic Linear Operator
00005  */
00006 
00007 #ifndef __invcg2_timing_hacks_2_h__
00008 #define __invcg2_timing_hacks_2_h__
00009 
00010 #include "chromabase.h"
00011 #include "linearop.h"
00012 #include "actions/ferm/linop/dslash_w.h"
00013 
00014 //! Highly optimised Conjugate-Gradient (CGNE) algorithm for a Even Odd Preconditioned
00015                                                                                 
00016 // Perversly theser are the types used in our axpys.
00017 typedef OLattice< PSpinVector< PColorVector< RComplex< PScalar<REAL> >, Nc>, Ns> > LFerm;
00018                                                                                 
00019 typedef OScalar< PScalar < PScalar < RScalar< PScalar < REAL > > > > > LScal;
00020 typedef OScalar< PScalar < PScalar < RScalar< PScalar < DOUBLE > > > > > LDble;                                                                                
00021 // Get at the REAL embedded in an LScal
00022 #define AT_REAL(a)  (a.elem().elem().elem().elem().elem())
00023                                                                                 
00024 // Get the first element of a vector over a subset
00025 #define FIRST_ELEM(a,s) (&(a.elem(s.start()).elem(0).elem(0).real().elem()))
00026                                                                                
00027 void InvCG2EvenOddPrecWilsLinOpTHack(const WilsonDslash &D,
00028                                 const LFerm& chi,
00029                                 LFerm& psi,
00030                                 const LScal& mass,
00031                                 const LScal& RsdCG,
00032                                 int MaxCG,
00033                                 int& n_count);
00034  
00035 
00036 // GNUC vector type
00037 typedef float v4sf __attribute__((mode(V4SF),aligned(16)));
00038 
00039 // vaxpy3 and norm put together
00040 inline
00041 void vaxpy3_norm(REAL *Out,REAL *scalep,REAL *InScale, REAL *Add,int n_3vec,
00042                  REAL* dsum)
00043 {
00044 #ifdef DEBUG_BLAS
00045   QDPIO::cout << "SSE_TEST: vaxpy3_norm" << endl;
00046 #endif
00047 
00048   int n_loops = n_3vec;
00049 
00050   v4sf vscalep = __builtin_ia32_loadss(scalep);
00051   asm("shufps\t$0,%0,%0" : "+x" (vscalep));
00052 
00053   REAL fzero = 0.0;
00054   register v4sf vsum = __builtin_ia32_loadss(&fzero);
00055   asm("shufps\t$0,%0,%0" : "+x" (vsum));
00056 
00057   for (; n_loops-- > 0; )
00058   {
00059     register v4sf vtmp;
00060 
00061     vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+ 0)), __builtin_ia32_loadaps(Add+ 0));
00062     vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
00063     __builtin_ia32_storeaps(Out+ 0, vtmp);
00064 
00065     vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+ 4)), __builtin_ia32_loadaps(Add+ 4));
00066     vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
00067     __builtin_ia32_storeaps(Out+ 4, vtmp);
00068 
00069     vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+ 8)), __builtin_ia32_loadaps(Add+ 8));
00070     vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
00071     __builtin_ia32_storeaps(Out+ 8, vtmp);
00072 
00073     vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+12)), __builtin_ia32_loadaps(Add+12));
00074     vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
00075     __builtin_ia32_storeaps(Out+12, vtmp);
00076 
00077     vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+16)), __builtin_ia32_loadaps(Add+16));
00078     vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
00079     __builtin_ia32_storeaps(Out+16, vtmp);
00080 
00081     vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+20)), __builtin_ia32_loadaps(Add+20));
00082     vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
00083     __builtin_ia32_storeaps(Out+20, vtmp);
00084 
00085     Out += 24; InScale += 24; Add += 24;
00086   }
00087 
00088   REAL fsum[4];
00089   __builtin_ia32_storeaps(fsum, vsum);
00090   *dsum = (REAL)(fsum[0] + fsum[1] + fsum[2] + fsum[3]);
00091 }
00092 
00093 
00094 
00095 #endif

Generated on Fri Mar 19 04:33:27 2010 for CHROMA by  doxygen 1.4.7