00001
00002
00003
00004
00005
00006
00007 #ifndef __invcg2_timing_hacks_2_h__
00008 #define __invcg2_timing_hacks_2_h__
00009
00010 #include "chromabase.h"
00011 #include "linearop.h"
00012 #include "actions/ferm/linop/dslash_w.h"
00013
00014
00015
00016
00017 typedef OLattice< PSpinVector< PColorVector< RComplex< PScalar<REAL> >, Nc>, Ns> > LFerm;
00018
00019 typedef OScalar< PScalar < PScalar < RScalar< PScalar < REAL > > > > > LScal;
00020 typedef OScalar< PScalar < PScalar < RScalar< PScalar < DOUBLE > > > > > LDble;
00021
00022 #define AT_REAL(a) (a.elem().elem().elem().elem().elem())
00023
00024
00025 #define FIRST_ELEM(a,s) (&(a.elem(s.start()).elem(0).elem(0).real().elem()))
00026
00027 void InvCG2EvenOddPrecWilsLinOpTHack(const WilsonDslash &D,
00028 const LFerm& chi,
00029 LFerm& psi,
00030 const LScal& mass,
00031 const LScal& RsdCG,
00032 int MaxCG,
00033 int& n_count);
00034
00035
00036
00037 typedef float v4sf __attribute__((mode(V4SF),aligned(16)));
00038
00039
00040 inline
00041 void vaxpy3_norm(REAL *Out,REAL *scalep,REAL *InScale, REAL *Add,int n_3vec,
00042 REAL* dsum)
00043 {
00044 #ifdef DEBUG_BLAS
00045 QDPIO::cout << "SSE_TEST: vaxpy3_norm" << endl;
00046 #endif
00047
00048 int n_loops = n_3vec;
00049
00050 v4sf vscalep = __builtin_ia32_loadss(scalep);
00051 asm("shufps\t$0,%0,%0" : "+x" (vscalep));
00052
00053 REAL fzero = 0.0;
00054 register v4sf vsum = __builtin_ia32_loadss(&fzero);
00055 asm("shufps\t$0,%0,%0" : "+x" (vsum));
00056
00057 for (; n_loops-- > 0; )
00058 {
00059 register v4sf vtmp;
00060
00061 vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+ 0)), __builtin_ia32_loadaps(Add+ 0));
00062 vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
00063 __builtin_ia32_storeaps(Out+ 0, vtmp);
00064
00065 vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+ 4)), __builtin_ia32_loadaps(Add+ 4));
00066 vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
00067 __builtin_ia32_storeaps(Out+ 4, vtmp);
00068
00069 vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+ 8)), __builtin_ia32_loadaps(Add+ 8));
00070 vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
00071 __builtin_ia32_storeaps(Out+ 8, vtmp);
00072
00073 vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+12)), __builtin_ia32_loadaps(Add+12));
00074 vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
00075 __builtin_ia32_storeaps(Out+12, vtmp);
00076
00077 vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+16)), __builtin_ia32_loadaps(Add+16));
00078 vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
00079 __builtin_ia32_storeaps(Out+16, vtmp);
00080
00081 vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+20)), __builtin_ia32_loadaps(Add+20));
00082 vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
00083 __builtin_ia32_storeaps(Out+20, vtmp);
00084
00085 Out += 24; InScale += 24; Add += 24;
00086 }
00087
00088 REAL fsum[4];
00089 __builtin_ia32_storeaps(fsum, vsum);
00090 *dsum = (REAL)(fsum[0] + fsum[1] + fsum[2] + fsum[3]);
00091 }
00092
00093
00094
00095 #endif