Numerical.gemv.C.SSE Intrinsics.0.aligned m128
From BenchIT-Wiki
Contents |
short description
A kernel which computes the product of a matrix and a vector and stores the result into another vector. Used datatype: packed single. You should have SSE and the corresponding Intrinsic-functions to compile and run this kernel.
requirements
A C compiler and an Intel Pentium3(R) compliant system. To compile with gcc use flag -msse.
parameters
kernel specific parameters
BENCHIT_SGEMV_C_SSE_ALIGN_MIN
the minimal problemsize to measure
BENCHIT_SGEMV_C_SSE_ALIGN_INCREMENT
the incrementation with every step
BENCHIT_SGEMV_C_SSE_ALIGN_MAX
the maximal problemsize to measure
additional parameters
You can overwrite every environment variable in this parameters file. Some important ones have been listed and commented out. For explanation of these variables please look at LOCALDEFS
expected results
The results should show a curve which starts with a high fp-performance, but falls when the data doesn't fit in the cache anymore.
detailed description
One measurement-routine is encapsulated in the file work.c:
void ssealignIJ_(int sizeVector,int sizeAusgabe,float alpha,float beta, float* x, float *A, float *y)
{
int i,j,iXsize;
// upper limit for j-loops
int upperLimitJ=sizeAusgabe-sizeAusgabe%4;
// upper limit for i-loops
int upperLimitI=sizeVector-sizeVector%4;
// xmm Register
__m128 xmm_gamma,xmm_y,xmm_x,xmm_a0,xmm_a1,xmm_a2,xmm_a3;
// load beta into all places of xmm_gamma
xmm_gamma=_mm_load1_ps(&beta);
// calculate y=beta*y parallel
for (j=0;j<upperLimitJ;j=j+4)
{
xmm_y=_mm_load_ps(&y[j]);
xmm_y=_mm_mul_ps(xmm_y,xmm_gamma);
_mm_store_ps(&y[j],xmm_y);
}
// and maybe sequential (if the size of y is not a multiple of 4)
for (j=upperLimitJ;j<sizeAusgabe;j++)
{
y[j]=beta*y[j];
}
//
// now : x=x, A=A, y=beta*y
//
// load alpha into all places of xmm_gamma
xmm_gamma=_mm_load1_ps(&alpha);
// if sizeAusgabe is a multiple of 4, every A[m][4n] (m,n E N+) can be loaded aligned
if (sizeAusgabe%4==0)
{
for (i=0;i<upperLimitI;i=i+4)
{
// temporary variable for i*sizeAusgabe
iXsize=i*sizeAusgabe;
// load x[i] and x[i+1] (the next 4 entries for vector x)
xmm_a0=_mm_load_ps(&x[i]);
// multiply them with gamma
xmm_a0=_mm_mul_ps(xmm_a0,xmm_gamma);
xmm_a3=_mm_shuffle_ps(xmm_a0,xmm_a0,_MM_SHUFFLE(3,3,3,3));
xmm_a2=_mm_shuffle_ps(xmm_a0,xmm_a0,_MM_SHUFFLE(2,2,2,2));
// write gamma*x[i+1] into both sides of xmm_a1
xmm_a1=_mm_shuffle_ps(xmm_a0,xmm_a0,_MM_SHUFFLE(1,1,1,1));
// write gamma*x[i] into both sides of xmm_a0
xmm_a0=_mm_shuffle_ps(xmm_a0,xmm_a0,_MM_SHUFFLE(0,0,0,0));
// do for the four next elements of A
for (j=0;j<upperLimitJ;j=j+4)
{
// load destination vector y
xmm_y=_mm_load_ps(&y[j]);
// load next 4 elements of A [i][j] and a[i][j+1]
xmm_x=_mm_load_ps(&A[iXsize+j]);
// multiply them with gamma*x[i]
xmm_x=_mm_mul_ps(xmm_x,xmm_a0);
// add y[ j ] = y[ j ] + A[ i ][ j ] * gamma * x[ i ]
// y[ j+1 ] = y[ j+1 ] + A[ i ][ j+1 ] * gamma * x[ i ]
xmm_y=_mm_add_ps(xmm_y,xmm_x);
// load A [i+1][j] and A[i+1][j+1]
xmm_x=_mm_load_ps(&A[((i+1)*(sizeAusgabe))+j]);
// see above
xmm_x=_mm_mul_ps(xmm_x,xmm_a1);
xmm_y=_mm_add_ps(xmm_y,xmm_x);
xmm_x=_mm_load_ps(&A[((i+2)*(sizeAusgabe))+j]);
xmm_x=_mm_mul_ps(xmm_x,xmm_a2);
xmm_y=_mm_add_ps(xmm_y,xmm_x);
xmm_x=_mm_load_ps(&A[((i+3)*(sizeAusgabe))+j]);
xmm_x=_mm_mul_ps(xmm_x,xmm_a3);
xmm_y=_mm_add_ps(xmm_y,xmm_x);
// store
// y[j]=y[j]+A[i][j]*gamma*x[i]+A[i+1]*gamma*x[i]
// y[j+1] equivalent
_mm_store_ps(&y[j],xmm_y);
}
//(upperLimit is the same as sizeAusgabe)
}
for (i=upperLimitI;i<sizeVector;i++)
{
// temporary variable for i*sizeAusgabe
iXsize=i*sizeAusgabe;
// load x[i]
xmm_a0=_mm_load1_ps(&x[i]);
// multiply it with gamma
xmm_a0=_mm_mul_ps(xmm_a0,xmm_gamma);
for (j=0;j<upperLimitJ;j=j+4)
{
// load destination vector y
xmm_y=_mm_load_ps(&y[j]);
// load next 4 elements of A [i][j] and a[i][j+1]
xmm_x=_mm_load_ps(&A[iXsize+j]);
// multiply them with gamma*x[i]
xmm_x=_mm_mul_ps(xmm_x,xmm_a0);
// add y[ j ] = y[ j ] + A[ i ][ j ] * gamma * x[ i ]
// y[ j+1 ] = y[ j+1 ] + A[ i ][ j+1 ] * gamma * x[ i ]
xmm_y=_mm_add_ps(xmm_y,xmm_x);
// store
// y[j]=y[j]+A[i][j]*gamma*x[i]
// y[j+1] equivalent
_mm_store_ps(&y[j],xmm_y);
}
}
}
// A isn't aligned
else
{
for (i=0;i<upperLimitI;i=i+4)
{
// temporary variable for i*sizeAusgabe
iXsize=i*sizeAusgabe;
// load x[i] and x[i+1] (the next 4 entries for vector x)
xmm_a0=_mm_load_ps(&x[i]);
// multiply them with gamma
xmm_a0=_mm_mul_ps(xmm_a0,xmm_gamma);
xmm_a3=_mm_shuffle_ps(xmm_a0,xmm_a0,_MM_SHUFFLE(3,3,3,3));
xmm_a2=_mm_shuffle_ps(xmm_a0,xmm_a0,_MM_SHUFFLE(2,2,2,2));
// write gamma*x[i+1] into both sides of xmm_a1
xmm_a1=_mm_shuffle_ps(xmm_a0,xmm_a0,_MM_SHUFFLE(1,1,1,1));
// write gamma*x[i] into both sides of xmm_a0
xmm_a0=_mm_shuffle_ps(xmm_a0,xmm_a0,_MM_SHUFFLE(0,0,0,0));
// do for the two next elements of A
for (j=0;j<upperLimitJ;j=j+4)
{
// load destination vector y
xmm_y=_mm_load_ps(&y[j]);
// load next 4 elements of A [i][j] and a[i][j+1]
xmm_x=_mm_load_ps(&A[iXsize+j]);
// multiply them with gamma*x[i]
xmm_x=_mm_mul_ps(xmm_x,xmm_a0);
// add y[ j ] = y[ j ] + A[ i ][ j ] * gamma * x[ i ]
// y[ j+1 ] = y[ j+1 ] + A[ i ][ j+1 ] * gamma * x[ i ]
xmm_y=_mm_add_ps(xmm_y,xmm_x);
// load A [i+1][j] and A[i+1][j+1]
xmm_x=_mm_loadu_ps(&A[((i+1)*(sizeAusgabe))+j]);
// see above
xmm_x=_mm_mul_ps(xmm_x,xmm_a1);
xmm_y=_mm_add_ps(xmm_y,xmm_x);
xmm_x=_mm_loadu_ps(&A[((i+2)*(sizeAusgabe))+j]);
xmm_x=_mm_mul_ps(xmm_x,xmm_a2);
xmm_y=_mm_add_ps(xmm_y,xmm_x);
xmm_x=_mm_loadu_ps(&A[((i+3)*(sizeAusgabe))+j]);
xmm_x=_mm_mul_ps(xmm_x,xmm_a3);
xmm_y=_mm_add_ps(xmm_y,xmm_x);
// store
// y[j]=y[j]+A[i][j]*gamma*x[i]+A[i+1]*gamma*x[i]
// y[j+1] equivalent
_mm_store_ps(&y[j],xmm_y);
}
for (j=upperLimitJ;j<sizeAusgabe;j++)
{
y[j]=y[j]+alpha*(A[iXsize+j]*x[i]+A[((i+1)*sizeAusgabe)+j]*x[i+1]);
}
}
for (i=upperLimitI;i<sizeVector;i++)
{
// temporary variable for i*sizeAusgabe
iXsize=i*sizeAusgabe;
// load x[i]
xmm_a0=_mm_load1_ps(&x[i]);
// multiply it with gamma
xmm_a0=_mm_mul_ps(xmm_a0,xmm_gamma);
for (j=0;j<upperLimitJ;j=j+4)
{
// load destination vector y
xmm_y=_mm_load_ps(&y[j]);
// load next 4 elements of A [i][j] and a[i][j+1]
xmm_x=_mm_loadu_ps(&A[iXsize+j]);
// multiply them with gamma*x[i]
xmm_x=_mm_mul_ps(xmm_x,xmm_a0);
// add y[ j ] = y[ j ] + A[ i ][ j ] * gamma * x[ i ]
// y[ j+1 ] = y[ j+1 ] + A[ i ][ j+1 ] * gamma * x[ i ]
xmm_y=_mm_add_ps(xmm_y,xmm_x);
// store
// y[j]=y[j]+A[i][j]*gamma*x[i]
// y[j+1] equivalent
_mm_store_ps(&y[j],xmm_y);
}
for (j=upperLimitJ;j<sizeAusgabe;j++)
{
y[j]=y[j]+alpha*(A[iXsize+j]*x[i]+A[((i+1)*sizeAusgabe)+j]*x[i+1]);
}
}
}
}
The routine ssealignJI differs in the order of the loops i and j.
