Numerical.gemv.C.SSE Intrinsics.0.aligned m128

From BenchIT-Wiki

Jump to: navigation, search

Contents

short description

A kernel which computes the product of a matrix and a vector and stores the result into another vector. Used datatype: packed single. You should have SSE and the corresponding Intrinsic-functions to compile and run this kernel.

requirements

A C compiler and an Intel Pentium3(R) compliant system. To compile with gcc use flag -msse.

parameters

kernel specific parameters

BENCHIT_SGEMV_C_SSE_ALIGN_MIN

the minimal problemsize to measure

BENCHIT_SGEMV_C_SSE_ALIGN_INCREMENT

the incrementation with every step

BENCHIT_SGEMV_C_SSE_ALIGN_MAX

the maximal problemsize to measure

additional parameters

You can overwrite every environment variable in this parameters file. Some important ones have been listed and commented out. For explanation of these variables please look at LOCALDEFS

expected results

The results should show a curve which starts with a high fp-performance, but falls when the data doesn't fit in the cache anymore.

detailed description

One measurement-routine is encapsulated in the file work.c:

void ssealignIJ_(int sizeVector,int sizeAusgabe,float alpha,float beta, float* x, float *A, float *y)
{
	int i,j,iXsize;
	// upper limit for j-loops
	int upperLimitJ=sizeAusgabe-sizeAusgabe%4;
	// upper limit for i-loops
	int upperLimitI=sizeVector-sizeVector%4;
	// xmm Register
	__m128 xmm_gamma,xmm_y,xmm_x,xmm_a0,xmm_a1,xmm_a2,xmm_a3;
	// load beta into all places of xmm_gamma
	xmm_gamma=_mm_load1_ps(&beta);
	// calculate y=beta*y parallel
	for (j=0;j<upperLimitJ;j=j+4)
	{
		xmm_y=_mm_load_ps(&y[j]);
		xmm_y=_mm_mul_ps(xmm_y,xmm_gamma);
		_mm_store_ps(&y[j],xmm_y);
	}
	// and maybe sequential (if the size of y is not a multiple of 4)
	for (j=upperLimitJ;j<sizeAusgabe;j++)
	{
		y[j]=beta*y[j];
	}
	//
	// now : x=x, A=A, y=beta*y
	//

	// load alpha into all places of xmm_gamma
	xmm_gamma=_mm_load1_ps(&alpha);

	// if sizeAusgabe is a multiple of 4, every A[m][4n] (m,n E N+) can be loaded aligned
	if (sizeAusgabe%4==0)
	{
		for (i=0;i<upperLimitI;i=i+4)
		{
			// temporary variable for i*sizeAusgabe
			iXsize=i*sizeAusgabe;
			// load x[i] and x[i+1] (the next 4 entries for vector x)
			xmm_a0=_mm_load_ps(&x[i]);
			// multiply them with gamma
			xmm_a0=_mm_mul_ps(xmm_a0,xmm_gamma);
			xmm_a3=_mm_shuffle_ps(xmm_a0,xmm_a0,_MM_SHUFFLE(3,3,3,3));
			xmm_a2=_mm_shuffle_ps(xmm_a0,xmm_a0,_MM_SHUFFLE(2,2,2,2));
			// write gamma*x[i+1] into both sides of xmm_a1
			xmm_a1=_mm_shuffle_ps(xmm_a0,xmm_a0,_MM_SHUFFLE(1,1,1,1));
			// write gamma*x[i] into both sides of xmm_a0
			xmm_a0=_mm_shuffle_ps(xmm_a0,xmm_a0,_MM_SHUFFLE(0,0,0,0));
			// do for the four next elements of A
			for (j=0;j<upperLimitJ;j=j+4)
			{
				// load destination vector y
				xmm_y=_mm_load_ps(&y[j]);
				// load next 4 elements of A [i][j] and a[i][j+1]
				xmm_x=_mm_load_ps(&A[iXsize+j]);
				// multiply them with gamma*x[i]
				xmm_x=_mm_mul_ps(xmm_x,xmm_a0);
				// add y[  j  ] = y[  j  ] + A[  i  ][  j  ] * gamma * x[  i  ]
				//     y[ j+1 ] = y[ j+1 ] + A[  i  ][ j+1 ] * gamma * x[  i  ]
				xmm_y=_mm_add_ps(xmm_y,xmm_x);
				// load A [i+1][j] and A[i+1][j+1]
				xmm_x=_mm_load_ps(&A[((i+1)*(sizeAusgabe))+j]);
				// see above
				xmm_x=_mm_mul_ps(xmm_x,xmm_a1);
				xmm_y=_mm_add_ps(xmm_y,xmm_x);
				xmm_x=_mm_load_ps(&A[((i+2)*(sizeAusgabe))+j]);
				xmm_x=_mm_mul_ps(xmm_x,xmm_a2);
				xmm_y=_mm_add_ps(xmm_y,xmm_x);
				xmm_x=_mm_load_ps(&A[((i+3)*(sizeAusgabe))+j]);
				xmm_x=_mm_mul_ps(xmm_x,xmm_a3);
				xmm_y=_mm_add_ps(xmm_y,xmm_x);
				// store
				// y[j]=y[j]+A[i][j]*gamma*x[i]+A[i+1]*gamma*x[i]
				// y[j+1] equivalent
				_mm_store_ps(&y[j],xmm_y);
			}
			//(upperLimit is the same as sizeAusgabe)
		}
		for (i=upperLimitI;i<sizeVector;i++)
		{
			// temporary variable for i*sizeAusgabe
			iXsize=i*sizeAusgabe;
			// load x[i]
			xmm_a0=_mm_load1_ps(&x[i]);
			// multiply it with gamma
			xmm_a0=_mm_mul_ps(xmm_a0,xmm_gamma);
			for (j=0;j<upperLimitJ;j=j+4)
			{
				// load destination vector y
				xmm_y=_mm_load_ps(&y[j]);
				// load next 4 elements of A [i][j] and a[i][j+1]
				xmm_x=_mm_load_ps(&A[iXsize+j]);
				// multiply them with gamma*x[i]
				xmm_x=_mm_mul_ps(xmm_x,xmm_a0);
				// add y[  j  ] = y[  j  ] + A[  i  ][  j  ] * gamma * x[  i  ]
				//     y[ j+1 ] = y[ j+1 ] + A[  i  ][ j+1 ] * gamma * x[  i  ]
				xmm_y=_mm_add_ps(xmm_y,xmm_x);
				// store
				// y[j]=y[j]+A[i][j]*gamma*x[i]
				// y[j+1] equivalent
				_mm_store_ps(&y[j],xmm_y);
			}
			
		}
	}
	// A isn't aligned
	else
	{
		for (i=0;i<upperLimitI;i=i+4)
		{
			// temporary variable for i*sizeAusgabe
			iXsize=i*sizeAusgabe;
			// load x[i] and x[i+1] (the next 4 entries for vector x)
			xmm_a0=_mm_load_ps(&x[i]);
			// multiply them with gamma
			xmm_a0=_mm_mul_ps(xmm_a0,xmm_gamma);
			xmm_a3=_mm_shuffle_ps(xmm_a0,xmm_a0,_MM_SHUFFLE(3,3,3,3));
			xmm_a2=_mm_shuffle_ps(xmm_a0,xmm_a0,_MM_SHUFFLE(2,2,2,2));
			// write gamma*x[i+1] into both sides of xmm_a1
			xmm_a1=_mm_shuffle_ps(xmm_a0,xmm_a0,_MM_SHUFFLE(1,1,1,1));
			// write gamma*x[i] into both sides of xmm_a0
			xmm_a0=_mm_shuffle_ps(xmm_a0,xmm_a0,_MM_SHUFFLE(0,0,0,0));
			// do for the two next elements of A
			for (j=0;j<upperLimitJ;j=j+4)
			{
				// load destination vector y
				xmm_y=_mm_load_ps(&y[j]);
				// load next 4 elements of A [i][j] and a[i][j+1]
				xmm_x=_mm_load_ps(&A[iXsize+j]);
				// multiply them with gamma*x[i]
				xmm_x=_mm_mul_ps(xmm_x,xmm_a0);
				// add y[  j  ] = y[  j  ] + A[  i  ][  j  ] * gamma * x[  i  ]
				//     y[ j+1 ] = y[ j+1 ] + A[  i  ][ j+1 ] * gamma * x[  i  ]
				xmm_y=_mm_add_ps(xmm_y,xmm_x);
				// load A [i+1][j] and A[i+1][j+1]
				xmm_x=_mm_loadu_ps(&A[((i+1)*(sizeAusgabe))+j]);
				// see above
				xmm_x=_mm_mul_ps(xmm_x,xmm_a1);
				xmm_y=_mm_add_ps(xmm_y,xmm_x);
				xmm_x=_mm_loadu_ps(&A[((i+2)*(sizeAusgabe))+j]);
				xmm_x=_mm_mul_ps(xmm_x,xmm_a2);
				xmm_y=_mm_add_ps(xmm_y,xmm_x);
				xmm_x=_mm_loadu_ps(&A[((i+3)*(sizeAusgabe))+j]);
				xmm_x=_mm_mul_ps(xmm_x,xmm_a3);
				xmm_y=_mm_add_ps(xmm_y,xmm_x);
				// store
				// y[j]=y[j]+A[i][j]*gamma*x[i]+A[i+1]*gamma*x[i]
				// y[j+1] equivalent
				_mm_store_ps(&y[j],xmm_y);
			}
			for (j=upperLimitJ;j<sizeAusgabe;j++)
			{
				y[j]=y[j]+alpha*(A[iXsize+j]*x[i]+A[((i+1)*sizeAusgabe)+j]*x[i+1]);
			}
		}
		for (i=upperLimitI;i<sizeVector;i++)
		{
			// temporary variable for i*sizeAusgabe
			iXsize=i*sizeAusgabe;
			// load x[i]
			xmm_a0=_mm_load1_ps(&x[i]);
			// multiply it with gamma
			xmm_a0=_mm_mul_ps(xmm_a0,xmm_gamma);
			for (j=0;j<upperLimitJ;j=j+4)
			{
				// load destination vector y
				xmm_y=_mm_load_ps(&y[j]);
				// load next 4 elements of A [i][j] and a[i][j+1]
				xmm_x=_mm_loadu_ps(&A[iXsize+j]);
				// multiply them with gamma*x[i]
				xmm_x=_mm_mul_ps(xmm_x,xmm_a0);
				// add y[  j  ] = y[  j  ] + A[  i  ][  j  ] * gamma * x[  i  ]
				//     y[ j+1 ] = y[ j+1 ] + A[  i  ][ j+1 ] * gamma * x[  i  ]
				xmm_y=_mm_add_ps(xmm_y,xmm_x);
				// store
				// y[j]=y[j]+A[i][j]*gamma*x[i]
				// y[j+1] equivalent
				_mm_store_ps(&y[j],xmm_y);
			}
			for (j=upperLimitJ;j<sizeAusgabe;j++)
			{
				y[j]=y[j]+alpha*(A[iXsize+j]*x[i]+A[((i+1)*sizeAusgabe)+j]*x[i+1]);
			}
		}
	}
}

The routine ssealignJI differs in the order of the loops i and j.

see also

Personal tools