= 0 ; i < _size; ++i)
{
if (fabs(_A[i] - _B[i]) > 1.0e-3)
{
printf("%f \t %f\n",_A[i],_B[i]);
return false;
}
}
return true;
}
int main(int argc, char* argv[])
{
srand(13);
if(!InitCUDA()) {
return 0;
}
float* A = myNewOnCPU(WA*HA);
float* B = myNewOnCPU(WB*HB);
randomInit(A,WA*HA);
randomInit(B,WB*HB);
float* C = myNewOnCPU(WC*HC);
memset(C,0,sizeof(float)*WC*HC);
float* C2 = myNewOnCPU(WC*HC);
memset(C2,0,sizeof(float)*WC*HC);
unsigned int tick1 = GetTickCount();
MatrixMulCPU(C2,A,B,WA,HA,WB);
printf("CPU use Time : %dms\n",GetTickCount() - tick1);
unsigned int timer = 0;
cutilCheckError(cutCreateTimer(&timer));
cutilCheckError(cutStartTimer(timer));
{
matrixMulGPU(C,A,B,WA,HA,WB);
}
cutilCheckError(cutStopTimer(timer));
printf("GPU use time: %f (ms) \n", cutGetTimerValue(timer));
cutilCheckError(cutDeleteTimer(timer));
if (checkError(C,C2,WC*HC))
{
printf("Accept\n");
}
else
{
printf("Worng Answer\n");
}
myDeleteOnCPU(A);
myDeleteOnCPU(B);
myDeleteOnCPU(C);
myDeleteOnCPU(C2);
return 0;
}
运算结果如下:
版本0:

版本1:

可以看出,GPU并行性能比CPU好很多,而且版本1优于版本0