GPU speed up, matrix multiplication

조회 수: 22 (최근 30일)
yang li
yang li 2018년 4월 23일
답변: Joss Knight 2018년 4월 28일
I use GPU (Tesla K80) to speed up the matrix multiplication in matlab 2016a and cuda 7.5. At first, the procedure runs fast, about 0.0001S per loop, after a certain number of iterations, the procedure runs slowly, about 0.04S per loop. ############################ main.m clear; A = 100 * 100000; C = 100 * 100000; for i = 1:10000 tic; B = MatrixMul(A, C); toc; end ############## MatrixMul.cu
if true
#include "mex.h" #include "gpu/mxGPUArray.h"
void _global_ TimesTwo(double const * const A, double const * const C, double * const B, int const N, int const rowsA, int const rowsC, int const colsA, int const colsC) { int const i = blockDim.x * blockIdx.x + threadIdx.x; int j; if (i < rowsA * rowsC) { int co_x = i % rowsA; int co_y = i / rowsA; B[i] = 0; for (j = 0; j < colsA; j++) { B[i] += A[ rowsA * j + co_x] * C[ rowsC * j + co_y]; } } }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[]) { mxGPUArray const *A; mxGPUArray const *C; mxGPUArray *B;
double const *d_A; double const *d_C; double *d_B; int N;
char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput"; char const * const errMsg = "Invalid input to MEX file."; int const threadsPerBlock = 256; int blocksPerGrid;
mxInitGPU(); A = mxGPUCreateFromMxArray(prhs[0]); C = mxGPUCreateFromMxArray(prhs[1]);
d_A = (double const *)(mxGPUGetDataReadOnly(A)); d_C = (double const *)(mxGPUGetDataReadOnly(C));
const mwSize *dimsA = mxGPUGetDimensions(A); const mwSize *dimsC = mxGPUGetDimensions(C);
size_t nrowsA = dimsA[0]; size_t ncolsA = dimsA[1]; size_t nrowsC = dimsC[0]; size_t ncolsC = dimsC[1]; mwSize dims[2] = {nrowsA, nrowsC};
B = mxGPUCreateGPUArray(2, dims, mxGPUGetClassID(A), mxGPUGetComplexity(A), MX_GPU_DO_NOT_INITIALIZE); d_B = (double *)(mxGPUGetData(B));
N = (int)(nrowsA * nrowsC); blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; TimesTwo<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, d_B, N, nrowsA, nrowsC, ncolsA, ncolsC);
plhs[0] = mxGPUCreateMxArrayOnGPU(B); mxGPUDestroyGPUArray(A); mxGPUDestroyGPUArray(B); mxGPUDestroyGPUArray(C); }
<<
>>
end

답변 (1개)

Joss Knight
Joss Knight 2018년 4월 28일
tic and toc are not giving the correct timings for your first set of iterations, because your kernels are launching asynchronously. You need to use gputimeit or add a call to wait(gpuDevice).
Also, your kernel is not efficient, you should be using cublas to perform matrix multiplication.

카테고리

Help CenterFile Exchange에서 GPU CUDA and MEX Programming에 대해 자세히 알아보기

Community Treasure Hunt

Find the treasures in MATLAB Central and discover how the community can help you!

Start Hunting!

Translated by