超並列プロセサ - GeForceアーキテクチャとCUDAプログラミング

そして、CUT_CHECK_ERROR関数を呼び、GPUでエラーが発生したかどうかをチェックする。続いて、cudaMemcpy関数を呼び、デバイスメモリからホストメモリに結果を転送する。

    // setup execution parameters
    dim3 threads(BLOCK_SIZE、 BLOCK_SIZE);
    dim3 grid(WC / threads.x, HC / threads.y);

    // execute the kernel
    matrixMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);

    // check if kernel execution generated and error
    CUT_CHECK_ERROR("Kernel execution failed");

    // copy result from device to host
    CUDA_SAFE_CALL(cudaMemcpy(h_C, d_C, mem_size_C,
                              cudaMemcpyDeviceToHost) );

そしてrunTestの最後の部分では、cutStopTimerでタイマーを止めて、経過時間をプリントし、cutDeleteTimerでタイマーを削除する。computeGold関数はGPUを使わずCPUで行列積を計算する関数で、cutCompareL2fe関数で両者の結果を比較して誤差が1E-6以下であればパス、そうでなければエラーを出力し、ホストとデバイスメモリを開放している。

    // stop and destroy timer
    CUT_SAFE_CALL(cutStopTimer(timer));
    printf("Processing time: %f (ms) \n", cutGetTimerValue(timer));
    CUT_SAFE_CALL(cutDeleteTimer(timer));

    // compute reference solution
    float* reference = (float*) malloc(mem_size_C);
    computeGold(reference, h_A, h_B, HA, WA, WB);

    // check result
    CUTBoolean res = cutCompareL2fe(reference, h_C, size_C, 1e-6f);
    printf("Test %s \n", (1 == res) ? "PASSED" : "FAILED");
    if (res!=1) printDiff(reference, h_C, WC, HC);

    // clean up memory
    free(h_A);
    free(h_B);
    free(h_C);
    free(reference);
    CUDA_SAFE_CALL(cudaFree(d_A));
    CUDA_SAFE_CALL(cudaFree(d_B));
    CUDA_SAFE_CALL(cudaFree(d_C));
}

行列積を計算する本体であるmatrixMul_kernel.cuの最初の部分は次のようになっている。それぞれのスレッドはblockIdx.xとblockIdx.yでブロック内の位置、threadIdx.xとthreadIdx.yでグリッド内の自分の位置を知り、行列Cのどの要素の計算を行うかを判断する。

//! Matrix multiplication on the device: C = A * B
//! wA is A's width and wB is B's width

__global__ void
matrixMul( float* C, float* A, float* B, int wA, int wB)
{
    // Block index
    int bx = blockIdx.x;
    int by = blockIdx.y;

    // Thread index
    int tx = threadIdx.x;
    int ty = threadIdx.y;

そして、行列積の計算を行、列数がBLOCK_SIZEの大きさの部分行列を単位として行うので、行列Aの最初と最後の部分行列の位置であるaBeginとaEndを計算し、aStepをBLOCK_SIZEとする。更に、行列Bに対しても同様に、bBeginとbStepを計算する。

    // Index of the first sub-matrix of A processed by the block
    int aBegin = wA * BLOCK_SIZE * by;

    // Index of the last sub-matrix of A processed by the block
    int aEnd   = aBegin + wA - 1;

    // Step size used to iterate through the sub-matrices of A
    int aStep  = BLOCK_SIZE;

    // Index of the first sub-matrix of B processed by the block
    int bBegin = BLOCK_SIZE * bx;

    // Step size used to iterate through the sub-matrices of B
    int bStep  = BLOCK_SIZE * wB;

    // Csub is used to store the element of the block sub-matrix
    // that is computed by the thread
    float Csub = 0;

そして、それぞれのスレッドが、次の図のように、まず、行列Aと行列Bの0と書かれた部分行列の中の自分の分担する要素をシェアードメモリに入れて積を計算する。次に、aStepとbStepを加えて、1と書かれた部分行列A、Bの分担する要素をシェアードメモリに入れて積を求めという操作を最後のブロックまで繰り返す。これで結果の行列CのBLOCK_SIZE*bx+tx、BLOCK_SIZE*by+ty要素が求まる。