在這裡其實有一個還沒提到的概念, 被稱作shared memory, 這是CUDA架構下的一種記憶體架構, 可以提升計算速度, 部過這個概念要等到下一篇文章我才會寫, 因此下面的程式碼是沒有shared memory的版本!
此外, 這裡的陣列大小選取都是blocksize的倍數, 因為若不是blocksize的倍數, 記憶體在分配上會有技術問題要處理, 此處也先省略
#include <iostream> #include <cstdlib> #include <ctime>
#include <cuda_runtime.h>
#include "device_launch_parameters.h" #define BLOCK_SIZE 2 #define A_row 2 #define A_col 3 #define B_row A_col #define B_col 2 #define C_row A_row #define C_col B_col using namespace std; // 宣告Matrix class class Matrix{ public: int width; int height; float* elements; }; __global__ void MatMulKernel(Matrix A, Matrix B, Matrix C) { float ans = 0; int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; for (int p = 0; p < A.width; ++p) ans += A.elements[row * A.width + p] * B.elements[p * B.width + col]; C.elements[row * C.width + col] = ans; } int main(void) { float A[A_row * BLOCK_SIZE][A_col * BLOCK_SIZE]; float B[B_row * BLOCK_SIZE][B_col * BLOCK_SIZE]; float C[A_row * BLOCK_SIZE][B_col * BLOCK_SIZE]={0}; //隨機分配值給矩陣 for(int i=0;i< A_row * BLOCK_SIZE;i++) { for(int j=0;j< A_col * BLOCK_SIZE;j++) { A[i][j]=rand(); } } for(int i=0;i< B_col * BLOCK_SIZE;i++) { for(int j=0;j< B_row * BLOCK_SIZE;j++) { B[i][j]=rand(); } } Matrix h_A,h_B,h_C; //把給定的矩陣轉為Matrix類別 h_A.height = A_row * BLOCK_SIZE; h_B.height = B_row * BLOCK_SIZE; h_C.height = C_row * BLOCK_SIZE; h_A.width = A_col * BLOCK_SIZE; h_B.width = B_col * BLOCK_SIZE; h_C.width = C_col * BLOCK_SIZE;
h_A.elements = (float*)malloc(A_row*A_col*BLOCK_SIZE*BLOCK_SIZE*sizeof(float));
h_B.elements = (float*)malloc(B_row*B_col*BLOCK_SIZE*BLOCK_SIZE*sizeof(float));
h_C.elements = (float*)malloc(C_row*C_col*BLOCK_SIZE*BLOCK_SIZE*sizeof(float));
for(int i=0;i< A_row * BLOCK_SIZE;i++) { for(int j=0;j< A_col * BLOCK_SIZE;j++) { *(h_A.elements + i*h_A.width + j) = A[i][j]; } } for(int i=0;i< B_row * BLOCK_SIZE;i++) { for(int j=0;j< B_col * BLOCK_SIZE;j++) { *(h_B.elements + i*h_B.width + j) = B[i][j]; } } // 把h_A 和 h_B 複製給 device Matrix d_A; d_A.width = h_A.width; d_A.height = h_A.height; size_t size = h_A.width * h_A.height * sizeof(float); cudaMalloc(&d_A.elements, size); cudaMemcpy(d_A.elements, h_A.elements, size, cudaMemcpyHostToDevice); Matrix d_B; d_B.width = h_B.width; d_B.height = h_B.height; size = h_B.width * h_B.height * sizeof(float); cudaMalloc(&d_B.elements, size); cudaMemcpy(d_B.elements, h_B.elements, size, cudaMemcpyHostToDevice); // 在 device memory 裡創出 d_C 的記憶體空間 Matrix d_C; d_C.width = h_C.width; d_C.height = h_C.height; size = h_C.width * h_C.height * sizeof(float); cudaMalloc(&d_C.elements, size); // 實際計算 dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); dim3 dimGrid(h_B.width / dimBlock.x, h_A.height / dimBlock.y); MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C); // 把算好的結果從d_C 複製回 h_C cudaMemcpy(h_C.elements, d_C.elements, size, cudaMemcpyDeviceToHost); // 驗算 int ok=1; for(int i=0;ok && i< A_row * BLOCK_SIZE ;i++) { for(int j=0;ok && j< B_col * BLOCK_SIZE;j++) { for(int k=0;k< A_col * BLOCK_SIZE;k++) { C[i][j]+=A[i][k]*B[k][j]; } if(C[i][j]!=*(h_C.elements+i*h_C.width+j)) { cout<<"fail"; ok=0; break; } } } if(ok==1) { cout<<"success"<<endl; } //釋放記憶體空間 cudaFree(d_A.elements); cudaFree(d_B.elements); cudaFree(d_C.elements);
system("pause"); }
reference:
1. Nvidia developer zone