前提:
本文的目的就是设置的程序中,每个线程可以负责一个单独的计算任务。帮助学习和理解线程是如何组织的。
本文处理一个二维数据的加法。
数据在内存中的存储
以线性、行为主的方式存储。
例如,一个16*8的一维数组,在内存中是一段连续的128个地址存储该数据
如下图,每个小格子表示一行数据
想要GPU充分发挥他的优点就是每个线程处理不同的数据,避免同一个线程处理多个数据,或者避免线程没有组织的胡乱访问内存。
组织线程模型
二维网格二维线程块 2D grid \ 2D block
如图,不同颜色的方块表示的是一个线程块。因为数组大小是16*8 =128,先定义每个块的维度是(4,4),所以可以计算得到网格的维度是(4,2)。
定义:
gridsize(4,2)
blocksize(4,4)
目的是为了让线程和数组内存中的分布一一对应。
线程和二维矩阵映射关系
ix = threadIdx.x + blockIdx.x * blockDim.x;
iy = threadIdx.y + blockIdx.y * blockDim.y;
如下图
线程和二维矩阵映射关系
idx = iy * gridDim.x * blockDim.x + ix;
编写代码如下:实现二维网格二维线程块进行二维数组的加法
#include "cuda_runtime_api.h"
#include <device_launch_parameters.h>
#include <iostream>
static void CheckCudaErrorAux(const char*, unsigned, const char*, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)
__global__ void addMatrix(int* A, int* B, int* C, const int nx, const int ny)
{
int ix = threadIdx.x + blockIdx.x * blockDim.x;
int iy = threadIdx.y + blockIdx.y * blockDim.y;
int idx = iy * gridDim.x * blockDim.x + ix;
if (ix < nx && iy < ny)
{
C[idx] = A[idx] + B[idx];
}
}
int main()
{
const int nx = 16;
const int ny = 8;
const int nxy = nx * ny;
size_t stBytesCount = nxy * sizeof(int);
int* ipHost_A = new int[nxy];
int* ipHost_B = new int[nxy];
int* ipHost_C = new int[nxy];
for (size_t i = 0; i < nxy; i++)
{
ipHost_A[i] = i;
ipHost_B[i] = i + 1;
}
memset(ipHost_C, 0, stBytesCount);
int* ipDevice_A, * ipDevice_B, * ipDevice_C;
CUDA_CHECK_RETURN(cudaMalloc((void**)&ipDevice_A, stBytesCount));
CUDA_CHECK_RETURN(cudaMalloc((void**)&ipDevice_B, stBytesCount));
CUDA_CHECK_RETURN(cudaMalloc((void**)&ipDevice_C, stBytesCount));
CUDA_CHECK_RETURN(cudaMemcpy(ipDevice_A, ipHost_A, stBytesCount, cudaMemcpyHostToDevice));
CUDA_CHECK_RETURN(cudaMemcpy(ipDevice_B, ipHost_B, stBytesCount, cudaMemcpyHostToDevice));
dim3 block(4,4);
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
addMatrix <<<grid, block >>> (ipDevice_A, ipDevice_B, ipDevice_C, nx, ny);
CUDA_CHECK_RETURN(cudaMemcpy(ipHost_C, ipDevice_C, stBytesCount, cudaMemcpyDeviceToHost));
for (size_t i = 0; i < nxy; i++)
{
if (i % 4 == 0 && i)
std::cout << std::endl;
std::cout << ipHost_A[i] << " + " << ipHost_B[i] << " = " << ipHost_C[i] << "\t";
}
cudaFree(ipDevice_A);
cudaFree(ipDevice_B);
cudaFree(ipDevice_C);
delete []ipHost_A;
delete []ipHost_B;
delete []ipHost_C;
ipHost_A =nullptr;
ipHost_B =nullptr;
ipHost_C =nullptr;
return 0;
}
static void CheckCudaErrorAux(const char* file, unsigned line, const char* statement, cudaError_t err)
{
if (err == cudaSuccess)
return;
std::cerr << statement << " returned: " << cudaGetErrorName(err) << " \t : " << cudaGetErrorString(err) << "(" << err << ") at " << file << ":" << line << std::endl;
exit(1);
}
二维网格一维线程块 2D grid \ 1D block
如图,不同颜色的方块表示的是一个线程块。因为数组大小是16*8 =128,先定义每个块的维度是(4,1),所以可以计算得到网格的维度是(4,8)。
线程和二维矩阵映射关系
这里定义的网格是一维的,所以blockDim.y = 1, threadIdx.y 始终是0
ix = threadIdx.x + blockIdx.x * blockDim.x;
iy = threadIdx.y + blockIdx.y * blockDim.y;
如下图
线程和二维矩阵映射关系
idx = iy * gridDim.x * blockDim.x + ix;
编写代码如下:实现二维网格一维线程块进行二维数组的加法
#include "cuda_runtime_api.h"
#include <device_launch_parameters.h>
#include <iostream>
static void CheckCudaErrorAux(const char*, unsigned, const char*, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)
__global__ void kernel_addMatrix(int* A, int* B, int* C, const int nx, const int ny)
{
int ix = threadIdx.x + blockIdx.x * blockDim.x;
int iy = threadIdx.y + blockIdx.y * blockDim.y; // 因为block是一维的,所以threadIdx.y始终是0
int idx = iy * gridDim.x * blockDim.x + ix;
if (ix < nx && iy < ny)
{
C[idx] = A[idx] + B[idx];
}
}
int main()
{
const int nx = 16;
const int ny = 8;
const int nxy = nx * ny;
size_t stBytesCount = nxy * sizeof(int);
int* ipHost_A = new int[nxy];
int* ipHost_B = new int[nxy];
int* ipHost_C = new int[nxy];
for (size_t i = 0; i < nxy; i++)
{
ipHost_A[i] = i;
ipHost_B[i] = i + 1;
}
memset(ipHost_C, 0, stBytesCount);
int* ipDevice_A, * ipDevice_B, * ipDevice_C;
CUDA_CHECK_RETURN(cudaMalloc((void**)&ipDevice_A, stBytesCount));
CUDA_CHECK_RETURN(cudaMalloc((void**)&ipDevice_B, stBytesCount));
CUDA_CHECK_RETURN(cudaMalloc((void**)&ipDevice_C, stBytesCount));
CUDA_CHECK_RETURN(cudaMemcpy(ipDevice_A, ipHost_A, stBytesCount, cudaMemcpyHostToDevice));
CUDA_CHECK_RETURN(cudaMemcpy(ipDevice_B, ipHost_B, stBytesCount, cudaMemcpyHostToDevice));
dim3 block(4, 1);
dim3 grid((nx + block.x - 1) / block.x, ny);
kernel_addMatrix <<<grid, block >>> (ipDevice_A, ipDevice_B, ipDevice_C, nx, ny);
cudaThreadSynchronize();
CUDA_CHECK_RETURN(cudaMemcpy(ipHost_C, ipDevice_C, stBytesCount, cudaMemcpyDeviceToHost));
for (size_t i = 0; i < nxy; i++)
{
if (i % 4 == 0 && i)
std::cout << std::endl;
std::cout << ipHost_A[i] << " + " << ipHost_B[i] << " = " << ipHost_C[i] << "\t";
}
cudaFree(ipDevice_A);
cudaFree(ipDevice_B);
cudaFree(ipDevice_C);
delete[]ipHost_A;
delete[]ipHost_B;
delete[]ipHost_C;
ipHost_A = nullptr;
ipHost_B = nullptr;
ipHost_C = nullptr;
return 0;
}
static void CheckCudaErrorAux(const char* file, unsigned line, const char* statement, cudaError_t err)
{
if (err == cudaSuccess)
return;
std::cerr << statement << " returned: " << cudaGetErrorName(err) << " \t : " << cudaGetErrorString(err) << "(" << err << ") at " << file << ":" << line << std::endl;
exit(1);
}
示例结果
一维网格一维线程块 1D grid \ 1D block
之前的GPU线程数和数组的大小是相等的,如果说不相等的情况下,GPU每个线程处理的就不是一个运算,而是多个数据的运算。也就是说在核函数中需要使用循环进行处理。
针对本文示例,网格块定义为(4,1),线程块定义为(4,1)。也就是说每个线程处理的分布如下图:
这个例子中,每个线程需要处理的是一列的数据。
线程和二维矩阵映射关系
ix = threadIdx.x + blockIdx.x * blockDim.x;
iy = threadIdx.y + blockIdx.y * blockDim.y;
这里因为grid\block都是一维的,所以threadIdx.y、blockIdx.y都始终是0.
如下
编码如下:
#include "cuda_runtime_api.h"
#include <device_launch_parameters.h>
#include <iostream>
static void CheckCudaErrorAux(const char*, unsigned, const char*, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)
__global__ void _addMatrix(int* A, int* B, int* C, const int nx, const int ny)
{
int ix = threadIdx.x + blockIdx.x * blockDim.x;
int iy = 0;
int offset = gridDim.x * blockDim.x;
if (ix < nx)
{
for (size_t i = 0; i < ny; i++)
{
int idx = i * offset + ix;
if (idx < nx*ny)
{
C[idx] = A[idx] + B[idx];
}
}
}
}
int main()
{
const int nx = 16;
const int ny = 8;
const int nxy = nx * ny;
size_t stBytesCount = nxy * sizeof(int);
int* ipHost_A = new int[nxy];
int* ipHost_B = new int[nxy];
int* ipHost_C = new int[nxy];
for (size_t i = 0; i < nxy; i++)
{
ipHost_A[i] = i;
ipHost_B[i] = i + 1;
}
memset(ipHost_C, 0, stBytesCount);
int* ipDevice_A, * ipDevice_B, * ipDevice_C;
CUDA_CHECK_RETURN(cudaMalloc((void**)&ipDevice_A, stBytesCount));
CUDA_CHECK_RETURN(cudaMalloc((void**)&ipDevice_B, stBytesCount));
CUDA_CHECK_RETURN(cudaMalloc((void**)&ipDevice_C, stBytesCount));
CUDA_CHECK_RETURN(cudaMemcpy(ipDevice_A, ipHost_A, stBytesCount, cudaMemcpyHostToDevice));
CUDA_CHECK_RETURN(cudaMemcpy(ipDevice_B, ipHost_B, stBytesCount, cudaMemcpyHostToDevice));
dim3 block(4, 1);
dim3 grid(4, 1);
_addMatrix << <grid, block >> > (ipDevice_A, ipDevice_B, ipDevice_C, nx, ny);
cudaThreadSynchronize();
CUDA_CHECK_RETURN(cudaMemcpy(ipHost_C, ipDevice_C, stBytesCount, cudaMemcpyDeviceToHost));
for (size_t i = 0; i < nxy; i++)
{
if (i % 4 == 0 && i)
std::cout << std::endl;
std::cout << ipHost_A[i] << " + " << ipHost_B[i] << " = " << ipHost_C[i] << "\t";
}
cudaFree(ipDevice_A);
cudaFree(ipDevice_B);
cudaFree(ipDevice_C);
delete[]ipHost_A;
delete[]ipHost_B;
delete[]ipHost_C;
ipHost_A = nullptr;
ipHost_B = nullptr;
ipHost_C = nullptr;
return 0;
}
static void CheckCudaErrorAux(const char* file, unsigned line, const char* statement, cudaError_t err)
{
if (err == cudaSuccess)
return;
std::cerr << statement << " returned: " << cudaGetErrorName(err) << " \t : " << cudaGetErrorString(err) << "(" << err << ") at " << file << ":" << line << std::endl;
exit(1);
}