CUDA - 如何让线程和内存对应

前提：

本文的目的就是设置的程序中，每个线程可以负责一个单独的计算任务。帮助学习和理解线程是如何组织的。

本文处理一个二维数据的加法。

数据在内存中的存储

以线性、行为主的方式存储。

例如，一个16*8的一维数组，在内存中是一段连续的128个地址存储该数据

如下图，每个小格子表示一行数据

想要GPU充分发挥他的优点就是每个线程处理不同的数据，避免同一个线程处理多个数据，或者避免线程没有组织的胡乱访问内存。

组织线程模型

二维网格二维线程块 2D grid \ 2D block

如图，不同颜色的方块表示的是一个线程块。因为数组大小是16*8 =128，先定义每个块的维度是（4，4），所以可以计算得到网格的维度是（4，2）。

定义：

gridsize(4,2)

blocksize(4,4)

目的是为了让线程和数组内存中的分布一一对应。

线程和二维矩阵映射关系

ix = threadIdx.x + blockIdx.x * blockDim.x;
iy = threadIdx.y + blockIdx.y * blockDim.y;

如下图

线程和二维矩阵映射关系

idx = iy * gridDim.x * blockDim.x + ix;

编写代码如下：实现二维网格二维线程块进行二维数组的加法

#include "cuda_runtime_api.h"
#include <device_launch_parameters.h>
#include <iostream>

static void CheckCudaErrorAux(const char*, unsigned, const char*, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)

__global__ void addMatrix(int* A, int* B, int* C, const int nx, const int ny)
{
	int ix = threadIdx.x + blockIdx.x * blockDim.x;
	int iy = threadIdx.y + blockIdx.y * blockDim.y;
	int idx = iy * gridDim.x * blockDim.x + ix;

	if (ix < nx && iy < ny)
	{
		C[idx] = A[idx] + B[idx];
	}
}

int main()
{
	const int nx = 16;
	const int ny = 8;
	const int nxy = nx * ny;
	size_t stBytesCount = nxy * sizeof(int);
	int* ipHost_A = new int[nxy];
	int* ipHost_B = new int[nxy];
	int* ipHost_C = new int[nxy];

	for (size_t i = 0; i < nxy; i++)
	{
		ipHost_A[i] = i;
		ipHost_B[i] = i + 1;
	}
	memset(ipHost_C, 0, stBytesCount);
	
	int* ipDevice_A, * ipDevice_B, * ipDevice_C;
	CUDA_CHECK_RETURN(cudaMalloc((void**)&ipDevice_A, stBytesCount));
	CUDA_CHECK_RETURN(cudaMalloc((void**)&ipDevice_B, stBytesCount));
	CUDA_CHECK_RETURN(cudaMalloc((void**)&ipDevice_C, stBytesCount));

	CUDA_CHECK_RETURN(cudaMemcpy(ipDevice_A, ipHost_A, stBytesCount, cudaMemcpyHostToDevice));
	CUDA_CHECK_RETURN(cudaMemcpy(ipDevice_B, ipHost_B, stBytesCount, cudaMemcpyHostToDevice));

	dim3 block(4,4);
	dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);

	addMatrix <<<grid, block >>> (ipDevice_A, ipDevice_B, ipDevice_C, nx, ny);

	CUDA_CHECK_RETURN(cudaMemcpy(ipHost_C, ipDevice_C, stBytesCount, cudaMemcpyDeviceToHost));

	for (size_t i = 0; i < nxy; i++)
	{
		if (i % 4 == 0 && i)
			std::cout << std::endl;

		std::cout << ipHost_A[i] << " + " << ipHost_B[i] << " = " << ipHost_C[i] << "\t";

	}

	cudaFree(ipDevice_A);
	cudaFree(ipDevice_B);
	cudaFree(ipDevice_C);

	delete []ipHost_A;
	delete []ipHost_B;
	delete []ipHost_C;

	ipHost_A =nullptr;
	ipHost_B =nullptr;
	ipHost_C =nullptr;


	return 0;
}

static void CheckCudaErrorAux(const char* file, unsigned line, const char* statement, cudaError_t err)
{
	if (err == cudaSuccess)
		return;
	std::cerr << statement << " returned: " << cudaGetErrorName(err) << "  \t : " << cudaGetErrorString(err) << "(" << err << ") at " << file << ":" << line << std::endl;
	exit(1);
}

二维网格一维线程块 2D grid \ 1D block

如图，不同颜色的方块表示的是一个线程块。因为数组大小是16*8 =128，先定义每个块的维度是（4，1），所以可以计算得到网格的维度是（4，8）。

线程和二维矩阵映射关系

这里定义的网格是一维的，所以blockDim.y = 1, threadIdx.y 始终是0

ix = threadIdx.x + blockIdx.x * blockDim.x;
iy = threadIdx.y + blockIdx.y * blockDim.y;

如下图

线程和二维矩阵映射关系

idx = iy * gridDim.x * blockDim.x + ix;

编写代码如下：实现二维网格一维线程块进行二维数组的加法

#include "cuda_runtime_api.h"
#include <device_launch_parameters.h>
#include <iostream>

static void CheckCudaErrorAux(const char*, unsigned, const char*, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)

__global__ void kernel_addMatrix(int* A, int* B, int* C, const int nx, const int ny)
{
	int ix = threadIdx.x + blockIdx.x * blockDim.x;
	int iy = threadIdx.y + blockIdx.y * blockDim.y;  // 因为block是一维的，所以threadIdx.y始终是0
	int idx = iy * gridDim.x * blockDim.x + ix;

	if (ix < nx && iy < ny)
	{
		C[idx] = A[idx] + B[idx];
	}
}

int main()
{
	const int nx = 16;
	const int ny = 8;
	const int nxy = nx * ny;
	size_t stBytesCount = nxy * sizeof(int);
	int* ipHost_A = new int[nxy];
	int* ipHost_B = new int[nxy];
	int* ipHost_C = new int[nxy];

	for (size_t i = 0; i < nxy; i++)
	{
		ipHost_A[i] = i;
		ipHost_B[i] = i + 1;
	}
	memset(ipHost_C, 0, stBytesCount);

	int* ipDevice_A, * ipDevice_B, * ipDevice_C;
	CUDA_CHECK_RETURN(cudaMalloc((void**)&ipDevice_A, stBytesCount));
	CUDA_CHECK_RETURN(cudaMalloc((void**)&ipDevice_B, stBytesCount));
	CUDA_CHECK_RETURN(cudaMalloc((void**)&ipDevice_C, stBytesCount));

	CUDA_CHECK_RETURN(cudaMemcpy(ipDevice_A, ipHost_A, stBytesCount, cudaMemcpyHostToDevice));
	CUDA_CHECK_RETURN(cudaMemcpy(ipDevice_B, ipHost_B, stBytesCount, cudaMemcpyHostToDevice));

	dim3 block(4, 1);
	dim3 grid((nx + block.x - 1) / block.x, ny);

	kernel_addMatrix <<<grid, block >>> (ipDevice_A, ipDevice_B, ipDevice_C, nx, ny);	

	cudaThreadSynchronize();

	CUDA_CHECK_RETURN(cudaMemcpy(ipHost_C, ipDevice_C, stBytesCount, cudaMemcpyDeviceToHost));

	for (size_t i = 0; i < nxy; i++)
	{
		if (i % 4 == 0 && i)
			std::cout << std::endl;

		std::cout << ipHost_A[i] << " + " << ipHost_B[i] << " = " << ipHost_C[i] << "\t";

	}

	cudaFree(ipDevice_A);
	cudaFree(ipDevice_B);
	cudaFree(ipDevice_C);

	delete[]ipHost_A;
	delete[]ipHost_B;
	delete[]ipHost_C;

	ipHost_A = nullptr;
	ipHost_B = nullptr;
	ipHost_C = nullptr;


	return 0;
}

static void CheckCudaErrorAux(const char* file, unsigned line, const char* statement, cudaError_t err)
{
	if (err == cudaSuccess)
		return;
	std::cerr << statement << " returned: " << cudaGetErrorName(err) << "  \t : " << cudaGetErrorString(err) << "(" << err << ") at " << file << ":" << line << std::endl;
	exit(1);
}

示例结果

一维网格一维线程块 1D grid \ 1D block

之前的GPU线程数和数组的大小是相等的，如果说不相等的情况下，GPU每个线程处理的就不是一个运算，而是多个数据的运算。也就是说在核函数中需要使用循环进行处理。

针对本文示例，网格块定义为（4，1），线程块定义为（4，1）。也就是说每个线程处理的分布如下图：

这个例子中，每个线程需要处理的是一列的数据。

线程和二维矩阵映射关系

ix = threadIdx.x + blockIdx.x * blockDim.x;
iy = threadIdx.y + blockIdx.y * blockDim.y;

这里因为grid\block都是一维的，所以threadIdx.y、blockIdx.y都始终是0.

如下

编码如下：

#include "cuda_runtime_api.h"
#include <device_launch_parameters.h>
#include <iostream>

static void CheckCudaErrorAux(const char*, unsigned, const char*, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)

__global__ void _addMatrix(int* A, int* B, int* C, const int nx, const int ny)
{
	int ix = threadIdx.x + blockIdx.x * blockDim.x;

	int iy = 0; 
	int offset = gridDim.x * blockDim.x;

	if (ix < nx)
	{
		for (size_t i = 0; i < ny; i++)
		{
			int idx = i * offset + ix;
			if (idx < nx*ny)
			{
				C[idx] = A[idx] + B[idx];
			}
		}
	}
}

int main()
{
	const int nx = 16;
	const int ny = 8;
	const int nxy = nx * ny;
	size_t stBytesCount = nxy * sizeof(int);
	int* ipHost_A = new int[nxy];
	int* ipHost_B = new int[nxy];
	int* ipHost_C = new int[nxy];

	for (size_t i = 0; i < nxy; i++)
	{
		ipHost_A[i] = i;
		ipHost_B[i] = i + 1;
	}
	memset(ipHost_C, 0, stBytesCount);

	int* ipDevice_A, * ipDevice_B, * ipDevice_C;
	CUDA_CHECK_RETURN(cudaMalloc((void**)&ipDevice_A, stBytesCount));
	CUDA_CHECK_RETURN(cudaMalloc((void**)&ipDevice_B, stBytesCount));
	CUDA_CHECK_RETURN(cudaMalloc((void**)&ipDevice_C, stBytesCount));

	CUDA_CHECK_RETURN(cudaMemcpy(ipDevice_A, ipHost_A, stBytesCount, cudaMemcpyHostToDevice));
	CUDA_CHECK_RETURN(cudaMemcpy(ipDevice_B, ipHost_B, stBytesCount, cudaMemcpyHostToDevice));

	dim3 block(4, 1);
	dim3 grid(4, 1);

	_addMatrix << <grid, block >> > (ipDevice_A, ipDevice_B, ipDevice_C, nx, ny);

	cudaThreadSynchronize();

	CUDA_CHECK_RETURN(cudaMemcpy(ipHost_C, ipDevice_C, stBytesCount, cudaMemcpyDeviceToHost));

	for (size_t i = 0; i < nxy; i++)
	{
		if (i % 4 == 0 && i)
			std::cout << std::endl;

		std::cout << ipHost_A[i] << " + " << ipHost_B[i] << " = " << ipHost_C[i] << "\t";

	}

	cudaFree(ipDevice_A);
	cudaFree(ipDevice_B);
	cudaFree(ipDevice_C);

	delete[]ipHost_A;
	delete[]ipHost_B;
	delete[]ipHost_C;

	ipHost_A = nullptr;
	ipHost_B = nullptr;
	ipHost_C = nullptr;


	return 0;
}

static void CheckCudaErrorAux(const char* file, unsigned line, const char* statement, cudaError_t err)
{
	if (err == cudaSuccess)
		return;
	std::cerr << statement << " returned: " << cudaGetErrorName(err) << "  \t : " << cudaGetErrorString(err) << "(" << err << ") at " << file << ":" << line << std::endl;
	exit(1);
}