2.1 在CPU上执行sum array
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
for (int idx = 0; idx< N; idx ++)
{
C[idx] = A[idx] + B[idx];
}
}
void initialData(float *ip, int size)
{
time_t t;
srand((unsigned int) time(&t));
for (int i = 0; i < size; i++) {
ip[i] = (float) (rand() & 0xff) / 10.0f;
}
}
int main(int argc , char **argv)
{
int nElem = 1024;
size_t nBytes = nElem * sizeof(float);
float *h_A, *h_B, *h_C;
h_A = (float *) malloc (nBytes);
h_B = (float *) malloc (nBytes);
h_C = (float *) malloc (nBytes);
initialData(h_A, nElem);
initialData(h_B, nElem);
sumArraysOnHost(h_A, h_B, h_C, nElem);
free(h_A);
free(h_B);
free(h_C);
return 0;
}
2.2 查看grid, block的索引维度
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <cuda_runtime.h>
__global__ void checkIndex(void)
{
printf("threadidx: (%d ,%d ,%d) blockidx:(%d ,%d ,%d) blockdim: (%d ,%d ,%d) gridDim: (%d ,%d ,%d)\n",
threadIdx.x, threadIdx.y, threadIdx.z,
blockIdx.x, blockIdx.y, blockIdx.z,
blockDim.x,blockDim.y,blockDim.z,
gridDim.x, gridDim.y, gridDim.z
);
}
int main(int argc , char **argv)
{
int nElem = 6;
dim3 block(3);
dim3 grid ((nElem + block.x -1)/block.x);
printf("grid.x %d grid.y %d grid.z %d\n", grid.x, grid.y, grid.z);
printf("block.x %d block.y %d block.z %d\n", block.x, block.y, block.z);
checkIndex<<<grid, block>>>();
cudaDeviceReset();
return 0;
}
输出
grid.x 2 grid.y 1 grid.z 1
block.x 3 block.y 1 block.z 1
threadidx: (0 ,0 ,0) blockidx:(1 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (1 ,0 ,0) blockidx:(1 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (2 ,0 ,0) blockidx:(1 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (0 ,0 ,0) blockidx:(0 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (1 ,0 ,0) blockidx:(0 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (2 ,0 ,0) blockidx:(0 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
2.4 cuda核函数
kernel_name<<<grid, block>>> (argument list);
2.5 核函数限定符
global 在设备端执行,主机端或计算能力>3的设备端调用,必须有void 返回类型
device 在设备端执行,设备端调用
host 在主机端执行,主机端调用
2.6 在GPU中sum,和HOST sum array做对比
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#define CHECK(call) \
{\
const cudaError_t error = call; \
if (error != cudaSuccess)\
{\
printf("Error: %s: %d\n", __FILE__, __LINE__);\
printf("code :%d reason :%s\n", error , cudaGetErrorString(error));\
exit(1);\
}\
}
void checkResult(float *hostRef, float *gpuRef, const int N)
{
double epsilon = 1.0E-8;
bool match = 1;
for (int i = 0; i < N; i++)
{
if (abs(hostRef[i] - gpuRef[i])> epsilon)
{
match = 0;
printf("Array do not match\n");
printf("host %5.2f gpu % 5.2f at current %d\n", hostRef[i], gpuRef[i], i);
break;
}
}
if (match) printf("array matches\n");
}
void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
for (int idx = 0; idx< N; idx ++)
{
C[idx] = A[idx] + B[idx];
}
}
void initialData(float *ip, int size)
{
time_t t;
srand((unsigned int) time(&t));
for (int i = 0; i < size; i++) {
ip[i] = (float) (rand() & 0xff) / 10.0f;
}
}
__global__ void sumArraysOnGPU(float *A, float *B, float *C)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
C[i] = A[i] + B[i];
}
double cpusec()
{
struct timeval tp;
gettimeofday(&tp, NULL);
return ((double) tp.tv_sec + (double)tp.tv_usec* 1.e-6);
}
int main(int argc , char **argv)
{
printf("%s starting\n", argv[0]);
int dev = 0;
cudaSetDevice(dev);
//set up data
int nElem = 32;
size_t nBytes = nElem * sizeof(float);
float *h_A, *h_B, *hostRef, *gpuRef;
h_A = (float *) malloc (nBytes);
h_B = (float *) malloc (nBytes);
hostRef = (float *) malloc (nBytes);
gpuRef = (float *) malloc (nBytes);
initialData(h_A, nElem);
initialData(h_B, nElem);
memset(hostRef,0, nBytes);
memset(gpuRef,0, nBytes);
// malloc device global memory
float *d_A, *d_B, *d_C;
cudaMalloc((float**)&d_A, nBytes);
cudaMalloc((float**)&d_B, nBytes);
cudaMalloc((float**)&d_C, nBytes);
//transfer data from host to device
cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);
dim3 block(nElem);
dim3 grid(nElem/block.x);
sumArraysOnGPU<<<grid,block>>>(d_A, d_B, d_C);
printf("execution config <<<%d, %d>>>\n", grid.x, block.x);
//copy kernel result back to host
cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);
sumArraysOnHost(h_A, h_B, hostRef, nElem);
checkResult(hostRef, gpuRef, nElem);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(hostRef);
free(gpuRef);
return 0;
}
输出
execution config <<<1, 32>>>
array matches
2.8 尝试用nvprof 检查执行时间,报错找不到dll文件,可以参考
https://blog.csdn.net/qq_41607336/article/details/126741908
解决,
但最后还是报warning, 跟显卡相关
======= Warning: nvprof is not supported on devices with compute capability 8.0 and higher.
2.9 加上device信息打印,以及cudaevent 计时, 参考https://blog.csdn.net/qq_42681630/article/details/144895351
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <windows.h>
typedef unsigned long DWORD;
#define CHECK(call) \
{\
const cudaError_t error = call; \
if (error != cudaSuccess)\
{\
printf("Error: %s: %d\n", __FILE__, __LINE__);\
printf("code :%d reason :%s\n", error , cudaGetErrorString(error));\
exit(1);\
}\
}
void checkResult(float *hostRef, float *gpuRef, const int N)
{
double epsilon = 1.0E-8;
bool match = 1;
for (int i = 0; i < N; i++)
{
if (abs(hostRef[i] - gpuRef[i])> epsilon)
{
match = 0;
printf("Array do not match\n");
printf("host %5.2f gpu % 5.2f at current %d\n", hostRef[i], gpuRef[i], i);
break;
}
}
if (match) printf("array matches\n");
}
void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
for (int idx = 0; idx< N; idx ++)
{
C[idx] = A[idx] + B[idx];
}
}
void initialData(float *ip, int size)
{
time_t t;
srand((unsigned int) time(&t));
for (int i = 0; i < size; i++) {
ip[i] = (float) (rand() & 0xff) / 10.0f;
}
}
__global__ void sumArraysOnGPU(float *A, float *B, float *C)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
C[i] = A[i] + B[i];
}
int main(int argc , char **argv)
{
printf("%s starting\n", argv[0]);
int dev = 0;
cudaDeviceProp deviceprop;
CHECK(cudaGetDeviceProperties(&deviceprop,dev));
printf("Using Device %d : %s\n", dev, deviceprop.name);
CHECK(cudaSetDevice(dev));
printf( "Compute capability: %d.%d\n", deviceprop.major, deviceprop.minor );
printf( "Clock rate: %d\n", deviceprop.clockRate );
printf( "Memory Clock rate: %d\n", deviceprop.memoryClockRate );
printf( "Memory busWidth: %d\n", deviceprop.memoryBusWidth );
printf( " --- Memory Information for device ---\n");
// printf( "Total global mem: %ld\n", prop.totalGlobalMem );
printf( "Total global mem: %zu\n", deviceprop.totalGlobalMem );
printf( "Total constant Mem: %ld\n", deviceprop.totalConstMem );
printf( "Max mem pitch: %ld\n", deviceprop.memPitch );
printf( "Texture Alignment: %ld\n", deviceprop.textureAlignment );
printf( " --- MP Information for device ---\n" );
printf( "Multiprocessor count: %d\n",
deviceprop.multiProcessorCount );
printf( "Shared mem per mp: %ld\n", deviceprop.sharedMemPerBlock );
printf( "Registers per mp: %d\n", deviceprop.regsPerBlock );
printf( "Threads in warp: %d\n", deviceprop.warpSize );
printf( "Max threads per block: %d\n",
deviceprop.maxThreadsPerBlock );
printf( "Max thread dimensions: (%d, %d, %d)\n",
deviceprop.maxThreadsDim[0], deviceprop.maxThreadsDim[1],
deviceprop.maxThreadsDim[2] );
printf( "Max grid dimensions: (%d, %d, %d)\n",
deviceprop.maxGridSize[0], deviceprop.maxGridSize[1],
deviceprop.maxGridSize[2] );
printf( "\n" );
//set up data
int nElem = 1<<24;
size_t nBytes = nElem * sizeof(float);
float *h_A, *h_B, *hostRef, *gpuRef;
h_A = (float *) malloc (nBytes);
h_B = (float *) malloc (nBytes);
hostRef = (float *) malloc (nBytes);
gpuRef = (float *) malloc (nBytes);
initialData(h_A, nElem);
initialData(h_B, nElem);
memset(hostRef,0, nBytes);
memset(gpuRef,0, nBytes);
// malloc device global memory
float *d_A, *d_B, *d_C;
cudaMalloc((float**)&d_A, nBytes);
cudaMalloc((float**)&d_B, nBytes);
cudaMalloc((float**)&d_C, nBytes);
//transfer data from host to device
cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);
int Ilen = 1024;
dim3 block(Ilen);
dim3 grid((nElem + block.x - 1)/block.x);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
sumArraysOnGPU<<<grid,block>>>(d_A, d_B, d_C);
printf("execution config <<<%d, %d>>>\n", grid.x, block.x);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("Kernel execution time: %f ms\n", milliseconds);
cudaEventDestroy(start);
cudaEventDestroy(stop);
//copy kernel result back to host
cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);
sumArraysOnHost(h_A, h_B, hostRef, nElem);
checkResult(hostRef, gpuRef, nElem);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(hostRef);
free(gpuRef);
return 0;
}
输出:
Using Device 0 : NVIDIA GeForce RTX 4090
Compute capability: 8.9
Clock rate: 2520000
Memory Clock rate: 10501000
Memory busWidth: 384
— Memory Information for device —
Total global mem: 25756696576
Total constant Mem: 65536
Max mem pitch: 2147483647
Texture Alignment: 512
— MP Information for device —
Multiprocessor count: 128
Shared mem per mp: 49152
Registers per mp: 65536
Threads in warp: 32
Max threads per block: 1024
Max thread dimensions: (1024, 1024, 64)
Max grid dimensions: (2147483647, 65535, 65535)
execution config <<<16384, 1024>>>
execution config <<<16384, 1024>>>
Kernel execution time: 0.558752 ms
array matches