【cuda学习日记】2.cuda编程模型

2.1 在CPU上执行sum array

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>

void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
    for (int idx = 0; idx< N; idx ++)
    {
        C[idx] = A[idx] + B[idx];
    }
}

void initialData(float *ip, int size)
{
    time_t t;
    srand((unsigned int) time(&t));

    for (int i = 0; i < size; i++) {
        ip[i] = (float) (rand() & 0xff) / 10.0f;
    }
}

int main(int argc , char **argv)
{
    int nElem = 1024;
    size_t nBytes = nElem * sizeof(float);
    float *h_A, *h_B, *h_C;

    h_A = (float *) malloc (nBytes);
    h_B = (float *) malloc (nBytes);
    h_C = (float *) malloc (nBytes);

    initialData(h_A, nElem);
    initialData(h_B, nElem);

    sumArraysOnHost(h_A, h_B, h_C, nElem);

    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}

2.2 查看grid, block的索引维度

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <cuda_runtime.h>

__global__ void checkIndex(void)
{
    printf("threadidx: (%d ,%d ,%d) blockidx:(%d ,%d ,%d) blockdim: (%d ,%d ,%d) gridDim: (%d ,%d ,%d)\n", 
    threadIdx.x, threadIdx.y, threadIdx.z,
    blockIdx.x, blockIdx.y, blockIdx.z,
    blockDim.x,blockDim.y,blockDim.z,
    gridDim.x, gridDim.y, gridDim.z
    );
}


int main(int argc , char **argv)
{
    int nElem = 6;
    
    dim3 block(3);
    dim3 grid ((nElem + block.x -1)/block.x);

    printf("grid.x %d grid.y %d grid.z %d\n", grid.x, grid.y, grid.z);
    printf("block.x %d block.y %d block.z %d\n", block.x, block.y, block.z);

    checkIndex<<<grid, block>>>();

    cudaDeviceReset();

    return 0;
}

输出
grid.x 2 grid.y 1 grid.z 1
block.x 3 block.y 1 block.z 1
threadidx: (0 ,0 ,0) blockidx:(1 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (1 ,0 ,0) blockidx:(1 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (2 ,0 ,0) blockidx:(1 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (0 ,0 ,0) blockidx:(0 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (1 ,0 ,0) blockidx:(0 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (2 ,0 ,0) blockidx:(0 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)

2.4 cuda核函数
kernel_name<<<grid, block>>> (argument list);

2.5 核函数限定符
global 在设备端执行，主机端或计算能力>3的设备端调用，必须有void 返回类型
device 在设备端执行，设备端调用
host 在主机端执行，主机端调用

2.6 在GPU中sum，和HOST sum array做对比

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>

#define CHECK(call) \
 {\
    const cudaError_t error = call; \
    if (error != cudaSuccess)\
    {\
        printf("Error: %s: %d\n", __FILE__, __LINE__);\
        printf("code :%d reason :%s\n", error , cudaGetErrorString(error));\
        exit(1);\
    }\
}

void checkResult(float *hostRef, float *gpuRef, const int N)
{
    double epsilon = 1.0E-8;
    bool match = 1;
    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef[i] - gpuRef[i])> epsilon)
        {
            match = 0;
            printf("Array do not match\n");
            printf("host %5.2f gpu % 5.2f at current %d\n", hostRef[i], gpuRef[i], i);
            break;

        }
    }
    if (match) printf("array matches\n");
}

void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
    for (int idx = 0; idx< N; idx ++)
    {
        C[idx] = A[idx] + B[idx];
    }
}

void initialData(float *ip, int size)
{
    time_t t;
    srand((unsigned int) time(&t));

    for (int i = 0; i < size; i++) {
        ip[i] = (float) (rand() & 0xff) / 10.0f;
    }
}

__global__ void sumArraysOnGPU(float *A, float *B, float *C)
{
    int i  = blockIdx.x * blockDim.x + threadIdx.x;
    C[i] = A[i] + B[i];
}


double cpusec()
{
    struct timeval tp;
    gettimeofday(&tp, NULL);
    return ((double) tp.tv_sec + (double)tp.tv_usec* 1.e-6);
}

int main(int argc , char **argv)
{
    printf("%s starting\n", argv[0]);

    int dev = 0;
    cudaSetDevice(dev);


    //set up data
    int nElem = 32;
    size_t nBytes = nElem * sizeof(float);
    float *h_A, *h_B, *hostRef, *gpuRef;

    h_A = (float *) malloc (nBytes);
    h_B = (float *) malloc (nBytes);
    hostRef = (float *) malloc (nBytes);
    gpuRef = (float *) malloc (nBytes);

    initialData(h_A, nElem);
    initialData(h_B, nElem);

    memset(hostRef,0, nBytes);
    memset(gpuRef,0, nBytes);

    // malloc device global memory
    float *d_A, *d_B, *d_C;
    cudaMalloc((float**)&d_A, nBytes);
    cudaMalloc((float**)&d_B, nBytes);
    cudaMalloc((float**)&d_C, nBytes);

    //transfer data from host to device
    cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);

    dim3 block(nElem);
    dim3 grid(nElem/block.x);

    sumArraysOnGPU<<<grid,block>>>(d_A, d_B, d_C);
    printf("execution config <<<%d, %d>>>\n", grid.x, block.x);

    //copy kernel result back to host
    cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);

    sumArraysOnHost(h_A, h_B, hostRef, nElem);

    checkResult(hostRef, gpuRef, nElem);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(h_A);
    free(h_B);
    free(hostRef);
    free(gpuRef);

    return 0;
}

输出
execution config <<<1, 32>>>
array matches

2.8 尝试用nvprof 检查执行时间，报错找不到dll文件，可以参考
https://blog.csdn.net/qq_41607336/article/details/126741908
解决，
但最后还是报warning，跟显卡相关
======= Warning: nvprof is not supported on devices with compute capability 8.0 and higher.

2.9 加上device信息打印，以及cudaevent 计时，参考https://blog.csdn.net/qq_42681630/article/details/144895351

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <windows.h>

typedef unsigned long DWORD;

#define CHECK(call) \
 {\
    const cudaError_t error = call; \
    if (error != cudaSuccess)\
    {\
        printf("Error: %s: %d\n", __FILE__, __LINE__);\
        printf("code :%d reason :%s\n", error , cudaGetErrorString(error));\
        exit(1);\
    }\
}

void checkResult(float *hostRef, float *gpuRef, const int N)
{
    double epsilon = 1.0E-8;
    bool match = 1;
    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef[i] - gpuRef[i])> epsilon)
        {
            match = 0;
            printf("Array do not match\n");
            printf("host %5.2f gpu % 5.2f at current %d\n", hostRef[i], gpuRef[i], i);
            break;

        }
    }
    if (match) printf("array matches\n");
}

void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
    for (int idx = 0; idx< N; idx ++)
    {
        C[idx] = A[idx] + B[idx];
    }
}

void initialData(float *ip, int size)
{
    time_t t;
    srand((unsigned int) time(&t));

    for (int i = 0; i < size; i++) {
        ip[i] = (float) (rand() & 0xff) / 10.0f;
    }
}

__global__ void sumArraysOnGPU(float *A, float *B, float *C)
{
    int i  = blockIdx.x * blockDim.x + threadIdx.x;
    C[i] = A[i] + B[i];
}



int main(int argc , char **argv)
{
    printf("%s starting\n", argv[0]);

    int dev = 0;
    cudaDeviceProp deviceprop;
    CHECK(cudaGetDeviceProperties(&deviceprop,dev));
    printf("Using Device %d : %s\n", dev, deviceprop.name);
    CHECK(cudaSetDevice(dev));
    

    printf( "Compute capability:  %d.%d\n", deviceprop.major, deviceprop.minor );
    printf( "Clock rate:  %d\n", deviceprop.clockRate );
    printf( "Memory Clock rate:  %d\n", deviceprop.memoryClockRate );
    printf( "Memory busWidth:  %d\n", deviceprop.memoryBusWidth );
    printf( "   --- Memory Information for device  ---\n");
    // printf( "Total global mem:  %ld\n", prop.totalGlobalMem );
    printf( "Total global mem:  %zu\n", deviceprop.totalGlobalMem );
    printf( "Total constant Mem:  %ld\n", deviceprop.totalConstMem );
    printf( "Max mem pitch:  %ld\n", deviceprop.memPitch );
    printf( "Texture Alignment:  %ld\n", deviceprop.textureAlignment );
    
    printf( "   --- MP Information for device  ---\n" );
    printf( "Multiprocessor count:  %d\n",
                deviceprop.multiProcessorCount );
    printf( "Shared mem per mp:  %ld\n", deviceprop.sharedMemPerBlock );
    printf( "Registers per mp:  %d\n", deviceprop.regsPerBlock );
    printf( "Threads in warp:  %d\n", deviceprop.warpSize );
    printf( "Max threads per block:  %d\n",
                deviceprop.maxThreadsPerBlock );
    printf( "Max thread dimensions:  (%d, %d, %d)\n",
                deviceprop.maxThreadsDim[0], deviceprop.maxThreadsDim[1],
                deviceprop.maxThreadsDim[2] );
    printf( "Max grid dimensions:  (%d, %d, %d)\n",
                deviceprop.maxGridSize[0], deviceprop.maxGridSize[1],
                deviceprop.maxGridSize[2] );
    printf( "\n" );

    //set up data
    int nElem = 1<<24;
    size_t nBytes = nElem * sizeof(float);
    float *h_A, *h_B, *hostRef, *gpuRef;

    h_A = (float *) malloc (nBytes);
    h_B = (float *) malloc (nBytes);
    hostRef = (float *) malloc (nBytes);
    gpuRef = (float *) malloc (nBytes);

    initialData(h_A, nElem);
    initialData(h_B, nElem);

    memset(hostRef,0, nBytes);
    memset(gpuRef,0, nBytes);

    // malloc device global memory
    float *d_A, *d_B, *d_C;
    cudaMalloc((float**)&d_A, nBytes);
    cudaMalloc((float**)&d_B, nBytes);
    cudaMalloc((float**)&d_C, nBytes);

    //transfer data from host to device
    cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);


    int Ilen = 1024;
    dim3 block(Ilen);
    dim3 grid((nElem + block.x - 1)/block.x);
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    cudaEventRecord(start);

    sumArraysOnGPU<<<grid,block>>>(d_A, d_B, d_C);
    printf("execution config <<<%d, %d>>>\n", grid.x, block.x);
    
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("Kernel execution time: %f ms\n", milliseconds);



    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    
    //copy kernel result back to host
    cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);

    sumArraysOnHost(h_A, h_B, hostRef, nElem);

    checkResult(hostRef, gpuRef, nElem);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(h_A);
    free(h_B);
    free(hostRef);
    free(gpuRef);

    return 0;
}

输出：
Using Device 0 : NVIDIA GeForce RTX 4090
Compute capability: 8.9
Clock rate: 2520000
Memory Clock rate: 10501000
Memory busWidth: 384
— Memory Information for device —
Total global mem: 25756696576
Total constant Mem: 65536
Max mem pitch: 2147483647
Texture Alignment: 512
— MP Information for device —
Multiprocessor count: 128
Shared mem per mp: 49152
Registers per mp: 65536
Threads in warp: 32
Max threads per block: 1024
Max thread dimensions: (1024, 1024, 64)
Max grid dimensions: (2147483647, 65535, 65535)