安装编译环境
https://developer.download.nvidia.com/compute/cuda/12.5.0/local_installers/cuda_12.5.0_555.85_windows.exe
CMakeLists.txt
cmake_minimum_required(VERSION 3.10)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Release)
#set(CMAKE_CUDA_ARCHITECTURES 52;70;75;86)
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
project(hellocuda LANGUAGES CXX CUDA)
add_executable(main main.cu hello.cu)
target_include_directories(main PUBLIC ../../include)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
# --use_fast_math sinf 替换成 __sinf
# --ftz=true 会把极小数(denormal)退化为0
# --prec-div=false 降低除法的精度换取速度。
# --prec-sqrt=false 降低开方的精度换取速度。
# --fmad 因为非常重要,所以默认就是开启的,会自动把 a * b + c 优化成乘加(FMA)指令。
# 开启 --use_fast_math 后会自动开启上述所有
CudaAllocator.h
template <class T>
struct CudaAllocator {
using value_type = T;
T* allocate(size_t size) {
T* ptr = nullptr;
checkCudaErrors(cudaMallocManaged(&ptr, size * sizeof(T)));
return ptr;
}
void deallocate(T* ptr, size_t size = 0) {
checkCudaErrors(cudaFree(ptr));
}
template <class ...Args>
void construct(T* p, Args &&...args) {
if constexpr (!(sizeof...(Args) == 0 && std::is_pod_v<T>))
::new((void*)p) T(std::forward<Args>(args)...);
}
};
hello.cu
#include <cstdio>
#include <cuda_runtime.h>
__device__ void say_hello3() { // 定义
printf("Hello, world!\n");
}
main.cu
#include <cstdio>
#include <cuda_runtime.h>
#include "helper_cuda.h"
#include <vector>
#include "CudaAllocator.h"
#include <thrust/universal_vector.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/generate.h>
#include <thrust/for_each.h>
__device__ void say_hello3(); // 声明
__device__ __inline__ void say_hello() {
printf("Hello, world, world from GPU!\n");
}
__host__ void say_hello_host() {
printf("Hello, world from CPU!\n");
}
__host__ __device__ void say_hello2() {
#ifdef __CUDA_ARCH__
printf("Hello, world from GPU architecture %d!\n", __CUDA_ARCH__);
#else
printf("Hello, world from CPU!\n");
#endif
}
__global__ void another() {
int localVal = 1;
printf("another: Thread %d of %d\n", threadIdx.x, blockDim.x);
}
__global__ void kernel() {
unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int tnum = blockDim.x * gridDim.x;
printf("Flattened Thread %d of %d\n", tid, tnum);
say_hello2();
say_hello3();
another << <1, tnum >> > (); // 核函数调用核函数
printf("kernel: called another with %d threads\n", tnum);
}
__global__ void kernel2() {
printf("Block (%d,%d,%d) of (%d,%d,%d), Thread (%d,%d,%d) of (%d,%d,%d)\n",
blockIdx.x, blockIdx.y, blockIdx.z,
gridDim.x, gridDim.y, gridDim.z,
threadIdx.x, threadIdx.y, threadIdx.z,
blockDim.x, blockDim.y, blockDim.z);
}
// 默认host
constexpr const char* cuthead(const char* p) {
return p + 1;
}
// 内存访问
__global__ void kernel_memory(int* pret) {
*pret = 42;
}
// 数组并行处理
__global__ void kernel_array(int* arr, int n) {
for (int i = blockDim.x * blockIdx.x + threadIdx.x;
i < n; i += blockDim.x * gridDim.x) {
arr[i] = i;
}
}
// 并行处理模板
template <class Func>
__global__ void parallel_for(int n, Func func) {
for (int i = blockDim.x * blockIdx.x + threadIdx.x;
i < n; i += blockDim.x * gridDim.x) {
func(i);
}
}
// 自定义加减乘除
__device__ __inline__ int float_atomic_add(float* dst, float src) {
int old = __float_as_int(*dst), expect;
do {
expect = old;
old = atomicCAS((int*)dst, expect,
__float_as_int(__int_as_float(expect) + src));
} while (expect != old);
return old;
}
// map
template <int blockSize, class T>
__global__ void parallel_sum_kernel(T* sum, T const* arr, int n) {
__shared__ volatile int local_sum[blockSize];
int j = threadIdx.x;
int i = blockIdx.x;
T temp_sum = 0;
for (int t = i * blockSize + j; t < n; t += blockSize * gridDim.x) {
temp_sum += arr[t];
}
local_sum[j] = temp_sum;
__syncthreads();
if constexpr (blockSize >= 1024) {
if (j < 512)
local_sum[j] += local_sum[j + 512];
__syncthreads();
}
if constexpr (blockSize >= 512) {
if (j < 256)
local_sum[j] += local_sum[j + 256];
__syncthreads();
}
if constexpr (blockSize >= 256) {
if (j < 128)
local_sum[j] += local_sum[j + 128];
__syncthreads();
}
if constexpr (blockSize >= 128) {
if (j < 64)
local_sum[j] += local_sum[j + 64];
__syncthreads();
}
if (j < 32) {
if constexpr (blockSize >= 64)
local_sum[j] += local_sum[j + 32];
if constexpr (blockSize >= 32)
local_sum[j] += local_sum[j + 16];
if constexpr (blockSize >= 16)
local_sum[j] += local_sum[j + 8];
if constexpr (blockSize >= 8)
local_sum[j] += local_sum[j + 4];
if constexpr (blockSize >= 4)
local_sum[j] += local_sum[j + 2];
if (j == 0) {
sum[i] = local_sum[0] + local_sum[1];
}
}
}
// reduce
template <int reduceScale = 4096, int blockSize = 256, int cutoffSize = reduceScale * 2, class T>
int parallel_sum(T const* arr, int n) {
if (n > cutoffSize) {
std::vector<int, CudaAllocator<int>> sum(n / reduceScale);
parallel_sum_kernel<blockSize> << <n / reduceScale, blockSize >> > (sum.data(), arr, n);
return parallel_sum(sum.data(), n / reduceScale);
}
else {
checkCudaErrors(cudaDeviceSynchronize());
T final_sum = 0;
for (int i = 0; i < n; i++) {
final_sum += arr[i];
}
return final_sum;
}
}
// 共享内存
template <int blockSize, class T>
__global__ void parallel_transpose(T* out, T const* in, int nx, int ny) {
int x = blockIdx.x * blockSize + threadIdx.x;
int y = blockIdx.y * blockSize + threadIdx.y;
if (x >= nx || y >= ny) return;
__shared__ T tmp[blockSize * blockSize];
int rx = blockIdx.y * blockSize + threadIdx.x;
int ry = blockIdx.x * blockSize + threadIdx.y;
tmp[threadIdx.y * blockSize + threadIdx.x] = in[ry * nx + rx];
__syncthreads();
out[y * nx + x] = tmp[threadIdx.x * blockSize + threadIdx.y];
}
void testArray() {
int n = 65535;
int* arr;
checkCudaErrors(cudaMallocManaged(&arr, n * sizeof(int)));
int nthreads = 128;
int nblocks = (n + nthreads + 1) / nthreads;
kernel_array << <nblocks, nthreads >> > (arr, n);
checkCudaErrors(cudaDeviceSynchronize());
for (int i = 0; i < n; i++) {
printf("arr[%d]: %d\n", i, arr[i]);
}
cudaFree(arr);
}
void testVector() {
int n = 65536;
std::vector<int, CudaAllocator<int>> arr(n);
parallel_for << <32, 128 >> > (n, [arr = arr.data()] __device__(int i) {
arr[i] = i;
});
checkCudaErrors(cudaDeviceSynchronize());
for (int i = 0; i < n; i++) {
printf("arr[%d] = %d\n", i, arr[i]);
}
}
void testMath() {
int n = 1 << 25;
std::vector<float, CudaAllocator<float>> gpu(n);
std::vector<float> cpu(n);
for (int i = 0; i < n; i++) {
cpu[i] = sinf(i);
}
parallel_for << <n / 512, 128 >> > (n, [gpu = gpu.data()] __device__(int i) {
gpu[i] = __sinf(i);
});
checkCudaErrors(cudaDeviceSynchronize());
//for (int i = 0; i < n; i++) {
//printf("diff %d = %f\n", i, gpu[i] - cpu[i]);
//}
}
void testTrust() {
int n = 65536;
thrust::universal_vector<float> x(n); // CPU或GPU
thrust::host_vector<float> x_host(n); // CPU
thrust::device_vector<float> x_dev = x_host; // GPU
thrust::for_each(x_dev.begin(), x_dev.end(), [] __device__(float& x) {
x += 100.f;
});
}
void testAtomic() {
// atomicAdd(dst, src)
// atomicSub
// atomicOr
// atomicAnd
// atomicXor
// atomicMax
// atomicMin
// atomicExch old = *dst; *dst = src
// atomicCAS automic::exchange
}
void test_share() {
int nx = 1 << 14, ny = 1 << 14;
std::vector<int, CudaAllocator<int>> in(nx * ny);
std::vector<int, CudaAllocator<int>> out(nx * ny);
parallel_transpose<32> << <dim3(nx / 32, ny / 32, 1), dim3(32, 32, 1) >> >
(out.data(), in.data(), nx, ny);
}
// host 可以调用 global;global 可以调用 device;device 可以调用 device
// 实际上 GPU 的板块相当于 CPU 的线程,GPU 的线程相当于 CPU 的SIMD
// 数学函数sqrtf,rsqrtf,cbrtf,rcbrtf,powf,sinf,cosf,sinpif,cospif,sincosf,sincospif,logf,log2f,log10f,expf,exp2f,exp10f,tanf,atanf,asinf,acosf,fmodf,fabsf,fminf,fmaxf
// 低精度,高性能 __sinf,__expf、__logf、__cosf、__powf, __fdividef(x, y)
// old = atomicAdd(dst, src) 相当于 old = *dst; *dst += src
int main() {
// 三重尖括号里的,第一个参数表示板块数量,第二个参数决定着启动 kernel 时所用 GPU 的线程数量
kernel<<<1, 1>>>(); // CPU调用GPU函数异步执行
kernel2<<<dim3(2, 1, 1), dim3(2, 2, 2)>>>();
cudaDeviceSynchronize(); // CPU等待GPU完成
say_hello_host();
int* pret;
checkCudaErrors(cudaMallocManaged(&pret, sizeof(int)));
kernel_memory << <1, 1 >> > (pret);
checkCudaErrors(cudaDeviceSynchronize());
printf("result: %d\n", *pret);
cudaFree(pret);
testArray();
float sin10 = sinf(10.f);
/// map-reduce
int n = 1 << 24;
std::vector<int, CudaAllocator<int>> arr(n);
int final_sum = parallel_sum(arr.data(), n);
return 0;
}
参考
cuda-samples/Common/helper_string.h at 5f97d7d0dff880bc6567faa4c5e62e389a6d6999 · NVIDIA/cuda-samples · GitHub
https://github.com/NVIDIA/cuda-samples/blob/5f97d7d0dff880bc6567faa4c5e62e389a6d6999/Common/helper_cuda.h#L31
https://www.nvidia.cn/docs/IO/51635/NVIDIA_CUDA_Programming_Guide_1.1_chs.pdf
https://developer.download.nvidia.cn/CUDA/training/StreamsAndConcurrencyWebinar.pdf
CUDA Pro Tip: Write Flexible Kernels with Grid-Stride Loops | NVIDIA Technical Blog