0,环境
ubuntu 22.04
gcc-11
x86_64 18cores/36threads
256GB RAM
rocm 6.0.2
Radeon VII
1,第一次尝试
构建命令:
cmake -G "Unix Makefiles" ../llvm \
-DLLVM_ENABLE_PROJECTS="clang;lld;lldb;mlir;openmp" \
-DLLVM_BUILD_EXAMPLES=ON \
-DLLVM_TARGETS_TO_BUILD="host;AMDGPU" \
-DCMAKE_BUILD_TYPE=Debug \
-DLLVM_ENABLE_ASSERTIONS=ON \
-DLLVM_ENABLE_RUNTIMES=all \
-DLLVM_BUILD_LLVM_DYLIB=ON \
-DCMAKE_INSTALL_PREFIX=../../local_amdgpu
make -j32
效果:
由于 ncurses 库没有搞定,所以失败了:
估计是lldb的layout src之类的命令会用到 ncurses,
对策:不编译 lldb项目
2,第二次尝试
删掉了 lldb 和 openmp,调整了TARGET,加入了 NVPTX,
重来:
cmake -G "Unix Makefiles" ../llvm \
-DLLVM_ENABLE_PROJECTS="clang;lld;mlir" \
-DLLVM_BUILD_EXAMPLES=ON \
-DLLVM_TARGETS_TO_BUILD="Native;NVPTX;AMDGPU" \
-DCMAKE_BUILD_TYPE=Debug \
-DLLVM_ENABLE_ASSERTIONS=ON \
-DLLVM_ENABLE_RUNTIMES=all \
-DLLVM_BUILD_LLVM_DYLIB=ON \
-DCMAKE_INSTALL_PREFIX=../../local_amdgpu
make -j34
编译成功,
安装 make install
效果:
3,测试编译rocm hip源码
3.1 源码示例
vectorAdd.hip
#include <stdio.h>
#include <hip/hip_runtime.h>
__global__ void vectorAdd(const float *A, const float *B, float *C) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
C[i] = A[i] + B[i] + 0.0f;
}
int main(void) {
hipError_t err = hipSuccess;
int numElements = 512;
size_t size = numElements * sizeof(float);
printf("[Vector addition of %d elements]\n", numElements);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
if (h_A == NULL || h_B == NULL || h_C == NULL) {
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
for (int i = 0; i < numElements; ++i) {
h_A[i] = rand() / (float)RAND_MAX;
h_B[i] = rand() / (float)RAND_MAX;
}
float *d_A = NULL;
err = hipMalloc((void **)&d_A, size);
if (err != hipSuccess) {
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
hipGetErrorString(err));
exit(EXIT_FAILURE);
}
float *d_B = NULL;
err = hipMalloc((void **)&d_B, size);
if (err != hipSuccess) {
fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",
hipGetErrorString(err));
exit(EXIT_FAILURE);
}
float *d_C = NULL;
err = hipMalloc((void **)&d_C, size);
if (err != hipSuccess) {
fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",
hipGetErrorString(err));
exit(EXIT_FAILURE);
}
printf("Copy input data from the host memory to the CUDA device\n");
err = hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice);
if (err != hipSuccess) {
fprintf(stderr,
"Failed to copy vector A from host to device (error code %s)!\n",
hipGetErrorString(err));
exit(EXIT_FAILURE);
}
err = hipMemcpy(d_B, h_B, size, hipMemcpyHostToDevice);
if (err != hipSuccess) {
fprintf(stderr,
"Failed to copy vector B from host to device (error code %s)!\n",
hipGetErrorString(err));
exit(EXIT_FAILURE);
}
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
threadsPerBlock);
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C);
err = hipGetLastError();
if (err != hipSuccess) {
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",
hipGetErrorString(err));
exit(EXIT_FAILURE);
}
printf("Copy output data from the CUDA device to the host memory\n");
err = hipMemcpy(h_C, d_C, size, hipMemcpyDeviceToHost);
if (err != hipSuccess) {
fprintf(stderr,
"Failed to copy vector C from device to host (error code %s)!\n",
hipGetErrorString(err));
exit(EXIT_FAILURE);
}
for (int i = 0; i < numElements; ++i) {
if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
err = hipFree(d_A);
if (err != hipSuccess) {
fprintf(stderr, "Failed to free device vector A (error code %s)!\n",
hipGetErrorString(err));
exit(EXIT_FAILURE);
}
err = hipFree(d_B);
if (err != hipSuccess) {
fprintf(stderr, "Failed to free device vector B (error code %s)!\n",
hipGetErrorString(err));
exit(EXIT_FAILURE);
}
err = hipFree(d_C);
if (err != hipSuccess) {
fprintf(stderr, "Failed to free device vector C (error code %s)!\n",
hipGetErrorString(err));
exit(EXIT_FAILURE);
}
free(h_A);
free(h_B);
free(h_C);
printf("Done\n");
return 0;
}
3.2 编译运行
$ ../../local_amdgpu/bin/clang++ hello_vectorAdd.hip --rocm-path=/opt/rocm
这样简单的运行 clang++,还不太确定是不是仅仅起到了编译驱动的作用,后面进一步分析。
运行成功,
效果:
未完待续。。。