源码:
hello_vectorAdd.hip:
__global__ void vectorAdd(const float *A, const float *B, float *C) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
C[i] = A[i] + B[i] + 0.0f;
}
Makefile:
x.O1.s: hello_vectorAdd.hip
../../local_amdgpu/bin/clang++ ./hello_vectorAdd.hip -O1 -S --cuda-device-only -o x.O1.s
all:
../../local_amdgpu/bin/clang++ ./hello.hip -O1 -save-temps --cuda-device-only
结果:
核心部分:
_Z9vectorAddPKfS0_Pf: ; @_Z9vectorAddPKfS0_Pf
; %bb.0: ; %entry
s_load_dword s7, s[4:5], 0x24
s_load_dwordx4 s[0:3], s[4:5], 0x0
s_nop 0
s_load_dwordx2 s[4:5], s[4:5], 0x10
s_waitcnt lgkmcnt(0)
s_and_b32 s7, s7, 0xffff
s_mul_i32 s6, s6, s7
v_add_u32_e32 v0, s6, v0
v_ashrrev_i32_e32 v1, 31, v0
v_lshlrev_b64 v[0:1], 2, v[0:1]
v_mov_b32_e32 v3, s1
v_add_co_u32_e32 v2, vcc, s0, v0
v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
global_load_dword v4, v[2:3], off
v_mov_b32_e32 v3, s3
v_add_co_u32_e32 v2, vcc, s2, v0
v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
global_load_dword v2, v[2:3], off
v_mov_b32_e32 v3, s5
v_add_co_u32_e32 v0, vcc, s4, v0
v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
s_waitcnt vmcnt(0)
v_add_f32_e32 v2, v4, v2
v_add_f32_e32 v2, 0, v2
global_store_dword v[0:1], v2, off
s_endpgm
.text
.amdgcn_target "amdgcn-amd-amdhsa--gfx906"
.amdhsa_code_object_version 5
.protected _Z9vectorAddPKfS0_Pf ; -- Begin function _Z9vectorAddPKfS0_Pf
.globl _Z9vectorAddPKfS0_Pf
.p2align 8
.type _Z9vectorAddPKfS0_Pf,@function
_Z9vectorAddPKfS0_Pf: ; @_Z9vectorAddPKfS0_Pf
; %bb.0: ; %entry
s_load_dword s7, s[4:5], 0x24
s_load_dwordx4 s[0:3], s[4:5], 0x0
s_nop 0
s_load_dwordx2 s[4:5], s[4:5], 0x10
s_waitcnt lgkmcnt(0)
s_and_b32 s7, s7, 0xffff
s_mul_i32 s6, s6, s7
v_add_u32_e32 v0, s6, v0
v_ashrrev_i32_e32 v1, 31, v0
v_lshlrev_b64 v[0:1], 2, v[0:1]
v_mov_b32_e32 v3, s1
v_add_co_u32_e32 v2, vcc, s0, v0
v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
global_load_dword v4, v[2:3], off
v_mov_b32_e32 v3, s3
v_add_co_u32_e32 v2, vcc, s2, v0
v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
global_load_dword v2, v[2:3], off
v_mov_b32_e32 v3, s5
v_add_co_u32_e32 v0, vcc, s4, v0
v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
s_waitcnt vmcnt(0)
v_add_f32_e32 v2, v4, v2
v_add_f32_e32 v2, 0, v2
global_store_dword v[0:1], v2, off
s_endpgm
.section .rodata,"a",@progbits
.p2align 6, 0x0
.amdhsa_kernel _Z9vectorAddPKfS0_Pf
.amdhsa_group_segment_fixed_size 0
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 280
.amdhsa_user_sgpr_count 6
.amdhsa_user_sgpr_private_segment_buffer 1
.amdhsa_user_sgpr_dispatch_ptr 0
.amdhsa_user_sgpr_queue_ptr 0
.amdhsa_user_sgpr_kernarg_segment_ptr 1
.amdhsa_user_sgpr_dispatch_id 0
.amdhsa_user_sgpr_flat_scratch_init 0
.amdhsa_user_sgpr_private_segment_size 0
.amdhsa_uses_dynamic_stack 0
.amdhsa_system_sgpr_private_segment_wavefront_offset 0
.amdhsa_system_sgpr_workgroup_id_x 1
.amdhsa_system_sgpr_workgroup_id_y 0
.amdhsa_system_sgpr_workgroup_id_z 0
.amdhsa_system_sgpr_workgroup_info 0
.amdhsa_system_vgpr_workitem_id 0
.amdhsa_next_free_vgpr 5
.amdhsa_next_free_sgpr 8
.amdhsa_reserve_flat_scratch 0
.amdhsa_reserve_xnack_mask 1
.amdhsa_float_round_mode_32 0
.amdhsa_float_round_mode_16_64 0
.amdhsa_float_denorm_mode_32 3
.amdhsa_float_denorm_mode_16_64 3
.amdhsa_dx10_clamp 1
.amdhsa_ieee_mode 1
.amdhsa_fp16_overflow 0
.amdhsa_exception_fp_ieee_invalid_op 0
.amdhsa_exception_fp_denorm_src 0
.amdhsa_exception_fp_ieee_div_zero 0
.amdhsa_exception_fp_ieee_overflow 0
.amdhsa_exception_fp_ieee_underflow 0
.amdhsa_exception_fp_ieee_inexact 0
.amdhsa_exception_int_div_zero 0
.end_amdhsa_kernel
.text
.Lfunc_end0:
.size _Z9vectorAddPKfS0_Pf, .Lfunc_end0-_Z9vectorAddPKfS0_Pf
; -- End function
.section .AMDGPU.csdata,"",@progbits
; Kernel info:
; codeLenInByte = 136
; NumSgprs: 12
; NumVgprs: 5
; ScratchSize: 0
; MemoryBound: 0
; FloatMode: 240
; IeeeMode: 1
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 1
; VGPRBlocks: 1
; NumSGPRsForWavesPerEU: 12
; NumVGPRsForWavesPerEU: 5
; Occupancy: 8
; WaveLimiterHint : 0
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 6
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
.type _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,"aG",@progbits,_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,comdat
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE:
.zero 1
.size _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
.type _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,"aG",@progbits,_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,comdat
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE:
.zero 1
.size _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1
.protected _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
.type _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object
.section .rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,"aG",@progbits,_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,comdat
.weak _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE:
.zero 1
.size _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1
.type __hip_cuid_70e60f577689ac5a,@object ; @__hip_cuid_70e60f577689ac5a
.section .bss,"aw",@nobits
.globl __hip_cuid_70e60f577689ac5a
__hip_cuid_70e60f577689ac5a:
.byte 0 ; 0x0
.size __hip_cuid_70e60f577689ac5a, 1
.ident "clang version 19.0.0git (git@github.com:ROCm/llvm-project.git bba83842d40d65b75cedced64cd444623e0930ec)"
.ident "AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)"
.section ".note.GNU-stack","",@progbits
.addrsig
.addrsig_sym __hip_cuid_70e60f577689ac5a
.amdgpu_metadata
---
amdhsa.kernels:
- .args:
- .address_space: global
.name: A.coerce
.offset: 0
.size: 8
.value_kind: global_buffer
- .address_space: global
.name: B.coerce
.offset: 8
.size: 8
.value_kind: global_buffer
- .address_space: global
.name: C.coerce
.offset: 16
.size: 8
.value_kind: global_buffer
- .offset: 24
.size: 4
.value_kind: hidden_block_count_x
- .offset: 28
.size: 4
.value_kind: hidden_block_count_y
- .offset: 32
.size: 4
.value_kind: hidden_block_count_z
- .offset: 36
.size: 2
.value_kind: hidden_group_size_x
- .offset: 38
.size: 2
.value_kind: hidden_group_size_y
- .offset: 40
.size: 2
.value_kind: hidden_group_size_z
- .offset: 42
.size: 2
.value_kind: hidden_remainder_x
- .offset: 44
.size: 2
.value_kind: hidden_remainder_y
- .offset: 46
.size: 2
.value_kind: hidden_remainder_z
- .offset: 64
.size: 8
.value_kind: hidden_global_offset_x
- .offset: 72
.size: 8
.value_kind: hidden_global_offset_y
- .offset: 80
.size: 8
.value_kind: hidden_global_offset_z
- .offset: 88
.size: 2
.value_kind: hidden_grid_dims
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 280
.language: OpenCL C
.language_version:
- 2
- 0
.max_flat_workgroup_size: 1024
.name: _Z9vectorAddPKfS0_Pf
.private_segment_fixed_size: 0
.sgpr_count: 12
.sgpr_spill_count: 0
.symbol: _Z9vectorAddPKfS0_Pf.kd
.uniform_work_group_size: 1
.uses_dynamic_stack: false
.vgpr_count: 5
.vgpr_spill_count: 0
.wavefront_size: 64
amdhsa.target: amdgcn-amd-amdhsa--gfx906
amdhsa.version:
- 1
- 2
...
.end_amdgpu_metadata
debug:
调试从 .bc -> .s 的过程
$ gdb ../../local_amdgpu/bin/llc
(gdb) set args hello-hip-amdgcn-amd-amdhsa-gfx906.bc -o hello.bc.gdb.s
生成了 asm 文件:
$ ls