端午假期安装好了vs c++2022,并写了个简单的汇编代码,证明MASM真的可以运行。今天需要搞一个实实在在的C++和ASM混合编程的例子,因为用纯汇编的求伯君写WPS的时代一去不复返了。个别关键函数用汇编,充分发挥CPU的特色功能,偶尔还是需要的。
昨天找的随书代码的位置在github上:GitHub - Apress/modern-x86-assembly-language-programming-3e: Source Code for 'Modern X86 Assembly Language Programming' by Daniel Kusswurm
这是第三版,最新的书。又从z-liabrary上下载了这本英文书,导入微信读书,自动翻译为中文,z-libary加微信读书,使我实现了读书ziyou(啥时候财务ziyou,还远)。
这本书的附录A就举了怎样在vs2022环境建立一个C++加ASM的例子,今天咱们就逐步跟着书上学这个例子。
首先创建project
• Create a C++ project• Enable MASM support• Add an assembly language file• Set project properties•Edit the source code• Build and run the project
- 启动VS
- New Project
- Select Console App
- Project name:Example1
- Solution name:TestSolution
- Create
- Build>Configuration Manager,choose <Edit...>
- select X86, Remove--我的环境是Win32
其次,配置ASM环境的步骤:
- View>Solution Explorer
- rigtht-click Example1 and select Build Denpendencies>Build Customizations
- check masm
- Add New Item
- select .cpp for the file style
- Example1_fasm.asm Add
第三步是设置project属性
- Example1 and select Properties
- All Configurations All Platforms
- C/C++>Code Generation Set to Advanced Vector Extentions(/arch:AVX) or AVX2 or AVX512
- C/C++>Output change to Files Assembly Machine and Source Code(/FAcs)
- Microsoft Macro Assembler>Listing File Enable Assembly Generated Code Listing to Yes(/Sg)
- Change the Assembled Code Listing File text filed to $(IntDir)\%(filename).lst
- Click OK
$(IntDir)\%(filename).lst --这是1还是L?
最后一步就是写源码了
- AppendixA\TestSolution\Example1\Example1.cpp
- AppendixA\TestSolution\Example1\Example1_fasm.asm
Example1.cpp
#include <iostream>
#include <iomanip>
#include <string>
#include <cmath>
extern "C" void CalcZ_avx(float* z, const float* x, const float* y, size_t n);
static void CalcZ_cpp(float* z, const float* x, const float* y, size_t n)
{
for (size_t i = 0; i < n; i++)
z[i] = x[i] + y[i];
}
int main(void)
{
constexpr size_t n = 20;
float x[n], y[n], z1[n], z2[n];
// Initialize the data arrays
for (size_t i = 0; i < n; i++)
{
x[i] = i * 10.0f + 10.0f;
y[i] = i * 1000.0f + 1000.0f;
z1[i] = z2[i] = 0.0f;
}
// Exercise the calculating functions
CalcZ_cpp(z1, x, y, n);
CalcZ_avx(z2, x, y, n);
// Display the results
constexpr char nl = '\n';
constexpr size_t w = 10;
constexpr float eps = 1.0e-6f;
std::cout << std::fixed << std::setprecision(1);
std::cout << std::setw(w) << "i";
std::cout << std::setw(w) << "x";
std::cout << std::setw(w) << "y";
std::cout << std::setw(w) << "z1";
std::cout << std::setw(w) << "z2" << nl;
std::cout << std::string(50, '-') << nl;
for (size_t i = 0; i < n; i++)
{
std::cout << std::setw(w) << i;
std::cout << std::setw(w) << x[i];
std::cout << std::setw(w) << y[i];
std::cout << std::setw(w) << z1[i];
std::cout << std::setw(w) << z2[i] << nl;
if (fabs(z1[i] - z2[i]) > eps)
{
std::cout << "Compare error!\n";
break;
}
}
}
Example1_fasm.asm
;------------------------------------------------------------------------------
; Example1_fasm.asm
;------------------------------------------------------------------------------
;------------------------------------------------------------------------------
; void CalcZ_avx(float* z, const float* x, const float* x, size_t n);
;------------------------------------------------------------------------------
NSE equ 8 ;num_simd_elements
SF equ 4 ;scale factor for F32
.code
CalcZ_avx proc
; Validate arguments
test r9,r9 ;n == 0?
jz Done ;jump if yes
; Initialize
mov rax,-SF ;rax = array offset (Loop2)
cmp r9,NSE ;n < NSE?
jb Loop2 ;jump if yes
mov rax,-NSE*SF ;rax = array offset (Loop1)
; Calculate z[i:i+7] = x[i:i+7] + y[i:i+7]
Loop1: add rax,NSE*SF ;update array offset
vmovups ymm0,ymmword ptr [rdx+rax] ;ymm0 = x[i:i+7]
vmovups ymm1,ymmword ptr [r8+rax] ;ymm1 = y[i:i+7]
vaddps ymm2,ymm0,ymm1 ;z[i:i+7] = x[i:i+7] + y[i:i+7]
vmovups ymmword ptr [rcx+rax],ymm2 ;save z[i:i+7]
sub r9,NSE ;n -= NSE
cmp r9,NSE ;n >= NSE?
jae Loop1 ;jump if yes
test r9,r9 ;n == 0?
jz Done ;jump if yes
add rax,NSE*SF-SF ;adjust array offset for Loop2
; Calculate z[i] = x[i] + y[i] for remaining elements
Loop2: add rax,SF ;update array offset
vmovss xmm0,real4 ptr [rdx+rax] ;xmm0 = x[i]
vmovss xmm1,real4 ptr [r8+rax] ;xmm1 = y[i]
vaddss xmm2,xmm0,xmm1 ;z[i] = x[i] + y[i]
vmovss real4 ptr [rcx+rax],xmm2 ;save z[i]
sub r9,1 ;n -= 1
jnz Loop2 ;repeat until done
Done: vzeroupper
ret ;return to caller
CalcZ_avx endp
end
最终构建运行即可
代码有点高大上,估计是用了AVX,两个loop同时运行。慢慢看书了解含义吧,还挺复杂的。
这个例子太高深了,再举个简单的例子,把数组倒序输出。