现代X86汇编-C和ASM混合编程举例

端午假期安装好了vs c++2022,并写了个简单的汇编代码，证明MASM真的可以运行。今天需要搞一个实实在在的C++和ASM混合编程的例子，因为用纯汇编的求伯君写WPS的时代一去不复返了。个别关键函数用汇编，充分发挥CPU的特色功能，偶尔还是需要的。

昨天找的随书代码的位置在github上：GitHub - Apress/modern-x86-assembly-language-programming-3e: Source Code for 'Modern X86 Assembly Language Programming' by Daniel Kusswurm

这是第三版，最新的书。又从z-liabrary上下载了这本英文书，导入微信读书，自动翻译为中文，z-libary加微信读书，使我实现了读书ziyou（啥时候财务ziyou，还远）。

这本书的附录A就举了怎样在vs2022环境建立一个C++加ASM的例子，今天咱们就逐步跟着书上学这个例子。

首先创建project

• Create a C++ project• Enable MASM support• Add an assembly language file• Set project properties•Edit the source code• Build and run the project

启动VS
New Project
Select Console App
Project name:Example1
Solution name:TestSolution
Create
Build>Configuration Manager,choose <Edit...>
select X86, Remove--我的环境是Win32

其次，配置ASM环境的步骤：

View>Solution Explorer
rigtht-click Example1 and select Build Denpendencies>Build Customizations
check masm
Add New Item
select .cpp for the file style
Example1_fasm.asm Add

第三步是设置project属性

Example1 and select Properties
All Configurations All Platforms
C/C++>Code Generation Set to Advanced Vector Extentions(/arch:AVX) or AVX2 or AVX512
C/C++>Output change to Files Assembly Machine and Source Code(/FAcs)
Microsoft Macro Assembler>Listing File Enable Assembly Generated Code Listing to Yes(/Sg)
Change the Assembled Code Listing File text filed to $(IntDir)\%(filename).lst
Click OK

$(IntDir)\%(filename).lst --这是1还是L？

最后一步就是写源码了

AppendixA\TestSolution\Example1\Example1.cpp
AppendixA\TestSolution\Example1\Example1_fasm.asm

Example1.cpp

#include <iostream>
#include <iomanip>
#include <string>
#include <cmath>

extern "C" void CalcZ_avx(float* z, const float* x, const float* y, size_t n);

static void CalcZ_cpp(float* z, const float* x, const float* y, size_t n)
{
    for (size_t i = 0; i < n; i++)
        z[i] = x[i] + y[i];
}

int main(void)
{
    constexpr size_t n = 20;
    float x[n], y[n], z1[n], z2[n];

    // Initialize the data arrays
    for (size_t i = 0; i < n; i++)
    {
        x[i] = i * 10.0f + 10.0f;
        y[i] = i * 1000.0f + 1000.0f;
        z1[i] = z2[i] = 0.0f;
    }

    // Exercise the calculating functions
    CalcZ_cpp(z1, x, y, n);
    CalcZ_avx(z2, x, y, n);

    // Display the results
    constexpr char nl = '\n';
    constexpr size_t w = 10;
    constexpr float eps = 1.0e-6f;

    std::cout << std::fixed << std::setprecision(1);

    std::cout << std::setw(w) << "i";
    std::cout << std::setw(w) << "x";
    std::cout << std::setw(w) << "y";
    std::cout << std::setw(w) << "z1";
    std::cout << std::setw(w) << "z2" << nl;
    std::cout << std::string(50, '-') << nl;

    for (size_t i = 0; i < n; i++)
    {
        std::cout << std::setw(w) << i;
        std::cout << std::setw(w) << x[i];
        std::cout << std::setw(w) << y[i];
        std::cout << std::setw(w) << z1[i];
        std::cout << std::setw(w) << z2[i] << nl;

        if (fabs(z1[i] - z2[i]) > eps)
        {
            std::cout << "Compare error!\n";
            break;
        }
    }

}

Example1_fasm.asm

;------------------------------------------------------------------------------
; Example1_fasm.asm
;------------------------------------------------------------------------------

;------------------------------------------------------------------------------
; void CalcZ_avx(float* z, const float* x, const float* x, size_t n);
;------------------------------------------------------------------------------

NSE     equ 8                                   ;num_simd_elements
SF      equ 4                                   ;scale factor for F32

        .code
CalcZ_avx proc

; Validate arguments
        test r9,r9                              ;n == 0?
        jz Done                                 ;jump if yes

; Initialize
        mov rax,-SF                             ;rax = array offset (Loop2)
        cmp r9,NSE                              ;n < NSE?
        jb Loop2                                ;jump if yes
        mov rax,-NSE*SF                         ;rax = array offset (Loop1)

; Calculate z[i:i+7] = x[i:i+7] + y[i:i+7]
Loop1:  add rax,NSE*SF                          ;update array offset
        vmovups ymm0,ymmword ptr [rdx+rax]      ;ymm0 = x[i:i+7]
        vmovups ymm1,ymmword ptr [r8+rax]       ;ymm1 = y[i:i+7]
        vaddps ymm2,ymm0,ymm1                   ;z[i:i+7] = x[i:i+7] + y[i:i+7]
        vmovups ymmword ptr [rcx+rax],ymm2      ;save z[i:i+7]

        sub r9,NSE                              ;n -= NSE
        cmp r9,NSE                              ;n >= NSE?
        jae Loop1                               ;jump if yes

        test r9,r9                              ;n == 0?
        jz Done                                 ;jump if yes
        add rax,NSE*SF-SF                       ;adjust array offset for Loop2

; Calculate z[i] = x[i] + y[i] for remaining elements
Loop2:  add rax,SF                              ;update array offset
        vmovss xmm0,real4 ptr [rdx+rax]         ;xmm0 = x[i]
        vmovss xmm1,real4 ptr [r8+rax]          ;xmm1 = y[i]
        vaddss xmm2,xmm0,xmm1                   ;z[i] = x[i] + y[i]
        vmovss real4 ptr [rcx+rax],xmm2         ;save z[i]

        sub r9,1                                ;n -= 1
        jnz Loop2                               ;repeat until done

Done:   vzeroupper
        ret                                     ;return to caller
CalcZ_avx endp
        end

最终构建运行即可