// 使用SIMD指令优化的向量加法
//<mmintrin.h> MMX
//<xmmintrin.h> SSE
//<emmintrin.h> SSE2
//<pmmintrin.h> SSE3
//<tmmintrin.h> SSSE3
//<smmintrin.h> SSE4.1
//<nmmintrin.h> SSE4.2
//<wmmintrin.h> AES
//<immintrin.h> AVX, AVX2, FMA, BMI, POPCNT, AVX512
//<x86intrin.h> Auto(GCC)
//<intrin.h> Auto(MSVC)
#include <emmintrin.h> // 包含SSE2指令集
#include <valarray>
#include <iostream>
#include <chrono>
#include <vector>
__m128i vector_add(__m128i a, __m128i b) {
return _mm_add_epi32(a, b);
}
void add_vectors(int* a, int* b, int* c, int size) {
for (int i = 0; i < size; i += 4) {
__m128i va = _mm_load_si128((__m128i*)(a + i));
__m128i vb = _mm_load_si128((__m128i*)(b + i));
__m128i vc = _mm_add_epi32(va, vb);
_mm_store_si128((__m128i*)(c + i), vc);
}
}
int test() {
std::valarray<float> a = { 1.0, 2.0, 3.0, 4.0 };
std::valarray<float> b = { 5.0, 6.0, 7.0, 8.0 };
std::valarray<float> c = a + b;
for (auto& element : c) {
std::cout << element << " ";
}
std::cout << std::endl;
// 数据量小
std::vector<int> a1(10);
std::vector<int> b1(10);
for (size_t i = 0; i < 10; i++)
{
a1[i] = i;
b1[i] = i + 1;
}
std::vector<int> c1(10);
std::chrono::steady_clock::time_point t1 = std::chrono::steady_clock::now();
add_vectors(&a1[0], &b1[0], &c1[0], 4);
std::chrono::steady_clock::time_point t2 = std::chrono::steady_clock::now();
std::cout << "simd cost " << (t2 - t1).count() << std::endl;
std::chrono::steady_clock::time_point t3 = std::chrono::steady_clock::now();
for (size_t i = 0; i < c1.size(); ++i)
c1.at(i) = a1.at(i) + b1.at(i);
std::chrono::steady_clock::time_point t4 = std::chrono::steady_clock::now();
std::cout << "cost " << (t4 - t3).count() << std::endl;
// 数据量大
std::vector<int> a2(10000);
std::vector<int> b2(10000);
for (size_t i = 0; i < 10000; i++)
{
a2[i] = i;
b2[i] = i + 1;
}
std::vector<int> c2(10000);
t1 = std::chrono::steady_clock::now();
add_vectors(&a2[0], &b2[0], &c2[0], 4);
t2 = std::chrono::steady_clock::now();
std::cout << "simd cost " << (t2 - t1).count() << std::endl;
t3 = std::chrono::steady_clock::now();
for (size_t i = 0; i < c2.size(); ++i)
c2.at(i) = a2.at(i) + b2.at(i);
t4 = std::chrono::steady_clock::now();
std::cout << "cost " << (t4 - t3).count() << std::endl;
//for (const auto& ele : c1)
// std::cout << ele << " ";
return 0;
}
输出
6 8 10 12
simd cost 500
cost 400
simd cost 5700
cost 49200
总结: 数据运算量小无效果,数据运算量大效果提升显著
参考
GitHub - parallel101/simdtutor: x86-64 SIMD矢量优化系列教程
GitHub - google/highway: Performance-portable, length-agnostic SIMD with runtime dispatch