目的
opengl虽然老,但是算上opengl es,应该是应用最广泛的显卡api。用compute shader做计算,可以一定程度上摆脱N卡的限制,也摆脱windows和linux,mac等平台的限制。
计算着色器应该没有完全榨干硬件的性能,但是也取得了可观的并行性。
compute shader的并发模型
compute shader把并发任务拆成了一个三维的工作组,即一个并发任务可以有三个维度,我理解是为了方便索引。然后每个工作组下可以定义多个工作项,也有三个维度。工作组和工作项的区别在于,一个工作组的不同工作项可以共享变量,用share关键字即可,他们之间可以相互协同,类似于线程,可以完成比较复杂的工作。而工作组之间类似于进程,不方便共享,更独立。
一个shader只实现一个工作项的逻辑,工作项数目由shader指定,似乎不能动态设置,而工作项数目由api指定,可以动态设置。
计算着色器中的内置变量来进行索引:
假设一个任务有 (10,5,3)个工作组,每个工作组有(2,3,4)个工作项
- gl_WorkGroupSize:全局工作组数量。三维数组(10,5,3)
- gl_NumGroupSize: 每个工作组的工作项数,三维数组,即(2,3,4)
- gl_WorkGroupID:当前工作组的全局ID。三维数组 ,范围是([0-9],[0-4],[0-2])
- gl_LocalInvocationID:当前工作项的局部ID。三维数组,范围是([0-1],[0-2],[0-3])
- gl_GlobalInvocationID:当前工作项的全局ID。三维数组,相当于gl_WorkGroupID和gl_LocalInvocationID拉平的id:
- 比如第(5,2,1)工作组的第(1,2,1)个工作项的gl_GlobalInvocationID = (52+1, 23+2, 1*4+1) = (11,8,5)
- gl_LocalInvocationIndex:当前工作项的线性索引。一个数,相当于拉成一维来进行唯一索引,即上一项进一步拉平
性能测试
比较cpu单核性能和集成显卡,我的pc参数如下:
windows11 专业版
cpu 13th Gen Intel® Core™ i5-1340P 1.90 GHz
显卡 Intel® Iris® Xe Graphics
测试 10002000的矩阵和20001000的矩阵相乘,这个显卡很垃圾
代码
#include <iostream>
#include <string>
#include <vector>
#include <random>
#include <chrono>
#include "glad/glad.h"
#include "GLFW/glfw3.h"
#include "img_util.h"
#include <glm/glm.hpp>
#include <glm/gtc/matrix_transform.hpp>
#include <glm/gtc/type_ptr.hpp>
std::string loadShaderSource(const std::string& shaderPath) {
FILE* file = fopen(shaderPath.c_str(), "r");
std::vector<char> res;
if (file == NULL) {
std::cout << "open shader file error:" << shaderPath << std::endl;
return "";
}
// 计算大小
fseek(file, 0, SEEK_END);
long fileSize = ftell(file);
fseek(file, 0, SEEK_SET);
if (fileSize > 0) {
res.resize(fileSize);
long readSize = fread(res.data(), sizeof(char), fileSize, file);
if (readSize > fileSize) {
std::cout << "read shader file error:" << shaderPath
<< "fileSize: " << fileSize << ",readSize: " << readSize
<< " ,content: " << std::string(res.begin(), res.end()) << std::endl;
return "";
}
}
return std::string(res.begin(), res.end());
}
void randomFloatVector(std::vector<float>& vec, float min_value = -1.0f, float max_value= 1.0f) {
size_t size = vec.size();
// 创建一个随机数生成器
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> dis(min_value, max_value);
// 创建一个 vector 并随机初始化
for (size_t i = 0; i < size; ++i) {
vec[i] = dis(gen);
}
}
GLFWwindow* initWindow() {
glfwInit();
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 6);
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
GLFWwindow* window = glfwCreateWindow(800, 600, "LearnOpenGL", NULL, NULL);
if (window == NULL)
{
std::cout << "Failed to create GLFW window" << std::endl;
glfwTerminate();
return NULL;
}
glfwMakeContextCurrent(window);
if (!gladLoadGLLoader((GLADloadproc)glfwGetProcAddress))
{
std::cout << "Failed to initialize GLAD" << std::endl;
return NULL;
}
glfwSetInputMode(window, GLFW_CURSOR, GLFW_CURSOR_NORMAL);
return window;
}
int main(int argc, char** argv) {
GLFWwindow* window = initWindow();
if (!window) {
std::cout << "init window failed" << std::endl;
return -1;
}
int m = 1000;
int n = 2000;
std::string compteShaderPath = "D:\\projects\\cmake_proj\\shaders\\compute_shaders\\matmul.comp";
std::cout << "init data begin" << std::endl;
std::vector<float> xData(m * n);
randomFloatVector(xData);
std::vector<float> wData(m * n);
randomFloatVector(wData);
std::vector<float> outDataGpu(m * m);
std::vector<float> outDataCpu(m * m);
GLuint xBuffer, wBuffer, outBuffer;
std::string computeShaderSource = loadShaderSource(compteShaderPath);
// 创建缓冲区
glGenBuffers(1, &xBuffer);
glGenBuffers(1, &wBuffer);
glGenBuffers(1, &outBuffer);
// 绑定并初始化 x 缓冲区
glBindBuffer(GL_SHADER_STORAGE_BUFFER, xBuffer);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(float) * m * n, xData.data(), GL_STATIC_DRAW);
// 绑定并初始化 w 缓冲区
glBindBuffer(GL_SHADER_STORAGE_BUFFER, wBuffer);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(float) * m * n, wData.data(), GL_STATIC_DRAW);
// 绑定并初始化 out 缓冲区
glBindBuffer(GL_SHADER_STORAGE_BUFFER, outBuffer);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(float) * m * m, nullptr, GL_STATIC_DRAW);
// 创建shader
std::cout << "load shader begin" << std::endl;
GLuint computeShaderProgram = glCreateProgram();
GLuint computeShader = glCreateShader(GL_COMPUTE_SHADER);
const char* source = computeShaderSource.c_str();
glShaderSource(computeShader, 1, &source, nullptr);
glCompileShader(computeShader);
// 检查编译错误
GLint success;
glGetShaderiv(computeShader, GL_COMPILE_STATUS, &success);
if (!success) {
char infoLog[512];
glGetShaderInfoLog(computeShader, 512, nullptr, infoLog);
std::cerr << "Compute shader compilation failed: " << infoLog << std::endl;
}
// 链接着色器程序
glAttachShader(computeShaderProgram, computeShader);
glLinkProgram(computeShaderProgram);
// 检查链接错误
glGetProgramiv(computeShaderProgram, GL_LINK_STATUS, &success);
if (!success) {
char infoLog[512];
glGetProgramInfoLog(computeShaderProgram, 512, nullptr, infoLog);
std::cerr << "Compute shader program linking failed: " << infoLog << std::endl;
}
//计时
std::cout << "gpu compute begin" << std::endl;
auto clk = std::chrono::high_resolution_clock();
auto bg = clk.now();
glUseProgram(computeShaderProgram);
// 设置 n
glUniform1i(glGetUniformLocation(computeShaderProgram, "n"), n);
glUniform1i(glGetUniformLocation(computeShaderProgram, "m"), m);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, xBuffer);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, wBuffer);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, outBuffer);
// 设置工作组大小
glDispatchCompute(m, m, 1);
// 确保所有计算完成
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
// 读取数据
glBindBuffer(GL_SHADER_STORAGE_BUFFER, outBuffer);
glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, sizeof(float) * m * m, outDataGpu.data());
auto ed = clk.now();
auto gpuTime = std::chrono::duration_cast<std::chrono::microseconds>(ed - bg).count() / 1000.0;
std::cout << "gpu time:" << gpuTime << "ms" << std::endl;
bg = clk.now();
for (int i = 0;i < m; ++i) {
for (int j = 0;j < m; ++j) {
float val = 0.0;
for (int k = 0;k < n;++k) {
val += xData[i * n + k] * wData[j + k * m];
}
outDataCpu[i * m + j] = val;
}
}
ed = clk.now();
auto cpuTime = std::chrono::duration_cast<std::chrono::microseconds>(ed - bg).count() / 1000.0;
std::cout << "cpu time:" << cpuTime << "ms" << std::endl;
float diff = 0.0;
for (int i = 0;i < m*m; ++i) {
diff += fabs(outDataGpu[i] - outDataCpu[i]);
}
std::cout << "diff: " << diff << ", avg:" << diff / (m * m) <<", cpu / gpu: " << cpuTime / gpuTime << std::endl;
// 释放资源
glDeleteBuffers(1, &xBuffer);
glDeleteBuffers(1, &wBuffer);
glDeleteBuffers(1, &outBuffer);
glDeleteShader(computeShader);
glDeleteProgram(computeShaderProgram);
return 0;
}
shader
#version 460 core
uniform int n;
uniform int m;
layout(local_size_x = 1, local_size_y = 1) in;
layout(binding = 0) readonly buffer Input0 {
float data[];
} x;
layout(binding = 1) readonly buffer Input1 {
float data[];
} w;
layout(binding = 2) writeonly buffer Output0 {
float data[];
} xout;
void main() {
int i = int(gl_GlobalInvocationID.x); // x第i 行
int j = int(gl_GlobalInvocationID.y); // w第j列
float val = 0.0;
for (int k=0; k<n; ++k) {
val += x.data[i*n + k] * w.data[j + k *m ];
}
xout.data[i*m + j] = val;
}
采用了一个工作组数量都设置成1,gl_GlobalInvocationID等同于gl_WorkGroupID。运行结果
相比于单线程,可以获得大概8-10倍的速度提升
参考资料
https://github.com/SingingRivulet/transformer.gl.git
https://github.com/cgoxopx/llama2.gl
https://zhuanlan.zhihu.com/p/673144065
https://blog.csdn.net/qq_26328385/article/details/105526000