第6章以实现简单的光线追踪为例子,引入了Constant Memory和性能测量方法。
Constant Memory
NVIDIA的硬件提供了64K的constant只读内存。定义constant内存的变量,使用关键字__constant__。从constant内存里读取出来的数据,可以缓存起来,后面代码读取相同地址的数据时,不用再从constant里读;另外一个线程读回的数据,会广播给其他的线程,其他线程不用再去读取。所有这些都由Cuda完成,不需要程序员去干预。
只是要记住:
1.用来存放只读数据
2.保证各个线程访问相同的地址,否则不仅不能提高性能,还会适得其反。
光线追踪
光线追踪是另一种将三维场景显示到二维图像的方法。对于图像的像素,我们可以看作是三维场景中物体发出光照射到这里产生的。循着光线的来路,反过来,找到光线与场景中物体的交点,以此物体在此点的颜色为基础,得出像素的颜色。书中的例子建立一个场景,场景由20个球体组成,假设图像垂直于Z轴,这样所有的光线就平行于Z轴,这样由像素(ox,oy)点出来的光线与球体交点的坐标与像素点坐标相同,也为(ox, oy)。设球心的坐标为(x, y, z),球的半径为r。书中可能考虑到以像素点为原始点,所以把坐标定义成了(ox,oy), 这样球心只能定义为(x,y,z)了,球的方程看起来就有点怪异了。由球方程
可以很容易得出交点的Z坐标oz的值。这里再次体现了选择图像跟Z轴垂直带来的便利,交点Z轴坐标就可以用来判断它与图像平面的距离。像素只取离图像平面最近的交点的颜色,因为其他远一些的交点被挡住了。
球体建模
书中例子定义了结构Sphere。
#define INF 2e10f
struct Sphere {
float r,b,g;
float radius;
float x,y,z;
__device__ float hit( float ox, float oy, float *n ) {
float dx = ox - x;
float dy = oy - y;
if (dx*dx + dy*dy < radius*radius) {
float dz = sqrtf( radius*radius - dx*dx - dy*dy );
*n = dz / sqrtf( radius * radius );
return dz + z;
}
return -INF;
}
};
__device__ float hit( float ox, float oy, float *n) 的参数ox, oy为图像中像素点的坐标。参数n为一个浮点数指针,如果来自(ox,oy)的光线与球面相交,则n返回交点在Z方向与球心的距离与球半径的比值,返回的n将跟球体的颜色一起,参与像素点颜色的计算。函数的返回值为交点的z坐标(Z轴垂直于纸面往外,坐标值越大,距离图像越近)。若没有相交,则返回负无穷大。
场景生成
在main函数中,先在主机端生成20个球体,球体的位置,颜色和半径大小随机产生,然后传递到s指向的GPU内存,GPU使用光线追踪算法生成的图像则由_dev_bitmap保存。它们都是由cudaMalloc分配的GPU全局内存,还没有用到这一章引入的constant内存。
...
#define DIM 1024
#define rnd( x ) (x * rand() / RAND_MAX)
#define SPHERES 20
...
// globals needed by the update routine
struct DataBlock {
unsigned char *dev_bitmap;
Sphere *s;
};
int main( void ) {
DataBlock data;
CPUBitmap bitmap( DIM, DIM, &data );
unsigned char *dev_bitmap;
Sphere *s;
// allocate memory on the GPU for the output bitmap
HANDLE_ERROR( cudaMalloc( (void**)&dev_bitmap,
bitmap.image_size() ) );
// allocate memory for the Sphere dataset
HANDLE_ERROR( cudaMalloc( (void**)&s,
sizeof(Sphere) * SPHERES ) );
// allocate temp memory, initialize it, copy to
// memory on the GPU, then free our temp memory
Sphere *temp_s = (Sphere*)malloc( sizeof(Sphere) * SPHERES );
for (int i=0; i<SPHERES; i++) {
temp_s[i].r = rnd( 1.0f );
temp_s[i].g = rnd( 1.0f );
temp_s[i].b = rnd( 1.0f );
temp_s[i].x = rnd( 1000.0f ) - 500;
temp_s[i].y = rnd( 1000.0f ) - 500;
temp_s[i].z = rnd( 1000.0f ) - 500;
temp_s[i].radius = rnd( 100.0f ) + 20;
}
HANDLE_ERROR( cudaMemcpy( s, temp_s,
sizeof(Sphere) * SPHERES,
cudaMemcpyHostToDevice ) );
free( temp_s );
...
内核函数
x,y和offset的计算,我们前面已经很熟悉了。ox,oy 由 x,y减去DIM/2是为了把Z轴移到图像的中心。
内核函数遍历场景中的20个球体,检查是否与线程负责的像素点的反向光线相交。调用球体的hit函数,返回值为交点Z坐标,如果比之前的值都大,则使用当前球体参数重新计算像素点的颜色。
颜色最后存入到ptr指向的内存。
__global__ void kernel( Sphere *s, unsigned char *ptr ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float ox = (x - DIM/2);
float oy = (y - DIM/2);
float r=0, g=0, b=0;
float maxz = -INF;
for(int i=0; i<SPHERES; i++) {
float n;
float t = s[i].hit( ox, oy, &n );
if (t > maxz) {
float fscale = n;
r = s[i].r * fscale;
g = s[i].g * fscale;
b = s[i].b * fscale;
maxz = t;
}
}
ptr[offset*4 + 0] = (int)(r * 255);
ptr[offset*4 + 1] = (int)(g * 255);
ptr[offset*4 + 2] = (int)(b * 255);
ptr[offset*4 + 3] = 255;
}
继续前面main剩下的部分
...
// generate a bitmap from our sphere data
dim3 grids(DIM/16,DIM/16);
dim3 threads(16,16);
kernel<<<grids,threads>>>( s, dev_bitmap );
// copy our bitmap back from the GPU for display
HANDLE_ERROR( cudaMemcpy( bitmap.get_ptr(), dev_bitmap,
bitmap.image_size(),
cudaMemcpyDeviceToHost ) );
HANDLE_ERROR( cudaFree( dev_bitmap ) );
HANDLE_ERROR( cudaFree( s ) );
// display
bitmap.display_and_exit();
}
Global内存版完整代码
#include "../common/book.h"
#include "../common/cpu_bitmap.h"
#define DIM 1024
#define rnd( x ) (x * rand() / RAND_MAX)
#define INF 2e10f
struct Sphere {
float r,b,g;
float radius;
float x,y,z;
__device__ float hit( float ox, float oy, float *n ) {
float dx = ox - x;
float dy = oy - y;
if (dx*dx + dy*dy < radius*radius) {
float dz = sqrtf( radius*radius - dx*dx - dy*dy );
*n = dz / sqrtf( radius * radius );
return dz + z;
}
return -INF;
}
};
#define SPHERES 20
__global__ void kernel( Sphere *s, unsigned char *ptr ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float ox = (x - DIM/2);
float oy = (y - DIM/2);
float r=0, g=0, b=0;
float maxz = -INF;
for(int i=0; i<SPHERES; i++) {
float n;
float t = s[i].hit( ox, oy, &n );
if (t > maxz) {
float fscale = n;
r = s[i].r * fscale;
g = s[i].g * fscale;
b = s[i].b * fscale;
maxz = t;
}
}
ptr[offset*4 + 0] = (int)(r * 255);
ptr[offset*4 + 1] = (int)(g * 255);
ptr[offset*4 + 2] = (int)(b * 255);
ptr[offset*4 + 3] = 255;
}
// globals needed by the update routine
struct DataBlock {
unsigned char *dev_bitmap;
Sphere *s;
};
int main( void ) {
DataBlock data;
CPUBitmap bitmap( DIM, DIM, &data );
unsigned char *dev_bitmap;
Sphere *s;
// allocate memory on the GPU for the output bitmap
HANDLE_ERROR( cudaMalloc( (void**)&dev_bitmap,
bitmap.image_size() ) );
// allocate memory for the Sphere dataset
HANDLE_ERROR( cudaMalloc( (void**)&s,
sizeof(Sphere) * SPHERES ) );
// allocate temp memory, initialize it, copy to
// memory on the GPU, then free our temp memory
Sphere *temp_s = (Sphere*)malloc( sizeof(Sphere) * SPHERES );
for (int i=0; i<SPHERES; i++) {
temp_s[i].r = rnd( 1.0f );
temp_s[i].g = rnd( 1.0f );
temp_s[i].b = rnd( 1.0f );
temp_s[i].x = rnd( 1000.0f ) - 500;
temp_s[i].y = rnd( 1000.0f ) - 500;
temp_s[i].z = rnd( 1000.0f ) - 500;
temp_s[i].radius = rnd( 100.0f ) + 20;
}
HANDLE_ERROR( cudaMemcpy( s, temp_s,
sizeof(Sphere) * SPHERES,
cudaMemcpyHostToDevice ) );
free( temp_s );
// generate a bitmap from our sphere data
dim3 grids(DIM/16,DIM/16);
dim3 threads(16,16);
kernel<<<grids,threads>>>( s, dev_bitmap );
// copy our bitmap back from the GPU for display
HANDLE_ERROR( cudaMemcpy( bitmap.get_ptr(), dev_bitmap,
bitmap.image_size(),
cudaMemcpyDeviceToHost ) );
HANDLE_ERROR( cudaFree( dev_bitmap ) );
HANDLE_ERROR( cudaFree( s ) );
// display
bitmap.display_and_exit();
}
运行所得的图片:
使用Constant内存
由constant内存特性,在光线追踪的例子中,适合将场景数据放入其中。各个球体数据在生成之后不会再变,而且各个线程都需要访问。
constant 内存变量不是通过cudaMalloc来分配的,它需要申明为一个静态全局变量,申明如下:
__constant__ Sphere s[SPHERES];
那么如何把CPU在main函数中生成的球体数据传递到constant内存呢。CUDA提供cudaMemcpyToSymbol函数。
HANDLE_ERROR( cudaMemcpyToSymbol( s, temp_s,
sizeof(Sphere) * SPHERES) );
而内核函数只保留了图像像素指针,GPU的constant内存变量s为全局变量,不许作为函数参数传入。
__global__ void kernel( unsigned char *ptr ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float ox = (x - DIM/2);
float oy = (y - DIM/2);
float r=0, g=0, b=0;
float maxz = -INF;
for(int i=0; i<SPHERES; i++) {
float n;
float t = s[i].hit( ox, oy, &n );
if (t > maxz) {
float fscale = n;
r = s[i].r * fscale;
g = s[i].g * fscale;
b = s[i].b * fscale;
maxz = t;
}
}
ptr[offset*4 + 0] = (int)(r * 255);
ptr[offset*4 + 1] = (int)(g * 255);
ptr[offset*4 + 2] = (int)(b * 255);
ptr[offset*4 + 3] = 255;
}
Constant内存完整代码
#include "../common/book.h"
#include "../common/cpu_bitmap.h"
#define DIM 1024
#define rnd( x ) (x * rand() / RAND_MAX)
#define INF 2e10f
struct Sphere {
float r,b,g;
float radius;
float x,y,z;
__device__ float hit( float ox, float oy, float *n ) {
float dx = ox - x;
float dy = oy - y;
if (dx*dx + dy*dy < radius*radius) {
float dz = sqrtf( radius*radius - dx*dx - dy*dy );
*n = dz / sqrtf( radius * radius );
return dz + z;
}
return -INF;
}
};
#define SPHERES 20
__constant__ Sphere s[SPHERES];
__global__ void kernel( unsigned char *ptr ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float ox = (x - DIM/2);
float oy = (y - DIM/2);
float r=0, g=0, b=0;
float maxz = -INF;
for(int i=0; i<SPHERES; i++) {
float n;
float t = s[i].hit( ox, oy, &n );
if (t > maxz) {
float fscale = n;
r = s[i].r * fscale;
g = s[i].g * fscale;
b = s[i].b * fscale;
maxz = t;
}
}
ptr[offset*4 + 0] = (int)(r * 255);
ptr[offset*4 + 1] = (int)(g * 255);
ptr[offset*4 + 2] = (int)(b * 255);
ptr[offset*4 + 3] = 255;
}
// globals needed by the update routine
struct DataBlock {
unsigned char *dev_bitmap;
};
int main( void ) {
DataBlock data;
CPUBitmap bitmap( DIM, DIM, &data );
unsigned char *dev_bitmap;
// allocate memory on the GPU for the output bitmap
HANDLE_ERROR( cudaMalloc( (void**)&dev_bitmap,
bitmap.image_size() ) );
// allocate temp memory, initialize it, copy to constant
// memory on the GPU, then free our temp memory
Sphere *temp_s = (Sphere*)malloc( sizeof(Sphere) * SPHERES );
for (int i=0; i<SPHERES; i++) {
temp_s[i].r = rnd( 1.0f );
temp_s[i].g = rnd( 1.0f );
temp_s[i].b = rnd( 1.0f );
temp_s[i].x = rnd( 1000.0f ) - 500;
temp_s[i].y = rnd( 1000.0f ) - 500;
temp_s[i].z = rnd( 1000.0f ) - 500;
temp_s[i].radius = rnd( 100.0f ) + 20;
}
HANDLE_ERROR( cudaMemcpyToSymbol( s, temp_s,
sizeof(Sphere) * SPHERES) );
free( temp_s );
// generate a bitmap from our sphere data
dim3 grids(DIM/16,DIM/16);
dim3 threads(16,16);
kernel<<<grids,threads>>>( dev_bitmap );
// copy our bitmap back from the GPU for display
HANDLE_ERROR( cudaMemcpy( bitmap.get_ptr(), dev_bitmap,
bitmap.image_size(),
cudaMemcpyDeviceToHost ) );
HANDLE_ERROR( cudaFree( dev_bitmap ) );
// display
bitmap.display_and_exit();
}