#include"stdio.h"#include<iostream>#include<cuda.h>#include<cuda_runtime.h>//Defining number of elements in Array#defineN50000//Defining Kernel function for vector addition
__global__ voidgpuAdd(int* d_a,int* d_b,int* d_c){//Getting Thread index of current kernelint tid = threadIdx.x + blockIdx.x * blockDim.x;while(tid < N){
d_c[tid]= d_a[tid]+ d_b[tid];
tid += blockDim.x * gridDim.x;}}intmain(void){//Defining host arraysint h_a[N], h_b[N], h_c[N];//Defining device pointersint* d_a,* d_b,* d_c;//----------创建事件记录起止时间---------------------
cudaEvent_t e_start, e_stop;cudaEventCreate(&e_start);cudaEventCreate(&e_stop);//第一次记录时间戳cudaEventRecord(e_start,0);// allocate the memorycudaMalloc((void**)&d_a, N *sizeof(int));cudaMalloc((void**)&d_b, N *sizeof(int));cudaMalloc((void**)&d_c, N *sizeof(int));//Initializing Arraysfor(int i =0; i < N; i++){
h_a[i]=2* i * i;
h_b[i]= i;}// Copy input arrays from host to device memorycudaMemcpy(d_a, h_a, N *sizeof(int), cudaMemcpyHostToDevice);cudaMemcpy(d_b, h_b, N *sizeof(int), cudaMemcpyHostToDevice);//Calling kernels passing device pointers as parameters
gpuAdd <<<512,512>>>(d_a, d_b, d_c);//Copy result back to host memory from device memorycudaMemcpy(h_c, d_c, N *sizeof(int), cudaMemcpyDeviceToHost);cudaDeviceSynchronize();//再次记录时间戳cudaEventRecord(e_stop,0);//等待所有GPU工作都完成cudaEventSynchronize(e_stop);float elapsedTime;//计算时间插值cudaEventElapsedTime(&elapsedTime, e_start, e_stop);printf("Time to add %d numbers: %3.1f ms\n", N, elapsedTime);int Correct =1;printf("Vector addition on GPU \n");//Printing result on consolefor(int i =0; i < N; i++){if((h_a[i]+ h_b[i]!= h_c[i])){
Correct =0;}}if(Correct ==1){printf("GPU has computed Sum Correctly\n");}else{printf("There is an Error in GPU Computation\n");}//Free up memorycudaFree(d_a);cudaFree(d_b);cudaFree(d_c);return0;}