// Able to handle when `num_entries % num_streams != 0`.constuint64_t num_entries =10;constuint64_t num_iters =1UL<<10;cudaMallocHost(&data_cpu,sizeof(uint64_t)*num_entries);cudaMalloc (&data_gpu,sizeof(uint64_t)*num_entries);// Set the number of streams to not evenly divide num_entries.constuint64_t num_streams =3;cudaStream_t streams[num_streams];for (uint64_t stream =0; stream < num_streams; stream++)cudaStreamCreate(&streams[stream]);// Use round-up division (`sdiv`, defined in helper.cu) so `num_streams*chunk_size`// is never less than `num_entries`.// This can result in `num_streams*chunk_size` being greater than `num_entries`, meaning// we will need to guard against out-of-range errors in the final "tail" stream (see below).constuint64_t chunk_size =sdiv(num_entries, num_streams);for (uint64_t stream =0; stream < num_streams; stream++) {constuint64_t lower = chunk_size*stream;// For tail stream `lower+chunk_size` could be out of range, so here we guard against that.constuint64_t upper =min(lower+chunk_size, num_entries);// Since the tail stream width may not be `chunk_size`,// we need to calculate a separate `width` value.constuint64_t width = upper-lower;// Use `width` instead of `chunk_size`.cudaMemcpyAsync(data_gpu+lower, data_cpu+lower,sizeof(uint64_t)*width, cudaMemcpyHostToDevice, streams[stream]);// Use `width` instead of `chunk_size`. decrypt_gpu<<<80*32,64,0, streams[stream]>>> (data_gpu+lower, width, num_iters);// Use `width` instead of `chunk_size`.cudaMemcpyAsync(data_cpu+lower, data_gpu+lower,sizeof(uint64_t)*width, cudaMemcpyDeviceToHost, streams[stream]);}// Destroy streams.for (uint64_t stream =0; stream < num_streams; stream++)cudaStreamDestroy(streams[stream]);
完整代码如下:
#include<cstdint>#include<iostream>#include"helpers.cuh"#include"encryption.cuh"voidencrypt_cpu(uint64_t* data,uint64_t num_entries,uint64_t num_iters,bool parallel=true) {#pragmaompparallelforif (parallel)for (uint64_t entry =0; entry < num_entries; entry++) data[entry] =permute64(entry, num_iters);}__global__ voiddecrypt_gpu(uint64_t* data,uint64_t num_entries,uint64_t num_iters) {constuint64_t thrdID =blockIdx.x*blockDim.x+threadIdx.x;constuint64_t stride =blockDim.x*gridDim.x;for (uint64_t entry = thrdID; entry < num_entries; entry += stride) data[entry] =unpermute64(data[entry], num_iters);}boolcheck_result_cpu(uint64_t* data,uint64_t num_entries,bool parallel=true) {uint64_t counter =0;#pragmaompparallelforreduction(+: counter) if (parallel)for (uint64_t entry =0; entry < num_entries; entry++) counter += data[entry] == entry;return counter == num_entries;}intmain (int argc,char* argv[]) { Timer timer; Timer overall;constuint64_t num_entries =1UL<<26;constuint64_t num_iters =1UL<<10;constbool openmp =true;// Define the number of streams.constuint64_t num_streams =32;// Use round-up division to calculate chunk size.constuint64_t chunk_size =sdiv(num_entries, num_streams);timer.start();uint64_t* data_cpu,* data_gpu;cudaMallocHost(&data_cpu,sizeof(uint64_t)*num_entries);cudaMalloc (&data_gpu,sizeof(uint64_t)*num_entries);timer.stop("allocate memory");check_last_error();timer.start();encrypt_cpu(data_cpu, num_entries, num_iters, openmp);timer.stop("encrypt data on CPU");timer.start();// Create array for storing streams.cudaStream_t streams[num_streams];// Create number of streams and store in array.for (uint64_t stream =0; stream < num_streams; stream++)cudaStreamCreate(&streams[stream]);timer.stop("create streams");check_last_error();overall.start();timer.start();// For each stream...for (uint64_t stream =0; stream < num_streams; stream++) {// ...calculate index into global data (`lower`) and size of data for it to process (`width`).constuint64_t lower = chunk_size*stream;constuint64_t upper =min(lower+chunk_size, num_entries);constuint64_t width = upper-lower;// ...copy stream's chunk to device.cudaMemcpyAsync(data_gpu+lower, data_cpu+lower,sizeof(uint64_t)*width, cudaMemcpyHostToDevice, streams[stream]);// ...compute stream's chunk. decrypt_gpu<<<80*32,64,0, streams[stream]>>> (data_gpu+lower, width, num_iters);// ...copy stream's chunk to host.cudaMemcpyAsync(data_cpu+lower, data_gpu+lower,sizeof(uint64_t)*width, cudaMemcpyDeviceToHost, streams[stream]); }for (uint64_t stream =0; stream < num_streams; stream++)// Synchronize streams before checking results on host.cudaStreamSynchronize(streams[stream]); // Note modification of timer instance use.timer.stop("asynchronous H2D->kernel->D2H");overall.stop("total time on GPU");check_last_error();timer.start();constbool success =check_result_cpu(data_cpu, num_entries, openmp); std::cout <<"STATUS: test "<< ( success ?"passed":"failed")<< std::endl;timer.stop("checking result on CPU");timer.start(); for (uint64_t stream =0; stream < num_streams; stream++)// Destroy streams.cudaStreamDestroy(streams[stream]); timer.stop("destroy streams");check_last_error();timer.start();cudaFreeHost(data_cpu);cudaFree (data_gpu);timer.stop("free memory");check_last_error();}