openmp - Multiple Host threads launch CUDA kernels together -
i have encountered strange situation. here our code:
#include <omp.h> #include <stdio.h> #include <stdlib.h> #include <cuda.h> void initcuda(int g) { cudadeviceprop prop; if(cudagetdeviceproperties(&prop, g) == cudasuccess) printf("mp cnt: %d ,concurrent kernels:%d , asyncenginecount:%d , thrdpermp: %d\n", prop.multiprocessorcount,prop.concurrentkernels,prop.asyncenginecount,192); cudasetdevice(g); } __global__ void cudajob(float *mem){ unsigned int tid=threadidx.x+blockidx.x*blockdim.x; mem[tid]=-1e5; while(mem[tid]<1.0e5){ mem[tid]=mem[tid]+1e-2; } } void wrapper(int n,int b){ float** dmem=(float**)malloc(n*(sizeof(float*))); cudastream_t* stream=(cudastream_t*)malloc(sizeof(cudastream_t)*n); dim3 grid=dim3(b,1,1); dim3 block=dim3(192,1,1);//2496/13=192 for(int i=0;i<n;i++) { cudamalloc((void**)&dmem[i],192*b*sizeof(float)); cudastreamcreate(&stream[i]); } for(int i=0;i<n;i++) cudajob<<<grid,block,0,stream[i]>>>(dmem[i]); for(int i=0;i<n;i++) { cudastreamdestroy(stream[i]); cudafree(dmem[i]); } free(stream); free(dmem); } int main(int argc,char* argv[]){ initcuda(0); int n=atoi(argv[1]); int nthreads=atoi(argv[2]); int b=atoi(argv[3]); float t1=omp_get_wtime(); #pragma omp parallel num_threads(nthreads) firstprivate(nthreads,n,b) { #pragma omp barrier float time=omp_get_wtime(); int id=omp_get_thread_num(); wrapper(n,b); time=omp_get_wtime()-time; printf("num threads: %d, time: %f\n",id,time); } printf("total: %f\n",omp_get_wtime()-t1); return 0; }
so if run ./main 1 8 1. means 8 threads , each of them launch 1 kernel. actual run time suggest kernels not launch simultaneously:
mp cnt: 13 ,concurrent kernels:1 , asyncenginecount:2 , thrdpermp: 192 num threads: 0, time: 3.788108 num threads: 6, time: 6.661960 num threads: 7, time: 9.535245 num threads: 2, time: 12.408561 num threads: 5, time: 12.410481 num threads: 1, time: 12.411650 num threads: 4, time: 12.412888 num threads: 3, time: 12.414572 total: 12.414601
after debuging found problem may caused cleaning of memory , stream. if comment out cudafree , streamdestroy , free. run time suggest concurrent:
mp cnt: 13 ,concurrent kernels:1 , asyncenginecount:2 , thrdpermp: 192 num threads: 7, time: 3.805691 num threads: 1, time: 3.806201 num threads: 3, time: 3.806624 num threads: 2, time: 3.806695 num threads: 6, time: 3.807018 num threads: 5, time: 3.807456 num threads: 0, time: 3.807486 num threads: 4, time: 3.807792 total: 3.807799
at last found if add omp barrier right behind kernel launching call. cleaning not cause problem:
for(int i=0;i<n;i++) cudajob<<<grid,block,0,stream[i]>>>(dmem[i]); #pragma omp barrier for(int i=0;i<n;i++) { cudastreamdestroy(stream[i]); cudafree(dmem[i]); }
so, think when multiple host threads trying clean memory , streams on device, may compete each other. not sure.
is right? can 1 remove omp barrier? because don't think necessary our problem.
yes, cudamalloc
, cudafree
, , cudastreamcreate
synchronous, means tend serialize activity, forcing cuda calls issued before them complete, before execute.
the usual recommendation such allocations outside of time-critical code. figure out how many allocations need, allocate them up-front, use (and perhaps re-use) them during main processing loop, free whatever needed @ end.
Comments
Post a Comment