openmp - Multiple Host threads launch CUDA kernels together -


i have encountered strange situation. here our code:

#include <omp.h> #include <stdio.h> #include <stdlib.h> #include <cuda.h>   void initcuda(int g) {     cudadeviceprop prop;     if(cudagetdeviceproperties(&prop, g) == cudasuccess) printf("mp cnt: %d ,concurrent kernels:%d , asyncenginecount:%d , thrdpermp: %d\n",                                                              prop.multiprocessorcount,prop.concurrentkernels,prop.asyncenginecount,192);     cudasetdevice(g); } __global__ void cudajob(float *mem){   unsigned int tid=threadidx.x+blockidx.x*blockdim.x;   mem[tid]=-1e5;   while(mem[tid]<1.0e5){     mem[tid]=mem[tid]+1e-2;   } } void wrapper(int n,int b){   float** dmem=(float**)malloc(n*(sizeof(float*)));    cudastream_t* stream=(cudastream_t*)malloc(sizeof(cudastream_t)*n);    dim3 grid=dim3(b,1,1);   dim3 block=dim3(192,1,1);//2496/13=192    for(int i=0;i<n;i++) {     cudamalloc((void**)&dmem[i],192*b*sizeof(float));     cudastreamcreate(&stream[i]);   }    for(int i=0;i<n;i++) cudajob<<<grid,block,0,stream[i]>>>(dmem[i]);    for(int i=0;i<n;i++) {     cudastreamdestroy(stream[i]);     cudafree(dmem[i]);   }    free(stream);  free(dmem);  }  int main(int argc,char* argv[]){  initcuda(0); int n=atoi(argv[1]); int nthreads=atoi(argv[2]); int b=atoi(argv[3]); float t1=omp_get_wtime(); #pragma omp parallel num_threads(nthreads) firstprivate(nthreads,n,b) { #pragma omp barrier   float time=omp_get_wtime();   int id=omp_get_thread_num();    wrapper(n,b);   time=omp_get_wtime()-time;   printf("num threads: %d, time: %f\n",id,time); } printf("total: %f\n",omp_get_wtime()-t1); return 0; } 

so if run ./main 1 8 1. means 8 threads , each of them launch 1 kernel. actual run time suggest kernels not launch simultaneously:

    mp cnt: 13 ,concurrent kernels:1 , asyncenginecount:2 , thrdpermp: 192 num threads: 0, time: 3.788108 num threads: 6, time: 6.661960 num threads: 7, time: 9.535245 num threads: 2, time: 12.408561 num threads: 5, time: 12.410481 num threads: 1, time: 12.411650 num threads: 4, time: 12.412888 num threads: 3, time: 12.414572 total: 12.414601 

after debuging found problem may caused cleaning of memory , stream. if comment out cudafree , streamdestroy , free. run time suggest concurrent:

mp cnt: 13 ,concurrent kernels:1 , asyncenginecount:2 , thrdpermp: 192 num threads: 7, time: 3.805691 num threads: 1, time: 3.806201 num threads: 3, time: 3.806624 num threads: 2, time: 3.806695 num threads: 6, time: 3.807018 num threads: 5, time: 3.807456 num threads: 0, time: 3.807486 num threads: 4, time: 3.807792 total: 3.807799   

at last found if add omp barrier right behind kernel launching call. cleaning not cause problem:

 for(int i=0;i<n;i++) cudajob<<<grid,block,0,stream[i]>>>(dmem[i]);  #pragma omp barrier  for(int i=0;i<n;i++) {     cudastreamdestroy(stream[i]);     cudafree(dmem[i]);   } 

so, think when multiple host threads trying clean memory , streams on device, may compete each other. not sure.

is right? can 1 remove omp barrier? because don't think necessary our problem.

yes, cudamalloc, cudafree, , cudastreamcreate synchronous, means tend serialize activity, forcing cuda calls issued before them complete, before execute.

the usual recommendation such allocations outside of time-critical code. figure out how many allocations need, allocate them up-front, use (and perhaps re-use) them during main processing loop, free whatever needed @ end.


Comments

Popular posts from this blog

java - WrongTypeOfReturnValue exception thrown when unit testing using mockito -

php - Magento - Deleted Base url key -

android - How to disable Button if EditText is empty ? -