CUDA initialization error after fork -


i "initialization error" after calling fork(). if run same program without fork, works fine.

if (fork() == 0) {     ...     cudamalloc(....);     ... } 

what cause this?

a complete example below. if comment out cudagetdevicecount call, works fine.

#include <stdio.h> #include <unistd.h> #include <stdlib.h> #include <sys/types.h> #include <sys/wait.h> #include <cuda_runtime.h>  #define perr(call) \   if (call) {\    fprintf(stderr, "%s:%d error [%s] on "#call"\n", __file__, __line__,\       cudageterrorstring(cudagetlasterror()));\    exit(1);\   }  int main(int argc, char **argv) {   float *v_d;   int gpucount;    cudagetdevicecount(&gpucount);    if (fork() == 0) {     cudasetdevice(0);     perr(cudamalloc(&v_d, 1000*sizeof(float)));   }   wait(null);   return 0; } 

simple makefile:

progs = fork cuda_path = /usr/local/cuda cxxflags = -g -o0 -wall cxxincludes = -i$(cuda_path)/include nvcc := $(cuda_path)/bin/nvcc -ccbin $(cxx) -xcompiler "$(cxxflags)"  fork: fork.cxx         $(nvcc) $^ -o $@ $(libs)  clean:         (rm $(progs) *.o) 

in case, trying number of devices available within parent process. work-around it:

  if (fork() == 0) {     perr(cudagetdevicecount(&gpucount));     return(gpucount);   }   wait(&gpucount);   gpucount =  wexitstatus(gpucount); 

fork() creates child process. processes have own address spaces. cuda context cannot shared between 2 different processes, many reasons, 1 of various pointers meaningless in different address space.

if create cuda context before fork(), cannot use within child process. cudasetdevice(0); call attempts share cuda context, implicitly created in parent process when call cudagetdevicecount();

the solution, you've hinted at, either cuda work in parent process or in child process. if in multi-device system, should possible allocate separate devices separate processes (the cuda simpleipc sample code this). (the key point not create cuda context before fork.)

you may interested in this question/answer , this one.

here's worked example (requires 2 cuda devices) showing child process , parent process using separate gpus:

$ cat t345.cu #include <unistd.h>     /* symbolic constants */ #include <sys/types.h>  /* primitive system data types */ #include <errno.h>      /* errors */ #include <stdio.h>      /* input/output */ #include <sys/wait.h>   /* wait process termination */ #include <stdlib.h>     /* general utilities */   #define cudacheckerrors(msg) \     { \         cudaerror_t __err = cudagetlasterror(); \         if (__err != cudasuccess) { \             fprintf(stderr, "fatal error: %s (%s @ %s:%d)\n", \                 msg, cudageterrorstring(__err), \                 __file__, __line__); \             fprintf(stderr, "*** failed - aborting\n"); \             exit(1); \         } \     } while (0)   __global__ void addkernel(int *data){   *data += 1; }  int main() {     pid_t childpid; /* variable store child's pid */     int retval;     /* child process: user-provided return code */     int status;     /* parent process: child's exit status */      /* 1 int variable needed because each process have        own instance of variable        here, 2 int variables used clarity */      /* create new process */     childpid = fork();      if (childpid >= 0) /* fork succeeded */     {         if (childpid == 0) /* fork() returns 0 child process */         {             printf("child: child process!\n");             printf("child: here's pid: %d\n", getpid());             printf("child: parent's pid is: %d\n", getppid());             printf("child: value of copy of childpid is: %d\n", childpid);             int *h_a, *d_a;             h_a = (int *)malloc(sizeof(int));             cudasetdevice(0);             cudacheckerrors("child cudasetdevice fail");             cudamalloc(&d_a, sizeof(int));             cudacheckerrors("cudamalloc fail");             *h_a = 1;             cudamemcpy(d_a, h_a, sizeof(int), cudamemcpyhosttodevice);             cudacheckerrors("cudamemcpy h2d fail");             addkernel<<<1,1>>>(d_a);             cudadevicesynchronize();             cudacheckerrors("kernel fail");             cudamemcpy(h_a, d_a, sizeof(int), cudamemcpydevicetohost);             cudacheckerrors("cudamemcpy d2h fail");             printf("child: result: %d\n", *h_a);              printf("child: sleeping 1 second...\n");             sleep(1); /* sleep 1 second */             cudadevicereset();             printf("child: enter exit value (0 255): ");             scanf(" %d", &retval);             printf("child: goodbye!\n");             exit(retval); /* child exits user-provided return code */         }         else /* fork() returns new pid parent process */         {             printf("parent: parent process!\n");             printf("parent: here's pid: %d\n", getpid());             printf("parent: value of copy of childpid %d\n", childpid);             printf("parent: wait child exit.\n");             int *h_a, *d_a;             h_a = (int *)malloc(sizeof(int));             cudasetdevice(1);             cudacheckerrors("parent cudasetdevice fail");             cudamalloc(&d_a, sizeof(int));             cudacheckerrors("cudamalloc fail");             *h_a = 2;             cudamemcpy(d_a, h_a, sizeof(int), cudamemcpyhosttodevice);             cudacheckerrors("cudamemcpy h2d fail");             addkernel<<<1,1>>>(d_a);             cudadevicesynchronize();             cudacheckerrors("kernel fail");             cudamemcpy(h_a, d_a, sizeof(int), cudamemcpydevicetohost);             cudacheckerrors("cudamemcpy d2h fail");             printf("parent: result: %d\n", *h_a);             wait(&status); /* wait child exit, , store status */             printf("parent: child's exit code is: %d\n", wexitstatus(status));             cudasetdevice(0);             cudacheckerrors("parent cudasetdevice  2 fail");             int *h_a2, *d_a2;             cudamalloc(&d_a2, sizeof(int));             cudacheckerrors("cudamalloc fail");             h_a2 = (int *)malloc(sizeof(int));             *h_a2 = 5;             cudamemcpy(d_a2, h_a2, sizeof(int), cudamemcpyhosttodevice);             cudacheckerrors("cudamemcpy h2d fail");             addkernel<<<1,1>>>(d_a2);             cudadevicesynchronize();             cudacheckerrors("kernel fail");             cudamemcpy(h_a2, d_a2, sizeof(int), cudamemcpydevicetohost);             cudacheckerrors("cudamemcpy d2h fail");             printf("parent: result2: %d\n", *h_a2);             printf("parent: goodbye!\n");             exit(0);  /* parent exits */         }     }     else /* fork returns -1 on failure */     {         perror("fork"); /* display error message */         exit(0);     } } $ nvcc -arch=sm_20 -o t345 t345.cu $ ./t345 child: child process! child: here's pid: 23603 child: parent's pid is: 23602 child: value of copy of childpid is: 0 parent: parent process! parent: here's pid: 23602 parent: value of copy of childpid 23603 parent: wait child exit. child: result: 2 child: sleeping 1 second... parent: result: 3 child: enter exit value (0 255): 10 child: goodbye! parent: child's exit code is: 10 parent: result2: 6 parent: goodbye! $ 

(modified here)


Comments

Popular posts from this blog

apache - Remove .php and add trailing slash in url using htaccess not loading css -

javascript - jQuery show full size image on click -