CUDA initialization error after fork -
i "initialization error" after calling fork(). if run same program without fork, works fine.
if (fork() == 0) { ... cudamalloc(....); ... }
what cause this?
a complete example below. if comment out cudagetdevicecount call, works fine.
#include <stdio.h> #include <unistd.h> #include <stdlib.h> #include <sys/types.h> #include <sys/wait.h> #include <cuda_runtime.h> #define perr(call) \ if (call) {\ fprintf(stderr, "%s:%d error [%s] on "#call"\n", __file__, __line__,\ cudageterrorstring(cudagetlasterror()));\ exit(1);\ } int main(int argc, char **argv) { float *v_d; int gpucount; cudagetdevicecount(&gpucount); if (fork() == 0) { cudasetdevice(0); perr(cudamalloc(&v_d, 1000*sizeof(float))); } wait(null); return 0; }
simple makefile:
progs = fork cuda_path = /usr/local/cuda cxxflags = -g -o0 -wall cxxincludes = -i$(cuda_path)/include nvcc := $(cuda_path)/bin/nvcc -ccbin $(cxx) -xcompiler "$(cxxflags)" fork: fork.cxx $(nvcc) $^ -o $@ $(libs) clean: (rm $(progs) *.o)
in case, trying number of devices available within parent process. work-around it:
if (fork() == 0) { perr(cudagetdevicecount(&gpucount)); return(gpucount); } wait(&gpucount); gpucount = wexitstatus(gpucount);
fork()
creates child process. processes have own address spaces. cuda context cannot shared between 2 different processes, many reasons, 1 of various pointers meaningless in different address space.
if create cuda context before fork()
, cannot use within child process. cudasetdevice(0);
call attempts share cuda context, implicitly created in parent process when call cudagetdevicecount();
the solution, you've hinted at, either cuda work in parent process or in child process. if in multi-device system, should possible allocate separate devices separate processes (the cuda simpleipc sample code this). (the key point not create cuda context before fork.)
you may interested in this question/answer , this one.
here's worked example (requires 2 cuda devices) showing child process , parent process using separate gpus:
$ cat t345.cu #include <unistd.h> /* symbolic constants */ #include <sys/types.h> /* primitive system data types */ #include <errno.h> /* errors */ #include <stdio.h> /* input/output */ #include <sys/wait.h> /* wait process termination */ #include <stdlib.h> /* general utilities */ #define cudacheckerrors(msg) \ { \ cudaerror_t __err = cudagetlasterror(); \ if (__err != cudasuccess) { \ fprintf(stderr, "fatal error: %s (%s @ %s:%d)\n", \ msg, cudageterrorstring(__err), \ __file__, __line__); \ fprintf(stderr, "*** failed - aborting\n"); \ exit(1); \ } \ } while (0) __global__ void addkernel(int *data){ *data += 1; } int main() { pid_t childpid; /* variable store child's pid */ int retval; /* child process: user-provided return code */ int status; /* parent process: child's exit status */ /* 1 int variable needed because each process have own instance of variable here, 2 int variables used clarity */ /* create new process */ childpid = fork(); if (childpid >= 0) /* fork succeeded */ { if (childpid == 0) /* fork() returns 0 child process */ { printf("child: child process!\n"); printf("child: here's pid: %d\n", getpid()); printf("child: parent's pid is: %d\n", getppid()); printf("child: value of copy of childpid is: %d\n", childpid); int *h_a, *d_a; h_a = (int *)malloc(sizeof(int)); cudasetdevice(0); cudacheckerrors("child cudasetdevice fail"); cudamalloc(&d_a, sizeof(int)); cudacheckerrors("cudamalloc fail"); *h_a = 1; cudamemcpy(d_a, h_a, sizeof(int), cudamemcpyhosttodevice); cudacheckerrors("cudamemcpy h2d fail"); addkernel<<<1,1>>>(d_a); cudadevicesynchronize(); cudacheckerrors("kernel fail"); cudamemcpy(h_a, d_a, sizeof(int), cudamemcpydevicetohost); cudacheckerrors("cudamemcpy d2h fail"); printf("child: result: %d\n", *h_a); printf("child: sleeping 1 second...\n"); sleep(1); /* sleep 1 second */ cudadevicereset(); printf("child: enter exit value (0 255): "); scanf(" %d", &retval); printf("child: goodbye!\n"); exit(retval); /* child exits user-provided return code */ } else /* fork() returns new pid parent process */ { printf("parent: parent process!\n"); printf("parent: here's pid: %d\n", getpid()); printf("parent: value of copy of childpid %d\n", childpid); printf("parent: wait child exit.\n"); int *h_a, *d_a; h_a = (int *)malloc(sizeof(int)); cudasetdevice(1); cudacheckerrors("parent cudasetdevice fail"); cudamalloc(&d_a, sizeof(int)); cudacheckerrors("cudamalloc fail"); *h_a = 2; cudamemcpy(d_a, h_a, sizeof(int), cudamemcpyhosttodevice); cudacheckerrors("cudamemcpy h2d fail"); addkernel<<<1,1>>>(d_a); cudadevicesynchronize(); cudacheckerrors("kernel fail"); cudamemcpy(h_a, d_a, sizeof(int), cudamemcpydevicetohost); cudacheckerrors("cudamemcpy d2h fail"); printf("parent: result: %d\n", *h_a); wait(&status); /* wait child exit, , store status */ printf("parent: child's exit code is: %d\n", wexitstatus(status)); cudasetdevice(0); cudacheckerrors("parent cudasetdevice 2 fail"); int *h_a2, *d_a2; cudamalloc(&d_a2, sizeof(int)); cudacheckerrors("cudamalloc fail"); h_a2 = (int *)malloc(sizeof(int)); *h_a2 = 5; cudamemcpy(d_a2, h_a2, sizeof(int), cudamemcpyhosttodevice); cudacheckerrors("cudamemcpy h2d fail"); addkernel<<<1,1>>>(d_a2); cudadevicesynchronize(); cudacheckerrors("kernel fail"); cudamemcpy(h_a2, d_a2, sizeof(int), cudamemcpydevicetohost); cudacheckerrors("cudamemcpy d2h fail"); printf("parent: result2: %d\n", *h_a2); printf("parent: goodbye!\n"); exit(0); /* parent exits */ } } else /* fork returns -1 on failure */ { perror("fork"); /* display error message */ exit(0); } } $ nvcc -arch=sm_20 -o t345 t345.cu $ ./t345 child: child process! child: here's pid: 23603 child: parent's pid is: 23602 child: value of copy of childpid is: 0 parent: parent process! parent: here's pid: 23602 parent: value of copy of childpid 23603 parent: wait child exit. child: result: 2 child: sleeping 1 second... parent: result: 3 child: enter exit value (0 255): 10 child: goodbye! parent: child's exit code is: 10 parent: result2: 6 parent: goodbye! $
(modified here)
Comments
Post a Comment