cuda - CUBLAS: Incorrect inversion for matrix with zero pivot -


since cuda 5.5, cublas library contains routines batched matrix factorization , inversion (cublas<t>getrfbatched , cublas<t>getribatched respectively).

getting guide documentation, wrote test code inversion of n x n matrix using these routines. code gives correct output if matrix has non 0 pivots. setting pivot 0 results in incorrect results. have verified results using matlab.

i realize providing row major matrices input while cublas expects column major matrices, shouldn't matter transpose result. sure, tested on column major input, getting same behavior.

i confused as, cublas<t>getribatched expects pivot exchange information array p input, output cublas<t>getrfbatched. so, if 0 pivots eliminated row exchange, inversion routine should handle automatically.

how perform inversion of matrices contain 0 pivot using cublas?

following self contained compile-able example different test cases:

#include <cstdio> #include <cstdlib> #include <cuda_runtime.h> #include <cublas_v2.h>  #define cudacall(call)                                                                                                          \                                                                                                                              \     {                                                                                                                           \         cudaerror_t err = (call);                                                                                               \         if(cudasuccess != err)                                                                                                  \         {                                                                                                                       \             fprintf(stderr,"cuda error:\nfile = %s\nline = %d\nreason = %s\n", __file__, __line__, cudageterrorstring(err));    \             cudadevicereset();                                                                                                  \             exit(exit_failure);                                                                                                 \         }                                                                                                                       \     }                                                                                                                           \     while (0)  #define cublascall(call)                                                                                        \                                                                                                              \     {                                                                                                           \         cublasstatus_t status = (call);                                                                         \         if(cublas_status_success != status)                                                                     \         {                                                                                                       \             fprintf(stderr,"cublas error:\nfile = %s\nline = %d\ncode = %d\n", __file__, __line__, status);     \             cudadevicereset();                                                                                  \             exit(exit_failure);                                                                                 \         }                                                                                                       \                                                                                                                 \     }                                                                                                           \     while(0)   void invert_device(float* src_d, float* dst_d, int n) {     cublashandle_t handle;     cublascall(cublascreate_v2(&handle));      int batchsize = 1;      int *p, *info;      cudacall(cudamalloc<int>(&p,n * batchsize * sizeof(int)));     cudacall(cudamalloc<int>(&info,batchsize * sizeof(int)));      int lda = n;      float *a[] = { src_d };     float** a_d;     cudacall(cudamalloc<float*>(&a_d,sizeof(a)));     cudacall(cudamemcpy(a_d,a,sizeof(a),cudamemcpyhosttodevice));      cublascall(cublassgetrfbatched(handle,n,a_d,lda,p,info,batchsize));      int infoh = 0;     cudacall(cudamemcpy(&infoh,info,sizeof(int),cudamemcpydevicetohost));      if(infoh == n)     {         fprintf(stderr, "factorization failed: matrix singular\n");         cudadevicereset();         exit(exit_failure);     }      float* c[] = { dst_d };     float** c_d;     cudacall(cudamalloc<float*>(&c_d,sizeof(c)));     cudacall(cudamemcpy(c_d,c,sizeof(c),cudamemcpyhosttodevice));      cublascall(cublassgetribatched(handle,n,a_d,lda,p,c_d,lda,info,batchsize));      cudacall(cudamemcpy(&infoh,info,sizeof(int),cudamemcpydevicetohost));      if(infoh != 0)     {         fprintf(stderr, "inversion failed: matrix singular\n");         cudadevicereset();         exit(exit_failure);     }      cudafree(p), cudafree(info), cublasdestroy_v2(handle); }  void invert(float* src, float* dst, int n) {     float* src_d, *dst_d;      cudacall(cudamalloc<float>(&src_d,n * n * sizeof(float)));     cudacall(cudamemcpy(src_d,src,n * n * sizeof(float),cudamemcpyhosttodevice));     cudacall(cudamalloc<float>(&dst_d,n * n * sizeof(float)));      invert_device(src_d,dst_d,n);      cudacall(cudamemcpy(dst,dst_d,n * n * sizeof(float),cudamemcpydevicetohost));      cudafree(src_d), cudafree(dst_d); }  void test_invert() {     const int n = 3;      //random matrix full pivots     float full_pivots[n*n] = { 0.5, 3, 4,                                  1, 3, 10,                                  4 , 9, 16 };      //almost same above matrix first pivot 0     float zero_pivot[n*n] = { 0, 3, 4,                                1, 3, 10,                               4 , 9, 16 };      float zero_pivot_col_major[n*n] = { 0, 1, 4,                                          3, 3, 9,                                         4 , 10, 16 };      float another_zero_pivot[n*n] = { 0, 3, 4,                                        1, 5, 6,                                       9, 8, 2 };      float another_full_pivot[n * n] = { 22, 3, 4,                                          1, 5, 6,                                         9, 8, 2 };      float singular[n*n] = {1,2,3,                            4,5,6,                            7,8,9};       //select matrix setting "a"     float* = zero_pivot;        fprintf(stdout, "input:\n\n");     for(int i=0; i<n; i++)     {         for(int j=0; j<n; j++)             fprintf(stdout,"%f\t",a[i*n+j]);         fprintf(stdout,"\n");     }      fprintf(stdout,"\n\n");      invert(a,a,n);      fprintf(stdout, "inverse:\n\n");     for(int i=0; i<n; i++)     {         for(int j=0; j<n; j++)             fprintf(stdout,"%f\t",a[i*n+j]);         fprintf(stdout,"\n");     }  }  int main() {     test_invert();      int n;  scanf("%d",&n);     return 0; } 

there seems bug in current cublas library implementation of cublas<t>getrfbatched matrices of dimension (n) such 3<=n<=16, when there "zero pivot" say.

a possible workaround "identity-extend" a matrix inverted, when n<17, size of 17x17 (using matlab nomenclature):

 lu = getrf( [a 0 ; 0 i]); 

continuing, can use cublas<t>getribatched in "ordinary" fashion:

 inva = getri( lu(1:3,1:3) ) 

(you can leave @ n=17, call getri way, , extract result first 3x3 rows , columns of inva.)

here worked example, borrowing code supplied, showing inversion of supplied 3x3 zero_pivot matrix, using zero_pivot_war matrix "identity-extended" workaround:

$ cat t340.cu #include <cstdio> #include <cstdlib> #include <cuda_runtime.h> #include <cublas_v2.h>  #define cudacall(call)                                                                                                          \                                                                                                                              \     {                                                                                                                           \         cudaerror_t err = (call);                                                                                               \         if(cudasuccess != err)                                                                                                  \         {                                                                                                                       \             fprintf(stderr,"cuda error:\nfile = %s\nline = %d\nreason = %s\n", __file__, __line__, cudageterrorstring(err));    \             cudadevicereset();                                                                                                  \             exit(exit_failure);                                                                                                 \         }                                                                                                                       \     }                                                                                                                           \     while (0)  #define cublascall(call)                                                                                        \                                                                                                              \     {                                                                                                           \         cublasstatus_t status = (call);                                                                         \         if(cublas_status_success != status)                                                                     \         {                                                                                                       \             fprintf(stderr,"cublas error:\nfile = %s\nline = %d\ncode = %d\n", __file__, __line__, status);     \             cudadevicereset();                                                                                  \             exit(exit_failure);                                                                                 \         }                                                                                                       \                                                                                                                 \     }                                                                                                           \     while(0)   void invert_device(float* src_d, float* dst_d, int n) {     cublashandle_t handle;     cublascall(cublascreate_v2(&handle));      int batchsize = 1;      int *p, *info;      cudacall(cudamalloc<int>(&p,17 * batchsize * sizeof(int)));     cudacall(cudamalloc<int>(&info,batchsize * sizeof(int)));      int lda = 17;      float *a[] = { src_d };     float** a_d;     cudacall(cudamalloc<float*>(&a_d,sizeof(a)));     cudacall(cudamemcpy(a_d,a,sizeof(a),cudamemcpyhosttodevice));      cublascall(cublassgetrfbatched(handle,17,a_d,lda,p,info,batchsize));      int infoh = 0;     cudacall(cudamemcpy(&infoh,info,sizeof(int),cudamemcpydevicetohost));      if(infoh == 17)     {         fprintf(stderr, "factorization failed: matrix singular\n");         cudadevicereset();         exit(exit_failure);     }      float* c[] = { dst_d };     float** c_d;     cudacall(cudamalloc<float*>(&c_d,sizeof(c)));     cudacall(cudamemcpy(c_d,c,sizeof(c),cudamemcpyhosttodevice));      cublascall(cublassgetribatched(handle,n,a_d,lda,p,c_d,n,info,batchsize));      cudacall(cudamemcpy(&infoh,info,sizeof(int),cudamemcpydevicetohost));      if(infoh != 0)     {         fprintf(stderr, "inversion failed: matrix singular\n");         cudadevicereset();         exit(exit_failure);     }      cudafree(p), cudafree(info), cublasdestroy_v2(handle); }  void invert(float* src, float* dst, int n) {     float* src_d, *dst_d;      cudacall(cudamalloc<float>(&src_d,17 * 17 * sizeof(float)));     cudacall(cudamemcpy(src_d,src,17 * 17 * sizeof(float),cudamemcpyhosttodevice));     cudacall(cudamalloc<float>(&dst_d,n * n * sizeof(float)));      invert_device(src_d,dst_d,n);      cudacall(cudamemcpy(dst,dst_d,n * n * sizeof(float),cudamemcpydevicetohost));      cudafree(src_d), cudafree(dst_d); }  void test_invert() {     const int n = 3;      //random matrix full pivots /*    float full_pivots[n*n] = { 0.5, 3, 4,                                 1, 3, 10,                                 4 , 9, 16 };      //almost same above matrix first pivot 0     float zero_pivot[n*n] = { 0, 3, 4,                               1, 3, 10,                               4 , 9, 16 };      float zero_pivot_col_major[n*n] = { 0, 1, 4,                                         3, 3, 9,                                         4 , 10, 16 }; */     float zero_pivot_war[17*17] = {                                         0,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,                                         1,3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,                                         4,9,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,                                         0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,                                         0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,                                         0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,                                         0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,                                         0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,                                         0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,                                         0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,                                         0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,                                         0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,                                         0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,                                         0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,                                         0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,                                         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,                                         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 }; /*     float another_zero_pivot[n*n] = { 0, 3, 4,                                       1, 5, 6,                                       9, 8, 2 };      float another_full_pivot[n * n] = { 22, 3, 4,                                         1, 5, 6,                                         9, 8, 2 };      float singular[n*n] = {1,2,3,                            4,5,6,                            7,8,9}; */     float result[n*n];      //select matrix setting "a"     float* = zero_pivot_war;      fprintf(stdout, "input:\n\n");     for(int i=0; i<n; i++)     {         for(int j=0; j<n; j++)             fprintf(stdout,"%f\t",a[i*17+j]);         fprintf(stdout,"\n");     }      fprintf(stdout,"\n\n");      invert(a,result,n);      fprintf(stdout, "inverse:\n\n");     for(int i=0; i<n; i++)     {         for(int j=0; j<n; j++)             fprintf(stdout,"%f\t",result[i*n+j]);         fprintf(stdout,"\n");     }  }  int main() {     test_invert();  //    int n;  scanf("%d",&n);     return 0; } $ nvcc -arch=sm_20 -o t340 t340.cu -lcublas $ cuda-memcheck ./t340 ========= cuda-memcheck input:  0.000000        3.000000        4.000000 1.000000        3.000000        10.000000 4.000000        9.000000        16.000000   inverse:  -0.700000       -0.200000       0.300000 0.400000        -0.266667       0.066667 -0.050000       0.200000        -0.050000 ========= error summary: 0 errors $ 

the above result appears me correct based on simple test elsewhere.

i don't have further technical details nature of possible bug in cublas. can tell, present in both cuda 5.5 , cuda 6.0 rc. detailed bug discussions nvidia-supplied assets (e.g. cublas library) should taken on the nvidia developer forums or directly @ bug filing portal on developer.nvidia.com (you must registered developer file bug).


Comments

Popular posts from this blog

apache - Remove .php and add trailing slash in url using htaccess not loading css -

inno setup - TLabel or TNewStaticText - change .Font.Style on Focus like Cursor changes with .Cursor -