cuda - CUBLAS: Incorrect inversion for matrix with zero pivot -
since cuda 5.5, cublas library contains routines batched matrix factorization , inversion (cublas<t>getrfbatched
, cublas<t>getribatched
respectively).
getting guide documentation, wrote test code inversion of n x n matrix using these routines. code gives correct output if matrix has non 0 pivots. setting pivot 0 results in incorrect results. have verified results using matlab.
i realize providing row major matrices input while cublas expects column major matrices, shouldn't matter transpose result. sure, tested on column major input, getting same behavior.
i confused as, cublas<t>getribatched
expects pivot exchange information array p
input, output cublas<t>getrfbatched
. so, if 0 pivots eliminated row exchange, inversion routine should handle automatically.
how perform inversion of matrices contain 0 pivot using cublas?
following self contained compile-able example different test cases:
#include <cstdio> #include <cstdlib> #include <cuda_runtime.h> #include <cublas_v2.h> #define cudacall(call) \ \ { \ cudaerror_t err = (call); \ if(cudasuccess != err) \ { \ fprintf(stderr,"cuda error:\nfile = %s\nline = %d\nreason = %s\n", __file__, __line__, cudageterrorstring(err)); \ cudadevicereset(); \ exit(exit_failure); \ } \ } \ while (0) #define cublascall(call) \ \ { \ cublasstatus_t status = (call); \ if(cublas_status_success != status) \ { \ fprintf(stderr,"cublas error:\nfile = %s\nline = %d\ncode = %d\n", __file__, __line__, status); \ cudadevicereset(); \ exit(exit_failure); \ } \ \ } \ while(0) void invert_device(float* src_d, float* dst_d, int n) { cublashandle_t handle; cublascall(cublascreate_v2(&handle)); int batchsize = 1; int *p, *info; cudacall(cudamalloc<int>(&p,n * batchsize * sizeof(int))); cudacall(cudamalloc<int>(&info,batchsize * sizeof(int))); int lda = n; float *a[] = { src_d }; float** a_d; cudacall(cudamalloc<float*>(&a_d,sizeof(a))); cudacall(cudamemcpy(a_d,a,sizeof(a),cudamemcpyhosttodevice)); cublascall(cublassgetrfbatched(handle,n,a_d,lda,p,info,batchsize)); int infoh = 0; cudacall(cudamemcpy(&infoh,info,sizeof(int),cudamemcpydevicetohost)); if(infoh == n) { fprintf(stderr, "factorization failed: matrix singular\n"); cudadevicereset(); exit(exit_failure); } float* c[] = { dst_d }; float** c_d; cudacall(cudamalloc<float*>(&c_d,sizeof(c))); cudacall(cudamemcpy(c_d,c,sizeof(c),cudamemcpyhosttodevice)); cublascall(cublassgetribatched(handle,n,a_d,lda,p,c_d,lda,info,batchsize)); cudacall(cudamemcpy(&infoh,info,sizeof(int),cudamemcpydevicetohost)); if(infoh != 0) { fprintf(stderr, "inversion failed: matrix singular\n"); cudadevicereset(); exit(exit_failure); } cudafree(p), cudafree(info), cublasdestroy_v2(handle); } void invert(float* src, float* dst, int n) { float* src_d, *dst_d; cudacall(cudamalloc<float>(&src_d,n * n * sizeof(float))); cudacall(cudamemcpy(src_d,src,n * n * sizeof(float),cudamemcpyhosttodevice)); cudacall(cudamalloc<float>(&dst_d,n * n * sizeof(float))); invert_device(src_d,dst_d,n); cudacall(cudamemcpy(dst,dst_d,n * n * sizeof(float),cudamemcpydevicetohost)); cudafree(src_d), cudafree(dst_d); } void test_invert() { const int n = 3; //random matrix full pivots float full_pivots[n*n] = { 0.5, 3, 4, 1, 3, 10, 4 , 9, 16 }; //almost same above matrix first pivot 0 float zero_pivot[n*n] = { 0, 3, 4, 1, 3, 10, 4 , 9, 16 }; float zero_pivot_col_major[n*n] = { 0, 1, 4, 3, 3, 9, 4 , 10, 16 }; float another_zero_pivot[n*n] = { 0, 3, 4, 1, 5, 6, 9, 8, 2 }; float another_full_pivot[n * n] = { 22, 3, 4, 1, 5, 6, 9, 8, 2 }; float singular[n*n] = {1,2,3, 4,5,6, 7,8,9}; //select matrix setting "a" float* = zero_pivot; fprintf(stdout, "input:\n\n"); for(int i=0; i<n; i++) { for(int j=0; j<n; j++) fprintf(stdout,"%f\t",a[i*n+j]); fprintf(stdout,"\n"); } fprintf(stdout,"\n\n"); invert(a,a,n); fprintf(stdout, "inverse:\n\n"); for(int i=0; i<n; i++) { for(int j=0; j<n; j++) fprintf(stdout,"%f\t",a[i*n+j]); fprintf(stdout,"\n"); } } int main() { test_invert(); int n; scanf("%d",&n); return 0; }
there seems bug in current cublas library implementation of cublas<t>getrfbatched
matrices of dimension (n
) such 3<=n<=16
, when there "zero pivot" say.
a possible workaround "identity-extend" a
matrix inverted, when n<17, size of 17x17 (using matlab nomenclature):
lu = getrf( [a 0 ; 0 i]);
continuing, can use cublas<t>getribatched
in "ordinary" fashion:
inva = getri( lu(1:3,1:3) )
(you can leave @ n=17, call getri
way, , extract result first 3x3 rows , columns of inva
.)
here worked example, borrowing code supplied, showing inversion of supplied 3x3 zero_pivot
matrix, using zero_pivot_war
matrix "identity-extended" workaround:
$ cat t340.cu #include <cstdio> #include <cstdlib> #include <cuda_runtime.h> #include <cublas_v2.h> #define cudacall(call) \ \ { \ cudaerror_t err = (call); \ if(cudasuccess != err) \ { \ fprintf(stderr,"cuda error:\nfile = %s\nline = %d\nreason = %s\n", __file__, __line__, cudageterrorstring(err)); \ cudadevicereset(); \ exit(exit_failure); \ } \ } \ while (0) #define cublascall(call) \ \ { \ cublasstatus_t status = (call); \ if(cublas_status_success != status) \ { \ fprintf(stderr,"cublas error:\nfile = %s\nline = %d\ncode = %d\n", __file__, __line__, status); \ cudadevicereset(); \ exit(exit_failure); \ } \ \ } \ while(0) void invert_device(float* src_d, float* dst_d, int n) { cublashandle_t handle; cublascall(cublascreate_v2(&handle)); int batchsize = 1; int *p, *info; cudacall(cudamalloc<int>(&p,17 * batchsize * sizeof(int))); cudacall(cudamalloc<int>(&info,batchsize * sizeof(int))); int lda = 17; float *a[] = { src_d }; float** a_d; cudacall(cudamalloc<float*>(&a_d,sizeof(a))); cudacall(cudamemcpy(a_d,a,sizeof(a),cudamemcpyhosttodevice)); cublascall(cublassgetrfbatched(handle,17,a_d,lda,p,info,batchsize)); int infoh = 0; cudacall(cudamemcpy(&infoh,info,sizeof(int),cudamemcpydevicetohost)); if(infoh == 17) { fprintf(stderr, "factorization failed: matrix singular\n"); cudadevicereset(); exit(exit_failure); } float* c[] = { dst_d }; float** c_d; cudacall(cudamalloc<float*>(&c_d,sizeof(c))); cudacall(cudamemcpy(c_d,c,sizeof(c),cudamemcpyhosttodevice)); cublascall(cublassgetribatched(handle,n,a_d,lda,p,c_d,n,info,batchsize)); cudacall(cudamemcpy(&infoh,info,sizeof(int),cudamemcpydevicetohost)); if(infoh != 0) { fprintf(stderr, "inversion failed: matrix singular\n"); cudadevicereset(); exit(exit_failure); } cudafree(p), cudafree(info), cublasdestroy_v2(handle); } void invert(float* src, float* dst, int n) { float* src_d, *dst_d; cudacall(cudamalloc<float>(&src_d,17 * 17 * sizeof(float))); cudacall(cudamemcpy(src_d,src,17 * 17 * sizeof(float),cudamemcpyhosttodevice)); cudacall(cudamalloc<float>(&dst_d,n * n * sizeof(float))); invert_device(src_d,dst_d,n); cudacall(cudamemcpy(dst,dst_d,n * n * sizeof(float),cudamemcpydevicetohost)); cudafree(src_d), cudafree(dst_d); } void test_invert() { const int n = 3; //random matrix full pivots /* float full_pivots[n*n] = { 0.5, 3, 4, 1, 3, 10, 4 , 9, 16 }; //almost same above matrix first pivot 0 float zero_pivot[n*n] = { 0, 3, 4, 1, 3, 10, 4 , 9, 16 }; float zero_pivot_col_major[n*n] = { 0, 1, 4, 3, 3, 9, 4 , 10, 16 }; */ float zero_pivot_war[17*17] = { 0,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 4,9,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 }; /* float another_zero_pivot[n*n] = { 0, 3, 4, 1, 5, 6, 9, 8, 2 }; float another_full_pivot[n * n] = { 22, 3, 4, 1, 5, 6, 9, 8, 2 }; float singular[n*n] = {1,2,3, 4,5,6, 7,8,9}; */ float result[n*n]; //select matrix setting "a" float* = zero_pivot_war; fprintf(stdout, "input:\n\n"); for(int i=0; i<n; i++) { for(int j=0; j<n; j++) fprintf(stdout,"%f\t",a[i*17+j]); fprintf(stdout,"\n"); } fprintf(stdout,"\n\n"); invert(a,result,n); fprintf(stdout, "inverse:\n\n"); for(int i=0; i<n; i++) { for(int j=0; j<n; j++) fprintf(stdout,"%f\t",result[i*n+j]); fprintf(stdout,"\n"); } } int main() { test_invert(); // int n; scanf("%d",&n); return 0; } $ nvcc -arch=sm_20 -o t340 t340.cu -lcublas $ cuda-memcheck ./t340 ========= cuda-memcheck input: 0.000000 3.000000 4.000000 1.000000 3.000000 10.000000 4.000000 9.000000 16.000000 inverse: -0.700000 -0.200000 0.300000 0.400000 -0.266667 0.066667 -0.050000 0.200000 -0.050000 ========= error summary: 0 errors $
the above result appears me correct based on simple test elsewhere.
i don't have further technical details nature of possible bug in cublas. can tell, present in both cuda 5.5 , cuda 6.0 rc. detailed bug discussions nvidia-supplied assets (e.g. cublas library) should taken on the nvidia developer forums or directly @ bug filing portal on developer.nvidia.com (you must registered developer file bug).
Comments
Post a Comment