diff --git a/MATLAB/Source/POCS_TV.cu b/MATLAB/Source/POCS_TV.cu index 4a3861ec..e34a589b 100644 --- a/MATLAB/Source/POCS_TV.cu +++ b/MATLAB/Source/POCS_TV.cu @@ -183,7 +183,7 @@ do { \ __syncthreads(); -#if (__CUDA_ARCH__ >= 300) +#if (__CUDART_VERSION >= 9000) if ( tid < 32 ) { mySum = sdata[tid] + sdata[tid + 32]; @@ -231,7 +231,7 @@ do { \ __syncthreads(); -#if (__CUDA_ARCH__ >= 300) +#if (__CUDART_VERSION >= 9000) if ( tid < 32 ) { mySum = sdata[tid] + sdata[tid + 32]; @@ -397,9 +397,11 @@ do { \ // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported; + // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // splits>2 is completely empirical observation if (isHostRegisterSupported & splits>2){ cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); diff --git a/MATLAB/Source/POCS_TV2.cu b/MATLAB/Source/POCS_TV2.cu index 0f2e0b00..8eca93fc 100644 --- a/MATLAB/Source/POCS_TV2.cu +++ b/MATLAB/Source/POCS_TV2.cu @@ -204,7 +204,7 @@ do { \ __syncthreads(); -#if (__CUDA_ARCH__ >= 300) +#if (__CUDART_VERSION >= 9000) if ( tid < 32 ) { mySum = sdata[tid] + sdata[tid + 32]; @@ -252,7 +252,7 @@ do { \ __syncthreads(); -#if (__CUDA_ARCH__ >= 300) +#if (__CUDART_VERSION >= 9000) if ( tid < 32 ) { mySum = sdata[tid] + sdata[tid + 32]; @@ -417,9 +417,11 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported; + // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // splits>2 is completely empirical observation if (isHostRegisterSupported & splits>2){ cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); diff --git a/MATLAB/Source/Siddon_projection.cu b/MATLAB/Source/Siddon_projection.cu index b74f2f54..60a798cc 100644 --- a/MATLAB/Source/Siddon_projection.cu +++ b/MATLAB/Source/Siddon_projection.cu @@ -349,9 +349,11 @@ int siddon_ray_projection(float * img, Geometry geo, float** result,float cons //Pagelock memory for synchronous copy. // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported; + // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. if (isHostRegisterSupported & (splits>1 |deviceCount>1)){ diff --git a/MATLAB/Source/ray_interpolated_projection.cu b/MATLAB/Source/ray_interpolated_projection.cu index f96b30e0..3fd2155c 100644 --- a/MATLAB/Source/ray_interpolated_projection.cu +++ b/MATLAB/Source/ray_interpolated_projection.cu @@ -290,9 +290,11 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c //Pagelock memory for synchronous copy. // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported; + // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. if (isHostRegisterSupported & splits>1){ diff --git a/MATLAB/Source/tvdenoising.cu b/MATLAB/Source/tvdenoising.cu index 1c4d895e..6f446967 100644 --- a/MATLAB/Source/tvdenoising.cu +++ b/MATLAB/Source/tvdenoising.cu @@ -262,9 +262,11 @@ do { \ // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported; + // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif if (isHostRegisterSupported & splits>1){ cudaHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); diff --git a/MATLAB/Source/voxel_backprojection.cu b/MATLAB/Source/voxel_backprojection.cu index 03cdd8b5..99a3001f 100644 --- a/MATLAB/Source/voxel_backprojection.cu +++ b/MATLAB/Source/voxel_backprojection.cu @@ -311,9 +311,11 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa cudaCheckErrors("Error"); //Pagelock memory for synchronous copy. // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported; + // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){ diff --git a/MATLAB/Source/voxel_backprojection2.cu b/MATLAB/Source/voxel_backprojection2.cu index 128c6b04..de920137 100644 --- a/MATLAB/Source/voxel_backprojection2.cu +++ b/MATLAB/Source/voxel_backprojection2.cu @@ -352,9 +352,11 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float //Pagelock memory for synchronous copy. // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported; + // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. if (isHostRegisterSupported & split_image>1){ diff --git a/MATLAB/Source/voxel_backprojection_parallel.cu b/MATLAB/Source/voxel_backprojection_parallel.cu index 08982256..98faab16 100644 --- a/MATLAB/Source/voxel_backprojection_parallel.cu +++ b/MATLAB/Source/voxel_backprojection_parallel.cu @@ -298,9 +298,11 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re } //Pagelock memory for synchronous copy. // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported; + // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif if (isHostRegisterSupported){ cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); } diff --git a/Python/setup.py b/Python/setup.py index 261d96f2..714b2bd7 100644 --- a/Python/setup.py +++ b/Python/setup.py @@ -16,7 +16,7 @@ # Code from https://github.com/pytorch/pytorch/blob/master/torch/utils/cpp_extension.py COMPUTE_CAPABILITY_ARGS = [ # '-gencode=arch=compute_20,code=sm_20', #deprecated - #'-gencode=arch=compute_30,code=sm_30',#deprecated + '-gencode=arch=compute_30,code=sm_30', '-gencode=arch=compute_37,code=sm_37', '-gencode=arch=compute_52,code=sm_52', '-gencode=arch=compute_60,code=sm_60', diff --git a/Python/tigre/Source/POCS_TV.cu b/Python/tigre/Source/POCS_TV.cu index 4a3861ec..e34a589b 100644 --- a/Python/tigre/Source/POCS_TV.cu +++ b/Python/tigre/Source/POCS_TV.cu @@ -183,7 +183,7 @@ do { \ __syncthreads(); -#if (__CUDA_ARCH__ >= 300) +#if (__CUDART_VERSION >= 9000) if ( tid < 32 ) { mySum = sdata[tid] + sdata[tid + 32]; @@ -231,7 +231,7 @@ do { \ __syncthreads(); -#if (__CUDA_ARCH__ >= 300) +#if (__CUDART_VERSION >= 9000) if ( tid < 32 ) { mySum = sdata[tid] + sdata[tid + 32]; @@ -397,9 +397,11 @@ do { \ // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported; + // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // splits>2 is completely empirical observation if (isHostRegisterSupported & splits>2){ cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); diff --git a/Python/tigre/Source/POCS_TV2.cu b/Python/tigre/Source/POCS_TV2.cu index 0f2e0b00..8eca93fc 100644 --- a/Python/tigre/Source/POCS_TV2.cu +++ b/Python/tigre/Source/POCS_TV2.cu @@ -204,7 +204,7 @@ do { \ __syncthreads(); -#if (__CUDA_ARCH__ >= 300) +#if (__CUDART_VERSION >= 9000) if ( tid < 32 ) { mySum = sdata[tid] + sdata[tid + 32]; @@ -252,7 +252,7 @@ do { \ __syncthreads(); -#if (__CUDA_ARCH__ >= 300) +#if (__CUDART_VERSION >= 9000) if ( tid < 32 ) { mySum = sdata[tid] + sdata[tid + 32]; @@ -417,9 +417,11 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported; + // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // splits>2 is completely empirical observation if (isHostRegisterSupported & splits>2){ cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); diff --git a/Python/tigre/Source/Siddon_projection.cu b/Python/tigre/Source/Siddon_projection.cu index 04896ad2..d611d0d1 100644 --- a/Python/tigre/Source/Siddon_projection.cu +++ b/Python/tigre/Source/Siddon_projection.cu @@ -353,9 +353,11 @@ int siddon_ray_projection(float * img, Geometry geo, float** result,float cons //Pagelock memory for synchronous copy. // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported; + // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. if (isHostRegisterSupported & (splits>1 |deviceCount>1)){ diff --git a/Python/tigre/Source/ray_interpolated_projection.cu b/Python/tigre/Source/ray_interpolated_projection.cu index 2df2b6e0..0a0e64b4 100644 --- a/Python/tigre/Source/ray_interpolated_projection.cu +++ b/Python/tigre/Source/ray_interpolated_projection.cu @@ -295,9 +295,11 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c //Pagelock memory for synchronous copy. // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported; + // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. if (isHostRegisterSupported & splits>1){ diff --git a/Python/tigre/Source/tvdenoising.cu b/Python/tigre/Source/tvdenoising.cu index 1c4d895e..6f446967 100644 --- a/Python/tigre/Source/tvdenoising.cu +++ b/Python/tigre/Source/tvdenoising.cu @@ -262,9 +262,11 @@ do { \ // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported; + // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif if (isHostRegisterSupported & splits>1){ cudaHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); diff --git a/Python/tigre/Source/voxel_backprojection.cu b/Python/tigre/Source/voxel_backprojection.cu index 60b4bb9c..4ab87021 100644 --- a/Python/tigre/Source/voxel_backprojection.cu +++ b/Python/tigre/Source/voxel_backprojection.cu @@ -317,9 +317,11 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa cudaCheckErrors("Error"); //Pagelock memory for synchronous copy. // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported; + // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){ diff --git a/Python/tigre/Source/voxel_backprojection2.cu b/Python/tigre/Source/voxel_backprojection2.cu index 204051bf..3bc6b3e7 100644 --- a/Python/tigre/Source/voxel_backprojection2.cu +++ b/Python/tigre/Source/voxel_backprojection2.cu @@ -356,9 +356,12 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float //Pagelock memory for synchronous copy. // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported; + // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif + // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. if (isHostRegisterSupported & split_image>1){ diff --git a/Python/tigre/Source/voxel_backprojection_parallel.cu b/Python/tigre/Source/voxel_backprojection_parallel.cu index c48fb75f..e05dd376 100644 --- a/Python/tigre/Source/voxel_backprojection_parallel.cu +++ b/Python/tigre/Source/voxel_backprojection_parallel.cu @@ -302,9 +302,11 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re } //Pagelock memory for synchronous copy. // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported; + // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif if (isHostRegisterSupported){ cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); }