Merge pull request #243 from gfardell/compute_30

Compute 30 build
CERN · Jan 28, 2021 · 28332a3 · 28332a3
2 parents c658883 + 981afaf
commit 28332a3
Show file tree

Hide file tree

Showing 17 changed files with 74 additions and 41 deletions.
diff --git a/MATLAB/Source/POCS_TV.cu b/MATLAB/Source/POCS_TV.cu
@@ -183,7 +183,7 @@ do { \
         __syncthreads();
 
 
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDART_VERSION >= 9000)
         if ( tid < 32 )
         {
             mySum = sdata[tid] + sdata[tid + 32];
@@ -231,7 +231,7 @@ do { \
         __syncthreads();
 
 
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDART_VERSION >= 9000)
         if ( tid < 32 )
         {
             mySum = sdata[tid] + sdata[tid + 32];
@@ -397,9 +397,11 @@ do { \
 
 
         // Lets try to make the host memory pinned:
-        // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-        int isHostRegisterSupported;
+        // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
+        int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
         cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
         // splits>2 is completely empirical observation
         if (isHostRegisterSupported & splits>2){
             cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);

diff --git a/MATLAB/Source/POCS_TV2.cu b/MATLAB/Source/POCS_TV2.cu
@@ -204,7 +204,7 @@ do { \
         __syncthreads();
 
 
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDART_VERSION >= 9000)
         if ( tid < 32 )
         {
             mySum = sdata[tid] + sdata[tid + 32];
@@ -252,7 +252,7 @@ do { \
         __syncthreads();
 
 
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDART_VERSION >= 9000)
         if ( tid < 32 )
         {
             mySum = sdata[tid] + sdata[tid + 32];
@@ -417,9 +417,11 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
 
 
         // Lets try to make the host memory pinned:
-        // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-        int isHostRegisterSupported;
+        // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
+        int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
         cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
         // splits>2 is completely empirical observation
         if (isHostRegisterSupported & splits>2){
             cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);

diff --git a/MATLAB/Source/Siddon_projection.cu b/MATLAB/Source/Siddon_projection.cu
@@ -349,9 +349,11 @@ int siddon_ray_projection(float  *  img, Geometry geo, float** result,float cons
 
     //Pagelock memory for synchronous copy.
     // Lets try to make the host memory pinned:
-    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-    int isHostRegisterSupported;
+    // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
     if (isHostRegisterSupported & (splits>1 |deviceCount>1)){

diff --git a/MATLAB/Source/ray_interpolated_projection.cu b/MATLAB/Source/ray_interpolated_projection.cu
@@ -290,9 +290,11 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
 
     //Pagelock memory for synchronous copy.
     // Lets try to make the host memory pinned:
-    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-    int isHostRegisterSupported;
+    // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
     if (isHostRegisterSupported & splits>1){

diff --git a/MATLAB/Source/tvdenoising.cu b/MATLAB/Source/tvdenoising.cu
@@ -262,9 +262,11 @@ do { \
 
 
         // Lets try to make the host memory pinned:
-        // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-        int isHostRegisterSupported;
+        // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
+        int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
         cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
         if (isHostRegisterSupported & splits>1){
             cudaHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
             cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);

diff --git a/MATLAB/Source/voxel_backprojection.cu b/MATLAB/Source/voxel_backprojection.cu
@@ -311,9 +311,11 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
     cudaCheckErrors("Error");
     //Pagelock memory for synchronous copy.
     // Lets try to make the host memory pinned:
-    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-    int isHostRegisterSupported;
+    // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
     if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){

diff --git a/MATLAB/Source/voxel_backprojection2.cu b/MATLAB/Source/voxel_backprojection2.cu
@@ -352,9 +352,11 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
 
     //Pagelock memory for synchronous copy.
     // Lets try to make the host memory pinned:
-    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-    int isHostRegisterSupported;
+    // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
     if (isHostRegisterSupported & split_image>1){

diff --git a/MATLAB/Source/voxel_backprojection_parallel.cu b/MATLAB/Source/voxel_backprojection_parallel.cu
@@ -298,9 +298,11 @@ int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* re
     }
     //Pagelock memory for synchronous copy.
     // Lets try to make the host memory pinned:
-    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-    int isHostRegisterSupported;
+    // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
     if (isHostRegisterSupported){
         cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable);
     }

diff --git a/Python/setup.py b/Python/setup.py
@@ -16,7 +16,7 @@
 
 # Code from https://github.com/pytorch/pytorch/blob/master/torch/utils/cpp_extension.py
 COMPUTE_CAPABILITY_ARGS = [  # '-gencode=arch=compute_20,code=sm_20', #deprecated
-    #'-gencode=arch=compute_30,code=sm_30',#deprecated
+    '-gencode=arch=compute_30,code=sm_30',
     '-gencode=arch=compute_37,code=sm_37',
     '-gencode=arch=compute_52,code=sm_52',
     '-gencode=arch=compute_60,code=sm_60',

diff --git a/Python/tigre/Source/POCS_TV.cu b/Python/tigre/Source/POCS_TV.cu
@@ -183,7 +183,7 @@ do { \
         __syncthreads();
 
 
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDART_VERSION >= 9000)
         if ( tid < 32 )
         {
             mySum = sdata[tid] + sdata[tid + 32];
@@ -231,7 +231,7 @@ do { \
         __syncthreads();
 
 
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDART_VERSION >= 9000)
         if ( tid < 32 )
         {
             mySum = sdata[tid] + sdata[tid + 32];
@@ -397,9 +397,11 @@ do { \
 
 
         // Lets try to make the host memory pinned:
-        // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-        int isHostRegisterSupported;
+        // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
+        int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
         cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
         // splits>2 is completely empirical observation
         if (isHostRegisterSupported & splits>2){
             cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);

diff --git a/Python/tigre/Source/POCS_TV2.cu b/Python/tigre/Source/POCS_TV2.cu
@@ -204,7 +204,7 @@ do { \
         __syncthreads();
 
 
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDART_VERSION >= 9000)
         if ( tid < 32 )
         {
             mySum = sdata[tid] + sdata[tid + 32];
@@ -252,7 +252,7 @@ do { \
         __syncthreads();
 
 
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDART_VERSION >= 9000)
         if ( tid < 32 )
         {
             mySum = sdata[tid] + sdata[tid + 32];
@@ -417,9 +417,11 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
 
 
         // Lets try to make the host memory pinned:
-        // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-        int isHostRegisterSupported;
+        // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
+        int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
         cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
         // splits>2 is completely empirical observation
         if (isHostRegisterSupported & splits>2){
             cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);

diff --git a/Python/tigre/Source/Siddon_projection.cu b/Python/tigre/Source/Siddon_projection.cu
@@ -353,9 +353,11 @@ int siddon_ray_projection(float  *  img, Geometry geo, float** result,float cons
 
     //Pagelock memory for synchronous copy.
     // Lets try to make the host memory pinned:
-    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-    int isHostRegisterSupported;
+    // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
     if (isHostRegisterSupported & (splits>1 |deviceCount>1)){

diff --git a/Python/tigre/Source/ray_interpolated_projection.cu b/Python/tigre/Source/ray_interpolated_projection.cu
@@ -295,9 +295,11 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
 
     //Pagelock memory for synchronous copy.
     // Lets try to make the host memory pinned:
-    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-    int isHostRegisterSupported;
+    // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
     if (isHostRegisterSupported & splits>1){

diff --git a/Python/tigre/Source/tvdenoising.cu b/Python/tigre/Source/tvdenoising.cu
@@ -262,9 +262,11 @@ do { \
 
 
         // Lets try to make the host memory pinned:
-        // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-        int isHostRegisterSupported;
+        // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
+        int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
         cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
         if (isHostRegisterSupported & splits>1){
             cudaHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
             cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);

diff --git a/Python/tigre/Source/voxel_backprojection.cu b/Python/tigre/Source/voxel_backprojection.cu
@@ -317,9 +317,11 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
     cudaCheckErrors("Error");
     //Pagelock memory for synchronous copy.
     // Lets try to make the host memory pinned:
-    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-    int isHostRegisterSupported;
+    // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
     if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){

diff --git a/Python/tigre/Source/voxel_backprojection2.cu b/Python/tigre/Source/voxel_backprojection2.cu
@@ -356,9 +356,12 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
 
     //Pagelock memory for synchronous copy.
     // Lets try to make the host memory pinned:
-    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-    int isHostRegisterSupported;
+    // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
+
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
     if (isHostRegisterSupported & split_image>1){

diff --git a/Python/tigre/Source/voxel_backprojection_parallel.cu b/Python/tigre/Source/voxel_backprojection_parallel.cu
@@ -302,9 +302,11 @@ int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* re
     }
     //Pagelock memory for synchronous copy.
     // Lets try to make the host memory pinned:
-    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-    int isHostRegisterSupported;
+    // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
     if (isHostRegisterSupported){
         cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable);
     }