Skip to content

Commit

Permalink
Merge pull request #243 from gfardell/compute_30
Browse files Browse the repository at this point in the history
Compute 30 build
  • Loading branch information
AnderBiguri authored Jan 28, 2021
2 parents c658883 + 981afaf commit 28332a3
Show file tree
Hide file tree
Showing 17 changed files with 74 additions and 41 deletions.
10 changes: 6 additions & 4 deletions MATLAB/Source/POCS_TV.cu
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ do { \
__syncthreads();


#if (__CUDA_ARCH__ >= 300)
#if (__CUDART_VERSION >= 9000)
if ( tid < 32 )
{
mySum = sdata[tid] + sdata[tid + 32];
Expand Down Expand Up @@ -231,7 +231,7 @@ do { \
__syncthreads();


#if (__CUDA_ARCH__ >= 300)
#if (__CUDART_VERSION >= 9000)
if ( tid < 32 )
{
mySum = sdata[tid] + sdata[tid + 32];
Expand Down Expand Up @@ -397,9 +397,11 @@ do { \


// Lets try to make the host memory pinned:
// We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
int isHostRegisterSupported;
// We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
int isHostRegisterSupported = 0;
#if CUDART_VERSION >= 9020
cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
#endif
// splits>2 is completely empirical observation
if (isHostRegisterSupported & splits>2){
cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
Expand Down
10 changes: 6 additions & 4 deletions MATLAB/Source/POCS_TV2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ do { \
__syncthreads();


#if (__CUDA_ARCH__ >= 300)
#if (__CUDART_VERSION >= 9000)
if ( tid < 32 )
{
mySum = sdata[tid] + sdata[tid + 32];
Expand Down Expand Up @@ -252,7 +252,7 @@ do { \
__syncthreads();


#if (__CUDA_ARCH__ >= 300)
#if (__CUDART_VERSION >= 9000)
if ( tid < 32 )
{
mySum = sdata[tid] + sdata[tid + 32];
Expand Down Expand Up @@ -417,9 +417,11 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma


// Lets try to make the host memory pinned:
// We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
int isHostRegisterSupported;
// We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
int isHostRegisterSupported = 0;
#if CUDART_VERSION >= 9020
cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
#endif
// splits>2 is completely empirical observation
if (isHostRegisterSupported & splits>2){
cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
Expand Down
6 changes: 4 additions & 2 deletions MATLAB/Source/Siddon_projection.cu
Original file line number Diff line number Diff line change
Expand Up @@ -349,9 +349,11 @@ int siddon_ray_projection(float * img, Geometry geo, float** result,float cons

//Pagelock memory for synchronous copy.
// Lets try to make the host memory pinned:
// We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
int isHostRegisterSupported;
// We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
int isHostRegisterSupported = 0;
#if CUDART_VERSION >= 9020
cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
#endif
// empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
// pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
if (isHostRegisterSupported & (splits>1 |deviceCount>1)){
Expand Down
6 changes: 4 additions & 2 deletions MATLAB/Source/ray_interpolated_projection.cu
Original file line number Diff line number Diff line change
Expand Up @@ -290,9 +290,11 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c

//Pagelock memory for synchronous copy.
// Lets try to make the host memory pinned:
// We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
int isHostRegisterSupported;
// We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
int isHostRegisterSupported = 0;
#if CUDART_VERSION >= 9020
cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
#endif
// empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
// pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
if (isHostRegisterSupported & splits>1){
Expand Down
6 changes: 4 additions & 2 deletions MATLAB/Source/tvdenoising.cu
Original file line number Diff line number Diff line change
Expand Up @@ -262,9 +262,11 @@ do { \


// Lets try to make the host memory pinned:
// We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
int isHostRegisterSupported;
// We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
int isHostRegisterSupported = 0;
#if CUDART_VERSION >= 9020
cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
#endif
if (isHostRegisterSupported & splits>1){
cudaHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
Expand Down
6 changes: 4 additions & 2 deletions MATLAB/Source/voxel_backprojection.cu
Original file line number Diff line number Diff line change
Expand Up @@ -311,9 +311,11 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa
cudaCheckErrors("Error");
//Pagelock memory for synchronous copy.
// Lets try to make the host memory pinned:
// We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
int isHostRegisterSupported;
// We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
int isHostRegisterSupported = 0;
#if CUDART_VERSION >= 9020
cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
#endif
// empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
// pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){
Expand Down
6 changes: 4 additions & 2 deletions MATLAB/Source/voxel_backprojection2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -352,9 +352,11 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float

//Pagelock memory for synchronous copy.
// Lets try to make the host memory pinned:
// We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
int isHostRegisterSupported;
// We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
int isHostRegisterSupported = 0;
#if CUDART_VERSION >= 9020
cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
#endif
// empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
// pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
if (isHostRegisterSupported & split_image>1){
Expand Down
6 changes: 4 additions & 2 deletions MATLAB/Source/voxel_backprojection_parallel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -298,9 +298,11 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re
}
//Pagelock memory for synchronous copy.
// Lets try to make the host memory pinned:
// We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
int isHostRegisterSupported;
// We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
int isHostRegisterSupported = 0;
#if CUDART_VERSION >= 9020
cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
#endif
if (isHostRegisterSupported){
cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable);
}
Expand Down
2 changes: 1 addition & 1 deletion Python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

# Code from https://github.com/pytorch/pytorch/blob/master/torch/utils/cpp_extension.py
COMPUTE_CAPABILITY_ARGS = [ # '-gencode=arch=compute_20,code=sm_20', #deprecated
#'-gencode=arch=compute_30,code=sm_30',#deprecated
'-gencode=arch=compute_30,code=sm_30',
'-gencode=arch=compute_37,code=sm_37',
'-gencode=arch=compute_52,code=sm_52',
'-gencode=arch=compute_60,code=sm_60',
Expand Down
10 changes: 6 additions & 4 deletions Python/tigre/Source/POCS_TV.cu
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ do { \
__syncthreads();


#if (__CUDA_ARCH__ >= 300)
#if (__CUDART_VERSION >= 9000)
if ( tid < 32 )
{
mySum = sdata[tid] + sdata[tid + 32];
Expand Down Expand Up @@ -231,7 +231,7 @@ do { \
__syncthreads();


#if (__CUDA_ARCH__ >= 300)
#if (__CUDART_VERSION >= 9000)
if ( tid < 32 )
{
mySum = sdata[tid] + sdata[tid + 32];
Expand Down Expand Up @@ -397,9 +397,11 @@ do { \


// Lets try to make the host memory pinned:
// We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
int isHostRegisterSupported;
// We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
int isHostRegisterSupported = 0;
#if CUDART_VERSION >= 9020
cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
#endif
// splits>2 is completely empirical observation
if (isHostRegisterSupported & splits>2){
cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
Expand Down
10 changes: 6 additions & 4 deletions Python/tigre/Source/POCS_TV2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ do { \
__syncthreads();


#if (__CUDA_ARCH__ >= 300)
#if (__CUDART_VERSION >= 9000)
if ( tid < 32 )
{
mySum = sdata[tid] + sdata[tid + 32];
Expand Down Expand Up @@ -252,7 +252,7 @@ do { \
__syncthreads();


#if (__CUDA_ARCH__ >= 300)
#if (__CUDART_VERSION >= 9000)
if ( tid < 32 )
{
mySum = sdata[tid] + sdata[tid + 32];
Expand Down Expand Up @@ -417,9 +417,11 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma


// Lets try to make the host memory pinned:
// We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
int isHostRegisterSupported;
// We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
int isHostRegisterSupported = 0;
#if CUDART_VERSION >= 9020
cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
#endif
// splits>2 is completely empirical observation
if (isHostRegisterSupported & splits>2){
cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
Expand Down
6 changes: 4 additions & 2 deletions Python/tigre/Source/Siddon_projection.cu
Original file line number Diff line number Diff line change
Expand Up @@ -353,9 +353,11 @@ int siddon_ray_projection(float * img, Geometry geo, float** result,float cons

//Pagelock memory for synchronous copy.
// Lets try to make the host memory pinned:
// We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
int isHostRegisterSupported;
// We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
int isHostRegisterSupported = 0;
#if CUDART_VERSION >= 9020
cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
#endif
// empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
// pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
if (isHostRegisterSupported & (splits>1 |deviceCount>1)){
Expand Down
6 changes: 4 additions & 2 deletions Python/tigre/Source/ray_interpolated_projection.cu
Original file line number Diff line number Diff line change
Expand Up @@ -295,9 +295,11 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c

//Pagelock memory for synchronous copy.
// Lets try to make the host memory pinned:
// We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
int isHostRegisterSupported;
// We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
int isHostRegisterSupported = 0;
#if CUDART_VERSION >= 9020
cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
#endif
// empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
// pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
if (isHostRegisterSupported & splits>1){
Expand Down
6 changes: 4 additions & 2 deletions Python/tigre/Source/tvdenoising.cu
Original file line number Diff line number Diff line change
Expand Up @@ -262,9 +262,11 @@ do { \


// Lets try to make the host memory pinned:
// We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
int isHostRegisterSupported;
// We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
int isHostRegisterSupported = 0;
#if CUDART_VERSION >= 9020
cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
#endif
if (isHostRegisterSupported & splits>1){
cudaHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
Expand Down
6 changes: 4 additions & 2 deletions Python/tigre/Source/voxel_backprojection.cu
Original file line number Diff line number Diff line change
Expand Up @@ -317,9 +317,11 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa
cudaCheckErrors("Error");
//Pagelock memory for synchronous copy.
// Lets try to make the host memory pinned:
// We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
int isHostRegisterSupported;
// We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
int isHostRegisterSupported = 0;
#if CUDART_VERSION >= 9020
cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
#endif
// empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
// pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){
Expand Down
7 changes: 5 additions & 2 deletions Python/tigre/Source/voxel_backprojection2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -356,9 +356,12 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float

//Pagelock memory for synchronous copy.
// Lets try to make the host memory pinned:
// We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
int isHostRegisterSupported;
// We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
int isHostRegisterSupported = 0;
#if CUDART_VERSION >= 9020
cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
#endif

// empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
// pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
if (isHostRegisterSupported & split_image>1){
Expand Down
6 changes: 4 additions & 2 deletions Python/tigre/Source/voxel_backprojection_parallel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -302,9 +302,11 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re
}
//Pagelock memory for synchronous copy.
// Lets try to make the host memory pinned:
// We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
int isHostRegisterSupported;
// We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
int isHostRegisterSupported = 0;
#if CUDART_VERSION >= 9020
cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
#endif
if (isHostRegisterSupported){
cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable);
}
Expand Down

0 comments on commit 28332a3

Please sign in to comment.