Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FBP filtering on GPU #423

Draft
wants to merge 10 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
256 changes: 256 additions & 0 deletions Common/CUDA/FbpFiltration.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
/*-------------------------------------------------------------------------
*
* CUDA functions for convolution
*
* Applies the convolution filter in the Fourier space.
* The filter should be given in the Fourier transformed form.
*
* CODE by Tomoyuki SADAKANE
* ---------------------------------------------------------------------------
* ---------------------------------------------------------------------------
* Copyright (c) 2015, University of Bath and CERN- European Organization for
* Nuclear Research
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* ---------------------------------------------------------------------------
*
* Contact: [email protected]
* Codes : https://github.com/CERN/TIGRE
* ---------------------------------------------------------------------------
*/

#include "TIGRE_common.hpp"
#include "FbpFiltration.hpp"
#include <string>

#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
mexPrintf("%s \n",msg);\
mexErrMsgIdAndTxt("apply_filtration",cudaGetErrorString(__err));\
} \
} while (0)

void cudafftCheckError(cufftResult_t fftResult, const std::string& rstrMsg) {
std::string strError = "Unknown error";
if (fftResult == CUFFT_SUCCESS){ return; }
else if (fftResult == CUFFT_INVALID_PLAN ) { strError = "The plan parameter is not a valid handle. Handle is not valid when the plan is locked."; }
else if (fftResult == CUFFT_ALLOC_FAILED ) { strError = "The allocation of GPU resources for the plan failed."; }
else if (fftResult == CUFFT_INVALID_VALUE ) { strError = "One or more invalid parameters were passed to the API."; }
else if (fftResult == CUFFT_INTERNAL_ERROR) { strError = "An internal driver error was detected."; }
else if (fftResult == CUFFT_SETUP_FAILED ) { strError = "The cuFFT library failed to initialize."; }
else if (fftResult == CUFFT_INVALID_SIZE ) { strError = "The nx or batch parameter is not a supported size."; }
mexPrintf("%s \n", rstrMsg.c_str());
mexErrMsgIdAndTxt("ApplyFiltration", strError.c_str());
}

__global__ void ApplyFilter(cufftComplex* pcfInOut, size_t uiULen, size_t uiVLen, float* pfFilter, float fULInv) {

size_t uiU = threadIdx.x + blockIdx.x * blockDim.x;
size_t uiV = threadIdx.y + blockIdx.y * blockDim.y;
if (uiV >= uiVLen || uiU >= uiULen) {
return;
}
pcfInOut[uiU+uiULen*uiV].x *= pfFilter[uiU]*fULInv;
pcfInOut[uiU+uiULen*uiV].y *= pfFilter[uiU]*fULInv;
}

//! Apply filter in the Fourier space
void apply_filtration (const float* pfIn, size_t uiULen, size_t uiVLen, const float* pfFilter, float* pfOut, const GpuIds& gpuids) {
// Prepare for MultiGPU
int deviceCount = gpuids.GetLength();
cudaCheckErrors("Device query fail");
if (deviceCount == 0) {
mexErrMsgIdAndTxt("apply_filtration","There are no available device(s) that support CUDA\n");
}
//
// CODE assumes
// 1.-All available devices are usable by this code
// 2.-All available devices are equal, they are the same machine (warning thrown)
// Check the available devices, and if they are the same
if (!gpuids.AreEqualDevices()) {
mexWarnMsgIdAndTxt("apply_filtration","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
}
// USE THE FIRST GPU ONLY!!!!!!!!!!!!!!!!!
cudaSetDevice(gpuids[0]);

const size_t uiLen = uiULen * uiVLen;
const float fULInv = 1./uiULen;

float* d_pfInOut = nullptr;
cudaMalloc((void **)&d_pfInOut, uiLen * sizeof(float));
cudaCheckErrors("apply_filtration fail cudaMalloc 1");
cudaMemcpy(d_pfInOut, pfIn, uiLen* sizeof(float), cudaMemcpyHostToDevice); // Sync only. pfIn is not pinned.
cudaCheckErrors("apply_filtration fail cudaMemcpy 1");

size_t uiBufferSize = (uiULen+1)/2+1; // Buffer size for R2C. See https://docs.nvidia.com/cuda/cufft/

cufftHandle cudafftPlanFwd;
cufftHandle cudafftPlanInv;
const int iBatch = uiVLen;
cufftResult_t fftresult;
fftresult = cufftPlan1d(&cudafftPlanFwd, uiULen, CUFFT_R2C, iBatch);
cudafftCheckError(fftresult, "apply_filtration fail cufftPlan1d 1");
fftresult = cufftPlan1d(&cudafftPlanInv, uiULen, CUFFT_C2R, iBatch);
cudafftCheckError(fftresult, "apply_filtration fail cufftPlan1d 2");

float* d_pfFilter = nullptr;
cudaMalloc((void **)&d_pfFilter, uiULen * sizeof(float));
cudaCheckErrors("apply_filtration fail cudaMalloc 2");
cudaMemcpy(d_pfFilter, pfFilter, uiULen * sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("apply_filtration fail cudaMemcpy 2");

cufftComplex* d_pcfWork = nullptr;
cudaMalloc((void **)&d_pcfWork, uiBufferSize * uiVLen*sizeof(cufftComplex));
cudaCheckErrors("apply_filtration fail cudaMalloc 3");

{
const int divU = 128;//PIXEL_SIZE_BLOCK;
const int divV = 1;//PIXEL_SIZE_BLOCK;
dim3 grid((uiULen+divU-1)/divU,(uiVLen+divV-1)/divV,1);
dim3 block(divU,divV,1);
cufftSetStream(cudafftPlanFwd, 0);
cufftSetStream(cudafftPlanInv, 0);
fftresult = cufftExecR2C (cudafftPlanFwd, d_pfInOut, d_pcfWork);
cudafftCheckError(fftresult, "apply_filtration fail cufftExecR2C");
ApplyFilter<<<grid, block>>>(d_pcfWork, uiBufferSize, uiVLen, d_pfFilter, fULInv);// Kernel d_pcfInOut = d_pcfInOut * pfFilter / uiULen
fftresult = cufftExecC2R (cudafftPlanInv, d_pcfWork, d_pfInOut);
cudafftCheckError(fftresult, "apply_filtration fail cufftExecC2R");
}
cudaMemcpy(pfOut, d_pfInOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("apply_filtration fail cudaMemcpy 3");

cudaFree(d_pcfWork); d_pcfWork = nullptr;
cudaFree(d_pfInOut); d_pfInOut = nullptr;
cudaFree(d_pfFilter); d_pfFilter = nullptr;
cufftDestroy(cudafftPlanFwd);
cufftDestroy(cudafftPlanInv);
}


//! Apply filter in the Fourier space
void apply_filtration2 (const float* pfInAll, size_t uiOffset, size_t uiULen, size_t uiBatchTotal, const float* pfFilter, size_t uiFLen, float fScale, float* pfOut, const GpuIds& gpuids) {
// Prepare for MultiGPU
int deviceCount = gpuids.GetLength();
cudaCheckErrors("Device query fail");
if (deviceCount == 0) {
mexErrMsgIdAndTxt("apply_filtration2","There are no available device(s) that support CUDA\n");
}
//
// CODE assumes
// 1.-All available devices are usable by this code
// 2.-All available devices are equal, they are the same machine (warning thrown)
// Check the available devices, and if they are the same
if (!gpuids.AreEqualDevices()) {
mexWarnMsgIdAndTxt("apply_filtration2","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
}
const float* pfIn = pfInAll+uiOffset;
const float fFLInv = 1./uiFLen;
const size_t uiBufferSize = (uiFLen+1)/2+1; // Buffer size for R2C. See https://docs.nvidia.com/cuda/cufft/
const size_t uiPaddingLen = (uiFLen-uiULen) / 2;
const size_t uiBatch0 = uiBatchTotal / deviceCount;

float** pd_pfProjWide = (float**)malloc(deviceCount*sizeof(float*));
float** pd_pfFilter = (float**)malloc(deviceCount*sizeof(float*));
cufftComplex** pd_pcfWork = (cufftComplex**)malloc(deviceCount*sizeof(cufftComplex*));

size_t* puiBatch = (size_t*)malloc(deviceCount*sizeof(size_t));
cufftHandle* pcudafftPlanFwd = (cufftHandle*)malloc(deviceCount*sizeof(cufftHandle));
cufftHandle* pcudafftPlanInv = (cufftHandle*)malloc(deviceCount*sizeof(cufftHandle));

cufftResult_t fftresult;

for (int dev = 0; dev < deviceCount; ++dev) {
pd_pfProjWide[dev] = nullptr;
pd_pfFilter[dev] = nullptr;
pd_pcfWork[dev] = nullptr;
puiBatch[dev] = dev!=deviceCount-1? uiBatch0 : uiBatchTotal - uiBatch0*dev;
size_t uiBatch = puiBatch[dev];

// Allocate extended projection
cudaSetDevice(gpuids[dev]);
cudaCheckErrors("apply_filtration2 fail cudaSetDevice");
cudaMalloc((void**)&pd_pfProjWide[dev], uiFLen*uiBatch*sizeof(float));
cudaCheckErrors("apply_filtration2 fail cudaMalloc wide");
// Fill 0
cudaMemset(pd_pfProjWide[dev], 0, uiFLen*uiBatch*sizeof(float));
cudaCheckErrors("apply_filtration2 fail cudaMemset");
// Insert projection
cudaMemcpy2D(&pd_pfProjWide[dev][uiPaddingLen], uiFLen*sizeof(float), &pfIn[dev*uiBatch0*uiULen], uiULen*sizeof(float), uiULen*sizeof(float), uiBatch, cudaMemcpyHostToDevice);
cudaCheckErrors("apply_filtration2 fail cudaMemcpy2D 1");

// Filter
cudaMalloc((void **)&pd_pfFilter[dev], uiFLen * sizeof(float));
cudaCheckErrors("apply_filtration2 fail cudaMalloc filter");
cudaMemcpy(pd_pfFilter[dev], pfFilter, uiFLen * sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("apply_filtration2 fail cudaMemcpy 2");

// Work space for FFT.
cudaMalloc((void **)&pd_pcfWork[dev], uiBufferSize * uiBatch*sizeof(cufftComplex));
cudaCheckErrors("apply_filtration2 fail cudaMalloc work");
}
for (int dev = 0; dev < deviceCount; ++dev) {
cudaSetDevice(gpuids[dev]);
const size_t uiBatch = puiBatch[dev];
fftresult = cufftPlan1d(&pcudafftPlanFwd[dev], uiFLen, CUFFT_R2C, uiBatch);
cudafftCheckError(fftresult, "apply_filtration2 fail cufftPlan1d 1");
fftresult = cufftPlan1d(&pcudafftPlanInv[dev], uiFLen, CUFFT_C2R, uiBatch);
cudafftCheckError(fftresult, "apply_filtration2 fail cufftPlan1d 2");
const int divU = 128;//PIXEL_SIZE_BLOCK;
const int divV = 1;//PIXEL_SIZE_BLOCK;
dim3 grid((uiFLen+divU-1)/divU,(uiBatch+divV-1)/divV,1);
dim3 block(divU,divV,1);
fftresult = cufftExecR2C (pcudafftPlanFwd[dev], pd_pfProjWide[dev], pd_pcfWork[dev]);
cudafftCheckError(fftresult, "apply_filtration2 fail cufftExecR2C");
ApplyFilter<<<grid, block>>>(pd_pcfWork[dev], uiBufferSize, uiBatch, pd_pfFilter[dev], fFLInv*fScale);// Kernel d_pcfInOut = d_pcfInOut * pfFilter / uiFLen *
fftresult = cufftExecC2R (pcudafftPlanInv[dev], pd_pcfWork[dev], pd_pfProjWide[dev]);
cudafftCheckError(fftresult, "apply_filtration2 fail cufftExecC2R");
}
for (int dev = 0; dev < deviceCount; ++dev) {
cudaSetDevice(gpuids[dev]);
const size_t uiBatch = puiBatch[dev];
cudaMemcpy2D(&pfOut[dev*uiBatch0*uiULen], uiULen*sizeof(float), &pd_pfProjWide[dev][uiPaddingLen], uiFLen*sizeof(float), uiULen*sizeof(float), uiBatch, cudaMemcpyDeviceToHost);
cudaCheckErrors("apply_filtration2 fail cudaMemcpy2D 2");

cudaFree(pd_pcfWork[dev]); pd_pcfWork[dev] = nullptr;
cudaFree(pd_pfProjWide[dev]); pd_pfProjWide[dev] = nullptr;
cudaFree(pd_pfFilter[dev]); pd_pfFilter[dev] = nullptr;
cufftDestroy(pcudafftPlanFwd[dev]);
cufftDestroy(pcudafftPlanInv[dev]);
cudaCheckErrors("apply_filtration2 fail free");
}
cudaDeviceSynchronize();

free(pd_pfProjWide); pd_pfProjWide = nullptr;
free(pd_pfFilter); pd_pfFilter = nullptr;
free(puiBatch); puiBatch = nullptr;
free(pd_pcfWork); pd_pcfWork = nullptr;
free(pcudafftPlanFwd); pcudafftPlanFwd = nullptr;
free(pcudafftPlanInv); pcudafftPlanInv = nullptr;
}
54 changes: 54 additions & 0 deletions Common/CUDA/FbpFiltration.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*-------------------------------------------------------------------------
*
* Header CUDA functions for convolution
*
* Applies the convolution filter in the Fourier space.
* The filter should be given in the Fourier transformed form.
*
* CODE by Tomoyuki SADAKANE
* ---------------------------------------------------------------------------
* ---------------------------------------------------------------------------
* Copyright (c) 2015, University of Bath and CERN- European Organization for
* Nuclear Research
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* ---------------------------------------------------------------------------
*
* Contact: [email protected]
* Codes : https://github.com/CERN/TIGRE
* ---------------------------------------------------------------------------
*/

#include "TIGRE_common.hpp"
#include "GpuIds.hpp"

#include <complex>
#include <cufft.h>

void apply_filtration(const float* pfIn, size_t uiXLen, size_t uiYLen, const float* pfFilter, float* pfOut, const GpuIds& gpuids);
void apply_filtration2 (const float* pfInAll, size_t uiOffset, size_t uiULen, size_t uiBatch, const float* pfFilter, size_t uiFLen, float fScale, float* pfOut, const GpuIds& gpuids) ;
1 change: 0 additions & 1 deletion MATLAB/Algorithms/FBP.m
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@

%% Weight
%proj=data
proj=permute(proj,[2 1 3]);

%% filter
proj_filt = filtering(proj,geo,angles,parker); % Not sure if offsets are good in here
Expand Down
18 changes: 11 additions & 7 deletions MATLAB/Algorithms/FDK.m
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
% Codes: https://github.com/CERN/TIGRE/
% Coded by: Kyungsang Kim, modified by Ander Biguri, Brandon Nelson
%--------------------------------------------------------------------------
[filter,parker,dowang,gpuids]=parse_inputs(angles,varargin);
[filter,parker,dowang,usegpufft,gpuids]=parse_inputs(angles,varargin);

geo=checkGeo(geo,angles);
geo.filter=filter;
Expand All @@ -59,9 +59,7 @@
end

%% Weight
proj=permute(proj,[2 1 3]);
for ii=1:size(angles,2)

us = ((-geo.nDetector(1)/2+0.5):1:(geo.nDetector(1)/2-0.5))*geo.dDetector(1) + offset(1,ii);
vs = ((-geo.nDetector(2)/2+0.5):1:(geo.nDetector(2)/2-0.5))*geo.dDetector(2) + offset(2,ii);
[uu,vv] = meshgrid(us,vs); % detector
Expand All @@ -70,10 +68,10 @@
w = (geo.DSD(ii))./sqrt((geo.DSD(ii))^2+uu.^2 + vv.^2);

% Multiply the weights with projection data
proj(:,:,ii) = proj(:,:,ii).*w';
proj(:,:,ii) = w.*proj(:,:,ii);
end
%% Fourier transform based filtering
proj = filtering(proj,geo,angles,parker); % Not sure if offsets are good in here
proj = filtering(proj,geo,angles,parker, 'usegpufft', usegpufft); % Not sure if offsets are good in here

% RMFIELD Remove fields from a structure array.
geo=rmfield(geo,'filter');
Expand Down Expand Up @@ -113,9 +111,9 @@
end


function [filter, parker, wang, gpuids]=parse_inputs(angles,argin)
function [filter, parker, wang, usegpufft, gpuids]=parse_inputs(angles,argin)

opts = {'filter','parker','wang','gpuids'};
opts = {'filter','parker','wang','usegpufft','gpuids'};
defaults=ones(length(opts),1);

% Check inputs
Expand Down Expand Up @@ -176,6 +174,12 @@
end
filter=val;
end
case 'usegpufft'
if default
usegpufft=2;
else
usegpufft=val;
end
case 'gpuids'
if default
gpuids = GpuIds();
Expand Down
Loading