From 2e765745f68fc0777019d938668370e83a1e9a58 Mon Sep 17 00:00:00 2001 From: Tomoyuki SADAKANE <40597344+tsadakane@users.noreply.github.com> Date: Sun, 8 Jan 2023 18:05:29 +0900 Subject: [PATCH 01/10] Add CUDA implementation of FBP filtering * Added Common files --- Common/CUDA/FbpFiltration.cu | 164 ++++++++++++++++++++++++++++++++++ Common/CUDA/FbpFiltration.hpp | 53 +++++++++++ 2 files changed, 217 insertions(+) create mode 100644 Common/CUDA/FbpFiltration.cu create mode 100644 Common/CUDA/FbpFiltration.hpp diff --git a/Common/CUDA/FbpFiltration.cu b/Common/CUDA/FbpFiltration.cu new file mode 100644 index 00000000..eac45c63 --- /dev/null +++ b/Common/CUDA/FbpFiltration.cu @@ -0,0 +1,164 @@ +/*------------------------------------------------------------------------- + * + * CUDA functions for convolution + * + * Applies the convolution filter in the Fourier space. + * The filter should be given in the Fourier transformed form. + * + * CODE by Tomoyuki SADAKANE + * --------------------------------------------------------------------------- + * --------------------------------------------------------------------------- + * Copyright (c) 2015, University of Bath and CERN- European Organization for + * Nuclear Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * --------------------------------------------------------------------------- + * + * Contact: tigre.toolbox@gmail.com + * Codes : https://github.com/CERN/TIGRE + * --------------------------------------------------------------------------- + */ + +#include "TIGRE_common.hpp" +#include "FbpFiltration.hpp" +#include + +#define cudaCheckErrors(msg) \ +do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + mexPrintf("%s \n",msg);\ + mexErrMsgIdAndTxt("apply_filtration",cudaGetErrorString(__err));\ + } \ +} while (0) + +void cudafftCheckError(cufftResult_t fftResult, const std::string& rstrMsg) { + std::string strError = "Unknown error"; + if (fftResult == CUFFT_SUCCESS){ return; } + else if (fftResult == CUFFT_INVALID_PLAN ) { strError = "The plan parameter is not a valid handle. Handle is not valid when the plan is locked."; } + else if (fftResult == CUFFT_ALLOC_FAILED ) { strError = "The allocation of GPU resources for the plan failed."; } + else if (fftResult == CUFFT_INVALID_VALUE ) { strError = "One or more invalid parameters were passed to the API."; } + else if (fftResult == CUFFT_INTERNAL_ERROR) { strError = "An internal driver error was detected."; } + else if (fftResult == CUFFT_SETUP_FAILED ) { strError = "The cuFFT library failed to initialize."; } + else if (fftResult == CUFFT_INVALID_SIZE ) { strError = "The nx or batch parameter is not a supported size."; } + mexPrintf("%s \n", rstrMsg.c_str()); + mexErrMsgIdAndTxt("ApplyFiltration", strError.c_str()); +} + +__global__ void ApplyFilter(cufftComplex* pcfInOut, size_t uiULen, size_t uiVLen, float* pfFilter, float fULInv) { + + size_t uiU = threadIdx.x + blockIdx.x * blockDim.x; + size_t uiV = threadIdx.y + blockIdx.y * blockDim.y; + if (uiV >= uiVLen || uiU >= uiULen) { + return; + } + pcfInOut[uiU+uiULen*uiV].x *= pfFilter[uiU]*fULInv; + pcfInOut[uiU+uiULen*uiV].y *= pfFilter[uiU]*fULInv; +} + +//! Apply filter in the Fourier space +void apply_filtration(const float* pfIn, size_t uiULen, size_t uiVLen, const float* pfFilter, float* pfOut, const GpuIds& gpuids){ + // Prepare for MultiGPU + int deviceCount = gpuids.GetLength(); + cudaCheckErrors("Device query fail"); + if (deviceCount == 0) { + mexErrMsgIdAndTxt("apply_filtration","There are no available device(s) that support CUDA\n"); + } + // + // CODE assumes + // 1.-All available devices are usable by this code + // 2.-All available devices are equal, they are the same machine (warning thrown) + // Check the available devices, and if they are the same + if (!gpuids.AreEqualDevices()) { + mexWarnMsgIdAndTxt("apply_filtration","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed."); + } + // USE THE FIRST GPU ONLY!!!!!!!!!!!!!!!!! + cudaSetDevice(gpuids[0]); + + const size_t uiLen = uiULen * uiVLen; + cufftComplex* h_pcfInOut = (cufftComplex*)malloc(uiLen*sizeof(cufftComplex)); + + if (!h_pcfInOut) { + mexErrMsgIdAndTxt("ApplyFiltration", "apply_filtration fail cudaMallocHost 1"); + } + for (int iV = 0; iV < uiVLen; ++iV) { + for (int iU = 0; iU < uiULen; ++iU) { + h_pcfInOut[iU+uiULen*iV] = cufftComplex{pfIn[iU+uiULen*iV], 0}; + } + } + + const float fULInv = 1./uiULen; + + cufftHandle cudafftPlan; + const int iBatch = uiVLen; + cufftResult_t fftresult; + fftresult = cufftPlan1d(&cudafftPlan, uiULen, CUFFT_C2C, iBatch); + cudafftCheckError(fftresult, "apply_filtration fail cufftPlan1d 1"); + fftresult = cufftPlan1d(&cudafftPlan, uiULen, CUFFT_C2C, iBatch); + cudafftCheckError(fftresult, "apply_filtration fail cufftPlan1d 2"); + + float* d_pfFilter = nullptr; + cufftComplex* d_pcfInOut = nullptr; + cudaMalloc((void **)&d_pcfInOut, uiLen * sizeof(cufftComplex)); + cudaCheckErrors("apply_filtration fail cudaMalloc 1"); + cudaMalloc((void **)&d_pfFilter, uiULen * sizeof(float)); + cudaCheckErrors("apply_filtration fail cudaMalloc 2"); + cudaMemcpy(d_pcfInOut, h_pcfInOut, uiLen* sizeof(cufftComplex), cudaMemcpyHostToDevice); + cudaCheckErrors("apply_filtration fail cudaMemcpy 1"); + cudaMemcpy(d_pfFilter, pfFilter, uiULen * sizeof(float), cudaMemcpyHostToDevice); + cudaCheckErrors("apply_filtration fail cudaMemcpy 2"); + + { + const int divU = 128;//PIXEL_SIZE_BLOCK; + const int divV = 1;//PIXEL_SIZE_BLOCK; + dim3 grid((uiULen+divU-1)/divU,(uiVLen+divV-1)/divV,1); + dim3 block(divU,divV,1); + cufftSetStream(cudafftPlan, 0); + fftresult = cufftExecC2C (cudafftPlan, d_pcfInOut, d_pcfInOut, CUFFT_FORWARD); + cudafftCheckError(fftresult, "apply_filtration fail cufftExecC2C CUFFT_FORWARD"); + ApplyFilter<<>>(d_pcfInOut, uiULen, uiVLen, d_pfFilter, fULInv);// Kernel d_pcfInOut = d_pcfInOut * pfFilter / uiULen + fftresult = cufftExecC2C (cudafftPlan, d_pcfInOut, d_pcfInOut, CUFFT_INVERSE); + cudafftCheckError(fftresult, "apply_filtration fail cufftExecC2C CUFFT_INVERSE"); + } + + cudaMemcpy(h_pcfInOut, d_pcfInOut, uiLen*sizeof(cufftComplex), cudaMemcpyDeviceToHost); + cudaCheckErrors("apply_filtration fail cudaMemcpy 3"); + + cudaFree(d_pcfInOut); d_pcfInOut = nullptr; + cudaFree(d_pfFilter); d_pfFilter = nullptr; + + cufftSetStream(cudafftPlan, 0); + cufftDestroy(cudafftPlan); + + for (int iV = 0; iV < uiVLen; ++iV) { + for (int iU = 0; iU < uiULen; ++iU) { + pfOut[iU+uiULen*iV] = h_pcfInOut[iU+uiULen*iV].x; + } + } + free(h_pcfInOut); h_pcfInOut = nullptr; +} diff --git a/Common/CUDA/FbpFiltration.hpp b/Common/CUDA/FbpFiltration.hpp new file mode 100644 index 00000000..88c8eade --- /dev/null +++ b/Common/CUDA/FbpFiltration.hpp @@ -0,0 +1,53 @@ +/*------------------------------------------------------------------------- + * + * Header CUDA functions for convolution + * + * Applies the convolution filter in the Fourier space. + * The filter should be given in the Fourier transformed form. + * + * CODE by Tomoyuki SADAKANE + * --------------------------------------------------------------------------- + * --------------------------------------------------------------------------- + * Copyright (c) 2015, University of Bath and CERN- European Organization for + * Nuclear Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * --------------------------------------------------------------------------- + * + * Contact: tigre.toolbox@gmail.com + * Codes : https://github.com/CERN/TIGRE + * --------------------------------------------------------------------------- + */ + +#include "TIGRE_common.hpp" +#include "GpuIds.hpp" + +#include +#include + +void apply_filtration(const float* pfIn, size_t uiXLen, size_t uiYLen, const float* pfFilter, float* pfOut, const GpuIds& gpuids); From 235c0050f378d497136e5bd21a66642ac929e7cb Mon Sep 17 00:00:00 2001 From: Tomoyuki SADAKANE <40597344+tsadakane@users.noreply.github.com> Date: Sun, 8 Jan 2023 20:41:58 +0900 Subject: [PATCH 02/10] Add CUDA implementation of FBP filtering * Modified and added Matlab files --- MATLAB/Algorithms/FDK.m | 14 +- MATLAB/Compile.m | 2 + .../cuda_interface/ApplyFbpFiltration.cpp | 130 ++++++++++++++++++ MATLAB/Utilities/filtering.m | 24 ++-- MATLAB/mex_CUDA_win64_MSV2019.xml | 2 +- MATLAB/mex_CUDA_win64_MVS2013.xml | 2 +- MATLAB/mex_CUDA_win64_MVS2015.xml | 2 +- MATLAB/mex_CUDA_win64_MVS2022.xml | 2 +- 8 files changed, 161 insertions(+), 17 deletions(-) create mode 100644 MATLAB/Utilities/cuda_interface/ApplyFbpFiltration.cpp diff --git a/MATLAB/Algorithms/FDK.m b/MATLAB/Algorithms/FDK.m index 58affe95..5533f105 100644 --- a/MATLAB/Algorithms/FDK.m +++ b/MATLAB/Algorithms/FDK.m @@ -38,7 +38,7 @@ % Codes: https://github.com/CERN/TIGRE/ % Coded by: Kyungsang Kim, modified by Ander Biguri, Brandon Nelson %-------------------------------------------------------------------------- -[filter,parker,dowang,gpuids]=parse_inputs(angles,varargin); +[filter,parker,dowang,usegpufft,gpuids]=parse_inputs(angles,varargin); geo=checkGeo(geo,angles); geo.filter=filter; @@ -73,7 +73,7 @@ proj(:,:,ii) = proj(:,:,ii).*w'; end %% Fourier transform based filtering -proj = filtering(proj,geo,angles,parker); % Not sure if offsets are good in here +proj = filtering(proj,geo,angles,parker,usegpufft); % Not sure if offsets are good in here % RMFIELD Remove fields from a structure array. geo=rmfield(geo,'filter'); @@ -113,9 +113,9 @@ end -function [filter, parker, wang, gpuids]=parse_inputs(angles,argin) +function [filter, parker, wang, usegpufft, gpuids]=parse_inputs(angles,argin) -opts = {'filter','parker','wang','gpuids'}; +opts = {'filter','parker','wang','usegpufft','gpuids'}; defaults=ones(length(opts),1); % Check inputs @@ -176,6 +176,12 @@ end filter=val; end + case 'usegpufft' + if default + usegpufft=true; + else + usegpufft=val; + end case 'gpuids' if default gpuids = GpuIds(); diff --git a/MATLAB/Compile.m b/MATLAB/Compile.m index fdb45b07..2f72e465 100644 --- a/MATLAB/Compile.m +++ b/MATLAB/Compile.m @@ -97,6 +97,7 @@ mex('-largeArrayDims', './Utilities/cuda_interface/AwminTV.cpp', '../Common/CUDA/POCS_TV2.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex('-largeArrayDims', './Utilities/cuda_interface/tvDenoise.cpp', '../Common/CUDA/tvdenoising.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex('-largeArrayDims', './Utilities/cuda_interface/AddNoise.cpp', '../Common/CUDA/RandomNumberGenerator.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) + mex('-largeArrayDims', './Utilities/cuda_interface/ApplyFbpFiltration.cpp', '../Common/CUDA/FbpFiltration.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex('-largeArrayDims', './Utilities/IO/VarianCBCT/mexReadXim.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex('-largeArrayDims', './Utilities/GPU/getGpuName_mex.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex('-largeArrayDims', './Utilities/GPU/getGpuCount_mex.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) @@ -109,6 +110,7 @@ mex( './Utilities/cuda_interface/AwminTV.cpp', '../Common/CUDA/POCS_TV2.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex( './Utilities/cuda_interface/tvDenoise.cpp', '../Common/CUDA/tvdenoising.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex( './Utilities/cuda_interface/AddNoise.cpp', '../Common/CUDA/RandomNumberGenerator.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) + mex( './Utilities/cuda_interface/ApplyFbpFiltration.cpp', '../Common/CUDA/FbpFiltration.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex( './Utilities/IO/VarianCBCT/mexReadXim.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex( './Utilities/GPU/getGpuName_mex.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex('./Utilities/GPU/getGpuCount_mex.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) diff --git a/MATLAB/Utilities/cuda_interface/ApplyFbpFiltration.cpp b/MATLAB/Utilities/cuda_interface/ApplyFbpFiltration.cpp new file mode 100644 index 00000000..ef9cc8e6 --- /dev/null +++ b/MATLAB/Utilities/cuda_interface/ApplyFbpFiltration.cpp @@ -0,0 +1,130 @@ +/*------------------------------------------------------------------------- + * + * MATLAB MEX functions for convolution. Check inputs and parses + * MATLAB data to C++ data. + * + * + * CODE by Tomoyuki SADAKANE + * +--------------------------------------------------------------------------- +--------------------------------------------------------------------------- +Copyright (c) 2015, University of Bath and CERN- European Organization for +Nuclear Research +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + --------------------------------------------------------------------------- + +Contact: tigre.toolbox@gmail.com +Codes : https://github.com/CERN/TIGRE +--------------------------------------------------------------------------- + */ + +#include +#include +#include +#include +#include +#include +#include +#include +/** + * MEX gateway + * ApplyFbpFiltration(mat2dProj, mat1dFlt, "gpuids", gpuids); + */ + +void mexFunction(int nlhs, mxArray *plhs[], + int nrhs, mxArray const *prhs[]) +{ + GpuIds gpuids; + if (nrhs==4) { + size_t iM = mxGetM(prhs[3]); + if (iM != 1) { + mexErrMsgIdAndTxt( "TIGRE:MEX:ApplyFbpFiltration:unknown","5th parameter must be a row vector."); + return; + } + size_t uiGpuCount = mxGetN(prhs[3]); + if (uiGpuCount == 0) { + mexErrMsgIdAndTxt( "TIGRE:MEX:ApplyFbpFiltration:unknown","5th parameter must be a row vector."); + return; + } + int* piGpuIds = (int*)mxGetData(prhs[3]); + gpuids.SetIds(uiGpuCount, piGpuIds); + } else { + int iGpuCount = GetGpuCount(); + int* piDev = (int*)malloc(iGpuCount * sizeof(int)); + for (int iI = 0; iI < iGpuCount; ++iI) { + piDev[iI] = iI; + } + gpuids.SetIds(iGpuCount, piDev); + free(piDev); piDev = 0; + } + if (nrhs < 2) { + mexErrMsgIdAndTxt("TIGRE:MEX:ApplyFbpFiltration", "At least two input argumet required."); + } else if (nrhs==2 || nrhs==4){ + size_t mrows = mxGetM(prhs[0]); + size_t ncols = mxGetN(prhs[0]); + if (mrows ==1 && ncols ==1) { + mexErrMsgIdAndTxt("TIGRE:MEX:ApplyFbpFiltration", "1st parameter should not be 1x1"); + } + mrows = mxGetM(prhs[1]); + ncols = mxGetN(prhs[1]); + if (mrows==1 || ncols !=1) { + mexErrMsgIdAndTxt("TIGRE:MEX:ApplyFbpFiltration", "2nd parameter should be Mx1 (M>1)"); + } + } else if (nrhs>4) { + mexErrMsgIdAndTxt("TIGRE:MEX:ApplyFbpFiltration", "Too many imput argumets"); + } + /////////////// First input argumet. Projection + mxArray const * const proj = prhs[0]; + float* pfProj = static_cast(mxGetData(proj)); + const mwSize numDimsProj = mxGetNumberOfDimensions(proj); // get dim of proj + const mwSize *size_proj= mxGetDimensions(proj); //get size of proj + // printf("numDimsProj = %d\n", numDimsProj); + // for (int iI = 0; iI < numDimsProj; ++iI) { + // printf("size_proj[%d] = %d\n", iI, size_proj[iI]); + // } + /////////////// Second input argumet. Filter + const mxArray* filter = prhs[1]; + float* pfFilter = static_cast(mxGetData(filter)); + const mwSize numDimsFilter = mxGetNumberOfDimensions(filter); // get dim of filter + const mwSize *size_filter= mxGetDimensions(filter); //get size of filter + // printf("numDimsFilter = %d\n", numDimsFilter); + // for (int iI = 0; iI < numDimsFilter; ++iI) { + // printf("size_filter[%d] = %d\n", iI, size_filter[iI]); + // } + if (size_filter[0] != size_proj[0]) { + mexErrMsgIdAndTxt("TIGRE:MEX:ApplyFbpFiltration", "Width of projection must be equal to filter."); + } + ////////////// + //prepare outputs + // Allocate output projection + plhs[0] = mxCreateNumericArray(numDimsProj, size_proj, mxSINGLE_CLASS, mxREAL); + float *pfProjOut =(float*) mxGetPr(plhs[0]); + // call CUDA filtering + apply_filtration(pfProj, size_proj[0], size_proj[1], pfFilter, pfProjOut, gpuids); +} diff --git a/MATLAB/Utilities/filtering.m b/MATLAB/Utilities/filtering.m index 8974e875..54cb2726 100644 --- a/MATLAB/Utilities/filtering.m +++ b/MATLAB/Utilities/filtering.m @@ -1,4 +1,4 @@ -function [ proj ] = filtering(proj,geo,angles,parker) +function [ proj ] = filtering(proj,geo,angles,parker,usegpu) %FILTERING Summary of this function goes here % Detailed explanation goes here %-------------------------------------------------------------------------- @@ -30,19 +30,25 @@ d = 1; % cut off (0~1) [filt] = Filter(geo.filter, ramp_kernel, filt_len, d); -filt = repmat(filt',[1 geo.nDetector(2)]); - +if usegpu + filt = filt'; + gpuids = GpuIds(); +else + filt = repmat(filt',[1 geo.nDetector(2)]); +end for ii=1:size(angles,2) fproj = (zeros(filt_len,geo.nDetector(2),'single')); fproj(round(filt_len/2-geo.nDetector(1)/2+1):round(filt_len/2+geo.nDetector(1)/2),:) = proj(:,:,ii); - - fproj = fft(fproj); - - fproj = fproj.*filt; - - fproj = (real(ifft(fproj))); + + if usegpu + fproj = ApplyFbpFiltration(fproj, filt, gpuids); + else + fproj = fft(fproj); + fproj = fproj.*filt; + fproj = (real(ifft(fproj))); + end if parker proj(:,:,ii) = fproj(round(end/2-geo.nDetector(1)/2+1):round(end/2+geo.nDetector(1)/2),:)/2/geo.dDetector(1)*(2*pi/ (pi/angle_step) )/2*(geo.DSD(ii)/geo.DSO(ii)); diff --git a/MATLAB/mex_CUDA_win64_MSV2019.xml b/MATLAB/mex_CUDA_win64_MSV2019.xml index 5f2f0a12..892262c7 100644 --- a/MATLAB/mex_CUDA_win64_MSV2019.xml +++ b/MATLAB/mex_CUDA_win64_MSV2019.xml @@ -44,7 +44,7 @@ LINKFLAGS="/nologo /manifest" LINKTYPE="/DLL " LINKEXPORT="/EXPORT:mexFunction" - LINKLIBS="/LIBPATH:"$MATLABROOT\extern\lib\$ARCH\microsoft" libmx.lib libmex.lib libmat.lib cudart.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib" + LINKLIBS="/LIBPATH:"$MATLABROOT\extern\lib\$ARCH\microsoft" libmx.lib libmex.lib libmat.lib cudart.lib cufft.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib" LINKDEBUGFLAGS="/debug /PDB:"$TEMPNAME$LDEXT.pdb"" LINKOPTIMFLAGS="" OBJEXT=".obj" diff --git a/MATLAB/mex_CUDA_win64_MVS2013.xml b/MATLAB/mex_CUDA_win64_MVS2013.xml index e6620936..91d9f15d 100644 --- a/MATLAB/mex_CUDA_win64_MVS2013.xml +++ b/MATLAB/mex_CUDA_win64_MVS2013.xml @@ -46,7 +46,7 @@ LINKFLAGS="/nologo /manifest" LINKTYPE="/DLL " LINKEXPORT="/EXPORT:mexFunction" - LINKLIBS="/LIBPATH:"$MATLABROOT\extern\lib\$ARCH\microsoft" libmx.lib libmex.lib libmat.lib gpu.lib cudart.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib" + LINKLIBS="/LIBPATH:"$MATLABROOT\extern\lib\$ARCH\microsoft" libmx.lib libmex.lib libmat.lib gpu.lib cudart.lib cufft.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib" LINKDEBUGFLAGS="/debug /PDB:"$TEMPNAME$LDEXT.pdb"" LINKOPTIMFLAGS="" OBJEXT=".obj" diff --git a/MATLAB/mex_CUDA_win64_MVS2015.xml b/MATLAB/mex_CUDA_win64_MVS2015.xml index 0eda60bc..430efcf0 100644 --- a/MATLAB/mex_CUDA_win64_MVS2015.xml +++ b/MATLAB/mex_CUDA_win64_MVS2015.xml @@ -44,7 +44,7 @@ LINKFLAGS="/nologo /manifest" LINKTYPE="/DLL " LINKEXPORT="/EXPORT:mexFunction" - LINKLIBS="/LIBPATH:"$MATLABROOT\extern\lib\$ARCH\microsoft" libmx.lib libmex.lib libmat.lib cudart.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib" + LINKLIBS="/LIBPATH:"$MATLABROOT\extern\lib\$ARCH\microsoft" libmx.lib libmex.lib libmat.lib cudart.lib cufft.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib" LINKDEBUGFLAGS="/debug /PDB:"$TEMPNAME$LDEXT.pdb"" LINKOPTIMFLAGS="" OBJEXT=".obj" diff --git a/MATLAB/mex_CUDA_win64_MVS2022.xml b/MATLAB/mex_CUDA_win64_MVS2022.xml index cc140d68..705e18a0 100644 --- a/MATLAB/mex_CUDA_win64_MVS2022.xml +++ b/MATLAB/mex_CUDA_win64_MVS2022.xml @@ -44,7 +44,7 @@ LINKFLAGS="/nologo /manifest" LINKTYPE="/DLL " LINKEXPORT="/EXPORT:mexFunction" - LINKLIBS="/LIBPATH:"$MATLABROOT\extern\lib\$ARCH\microsoft" libmx.lib libmex.lib libmat.lib cudart.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib" + LINKLIBS="/LIBPATH:"$MATLABROOT\extern\lib\$ARCH\microsoft" libmx.lib libmex.lib libmat.lib cudart.lib cufft.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib" LINKDEBUGFLAGS="/debug /PDB:"$TEMPNAME$LDEXT.pdb"" LINKOPTIMFLAGS="" OBJEXT=".obj" From 249c7a7ec0ad8813f4f51444dad4fdc310210123 Mon Sep 17 00:00:00 2001 From: Tomoyuki SADAKANE <40597344+tsadakane@users.noreply.github.com> Date: Sun, 8 Jan 2023 20:30:06 +0900 Subject: [PATCH 03/10] Add CUDA implementation of FBP filtering * Modified and added Python files --- Python/setup.py | 22 ++++++- .../algorithms/single_pass_algorithms.py | 5 +- .../cuda_interface/_FbpFiltration.pyx | 57 +++++++++++++++++ Python/tigre/utilities/filtering.py | 61 +++++++++++-------- 4 files changed, 118 insertions(+), 27 deletions(-) create mode 100644 Python/tigre/utilities/cuda_interface/_FbpFiltration.pyx diff --git a/Python/setup.py b/Python/setup.py index ad35b470..e61ac519 100644 --- a/Python/setup.py +++ b/Python/setup.py @@ -509,6 +509,26 @@ def include_headers(filename_list, sdist=False): ) +FbpFiltration_ext = Extension( + "_FbpFiltration", + sources=include_headers( + [ + "../Common/CUDA/TIGRE_common.cpp", + "../Common/CUDA/FbpFiltration.cu", + "../Common/CUDA/GpuIds.cpp", + "../Common/CUDA/gpuUtils.cu", + "tigre/utilities/cuda_interface/_FbpFiltration.pyx", + ], + sdist=sys.argv[1] == "sdist", + ), + define_macros=[("IS_FOR_PYTIGRE", None)], + library_dirs=[CUDA["lib64"]], + libraries=["cudart", "cufft"], + language="c++", + runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None, + include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"], +) + setup( name="pytigre", version="2.4.0", @@ -516,7 +536,7 @@ def include_headers(filename_list, sdist=False): packages=find_packages(), include_package_data=True, data_files=[("data", ["../Common/data/head.mat"])], - ext_modules=[Ax_ext, Atb_ext, tvdenoising_ext, minTV_ext, AwminTV_ext, gpuUtils_ext, RandomNumberGenerator_ext], + ext_modules=[Ax_ext, Atb_ext, tvdenoising_ext, minTV_ext, AwminTV_ext, gpuUtils_ext, RandomNumberGenerator_ext, FbpFiltration_ext], py_modules=["tigre.py"], cmdclass={"build_ext": BuildExtension}, install_requires=["Cython", "matplotlib", "numpy", "scipy", "tqdm"], diff --git a/Python/tigre/algorithms/single_pass_algorithms.py b/Python/tigre/algorithms/single_pass_algorithms.py index 3fac0621..b15030fe 100644 --- a/Python/tigre/algorithms/single_pass_algorithms.py +++ b/Python/tigre/algorithms/single_pass_algorithms.py @@ -73,6 +73,9 @@ def FDK(proj, geo, angles, **kwargs): verbose = kwargs["verbose"] if "verbose" in kwargs else False gpuids = kwargs["gpuids"] if "gpuids" in kwargs else None + + use_gpu_for_filtration = kwargs["use_gpu_for_filtration"] if "use_gpu_for_filtration" in kwargs else True + geo = copy.deepcopy(geo) geo.check_geo(angles) geo.checknans() @@ -88,7 +91,7 @@ def FDK(proj, geo, angles, **kwargs): w = geo.DSD[0] / np.sqrt((geo.DSD[0] ** 2 + xx ** 2 + yy ** 2)) np.multiply(proj, w, out=proj_filt) - proj_filt = filtering(proj_filt, geo, angles, parker=False, verbose=verbose) + proj_filt = filtering(proj_filt, geo, angles, parker=False, verbose=verbose, use_gpu=use_gpu_for_filtration, gpuids=gpuids) return Atb(proj_filt, geo, geo.angles, "FDK", gpuids=gpuids) diff --git a/Python/tigre/utilities/cuda_interface/_FbpFiltration.pyx b/Python/tigre/utilities/cuda_interface/_FbpFiltration.pyx new file mode 100644 index 00000000..6705b219 --- /dev/null +++ b/Python/tigre/utilities/cuda_interface/_FbpFiltration.pyx @@ -0,0 +1,57 @@ +# This file is part of the TIGRE Toolbox + +# Copyright (c) 2015, University of Bath and +# CERN-European Organization for Nuclear Research +# All rights reserved. + +# License: Open Source under BSD. +# See the full license at +# https://github.com/CERN/TIGRE/license.txt + +# Contact: tigre.toolbox@gmail.com +# Codes: https://github.com/CERN/TIGRE/ +# -------------------------------------------------------------------------- +# Coded by: Tomoyuki SADAKANE + +cimport numpy as np +import numpy as np +from tigre.utilities.errors import TigreCudaCallError +from tigre.utilities.cuda_interface._gpuUtils cimport GpuIds as c_GpuIds, convert_to_c_gpuids, free_c_gpuids + +np.import_array() + +from libc.stdlib cimport malloc, free + +cdef extern from "numpy/arrayobject.h": + void PyArray_ENABLEFLAGS(np.ndarray arr, int flags) + void PyArray_CLEARFLAGS(np.ndarray arr, int flags) + +cdef extern from "FbpFiltration.hpp": + cdef void apply_filtration(const float* pf32In, size_t uiXLen, size_t uiYLen, float* pf32Filter, float* pfOut, const c_GpuIds gpuids) + +def cuda_raise_errors(error_code): + if error_code: + raise TigreCudaCallError('FFT::', error_code) + +def apply(np.ndarray[np.float32_t, ndim=2] src, np.ndarray[np.float32_t, ndim=1] flt, gpuids=None): + cdef c_GpuIds* c_gpuids = convert_to_c_gpuids(gpuids) + if not c_gpuids: + raise MemoryError() + + cdef np.npy_intp size_img[2] + size_img[0]= src.shape[0] + size_img[1]= src.shape[1] + + cdef float* c_imgout = malloc(size_img[0] *size_img[1]* sizeof(float)) + + src = np.ascontiguousarray(src) + flt = np.ascontiguousarray(flt) + + cdef float* c_src = src.data + cdef float* c_flt = flt.data + cdef size_t uiXLen = size_img[1] + cdef size_t uiYLen = size_img[0] + cuda_raise_errors(apply_filtration(c_src, uiXLen, uiYLen, c_flt, c_imgout, c_gpuids[0])) + imgout = np.PyArray_SimpleNewFromData(2, size_img, np.NPY_FLOAT32, c_imgout) + PyArray_ENABLEFLAGS(imgout, np.NPY_OWNDATA) + return imgout diff --git a/Python/tigre/utilities/filtering.py b/Python/tigre/utilities/filtering.py index f7aedc17..38256f30 100644 --- a/Python/tigre/utilities/filtering.py +++ b/Python/tigre/utilities/filtering.py @@ -9,10 +9,11 @@ import numpy as np from tigre.utilities.parkerweight import parkerweight +import _FbpFiltration as fbpfiltration # TODO: Fix parker -def filtering(proj, geo, angles, parker, verbose=False): +def filtering(proj, geo, angles, parker, verbose=False, use_gpu=True, gpuids=None): if parker: proj=parkerweight(proj.transpose(0,2,1),geo,angles,parker).transpose(0,2,1) @@ -21,34 +22,44 @@ def filtering(proj, geo, angles, parker, verbose=False): d=1 filt=filter(geo.filter,ramp_kernel[0],filt_len,d,verbose=verbose) - filt=np.kron(np.ones((np.int64(geo.nDetector[0]),1),dtype=np.float32),filt) padding = int((filt_len-geo.nDetector[1])//2 ) scale_factor = (geo.DSD[0]/geo.DSO[0]) * (2 * np.pi/ len(angles)) / ( 4 * geo.dDetector[0] ) + if use_gpu: + fproj=np.empty((geo.nDetector[0],filt_len),dtype=np.float32) + for idx_proj in range(0,angles.shape[0]): + fproj.fill(0) + fproj[:,padding:padding+geo.nDetector[1]]=proj[idx_proj] - #filter 2 projection at a time packing in to complex container - fproj=np.empty((geo.nDetector[0],filt_len),dtype=np.complex64) - for i in range(0,angles.shape[0]-1,2): - fproj.fill(0) - fproj.real[:,padding:padding+geo.nDetector[1]]=proj[i] - fproj.imag[:,padding:padding+geo.nDetector[1]]=proj[i+1] - - fproj=fft(fproj,axis=1) - fproj=fproj*filt - fproj=ifft(fproj,axis=1) - - proj[i]=fproj.real[:,padding:padding+geo.nDetector[1]] * scale_factor - proj[i+1]=fproj.imag[:,padding:padding+geo.nDetector[1]] * scale_factor - - #if odd number of projections filter last solo - if angles.shape[0] % 2: - fproj.fill(0) - fproj.real[:,padding:padding+geo.nDetector[1]]=proj[angles.shape[0]-1] - - fproj=fft(fproj,axis=1) - fproj=fproj*filt - fproj=np.real(ifft(fproj,axis=1)) - proj[angles.shape[0]-1]=fproj[:,padding:padding+geo.nDetector[1]] * scale_factor + fproj = fbpfiltration.apply(fproj, filt, gpuids) + + proj[idx_proj]=fproj[:,padding:padding+geo.nDetector[1]] * scale_factor + else: + filt=np.kron(np.ones((np.int64(geo.nDetector[0]),1),dtype=np.float32),filt) + + #filter 2 projection at a time packing in to complex container + fproj=np.empty((geo.nDetector[0],filt_len),dtype=np.complex64) + for i in range(0,angles.shape[0]-1,2): + fproj.fill(0) + fproj.real[:,padding:padding+geo.nDetector[1]]=proj[i] + fproj.imag[:,padding:padding+geo.nDetector[1]]=proj[i+1] + + fproj=fft(fproj,axis=1) + fproj=fproj*filt + fproj=ifft(fproj,axis=1) + + proj[i]=fproj.real[:,padding:padding+geo.nDetector[1]] * scale_factor + proj[i+1]=fproj.imag[:,padding:padding+geo.nDetector[1]] * scale_factor + + #if odd number of projections filter last solo + if angles.shape[0] % 2: + fproj.fill(0) + fproj.real[:,padding:padding+geo.nDetector[1]]=proj[angles.shape[0]-1] + + fproj=fft(fproj,axis=1) + fproj=fproj*filt + fproj=np.real(ifft(fproj,axis=1)) + proj[angles.shape[0]-1]=fproj[:,padding:padding+geo.nDetector[1]] * scale_factor return proj From 3e9bc036862f3891601122b1aaecad4f8d1b701e Mon Sep 17 00:00:00 2001 From: Tomoyuki SADAKANE <40597344+tsadakane@users.noreply.github.com> Date: Mon, 9 Jan 2023 01:01:54 +0900 Subject: [PATCH 04/10] Optimise FbpFiltration.cu * Use cufftExecR2C and cufftExecC2R to omit slow copy. --- Common/CUDA/FbpFiltration.cu | 68 +++++++++++++++--------------------- 1 file changed, 29 insertions(+), 39 deletions(-) diff --git a/Common/CUDA/FbpFiltration.cu b/Common/CUDA/FbpFiltration.cu index eac45c63..116ca69f 100644 --- a/Common/CUDA/FbpFiltration.cu +++ b/Common/CUDA/FbpFiltration.cu @@ -82,7 +82,7 @@ __global__ void ApplyFilter(cufftComplex* pcfInOut, size_t uiULen, size_t uiVLen } //! Apply filter in the Fourier space -void apply_filtration(const float* pfIn, size_t uiULen, size_t uiVLen, const float* pfFilter, float* pfOut, const GpuIds& gpuids){ +void apply_filtration (const float* pfIn, size_t uiULen, size_t uiVLen, const float* pfFilter, float* pfOut, const GpuIds& gpuids) { // Prepare for MultiGPU int deviceCount = gpuids.GetLength(); cudaCheckErrors("Device query fail"); @@ -101,64 +101,54 @@ void apply_filtration(const float* pfIn, size_t uiULen, size_t uiVLen, const flo cudaSetDevice(gpuids[0]); const size_t uiLen = uiULen * uiVLen; - cufftComplex* h_pcfInOut = (cufftComplex*)malloc(uiLen*sizeof(cufftComplex)); + const float fULInv = 1./uiULen; - if (!h_pcfInOut) { - mexErrMsgIdAndTxt("ApplyFiltration", "apply_filtration fail cudaMallocHost 1"); - } - for (int iV = 0; iV < uiVLen; ++iV) { - for (int iU = 0; iU < uiULen; ++iU) { - h_pcfInOut[iU+uiULen*iV] = cufftComplex{pfIn[iU+uiULen*iV], 0}; - } - } + float* d_pfInOut = nullptr; + cudaMalloc((void **)&d_pfInOut, uiLen * sizeof(float)); + cudaCheckErrors("apply_filtration fail cudaMalloc 1"); + cudaMemcpy(d_pfInOut, pfIn, uiLen* sizeof(float), cudaMemcpyHostToDevice); // Sync only. pfIn is not pinned. + cudaCheckErrors("apply_filtration fail cudaMemcpy 1"); - const float fULInv = 1./uiULen; + size_t uiBufferSize = (uiULen+1)/2+1; // Buffer size for R2C. See https://docs.nvidia.com/cuda/cufft/ - cufftHandle cudafftPlan; + cufftHandle cudafftPlanFwd; + cufftHandle cudafftPlanInv; const int iBatch = uiVLen; cufftResult_t fftresult; - fftresult = cufftPlan1d(&cudafftPlan, uiULen, CUFFT_C2C, iBatch); + fftresult = cufftPlan1d(&cudafftPlanFwd, uiULen, CUFFT_R2C, iBatch); cudafftCheckError(fftresult, "apply_filtration fail cufftPlan1d 1"); - fftresult = cufftPlan1d(&cudafftPlan, uiULen, CUFFT_C2C, iBatch); + fftresult = cufftPlan1d(&cudafftPlanInv, uiULen, CUFFT_C2R, iBatch); cudafftCheckError(fftresult, "apply_filtration fail cufftPlan1d 2"); - float* d_pfFilter = nullptr; - cufftComplex* d_pcfInOut = nullptr; - cudaMalloc((void **)&d_pcfInOut, uiLen * sizeof(cufftComplex)); - cudaCheckErrors("apply_filtration fail cudaMalloc 1"); + float* d_pfFilter = nullptr; cudaMalloc((void **)&d_pfFilter, uiULen * sizeof(float)); cudaCheckErrors("apply_filtration fail cudaMalloc 2"); - cudaMemcpy(d_pcfInOut, h_pcfInOut, uiLen* sizeof(cufftComplex), cudaMemcpyHostToDevice); - cudaCheckErrors("apply_filtration fail cudaMemcpy 1"); cudaMemcpy(d_pfFilter, pfFilter, uiULen * sizeof(float), cudaMemcpyHostToDevice); cudaCheckErrors("apply_filtration fail cudaMemcpy 2"); + cufftComplex* d_pcfWork = nullptr; + cudaMalloc((void **)&d_pcfWork, uiBufferSize * uiVLen*sizeof(cufftComplex)); + cudaCheckErrors("apply_filtration fail cudaMalloc 3"); + { const int divU = 128;//PIXEL_SIZE_BLOCK; const int divV = 1;//PIXEL_SIZE_BLOCK; dim3 grid((uiULen+divU-1)/divU,(uiVLen+divV-1)/divV,1); dim3 block(divU,divV,1); - cufftSetStream(cudafftPlan, 0); - fftresult = cufftExecC2C (cudafftPlan, d_pcfInOut, d_pcfInOut, CUFFT_FORWARD); - cudafftCheckError(fftresult, "apply_filtration fail cufftExecC2C CUFFT_FORWARD"); - ApplyFilter<<>>(d_pcfInOut, uiULen, uiVLen, d_pfFilter, fULInv);// Kernel d_pcfInOut = d_pcfInOut * pfFilter / uiULen - fftresult = cufftExecC2C (cudafftPlan, d_pcfInOut, d_pcfInOut, CUFFT_INVERSE); - cudafftCheckError(fftresult, "apply_filtration fail cufftExecC2C CUFFT_INVERSE"); + cufftSetStream(cudafftPlanFwd, 0); + cufftSetStream(cudafftPlanInv, 0); + fftresult = cufftExecR2C (cudafftPlanFwd, d_pfInOut, d_pcfWork); + cudafftCheckError(fftresult, "apply_filtration fail cufftExecR2C"); + ApplyFilter<<>>(d_pcfWork, uiBufferSize, uiVLen, d_pfFilter, fULInv);// Kernel d_pcfInOut = d_pcfInOut * pfFilter / uiULen + fftresult = cufftExecC2R (cudafftPlanInv, d_pcfWork, d_pfInOut); + cudafftCheckError(fftresult, "apply_filtration fail cufftExecC2R"); } - - cudaMemcpy(h_pcfInOut, d_pcfInOut, uiLen*sizeof(cufftComplex), cudaMemcpyDeviceToHost); + cudaMemcpy(pfOut, d_pfInOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost); cudaCheckErrors("apply_filtration fail cudaMemcpy 3"); - cudaFree(d_pcfInOut); d_pcfInOut = nullptr; + cudaFree(d_pcfWork); d_pcfWork = nullptr; + cudaFree(d_pfInOut); d_pfInOut = nullptr; cudaFree(d_pfFilter); d_pfFilter = nullptr; - - cufftSetStream(cudafftPlan, 0); - cufftDestroy(cudafftPlan); - - for (int iV = 0; iV < uiVLen; ++iV) { - for (int iU = 0; iU < uiULen; ++iU) { - pfOut[iU+uiULen*iV] = h_pcfInOut[iU+uiULen*iV].x; - } - } - free(h_pcfInOut); h_pcfInOut = nullptr; + cufftDestroy(cudafftPlanFwd); + cufftDestroy(cudafftPlanInv); } From c0245a4f31eae54b33721026a5740a0961d01a78 Mon Sep 17 00:00:00 2001 From: Tomoyuki SADAKANE <40597344+tsadakane@users.noreply.github.com> Date: Mon, 16 Jan 2023 15:12:46 +0900 Subject: [PATCH 05/10] Add another imlementation for CUDA FBP filtration * Padding and FBP filtration are applied in `apply_filtration2` function --- Common/CUDA/FbpFiltration.cu | 75 +++++++++++++++++++++++++++++++++++ Common/CUDA/FbpFiltration.hpp | 1 + 2 files changed, 76 insertions(+) diff --git a/Common/CUDA/FbpFiltration.cu b/Common/CUDA/FbpFiltration.cu index 116ca69f..dfd513b1 100644 --- a/Common/CUDA/FbpFiltration.cu +++ b/Common/CUDA/FbpFiltration.cu @@ -152,3 +152,78 @@ void apply_filtration (const float* pfIn, size_t uiULen, size_t uiVLen, const fl cufftDestroy(cudafftPlanFwd); cufftDestroy(cudafftPlanInv); } + + +//! Apply filter in the Fourier space +void apply_filtration2 (const float* pfInAll, size_t uiOffset, size_t uiULen, size_t uiBatch, const float* pfFilter, size_t uiFLen, float fScale, float* pfOut, const GpuIds& gpuids) { + // Prepare for MultiGPU + int deviceCount = gpuids.GetLength(); + cudaCheckErrors("Device query fail"); + if (deviceCount == 0) { + mexErrMsgIdAndTxt("apply_filtration","There are no available device(s) that support CUDA\n"); + } + // + // CODE assumes + // 1.-All available devices are usable by this code + // 2.-All available devices are equal, they are the same machine (warning thrown) + // Check the available devices, and if they are the same + if (!gpuids.AreEqualDevices()) { + mexWarnMsgIdAndTxt("apply_filtration","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed."); + } + // USING THE FIRST GPU ONLY + const float* pfIn = pfInAll+uiOffset; + cudaSetDevice(gpuids[0]); + cudaCheckErrors("apply_filtration fail cudaSetDevice"); + size_t uiPaddingLen = (uiFLen-uiULen) / 2; + float* d_pfProjWide = nullptr; + cudaMalloc((void**)&d_pfProjWide, uiFLen*uiBatch*sizeof(float)); + cudaCheckErrors("apply_filtration fail cudaMalloc wide"); + cudaMemset(d_pfProjWide, 0, uiFLen*uiBatch*sizeof(float)); + cudaCheckErrors("apply_filtration fail cudaMemset"); + cudaMemcpy2D(&d_pfProjWide[uiPaddingLen], uiFLen*sizeof(float), pfIn, uiULen*sizeof(float), uiULen*sizeof(float), uiBatch, cudaMemcpyHostToDevice); + cudaCheckErrors("apply_filtration fail cudaMemcpy2D"); + + const float fFLInv = 1./uiFLen; + + size_t uiBufferSize = (uiFLen+1)/2+1; // Buffer size for R2C. See https://docs.nvidia.com/cuda/cufft/ + + cufftHandle cudafftPlanFwd; + cufftHandle cudafftPlanInv; + cufftResult_t fftresult; + fftresult = cufftPlan1d(&cudafftPlanFwd, uiFLen, CUFFT_R2C, uiBatch); + cudafftCheckError(fftresult, "apply_filtration fail cufftPlan1d 1"); + fftresult = cufftPlan1d(&cudafftPlanInv, uiFLen, CUFFT_C2R, uiBatch); + cudafftCheckError(fftresult, "apply_filtration fail cufftPlan1d 2"); + + float* d_pfFilter = nullptr; + cudaMalloc((void **)&d_pfFilter, uiFLen * sizeof(float)); + cudaCheckErrors("apply_filtration fail cudaMalloc 2"); + cudaMemcpy(d_pfFilter, pfFilter, uiFLen * sizeof(float), cudaMemcpyHostToDevice); + cudaCheckErrors("apply_filtration fail cudaMemcpy 2"); + + cufftComplex* d_pcfWork = nullptr; + cudaMalloc((void **)&d_pcfWork, uiBufferSize * uiBatch*sizeof(cufftComplex)); + cudaCheckErrors("apply_filtration fail cudaMalloc 3"); + + { + const int divU = 128;//PIXEL_SIZE_BLOCK; + const int divV = 1;//PIXEL_SIZE_BLOCK; + dim3 grid((uiFLen+divU-1)/divU,(uiBatch+divV-1)/divV,1); + dim3 block(divU,divV,1); + cufftSetStream(cudafftPlanFwd, 0); + cufftSetStream(cudafftPlanInv, 0); + fftresult = cufftExecR2C (cudafftPlanFwd, d_pfProjWide, d_pcfWork); + cudafftCheckError(fftresult, "apply_filtration fail cufftExecR2C"); + ApplyFilter<<>>(d_pcfWork, uiBufferSize, uiBatch, d_pfFilter, fFLInv*fScale);// Kernel d_pcfInOut = d_pcfInOut * pfFilter / uiFLen * + fftresult = cufftExecC2R (cudafftPlanInv, d_pcfWork, d_pfProjWide); + cudafftCheckError(fftresult, "apply_filtration fail cufftExecC2R"); + } + cudaMemcpy2D(pfOut, uiULen*sizeof(float), &d_pfProjWide[uiPaddingLen], uiFLen*sizeof(float), uiULen*sizeof(float), uiBatch, cudaMemcpyDeviceToHost); + cudaCheckErrors("apply_filtration fail cudaMemcpy 3"); + + cudaFree(d_pcfWork); d_pcfWork = nullptr; + cudaFree(d_pfProjWide); d_pfProjWide = nullptr; + cudaFree(d_pfFilter); d_pfFilter = nullptr; + cufftDestroy(cudafftPlanFwd); + cufftDestroy(cudafftPlanInv); +} \ No newline at end of file diff --git a/Common/CUDA/FbpFiltration.hpp b/Common/CUDA/FbpFiltration.hpp index 88c8eade..ab8fd792 100644 --- a/Common/CUDA/FbpFiltration.hpp +++ b/Common/CUDA/FbpFiltration.hpp @@ -51,3 +51,4 @@ #include void apply_filtration(const float* pfIn, size_t uiXLen, size_t uiYLen, const float* pfFilter, float* pfOut, const GpuIds& gpuids); +void apply_filtration2 (const float* pfInAll, size_t uiOffset, size_t uiULen, size_t uiBatch, const float* pfFilter, size_t uiFLen, float fScale, float* pfOut, const GpuIds& gpuids) ; From 054de4496b5a79cc819f7a28425eceb11c37df19 Mon Sep 17 00:00:00 2001 From: Tomoyuki SADAKANE <40597344+tsadakane@users.noreply.github.com> Date: Mon, 16 Jan 2023 15:33:50 +0900 Subject: [PATCH 06/10] Add another imlementation for CUDA FBP filtration * Modified matlab files --- MATLAB/Algorithms/FDK.m | 4 +- MATLAB/Compile.m | 2 + .../ApplyPaddingAndFbpFiltration.cpp | 136 ++++++++++++++++++ MATLAB/Utilities/filtering.m | 112 ++++++++++++--- 4 files changed, 235 insertions(+), 19 deletions(-) create mode 100644 MATLAB/Utilities/cuda_interface/ApplyPaddingAndFbpFiltration.cpp diff --git a/MATLAB/Algorithms/FDK.m b/MATLAB/Algorithms/FDK.m index 5533f105..0aa8b8d5 100644 --- a/MATLAB/Algorithms/FDK.m +++ b/MATLAB/Algorithms/FDK.m @@ -73,7 +73,7 @@ proj(:,:,ii) = proj(:,:,ii).*w'; end %% Fourier transform based filtering -proj = filtering(proj,geo,angles,parker,usegpufft); % Not sure if offsets are good in here +proj = filtering(proj,geo,angles,parker, 'usegpufft', usegpufft, 'gpuids', gpuids); % Not sure if offsets are good in here % RMFIELD Remove fields from a structure array. geo=rmfield(geo,'filter'); @@ -178,7 +178,7 @@ end case 'usegpufft' if default - usegpufft=true; + usegpufft=2; else usegpufft=val; end diff --git a/MATLAB/Compile.m b/MATLAB/Compile.m index 2f72e465..c45686cb 100644 --- a/MATLAB/Compile.m +++ b/MATLAB/Compile.m @@ -98,6 +98,7 @@ mex('-largeArrayDims', './Utilities/cuda_interface/tvDenoise.cpp', '../Common/CUDA/tvdenoising.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex('-largeArrayDims', './Utilities/cuda_interface/AddNoise.cpp', '../Common/CUDA/RandomNumberGenerator.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex('-largeArrayDims', './Utilities/cuda_interface/ApplyFbpFiltration.cpp', '../Common/CUDA/FbpFiltration.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) + mex('-largeArrayDims', './Utilities/cuda_interface/ApplyPaddingAndFbpFiltration.cpp', '../Common/CUDA/FbpFiltration.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex('-largeArrayDims', './Utilities/IO/VarianCBCT/mexReadXim.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex('-largeArrayDims', './Utilities/GPU/getGpuName_mex.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex('-largeArrayDims', './Utilities/GPU/getGpuCount_mex.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) @@ -111,6 +112,7 @@ mex( './Utilities/cuda_interface/tvDenoise.cpp', '../Common/CUDA/tvdenoising.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex( './Utilities/cuda_interface/AddNoise.cpp', '../Common/CUDA/RandomNumberGenerator.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex( './Utilities/cuda_interface/ApplyFbpFiltration.cpp', '../Common/CUDA/FbpFiltration.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) + mex( './Utilities/cuda_interface/ApplyPaddingAndFbpFiltration.cpp', '../Common/CUDA/FbpFiltration.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex( './Utilities/IO/VarianCBCT/mexReadXim.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex( './Utilities/GPU/getGpuName_mex.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) mex('./Utilities/GPU/getGpuCount_mex.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS) diff --git a/MATLAB/Utilities/cuda_interface/ApplyPaddingAndFbpFiltration.cpp b/MATLAB/Utilities/cuda_interface/ApplyPaddingAndFbpFiltration.cpp new file mode 100644 index 00000000..3fb78493 --- /dev/null +++ b/MATLAB/Utilities/cuda_interface/ApplyPaddingAndFbpFiltration.cpp @@ -0,0 +1,136 @@ +/*------------------------------------------------------------------------- + * + * MATLAB MEX functions for convolution. Check inputs and parses + * MATLAB data to C++ data. + * + * + * CODE by Tomoyuki SADAKANE + * +--------------------------------------------------------------------------- +--------------------------------------------------------------------------- +Copyright (c) 2015, University of Bath and CERN- European Organization for +Nuclear Research +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + --------------------------------------------------------------------------- + +Contact: tigre.toolbox@gmail.com +Codes : https://github.com/CERN/TIGRE +--------------------------------------------------------------------------- + */ + +#include +#include +#include +#include +#include +#include +#include +#include +/** + * MEX gateway + * ApplyPaddingAndFbpFiltration(mat2dProj, idx_min, idx_max, mat1dFlt, scale, gpuids); + */ + +void mexFunction(int nlhs, mxArray *plhs[], + int nrhs, mxArray const *prhs[]) +{ + const int kiArgcMax = 6; + GpuIds gpuids; + if (nrhs==kiArgcMax) { + size_t iM = mxGetM(prhs[kiArgcMax-1]); + if (iM != 1) { + mexErrMsgIdAndTxt( "TIGRE:MEX:ApplyFbpFiltration:unknown","gpuids parameter must be a row vector."); + return; + } + size_t uiGpuCount = mxGetN(prhs[kiArgcMax-1]); + if (uiGpuCount == 0) { + mexErrMsgIdAndTxt( "TIGRE:MEX:ApplyFbpFiltration:unknown","gpuids parameter must be a row vector."); + return; + } + int* piGpuIds = (int*)mxGetData(prhs[kiArgcMax-1]); + gpuids.SetIds(uiGpuCount, piGpuIds); + } else { + int iGpuCount = GetGpuCount(); + int* piDev = (int*)malloc(iGpuCount * sizeof(int)); + for (int iI = 0; iI < iGpuCount; ++iI) { + piDev[iI] = iI; + } + gpuids.SetIds(iGpuCount, piDev); + free(piDev); piDev = 0; + } + if (nrhs < kiArgcMax-1) { + mexErrMsgIdAndTxt("TIGRE:MEX:ApplyFbpFiltration", "At least two input argumet required."); + } else if (nrhs==kiArgcMax-1 || nrhs==kiArgcMax){ + size_t mrows = mxGetM(prhs[0]); + size_t ncols = mxGetN(prhs[0]); + if (mrows ==1 && ncols ==1) { + mexErrMsgIdAndTxt("TIGRE:MEX:ApplyFbpFiltration", "1st parameter should not be 1x1"); + } + mrows = mxGetM(prhs[3]); + ncols = mxGetN(prhs[3]); + if (mrows==1 || ncols !=1) { + mexErrMsgIdAndTxt("TIGRE:MEX:ApplyFbpFiltration", "2nd parameter should be Mx1 (M>1)"); + } + } else if (nrhs<5) { + mexErrMsgIdAndTxt("TIGRE:MEX:ApplyFbpFiltration", "Too many imput argumets"); + } + /////////////// First input argumet. Projection + mxArray const * const proj = prhs[0]; + float* pfProj = static_cast(mxGetData(proj)); + const mwSize numDimsProj = mxGetNumberOfDimensions(proj); // get dim of proj + const mwSize *size_proj= mxGetDimensions(proj); //get size of proj + // printf("numDimsProj = %d\n", numDimsProj); + // for (int iI = 0; iI < numDimsProj; ++iI) { + // printf("size_proj[%d] = %d\n", iI, size_proj[iI]); + // } + // 2nd and 3rd inputs: From idx_begin-th to idx_end-th projections are processed. + // Note: idx_end is is not included in the range. + size_t idx_begin = (size_t)mxGetScalar(prhs[1]); + size_t idx_end = (size_t)mxGetScalar(prhs[2]); + /////////////// 4th input argumet. Filter + const mxArray* filter = prhs[3]; + float* pfFilter = static_cast(mxGetData(filter)); + const mwSize numDimsFilter = mxGetNumberOfDimensions(filter); // get dim of filter + const mwSize *size_filter= mxGetDimensions(filter); //get size of filter + // 5th input: Scaling + float fScale = (float)mxGetScalar(prhs[4]); + + size_t uiOffset = (idx_begin-1)*size_proj[0]*size_proj[1]; + ////////////// + //prepare outputs + // Allocate output projection + mwSize size_ret[3]; + size_ret[0] = size_proj[0]; + size_ret[1] = size_proj[1]; + size_ret[2] = idx_end-idx_begin; + plhs[0] = mxCreateNumericArray(3, size_ret, mxSINGLE_CLASS, mxREAL); + float *pfProjOut =(float*) mxGetPr(plhs[0]); + // call CUDA filtering + apply_filtration2(pfProj, uiOffset, size_ret[0], size_ret[1]*size_ret[2], pfFilter, size_filter[0], fScale, pfProjOut, gpuids); +} diff --git a/MATLAB/Utilities/filtering.m b/MATLAB/Utilities/filtering.m index 54cb2726..5a1eb31c 100644 --- a/MATLAB/Utilities/filtering.m +++ b/MATLAB/Utilities/filtering.m @@ -1,4 +1,4 @@ -function [ proj ] = filtering(proj,geo,angles,parker,usegpu) +function [ proj ] = filtering(proj,geo,angles,parker,varargin) %FILTERING Summary of this function goes here % Detailed explanation goes here %-------------------------------------------------------------------------- @@ -19,6 +19,7 @@ % Codes: https://github.com/CERN/TIGRE/ % Coded by: Kyungsang Kim, modified by Ander Biguri %-------------------------------------------------------------------------- +[usegpufft,gpuids]=parse_inputs(varargin); if parker proj = permute(ParkerWeight(permute(proj,[2 1 3]),geo,angles,parker),[2 1 3]); @@ -30,32 +31,53 @@ d = 1; % cut off (0~1) [filt] = Filter(geo.filter, ramp_kernel, filt_len, d); -if usegpu +if usegpufft>0 filt = filt'; - gpuids = GpuIds(); else filt = repmat(filt',[1 geo.nDetector(2)]); end -for ii=1:size(angles,2) - - fproj = (zeros(filt_len,geo.nDetector(2),'single')); + +if usegpufft==2 + bundle_size = 32; %len(gpuids) + n_bundles = floor((size(angles,2)+bundle_size-1)/bundle_size); + n_angles = size(angles, 2); + for ii=1:n_bundles + + if ii ~= n_bundles + bundle_size_actual = bundle_size; + else + bundle_size_actual = n_angles-(ii-1)*bundle_size; + end + idx_begin = (ii-1)*bundle_size+1; + idx_end = (ii-1)*bundle_size+bundle_size_actual+1; + proj_flt = ApplyPaddingAndFbpFiltration(proj, idx_begin, idx_end, filt, 1, gpuids.devices); + bundle_range = idx_begin:(idx_end-1); + if parker + proj_flt = proj_flt/2/geo.dDetector(1)*(2*pi/ (pi/angle_step) )/2*(geo.DSD(bundle_range)/geo.DSO(bundle_range)); + else + proj_flt = proj_flt/2/geo.dDetector(1)*(2*pi/ size(angles,2) )/2*(geo.DSD(bundle_range)/geo.DSO(bundle_range)); + end + proj(:,:,bundle_range) = proj_flt; + end +else - fproj(round(filt_len/2-geo.nDetector(1)/2+1):round(filt_len/2+geo.nDetector(1)/2),:) = proj(:,:,ii); + for ii=1:size(angles,2) + + fproj = (zeros(filt_len,geo.nDetector(2),'single')); + + fproj(round(filt_len/2-geo.nDetector(1)/2+1):round(filt_len/2+geo.nDetector(1)/2),:) = proj(:,:,ii); - if usegpu - fproj = ApplyFbpFiltration(fproj, filt, gpuids); - else fproj = fft(fproj); fproj = fproj.*filt; fproj = (real(ifft(fproj))); + + if parker + proj(:,:,ii) = fproj(round(end/2-geo.nDetector(1)/2+1):round(end/2+geo.nDetector(1)/2),:)/2/geo.dDetector(1)*(2*pi/ (pi/angle_step) )/2*(geo.DSD(ii)/geo.DSO(ii)); + else + proj(:,:,ii) = fproj(round(end/2-geo.nDetector(1)/2+1):round(end/2+geo.nDetector(1)/2),:)/2/geo.dDetector(1)*(2*pi/ size(angles,2) )/2*(geo.DSD(ii)/geo.DSO(ii)); + end + end - - if parker - proj(:,:,ii) = fproj(round(end/2-geo.nDetector(1)/2+1):round(end/2+geo.nDetector(1)/2),:)/2/geo.dDetector(1)*(2*pi/ (pi/angle_step) )/2*(geo.DSD(ii)/geo.DSO(ii)); - else - proj(:,:,ii) = fproj(round(end/2-geo.nDetector(1)/2+1):round(end/2+geo.nDetector(1)/2),:)/2/geo.dDetector(1)*(2*pi/ size(angles,2) )/2*(geo.DSD(ii)/geo.DSO(ii)); - end - end proj=permute(proj,[2 1 3]); @@ -98,3 +120,59 @@ return end + +function [usegpufft, gpuids]=parse_inputs(argin) + +opts = {'usegpufft','gpuids'}; +defaults=ones(length(opts),1); + +% Check inputs +nVarargs = length(argin); +if mod(nVarargs,2) + error('TIGRE:filtering:InvalidInput','Invalid number of inputs') +end + +% check if option has been passed as input +for ii=1:2:nVarargs + ind=find(ismember(opts,lower(argin{ii}))); + if ~isempty(ind) + defaults(ind)=0; + end +end + +for ii=1:length(opts) + opt=opts{ii}; + default=defaults(ii); + % if one option is not default, then extract value from input + if default==0 + ind=double.empty(0,1);jj=1; + while isempty(ind) + ind=find(isequal(opt,lower(argin{jj}))); + jj=jj+1; + end + if isempty(ind) + error('TIGRE:filtering:InvalidInput',['Optional parameter "' argin{jj} '" does not exist' ]); + end + val=argin{jj}; + end + + switch opt + case 'usegpufft' + if default + usegpufft=2; + else + usegpufft=val; + end + case 'gpuids' + if default + gpuids = GpuIds(); + else + gpuids = val; + end + + otherwise + error('TIGRE:filtering:InvalidInput',['Invalid input name:', num2str(opt),'\n No such option in FDK()']); + end +end + +end From 18c768088695574362af5e686a114680bd46166d Mon Sep 17 00:00:00 2001 From: Tomoyuki SADAKANE <40597344+tsadakane@users.noreply.github.com> Date: Mon, 16 Jan 2023 15:35:50 +0900 Subject: [PATCH 07/10] Add another imlementation for CUDA FBP filtration * Modified python files --- Python/tigre/algorithms/single_pass_algorithms.py | 2 +- Python/tigre/utilities/filtering.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/Python/tigre/algorithms/single_pass_algorithms.py b/Python/tigre/algorithms/single_pass_algorithms.py index b15030fe..6110f735 100644 --- a/Python/tigre/algorithms/single_pass_algorithms.py +++ b/Python/tigre/algorithms/single_pass_algorithms.py @@ -74,7 +74,7 @@ def FDK(proj, geo, angles, **kwargs): gpuids = kwargs["gpuids"] if "gpuids" in kwargs else None - use_gpu_for_filtration = kwargs["use_gpu_for_filtration"] if "use_gpu_for_filtration" in kwargs else True + use_gpu_for_filtration = kwargs["use_gpu_for_filtration"] if "use_gpu_for_filtration" in kwargs else 2 geo = copy.deepcopy(geo) geo.check_geo(angles) diff --git a/Python/tigre/utilities/filtering.py b/Python/tigre/utilities/filtering.py index 38256f30..08a2ffc1 100644 --- a/Python/tigre/utilities/filtering.py +++ b/Python/tigre/utilities/filtering.py @@ -5,6 +5,8 @@ import numpy as np from scipy.fft import fft, ifft +import matplotlib.pyplot as plt + import warnings import numpy as np @@ -25,14 +27,20 @@ def filtering(proj, geo, angles, parker, verbose=False, use_gpu=True, gpuids=Non padding = int((filt_len-geo.nDetector[1])//2 ) scale_factor = (geo.DSD[0]/geo.DSO[0]) * (2 * np.pi/ len(angles)) / ( 4 * geo.dDetector[0] ) - if use_gpu: + if use_gpu==2: + bundle_size = 32#len(gpuids) + n_bundles = (angles.shape[0]+bundle_size-1)//bundle_size + for idx_bundle in range(0,n_bundles): + bundle_size_actual = bundle_size if idx_bundle != n_bundles-1 else angles.shape[0]-idx_bundle*bundle_size + idx_begin = idx_bundle*bundle_size + idx_end = idx_bundle*bundle_size+bundle_size_actual + proj[idx_begin:idx_end] = fbpfiltration.apply_padding_and_filtering(proj, idx_begin, idx_end, filt, scale_factor, gpuids) + elif use_gpu==1: fproj=np.empty((geo.nDetector[0],filt_len),dtype=np.float32) for idx_proj in range(0,angles.shape[0]): fproj.fill(0) fproj[:,padding:padding+geo.nDetector[1]]=proj[idx_proj] - fproj = fbpfiltration.apply(fproj, filt, gpuids) - proj[idx_proj]=fproj[:,padding:padding+geo.nDetector[1]] * scale_factor else: filt=np.kron(np.ones((np.int64(geo.nDetector[0]),1),dtype=np.float32),filt) From 46157e44094044004521758588dcec44fcfe1e45 Mon Sep 17 00:00:00 2001 From: Tomoyuki SADAKANE <40597344+tsadakane@users.noreply.github.com> Date: Sat, 21 Jan 2023 18:11:45 +0900 Subject: [PATCH 08/10] Make FPD filtering support multiple GPUs --- Common/CUDA/FbpFiltration.cu | 121 +++++++++++++++++++++-------------- 1 file changed, 74 insertions(+), 47 deletions(-) diff --git a/Common/CUDA/FbpFiltration.cu b/Common/CUDA/FbpFiltration.cu index dfd513b1..0a5db4bf 100644 --- a/Common/CUDA/FbpFiltration.cu +++ b/Common/CUDA/FbpFiltration.cu @@ -155,12 +155,12 @@ void apply_filtration (const float* pfIn, size_t uiULen, size_t uiVLen, const fl //! Apply filter in the Fourier space -void apply_filtration2 (const float* pfInAll, size_t uiOffset, size_t uiULen, size_t uiBatch, const float* pfFilter, size_t uiFLen, float fScale, float* pfOut, const GpuIds& gpuids) { +void apply_filtration2 (const float* pfInAll, size_t uiOffset, size_t uiULen, size_t uiBatchTotal, const float* pfFilter, size_t uiFLen, float fScale, float* pfOut, const GpuIds& gpuids) { // Prepare for MultiGPU int deviceCount = gpuids.GetLength(); cudaCheckErrors("Device query fail"); if (deviceCount == 0) { - mexErrMsgIdAndTxt("apply_filtration","There are no available device(s) that support CUDA\n"); + mexErrMsgIdAndTxt("apply_filtration2","There are no available device(s) that support CUDA\n"); } // // CODE assumes @@ -168,62 +168,89 @@ void apply_filtration2 (const float* pfInAll, size_t uiOffset, size_t uiULen, si // 2.-All available devices are equal, they are the same machine (warning thrown) // Check the available devices, and if they are the same if (!gpuids.AreEqualDevices()) { - mexWarnMsgIdAndTxt("apply_filtration","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed."); + mexWarnMsgIdAndTxt("apply_filtration2","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed."); } - // USING THE FIRST GPU ONLY const float* pfIn = pfInAll+uiOffset; - cudaSetDevice(gpuids[0]); - cudaCheckErrors("apply_filtration fail cudaSetDevice"); - size_t uiPaddingLen = (uiFLen-uiULen) / 2; - float* d_pfProjWide = nullptr; - cudaMalloc((void**)&d_pfProjWide, uiFLen*uiBatch*sizeof(float)); - cudaCheckErrors("apply_filtration fail cudaMalloc wide"); - cudaMemset(d_pfProjWide, 0, uiFLen*uiBatch*sizeof(float)); - cudaCheckErrors("apply_filtration fail cudaMemset"); - cudaMemcpy2D(&d_pfProjWide[uiPaddingLen], uiFLen*sizeof(float), pfIn, uiULen*sizeof(float), uiULen*sizeof(float), uiBatch, cudaMemcpyHostToDevice); - cudaCheckErrors("apply_filtration fail cudaMemcpy2D"); - const float fFLInv = 1./uiFLen; + const size_t uiBufferSize = (uiFLen+1)/2+1; // Buffer size for R2C. See https://docs.nvidia.com/cuda/cufft/ + const size_t uiPaddingLen = (uiFLen-uiULen) / 2; + const size_t uiBatch0 = uiBatchTotal / deviceCount; - size_t uiBufferSize = (uiFLen+1)/2+1; // Buffer size for R2C. See https://docs.nvidia.com/cuda/cufft/ + float** pd_pfProjWide = (float**)malloc(deviceCount*sizeof(float*)); + float** pd_pfFilter = (float**)malloc(deviceCount*sizeof(float*)); + cufftComplex** pd_pcfWork = (cufftComplex**)malloc(deviceCount*sizeof(cufftComplex*)); - cufftHandle cudafftPlanFwd; - cufftHandle cudafftPlanInv; - cufftResult_t fftresult; - fftresult = cufftPlan1d(&cudafftPlanFwd, uiFLen, CUFFT_R2C, uiBatch); - cudafftCheckError(fftresult, "apply_filtration fail cufftPlan1d 1"); - fftresult = cufftPlan1d(&cudafftPlanInv, uiFLen, CUFFT_C2R, uiBatch); - cudafftCheckError(fftresult, "apply_filtration fail cufftPlan1d 2"); + size_t* puiBatch = (size_t*)malloc(deviceCount*sizeof(size_t)); + cufftHandle* pcudafftPlanFwd = (cufftHandle*)malloc(deviceCount*sizeof(cufftHandle)); + cufftHandle* pcudafftPlanInv = (cufftHandle*)malloc(deviceCount*sizeof(cufftHandle)); - float* d_pfFilter = nullptr; - cudaMalloc((void **)&d_pfFilter, uiFLen * sizeof(float)); - cudaCheckErrors("apply_filtration fail cudaMalloc 2"); - cudaMemcpy(d_pfFilter, pfFilter, uiFLen * sizeof(float), cudaMemcpyHostToDevice); - cudaCheckErrors("apply_filtration fail cudaMemcpy 2"); + cufftResult_t fftresult; + + for (int dev = 0; dev < deviceCount; ++dev) { + pd_pfProjWide[dev] = nullptr; + pd_pfFilter[dev] = nullptr; + pd_pcfWork[dev] = nullptr; + puiBatch[dev] = dev!=deviceCount-1? uiBatch0 : uiBatchTotal - uiBatch0*dev; + size_t uiBatch = puiBatch[dev]; - cufftComplex* d_pcfWork = nullptr; - cudaMalloc((void **)&d_pcfWork, uiBufferSize * uiBatch*sizeof(cufftComplex)); - cudaCheckErrors("apply_filtration fail cudaMalloc 3"); + // Allocate extended projection + cudaSetDevice(gpuids[dev]); + cudaCheckErrors("apply_filtration2 fail cudaSetDevice"); + cudaMalloc((void**)&pd_pfProjWide[dev], uiFLen*uiBatch*sizeof(float)); + cudaCheckErrors("apply_filtration2 fail cudaMalloc wide"); + // Fill 0 + cudaMemset(pd_pfProjWide[dev], 0, uiFLen*uiBatch*sizeof(float)); + cudaCheckErrors("apply_filtration2 fail cudaMemset"); + // Insert projection + cudaMemcpy2D(&pd_pfProjWide[dev][uiPaddingLen], uiFLen*sizeof(float), &pfIn[dev*uiBatch0*uiULen], uiULen*sizeof(float), uiULen*sizeof(float), uiBatch, cudaMemcpyHostToDevice); + cudaCheckErrors("apply_filtration2 fail cudaMemcpy2D 1"); - { + // Filter + cudaMalloc((void **)&pd_pfFilter[dev], uiFLen * sizeof(float)); + cudaCheckErrors("apply_filtration2 fail cudaMalloc filter"); + cudaMemcpy(pd_pfFilter[dev], pfFilter, uiFLen * sizeof(float), cudaMemcpyHostToDevice); + cudaCheckErrors("apply_filtration2 fail cudaMemcpy 2"); + + // Work space for FFT. + cudaMalloc((void **)&pd_pcfWork[dev], uiBufferSize * uiBatch*sizeof(cufftComplex)); + cudaCheckErrors("apply_filtration2 fail cudaMalloc work"); + } + for (int dev = 0; dev < deviceCount; ++dev) { + cudaSetDevice(gpuids[dev]); + const size_t uiBatch = puiBatch[dev]; + fftresult = cufftPlan1d(&pcudafftPlanFwd[dev], uiFLen, CUFFT_R2C, uiBatch); + cudafftCheckError(fftresult, "apply_filtration2 fail cufftPlan1d 1"); + fftresult = cufftPlan1d(&pcudafftPlanInv[dev], uiFLen, CUFFT_C2R, uiBatch); + cudafftCheckError(fftresult, "apply_filtration2 fail cufftPlan1d 2"); const int divU = 128;//PIXEL_SIZE_BLOCK; const int divV = 1;//PIXEL_SIZE_BLOCK; dim3 grid((uiFLen+divU-1)/divU,(uiBatch+divV-1)/divV,1); dim3 block(divU,divV,1); - cufftSetStream(cudafftPlanFwd, 0); - cufftSetStream(cudafftPlanInv, 0); - fftresult = cufftExecR2C (cudafftPlanFwd, d_pfProjWide, d_pcfWork); - cudafftCheckError(fftresult, "apply_filtration fail cufftExecR2C"); - ApplyFilter<<>>(d_pcfWork, uiBufferSize, uiBatch, d_pfFilter, fFLInv*fScale);// Kernel d_pcfInOut = d_pcfInOut * pfFilter / uiFLen * - fftresult = cufftExecC2R (cudafftPlanInv, d_pcfWork, d_pfProjWide); - cudafftCheckError(fftresult, "apply_filtration fail cufftExecC2R"); + fftresult = cufftExecR2C (pcudafftPlanFwd[dev], pd_pfProjWide[dev], pd_pcfWork[dev]); + cudafftCheckError(fftresult, "apply_filtration2 fail cufftExecR2C"); + ApplyFilter<<>>(pd_pcfWork[dev], uiBufferSize, uiBatch, pd_pfFilter[dev], fFLInv*fScale);// Kernel d_pcfInOut = d_pcfInOut * pfFilter / uiFLen * + fftresult = cufftExecC2R (pcudafftPlanInv[dev], pd_pcfWork[dev], pd_pfProjWide[dev]); + cudafftCheckError(fftresult, "apply_filtration2 fail cufftExecC2R"); } - cudaMemcpy2D(pfOut, uiULen*sizeof(float), &d_pfProjWide[uiPaddingLen], uiFLen*sizeof(float), uiULen*sizeof(float), uiBatch, cudaMemcpyDeviceToHost); - cudaCheckErrors("apply_filtration fail cudaMemcpy 3"); + for (int dev = 0; dev < deviceCount; ++dev) { + cudaSetDevice(gpuids[dev]); + const size_t uiBatch = puiBatch[dev]; + cudaMemcpy2D(&pfOut[dev*uiBatch0*uiULen], uiULen*sizeof(float), &pd_pfProjWide[dev][uiPaddingLen], uiFLen*sizeof(float), uiULen*sizeof(float), uiBatch, cudaMemcpyDeviceToHost); + cudaCheckErrors("apply_filtration2 fail cudaMemcpy2D 2"); - cudaFree(d_pcfWork); d_pcfWork = nullptr; - cudaFree(d_pfProjWide); d_pfProjWide = nullptr; - cudaFree(d_pfFilter); d_pfFilter = nullptr; - cufftDestroy(cudafftPlanFwd); - cufftDestroy(cudafftPlanInv); -} \ No newline at end of file + cudaFree(pd_pcfWork[dev]); pd_pcfWork[dev] = nullptr; + cudaFree(pd_pfProjWide[dev]); pd_pfProjWide[dev] = nullptr; + cudaFree(pd_pfFilter[dev]); pd_pfFilter[dev] = nullptr; + cufftDestroy(pcudafftPlanFwd[dev]); + cufftDestroy(pcudafftPlanInv[dev]); + cudaCheckErrors("apply_filtration2 fail free"); + } + cudaDeviceSynchronize(); + + free(pd_pfProjWide); pd_pfProjWide = nullptr; + free(pd_pfFilter); pd_pfFilter = nullptr; + free(puiBatch); puiBatch = nullptr; + free(pd_pcfWork); pd_pcfWork = nullptr; + free(pcudafftPlanFwd); pcudafftPlanFwd = nullptr; + free(pcudafftPlanInv); pcudafftPlanInv = nullptr; +} From aed4379921663406b2efd88f08964111ba2e4074 Mon Sep 17 00:00:00 2001 From: Tomoyuki SADAKANE <40597344+tsadakane@users.noreply.github.com> Date: Wed, 1 Feb 2023 23:28:59 +0900 Subject: [PATCH 09/10] Refactoring Matlab FDK and FBP * Moved transposing projections into filtering --- MATLAB/Algorithms/FBP.m | 1 - MATLAB/Algorithms/FDK.m | 6 ++---- MATLAB/Utilities/filtering.m | 4 +++- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/MATLAB/Algorithms/FBP.m b/MATLAB/Algorithms/FBP.m index 428d0421..8775c440 100644 --- a/MATLAB/Algorithms/FBP.m +++ b/MATLAB/Algorithms/FBP.m @@ -43,7 +43,6 @@ %% Weight %proj=data -proj=permute(proj,[2 1 3]); %% filter proj_filt = filtering(proj,geo,angles,parker); % Not sure if offsets are good in here diff --git a/MATLAB/Algorithms/FDK.m b/MATLAB/Algorithms/FDK.m index 0aa8b8d5..900af71a 100644 --- a/MATLAB/Algorithms/FDK.m +++ b/MATLAB/Algorithms/FDK.m @@ -59,9 +59,7 @@ end %% Weight -proj=permute(proj,[2 1 3]); for ii=1:size(angles,2) - us = ((-geo.nDetector(1)/2+0.5):1:(geo.nDetector(1)/2-0.5))*geo.dDetector(1) + offset(1,ii); vs = ((-geo.nDetector(2)/2+0.5):1:(geo.nDetector(2)/2-0.5))*geo.dDetector(2) + offset(2,ii); [uu,vv] = meshgrid(us,vs); % detector @@ -70,10 +68,10 @@ w = (geo.DSD(ii))./sqrt((geo.DSD(ii))^2+uu.^2 + vv.^2); % Multiply the weights with projection data - proj(:,:,ii) = proj(:,:,ii).*w'; + proj(:,:,ii) = w.*proj(:,:,ii); end %% Fourier transform based filtering -proj = filtering(proj,geo,angles,parker, 'usegpufft', usegpufft, 'gpuids', gpuids); % Not sure if offsets are good in here +proj = filtering(proj,geo,angles,parker); % Not sure if offsets are good in here % RMFIELD Remove fields from a structure array. geo=rmfield(geo,'filter'); diff --git a/MATLAB/Utilities/filtering.m b/MATLAB/Utilities/filtering.m index 5a1eb31c..0a550cb4 100644 --- a/MATLAB/Utilities/filtering.m +++ b/MATLAB/Utilities/filtering.m @@ -22,7 +22,7 @@ [usegpufft,gpuids]=parse_inputs(varargin); if parker - proj = permute(ParkerWeight(permute(proj,[2 1 3]),geo,angles,parker),[2 1 3]); + proj = ParkerWeight(proj,geo,angles,parker); diff_angles = diff(angles(1,:)); angle_step = mean(abs(diff_angles)); % to be used later end @@ -37,6 +37,8 @@ filt = repmat(filt',[1 geo.nDetector(2)]); end +proj=permute(proj,[2 1 3]); + if usegpufft==2 bundle_size = 32; %len(gpuids) n_bundles = floor((size(angles,2)+bundle_size-1)/bundle_size); From 7c97471bd0b1184e75f06340045a884955353fdc Mon Sep 17 00:00:00 2001 From: Tomoyuki SADAKANE <40597344+tsadakane@users.noreply.github.com> Date: Thu, 2 Feb 2023 23:02:45 +0900 Subject: [PATCH 10/10] Fix FDK option usegpufft option not working --- MATLAB/Algorithms/FDK.m | 2 +- MATLAB/Utilities/filtering.m | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/MATLAB/Algorithms/FDK.m b/MATLAB/Algorithms/FDK.m index 900af71a..00500523 100644 --- a/MATLAB/Algorithms/FDK.m +++ b/MATLAB/Algorithms/FDK.m @@ -71,7 +71,7 @@ proj(:,:,ii) = w.*proj(:,:,ii); end %% Fourier transform based filtering -proj = filtering(proj,geo,angles,parker); % Not sure if offsets are good in here +proj = filtering(proj,geo,angles,parker, 'usegpufft', usegpufft); % Not sure if offsets are good in here % RMFIELD Remove fields from a structure array. geo=rmfield(geo,'filter'); diff --git a/MATLAB/Utilities/filtering.m b/MATLAB/Utilities/filtering.m index 0a550cb4..7a8cc874 100644 --- a/MATLAB/Utilities/filtering.m +++ b/MATLAB/Utilities/filtering.m @@ -68,11 +68,13 @@ fproj = (zeros(filt_len,geo.nDetector(2),'single')); fproj(round(filt_len/2-geo.nDetector(1)/2+1):round(filt_len/2+geo.nDetector(1)/2),:) = proj(:,:,ii); - - fproj = fft(fproj); - fproj = fproj.*filt; - fproj = (real(ifft(fproj))); - + if usegpufft==1 + fproj = ApplyFbpFiltration(fproj, filt, gpuids); + else + fproj = fft(fproj); + fproj = fproj.*filt; + fproj = (real(ifft(fproj))); + end if parker proj(:,:,ii) = fproj(round(end/2-geo.nDetector(1)/2+1):round(end/2+geo.nDetector(1)/2),:)/2/geo.dDetector(1)*(2*pi/ (pi/angle_step) )/2*(geo.DSD(ii)/geo.DSO(ii)); else