7#ifndef HEFFTE_BACKEND_ROCM_H
8#define HEFFTE_BACKEND_ROCM_H
10#include "heffte_r2r_executor.h"
12#ifdef Heffte_ENABLE_ROCM
14#ifndef __HIP_PLATFORM_HCC__
15#define __HIP_PLATFORM_HCC__
17#include <hip/hip_runtime.h>
18#include <rocfft/rocfft.h>
19#include "heffte_backend_vector.h"
21#ifdef Heffte_ENABLE_MAGMA
22#include "heffte_magma_helpers.h"
61 throw std::runtime_error(std::string(
function_name) +
" failed with error code: " + std::to_string(
status));
70 template<
typename precision_type,
typename index>
78 template<
typename precision_type,
typename index>
85 template<
typename scalar_type,
typename index>
94 template<
typename scalar_type,
typename index>
103 template<
typename scalar_type,
typename index>
112 template<
typename scalar_type,
typename index>
114 index buff_line_stride, index buff_plane_stride,
int map0,
int map1,
int map2,
123 template<
typename precision>
126 template<
typename precision>
129 template<
typename precision>
132 template<
typename precision>
145 template<
typename precision>
148 template<
typename precision>
151 template<
typename precision>
154 template<
typename precision>
167 template<
typename precision>
170 template<
typename precision>
173 template<
typename precision>
176 template<
typename precision>
180 return 4 * (length-1);
263 template<
typename scalar_type>
270 template<
typename scalar_type>
272 if (pntr ==
nullptr)
return;
276 template<
typename scalar_type>
278 if (stream ==
nullptr)
284 template<
typename scalar_type>
289 template<
typename scalar_type>
294 template<
typename scalar_type>
297 "device_to_host (rocm)");
300 template<
typename scalar_type>
303 "device_to_device (rocm)");
306 template<
typename scalar_type>
309 "host_to_device (rocm)");
322 template<
typename T>
using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
333 template<
typename T>
using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
344 template<
typename T>
using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
355 template<
typename T>
using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
366template<
typename precision_type, direction dir>
423template<
typename precision_type, direction dir>
443 1, &stride, dist, 1, &stride, dist
469 size_t size[2] = {
size1, size2};
480 2, embed.data(), dist, 2, embed.data(), dist
498 std::array<size_t, 3> size = {
size1, size2,
size3};
507 nullptr,
nullptr, 3,
nullptr, 1, 3,
nullptr, 1
516 3, size.data(), 1,
desc),
557 template<
typename index>
565 block_stride(
box.osize(0) *
box.osize(1)),
566 total_size(
box.count()),
573 template<
typename index>
577 blocks(1), block_stride(0), total_size(
box.count()),
586 embed = {
static_cast<size_t>(stride),
static_cast<size_t>(size)};
587 howmanyffts =
box.size[2];
589 stride =
box.size[0];
591 embed = {
static_cast<size_t>(stride),
static_cast<size_t>(size) *
static_cast<size_t>(stride)};
592 howmanyffts =
box.size[0];
596 embed = {
static_cast<size_t>(stride),
static_cast<size_t>(
box.size[1]) *
static_cast<size_t>(
box.size[0])};
597 howmanyffts =
box.size[1];
602 template<
typename index>
605 size(
box.size[0]), size2(
box.size[1]), howmanyffts(
box.size[2]),
607 blocks(1), block_stride(0),
608 total_size(
box.count()),
614 template<
typename precision_type, direction dir>
615 void execute(std::complex<precision_type>
data[], std::complex<precision_type> *workspace)
const{
616 if (std::is_same<precision_type, float>::value){
618 make_plan(ccomplex_forward);
620 make_plan(ccomplex_backward);
623 make_plan(zcomplex_forward);
625 make_plan(zcomplex_backward);
631 size_t wsize = (std::is_same<precision_type, float>::value) ?
638 for(
int i=0;
i<blocks;
i++){
641 (std::is_same<precision_type, float>::value) ?
650 void forward(std::complex<float>
data[], std::complex<float> *workspace)
const override{
654 void forward(std::complex<double>
data[], std::complex<double> *workspace)
const override{
658 void backward(std::complex<float>
data[], std::complex<float> *workspace)
const override{
662 void backward(std::complex<double>
data[], std::complex<double> *workspace)
const override{
667 void forward(
float const indata[], std::complex<float>
outdata[], std::complex<float> *workspace)
const override{
672 void forward(
double const indata[], std::complex<double>
outdata[], std::complex<double> *workspace)
const override{
688 int box_size()
const override{
return total_size; }
693 make_plan(ccomplex_forward);
694 make_plan(ccomplex_backward);
695 make_plan(zcomplex_forward);
696 make_plan(zcomplex_backward);
698 std::max( std::max(ccomplex_forward->size_work(), ccomplex_backward->size_work()) /
sizeof(std::complex<float>),
699 std::max(zcomplex_forward->size_work(), zcomplex_backward->size_work()) /
sizeof(std::complex<double>) ) + 1;
705 template<
typename scalar_type, direction dir>
717 mutable hipStream_t stream;
719 int size, size2, howmanyffts, stride, dist, blocks, block_stride, total_size;
720 std::array<size_t, 2> embed;
721 mutable std::unique_ptr<plan_rocfft<std::complex<float>,
direction::forward>> ccomplex_forward;
722 mutable std::unique_ptr<plan_rocfft<std::complex<float>,
direction::backward>> ccomplex_backward;
723 mutable std::unique_ptr<plan_rocfft<std::complex<double>,
direction::forward>> zcomplex_forward;
724 mutable std::unique_ptr<plan_rocfft<std::complex<double>,
direction::backward>> zcomplex_backward;
748 template<
typename index>
757 rblock_stride(
box.osize(0) *
box.osize(1)),
758 cblock_stride(
box.osize(0) * (
box.osize(1)/2 + 1)),
767 template<
typename precision_type>
769 if (std::is_same<precision_type, float>::value){
779 size_t wsize = (std::is_same<precision_type, float>::value) ? sforward->size_work() : dforward->size_work();
784 reinterpret_cast<unsigned char *
>(workspace) +
wsize);
787 for(
int i=0;
i<blocks;
i++){
789 void *
cdata =
reinterpret_cast<void*
>(
outdata +
i * cblock_stride);
791 (std::is_same<precision_type, float>::value) ? *sforward : *dforward,
797 template<
typename precision_type>
799 if (std::is_same<precision_type, float>::value){
800 make_plan(sbackward);
802 make_plan(dbackward);
809 size_t wsize = (std::is_same<precision_type, float>::value) ? sbackward->size_work() : dbackward->size_work();
813 std::complex<precision_type> *
copy_indata =
reinterpret_cast<std::complex<precision_type>*
>(
814 reinterpret_cast<unsigned char *
>(workspace) +
wsize);
817 for(
int i=0;
i<blocks;
i++){
819 void *
rdata =
reinterpret_cast<void*
>(
outdata +
i * rblock_stride);
821 (std::is_same<precision_type, float>::value) ? *sbackward : *dbackward,
827 void forward(
float const indata[], std::complex<float>
outdata[], std::complex<float> *workspace)
const override{
835 void forward(
double const indata[], std::complex<double>
outdata[], std::complex<double> *workspace)
const override{
853 make_plan(sbackward);
854 make_plan(dbackward);
857 std::max( std::max(sforward->size_work() +
box_size() *
sizeof(
float), sbackward->size_work() +
complex_size() *
sizeof(std::complex<float>)) /
sizeof(std::complex<float>),
858 std::max(dforward->size_work() +
box_size() *
sizeof(
double), dbackward->size_work() +
complex_size() *
sizeof(std::complex<double>)) /
sizeof(std::complex<double>) ) + 1;
863 template<
typename scalar_type, direction dir>
865 if (!plan) plan = std::unique_ptr<plan_rocfft<scalar_type, dir>>(
new plan_rocfft<scalar_type, dir>(size, howmanyffts, stride, rdist, cdist));
868 mutable hipStream_t stream;
870 int size, howmanyffts, stride, blocks;
871 int rdist, cdist, rblock_stride, cblock_stride, rsize, csize;
872 mutable std::unique_ptr<plan_rocfft<float, direction::forward>> sforward;
873 mutable std::unique_ptr<plan_rocfft<double, direction::forward>> dforward;
874 mutable std::unique_ptr<plan_rocfft<float, direction::backward>> sbackward;
875 mutable std::unique_ptr<plan_rocfft<double, direction::backward>> dbackward;
936 template<
typename scalar_type,
typename index>
941 template<
typename scalar_type,
typename index>
951template<>
struct transpose_packer<tag::gpu>{
953 template<
typename scalar_type,
typename index>
958 template<
typename scalar_type,
typename index>
960 rocm::transpose_unpack<scalar_type>(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride,
961 plan.buff_line_stride, plan.buff_plane_stride, plan.map[0], plan.map[1], plan.map[2],
buffer,
data);
965namespace data_scaling {
970 template<
typename scalar_type,
typename index>
971 static void apply(hipStream_t stream, index num_entries, scalar_type *data,
double scale_factor){
972 rocm::scale_data<scalar_type, long long>(stream,
static_cast<long long>(num_entries), data, scale_factor);
978 template<
typename precision_type,
typename index>
979 static void apply(hipStream_t stream, index num_entries, std::complex<precision_type> *data,
double scale_factor){
980 apply<precision_type>(stream, 2*num_entries,
reinterpret_cast<precision_type*
>(data), scale_factor);
990 static const bool use_reorder =
true;
998 static const bool use_reorder =
true;
1006 static const bool use_reorder =
true;
1014 static const bool use_reorder =
true;
Base class for all backend executors.
Definition heffte_common.h:561
virtual int complex_size() const
Return the size of the complex-box (r2c executors).
Definition heffte_common.h:594
virtual void backward(float[], float *) const
Backward r2r, single precision.
Definition heffte_common.h:570
virtual void forward(float[], float *) const
Forward r2r, single precision.
Definition heffte_common.h:566
Wrapper to rocFFT API for real-to-complex transform with shortening of the data.
Definition heffte_backend_rocm.h:737
int complex_size() const override
Returns the size of the box with complex coefficients.
Definition heffte_backend_rocm.h:846
void backward(std::complex< float > indata[], float outdata[], std::complex< float > *workspace) const override
Backward transform, single precision.
Definition heffte_backend_rocm.h:831
rocfft_executor_r2c(hipStream_t active_stream, box3d< index > const box, int dimension)
Constructor defines the box and the dimension of reduction.
Definition heffte_backend_rocm.h:749
void forward(float const indata[], std::complex< float > outdata[], std::complex< float > *workspace) const override
Forward transform, single precision.
Definition heffte_backend_rocm.h:827
int box_size() const override
Returns the size of the box with real data.
Definition heffte_backend_rocm.h:844
void forward(precision_type const indata[], std::complex< precision_type > outdata[], std::complex< precision_type > *workspace) const
Forward transform, single precision.
Definition heffte_backend_rocm.h:768
void backward(std::complex< precision_type > indata[], precision_type outdata[], std::complex< precision_type > *workspace) const
Backward transform, single precision.
Definition heffte_backend_rocm.h:798
size_t compute_workspace_size() const
Computes the size of the needed workspace.
Definition heffte_backend_rocm.h:850
void backward(std::complex< double > indata[], double outdata[], std::complex< double > *workspace) const override
Backward transform, double precision.
Definition heffte_backend_rocm.h:839
size_t workspace_size() const override
Return the size of the needed workspace.
Definition heffte_backend_rocm.h:848
void forward(double const indata[], std::complex< double > outdata[], std::complex< double > *workspace) const override
Forward transform, double precision.
Definition heffte_backend_rocm.h:835
Wrapper around the rocFFT API.
Definition heffte_backend_rocm.h:548
void forward(double const indata[], std::complex< double > outdata[], std::complex< double > *workspace) const override
Converts the real data to complex and performs double-complex forward transform.
Definition heffte_backend_rocm.h:672
void backward(std::complex< float > indata[], float outdata[], std::complex< float > *workspace) const override
Performs backward float-complex transform and truncates the complex part of the result.
Definition heffte_backend_rocm.h:677
void forward(float const indata[], std::complex< float > outdata[], std::complex< float > *workspace) const override
Converts the real data to complex and performs float-complex forward transform.
Definition heffte_backend_rocm.h:667
void backward(std::complex< double > indata[], double outdata[], std::complex< double > *workspace) const override
Performs backward double-complex transform and truncates the complex part of the result.
Definition heffte_backend_rocm.h:682
rocfft_executor(hipStream_t active_stream, box3d< index > const box, int dimension)
Constructor, specifies the box and dimension.
Definition heffte_backend_rocm.h:558
size_t compute_workspace_size() const
Computes the size of the needed workspace.
Definition heffte_backend_rocm.h:692
rocfft_executor(hipStream_t active_stream, box3d< index > const box, int dir1, int dir2)
Merges two FFTs into one.
Definition heffte_backend_rocm.h:574
int box_size() const override
Returns the size of the box.
Definition heffte_backend_rocm.h:688
void forward(std::complex< double > data[], std::complex< double > *workspace) const override
Forward fft, double-complex case.
Definition heffte_backend_rocm.h:654
void backward(std::complex< float > data[], std::complex< float > *workspace) const override
Backward fft, float-complex case.
Definition heffte_backend_rocm.h:658
size_t workspace_size() const override
Return the size of the needed workspace.
Definition heffte_backend_rocm.h:690
void execute(std::complex< precision_type > data[], std::complex< precision_type > *workspace) const
Perform an in-place FFT on the data in the given direction.
Definition heffte_backend_rocm.h:615
rocfft_executor(hipStream_t active_stream, box3d< index > const box)
Merges three FFTs into one.
Definition heffte_backend_rocm.h:603
void backward(std::complex< double > data[], std::complex< double > *workspace) const override
Backward fft, double-complex case.
Definition heffte_backend_rocm.h:662
void forward(std::complex< float > data[], std::complex< float > *workspace) const override
Forward fft, float-complex case.
Definition heffte_backend_rocm.h:650
int fft1d_get_howmany(box3d< index > const box, int const dimension)
Return the number of 1-D ffts contained in the box in the given dimension.
Definition heffte_geometry.h:159
int fft1d_get_stride(box3d< index > const box, int const dimension)
Return the stride of the 1-D ffts contained in the box in the given dimension.
Definition heffte_geometry.h:169
@ backward
Inverse DFT transform.
@ forward
Forward DFT transform.
void apply(cudaStream_t stream, index num_entries, scalar_type *data, double scale_factor)
Simply multiply the num_entries in the data by the scale_factor.
Definition heffte_backend_cuda.h:837
void check_error(hipError_t status, const char *function_name)
Checks the status of a ROCm command and in case of a failure, converts it to a C++ exception.
Definition heffte_backend_rocm.h:51
void convert(hipStream_t stream, index num_entries, precision_type const source[], std::complex< precision_type > destination[])
Convert real numbers to complex when both are located on the GPU device.
void transpose_unpack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, index buff_line_stride, index buff_plane_stride, int map0, int map1, int map2, scalar_type const source[], scalar_type destination[])
Performs a transpose-unpack operation for data sitting on the GPU device.
void scale_data(hipStream_t stream, index num_entries, scalar_type *data, double scale_factor)
Scales real data (double or float) by the scaling factor.
void direct_pack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[])
Performs a direct-pack operation for data sitting on the GPU device.
void direct_unpack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[])
Performs a direct-unpack operation for data sitting on the GPU device.
Namespace containing all HeFFTe methods and classes.
Definition heffte_backend_cuda.h:38
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the ROCm vector container.
Definition heffte_backend_rocm.h:322
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the ROCm vector container.
Definition heffte_backend_rocm.h:355
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the ROCm vector container.
Definition heffte_backend_rocm.h:333
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the ROCm vector container.
Definition heffte_backend_rocm.h:344
Defines the container for the temporary buffers.
Definition heffte_common.h:237
Type-tag for the cuFFT backend.
Definition heffte_common.h:162
static void copy_device_to_device(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the device to the device.
Definition heffte_backend_rocm.h:301
cudaStream_t stream_type
The stream type for the device.
Definition heffte_backend_cuda.h:232
static void copy_n(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Equivalent to std::copy_n() but using CUDA arrays.
Definition heffte_backend_rocm.h:277
static scalar_type * allocate(hipStream_t, size_t num_entries)
Allocate memory.
Definition heffte_backend_rocm.h:264
static void copy_n(hipStream_t stream, scalar_type const source[], size_t num_entries, std::complex< scalar_type > destination[])
Copy-convert real-to-complex.
Definition heffte_backend_rocm.h:290
static void copy_host_to_device(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the host to the device.
Definition heffte_backend_rocm.h:307
static void copy_device_to_host(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the device to the host.
Definition heffte_backend_rocm.h:295
static void copy_n(hipStream_t stream, std::complex< scalar_type > const source[], size_t num_entries, scalar_type destination[])
Copy-convert complex-to-real.
Definition heffte_backend_rocm.h:285
static void free(hipStream_t, scalar_type *pntr)
Free memory.
Definition heffte_backend_rocm.h:271
Common data-transfer operations, must be specializes for each location (cpu/gpu).
Definition heffte_common.h:59
Defines inverse mapping from the location tag to a default backend tag.
Definition heffte_common.h:430
The CUDA backend uses a CUDA stream.
Definition heffte_backend_cuda.h:202
device_instance(hipStream_t new_stream=nullptr)
Constructor, sets up the stream.
Definition heffte_backend_rocm.h:231
void synchronize_device() const
Syncs the execution with the queue.
Definition heffte_backend_rocm.h:237
hipStream_t stream() const
Returns the nullptr (const case).
Definition heffte_backend_rocm.h:235
cudaStream_t stream_type
The type for the internal stream.
Definition heffte_backend_cuda.h:214
hipStream_t stream()
Returns the nullptr.
Definition heffte_backend_rocm.h:233
hipStream_t _stream
The CUDA stream to be used in all operations.
Definition heffte_backend_rocm.h:239
Holds the auxiliary variables needed by each backend.
Definition heffte_common.h:408
Allows to define whether a specific backend interface has been enabled.
Definition heffte_common.h:226
Type-tag for the Cosine Transform of type 1 using the rocFFT backend.
Definition heffte_common.h:199
Type-tag for the Cosine Transform using the rocFFT backend.
Definition heffte_common.h:189
Type-tag for the Sine Transform using the rocFFT backend.
Definition heffte_common.h:194
Type-tag for the rocFFT backend.
Definition heffte_common.h:184
Defines a set of default plan options for a given backend.
Definition heffte_common.h:761
void unpack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned unpack operation.
Definition heffte_backend_rocm.h:942
void pack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition heffte_backend_rocm.h:937
Defines the direct packer without implementation, use the specializations to get the CPU or GPU imple...
Definition heffte_pack3d.h:83
Indicates the structure that will be used by the fft backend.
Definition heffte_common.h:663
Wrapper around cufftHandle plans, set for float or double complex.
Definition heffte_backend_cuda.h:346
~plan_rocfft()
Destructor, deletes the plan.
Definition heffte_backend_rocm.h:524
plan_rocfft(size_t size1, size_t size2, std::array< size_t, 2 > const &embed, size_t batch, size_t dist)
Constructor, takes inputs identical to cufftMakePlanMany().
Definition heffte_backend_rocm.h:468
plan_rocfft(size_t size, size_t batch, size_t stride, size_t dist)
Constructor, takes inputs identical to cufftMakePlanMany().
Definition heffte_backend_rocm.h:433
plan_rocfft(size_t size1, size_t size2, size_t size3)
Constructor, takes inputs identical to cufftPlan3d()
Definition heffte_backend_rocm.h:497
size_t size_work() const
Return the worksize.
Definition heffte_backend_rocm.h:528
Plan for the r2c single precision transform.
Definition heffte_backend_rocm.h:367
~plan_rocfft()
Destructor, deletes the plan.
Definition heffte_backend_rocm.h:406
size_t size_work() const
Return the worksize.
Definition heffte_backend_rocm.h:410
plan_rocfft(size_t size, size_t batch, size_t stride, size_t rdist, size_t cdist)
Constructor and initializer of the plan.
Definition heffte_backend_rocm.h:377
Implementation of Cosine Transform of type 1 pre-post processing methods using CUDA.
Definition heffte_backend_rocm.h:165
static void pre_forward(hipStream_t, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.
static int compute_extended_length(int length)
Computes the length of the extended signal.
Definition heffte_backend_rocm.h:179
static void post_backward(hipStream_t, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.
static void post_forward(hipStream_t, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.
static void pre_backward(hipStream_t, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.
Implementation of Cosine Transform pre-post processing methods using CUDA.
Definition heffte_backend_rocm.h:121
static void post_forward(hipStream_t, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.
static void pre_backward(hipStream_t, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.
static void pre_forward(hipStream_t, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.
static void post_backward(hipStream_t, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.
static int compute_extended_length(int length)
Computes the length of the extended signal.
Definition heffte_backend_rocm.h:135
Indicates whether the rocfft_setup() method has been called.
Definition heffte_backend_rocm.h:188
static bool is_initialized
Static (global) variable indicating if rocfft_setup() has been called.
Definition heffte_backend_rocm.h:190
static void make()
Make initialize.
Definition heffte_backend_rocm.h:192
Implementation of Sine Transform pre-post processing methods using CUDA.
Definition heffte_backend_rocm.h:143
static void post_backward(hipStream_t, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.
static void pre_backward(hipStream_t, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.
static int compute_extended_length(int length)
Computes the length of the extended signal.
Definition heffte_backend_rocm.h:157
static void post_forward(hipStream_t, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.
static void pre_forward(hipStream_t, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.
Indicates the use of gpu backend and that all input/output data and arrays will be bound to the gpu d...
Definition heffte_common.h:45
void pack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition heffte_backend_rocm.h:954
void unpack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned transpose-unpack operation.
Definition heffte_backend_rocm.h:959