7#ifndef HEFFTE_RESHAPE3D_H
8#define HEFFTE_RESHAPE3D_H
10#include "heffte_plan_logic.h"
11#include "heffte_backends.h"
31#ifdef Heffte_ENABLE_CUDA
32namespace gpu {
using namespace cuda; }
34#ifdef Heffte_ENABLE_ROCM
35namespace gpu {
using namespace rocm; }
37#ifdef Heffte_ENABLE_ONEAPI
38namespace gpu {
using namespace oneapi; }
51template<
typename index>
59template<
typename index>
114template<
typename index>
117 for(
auto const &
s :
shapers)
if (
s) max_size = std::max(max_size,
s->size_workspace());
132template<
typename location_tag,
template<
typename device>
class packer,
typename index>
138 template<
typename b,
template<
typename d>
class p,
typename i>
friend std::unique_ptr<reshape3d_alltoall<b, p, i>>
159 template<
typename scalar_type>
163 size_t size_workspace()
const override {
return 2 * num_entries * packplan.size(); }
172 std::vector<int>&&, std::vector<int>&&,
int);
175 int const me, nprocs;
176 bool const use_gpu_aware;
178 std::vector<pack_plan_3d<index>> packplan, unpackplan;
179 std::vector<int> send_offset, recv_offset;
180 int const num_entries;
205template<
typename location_tag,
template<
typename device>
class packer = direct_packer,
typename index>
206std::unique_ptr<reshape3d_alltoall<location_tag, packer, index>>
225template<
typename location_tag,
template<
typename device>
class packer,
typename index>
231 template<
typename b,
template<
typename d>
class p,
typename i>
friend std::unique_ptr<reshape3d_alltoallv<b, p, i>>
252 template<
typename scalar_type>
262 std::vector<int> &&send_offset, std::vector<int> &&send_size, std::vector<int>
const &send_proc,
263 std::vector<int> &&recv_offset, std::vector<int> &&recv_size, std::vector<int>
const &recv_proc,
267 int const me, nprocs;
268 bool const use_gpu_aware;
270 std::vector<int>
const send_offset;
271 std::vector<int>
const send_size;
272 std::vector<int>
const recv_offset;
273 std::vector<int>
const recv_size;
274 int const send_total, recv_total;
276 std::vector<pack_plan_3d<index>>
const packplan, unpackplan;
279 std::vector<int> counts, displacements, map;
280 iotripple(std::vector<int>
const &
pgroup, std::vector<int>
const &
proc, std::vector<int>
const &
sizes) :
285 for(
size_t i=0;
i<
proc.size();
i++){
297 iotripple
const send, recv;
322template<
typename location_tag,
template<
typename device>
class packer = direct_packer,
typename index>
323std::unique_ptr<reshape3d_alltoallv<location_tag, packer, index>>
340template<
typename location_tag,
template<
typename device>
class packer,
typename index>
346 template<
typename b,
template<
typename d>
class p,
typename i>
friend std::unique_ptr<reshape3d_pointtopoint<b, p, i>>
367 template<
typename scalar_type>
371 template<
typename scalar_type>
380 std::vector<int> &&send_offset, std::vector<int> &&send_size, std::vector<int> &&send_proc,
381 std::vector<int> &&recv_offset, std::vector<int> &&recv_size, std::vector<int> &&recv_proc,
382 std::vector<int> &&recv_loc,
386 int const me, nprocs;
387 bool const self_to_self;
389 bool const use_gpu_aware;
390 mutable std::vector<MPI_Request> requests;
391 mutable std::vector<MPI_Request> isends;
393 std::vector<int>
const send_proc;
394 std::vector<int>
const send_offset;
395 std::vector<int>
const send_size;
396 std::vector<int>
const recv_proc;
397 std::vector<int>
const recv_offset;
398 std::vector<int>
const recv_size;
399 std::vector<int>
const recv_loc;
400 int const send_total, recv_total;
402 std::vector<pack_plan_3d<index>>
const packplan, unpackplan;
429template<
typename location_tag,
template<
typename device>
class packer =
direct_packer,
typename index>
430std::unique_ptr<reshape3d_pointtopoint<location_tag, packer, index>>
443template<
typename location_tag,
typename index>
449 reshape3d_base<index>(cplan.size[0] * cplan.size[1] * cplan.size[2], cplan.size[0] * cplan.size[1] * cplan.size[2]),
472 template<
typename scalar_type>
478 destination +
j * this->input_size);
480 for(
int j=0; j<batch_size; j++)
481 transpose_packer<location_tag>().unpack(this->
stream(), plan, source + j * this->
input_size,
482 destination + j * this->input_size);
486 pack_plan_3d<index>
const plan;
504template<
typename backend_tag,
typename index>
514 return std::unique_ptr<reshape3d_base<index>>();
518 std::vector<pack_plan_3d<index>>
plans;
526 return std::unique_ptr<reshape3d_base<index>>();
Reshape algorithm based on the MPI_Alltoall() method.
Definition heffte_reshape3d.h:133
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition heffte_reshape3d.h:154
~reshape3d_alltoall()
Destructor, frees the comm generated by the constructor.
Definition heffte_reshape3d.h:136
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition heffte_reshape3d.h:150
size_t size_workspace() const override
The size of the workspace must include padding.
Definition heffte_reshape3d.h:163
friend std::unique_ptr< reshape3d_alltoall< b, p, i > > make_reshape3d_alltoall(typename backend::device_instance< b >::stream_type, std::vector< box3d< i > > const &, std::vector< box3d< i > > const &, bool, MPI_Comm const)
Factory method, use to construct instances of the class.
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition heffte_reshape3d.h:142
void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_alltoallv::apply() algorithm for all scalar types.
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition heffte_reshape3d.h:146
Reshape algorithm based on the MPI_Alltoallv() method.
Definition heffte_reshape3d.h:226
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition heffte_reshape3d.h:247
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition heffte_reshape3d.h:243
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition heffte_reshape3d.h:239
~reshape3d_alltoallv()
Destructor, frees the comm generated by the constructor.
Definition heffte_reshape3d.h:229
friend std::unique_ptr< reshape3d_alltoallv< b, p, i > > make_reshape3d_alltoallv(typename backend::device_instance< b >::stream_type, std::vector< box3d< i > > const &, std::vector< box3d< i > > const &, bool, MPI_Comm const)
Factory method, use to construct instances of the class.
void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_alltoallv::apply() algorithm for all scalar types.
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition heffte_reshape3d.h:235
Base reshape interface.
Definition heffte_reshape3d.h:60
reshape3d_base(index cinput_size, index coutput_size)
Constructor that sets the input and output sizes.
Definition heffte_reshape3d.h:63
index const output_size
Stores the size of the output.
Definition heffte_reshape3d.h:86
virtual void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const =0
Apply the reshape, double precision complex.
virtual ~reshape3d_base()=default
Default virtual destructor.
index size_intput() const
Returns the input size.
Definition heffte_reshape3d.h:76
scalar_type * cpu_send_buffer(size_t num_entries) const
Allocates and returns a CPU buffer when GPU-Aware communication has been disabled.
Definition heffte_reshape3d.h:93
virtual void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const =0
Apply the reshape, single precision complex.
virtual void apply(int batch_size, double const source[], double destination[], double workspace[]) const =0
Apply the reshape, double precision.
index const input_size
Stores the size of the input.
Definition heffte_reshape3d.h:84
index size_output() const
Returns the output size.
Definition heffte_reshape3d.h:78
virtual void apply(int batch_size, float const source[], float destination[], float workspace[]) const =0
Apply the reshape, single precision.
scalar_type * cpu_recv_buffer(size_t num_entries) const
Allocates and returns a CPU buffer when GPU-Aware communication has been disabled.
Definition heffte_reshape3d.h:99
std::vector< float > send_unaware
Temp buffers for the gpu-unaware algorithms.
Definition heffte_reshape3d.h:105
virtual size_t size_workspace() const
Returns the workspace size.
Definition heffte_reshape3d.h:80
std::vector< float > recv_unaware
Temp buffers for the gpu-unaware algorithms.
Definition heffte_reshape3d.h:107
Reshape algorithm based on the MPI_Send() and MPI_Irecv() methods.
Definition heffte_reshape3d.h:341
~reshape3d_pointtopoint()=default
Destructor, frees the comm generated by the constructor.
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition heffte_reshape3d.h:350
void no_gpuaware_send_recv(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_pointtopoint::apply() algorithm that does not use GPU-Aware MPI.
void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_pointtopoint::apply() algorithm for all scalar types.
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition heffte_reshape3d.h:358
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition heffte_reshape3d.h:362
friend std::unique_ptr< reshape3d_pointtopoint< b, p, i > > make_reshape3d_pointtopoint(typename backend::device_instance< b >::stream_type, std::vector< box3d< i > > const &, std::vector< box3d< i > > const &, reshape_algorithm, bool, MPI_Comm const)
Factory method, use to construct instances of the class.
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition heffte_reshape3d.h:354
Special case of the reshape that does not involve MPI communication but applies a transpose instead.
Definition heffte_reshape3d.h:444
reshape3d_transpose(typename backend::device_instance< location_tag >::stream_type q, pack_plan_3d< index > const cplan)
Constructor using the provided unpack plan.
Definition heffte_reshape3d.h:447
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition heffte_reshape3d.h:467
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition heffte_reshape3d.h:455
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition heffte_reshape3d.h:463
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition heffte_reshape3d.h:459
reshape_algorithm
Defines list of potential communication algorithms.
Definition heffte_plan_logic.h:48
@ alltoall
Using the MPI_Alltoall options, with padding on the data.
@ alltoallv
Using the MPI_Alltoallv options, no padding on the data (default option).
bool match(std::vector< box3d< index > > const &shape0, std::vector< box3d< index > > const &shape1)
Compares two vectors of boxes, returns true if all boxes match.
Definition heffte_geometry.h:246
int comm_rank(MPI_Comm const comm)
Returns the rank of this process within the specified comm.
Definition heffte_utils.h:79
void comm_free(MPI_Comm const comm)
Calls free on the MPI comm.
Definition heffte_utils.h:175
size_t get_workspace_size(std::array< std::unique_ptr< reshape3d_base< index > >, 4 > const &shapers)
Returns the maximum workspace size used by the shapers.
Definition heffte_reshape3d.h:115
std::unique_ptr< reshape3d_pointtopoint< location_tag, packer, index > > make_reshape3d_pointtopoint(typename backend::device_instance< location_tag >::stream_type q, std::vector< box3d< index > > const &input_boxes, std::vector< box3d< index > > const &output_boxes, reshape_algorithm algorithm, bool use_gpu_aware, MPI_Comm const comm)
Factory method that all the necessary work to establish the communication patterns.
std::unique_ptr< reshape3d_alltoall< location_tag, packer, index > > make_reshape3d_alltoall(typename backend::device_instance< location_tag >::stream_type q, std::vector< box3d< index > > const &input_boxes, std::vector< box3d< index > > const &output_boxes, bool uses_gpu_aware, MPI_Comm const comm)
Factory method that all the necessary work to establish the communication patterns.
std::unique_ptr< reshape3d_alltoallv< location_tag, packer, index > > make_reshape3d_alltoallv(typename backend::device_instance< location_tag >::stream_type q, std::vector< box3d< index > > const &input_boxes, std::vector< box3d< index > > const &output_boxes, bool use_gpu_aware, MPI_Comm const comm)
Factory method that all the necessary work to establish the communication patterns.
std::unique_ptr< reshape3d_base< index > > make_reshape3d(typename backend::device_instance< typename backend::buffer_traits< backend_tag >::location >::stream_type stream, std::vector< box3d< index > > const &input_boxes, std::vector< box3d< index > > const &output_boxes, MPI_Comm const comm, plan_options const options)
Factory method to create a reshape3d instance.
Definition heffte_reshape3d.h:505
void compute_overlap_map_transpose_pack(int me, int nprocs, box3d< index > const destination, std::vector< box3d< index > > const &boxes, std::vector< int > &proc, std::vector< int > &offset, std::vector< int > &sizes, std::vector< pack_plan_3d< index > > &plans)
Generates an unpack plan where the boxes and the destination do not have the same order.
Namespace containing all HeFFTe methods and classes.
Definition heffte_backend_cuda.h:38
Common data-transfer operations, must be specializes for each location (cpu/gpu).
Definition heffte_common.h:59
Holds the auxiliary variables needed by each backend.
Definition heffte_common.h:408
device_instance(void *=nullptr)
Empty constructor.
Definition heffte_common.h:410
void * stream()
Returns the nullptr.
Definition heffte_common.h:414
Defines the direct packer without implementation, use the specializations to get the CPU or GPU imple...
Definition heffte_pack3d.h:83
Wrapper around cufftHandle plans, set for float or double complex.
Definition heffte_backend_cuda.h:346
Defines a set of tweaks and options to use in the plan generation.
Definition heffte_plan_logic.h:131
reshape_algorithm algorithm
Defines the communication algorithm.
Definition heffte_plan_logic.h:148
bool use_gpu_aware
Defines whether to use MPI calls directly from the GPU or to move to the CPU first.
Definition heffte_plan_logic.h:152
Indicates the use of cpu backend and that all input/output data and arrays will be bound to the cpu.
Definition heffte_common.h:38