Report Typos and Errors    
Semi-Lagrangian Library
Modular library for kinetic and gyrokinetic simulations of plasmas in fusion energy devices.
sll_m_decomposition.F90
Go to the documentation of this file.
1 !**************************************************************
2 !
3 ! Domain decomposition module for Selalib (1D, ..., 6d)
4 !
5 ! Some design aspects are borrowed from sll_remap.F90.
6 !
7 ! Note:
8 ! For memory reasons, this code is implemented separately
9 ! from the comm/port model provided by the existing module
10 ! located at "point_to_point_communications".
11 ! In 6d, we cannot afford allocating 2*12=24 buffers to
12 ! implement the non-blocking ghost cell exchange using
13 ! the comm/port model.
14 !
15 !**************************************************************
21 
22 
23 ! Preprocessor macro: Use single precision for the halo exchange, ie.
24 ! basically a poor man's lossy compression to speed up MPI communication.
25 ! NOTE: The define should be passed by the build system, please see
26 ! the build script <mpcdf.sh> for an example.
27 !!!#define USE_HALO_REAL32
28 #ifdef USE_HALO_REAL32
29 #define HALO_DTYPE sll_real32
30 #else
31 #define HALO_DTYPE sll_real64
32 #endif
33 
34 
36 !+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
37 #include "sll_assert.h"
38 #include "sll_memory.h"
39 #include "sll_working_precision.h"
40 
41  use sll_m_collective, only: &
43 
44  use mpi, only: &
45  mpi_cart_create, &
46  mpi_cart_get, &
47  mpi_cart_sub, &
48  mpi_cart_shift, &
49  mpi_double_precision, &
50  mpi_real, &
51  mpi_send, &
52  mpi_recv, &
53  mpi_sendrecv, &
54  mpi_status_ignore, &
55  mpi_success, &
56  mpi_allreduce, &
57  mpi_integer, &
58  mpi_sum, &
59  mpi_group_incl, &
60  mpi_comm_create, &
61  mpi_group_free, &
62  mpi_barrier, &
63  mpi_finalize, &
64  mpi_byte, &
65  mpi_max_processor_name, &
66  mpi_get_processor_name, &
67  mpi_scatter, &
68  mpi_character, &
69  mpi_comm_split, &
70  mpi_comm_free
71 
73 
74 #ifdef _OPENMP
75  use omp_lib
76 #define OMP_COLLAPSE collapse(2)
77 #define OMP_SCHEDULE schedule(static)
78 !#define OMP_SCHEDULE schedule(dynamic)
79 #endif
80 
81 #ifdef USE_FMEMPOOL
82  use fmempool
83 #endif
84 
86 
87  implicit none
88 
89  public :: &
114 
115  public :: &
120 
121  private
122 
125  ! topology-associated MPI communicator
126  sll_int32 :: comm
127  ! MPI rank
128  sll_int32 :: rank
129  ! MPI size
130  sll_int32 :: nprocs
131  ! store optional information (for debugging purposes)
132  sll_int32 :: info = -1
133  ! array containing the number of MPI processes to use along the n-th dimension
134  sll_int32 :: procs(6)
135  ! array indicating if a periodic boundary condition exists at the n-th dimension
136  logical :: periodic(6)
137  ! coordinates of the current MPI process within the cartesian topology
138  sll_int32 :: coords(6)
139  ! MPI ranks of the topological neighbors of the current MPI process
140  sll_int32 :: neighbors(12)
142 
145  ! topology-associated MPI communicator
146  sll_int32 :: comm
147  ! MPI rank
148  sll_int32 :: rank
149  ! MPI size
150  sll_int32 :: nprocs
151  ! store optional information (for debugging purposes)
152  sll_int32 :: info = -1
153  ! array containing the number of MPI processes to use along the n-th dimension
154  sll_int32 :: procs(3)
155  ! array indicating if a periodic boundary condition exists at the n-th dimension
156  logical :: periodic(3)
157  ! coordinates of the current MPI process within the cartesian topology
158  sll_int32 :: coords(3)
159  ! MPI ranks of the topological neighbors of the current MPI process
160  sll_int32 :: neighbors(6)
162 
165  ! --- primary: local array extents usable for computation, width of the halo
166  sll_int32, dimension(:) :: mn(6) ! min index, w/o halo
167  sll_int32, dimension(:) :: mx(6) ! max index, w/o halo
168  sll_int32, dimension(:) :: hw(6) ! halo width
169  ! --- derived: total allocated extents of the local array, including halo cells
170  sll_int32, dimension(:) :: lo(6) ! min index, w/ halo
171  sll_int32, dimension(:) :: hi(6) ! max index, w/ halo
172  ! --- derived: auxiliary variables, array sizes
173  sll_int32, dimension(:) :: nw(6) ! net width of the array, w/o halo
174  sll_int32, dimension(:) :: gw(6) ! gross width of the array, w/ halo
175  ! --- derived: ranges to copy between send (tx) and receive (rx) buffers and arrays
176  sll_int32, dimension(:) :: tx_lolo(6) ! lower/left neighbor, lower bound
177  sll_int32, dimension(:) :: tx_lohi(6) ! lower/left neighbor, upper bound
178  sll_int32, dimension(:) :: tx_hilo(6) ! upper/right neighbor, lower bound
179  sll_int32, dimension(:) :: tx_hihi(6) ! upper/right neighbor, upper bound
180  sll_int32, dimension(:) :: rx_lolo(6)
181  sll_int32, dimension(:) :: rx_lohi(6)
182  sll_int32, dimension(:) :: rx_hilo(6)
183  sll_int32, dimension(:) :: rx_hihi(6)
184  end type decomposition_local_6d
185 
186 
187 ! --- "slim" halo redesign
189  sll_int32, dimension(:) :: mn(6) ! min index of buf
190  sll_int32, dimension(:) :: mx(6) ! max index of buf
191  sll_int32, dimension(:) :: nw(6) ! net width of buf
192 #ifdef USE_FMEMPOOL
193  halo_dtype, pointer :: buf(:,:,:,:,:,:) => null() ! halo buffer
194 #else
195  halo_dtype, dimension(:,:,:,:,:,:), allocatable :: buf
196 #endif
197  end type halo_buffer_6d
198 
199 
201  sll_int32, dimension(:) :: mn(3) ! min index of buf
202  sll_int32, dimension(:) :: mx(3) ! max index of buf
203  sll_int32, dimension(:) :: nw(3) ! net width of buf
204  halo_dtype, dimension(:,:,:), allocatable :: buf ! halo buffer
205  end type halo_buffer_3d
206 
207 
210  ! --- primary: local array extents usable for computation, width of the halo
211  sll_int32, dimension(:) :: mn(6) ! min index, w/o halo
212  sll_int32, dimension(:) :: mx(6) ! max index, w/o halo
213  sll_int32, dimension(:) :: nw(6) ! net width of the array, w/o halo
214  ! --- dynamic part
215  sll_int32 :: id ! dimension to which the halos currently apply (valid: 1, ..., 6; invalid: -1)
216  type(halo_buffer_6d) :: halo_left
217  type(halo_buffer_6d) :: halo_right
218  ! --- boundary condition buffers for spline interpolation
219  halo_dtype, allocatable :: bc_left_send(:,:,:,:,:)
220  halo_dtype, allocatable :: bc_right_send(:,:,:,:,:)
221  halo_dtype, allocatable :: bc_left(:,:,:,:,:)
222  halo_dtype, allocatable :: bc_right(:,:,:,:,:)
223  ! --- cell (DG) interpolation-specific entries
224  sll_int32, dimension(:) :: mn_cell(6) ! min cell
225  sll_int32, dimension(:) :: mx_cell(6) ! max cell
226  sll_int32, dimension(:) :: n_cells(6) ! local number of cells
228 
229 
230 
232  ! --- primary: local array extents usable for computation, width of the halo
233  sll_int32, dimension(:) :: mn(3) ! min index, w/o halo
234  sll_int32, dimension(:) :: mx(3) ! max index, w/o halo
235  sll_int32, dimension(:) :: nw(3) ! net width of the array, w/o halo
236  ! --- dynamic part
237  sll_int32 :: id ! dimension to which the halos currently apply (valid: 1, ..., 3; invalid: -1)
238  type(halo_buffer_3d) :: halo_left
239  type(halo_buffer_3d) :: halo_right
240  ! --- cell (DG) interpolation-specific entries
241  sll_int32, dimension(:) :: mn_cell(6) ! min cell
242  sll_int32, dimension(:) :: mx_cell(6) ! max cell
243  sll_int32, dimension(:) :: n_cells(6) ! local number of cells
245 
246 
247 
250  ! --- primary: local array extents usable for computation, width of the halo
251  sll_int32, dimension(:) :: mn(3) ! min index, w/o halo
252  sll_int32, dimension(:) :: mx(3) ! max index, w/o halo
253  sll_int32, dimension(:) :: hw(3) ! halo width
254  ! --- derived: total allocated extents of the local array, including halo cells
255  sll_int32, dimension(:) :: lo(3) ! min index, w/ halo
256  sll_int32, dimension(:) :: hi(3) ! max index, w/ halo
257  ! --- derived: auxiliary variables, array sizes
258  sll_int32, dimension(:) :: nw(3) ! net width of the array, w/o halo
259  sll_int32, dimension(:) :: gw(3) ! gross width of the array, w/ halo
260  ! --- derived: ranges to copy between send (tx) and receive (rx) buffers and arrays
261  sll_int32, dimension(:) :: tx_lolo(3) ! lower/left neighbor, lower bound
262  sll_int32, dimension(:) :: tx_lohi(3) ! lower/left neighbor, upper bound
263  sll_int32, dimension(:) :: tx_hilo(3) ! upper/right neighbor, lower bound
264  sll_int32, dimension(:) :: tx_hihi(3) ! upper/right neighbor, upper bound
265  sll_int32, dimension(:) :: rx_lolo(3)
266  sll_int32, dimension(:) :: rx_lohi(3)
267  sll_int32, dimension(:) :: rx_hilo(3)
268  sll_int32, dimension(:) :: rx_hihi(3)
269  end type decomposition_local_3d
270 
273  sll_int32, dimension(:) :: global(6)
274  type(decomposition_local_6d) :: local
275  end type sll_t_decomposition_6d
276 
277 
280  sll_int32, dimension(:) :: global(6)
281  sll_int32, dimension(:) :: n_cells(6) ! local number of cells (DG)
284 
285 
288  sll_int32, dimension(:) :: global(3)
289  type(decomposition_local_3d) :: local
290  end type sll_t_decomposition_3d
291 
292 
295  sll_int32, dimension(:) :: global(3)
298 
299 
301  !module procedure sll_f_new_cartesian_topology_3d
302  module procedure sll_f_new_cartesian_topology_6d
303  end interface sll_o_new_cartesian_topology
304 
308 
309  ! --- DEPRECATED - should be sll_f_*, see below!
311  module procedure sll_f_apply_halo_exchange_6d_real64
312  end interface sll_o_apply_halo_exchange
313 
315  module procedure sll_f_apply_halo_exchange_6d_real64
318  end interface sll_f_apply_halo_exchange
319 
320 
321 contains
322 
323 
325  subroutine get_transposed_process_map(procs_per_dimension, rank_map)
326  integer, parameter :: nd=6
327  sll_int32, intent(in) :: procs_per_dimension(nd)
328  sll_int32, intent(inout) :: rank_map(0:)
329  integer :: i,j,k,l,m,n,rk
330  integer, allocatable :: mpi_grid(:,:,:,:,:,:)
331 
332  allocate(mpi_grid(0:procs_per_dimension(1)-1,&
333  0:procs_per_dimension(2)-1,&
334  0:procs_per_dimension(3)-1,&
335  0:procs_per_dimension(4)-1,&
336  0:procs_per_dimension(5)-1,&
337  0:procs_per_dimension(6)-1))
338 
339  ! set up integer array in row major (C) order with MPI ranks ('rk'),
340  ! mimicking a compact layout of MPI processes onto nodes
341  rk = 0
342  do i=0,procs_per_dimension(1)-1
343  do j=0,procs_per_dimension(2)-1
344  do k=0,procs_per_dimension(3)-1
345  do l=0,procs_per_dimension(4)-1
346  do m=0,procs_per_dimension(5)-1
347  do n=0,procs_per_dimension(6)-1
348  mpi_grid(i,j,k,l,m,n) = rk
349  rk = rk + 1
350  enddo
351  enddo
352  enddo
353  enddo
354  enddo
355  enddo
356 
357  ! fill rank map such that the original process grid is transposed
358  rk = 0
359  do n=0,procs_per_dimension(6)-1
360  do m=0,procs_per_dimension(5)-1
361  do l=0,procs_per_dimension(4)-1
362  do k=0,procs_per_dimension(3)-1
363  do j=0,procs_per_dimension(2)-1
364  do i=0,procs_per_dimension(1)-1
365  rank_map(rk) = mpi_grid(i,j,k,l,m,n)
366  rk = rk + 1
367  enddo
368  enddo
369  enddo
370  enddo
371  enddo
372  enddo
373 
374  deallocate(mpi_grid)
375  end subroutine
376 
377 
379  function sll_f_new_cartesian_topology_6d(top_collective, procs_per_dimension, periodic)
381  type(sll_t_collective_t), intent(in) :: top_collective
382  integer, parameter :: nd=6
383  sll_int32, intent(in) :: procs_per_dimension(nd)
384  logical, intent(in) :: periodic(nd)
385  ! reordering of MPI ranks with MPI_Cart_create()
386  logical :: q_reorder
387  sll_int32 :: i, ierr
388  integer :: name_len, my_rank, num_ranks, fd, in1, in2, new_rank, comm_temp
389  character(len=MPI_MAX_PROCESSOR_NAME) :: hostname
390  character(len=MPI_MAX_PROCESSOR_NAME), allocatable :: hostnames(:)
391  integer, allocatable :: ranks_reordered(:)
392  logical :: q_block, q_block_6d_py, q_transpose
393 
394  my_rank = top_collective%rank
395  num_ranks = top_collective%size
396 
397  ! find out if and how we are supposed to modify the process grid,
398  q_transpose = sll_f_query_environment("SLL_TRANSPOSE_PROCESS_GRID", .false.)
399  q_block = sll_f_query_environment("SLL_USE_PROCESS_BLOCKING", .false.)
400  q_reorder = sll_f_query_environment("SLL_MPI_CART_CREATE_REORDER", .false.)
401  inquire(file="./block6d.py", exist=q_block_6d_py)
402 
403  ! catch non-useful logical cases
404  if (((q_transpose).or.(q_block)).and.(q_reorder)) then
405  q_reorder = .false.
406  endif
407  if ((q_transpose).and.(q_block)) then
408  q_transpose = .false.
409  q_block = .false.
410  endif
411  if ((q_block).and.(.not.(q_block_6d_py))) then
412  if (my_rank == 0) then
413  write(*,*) " MPI topology: disabled blocking due to missing <block6d.py>"
414  endif
415  q_block = .false.
416  endif
417  if ((q_block).and.(top_collective%size < 64)) then
418  if (my_rank == 0) then
419  write(*,*) " MPI topology: disabled blocking due to small number of processes"
420  endif
421  q_block = .false.
422  endif
423 
424  sll_allocate(sll_f_new_cartesian_topology_6d, ierr)
425 
426  sll_f_new_cartesian_topology_6d%procs = procs_per_dimension
427  sll_f_new_cartesian_topology_6d%periodic = periodic
428 
429  if ((q_transpose).or.(q_block)) then
430 ! if (my_rank == 0) then
431  allocate(ranks_reordered(0:num_ranks-1))
432 ! endif
433 
434  ! fill the 'ranks_reordered' array, either blocked or transposed
435  if (q_block) then
436  ! implementation transferred from './blocking_prototype/cart.f90'
437  call mpi_get_processor_name(hostname, name_len, ierr)
438  if (my_rank == 0) then
439  write(*,*) " MPI topology: blocked process topology"
440 
441  allocate(hostnames(0:num_ranks-1))
442 
443  ! collect hostname table on rank 0
444  hostnames(0) = hostname
445  do i=1,num_ranks-1
446  call mpi_recv(hostnames(i), mpi_max_processor_name, mpi_character, &
447  i, 0, top_collective%comm, mpi_status_ignore, ierr)
448  enddo
449 
450  ! write dims and hostnames table to text file
451  fd = 42
452  open(unit=fd, file='rank_hostnames.dat', status='replace', action='write')
453  write(fd,*) num_ranks
454  write(fd,*) nd
455  do i=1,nd
456  write(fd,*) sll_f_new_cartesian_topology_6d%procs(i)
457  enddo
458  do i=0,num_ranks-1
459  write(fd,'(I6,A,A)') i, " ", trim(hostnames(i))
460  enddo
461  close(fd)
462  deallocate(hostnames)
463 
464  ! Reorder MPI ranks in a blocked way,
465  ! this is currently done separately in a Python script.
466  ! In case this turns out to be successful a Fortran implementation will be done.
467  ! call system("python ./block6d.py")
468  call execute_command_line("python ./block6d.py")
469 
470  ! read reordered rank numbers
471  open(unit=fd, file='rank_reordered.dat', status='old', action='read')
472  do i=0,num_ranks-1
473  read(fd,*) in1, in2
474  ranks_reordered(i) = in2
475  enddo
476  close(fd)
477  else
478  call mpi_send(hostname, mpi_max_processor_name, mpi_character, &
479  0, 0, top_collective%comm, ierr)
480  endif
481  else
482  ! q_transpose branch
483  if (my_rank == 0) then
484  write(*,*) " MPI topology: transposed process topology"
485  call get_transposed_process_map(procs_per_dimension, ranks_reordered)
486  endif
487  endif
488 
489  ! distribute the individual new rank to all the processes
490  call mpi_scatter(ranks_reordered, 1, mpi_integer, &
491  new_rank, 1, mpi_integer, &
492  0, top_collective%comm, ierr)
493  if (my_rank == 0) then
494  deallocate(ranks_reordered)
495  endif
496 
497  ! renumber the MPI ranks
498  call mpi_comm_split(top_collective%comm, 0, new_rank, comm_temp, ierr)
499  ! create a new cartesian topology based on the renumbered ranks
500  call mpi_cart_create(comm_temp, nd, &
503  q_reorder,&
505  ierr)
506  call mpi_comm_free(comm_temp, ierr)
507  else
508  ! native Cartesian topology creation
509  if ((my_rank == 0).and.(q_reorder)) then
510  write(*,*) " MPI topology: MPI_Cart_create() _may_ reorder processes."
511  endif
512  call mpi_cart_create(top_collective%comm, nd,&
515  q_reorder,&
517  ierr)
518  sll_assert(ierr == mpi_success)
519  endif
520 
521  call mpi_comm_rank(sll_f_new_cartesian_topology_6d%comm,&
523  ierr)
524  sll_assert(ierr == mpi_success)
525 
526  call mpi_comm_size(sll_f_new_cartesian_topology_6d%comm,&
528  ierr)
529  sll_assert(ierr == mpi_success)
530 
531  ! print new rank mapping
532  if ((q_transpose).or.(q_block)) then
533  write(*,'(A,I6,A,I6)') " : ", my_rank, " --> ", sll_f_new_cartesian_topology_6d%rank
534  endif
535 
536  ! query the coordinates of the current process within the cartesian topology
538  call mpi_cart_get(sll_f_new_cartesian_topology_6d%comm, nd,&
542  ierr)
543  sll_assert(ierr == mpi_success)
544 
545  ! determine the neighbors within the cartesian topology
546  sll_f_new_cartesian_topology_6d%neighbors = -1
547  do i=1,nd
548  call mpi_cart_shift(sll_f_new_cartesian_topology_6d%comm, i-1, 1, &
549  sll_f_new_cartesian_topology_6d%neighbors(2*i-1), &
550  sll_f_new_cartesian_topology_6d%neighbors(2*i), &
551  ierr)
552  sll_assert(ierr == mpi_success)
553  enddo
555 
559 
561  function sll_f_new_cartesian_topology_3d(top_collective, procs_per_dimension, periodic)
563  type(sll_t_collective_t), intent(in) :: top_collective
564  integer, parameter :: nd=3
565  sll_int32, intent(in) :: procs_per_dimension(nd)
566  logical, intent(in) :: periodic(nd)
567  ! disallow reordering of MPI ranks with MPI_Cart_create()
568  logical, parameter :: reorder = .false.
569  sll_int32 :: i, ierr
570 
571  sll_allocate(sll_f_new_cartesian_topology_3d, ierr)
572 
573  sll_f_new_cartesian_topology_3d%procs = procs_per_dimension
574  sll_f_new_cartesian_topology_3d%periodic = periodic
575 
576  ! create a cartesian process topology, return a new communicator
577  call mpi_cart_create(top_collective%comm, nd,&
580  reorder,&
582  ierr)
583  sll_assert(ierr == mpi_success)
584 
585 
586  call mpi_comm_rank(sll_f_new_cartesian_topology_3d%comm,&
588  ierr)
589  sll_assert(ierr == mpi_success)
590 
591  call mpi_comm_size(sll_f_new_cartesian_topology_3d%comm,&
593  ierr)
594  sll_assert(ierr == mpi_success)
595 
596 
597  ! query the coordinates of the current process within the cartesian topology
599  call mpi_cart_get(sll_f_new_cartesian_topology_3d%comm, nd,&
603  ierr)
604  sll_assert(ierr == mpi_success)
605 
606  ! determine the neighbors within the cartesian topology
607  sll_f_new_cartesian_topology_3d%neighbors = -1
608  do i=1,nd
609  call mpi_cart_shift(sll_f_new_cartesian_topology_3d%comm, i-1, 1, &
610  sll_f_new_cartesian_topology_3d%neighbors(2*i-1), &
611  sll_f_new_cartesian_topology_3d%neighbors(2*i), &
612  ierr)
613  sll_assert(ierr == mpi_success)
614  enddo
616 
620 
621 
625  type(sll_t_cartesian_topology_6d), pointer, intent(in) :: t6d
626  logical, dimension(6), intent(in) :: keep_dim
627  sll_int32, parameter :: nd = 3
628  sll_int32 :: i, j, ierr
629 
630  ! assert that we actually pick three dimensions from the 6D topology
631  j = 0
632  do i=1,6
633  if (keep_dim(i) .eqv. .true.) j = j + 1
634  end do
635  sll_assert(j == nd)
636 
637  sll_allocate(sll_f_new_cartesian_topology_3d_from_6d, ierr)
638  t3d => sll_f_new_cartesian_topology_3d_from_6d ! create a convenient pointer alias
639 
640  ! Create a 3d topology by projecting the velocity coordinates (dims 4,5,6)
641  ! down to the spatial coordinates (dims 1,2,3). As a result,
642  ! we obtain a set of new communicators with 3D Cartesian topology.
643  call mpi_cart_sub(t6d%comm, keep_dim, &
644  t3d%comm, ierr)
645  sll_assert(ierr == mpi_success)
646 
647  call mpi_comm_rank(t3d%comm,&
648  t3d%rank,&
649  ierr)
650  sll_assert(ierr == mpi_success)
651 
652  call mpi_comm_size(t3d%comm,&
653  t3d%nprocs,&
654  ierr)
655  sll_assert(ierr == mpi_success)
656 
657  ! Debug: Construct an identifier that is unique among the processors of the new 3D group.
658  i = t6d%rank
659  call mpi_allreduce(i, t3d%info, 1,&
660  mpi_integer, mpi_sum,&
661  t3d%comm, ierr)
662  sll_assert(ierr == mpi_success)
663 
664  ! Query the coordinates of the current process within the cartesian topology
665  t3d%coords = -1
666  t3d%procs = -1
667  t3d%periodic = .false.
668  call mpi_cart_get(t3d%comm, nd,&
669  t3d%procs,&
670  t3d%periodic,&
671  t3d%coords,&
672  ierr)
673  sll_assert(ierr == mpi_success)
674 
675  ! Determine the neighbors within the cartesian topology
676  t3d%neighbors = -1
677  do i=1,nd
678  call mpi_cart_shift(t3d%comm, i-1, 1, &
679  t3d%neighbors(2*i-1), &
680  t3d%neighbors(2*i), &
681  ierr)
682  sll_assert(ierr == mpi_success)
683  enddo
685 
686 
690  topo_3d_o ! just a convenient pointer alias
691  type(sll_t_cartesian_topology_6d), pointer, intent(in) :: topo_6d
692  type(sll_t_cartesian_topology_3d), pointer, intent(in) :: topo_3d
693  sll_int32 :: i, j, n3d, ierr
694  sll_int32, allocatable :: topo_3d_rank_table(:)
695  sll_int32, allocatable :: topo_6d_rank_grouped_by_3d_rank(:)
696  sll_int32 :: group_6d, group_3d
697 
698  ! build a table that contains the 3D-ranks of each process in the 6D communicator
699  allocate(topo_3d_rank_table(0:topo_6d%nprocs-1))
700  topo_3d_rank_table(:) = -1
701  call mpi_allgather(topo_3d%rank, 1, mpi_integer,&
702  topo_3d_rank_table, 1, mpi_integer,&
703  topo_6d%comm, ierr)
704  sll_assert(ierr == mpi_success)
705  !write(*,*) topo_3d_rank_table
706 
707  ! determine the number of co-existing 3d communicators
708  n3d = 0
709  do i=0, topo_6d%nprocs-1
710  if (topo_3d%rank == topo_3d_rank_table(i)) then
711  n3d = n3d + 1
712  end if
713  end do
714 
715  ! build a table of the 6d ranks that have identical 3d ranks
716  allocate(topo_6d_rank_grouped_by_3d_rank(n3d))
717  topo_6d_rank_grouped_by_3d_rank(:) = -1
718  j = 1
719  do i=0, topo_6d%nprocs-1
720  if (topo_3d%rank == topo_3d_rank_table(i)) then
721  topo_6d_rank_grouped_by_3d_rank(j) = i
722  j = j + 1
723  end if
724  end do
725  !write(*,"(8192(I))") topo_6d%rank, topo_6d_rank_grouped_by_3d_rank
726 
727 
728  ! obtain the group belonging to the 6d communicator
729  call mpi_comm_group(topo_6d%comm, group_6d, ierr)
730  sll_assert(ierr == mpi_success)
731 
732  ! TODO : handle (re)ordering of the ranks in order to be compatible
733  ! with sll_remap (ie data <--> rank mapping must be handled)
734 
735  ! form new groups defined by the 6d rank tables with identical 3d ranks (see above)
736  call mpi_group_incl(group_6d, n3d, topo_6d_rank_grouped_by_3d_rank, group_3d, ierr)
737  sll_assert(ierr == mpi_success)
738 
739  ! create the new topology
740  sll_allocate(topo_3d_o, ierr)
741  sll_assert(ierr == mpi_success)
742  call mpi_comm_create(topo_6d%comm, group_3d, topo_3d_o%comm, ierr)
743  sll_assert(ierr == mpi_success)
744  !
745  call mpi_comm_rank(topo_3d_o%comm,&
746  topo_3d_o%rank,&
747  ierr)
748  sll_assert(ierr == mpi_success)
749  !
750  call mpi_comm_size(topo_3d_o%comm,&
751  topo_3d_o%nprocs,&
752  ierr)
753  sll_assert(ierr == mpi_success)
754 
755  deallocate(topo_3d_rank_table)
756  deallocate(topo_6d_rank_grouped_by_3d_rank)
757 
758  call mpi_group_free(group_3d, ierr)
759  sll_assert(ierr == mpi_success)
760 
761  call mpi_group_free(group_6d, ierr)
762  sll_assert(ierr == mpi_success)
763 
766 
767 
768  function sll_f_new_cartesian_domain_decomposition_6d(topology, grid_size, halo_width)
770  type(sll_t_cartesian_topology_6d), pointer, intent(in) :: topology
771  integer, parameter :: nd=6
772  sll_int32, intent(in) :: grid_size(nd)
773  sll_int32, intent(in) :: halo_width(nd)
774  sll_int32 :: i, ierr
775  sll_int32 :: lp, l0, l1
776 
777  ! --- initial checks
778  do i=1,nd
779  sll_assert( halo_width(i) >= 0 )
780  end do
781  do i=1,nd
782  ! for the moment, the global grid must be divisible evenly among the MPI processes
783  sll_assert( mod(grid_size(i), topology%procs(i)) == 0 )
784  end do
785 
786  ! --- copy and calculate all the necessary index values ---
788 
790 
791  ! --- loop over dimensions and compute index values for each dimension
792  do i=1,nd
793  ! compute the local number of grid points
794  lp = grid_size(i) / topology%procs(i)
795  ! compute the lower local index bound (the coords array starts at zero)
796  l0 = 1 + topology%coords(i) * lp
797  ! compute the upper local index bound (the coords array starts at zero)
798  l1 = (topology%coords(i) + 1) * lp
799 
800  sll_assert( lp/2 >= halo_width(i) )
801 
804  sll_f_new_cartesian_domain_decomposition_6d%local%hw(i) = halo_width(i)
805  sll_f_new_cartesian_domain_decomposition_6d%local%lo(i) = l0 - halo_width(i)
806  sll_f_new_cartesian_domain_decomposition_6d%local%hi(i) = l1 + halo_width(i)
807 
808  sll_f_new_cartesian_domain_decomposition_6d%local%tx_lolo(i) = l0
809  sll_f_new_cartesian_domain_decomposition_6d%local%tx_lohi(i) = l0 + (halo_width(i) - 1)
810  sll_f_new_cartesian_domain_decomposition_6d%local%tx_hilo(i) = l1 - (halo_width(i) - 1)
811  sll_f_new_cartesian_domain_decomposition_6d%local%tx_hihi(i) = l1
812  sll_f_new_cartesian_domain_decomposition_6d%local%rx_lolo(i) = l0 - halo_width(i)
813  sll_f_new_cartesian_domain_decomposition_6d%local%rx_lohi(i) = l0 - 1
814  sll_f_new_cartesian_domain_decomposition_6d%local%rx_hilo(i) = l1 + 1
815  sll_f_new_cartesian_domain_decomposition_6d%local%rx_hihi(i) = l1 + halo_width(i)
816  end do
817 
818  ! compute net array width
821 
822  ! compute gross array width
825 
827 
828 
832 
833 
834  ! --- allocator for redesigned ("slim") decomposition with dynamic halos
837  type(sll_t_cartesian_topology_6d), pointer, intent(in) :: topology
838  integer, parameter :: nd=6
839  sll_int32, intent(in) :: grid_size(nd)
840  sll_int32 :: i, ierr
841  sll_int32 :: lp, l0, l1
842 
843  ! --- initial checks
844  do i=1,nd
845  ! for the moment, the global grid must be divisible among the MPI processes
846  sll_assert( mod(grid_size(i), topology%procs(i)) == 0 )
847  end do
848 
849  ! --- copy and calculate all the necessary index values
852  ! loop over dimensions and compute index values for each dimension
853  do i=1,nd
854  ! compute the local number of grid points
855  lp = grid_size(i) / topology%procs(i)
856  ! compute the lower local index bound (the coords array starts at zero)
857  l0 = 1 + topology%coords(i) * lp
858  ! compute the upper local index bound (the coords array starts at zero)
859  l1 = (topology%coords(i) + 1) * lp
860 
864  end do
865 
867 
868 ! #ifdef USE_FMEMPOOL
869 ! nullify(sll_f_new_cartesian_domain_decomposition_slim_6d%local%halo_left%buf)
870 ! nullify(sll_f_new_cartesian_domain_decomposition_slim_6d%local%halo_right%buf)
871 ! #endif
873 
874 
878 
879 
882  type(sll_t_cartesian_topology_3d), pointer, intent(in) :: topology
883  integer, parameter :: nd=3
884  sll_int32, intent(in) :: grid_size(nd)
885  sll_int32 :: i, ierr
886  sll_int32 :: lp, l0, l1
887 
888  ! --- initial checks
889  do i=1,nd
890  ! for the moment, the global grid must be divisible among the MPI processes
891  sll_assert( mod(grid_size(i), topology%procs(i)) == 0 )
892  end do
893 
894  ! --- copy and calculate all the necessary index values
897  ! loop over dimensions and compute index values for each dimension
898  do i=1,nd
899  ! compute the local number of grid points
900  lp = grid_size(i) / topology%procs(i)
901  ! compute the lower local index bound (the coords array starts at zero)
902  l0 = 1 + topology%coords(i) * lp
903  ! compute the upper local index bound (the coords array starts at zero)
904  l1 = (topology%coords(i) + 1) * lp
905 
909  end do
910 
913 
914 
918 
919 
920  ! --- allocator for redesigned ("slim") decomposition with dynamic halos
921  function sll_f_new_cartesian_cell_domain_decomposition_slim_6d(topology, n_cells, degree)
923  type(sll_t_cartesian_topology_6d), pointer, intent(in) :: topology
924  integer, parameter :: nd=6
925  sll_int32, intent(in) :: n_cells(nd)
926  sll_int32, intent(in) :: degree(nd)
927 
928  sll_int32 :: i, ierr
929  sll_int32 :: lp, l0, l1
930 
931  ! --- initial checks
932  do i=1,nd
933  ! for the moment, the global grid must be divisible among the MPI processes
934  sll_assert( mod(n_cells(i), topology%procs(i)) == 0 )
935  end do
936 
937  ! --- copy and calculate all the necessary index values
939 
941  ! GRID indices: loop over dimensions and compute index values for each dimension
942  do i=1,nd
943  ! compute the local number of grid points
944  lp = (n_cells(i) / topology%procs(i)) * degree(i)
945  ! compute the lower local index bound (the coords array starts at zero)
946  l0 = 1 + topology%coords(i) * lp
947  ! compute the upper local index bound (the coords array starts at zero)
948  l1 = (topology%coords(i) + 1) * lp
949 
953  end do
954 
956  ! CELL indices: loop over dimensions and compute index values for each dimension
957  do i=1,nd
958  ! compute the local number of cells
959  lp = n_cells(i) / topology%procs(i)
960  ! compute the lower local index bound (the coords array starts at zero)
961  l0 = 1 + topology%coords(i) * lp
962  ! compute the upper local index bound (the coords array starts at zero)
963  l1 = (topology%coords(i) + 1) * lp
964 
968  end do
969 
971 
972 ! #ifdef USE_FMEMPOOL
973 ! nullify(sll_f_new_cartesian_cell_domain_decomposition_slim_6d%local%halo_left%buf)
974 ! nullify(sll_f_new_cartesian_cell_domain_decomposition_slim_6d%local%halo_right%buf)
975 ! #endif
977 
980 !#ifdef USE_FMEMPOOL
981 ! call mp_cleanup()
982 !#endif
984 
985 
986  function sll_f_new_cartesian_domain_decomposition_3d(topology, grid_size, halo_width)
988  type(sll_t_cartesian_topology_3d), pointer, intent(in) :: topology
989  integer, parameter :: nd=3
990  sll_int32, intent(in) :: grid_size(nd)
991  sll_int32, intent(in) :: halo_width(nd)
992  sll_int32 :: i, ierr
993  sll_int32 :: lp, l0, l1
994 
995  ! --- initial checks
996  do i=1,nd
997  sll_assert( halo_width(i) >= 0 )
998  end do
999  do i=1,nd
1000  ! for the moment, the global grid must be divisible evenly among the MPI processes
1001  sll_assert( mod(grid_size(i), topology%procs(i)) == 0 )
1002  end do
1003 
1004  ! --- copy and calculate all the necessary index values ---
1006 
1008 
1009  ! --- loop over dimensions and compute index values for each dimension
1010  do i=1,nd
1011  ! compute the local number of grid points
1012  lp = grid_size(i) / topology%procs(i)
1013  ! compute the lower local index bound (the coords array starts at zero)
1014  l0 = 1 + topology%coords(i) * lp
1015  ! compute the upper local index bound (the coords array starts at zero)
1016  l1 = (topology%coords(i) + 1) * lp
1017 
1018  sll_assert( lp/2 >= halo_width(i) )
1019 
1022  sll_f_new_cartesian_domain_decomposition_3d%local%hw(i) = halo_width(i)
1023  sll_f_new_cartesian_domain_decomposition_3d%local%lo(i) = l0 - halo_width(i)
1024  sll_f_new_cartesian_domain_decomposition_3d%local%hi(i) = l1 + halo_width(i)
1025 
1026  sll_f_new_cartesian_domain_decomposition_3d%local%tx_lolo(i) = l0
1027  sll_f_new_cartesian_domain_decomposition_3d%local%tx_lohi(i) = l0 + (halo_width(i) - 1)
1028  sll_f_new_cartesian_domain_decomposition_3d%local%tx_hilo(i) = l1 - (halo_width(i) - 1)
1029  sll_f_new_cartesian_domain_decomposition_3d%local%tx_hihi(i) = l1
1030  sll_f_new_cartesian_domain_decomposition_3d%local%rx_lolo(i) = l0 - halo_width(i)
1031  sll_f_new_cartesian_domain_decomposition_3d%local%rx_lohi(i) = l0 - 1
1032  sll_f_new_cartesian_domain_decomposition_3d%local%rx_hilo(i) = l1 + 1
1033  sll_f_new_cartesian_domain_decomposition_3d%local%rx_hihi(i) = l1 + halo_width(i)
1034  end do
1035 
1036  ! compute net array width
1039 
1040  ! compute gross array width
1044 
1045 
1049 
1050 
1051  subroutine sll_s_copy_array_to_buffer_6d_real64(arr, arr_lo, arr_hi, buf, ranges, n_threads)
1052  sll_int32, dimension(6), intent(in) :: arr_lo
1053  sll_int32, dimension(6), intent(in) :: arr_hi
1054  sll_real64, dimension(arr_lo(1):arr_hi(1), arr_lo(2):arr_hi(2), arr_lo(3):arr_hi(3), & arr_lo(4):arr_hi(4), arr_lo(5):arr_hi(5), arr_lo(6):arr_hi(6)), &
1055  intent(in) :: arr
1056  halo_dtype, dimension(:), intent(out) :: buf
1057  sll_int32, dimension(6,2), intent(in) :: ranges
1058  sll_int32, intent(in), optional :: n_threads
1059  sll_int32 :: i,j,k,l,m,n
1060  sll_int32 :: ii,ij,ik,il,im,in ! original indices, computed from zero-based ones
1061  sll_int32 :: wi,wj,wk,wl,wm,wn ! widths of the array, for zero-based indexing
1062  sll_int32 :: oj,ok,ol,om,on ! offsets of the buffer, for zero-based indexing
1063  sll_int32 :: idx, n_omp_threads
1064 
1065 #ifdef _OPENMP
1066  if (present(n_threads)) then
1067  n_omp_threads = n_threads
1068  else
1069  n_omp_threads = omp_get_max_threads()
1070  endif
1071 #else
1072  n_omp_threads = 1
1073 #endif
1074 
1075  wi = ranges(1,2) - ranges(1,1) + 1
1076  wj = ranges(2,2) - ranges(2,1) + 1
1077  wk = ranges(3,2) - ranges(3,1) + 1
1078  wl = ranges(4,2) - ranges(4,1) + 1
1079  wm = ranges(5,2) - ranges(5,1) + 1
1080  wn = ranges(6,2) - ranges(6,1) + 1
1081 !$omp parallel num_threads(n_omp_threads) default(shared) private(i,j,k,l,m,n, ii,ij,ik,il,im,in, oj,ok,ol,om,on, idx)
1082 !$omp do OMP_SCHEDULE OMP_COLLAPSE
1083  do n=0,wn-1
1084  do m=0,wm-1
1085  ! --- enable OMP collapse
1086  in = n + ranges(6,1)
1087  on = n * wi * wj * wk * wl * wm
1088  ! ---
1089  im = m + ranges(5,1)
1090  om = m * wi * wj * wk * wl
1091  do l=0,wl-1
1092  il = l + ranges(4,1)
1093  ol = l * wi * wj * wk
1094  do k=0,wk-1
1095  ik = k + ranges(3,1)
1096  ok = k * wi * wj
1097  do j=0,wj-1
1098  ij = j + ranges(2,1)
1099  oj = j * wi
1100  do i=0,wi-1
1101  ii = i + ranges(1,1)
1102  ! linear index calculation
1103  idx = 1 + i + oj + ok + ol + om + on
1104  buf(idx) = arr(ii,ij,ik,il,im,in)
1105  enddo
1106  enddo
1107  enddo
1108  enddo
1109  enddo
1110  enddo
1111 !$omp end do
1112 !$omp end parallel
1113 ! --- previous version, a little slower, non-parallel, kept for documentation purposes
1114 ! idx=1
1115 ! do n=ranges(6,1),ranges(6,2)
1116 ! do m=ranges(5,1),ranges(5,2)
1117 ! do l=ranges(4,1),ranges(4,2)
1118 ! do k=ranges(3,1),ranges(3,2)
1119 ! do j=ranges(2,1),ranges(2,2)
1120 ! do i=ranges(1,1),ranges(1,2)
1121 ! buf(idx) = arr(i,j,k,l,m,n)
1122 ! idx = idx + 1
1123 ! enddo
1124 ! enddo
1125 ! enddo
1126 ! enddo
1127 ! enddo
1128 ! enddo
1130 
1131 
1132  subroutine sll_s_copy_array_to_buffer_3d_real64(arr, arr_lo, arr_hi, buf, ranges, n_threads)
1133  sll_int32, dimension(3), intent(in) :: arr_lo
1134  sll_int32, dimension(3), intent(in) :: arr_hi
1135  sll_real64, dimension(arr_lo(1):arr_hi(1), arr_lo(2):arr_hi(2), arr_lo(3):arr_hi(3)), intent(in) :: arr
1136  halo_dtype, dimension(:), intent(out) :: buf
1137  sll_int32, dimension(3,2), intent(in) :: ranges
1138  sll_int32, intent(in), optional :: n_threads
1139 
1140  sll_int32 :: i,j,k
1141  sll_int32 :: ii,ij,ik ! original indices, computed from zero-based ones
1142  sll_int32 :: wi,wj,wk ! widths of the array, for zero-based indexing
1143  sll_int32 :: oj,ok ! offsets of the buffer, for zero-based indexing
1144  sll_int32 :: idx, n_omp_threads
1145 
1146 #ifdef _OPENMP
1147  if (present(n_threads)) then
1148  n_omp_threads = n_threads
1149  else
1150  n_omp_threads = omp_get_max_threads()
1151  endif
1152 #else
1153  n_omp_threads = 1
1154 #endif
1155 
1156  wi = ranges(1,2) - ranges(1,1) + 1
1157  wj = ranges(2,2) - ranges(2,1) + 1
1158  wk = ranges(3,2) - ranges(3,1) + 1
1159 !$omp parallel num_threads(n_omp_threads) default(shared) private(i,j,k,ii,ij,ik,oj,ok,idx)
1160 !$omp do OMP_SCHEDULE OMP_COLLAPSE
1161  do k=0,wk-1
1162  do j=0,wj-1
1163  ik = k + ranges(3,1)
1164  ok = k * wi * wj
1165  ij = j + ranges(2,1)
1166  oj = j * wi
1167  do i=0,wi-1
1168  ii = i + ranges(1,1)
1169  idx = 1 + i + oj + ok
1170  buf(idx) = arr(ii,ij,ik)
1171  enddo
1172  enddo
1173  enddo
1174 !$omp end do
1175 !$omp end parallel
1177 
1178 
1179 
1180  subroutine copy_buffer_to_array_6d_real64(buf, arr, arr_lo, arr_hi, ranges, n_threads)
1181  halo_dtype, dimension(:), intent(in) :: buf
1182  sll_int32, dimension(6), intent(in) :: arr_lo
1183  sll_int32, dimension(6), intent(in) :: arr_hi
1184  sll_real64, dimension(arr_lo(1):arr_hi(1), arr_lo(2):arr_hi(2), arr_lo(3):arr_hi(3), & arr_lo(4):arr_hi(4), arr_lo(5):arr_hi(5), arr_lo(6):arr_hi(6)),&
1185  intent(out) :: arr
1186  sll_int32, dimension(6,2), intent(in) :: ranges
1187  sll_int32, intent(in), optional :: n_threads
1188  sll_int32 :: i,j,k,l,m,n
1189  sll_int32 :: ii,ij,ik,il,im,in ! original indices, computed from zero-based ones
1190  sll_int32 :: wi,wj,wk,wl,wm,wn ! widths of the array, for zero-based indexing
1191  sll_int32 :: oj,ok,ol,om,on ! offsets of the buffer, for zero-based indexing
1192  sll_int32 :: idx, n_omp_threads
1193 
1194 #ifdef _OPENMP
1195  if (present(n_threads)) then
1196  n_omp_threads = n_threads
1197  else
1198  n_omp_threads = omp_get_max_threads()
1199  endif
1200 #else
1201  n_omp_threads = 1
1202 #endif
1203 
1204  wi = ranges(1,2) - ranges(1,1) + 1
1205  wj = ranges(2,2) - ranges(2,1) + 1
1206  wk = ranges(3,2) - ranges(3,1) + 1
1207  wl = ranges(4,2) - ranges(4,1) + 1
1208  wm = ranges(5,2) - ranges(5,1) + 1
1209  wn = ranges(6,2) - ranges(6,1) + 1
1210 !$omp parallel num_threads(n_omp_threads) default(shared) private(i,j,k,l,m,n, ii,ij,ik,il,im,in, oj,ok,ol,om,on, idx)
1211 !$omp do OMP_SCHEDULE OMP_COLLAPSE
1212  do n=0,wn-1
1213  do m=0,wm-1
1214  ! --- enable OMP collapse
1215  in = n + ranges(6,1)
1216  on = n * wi * wj * wk * wl * wm
1217  ! ---
1218  im = m + ranges(5,1)
1219  om = m * wi * wj * wk * wl
1220  do l=0,wl-1
1221  il = l + ranges(4,1)
1222  ol = l * wi * wj * wk
1223  do k=0,wk-1
1224  ik = k + ranges(3,1)
1225  ok = k * wi * wj
1226  do j=0,wj-1
1227  ij = j + ranges(2,1)
1228  oj = j * wi
1229  do i=0,wi-1
1230  ii = i + ranges(1,1)
1231  ! linear index calculation
1232  idx = 1 + i + oj + ok + ol + om + on
1233  arr(ii,ij,ik,il,im,in) = buf(idx)
1234  enddo
1235  enddo
1236  enddo
1237  enddo
1238  enddo
1239  enddo
1240 !$omp end do
1241 !$omp end parallel
1242 ! --- previous version, a little slower, non-parallel, kept for documentation purposes
1243 ! idx=1
1244 ! do n=ranges(6,1),ranges(6,2)
1245 ! do m=ranges(5,1),ranges(5,2)
1246 ! do l=ranges(4,1),ranges(4,2)
1247 ! do k=ranges(3,1),ranges(3,2)
1248 ! do j=ranges(2,1),ranges(2,2)
1249 ! do i=ranges(1,1),ranges(1,2)
1250 ! arr(i,j,k,l,m,n) = buf(idx)
1251 ! idx = idx + 1
1252 ! enddo
1253 ! enddo
1254 ! enddo
1255 ! enddo
1256 ! enddo
1257 ! enddo
1258  end subroutine copy_buffer_to_array_6d_real64
1259 
1260 
1261  subroutine sll_f_apply_halo_exchange_6d_real64(topo, decomp, arr, dim_mask_in)
1262  integer, parameter :: nd = 6 ! we handle 6 dimensions
1263  ! --- arguments
1264  type(sll_t_cartesian_topology_6d), intent(in) :: topo
1265  type(sll_t_decomposition_6d), intent(in) :: decomp
1266  sll_real64, dimension(:,:,:,:,:,:), intent(inout) :: arr(decomp%local%lo(1):decomp%local%hi(1), &
1267  decomp%local%lo(2):decomp%local%hi(2), &
1268  decomp%local%lo(3):decomp%local%hi(3), &
1269  decomp%local%lo(4):decomp%local%hi(4), &
1270  decomp%local%lo(5):decomp%local%hi(5), &
1271  decomp%local%lo(6):decomp%local%hi(6))
1272  logical, dimension(:), intent(in), optional :: dim_mask_in(nd)
1273  ! --- MPI communication-related variables and buffers
1274  sll_int64, save :: bufsize = 0
1275  sll_int64 :: nxc ! total number of elements to be exchanged
1276  sll_int64 :: off ! offset where to start from
1277  sll_int64 :: rem ! remaining elements yet to be exchanged
1278  sll_int32 :: nel ! number of elements to be exchanged at a single MPI call
1279  integer, dimension(:,:) :: r_rx(nd,2) ! index ranges for the direct copy operations
1280  integer, dimension(:,:) :: r_tx(nd,2)
1281 #ifdef USE_HALO_REAL32
1282  sll_int32, parameter :: nxc_max = 128000000
1283  integer, parameter :: mpi_precision = mpi_real
1284  integer, parameter :: word_size = 4
1285 #else
1286  sll_int32, parameter :: nxc_max = 64000000
1287  integer, parameter :: mpi_precision = mpi_double_precision
1288  integer, parameter :: word_size = 8
1289 #endif
1290  halo_dtype, dimension(:), allocatable, target, save :: sendbuf, recvbuf
1291  integer :: mpi_tag
1292  ! ---
1293  integer :: id, jd
1294  integer :: ierr
1295 
1296 ! integer :: nbytes_in
1297 ! integer :: off_elements_thread, off_bytes_thread, nel_thread, omp_rank, omp_size
1298 ! sll_int32 :: n_packets
1299 
1300  logical, save :: first_call = .true.
1301  logical, save :: sll_use_mpi_sendrecv = .true.
1302  integer :: i,j,k,l,m,n
1303  ! --- DEBUG
1304  integer, save :: dump_buffer_invocation_count = 0
1305  integer, save :: dump_f6d_invocation_count = 0
1306  integer, save :: dump_dd_info_invocation_count = 0
1307  ! --- dimension selection mask
1308  logical, dimension(:) :: dim_mask(nd)
1309 
1310  dim_mask = .true. ! by default, we exchange all 6 dimensions
1311  if (present(dim_mask_in)) dim_mask = dim_mask_in
1312 
1313  mpi_tag = 0
1314 
1315  ! --- query environment variables
1316  if (first_call) then
1317 #ifdef USE_HALO_REAL32
1318  if (topo%rank == 0) then
1319  write(*,*) "sll_m_decomposition::apply_halo_exchange() uses single precision messages"
1320  endif
1321 #endif
1322  sll_use_mpi_sendrecv = sll_f_query_environment("SLL_USE_MPI_SENDRECV", .true.)
1323  if (topo%rank == 0) then
1324  if (sll_use_mpi_sendrecv) then
1325  write(*,*) "sll_m_decomposition::apply_halo_exchange() uses MPI_SENDRECV()"
1326  else
1327  write(*,*) "sll_m_decomposition::apply_halo_exchange() uses MPI_SEND() and MPI_RECV()"
1328  endif
1329  end if
1330  first_call = .false.
1331  endif
1332 
1333  ! --- loop over dimensions and exchange data between neighbors
1334  do id=1,nd
1335  if (.not. dim_mask(id)) cycle
1336 
1337  if (topo%procs(id) == 1) then
1338  ! --- we copy the ghost cells directly, assuming periodic BCs
1339  ! (1) copy to the left
1340  r_tx(:,1) = decomp%local%mn(:)
1341  r_tx(:,2) = decomp%local%mx(:)
1342  r_tx(id,1) = decomp%local%tx_lolo(id)
1343  r_tx(id,2) = decomp%local%tx_lohi(id)
1344  r_rx(:,1) = decomp%local%mn(:)
1345  r_rx(:,2) = decomp%local%mx(:)
1346  r_rx(id,1) = decomp%local%rx_hilo(id)
1347  r_rx(id,2) = decomp%local%rx_hihi(id)
1348 !$omp parallel default(shared) private(i,j,k,l,m,n)
1349 !$omp do OMP_SCHEDULE OMP_COLLAPSE
1350  do n = 0, r_rx(6,2)-r_rx(6,1)
1351  do m = 0, r_rx(5,2)-r_rx(5,1)
1352  do l = 0, r_rx(4,2)-r_rx(4,1)
1353  do k = 0, r_rx(3,2)-r_rx(3,1)
1354  do j = 0, r_rx(2,2)-r_rx(2,1)
1355  do i = 0, r_rx(1,2)-r_rx(1,1)
1356  arr(r_rx(1,1)+i, r_rx(2,1)+j, r_rx(3,1)+k, r_rx(4,1)+l, r_rx(5,1)+m, r_rx(6,1)+n) = &
1357  arr(r_tx(1,1)+i, r_tx(2,1)+j, r_tx(3,1)+k, r_tx(4,1)+l, r_tx(5,1)+m, r_tx(6,1)+n)
1358  enddo
1359  enddo
1360  enddo
1361  enddo
1362  enddo
1363  enddo
1364 !$omp end do
1365 !$omp end parallel
1366  ! (2) copy to the right
1367  r_tx(:,1) = decomp%local%mn(:)
1368  r_tx(:,2) = decomp%local%mx(:)
1369  r_tx(id,1) = decomp%local%tx_hilo(id)
1370  r_tx(id,2) = decomp%local%tx_hihi(id)
1371  r_rx(:,1) = decomp%local%mn(:)
1372  r_rx(:,2) = decomp%local%mx(:)
1373  r_rx(id,1) = decomp%local%rx_lolo(id)
1374  r_rx(id,2) = decomp%local%rx_lohi(id)
1375 !$omp parallel default(shared) private(i,j,k,l,m,n)
1376 !$omp do OMP_SCHEDULE OMP_COLLAPSE
1377  do n = 0, r_rx(6,2)-r_rx(6,1)
1378  do m = 0, r_rx(5,2)-r_rx(5,1)
1379  do l = 0, r_rx(4,2)-r_rx(4,1)
1380  do k = 0, r_rx(3,2)-r_rx(3,1)
1381  do j = 0, r_rx(2,2)-r_rx(2,1)
1382  do i = 0, r_rx(1,2)-r_rx(1,1)
1383  arr(r_rx(1,1)+i, r_rx(2,1)+j, r_rx(3,1)+k, r_rx(4,1)+l, r_rx(5,1)+m, r_rx(6,1)+n) = &
1384  arr(r_tx(1,1)+i, r_tx(2,1)+j, r_tx(3,1)+k, r_tx(4,1)+l, r_tx(5,1)+m, r_tx(6,1)+n)
1385  enddo
1386  enddo
1387  enddo
1388  enddo
1389  enddo
1390  enddo
1391 !$omp end do
1392 !$omp end parallel
1393  else
1394  ! --- calculate the number of items to be exchanged
1395  nxc = int(decomp%local%hw(id), i64)
1396  do jd=1,nd
1397  if (jd == id) then
1398  cycle
1399  else
1400  nxc = nxc * decomp%local%nw(jd)
1401  endif
1402  end do
1403 
1404  if (nxc > bufsize) then
1405  if (allocated(sendbuf)) deallocate(sendbuf)
1406  if (allocated(recvbuf)) deallocate(recvbuf)
1407  allocate(sendbuf(nxc))
1408  allocate(recvbuf(nxc))
1409  bufsize = nxc
1410  end if
1411 
1412  ! --- (1) copy halo cells to the left neighbor
1413  r_tx(:,1) = decomp%local%mn(:)
1414  r_tx(:,2) = decomp%local%mx(:)
1415  r_tx(id,1) = decomp%local%tx_lolo(id)
1416  r_tx(id,2) = decomp%local%tx_lohi(id)
1417  call sll_s_copy_array_to_buffer_6d_real64(arr, decomp%local%lo, decomp%local%hi, sendbuf, r_tx)
1418 
1419  ! --- split MPI communication into pieces smaller than nxc_max
1420  rem = nxc
1421  off = 1
1422  do while (rem > 0)
1423  if (rem > nxc_max) then
1424  nel = nxc_max
1425  else
1426  nel = int(rem, i32)
1427  endif
1428  if (sll_use_mpi_sendrecv) then
1429  call mpi_sendrecv(sendbuf(off), nel, mpi_precision, topo%neighbors(2*id-1), mpi_tag,&
1430  recvbuf(off), nel, mpi_precision, topo%neighbors(2*id), mpi_tag,&
1431  topo%comm, mpi_status_ignore, ierr)
1432  sll_assert(ierr == mpi_success)
1433  else ! sll_use_mpi_sendrecv
1434  ! --- odd processor coordinate numbers send first to the left
1435  if (mod(topo%coords(id), 2) > 0) then
1436  call mpi_send(sendbuf(off), nel, mpi_precision, topo%neighbors(2*id-1), mpi_tag, topo%comm, ierr)
1437  sll_assert(ierr == mpi_success)
1438  call mpi_recv(recvbuf(off), nel, mpi_precision, topo%neighbors(2*id), mpi_tag, topo%comm, mpi_status_ignore, ierr)
1439  sll_assert(ierr == mpi_success)
1440  else
1441  ! --- even processor coordinate numbers receive first from the right
1442  call mpi_recv(recvbuf(off), nel, mpi_precision, topo%neighbors(2*id), mpi_tag, topo%comm, mpi_status_ignore, ierr)
1443  sll_assert(ierr == mpi_success)
1444  call mpi_send(sendbuf(off), nel, mpi_precision, topo%neighbors(2*id-1), mpi_tag, topo%comm, ierr)
1445  sll_assert(ierr == mpi_success)
1446  endif
1447  endif ! sll_use_mpi_sendrecv
1448  off = off + nel
1449  rem = rem - nel
1450  enddo ! while(rem>0)
1451 
1452  r_rx(:,1) = decomp%local%mn(:)
1453  r_rx(:,2) = decomp%local%mx(:)
1454  r_rx(id,1) = decomp%local%rx_hilo(id)
1455  r_rx(id,2) = decomp%local%rx_hihi(id)
1456  call copy_buffer_to_array_6d_real64(recvbuf, arr, decomp%local%lo, decomp%local%hi, r_rx)
1457 
1458 
1459  ! --- (2) copy halo cells to the right neighbor
1460  r_tx(:,1) = decomp%local%mn(:)
1461  r_tx(:,2) = decomp%local%mx(:)
1462  r_tx(id,1) = decomp%local%tx_hilo(id)
1463  r_tx(id,2) = decomp%local%tx_hihi(id)
1464  call sll_s_copy_array_to_buffer_6d_real64(arr, decomp%local%lo, decomp%local%hi, sendbuf, r_tx)
1465  rem = nxc
1466  off = 1
1467  do while (rem > 0)
1468  if (rem > nxc_max) then
1469  nel = nxc_max
1470  else
1471  nel = int(rem, i32)
1472  endif
1473  if (sll_use_mpi_sendrecv) then
1474  call mpi_sendrecv(sendbuf(off), nel, mpi_precision, topo%neighbors(2*id), 1,&
1475  recvbuf(off), nel, mpi_precision, topo%neighbors(2*id-1), 1,&
1476  topo%comm, mpi_status_ignore, ierr)
1477  sll_assert(ierr == mpi_success)
1478  else ! sll_use_mpi_sendrecv
1479  ! --- odd processor coordinate numbers send first to the right
1480  if (mod(topo%coords(id), 2) > 0) then
1481  call mpi_send(sendbuf(off), nel, mpi_precision, topo%neighbors(2*id), &
1482  mpi_tag, topo%comm, ierr)
1483  sll_assert(ierr == mpi_success)
1484  call mpi_recv(recvbuf(off), nel, mpi_precision, topo%neighbors(2*id-1), &
1485  mpi_tag, topo%comm, mpi_status_ignore, ierr)
1486  sll_assert(ierr == mpi_success)
1487  else
1488  ! --- even processor coordinate numbers receive first from the left
1489  call mpi_recv(recvbuf(off), nel, mpi_precision, topo%neighbors(2*id-1), &
1490  mpi_tag, topo%comm, mpi_status_ignore, ierr)
1491  sll_assert(ierr == mpi_success)
1492  call mpi_send(sendbuf(off), nel, mpi_precision, topo%neighbors(2*id), &
1493  mpi_tag, topo%comm, ierr)
1494  sll_assert(ierr == mpi_success)
1495  endif
1496  endif ! sll_use_mpi_sendrecv
1497  off = off + nel
1498  rem = rem - nel
1499  enddo
1500 
1501  r_rx(:,1) = decomp%local%mn(:)
1502  r_rx(:,2) = decomp%local%mx(:)
1503  r_rx(id,1) = decomp%local%rx_lolo(id)
1504  r_rx(id,2) = decomp%local%rx_lohi(id)
1505  call copy_buffer_to_array_6d_real64(recvbuf, arr, decomp%local%lo, decomp%local%hi, r_rx)
1506 
1507  endif ! if branch (topo%nprocs > 1)
1508  enddo ! id (loop over dimensions)
1509 
1511 
1512 
1513 ! call MPI_Sendrecv(sendbuf, nel, mpi_precision, topo%neighbors(2*id-1), mpi_tag,&
1514 ! decomp%local%halo_right%buf, nel, mpi_precision, topo%neighbors(2*id), mpi_tag,&
1515 ! topo%comm, MPI_STATUS_IGNORE, ierr)
1516  subroutine mpi_sendrecv_compressed_6d_real64(sendbuf, recvbuf, nel, rank_send,&
1517  rank_recv, mpi_comm, verbose, mpi_tag)
1518  use iso_c_binding
1519  halo_dtype, pointer :: sendbuf(:)
1520  halo_dtype, pointer :: recvbuf(:,:,:,:,:,:), recv_view_1d(:)
1521  sll_int32 :: nel, rank_send, rank_recv, mpi_comm
1522  logical, intent(in), optional :: verbose
1523  integer, intent(in), optional :: mpi_tag
1524  logical :: verb
1525  integer :: tag
1526 
1527  if (present(verbose)) then
1528  verb = verbose
1529  else
1530  verb = .false.
1531  endif
1532 
1533  if (present(mpi_tag)) then
1534  tag = mpi_tag
1535  else
1536  tag = 0
1537  endif
1538 
1539  call c_f_pointer(c_loc(recvbuf), recv_view_1d, [nel])
1540  call mpi_sendrecv_compressed(sendbuf, recv_view_1d, nel, rank_send, rank_recv, mpi_comm, verb, tag)
1541  nullify(recv_view_1d)
1542  end subroutine mpi_sendrecv_compressed_6d_real64
1543 
1544  subroutine mpi_sendrecv_compressed(sendbuf, recvbuf, nel, rank_send, rank_recv,&
1545  mpi_comm, verbose, mpi_tag)
1546  halo_dtype, pointer :: sendbuf(:)
1547  halo_dtype, pointer :: recvbuf(:)
1548  sll_int32 :: nel, rank_send, rank_recv, mpi_comm
1549  logical, intent(in), optional :: verbose
1550  integer, intent(in), optional :: mpi_tag
1551  sll_int32 :: ierr
1552 #ifdef USE_HALO_REAL32
1553  integer, parameter :: mpi_precision = mpi_real
1554 #else
1555  integer, parameter :: mpi_precision = mpi_double_precision
1556 #endif
1557  logical :: do_compress
1558  type(sll_t_compressed_buffer) :: comp_send, comp_recv ! data structures containing compressed data and offsets
1559  integer :: omp_size
1560  logical :: verb
1561  integer :: tag
1562 
1563  if (present(verbose)) then
1564  verb = verbose
1565  else
1566  verb = .false.
1567  endif
1568 
1569  if (present(mpi_tag)) then
1570  tag = mpi_tag
1571  else
1572  tag = 0
1573  endif
1574 
1575 #ifdef _OPENMP
1576  omp_size = omp_get_max_threads()
1577 #else
1578  omp_size = 1
1579 #endif
1580 
1581 #ifdef USE_HALO_REAL32
1582  do_compress = .false.
1583 #else
1584  if (modulo(nel, zfp_blocksize) /= 0) then
1585  write(*,*) "mpi_sendrecv_compressed() : disabled due to blocksize mismatch"
1586  do_compress = .false.
1587 ! elseif (modulo(nel/zfp_blocksize, omp_size) /= 0)
1588 ! write(*,*) "mpi_sendrecv_compressed() : disabled due to n_slices vs. omp_size mismatch"
1589 ! do_compress = .false.
1590  else
1591  do_compress = .true.
1592  endif
1593 #endif
1594 
1595  if (do_compress) then
1596 #ifndef USE_HALO_REAL32
1597  ! compress halo data into the comp_send object
1598  call deflate_buffer_real64(sendbuf, comp_send, n_doubles=nel)
1599 
1600  ! ! Communicate the index part of the comp_send object, assuming that each MPI rank
1601  ! ! uses the same number of threads (i.e. comp%n_slices). We concatenate the integers
1602  ! ! and arrays into a contiguous buffer, for simplicity. Could be done better using a
1603  ! ! MPI data type, of course.
1604  ! n_idx = concatenate_index_arrays(comp_send, comp_idx_send) ! allocates comp_idx_send internally!
1605  ! allocate(comp_idx_recv(0:n_idx-1))
1606  ! call MPI_Sendrecv(comp_idx_send, n_idx, MPI_INTEGER, rank_send, mpi_tag,&
1607  ! comp_idx_recv, n_idx, MPI_INTEGER, rank_recv, mpi_tag,&
1608  ! mpi_comm, MPI_STATUS_IGNORE, ierr)
1609  ! SLL_ASSERT_ALWAYS(ierr == MPI_SUCCESS)
1610  ! call decatenate_index_arrays(comp_recv, comp_idx_recv)
1611  ! deallocate(comp_idx_send); nullify(comp_idx_send)
1612  ! deallocate(comp_idx_recv); nullify(comp_idx_recv)
1613  !
1614  ! if (verb) then
1615  ! call print_compression_information(comp_send, .false.)
1616  ! endif
1617  !
1618  ! ! communicate the compressed buffer
1619  ! allocate(comp_recv%buffer(0:comp_recv%n_bytes_deflated_total-1))
1620  ! call MPI_Sendrecv(comp_send%buffer, comp_send%n_bytes_deflated_total, MPI_BYTE, rank_send, mpi_tag,&
1621  ! comp_recv%buffer, comp_recv%n_bytes_deflated_total, MPI_BYTE, rank_recv, mpi_tag,&
1622  ! mpi_comm, MPI_STATUS_IGNORE, ierr)
1623  ! SLL_ASSERT_ALWAYS(ierr == MPI_SUCCESS)
1624  call sll_s_mpi_sendrecv_compressed_core(comp_send, comp_recv, rank_send, rank_recv, mpi_comm, verb, tag)
1625 
1626  ! decompress halo data into revcbuf
1627  call inflate_buffer_real64(recvbuf, comp_recv)
1628 
1629  call deallocate_compressed_buffer_obj(comp_send)
1630  call deallocate_compressed_buffer_obj(comp_recv)
1631 #endif
1632  else
1633  call mpi_sendrecv(sendbuf, nel, mpi_precision, rank_send, tag,&
1634  recvbuf, nel, mpi_precision, rank_recv, tag,&
1635  mpi_comm, mpi_status_ignore, ierr)
1636  sll_assert_always(ierr == mpi_success)
1637  endif
1638  end subroutine mpi_sendrecv_compressed
1639 
1640 
1642  subroutine sll_s_mpi_sendrecv_compressed_core(comp_send, comp_recv, rank_send, rank_recv, mpi_comm, verbose, mpi_tag)
1643  type(sll_t_compressed_buffer) :: comp_send, comp_recv ! data structures containing compressed data and offsets
1644  sll_int32 :: rank_send, rank_recv, mpi_comm
1645  logical, intent(in), optional :: verbose
1646  integer, intent(in), optional :: mpi_tag
1647 
1648  integer :: ierr, n_idx
1649  integer, pointer :: comp_idx_send(:), comp_idx_recv(:)
1650  logical :: verb
1651  integer :: tag
1652 
1653  if (present(verbose)) then
1654  verb = verbose
1655  else
1656  verb = .false.
1657  endif
1658 
1659  if (present(mpi_tag)) then
1660  tag = mpi_tag
1661  else
1662  tag = 0
1663  endif
1664 
1665  ! Communicate the index part of the comp_send object, assuming that each MPI rank
1666  ! uses the same number of threads (i.e. comp%n_slices). We concatenate the integers
1667  ! and arrays into a contiguous buffer, for simplicity. Could be done better using a
1668  ! MPI data type, of course.
1669  n_idx = concatenate_index_arrays(comp_send, comp_idx_send) ! allocates comp_idx_send internally!
1670  allocate(comp_idx_recv(0:n_idx-1))
1671  call mpi_sendrecv(comp_idx_send, n_idx, mpi_integer, rank_send, mpi_tag,&
1672  comp_idx_recv, n_idx, mpi_integer, rank_recv, mpi_tag,&
1673  mpi_comm, mpi_status_ignore, ierr)
1674  sll_assert_always(ierr == mpi_success)
1675  call decatenate_index_arrays(comp_recv, comp_idx_recv)
1676  deallocate(comp_idx_send); nullify(comp_idx_send)
1677  deallocate(comp_idx_recv); nullify(comp_idx_recv)
1678 
1679  if (verb) then
1680  call print_compression_information(comp_send, .false.)
1681  endif
1682 
1683  ! communicate the compressed buffer
1684  allocate(comp_recv%buffer(0:comp_recv%n_bytes_deflated_total-1))
1685  call mpi_sendrecv(comp_send%buffer, comp_send%n_bytes_deflated_total, mpi_byte, rank_send, tag,&
1686  comp_recv%buffer, comp_recv%n_bytes_deflated_total, mpi_byte, rank_recv, tag,&
1687  mpi_comm, mpi_status_ignore, ierr)
1688  sll_assert_always(ierr == mpi_success)
1689  end subroutine sll_s_mpi_sendrecv_compressed_core
1690 
1691 
1692  ! --- halo exchange routine compatible with the "slim" redesing using dynamic buffers ---
1693  subroutine sll_f_apply_halo_exchange_slim_6d_real64(topo, decomp, arr, id, hw_left, hw_right)
1694  integer, parameter :: nd = 6 ! we handle 6 dimensions
1695  ! --- arguments
1696  type(sll_t_cartesian_topology_6d), intent(in) :: topo
1697  type(sll_t_decomposition_slim_6d), intent(inout) :: decomp
1698  sll_real64, dimension(:,:,:,:,:,:), intent(inout) :: arr(decomp%local%mn(1):decomp%local%mx(1), &
1699  decomp%local%mn(2):decomp%local%mx(2), &
1700  decomp%local%mn(3):decomp%local%mx(3), &
1701  decomp%local%mn(4):decomp%local%mx(4), &
1702  decomp%local%mn(5):decomp%local%mx(5), &
1703  decomp%local%mn(6):decomp%local%mx(6))
1704  sll_int32, intent(in) :: id, hw_left, hw_right
1705  sll_int32 :: halo_block(6,2)
1706  halo_block(:,1) = decomp%local%mn
1707  halo_block(:,2) = decomp%local%mx
1708  call sll_s_apply_halo_exchange_slim_6d_real64( topo, decomp, arr, id, hw_left, hw_right, halo_block )
1710 
1711 
1712  ! --- halo exchange routine compatible with the "slim" redesing using dynamic buffers ---
1713  subroutine sll_s_apply_halo_exchange_slim_6d_real64(topo, decomp, arr, id, hw_left, hw_right, halo_block)
1714  integer, parameter :: nd = 6 ! we handle 6 dimensions
1715  ! --- arguments
1716  type(sll_t_cartesian_topology_6d), intent(in) :: topo
1717  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomp
1718  sll_real64, intent(inout) :: arr(decomp%local%mn(1):decomp%local%mx(1), &
1719  decomp%local%mn(2):decomp%local%mx(2), &
1720  decomp%local%mn(3):decomp%local%mx(3), &
1721  decomp%local%mn(4):decomp%local%mx(4), &
1722  decomp%local%mn(5):decomp%local%mx(5), &
1723  decomp%local%mn(6):decomp%local%mx(6))
1724  sll_int32, intent(in) :: id, hw_left, hw_right
1725  sll_int32, intent(in) :: halo_block(6,2)
1726  integer :: jd, i, j, k, l, m, n
1727  integer :: ierr
1728  logical, save :: first_call = .true.
1729 
1730  ! --- MPI communication-related variables and buffers
1731  sll_int64, save :: bufsize = 0
1732  sll_int64 :: nxc ! total number of elements to be exchanged
1733  sll_int32 :: nel ! number of elements to be exchanged at a single MPI call
1734  integer, dimension(:,:) :: r_rx(nd,2) ! index ranges for copy operations
1735  integer, dimension(:,:) :: r_tx(nd,2)
1736 #ifdef USE_HALO_REAL32
1737  sll_int32, parameter :: nxc_max = 2147483647
1738  integer, parameter :: mpi_precision = mpi_real
1739  integer, parameter :: word_size = 4
1740 #else
1741  sll_int32, parameter :: nxc_max = 2147483647
1742  integer, parameter :: mpi_precision = mpi_double_precision
1743  integer, parameter :: word_size = 8
1744 #endif
1745  halo_dtype, pointer, save :: sendbuf(:)
1746  halo_dtype, pointer :: recvbuf(:,:,:,:,:,:)
1747  integer :: mpi_tag
1748  logical, save :: use_compression
1749  logical, save :: compression_verbose
1750  integer, save :: prec
1751  ! ------
1752  logical, parameter :: sendbuf_dump = .false.
1753  integer, save :: dump_counter = 0
1754  character(len=32) :: dump_filename
1755 
1756  mpi_tag = 0
1757 
1758  if (first_call) then
1759  first_call = .false.
1760  compression_verbose = .false.
1761 #ifdef USE_HALO_REAL32
1762  if (topo%rank == 0) then
1763  write(*,*) "sll_m_decomposition::apply_halo_exchange() uses single precision messages"
1764  endif
1765  use_compression = .false.
1766 #else
1767 #ifdef USE_ZFP
1768  use_compression = sll_f_query_environment("SLL_USE_COMPRESSION", .false.)
1769  if (use_compression) then
1770  prec = sll_f_query_environment("SLL_ZFP_PRECISION", 32)
1771  call set_compression_precision(prec)
1772  if (topo%rank == 0) then
1773  write(*,*) "sll_m_decomposition::apply_halo_exchange() uses message compression"
1774  compression_verbose = sll_f_query_environment("SLL_COMPRESSION_VERBOSE", .false.)
1775  endif
1776  endif
1777 #else
1778  use_compression = .false.
1779 #endif
1780 #endif
1781  nullify(sendbuf)
1782  endif
1783 
1784 
1785  ! --- copy halo cells to the left neighbor / into the right halo buffer
1786  !
1787  ! A one-dimensional example, :
1788  !
1789  ! |---|----------|**| # left neighbor
1790  !
1791  ! |---|**--------|$$| # current process
1792  !
1793  ! |---|$$--------|%%| # right neighbor
1794  !
1795 
1796  decomp%local%id = id
1797  decomp%local%halo_right%mn(:) = halo_block(:,1)
1798  decomp%local%halo_right%mx(:) = halo_block(:,2)
1799  decomp%local%halo_right%nw(:) = decomp%local%halo_right%mx(:)-decomp%local%halo_right%mn(:)+1
1800 
1801  decomp%local%halo_right%mn(id) = decomp%local%mx(id) + 1
1802  decomp%local%halo_right%mx(id) = decomp%local%mx(id) + hw_right
1803  decomp%local%halo_right%nw(id) = hw_right
1804 
1805  if (hw_right > 0) then
1806  ! --- change the decomposition object state as required by the requested halo_width
1807 #ifdef USE_FMEMPOOL
1808  if (associated(decomp%local%halo_right%buf)) then
1809  call mp_release(decomp%local%halo_right%buf)
1810  endif
1811  call mp_acquire(decomp%local%halo_right%buf, decomp%local%halo_right%mn, decomp%local%halo_right%mx)
1812 #else
1813  if (allocated(decomp%local%halo_right%buf)) &
1814  deallocate(decomp%local%halo_right%buf)
1815  allocate(decomp%local%halo_right%buf( &
1816  decomp%local%halo_right%mn(1):decomp%local%halo_right%mx(1), &
1817  decomp%local%halo_right%mn(2):decomp%local%halo_right%mx(2), &
1818  decomp%local%halo_right%mn(3):decomp%local%halo_right%mx(3), &
1819  decomp%local%halo_right%mn(4):decomp%local%halo_right%mx(4), &
1820  decomp%local%halo_right%mn(5):decomp%local%halo_right%mx(5), &
1821  decomp%local%halo_right%mn(6):decomp%local%halo_right%mx(6) ))
1822 #endif
1823 
1824 ! call mp_statistics()
1825  recvbuf => decomp%local%halo_right%buf
1826 ! write(*,*) "###", lbound(recvbuf), ubound(recvbuf), size(recvbuf)
1827 
1828  ! --- calculate rx and tx index ranges
1829  ! index ranges on the computation array to be sent
1830  r_tx(:,1) = decomp%local%halo_right%mn(:)!decomp%local%mn(:)
1831  r_tx(:,2) = decomp%local%halo_right%mx(:)!decomp%local%mx(:)
1832  r_tx(id,1) = decomp%local%mn(id)
1833  r_tx(id,2) = decomp%local%mn(id) + hw_right - 1
1834  ! index ranges on the buffer are simply its extents
1835  r_rx(:,1) = decomp%local%halo_right%mn(:)
1836  r_rx(:,2) = decomp%local%halo_right%mx(:)
1837 
1838  if (topo%procs(id) == 1) then
1839  ! --- we copy the ghost cells directly, assuming periodic BCs for the moment
1840 !$omp parallel default(shared) private(i,j,k,l,m,n)
1841 !$omp do OMP_SCHEDULE OMP_COLLAPSE
1842  do n = 0, r_rx(6,2)-r_rx(6,1)
1843  do m = 0, r_rx(5,2)-r_rx(5,1)
1844  do l = 0, r_rx(4,2)-r_rx(4,1)
1845  do k = 0, r_rx(3,2)-r_rx(3,1)
1846  do j = 0, r_rx(2,2)-r_rx(2,1)
1847  do i = 0, r_rx(1,2)-r_rx(1,1)
1848  recvbuf(r_rx(1,1)+i, r_rx(2,1)+j, r_rx(3,1)+k, &
1849  r_rx(4,1)+l, r_rx(5,1)+m, r_rx(6,1)+n) = &
1850  arr(r_tx(1,1)+i, r_tx(2,1)+j, r_tx(3,1)+k, &
1851  r_tx(4,1)+l, r_tx(5,1)+m, r_tx(6,1)+n)
1852  enddo
1853  enddo
1854  enddo
1855  enddo
1856  enddo
1857  enddo
1858 !$omp end do
1859 !$omp end parallel
1860  else
1861  ! calculate the total number of elements to be exchanged
1862  nxc = int(hw_right, i64)
1863  do jd=1,nd
1864  if (jd == id) then
1865  cycle
1866  else
1867  nxc = nxc * decomp%local%halo_right%nw(jd)
1868  endif
1869  end do
1870  if (nxc > bufsize) then
1871  if (associated(sendbuf)) deallocate(sendbuf)
1872  allocate(sendbuf(nxc))
1873  bufsize = nxc
1874  end if
1875 
1876  call sll_s_copy_array_to_buffer_6d_real64(arr, decomp%local%mn, decomp%local%mx, sendbuf, r_tx)
1877 
1878  sll_assert_always(nxc <= nxc_max) ! check if message size is within the allowed limit
1879  nel = nxc ! 64 bit -> 32 bit
1880 
1881  if (use_compression) then
1882  if (sendbuf_dump) then
1883  write(dump_filename,'(a,i1.1,a,i4.4,a)') "L", id, "_", dump_counter, ".txt"
1884  write(*,*) trim(dump_filename)
1885  open(unit=88, file=trim(dump_filename), status='replace')
1886  write(88,'(E24.16)') (sendbuf(i), i=1,nel)
1887  close(88)
1888  endif
1889  call mpi_sendrecv_compressed_6d_real64(sendbuf, recvbuf, nel, &
1890  topo%neighbors(2*id-1), topo%neighbors(2*id), topo%comm, &
1891  compression_verbose)
1892  else
1893  call mpi_sendrecv(sendbuf, nel, mpi_precision, topo%neighbors(2*id-1), mpi_tag,&
1894  recvbuf, nel, mpi_precision, topo%neighbors(2*id), mpi_tag,&
1895  topo%comm, mpi_status_ignore, ierr)
1896  sll_assert_always(ierr == mpi_success)
1897  endif
1898  endif ! if branch (topo%nprocs > 1)
1899  nullify(recvbuf)
1900  else
1901  ! --- nothing to do
1902  continue
1903  endif ! (hw_right > 0)
1904 
1905 
1906  ! --- copy halo cells to the right neighbor / into the left halo buffer
1907  !
1908  ! A one-dimensional example, :
1909  !
1910  ! |%%%|-------***|--| # left neighbor
1911  !
1912  ! |***|-------$$$|--| # current process
1913  !
1914  ! |$$$|-------%%%|--| # right neighbor
1915  !
1916  decomp%local%id = id
1917  decomp%local%halo_left%mn(:) = halo_block(:,1)
1918  decomp%local%halo_left%mx(:) = halo_block(:,2)
1919  decomp%local%halo_left%nw(:) = decomp%local%halo_left%mx(:)-decomp%local%halo_left%mn(:)+1
1920 
1921  decomp%local%halo_left%mx(id) = decomp%local%mn(id) - 1
1922  decomp%local%halo_left%mn(id) = decomp%local%mn(id) - hw_left
1923  decomp%local%halo_left%nw(id) = hw_left
1924 
1925  if (hw_left > 0) then
1926  ! --- change the decomposition object's state as required by the requested halo width
1927 #ifdef USE_FMEMPOOL
1928  if (associated(decomp%local%halo_left%buf)) then
1929  call mp_release(decomp%local%halo_left%buf)
1930  endif
1931  call mp_acquire(decomp%local%halo_left%buf, decomp%local%halo_left%mn, decomp%local%halo_left%mx)
1932 #else
1933  if (allocated(decomp%local%halo_left%buf)) &
1934  deallocate(decomp%local%halo_left%buf)
1935  allocate(decomp%local%halo_left%buf( &
1936  decomp%local%halo_left%mn(1):decomp%local%halo_left%mx(1), &
1937  decomp%local%halo_left%mn(2):decomp%local%halo_left%mx(2), &
1938  decomp%local%halo_left%mn(3):decomp%local%halo_left%mx(3), &
1939  decomp%local%halo_left%mn(4):decomp%local%halo_left%mx(4), &
1940  decomp%local%halo_left%mn(5):decomp%local%halo_left%mx(5), &
1941  decomp%local%halo_left%mn(6):decomp%local%halo_left%mx(6) ))
1942 #endif
1943 
1944  recvbuf => decomp%local%halo_left%buf
1945 
1946  ! --- calculate rx and tx index ranges
1947  ! index ranges on the computation array to be sent
1948  r_tx(:,1) = decomp%local%halo_left%mn(:)!decomp%local%mn(:)
1949  r_tx(:,2) = decomp%local%halo_left%mx(:)!decomp%local%mx(:)
1950  r_tx(id,1) = decomp%local%mx(id) - hw_left + 1
1951  r_tx(id,2) = decomp%local%mx(id)
1952  ! index ranges on the buffer are just its extents
1953  r_rx(:,1) = decomp%local%halo_left%mn(:)
1954  r_rx(:,2) = decomp%local%halo_left%mx(:)
1955 
1956  if (topo%procs(id) == 1) then
1957  ! --- we copy the ghost cells directly, assuming periodic BCs for the moment
1958 !$omp parallel default(shared) private(i,j,k,l,m,n)
1959 !$omp do OMP_SCHEDULE OMP_COLLAPSE
1960  do n = 0, r_rx(6,2)-r_rx(6,1)
1961  do m = 0, r_rx(5,2)-r_rx(5,1)
1962  do l = 0, r_rx(4,2)-r_rx(4,1)
1963  do k = 0, r_rx(3,2)-r_rx(3,1)
1964  do j = 0, r_rx(2,2)-r_rx(2,1)
1965  do i = 0, r_rx(1,2)-r_rx(1,1)
1966  recvbuf(r_rx(1,1)+i, r_rx(2,1)+j, r_rx(3,1)+k, &
1967  r_rx(4,1)+l, r_rx(5,1)+m, r_rx(6,1)+n) = &
1968  arr(r_tx(1,1)+i, r_tx(2,1)+j, r_tx(3,1)+k, &
1969  r_tx(4,1)+l, r_tx(5,1)+m, r_tx(6,1)+n)
1970  enddo
1971  enddo
1972  enddo
1973  enddo
1974  enddo
1975  enddo
1976 !$omp end do
1977 !$omp end parallel
1978  else
1979  ! calculate the total number of elements to be exchanged
1980  nxc = int(hw_left, i64)
1981  do jd=1,nd
1982  if (jd == id) then
1983  cycle
1984  else
1985  nxc = nxc * decomp%local%halo_left%nw(jd)
1986  endif
1987  end do
1988  if (nxc > bufsize) then
1989  if (associated(sendbuf)) deallocate(sendbuf)
1990  allocate(sendbuf(nxc))
1991  bufsize = nxc
1992  end if
1993 
1994  call sll_s_copy_array_to_buffer_6d_real64(arr, decomp%local%mn, decomp%local%mx, sendbuf, r_tx)
1995 
1996  sll_assert_always(nxc <= nxc_max)
1997  nel = nxc
1998 
1999  if (use_compression) then
2000  if (sendbuf_dump) then
2001  write(dump_filename,'(a,i1.1,a,i4.4,a)') "R", id, "_", dump_counter, ".txt"
2002  write(*,*) trim(dump_filename)
2003  open(unit=88, file=dump_filename, status='replace')
2004  write(88,'(E24.16)') (sendbuf(i), i=1,nel)
2005  close(88)
2006  endif
2007  call mpi_sendrecv_compressed_6d_real64(sendbuf, recvbuf, nel, &
2008  topo%neighbors(2*id), topo%neighbors(2*id-1), topo%comm, &
2009  compression_verbose)
2010  else
2011  call mpi_sendrecv(sendbuf, nel, mpi_precision, topo%neighbors(2*id), mpi_tag,&
2012  recvbuf, nel, mpi_precision, topo%neighbors(2*id-1), mpi_tag,&
2013  topo%comm, mpi_status_ignore, ierr)
2014  sll_assert_always(ierr == mpi_success)
2015  endif
2016  endif ! if branch (topo%nprocs > 1)
2017  nullify(recvbuf)
2018  else
2019  ! --- nothing to do
2020  continue
2021  endif ! (hw_left > 0)
2022 
2023  dump_counter = dump_counter + 1
2024 
2025 #ifdef USE_FMEMPOOL
2026 ! call mp_statistics()
2027 #endif
2029 
2030 
2031 
2032 ! --- 3D HALO EXCHANGE BELOW ---
2033 
2034 
2035 
2036  ! --- halo exchange routine compatible with the "slim" redesing using dynamic buffers ---
2037  subroutine sll_f_apply_halo_exchange_slim_3d_real64(topo, decomp, arr, id, hw_left, hw_right)
2038  integer, parameter :: nd = 3 ! we handle 3 dimensions
2039  type(sll_t_cartesian_topology_3d), intent(in) :: topo
2040  type(sll_t_decomposition_slim_3d), intent(inout) :: decomp
2041  sll_real64, dimension(:,:,:), intent(inout) :: arr(decomp%local%mn(1):decomp%local%mx(1), &
2042  decomp%local%mn(2):decomp%local%mx(2), &
2043  decomp%local%mn(3):decomp%local%mx(3))
2044  sll_int32, intent(in) :: id, hw_left, hw_right
2045  sll_int32 :: halo_block(3,2)
2046  halo_block(:,1) = decomp%local%mn
2047  halo_block(:,2) = decomp%local%mx
2048  call sll_s_apply_halo_exchange_slim_3d_real64( topo, decomp, arr, id, hw_left, hw_right, halo_block )
2050 
2051 
2052  ! --- halo exchange routine compatible with the "slim" redesing using dynamic buffers ---
2053  subroutine sll_s_apply_halo_exchange_slim_3d_real64(topo, decomp, arr, id, hw_left, hw_right, halo_block)
2054  integer, parameter :: nd = 3 ! we handle 3 dimensions
2055  type(sll_t_cartesian_topology_3d), intent(in) :: topo
2056  type(sll_t_decomposition_slim_3d), target, intent(inout) :: decomp
2057  sll_real64, intent(inout) :: arr(decomp%local%mn(1):decomp%local%mx(1), &
2058  decomp%local%mn(2):decomp%local%mx(2), &
2059  decomp%local%mn(3):decomp%local%mx(3))
2060  sll_int32, intent(in) :: id, hw_left, hw_right
2061  sll_int32, intent(in) :: halo_block(3,2)
2062  integer :: jd, i, j, k
2063  integer :: ierr
2064  logical, save :: first_call = .true.
2065 
2066  ! --- MPI communication-related variables and buffers
2067  sll_int64, save :: bufsize = 0
2068  sll_int64 :: nxc ! total number of elements to be exchanged
2069  sll_int32 :: nel ! number of elements to be exchanged at a single MPI call
2070  integer, dimension(:,:) :: r_rx(nd,2) ! index ranges for copy operations
2071  integer, dimension(:,:) :: r_tx(nd,2)
2072 #ifdef USE_HALO_REAL32
2073  sll_int32, parameter :: nxc_max = 2147483647
2074  integer, parameter :: mpi_precision = mpi_real
2075  integer, parameter :: word_size = 4
2076 #else
2077  sll_int32, parameter :: nxc_max = 2147483647
2078  integer, parameter :: mpi_precision = mpi_double_precision
2079  integer, parameter :: word_size = 8
2080 #endif
2081 ! HALO_DTYPE, dimension(:), allocatable, target, save :: sendbuf
2082  halo_dtype, pointer, save :: sendbuf(:)
2083  halo_dtype, pointer :: recvbuf(:,:,:)
2084  integer :: mpi_tag
2085 
2086  ! ------
2087 
2088 
2089  mpi_tag = 0
2090 
2091  if (first_call) then
2092 #ifdef USE_HALO_REAL32
2093  if (topo%rank == 0) then
2094  write(*,*) "sll_m_decomposition::sll_s_apply_halo_exchange_slim_3d_real64() uses single precision messages"
2095  endif
2096 #endif
2097  first_call = .false.
2098  nullify(sendbuf)
2099  endif
2100 
2101 
2102  decomp%local%id = id
2103  decomp%local%halo_right%mn(:) = halo_block(:,1)
2104  decomp%local%halo_right%mx(:) = halo_block(:,2)
2105  decomp%local%halo_right%nw(:) = decomp%local%halo_right%mx(:)-decomp%local%halo_right%mn(:)+1
2106 
2107  decomp%local%halo_right%mn(id) = decomp%local%mx(id) + 1
2108  decomp%local%halo_right%mx(id) = decomp%local%mx(id) + hw_right
2109  decomp%local%halo_right%nw(id) = hw_right
2110 
2111  if (hw_right > 0) then
2112  ! --- change the decomposition object's state as required by the requested halo_width
2113  if (allocated(decomp%local%halo_right%buf)) &
2114  deallocate(decomp%local%halo_right%buf)
2115 
2116  allocate(decomp%local%halo_right%buf( &
2117  decomp%local%halo_right%mn(1):decomp%local%halo_right%mx(1), &
2118  decomp%local%halo_right%mn(2):decomp%local%halo_right%mx(2), &
2119  decomp%local%halo_right%mn(3):decomp%local%halo_right%mx(3)))
2120 
2121  recvbuf => decomp%local%halo_right%buf
2122 
2123  ! --- calculate rx and tx index ranges
2124  ! index ranges on the computation array to be sent
2125  r_tx(:,1) = decomp%local%halo_right%mn(:)!decomp%local%mn(:)
2126  r_tx(:,2) = decomp%local%halo_right%mx(:)!decomp%local%mx(:)
2127  r_tx(id,1) = decomp%local%mn(id)
2128  r_tx(id,2) = decomp%local%mn(id) + hw_right - 1
2129  ! index ranges on the buffer are simply its extents
2130  r_rx(:,1) = decomp%local%halo_right%mn(:)
2131  r_rx(:,2) = decomp%local%halo_right%mx(:)
2132 
2133  if (topo%procs(id) == 1) then
2134  ! --- we copy the ghost cells directly, assuming periodic BCs for the moment
2135 !$omp parallel default(shared) private(i,j,k)
2136 !$omp do OMP_SCHEDULE OMP_COLLAPSE
2137  do k = 0, r_rx(3,2)-r_rx(3,1)
2138  do j = 0, r_rx(2,2)-r_rx(2,1)
2139  do i = 0, r_rx(1,2)-r_rx(1,1)
2140  recvbuf(r_rx(1,1)+i, r_rx(2,1)+j, r_rx(3,1)+k) = &
2141  arr(r_tx(1,1)+i, r_tx(2,1)+j, r_tx(3,1)+k)
2142  enddo
2143  enddo
2144  enddo
2145 !$omp end do
2146 !$omp end parallel
2147  else
2148  ! calculate the total number of elements to be exchanged
2149  nxc = int(hw_right, i64)
2150  do jd=1,nd
2151  if (jd == id) then
2152  cycle
2153  else
2154  nxc = nxc * decomp%local%halo_right%nw(jd)
2155  endif
2156  end do
2157  if (nxc > bufsize) then
2158  if (associated(sendbuf)) deallocate(sendbuf)
2159  allocate(sendbuf(nxc))
2160  bufsize = nxc
2161  end if
2162 
2163  call sll_s_copy_array_to_buffer_3d_real64(arr, decomp%local%mn, decomp%local%mx, sendbuf, r_tx)
2164 
2165  sll_assert_always(nxc <= nxc_max) ! check if message size is within the allowed limit
2166  nel = nxc ! 64 bit -> 32 bit
2167  call mpi_sendrecv(sendbuf, nel, mpi_precision, topo%neighbors(2*id-1), mpi_tag,&
2168  decomp%local%halo_right%buf, nel, mpi_precision, topo%neighbors(2*id), mpi_tag,&
2169  topo%comm, mpi_status_ignore, ierr)
2170  sll_assert_always(ierr == mpi_success)
2171  endif ! if branch (topo%nprocs > 1)
2172  nullify(recvbuf)
2173  else
2174  ! --- nothing to do
2175  continue
2176  endif ! (hw_right > 0)
2177 
2178 
2179  ! --- copy halo cells to the right neighbor / into the left halo buffer
2180 
2181  decomp%local%id = id
2182  decomp%local%halo_left%mn(:) = halo_block(:,1)
2183  decomp%local%halo_left%mx(:) = halo_block(:,2)
2184  decomp%local%halo_left%nw(:) = decomp%local%halo_left%mx(:)-decomp%local%halo_left%mn(:)+1
2185 
2186  decomp%local%halo_left%mx(id) = decomp%local%mn(id) - 1
2187  decomp%local%halo_left%mn(id) = decomp%local%mn(id) - hw_left
2188  decomp%local%halo_left%nw(id) = hw_left
2189 
2190  if (hw_left > 0) then
2191  ! --- change the decomposition object's state as required by the requested halo width
2192  if (allocated(decomp%local%halo_left%buf)) &
2193  deallocate(decomp%local%halo_left%buf)
2194  allocate(decomp%local%halo_left%buf( &
2195  decomp%local%halo_left%mn(1):decomp%local%halo_left%mx(1), &
2196  decomp%local%halo_left%mn(2):decomp%local%halo_left%mx(2), &
2197  decomp%local%halo_left%mn(3):decomp%local%halo_left%mx(3)))
2198 
2199  recvbuf => decomp%local%halo_left%buf
2200 
2201  ! --- calculate rx and tx index ranges
2202  ! index ranges on the computation array to be sent
2203  r_tx(:,1) = decomp%local%halo_left%mn(:)!decomp%local%mn(:)
2204  r_tx(:,2) = decomp%local%halo_left%mx(:)!decomp%local%mx(:)
2205  r_tx(id,1) = decomp%local%mx(id) - hw_left + 1
2206  r_tx(id,2) = decomp%local%mx(id)
2207  ! index ranges on the buffer are just its extents
2208  r_rx(:,1) = decomp%local%halo_left%mn(:)
2209  r_rx(:,2) = decomp%local%halo_left%mx(:)
2210 
2211  if (topo%procs(id) == 1) then
2212  ! --- we copy the ghost cells directly, assuming periodic BCs for the moment
2213 !$omp parallel default(shared) private(i,j,k)
2214 !$omp do OMP_SCHEDULE OMP_COLLAPSE
2215  do k = 0, r_rx(3,2)-r_rx(3,1)
2216  do j = 0, r_rx(2,2)-r_rx(2,1)
2217  do i = 0, r_rx(1,2)-r_rx(1,1)
2218  recvbuf(r_rx(1,1)+i, r_rx(2,1)+j, r_rx(3,1)+k) = &
2219  arr(r_tx(1,1)+i, r_tx(2,1)+j, r_tx(3,1)+k)
2220  enddo
2221  enddo
2222  enddo
2223 !$omp end do
2224 !$omp end parallel
2225  else
2226  ! calculate the total number of elements to be exchanged
2227  nxc = int(hw_left, i64)
2228  do jd=1,nd
2229  if (jd == id) then
2230  cycle
2231  else
2232  nxc = nxc * decomp%local%halo_left%nw(jd)
2233  endif
2234  end do
2235  if (nxc > bufsize) then
2236  if (associated(sendbuf)) deallocate(sendbuf)
2237  allocate(sendbuf(nxc))
2238  bufsize = nxc
2239  end if
2240 
2241  call sll_s_copy_array_to_buffer_3d_real64(arr, decomp%local%mn, decomp%local%mx, sendbuf, r_tx)
2242 
2243  sll_assert_always(nxc <= nxc_max)
2244  nel = nxc
2245  call mpi_sendrecv(sendbuf, nel, mpi_precision, topo%neighbors(2*id), mpi_tag,&
2246  decomp%local%halo_left%buf, nel, mpi_precision, topo%neighbors(2*id-1), mpi_tag,&
2247  topo%comm, mpi_status_ignore, ierr)
2248  sll_assert_always(ierr == mpi_success)
2249  endif ! if branch (topo%nprocs > 1)
2250  nullify(recvbuf)
2251  else
2252  ! --- nothing to do
2253  continue
2254  endif ! (hw_left > 0)
2256 
2257 
2258  subroutine sll_s_apply_bc_exchange_slim_6d_real64(topo, decomp, id)
2259  type(sll_t_cartesian_topology_6d), intent(in) :: topo
2260  type(sll_t_decomposition_slim_6d), intent(inout) :: decomp
2261  sll_int32, intent(in) :: id
2262 
2263  integer :: ierr
2264  sll_int32 :: nel ! number of elements to be exchanged at a single MPI call
2265 #ifdef USE_HALO_REAL32
2266  integer, parameter :: mpi_precision = mpi_real
2267 #else
2268  integer, parameter :: mpi_precision = mpi_double_precision
2269 #endif
2270  integer, parameter :: mpi_tag=1024 ! large tag number to enable overlap with halo exchange
2271 
2272  if (allocated(decomp%local%bc_left_send) .and. allocated(decomp%local%bc_right)) then
2273  nel = size(decomp%local%bc_left_send)
2274  call mpi_sendrecv(decomp%local%bc_left_send, nel, mpi_precision, topo%neighbors(2*id), mpi_tag,&
2275  decomp%local%bc_right, nel, mpi_precision, topo%neighbors(2*id-1), mpi_tag,&
2276  topo%comm, mpi_status_ignore, ierr)
2277  sll_assert_always(ierr == mpi_success)
2278  endif
2279 
2280  if (allocated(decomp%local%bc_right_send) .and. allocated(decomp%local%bc_left)) then
2281  nel = size(decomp%local%bc_right_send)
2282  call mpi_sendrecv(decomp%local%bc_right_send, nel, mpi_precision, topo%neighbors(2*id-1), mpi_tag,&
2283  decomp%local%bc_left, nel, mpi_precision, topo%neighbors(2*id), mpi_tag,&
2284  topo%comm, mpi_status_ignore, ierr)
2285  sll_assert_always(ierr == mpi_success)
2286  endif
2288 
2289 
2290  function sll_f_select_dim(id)
2291  logical, dimension(:) :: sll_f_select_dim(6)
2292  sll_int32, optional :: id
2293 
2294  if (present(id)) then
2295  sll_f_select_dim = .false.
2296  sll_f_select_dim(id) = .true.
2297  else
2298  sll_f_select_dim = .true.
2299  endif
2300  end function sll_f_select_dim
2301 
2302 
2303  subroutine sll_s_allocate_bc_buffers_6d(decomp, id)
2304  type(sll_t_decomposition_slim_6d), intent(inout) :: decomp
2305  sll_int32, intent(in) :: id ! dimension to be skipped
2306 
2307  sll_int32 :: idx_mn(5), idx_mx(5)
2308  sll_int32 :: i, j
2309 
2310  j = 1
2311  do i=1, 6
2312  if (i == id) cycle
2313  idx_mn(j) = decomp%local%mn(i)
2314  idx_mx(j) = decomp%local%mx(i)
2315  j = j + 1
2316  enddo
2317 
2318  call sll_s_allocate_bc_buffers_6d_part(decomp, id, idx_mn, idx_mx)
2319  end subroutine sll_s_allocate_bc_buffers_6d
2320 
2321  ! Helper routine to allocate boundary condition buffers
2322  subroutine sll_s_allocate_bc_buffers_6d_part(decomp, id, idx_mn, idx_mx)
2323  type(sll_t_decomposition_slim_6d), intent(inout) :: decomp
2324  sll_int32, intent(in) :: id ! dimension to be skipped
2325 
2326  sll_int32, intent(in) :: idx_mn(5), idx_mx(5)
2327 !!$ sll_int32 :: i, j
2328 !!$
2329 !!$ j = 1
2330 !!$ do i=1, 6
2331 !!$ if (i == id) cycle
2332 !!$ idx_mn(j) = decomp%local%mn(i)
2333 !!$ idx_mx(j) = decomp%local%mx(i)
2334 !!$ j = j + 1
2335 !!$ enddo
2336 
2337  call sll_s_deallocate_bc_buffers(decomp)
2338 
2339  allocate(decomp%local%bc_right_send(idx_mn(1):idx_mx(1), &
2340  idx_mn(2):idx_mx(2), &
2341  idx_mn(3):idx_mx(3), &
2342  idx_mn(4):idx_mx(4), &
2343  idx_mn(5):idx_mx(5)))
2344  allocate(decomp%local%bc_left_send( idx_mn(1):idx_mx(1), &
2345  idx_mn(2):idx_mx(2), &
2346  idx_mn(3):idx_mx(3), &
2347  idx_mn(4):idx_mx(4), &
2348  idx_mn(5):idx_mx(5)))
2349  allocate(decomp%local%bc_right( idx_mn(1):idx_mx(1), &
2350  idx_mn(2):idx_mx(2), &
2351  idx_mn(3):idx_mx(3), &
2352  idx_mn(4):idx_mx(4), &
2353  idx_mn(5):idx_mx(5)))
2354  allocate(decomp%local%bc_left( idx_mn(1):idx_mx(1), &
2355  idx_mn(2):idx_mx(2), &
2356  idx_mn(3):idx_mx(3), &
2357  idx_mn(4):idx_mx(4), &
2358  idx_mn(5):idx_mx(5)))
2359  end subroutine sll_s_allocate_bc_buffers_6d_part
2360 
2361 
2362  ! safety/helper routine to deallocate boundary condition buffers
2363  subroutine sll_s_deallocate_bc_buffers(decomp)
2364  type(sll_t_decomposition_slim_6d), intent(inout) :: decomp
2366  if (allocated(decomp%local%bc_right_send)) deallocate(decomp%local%bc_right_send)
2367  if (allocated(decomp%local%bc_left_send)) deallocate(decomp%local%bc_left_send)
2368  if (allocated(decomp%local%bc_right)) deallocate(decomp%local%bc_right)
2369  if (allocated(decomp%local%bc_left)) deallocate(decomp%local%bc_left)
2370  end subroutine sll_s_deallocate_bc_buffers
2371 
2372 
2373  ! DEBUG output routine
2374  subroutine dump_ascii(file, arr)
2375  character(len=*), intent(in) :: file
2376 #ifdef USE_HALO_REAL32
2377  sll_real32, dimension(:), intent(in) :: arr
2378 #else
2379  sll_real64, dimension(:), intent(in) :: arr
2380 #endif
2381  integer, parameter :: fd = 67
2382  integer :: i
2383  open(unit=fd, file=file)
2384  do i=1, size(arr)
2385  write(fd,*) arr(i)
2386  end do
2387  close(fd)
2388  end subroutine
2389 
2390 
2391  ! DEBUG output routine
2392  subroutine dump_binary(filename, array)
2393  character(len=*), intent(in) :: filename
2394  sll_real64, intent(in) :: array(:)
2395  integer, parameter :: iunit = 67
2396  open(iunit, file=trim(filename), status='replace', form='unformatted')
2397  write(iunit) array
2398  close(iunit)
2399  end subroutine dump_binary
2400 
2401 
2402  ! DEBUG output routine
2403  subroutine dump_ascii_6d(file, decomp, arr)
2404  character(len=*), intent(in) :: file
2405  type(sll_t_decomposition_6d), intent(in) :: decomp
2406  sll_real64, dimension(:,:,:,:,:,:), intent(inout) :: arr(decomp%local%lo(1):decomp%local%hi(1), &
2407  decomp%local%lo(2):decomp%local%hi(2), &
2408  decomp%local%lo(3):decomp%local%hi(3), &
2409  decomp%local%lo(4):decomp%local%hi(4), &
2410  decomp%local%lo(5):decomp%local%hi(5), &
2411  decomp%local%lo(6):decomp%local%hi(6))
2412  integer, parameter :: fd = 67
2413  sll_int32 :: i,j,k,l,m,n
2414  open(unit=fd, file=file)
2415  do n=decomp%local%mn(6),decomp%local%mx(6)
2416  do m=decomp%local%mn(5),decomp%local%mx(5)
2417  do l=decomp%local%mn(4),decomp%local%mx(4)
2418  do k=decomp%local%mn(3),decomp%local%mx(3)
2419  do j=decomp%local%mn(2),decomp%local%mx(2)
2420  do i=decomp%local%mn(1),decomp%local%mx(1)
2421  write(fd,'(I3,I3,I3,I3,I3,I3,F20.16)') i, j, k, l, m, n, arr(i,j,k,l,m,n)
2422  enddo
2423  enddo
2424  enddo
2425  enddo
2426  enddo
2427  enddo
2428  close(fd)
2429  end subroutine
2430 
2431 
2432  subroutine dump_dd_information(file, topo, decomp)
2433  character(len=*), intent(in) :: file
2434  type(sll_t_cartesian_topology_6d), intent(in) :: topo
2435  type(sll_t_decomposition_6d), intent(in) :: decomp
2436  integer, parameter :: fd = 67
2437  open(unit=fd, file=file)
2438  write(fd,'(A,1I4)') "topo%rank : ", topo%rank
2439  write(fd,'(A,1I4)') "topo%nprocs : ", topo%nprocs
2440  write(fd,'(A,6I4)') "topo%procs : ", topo%procs
2441  write(fd,'(A,6L4)') "topo%periodic : ", topo%periodic
2442  write(fd,'(A,6I4)') "topo%coords : ", topo%coords
2443  write(fd,'(A,12I4)') "topo%neighbors : ", topo%neighbors
2444  write(fd,'(A,6I4)') "decomp%global : ", decomp%global
2445  write(fd,'(A,6I4)') "decomp%local%mn : ", decomp%local%mn
2446  write(fd,'(A,6I4)') "decomp%local%mx : ", decomp%local%mx
2447  write(fd,'(A,6I4)') "decomp%local%hw : ", decomp%local%hw
2448  write(fd,'(A,6I4)') "decomp%local%lo : ", decomp%local%lo
2449  write(fd,'(A,6I4)') "decomp%local%hi : ", decomp%local%hi
2450  write(fd,'(A,6I4)') "decomp%local%nw : ", decomp%local%nw
2451  write(fd,'(A,6I4)') "decomp%local%gw : ", decomp%local%gw
2452  write(fd,'(A,6I4)') "decomp%local%tx_lolo : ", decomp%local%tx_lolo
2453  write(fd,'(A,6I4)') "decomp%local%tx_lohi : ", decomp%local%tx_lohi
2454  write(fd,'(A,6I4)') "decomp%local%tx_hilo : ", decomp%local%tx_hilo
2455  write(fd,'(A,6I4)') "decomp%local%tx_hihi : ", decomp%local%tx_hihi
2456  write(fd,'(A,6I4)') "decomp%local%rx_lolo : ", decomp%local%rx_lolo
2457  write(fd,'(A,6I4)') "decomp%local%rx_lohi : ", decomp%local%rx_lohi
2458  write(fd,'(A,6I4)') "decomp%local%rx_hilo : ", decomp%local%rx_hilo
2459  write(fd,'(A,6I4)') "decomp%local%rx_hihi : ", decomp%local%rx_hihi
2460  close(fd)
2461  end subroutine
2462 
2463  ! dummy routine to check correct linking, do not call
2464  subroutine dummy_mempool()
2465 #ifdef USE_FMEMPOOL
2467 #endif
2468  end subroutine dummy_mempool
2469 
2470 
2471  function sll_f_set_process_grid(mpi_world_size, process_grid_par) result(process_grid)
2472  integer :: process_grid(6), i, j
2473  integer, intent(in) :: mpi_world_size ! number of MPI processes
2474  integer, intent(in), optional :: process_grid_par(6) ! process grid set via parameter file, shall be initialized to zero otherwise
2475  logical :: swap_process_grid
2476 
2477  swap_process_grid = sll_f_query_environment("SLL_SWAP_PROCESS_GRID", .false.)
2478 
2479  if ((present(process_grid_par)) .and. (product(process_grid_par) == mpi_world_size)) then
2480  ! apply the process grid passed as a parameter (i.e. set via the parameter file)
2481  process_grid = process_grid_par
2482  else
2486 
2487  ! divide starting with the last dimension (may be swapped later, see below)
2488  select case(mpi_world_size)
2489  case(1)
2490  process_grid = [1,1,1,1,1,1]
2491  case(2)
2492  process_grid = [1,1,1,1,1,2]
2493  case(4)
2494  process_grid = [1,1,1,1,2,2]
2495  case(8)
2496  process_grid = [1,1,1,2,2,2]
2497  case(16)
2498  process_grid = [1,1,2,2,2,2]
2499  case(32)
2500  process_grid = [1,2,2,2,2,2]
2501  case(64)
2502  process_grid = [2,2,2,2,2,2]
2503  case(96)
2504  process_grid = [2,2,2,2,2,3]
2505  case(128)
2506  process_grid = [2,2,2,2,2,4]
2507  case(256)
2508  process_grid = [2,2,2,2,4,4]
2509  case(512)
2510  process_grid = [2,2,2,4,4,4]
2511  case(1024)
2512  process_grid = [2,2,4,4,4,4]
2513  case(2048)
2514  process_grid = [2,4,4,4,4,4]
2515  case(4096)
2516  process_grid = [4,4,4,4,4,4]
2517  case(8192)
2518  process_grid = [4,4,4,4,4,8]
2519  case(16384)
2520  process_grid = [4,4,4,4,8,8]
2521  case(32768)
2522  process_grid = [4,4,4,8,8,8]
2523  case(65536)
2524  process_grid = [4,4,8,8,8,8]
2525  case(131072)
2526  process_grid = [4,8,8,8,8,8]
2527  case(262144)
2528  process_grid = [8,8,8,8,8,8]
2529  case default
2530  write(*,*) "Error: No process topology implemented for ", mpi_world_size, " processes. STOP."
2531  end select
2532 
2533  if (swap_process_grid) then
2534  do i=1,3
2535  j = process_grid(7-i)
2536  process_grid(7-i) = process_grid(i)
2537  process_grid(i) = j
2538  enddo
2539  endif
2540  endif
2541  end function sll_f_set_process_grid
2542 
2543 end module sll_m_decomposition
2544 
Plain Fortran implementation of a memory pool.
Definition: fmempool.F90:5
subroutine, public mp_statistics()
Definition: fmempool.F90:253
logical, save verbose
Definition: fmempool.F90:58
Parallelizing facility.
Module providing an F90 interface to the ZFP compression library: http://computation....
subroutine print_compression_information(comp, verbose)
integer function concatenate_index_arrays(comp, array)
allocate array, copy indices from comp into array, return
subroutine set_compression_precision(prec)
subroutine deallocate_compressed_buffer_obj(comp)
subroutine decatenate_index_arrays(comp, array)
subroutine deflate_buffer_real64(buf, comp, n_doubles, n_threads)
compress buffer
subroutine inflate_buffer_real64(buf, comp, n_threads)
decompress buffer
integer, parameter zfp_blocksize
data structure to support threaded ZFP compression and decompression
Module providing data structures and tools to implement domain decompositions.
type(sll_t_decomposition_3d) function, pointer, public sll_f_new_cartesian_domain_decomposition_3d(topology, grid_size, halo_width)
type(sll_t_cartesian_topology_3d) function, pointer, public sll_f_new_cartesian_topology_3d_from_6d(t6d, keep_dim)
6D-->3D topology mapper, creates a 3D sub-topology from a 6D topology.
subroutine dump_binary(filename, array)
subroutine sll_s_deallocate_cartesian_topology_6d()
6D Cartesian topology destructor
subroutine sll_s_deallocate_cartesian_domain_decomposition_slim_6d()
6D Cartesian slim domain decomposition destructor
subroutine sll_s_deallocate_cartesian_topology_3d()
3D Cartesian topology destructor
type(sll_t_decomposition_slim_6d) function, pointer, public sll_f_new_cartesian_cell_domain_decomposition_slim_6d(topology, n_cells, degree)
type(sll_t_decomposition_6d) function, pointer, public sll_f_new_cartesian_domain_decomposition_6d(topology, grid_size, halo_width)
subroutine sll_f_apply_halo_exchange_6d_real64(topo, decomp, arr, dim_mask_in)
type(sll_t_decomposition_slim_3d) function, pointer, public sll_f_new_cartesian_domain_decomposition_slim_3d(topology, grid_size)
type(sll_t_cartesian_topology_3d) function, pointer, public sll_f_new_cartesian_topology_3d(top_collective, procs_per_dimension, periodic)
3D Cartesian topology constructor function
subroutine sll_s_copy_array_to_buffer_3d_real64(arr, arr_lo, arr_hi, buf, ranges, n_threads)
logical function, dimension(6), public sll_f_select_dim(id)
subroutine sll_s_deallocate_cartesian_domain_decomposition_slim_3d()
3D Cartesian slim domain decomposition destructor
subroutine get_transposed_process_map(procs_per_dimension, rank_map)
Returns a mpi rank table with the processes transposed.
subroutine copy_buffer_to_array_6d_real64(buf, arr, arr_lo, arr_hi, ranges, n_threads)
subroutine, public sll_f_apply_halo_exchange_slim_3d_real64(topo, decomp, arr, id, hw_left, hw_right)
subroutine, public sll_s_apply_bc_exchange_slim_6d_real64(topo, decomp, id)
integer function, dimension(6), public sll_f_set_process_grid(mpi_world_size, process_grid_par)
subroutine mpi_sendrecv_compressed(sendbuf, recvbuf, nel, rank_send, rank_recv, mpi_comm, verbose, mpi_tag)
subroutine dump_dd_information(file, topo, decomp)
subroutine sll_s_deallocate_cartesian_domain_decomposition_6d()
6D Cartesian domain decomposition destructor
subroutine, public sll_s_deallocate_bc_buffers(decomp)
subroutine sll_s_apply_halo_exchange_slim_3d_real64(topo, decomp, arr, id, hw_left, hw_right, halo_block)
subroutine, public sll_s_allocate_bc_buffers_6d(decomp, id)
subroutine dump_ascii(file, arr)
subroutine, public sll_f_apply_halo_exchange_slim_6d_real64(topo, decomp, arr, id, hw_left, hw_right)
subroutine, public sll_s_apply_halo_exchange_slim_6d_real64(topo, decomp, arr, id, hw_left, hw_right, halo_block)
subroutine, public sll_s_allocate_bc_buffers_6d_part(decomp, id, idx_mn, idx_mx)
subroutine, public sll_s_copy_array_to_buffer_6d_real64(arr, arr_lo, arr_hi, buf, ranges, n_threads)
type(sll_t_cartesian_topology_3d) function, pointer, public sll_f_new_cartesian_topology_3d_orthogonal(topo_6d, topo_3d)
type(sll_t_decomposition_slim_6d) function, pointer, public sll_f_new_cartesian_domain_decomposition_slim_6d(topology, grid_size)
subroutine sll_s_deallocate_cartesian_domain_decomposition_3d()
3D Cartesian domain decomposition destructor
subroutine sll_s_deallocate_cartesian_cell_domain_decomposition_slim_6d()
6D Cartesian slim domain decomposition destructor
subroutine dump_ascii_6d(file, decomp, arr)
subroutine, public sll_s_mpi_sendrecv_compressed_core(comp_send, comp_recv, rank_send, rank_recv, mpi_comm, verbose, mpi_tag)
MPI sendrecv functionality, wrapped for a compressed buffer.
subroutine mpi_sendrecv_compressed_6d_real64(sendbuf, recvbuf, nel, rank_send, rank_recv, mpi_comm, verbose, mpi_tag)
type(sll_t_cartesian_topology_6d) function, pointer, public sll_f_new_cartesian_topology_6d(top_collective, procs_per_dimension, periodic)
6D Cartesian topology constructor function
3D decomposition, index limits local to an MPI process.
6D decomposition, index limits local to an MPI process.
Some common numerical utilities.
logical function, public sll_f_query_environment(env_variable, default_value)
Query an environment variable for the values on,off,1,0,true,false and return the result as a logical...
Wrapper around the communicator.
6D decomposition, "slim" redesign with dynamic halo cells
Information on the 3D cartesian process topology.
Information on the 6D cartesian process topology.
3D decomposition, global array size information and local information.
6D decomposition, global array size information and local information.
6D decomposition, slim redesign, global array size information and local information.
    Report Typos and Errors