Report Typos and Errors    
Semi-Lagrangian Library
Modular library for kinetic and gyrokinetic simulations of plasmas in fusion energy devices.
sll_m_advection_6d_lagrange_dd_slim.F90
Go to the documentation of this file.
1 
8 
9 
10 #ifdef USE_HALO_REAL32
11 #define HALO_DTYPE sll_real32
12 #else
13 #define HALO_DTYPE sll_real64
14 #endif
15 
16 ! --- confirmed 20160825: cache blocking ON gives the same result as OFF ---
17 
19 !+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
20 #include "sll_assert.h"
21 #include "sll_errors.h"
22 #include "sll_working_precision.h"
23 
24  use sll_m_constants, only : &
26 
27  use sll_m_interpolators_1d_base, only: &
29 
33 
34  use sll_m_decomposition, only: &
43 
46 
52 
53 #ifdef _OPENMP
54  use omp_lib
55 ! TODO : confirm safety of collapse(2)
56 #define OMP_COLLAPSE collapse(2)
57 #define OMP_SCHEDULE schedule(static)
58 #endif
59 
60 #ifdef USE_FMEMPOOL
61  use fmempool
62 #endif
63 
64  implicit none
65 
66  ! --- Lagrange routines ---
67  public :: &
88 
89  private
90 !+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
91 
93  sll_int32 :: lagrange_width(2)
94 
95  sll_real64, allocatable :: displacement_eta1(:)
96  sll_real64, allocatable :: displacement_eta2(:)
97  sll_real64, allocatable :: displacement_eta3(:)
98 
99  sll_int32 , allocatable :: halo_blocks_eta1(:,:,:)
100  sll_int32 , allocatable :: halo_blocks_eta2(:,:,:)
101  sll_int32 , allocatable :: halo_blocks_eta3(:,:,:)
102 
103  sll_int32 , allocatable :: halo_width_eta1(:,:)
104  sll_int32 , allocatable :: halo_width_eta2(:,:)
105  sll_int32 , allocatable :: halo_width_eta3(:,:)
106 
107  sll_int32 :: n_halo_blocks(3)
109 
110 
111 contains
112 
113 
115  class(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
116 
117  if (allocated(self%displacement_eta1)) deallocate(self%displacement_eta1)
118  if (allocated(self%displacement_eta2)) deallocate(self%displacement_eta2)
119  if (allocated(self%displacement_eta3)) deallocate(self%displacement_eta3)
120 
121  if (allocated(self%halo_blocks_eta1)) deallocate(self%halo_blocks_eta1)
122  if (allocated(self%halo_blocks_eta2)) deallocate(self%halo_blocks_eta2)
123  if (allocated(self%halo_blocks_eta3)) deallocate(self%halo_blocks_eta3)
124 
125  if (allocated(self%halo_width_eta1)) deallocate(self%halo_width_eta1)
126  if (allocated(self%halo_width_eta2)) deallocate(self%halo_width_eta2)
127  if (allocated(self%halo_width_eta3)) deallocate(self%halo_width_eta3)
129 
130 
131  subroutine sll_s_advection_6d_lagrange_dd_slim_init(self, lagrange_width )
132  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
133  sll_int32, intent( in ) :: lagrange_width(2)
134  self%lagrange_width = lagrange_width
135 #ifdef DISABLE_CACHE_BLOCKING
136  write(*,*) "sll_m_advection_6d_lagrange_dd_slim :: cache blocking disabled"
137 #endif
139 
140 
141  ! --- calculate depth of the buffers used for cache blocking ---
142  function get_wx(decomposition, id_in)
143  type(sll_t_decomposition_slim_6d), intent(in) :: decomposition
144  sll_int32, intent(in), optional :: id_in
145  sll_int32 :: get_wx, wx, id
146 #ifndef DISABLE_CACHE_BLOCKING
147  ! --- by default we want to block in the first dimension
148  if (present(id_in)) then
149  id = id_in
150  else
151  id = 1
152  endif
153  ! --- calculate width of prefetch cache buffer
154 ! wx = 8 ! SandyBridge cache line is 64 Bytes long
155 ! do while(wx > 1)
156 ! if (mod(decomposition%local%nw(id),wx) == 0) then
157 ! exit
158 ! else
159 ! wx = wx / 2
160 ! endif
161 ! enddo
162  wx = decomposition%local%nw(id)
163 #else
164  wx = 1
165 #endif
166  get_wx = wx
167  end function get_wx
168 
169 
170 
171 ! --- SETUP ROUTINES FOR LAGRANGE INTERPOLATION WITH VARIABLE DISPLACEMENT BELOW ---
172 
173  subroutine sll_s_advection_6d_lagrange_dd_slim_set_eta123(self, decomposition, displacement_eta1, &
174  displacement_eta2, displacement_eta3)
175  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
176  type(sll_t_decomposition_slim_6d), intent(in) :: decomposition
177 
178  sll_real64, intent(in) :: displacement_eta1(:)
179  sll_real64, intent(in) :: displacement_eta2(:)
180  sll_real64, intent(in) :: displacement_eta3(:)
181 
182  allocate(self%displacement_eta1(decomposition%local%mn(4):decomposition%local%mx(4)))
183  self%displacement_eta1 = displacement_eta1
184  allocate(self%displacement_eta2(decomposition%local%mn(5):decomposition%local%mx(5)))
185  self%displacement_eta2 = displacement_eta2
186  allocate(self%displacement_eta3(decomposition%local%mn(6):decomposition%local%mx(6)))
187  self%displacement_eta3 = displacement_eta3
188 
189  call make_blocks_lagrange(4, self%lagrange_width(1), decomposition, &
190  self%displacement_eta1, self%halo_blocks_eta1, &
191  self%halo_width_eta1, self%n_halo_blocks(1))
192  call make_blocks_lagrange(5, self%lagrange_width(1), decomposition, &
193  self%displacement_eta2, self%halo_blocks_eta2, &
194  self%halo_width_eta2, self%n_halo_blocks(2))
195  call make_blocks_lagrange(6, self%lagrange_width(1), decomposition, &
196  self%displacement_eta3, self%halo_blocks_eta3, &
197  self%halo_width_eta3, self%n_halo_blocks(3))
199 
200 
202  subroutine make_blocks_lagrange(ind, stencil, decomposition, disp, halo_blocks , halo_width, n_halo_blocks)
203  sll_int32, intent( in ) :: ind
204  sll_int32, intent( in ) :: stencil
205  type(sll_t_decomposition_slim_6d), intent( in ) :: decomposition
206  sll_real64, intent( inout ) :: disp(decomposition%local%mn(ind):decomposition%local%mx(ind))
207  sll_int32, intent( out ), allocatable :: halo_blocks(:,:,:)
208  sll_int32, intent( out ), allocatable :: halo_width(:,:)
209  sll_int32, intent( out ) :: n_halo_blocks
210 
211  sll_int32 :: index_range(2)
212  sll_int32 :: blocks, j, bl
213  sll_int32 :: box1, box2
214 
215  index_range = [ decomposition%local%mn(ind), decomposition%local%mx(ind) ]
216 
217  ! Determine the number of blocks: We want to allow for displacements of maximally
218  ! stencil/2 cells (such that the total number of halo cells can be stencil-1 in each
219  ! of the blocks.
220  ! For a stencil of width "stencil" (even) the maximum number of blocks is
221  ! stencil
222  blocks = stencil
223  ! However, not all blocks might be present, so let us check first how many blocks there are or rather the interval of boxes that are there
224  ! Since we assume monotonic displacement, we only need to identify the box of the first and last value
225  bl = index_range(1)
226  if ( abs(disp(bl) ) == 0.0_f64) bl = bl+1
227  box1 = floor( disp(bl) )
228  bl = index_range(2)
229  if ( abs(disp(bl) ) == 0.0_f64) bl = bl-1
230  box2 = floor( disp(bl) )
231 
232  ! The allowed boxes in this numbering are -stencil/2, ...., stencil/2-1 (otherwise the displacement was too large in modulus)
233  sll_assert( box1 >= -stencil/2 )
234  sll_assert( box1 < stencil/2 )
235  sll_assert( box2 >= -stencil/2 )
236  sll_assert( box2 < stencil/2 )
237  ! Compute number of blocks
238  blocks = abs(box2-box1)+1
239 
240  ! Now that we know the number of blocks, we can allocate the array holding the block information
241  allocate( halo_blocks(6, 2, blocks) )
242  allocate( halo_width(2, blocks) )
243  do j=1,blocks
244  halo_blocks(:, 1, j) = decomposition%local%mn
245  halo_blocks(:, 2, j) = decomposition%local%mx
246  end do
247 
248  ! We have to distinguish increasing and decreasing displacement
249  if (box1 > box2 ) then
250  j = index_range(1)
251  do bl = 1,blocks!box1, box2
252  if ( abs(disp(j)) == 0.0_f64 ) j = j+1
253  halo_width(1, bl ) = stencil/2 - (box1-bl+1)-1
254  halo_width(2, bl ) = stencil/2 + (box1-bl+1)
255  halo_blocks(ind, 1, bl ) = j
256  do while( disp(j) > box1-bl+1 )
257  j = j+1
258  if ( j > index_range(2)) exit
259  end do
260  if ( abs(disp(j-1)) == 0.0_f64 ) then
261  halo_blocks(ind, 2, bl ) = j-2
262  else
263  halo_blocks(ind, 2, bl ) = j-1
264  end if
265  end do
266  else
267  j = index_range(1)
268  do bl=box1, box2
269  if ( abs(disp(j)) == 0.0_f64 ) j = j+1
270  halo_width(1, bl-box1+1) = stencil/2 - bl-1
271  halo_width(2, bl-box1+1) = stencil/2 + bl
272  halo_blocks(ind, 1, bl-box1+1) = j
273  do while ( (disp(j) < bl+1) )
274  j = j+1
275  if ( j > index_range(2) ) exit
276  end do
277  if ( abs(disp(j-1)) == 0.0_f64 ) then
278  halo_blocks(ind, 2, bl-box1+1) = j-2
279  else
280  halo_blocks(ind, 2, bl-box1+1) = j-1
281  end if
282  end do
283  end if
284 
285  n_halo_blocks = blocks
286  end subroutine make_blocks_lagrange
287 
288 
289 
291  subroutine sll_s_advection_6d_lagrange_dd_slim_fadvect_eta1(self, topology, decomposition, f6d)
292  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
293  type(sll_t_cartesian_topology_6d), intent( in ) :: topology
294  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
295  sll_real64, intent(inout) :: f6d(&
296  decomposition%local%mn(1):decomposition%local%mx(1), &
297  decomposition%local%mn(2):decomposition%local%mx(2), &
298  decomposition%local%mn(3):decomposition%local%mx(3), &
299  decomposition%local%mn(4):decomposition%local%mx(4), &
300  decomposition%local%mn(5):decomposition%local%mx(5), &
301  decomposition%local%mn(6):decomposition%local%mx(6))
302 
303  sll_int32, parameter :: id = 1 ! eta1
304  sll_int32 :: j
305 
306  do j=1, self%n_halo_blocks(id)
308  decomposition, &
309  f6d, &
310  id, &
311  self%halo_width_eta1(1, j), &
312  self%halo_width_eta1(2, j), &
313  self%halo_blocks_eta1(:,:,j))
314 
316  self, &
317  decomposition, &
318  self%halo_blocks_eta1(id+3, :, j), &
319  self%displacement_eta1, &
320  f6d)
321  end do
323 
325  subroutine sll_s_advection_6d_lagrange_dd_slim_advect_eta1_core(self, decomposition, eta4_cut, displacement, f6d)
326  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
327  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
328  sll_int32, intent(in) :: eta4_cut(2)
329  sll_real64, intent(in) :: displacement(decomposition%local%mn(4):decomposition%local%mx(4))
330  sll_real64, intent(inout) :: f6d(&
331  decomposition%local%mn(1):decomposition%local%mx(1), &
332  decomposition%local%mn(2):decomposition%local%mx(2), &
333  decomposition%local%mn(3):decomposition%local%mx(3), &
334  decomposition%local%mn(4):decomposition%local%mx(4), &
335  decomposition%local%mn(5):decomposition%local%mx(5), &
336  decomposition%local%mn(6):decomposition%local%mx(6))
337  sll_int32, parameter :: id = 1 ! eta1
338  sll_int32 :: i,j,k,l,m,n,w,wx, lw
339  sll_real64, pointer :: buf_i(:,:)
340  sll_real64, pointer :: buf_o(:,:)
341 #ifdef USE_FMEMPOOL
342  sll_int32 :: mn(2), mx(2)
343 #endif
344  sll_int32 :: l_mn, l_mx, c_mn, c_mx, r_mn, r_mx
345  sll_int32, pointer :: loop_mn(:), loop_mx(:)
346  halo_dtype, pointer :: l_halo(:,:,:,:,:,:)
347  halo_dtype, pointer :: r_halo(:,:,:,:,:,:)
348 
349  ! cache values to avoid multiple dereferencing inside the loops
350  l_mn = decomposition%local%halo_left%mn(id)
351  l_mx = decomposition%local%halo_left%mx(id)
352  c_mn = decomposition%local%mn(id)
353  c_mx = decomposition%local%mx(id)
354  r_mn = decomposition%local%halo_right%mn(id)
355  r_mx = decomposition%local%halo_right%mx(id)
356  l_halo => decomposition%local%halo_left%buf
357  r_halo => decomposition%local%halo_right%buf
358  loop_mn => decomposition%local%mn
359  loop_mx => decomposition%local%mx
360  ! It turns out that blocking over the second dimension is not significantly faster,
361  ! even slower that no blocking, so be disable it by default!
362 #ifdef CACHE_BLOCKING_J
363  ! NOTE: Buffering is done here for the second dim "j".
364  ! This implementation uses 0-based indexing in w!
365  wx = get_wx(decomposition, 2)
366 #else
367  wx = 1
368 #endif
369  lw = self%lagrange_width(1)
370 
371 #ifdef USE_FMEMPOOL
372  mn = [l_mn, 0]
373  mx = [r_mx, wx-1]
374 #endif
375 
376 !$omp parallel default(shared) private(i,j,k,l,m,n,w,buf_i,buf_o)
377 #ifdef USE_FMEMPOOL
378  call mp_acquire(buf_i, mn, mx)
379  call mp_acquire(buf_o, mn, mx)
380 #else
381  allocate(buf_i(l_mn:r_mx, 0:wx-1))
382  allocate(buf_o(l_mn:r_mx, 0:wx-1))
383 #endif
384 !$omp do OMP_COLLAPSE OMP_SCHEDULE
385  do n=loop_mn(6), loop_mx(6)
386  do m=loop_mn(5), loop_mx(5)
387  do l=eta4_cut(1), eta4_cut(2)
388  do k=loop_mn(3), loop_mx(3)
389 #ifdef CACHE_BLOCKING_J
390  do j=loop_mn(2), loop_mx(2), wx
391  do w=0,wx-1
392  do i=l_mn, l_mx
393  buf_i(i,w) = l_halo(i,j+w,k,l,m,n)
394  enddo
395  enddo
396  do w=0,wx-1
397  do i=c_mn, c_mx
398  buf_i(i,w) = f6d(i,j+w,k,l,m,n)
399  enddo
400  enddo
401  do w=0,wx-1
402  do i=r_mn, r_mx
403  buf_i(i,w) = r_halo(i,j+w,k,l,m,n)
404  enddo
405  enddo
406  ! perform interpolation
407  do w=0,wx-1
409  buf_i(:,w), &
410  buf_o(:,w), &
411  displacement(l), &
412  lw)
413  enddo
414  ! copy-back interpolated values
415 ! f6d(:,j,k,l,m,n) = buf_o(c_mn:c_mx)
416  do w=0,wx-1
417  do i=c_mn, c_mx
418  f6d(i,j+w,k,l,m,n) = buf_o(i,w)
419  enddo
420  enddo
421  end do
422 #else
423  do j=loop_mn(2), loop_mx(2)
424  ! (1) fill input buffer piecewise
425 ! if ( l_mn <= l_mx ) then
426 ! buf_i(l_mn:l_mx) = l_halo(:,j,k,l,m,n)
427 ! end if
428 ! buf_i(c_mn:c_mx) = f6d(:,j,k,l,m,n)
429 ! if (r_mn <= r_mx ) then
430 ! buf_i(r_mn:r_mx) = r_halo(:,j,k,l,m,n)
431 ! end if
432  do i=l_mn, l_mx
433  buf_i(i,0) = l_halo(i,j,k,l,m,n)
434  enddo
435  do i=c_mn, c_mx
436  buf_i(i,0) = f6d(i,j,k,l,m,n)
437  enddo
438  do i=r_mn, r_mx
439  buf_i(i,0) = r_halo(i,j,k,l,m,n)
440  enddo
441  ! (2) perform interpolation
443  buf_i(:,0), &
444  buf_o(:,0), &
445  displacement(l), &
446  lw)
447  ! (3) copy-back interpolated values
448 ! f6d(:,j,k,l,m,n) = buf_o(c_mn:c_mx)
449  do i=c_mn, c_mx
450  f6d(i,j,k,l,m,n) = buf_o(i,0)
451  enddo
452  end do
453 #endif
454  end do
455  end do
456  end do
457  end do
458 !$omp end do
459 #ifdef USE_FMEMPOOL
460  call mp_release(buf_i)
461  call mp_release(buf_o)
462 #else
463  deallocate(buf_i)
464  deallocate(buf_o)
465 #endif
466 !$omp end parallel
468 
469 
471  subroutine sll_s_advection_6d_lagrange_dd_slim_fadvect_eta2(self, topology, decomposition, f6d)
472  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
473  type(sll_t_cartesian_topology_6d), intent( in ) :: topology
474  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
475  sll_real64, intent(inout) :: f6d(&
476  decomposition%local%mn(1):decomposition%local%mx(1), &
477  decomposition%local%mn(2):decomposition%local%mx(2), &
478  decomposition%local%mn(3):decomposition%local%mx(3), &
479  decomposition%local%mn(4):decomposition%local%mx(4), &
480  decomposition%local%mn(5):decomposition%local%mx(5), &
481  decomposition%local%mn(6):decomposition%local%mx(6))
482  sll_int32, parameter :: id = 2 ! eta2
483  sll_int32 :: j
484 
485  do j=1, self%n_halo_blocks(id)
487  decomposition, &
488  f6d, &
489  id, &
490  self%halo_width_eta2(1, j), &
491  self%halo_width_eta2(2, j), &
492  self%halo_blocks_eta2(:,:,j))
493 
495  self, &
496  decomposition, &
497  self%halo_blocks_eta2(id+3, :, j), &
498  self%displacement_eta2, &
499  f6d)
500  end do
502 
504  subroutine sll_s_advection_6d_lagrange_dd_slim_advect_eta2_core(self, decomposition, eta5_cut, displacement, f6d)
505  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
506  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
507  sll_int32, intent(in) :: eta5_cut(2)
508  sll_real64, intent(in) :: displacement(decomposition%local%mn(5):decomposition%local%mx(5))
509  sll_real64, intent(inout) :: f6d(&
510  decomposition%local%mn(1):decomposition%local%mx(1), &
511  decomposition%local%mn(2):decomposition%local%mx(2), &
512  decomposition%local%mn(3):decomposition%local%mx(3), &
513  decomposition%local%mn(4):decomposition%local%mx(4), &
514  decomposition%local%mn(5):decomposition%local%mx(5), &
515  decomposition%local%mn(6):decomposition%local%mx(6))
516  sll_int32, parameter :: id = 2 ! eta2
517  sll_int32 :: i,j,k,l,m,n,w,wx, lw
518  sll_real64, pointer :: buf_i(:,:)
519  sll_real64, pointer :: buf_o(:,:)
520 #ifdef USE_FMEMPOOL
521  sll_int32 :: mn(2), mx(2)
522 #endif
523  sll_int32 :: l_mn, l_mx, c_mn, c_mx, r_mn, r_mx
524  sll_int32, pointer :: loop_mn(:), loop_mx(:)
525  halo_dtype, pointer :: l_halo(:,:,:,:,:,:)
526  halo_dtype, pointer :: r_halo(:,:,:,:,:,:)
527 
528  ! cache values to avoid multiple dereferencing inside the loops
529  l_mn = decomposition%local%halo_left%mn(id)
530  l_mx = decomposition%local%halo_left%mx(id)
531  c_mn = decomposition%local%mn(id)
532  c_mx = decomposition%local%mx(id)
533  r_mn = decomposition%local%halo_right%mn(id)
534  r_mx = decomposition%local%halo_right%mx(id)
535  l_halo => decomposition%local%halo_left%buf
536  r_halo => decomposition%local%halo_right%buf
537  loop_mn => decomposition%local%mn
538  loop_mx => decomposition%local%mx
539  wx = get_wx(decomposition)
540  lw = self%lagrange_width(1)
541 
542 #ifdef USE_FMEMPOOL
543  mn = [l_mn, 0]
544  mx = [r_mx, wx-1]
545 #endif
546 
547 !$omp parallel default(shared) private(i,j,k,l,m,n,w,buf_i,buf_o)
548 #ifdef USE_FMEMPOOL
549  call mp_acquire(buf_i, mn, mx)
550  call mp_acquire(buf_o, mn, mx)
551 #else
552  allocate(buf_i(l_mn:r_mx, 0:wx-1))
553  allocate(buf_o(l_mn:r_mx, 0:wx-1))
554 #endif
555 !$omp do OMP_COLLAPSE OMP_SCHEDULE
556  do n=loop_mn(6), loop_mx(6)
557  do m=eta5_cut(1), eta5_cut(2)
558  do l=loop_mn(4), loop_mx(4)
559  do k=loop_mn(3), loop_mx(3)
560 #ifndef DISABLE_CACHE_BLOCKING
561  do i=loop_mn(1), loop_mx(1), wx
562  ! (1) fill input buffer piecewise
563  do j=l_mn,l_mx
564  do w=0,wx-1
565  buf_i(j,w) = l_halo(i+w,j,k,l,m,n)
566  enddo
567  enddo
568  do j=c_mn,c_mx
569  do w=0,wx-1
570  buf_i(j,w) = f6d(i+w,j,k,l,m,n)
571  enddo
572  enddo
573  do j=r_mn,r_mx
574  do w=0,wx-1
575  buf_i(j,w) = r_halo(i+w,j,k,l,m,n)
576  enddo
577  enddo
578  ! (2) perform interpolation
579  do w=0,wx-1
581  buf_i(:,w), &
582  buf_o(:,w), &
583  displacement(m), &
584  lw)
585  enddo
586  ! (3) copy-back interpolated values
587 ! f6d(i,:,k,l,m,n) = buf_o(c_mn:c_mx)
588  do j=c_mn,c_mx
589  do w=0,wx-1
590  f6d(i+w,j,k,l,m,n) = buf_o(j,w)
591  enddo
592  enddo
593  end do
594 #else
595  do i=loop_mn(1), loop_mx(1)
596  ! (1) fill input buffer piecewise
597  do j=l_mn,l_mx
598  buf_i(j,0) = l_halo(i,j,k,l,m,n)
599  enddo
600  do j=c_mn,c_mx
601  buf_i(j,0) = f6d(i,j,k,l,m,n)
602  enddo
603  do j=r_mn,r_mx
604  buf_i(j,0) = r_halo(i,j,k,l,m,n)
605  enddo
606  ! (2) perform interpolation
608  buf_i(:,0), &
609  buf_o(:,0), &
610  displacement(m), &
611  lw)
612  ! (3) copy-back interpolated values
613 ! f6d(i,:,k,l,m,n) = buf_o(c_mn:c_mx)
614  do j=c_mn,c_mx
615  f6d(i,j,k,l,m,n) = buf_o(j,0)
616  enddo
617  end do
618 #endif
619  end do
620  end do
621  end do
622  end do
623 !$omp end do
624 #ifdef USE_FMEMPOOL
625  call mp_release(buf_i)
626  call mp_release(buf_o)
627 #else
628  deallocate(buf_i)
629  deallocate(buf_o)
630 #endif
631 !$omp end parallel
633 
634 
636  subroutine sll_s_advection_6d_lagrange_dd_slim_fadvect_eta3(self, topology, decomposition, f6d)
637  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
638  type(sll_t_cartesian_topology_6d), intent( in ) :: topology
639  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
640  sll_real64, intent(inout) :: f6d(&
641  decomposition%local%mn(1):decomposition%local%mx(1), &
642  decomposition%local%mn(2):decomposition%local%mx(2), &
643  decomposition%local%mn(3):decomposition%local%mx(3), &
644  decomposition%local%mn(4):decomposition%local%mx(4), &
645  decomposition%local%mn(5):decomposition%local%mx(5), &
646  decomposition%local%mn(6):decomposition%local%mx(6))
647 
648  sll_int32, parameter :: id = 3 ! eta3
649  sll_int32 :: j
650 
651  do j=1, self%n_halo_blocks(id)
653  decomposition, &
654  f6d, &
655  id, &
656  self%halo_width_eta3(1, j), &
657  self%halo_width_eta3(2, j), &
658  self%halo_blocks_eta3(:,:,j))
659 
661  self, &
662  decomposition, &
663  self%halo_blocks_eta3(id+3, :, j), &
664  self%displacement_eta3, &
665  f6d)
666  end do
668 
670  subroutine sll_s_advection_6d_lagrange_dd_slim_advect_eta3_core(self, decomposition, eta6_cut, displacement, f6d)
671  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
672  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
673  sll_int32, intent(in) :: eta6_cut(2)
674  sll_real64, intent(in) :: displacement(decomposition%local%mn(6):decomposition%local%mx(6))
675  sll_real64, intent(inout) :: f6d(&
676  decomposition%local%mn(1):decomposition%local%mx(1), &
677  decomposition%local%mn(2):decomposition%local%mx(2), &
678  decomposition%local%mn(3):decomposition%local%mx(3), &
679  decomposition%local%mn(4):decomposition%local%mx(4), &
680  decomposition%local%mn(5):decomposition%local%mx(5), &
681  decomposition%local%mn(6):decomposition%local%mx(6))
682  sll_int32, parameter :: id = 3 ! eta3
683  sll_int32 :: i,j,k,l,m,n,w,wx, lw
684  sll_real64, pointer :: buf_i(:,:)
685  sll_real64, pointer :: buf_o(:,:)
686 #ifdef USE_FMEMPOOL
687  sll_int32 :: mn(2), mx(2)
688 #endif
689  sll_int32 :: l_mn, l_mx, c_mn, c_mx, r_mn, r_mx
690  sll_int32, pointer :: loop_mn(:), loop_mx(:)
691  halo_dtype, pointer :: l_halo(:,:,:,:,:,:)
692  halo_dtype, pointer :: r_halo(:,:,:,:,:,:)
693 
694  ! cache values to avoid multiple dereferencing inside the loops
695  l_mn = decomposition%local%halo_left%mn(id)
696  l_mx = decomposition%local%halo_left%mx(id)
697  c_mn = decomposition%local%mn(id)
698  c_mx = decomposition%local%mx(id)
699  r_mn = decomposition%local%halo_right%mn(id)
700  r_mx = decomposition%local%halo_right%mx(id)
701  l_halo => decomposition%local%halo_left%buf
702  r_halo => decomposition%local%halo_right%buf
703  loop_mn => decomposition%local%mn
704  loop_mx => decomposition%local%mx
705  wx = get_wx(decomposition)
706  lw = self%lagrange_width(1)
707 
708 #ifdef USE_FMEMPOOL
709  mn = [l_mn, 0]
710  mx = [r_mx, wx-1]
711 #endif
712 
713 !$omp parallel default(shared) private(i,j,k,l,m,n,w,buf_i,buf_o)
714 #ifdef USE_FMEMPOOL
715  call mp_acquire(buf_i, mn, mx)
716  call mp_acquire(buf_o, mn, mx)
717 #else
718  allocate(buf_i(l_mn:r_mx, 0:wx-1))
719  allocate(buf_o(l_mn:r_mx, 0:wx-1))
720 #endif
721 !$omp do OMP_COLLAPSE OMP_SCHEDULE
722  do n=eta6_cut(1), eta6_cut(2)
723  do m=loop_mn(5), loop_mx(5)
724  do l=loop_mn(4), loop_mx(4)
725  do j=loop_mn(2), loop_mx(2)
726 #ifndef DISABLE_CACHE_BLOCKING
727  do i=loop_mn(1), loop_mx(1), wx
728  ! (1) fill input buffer piecewise
729  do k=l_mn,l_mx
730  do w=0,wx-1
731  buf_i(k,w) = l_halo(i+w,j,k,l,m,n)
732  enddo
733  enddo
734  do k=c_mn,c_mx
735  do w=0,wx-1
736  buf_i(k,w) = f6d(i+w,j,k,l,m,n)
737  enddo
738  enddo
739  do k=r_mn,r_mx
740  do w=0,wx-1
741  buf_i(k,w) = r_halo(i+w,j,k,l,m,n)
742  enddo
743  enddo
744  ! (2) perform interpolation
745  do w=0,wx-1
747  buf_i(:,w), &
748  buf_o(:,w), &
749  displacement(n), &
750  lw)
751  enddo
752  ! (3) copy-back interpolated values
753 ! f6d(i,j,:,l,m,n) = buf_o(c_mn:c_mx)
754  do k=c_mn,c_mx
755  do w=0,wx-1
756  f6d(i+w,j,k,l,m,n) = buf_o(k,w)
757  enddo
758  enddo
759  end do
760 #else
761  do i=loop_mn(1), loop_mx(1)
762  ! (1) fill input buffer piecewise
763  do k=l_mn,l_mx
764  buf_i(k,0) = l_halo(i,j,k,l,m,n)
765  enddo
766  do k=c_mn,c_mx
767  buf_i(k,0) = f6d(i,j,k,l,m,n)
768  enddo
769  do k=r_mn,r_mx
770  buf_i(k,0) = r_halo(i,j,k,l,m,n)
771  enddo
772  ! (2) perform interpolation
774  buf_i(:,0), &
775  buf_o(:,0), &
776  displacement(n), &
777  lw)
778  ! (3) copy-back interpolated values
779 ! f6d(i,j,:,l,m,n) = buf_o(c_mn:c_mx)
780  do k=c_mn,c_mx
781  f6d(i,j,k,l,m,n) = buf_o(k,0)
782  enddo
783  end do
784 #endif
785  end do
786  end do
787  end do
788  end do
789 !$omp end do
790 #ifdef USE_FMEMPOOL
791  call mp_release(buf_i)
792  call mp_release(buf_o)
793 #else
794  deallocate(buf_i)
795  deallocate(buf_o)
796 #endif
797 !$omp end parallel
799 
800 
801 ! --- LAGRANGE INTERPOLATION ROUTINES WITH *FIXED* DISCPLACEMENT BELOW ---
802 ! (previously sll_m_advection_6d_lagrange_dd_slim.BAK.F90)
803 
804 
806  subroutine sll_s_advection_6d_lagrange_dd_slim_advect_eta1(self, decomposition, displacement, f6d)
807  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
808  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
809  sll_real64, intent(in) :: displacement(decomposition%local%mn(4):decomposition%local%mx(4))
810  sll_real64, intent(inout) :: f6d(&
811  decomposition%local%mn(1):decomposition%local%mx(1), &
812  decomposition%local%mn(2):decomposition%local%mx(2), &
813  decomposition%local%mn(3):decomposition%local%mx(3), &
814  decomposition%local%mn(4):decomposition%local%mx(4), &
815  decomposition%local%mn(5):decomposition%local%mx(5), &
816  decomposition%local%mn(6):decomposition%local%mx(6))
817  sll_int32, parameter :: id = 1 ! eta1
818  sll_int32 :: i,j,k,l,m,n,w,wx, lw
819  sll_real64, pointer :: buf_i(:,:)
820  sll_real64, pointer :: buf_o(:,:)
821 #ifdef USE_FMEMPOOL
822  sll_int32 :: mn(2), mx(2)
823 #endif
824  sll_int32 :: l_mn, l_mx, c_mn, c_mx, r_mn, r_mx
825  sll_int32, pointer :: loop_mn(:), loop_mx(:)
826  halo_dtype, pointer :: l_halo(:,:,:,:,:,:)
827  halo_dtype, pointer :: r_halo(:,:,:,:,:,:)
828 
829  sll_assert_always(decomposition%local%halo_left%nw(id) == decomposition%local%halo_right%nw(id))
830  sll_assert_always(decomposition%local%halo_left%nw(id) == (self%lagrange_width(1)-1)/2)
831  ! cache values to avoid multiple dereferencing inside the loops
832  l_mn = decomposition%local%halo_left%mn(id)
833  l_mx = decomposition%local%halo_left%mx(id)
834  c_mn = decomposition%local%mn(id)
835  c_mx = decomposition%local%mx(id)
836  r_mn = decomposition%local%halo_right%mn(id)
837  r_mx = decomposition%local%halo_right%mx(id)
838  l_halo => decomposition%local%halo_left%buf
839  r_halo => decomposition%local%halo_right%buf
840  loop_mn => decomposition%local%mn
841  loop_mx => decomposition%local%mx
842  ! NOTE: Buffering is done here for the second dim "j".
843  ! Seems to be marginally beneficial.
844  wx = get_wx(decomposition, 2)
845  lw = self%lagrange_width(1)
846 
847 ! !$omp parallel default(shared) private(i,j,k,l,m,n,w,buf_i,buf_o)
848 ! allocate(buf_i(l_mn:r_mx))
849 ! allocate(buf_o(l_mn:r_mx))
850 ! !$omp do OMP_COLLAPSE OMP_SCHEDULE
851 ! do n=decomposition%local%mn(6), decomposition%local%mx(6)
852 ! do m=decomposition%local%mn(5), decomposition%local%mx(5)
853 ! do l=decomposition%local%mn(4), decomposition%local%mx(4)
854 ! do k=decomposition%local%mn(3), decomposition%local%mx(3)
855 ! do j=decomposition%local%mn(2), decomposition%local%mx(2)
856 ! ! (1) fill input buffer piecewise
857 ! buf_i(l_mn:l_mx) = l_halo(:,j,k,l,m,n)
858 ! buf_i(c_mn:c_mx) = f6d(:,j,k,l,m,n)
859 ! buf_i(r_mn:r_mx) = r_halo(:,j,k,l,m,n)
860 ! ! (2) perform interpolation
861 ! call sll_s_lagrange_interpolation_1d_fast_disp_fixed_haloc_cells( &
862 ! buf_i, &
863 ! buf_o, &
864 ! displacement(l), &
865 ! self%lagrange_width(1))
866 ! ! (3) copy-back interpolated values
867 ! f6d(:,j,k,l,m,n) = buf_o(c_mn:c_mx)
868 ! end do
869 ! end do
870 ! end do
871 ! end do
872 ! end do
873 ! !$omp end do
874 ! deallocate(buf_i)
875 ! deallocate(buf_o)
876 ! !$omp end parallel
877 
878 #ifdef USE_FMEMPOOL
879  mn = [l_mn, 0]
880  mx = [r_mx, wx-1]
881 #endif
882 
883 !$omp parallel default(shared) private(i,j,k,l,m,n,w,buf_i,buf_o)
884 #ifdef USE_FMEMPOOL
885  call mp_acquire(buf_i, mn, mx)
886  call mp_acquire(buf_o, mn, mx)
887 #else
888  allocate(buf_i(l_mn:r_mx, 0:wx-1))
889  allocate(buf_o(l_mn:r_mx, 0:wx-1))
890 #endif
891 !$omp do OMP_COLLAPSE OMP_SCHEDULE
892  do n=loop_mn(6), loop_mx(6)
893  do m=loop_mn(5), loop_mx(5)
894  do l=loop_mn(4), loop_mx(4)
895  do k=loop_mn(3), loop_mx(3)
896 #ifndef DISABLE_CACHE_BLOCKING
897  do j=loop_mn(2), loop_mx(2), wx
898  do w=0,wx-1
899  do i=l_mn,l_mx
900  buf_i(i,w) = l_halo(i,j+w,k,l,m,n)
901  end do
902  end do
903  do w=0,wx-1
904  do i=c_mn,c_mx
905  buf_i(i,w) = f6d(i,j+w,k,l,m,n)
906  end do
907  end do
908  do w=0,wx-1
909  do i=r_mn,r_mx
910  buf_i(i,w) = r_halo(i,j+w,k,l,m,n)
911  end do
912  end do
913  do w=0,wx-1
915  buf_i(:,w), &
916  buf_o(:,w), &
917  displacement(l), &
918  lw)
919  end do
920  do w=0,wx-1
921  do i=c_mn,c_mx
922  f6d(i,j+w,k,l,m,n) = buf_o(i,w)
923  end do
924  end do
925  end do
926 #else
927  do j=loop_mn(2), loop_mx(2)
928  ! (1) fill input buffer piecewise
929 ! buf_i(l_mn:l_mx,1) = l_halo(:,j,k,l,m,n)
930 ! buf_i(c_mn:c_mx,1) = f6d(:,j,k,l,m,n)
931 ! buf_i(r_mn:r_mx,1) = r_halo(:,j,k,l,m,n)
932  do i=l_mn,l_mx
933  buf_i(i,0) = l_halo(i,j,k,l,m,n)
934  enddo
935  do i=c_mn,c_mx
936  buf_i(i,0) = f6d(i,j,k,l,m,n)
937  enddo
938  do i=r_mn,r_mx
939  buf_i(i,0) = r_halo(i,j,k,l,m,n)
940  enddo
941  ! (2) perform interpolation
943  buf_i(:,0), &
944  buf_o(:,0), &
945  displacement(l), &
946  lw)
947  ! (3) copy-back interpolated values
948 ! f6d(:,j,k,l,m,n) = buf_o(c_mn:c_mx,1)
949  do i=c_mn,c_mx
950  f6d(i,j,k,l,m,n) = buf_o(i,0)
951  enddo
952  end do
953 #endif
954  end do
955  end do
956  end do
957  end do
958 !$omp end do
959 #ifdef USE_FMEMPOOL
960  call mp_release(buf_i)
961  call mp_release(buf_o)
962 #else
963  deallocate(buf_i)
964  deallocate(buf_o)
965 #endif
966 !$omp end parallel
968 
970  subroutine sll_s_advection_6d_lagrange_dd_slim_advect_eta1_dispeta45(self, decomposition, displacement, f6d)
971  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
972  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
973  sll_real64, intent(in) :: displacement(decomposition%local%mn(4):decomposition%local%mx(4), decomposition%local%mn(5):decomposition%local%mx(5))
974  sll_real64, intent(inout) :: f6d(&
975  decomposition%local%mn(1):decomposition%local%mx(1), &
976  decomposition%local%mn(2):decomposition%local%mx(2), &
977  decomposition%local%mn(3):decomposition%local%mx(3), &
978  decomposition%local%mn(4):decomposition%local%mx(4), &
979  decomposition%local%mn(5):decomposition%local%mx(5), &
980  decomposition%local%mn(6):decomposition%local%mx(6))
981  sll_int32, parameter :: id = 1 ! eta1
982  sll_int32 :: i,j,k,l,m,n,w,wx, lw
983  sll_real64, pointer :: buf_i(:,:)
984  sll_real64, pointer :: buf_o(:,:)
985 #ifdef USE_FMEMPOOL
986  sll_int32 :: mn(2), mx(2)
987 #endif
988  sll_int32 :: l_mn, l_mx, c_mn, c_mx, r_mn, r_mx
989  sll_int32, pointer :: loop_mn(:), loop_mx(:)
990  halo_dtype, pointer :: l_halo(:,:,:,:,:,:)
991  halo_dtype, pointer :: r_halo(:,:,:,:,:,:)
992 
993  sll_assert_always(decomposition%local%halo_left%nw(id) == decomposition%local%halo_right%nw(id))
994  sll_assert_always(decomposition%local%halo_left%nw(id) == (self%lagrange_width(1)-1)/2)
995  ! cache values to avoid multiple dereferencing inside the loops
996  l_mn = decomposition%local%halo_left%mn(id)
997  l_mx = decomposition%local%halo_left%mx(id)
998  c_mn = decomposition%local%mn(id)
999  c_mx = decomposition%local%mx(id)
1000  r_mn = decomposition%local%halo_right%mn(id)
1001  r_mx = decomposition%local%halo_right%mx(id)
1002  l_halo => decomposition%local%halo_left%buf
1003  r_halo => decomposition%local%halo_right%buf
1004  loop_mn => decomposition%local%mn
1005  loop_mx => decomposition%local%mx
1006  ! NOTE: Buffering is done here for the second dim "j".
1007  ! Seems to be marginally beneficial.
1008  wx = get_wx(decomposition, 2)
1009  lw = self%lagrange_width(1)
1010 
1011 #ifdef USE_FMEMPOOL
1012  mn = [l_mn, 0]
1013  mx = [r_mx, wx-1]
1014 #endif
1015 
1016 !$omp parallel default(shared) private(i,j,k,l,m,n,w,buf_i,buf_o)
1017 #ifdef USE_FMEMPOOL
1018  call mp_acquire(buf_i, mn, mx)
1019  call mp_acquire(buf_o, mn, mx)
1020 #else
1021  allocate(buf_i(l_mn:r_mx, 0:wx-1))
1022  allocate(buf_o(l_mn:r_mx, 0:wx-1))
1023 #endif
1024 !$omp do OMP_COLLAPSE OMP_SCHEDULE
1025  do n=loop_mn(6), loop_mx(6)
1026  do m=loop_mn(5), loop_mx(5)
1027  do l=loop_mn(4), loop_mx(4)
1028  do k=loop_mn(3), loop_mx(3)
1029 #ifndef DISABLE_CACHE_BLOCKING
1030  do j=loop_mn(2), loop_mx(2), wx
1031  do w=0,wx-1
1032  do i=l_mn,l_mx
1033  buf_i(i,w) = l_halo(i,j+w,k,l,m,n)
1034  end do
1035  end do
1036  do w=0,wx-1
1037  do i=c_mn,c_mx
1038  buf_i(i,w) = f6d(i,j+w,k,l,m,n)
1039  end do
1040  end do
1041  do w=0,wx-1
1042  do i=r_mn,r_mx
1043  buf_i(i,w) = r_halo(i,j+w,k,l,m,n)
1044  end do
1045  end do
1046  do w=0,wx-1
1048  buf_i(:,w), &
1049  buf_o(:,w), &
1050  displacement(l,m), &
1051  lw)
1052  end do
1053  do w=0,wx-1
1054  do i=c_mn,c_mx
1055  f6d(i,j+w,k,l,m,n) = buf_o(i,w)
1056  end do
1057  end do
1058  end do
1059 #else
1060  do j=loop_mn(2), loop_mx(2)
1061  ! (1) fill input buffer piecewise
1062 ! buf_i(l_mn:l_mx,1) = l_halo(:,j,k,l,m,n)
1063 ! buf_i(c_mn:c_mx,1) = f6d(:,j,k,l,m,n)
1064 ! buf_i(r_mn:r_mx,1) = r_halo(:,j,k,l,m,n)
1065  do i=l_mn,l_mx
1066  buf_i(i,0) = l_halo(i,j,k,l,m,n)
1067  enddo
1068  do i=c_mn,c_mx
1069  buf_i(i,0) = f6d(i,j,k,l,m,n)
1070  enddo
1071  do i=r_mn,r_mx
1072  buf_i(i,0) = r_halo(i,j,k,l,m,n)
1073  enddo
1074  ! (2) perform interpolation
1076  buf_i(:,0), &
1077  buf_o(:,0), &
1078  displacement(l,m), &
1079  lw)
1080  ! (3) copy-back interpolated values
1081 ! f6d(:,j,k,l,m,n) = buf_o(c_mn:c_mx,1)
1082  do i=c_mn,c_mx
1083  f6d(i,j,k,l,m,n) = buf_o(i,0)
1084  enddo
1085  end do
1086 #endif
1087  end do
1088  end do
1089  end do
1090  end do
1091 !$omp end do
1092 #ifdef USE_FMEMPOOL
1093  call mp_release(buf_i)
1094  call mp_release(buf_o)
1095 #else
1096  deallocate(buf_i)
1097  deallocate(buf_o)
1098 #endif
1099 !$omp end parallel
1101 
1102 
1104  subroutine sll_s_advection_6d_lagrange_dd_slim_advect_eta2(self, decomposition, displacement, f6d)
1105  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
1106  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
1107  sll_real64, intent(in) :: displacement(decomposition%local%mn(5):decomposition%local%mx(5))
1108  sll_real64, intent(inout) :: f6d(&
1109  decomposition%local%mn(1):decomposition%local%mx(1), &
1110  decomposition%local%mn(2):decomposition%local%mx(2), &
1111  decomposition%local%mn(3):decomposition%local%mx(3), &
1112  decomposition%local%mn(4):decomposition%local%mx(4), &
1113  decomposition%local%mn(5):decomposition%local%mx(5), &
1114  decomposition%local%mn(6):decomposition%local%mx(6))
1115  sll_int32, parameter :: id = 2 ! eta2
1116  sll_int32 :: i,j,k,l,m,n,w,wx, lw
1117  sll_real64, pointer :: buf_i(:,:)
1118  sll_real64, pointer :: buf_o(:,:)
1119 #ifdef USE_FMEMPOOL
1120  sll_int32 :: mn(2), mx(2)
1121 #endif
1122  sll_int32 :: l_mn, l_mx, c_mn, c_mx, r_mn, r_mx
1123  sll_int32, pointer :: loop_mn(:), loop_mx(:)
1124  halo_dtype, pointer :: l_halo(:,:,:,:,:,:)
1125  halo_dtype, pointer :: r_halo(:,:,:,:,:,:)
1126 
1127  ! NOTE: Current restriction for the memory-saving `slim` halos,
1128  ! to be lowered later for the real one-sided halo implementation.
1129  ! One-sided halos require a more clever index handling (though straight-forward).
1130  sll_assert_always(decomposition%local%halo_left%nw(id) == decomposition%local%halo_right%nw(id))
1131  sll_assert_always(decomposition%local%halo_left%nw(id) == (self%lagrange_width(1)-1)/2)
1132  ! cache values to avoid multiple dereferencing inside the loops
1133  l_mn = decomposition%local%halo_left%mn(id)
1134  l_mx = decomposition%local%halo_left%mx(id)
1135  c_mn = decomposition%local%mn(id)
1136  c_mx = decomposition%local%mx(id)
1137  r_mn = decomposition%local%halo_right%mn(id)
1138  r_mx = decomposition%local%halo_right%mx(id)
1139  l_halo => decomposition%local%halo_left%buf
1140  r_halo => decomposition%local%halo_right%buf
1141  loop_mn => decomposition%local%mn
1142  loop_mx => decomposition%local%mx
1143  wx = get_wx(decomposition)
1144  lw = self%lagrange_width(1)
1145 
1146 ! !$omp parallel default(shared) private(i,j,k,l,m,n,w,buf_i,buf_o)
1147 ! allocate(buf_i(l_mn:r_mx))
1148 ! allocate(buf_o(l_mn:r_mx))
1149 ! !$omp do OMP_COLLAPSE OMP_SCHEDULE
1150 ! do n=decomposition%local%mn(6), decomposition%local%mx(6)
1151 ! do m=decomposition%local%mn(5), decomposition%local%mx(5)
1152 ! do l=decomposition%local%mn(4), decomposition%local%mx(4)
1153 ! do k=decomposition%local%mn(3), decomposition%local%mx(3)
1154 ! do i=decomposition%local%mn(1), decomposition%local%mx(1)
1155 ! ! buf_i = f6d(i,:,k,l,m,n)
1156 ! ! (1) fill input buffer piecewise
1157 ! buf_i(l_mn:l_mx) = l_halo(i,:,k,l,m,n)
1158 ! buf_i(c_mn:c_mx) = f6d(i,:,k,l,m,n)
1159 ! buf_i(r_mn:r_mx) = r_halo(i,:,k,l,m,n)
1160 ! ! (2) perform interpolation
1161 ! call sll_s_lagrange_interpolation_1d_fast_disp_fixed_haloc_cells( &
1162 ! buf_i, &
1163 ! buf_o, &
1164 ! displacement(m), &
1165 ! self%lagrange_width(1))
1166 ! ! (3) copy-back interpolated values
1167 ! f6d(i,:,k,l,m,n) = buf_o(c_mn:c_mx)
1168 ! ! f6d(i,:,k,l,m,n) = buf_o
1169 ! end do
1170 ! end do
1171 ! end do
1172 ! end do
1173 ! end do
1174 ! !$omp end do
1175 ! deallocate(buf_i)
1176 ! deallocate(buf_o)
1177 ! !$omp end parallel
1178 
1179 #ifdef USE_FMEMPOOL
1180  mn = [l_mn, 0]
1181  mx = [r_mx, wx-1]
1182 #endif
1183 
1184 !$omp parallel default(shared) private(i,j,k,l,m,n,w,buf_i,buf_o)
1185 #ifdef USE_FMEMPOOL
1186  call mp_acquire(buf_i, mn, mx)
1187  call mp_acquire(buf_o, mn, mx)
1188 #else
1189  allocate(buf_i(l_mn:r_mx, 0:wx-1))
1190  allocate(buf_o(l_mn:r_mx, 0:wx-1))
1191 #endif
1192 !$omp do OMP_COLLAPSE OMP_SCHEDULE
1193  do n=loop_mn(6), loop_mx(6)
1194  do m=loop_mn(5), loop_mx(5)
1195  do l=loop_mn(4), loop_mx(4)
1196  do k=loop_mn(3), loop_mx(3)
1197 #ifndef DISABLE_CACHE_BLOCKING
1198  do i=loop_mn(1), loop_mx(1), wx
1199  do j=l_mn,l_mx
1200  do w=0,wx-1
1201  buf_i(j,w) = l_halo(i+w,j,k,l,m,n)
1202  end do
1203  end do
1204  do j=c_mn,c_mx
1205  do w=0,wx-1
1206  buf_i(j,w) = f6d(i+w,j,k,l,m,n)
1207  end do
1208  end do
1209  do j=r_mn,r_mx
1210  do w=0,wx-1
1211  buf_i(j,w) = r_halo(i+w,j,k,l,m,n)
1212  end do
1213  end do
1214  do w=0,wx-1
1216  buf_i(:,w), &
1217  buf_o(:,w), &
1218  displacement(m), &
1219  lw)
1220  end do
1221  do j=c_mn,c_mx
1222  do w=0,wx-1
1223  f6d(i+w,j,k,l,m,n) = buf_o(j,w)
1224  end do
1225  end do
1226  end do
1227 #else
1228  do i=loop_mn(1), loop_mx(1)
1229  ! (1) fill input buffer piecewise
1230 ! buf_i(l_mn:l_mx,1) = l_halo(i,:,k,l,m,n)
1231 ! buf_i(c_mn:c_mx,1) = f6d(i,:,k,l,m,n)
1232 ! buf_i(r_mn:r_mx,1) = r_halo(i,:,k,l,m,n)
1233  do j=l_mn,l_mx
1234  buf_i(j,0) = l_halo(i,j,k,l,m,n)
1235  enddo
1236  do j=c_mn,c_mx
1237  buf_i(j,0) = f6d(i,j,k,l,m,n)
1238  enddo
1239  do j=r_mn,r_mx
1240  buf_i(j,0) = r_halo(i,j,k,l,m,n)
1241  enddo
1242  ! (2) perform interpolation
1244  buf_i(:,0), &
1245  buf_o(:,0), &
1246  displacement(m), &
1247  lw)
1248 ! f6d(i,:,k,l,m,n) = buf_o(c_mn:c_mx,1)
1249  do j=c_mn,c_mx
1250  f6d(i,j,k,l,m,n) = buf_o(j,0)
1251  enddo
1252  end do
1253 #endif
1254  end do
1255  end do
1256  end do
1257  end do
1258 !$omp end do
1259 #ifdef USE_FMEMPOOL
1260  call mp_release(buf_i)
1261  call mp_release(buf_o)
1262 #else
1263  deallocate(buf_i)
1264  deallocate(buf_o)
1265 #endif
1266 !$omp end parallel
1268 
1269 
1271  subroutine sll_s_advection_6d_lagrange_dd_slim_advect_eta2_dispeta45(self, decomposition, displacement, f6d)
1272  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
1273  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
1274  sll_real64, intent(in) :: displacement(&
1275  decomposition%local%mn(4):decomposition%local%mx(4),&
1276  decomposition%local%mn(5):decomposition%local%mx(5))
1277  sll_real64, intent(inout) :: f6d(&
1278  decomposition%local%mn(1):decomposition%local%mx(1), &
1279  decomposition%local%mn(2):decomposition%local%mx(2), &
1280  decomposition%local%mn(3):decomposition%local%mx(3), &
1281  decomposition%local%mn(4):decomposition%local%mx(4), &
1282  decomposition%local%mn(5):decomposition%local%mx(5), &
1283  decomposition%local%mn(6):decomposition%local%mx(6))
1284  sll_int32, parameter :: id = 2 ! eta2
1285  sll_int32 :: i,j,k,l,m,n,w,wx, lw
1286  sll_real64, pointer :: buf_i(:,:)
1287  sll_real64, pointer :: buf_o(:,:)
1288 #ifdef USE_FMEMPOOL
1289  sll_int32 :: mn(2), mx(2)
1290 #endif
1291  sll_int32 :: l_mn, l_mx, c_mn, c_mx, r_mn, r_mx
1292  sll_int32, pointer :: loop_mn(:), loop_mx(:)
1293  halo_dtype, pointer :: l_halo(:,:,:,:,:,:)
1294  halo_dtype, pointer :: r_halo(:,:,:,:,:,:)
1295 
1296  ! NOTE: Current restriction for the memory-saving `slim` halos,
1297  ! to be lowered later for the real one-sided halo implementation.
1298  ! One-sided halos require a more clever index handling (though straight-forward).
1299  sll_assert_always(decomposition%local%halo_left%nw(id) == decomposition%local%halo_right%nw(id))
1300  sll_assert_always(decomposition%local%halo_left%nw(id) == (self%lagrange_width(1)-1)/2)
1301  ! cache values to avoid multiple dereferencing inside the loops
1302  l_mn = decomposition%local%halo_left%mn(id)
1303  l_mx = decomposition%local%halo_left%mx(id)
1304  c_mn = decomposition%local%mn(id)
1305  c_mx = decomposition%local%mx(id)
1306  r_mn = decomposition%local%halo_right%mn(id)
1307  r_mx = decomposition%local%halo_right%mx(id)
1308  l_halo => decomposition%local%halo_left%buf
1309  r_halo => decomposition%local%halo_right%buf
1310  loop_mn => decomposition%local%mn
1311  loop_mx => decomposition%local%mx
1312  wx = get_wx(decomposition)
1313  lw = self%lagrange_width(1)
1314 
1315 #ifdef USE_FMEMPOOL
1316  mn = [l_mn, 0]
1317  mx = [r_mx, wx-1]
1318 #endif
1319 
1320 !$omp parallel default(shared) private(i,j,k,l,m,n,w,buf_i,buf_o)
1321 #ifdef USE_FMEMPOOL
1322  call mp_acquire(buf_i, mn, mx)
1323  call mp_acquire(buf_o, mn, mx)
1324 #else
1325  allocate(buf_i(l_mn:r_mx, 0:wx-1))
1326  allocate(buf_o(l_mn:r_mx, 0:wx-1))
1327 #endif
1328 !$omp do OMP_COLLAPSE OMP_SCHEDULE
1329  do n=loop_mn(6), loop_mx(6)
1330  do m=loop_mn(5), loop_mx(5)
1331  do l=loop_mn(4), loop_mx(4)
1332  do k=loop_mn(3), loop_mx(3)
1333 #ifndef DISABLE_CACHE_BLOCKING
1334  do i=loop_mn(1), loop_mx(1), wx
1335  do j=l_mn,l_mx
1336  do w=0,wx-1
1337  buf_i(j,w) = l_halo(i+w,j,k,l,m,n)
1338  end do
1339  end do
1340  do j=c_mn,c_mx
1341  do w=0,wx-1
1342  buf_i(j,w) = f6d(i+w,j,k,l,m,n)
1343  end do
1344  end do
1345  do j=r_mn,r_mx
1346  do w=0,wx-1
1347  buf_i(j,w) = r_halo(i+w,j,k,l,m,n)
1348  end do
1349  end do
1350  do w=0,wx-1
1352  buf_i(:,w), &
1353  buf_o(:,w), &
1354  displacement(l,m), &
1355  lw)
1356  end do
1357  do j=c_mn,c_mx
1358  do w=0,wx-1
1359  f6d(i+w,j,k,l,m,n) = buf_o(j,w)
1360  end do
1361  end do
1362  end do
1363 #else
1364  do i=loop_mn(1), loop_mx(1)
1365  ! (1) fill input buffer piecewise
1366 ! buf_i(l_mn:l_mx,1) = l_halo(i,:,k,l,m,n)
1367 ! buf_i(c_mn:c_mx,1) = f6d(i,:,k,l,m,n)
1368 ! buf_i(r_mn:r_mx,1) = r_halo(i,:,k,l,m,n)
1369  do j=l_mn,l_mx
1370  buf_i(j,0) = l_halo(i,j,k,l,m,n)
1371  enddo
1372  do j=c_mn,c_mx
1373  buf_i(j,0) = f6d(i,j,k,l,m,n)
1374  enddo
1375  do j=r_mn,r_mx
1376  buf_i(j,0) = r_halo(i,j,k,l,m,n)
1377  enddo
1378  ! (2) perform interpolation
1380  buf_i(:,0), &
1381  buf_o(:,0), &
1382  displacement(l,m), &
1383  lw)
1384 ! f6d(i,:,k,l,m,n) = buf_o(c_mn:c_mx,1)
1385  do j=c_mn,c_mx
1386  f6d(i,j,k,l,m,n) = buf_o(j,0)
1387  enddo
1388  end do
1389 #endif
1390  end do
1391  end do
1392  end do
1393  end do
1394 !$omp end do
1395 #ifdef USE_FMEMPOOL
1396  call mp_release(buf_i)
1397  call mp_release(buf_o)
1398 #else
1399  deallocate(buf_i)
1400  deallocate(buf_o)
1401 #endif
1402 !$omp end parallel
1404 
1405 
1406 
1407 
1408 
1410  subroutine sll_s_advection_6d_lagrange_dd_slim_advect_eta3(self, decomposition, displacement, f6d)
1411  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
1412  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
1413  sll_real64, intent(in) :: displacement(decomposition%local%mn(6):decomposition%local%mx(6))
1414  sll_real64, intent(inout) :: f6d(&
1415  decomposition%local%mn(1):decomposition%local%mx(1), &
1416  decomposition%local%mn(2):decomposition%local%mx(2), &
1417  decomposition%local%mn(3):decomposition%local%mx(3), &
1418  decomposition%local%mn(4):decomposition%local%mx(4), &
1419  decomposition%local%mn(5):decomposition%local%mx(5), &
1420  decomposition%local%mn(6):decomposition%local%mx(6))
1421  sll_int32, parameter :: id = 3 ! eta3
1422  sll_int32 :: i,j,k,l,m,n,w,wx, lw
1423  sll_real64, pointer :: buf_i(:,:)
1424  sll_real64, pointer :: buf_o(:,:)
1425 #ifdef USE_FMEMPOOL
1426  sll_int32 :: mn(2), mx(2)
1427 #endif
1428  sll_int32 :: l_mn, l_mx, c_mn, c_mx, r_mn, r_mx
1429  sll_int32, pointer :: loop_mn(:), loop_mx(:)
1430  halo_dtype, pointer :: l_halo(:,:,:,:,:,:)
1431  halo_dtype, pointer :: r_halo(:,:,:,:,:,:)
1432 
1433  ! NOTE: Current restriction for the memory-saving `slim` halos,
1434  ! to be lowered later for the real one-sided halo implementation.
1435  ! One-sided halos require a more clever index handling (though straight-forward).
1436  sll_assert_always(decomposition%local%halo_left%nw(id) == decomposition%local%halo_right%nw(id))
1437  sll_assert_always(decomposition%local%halo_left%nw(id) == (self%lagrange_width(1)-1)/2)
1438  ! cache values to avoid multiple dereferencing inside the loops
1439  l_mn = decomposition%local%halo_left%mn(id)
1440  l_mx = decomposition%local%halo_left%mx(id)
1441  c_mn = decomposition%local%mn(id)
1442  c_mx = decomposition%local%mx(id)
1443  r_mn = decomposition%local%halo_right%mn(id)
1444  r_mx = decomposition%local%halo_right%mx(id)
1445  l_halo => decomposition%local%halo_left%buf
1446  r_halo => decomposition%local%halo_right%buf
1447  loop_mn => decomposition%local%mn
1448  loop_mx => decomposition%local%mx
1449  wx = get_wx(decomposition)
1450  lw = self%lagrange_width(1)
1451 
1452 #ifdef USE_FMEMPOOL
1453  mn = [l_mn, 0]
1454  mx = [r_mx, wx-1]
1455 #endif
1456 
1457 !$omp parallel default(shared) private(i,j,k,l,m,n,w,buf_i,buf_o)
1458 #ifdef USE_FMEMPOOL
1459  call mp_acquire(buf_i, mn, mx)
1460  call mp_acquire(buf_o, mn, mx)
1461 #else
1462  allocate(buf_i(l_mn:r_mx, 0:wx-1))
1463  allocate(buf_o(l_mn:r_mx, 0:wx-1))
1464 #endif
1465 !$omp do OMP_COLLAPSE OMP_SCHEDULE
1466  do n=loop_mn(6), loop_mx(6)
1467  do m=loop_mn(5), loop_mx(5)
1468  do l=loop_mn(4), loop_mx(4)
1469  do j=loop_mn(2), loop_mx(2)
1470 ! do i=loop_mn(1), loop_mx(1)
1471 ! ! (1) fill input buffer piecewise
1472 ! buf_i(l_mn:l_mx) = l_halo(i,j,:,l,m,n)
1473 ! buf_i(c_mn:c_mx) = f6d(i,j,:,l,m,n)
1474 ! buf_i(r_mn:r_mx) = r_halo(i,j,:,l,m,n)
1475 ! ! (2) perform interpolation
1476 ! call sll_s_lagrange_interpolation_1d_fast_disp_fixed_haloc_cells( &
1477 ! buf_i, &
1478 ! buf_o, &
1479 ! displacement(n), &
1480 ! self%lagrange_width(1))
1481 ! ! (3) copy-back interpolated values
1482 ! f6d(i,j,:,l,m,n) = buf_o(c_mn:c_mx)
1483 ! end do
1484 #ifndef DISABLE_CACHE_BLOCKING
1485  do i=loop_mn(1), loop_mx(1), wx
1486  do k=l_mn,l_mx
1487  do w=0,wx-1
1488  buf_i(k,w) = l_halo(i+w,j,k,l,m,n)
1489  end do
1490  end do
1491  do k=c_mn,c_mx
1492  do w=0,wx-1
1493  buf_i(k,w) = f6d(i+w,j,k,l,m,n)
1494  end do
1495  end do
1496  do k=r_mn,r_mx
1497  do w=0,wx-1
1498  buf_i(k,w) = r_halo(i+w,j,k,l,m,n)
1499  end do
1500  end do
1501  do w=0,wx-1
1503  buf_i(:,w), &
1504  buf_o(:,w), &
1505  displacement(n), &
1506  lw)
1507  end do
1508  do k=c_mn,c_mx
1509  do w=0,wx-1
1510  f6d(i+w,j,k,l,m,n) = buf_o(k,w)
1511  end do
1512  end do
1513  end do
1514 #else
1515  do i=loop_mn(1), loop_mx(1)
1516  ! (1) fill input buffer piecewise
1517 ! buf_i(l_mn:l_mx,1) = l_halo(i,j,:,l,m,n)
1518 ! buf_i(c_mn:c_mx,1) = f6d(i,j,:,l,m,n)
1519 ! buf_i(r_mn:r_mx,1) = r_halo(i,j,:,l,m,n)
1520  do k=l_mn,l_mx
1521  buf_i(k,0) = l_halo(i,j,k,l,m,n)
1522  enddo
1523  do k=c_mn,c_mx
1524  buf_i(k,0) = f6d(i,j,k,l,m,n)
1525  enddo
1526  do k=r_mn,r_mx
1527  buf_i(k,0) = r_halo(i,j,k,l,m,n)
1528  enddo
1529  ! (2) perform interpolation
1531  buf_i(:,0), &
1532  buf_o(:,0), &
1533  displacement(n), &
1534  lw)
1535 ! f6d(i,j,:,l,m,n) = buf_o(c_mn:c_mx,1)
1536  do k=c_mn,c_mx
1537  f6d(i,j,k,l,m,n) = buf_o(k,0)
1538  enddo
1539  end do
1540 #endif
1541  end do
1542  end do
1543  end do
1544  end do
1545 !$omp end do
1546 #ifdef USE_FMEMPOOL
1547  call mp_release(buf_i)
1548  call mp_release(buf_o)
1549 #else
1550  deallocate(buf_i)
1551  deallocate(buf_o)
1552 #endif
1553 !$omp end parallel
1555 
1556 
1558  subroutine sll_s_advection_6d_lagrange_dd_slim_advect_eta4(self, decomposition, displacement, f6d)
1559  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
1560  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
1561  sll_real64, intent(in) :: displacement(&
1562  decomposition%local%mn(1):decomposition%local%mx(1), &
1563  decomposition%local%mn(2):decomposition%local%mx(2), &
1564  decomposition%local%mn(3):decomposition%local%mx(3))
1565  sll_real64, intent(inout) :: f6d(&
1566  decomposition%local%mn(1):decomposition%local%mx(1), &
1567  decomposition%local%mn(2):decomposition%local%mx(2), &
1568  decomposition%local%mn(3):decomposition%local%mx(3), &
1569  decomposition%local%mn(4):decomposition%local%mx(4), &
1570  decomposition%local%mn(5):decomposition%local%mx(5), &
1571  decomposition%local%mn(6):decomposition%local%mx(6))
1572  sll_int32, parameter :: id = 4 ! eta4
1573  sll_int32 :: i,j,k,l,m,n,w,wx, lw
1574  sll_real64, pointer :: buf_i(:,:)
1575  sll_real64, pointer :: buf_o(:,:)
1576 #ifdef USE_FMEMPOOL
1577  sll_int32 :: mn(2), mx(2)
1578 #endif
1579  sll_int32 :: l_mn, l_mx, c_mn, c_mx, r_mn, r_mx
1580  sll_int32, pointer :: loop_mn(:), loop_mx(:)
1581  halo_dtype, pointer :: l_halo(:,:,:,:,:,:)
1582  halo_dtype, pointer :: r_halo(:,:,:,:,:,:)
1583 
1584  ! NOTE: Current restriction for the memory-saving `slim` halos,
1585  ! to be lowered later for the real one-sided halo implementation.
1586  ! One-sided halos require a more clever index handling (though straight-forward).
1587  sll_assert_always(decomposition%local%halo_left%nw(id) == decomposition%local%halo_right%nw(id))
1588  sll_assert_always(decomposition%local%halo_left%nw(id) == (self%lagrange_width(2)-1)/2)
1589  ! cache values to avoid multiple dereferencing inside the loops
1590  l_mn = decomposition%local%halo_left%mn(id)
1591  l_mx = decomposition%local%halo_left%mx(id)
1592  c_mn = decomposition%local%mn(id)
1593  c_mx = decomposition%local%mx(id)
1594  r_mn = decomposition%local%halo_right%mn(id)
1595  r_mx = decomposition%local%halo_right%mx(id)
1596  l_halo => decomposition%local%halo_left%buf
1597  r_halo => decomposition%local%halo_right%buf
1598  loop_mn => decomposition%local%mn
1599  loop_mx => decomposition%local%mx
1600  wx = get_wx(decomposition)
1601  lw = self%lagrange_width(2)
1602 
1603 #ifdef USE_FMEMPOOL
1604  mn = [l_mn, 0]
1605  mx = [r_mx, wx-1]
1606 #endif
1607 
1608 !$omp parallel default(shared) private(i,j,k,l,m,n,w,buf_i,buf_o)
1609 #ifdef USE_FMEMPOOL
1610  call mp_acquire(buf_i, mn, mx)
1611  call mp_acquire(buf_o, mn, mx)
1612 #else
1613  allocate(buf_i(l_mn:r_mx, 0:wx-1))
1614  allocate(buf_o(l_mn:r_mx, 0:wx-1))
1615 #endif
1616 !$omp do OMP_COLLAPSE OMP_SCHEDULE
1617  do n=loop_mn(6), loop_mx(6)
1618  do m=loop_mn(5), loop_mx(5)
1619  do k=loop_mn(3), loop_mx(3)
1620  do j=loop_mn(2), loop_mx(2)
1621 ! do i=loop_mn(1), loop_mx(1)
1622 ! ! (1) fill input buffer piecewise
1623 ! buf_i(l_mn:l_mx) = l_halo(i,j,k,:,m,n)
1624 ! buf_i(c_mn:c_mx) = f6d(i,j,k,:,m,n)
1625 ! buf_i(r_mn:r_mx) = r_halo(i,j,k,:,m,n)
1626 ! ! (2) perform interpolation
1627 ! call sll_s_lagrange_interpolation_1d_fast_disp_fixed_haloc_cells( &
1628 ! buf_i, &
1629 ! buf_o, &
1630 ! displacement(i,j,k), &
1631 ! self%lagrange_width(2))
1632 ! ! (3) copy-back interpolated values
1633 ! f6d(i,j,k,:,m,n) = buf_o(c_mn:c_mx)
1634 ! end do
1635 #ifndef DISABLE_CACHE_BLOCKING
1636  do i=loop_mn(1), loop_mx(1), wx
1637  do l=l_mn,l_mx
1638  do w=0,wx-1
1639  buf_i(l,w) = l_halo(i+w,j,k,l,m,n)
1640  end do
1641  end do
1642  do l=c_mn,c_mx
1643  do w=0,wx-1
1644  buf_i(l,w) = f6d(i+w,j,k,l,m,n)
1645  end do
1646  end do
1647  do l=r_mn,r_mx
1648  do w=0,wx-1
1649  buf_i(l,w) = r_halo(i+w,j,k,l,m,n)
1650  end do
1651  end do
1652  do w=0,wx-1
1654  buf_i(:,w), &
1655  buf_o(:,w), &
1656  displacement(i+w,j,k), &
1657  lw)
1658  end do
1659  do l=c_mn,c_mx
1660  do w=0,wx-1
1661  f6d(i+w,j,k,l,m,n) = buf_o(l,w)
1662  end do
1663  end do
1664  end do
1665 #else
1666  do i=loop_mn(1), loop_mx(1)
1667  ! (1) fill input buffer piecewise
1668 ! buf_i(l_mn:l_mx) = l_halo(i,j,k,:,m,n)
1669 ! buf_i(c_mn:c_mx) = f6d(i,j,k,:,m,n)
1670 ! buf_i(r_mn:r_mx) = r_halo(i,j,k,:,m,n)
1671  do l=l_mn,l_mx
1672  buf_i(l,0) = l_halo(i,j,k,l,m,n)
1673  enddo
1674  do l=c_mn,c_mx
1675  buf_i(l,0) = f6d(i,j,k,l,m,n)
1676  enddo
1677  do l=r_mn,r_mx
1678  buf_i(l,0) = r_halo(i,j,k,l,m,n)
1679  enddo
1680  ! (2) perform interpolation
1682  buf_i(:,0), &
1683  buf_o(:,0), &
1684  displacement(i,j,k), &
1685  lw)
1686  do l=c_mn,c_mx
1687  f6d(i,j,k,l,m,n) = buf_o(l,0)
1688  enddo
1689  end do
1690 #endif
1691  end do
1692  end do
1693  end do
1694  end do
1695 !$omp end do
1696 #ifdef USE_FMEMPOOL
1697  call mp_release(buf_i)
1698  call mp_release(buf_o)
1699 #else
1700  deallocate(buf_i)
1701  deallocate(buf_o)
1702 #endif
1703 !$omp end parallel
1705 
1706 
1708  subroutine sll_s_advection_6d_lagrange_dd_slim_advect_eta5(self, decomposition, displacement, f6d)
1709  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
1710  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
1711  sll_real64, intent(in) :: displacement(&
1712  decomposition%local%mn(1):decomposition%local%mx(1), &
1713  decomposition%local%mn(2):decomposition%local%mx(2), &
1714  decomposition%local%mn(3):decomposition%local%mx(3))
1715  sll_real64, intent(inout) :: f6d(&
1716  decomposition%local%mn(1):decomposition%local%mx(1), &
1717  decomposition%local%mn(2):decomposition%local%mx(2), &
1718  decomposition%local%mn(3):decomposition%local%mx(3), &
1719  decomposition%local%mn(4):decomposition%local%mx(4), &
1720  decomposition%local%mn(5):decomposition%local%mx(5), &
1721  decomposition%local%mn(6):decomposition%local%mx(6))
1722  sll_int32, parameter :: id = 5 ! eta5
1723  sll_int32 :: i,j,k,l,m,n,w,wx, lw
1724  sll_real64, pointer :: buf_i(:,:)
1725  sll_real64, pointer :: buf_o(:,:)
1726 #ifdef USE_FMEMPOOL
1727  sll_int32 :: mn(2), mx(2)
1728 #endif
1729  sll_int32 :: l_mn, l_mx, c_mn, c_mx, r_mn, r_mx
1730  sll_int32, pointer :: loop_mn(:), loop_mx(:)
1731  halo_dtype, pointer :: l_halo(:,:,:,:,:,:)
1732  halo_dtype, pointer :: r_halo(:,:,:,:,:,:)
1733 
1734  ! NOTE: Current restriction for the memory-saving `slim` halos,
1735  ! to be lowered later for the real one-sided halo implementation.
1736  ! One-sided halos require a more clever index handling (though straight-forward).
1737  sll_assert_always(decomposition%local%halo_left%nw(id) == decomposition%local%halo_right%nw(id))
1738  sll_assert_always(decomposition%local%halo_left%nw(id) == (self%lagrange_width(2)-1)/2)
1739  ! cache values to avoid multiple dereferencing inside the loops
1740  l_mn = decomposition%local%halo_left%mn(id)
1741  l_mx = decomposition%local%halo_left%mx(id)
1742  c_mn = decomposition%local%mn(id)
1743  c_mx = decomposition%local%mx(id)
1744  r_mn = decomposition%local%halo_right%mn(id)
1745  r_mx = decomposition%local%halo_right%mx(id)
1746  l_halo => decomposition%local%halo_left%buf
1747  r_halo => decomposition%local%halo_right%buf
1748  loop_mn => decomposition%local%mn
1749  loop_mx => decomposition%local%mx
1750  wx = get_wx(decomposition)
1751  lw = self%lagrange_width(2)
1752 
1753 #ifdef USE_FMEMPOOL
1754  mn = [l_mn, 0]
1755  mx = [r_mx, wx-1]
1756 #endif
1757 
1758 !$omp parallel default(shared) private(i,j,k,l,m,n,w,buf_i,buf_o)
1759 #ifdef USE_FMEMPOOL
1760  call mp_acquire(buf_i, mn, mx)
1761  call mp_acquire(buf_o, mn, mx)
1762 #else
1763  allocate(buf_i(l_mn:r_mx, 0:wx-1))
1764  allocate(buf_o(l_mn:r_mx, 0:wx-1))
1765 #endif
1766 !$omp do OMP_COLLAPSE OMP_SCHEDULE
1767  do n=loop_mn(6), loop_mx(6)
1768  do l=loop_mn(4), loop_mx(4)
1769  do k=loop_mn(3), loop_mx(3)
1770  do j=loop_mn(2), loop_mx(2)
1771 ! do i=loop_mn(1), loop_mx(1)
1772 ! ! (1) fill input buffer piecewise
1773 ! buf_i(l_mn:l_mx) = l_halo(i,j,k,l,:,n)
1774 ! buf_i(c_mn:c_mx) = f6d(i,j,k,l,:,n)
1775 ! buf_i(r_mn:r_mx) = r_halo(i,j,k,l,:,n)
1776 ! ! (2) perform interpolation
1777 ! call sll_s_lagrange_interpolation_1d_fast_disp_fixed_haloc_cells( &
1778 ! buf_i, &
1779 ! buf_o, &
1780 ! displacement(i,j,k), &
1781 ! self%lagrange_width(2))
1782 ! ! (3) copy-back interpolated values
1783 ! f6d(i,j,k,l,:,n) = buf_o(c_mn:c_mx)
1784 ! end do
1785 #ifndef DISABLE_CACHE_BLOCKING
1786  do i=loop_mn(1), loop_mx(1), wx
1787  do m=l_mn,l_mx
1788  do w=0,wx-1
1789  buf_i(m,w) = l_halo(i+w,j,k,l,m,n)
1790  end do
1791  end do
1792  do m=c_mn,c_mx
1793  do w=0,wx-1
1794  buf_i(m,w) = f6d(i+w,j,k,l,m,n)
1795  end do
1796  end do
1797  do m=r_mn,r_mx
1798  do w=0,wx-1
1799  buf_i(m,w) = r_halo(i+w,j,k,l,m,n)
1800  end do
1801  end do
1802 
1803  do w=0,wx-1
1805  buf_i(:,w), &
1806  buf_o(:,w), &
1807  displacement(i+w,j,k), &
1808  lw)
1809  end do
1810  do m=c_mn,c_mx
1811  do w=0,wx-1
1812  f6d(i+w,j,k,l,m,n) = buf_o(m,w)
1813  end do
1814  end do
1815  end do
1816 #else
1817  do i=loop_mn(1), loop_mx(1)
1818  ! (1) fill input buffer piecewise
1819  do m=l_mn,l_mx
1820  buf_i(m,0) = l_halo(i,j,k,l,m,n)
1821  enddo
1822  do m=c_mn,c_mx
1823  buf_i(m,0) = f6d(i,j,k,l,m,n)
1824  enddo
1825  do m=r_mn,r_mx
1826  buf_i(m,0) = r_halo(i,j,k,l,m,n)
1827  enddo
1828  ! (2) perform interpolation
1830  buf_i(:,0), &
1831  buf_o(:,0), &
1832  displacement(i,j,k), &
1833  lw)
1834  do m=c_mn,c_mx
1835  f6d(i,j,k,l,m,n) = buf_o(m,0)
1836  enddo
1837  end do
1838 #endif
1839  end do
1840  end do
1841  end do
1842  end do
1843 !$omp end do
1844 #ifdef USE_FMEMPOOL
1845  call mp_release(buf_i)
1846  call mp_release(buf_o)
1847 #else
1848  deallocate(buf_i)
1849  deallocate(buf_o)
1850 #endif
1851 !$omp end parallel
1853 
1854 
1856  subroutine sll_s_advection_6d_lagrange_dd_slim_advect_eta6(self, decomposition, displacement, f6d)
1857  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
1858  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
1859  sll_real64, intent(in) :: displacement(&
1860  decomposition%local%mn(1):decomposition%local%mx(1), &
1861  decomposition%local%mn(2):decomposition%local%mx(2), &
1862  decomposition%local%mn(3):decomposition%local%mx(3))
1863  sll_real64, intent(inout) :: f6d(&
1864  decomposition%local%mn(1):decomposition%local%mx(1), &
1865  decomposition%local%mn(2):decomposition%local%mx(2), &
1866  decomposition%local%mn(3):decomposition%local%mx(3), &
1867  decomposition%local%mn(4):decomposition%local%mx(4), &
1868  decomposition%local%mn(5):decomposition%local%mx(5), &
1869  decomposition%local%mn(6):decomposition%local%mx(6))
1870  sll_int32, parameter :: id = 6 ! eta6
1871  sll_int32 :: i,j,k,l,m,n,w,wx, lw
1872  sll_real64, pointer :: buf_i(:,:)
1873  sll_real64, pointer :: buf_o(:,:)
1874 #ifdef USE_FMEMPOOL
1875  sll_int32 :: mn(2), mx(2)
1876 #endif
1877  sll_int32 :: l_mn, l_mx, c_mn, c_mx, r_mn, r_mx
1878  sll_int32, pointer :: loop_mn(:), loop_mx(:)
1879  halo_dtype, pointer :: l_halo(:,:,:,:,:,:)
1880  halo_dtype, pointer :: r_halo(:,:,:,:,:,:)
1881 
1882  ! NOTE: Current restriction for the memory-saving `slim` halos,
1883  ! to be lowered later for the real one-sided halo implementation.
1884  ! One-sided halos require a more clever index handling (though straight-forward).
1885  sll_assert_always(decomposition%local%halo_left%nw(id) == decomposition%local%halo_right%nw(id))
1886  sll_assert_always(decomposition%local%halo_left%nw(id) == (self%lagrange_width(2)-1)/2)
1887  ! cache values to avoid multiple dereferencing inside the loops
1888  l_mn = decomposition%local%halo_left%mn(id)
1889  l_mx = decomposition%local%halo_left%mx(id)
1890  c_mn = decomposition%local%mn(id)
1891  c_mx = decomposition%local%mx(id)
1892  r_mn = decomposition%local%halo_right%mn(id)
1893  r_mx = decomposition%local%halo_right%mx(id)
1894  l_halo => decomposition%local%halo_left%buf
1895  r_halo => decomposition%local%halo_right%buf
1896  loop_mn => decomposition%local%mn
1897  loop_mx => decomposition%local%mx
1898  wx = get_wx(decomposition)
1899  lw = self%lagrange_width(2)
1900 
1901 #ifdef USE_FMEMPOOL
1902  mn = [l_mn, 0]
1903  mx = [r_mx, wx-1]
1904 #endif
1905 
1906 !$omp parallel default(shared) private(i,j,k,l,m,n,w,buf_i,buf_o)
1907 #ifdef USE_FMEMPOOL
1908  call mp_acquire(buf_i, mn, mx)
1909  call mp_acquire(buf_o, mn, mx)
1910 #else
1911  allocate(buf_i(l_mn:r_mx, 0:wx-1))
1912  allocate(buf_o(l_mn:r_mx, 0:wx-1))
1913 #endif
1914 !$omp do OMP_COLLAPSE OMP_SCHEDULE
1915  do m=loop_mn(5), loop_mx(5)
1916  do l=loop_mn(4), loop_mx(4)
1917  do k=loop_mn(3), loop_mx(3)
1918  do j=loop_mn(2), loop_mx(2)
1919 
1920 ! do i=loop_mn(1), loop_mx(1)
1921 ! ! (1) fill input buffer piecewise
1922 ! buf_i(l_mn:l_mx) = l_halo(i,j,k,l,m,:)
1923 ! buf_i(c_mn:c_mx) = f6d(i,j,k,l,m,:)
1924 ! buf_i(r_mn:r_mx) = r_halo(i,j,k,l,m,:)
1925 ! ! (2) perform interpolation
1926 ! call sll_s_lagrange_interpolation_1d_fast_disp_fixed_haloc_cells( &
1927 ! buf_i, &
1928 ! buf_o, &
1929 ! displacement(i,j,k), &
1930 ! self%lagrange_width(2))
1931 ! ! (3) copy-back interpolated values
1932 ! f6d(i,j,k,l,m,:) = buf_o(c_mn:c_mx)
1933 ! end do
1934 #ifndef DISABLE_CACHE_BLOCKING
1935  do i=loop_mn(1), loop_mx(1), wx
1936  do n=l_mn,l_mx
1937  do w=0,wx-1
1938  buf_i(n,w) = l_halo(i+w,j,k,l,m,n)
1939  end do
1940  end do
1941  do n=c_mn,c_mx
1942  do w=0,wx-1
1943  buf_i(n,w) = f6d(i+w,j,k,l,m,n)
1944  end do
1945  end do
1946  do n=r_mn,r_mx
1947  do w=0,wx-1
1948  buf_i(n,w) = r_halo(i+w,j,k,l,m,n)
1949  end do
1950  end do
1951  do w=0,wx-1
1953  buf_i(:,w), &
1954  buf_o(:,w), &
1955  displacement(i+w,j,k), &
1956  lw)
1957  end do
1958  do n=c_mn,c_mx
1959  do w=0,wx-1
1960  f6d(i+w,j,k,l,m,n) = buf_o(n,w)
1961  end do
1962  end do
1963  end do
1964 #else
1965  do i=loop_mn(1), loop_mx(1)
1966  ! (1) fill input buffer piecewise
1967  do n=l_mn,l_mx
1968  buf_i(n,0) = l_halo(i,j,k,l,m,n)
1969  enddo
1970  do n=c_mn,c_mx
1971  buf_i(n,0) = f6d(i,j,k,l,m,n)
1972  enddo
1973  do n=r_mn,r_mx
1974  buf_i(n,0) = r_halo(i,j,k,l,m,n)
1975  enddo
1976  ! (2) perform interpolation
1978  buf_i(:,0), &
1979  buf_o(:,0), &
1980  displacement(i,j,k), &
1981  lw)
1982 ! f6d(i,j,k,:,m,n) = buf_o(c_mn:c_mx)
1983  do n=c_mn,c_mx
1984  f6d(i,j,k,l,m,n) = buf_o(n,0)
1985  enddo
1986  end do
1987 #endif
1988  end do
1989  end do
1990  end do
1991  end do
1992 !$omp end do
1993 #ifdef USE_FMEMPOOL
1994  call mp_release(buf_i)
1995  call mp_release(buf_o)
1996 #else
1997  deallocate(buf_i)
1998  deallocate(buf_o)
1999 #endif
2000 !$omp end parallel
2002 
2003 
2004 
2006  subroutine sll_s_advection_6d_clagrange_dd_slim_advect_eta1_d45(self, decomposition, displacement, f6d)
2007 
2008  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
2009  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
2010  sll_real64, intent(in) :: displacement(decomposition%local%mn(4):decomposition%local%mx(4),decomposition%local%mn(5):decomposition%local%mx(5))
2011  sll_real64, intent(inout) :: f6d(&
2012  decomposition%local%mn(1):decomposition%local%mx(1), &
2013  decomposition%local%mn(2):decomposition%local%mx(2), &
2014  decomposition%local%mn(3):decomposition%local%mx(3), &
2015  decomposition%local%mn(4):decomposition%local%mx(4), &
2016  decomposition%local%mn(5):decomposition%local%mx(5), &
2017  decomposition%local%mn(6):decomposition%local%mx(6))
2018  ! ---
2019  sll_int32, parameter :: id = 1 ! eta1
2020  sll_int32 :: i,j,k,l,m,n,w,wx, n_omp_threads, deg
2021  sll_real64, pointer :: buf_i(:,:)
2022  sll_real64, pointer :: buf_o(:,:)
2023  sll_int32 :: l_mn, l_mx, c_mn, c_mx, r_mn, r_mx, l_mn_l, r_mx_l
2024  sll_int32, pointer :: loop_mn(:), loop_mx(:)
2025  halo_dtype, pointer :: l_halo(:,:,:,:,:,:)
2026  halo_dtype, pointer :: r_halo(:,:,:,:,:,:)
2027 #ifdef USE_FMEMPOOL
2028  sll_int32 :: mn(2), mx(2)
2029 #endif
2030  sll_int32 :: si
2031  sll_real64 :: alpha
2032 
2033  ! cache values to avoid multiple dereferencing inside the loops
2034  l_mn = decomposition%local%halo_left%mn(id) !decomposition%local%halo_left%mn(id)
2035  l_mx = decomposition%local%halo_left%mx(id) !decomposition%local%halo_left%mx(id)
2036  c_mn = decomposition%local%mn(id)
2037  c_mx = decomposition%local%mx(id)
2038  r_mn = decomposition%local%halo_right%mn(id) !decomposition%local%halo_right%mn(id)
2039  r_mx = decomposition%local%halo_right%mx(id) !decomposition%local%halo_right%mx(id)
2040  l_halo => decomposition%local%halo_left%buf
2041  r_halo => decomposition%local%halo_right%buf
2042  loop_mn => decomposition%local%mn
2043  loop_mx => decomposition%local%mx
2044  ! NOTE: Buffering is done here for the second dim "j".
2045  ! Seems to be marginally beneficial.
2046  wx = get_wx(decomposition, 2)
2047  deg = self%lagrange_width(1)
2048 #ifdef USE_FMEMPOOL
2049  mn = [l_mn, 0]
2050  mx = [r_mx, wx-1]
2051 #endif
2052 
2053 !$omp parallel default(shared) private(i,j,k,l,m,n,w,buf_i,buf_o,si,alpha,l_mn_l,r_mx_l)
2054 ! allocate(buf_i(l_mn:r_mx,0:wx-1))
2055 ! allocate(buf_o(l_mn:r_mx,0:wx-1))
2056 #ifdef USE_FMEMPOOL
2057  call mp_acquire(buf_i, mn, mx)
2058  call mp_acquire(buf_o, mn, mx)
2059 #else
2060  allocate(buf_i(l_mn:r_mx, 0:wx-1))
2061  allocate(buf_o(l_mn:r_mx, 0:wx-1))
2062 #endif
2063 !$omp do OMP_COLLAPSE OMP_SCHEDULE
2064  do n=loop_mn(6), loop_mx(6)
2065  do m=loop_mn(5), loop_mx(5)
2066  do l=loop_mn(4), loop_mx(4)
2067 
2068  si = floor(displacement(l,m))
2069  alpha = displacement(l,m) - real(si,f64)
2070  l_mn_l = -deg/2+1+si+c_mn
2071  r_mx_l = c_mx+deg/2+si
2072 
2073  do k=loop_mn(3), loop_mx(3)
2074 #ifndef DISABLE_CACHE_BLOCKING
2075  do j=loop_mn(2), loop_mx(2), wx
2076  do w=0,wx-1
2077  do i=l_mn_l,l_mx
2078  buf_i(i,w) = l_halo(i,j+w,k,l,m,n)
2079  end do
2080  end do
2081  do w=0,wx-1
2082  do i=c_mn,c_mx
2083  buf_i(i,w) = f6d(i,j+w,k,l,m,n)
2084  end do
2085  end do
2086  do w=0,wx-1
2087  do i=r_mn,r_mx_l
2088  buf_i(i,w) = r_halo(i,j+w,k,l,m,n)
2089  end do
2090  end do
2091  do w=0,wx-1
2093  buf_i(l_mn_l:r_mx_l,w), &
2094  buf_o(l_mn_l:r_mx_l,w), &
2095  alpha, &
2096  deg)
2097  end do
2098  do w=0,wx-1
2099  do i=c_mn,c_mx
2100  f6d(i,j+w,k,l,m,n) = buf_o(i+si,w)
2101  end do
2102  end do
2103  end do
2104 #else
2105  do j=loop_mn(2), loop_mx(2)
2106  ! (1) fill input buffer piecewise
2107  do i=l_mn_l,l_mx
2108  buf_i(i,0) = l_halo(i,j,k,l,m,n)
2109  enddo
2110  do i=c_mn,c_mx
2111  buf_i(i,0) = f6d(i,j,k,l,m,n)
2112  enddo
2113  do i=r_mn,r_mx_l
2114  buf_i(i,0) = r_halo(i,j,k,l,m,n)
2115  enddo
2116  ! (2) perform interpolation
2118  buf_i(l_mn_l:r_mx_l,0), &
2119  buf_o(l_mn_l:r_mx_l,0), &
2120  alpha, &
2121  deg)
2122  ! (3) copy-back interpolated values
2123  do i=c_mn,c_mx
2124  f6d(i,j,k,l,m,n) = buf_o(i+si,0)
2125  enddo
2126  end do
2127 #endif
2128  end do
2129  end do
2130  end do
2131  end do
2132 !$omp end do
2133 #ifdef USE_FMEMPOOL
2134  call mp_release(buf_i)
2135  call mp_release(buf_o)
2136 #else
2137  deallocate(buf_i)
2138  deallocate(buf_o)
2139 #endif
2140 !$omp end parallel
2142 
2143 
2145  subroutine sll_s_advection_6d_clagrange_dd_slim_advect_eta2_d45(self, decomposition, displacement, f6d)
2146 
2147  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
2148  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
2149  sll_real64, intent(in) :: displacement(decomposition%local%mn(4):decomposition%local%mx(4),decomposition%local%mn(5):decomposition%local%mx(5))
2150  sll_real64, intent(inout) :: f6d(&
2151  decomposition%local%mn(1):decomposition%local%mx(1), &
2152  decomposition%local%mn(2):decomposition%local%mx(2), &
2153  decomposition%local%mn(3):decomposition%local%mx(3), &
2154  decomposition%local%mn(4):decomposition%local%mx(4), &
2155  decomposition%local%mn(5):decomposition%local%mx(5), &
2156  decomposition%local%mn(6):decomposition%local%mx(6))
2157  ! ---
2158  sll_int32, parameter :: id = 2 ! eta2
2159  sll_int32 :: i,j,k,l,m,n,w,wx, n_omp_threads, deg
2160  sll_real64, pointer :: buf_i(:,:)
2161  sll_real64, pointer :: buf_o(:,:)
2162  sll_int32 :: l_mn, l_mx, c_mn, c_mx, r_mn, r_mx, l_mn_l, r_mx_l
2163  sll_int32, pointer :: loop_mn(:), loop_mx(:)
2164  halo_dtype, pointer :: l_halo(:,:,:,:,:,:)
2165  halo_dtype, pointer :: r_halo(:,:,:,:,:,:)
2166 #ifdef USE_FMEMPOOL
2167  sll_int32 :: mn(2), mx(2)
2168 #endif
2169  sll_int32 :: si
2170  sll_real64 :: alpha
2171 
2172  ! cache values to avoid multiple dereferencing inside the loops
2173  l_mn = decomposition%local%halo_left%mn(id) !decomposition%local%halo_left%mn(id)
2174  l_mx = decomposition%local%halo_left%mx(id) !decomposition%local%halo_left%mx(id)
2175  c_mn = decomposition%local%mn(id)
2176  c_mx = decomposition%local%mx(id)
2177  r_mn = decomposition%local%halo_right%mn(id) !decomposition%local%halo_right%mn(id)
2178  r_mx = decomposition%local%halo_right%mx(id) !decomposition%local%halo_right%mx(id)
2179  l_halo => decomposition%local%halo_left%buf
2180  r_halo => decomposition%local%halo_right%buf
2181  loop_mn => decomposition%local%mn
2182  loop_mx => decomposition%local%mx
2183  wx = get_wx(decomposition, 1)
2184  deg = self%lagrange_width(1)
2185 #ifdef USE_FMEMPOOL
2186  mn = [l_mn, 0]
2187  mx = [r_mx, wx-1]
2188 #endif
2189 
2190 !$omp parallel default(shared) private(i,j,k,l,m,n,w,buf_i,buf_o,si,alpha,l_mn_l,r_mx_l)
2191 ! allocate(buf_i(l_mn:r_mx,0:wx-1))
2192 ! allocate(buf_o(l_mn:r_mx,0:wx-1))
2193 #ifdef USE_FMEMPOOL
2194  call mp_acquire(buf_i, mn, mx)
2195  call mp_acquire(buf_o, mn, mx)
2196 #else
2197  allocate(buf_i(l_mn:r_mx, 0:wx-1))
2198  allocate(buf_o(l_mn:r_mx, 0:wx-1))
2199 #endif
2200 !$omp do OMP_COLLAPSE OMP_SCHEDULE
2201  do n=loop_mn(6), loop_mx(6)
2202  do m=loop_mn(5), loop_mx(5)
2203  do l=loop_mn(4), loop_mx(4)
2204 
2205  si = floor(displacement(l,m))
2206  alpha = displacement(l,m) - real(si,f64)
2207  l_mn_l = -deg/2+1+si+c_mn
2208  r_mx_l = c_mx+deg/2+si
2209 
2210  do k=loop_mn(3), loop_mx(3)
2211 #ifndef DISABLE_CACHE_BLOCKING
2212  do i=loop_mn(1), loop_mx(1), wx
2213  do j=l_mn_l,l_mx
2214  do w=0,wx-1
2215  buf_i(j,w) = l_halo(i+w,j,k,l,m,n)
2216  end do
2217  end do
2218  do j=c_mn,c_mx
2219  do w=0,wx-1
2220  buf_i(j,w) = f6d(i+w,j,k,l,m,n)
2221  end do
2222  end do
2223  do j=r_mn,r_mx_l
2224  do w=0,wx-1
2225  buf_i(j,w) = r_halo(i+w,j,k,l,m,n)
2226  end do
2227  end do
2228  do w=0,wx-1
2230  buf_i(l_mn_l:r_mx_l,w), &
2231  buf_o(l_mn_l:r_mx_l,w), &
2232  alpha, &
2233  deg)
2234  end do
2235  do j=c_mn,c_mx
2236  do w=0,wx-1
2237  f6d(i+w,j,k,l,m,n) = buf_o(j+si,w)
2238  end do
2239  end do
2240  end do
2241 #else
2242  do i=loop_mn(1), loop_mx(1)
2243  do j=l_mn_l,l_mx
2244  buf_i(j,0) = l_halo(i,j,k,l,m,n)
2245  enddo
2246  do j=c_mn,c_mx
2247  buf_i(j,0) = f6d(i,j,k,l,m,n)
2248  enddo
2249  do j=r_mn,r_mx_l
2250  buf_i(j,0) = r_halo(i,j,k,l,m,n)
2251  enddo
2253  buf_i(l_mn_l:r_mx_l,0), &
2254  buf_o(l_mn_l:r_mx_l,0), &
2255  alpha, &
2256  deg)
2257  do j=c_mn,c_mx
2258  f6d(i,j,k,l,m,n) = buf_o(j+si,0)
2259  enddo
2260  end do
2261 #endif
2262  end do
2263  end do
2264  end do
2265  end do
2266 !$omp end do
2267 #ifdef USE_FMEMPOOL
2268  call mp_release(buf_i)
2269  call mp_release(buf_o)
2270 #else
2271  deallocate(buf_i)
2272  deallocate(buf_o)
2273 #endif
2274 !$omp end parallel
2276 
2277 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
2278 !
2279 !
2280 ! From here the block of the advection routines for x with x dependent velocities
2281 !
2282 !
2283 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
2284 
2286  subroutine sll_s_advection_6d_lagrange_dd_slim_advect_eta1_givenv(self, decomposition, rdelta_x, velocity_grid, velocity_shift, rotation_matrix, propagation_matrix, f6d)
2287  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
2288  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
2289  type(sll_t_array), intent(in) :: velocity_grid(6)
2290  sll_real64, intent(in) :: velocity_shift(decomposition%local%mn(1):decomposition%local%mx(1), &
2291  decomposition%local%mn(2):decomposition%local%mx(2), &
2292  decomposition%local%mn(3):decomposition%local%mx(3),3)
2293  sll_real64, intent(in) :: rotation_matrix(3,3)
2294  sll_real64, intent(in) :: propagation_matrix(3,3)
2295  sll_real64, intent(in) :: rdelta_x
2296  sll_real64, intent(inout) :: f6d(&
2297  decomposition%local%mn(1):decomposition%local%mx(1), &
2298  decomposition%local%mn(2):decomposition%local%mx(2), &
2299  decomposition%local%mn(3):decomposition%local%mx(3), &
2300  decomposition%local%mn(4):decomposition%local%mx(4), &
2301  decomposition%local%mn(5):decomposition%local%mx(5), &
2302  decomposition%local%mn(6):decomposition%local%mx(6))
2303  sll_int32, parameter :: id = 1 ! eta1
2304  sll_int32 :: i,j,k,l,m,n,w,wx, lw
2305  sll_real64, pointer :: buf_i(:,:)
2306  sll_real64, pointer :: buf_o(:,:)
2307  sll_real64, pointer :: displacement(:,:)
2308 #ifdef USE_FMEMPOOL
2309  sll_int32 :: mn(2), mx(2), mno(2), mxo(2)
2310 #endif
2311  sll_int32 :: l_mn, l_mx, c_mn, c_mx, r_mn, r_mx
2312  sll_int32, pointer :: loop_mn(:), loop_mx(:)
2313  halo_dtype, pointer :: l_halo(:,:,:,:,:,:)
2314  halo_dtype, pointer :: r_halo(:,:,:,:,:,:)
2315  sll_real64 :: v_coord(3)
2316 
2317  sll_assert_always(decomposition%local%halo_left%nw(id) == decomposition%local%halo_right%nw(id))
2318  sll_assert_always(decomposition%local%halo_left%nw(id) == (self%lagrange_width(1)-1)/2)
2319  ! cache values to avoid multiple dereferencing inside the loops
2320  l_mn = decomposition%local%halo_left%mn(id)
2321  l_mx = decomposition%local%halo_left%mx(id)
2322  c_mn = decomposition%local%mn(id)
2323  c_mx = decomposition%local%mx(id)
2324  r_mn = decomposition%local%halo_right%mn(id)
2325  r_mx = decomposition%local%halo_right%mx(id)
2326  l_halo => decomposition%local%halo_left%buf
2327  r_halo => decomposition%local%halo_right%buf
2328  loop_mn => decomposition%local%mn
2329  loop_mx => decomposition%local%mx
2330  ! NOTE: Buffering is done here for the second dim "j".
2331  ! Seems to be marginally beneficial.
2332  wx = get_wx(decomposition, 2)
2333  lw = self%lagrange_width(1)
2334 
2335 #ifdef USE_FMEMPOOL
2336  mn = [l_mn, 0]
2337  mx = [r_mx, wx-1]
2338  mno = [c_mn, 0]
2339  mxo = [c_mx, wx-1]
2340 #endif
2341 
2342 !$omp parallel default(shared) private(i,j,k,l,m,n,w,buf_i,buf_o,displacement, v_coord)
2343 #ifdef USE_FMEMPOOL
2344  call mp_acquire(buf_i, mn, mx)
2345  call mp_acquire(buf_o, mno, mxo)
2346 #else
2347  allocate(buf_i(l_mn:r_mx, 0:wx-1))
2348  allocate(buf_o(c_mn:c_mx, 0:wx-1))
2349 #endif
2350  allocate(displacement(c_mn:c_mx, 0:wx-1))
2351 !$omp do OMP_COLLAPSE OMP_SCHEDULE
2352  do n=loop_mn(6), loop_mx(6)
2353  do m=loop_mn(5), loop_mx(5)
2354  do l=loop_mn(4), loop_mx(4)
2355  do k=loop_mn(3), loop_mx(3)
2356 #ifndef DISABLE_CACHE_BLOCKING
2357  do j=loop_mn(2), loop_mx(2), wx
2358  do w=0,wx-1
2359  do i=l_mn,l_mx
2360  buf_i(i,w) = l_halo(i,j+w,k,l,m,n)
2361  end do
2362  end do
2363  do w=0,wx-1
2364  do i=c_mn,c_mx
2365  buf_i(i,w) = f6d(i,j+w,k,l,m,n)
2366  end do
2367  end do
2368  do w=0,wx-1
2369  do i=r_mn,r_mx
2370  buf_i(i,w) = r_halo(i,j+w,k,l,m,n)
2371  end do
2372  end do
2373 
2374 
2375  ! Compute the displacement vector
2376 
2377  do w=0,wx-1
2378  do i=c_mn, c_mx
2379 
2380  v_coord(1) = rotation_matrix(1,1) * velocity_grid(4)%vals(l-loop_mn(4)+1) + &
2381  rotation_matrix(1,2) * velocity_grid(5)%vals(m-loop_mn(5)+1) + &
2382  rotation_matrix(1,3) * velocity_grid(6)%vals(n-loop_mn(6)+1) + &
2383  velocity_shift(i,j+w,k,1)
2384  v_coord(2) = rotation_matrix(2,1) * velocity_grid(4)%vals(l-loop_mn(4)+1) + &
2385  rotation_matrix(2,2) * velocity_grid(5)%vals(m-loop_mn(5)+1) + &
2386  rotation_matrix(2,3) * velocity_grid(6)%vals(n-loop_mn(6)+1) + &
2387  velocity_shift(i,j+w,k,2)
2388  v_coord(3) = rotation_matrix(3,1) * velocity_grid(4)%vals(l-loop_mn(4)+1) + &
2389  rotation_matrix(3,2) * velocity_grid(5)%vals(m-loop_mn(5)+1) + &
2390  rotation_matrix(3,3) * velocity_grid(6)%vals(n-loop_mn(6)+1) + &
2391  velocity_shift(i,j+w,k,3)
2392 
2393  displacement(i,w) = rdelta_x * ( &
2394  propagation_matrix(1,1) * v_coord(1)+&
2395  propagation_matrix(1,2) * v_coord(2)+&
2396  propagation_matrix(1,3) * v_coord(3) )
2397  if (abs(displacement(i,w))> 1.0_f64 ) then
2398  print*, i,j+w,k,l,m,n, displacement(i,w)
2399  stop
2400  end if
2401  !print*, v_coord
2402  end do
2403  end do
2404  !print*, displacement
2405 
2406  do w=0,wx-1
2407  ! (2) perform interpolation
2409  buf_i(:,w), &
2410  buf_o(:,w), &
2411  displacement(:,w), &
2412  lw, &
2413  0)
2414  end do
2415  do w=0,wx-1
2416  do i=c_mn,c_mx
2417  f6d(i,j+w,k,l,m,n) = buf_o(i,w)
2418  end do
2419  end do
2420  end do
2421 #else
2422  do j=loop_mn(2), loop_mx(2)
2423 
2424  ! (1) fill input buffer piecewise
2425 ! buf_i(l_mn:l_mx,1) = l_halo(:,j,k,l,m,n)
2426 ! buf_i(c_mn:c_mx,1) = f6d(:,j,k,l,m,n)
2427 ! buf_i(r_mn:r_mx,1) = r_halo(:,j,k,l,m,n)
2428  do i=l_mn,l_mx
2429  buf_i(i,0) = l_halo(i,j,k,l,m,n)
2430  enddo
2431  do i=c_mn,c_mx
2432  buf_i(i,0) = f6d(i,j,k,l,m,n)
2433  enddo
2434  do i=r_mn,r_mx
2435  buf_i(i,0) = r_halo(i,j,k,l,m,n)
2436  enddo
2437 
2438  ! Compute the displacement vector
2439  do i=c_mn, c_mx
2440  v_coord(1) = rotation_matrix(1,1) * velocity_grid(4)%vals(l-loop_mn(4)+1) + &
2441  rotation_matrix(1,2) * velocity_grid(5)%vals(m-loop_mn(5)+1) + &
2442  rotation_matrix(1,3) * velocity_grid(6)%vals(n-loop_mn(6)+1) + &
2443  velocity_shift(i,j,k,1)
2444  v_coord(2) = rotation_matrix(2,1) * velocity_grid(4)%vals(l-loop_mn(4)+1) + &
2445  rotation_matrix(2,2) * velocity_grid(5)%vals(m-loop_mn(5)+1) + &
2446  rotation_matrix(2,3) * velocity_grid(6)%vals(n-loop_mn(6)+1) + &
2447  velocity_shift(i,j,k,2)
2448  v_coord(3) = rotation_matrix(3,1) * velocity_grid(4)%vals(l-loop_mn(4)+1) + &
2449  rotation_matrix(3,2) * velocity_grid(5)%vals(m-loop_mn(5)+1) + &
2450  rotation_matrix(3,3) * velocity_grid(6)%vals(n-loop_mn(6)+1) + &
2451  velocity_shift(i,j,k,3)
2452 
2453  displacement(i,0) = rdelta_x * ( &
2454  propagation_matrix(1,1) * v_coord(1)+&
2455  propagation_matrix(1,2) * v_coord(2)+&
2456  propagation_matrix(1,3) * v_coord(3) )
2457  end do
2458 
2459  ! (2) perform interpolation
2461  buf_i(:,0), &
2462  buf_o(:,0), &
2463  displacement, &
2464  lw, &
2465  0)
2466 
2467  ! (3) copy-back interpolated values
2468  ! f6d(:,j,k,l,m,n) = buf_o(c_mn:c_mx,1)
2469  do i=c_mn,c_mx
2470  f6d(i,j,k,l,m,n) = buf_o(i,0)
2471  enddo
2472  end do
2473 #endif
2474  end do
2475  end do
2476  end do
2477  end do
2478 !$omp end do
2479 #ifdef USE_FMEMPOOL
2480  call mp_release(buf_i)
2481  call mp_release(buf_o)
2482 #else
2483  deallocate(buf_i)
2484  deallocate(buf_o)
2485 #endif
2486  deallocate(displacement)
2487 !$omp end parallel
2489 
2491  subroutine sll_s_advection_6d_lagrange_dd_slim_advect_eta2_givenv(self, decomposition, rdelta_x, velocity_grid, velocity_shift, rotation_matrix, propagation_matrix, f6d)
2492  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
2493  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
2494  type(sll_t_array), intent(in) :: velocity_grid(6)
2495  sll_real64, intent(in) :: velocity_shift(decomposition%local%mn(1):decomposition%local%mx(1), &
2496  decomposition%local%mn(2):decomposition%local%mx(2), &
2497  decomposition%local%mn(3):decomposition%local%mx(3),3)
2498  sll_real64, intent(in) :: rotation_matrix(3,3)
2499  sll_real64, intent(in) :: propagation_matrix(3,3)
2500  sll_real64, intent(in) :: rdelta_x
2501  sll_real64, intent(inout) :: f6d(&
2502  decomposition%local%mn(1):decomposition%local%mx(1), &
2503  decomposition%local%mn(2):decomposition%local%mx(2), &
2504  decomposition%local%mn(3):decomposition%local%mx(3), &
2505  decomposition%local%mn(4):decomposition%local%mx(4), &
2506  decomposition%local%mn(5):decomposition%local%mx(5), &
2507  decomposition%local%mn(6):decomposition%local%mx(6))
2508  sll_int32, parameter :: id = 2 ! eta2
2509  sll_int32 :: i,j,k,l,m,n,w,wx, lw
2510  sll_real64, pointer :: buf_i(:,:)
2511  sll_real64, pointer :: buf_o(:,:)
2512  sll_real64, pointer :: displacement(:,:)
2513 #ifdef USE_FMEMPOOL
2514  sll_int32 :: mn(2), mx(2), mno(2), mxo(2)
2515 #endif
2516  sll_int32 :: l_mn, l_mx, c_mn, c_mx, r_mn, r_mx
2517  sll_int32, pointer :: loop_mn(:), loop_mx(:)
2518  halo_dtype, pointer :: l_halo(:,:,:,:,:,:)
2519  halo_dtype, pointer :: r_halo(:,:,:,:,:,:)
2520  sll_real64 :: v_coord(3)
2521 
2522  ! NOTE: Current restriction for the memory-saving `slim` halos,
2523  ! to be lowered later for the real one-sided halo implementation.
2524  ! One-sided halos require a more clever index handling (though straight-forward).
2525  sll_assert_always(decomposition%local%halo_left%nw(id) == decomposition%local%halo_right%nw(id))
2526  sll_assert_always(decomposition%local%halo_left%nw(id) == (self%lagrange_width(1)-1)/2)
2527  ! cache values to avoid multiple dereferencing inside the loops
2528  l_mn = decomposition%local%halo_left%mn(id)
2529  l_mx = decomposition%local%halo_left%mx(id)
2530  c_mn = decomposition%local%mn(id)
2531  c_mx = decomposition%local%mx(id)
2532  r_mn = decomposition%local%halo_right%mn(id)
2533  r_mx = decomposition%local%halo_right%mx(id)
2534  l_halo => decomposition%local%halo_left%buf
2535  r_halo => decomposition%local%halo_right%buf
2536  loop_mn => decomposition%local%mn
2537  loop_mx => decomposition%local%mx
2538  wx = get_wx(decomposition)
2539  lw = self%lagrange_width(1)
2540 
2541 #ifdef USE_FMEMPOOL
2542  mn = [l_mn, 0]
2543  mx = [r_mx, wx-1]
2544  mno = [c_mn, 0]
2545  mxo = [c_mx, wx-1]
2546 #endif
2547 
2548 !$omp parallel default(shared) private(i,j,k,l,m,n,w,buf_i,buf_o,displacement, v_coord)
2549 #ifdef USE_FMEMPOOL
2550  call mp_acquire(buf_i, mn, mx)
2551  call mp_acquire(buf_o, mno, mxo)
2552 #else
2553  allocate(buf_i(l_mn:r_mx, 0:wx-1))
2554  allocate(buf_o(c_mn:c_mx, 0:wx-1))
2555 #endif
2556  allocate(displacement(c_mn:c_mx, 0:wx-1))
2557 !$omp do OMP_COLLAPSE OMP_SCHEDULE
2558  do n=loop_mn(6), loop_mx(6)
2559  do m=loop_mn(5), loop_mx(5)
2560  do l=loop_mn(4), loop_mx(4)
2561  do k=loop_mn(3), loop_mx(3)
2562 #ifndef DISABLE_CACHE_BLOCKING
2563  do i=loop_mn(1), loop_mx(1), wx
2564  do j=l_mn,l_mx
2565  do w=0,wx-1
2566  buf_i(j,w) = l_halo(i+w,j,k,l,m,n)
2567  end do
2568  end do
2569  do j=c_mn,c_mx
2570  do w=0,wx-1
2571  buf_i(j,w) = f6d(i+w,j,k,l,m,n)
2572  end do
2573  end do
2574  do j=r_mn,r_mx
2575  do w=0,wx-1
2576  buf_i(j,w) = r_halo(i+w,j,k,l,m,n)
2577  end do
2578  end do
2579  ! Compute the displacement vector
2580  do j=c_mn, c_mx
2581  do w=0,wx-1
2582  v_coord(1) = rotation_matrix(1,1) * velocity_grid(4)%vals(l-loop_mn(4)+1) + &
2583  rotation_matrix(1,2) * velocity_grid(5)%vals(m-loop_mn(5)+1) + &
2584  rotation_matrix(1,3) * velocity_grid(6)%vals(n-loop_mn(6)+1) + &
2585  velocity_shift(i+w,j,k,1)
2586  v_coord(2) = rotation_matrix(2,1) * velocity_grid(4)%vals(l-loop_mn(4)+1) + &
2587  rotation_matrix(2,2) * velocity_grid(5)%vals(m-loop_mn(5)+1) + &
2588  rotation_matrix(2,3) * velocity_grid(6)%vals(n-loop_mn(6)+1) + &
2589  velocity_shift(i+w,j,k,2)
2590  v_coord(3) = rotation_matrix(3,1) * velocity_grid(4)%vals(l-loop_mn(4)+1) + &
2591  rotation_matrix(3,2) * velocity_grid(5)%vals(m-loop_mn(5)+1) + &
2592  rotation_matrix(3,3) * velocity_grid(6)%vals(n-loop_mn(6)+1) + &
2593  velocity_shift(i+w,j,k,3)
2594 
2595  displacement(j,w) = rdelta_x * ( &
2596  propagation_matrix(2,1) * v_coord(1)+&
2597  propagation_matrix(2,2) * v_coord(2)+&
2598  propagation_matrix(2,3) * v_coord(3) )
2599  if (abs(displacement(j,w))> 1.0_f64 ) then
2600  print*, i+w,j,k,l,m,n, displacement(j,w)
2601  stop
2602  end if
2603  !displacement(j,w) = 0.0_f64!- delta_tx * velocity_grid(i+w,j,k,2)
2604  end do
2605  end do
2606  do w=0,wx-1
2607  ! (2) perform interpolation
2609  buf_i(:,w), &
2610  buf_o(:,w), &
2611  displacement(:,w), &
2612  lw, &
2613  0)
2614  end do
2615  do j=c_mn,c_mx
2616  do w=0,wx-1
2617  f6d(i+w,j,k,l,m,n) = buf_o(j,w)
2618  end do
2619  end do
2620  end do
2621 #else
2622  do i=loop_mn(1), loop_mx(1)
2623  ! (1) fill input buffer piecewise
2624 ! buf_i(l_mn:l_mx,1) = l_halo(i,:,k,l,m,n)
2625 ! buf_i(c_mn:c_mx,1) = f6d(i,:,k,l,m,n)
2626 ! buf_i(r_mn:r_mx,1) = r_halo(i,:,k,l,m,n)
2627  do j=l_mn,l_mx
2628  buf_i(j,0) = l_halo(i,j,k,l,m,n)
2629  enddo
2630  do j=c_mn,c_mx
2631  buf_i(j,0) = f6d(i,j,k,l,m,n)
2632  enddo
2633  do j=r_mn,r_mx
2634  buf_i(j,0) = r_halo(i,j,k,l,m,n)
2635  enddo
2636  ! Compute the displacement vector
2637  do j=c_mn, c_mx
2638  v_coord(1) = rotation_matrix(1,1) * velocity_grid(4)%vals(l-loop_mn(4)+1) + &
2639  rotation_matrix(1,2) * velocity_grid(5)%vals(m-loop_mn(5)+1) + &
2640  rotation_matrix(1,3) * velocity_grid(6)%vals(n-loop_mn(6)+1) + &
2641  velocity_shift(i,j,k,1)
2642  v_coord(2) = rotation_matrix(2,1) * velocity_grid(4)%vals(l-loop_mn(4)+1) + &
2643  rotation_matrix(2,2) * velocity_grid(5)%vals(m-loop_mn(5)+1) + &
2644  rotation_matrix(2,3) * velocity_grid(6)%vals(n-loop_mn(6)+1) + &
2645  velocity_shift(i,j,k,2)
2646  v_coord(3) = rotation_matrix(3,1) * velocity_grid(4)%vals(l-loop_mn(4)+1) + &
2647  rotation_matrix(3,2) * velocity_grid(5)%vals(m-loop_mn(5)+1) + &
2648  rotation_matrix(3,3) * velocity_grid(6)%vals(n-loop_mn(6)+1) + &
2649  velocity_shift(i,j,k,3)
2650 
2651  displacement(j,0) = rdelta_x * ( &
2652  propagation_matrix(2,1) * v_coord(1)+&
2653  propagation_matrix(2,2) * v_coord(2)+&
2654  propagation_matrix(2,3) * v_coord(3) )
2655  !displacement(j,0) = - delta_tx * velocity_grid(i,j,k,2)
2656  end do
2657  ! (2) perform interpolation
2659  buf_i(:,0), &
2660  buf_o(:,0), &
2661  displacement(:,0), &
2662  lw, &
2663  0)
2664  ! f6d(i,:,k,l,m,n) = buf_o(c_mn:c_mx,1)
2665  do j=c_mn,c_mx
2666  f6d(i,j,k,l,m,n) = buf_o(j,0)
2667  enddo
2668  end do
2669 #endif
2670  end do
2671  end do
2672  end do
2673  end do
2674 !$omp end do
2675 #ifdef USE_FMEMPOOL
2676  call mp_release(buf_i)
2677  call mp_release(buf_o)
2678 #else
2679  deallocate(buf_i)
2680  deallocate(buf_o)
2681 #endif
2682  deallocate(displacement)
2683 !$omp end parallel
2685 
2687  subroutine sll_s_advection_6d_lagrange_dd_slim_advect_eta3_givenv(self, decomposition, rdelta_x, velocity_grid, velocity_shift, rotation_matrix, propagation_matrix, f6d)
2688  type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self
2689  type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
2690  type(sll_t_array), intent(in) :: velocity_grid(6)
2691  sll_real64, intent(in) :: velocity_shift(decomposition%local%mn(1):decomposition%local%mx(1), &
2692  decomposition%local%mn(2):decomposition%local%mx(2), &
2693  decomposition%local%mn(3):decomposition%local%mx(3),3)
2694  sll_real64, intent(in) :: rotation_matrix(3,3)
2695  sll_real64, intent(in) :: propagation_matrix(3,3)
2696  sll_real64, intent(in) :: rdelta_x
2697  sll_real64, intent(inout) :: f6d(&
2698  decomposition%local%mn(1):decomposition%local%mx(1), &
2699  decomposition%local%mn(2):decomposition%local%mx(2), &
2700  decomposition%local%mn(3):decomposition%local%mx(3), &
2701  decomposition%local%mn(4):decomposition%local%mx(4), &
2702  decomposition%local%mn(5):decomposition%local%mx(5), &
2703  decomposition%local%mn(6):decomposition%local%mx(6))
2704 
2705  sll_int32, parameter :: id = 3 ! eta3
2706  sll_int32 :: i,j,k,l,m,n,w,wx, lw
2707  sll_real64, pointer :: buf_i(:,:)
2708  sll_real64, pointer :: buf_o(:,:)
2709  sll_real64, pointer :: displacement(:,:)
2710 #ifdef USE_FMEMPOOL
2711  sll_int32 :: mn(2), mx(2), mno(2), mxo(2)
2712 #endif
2713  sll_int32 :: l_mn, l_mx, c_mn, c_mx, r_mn, r_mx
2714  sll_int32, pointer :: loop_mn(:), loop_mx(:)
2715  halo_dtype, pointer :: l_halo(:,:,:,:,:,:)
2716  halo_dtype, pointer :: r_halo(:,:,:,:,:,:)
2717  sll_real64 :: v_coord(3)
2718 
2719  ! NOTE: Current restriction for the memory-saving `slim` halos,
2720  ! to be lowered later for the real one-sided halo implementation.
2721  ! One-sided halos require a more clever index handling (though straight-forward).
2722  sll_assert_always(decomposition%local%halo_left%nw(id) == decomposition%local%halo_right%nw(id))
2723  sll_assert_always(decomposition%local%halo_left%nw(id) == (self%lagrange_width(1)-1)/2)
2724  ! cache values to avoid multiple dereferencing inside the loops
2725  l_mn = decomposition%local%halo_left%mn(id)
2726  l_mx = decomposition%local%halo_left%mx(id)
2727  c_mn = decomposition%local%mn(id)
2728  c_mx = decomposition%local%mx(id)
2729  r_mn = decomposition%local%halo_right%mn(id)
2730  r_mx = decomposition%local%halo_right%mx(id)
2731  l_halo => decomposition%local%halo_left%buf
2732  r_halo => decomposition%local%halo_right%buf
2733  loop_mn => decomposition%local%mn
2734  loop_mx => decomposition%local%mx
2735  wx = get_wx(decomposition)
2736  lw = self%lagrange_width(1)
2737 
2738 #ifdef USE_FMEMPOOL
2739  mn = [l_mn, 0]
2740  mx = [r_mx, wx-1]
2741  mno = [c_mn, 0]
2742  mxo = [c_mx, wx-1]
2743 #endif
2744 
2745 !$omp parallel default(shared) private(i,j,k,l,m,n,w,buf_i,buf_o,displacement,v_coord)
2746 #ifdef USE_FMEMPOOL
2747  call mp_acquire(buf_i, mn, mx)
2748  call mp_acquire(buf_o, mno, mxo)
2749 #else
2750  allocate(buf_i(l_mn:r_mx, 0:wx-1))
2751  allocate(buf_o(c_mn:c_mx, 0:wx-1))
2752 #endif
2753  allocate(displacement(c_mn:c_mx, 0:wx-1))
2754 !$omp do OMP_COLLAPSE OMP_SCHEDULE
2755  do n=loop_mn(6), loop_mx(6)
2756  do m=loop_mn(5), loop_mx(5)
2757  do l=loop_mn(4), loop_mx(4)
2758  do j=loop_mn(2), loop_mx(2)
2759 #ifndef DISABLE_CACHE_BLOCKING
2760  do i=loop_mn(1), loop_mx(1), wx
2761  do k=l_mn,l_mx
2762  do w=0,wx-1
2763  buf_i(k,w) = l_halo(i+w,j,k,l,m,n)
2764  end do
2765  end do
2766  do k=c_mn,c_mx
2767  do w=0,wx-1
2768  buf_i(k,w) = f6d(i+w,j,k,l,m,n)
2769  end do
2770  end do
2771  do k=r_mn,r_mx
2772  do w=0,wx-1
2773  buf_i(k,w) = r_halo(i+w,j,k,l,m,n)
2774  end do
2775  end do
2776  ! Compute the displacement vector
2777  do k=c_mn, c_mx
2778  do w=0,wx-1
2779  v_coord(1) = rotation_matrix(1,1) * velocity_grid(4)%vals(l-loop_mn(4)+1) + &
2780  rotation_matrix(1,2) * velocity_grid(5)%vals(m-loop_mn(5)+1) + &
2781  rotation_matrix(1,3) * velocity_grid(6)%vals(n-loop_mn(6)+1) + &
2782  velocity_shift(i+w,j,k,1)
2783  v_coord(2) = rotation_matrix(2,1) * velocity_grid(4)%vals(l-loop_mn(4)+1) + &
2784  rotation_matrix(2,2) * velocity_grid(5)%vals(m-loop_mn(5)+1) + &
2785  rotation_matrix(2,3) * velocity_grid(6)%vals(n-loop_mn(6)+1) + &
2786  velocity_shift(i+w,j,k,2)
2787  v_coord(3) = rotation_matrix(3,1) * velocity_grid(4)%vals(l-loop_mn(4)+1) + &
2788  rotation_matrix(3,2) * velocity_grid(5)%vals(m-loop_mn(5)+1) + &
2789  rotation_matrix(3,3) * velocity_grid(6)%vals(n-loop_mn(6)+1) + &
2790  velocity_shift(i+w,j,k,3)
2791 
2792  displacement(k,w) = rdelta_x * ( &
2793  propagation_matrix(3,1) * v_coord(1)+&
2794  propagation_matrix(3,2) * v_coord(2)+&
2795  propagation_matrix(3,3) * v_coord(3) )
2796  end do
2797  end do
2798  do w=0,wx-1
2799  ! (2) perform interpolation
2801  buf_i(:,w), &
2802  buf_o(:,w), &
2803  displacement(:,w), &
2804  lw, &
2805  0)
2806  end do
2807  do k=c_mn,c_mx
2808  do w=0,wx-1
2809  f6d(i+w,j,k,l,m,n) = buf_o(k,w)
2810  end do
2811  end do
2812  end do
2813 #else
2814  do i=loop_mn(1), loop_mx(1)
2815  ! (1) fill input buffer piecewise
2816 ! buf_i(l_mn:l_mx,1) = l_halo(i,:,k,l,m,n)
2817 ! buf_i(c_mn:c_mx,1) = f6d(i,:,k,l,m,n)
2818 ! buf_i(r_mn:r_mx,1) = r_halo(i,:,k,l,m,n)
2819  do k=l_mn,l_mx
2820  buf_i(k,0) = l_halo(i,j,k,l,m,n)
2821  enddo
2822  do k=c_mn,c_mx
2823  buf_i(k,0) = f6d(i,j,k,l,m,n)
2824  enddo
2825  do k=r_mn,r_mx
2826  buf_i(k,0) = r_halo(i,j,k,l,m,n)
2827  enddo ! Compute the displacement vector
2828  do k=c_mn, c_mx
2829  !displacement(k,0) = - delta_tx * velocity_grid(i,j,k,3)
2830  v_coord(1) = rotation_matrix(1,1) * velocity_grid(4)%vals(l-loop_mn(4)+1) + &
2831  rotation_matrix(1,2) * velocity_grid(5)%vals(m-loop_mn(5)+1) + &
2832  rotation_matrix(1,3) * velocity_grid(6)%vals(n-loop_mn(6)+1) + &
2833  velocity_shift(i,j,k,1)
2834  v_coord(2) = rotation_matrix(2,1) * velocity_grid(4)%vals(l-loop_mn(4)+1) + &
2835  rotation_matrix(2,2) * velocity_grid(5)%vals(m-loop_mn(5)+1) + &
2836  rotation_matrix(2,3) * velocity_grid(6)%vals(n-loop_mn(6)+1) + &
2837  velocity_shift(i,j,k,2)
2838  v_coord(3) = rotation_matrix(3,1) * velocity_grid(4)%vals(l-loop_mn(4)+1) + &
2839  rotation_matrix(3,2) * velocity_grid(5)%vals(m-loop_mn(5)+1) + &
2840  rotation_matrix(3,3) * velocity_grid(6)%vals(n-loop_mn(6)+1) + &
2841  velocity_shift(i,j,k,3)
2842 
2843  displacement(k,0) = rdelta_x * ( &
2844  propagation_matrix(3,1) * v_coord(1)+&
2845  propagation_matrix(3,2) * v_coord(2)+&
2846  propagation_matrix(3,3) * v_coord(3) )
2847  end do
2848  ! (2) perform interpolation
2850  buf_i(:,0), &
2851  buf_o(:,0), &
2852  displacement(:,0), &
2853  lw, &
2854  0)
2855  ! f6d(i,:,k,l,m,n) = buf_o(c_mn:c_mx,1)
2856  do k=c_mn,c_mx
2857  f6d(i,j,k,l,m,n) = buf_o(k,0)
2858  enddo
2859  end do
2860 #endif
2861  end do
2862  end do
2863  end do
2864  end do
2865 !$omp end do
2866 #ifdef USE_FMEMPOOL
2867  call mp_release(buf_i)
2868  call mp_release(buf_o)
2869 #else
2870  deallocate(buf_i)
2871  deallocate(buf_o)
2872 #endif
2873  deallocate(displacement)
2874 !$omp end parallel
2876 
2877 
2878 !!$ !> Advection along eta1 with displacement with centered lagrange interpolation
2879 !!$ subroutine sll_s_advection_6d_clagrange_dd_slim_advect_eta1_givenv(self, decomposition, velocity_grid, delta_t, f6d)
2880 !!$
2881 !!$ type(sll_t_advection_6d_lagrange_dd_slim), intent(inout) :: self !< advector object
2882 !!$ type(sll_t_decomposition_slim_6d), target, intent(inout) :: decomposition
2883 !!$ sll_real64, intent(in) :: velocity_grid(decomposition%local%mn(1):decomposition%local%mx(1),decomposition%local%mn(2):decomposition%local%mx(2),decomposition%local%mn(3):decomposition%local%mx(3),3) !< displacement vector
2884 !!$ sll_real64, intent(in) :: delta_t
2885 !!$ sll_real64, intent(inout) :: f6d(&
2886 !!$ decomposition%local%mn(1):decomposition%local%mx(1), &
2887 !!$ decomposition%local%mn(2):decomposition%local%mx(2), &
2888 !!$ decomposition%local%mn(3):decomposition%local%mx(3), &
2889 !!$ decomposition%local%mn(4):decomposition%local%mx(4), &
2890 !!$ decomposition%local%mn(5):decomposition%local%mx(5), &
2891 !!$ decomposition%local%mn(6):decomposition%local%mx(6)) !< value of the function on input and advected function values on output
2892 !!$ ! ---
2893 !!$ sll_int32, parameter :: id = 1 ! eta1
2894 !!$ sll_int32 :: i,j,k,l,m,n,w,wx, n_omp_threads, deg
2895 !!$ sll_real64, pointer :: buf_i(:,:)
2896 !!$ sll_real64, pointer :: buf_o(:,:)
2897 !!$ sll_int32 :: l_mn, l_mx, c_mn, c_mx, r_mn, r_mx, l_mn_l, r_mx_l
2898 !!$ sll_int32, pointer :: loop_mn(:), loop_mx(:)
2899 !!$ HALO_DTYPE, pointer :: l_halo(:,:,:,:,:,:)
2900 !!$ HALO_DTYPE, pointer :: r_halo(:,:,:,:,:,:)
2901 !!$#ifdef USE_FMEMPOOL
2902 !!$ sll_int32 :: mn(2), mx(2)
2903 !!$#endif
2904 !!$ sll_int32 :: si
2905 !!$ sll_real64 :: alpha
2906 !!$ sll_real64 :: displacement
2907 !!$
2908 !!$ ! cache values to avoid multiple dereferencing inside the loops
2909 !!$ l_mn = decomposition%local%halo_left%mn(id) !decomposition%local%halo_left%mn(id)
2910 !!$ l_mx = decomposition%local%halo_left%mx(id) !decomposition%local%halo_left%mx(id)
2911 !!$ c_mn = decomposition%local%mn(id)
2912 !!$ c_mx = decomposition%local%mx(id)
2913 !!$ r_mn = decomposition%local%halo_right%mn(id) !decomposition%local%halo_right%mn(id)
2914 !!$ r_mx = decomposition%local%halo_right%mx(id) !decomposition%local%halo_right%mx(id)
2915 !!$ l_halo => decomposition%local%halo_left%buf
2916 !!$ r_halo => decomposition%local%halo_right%buf
2917 !!$ loop_mn => decomposition%local%mn
2918 !!$ loop_mx => decomposition%local%mx
2919 !!$ ! NOTE: Buffering is done here for the second dim "j".
2920 !!$ ! Seems to be marginally beneficial.
2921 !!$ wx = get_wx(decomposition, 2)
2922 !!$ deg = self%lagrange_width(1)
2923 !!$#ifdef USE_FMEMPOOL
2924 !!$ mn = [l_mn, 0]
2925 !!$ mx = [r_mx, wx-1]
2926 !!$#endif
2927 !!$
2928 !!$!$omp parallel default(shared) private(i,j,k,l,m,n,w,buf_i,buf_o,si,alpha,l_mn_l,r_mx_l)
2929 !!$! allocate(buf_i(l_mn:r_mx,0:wx-1))
2930 !!$! allocate(buf_o(l_mn:r_mx,0:wx-1))
2931 !!$#ifdef USE_FMEMPOOL
2932 !!$ call mp_acquire(buf_i, mn, mx)
2933 !!$ call mp_acquire(buf_o, mn, mx)
2934 !!$#else
2935 !!$ allocate(buf_i(l_mn:r_mx, 0:wx-1))
2936 !!$ allocate(buf_o(l_mn:r_mx, 0:wx-1))
2937 !!$#endif
2938 !!$!$omp do OMP_COLLAPSE OMP_SCHEDULE
2939 !!$ do n=loop_mn(6), loop_mx(6)
2940 !!$ do m=loop_mn(5), loop_mx(5)
2941 !!$ do l=loop_mn(4), loop_mx(4)
2942 !!$
2943 !!$ do k=loop_mn(3), loop_mx(3)
2944 !!$#ifndef DISABLE_CACHE_BLOCKING
2945 !!$ do j=loop_mn(2), loop_mx(2), wx
2946 !!$ do w=0,wx-1
2947 !!$ do i=l_mn_l,l_mx
2948 !!$ buf_i(i,w) = l_halo(i,j+w,k,l,m,n)
2949 !!$ end do
2950 !!$ end do
2951 !!$ do w=0,wx-1
2952 !!$ do i=c_mn,c_mx
2953 !!$ buf_i(i,w) = f6d(i,j+w,k,l,m,n)
2954 !!$ end do
2955 !!$ end do
2956 !!$ do w=0,wx-1
2957 !!$ do i=r_mn,r_mx_l
2958 !!$ buf_i(i,w) = r_halo(i,j+w,k,l,m,n)
2959 !!$ end do
2960 !!$ end do
2961 !!$ do w=0,wx-1
2962 !!$ call sll_s_lagrange_interpolation_1d_fast_disp_even_halo_cells( &
2963 !!$ buf_i(l_mn_l:r_mx_l,w), &
2964 !!$ buf_o(l_mn_l:r_mx_l,w), &
2965 !!$ alpha, &
2966 !!$ deg)
2967 !!$ end do
2968 !!$ do w=0,wx-1
2969 !!$ do i=c_mn,c_mx
2970 !!$ f6d(i,j+w,k,l,m,n) = buf_o(i+si,w)
2971 !!$ end do
2972 !!$ end do
2973 !!$ end do
2974 !!$#else
2975 !!$ do j=loop_mn(2), loop_mx(2)
2976 !!$
2977 !!$ displacement = -delta_t * velocity_grid(c_mn:c_mx,j,k,1)
2978 !!$
2979 !!$ si = floor(displacement(l,m))
2980 !!$ alpha = displacement(l,m) - real(si,f64)
2981 !!$ l_mn_l = -deg/2+1+si+c_mn
2982 !!$ r_mx_l = c_mx+deg/2+si
2983 !!$
2984 !!$ ! (1) fill input buffer piecewise
2985 !!$ do i=l_mn_l,l_mx
2986 !!$ buf_i(i,0) = l_halo(i,j,k,l,m,n)
2987 !!$ enddo
2988 !!$ do i=c_mn,c_mx
2989 !!$ buf_i(i,0) = f6d(i,j,k,l,m,n)
2990 !!$ enddo
2991 !!$ do i=r_mn,r_mx_l
2992 !!$ buf_i(i,0) = r_halo(i,j,k,l,m,n)
2993 !!$ enddo
2994 !!$ ! (2) perform interpolation
2995 !!$ call sll_s_lagrange_interpolation_1d_fast_disp_even_halo_cells( &
2996 !!$ buf_i(l_mn_l:r_mx_l,0), &
2997 !!$ buf_o(l_mn_l:r_mx_l,0), &
2998 !!$ alpha, &
2999 !!$ deg)
3000 !!$ ! (3) copy-back interpolated values
3001 !!$ do i=c_mn,c_mx
3002 !!$ f6d(i,j,k,l,m,n) = buf_o(i+si,0)
3003 !!$ enddo
3004 !!$ end do
3005 !!$#endif
3006 !!$ end do
3007 !!$ end do
3008 !!$ end do
3009 !!$ end do
3010 !!$!$omp end do
3011 !!$#ifdef USE_FMEMPOOL
3012 !!$ call mp_release(buf_i)
3013 !!$ call mp_release(buf_o)
3014 !!$#else
3015 !!$ deallocate(buf_i)
3016 !!$ deallocate(buf_o)
3017 !!$#endif
3018 !!$!$omp end parallel
3019 !!$ end subroutine sll_s_advection_6d_clagrange_dd_slim_advect_eta1_d45
3020 
3021 
3022 
Plain Fortran implementation of a memory pool.
Definition: fmempool.F90:5
Module implementing Lagrange advection for the setting of a domain decomposition in 6d with extra buf...
subroutine, public sll_s_advection_6d_lagrange_dd_slim_advect_eta1(self, decomposition, displacement, f6d)
Advection along eta1 with displacement dependent on eta4.
subroutine, public sll_s_advection_6d_lagrange_dd_slim_advect_eta1_givenv(self, decomposition, rdelta_x, velocity_grid, velocity_shift, rotation_matrix, propagation_matrix, f6d)
Advection along eta1 with displacement dependent on eta45 (fixed interval Lagrange interpolation )
subroutine sll_s_advection_6d_lagrange_dd_slim_advect_eta2_core(self, decomposition, eta5_cut, displacement, f6d)
Advection along eta2 with displacement dependent on eta5.
subroutine, public sll_s_advection_6d_lagrange_dd_slim_set_eta123(self, decomposition, displacement_eta1, displacement_eta2, displacement_eta3)
subroutine, public sll_s_advection_6d_lagrange_dd_slim_advect_eta3(self, decomposition, displacement, f6d)
Advection along eta3 with displacement dependent on eta6.
subroutine, public sll_s_advection_6d_lagrange_dd_slim_advect_eta2(self, decomposition, displacement, f6d)
Advection along eta2 with displacement dependent on eta5.
subroutine make_blocks_lagrange(ind, stencil, decomposition, disp, halo_blocks, halo_width, n_halo_blocks)
Helper function to calculate the communication blocks for the Lagrange interpolation.
subroutine sll_s_advection_6d_lagrange_dd_slim_advect_eta1_core(self, decomposition, eta4_cut, displacement, f6d)
Advection along eta1 with displacement dependent on eta4.
subroutine, public sll_s_advection_6d_clagrange_dd_slim_advect_eta2_d45(self, decomposition, displacement, f6d)
Advection along eta2 with displacement dependent on eta4-5.
subroutine, public sll_s_advection_6d_lagrange_dd_slim_advect_eta2_givenv(self, decomposition, rdelta_x, velocity_grid, velocity_shift, rotation_matrix, propagation_matrix, f6d)
Advection along eta2 with displacement dependent on eta45.
subroutine, public sll_s_advection_6d_lagrange_dd_slim_advect_eta5(self, decomposition, displacement, f6d)
Advection along eta5 with displacement dependent on eta1-3.
subroutine, public sll_s_advection_6d_lagrange_dd_slim_fadvect_eta3(self, topology, decomposition, f6d)
Advection along eta3 with displacement dependent on eta6.
subroutine, public sll_s_advection_6d_lagrange_dd_slim_init(self, lagrange_width)
subroutine, public sll_s_advection_6d_lagrange_dd_slim_advect_eta2_dispeta45(self, decomposition, displacement, f6d)
Advection along eta2 with displacement dependent on eta45.
subroutine, public sll_s_advection_6d_lagrange_dd_slim_advect_eta1_dispeta45(self, decomposition, displacement, f6d)
Advection along eta1 with displacement dependent on eta45.
subroutine, public sll_s_advection_6d_lagrange_dd_slim_advect_eta6(self, decomposition, displacement, f6d)
Advection along eta6 with displacement dependent on eta1-3.
subroutine, public sll_s_advection_6d_lagrange_dd_slim_fadvect_eta1(self, topology, decomposition, f6d)
Advection along eta1 with displacement dependent on eta4.
subroutine sll_s_advection_6d_lagrange_dd_slim_advect_eta3_core(self, decomposition, eta6_cut, displacement, f6d)
Advection along eta3 with displacement dependent on eta6.
subroutine, public sll_s_advection_6d_lagrange_dd_slim_advect_eta4(self, decomposition, displacement, f6d)
Advection along eta4 with displacement dependent on eta1-3.
subroutine, public sll_s_advection_6d_lagrange_dd_slim_fadvect_eta2(self, topology, decomposition, f6d)
Advection along eta2 with displacement dependent on eta5.
subroutine, public sll_s_advection_6d_clagrange_dd_slim_advect_eta1_d45(self, decomposition, displacement, f6d)
Advection along eta1 with displacement dependent on eta4-5.
subroutine, public sll_s_advection_6d_lagrange_dd_slim_advect_eta3_givenv(self, decomposition, rdelta_x, velocity_grid, velocity_shift, rotation_matrix, propagation_matrix, f6d)
Advection along eta3 with displacement dependent on eta45.
integer(kind=i32) function get_wx(decomposition, id_in)
Fortran module where set some physical and mathematical constants.
real(kind=f64), parameter, public sll_p_twopi
Module providing data structures and tools to implement domain decompositions.
subroutine, public sll_s_apply_bc_exchange_slim_6d_real64(topo, decomp, id)
subroutine, public sll_s_deallocate_bc_buffers(decomp)
subroutine, public sll_s_allocate_bc_buffers_6d(decomp, id)
subroutine, public sll_f_apply_halo_exchange_slim_6d_real64(topo, decomp, arr, id, hw_left, hw_right)
subroutine, public sll_s_apply_halo_exchange_slim_6d_real64(topo, decomp, arr, id, hw_left, hw_right, halo_block)
subroutine, public sll_s_allocate_bc_buffers_6d_part(decomp, id, idx_mn, idx_mx)
Data types that collect the parameters for various types of initial distributions and define the corr...
Module for 1D interpolation and reconstruction.
Module for 1D Lagrange interpolation on a uniform grid (only odd order)
subroutine, public sll_s_lagrange_interpolation_1d_fast_disp_even_halo_cells(fi, fp, p, stencil)
Even Lagrange interpolation with halo cells and no interval shift, i.e. p must be between zero and on...
subroutine, public sll_s_lagrange_interpolation_1d_fast_disp_fixed_haloc_cells(fi, fp, p, stencil)
Lagrange interpolation with halo cell boundaries, where the input array already contains halo cells....
subroutine, public sll_s_lagrange_interpolation_1d_fast_disp_centered_halo_cells(fi, fp, p, stencil)
subroutine, public sll_s_lagrange_interpolation_1d_fast_haloc_cells(fi, fp, p, stencil, index_shift)
Lagrange interpolation with halo cell boundaries, where the input array already contains halo cells....
Interpolator class and methods of Lagrange 1D interpolator.
integer(kind=i32), parameter, public sll_p_lagrange_fixed
Flag to specify Lagrange interpolation on a fixed interval centered around the point that is displace...
integer(kind=i32), parameter, public sll_p_lagrange_centered
Flag to specify Lagrange interpolation centered around the interpolation point.
Information on the 6D cartesian process topology.
6D decomposition, slim redesign, global array size information and local information.
Abstract class for 1D interpolation and reconstruction.
    Report Typos and Errors