!! Copyright (C) 2022, 2024 F. Bonafé, S. Ohlmann
!!
!! This program is free software; you can redistribute it and/or modify
!! it under the terms of the GNU General Public License as published by
!! the Free Software Foundation; either version 2, or (at your option)
!! any later version.
!!
!! This program is distributed in the hope that it will be useful,
!! but WITHOUT ANY WARRANTY; without even the implied warranty of
!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!! GNU General Public License for more details.
!!
!! You should have received a copy of the GNU General Public License
!! along with this program; if not, write to the Free Software
!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
!! 02110-1301, USA.
!!
#include "global.h"


!> @brief Implementation details for regridding
!!
!! For commensurate grids, restriction and prolongation are chosen such that they are adjoint operators.
!! <ol>
!! <li> Simple mapping:
!!   - find all points of output mesh that are also on the local input mesh
!!   - get their partition number on the output mesh
!!   - use the partition_transfer class to communicate those points
!!   - use the corresponding mappings for the input and output buffers
!!
!! <li> Restriction: mapping from fine to coarse meshes.
!!
!!   The strategy is as follows:
!!   - find all points of the coarse mesh that correspond to local points of
!!     the fine mesh
!!   - find all points of the coarse mesh around those that have been found
!!     that are connected by an order-1 cube stencil - the idea is that those
!!     points have contributions in the restriction operation from points on
!!     the fine mesh
!!   - create the stencil for the restriction and compute the weights; the
!!     size is 2*(eta-1)+1 in each dimension, where eta is the ratio of the
!!     grid spacings
!!   - save the indices of all of these points for communication
!!   - when doing the transfer:
!!       <ul>
!!       <li> apply the restriction by looping over the locally matching coarse
!!            points and applying the restriction stencil, adding up the
!!            contribution from the points on the fine mesh
!!       <li> communicate those values
!!       <li> do a reduction of the received values; each rank can receive
!!            contributions from different ranks, depending on the parallelization
!!       </ul>
!! <li> Prolongation: mapping from coarse to fine meshes using linear interpolation in nd space
!!
!!   The strategy is slightly different from the restriction operator:
!!   - first, get all local coarse points corresponding to local fine points
!!     on the output mesh, thus loop over the output mesh here
!!   - get all points on the coarse input mesh reachable by a cube stencil
!!   - these are now the points needed for the output mesh
!!   - use the inverse partition transfer to communicate the points; this is
!!     needed because the normal partition transfer can only communicate
!!     points that are locally available, but we also potentially need some
!!     from other cores
!!   - when doing the transfer, loop over the coarse points and apply the
!!     transfer stencil (which corresponds to linear interpolation) to the
!!     adjacent fine mesh points. For each point of the fine mesh, the contributions
!!     of the coarse mesh points are accumulated.
!! </ol>
!!
!! For curvilinear or non-commmensurate grids, a generic interpolation scheme is used:
!! - for each point in the output mesh, either take the value from the input mesh if it
!!   is present there as well or get the hexahedron with the 8 vertices surrounding it on
!!   the input mesh
!! - determine the interpolation coefficients on these hexahedra and apply the interpolation
!!   when doing the transfer
!! This scheme is simpler than the one above, but it is not adjoint in both directions.
module regridding_oct_m
  use affine_coordinates_oct_m
  use coordinate_system_oct_m
  use debug_oct_m
  use global_oct_m
  use grid_oct_m
  use iihash_oct_m
  use lalg_adv_oct_m
  use math_oct_m
  use mesh_oct_m
  use mesh_function_oct_m
  use messages_oct_m
  use namespace_oct_m
  use parser_oct_m
  use partition_oct_m
  use partition_transfer_oct_m
  use profiling_oct_m
  use space_oct_m
  use stencil_oct_m
  use utils_oct_m

  implicit none
  public :: regridding_t

  !> @brief contains the information of the meshes and provides the transfer functions
  !!
  type :: regridding_t
    private
    class(mesh_t), pointer :: mesh_in, mesh_out                          !< meshes for the transfer
    type(partition_transfer_t) :: partition_transfer                     !<
    integer :: nsend, nrecv, dim
    integer,     allocatable :: order_in(:), order_out(:)                !< sorting of points due to partitioning
    integer(int64), allocatable :: order_in_global(:), order_out_global(:)  !< sorting of points due to partitioning
    logical, allocatable :: overlap_map(:)
    logical :: do_restriction, do_prolongation
    type(stencil_t) :: transfer_stencil
    real(real64), allocatable :: weights(:)
    real(real64), allocatable :: weights_generic(:, :)
    integer, allocatable :: stencil_points(:, :)
    integer, allocatable :: eta(:)
    integer :: interpolation_level !< can be NEAREST_NEIGHBOR or LINEAR
    integer :: scale_norms !< the method with which the norms are scaled after regridding
    logical :: generic_interpolation !< method for generic combinations of grids
  contains
    procedure :: dregridding_do_transfer_1, zregridding_do_transfer_1 !< @copydoc regridding_oct_m::dregridding_dot_transfer_1
    procedure :: dregridding_do_transfer_2, zregridding_do_transfer_2 !< @copydoc regridding_oct_m::dregridding_dot_transfer_2
    generic :: do_transfer => dregridding_do_transfer_1, zregridding_do_transfer_1
    generic :: do_transfer => dregridding_do_transfer_2, zregridding_do_transfer_2
    final :: regridding_finalize
  end type regridding_t

  interface regridding_t
    procedure regridding_init
  end interface regridding_t

  integer, parameter :: &
    NEAREST_NEIGHBOR = 0, &
    LINEAR = 1, &
    SCALE_NONE = 0, &
    SCALE_NORM2 = 1

contains

  ! ---------------------------------------------------------
  !> Generate a re-mapping of points from mesh_in to mesh_out
  function regridding_init(mesh_out, mesh_in, space_in, namespace) result(this)
    class(mesh_t), target, intent(in) :: mesh_out
    class(mesh_t), target, intent(in) :: mesh_in
    class(space_t),        intent(in) :: space_in
    type(namespace_t),     intent(in) :: namespace
    class(regridding_t), pointer :: this

    integer :: ip_in, ip_out, index(1:space_in%dim), idim, ii, is, size_array
    integer(int64), allocatable :: global_indices_in(:), order_in(:), order_out(:)
    integer(int64), allocatable :: order_in_original(:), order_out_original(:)
    integer(int64), allocatable :: global_indices_in_tmp(:)
    integer(int64), allocatable :: global_indices_in_original(:)
    integer, allocatable :: partition_in(:)
    integer(int64) :: ipg_coarse, ipg_in
    real(real64) :: spacing_ratio(1:space_in%dim)
    real(real64) :: x_fine(1:space_in%dim), x_coarse(1:space_in%dim), x_primitive(1:space_in%dim)
    real(real64) :: x_out(1:space_in%dim), x_in(1:space_in%dim), rmin, r_current
    real(real64), allocatable :: M(:, :), p(:)
    real(real64) :: scaling(1:space_in%dim, 1:space_in%dim), rotation(1:space_in%dim, 1:space_in%dim)
    integer :: i_coarse, ip_fine, ip_mindist, cube_size
    integer :: index_stencil(1:space_in%dim), shift(1:space_in%dim)
    logical :: same_eta, on_coarse_grid, found, on_in_grid
    type(lihash_t) :: global_indices
    class(mesh_t), pointer :: mesh_coarse, mesh_fine
    class(coordinate_system_t), pointer :: coarse_coord, fine_coord

    PUSH_SUB_WITH_PROFILE(regridding_init)

    SAFE_ALLOCATE(this)

    nullify(coarse_coord)
    nullify(fine_coord)

    !%Variable RegriddingInterpolationLevel
    !%Type integer
    !%Default regridding_linear
    !%Section Mesh
    !%Description
    !% Choose the interpolation order for the regridding. 0 is equivalent to nearest neighbor
    !% interpolation. The default is linear interpolation.
    !%Option regridding_nearest_neighbor 0
    !% Use the nearest neighbor for the regridding. This is faster than the linear interpolation.
    !%Option regridding_linear 1
    !% Use a trilinear interpolation. This is implemented similar to restriction and prolongation
    !% operations in multigrid methods. This ensures that both directions are adjoint to each other.
    !%End
    call parse_variable(namespace, "RegriddingInterpolationLevel", LINEAR, this%interpolation_level)

    !%Variable RegriddingRescale
    !%Type integer
    !%Default scale_norm2
    !%Section Mesh
    !%Description
    !% Rescale the regridded quantities. Not using a rescaling can lead to bad results if the
    !% ratio of the grid spacings is large. The default is to rescale by the 2-norm of the quantity
    !% except for generic interpolations (i.e. between curvilinear or non-commensurate grids).
    !%Option scale_none 0
    !% Do not rescale the regridded quantities.
    !%Option scale_norm2 1
    !% Scale the regridded quantities by the 2-norm of the quantity on the overlapping
    !% region of the grid.
    !%End
    call parse_variable(namespace, "RegriddingRescale", SCALE_NORM2, this%scale_norms)

    ! check some conditions which are not yet supported
    if (.not. (mesh_out%coord_system%dim == space_in%dim)) then
      message(1) = "For regridding, both grids need to have the same space dimension."
      call messages_fatal(1, namespace=namespace)
    end if
    ! use generic interpolation if the grids are not commensurate or curvilinear
    this%generic_interpolation = mesh_out%coord_system%local_basis .or. &
      mesh_in%coord_system%local_basis .or. &
      .not. (same_type_as(mesh_out%coord_system, mesh_in%coord_system))

    this%dim = space_in%dim
    spacing_ratio(:) = mesh_out%spacing(1:this%dim)/mesh_in%spacing(1:this%dim)

    this%do_restriction = all(spacing_ratio > M_ONE)
    this%do_prolongation = all(spacing_ratio < M_ONE)
    ! invert spacing ratio for prolongations
    if (this%do_prolongation) spacing_ratio = M_ONE/spacing_ratio
    ! get the integer ratio of the spacings
    SAFE_ALLOCATE(this%eta(1:this%dim))
    do idim = 1, this%dim
      this%eta(idim) = nint(spacing_ratio(idim))
    end do
    if (any(abs((spacing_ratio - this%eta)/spacing_ratio) > 10.0_real64*M_EPSILON)) then
      this%generic_interpolation = .true.
    end if
    same_eta = .true.
    do idim = 2, this%dim
      same_eta = same_eta .and. this%eta(idim) == this%eta(idim-1)
    end do
    if (.not. same_eta) then
      this%generic_interpolation = .true.
    end if
    ! use always generic interpolation for nearest neighbor interpolation
    if (this%interpolation_level == NEAREST_NEIGHBOR) then
      this%generic_interpolation = .true.
    end if

    if (this%generic_interpolation) then
      this%do_prolongation = .true.
      this%do_restriction = .false.
      this%scale_norms = SCALE_NONE
    end if

    this%mesh_in => mesh_in
    this%mesh_out => mesh_out
    if (.not. this%do_prolongation) then
      mesh_fine => this%mesh_in
      mesh_coarse => this%mesh_out
    else
      mesh_coarse => this%mesh_in
      mesh_fine => this%mesh_out
    end if

    ! collect all locally available points in mesh_coarse that are also in mesh_fine
    call lihash_init(global_indices)
    size_array = mesh_fine%np
    SAFE_ALLOCATE(global_indices_in_tmp(size_array))
    SAFE_ALLOCATE(global_indices_in_original(size_array))

    if (.not. this%generic_interpolation) then
      ii = 0
      do ip_fine = 1, mesh_fine%np
        call mesh_local_index_to_coords(mesh_fine, ip_fine, index)
        on_coarse_grid = .true.
        if (this%do_restriction .or. this%do_prolongation) then
          do idim = 1, this%dim
            on_coarse_grid = on_coarse_grid .and. mod(index(idim), this%eta(idim)) == 0
          end do
        end if
        if (on_coarse_grid) then
          ! translate between fine and coarse grid
          index(:) = index(:) / this%eta(:)
          ipg_coarse = mesh_global_index_from_coords(mesh_coarse, index)
          call insert_global_point(mesh_coarse, ipg_coarse, ii)
        else if (this%do_restriction .or. this%do_prolongation) then
          ! now get all coarse points that surround the fine point
          ! start always from the same corner of the cube
          ! need a cube of size of the interpolation level, so shift to the corner of that cube
          index(:) = floor(real(index(:)) / this%eta(:)) - (this%interpolation_level-1)/2
          cube_size = 1+this%interpolation_level
          do is = 1, cube_size**this%dim
            ! get index of coarse point in the surrounding parallelepiped
            ! using the corresponding index as number in base cube_size
            call convert_to_base(is-1, cube_size, shift)
            index_stencil(:) = index(:) + shift(:)
            ipg_coarse = mesh_global_index_from_coords(mesh_coarse, index_stencil)
            call insert_global_point(mesh_coarse, ipg_coarse, ii)
          end do
        end if
      end do
      call lihash_end(global_indices)
    else
      ii = 0
      do ip_out = 1, mesh_out%np
        call mesh_local_index_to_coords(mesh_out, ip_out, index)
        x_out = mesh_out%coord_system%to_cartesian(real(index, real64)*mesh_out%spacing)
        x_in = mesh_in%coord_system%from_cartesian(x_out)/mesh_in%spacing
        on_in_grid = all(abs((x_in - nint(x_in))) < 10.0_real64*M_EPSILON)
        if (on_in_grid) then
          ! translate between out and in grid
          ipg_in = mesh_global_index_from_coords(mesh_in, nint(x_in))
          call insert_global_point(mesh_in, ipg_in, ii)
        else
          ! now get all points in the input mesh that surround the point in the output mesh
          ! start always from the same corner of the cube
          ! need a cube of size of the interpolation level, so shift to the corner of that cube
          index(:) = floor(x_in) - (max(this%interpolation_level, 1)-1)/2
          cube_size = 1+max(this%interpolation_level, 1)
          do is = 1, cube_size**this%dim
            ! get index of coarse point in the surrounding parallelepiped
            ! using the corresponding index as number in base cube_size
            call convert_to_base(is-1, cube_size, shift)
            index_stencil(:) = index(:) + shift(:)
            ipg_in = mesh_global_index_from_coords(mesh_in, index_stencil)
            call insert_global_point(mesh_in, ipg_in, ii)
          end do
        end if
      end do
      call lihash_end(global_indices)
    end if

    SAFE_ALLOCATE(global_indices_in(ii))
    SAFE_ALLOCATE(partition_in(ii))
    global_indices_in(1:ii) = global_indices_in_tmp(1:ii)
    SAFE_DEALLOCATE_A(global_indices_in_tmp)

    if (mesh_coarse%parallel_in_domains) then
      ! determine where the points of the coarse mesh are stored
      call partition_get_partition_number(mesh_coarse%partition, ii, global_indices_in, partition_in)
    else
      partition_in = 1
    end if

    ! Init partition transfer
    ! need inverse direction for prolongations
    call partition_transfer_init(this%partition_transfer, ii, global_indices_in, &
      mesh_in%mpi_grp, mesh_out%mpi_grp, partition_in, &
      this%nsend, this%nrecv, order_in, order_out, inverse=this%do_prolongation)

    ! we need to transfer the global indices of the boundary point and the corresponding
    ! inner point for periodic meshes
    if (space_in%is_periodic()) then
      global_indices_in(1:ii) = global_indices_in_original(1:ii)
      call partition_transfer_init(this%partition_transfer, ii, global_indices_in, &
        mesh_in%mpi_grp, mesh_out%mpi_grp, partition_in, &
        this%nsend, this%nrecv, order_in_original, order_out_original, inverse=this%do_prolongation)
    else
      order_in_original = order_in
      order_out_original = order_out
    end if

    ! convert from global to local indices
    SAFE_ALLOCATE(this%order_in(1:this%nsend))
    SAFE_ALLOCATE(this%order_in_global(1:this%nsend))
    SAFE_ALLOCATE(this%order_out(1:this%nrecv))
    SAFE_ALLOCATE(this%order_out_global(1:this%nrecv))

    ! get the mapping for mesh_in in the order given by the global indices of mesh_out
    do ip_in = 1, this%nsend
      if (this%do_restriction) then
        ! in this case, we have an index on mesh_out
        this%order_in_global(ip_in) = order_in_original(ip_in)
        this%order_in(ip_in) = 0
      else if (this%do_prolongation) then
        ! in this case, we have an index on mesh_in
        this%order_in_global(ip_in) = order_in(ip_in)
        this%order_in(ip_in) = mesh_global2local(mesh_in, this%order_in_global(ip_in))
      else
        ! convert back from the global index of mesh_out to a local index of mesh_in
        call mesh_global_index_to_coords(mesh_out, order_in_original(ip_in), index)
        this%order_in_global(ip_in) = mesh_global_index_from_coords(mesh_in, index)
        this%order_in(ip_in) = mesh_global2local(mesh_in, this%order_in_global(ip_in))
      end if
      if (.not. this%do_restriction) then
        if (this%order_in(ip_in) == 0) then
          write(message(1),'(a,i10,a,i10)') "Error in regridding part 1: mesh point ", &
            this%order_in(ip_in), " is not stored in partition ", mesh_in%pv%partno
          call messages_fatal(1, namespace=namespace)
        end if
      end if
    end do

    ! for mapping back to the global grid after the transfer, convert to local indices of mesh_out
    do ip_out = 1, this%nrecv
      if (.not. this%do_prolongation) then
        this%order_out_global(ip_out) = order_out(ip_out)
        this%order_out(ip_out) = mesh_global2local(mesh_out, order_out(ip_out))
      else
        ! store the global index of mesh_in
        this%order_out_global(ip_out) = order_out_original(ip_out)
        this%order_out(ip_out) = 0
      end if
      if (.not. this%do_prolongation) then
        if (this%order_out(ip_out) == 0) then
          write(message(1),'(a,i10,a,i10)') "Error in regridding part 2: mesh point ", &
            this%order_out(ip_out), " is not stored in partition ", mesh_out%pv%partno
          call messages_fatal(1, namespace=namespace)
        end if
      end if
    end do

    ! remove rotations for affine coordinates because they can lead to singularities in the
    ! computation of the weights; the weights are independent of rotations
    ! remove the rotation for the coarse mesh and use the same rotation matrix for the fine mesh
    select type(coord_system => mesh_coarse%coord_system)
    class is (affine_coordinates_t)
      scaling = lalg_remove_rotation(this%dim, coord_system%basis%vectors)
      coarse_coord => affine_coordinates_t(namespace, this%dim, scaling)
      ! get rotation matrix from polar decomposition A = U P -> U = A P^{-1}
      call lalg_inverse(this%dim, scaling, "dir")
      rotation = matmul(coord_system%basis%vectors, scaling)
    class default
      coarse_coord => mesh_coarse%coord_system
      rotation = diagonal_matrix(this%dim, M_ONE)
    end select
    select type(coord_system => mesh_fine%coord_system)
    class is (affine_coordinates_t)
      ! here we compute the new basis as U^T A
      fine_coord => affine_coordinates_t(namespace, this%dim, &
        matmul(transpose(rotation), coord_system%basis%vectors))
    class default
      fine_coord => mesh_fine%coord_system
    end select
    ! create transfer stencil
    if (.not. this%generic_interpolation) then
      if (this%do_restriction .or. this%do_prolongation) then
        ! determine weights by requiring a set of polynomials to be exactly reproduced
        ! on the corners of the polyhedron generated by neighbouring grid points
        call get_transfer_stencil(this%transfer_stencil, this%dim, this%eta(1), this%interpolation_level)
        SAFE_ALLOCATE(this%weights(1:this%transfer_stencil%size))
        ! get M matrix
        cube_size = 1+this%interpolation_level
        SAFE_ALLOCATE(M(1:cube_size**this%dim, 1:cube_size**this%dim))
        SAFE_ALLOCATE(p(1:cube_size**this%dim))
        do ii = 1, cube_size**this%dim
          ! get index of coarse point in the surrounding parallelepiped
          ! using the corresponding index as number in base cube_size
          call convert_to_base(ii-1, cube_size, shift)
          x_primitive = (shift - (this%interpolation_level-1)/2)*mesh_coarse%spacing
          x_coarse = coarse_coord%to_cartesian(x_primitive)
          ! fill one row of M with the polynomials
          M(ii, :) = evaluate_polynomials(x_coarse)
        end do
        call lalg_inverse(cube_size**this%dim, M, "dir")
        do ii = 1, this%transfer_stencil%size
          ! Cartesian coordinates of point in fine mesh
          shift = -floor(real(this%transfer_stencil%points(:, ii), real64)/real(this%eta, real64))
          x_fine = real(this%transfer_stencil%points(:, ii), real64) + shift*this%eta
          x_fine = fine_coord%to_cartesian(x_fine * mesh_fine%spacing)
          ! get polynomials
          p = evaluate_polynomials(x_fine)
          ! determine coefficients for interpolation
          p = matmul(p, M)
          ! take the coefficient for the correct coarse point
          call convert_from_base(shift+(this%interpolation_level-1)/2, cube_size, i_coarse)
          i_coarse = i_coarse + 1
          this%weights(ii) = p(i_coarse)
        end do
        ! for the restriction, we need to take into account the ratio of volume elements
        ! like this, restriction is adjoint to prolongation
        if (this%do_restriction) then
          this%weights(:) = this%weights(:) * mesh_fine%volume_element/mesh_coarse%volume_element
        end if
        SAFE_DEALLOCATE_A(M)
        SAFE_DEALLOCATE_A(p)
      end if
    else
      ! compute interpolation weights for each point
      ! we always need at least a cube of size 2
      cube_size = 1+max(this%interpolation_level, 1)
      SAFE_ALLOCATE(this%weights_generic(1:cube_size**this%dim, 1:mesh_out%np))
      this%weights_generic = M_ZERO
      SAFE_ALLOCATE(this%stencil_points(1:cube_size**this%dim, 1:mesh_out%np))
      this%stencil_points = -1

      call lihash_init(global_indices)
      ! create hash map for the global indices of the input mesh
      do ip_out = 1, this%nrecv
        ! it is called ip_out because it is the output of the partition_transfer
        call lihash_insert(global_indices, this%order_out_global(ip_out), ip_out)
      end do

      SAFE_ALLOCATE(M(1:cube_size**this%dim, 1:cube_size**this%dim))
      SAFE_ALLOCATE(p(1:cube_size**this%dim))
      do ip_out = 1, mesh_out%np
        call mesh_local_index_to_coords(mesh_out, ip_out, index)
        x_out = fine_coord%to_cartesian(real(index, real64)*mesh_out%spacing)
        x_in = coarse_coord%from_cartesian(x_out)/mesh_in%spacing
        on_in_grid = all(abs((x_in - nint(x_in))) < 10.0_real64*M_EPSILON)
        if (on_in_grid) then
          ! the points coincide, so use only one point in the stencil with a weight of 1
          ipg_in = mesh_global_index_from_coords(mesh_in, nint(x_in))
          this%stencil_points(1, ip_out) = lihash_lookup(global_indices, ipg_in, found)
          this%weights_generic(1, ip_out) = M_ONE
        else
          ! now get all in points that surround the out point
          ! start always from the same corner of the cube
          ! need a cube of size of the interpolation level, so shift to the corner of that cube
          index(:) = floor(x_in) - (max(this%interpolation_level, 1)-1)/2
          rmin = M_HUGE
          ip_mindist = -1
          do is = 1, cube_size**this%dim
            ! get index of coarse point in the surrounding parallelepiped
            ! using the corresponding index as number in base cube_size
            call convert_to_base(is-1, cube_size, shift)
            index_stencil(:) = index(:) + shift(:)
            ipg_in = mesh_global_index_from_coords(mesh_in, index_stencil)
            ! the lookup will return -1 for points that are not locally available in the input mesh
            ! those will be ignored during the regridding
            ! the points we need from other processes are already in this hash table
            this%stencil_points(is, ip_out) = lihash_lookup(global_indices, ipg_in, found)
            x_in = coarse_coord%to_cartesian(index_stencil*mesh_in%spacing)
            ! compute M
            ! fill one row of M with the polynomials
            M(is, :) = evaluate_polynomials(x_in)
            ! determine closest point for nearest neighbor interpolation
            r_current = norm2(x_in - x_out)
            ! avoid rounding errors by requesting the distance to be smaller
            ! by a certain amount
            if (r_current < rmin - 10.*M_EPSILON .and. found) then
              rmin = r_current
              ip_mindist = this%stencil_points(is, ip_out)
            end if
          end do
          if (this%interpolation_level == NEAREST_NEIGHBOR) then
            ! only use one point for th nearest-neighbor interpolation
            this%weights_generic(1, ip_out) = M_ONE
            this%stencil_points(:, ip_out) = -1
            this%stencil_points(1, ip_out) = ip_mindist
          else
            call lalg_inverse(cube_size**this%dim, M, "dir")
            ! get polynomials at point to be interpolated
            p = evaluate_polynomials(x_out)
            ! determine coefficients for interpolation
            this%weights_generic(:, ip_out) = matmul(p, M)
          end if
        end if
      end do
      SAFE_DEALLOCATE_A(M)
      SAFE_DEALLOCATE_A(p)
      call lihash_end(global_indices)
    end if

    select case (this%scale_norms)
    case(SCALE_NONE)
      ! do nothing
    case(SCALE_NORM2)
      ! create overlap map
      SAFE_ALLOCATE(this%overlap_map(1:this%mesh_in%np))
      this%overlap_map = this%mesh_out%box%contains_points(this%mesh_in%np, this%mesh_in%x)
    end select

    SAFE_DEALLOCATE_A(partition_in)
    SAFE_DEALLOCATE_A(global_indices_in)
    SAFE_DEALLOCATE_A(order_in)
    SAFE_DEALLOCATE_A(order_out)

    ! deallocate coordinate systems correctly
    select type(coord_system => mesh_coarse%coord_system)
    class is (affine_coordinates_t)
      SAFE_DEALLOCATE_P(coarse_coord)
    class default
      nullify(coarse_coord)
    end select
    select type(coord_system => mesh_fine%coord_system)
    class is (affine_coordinates_t)
      SAFE_DEALLOCATE_P(fine_coord)
    class default
      nullify(fine_coord)
    end select

    POP_SUB_WITH_PROFILE(regridding_init)
  contains
    subroutine insert_global_point(mesh, ipg, ii)
      class(mesh_t),  intent(in)    :: mesh
      integer(int64), intent(in)    :: ipg
      integer,        intent(inout) :: ii

      integer :: ip_tmp

      ! take boundary into account for periodic meshes
      if (ipg > 0 .and. ipg <= mesh%np_global .or. &
        ipg > 0 .and. space_in%is_periodic()) then
        ip_tmp = lihash_lookup(global_indices, ipg, found)
        if (.not. found) then
          ii = ii + 1
          ! enlarge array if necessary
          if (ii >= size_array) then
            size_array = size_array * 2
            call make_array_larger(global_indices_in_tmp, size_array)
            call make_array_larger(global_indices_in_original, size_array)
          end if
          if (ipg <= mesh%np_global) then
            global_indices_in_tmp(ii) = ipg
            global_indices_in_original(ii) = ipg
          else if (ipg > mesh%np_global) then
            global_indices_in_tmp(ii) = mesh_periodic_point_global(mesh, space_in, ipg)
            global_indices_in_original(ii) = ipg
          end if
          call lihash_insert(global_indices, ipg, ii)
        end if
      end if
    end subroutine insert_global_point

    function evaluate_polynomials(x)
      real(real64), intent(in) :: x(1:this%dim)
      real(real64) :: evaluate_polynomials(1:(1+this%interpolation_level)**this%dim)
      integer :: cube_size, i, j, shift(1:this%dim)

      evaluate_polynomials = M_ONE
      cube_size = 1+this%interpolation_level
      do i = 1, cube_size**this%dim
        ! get index of coarse point in the surrounding parallelepiped
        ! using the corresponding index as number in base cube_size
        call convert_to_base(i-1, cube_size, shift)
        ! evaluate polynomials with the corresponding powers
        do j = 1, this%dim
          evaluate_polynomials(i) = evaluate_polynomials(i) * x(j)**shift(j)
        end do
      end do
    end function evaluate_polynomials

    subroutine get_transfer_stencil(this, dim, eta, order)
      type(stencil_t), intent(inout) :: this
      integer,         intent(in)    :: dim
      integer,         intent(in)    :: eta
      integer,         intent(in)    :: order

      integer :: i, offset, cube_size, shift(1:dim)

      PUSH_SUB(get_transfer_stencil)

      cube_size = eta*(order + 1) - 1
      call stencil_allocate(this, dim, cube_size**dim)

      offset = eta * (order/2 + 1) - 1
      do i = 1, cube_size**dim
        call convert_to_base(i-1, cube_size, shift)
        this%points(:, i) = shift(:) - offset
      end do

      call stencil_init_center(this)

      POP_SUB(get_transfer_stencil)
    end subroutine get_transfer_stencil
  end function regridding_init

  subroutine regridding_finalize(this)
    type(regridding_t), intent(inout) :: this

    PUSH_SUB(regridding_finalize)

    call partition_transfer_end(this%partition_transfer)
    SAFE_DEALLOCATE_A(this%eta)
    SAFE_DEALLOCATE_A(this%order_in)
    SAFE_DEALLOCATE_A(this%order_in_global)
    SAFE_DEALLOCATE_A(this%order_out)
    SAFE_DEALLOCATE_A(this%order_out_global)
    SAFE_DEALLOCATE_A(this%weights)
    select case (this%scale_norms)
    case(SCALE_NONE)
      ! do nothing
    case(SCALE_NORM2)
      SAFE_DEALLOCATE_A(this%overlap_map)
    end select

    POP_SUB(regridding_finalize)
  end subroutine regridding_finalize

#include "real.F90"
#include "regridding_inc.F90"
#include "undef.F90"

#include "complex.F90"
#include "regridding_inc.F90"
#include "undef.F90"

end module regridding_oct_m
