!! Copyright (C) 2009 X. Andrade
!!
!! This program is free software; you can redistribute it and/or modify
!! it under the terms of the GNU General Public License as published by
!! the Free Software Foundation; either version 2, or (at your option)
!! any later version.
!!
!! This program is distributed in the hope that it will be useful,
!! but WITHOUT ANY WARRANTY; without even the implied warranty of
!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!! GNU General Public License for more details.
!!
!! You should have received a copy of the GNU General Public License
!! along with this program; if not, write to the Free Software
!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
!! 02110-1301, USA.
!!

#include "global.h"

module nonlocal_pseudopotential_oct_m
  use accel_oct_m
  use batch_oct_m
  use batch_ops_oct_m
  use blas_oct_m
  use comm_oct_m
  use debug_oct_m
  use electron_space_oct_m
  use epot_oct_m
  use global_oct_m
  use hardware_oct_m
  use hgh_projector_oct_m
  use kb_projector_oct_m
  use lalg_basic_oct_m
  use math_oct_m
  use mesh_oct_m
  use messages_oct_m
  use mpi_oct_m
  use profiling_oct_m
  use projector_oct_m
  use projector_matrix_oct_m
  use ps_oct_m
  use rkb_projector_oct_m
  use space_oct_m
  use states_elec_oct_m
  use states_elec_dim_oct_m
  use submesh_oct_m
  use types_oct_m
  use wfs_elec_oct_m

  implicit none

  private

  public ::                                         &
    nonlocal_pseudopotential_t,                     &
    projection_t

  !> @brief nonlocal part of the pseudopotential
  !!
  type nonlocal_pseudopotential_t
    private
    type(projector_matrix_t), allocatable, public :: projector_matrices(:) !< projectors
    integer,                               public :: nprojector_matrices         !< number of projector matrices
    logical,                               public :: apply_projector_matrices    !< flag whether to apply projection matrices
    logical,                               public :: has_non_local_potential     !< flag whether non-local potential exists
    integer                                       :: full_projection_size
    integer,                               public :: max_npoints
    integer,                               public :: total_points
    integer                                       :: max_nprojs
    logical                                       :: projector_mix
    complex(real64),          allocatable, public :: projector_phases(:, :, :, :)
    integer,                  allocatable, public :: projector_to_atom(:)
    integer                                       :: nregions    !< number of non-overlapping regions
    integer,                  allocatable         :: regions(:)  !< list of atomd in each region.
    integer,                               public :: nphase      !< @brief number of phases:
    !!
    !!                                                              * 0 for finite systems without magnetic fields
    !!                                                              * 1 for periodic systems or magnetic fields
    !!                                                              * 3 for spiral boundary conditions
    type(accel_mem_t)                             :: buff_offsets
    type(accel_mem_t)                             :: buff_matrices
    type(accel_mem_t)                             :: buff_maps
    type(accel_mem_t)                             :: buff_scals
    type(accel_mem_t)                             :: buff_position
    type(accel_mem_t)                             :: buff_pos
    type(accel_mem_t)                             :: buff_invmap
    type(accel_mem_t),                     public :: buff_projector_phases
    type(accel_mem_t)                             :: buff_mix
    logical                                       :: projector_self_overlap  !< if .true. some projectors overlap with themselves
    real(real64),             pointer,     public :: spin(:,:,:) => null()
  contains

    procedure :: init => nonlocal_pseudopotential_init
    !< @copydoc nonlocal_pseudopotential_oct_m::nonlocal_pseudopotential_init
    procedure :: build => nonlocal_pseudopotential_build_proj
    !< @copydoc nonlocal_pseudopotential_oct_m::nonlocal_pseudopotential_build_proj
    procedure :: end => nonlocal_pseudopotential_destroy_proj
    !< @copydoc nonlocal_pseudopotential_oct_m::nonlocal_pseudopotential_destroy_proj
    procedure :: has_self_overlap => nonlocal_pseudopotential_self_overlap
    !< @copydoc nonlocal_pseudopotential_oct_m::nonlocal_pseudopotential_self_overlap
    procedure :: dstart => dnonlocal_pseudopotential_start
    !< @copydoc nonlocal_pseudopotential_oct_m::dnonlocal_pseudopotential_nlocal_start
    procedure :: zstart => znonlocal_pseudopotential_start
    !< @copydoc nonlocal_pseudopotential_oct_m::znonlocal_pseudopotential_nlocal_start
    procedure :: dfinish => dnonlocal_pseudopotential_finish
    !< @copydoc nonlocal_pseudopotential_oct_m::dnonlocal_pseudopotential_nlocal_finish
    procedure :: zfinish => znonlocal_pseudopotential_finish
    !< @copydoc nonlocal_pseudopotential_oct_m::znonlocal_pseudopotential_nlocal_finish
    procedure :: dforce => dnonlocal_pseudopotential_force
    !< @copydoc nonlocal_pseudopotential_oct_m::dnonlocal_pseudopotential_force
    procedure :: zforce => znonlocal_pseudopotential_force
    !< @copydoc nonlocal_pseudopotential_oct_m::znonlocal_pseudopotential_force
    procedure :: dposition_commutator => dnonlocal_pseudopotential_position_commutator
    !< @copydoc nonlocal_pseudopotential_oct_m::dnonlocal_pseudopotential_position_commutator
    procedure :: zposition_commutator => znonlocal_pseudopotential_position_commutator
    !< @copydoc nonlocal_pseudopotential_oct_m::znonlocal_pseudopotential_position_commutator
    procedure :: dr_vn_local => dnonlocal_pseudopotential_r_vnlocal
    !< @copydoc nonlocal_pseudopotential_oct_m::dnonlocal_pseudopotential_r_vnl
    procedure :: zr_vn_local => znonlocal_pseudopotential_r_vnlocal
    !< @copydoc nonlocal_pseudopotential_oct_m::znonlocal_pseudopotential_r_vnl
  end type nonlocal_pseudopotential_t

  !> @brief Class for projections of wave functions
  !
  type projection_t
    private
    real(real64), allocatable     :: dprojection(:, :) !< real array, dimensions (1:nst_linear, 1:this%full_projection_size)
    complex(real64), allocatable     :: zprojection(:, :) !< real array, dimensions (1:nst_linear, 1:this%full_projection_size)
    type(accel_mem_t)      :: buff_projection
    type(accel_mem_t)      :: buff_spin_to_phase
  end type projection_t

contains

  ! ---------------------------------------------------------
  !>@brief initialize the nonlocal_pseudopotential_t object
  !!
  subroutine nonlocal_pseudopotential_init(this)
    class(nonlocal_pseudopotential_t), intent(inout) :: this

    PUSH_SUB(nonlocal_pseudopotential_init)

    this%apply_projector_matrices = .false.
    this%has_non_local_potential = .false.
    this%nprojector_matrices = 0

    this%projector_self_overlap = .false.

    POP_SUB(nonlocal_pseudopotential_init)
  end subroutine nonlocal_pseudopotential_init


  !--------------------------------------------------------
  !>@brief Destroy the data of nonlocal_pseudopotential_t
  subroutine nonlocal_pseudopotential_destroy_proj(this)
    class(nonlocal_pseudopotential_t), intent(inout) :: this

    integer :: iproj

    PUSH_SUB(nonlocal_pseudopotential_destroy_proj)

    if (allocated(this%projector_matrices)) then

      if (accel_is_enabled()) then
        call accel_release_buffer(this%buff_offsets)
        call accel_release_buffer(this%buff_matrices)
        call accel_release_buffer(this%buff_maps)
        call accel_release_buffer(this%buff_scals)
        call accel_release_buffer(this%buff_position)
        call accel_release_buffer(this%buff_pos)
        call accel_release_buffer(this%buff_invmap)
        if (this%projector_mix) call accel_release_buffer(this%buff_mix)
        if (allocated(this%projector_phases)) call accel_release_buffer(this%buff_projector_phases)
      end if

      do iproj = 1, this%nprojector_matrices
        call projector_matrix_deallocate(this%projector_matrices(iproj))
      end do
      SAFE_DEALLOCATE_A(this%regions)
      SAFE_DEALLOCATE_A(this%projector_matrices)
      SAFE_DEALLOCATE_A(this%projector_phases)
      SAFE_DEALLOCATE_A(this%projector_to_atom)
    end if

    POP_SUB(nonlocal_pseudopotential_destroy_proj)
  end subroutine nonlocal_pseudopotential_destroy_proj

  !-----------------------------------------------------------------
  !> @brief build the projectors for the application of pseudo-potentials
  !!
  !! This routine also determines the regions for the projectors
  !! and whether any projector overlaps with itself in a periodic
  !! system.
  !
  subroutine nonlocal_pseudopotential_build_proj(this, space, mesh, epot)
    class(nonlocal_pseudopotential_t), target, intent(inout) :: this
    class(space_t),                         intent(in)    :: space
    class(mesh_t),                    intent(in)    :: mesh
    type(epot_t),             target, intent(in)    :: epot

    integer :: iatom, iproj, ll, lmax, lloc, mm, ic, jc
    integer :: nmat, imat, ip, iorder
    integer :: nregion, jatom, katom, iregion
    integer, allocatable :: order(:), head(:), region_count(:)
    logical, allocatable :: atom_counted(:)
    logical :: overlap
    type(projector_matrix_t), pointer :: pmat
    type(kb_projector_t),     pointer :: kb_p
    type(rkb_projector_t),    pointer :: rkb_p
    type(hgh_projector_t),    pointer :: hgh_p

    PUSH_SUB(nonlocal_pseudopotential_build_proj)

    call profiling_in("ATOM_COLORING")

    ! this is most likely a very inefficient algorithm, O(natom**2) or
    ! O(natom**3), probably it should be replaced by something better.

    SAFE_ALLOCATE(order(1:epot%natoms))         ! order(iregion) = ?
    SAFE_ALLOCATE(head(1:epot%natoms + 1))      ! head(iregion) points to the first atom in region iregion
    SAFE_ALLOCATE(region_count(1:epot%natoms))  ! region_count(iregion): number of atoms in that region
    SAFE_ALLOCATE(atom_counted(1:epot%natoms))

    this%projector_self_overlap = .false.
    atom_counted = .false.
    order = -1

    head(1) = 1
    nregion = 0
    do
      nregion = nregion + 1
      ASSERT(nregion <= epot%natoms)

      region_count(nregion) = 0

      do iatom = 1, epot%natoms
        if (atom_counted(iatom)) cycle

        overlap = .false.

        if (.not. projector_is(epot%proj(iatom), PROJ_NONE)) then
          ASSERT(associated(epot%proj(iatom)%sphere%mesh))
          do jatom = 1, region_count(nregion)
            katom = order(head(nregion) + jatom - 1)
            if (projector_is(epot%proj(katom), PROJ_NONE)) cycle
            overlap = submesh_overlap(epot%proj(iatom)%sphere, epot%proj(katom)%sphere, space)
            if (overlap) exit
          end do
        end if

        if (.not. overlap) then
          ! iatom did not overlap with any previously counted atoms:
          ! iatom will be added to the current region
          region_count(nregion) = region_count(nregion) + 1
          order(head(nregion) - 1 + region_count(nregion)) = iatom
          atom_counted(iatom) = .true.
        end if

      end do

      head(nregion + 1) = head(nregion) + region_count(nregion)

      if (all(atom_counted)) exit
    end do

    SAFE_DEALLOCATE_A(atom_counted)
    SAFE_DEALLOCATE_A(region_count)

    call messages_write('The atoms can be separated in ')
    call messages_write(nregion)
    call messages_write(' non-overlapping groups.')
    call messages_info(debug_only=.true.)

    do iregion = 1, nregion
      do iatom = head(iregion), head(iregion + 1) - 1
        if (.not. projector_is(epot%proj(order(iatom)), PROJ_KB)) cycle
        do jatom = head(iregion), iatom - 1
          if (.not. projector_is(epot%proj(order(jatom)), PROJ_KB)) cycle
          ASSERT(.not. submesh_overlap(epot%proj(order(iatom))%sphere, epot%proj(order(jatom))%sphere, space))
        end do
      end do
    end do

    call profiling_out("ATOM_COLORING")

    ! deallocate previous projectors
    call this%end()

    ! count projectors
    this%nprojector_matrices = 0
    this%apply_projector_matrices = .false.
    this%has_non_local_potential = .false.
    this%nregions = nregion

    !We determine if we have only local potential or not.
    do iorder = 1, epot%natoms
      iatom = order(iorder)

      if (.not. projector_is_null(epot%proj(iatom))) then
        this%has_non_local_potential = .true.
        exit
      end if
    end do

    do iorder = 1, epot%natoms
      iatom = order(iorder)

      if (.not. projector_is_null(epot%proj(iatom))) then
        this%nprojector_matrices = this%nprojector_matrices + 1
        this%apply_projector_matrices = .true.
      end if
    end do

    ! This is currently the only not supported case
    if (mesh%use_curvilinear)  this%apply_projector_matrices = .false.

    if (.not. this%apply_projector_matrices) then
      SAFE_DEALLOCATE_A(order)
      SAFE_DEALLOCATE_A(head)

      POP_SUB(nonlocal_pseudopotential_build_proj)
      return
    end if


    SAFE_ALLOCATE(this%projector_matrices(1:this%nprojector_matrices))
    SAFE_ALLOCATE(this%regions(1:this%nprojector_matrices + 1))
    SAFE_ALLOCATE(this%projector_to_atom(1:epot%natoms))

    this%full_projection_size = 0
    this%regions(this%nregions + 1) = this%nprojector_matrices + 1

    this%projector_mix = .false.

    iproj = 0
    do iregion = 1, this%nregions
      this%regions(iregion) = iproj + 1
      do iorder = head(iregion), head(iregion + 1) - 1

        iatom = order(iorder)

        if (projector_is(epot%proj(iatom), PROJ_NONE)) cycle

        iproj = iproj + 1

        pmat => this%projector_matrices(iproj)

        this%projector_to_atom(iproj) = iatom

        lmax = epot%proj(iatom)%lmax
        lloc = epot%proj(iatom)%lloc

        if (projector_is(epot%proj(iatom), PROJ_KB)) then

          ! count the number of projectors for this matrix
          nmat = 0
          do ll = 0, lmax
            if (ll == lloc) cycle
            do mm = -ll, ll
              nmat = nmat + epot%proj(iatom)%kb_p(ll, mm)%n_c
            end do
          end do

          call projector_matrix_allocate(pmat, nmat, epot%proj(iatom)%sphere, has_mix_matrix = .false.)

          ! generate the matrix
          pmat%dprojectors = M_ZERO
          imat = 1
          do ll = 0, lmax
            if (ll == lloc) cycle
            do mm = -ll, ll
              kb_p =>  epot%proj(iatom)%kb_p(ll, mm)
              do ic = 1, kb_p%n_c
                call lalg_copy(pmat%npoints, kb_p%p(:, ic), pmat%dprojectors(:, imat))
                pmat%scal(imat) = kb_p%e(ic)*mesh%vol_pp(1)
                imat = imat + 1
              end do
            end do
          end do

          this%projector_self_overlap = this%projector_self_overlap .or. epot%proj(iatom)%sphere%overlap

        else if (projector_is(epot%proj(iatom), PROJ_HGH)) then

          this%projector_mix = .true.

          ! count the number of projectors for this matrix
          nmat = 0
          do ll = 0, lmax
            if (ll == lloc) cycle
            do mm = -ll, ll
              nmat = nmat + 3
            end do
          end do

          call projector_matrix_allocate(pmat, nmat, epot%proj(iatom)%sphere, &
            has_mix_matrix = .true., is_cmplx = (epot%reltype == SPIN_ORBIT))

          ! generate the matrix
          if (epot%reltype == SPIN_ORBIT) then
            pmat%zprojectors = M_ZERO
            pmat%zmix = M_ZERO
          else
            pmat%dprojectors = M_ZERO
            pmat%dmix = M_ZERO
          end if

          imat = 1
          do ll = 0, lmax
            if (ll == lloc) cycle
            do mm = -ll, ll
              hgh_p =>  epot%proj(iatom)%hgh_p(ll, mm)

              ! HGH pseudos mix different components, so we need to
              ! generate a matrix that mixes the projections
              if (epot%reltype == SPIN_ORBIT .or. epot%reltype == FULLY_RELATIVISTIC_ZORA) then
                do ic = 1, 3
                  do jc = 1, 3
                    pmat%zmix(imat - 1 + ic, imat - 1 + jc, 1) = hgh_p%h(ic, jc) + M_HALF*mm*hgh_p%k(ic, jc)
                    pmat%zmix(imat - 1 + ic, imat - 1 + jc, 2) = hgh_p%h(ic, jc) - M_HALF*mm*hgh_p%k(ic, jc)

                    if (mm < ll) then
                      pmat%zmix(imat - 1 + ic, imat + 3 - 1 + jc, 3) = M_HALF*hgh_p%k(ic, jc) * &
                        sqrt(real(ll*(ll+1)-mm*(mm+1), real64))
                    end if

                    if (-mm < ll) then
                      pmat%zmix(imat - 1 + ic, imat - 3 - 1 + jc, 4) = M_HALF*hgh_p%k(ic, jc) * &
                        sqrt(real(ll*(ll+1)-mm*(mm-1), real64))
                    end if
                  end do
                end do
              else
                do ic = 1, 3
                  do jc = 1, 3
                    pmat%dmix(imat - 1 + ic, imat - 1 + jc) = hgh_p%h(ic, jc)
                  end do
                end do
              end if

              do ic = 1, 3
                if (epot%reltype == SPIN_ORBIT .or. epot%reltype == FULLY_RELATIVISTIC_ZORA) then
                  call lalg_copy(pmat%npoints, hgh_p%zp(:, ic), pmat%zprojectors(:, imat))
                else
                  call lalg_copy(pmat%npoints, hgh_p%dp(:, ic), pmat%dprojectors(:, imat))
                end if
                pmat%scal(imat) = mesh%volume_element
                imat = imat + 1
              end do

            end do
          end do

          this%projector_self_overlap = this%projector_self_overlap .or. epot%proj(iatom)%sphere%overlap

        else if (projector_is(epot%proj(iatom), PROJ_RKB)) then
          ASSERT(epot%reltype == SPIN_ORBIT)

          this%projector_mix = .true.

          ! count the number of projectors for this matrix
          nmat = 0
          if (lloc /= 0) nmat = nmat + epot%proj(iatom)%kb_p(1, 1)%n_c

          do ll = 1, lmax
            if (ll == lloc) cycle
            do mm = -ll, ll
              nmat = nmat + epot%proj(iatom)%rkb_p(ll, mm)%n_c
            end do
          end do

          call projector_matrix_allocate(pmat, nmat, epot%proj(iatom)%sphere, &
            has_mix_matrix = .true., is_cmplx = .true.)

          pmat%zprojectors = M_ZERO
          pmat%zmix = M_ZERO

          imat = 1
          if (lloc /= 0) then
            kb_p => epot%proj(iatom)%kb_p(1, 1)

            do ic = 1, kb_p%n_c
              pmat%zmix(ic, ic, 1:2) = kb_p%e(ic)
              do ip = 1, pmat%npoints
                pmat%zprojectors(ip, ic) = kb_p%p(ip, ic)
              end do
              pmat%scal(ic) = mesh%volume_element
            end do
            imat = kb_p%n_c + 1
            nullify(kb_p)
          end if

          do ll = 1, lmax
            if (ll == lloc) cycle
            do mm = -ll, ll
              rkb_p =>  epot%proj(iatom)%rkb_p(ll, mm)

              ! See rkb_projector.F90 for understanding the indices
              do ic = 0, rkb_p%n_c/2-1
                pmat%zmix(imat + ic*2, imat + ic*2, 1) = rkb_p%f(ic*2+1, 1, 1)
                pmat%zmix(imat + ic*2, imat + ic*2, 2) = rkb_p%f(ic*2+1, 2, 2)

                pmat%zmix(imat + ic*2+1, imat + ic*2+1, 1) = rkb_p%f(ic*2+2, 1, 1)
                pmat%zmix(imat + ic*2+1, imat + ic*2+1, 2) = rkb_p%f(ic*2+2, 2, 2)

                if (mm < ll) then
                  pmat%zmix(imat + ic*2+rkb_p%n_c, imat + ic*2, 4) =  rkb_p%f(ic*2+1, 2, 1)
                  pmat%zmix(imat + ic*2+1+rkb_p%n_c, imat + ic*2+1, 4) =  rkb_p%f(ic*2+2, 2, 1)
                end if

                if (-mm < ll) then
                  pmat%zmix(imat + ic*2-rkb_p%n_c, imat + ic*2, 3) =  rkb_p%f(ic*2+1, 1, 2)
                  pmat%zmix(imat + ic*2+1-rkb_p%n_c, imat + ic*2+1, 3) =  rkb_p%f(ic*2+2, 1, 2)
                end if
              end do

              do ic = 1, rkb_p%n_c
                call lalg_copy(pmat%npoints, rkb_p%ket(:, ic, 1, 1), pmat%zprojectors(:, imat))
                pmat%scal(imat) = mesh%volume_element
                imat = imat + 1
              end do
            end do

            nullify(rkb_p)
          end do

          this%projector_self_overlap = this%projector_self_overlap .or. epot%proj(iatom)%sphere%overlap

        else
          cycle
        end if

        pmat%map => epot%proj(iatom)%sphere%map
        pmat%position => epot%proj(iatom)%sphere%rel_x

        pmat%regions = epot%proj(iatom)%sphere%regions

        this%full_projection_size = this%full_projection_size + pmat%nprojs

      end do
    end do

    if (mesh%parallel_in_domains) then
      call mesh%mpi_grp%allreduce_inplace(this%projector_self_overlap, 1, MPI_LOGICAL, MPI_LOR)
    end if

    SAFE_DEALLOCATE_A(order)
    SAFE_DEALLOCATE_A(head)

    this%total_points = 0
    this%max_npoints = 0
    this%max_nprojs = 0
    do imat = 1, this%nprojector_matrices
      pmat => this%projector_matrices(imat)

      this%max_npoints = max(this%max_npoints, pmat%npoints)
      this%max_nprojs = max(this%max_nprojs, pmat%nprojs)
      this%total_points = this%total_points + pmat%npoints
    end do

    if (accel_is_enabled()) call build_accel()

    POP_SUB(nonlocal_pseudopotential_build_proj)

  contains

    subroutine build_accel()

      integer              :: matrix_size, scal_size
      integer, allocatable :: cnt(:), invmap(:, :), invmap2(:), pos(:)
      integer, allocatable :: offsets(:, :)
      integer, parameter   :: OFFSET_SIZE = 6 ! also defined in share/opencl/projectors.cl
      integer, parameter   :: POINTS = 1, PROJS = 2, MATRIX = 3, MAP = 4, SCAL = 5, MIX = 6 ! update OFFSET_SIZE
      integer              :: ip, is, ii, ipos, mix_offset

      PUSH_SUB(nonlocal_pseudopotential_build_proj.build_accel)

      SAFE_ALLOCATE(offsets(1:OFFSET_SIZE, 1:this%nprojector_matrices))
      SAFE_ALLOCATE(cnt(1:mesh%np))

      cnt = 0

      ! Here we construct the offsets for accessing various arrays within the GPU kernels.
      ! The offset(:,:) array contains a number of sizes and offsets, describing how to address the arrays.
      ! This allows to transfer all these numbers to the GPU in one memory transfer.
      !
      ! For each projection matrix (addressed by imap), we have:
      !
      ! offset(POINTS, imap) : number of points of the sphere imap
      ! offset(PROJS, imap)  : number of projectors for imap
      ! offset(MATRIX, imap) : address offset: cumulative of pmat%npoints * pmat%nprojs
      ! offset(MAP, imap)    : address offset: cumulative of pmat%npoints for each imap
      ! offset(SCAL, imap)   : address_offset: cumulative of pmat%nprojs
      ! offset(MIX, imap)    : address_offset: cumulative of pmat%nprojs**2 or 4*pmat%nprojs**2 for complex mixing

      ! first we count
      matrix_size = 0
      this%total_points = 0
      scal_size = 0
      this%max_npoints = 0
      this%max_nprojs = 0
      mix_offset = 0
      do imat = 1, this%nprojector_matrices
        pmat => this%projector_matrices(imat)

        this%max_npoints = max(this%max_npoints, pmat%npoints)
        this%max_nprojs = max(this%max_nprojs, pmat%nprojs)

        offsets(POINTS, imat) = pmat%npoints
        offsets(PROJS, imat) = pmat%nprojs

        offsets(MATRIX, imat) = matrix_size
        matrix_size = matrix_size + pmat%npoints*pmat%nprojs

        offsets(MAP, imat) = this%total_points
        this%total_points = this%total_points + pmat%npoints

        offsets(SCAL, imat) = scal_size
        scal_size = scal_size + pmat%nprojs

        offsets(MIX, imat) = mix_offset
        if (allocated(pmat%dmix)) then
          mix_offset = mix_offset + pmat%nprojs**2
        else if (allocated(pmat%zmix)) then
          mix_offset = mix_offset + 4 * pmat%nprojs**2
        else
          offsets(MIX, imat) = -1
        end if

        do is = 1, pmat%npoints
          ip = pmat%map(is)
          cnt(ip) = cnt(ip) + 1
        end do
      end do

      SAFE_ALLOCATE(invmap(1:max(maxval(cnt), 1), 1:mesh%np))
      SAFE_ALLOCATE(invmap2(1:max(maxval(cnt)*mesh%np, 1)))
      SAFE_ALLOCATE(pos(1:mesh%np + 1))

      cnt = 0
      ii = 0
      do imat = 1, this%nprojector_matrices
        pmat => this%projector_matrices(imat)
        do is = 1, pmat%npoints
          ip = pmat%map(is)
          cnt(ip) = cnt(ip) + 1
          invmap(cnt(ip), ip) = ii
          ii = ii + 1
        end do
      end do

      ipos = 0
      pos(1) = 0
      do ip = 1, mesh%np
        do ii = 1, cnt(ip)
          ipos = ipos + 1
          invmap2(ipos) = invmap(ii, ip)
        end do
        pos(ip + 1) = ipos
      end do

      ! allocate
      if (this%projector_matrices(1)%is_cmplx) then
        call accel_create_buffer(this%buff_matrices, ACCEL_MEM_READ_ONLY, TYPE_CMPLX, matrix_size)
      else
        call accel_create_buffer(this%buff_matrices, ACCEL_MEM_READ_ONLY, TYPE_FLOAT, matrix_size)
      end if
      call accel_create_buffer(this%buff_maps, ACCEL_MEM_READ_ONLY, TYPE_INTEGER, this%total_points)
      call accel_create_buffer(this%buff_position, ACCEL_MEM_READ_ONLY, TYPE_FLOAT, 3*this%total_points)
      call accel_create_buffer(this%buff_scals, ACCEL_MEM_READ_ONLY, TYPE_FLOAT, scal_size)

      if (mix_offset > 0) then
        if (allocated(this%projector_matrices(1)%zmix)) then
          call accel_create_buffer(this%buff_mix, ACCEL_MEM_READ_ONLY, TYPE_CMPLX, mix_offset)
        else
          call accel_create_buffer(this%buff_mix, ACCEL_MEM_READ_ONLY, TYPE_FLOAT, mix_offset)
        end if
      end if

      ! now copy
      do imat = 1, this%nprojector_matrices
        pmat => this%projector_matrices(imat)
        ! in parallel some spheres might not have points
        if (pmat%npoints > 0) then
          if (pmat%is_cmplx) then
            call accel_write_buffer(this%buff_matrices, pmat%nprojs*pmat%npoints, pmat%zprojectors, offset = offsets(MATRIX, imat))
          else
            call accel_write_buffer(this%buff_matrices, pmat%nprojs*pmat%npoints, pmat%dprojectors, offset = offsets(MATRIX, imat))
          end if
          call accel_write_buffer(this%buff_maps, pmat%npoints, pmat%map, offset = offsets(MAP, imat))
          call accel_write_buffer(this%buff_position, 3*pmat%npoints, pmat%position, offset = 3*offsets(MAP, imat))
        end if
        call accel_write_buffer(this%buff_scals, pmat%nprojs, pmat%scal, offset = offsets(SCAL, imat))
        if (offsets(MIX, imat) /= -1) then
          if (allocated(pmat%zmix)) then
            call accel_write_buffer(this%buff_mix, 4*pmat%nprojs**2, pmat%zmix, offset = offsets(MIX, imat))
          else
            call accel_write_buffer(this%buff_mix, pmat%nprojs**2, pmat%dmix, offset = offsets(MIX, imat))
          end if
        end if
      end do

      ! write the offsets
      call accel_create_buffer(this%buff_offsets, ACCEL_MEM_READ_ONLY, TYPE_INTEGER, OFFSET_SIZE*this%nprojector_matrices)
      call accel_write_buffer(this%buff_offsets, OFFSET_SIZE*this%nprojector_matrices, offsets)

      ! the inverse map
      call accel_create_buffer(this%buff_pos, ACCEL_MEM_READ_ONLY, TYPE_INTEGER, mesh%np + 1)
      call accel_write_buffer(this%buff_pos, mesh%np + 1, pos)

      call accel_create_buffer(this%buff_invmap, ACCEL_MEM_READ_ONLY, TYPE_INTEGER, ipos)
      call accel_write_buffer(this%buff_invmap, ipos, invmap2)

      SAFE_DEALLOCATE_A(offsets)
      SAFE_DEALLOCATE_A(cnt)
      SAFE_DEALLOCATE_A(invmap)
      SAFE_DEALLOCATE_A(invmap2)
      SAFE_DEALLOCATE_A(pos)

      POP_SUB(nonlocal_pseudopotential_build_proj.build_accel)
    end subroutine build_accel

  end subroutine nonlocal_pseudopotential_build_proj

  ! ----------------------------------------------------------------------------------
  !>@brief Returns .true. if the Hamiltonian contains projectors, which overlap with themself
  !
  logical pure function nonlocal_pseudopotential_self_overlap(this) result(projector_self_overlap)
    class(nonlocal_pseudopotential_t), intent(in) :: this

    projector_self_overlap = this%projector_self_overlap
  end function nonlocal_pseudopotential_self_overlap

#include "undef.F90"
#include "real.F90"
#include "nonlocal_pseudopotential_inc.F90"

#include "undef.F90"
#include "complex.F90"
#include "nonlocal_pseudopotential_inc.F90"

end module nonlocal_pseudopotential_oct_m

!! Local Variables:
!! mode: f90
!! coding: utf-8
!! End:
