! ---------------------------------------------------------
!> The routine calculates the expectation value of the momentum
!! operator
!! \f[
!! <p> = < phi*(ist, k) | -i \nabla | phi(ist, ik) >
!!\f]
! ---------------------------------------------------------
subroutine X(elec_momentum_me)(gr, st, space, kpoints, momentum)
  type(grid_t),                 intent(in)  :: gr
  type(states_elec_t),          intent(in)  :: st
  type(space_t),                intent(in)  :: space
  type(kpoints_t),              intent(in)  :: kpoints
  real(real64),                 intent(out) :: momentum(:,:,:)

  integer             :: idim, ist, ik, idir
  complex(real64)     :: expect_val_p
  R_TYPE, allocatable :: psi(:, :), grad(:,:,:)
  real(real64)        :: kpoint(space%dim)
#if defined(HAVE_MPI)
  integer             :: tmp
  real(real64), allocatable  :: lmomentum(:), gmomentum(:)
#endif
  real(real64), allocatable  :: lmom(:, :, :)
  integer             :: k_start, k_end, k_n, ndim, nst

  PUSH_SUB(X(elec_momentum_me))

  SAFE_ALLOCATE(psi(1:gr%np_part, 1:st%d%dim))
  SAFE_ALLOCATE(grad(1:gr%np, 1:space%dim, 1:st%d%dim))
  nst = st%nst

  do ik = st%d%kpt%start, st%d%kpt%end
    do ist = st%st_start, st%st_end

      call states_elec_get_state(st, gr, ist, ik, psi)

      do idim = 1, st%d%dim
        call X(derivatives_grad)(gr%der, psi(:, idim), grad(:, 1:space%dim, idim))
      end do

      do idir = 1, space%dim
        ! since the expectation value of the momentum operator is real
        ! for square integrable wfns this integral should be purely imaginary
        ! for complex wfns but real for real wfns (see case distinction below)
        expect_val_p = X(mf_dotp)(gr, st%d%dim, psi, grad(:, idir, :))

        ! In the case of real wavefunctions we do not include the
        ! -i prefactor of p = -i \nabla
        if (states_are_real(st)) then
          momentum(idir, ist, ik) = real(expect_val_p)
        else
          momentum(idir, ist, ik) = real(-M_zI * expect_val_p)
        end if
      end do

      ! have to add the momentum vector in the case of periodic systems,
      ! since psi contains only u_k
      kpoint(:) = kpoints%get_point(st%d%get_kpoint_index(ik))
      do idir = 1, space%periodic_dim
        momentum(idir, ist, ik) = momentum(idir, ist, ik) + kpoint(idir)
      end do

    end do

    ! Exchange momenta in the states parallel case.
#if defined(HAVE_MPI)
    if (st%parallel_in_states) then
      SAFE_ALLOCATE(lmomentum(1:st%lnst))
      SAFE_ALLOCATE(gmomentum(1:nst))

      do idir = 1, st%d%dim
        lmomentum(1:st%lnst) = momentum(idir, st%st_start:st%st_end, ik)
        call lmpi_gen_allgatherv(st%lnst, lmomentum, tmp, gmomentum, st%mpi_grp)
        momentum(idir, 1:nst, ik) = gmomentum(1:nst)
      end do

      SAFE_DEALLOCATE_A(lmomentum)
      SAFE_DEALLOCATE_A(gmomentum)
    end if
#endif
  end do

  ! Handle kpoints parallelization
  if (st%d%kpt%parallel) then
    k_start = st%d%kpt%start
    k_end = st%d%kpt%end
    k_n = st%d%kpt%nlocal
    ndim = ubound(momentum, dim = 1)

    ASSERT(.not. st%parallel_in_states)

    SAFE_ALLOCATE(lmom(1:ndim, 1:nst, 1:k_n))

    lmom(1:ndim, 1:nst, 1:k_n) = momentum(1:ndim, 1:nst, k_start:k_end)

    call st%d%kpt%mpi_grp%allgatherv(lmom, ndim * nst * k_n, MPI_DOUBLE_PRECISION, momentum, &
      st%d%kpt%num(:) * nst * ndim, (st%d%kpt%range(1, :) - 1)*nst*ndim, MPI_DOUBLE_PRECISION)

    SAFE_DEALLOCATE_A(lmom)
  end if

  SAFE_DEALLOCATE_A(psi)
  SAFE_DEALLOCATE_A(grad)

  POP_SUB(X(elec_momentum_me))
end subroutine X(elec_momentum_me)


! ---------------------------------------------------------
!> It calculates the expectation value of the angular
!! momentum of the states. If l2 is passed, it also
!! calculates the expectation value of the square of the
!! angular momentum of the state phi.
! ---------------------------------------------------------
subroutine X(elec_angular_momentum_me)(gr, st, space, ll, l2)
  type(grid_t),                         intent(in)  :: gr
  type(states_elec_t),                  intent(in)  :: st
  type(space_t),                        intent(in)  :: space
  real(real64), contiguous,             intent(out) :: ll(:, :, :) !< (st%nst, st%nik, 1 or 3)
  real(real64), contiguous,   optional, intent(out) :: l2(:, :)    !< (st%nst, st%nik)

  integer :: idim, ist, ik
  R_TYPE, allocatable :: psi(:), lpsi(:, :)

  PUSH_SUB(X(elec_angular_momentum_me))

  ASSERT(space%dim /= 1)

  SAFE_ALLOCATE(psi(1:gr%np_part))

  select case (space%dim)
  case (3)
    SAFE_ALLOCATE(lpsi(1:gr%np_part, 1:3))
  case (2)
    SAFE_ALLOCATE(lpsi(1:gr%np_part, 1))
  end select

  ll = M_ZERO
  if (present(l2)) l2 = M_ZERO

  do ik = st%d%kpt%start, st%d%kpt%end
    do ist = st%st_start, st%st_end
      do idim = 1, st%d%dim
        call states_elec_get_state(st, gr, idim, ist, ik, psi)

#if defined(R_TREAL)
        ll = M_ZERO
#else
        call X(physics_op_L)(gr%der, psi, lpsi)

        ll(ist, ik, 1) = ll(ist, ik, 1) + real(X(mf_dotp)(gr, psi, lpsi(:, 1), reduce = .false.), real64)
        if (space%dim == 3) then
          ll(ist, ik, 2) = ll(ist, ik, 2) + real(X(mf_dotp)(gr, psi, lpsi(:, 2), reduce = .false.), real64)
          ll(ist, ik, 3) = ll(ist, ik, 3) + real(X(mf_dotp)(gr, psi, lpsi(:, 3), reduce = .false.), real64)
        end if
#endif
        if (present(l2)) then
          call X(physics_op_L2)(gr%der, psi(:), lpsi(:, 1))
          l2(ist, ik) = l2(ist, ik) + real(X(mf_dotp)(gr, psi(:), lpsi(:, 1), reduce = .false.), real64)
        end if
      end do
    end do
  end do

  if (gr%parallel_in_domains) then
#if !defined(R_TREAL)
    call gr%allreduce(ll)
#endif
    if (present(l2)) then
      call gr%allreduce(l2)
    end if
  end if

  SAFE_DEALLOCATE_A(psi)
  SAFE_DEALLOCATE_A(lpsi)
  POP_SUB(X(elec_angular_momentum_me))
end subroutine X(elec_angular_momentum_me)


! Now we have three "ks_multipoles" routines, for the 1, 2, and 3D
! cases. Eventually they can probably be merged back into one, once
! we write the 2D case in more general form, using "cylindrical multipoles",
! analogous to the spherical ones.

! ---------------------------------------------------------
!> Prints out the multipole matrix elements between KS states.
!!
!! It prints the states to the file opened in iunit.
!! It prints the (ll,mm) multipole moment, for
!! the Kohn-Sham states in the irreducible subspace ik.
!!
subroutine X(elec_ks_multipoles_3d_me)(gr, st, space, ll, mm, ik, elements)
  type(grid_t),                  intent(in)  :: gr
  type(states_elec_t),           intent(in)  :: st
  type(space_t),                 intent(in)  :: space
  integer,                       intent(in)  :: ll, mm, ik
  R_TYPE,                        intent(out) :: elements(:,:)

  integer :: ist, jst, ip
  real(real64), allocatable :: multipole(:)
  real(real64) :: rr, xx(1:space%dim), ylm
  R_TYPE, allocatable :: psii(:, :), psij(:, :)

  PUSH_SUB(X(elec_ks_multipoles_3d_me))

  SAFE_ALLOCATE(psii(1:gr%np, 1:st%d%dim))
  SAFE_ALLOCATE(psij(1:gr%np, 1:st%d%dim))

  SAFE_ALLOCATE(multipole(1:gr%np))

  do ip = 1, gr%np
    call mesh_r(gr, ip, rr, coords = xx)
    call loct_ylm(1, xx(1), xx(2), xx(3), ll, mm, ylm)
    multipole(ip) = rr**ll * ylm
  end do

  ASSERT(.not. st%parallel_in_states)
  ! how to do this properly? states_elec_matrix
  do ist = 1, st%nst

    call states_elec_get_state(st, gr, ist, ik, psii)

    do jst = 1, st%nst

      call states_elec_get_state(st, gr, jst, ik, psij)

      psij(1:gr%np, 1) = psij(1:gr%np, 1)*multipole(1:gr%np)

      elements(ist, jst) = X(mf_dotp)(gr, st%d%dim, psii, psij)
    end do
  end do

  SAFE_DEALLOCATE_A(psii)
  SAFE_DEALLOCATE_A(psij)
  SAFE_DEALLOCATE_A(multipole)

  POP_SUB(X(elec_ks_multipoles_3d_me))
end subroutine X(elec_ks_multipoles_3d_me)

! ---------------------------------------------------------
!> Prints out the dipole matrix elements (X or Y) between single
!! orbitals, in the 1d case.
!!
!! It prints the states to the file opened in iunit.
!! It prints the moment, for single orbital states
!! irreducible subspace ik. It only prints the first order moments
!! X or Y. Eventually it should print the circular multipoles of
!! arbitrary order, similar to the 3D case.
!!
!! The argument ll should be 1 (X) or 2 (Y).
!!
subroutine X(elec_ks_multipoles_2d_me)(gr, st, ll, ik, elements)
  type(grid_t),                  intent(in)  :: gr
  type(states_elec_t),           intent(in)  :: st
  integer,                       intent(in)  :: ll, ik
  R_TYPE,                        intent(out) :: elements(:,:)

  integer :: ist, jst, ip
  real(real64), allocatable :: dipole(:)
  R_TYPE, allocatable :: psii(:, :), psij(:, :)

  PUSH_SUB(X(elec_ks_multipoles_2d_me))

  SAFE_ALLOCATE(psii(1:gr%np, 1:st%d%dim))
  SAFE_ALLOCATE(psij(1:gr%np, 1:st%d%dim))
  SAFE_ALLOCATE(dipole(1:gr%np))

  do ip = 1, gr%np
    dipole(ip) = gr%x(ip, ll)
  end do

  ASSERT(.not. st%parallel_in_states)
  ! how to do this properly? states_elec_matrix
  do ist = 1, st%nst

    call states_elec_get_state(st, gr, ist, ik, psii)

    do jst = 1, st%nst

      call states_elec_get_state(st, gr, jst, ik, psij)

      psij(1:gr%np, 1) = psij(1:gr%np, 1) * dipole(1:gr%np)

      elements(ist, jst) = X(mf_dotp)(gr, st%d%dim, psii, psij)
    end do
  end do

  SAFE_DEALLOCATE_A(psii)
  SAFE_DEALLOCATE_A(psij)
  SAFE_DEALLOCATE_A(dipole)

  POP_SUB(X(elec_ks_multipoles_2d_me))
end subroutine X(elec_ks_multipoles_2d_me)


! ---------------------------------------------------------
!> Prints out the multipole matrix elements (X**l) between single
!! orbitals, in the 1d case.
!!
!! It prints the states to the file opened in iunit.
!! It prints the moment of ll-th order, for single orbital states
!! irreducible subspace ik.
!!
subroutine X(elec_ks_multipoles_1d_me)(gr, st, ll, ik, elements)
  type(grid_t),                  intent(in)  :: gr
  type(states_elec_t),           intent(in)  :: st
  integer,                       intent(in)  :: ll, ik
  R_TYPE,                        intent(out) :: elements(:,:)

  integer :: ip, ist, jst
  real(real64), allocatable :: dipole(:)
  R_TYPE, allocatable :: psii(:, :), psij(:, :)

  PUSH_SUB(X(elec_ks_multipoles_1d_me))

  SAFE_ALLOCATE(psii(1:gr%np, 1:st%d%dim))
  SAFE_ALLOCATE(psij(1:gr%np, 1:st%d%dim))

  SAFE_ALLOCATE(dipole(1:gr%np))

  do ip = 1, gr%np
    dipole(ip) = gr%x(ip, 1)**ll
  end do

  ASSERT(.not. st%parallel_in_states)
  do ist = 1, st%nst

    call states_elec_get_state(st, gr, ist, ik, psii)

    do jst = 1, st%nst

      call states_elec_get_state(st, gr, jst, ik, psij)

      psij(1:gr%np, 1) = psij(1:gr%np, 1) * dipole (1:gr%np)

      elements(ist, jst) = X(mf_dotp)(gr, st%d%dim, psii, psij)
    end do
  end do

  SAFE_DEALLOCATE_A(psii)
  SAFE_DEALLOCATE_A(psij)
  SAFE_DEALLOCATE_A(dipole)

  POP_SUB(X(elec_ks_multipoles_1d_me))
end subroutine X(elec_ks_multipoles_1d_me)

! ---------------------------------------------------------
!> Computes the dipole matrix elements between KS states.
subroutine X(elec_dipole_me)(gr, st, namespace, hm, ions, ik, st_start, st_end, elements)
  type(grid_t),                  intent(in)  :: gr
  type(states_elec_t),           intent(in)  :: st
  type(namespace_t),             intent(in)  :: namespace
  type(hamiltonian_elec_t),      intent(in)  :: hm
  type(ions_t),                  intent(in)  :: ions
  integer,                       intent(in)  :: ik
  integer,                       intent(in)  :: st_start
  integer,                       intent(in)  :: st_end
  R_TYPE,                        intent(out) :: elements(:,:,:)

  integer :: ist, jst, ip, idir, idim, ispin
  R_TYPE :: dip_element
  R_TYPE, allocatable :: psii(:, :), psij(:, :), gpsii(:,:,:)

  PUSH_SUB(X(elec_dipole_me))

  ASSERT(.not. st%parallel_in_states)

  ispin = st%d%get_spin_index(ik)

  SAFE_ALLOCATE(psii(1:gr%np_part, 1:st%d%dim))
  SAFE_ALLOCATE(psij(1:gr%np, 1:st%d%dim))
  SAFE_ALLOCATE(gpsii(1:gr%np, 1:ions%space%dim, 1:st%d%dim))

  do ist = st_start, st_end
    call states_elec_get_state(st, gr, ist, ik, psii)

    if (.not. ions%space%is_periodic()) then
      do idim = 1, st%d%dim
        do idir = 1, ions%space%dim
          do ip = 1, gr%np
            gpsii(ip, idir, idim) = psii(ip, idim) * gr%x(ip, idir)
          end do
        end do
      end do
    else
      do idim = 1, st%d%dim
        call boundaries_set(gr%der%boundaries, gr, psii(:, idim))
      end do
      !We need the phase here as the routines for the nonlocal contributions assume that the wavefunctions have a phase.
#ifdef R_TCOMPLEX
      if (hm%phase%is_allocated()) then
        call hm%phase%apply_to_single(psii, gr%np_part, st%d%dim, ik, .false.)
      end if
#endif

      do idim = 1, st%d%dim
        call X(derivatives_grad)(gr%der, psii(:, idim), gpsii(:, :, idim), set_bc = .false.)
      end do

      !A nonlocal contribution from the MGGA potential must be included
      !This must be done first, as this is like a position-dependent mass
      call hm%ks_pot%X(current_mass_renormalization)(gpsii, gr%der%dim, st%d%dim, ispin)

      !A nonlocal contribution from the pseudopotential must be included
      call X(projector_commute_r_allatoms_alldir)(hm%ep%proj, ions, gr, st%d%dim, gr%der%boundaries, &
        ik, psii, gpsii)

      !A nonlocal contribution from the scissor must be included
      if (hm%scissor%apply) then
        call scissor_commute_r(hm%scissor, gr, ik, psii, gpsii)
      end if

      call X(lda_u_commute_r_single)(hm%lda_u, gr, ions%space, st%d, namespace, ist, ik, &
        psii, gpsii, hm%phase%is_allocated())

      call X(exchange_operator_commute_r)(hm%exxop, namespace, gr, st%d, ik, psii, gpsii)

#ifdef R_TCOMPLEX
      if (hm%phase%is_allocated()) then
        do idim = 1, st%d%dim
          do idir = 1, ions%space%dim
            call hm%phase%apply_to_single(gpsii(:, idir, idim:idim), gr%np, 1, ik, .true.)
          end do
        end do
      end if
#endif


    end if

    do jst = ist, st_end
      call states_elec_get_state(st, gr, jst, ik, psij)

      do idir = 1, ions%space%dim
        dip_element = M_ZERO
        do idim = 1, st%d%dim
          dip_element = dip_element + X(mf_dotp)(gr, gpsii(:, idir, idim), psij(:, idim))
        end do

        if (ions%space%is_periodic()) then
          if (abs(st%eigenval(ist, ik) - st%eigenval(jst, ik)) > 1e-5_real64) then
            dip_element = -dip_element/((st%eigenval(ist, ik) - st%eigenval(jst, ik)))
          else
            dip_element = R_TOTYPE(M_ZERO)
          end if
        end if

        elements(idir, ist - st_start + 1, jst - st_start + 1) = dip_element
        elements(idir, jst - st_start + 1, ist - st_start + 1) = R_CONJ(dip_element)

      end do
    end do
  end do

  SAFE_DEALLOCATE_A(gpsii)
  SAFE_DEALLOCATE_A(psii)
  SAFE_DEALLOCATE_A(psij)

  POP_SUB(X(elec_dipole_me))
end subroutine X(elec_dipole_me)


subroutine X(calculate_expectation_values_matrix)(namespace, hm, der, st, eigen, terms, diagonal_states)
  type(namespace_t),        intent(in)    :: namespace
  type(hamiltonian_elec_t), intent(in)    :: hm
  type(derivatives_t),      intent(in)    :: der
  type(states_elec_t),      intent(inout) :: st
  R_TYPE,                   intent(out)   :: eigen(st%st_start:,st%st_start:,st%d%kpt%start:) !:st%st_end,:st%st_end,:st%d%kpt%end
  integer, optional,        intent(in)    :: terms
  logical, optional,        intent(in)    :: diagonal_states ! if true (default), computes only <i|H|i>, otherwise <i|H|j>

  integer :: ik, ib, jb
  type(wfs_elec_t) :: hpsib

  PUSH_SUB(X(calculate_expectation_values_matrix))

  call profiling_in(TOSTRING(X(calc_exp_values_matrix)))

  eigen = M_ZERO


  do ik = st%d%kpt%start, st%d%kpt%end
    ! States are groupped in blocks. The dimension of each block depends on the hardware.
    ! The following do loop goes through all the groups of states (the parallelization here is DOMAINS or KPT)
    do ib = st%group%block_start, st%group%block_end

      if (hm%apply_packed()) call st%group%psib(ib, ik)%do_pack()

      call st%group%psib(ib, ik)%copy_to(hpsib)

      ! Perform H|psi_i> of the group of states
      call X(hamiltonian_elec_apply_batch)(hm, namespace, der%mesh, st%group%psib(ib, ik), hpsib, terms = terms)

      ! Now loop again over the groups of states, paying attention to avoid the combinations <i|H|j> == conj(<j|H|i>)
      do jb = ib, st%group%block_end
        if (optional_default(diagonal_states, .true.) .and. jb > ib) exit

        ! Perform the operation <psi_j|hpsi_i>, where hpsi_i = H|psi_i>. Normally it would be a vector multiplication,
        ! but since we are considering a block of states it is actually a matrix multiplication
        call X(mesh_batch_dotp_matrix)(der%mesh, st%group%psib(jb, ik), hpsib, eigen(:, :, ik), reduce = .false.)
        ! Remark: despite it seems that we are filling the upper part of the matrix,
        ! mesh_batch_dotp_matrix is actually computing only the lower triangular part
      end do

      if (hm%apply_packed()) call st%group%psib(ib, ik)%do_unpack(copy = .false.)
      call hpsib%end()
    end do
  end do

  if (der%mesh%parallel_in_domains) then
    call der%mesh%allreduce(eigen(st%st_start:st%st_end, st%st_start:st%st_end, st%d%kpt%start:st%d%kpt%end))
  end if

  call profiling_out(TOSTRING(X(calc_exp_values_matrix)))
  POP_SUB(X(calculate_expectation_values_matrix))
end subroutine X(calculate_expectation_values_matrix)

! ---------------------------------------------------------
subroutine X(elec_one_body_me)(gr, st, namespace, hm, iindex, jindex, oneint)
  type(grid_t),                  intent(in)    :: gr
  type(states_elec_t),           intent(inout) :: st
  type(namespace_t),             intent(in)    :: namespace
  type(hamiltonian_elec_t),      intent(in)    :: hm
  integer,                       intent(out)   :: iindex(:)
  integer,                       intent(out)   :: jindex(:)
  R_TYPE,                        intent(out)   :: oneint(:)

  integer :: ist, jst, np, iint, st_start, st_end
  R_TYPE  :: me
  R_TYPE, allocatable :: psii(:, :), psij(:, :)
  R_TYPE, allocatable :: one_bodies(:, :, :)

  PUSH_SUB(X(elec_one_body_me))

  SAFE_ALLOCATE(psii(1:gr%np, 1:st%d%dim))
  SAFE_ALLOCATE(psij(1:gr%np_part, 1:st%d%dim))

  if (st%d%ispin == SPINORS) then
    call messages_not_implemented("One-body integrals with spinors", namespace=namespace)
  end if
  ASSERT(.not. st%parallel_in_states) ! TODO: Add states parallelization support

  iint = 1
  ! Select case on theory level
  select case(hm%theory_level)
  case (HARTREE_FOCK)
    st_start = st%st_start
    st_end = st%st_end
    ! For Hartree Fock, the one body me should include only the kinetic energy and the external potential
    SAFE_ALLOCATE(one_bodies(st_start:st_end, st_start:st_end, st%d%kpt%start:st%d%kpt%end))

    call X(calculate_expectation_values_matrix)(namespace, hm, gr%der, st, one_bodies, &
      terms = TERM_KINETIC + TERM_LOCAL_EXTERNAL + TERM_NON_LOCAL_POTENTIAL, diagonal_states = .false.)

    ! Now we have to fill the arrays iindex/jindex/oneint
    do ist = st_start, st_end
      do jst = st_start, ist
        iindex(iint) = ist
        jindex(iint) = jst
        oneint(iint) = one_bodies(ist, jst, 1)
        iint = iint + 1
      end do
    end do
    SAFE_DEALLOCATE_A(one_bodies)

  case (KOHN_SHAM_DFT)
    np = gr%np

    do ist = 1, st%nst

      call states_elec_get_state(st, gr, ist, 1, psii)

      do jst = 1, st%nst
        if (jst > ist) cycle

        call states_elec_get_state(st, gr, jst, 1, psij)

        psij(1:np, 1) = R_CONJ(psii(1:np, 1)) * hm%ks_pot%vhxc(1:np, 1) * psij(1:np, 1)

        me = - X(mf_integrate)(gr, psij(:, 1))

        if (ist == jst) me = me + st%eigenval(ist,1)

        iindex(iint) = ist
        jindex(iint) = jst
        oneint(iint) = me
        iint = iint + 1
      end do
    end do
  case default
    call messages_not_implemented("One-body integrals with TheoryLevel not DFT or Hartree-Fock", namespace=namespace)
  end select

  SAFE_DEALLOCATE_A(psii)
  SAFE_DEALLOCATE_A(psij)

  POP_SUB(X(elec_one_body_me))
end subroutine X(elec_one_body_me)

! ---------------------------------------------------------
subroutine X(elec_two_body_me)(gr, st, space, namespace, kpoints, psolver, st_min, st_max, iindex, jindex, kindex, lindex, &
  twoint, phase, singularity, exc_k)
  type(grid_t),                   intent(in)    :: gr
  type(states_elec_t), target,    intent(inout) :: st
  class(space_t),                 intent(in)    :: space
  type(namespace_t),              intent(in)    :: namespace
  type(kpoints_t),                intent(in)    :: kpoints
  type(poisson_t),                intent(inout) :: psolver
  integer,                        intent(in)    :: st_min, st_max
  integer,                        intent(out)   :: iindex(:,:)
  integer,                        intent(out)   :: jindex(:,:)
  integer,                        intent(out)   :: kindex(:,:)
  integer,                        intent(out)   :: lindex(:,:)
  R_TYPE, contiguous,             intent(out)   :: twoint(:)
  type(phase_t),       optional,  intent(in)    :: phase
  type(singularity_t), optional,  intent(in)    :: singularity
  logical,             optional,  intent(in)    :: exc_k

  integer :: ist, jst, kst, lst, ijst, klst, ikpt, jkpt, kkpt, lkpt
  integer :: ist_global, jst_global, kst_global, lst_global, nst, nst_tot
  integer :: iint, ikpoint, jkpoint, ip, ibind, npath
  R_TYPE  :: me
  R_TYPE, allocatable :: nn(:), vv(:), two_body_int(:,:), tmp(:,:)
  R_TYPE, pointer :: psii(:), psij(:), psik(:), psil(:)
  real(real64) :: qq(space%dim)
  logical :: exc_k_
  class(wfs_elec_t), pointer :: wfs
  type(fourier_space_op_t) :: coulb

  PUSH_SUB(X(elec_two_body_me))

  SAFE_ALLOCATE(nn(1:gr%np))
  SAFE_ALLOCATE(vv(1:gr%np))
  SAFE_ALLOCATE(tmp(1:gr%np, 1:1))
  SAFE_ALLOCATE(two_body_int(1:gr%np, 1:1))

  if (st%d%ispin == SPINORS) then
    call messages_not_implemented("Two-body integrals with spinors", namespace=namespace)
  end if

  ASSERT(present(phase) .eqv. present(singularity))
#ifdef R_TCOMPLEX
  ASSERT(present(phase))
#endif

  npath = kpoints%nkpt_in_path()

  if (st%are_packed()) call st%unpack()

  ijst = 0
  iint = 1

  nst_tot = (st_max-st_min+1)*st%nik
  nst = (st_max-st_min+1)

  exc_k_ = .false.
  if (present(exc_k)) exc_k_ = exc_k

  if (present(singularity)) then
    qq = M_ZERO
    call poisson_build_kernel(psolver, namespace, space, coulb, qq, M_ZERO, M_ONE, M_ZERO)
  end if

  do ist_global = 1, nst_tot
    ist = mod(ist_global - 1, nst) + 1
    ikpt = (ist_global - ist) / nst + 1
    ikpoint = st%d%get_kpoint_index(ikpt)

    wfs => st%group%psib(st%group%iblock(ist+st_min-1), ikpt)
    ASSERT(wfs%status() /= BATCH_DEVICE_PACKED)
    ibind = wfs%inv_index((/ist+st_min-1, 1/))
    if (wfs%status() == BATCH_NOT_PACKED) then
      psii => wfs%X(ff_linear)(:, ibind)
    else if (wfs%status() == BATCH_PACKED) then
      psii => wfs%X(ff_pack)(ibind, :)
    else
      ASSERT(.false.) ! TODO: Add GPU support
    end if

    do jst_global = 1, nst_tot
      jst = mod(jst_global - 1, nst) + 1
      jkpt = (jst_global - jst) / nst + 1
      jkpoint = st%d%get_kpoint_index(jkpt)

      if (exc_k_ .and. ist /= jst) cycle

      if (present(singularity)) then
        qq(:) = kpoints%get_point(ikpoint, absolute_coordinates=.false.) &
          - kpoints%get_point(jkpoint, absolute_coordinates=.false.)
        ! In case of k-points, the poisson solver must contains k-q
        ! in the Coulomb potential, and must be changed for each q point
        call poisson_build_kernel(psolver, namespace, space, coulb, qq, &
          M_ZERO, M_ONE, M_ZERO, &
          -(kpoints%full%npoints-npath)*kpoints%latt%rcell_volume*(singularity%Fk(jkpoint)-singularity%FF))
      end if

#ifndef R_TCOMPLEX
      if (jst_global > ist_global) cycle
#endif
      ijst=ijst+1

      wfs => st%group%psib(st%group%iblock(jst+st_min-1), jkpt)
      ibind = wfs%inv_index((/jst+st_min-1, 1/))
      if (wfs%status() == BATCH_NOT_PACKED) then
        psij => wfs%X(ff_linear)(:, ibind)
      else if (wfs%status() == BATCH_PACKED) then
        psij => wfs%X(ff_pack)(ibind, :)
      else
        ASSERT(.false.) ! TODO: Add GPU support
      end if

      nn(1:gr%np) = R_CONJ(psii(1:gr%np)) * psij(1:gr%np)
      if (present(singularity)) then
        call X(poisson_solve)(psolver, namespace, vv, nn, all_nodes=.false., kernel=coulb)
      else
        call X(poisson_solve)(psolver, namespace, vv, nn, all_nodes=.false.)
      end if

      !We now put back the phase that we treated analytically using the Poisson solver
#ifdef R_TCOMPLEX
      do ip = 1, gr%np
        vv(ip) = vv(ip) * exp(M_zI*sum(qq(:) * gr%x(ip, :)))
      end do
#endif

      klst=0
      do kst_global = 1, nst_tot
        kst = mod(kst_global - 1, nst) + 1
        kkpt = (kst_global - kst) / nst + 1

        if (exc_k_ .and. kkpt /= jkpt) cycle

        wfs => st%group%psib(st%group%iblock(kst + st_min - 1), kkpt)
        ibind = wfs%inv_index((/kst + st_min - 1, 1/))
        if (wfs%status() == BATCH_NOT_PACKED) then
          psik => wfs%X(ff_linear)(:, ibind)
        else if (wfs%status() == BATCH_PACKED) then
          psik => wfs%X(ff_pack)(ibind, :)
        else
          ASSERT(.false.) ! TODO: Add GPU support
        end if

        !$omp parallel do
        do ip = 1, gr%np
          tmp(ip, 1) = vv(ip)*R_CONJ(psik(ip))
        end do
        !$omp end parallel do
        if (present(phase)) then
#ifdef R_TCOMPLEX
          call phase%apply_to_single(tmp, gr%np, 1, kkpt, .true.)
#endif
        end if

        do lst_global = 1, nst_tot
          lst = mod(lst_global - 1, nst) + 1
          lkpt = (lst_global - lst)/nst + 1

#ifndef R_TCOMPLEX
          if (lst_global > kst_global) cycle
          klst=klst+1
          if (klst > ijst) cycle
#endif

          if (exc_k_ .and. kst /= lst) cycle
          if (exc_k_ .and. lkpt /= ikpt) cycle
          wfs => st%group%psib(st%group%iblock(lst+st_min-1), lkpt)
          ibind = wfs%inv_index((/lst+st_min-1, 1/))
          if (wfs%status() == BATCH_NOT_PACKED) then
            psil => wfs%X(ff_linear)(:, ibind)
          else if (wfs%status() == BATCH_PACKED) then
            psil => wfs%X(ff_pack)(ibind, :)
          else
            ASSERT(.false.) ! TODO: Add GPU support
          end if

          !$omp parallel do
          do ip = 1, gr%np
            two_body_int(ip, 1) = tmp(ip, 1)*psil(ip)
          end do
          !$omp end parallel do
          if (present(phase)) then
#ifdef R_TCOMPLEX
            call phase%apply_to_single(two_body_int, gr%np, 1, lkpt, .false.)
#endif
          end if

          me = X(mf_integrate)(gr, two_body_int(:, 1), reduce = .false.)

          iindex(1,iint) = ist + st_min - 1
          iindex(2,iint) = ikpt
          jindex(1,iint) = jst + st_min - 1
          jindex(2,iint) = jkpt
          kindex(1,iint) = kst + st_min - 1
          kindex(2,iint) = kkpt
          lindex(1,iint) = lst + st_min - 1
          lindex(2,iint) = lkpt
          twoint(iint) = me
          iint = iint + 1

        end do
      end do
    end do
  end do

  call gr%allreduce(twoint)

  if (present(singularity)) then
    call fourier_space_op_end(coulb)
  end if

  SAFE_DEALLOCATE_A(nn)
  SAFE_DEALLOCATE_A(vv)
  SAFE_DEALLOCATE_A(tmp)
  SAFE_DEALLOCATE_A(two_body_int)

  POP_SUB(X(elec_two_body_me))
end subroutine X(elec_two_body_me)

!! Local Variables:
!! mode: f90
!! coding: utf-8
!! End:
