!! Copyright (C) 2002-2014 M. Marques, A. Castro, A. Rubio,
!! G. Bertsch, J. Alberdi-Rodriguez, M. Oliveira
!!
!! This program is free software; you can redistribute it and/or modify
!! it under the terms of the GNU General Public License as published by
!! the Free Software Foundation; either version 2, or (at your option)
!! any later version.
!!
!! This program is distributed in the hope that it will be useful,
!! but WITHOUT ANY WARRANTY; without even the implied warranty of
!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!! GNU General Public License for more details.
!!
!! You should have received a copy of the GNU General Public License
!! along with this program; if not, write to the Free Software
!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
!! 02110-1301, USA.
!!


! ---------------------------------------------------------
!> This routine fills state psi with an atomic orbital -- provided
!! by the pseudopotential structure in geo.
! ---------------------------------------------------------
subroutine X(lcao_atomic_orbital) (this, iorb, mesh, st, ions, psi, spin_channel)
  type(lcao_t),             intent(inout) :: this
  integer,                  intent(in)    :: iorb
  class(mesh_t),            intent(in)    :: mesh
  type(states_elec_t),      intent(in)    :: st
  type(ions_t),     target, intent(in)    :: ions
  R_TYPE,                   intent(inout) :: psi(:, :)
  integer,                  intent(in)    :: spin_channel

  class(species_t), pointer :: species
  integer :: idim, iatom, jj, ispin, ii, ll, mm
  R_TYPE, allocatable :: orbital(:)
  real(real64) :: radius
  type(submesh_t) :: sphere
#ifdef R_TCOMPLEX
  real(real64), allocatable :: dorbital(:)
#endif

  call profiling_in(TOSTRING(X(ATOMIC_ORBITAL)))
  PUSH_SUB(X(lcao_atomic_orbital))

  ASSERT(iorb >= 1)
  ASSERT(iorb <= this%maxorbs)

  psi(1:mesh%np, 1:st%d%dim) = R_TOTYPE(M_ZERO)

  iatom = this%atom(iorb)
  jj = this%level(iorb)
  idim = this%ddim(iorb)
  species => ions%atom(iatom)%species
  ASSERT(jj <= species%get_niwfs())
  ispin = max(spin_channel, idim)

  call species%get_iwf_ilm(jj, ispin, ii, ll, mm)
  ! For all-electron species, we want to use the principal quantum number
  if (species%is_full()) call species%get_iwf_n(jj, ispin, ii)

  radius = this%orbital_scale_factor*species%get_iwf_radius(ii, ispin)
  ! make sure that if the spacing is too large, the orbitals fit in a few points at least
  radius = max(radius, M_TWO*maxval(mesh%spacing))

  call submesh_init(sphere, ions%space, mesh, ions%latt, ions%pos(:, iatom), radius)

#ifdef R_TCOMPLEX
  if (.not. this%complex_ylms) then
    SAFE_ALLOCATE(dorbital(1:sphere%np))
    call datomic_orbital_get_submesh(species, sphere, ii, ll, mm, ispin, dorbital)
    call submesh_add_to_mesh(sphere, dorbital, psi(:, idim))
    SAFE_DEALLOCATE_A(dorbital)
  else
#endif

    SAFE_ALLOCATE(orbital(1:sphere%np))

    call X(atomic_orbital_get_submesh)(species, sphere, ii, ll, mm, ispin, orbital)
    call submesh_add_to_mesh(sphere, orbital, psi(:, idim))

    SAFE_DEALLOCATE_A(orbital)

#ifdef R_TCOMPLEX
  end if
#endif

  if (sphere%np == 0) this%is_empty(iorb) = .true.

  call submesh_end(sphere)



  POP_SUB(X(lcao_atomic_orbital))
  call profiling_out(TOSTRING(X(ATOMIC_ORBITAL)))

end subroutine X(lcao_atomic_orbital)

! ---------------------------------------------------------

subroutine X(lcao_wf)(this, st, gr, ions, hm, namespace, start)
  type(lcao_t),             intent(inout) :: this
  type(states_elec_t),      intent(inout) :: st
  type(grid_t),             intent(in)    :: gr
  type(ions_t),             intent(in)    :: ions
  type(hamiltonian_elec_t), intent(in)    :: hm
  type(namespace_t),        intent(in)    :: namespace
  integer, optional,        intent(in)    :: start

  integer :: nst, ik, n1, n2, idim, lcao_start, ie, maxmtxel
  R_TYPE, allocatable :: hpsi(:, :, :), overlap(:, :, :)
  real(real64), allocatable :: ev(:)
  R_TYPE, allocatable :: hamilt(:, :, :), lcaopsi(:, :, :), lcaopsi2(:, :), zeropsi(:)
  integer :: kstart, kend, ispin
  integer :: spin_channels
  integer :: iunit_h, iunit_s, iunit_e
  real(real64), allocatable :: tmp(:, :)
  integer :: iatom, jatom
  real(real64) :: dist2
  R_TYPE :: phase
  ! Variables used for some commented debug statements
  !character(len=256) :: filename
  !integer :: ierr
  real(real64), parameter :: tol_small_elem = 1.0e-14_real64

  PUSH_SUB(X(lcao_wf))

  write(message(1),'(a,i6,a)') 'Info: Performing initial LCAO calculation with ', &
    this%norbs,' orbitals.'
  call messages_info(1, namespace=namespace)


  nst = min(st%nst, this%norbs)
  kstart = st%d%kpt%start
  kend = st%d%kpt%end

  lcao_start = optional_default(start, 1)

  !In case of spinors, everything is taken care of by st%d%dim
  spin_channels = st%d%spin_channels
  if (st%d%ispin == SPINORS) spin_channels = 1

  ! Allocation of variables

  SAFE_ALLOCATE(lcaopsi(1:gr%np_part, 1:st%d%dim, 1:spin_channels))
  SAFE_ALLOCATE(lcaopsi2(1:gr%np, 1:st%d%dim))
  SAFE_ALLOCATE(hpsi(1:gr%np, 1:st%d%dim, kstart:kend))
  SAFE_ALLOCATE(hamilt(1:this%norbs, 1:this%norbs, kstart:kend))
  hamilt = M_ZERO
  SAFE_ALLOCATE(overlap(1:this%norbs, 1:this%norbs, 1:spin_channels))
  overlap = M_ZERO


  ie = 0
  maxmtxel = this%norbs * (this%norbs + 1)/2

  message(1) = "Info: Getting Hamiltonian matrix elements."
  call messages_info(1, namespace=namespace)

  if (mpi_grp_is_root(mpi_world)) call loct_progress_bar(-1, maxmtxel)

  do n1 = 1, this%norbs
    iatom = this%atom(n1)

    do ispin = 1, spin_channels
      call X(get_ao)(this, st, gr, ions, n1, ispin, lcaopsi(:, :, ispin), use_psi = .true.)
      do idim = 1, st%d%dim
        call boundaries_set(gr%der%boundaries, gr, lcaopsi(:, idim, ispin))
      end do

      ! Uncomment to output all the atomic orbitals used in the LCAO calculation
      !if (debug%info) then
      !  write(filename, '(a,i4.4,a,i1)') 'lcao-orb', n1, '-sp', ispin
      !  call X(io_function_output)(OPTION__OUTPUTFORMAT__XCRYSDEN, "debug/lcao", filename, namespace, &
      !   ions%space, gr, lcaopsi(:, 1, ispin),  sqrt(units_out%length**(-gr%box%dim)), &
      !    ierr, pos = ions%pos, atoms=ions%atom, grp = st%dom_st_kpt_mpi_grp)
      !end if
    end do

    do ik = kstart, kend
      ispin = st%d%get_spin_index(ik)
      call X(hamiltonian_elec_apply_single)(hm, namespace, gr, lcaopsi(:, :, ispin), &
        hpsi(:, :, ik), n1, ik, set_bc = .false.)
    end do

    do n2 = n1, this%norbs
      jatom = this%atom(n2)
      dist2 = sum((ions%pos(:, iatom) - ions%pos(:, jatom))**2)

      ! Note that here we are making an approximation for the Hamiltonian matrix,
      ! as the nonlocal part of the pseudopotential, or any nonlocal operator
      ! might still couple the two wavefunctions (from atom i and atom j).
      if (.not. ions%space%is_periodic() .and. (dist2 > (this%radius(iatom) + this%radius(jatom) + this%lapdist)**2)) cycle
      if (this%is_empty(n2) .and. .not. ions%atom(this%atom(n2))%species%is_full()) cycle !No contribution on this domain

      do ispin = 1, spin_channels

        call X(get_ao)(this, st, gr, ions, n2, ispin, lcaopsi2, use_psi = .true.)

        overlap(n1, n2, ispin) = X(mf_dotp)(gr, st%d%dim, lcaopsi(:, :, ispin), lcaopsi2, reduce=.false.)
        do ik = kstart, kend
          if (ispin /= st%d%get_spin_index(ik)) cycle
          hamilt(n1, n2, ik) = X(mf_dotp)(gr, st%d%dim, hpsi(:, :, ik), lcaopsi2, reduce=.false.)
        end do
      end do

      ie = ie + 1
    end do


    if (mpi_grp_is_root(mpi_world)) call loct_progress_bar(ie, maxmtxel)
  end do

  if (mpi_grp_is_root(mpi_world)) write(stdout, '(1x)')

  SAFE_DEALLOCATE_A(hpsi)

  if (gr%parallel_in_domains) then
    call gr%allreduce(hamilt)
    call gr%allreduce(overlap)
  end if

  if (debug%info .and. mpi_grp_is_root(mpi_world)) then
    iunit_h = io_open('debug/lcao/hamiltonian', namespace, action='write')
    iunit_s = io_open('debug/lcao/overlap', namespace, action='write')
    iunit_e = io_open('debug/lcao/eigenvectors', namespace, action='write')
    write(iunit_h,'(4a6,a15)') 'iorb', 'jorb', 'ik', 'spin', 'hamiltonian'
    write(iunit_s,'(3a6,a15)') 'iorb', 'jorb', 'spin', 'overlap'
    write(iunit_e,'(4a6,a15)') 'ieig', 'jorb', 'ik', 'spin', 'coefficient'
  end if

  !Symmetrization and debug
  do n1 = 1, this%norbs
    do n2 = n1, this%norbs
      do ispin = 1, spin_channels

        if(abs(overlap(n1, n2, ispin)) < tol_small_elem)  overlap(n1, n2, ispin) = M_ZERO
        overlap(n2, n1, ispin) = R_CONJ(overlap(n1, n2, ispin))
        if(n1 == n2) overlap(n2, n1, ispin) = real(overlap(n2, n1, ispin), real64)

        if (debug%info .and. mpi_grp_is_root(mpi_world)) then
          write(iunit_s,'(3i6,2es15.6)') n1, n2, ispin, overlap(n1, n2, ispin)
        end if

        do ik = kstart, kend
          if (ispin /= st%d%get_spin_index(ik)) cycle

          if(abs(hamilt(n1, n2, ik)) < tol_small_elem)  hamilt(n1, n2, ik) = M_ZERO
#ifdef R_TCOMPLEX
          ! We also treat the purely imaginary and purely real cases. This is interested for isolated systems
          if(abs(real(hamilt(n1, n2, ik), real64)) < tol_small_elem) then
            hamilt(n1, n2, ik) = cmplx(M_ZERO, aimag( hamilt(n1, n2, ik)), real64)
          end if
          if(abs(aimag(hamilt(n1, n2, ik))) < tol_small_elem) then
            hamilt(n1, n2, ik) = cmplx(real(hamilt(n1, n2, ik), real64), M_ZERO, real64)
          end if
#endif
          hamilt(n2, n1, ik) = R_CONJ(hamilt(n1, n2, ik))
          if(n1 == n2) hamilt(n2, n1, ik) = real(hamilt(n2, n1, ik), real64)

          if (debug%info .and. mpi_grp_is_root(mpi_world)) then
            write(iunit_h,'(4i6,2es15.6)') n1, n2, ik, ispin, units_from_atomic(units_out%energy, hamilt(n1, n2, ik))
          end if
        end do
      end do
    end do
  end do

  if (debug%info .and. mpi_grp_is_root(mpi_world)) then
    call io_close(iunit_h)
    call io_close(iunit_s)
  end if

  SAFE_ALLOCATE(ev(1:this%norbs))
  SAFE_ALLOCATE(zeropsi(1:gr%np))
  zeropsi = R_TOTYPE(M_ZERO)

  do ik = kstart, kend
    ispin = st%d%get_spin_index(ik)

    call lalg_geneigensolve(this%norbs, hamilt(:, :, ik), overlap(:, :, ispin), ev, preserve_mat=.true.)

    ! the eigenvectors are not unique due to phases and degenerate subspaces, but
    ! they must be consistent among processors
    do n1 = 1, this%norbs
      phase = hamilt(n1, n1, ik)
      if (abs(phase) < 1.0e-6_real64) then
        n2 = maxloc(abs(hamilt(:, n1, ik)), dim=1)
        phase = hamilt(n2, n1, ik)
      end if
      phase = phase / abs(phase)
      call lalg_scal(this%norbs, M_ONE/phase, hamilt(:, n1, ik))
    end do

    ! each node should receive all the eigenvalues
    st%eigenval(lcao_start:nst, ik) = ev(lcao_start:nst)

    do n1 = max(lcao_start, st%st_start), st%st_end
      do idim = 1, st%d%dim
        call states_elec_set_state(st, gr, idim, n1, ik, zeropsi)
      end do
    end do
  end do

  SAFE_DEALLOCATE_A(zeropsi)

  if (debug%info .and. mpi_grp_is_root(mpi_world)) then
    do ik =  kstart, kend
      ispin = st%d%get_spin_index(ik)
      do n2 = 1, this%norbs
        do n1 = 1, this%norbs
          write(iunit_e,'(4i6,2es15.6)') n2, n1, ik, ispin, hamilt(n1, n2, ik)
        end do
      end do
    end do
    call io_close(iunit_e)
  end if

  if (st%d%kpt%parallel) then
    ASSERT(.not. st%parallel_in_states)
    SAFE_ALLOCATE(tmp(1:st%nst, kstart:kend))
    tmp(1:nst, kstart:kend) = st%eigenval(1:nst, kstart:kend)
    ! This needs to be consistent with the initialization to not break serial/parallel compatibility
    if (nst<st%nst) tmp(nst+1:st%nst, kstart:kend) = 1e10_real64
    call st%d%kpt%mpi_grp%allgatherv(tmp(:, kstart:), st%nst * (kend - kstart + 1), MPI_DOUBLE_PRECISION, &
      st%eigenval, st%d%kpt%num(:) * st%nst, (st%d%kpt%range(1, :)-1) * st%nst, MPI_DOUBLE_PRECISION)
    SAFE_DEALLOCATE_A(tmp)
  end if

  ! Change of basis
  do n2 = 1, this%norbs

    if (this%is_empty(n2) .and. .not. ions%atom(this%atom(n2))%species%is_full()) cycle !No contribution on this domain

    !n2 fixes the spinor dimension, as we have two orbitals per spinor dimensions.
    !Otherwise we use hamilt(n2,n1,ik) twice, which is not what we want to do
    idim = this%ddim(n2)
    do ispin = 1, spin_channels
      call X(get_ao)(this, st, gr, ions, n2, ispin, lcaopsi2, use_psi = .false.)

      do ik = kstart, kend
        if (ispin /= st%d%get_spin_index(ik)) cycle
        do n1 = max(lcao_start, st%st_start), min(this%norbs, st%st_end)
          call states_elec_get_state(st, gr, idim, n1, ik, lcaopsi(:, 1, 1))
          call lalg_axpy(gr%np, hamilt(n2, n1, ik), lcaopsi2(:, idim), lcaopsi(:, 1, 1))
          call states_elec_set_state(st, gr, idim, n1, ik, lcaopsi(:, 1, 1))
        end do
      end do
    end do
  end do

  SAFE_DEALLOCATE_A(ev)
  SAFE_DEALLOCATE_A(hamilt)
  SAFE_DEALLOCATE_A(overlap)

  SAFE_DEALLOCATE_A(lcaopsi)
  SAFE_DEALLOCATE_A(lcaopsi2)

  SAFE_DEALLOCATE_A(this%X(buff))
  SAFE_DEALLOCATE_A(this%X(buff_single))

  POP_SUB(X(lcao_wf))
end subroutine X(lcao_wf)

! ---------------------------------------------------------
subroutine X(init_orbitals)(this, namespace, st, gr, ions, start)
  type(lcao_t),        intent(inout) :: this
  type(namespace_t),   intent(in)    :: namespace
  type(states_elec_t), intent(inout) :: st
  type(grid_t),        intent(in)    :: gr
  type(ions_t),        intent(in)    :: ions
  integer, optional,   intent(in)    :: start

  integer :: iorb, ispin, ist, ik, size, spin_channels
  integer :: nst, kstart, kend, lcao_start
  R_TYPE, allocatable :: ao(:, :)

  PUSH_SUB(X(init_orbitals))

  nst = st%nst
  kstart = st%d%kpt%start
  kend = st%d%kpt%end

  lcao_start = optional_default(start, 1)
  if (st%parallel_in_states .and. st%st_start > lcao_start) lcao_start = st%st_start

  ! We calculate the atomic orbitals first. To save memory we put
  ! all the orbitals we can in the part of st%Xpsi that we are going
  ! to overwrite and then the rest is stored in a single-precision
  ! buffer.

  SAFE_ALLOCATE(ao(1:gr%np, 1:st%d%dim))

  this%ck = 0

  iorb = 1
  ispin = 1

  ! first store in st%Xpsi
  ist_loop: do ist = lcao_start, st%st_end
    do ik = kstart, kend

      this%cst(iorb, ispin) = ist
      this%ck(iorb, ispin) = ik

      call X(lcao_atomic_orbital)(this, iorb, gr, st, ions, ao, ispin)
      call states_elec_set_state(st, gr, ist, ik, ao)

      if (ispin < st%d%spin_channels .and. st%d%ispin /= SPINORS) then
        ispin = ispin + 1
      else
        iorb = iorb + 1
        if (iorb > this%norbs) exit ist_loop
        ispin = 1
      end if

    end do
  end do ist_loop

  if (ispin < st%d%spin_channels .and. st%d%ispin /= SPINORS) then
    iorb = iorb - 1 ! we have not completed all the spin channels
    this%ck(iorb, 1:st%d%spin_channels) = 0
  end if

  ! if there are any orbitals left, allocate extra space for them

  if (iorb <= this%norbs) then

    size = (this%norbs - iorb + 1) * st%d%spin_channels
    if(this%save_memory) then
      write(message(1), '(a, i5, a)') "Info: Single-precision storage for ", size, " extra orbitals will be allocated."
      call messages_info(1, namespace=namespace)
    else
      write(message(1), '(a, i5, a)') "Info: Double-precision storage for ", size, " extra orbitals will be allocated."
      call messages_info(1, namespace=namespace)
    end if

    spin_channels = 1
    if (st%d%ispin == SPIN_POLARIZED) spin_channels = 2

    if(this%save_memory) then
      SAFE_ALLOCATE(this%X(buff_single)(1:gr%np, 1:st%d%dim, iorb:this%norbs, 1:spin_channels))
    else
      SAFE_ALLOCATE(this%X(buff)(1:gr%np, 1:st%d%dim, iorb:this%norbs, 1:spin_channels))
    end if

    do iorb = iorb, this%norbs
      do ispin = 1, spin_channels
        call X(lcao_atomic_orbital)(this, iorb, gr, st, ions, ao, ispin)
        ! Atomic orbitals used for the LCAO are in single-precision, so we need to convert them
        if(this%save_memory) then
#ifdef R_TCOMPLEX
          this%X(buff_single)(1:gr%np, 1:st%d%dim, iorb, ispin) = cmplx(ao(1:gr%np, 1:st%d%dim))!, kind=4)
#else
          this%X(buff_single)(1:gr%np, 1:st%d%dim, iorb, ispin) = real(ao(1:gr%np, 1:st%d%dim))!, 4)
#endif
        else
          this%X(buff)(1:gr%np, 1:st%d%dim, iorb, ispin) = ao(1:gr%np, 1:st%d%dim)
        end if
      end do
    end do

  end if

  this%initialized_orbitals = .true.

  SAFE_DEALLOCATE_A(ao)

  POP_SUB(X(init_orbitals))

end subroutine X(init_orbitals)


! ---------------------------------------------------------
subroutine X(get_ao)(this, st, mesh, ions, iorb, ispin, ao, use_psi)
  type(lcao_t),        intent(inout) :: this
  type(states_elec_t), intent(in)    :: st
  class(mesh_t),       intent(in)    :: mesh
  type(ions_t),        intent(in)    :: ions
  integer,             intent(in)    :: iorb
  integer,             intent(in)    :: ispin
  R_TYPE, contiguous,  intent(out)   :: ao(:, :)
  logical,             intent(in)    :: use_psi

  PUSH_SUB(X(get_ao))

  if (this%ck(iorb, ispin) == 0 .and. this%initialized_orbitals) then
    if(this%save_memory) then
      ao(1:mesh%np, 1:st%d%dim) = this%X(buff_single)(1:mesh%np, 1:st%d%dim, iorb, ispin)
    else
      ao(1:mesh%np, 1:st%d%dim) = this%X(buff)(1:mesh%np, 1:st%d%dim, iorb, ispin)
    end if
  else
    if (use_psi .and. this%initialized_orbitals) then
      call states_elec_get_state(st, mesh, this%cst(iorb, ispin), this%ck(iorb, ispin), ao)
    else
      call X(lcao_atomic_orbital)(this, iorb, mesh, st, ions, ao, ispin)
    end if
  end if

  POP_SUB(X(get_ao))

end subroutine X(get_ao)

! ---------------------------------------------------------

subroutine X(lcao_alt_init_orbitals)(this, namespace, st, gr, ions, start)
  type(lcao_t),        intent(inout) :: this
  type(namespace_t),   intent(in)    :: namespace
  type(states_elec_t), intent(inout) :: st
  type(grid_t),        intent(in)    :: gr
  type(ions_t),        intent(in)    :: ions
  integer, optional,   intent(in)    :: start

  integer :: iatom, norbs, dof

  PUSH_SUB(X(lcao_alt_init_orbitals))

  if (present(start)) then
    ASSERT(start == 1)
  end if

  write(message(1), '(a,i6,a)') 'Info: Performing LCAO calculation with ', this%norbs, ' orbitals.'
  write(message(2), '(a)') ' '
  call messages_info(2, namespace=namespace)

  if (this%norbs < st%nst) then
    write(message(1), '(a)') 'Not enough atomic orbitals to initialize all states,'
    write(message(2), '(i6,a)') st%nst - this%norbs, ' states will be randomized.'
    if (this%derivative) then
      call messages_warning(2, namespace=namespace)
    else
      write(message(3), '(a)') 'You can double the number of atomic orbitals by setting'
      write(message(4), '(a)') 'LCAOExtraOrbitals to yes.'
      call messages_warning(4, namespace=namespace)
    end if
  end if

  dof = 0
  do iatom = 1, ions%natoms
    norbs = ions%atom(iatom)%species%get_niwfs()

    ! initialize the radial grid
    call submesh_init(this%sphere(iatom), ions%space, gr, ions%latt, ions%pos(:, iatom), this%radius(iatom))
    dof = dof + this%sphere(iatom)%np*this%mult*norbs
  end do

  if (this%keep_orb) then
    call messages_write('Info: LCAO requires')
    call messages_write(dof*8.0_real64, units = unit_megabytes, fmt = '(f10.1)')
    call messages_write(' of memory for atomic orbitals.')
    call messages_info(namespace=namespace)
  end if

  POP_SUB(X(lcao_alt_init_orbitals))
end subroutine X(lcao_alt_init_orbitals)

! ---------------------------------------------------------
!> The alternative implementation.
subroutine X(lcao_alt_wf) (this, st, gr, ions, hm, namespace, start)
  type(lcao_t),             intent(inout) :: this
  type(states_elec_t),      intent(inout) :: st
  type(grid_t),             intent(in)    :: gr
  type(ions_t),             intent(in)    :: ions
  type(hamiltonian_elec_t), intent(in)    :: hm
  type(namespace_t),        intent(in)    :: namespace
  integer,                  intent(in)    :: start

  integer :: iatom, jatom, ik, ispin, nev, ib, n1, n2
  integer :: ibasis, jbasis, iorb, jorb, norbs, block_evec_max, block_evec_size
  R_TYPE, allocatable :: hamiltonian(:, :), overlap(:, :), aa(:, :), bb(:, :)
  integer :: prow, pcol, ilbasis, jlbasis
  R_TYPE, allocatable :: psii(:, :, :), hpsi(:, :, :)
  type(wfs_elec_t) :: hpsib, psib
  real(real64), allocatable :: eval(:)
  R_TYPE, allocatable :: evec(:, :), levec(:, :), block_evec(:, :)
  real(real64) :: dist2
  integer :: iunit_h, iunit_s, iunit_e
  character(len=256) :: filename
  ! Variables used for some commented debug statements
  !integer :: ierr

  PUSH_SUB(X(lcao_alt_wf))

  ASSERT(start == 1)

  if (debug%info) then
    if (this%parallel .or. mpi_grp_is_root(mpi_world)) then
      ! Hamiltonian matrix
      filename = 'debug/lcao/hamiltonian'
      if (this%parallel) then
        write(filename, '(a,".",i6.6)') trim(filename), gr%mpi_grp%rank
      end if
      iunit_h = io_open(filename, namespace, action='write')
      write(iunit_h,'(4a6,a15)') 'iorb', 'jorb', 'ik', 'spin', 'hamiltonian'

      ! Overlap matrix
      filename = 'debug/lcao/overlap'
      if (this%parallel) then
        write(filename, '(a,".",i6.6)') trim(filename), gr%mpi_grp%rank
      end if
      iunit_s = io_open(filename, namespace, action='write')
      write(iunit_s,'(3a6,a15)') 'iorb', 'jorb', 'spin', 'overlap'
    end if

    if (.not. this%parallel .and. mpi_grp_is_root(mpi_world)) then
      ! Eigenvectors
      iunit_e = io_open('debug/lcao/eigenvectors', namespace, action='write')
      write(iunit_e,'(4a6,a15)') 'ieig', 'jorb', 'ik', 'spin', 'coefficient'
    end if
  end if

  if (.not. this%parallel) then
    if (mpi_grp_is_root(gr%mpi_grp)) then
      SAFE_ALLOCATE(hamiltonian(1:this%norbs, 1:this%norbs))
      SAFE_ALLOCATE(overlap(1:this%norbs, 1:this%norbs))
    end if
  else
    SAFE_ALLOCATE(hamiltonian(1:this%lsize(1), 1:this%lsize(2)))
    SAFE_ALLOCATE(overlap(1:this%lsize(1), 1:this%lsize(2)))
  end if

  SAFE_ALLOCATE(aa(1:this%maxorb, 1:this%maxorb))
  SAFE_ALLOCATE(bb(1:this%maxorb, 1:this%maxorb))
  SAFE_ALLOCATE(psii(1:gr%np_part, 1:st%d%dim, 1:this%maxorb))
  SAFE_ALLOCATE(hpsi(1:gr%np, 1:st%d%dim, 1:this%maxorb))

  call st%set_zero()
  ! As we use below an all_reduce, we need to set eigenvalues to zero.
  ! This could be done better with a allgatherv
  st%eigenval = M_ZERO

  do ispin = 1, st%d%spin_channels

    if (st%d%spin_channels > 1) then
      write(message(1), '(a)') ' '
      write(message(2), '(a,i1)') 'LCAO for spin channel ', ispin
      write(message(3), '(a)') ' '
      call messages_info(3, namespace=namespace)
    end if

    if (ispin > 1) then
      ! we need to deallocate previous orbitals
      do iatom = 1, ions%natoms
        call lcao_alt_end_orbital(this, iatom)
      end do
    end if

    ! iterate over the kpoints for this spin
    do ik = st%d%kpt%start, st%d%kpt%end
      if (ispin /= st%d%get_spin_index(ik)) cycle

      st%eigenval(:, ik) = 1e10_real64
      if (st%nik > st%d%spin_channels) then
        write(message(1), '(a)') ' '
        write(message(2), '(a,i5)') 'LCAO for k-point ', st%d%get_kpoint_index(ik)
        write(message(3), '(a)') ' '
        call messages_info(3, namespace=namespace)
      end if

      call messages_write('Calculating matrix elements.')
      call messages_info(namespace=namespace)

      call profiling_in(TOSTRING(X(LCAO_MATRIX)))

      if (this%parallel .or. mpi_grp_is_root(gr%mpi_grp)) then
        hamiltonian = R_TOTYPE(M_ZERO)
        overlap = R_TOTYPE(M_ZERO)
      end if
      if (mpi_grp_is_root(mpi_world)) call loct_progress_bar(-1, ions%natoms)

      do iatom = 1, ions%natoms
        norbs = this%norb_atom(iatom)

        if (this%complex_ylms) then
          call zlcao_alt_get_orbital(this, this%sphere(iatom), ions, ispin, iatom, this%norb_atom(iatom))
        else
          call dlcao_alt_get_orbital(this, this%sphere(iatom), ions, ispin, iatom, this%norb_atom(iatom))
        end if

        psii = M_ZERO

        call wfs_elec_init( psib, st%d%dim, this%atom_orb_basis(iatom, 1), this%atom_orb_basis(iatom, norbs), psii, ik)
        call wfs_elec_init(hpsib, st%d%dim, this%atom_orb_basis(iatom, 1), this%atom_orb_basis(iatom, norbs), hpsi, ik)

#ifdef R_TCOMPLEX
        if(this%complex_ylms) then
          call zsubmesh_batch_add(this%sphere(iatom), this%orbitals(iatom), psib)
        else
          call dzsubmesh_batch_add(this%sphere(iatom), this%orbitals(iatom), psib)
        end if
#else
        call dsubmesh_batch_add(this%sphere(iatom), this%orbitals(iatom), psib)
#endif
        call X(hamiltonian_elec_apply_batch)(hm, namespace, gr, psib, hpsib)

        do jatom = 1, ions%natoms
          if (.not. this%calc_atom(jatom)) cycle
          ! we only calculate the upper triangle
          if (jatom < iatom) cycle

          dist2 = sum((ions%pos(:, iatom) - ions%pos(:, jatom))**2)

          ! FIXME: this is only correct for KS DFT, but not Hartree-Fock or generalized KS
          if (.not. ions%space%is_periodic() .and. (dist2 > (this%radius(iatom) + this%radius(jatom) + this%lapdist)**2)) cycle

          if (this%complex_ylms) then
            call zlcao_alt_get_orbital(this, this%sphere(jatom), ions, ispin, jatom, this%norb_atom(jatom))
          else
            call dlcao_alt_get_orbital(this, this%sphere(jatom), ions, ispin, jatom, this%norb_atom(jatom))
          end if

          ibasis = this%atom_orb_basis(iatom, 1)
          jbasis = this%atom_orb_basis(jatom, 1)

          call X(submesh_batch_dotp_matrix)(this%sphere(jatom), hpsib, this%orbitals(jatom), aa)

          if (dist2 > (this%radius(iatom) + this%radius(jatom))**2) then
            bb = M_ZERO
          else
            call X(submesh_batch_dotp_matrix)(this%sphere(jatom), psib, this%orbitals(jatom), bb)
          end if

          if (.not. this%keep_orb) call lcao_alt_end_orbital(this, jatom)

          !now, store the result in the matrix

          if (this%parallel .or. mpi_grp_is_root(gr%mpi_grp)) then
            do iorb = 1, norbs
              n1 = ibasis - 1 + iorb

              ! Uncomment to output all the atomic orbitals used in the LCAO calculation
              !if (debug%info) then
              !  write(filename, '(a,i4.4,a,i1)') 'lcao-orb', n1
              !  call X(io_function_output)(OPTION__OUTPUTFORMAT__XCRYSDEN, "debug/lcao", filename, namespace, &
              !    gr, psii(:, 1, iorb), sqrt(units_out%length**(-gr%box%dim)), &
              !    ierr, ions = ions)
              !end if

              do jorb = 1, this%norb_atom(jatom)
                n2 = jbasis - 1 + jorb

                if (n2 < n1) cycle ! only upper triangle

                if (this%parallel) then
                  call lcao_local_index(this, ibasis - 1 + iorb,  jbasis - 1 + jorb, ilbasis, jlbasis, prow, pcol)
                  if (all((/prow, pcol/) == this%myroc)) then
                    hamiltonian(ilbasis, jlbasis) = aa(iorb, jorb)
                    overlap(ilbasis, jlbasis) = bb(iorb, jorb)

                    if (debug%info) then
                      write(iunit_h,'(4i6,2f15.6)') n1, n2, ik, ispin, &
                        units_from_atomic(units_out%energy, hamiltonian(ilbasis, jlbasis))
                      write(iunit_s,'(3i6,2f15.6)') n1, n2, ispin, overlap(ilbasis, jlbasis)
                    end if
                  end if

                else
                  hamiltonian(n1, n2) = aa(iorb, jorb)
                  overlap(n1, n2) = bb(iorb, jorb)

                  if (debug%info .and. mpi_grp_is_root(mpi_world)) then
                    write(iunit_h,'(4i6,2f15.6)') n1, n2, ik, ispin, units_from_atomic(units_out%energy, hamiltonian(n1, n2))
                    write(iunit_s,'(3i6,2f15.6)') n1, n2, ispin, overlap(n1, n2)
                  end if
                end if

              end do
            end do
          end if

        end do ! jatom

        call psib%end()
        call hpsib%end()

        if (.not. this%keep_orb) call lcao_alt_end_orbital(this, iatom)

        if (mpi_grp_is_root(mpi_world)) call loct_progress_bar(iatom, ions%natoms)
      end do ! iatom

      if (mpi_grp_is_root(mpi_world)) write(stdout, '(1x)')

      call messages_write('Diagonalizing Hamiltonian.')
      call messages_info(namespace=namespace)

      call profiling_out(TOSTRING(X(LCAO_MATRIX)))
      ! the number of eigenvectors we need
      nev = min(this%norbs, st%nst)

      SAFE_ALLOCATE(eval(1:this%norbs))

      if (this%parallel) then
        SAFE_ALLOCATE(levec(1:this%lsize(1), 1:this%lsize(2)))
        SAFE_ALLOCATE(evec(1:this%norbs, st%st_start:min(st%st_end, this%norbs)))
      else
        if (mpi_grp_is_root(gr%mpi_grp)) then
          SAFE_ALLOCATE(evec(1:this%norbs, 1:this%norbs))
        end if
      end if

      call profiling_in(TOSTRING(X(LCAO_DIAG)))
      if (this%parallel) then
        call diagonalization_parallel()
      else

        if (mpi_grp_is_root(gr%mpi_grp)) then
          call diagonalization_serial()
        end if

        if (gr%parallel_in_domains) then
          ! Broadcast the eigenvalues to all the nodes
          call gr%mpi_grp%bcast(eval(1), size(eval), MPI_DOUBLE_PRECISION, 0)
          ! We will not broadcast the eigenvectors at this point, because we do not
          ! want all the processes to store the full matrix at the same time, as this
          ! can use a lot of memory space.
        end if

        if (debug%info) then
          do n2 = 1, this%norbs
            do n1 = 1, this%norbs
              write(iunit_e,'(4i6,2f15.6)') n2, n1, ik, ispin, evec(n1, n2)
            end do
          end do
        end if
      end if
      call profiling_out(TOSTRING(X(LCAO_DIAG)))

      call profiling_in(TOSTRING(X(LCAO_WAVEFUNCTIONS)))

      call messages_write('Generating wavefunctions.')
      call messages_info(namespace=namespace)

      ! set the eigenvalues
      st%eigenval(1:nev, ik) = eval(1:nev)
      ! FIXME: we should calculate expectation values of the Hamiltonian here.
      ! The output will show ******* for the eigenvalues which looks like something horrible has gone wrong.

      SAFE_DEALLOCATE_A(eval)

      if (.not. this%parallel .and. gr%parallel_in_domains) then

        if (mpi_grp_is_root(mpi_world)) call loct_progress_bar(-1, this%norbs * nev)

        ! We will work on each batch of states at a time, broadcasting only the portion of evec needed by that batch of states.
        ! Unfortunately this means the loop over atoms will be repeated for each batch, which is not the most efficient way
        ! of doing this, but it avoids storing the full evec matrix in all processes.

        do ib = st%group%block_start, st%group%block_end
          if (nev < states_elec_block_min(st, ib)) cycle

          block_evec_max = min(nev, states_elec_block_max(st, ib))
          block_evec_size = block_evec_max - states_elec_block_min(st, ib) + 1

          SAFE_ALLOCATE(block_evec(1:this%norbs, 1:block_evec_size))

          if (mpi_grp_is_root(gr%mpi_grp)) then
            block_evec(1:this%norbs, 1:block_evec_size) = &
              evec(1:this%norbs, states_elec_block_min(st, ib):block_evec_max)
          end if
          call gr%mpi_grp%bcast(block_evec(1,1), size(block_evec), R_MPITYPE, 0)
          ibasis = 1
          do iatom = 1, ions%natoms
            norbs = this%norb_atom(iatom)

            if (this%complex_ylms) then
              call zlcao_alt_get_orbital(this, this%sphere(iatom), ions, ispin, iatom, this%norb_atom(iatom))
            else
              call dlcao_alt_get_orbital(this, this%sphere(iatom), ions, ispin, iatom, this%norb_atom(iatom))
            end if

            ! FIXME: this call handles spinors incorrectly.
            call X(submesh_batch_add_matrix)(this%sphere(iatom), block_evec(ibasis:, :), &
              this%orbitals(iatom), st%group%psib(ib, ik))

            ibasis = ibasis + norbs
            if (mpi_grp_is_root(mpi_world)) then
              call loct_progress_bar((states_elec_block_min(st, ib) - 1) * this%norbs + block_evec_size * (ibasis - 1), &
                this%norbs * nev)
            end if
          end do

          if (.not. this%keep_orb) then
            do iatom = 1, ions%natoms
              call lcao_alt_end_orbital(this, iatom)
            end do
          end if

          SAFE_DEALLOCATE_A(block_evec)
        end do

        if (mpi_grp_is_root(gr%mpi_grp)) then
          SAFE_DEALLOCATE_A(evec)
        end if

      else

        if (mpi_grp_is_root(mpi_world)) call loct_progress_bar(-1, this%norbs)
        ibasis = 1

        do iatom = 1, ions%natoms
          norbs = this%norb_atom(iatom)

          if (this%complex_ylms) then
            call zlcao_alt_get_orbital(this, this%sphere(iatom), ions, ispin, iatom, this%norb_atom(iatom))
          else
            call dlcao_alt_get_orbital(this, this%sphere(iatom), ions, ispin, iatom, this%norb_atom(iatom))
          end if

          do ib = st%group%block_start, st%group%block_end
            ! FIXME: this call handles spinors incorrectly.
            call X(submesh_batch_add_matrix)(this%sphere(iatom), evec(ibasis:, states_elec_block_min(st, ib):), &
              this%orbitals(iatom), st%group%psib(ib, ik))
          end do

          if (.not. this%keep_orb) call lcao_alt_end_orbital(this, iatom)

          ibasis = ibasis + norbs
          if (mpi_grp_is_root(mpi_world)) call loct_progress_bar(ibasis - 1, this%norbs)
        end do

        SAFE_DEALLOCATE_A(evec)
      end if

      if (mpi_grp_is_root(mpi_world)) write(stdout, '(1x)')
      call profiling_out(TOSTRING(X(LCAO_WAVEFUNCTIONS)))
    end do ! ik
  end do ! ispin

  if (debug%info .and. (this%parallel .or. mpi_grp_is_root(mpi_world))) then
    call io_close(iunit_h)
    call io_close(iunit_s)
    if(.not. this%parallel)  call io_close(iunit_e)
  end if

  if (st%d%kpt%parallel) then
    call comm_allreduce(st%d%kpt%mpi_grp, st%eigenval, dim=(/nev,st%nik/))
  end if


  do iatom = 1, ions%natoms
    call submesh_end(this%sphere(iatom))
    call lcao_alt_end_orbital(this, iatom)
    call this%orbitals(iatom)%end()
  end do

  if (this%parallel .or. mpi_grp_is_root(gr%mpi_grp)) then
    SAFE_DEALLOCATE_A(hamiltonian)
    SAFE_DEALLOCATE_A(overlap)
  end if

  SAFE_DEALLOCATE_A(aa)
  SAFE_DEALLOCATE_A(bb)
  SAFE_DEALLOCATE_A(psii)
  SAFE_DEALLOCATE_A(hpsi)

  POP_SUB(X(lcao_alt_wf))

contains

  !> \return evec the eigenvector
  subroutine diagonalization_parallel()

#ifdef HAVE_SCALAPACK
    integer              :: neval_found, info, lwork
    R_TYPE               :: tmp(3) !< size must be at least 3 according to ScaLAPACK
    R_TYPE,  allocatable :: work(:)
    integer, allocatable :: iwork(:), ifail(:)
#ifdef R_TCOMPLEX
    real(real64),   allocatable :: rwork(:)
#endif
    integer              :: ilbasis, jlbasis, proc(1:2), dest(1:2)
    integer              :: nevec_found, liwork, ii, node
    real(real64),   allocatable :: gap(:)
    integer, allocatable :: iclustr(:)
    integer, allocatable :: send_count(:), send_disp(:), recv_count(:), recv_disp(:), recv_pos(:, :, :)
    R_TYPE,  allocatable :: send_buffer(:, :), recv_buffer(:, :)
    real(real64)         :: orfac
#ifdef R_TCOMPLEX
    real(real64)         :: rtmp(3) !< size must be at least 3 according to ScaLAPACK
    integer              :: lrwork
#endif
#endif

    PUSH_SUB(X(lcao_alt_wf).diagonalization_parallel)

#ifdef HAVE_SCALAPACK
    SAFE_ALLOCATE(ifail(1:this%norbs))
    SAFE_ALLOCATE(iclustr(1:2*st%dom_st_proc_grid%nprocs))
    SAFE_ALLOCATE(gap(1:st%dom_st_proc_grid%nprocs))

    ! This means that we do not want reorthogonalization of close
    ! eigenvectors
    orfac = M_ZERO

#ifdef R_TREAL
    call scalapack_sygvx(ibtype = 1, jobz = 'V', range = 'I', uplo = 'U', &
      n = this%norbs, a = hamiltonian(1, 1), ia = 1, ja = 1, desca = this%desc(1), &
      b = overlap(1, 1), ib = 1, jb = 1, descb = this%desc(1), &
      vl = M_ZERO, vu = M_ONE, il = 1, iu = nev, abstol = this%diag_tol, &
      m = neval_found, nz = nevec_found, w = eval(1), orfac = orfac, &
      z = levec(1, 1), iz = 1, jz = 1, descz = this%desc(1), &
      work = tmp(1), lwork = -1, iwork = liwork, liwork = -1, &
      ifail = ifail(1), iclustr = iclustr(1), gap = gap(1), info = info)
#else
    call scalapack_hegvx(ibtype = 1, jobz = 'V', range = 'I', uplo = 'U', &
      n = this%norbs, a = hamiltonian(1, 1), ia = 1, ja = 1, desca = this%desc(1), &
      b = overlap(1, 1), ib = 1, jb = 1, descb = this%desc(1), &
      vl = M_ZERO, vu = M_ONE, il = 1, iu = nev, abstol = this%diag_tol, &
      m = neval_found, nz = nevec_found, w = eval(1), orfac = orfac, &
      z = levec(1, 1), iz = 1, jz = 1, descz = this%desc(1), &
      work = tmp(1), lwork = -1, rwork = rtmp(1), lrwork = -1, iwork = liwork, liwork = -1, &
      ifail = ifail(1), iclustr = iclustr(1), gap = gap(1), info = info)
#endif

    if (info /= 0) then
      write(message(1), '(a,i4,a)') &
        'Workspace query for LCAO parallel diagonalization failed. ScaLAPACK returned info code ', info, '.'
      call messages_warning(1, namespace=namespace)
    end if

    lwork = nint(R_REAL(tmp(1)))

    SAFE_ALLOCATE(work(1:lwork))
    SAFE_ALLOCATE(iwork(1:liwork))
    work = M_ZERO
    iwork = 0

#ifdef R_TREAL
    call scalapack_sygvx(ibtype = 1, jobz = 'V', range = 'I', uplo = 'U', &
      n = this%norbs, a = hamiltonian(1, 1), ia = 1, ja = 1, desca = this%desc(1), &
      b = overlap(1, 1), ib = 1, jb = 1, descb = this%desc(1), &
      vl = M_ZERO, vu = M_ONE, il = 1, iu = nev, abstol = this%diag_tol, &
      m = neval_found, nz = nevec_found, w = eval(1), orfac = orfac, &
      z = levec(1, 1), iz = 1, jz = 1, descz = this%desc(1), &
      work = work(1), lwork = lwork, iwork = iwork(1), liwork = liwork, &
      ifail = ifail(1), iclustr = iclustr(1), gap = gap(1), info = info)
#else
    lrwork = nint(rtmp(1))
    SAFE_ALLOCATE(rwork(1:lrwork))

    call scalapack_hegvx(ibtype = 1, jobz = 'V', range = 'I', uplo = 'U', &
      n = this%norbs, a = hamiltonian(1, 1), ia = 1, ja = 1, desca = this%desc(1), &
      b = overlap(1, 1), ib = 1, jb = 1, descb = this%desc(1), &
      vl = M_ZERO, vu = M_ONE, il = 1, iu = nev, abstol = this%diag_tol, &
      m = neval_found, nz = nevec_found, w = eval(1), orfac = orfac, &
      z = levec(1, 1), iz = 1, jz = 1, descz = this%desc(1), &
      work = work(1), lwork = lwork, rwork = rwork(1), lrwork = lrwork, iwork = iwork(1), liwork = liwork, &
      ifail = ifail(1), iclustr = iclustr(1), gap = gap(1), info = info)
    SAFE_DEALLOCATE_A(rwork)
#endif

    if (info /= 0) then
      write(message(1), '(a,i4,a)') 'LCAO parallel diagonalization failed. ScaLAPACK returned info code ', info, '.'
      call messages_warning(1, namespace=namespace)
    end if

    SAFE_DEALLOCATE_A(ifail)
    SAFE_DEALLOCATE_A(iwork)
    SAFE_DEALLOCATE_A(work)
    SAFE_DEALLOCATE_A(iclustr)
    SAFE_DEALLOCATE_A(gap)

    ! Now we have to rearrange the data between processors. We have
    ! the data in levec, distributed according to ScaLAPACK and we
    ! need to copy it to evec, distributed by columns according to
    ! state parallelization.

    SAFE_ALLOCATE(send_count(1:st%dom_st_mpi_grp%size))
    SAFE_ALLOCATE(recv_count(1:st%dom_st_mpi_grp%size))

    ! First we count the number of points (to allocate the buffers)
    send_count = 0
    recv_count = 0
    do ibasis = 1, this%norbs
      do jbasis = 1, nev

        call lcao_local_index(this, ibasis, jbasis, ilbasis, jlbasis, proc(1), proc(2))

        if (all(proc == this%myroc)) then
          ! we have to send the point

          ! we only count points per column (we have to send the
          ! same number to all points in a row)
          send_count(st%node(jbasis) + 1) = send_count(st%node(jbasis) + 1) + 1
        end if

        if (st%node(jbasis) == this%myroc(2)) then
          ! we have to receive
          call MPI_Cart_rank(st%dom_st_mpi_grp%comm, proc, node, mpi_err)
          recv_count(node + 1) = recv_count(node + 1) + 1
        end if

      end do
    end do

    SAFE_ALLOCATE(send_buffer(1:max(1, maxval(send_count)), 1:st%dom_st_mpi_grp%size))
    SAFE_ALLOCATE(recv_pos(1:2, max(1, maxval(recv_count)), 1:st%dom_st_mpi_grp%size))

    send_count = 0
    recv_count = 0
    do ibasis = 1, this%norbs
      do jbasis = 1, nev

        call lcao_local_index(this, ibasis, jbasis, ilbasis, jlbasis, proc(1), proc(2))

        if (all(proc == this%myroc)) then
          ! we have to send the point
          dest(2) = st%node(jbasis)

          do ii = 1, this%nproc(1)
            dest(1) = ii - 1

            ! get the node id from coordinates
            call MPI_Cart_rank(st%dom_st_mpi_grp%comm, dest, node, mpi_err)
            node = node + 1
            send_count(node) = send_count(node) + 1
            send_buffer(send_count(node), node) = levec(ilbasis, jlbasis)
          end do

        end if

        if (st%node(jbasis) == this%myroc(2)) then
          ! we have to receive
          call MPI_Cart_rank(st%dom_st_mpi_grp%comm, proc, node, mpi_err)
          node = node + 1

          recv_count(node) = recv_count(node) + 1
          ! where do we put it once received?
          recv_pos(1, recv_count(node), node) = ibasis
          recv_pos(2, recv_count(node), node) = jbasis
        end if

      end do
    end do

    SAFE_DEALLOCATE_A(levec)
    SAFE_ALLOCATE(recv_buffer(1:max(1, maxval(recv_count)), 1:st%dom_st_mpi_grp%size))

    SAFE_ALLOCATE(send_disp(1:st%dom_st_mpi_grp%size))
    SAFE_ALLOCATE(recv_disp(1:st%dom_st_mpi_grp%size))

    do node = 1, st%dom_st_mpi_grp%size
      send_disp(node) = ubound(send_buffer, dim = 1)*(node - 1)
      recv_disp(node) = ubound(recv_buffer, dim = 1)*(node - 1)
    end do

    call st%dom_st_mpi_grp%alltoallv(send_buffer, send_count, send_disp, R_MPITYPE, &
      recv_buffer, recv_count, recv_disp, R_MPITYPE)

    do node = 1, st%dom_st_mpi_grp%size
      do ii = 1, recv_count(node)
        evec(recv_pos(1, ii, node), recv_pos(2, ii, node)) = recv_buffer(ii, node)
      end do
    end do

    SAFE_DEALLOCATE_A(send_disp)
    SAFE_DEALLOCATE_A(send_count)
    SAFE_DEALLOCATE_A(send_buffer)
    SAFE_DEALLOCATE_A(recv_pos)
    SAFE_DEALLOCATE_A(recv_disp)
    SAFE_DEALLOCATE_A(recv_count)
    SAFE_DEALLOCATE_A(recv_buffer)

#endif /* HAVE_SCALAPACK */
    POP_SUB(X(lcao_alt_wf).diagonalization_parallel)
  end subroutine diagonalization_parallel

  subroutine diagonalization_serial()

    integer              :: neval_found, info, lwork
    R_TYPE               :: tmp(3) !< size must be at least 3 according to ScaLAPACK
    R_TYPE,  allocatable :: work(:)
    integer, allocatable :: iwork(:), ifail(:)
#ifdef R_TCOMPLEX
    real(real64),   allocatable :: rwork(:)
#endif

    PUSH_SUB(X(lcao_alt_wf).diagonalization_serial)

    SAFE_ALLOCATE(ifail(1:this%norbs))
    SAFE_ALLOCATE(iwork(1:5*this%norbs))

    ASSERT(allocated(hamiltonian))
    ASSERT(allocated(overlap))

#ifdef R_TREAL
    call lapack_sygvx(itype = 1, jobz = 'V', range = 'I', uplo = 'U', &
      n = this%norbs, a = hamiltonian(1, 1), lda = this%norbs, b = overlap(1, 1), ldb = this%norbs, &
      vl = M_ZERO, vu = M_ONE, il = 1, iu = nev, abstol = this%diag_tol, &
      m = neval_found, w = eval(1), z = evec(1, 1), ldz = this%norbs, &
      work = tmp(1), lwork = -1, iwork = iwork(1), ifail = ifail(1), info = info)
#else
    SAFE_ALLOCATE(rwork(1:7*this%norbs))

    call lapack_hegvx(itype = 1, jobz = 'V', range = 'I', uplo = 'U', &
      n = this%norbs, a = hamiltonian(1, 1), lda = this%norbs, b = overlap(1, 1), ldb = this%norbs, &
      vl = M_ZERO, vu = M_ONE, il = 1, iu = nev, abstol = this%diag_tol, &
      m = neval_found, w = eval(1), z = evec(1, 1), ldz = this%norbs, &
      work = tmp(1), lwork = -1, rwork = rwork(1), iwork = iwork(1), ifail = ifail(1), info = info)
#endif
    if (info /= 0) then
      write(message(1), '(a,i4,a)') 'Workspace query for LCAO diagonalization failed. LAPACK returned info code ', info, '.'
      call messages_warning(1, namespace=namespace)
    end if

    lwork = nint(R_REAL(tmp(1)))
    SAFE_ALLOCATE(work(1:lwork))

#ifdef R_TREAL
    call lapack_sygvx(itype = 1, jobz = 'V', range = 'I', uplo = 'U', &
      n = this%norbs, a = hamiltonian(1, 1), lda = this%norbs, b = overlap(1, 1), ldb = this%norbs, &
      vl = M_ZERO, vu = M_ONE, il = 1, iu = nev, abstol = this%diag_tol, &
      m = neval_found, w = eval(1), z = evec(1, 1), ldz = this%norbs, &
      work = work(1), lwork = lwork, iwork = iwork(1), ifail = ifail(1), info = info)
#else
    call lapack_hegvx(itype = 1, jobz = 'V', range = 'I', uplo = 'U', &
      n = this%norbs, a = hamiltonian(1, 1), lda = this%norbs, b = overlap(1, 1), ldb = this%norbs, &
      vl = M_ZERO, vu = M_ONE, il = 1, iu = nev, abstol = this%diag_tol, &
      m = neval_found, w = eval(1), z = evec(1, 1), ldz = this%norbs, &
      work = work(1), lwork = lwork,  rwork = rwork(1), iwork = iwork(1), ifail = ifail(1), info = info)

    SAFE_DEALLOCATE_A(rwork)
#endif

    if (info /= 0) then
      write(message(1), '(a,i4,a)') 'LCAO diagonalization failed. LAPACK returned info code ', info, '.'
      call messages_warning(1, namespace=namespace)
    end if

    SAFE_DEALLOCATE_A(ifail)
    SAFE_DEALLOCATE_A(iwork)
    SAFE_DEALLOCATE_A(work)


    POP_SUB(X(lcao_alt_wf).diagonalization_serial)
  end subroutine diagonalization_serial

end subroutine X(lcao_alt_wf)

 ! ---------------------------------------------------------

 !> This function generates the set of an atomic orbitals for an atom
 !! and stores it in the batch orbitalb. It can be called when the
 !! orbitals are already stored. In that case it does not do anything.
subroutine X(lcao_alt_get_orbital)(this, sphere, ions, ispin, iatom, norbs)
  type(lcao_t),      intent(inout) :: this
  type(submesh_t),   intent(in)    :: sphere
  type(ions_t),      intent(in)    :: ions
  integer,           intent(in)    :: ispin
  integer,           intent(in)    :: iatom
  integer,           intent(in)    :: norbs

  integer :: iorb, ii, ll, mm
  logical :: derivative

  PUSH_SUB(X(lcao_alt_get_orbital))

  if (.not. this%is_orbital_initialized(iatom)) then
    call profiling_in(TOSTRING(X(LCAO_ORBITALS)))

    ! FIXME: the second argument should be dim = st%d%dim, not 1!
    call X(batch_init)(this%orbitals(iatom), 1, 1, norbs, sphere%np)

    ! generate the orbitals
    do iorb = 1, norbs
      if (iorb > ions%atom(iatom)%species%get_niwfs()) then
        call ions%atom(iatom)%species%get_iwf_ilm(iorb - ions%atom(iatom)%species%get_niwfs(), ispin, ii, ll, mm)
        derivative = .true.
      else
        call ions%atom(iatom)%species%get_iwf_ilm(iorb, ispin, ii, ll, mm)
        derivative = .false.
      end if

      call X(atomic_orbital_get_submesh)(ions%atom(iatom)%species, sphere, ii, ll, mm, &
        ispin, this%orbitals(iatom)%X(ff)(:, 1, iorb), derivative = derivative)
    end do

    this%is_orbital_initialized(iatom) = .true.
    call profiling_out(TOSTRING(X(LCAO_ORBITALS)))
  end if

  POP_SUB(X(lcao_alt_get_orbital))

end subroutine X(lcao_alt_get_orbital)

!! Local Variables:
!! mode: f90
!! coding: utf-8
!! End:
