/*
 Copyright (C) 2016 X. Andrade

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2, or (at your option)
 any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 02110-1301, USA.

*/

#include <config.h>

#ifdef HAVE_CUDA
#if defined(HAVE_HIP)
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#include <hip/hiprtc.h>
// https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html
#define CUcontext hipCtx_t
#define cuCtxCreate hipCtxCreate
#define cuCtxDestroy hipCtxDestroy
#define cuCtxSetCacheConfig hipCtxSetCacheConfig
#define CUDA_ERROR_OUT_OF_MEMORY hipErrorOutOfMemory
#define CUDA_SUCCESS hipSuccess
#define CUdevice hipDevice_t
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR hipDeviceAttributeComputeCapabilityMajor
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR hipDeviceAttributeComputeCapabilityMinor
#define CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X hipDeviceAttributeMaxBlockDimX
#define CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y hipDeviceAttributeMaxBlockDimY
#define CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z hipDeviceAttributeMaxBlockDimZ
#define CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X hipDeviceAttributeMaxGridDimX
#define CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y hipDeviceAttributeMaxGridDimY
#define CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z hipDeviceAttributeMaxGridDimZ
#define CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK hipDeviceAttributeMaxSharedMemoryPerBlock
#define CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK hipDeviceAttributeMaxThreadsPerBlock
#define CU_DEVICE_ATTRIBUTE_WARP_SIZE hipDeviceAttributeWarpSize
#define cuDeviceGet hipDeviceGet
#define cuDeviceGetAttribute hipDeviceGetAttribute
#define cuDeviceGetCount hipGetDeviceCount
#define cuDeviceGetName hipDeviceGetName
#define CUdevprop hipDeviceProp_t
#define cuDeviceGetProperties hipGetDeviceProperties
#define CUdeviceptr hipDeviceptr_t
#define cuDeviceTotalMem hipDeviceTotalMem
#define cuDriverGetVersion hipDriverGetVersion
#define CU_FUNC_ATTRIBUTE_BINARY_VERSION HIP_FUNC_ATTRIBUTE_BINARY_VERSION
#define CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
#define CU_FUNC_ATTRIBUTE_PTX_VERSION HIP_FUNC_ATTRIBUTE_PTX_VERSION
#define CU_FUNC_CACHE_PREFER_L1 hipFuncCachePreferL1
#define cuFuncGetAttribute hipFuncGetAttribute
#define CUfunction hipFunction_t
#define cuGetErrorName hipDrvGetErrorName
#define cuInit hipInit
#ifdef __HIP_PLATFORM_AMD__
// these are missing from hip/nvidia_detail/nvidia_hip_runtime_api.h
#define CU_JIT_ERROR_LOG_BUFFER hipJitOptionErrorLogBuffer
#define CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES hipJitOptionErrorLogBufferSizeBytes
#endif
#define CUjit_option hipJitOption
#define cuLaunchKernel hipModuleLaunchKernel
#define cuMemAlloc hipMalloc
#define cuMemcpyDtoHAsync hipMemcpyDtoHAsync
#define cuMemcpyHtoDAsync hipMemcpyHtoDAsync
#define cuMemFree hipFree
#define CUmodule hipModule_t
#define cuModuleGetFunction hipModuleGetFunction
#define cuModuleLoadDataEx hipModuleLoadDataEx
#define cuModuleUnload hipModuleUnload
#define CUresult hipError_t
#define CUstream hipStream_t
#define cuStreamCreate hipStreamCreateWithFlags
#define cuStreamDestroy hipStreamDestroy
#define CU_STREAM_NON_BLOCKING hipStreamNonBlocking
#define cuStreamSynchronize hipStreamSynchronize
// https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_RTC_API_supported_by_HIP.html
#define nvrtcCompileProgram hiprtcCompileProgram
#define nvrtcCreateProgram hiprtcCreateProgram
#define nvrtcDestroyProgram hiprtcDestroyProgram
#define nvrtcGetErrorString hiprtcGetErrorString
#define nvrtcGetProgramLog hiprtcGetProgramLog
#define nvrtcGetProgramLogSize hiprtcGetProgramLogSize
#define nvrtcGetPTX hiprtcGetCode
#define nvrtcGetPTXSize hiprtcGetCodeSize
#define nvrtcProgram hiprtcProgram
#define nvrtcResult hiprtcResult
#define NVRTC_SUCCESS HIPRTC_SUCCESS
#else
#include <cuda.h>
#include <nvrtc.h>
#endif
// all kernels and transfers are submitted to this non-blocking stream
// -> allows operations from libraries to overlap with this stream
CUstream *phStream;
int current_stream;
static int number_streams = 32;
#else
#include <stdint.h>
typedef intptr_t CUcontext;
typedef intptr_t CUdevice;
typedef intptr_t CUmodule;
typedef intptr_t CUfunction;
typedef intptr_t CUdeviceptr;
typedef intptr_t CUstream;
#endif

#include <cmath>
#include <stdlib.h> //we have to include this before cmath to workaround a bug in the PGI "compiler".

#include <iostream>

#include <fstream>

#include "string_f.h" /* fortran <-> c string compatibility issues */

#include <cassert>
#include <cstring>
#include <iterator>
#include <map>
#include <sstream>
#include <stdbool.h>
#include <vector>
#include <cstdlib>
#include <regex>

#include <fortran_types.h>

#define NVRTC_SAFE_CALL(x)                                                     \
  do {                                                                         \
    nvrtcResult result = x;                                                    \
    if (result != NVRTC_SUCCESS) {                                             \
      std::cerr << "\nerror: " #x " failed with error "                        \
                << nvrtcGetErrorString(result) << '\n';                        \
      exit(1);                                                                 \
    }                                                                          \
  } while (0)

#define CUDA_SAFE_CALL(x)                                                      \
  do {                                                                         \
    CUresult result = x;                                                       \
    if (result != CUDA_SUCCESS) {                                              \
      const char *msg;                                                         \
      cuGetErrorName(result, &msg);                                            \
      std::cerr << "\nerror: " #x " failed with error " << msg << '\n';        \
      if (result == CUDA_ERROR_OUT_OF_MEMORY) {                                \
        std::cerr << "Octopus could not allocate enough memory on the GPU.\n"; \
        std::cerr                                                              \
            << "Please use either more GPUs to distribute the memory or try "  \
               "StatesPack = no to keep the states mostly on the CPU.\n";      \
      }                                                                        \
      exit(1);                                                                 \
    }                                                                          \
  } while (0)

using namespace std;

extern "C" void FC_FUNC_(cuda_init,
                         CUDA_INIT)(CUcontext **context, CUdevice **device,
                                    CUstream **stream, fint *device_number,
                                    fint *rank) {

#ifdef HAVE_CUDA
  CUDA_SAFE_CALL(cuInit(0));

  *context = new CUcontext;
  *device = new CUdevice;

  int ndevices;

  CUDA_SAFE_CALL(cuDeviceGetCount(&ndevices));

  if (ndevices == 0) {
    cerr << "Error: no CUDA devices available." << std::endl;
    exit(1);
  }

  const auto* group_count = std::getenv("CTEST_RESOURCE_GROUP_COUNT");
  auto resource_group = std::vector<int>();
  if (group_count != nullptr) {
    for (int i = 0; i < std::stoi(group_count); ++i) {
      std::stringstream env_name;
      env_name << "CTEST_RESOURCE_GROUP_" << i << "_GPUS";
      const auto* rg_gpus = std::getenv(env_name.str().c_str());
      if (rg_gpus == nullptr){
        std::cerr << "CTEST_RESOURCE_GROUP " << i << "does not have _GPUS group" << std::endl;
        exit(1);
      }
      const auto re = std::regex("id:(\\d+),slots:(\\d+)");
      const auto rg_gpus_string = std::string(rg_gpus);
      std::smatch match;
      if (!std::regex_match(rg_gpus_string, match, re)){
        std::cerr << "Unexpected envrionment variable\n" << env_name.str() << " = " << rg_gpus << std::endl;
        exit(1);
      }
      assert(match.size() == 3);
      resource_group.emplace_back(std::stoi(match[1]));
    }
    assert(resource_group.size() == std::stoi(group_count));
  }

  if (!resource_group.empty()){
    *device_number = resource_group[*rank % resource_group.size()];
    if (*device_number >= ndevices){
      std::cerr << "Requested unsupported GPU: " << *device_number << "/" << ndevices << std::endl;
      exit(1);
    }
  } else {
    *device_number = (*device_number + *rank) % ndevices;
  }

  CUDA_SAFE_CALL(cuDeviceGet(*device, *device_number));

  CUDA_SAFE_CALL(cuCtxCreate(*context, 0, **device));

#ifdef __HIP_PLATFORM_NVIDIA__
  // bug in hip/nvidia_detail/nvidia_hip_runtime_api.h
  CUDA_SAFE_CALL(cuCtxSetCacheConfig(static_cast<CUfunc_cache>(CU_FUNC_CACHE_PREFER_L1)));
#elif defined(__HIP_PLATFORM_AMD__)
  // hipErrorNotSupported
#else
  CUDA_SAFE_CALL(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_L1));
#endif

  phStream = new CUstream[number_streams];
  for (current_stream = 0; current_stream < number_streams; ++current_stream) {
    CUDA_SAFE_CALL(
        cuStreamCreate(&phStream[current_stream], CU_STREAM_NON_BLOCKING));
  }
  current_stream = 0;
  *stream = &phStream[current_stream];
#endif
}

extern "C" void FC_FUNC_(cuda_end, CUDA_END)(CUcontext **context,
                                             CUdevice **device) {
#ifdef HAVE_CUDA

  CUDA_SAFE_CALL(cuStreamDestroy(phStream[current_stream]));
  CUDA_SAFE_CALL(cuCtxDestroy(**context));

  delete *context;
  delete *device;
#endif
}

extern "C" void
FC_FUNC_(cuda_module_map_init,
         CUDA_MODULE_MAP_INIT)(map<string, CUmodule *> **module_map) {
  *module_map = new map<string, CUmodule *>;
}

extern "C" void
FC_FUNC_(cuda_module_map_end,
         CUDA_MODULE_MAP_END)(map<string, CUmodule *> **module_map) {

  for (map<string, CUmodule *>::iterator map_it = (**module_map).begin();
       map_it != (**module_map).end(); ++map_it) {
    CUmodule *module = map_it->second;
#ifdef HAVE_CUDA
    CUDA_SAFE_CALL(cuModuleUnload(*module));
#endif
    delete module;
  }

  delete *module_map;
}

extern "C" void FC_FUNC_(cuda_build_program, CUDA_BUILD_PROGRAM)(
    map<string, CUmodule *> **module_map, CUmodule **module, CUdevice **device,
    STR_F_TYPE const fname, STR_F_TYPE const flags STR_ARG2) {
#ifdef HAVE_CUDA
  char *fname_c;
  char *flags_c;

  TO_C_STR1(fname, fname_c);
  TO_C_STR2(flags, flags_c);

  string map_descriptor = string(fname_c) + string(flags_c);

  map<string, CUmodule *>::iterator map_it =
      (**module_map).find(map_descriptor);
  if (map_it != (**module_map).end()) {
    *module = map_it->second;
    free(fname_c);
    return;
  }

  // read the source
  string source;

  source = "#include \"" + string(fname_c) + "\"\n";

  // cout << source << "|" << endl;

  nvrtcProgram prog;
  NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog, source.c_str(), "kernel_include.c",
                                     0, NULL, NULL));

  int major = 0, minor = 0;
  CUDA_SAFE_CALL(cuDeviceGetAttribute(
      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, **device));
  CUDA_SAFE_CALL(cuDeviceGetAttribute(
      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, **device));

  char compute_version[3];
  sprintf(compute_version, "%.1d%.1d", major, minor);

  string gpu_architecture;
#ifdef __HIP_PLATFORM_AMD__
  CUdevprop prop;
  CUDA_SAFE_CALL(cuDeviceGetProperties(&prop, **device));
  gpu_architecture = string(" --gpu-architecture=") + prop.gcnArchName + string(" --no-default-config ");
#elif defined(HAVE_CUBIN)
  // use actual architecture for cubin generation
  gpu_architecture = string(" --gpu-architecture=sm_") + string(compute_version) + string(" ");
#else
  gpu_architecture = string(" --gpu-architecture=compute_") + string(compute_version) + string(" ");
#endif

  string all_flags =
#ifdef HAVE_HIP
      " -O3 -fcuda-flush-denormals-to-zero -ffp-contract=fast -DCUDA " +
#else
      " --ftz=true --fmad=true -DCUDA -default-device " +
#endif
      gpu_architecture + string(flags_c);

  stringstream flags_stream(all_flags);
  istream_iterator<string> iter(flags_stream);
  istream_iterator<string> end;
  vector<string> tokens(iter, end);

  const char **opts = new const char *[tokens.size()];
  for (unsigned ii = 0; ii < tokens.size(); ii++)
    opts[ii] = tokens[ii].c_str();

  nvrtcResult err = nvrtcCompileProgram(prog, tokens.size(), opts);

  free(flags_c);

  size_t logSize;
  NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
  char *log = new char[logSize];
  NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));

  if (logSize > 1) {

    cout << "Cuda compilation messages" << endl;

    cout << "File    : " << fname_c << endl;

    cout << "Options : " << all_flags << endl;

    cout << log << endl;
  }

  if (NVRTC_SUCCESS != err) {
    cerr << "Error in compiling" << endl;
    exit(1);
  }

  delete[] log;
  delete[] opts;

  // Obtain PTX or CUBIN from the program.
  size_t ptxSize;
#ifdef HAVE_CUBIN
  // CUBIN avoids incompatibilities with the driver as the RTC library directly compiles to binary format
  NVRTC_SAFE_CALL(nvrtcGetCUBINSize(prog, &ptxSize));
  char *ptx = new char[ptxSize];
  NVRTC_SAFE_CALL(nvrtcGetCUBIN(prog, ptx));
#else
  NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptxSize));
  char *ptx = new char[ptxSize];
  NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx));
#endif

  NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog));

  *module = new CUmodule;

  const int num_options = 2;
  CUjit_option options[num_options];
  void *option_values[num_options];

  unsigned log_size = 4096;
  char log_buffer[log_size];

  options[0] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
  option_values[0] = (void *)(long)log_size;

  options[1] = CU_JIT_ERROR_LOG_BUFFER;
  option_values[1] = (void *)log_buffer;

  CUresult result =
      cuModuleLoadDataEx(*module, ptx, num_options, options, option_values);

  if (result != CUDA_SUCCESS) {
    std::cerr << log_buffer << std::endl;
    const char *msg;
    cuGetErrorName(result, &msg);
    std::cerr << "\nerror: cuModuleLoadDataEx failed with error " << msg
              << '\n';
    exit(1);
  }

  delete[] ptx;

  (**module_map)[map_descriptor] = *module;

  free(fname_c);
#endif
}

extern "C" void FC_FUNC_(cuda_create_kernel,
                         CUDA_CREATE_KERNEL)(CUfunction **kernel,
                                             CUmodule **module,
                                             STR_F_TYPE kernel_name STR_ARG1) {
#ifdef HAVE_CUDA
  char *kernel_name_c;

  TO_C_STR1(kernel_name, kernel_name_c);

  *kernel = new CUfunction;

  CUDA_SAFE_CALL(cuModuleGetFunction(*kernel, **module, kernel_name_c));

  free(kernel_name_c);
#endif
}

extern "C" void FC_FUNC_(cuda_release_module,
                         CUDA_RELEASE_MODULE)(CUmodule **module) {
#ifdef HAVE_CUDA
  CUDA_SAFE_CALL(cuModuleUnload(**module));
  delete *module;
#endif
}

extern "C" void FC_FUNC_(cuda_release_kernel,
                         CUDA_RELEASE_KERNEL)(CUfunction **kernel) {
#ifdef HAVE_CUDA
  delete *kernel;
#endif
}

extern "C" void FC_FUNC_(cuda_device_max_threads_per_block,
                         CUDA_DEVICE_MAX_THREADS_PER_BLOCK)(CUdevice **device,
                                                            fint *max_threads) {
#ifdef HAVE_CUDA
  int value;
  CUDA_SAFE_CALL(cuDeviceGetAttribute(
      &value, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, **device));
  *max_threads = value;
#endif
}

extern "C" void FC_FUNC_(cuda_kernel_max_threads_per_block,
                         CUDA_KERNEL_MAX_THREADS_PER_BLOCK)(CUfunction **kernel,
                                                            fint *max_threads) {
#ifdef HAVE_CUDA
  int value;
  CUDA_SAFE_CALL(cuFuncGetAttribute(&value,
    CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, **kernel));
  *max_threads = value;
#endif
}


extern "C" void FC_FUNC_(cuda_device_max_block_dim_x,
                         CUDA_DEVICE_MAX_BLOCK_DIM_X)(CUdevice **device,
                                                            fint *max_dim) {
#ifdef HAVE_CUDA
  int value;
  CUDA_SAFE_CALL(cuDeviceGetAttribute(
      &value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, **device));
  *max_dim = value;
#endif
}

extern "C" void FC_FUNC_(cuda_device_max_block_dim_y,
                         CUDA_DEVICE_MAX_BLOCK_DIM_Y)(CUdevice **device,
                                                            fint *max_dim) {
#ifdef HAVE_CUDA
  int value;
  CUDA_SAFE_CALL(cuDeviceGetAttribute(
      &value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, **device));
  *max_dim = value;
#endif
}

extern "C" void FC_FUNC_(cuda_device_max_block_dim_z,
                         CUDA_DEVICE_MAX_BLOCK_DIM_Z)(CUdevice **device,
                                                            fint *max_dim) {
#ifdef HAVE_CUDA
  int value;
  CUDA_SAFE_CALL(cuDeviceGetAttribute(
      &value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, **device));
  *max_dim = value;
#endif
}

extern "C" void FC_FUNC_(cuda_device_max_grid_dim_x,
                         CUDA_DEVICE_MAX_GRID_DIM_X)(CUdevice **device,
                                                            fint *max_dim) {
#ifdef HAVE_CUDA
  int value;
  CUDA_SAFE_CALL(cuDeviceGetAttribute(
      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, **device));
  *max_dim = value;
#endif
}

extern "C" void FC_FUNC_(cuda_device_max_grid_dim_y,
                         CUDA_DEVICE_MAX_GRID_DIM_Y)(CUdevice **device,
                                                            fint *max_dim) {
#ifdef HAVE_CUDA
  int value;
  CUDA_SAFE_CALL(cuDeviceGetAttribute(
      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, **device));
  *max_dim = value;
#endif
}

extern "C" void FC_FUNC_(cuda_device_max_grid_dim_z,
                         CUDA_DEVICE_MAX_GRID_DIM_Z)(CUdevice **device,
                                                            fint *max_dim) {
#ifdef HAVE_CUDA
  int value;
  CUDA_SAFE_CALL(cuDeviceGetAttribute(
      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, **device));
  *max_dim = value;
#endif
}



extern "C" void FC_FUNC_(cuda_device_total_memory,
                         CUDA_DEVICE_TOTAL_MEMORY)(CUdevice **device,
                                                   fint8 *total_memory) {
#ifdef HAVE_CUDA
  size_t mem;
  CUDA_SAFE_CALL(cuDeviceTotalMem(&mem, **device));
  *total_memory = mem;
#endif
}

extern "C" void FC_FUNC_(cuda_device_shared_memory,
                         CUDA_DEVICE_SHARED_MEMORY)(CUdevice **device,
                                                    fint8 *shared_memory) {
#ifdef HAVE_CUDA
  int mem;
  CUDA_SAFE_CALL(cuDeviceGetAttribute(
      &mem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, **device));
  *shared_memory = mem;
#endif
}

extern "C" void FC_FUNC_(cuda_mem_alloc, CUDA_MEM_ALLOC)(CUdeviceptr **cuda_ptr,
                                                         const fint8 *size) {
#ifdef HAVE_CUDA
  *cuda_ptr = new CUdeviceptr;
#ifdef __HIP_PLATFORM_NVIDIA__
  // bug in hip/nvidia_detail/nvidia_hip_runtime_api.h
  CUDA_SAFE_CALL(cuMemAlloc(reinterpret_cast<void**>(*cuda_ptr), *size));
#else
  CUDA_SAFE_CALL(cuMemAlloc(*cuda_ptr, *size));
#endif
#endif
}

extern "C" void FC_FUNC_(cuda_mem_free, CUDA_MEM_FREE)(CUdeviceptr **cuda_ptr) {
#ifdef HAVE_CUDA
#ifdef __HIP_PLATFORM_NVIDIA__
    // bug in hip/nvidia_detail/nvidia_hip_runtime_api.h
    CUDA_SAFE_CALL(cuMemFree(reinterpret_cast<void*>(**cuda_ptr)));
#else
  CUDA_SAFE_CALL(cuMemFree(**cuda_ptr));
#endif
  delete *cuda_ptr;
#endif
}

extern "C" void FC_FUNC_(cuda_memcpy_htod,
                         CUDA_MEMCPY_HTOD)(CUdeviceptr **cuda_ptr,
                                           /* const */ void **data, fint8 *size, // https://github.com/ROCm/HIP/issues/3444
                                           fint8 *offset) {
#ifdef HAVE_CUDA
  CUDA_SAFE_CALL(cuMemcpyHtoDAsync(**cuda_ptr + *offset, *data, *size,
                                   phStream[current_stream]));
#endif
}

extern "C" void FC_FUNC_(cuda_memcpy_dtoh,
                         CUDA_MEMCPY_DTOH)(CUdeviceptr **cuda_ptr, void **data,
                                           fint8 *size, fint8 *offset) {
#ifdef HAVE_CUDA
  CUDA_SAFE_CALL(cuMemcpyDtoHAsync(*data, **cuda_ptr + *offset, *size,
                                   phStream[current_stream]));
#endif
}

extern "C" void FC_FUNC_(cuda_alloc_arg_array,
                         CUDA_ALLOC_ARG_ARRAY)(vector<void *> **arg_array) {
  *arg_array = new vector<void *>;
}

extern "C" void FC_FUNC_(cuda_free_arg_array,
                         CUDA_FREE_ARG_ARRAY)(vector<void *> **arg_array) {

  for (unsigned ii = 0; ii < (**arg_array).size(); ii++)
    free((**arg_array)[ii]);
  delete *arg_array;
}

extern "C" void FC_FUNC_(cuda_kernel_set_arg_buffer,
                         CUDA_KERNEL_SET_ARG_BUFFER)(vector<void *> **arg_array,
                                                     CUdeviceptr **cuda_ptr,
                                                     fint *arg_index) {

  if (unsigned(*arg_index) >= (**arg_array).size())
    (**arg_array).resize(*arg_index + 1, NULL);

  if ((**arg_array)[*arg_index] == NULL)
    (**arg_array)[*arg_index] = malloc(sizeof(CUdeviceptr));

  memcpy((**arg_array)[*arg_index], *cuda_ptr, sizeof(CUdeviceptr));
}

extern "C" void FC_FUNC_(cuda_kernel_set_arg_value,
                         CUDA_KERNEL_SET_ARG_VALUE)(vector<void *> **arg_array,
                                                    void **arg, fint *arg_index,
                                                    fint *size) {

  if (unsigned(*arg_index) >= (**arg_array).size())
    (**arg_array).resize(*arg_index + 1, NULL);

  if ((**arg_array)[*arg_index] == NULL)
    (**arg_array)[*arg_index] = malloc(*size);

  memcpy((**arg_array)[*arg_index], *arg, *size);
}

extern "C" void FC_FUNC_(cuda_context_synchronize, CUDA_CONTEXT_SYNCHRONIZE)() {
#ifdef HAVE_CUDA
  CUDA_SAFE_CALL(cuStreamSynchronize(phStream[current_stream]));
#endif
}

extern "C" void FC_FUNC_(cuda_synchronize_all_streams,
                         CUDA_SYNCHRONIZE_ALL_STREAMS)() {
#ifdef HAVE_CUDA
  for (int i = 0; i < number_streams; ++i)
    CUDA_SAFE_CALL(cuStreamSynchronize(phStream[i]));
#endif
}

extern "C" void FC_FUNC_(cuda_launch_kernel,
                         CUDA_LAUNCH_KERNEL)(CUfunction **kernel,
                                             fint8 *griddim, fint8 *blockdim,
                                             fint8 *shared_mem,
                                             vector<void *> **arg_array) {
#ifdef HAVE_CUDA
  
/*  cout << "Kernel call" << endl;

  int nn;
  CUDA_SAFE_CALL(cuFuncGetAttribute(&nn,
  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, **kernel)); cout << "SIZE   " << nn
  << endl; CUDA_SAFE_CALL(cuFuncGetAttribute(&nn, CU_FUNC_ATTRIBUTE_PTX_VERSION,
  **kernel)); cout << "PTX    " << nn << endl;
  CUDA_SAFE_CALL(cuFuncGetAttribute(&nn, CU_FUNC_ATTRIBUTE_BINARY_VERSION,
  **kernel)); cout << "BINARY " << nn << endl;

  for(unsigned ii = 0; ii < (**arg_array).size(); ii++) cout << ii << " " <<
  (**arg_array)[ii] << endl;

  cout << "GRID  " << griddim[0] << " " << griddim[1] << " " <<  griddim[2] <<
  endl; cout << "BLOCK " << blockdim[0] << " " << blockdim[1] << " " <<
  blockdim[2] << endl;
  cout << "SHARED MEMORY  " << *shared_mem << endl; */
  

  assert((**arg_array).size() > 0);
  for (unsigned ii = 0; ii < (**arg_array).size(); ii++)
    assert((**arg_array)[ii] != NULL);

  CUDA_SAFE_CALL(cuLaunchKernel(**kernel, griddim[0], griddim[1], griddim[2],
                                blockdim[0], blockdim[1], blockdim[2],
                                *shared_mem, phStream[current_stream],
                                &(**arg_array)[0], NULL));

  // release the stored argument, this is not necessary in principle,
  // but it should help us to detect missing arguments.
  for (unsigned ii = 0; ii < (**arg_array).size(); ii++)
    free((**arg_array)[ii]);
  (**arg_array).resize(0);
#endif
}

extern "C" void FC_FUNC_(cuda_device_name,
                         CUDA_DEVICE_NAME)(CUdevice **device,
                                           STR_F_TYPE name STR_ARG1) {
#ifdef HAVE_CUDA
  char devicename[200];
  CUDA_SAFE_CALL(cuDeviceGetName(devicename, sizeof(devicename), **device));
  TO_F_STR1(devicename, name);

#endif
}

extern "C" void FC_FUNC_(cuda_device_capability,
                         CUDA_DEVICE_CAPABILITY)(CUdevice **device, fint *major,
                                                 fint *minor) {
#ifdef HAVE_CUDA
  int cmajor = 0, cminor = 0;
  CUDA_SAFE_CALL(cuDeviceGetAttribute(
      &cmajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, **device));
  CUDA_SAFE_CALL(cuDeviceGetAttribute(
      &cminor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, **device));
  *major = cmajor;
  *minor = cminor;
#endif
}

extern "C" void FC_FUNC_(cuda_driver_version,
                         CUDA_DRIVER_VERSION)(fint *version) {
#ifdef HAVE_CUDA
  int driverversion;
  CUDA_SAFE_CALL(cuDriverGetVersion(&driverversion));
  *version = driverversion;
#endif
}

extern "C" void FC_FUNC_(cuda_device_get_warpsize,
                         CUDA_DEVICE_GET_WARPSIZE)(CUdevice **device,
                                                   fint *warpSize) {
#ifdef HAVE_CUDA
  int cwarpSize = 0;
  CUDA_SAFE_CALL(cuDeviceGetAttribute(&cwarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
                                      **device));
  *warpSize = cwarpSize;
#endif
}

extern "C" void FC_FUNC_(cuda_deref, CUDA_DEREF)(CUdeviceptr **cuda_ptr,
                                                 void **cuda_deref_ptr) {
#ifdef HAVE_CUDA
  *cuda_deref_ptr = (void *)**cuda_ptr;
#endif
}

extern "C" void FC_FUNC_(cuda_set_stream, CUDA_SET_STREAM)(CUstream **stream,
                                                           fint *number) {
#ifdef HAVE_CUDA
  current_stream = (*number - 1) % number_streams;
  *stream = &phStream[current_stream];
#endif
}

extern "C" void FC_FUNC_(cuda_get_stream, CUDA_GET_STREAM)(fint *number) {
#ifdef HAVE_CUDA
  *number = current_stream + 1;
#endif
}

extern "C" void
FC_FUNC_(cuda_get_pointer_with_offset,
         CUDA_GET_POINTER_WITH_OFFSET)(CUdeviceptr **buffer, fint8 *offset,
                                       CUdeviceptr **buffer_offset) {
  *buffer_offset = new CUdeviceptr;
  **buffer_offset = (CUdeviceptr)((double *)**buffer + (ptrdiff_t)*offset);
}

extern "C" void FC_FUNC_(cuda_clean_pointer,
                         CUDA_CLEAN_POINTER)(CUdeviceptr **buffer) {
  delete *buffer;
}
