/*******************************************************************************
* Copyright 2014-2020 Intel Corporation.
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra     (dongarra@eecs.utk.edu)
// Piotr Luszczek    (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER

/*!
 @file GenerateProblem.cpp

 HPCG routine
 */

#ifndef HPCG_NO_MPI
#include <mpi.h>
#endif

#include <limits.h>
#include <fstream>
#include "hpcg.hpp"

#include <cassert>
#include "GenerateProblem.hpp"
#include "UsmUtil.hpp"
#include "GenerateProblem_ref.hpp"


template <typename T>
using device_atomic_ref = sycl::atomic_ref<T,
                                          sycl::memory_order::relaxed,
                                          sycl::memory_scope::device>;

/*!
  Routine to generate a sparse matrix, right hand side, initial guess, and exact solution.

  @param[in]  A        The generated system matrix
  @param[inout] b      The newly allocated and generated right hand side vector (if b!=0 on entry)
  @param[inout] x      The newly allocated solution vector with entries set to 0.0 (if x!=0 on entry)
  @param[inout] xexact The newly allocated solution vector with entries set to the exact solution (if the xexact!=0 non-zero on entry)

  @see GenerateGeometry
*/

void GenerateProblem(SparseMatrix & A, Vector * b, Vector * x, Vector * xexact,
                     sycl::queue & main_queue, int runRealRef) {

  // The call to this reference version of GenerateProblem can be replaced with custom code.
  // However, the data structures must remain unchanged such that the CheckProblem function is satisfied.
  // Furthermore, any code must work for general unstructured sparse matrices.  Special knowledge about the
  // specific nature of the sparsity pattern may not be explicitly used.
  // Make local copies of geometry information.  Use global_int_t since the RHS products in the calculations
  // below may result in global range values.
#ifdef HPCG_LOCAL_LONG_LONG
  GenerateProblem_ref(A,b,x,xexact);
#else
  // Make local copies of geometry information.  Use global_int_t since the RHS products in the calculations
  // below may result in global range values.
  global_int_t nx = A.geom->nx;
  global_int_t ny = A.geom->ny;
  global_int_t nz = A.geom->nz;
  global_int_t gnx = A.geom->gnx;
  global_int_t gny = A.geom->gny;
  global_int_t gnz = A.geom->gnz;
  global_int_t gix0 = A.geom->gix0;
  global_int_t giy0 = A.geom->giy0;
  global_int_t giz0 = A.geom->giz0;
  //global_int_t ipx = A.geom->ipx;
  //global_int_t ipy = A.geom->ipy;
  //global_int_t ipz = A.geom->ipz;
  int npx = A.geom->npx;
  int npy = A.geom->npy;
  int npz = A.geom->npz;
  double omp1,omp2;

//    std::cout << "gnx = " << gnx << ", gny = " << gny << ", gnz = " << gnz << std::endl;
//    std::cout << "npx = " << npx << ", npy = " << npy << ", npz = " << npz << std::endl;
  sycl::event ev;

  local_int_t localNumberOfRows = nx*ny*nz; // This is the size of our subblock
  // If this assert fails, it most likely means that the local_int_t is set to int and should be set to long long
  assert(localNumberOfRows>0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
  local_int_t numberOfNonzerosPerRow = 27; // We are approximating a 27-point finite element/volume/difference 3D stencil

  global_int_t totalNumberOfRows = gnx*gny*gnz; // Total number of grid points in mesh
  // If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
  assert(totalNumberOfRows>0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)

  // Allocate arrays that are of length localNumberOfRows
  double ** matrixValues;
  double ** matrixDiagonal;
  local_int_t  ** mtxIndL;
  global_int_t ** mtxIndG;
  char * nonzerosInRow;
  global_int_t *localToGlobalMap = nullptr;
  matrixDiagonal = (double**) sparse_malloc_device(sizeof(double*)*localNumberOfRows, main_queue);
  matrixValues = (double**) sparse_malloc_device(sizeof(double*)*localNumberOfRows, main_queue);
  mtxIndL = ( local_int_t**) sparse_malloc_device(sizeof( local_int_t*)*localNumberOfRows,
                                                  main_queue);
  mtxIndG = (global_int_t**) sparse_malloc_device(sizeof(global_int_t*)*localNumberOfRows,
                                                  main_queue);
  nonzerosInRow = (char*) sparse_malloc_device(sizeof(char)*localNumberOfRows, main_queue);
  if (b!=0) InitializeVectorDevice(*b, localNumberOfRows, main_queue);
  if (x!=0) InitializeVectorDevice(*x, localNumberOfRows, main_queue);
  if (xexact!=0) InitializeVectorDevice(*xexact, localNumberOfRows, main_queue);
  localToGlobalMap = (global_int_t *) sparse_malloc_device( sizeof(global_int_t)*localNumberOfRows, main_queue);
  double *bv = nullptr;
  double *xv = nullptr;
  double *xexactv = nullptr;
  if (b!=0) bv = b->values; // Only compute exact solution if requested
  if (x!=0) xv = x->values; // Only compute exact solution if requested
  if (xexact!=0) xexactv = xexact->values; // Only compute exact solution if requested

  A.localToGlobalMap = localToGlobalMap;
  // Now allocate the arrays pointed to
  local_int_t nnz = numberOfNonzerosPerRow*localNumberOfRows;
  global_int_t nnz_gl = ((global_int_t)numberOfNonzerosPerRow)*((global_int_t)localNumberOfRows);

  int local_flag = 1;
  if (nnz_gl > INT_MAX)
    local_flag = 0;
  int global_flag = local_flag;
#ifndef HPCG_NO_MPI
  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Allreduce(&local_flag, &global_flag, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
#endif
  if (global_flag == 0)
  {
    if (A.geom->rank == 0) {
      HPCG_fout << "Error: one of the processes has overflowed local number of nonzeros.\n"
                << "The size of the problem is too large.\nRe-build HPCG with the flag -DHPCG_ILP64 in order to use long long \n"
                << "int as the local integer type.\n"
                << "Warning: For such sizes, non-optimized HPCG implementation will be used.\n";
      HPCG_fout.flush();
    }
#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif
    HPCG_Finalize();
#ifndef HPCG_NO_MPI
    MPI_Finalize();
#endif
    exit(-1);
  }

  local_int_t  *mtxL = (local_int_t*) sparse_malloc_device(sizeof(local_int_t )*nnz, main_queue);
  global_int_t *mtxG = (global_int_t*)sparse_malloc_device(sizeof(global_int_t)*nnz, main_queue);
  double       *mtxA = (double*)      sparse_malloc_device(sizeof(double      )*nnz, main_queue);
  local_int_t *boundaryRows = (local_int_t*) sparse_malloc_device(
                            sizeof(local_int_t)*(nx*ny*nz - (nx-2)*(ny-2)*(nz-2)), main_queue);
  A.mtxL = mtxL;
  A.mtxG = mtxG;
  A.mtxA = mtxA;
  A.boundaryRows = boundaryRows;

  if ( A.mtxL == NULL || A.mtxG == NULL || A.mtxA == NULL || nonzerosInRow == NULL ||
       A.boundaryRows == NULL || mtxIndG == NULL || mtxIndL == NULL ||
       matrixValues == NULL || matrixDiagonal == NULL )
  {
      return;
  }

  A.numOfBoundaryRows = 0;
  // local_int_t *itmp_dev = (local_int_t *)sparse_malloc_device(sizeof(local_int_t) * 1, main_queue);
  local_int_t *itmp_dev = (local_int_t *)sparse_malloc_shared(sizeof(local_int_t) * 1, main_queue);
  local_int_t *itmp_host = (local_int_t *)sparse_malloc_host(sizeof(local_int_t) * 1, main_queue);


  //local_int_t *numOfBoundaryRows = (local_int_t *)sparse_malloc_shared(sizeof(local_int_t) * 1, main_queue);
  local_int_t numOfBoundaryRows = 0;

  local_int_t local_work_group_size = 256;
  local_int_t total_size = 0;

  sycl::event setup_bdryRows_ev;
{

  setup_bdryRows_ev = main_queue.submit([&](sycl::handler &cgh) {
    const local_int_t nBound = nx*ny;
    const local_int_t total_size = round_up_next_multiple(nBound, 256);
    auto kernel = [=] (sycl::nd_item<1> item) {
        local_int_t row = item.get_global_id(0);
        if(row < nBound) {
            local_int_t ix = row % nx;
            local_int_t iy = row / nx;

            boundaryRows[iy*nx + ix] = iy*nx + ix;
            boundaryRows[ny*nx + 2*(nz-2)*(nx+ny-2) + iy*nx + ix] = ((nz - 1)*ny + iy)*nx + ix;
        }
    };
    cgh.parallel_for<class GenerateProblemClass0>(sycl::nd_range<1>(total_size, 256), kernel);
  });
  numOfBoundaryRows += 2 * nx * ny;

  setup_bdryRows_ev = main_queue.submit([&](sycl::handler &cgh) {
    cgh.depends_on(setup_bdryRows_ev);
    const local_int_t nBound = (nz-2)*nx;
    const local_int_t total_size = round_up_next_multiple(nBound, 256);
    auto kernel = [=] (sycl::nd_item<1> item) {
        local_int_t row = item.get_global_id(0);
        if( row < nBound ) {
            local_int_t ix = row % nx;
            local_int_t iz = row / nx + 1;
            boundaryRows[ny*nx + 2*(iz-1)*(nx+ny-2) + ix ] = iz*ny*nx + ix;
            boundaryRows[ny*nx + 2*(iz-1)*(nx+ny-2) + nx + 2*(ny-2) + ix] = (iz*ny + (ny - 1))*nx + ix;
        }
    };
    cgh.parallel_for<class GenerateProblemClass1>(sycl::nd_range<1>(total_size, 256), kernel);
  });
  numOfBoundaryRows += 2 * nx * (nz-2);

  setup_bdryRows_ev = main_queue.submit([&](sycl::handler &cgh) {
    cgh.depends_on(setup_bdryRows_ev);
    const local_int_t nBound = (nz-2)*(ny-2);
    const local_int_t total_size = round_up_next_multiple(nBound, 256);
    auto kernel = [=] (sycl::nd_item<1> item) {
        local_int_t row = item.get_global_id(0);
        if( row < nBound ) {
            local_int_t iy = row % (ny-2) + 1;
            local_int_t iz = row / (ny-2) + 1;
            boundaryRows[ny*nx + 2*(iz-1)*(nx+ny-2) + nx + 2*(iy-1)]   = (iz*ny + iy)*nx;
            boundaryRows[ny*nx + 2*(iz-1)*(nx+ny-2) + nx + 2*(iy-1)+1] = (iz*ny + iy)*nx + nx - 1;
        }
    };
    cgh.parallel_for<class GenerateProblemClass2>(sycl::nd_range<1>(total_size, 256), kernel );
  });
  numOfBoundaryRows += 2 * (ny-2) * (nz-2);
}

  A.numOfBoundaryRows = numOfBoundaryRows;
  local_int_t localNumberOfNonzeros = 0;

  local_int_t *map_neib_r = (local_int_t*) sparse_malloc_device( sizeof(local_int_t)*A.geom->size, main_queue);
  if ( map_neib_r == NULL ) return;

  local_int_t *map_neib_r_host = nullptr;
  if (A.geom->size > 1) {
    map_neib_r_host = (local_int_t*) sparse_malloc_host( sizeof(local_int_t)*A.geom->size, main_queue);
    if ( map_neib_r_host == NULL ) return;
  }

  ev = main_queue.submit([&](sycl::handler &cgh) {
    cgh.fill<local_int_t>(map_neib_r, 0, A.geom->size);
  });


  //
  // fill the interior (regular part) of the mtxL/mtxA/mtxG arrays where the stencil is
  // fully utilized
  //
  ev = main_queue.submit([&](sycl::handler &cgh) {
      //const local_int_t wi = 4;
      const local_int_t zw = nz-2; //ceil_div(static_cast<local_int_t>(nz-2), wi);
      const local_int_t yw = ny-2; //ceil_div(static_cast<local_int_t>(ny-2), wi);
      const local_int_t xw = nx-2; //ceil_div(static_cast<local_int_t>(nx-2), wi);

      auto kernel = [=](sycl::item<3> item) {
            local_int_t zk = item.get_id(0) + 1; // shift over by 1 to be [1 , ... ,  nz-2]
            local_int_t yk = item.get_id(1) + 1; // shift over by 1 to be [1 , ... ,  ny-2]
            local_int_t xk = item.get_id(2) + 1; // shift over by 1 to be [1 , ... ,  nx-2]

            if( (zk < (nz-1)) && (yk < (ny-1)) && (xk < (nx-1)) ) {
              global_int_t giy = giy0 + yk; //ipy*ny+yk;
              global_int_t giz = giz0 + zk; //ipz*nz+zk;
              global_int_t gix = gix0 + xk; //ipx*nx+xk;
              local_int_t  currentLocalRow  = zk*ny*nx + yk*nx + xk;
              global_int_t currentGlobalRow = giz*gny*gnx + giy*gnx + gix;

              localToGlobalMap[currentLocalRow] = currentGlobalRow;
              mtxIndG[currentLocalRow]      = mtxG + currentLocalRow * numberOfNonzerosPerRow;
              mtxIndL[currentLocalRow]      = mtxL + currentLocalRow * numberOfNonzerosPerRow;
              matrixValues[currentLocalRow] = mtxA + currentLocalRow * numberOfNonzerosPerRow;

              double *currentValuePointer = matrixValues[currentLocalRow]; // Pointer to current value in current row
              global_int_t *currentIndexPointerG = mtxIndG[currentLocalRow]; // Pointer to current index in current row
              local_int_t  *currentIndexPointerL = mtxIndL[currentLocalRow]; // Pointer to current index in current row

              for (int sz = -1; sz <= 1; sz++) {

                *(currentValuePointer + 0) = -1.0;
                *(currentValuePointer + 1) = -1.0;
                *(currentValuePointer + 2) = -1.0;
                *(currentValuePointer + 3) = -1.0;
                *(currentValuePointer + 4) = -1.0;
                *(currentValuePointer + 5) = -1.0;
                *(currentValuePointer + 6) = -1.0;
                *(currentValuePointer + 7) = -1.0;
                *(currentValuePointer + 8) = -1.0;

                local_int_t offset = currentLocalRow + sz*ny*nx;
                *(currentIndexPointerL + 0) = offset - nx - 1;
                *(currentIndexPointerL + 1) = offset - nx;
                *(currentIndexPointerL + 2) = offset - nx + 1;
                *(currentIndexPointerL + 3) = offset - 1;
                *(currentIndexPointerL + 4) = offset;
                *(currentIndexPointerL + 5) = offset + 1;
                *(currentIndexPointerL + 6) = offset + nx - 1;
                *(currentIndexPointerL + 7) = offset + nx;
                *(currentIndexPointerL + 8) = offset + nx + 1;

                global_int_t offsetG = currentGlobalRow + sz*gny*gnx;
                *(currentIndexPointerG + 0) = offsetG - gnx - 1;
                *(currentIndexPointerG + 1) = offsetG - gnx;
                *(currentIndexPointerG + 2) = offsetG - gnx + 1;
                *(currentIndexPointerG + 3) = offsetG - 1;
                *(currentIndexPointerG + 4) = offsetG;
                *(currentIndexPointerG + 5) = offsetG + 1;
                *(currentIndexPointerG + 6) = offsetG + gnx - 1;
                *(currentIndexPointerG + 7) = offsetG + gnx;
                *(currentIndexPointerG + 8) = offsetG + gnx + 1;

                currentValuePointer  += 9;
                currentIndexPointerL += 9;
                currentIndexPointerG += 9;
              } // end sz for loop

              *(currentValuePointer - 14) = 26.0; // update diagonal
              matrixDiagonal[currentLocalRow] = currentValuePointer - 14; // store pointer to diagonal value on row

              char numberOfNonzerosInRow = 27; // this submission is filling interior elements which have full stencil pattern
              nonzerosInRow[currentLocalRow] = numberOfNonzerosInRow;
              if (bv!=0)      bv[currentLocalRow] = 26.0 - ((double) (numberOfNonzerosInRow-1));
              if (xv!=0)      xv[currentLocalRow] = 0.0;
              if (xexactv!=0) xexactv[currentLocalRow] = 1.0;
            } // if ()
      }; // kernel lambda
      cgh.parallel_for<class GenerateProblemClass4>(sycl::range<3>(zw, yw, xw), kernel);
    });
    localNumberOfNonzeros += (nz-2)*(nx-2)*(ny-2) * 27; // count all interior nodes with full stencil

    local_int_t npartz = A.geom->npartz;
    local_int_t *partz_nz = A.geom->partz_nz;
    int *partz_ids = A.geom->partz_ids;
    local_int_t rank = A.geom->rank;

    itmp_host[0] = 0; //now stand-in for local NNZ count
    // set itmp_dev to 0 prior to atomics usage in next kernel
    ev = main_queue.submit([&](sycl::handler &cgh) {
        cgh.depends_on(ev);
        cgh.fill<local_int_t>(itmp_dev, 0, 1);
    });

    ev = main_queue.submit([&](sycl::handler &cgh) {
        cgh.depends_on(ev);
        cgh.depends_on(setup_bdryRows_ev);
        const local_int_t numOfBoundaryRows = A.numOfBoundaryRows;
        const local_int_t total_size = round_up_next_multiple(numOfBoundaryRows, 256);

        auto ComputeRankOfMatrixRow2 = [=](const global_int_t gnx, const global_int_t gny,
                                           const local_int_t npartz, const local_int_t *partz_nz, const int *partz_ids,
                                           const local_int_t nx, const local_int_t ny,
                                           const int npx, const int npy, const global_int_t index)
        {
          global_int_t iz = index / (gny*gnx);
          const global_int_t iy = (index - iz*gny*gnx) / gnx;
          const global_int_t ix = index % gnx;

          // We now permit varying values for nz for any nx-by-ny plane of MPI processes.
          // npartz is the number of different groups of nx-by-ny groups of processes.
          // partz_ids is an array of length npartz where each value indicates the z process of the last process in the ith nx-by-ny group.
          // partz_nz is an array of length npartz containing the value of nz for the ith group.
          //
          //        With no variation, npartz = 1, partz_ids[0] = npz, partz_nz[0] = nz
          //
          int ipz = 0;
          int ipartz_ids = 0;
          for (int i = 0; i < npartz; ++i) {
              const int ipart_nz = partz_nz[i];
              ipartz_ids = partz_ids[i] - ipartz_ids;
              if (iz <= ipart_nz * ipartz_ids) {
                  ipz += iz / ipart_nz;
                  break;
              } else {
                  ipz += ipartz_ids;
                  iz -= ipart_nz * ipartz_ids;
              }
          }
          const int ipy  = iy / ny;
          const int ipx  = ix / nx;
          const int rank = ipz*npy*npx + ipy*npx + ipx;
          return rank;
      };

      auto kernel = [=](sycl::nd_item<1> item) {
        local_int_t row = item.get_global_id(0);

        if(row < numOfBoundaryRows) {
          const local_int_t currentLocalRow = boundaryRows[row];

          const local_int_t iz = currentLocalRow / (ny*nx);
          const local_int_t iy = (currentLocalRow / nx) % ny;
          const local_int_t ix = currentLocalRow % nx;

          const global_int_t giz = giz0 + iz; //ipz*nz + iz;
          const global_int_t giy = giy0 + iy; //ipy*ny + iy;
          const global_int_t gix = gix0 + ix; //ipx*nx + ix;

          const global_int_t sz_be = (-1 > -giz)     ? -1 : -giz;
          const global_int_t sz_en = (1 < gnz-giz-1) ? 1  : gnz-giz-1;

          const global_int_t sy_be = (-1 > -giy)     ? -1 : -giy;
          const global_int_t sy_en = (1 < gny-giy-1) ? 1  : gny-giy-1;

          const global_int_t sx_be = (-1 > -gix)     ? -1 : -gix;
          const global_int_t sx_en = (1 < gnx-gix-1) ? 1  : gnx-gix-1;

          const global_int_t currentGlobalRow = giz*gnx*gny + giy*gnx + gix;

          mtxIndG[currentLocalRow]      = mtxG + currentLocalRow*numberOfNonzerosPerRow;
          mtxIndL[currentLocalRow]      = mtxL + currentLocalRow*numberOfNonzerosPerRow;
          matrixValues[currentLocalRow] = mtxA + currentLocalRow*numberOfNonzerosPerRow;

          localToGlobalMap[currentLocalRow] = currentGlobalRow;

          char numberOfNonzerosInRow = 0;
          double *currentValuePointer = matrixValues[currentLocalRow]; // Pointer to current value in current row
          global_int_t *currentIndexPointerG = mtxIndG[currentLocalRow]; // Pointer to current index in current row
          local_int_t  *currentIndexPointerL = mtxIndL[currentLocalRow];

          for (global_int_t sz = sz_be; sz <= sz_en; sz++) {
            for (global_int_t sy = sy_be; sy <= sy_en; sy++) {
              for (global_int_t sx = sx_be; sx <= sx_en; sx++) {
                global_int_t g_col = currentGlobalRow + sz*gnx*gny + sy*gnx + sx;
                local_int_t  l_col = currentLocalRow  + sz*nx*ny   + sy*nx  + sx;
                if (g_col == currentGlobalRow) {
                    matrixDiagonal[currentLocalRow] = currentValuePointer;
                    *(currentValuePointer) = 26.0;
                } else {
                    *(currentValuePointer) = -1.0;
                }
                *(currentIndexPointerG) = g_col;
                const int rankIdOfColumnEntry = ComputeRankOfMatrixRow2(gnx, gny, npartz, partz_nz, partz_ids, (local_int_t)nx, (local_int_t)ny, npx, npy, g_col);
                if( rank == rankIdOfColumnEntry ) {
                    *(currentIndexPointerL) = l_col;
                } else {
                    map_neib_r[rankIdOfColumnEntry]++;
                    *(currentIndexPointerL) = -1 - rankIdOfColumnEntry;//(- l_col - 1);
                }
                numberOfNonzerosInRow++;

                // move to next pointer locations
                currentValuePointer++;
                currentIndexPointerG++;
                currentIndexPointerL++;
              } // end sx loop
            } // end sy loop
          } // end sz loop
          nonzerosInRow[currentLocalRow] = numberOfNonzerosInRow;
          device_atomic_ref<local_int_t>(itmp_dev[0]).fetch_add(numberOfNonzerosInRow);
          if( bv!=0 ) bv[currentLocalRow] = 26.0 - ((double) (numberOfNonzerosInRow-1));
          if( xv!=0 ) xv[currentLocalRow] = 0.0;
          if( xexactv!=0 ) xexactv[currentLocalRow] = 1.0;

        } // if row < numOfBoundaryRows
      };
      cgh.parallel_for<class GenerateProblemClass5>(sycl::nd_range<1>(total_size, 256), kernel);
    });
    ev = main_queue.memcpy(itmp_host, itmp_dev, 1*sizeof(local_int_t), {ev});
    A.work = map_neib_r; // save device version
    ev.wait();

    localNumberOfNonzeros += itmp_host[0]; // record unpredictable count of local nonzeros

    // compute number of MPI neighbors
    local_int_t number_of_neighbors = 0;
    if( A.geom->size > 1 ) {
        main_queue.memcpy(map_neib_r_host, map_neib_r, A.geom->size * sizeof(local_int_t), {ev}).wait();
        for( local_int_t i = 0; i < A.geom->size; i ++ ) {
            number_of_neighbors += (map_neib_r_host[i] > 0);
        }
        sycl::free(map_neib_r_host, main_queue);
    }


    // The following block replaces a previous host-side loop so that we can make A.boundaryRows
    // into device-side data (instead of malloc_shared). The cost ist the additional shared
    // memory in gtl_keys, gtl_values which is immediately freed after filling A.globaltoLocalMap.
    // A long-term fix might be to replace A.globalToLocalMap with some device-side data structure.
    {
        auto gtl_keys = sycl::malloc_shared<global_int_t>(A.numOfBoundaryRows, main_queue);
        auto gtl_values = sycl::malloc_shared<local_int_t>(A.numOfBoundaryRows, main_queue);
        local_int_t * A_boundaryRows = A.boundaryRows;
        auto ev = main_queue.submit([&](sycl::handler& cgh) {
            cgh.parallel_for(
                sycl::range<1>(A.numOfBoundaryRows),
                [=](sycl::id<1> id) {
                    local_int_t currentLocalRow = A_boundaryRows[id];

                    local_int_t iz = currentLocalRow/(ny*nx);
                    local_int_t iy = currentLocalRow/nx%ny;
                    local_int_t ix = currentLocalRow%nx;

                    global_int_t giz = giz0+iz; //ipz*nz+iz;
                    global_int_t giy = giy0+iy; //ipy*ny+iy;
                    global_int_t gix = gix0+ix; //ipx*nx+ix;
                    global_int_t currentGlobalRow = giz*gnx*gny+giy*gnx+gix;

                    gtl_keys[id] = currentGlobalRow;
                    gtl_values[id] = currentLocalRow;
                    // A.globalToLocalMap[currentGlobalRow] = currentLocalRow;
                });
        });
        ev.wait();
        for (int i = 0; i < A.numOfBoundaryRows; i++) {
            A.globalToLocalMap[gtl_keys[i]] = gtl_values[i];
        }
        sycl::free(gtl_keys, main_queue);
        sycl::free(gtl_values, main_queue);
    }


  global_int_t totalNumberOfNonzeros = 0;
#ifndef HPCG_NO_MPI
  // Use MPI's reduce function to sum all nonzeros
#ifdef HPCG_NO_LONG_LONG
  MPI_Allreduce(&localNumberOfNonzeros, &totalNumberOfNonzeros, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
#else
  long long lnnz = localNumberOfNonzeros, gnnz = 0; // convert to 64 bit for MPI call
  MPI_Allreduce(&lnnz, &gnnz, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
  totalNumberOfNonzeros = gnnz; // Copy back
#endif
#else
  totalNumberOfNonzeros = localNumberOfNonzeros;
#endif
  // If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
  // This assert is usually the first to fail as problem size increases beyond the 32-bit integer range.
  assert(totalNumberOfNonzeros>0); // Throw an exception of the number of nonzeros is less than zero (can happen if int overflow)

  A.title = 0;
#ifndef HPCG_NO_MPI
  A.numberOfSendNeighbors = number_of_neighbors;
#endif
  A.totalNumberOfRows = totalNumberOfRows;
  A.totalNumberOfNonzeros = totalNumberOfNonzeros;
  A.localNumberOfRows = localNumberOfRows;
  A.localNumberOfColumns = localNumberOfRows;
  A.localNumberOfNonzeros = localNumberOfNonzeros;
  A.nonzerosInRow = nonzerosInRow;
  A.mtxIndG = mtxIndG;
  A.mtxIndL = mtxIndL;
  A.matrixValues = matrixValues;
  A.matrixDiagonal = matrixDiagonal;

  sycl::free(itmp_dev, main_queue.get_context());
  sycl::free(itmp_host, main_queue.get_context());

#endif

  return;
}
