#include "SphereDataWriter.h"
#include <AMReX_FPC.H>

using namespace amrex;

namespace simflowny {

#ifdef AMREX_USE_HDF5_ASYNC
hid_t es_id_g = 0;
#endif

   AMREX_GPU_MANAGED Real resolution_x;

   /*
    *************************************************************************
    * Interpolates the spherical point from the indexed cell.
    *************************************************************************
    */
   AMREX_GPU_DEVICE
   AMREX_FORCE_INLINE
   double
   interpolate(
      const Array4<const Real> data,
      const int i,
      const int j,
      const int k,
      const int n,
      const double xfrac,
      const double yfrac,
      const double zfrac)
   {
      return (1 - xfrac) * (1-yfrac) * (1-zfrac) * data(i, j, k, n)
           + (1 - xfrac) *    yfrac  *    zfrac  * data(i, j + 1, k + 1, n)
           +      xfrac  * (1-yfrac) *    zfrac  * data(i + 1, j, k + 1, n)
           +      xfrac  *    yfrac  * (1-zfrac) * data(i + 1, j + 1, k, n)
           +      xfrac  * (1-yfrac) * (1-zfrac) * data(i + 1, j, k, n)
           + (1 - xfrac) *    yfrac  * (1-zfrac) * data(i, j + 1, k, n)
           + (1 - xfrac) * (1-yfrac) *    zfrac  * data(i, j, k + 1, n)
           +      xfrac  *    yfrac *     zfrac  * data(i + 1, j + 1, k + 1, n);
   }

   #ifdef BL_USE_MPI
   static void SetHDF5fapl(hid_t fapl, MPI_Comm comm)
   #else
   static void SetHDF5fapl(hid_t fapl)
   #endif
   {
   #ifdef BL_USE_MPI
       H5Pset_fapl_mpio(fapl, comm, MPI_INFO_NULL);

       // Alignment and metadata block size
       int alignment = 16 * 1024;
       int blocksize =  4 * 1024;
       H5Pset_alignment(fapl, alignment, alignment);
       H5Pset_meta_block_size(fapl, blocksize);

       // Collective metadata ops
       H5Pset_coll_metadata_write(fapl, true);
       H5Pset_all_coll_metadata_ops(fapl, true);

       // Defer cache flush
       H5AC_cache_config_t cache_config;
       cache_config.version = H5AC__CURR_CACHE_CONFIG_VERSION;
       H5Pget_mdc_config(fapl, &cache_config);
       cache_config.set_initial_size = 1;
       cache_config.initial_size = std::max(static_cast<int>(cache_config.min_size), 16 * 1024);
       cache_config.evictions_enabled = 0;
       cache_config.incr_mode = H5C_incr__off;
       cache_config.flash_incr_mode = H5C_flash_incr__off;
       cache_config.decr_mode = H5C_decr__off;
       H5Pset_mdc_config (fapl, &cache_config);
   #else
       H5Pset_fapl_sec2(fapl);
   #endif

   }

   static int CreateWriteHDF5AttrString(hid_t loc, const char *name, const char* str)
   {
       hid_t attr, atype, space;
       herr_t ret;

       BL_ASSERT(name);
       BL_ASSERT(str);

       space = H5Screate(H5S_SCALAR);
       atype = H5Tcopy(H5T_C_S1);
       H5Tset_size(atype, strlen(str)+1);
       H5Tset_strpad(atype,H5T_STR_NULLTERM);
       attr = H5Acreate(loc, name, atype, space, H5P_DEFAULT, H5P_DEFAULT);
       if (attr < 0) {
           printf("%s: Error with H5Acreate [%s]\n", __func__, name);
           return -1;
       }

       ret = H5Awrite(attr, atype, str);
       if (ret < 0) {
           printf("%s: Error with H5Awrite[%s]\n", __func__, name);
           return -1;
       }

       H5Tclose(atype);
       H5Sclose(space);
       H5Aclose(attr);

       return 1;
   }

   static int CreateWriteHDF5AttrDouble(hid_t loc, const char *name, hsize_t n, const double *data)
   {
       herr_t ret;
       hid_t attr, attr_space;
       hsize_t dims = n;

       attr_space = H5Screate_simple(1, &dims, NULL);

       attr = H5Acreate(loc, name, H5T_NATIVE_DOUBLE, attr_space, H5P_DEFAULT, H5P_DEFAULT);
       if (attr < 0) {
           printf("%s: Error with H5Acreate [%s]\n", __func__, name);
           return -1;
       }

       ret  = H5Awrite(attr, H5T_NATIVE_DOUBLE, (void*)data);
       if (ret < 0) {
           printf("%s: Error with H5Awrite [%s]\n", __func__, name);
           return -1;
       }
       H5Sclose(attr_space);
       H5Aclose(attr);
       return 1;
   }

   static int CreateWriteHDF5AttrInt(hid_t loc, const char *name, hsize_t n, const int *data)
   {
       herr_t ret;
       hid_t attr, attr_space;
       hsize_t dims = n;

       attr_space = H5Screate_simple(1, &dims, NULL);

       attr = H5Acreate(loc, name, H5T_NATIVE_INT, attr_space, H5P_DEFAULT, H5P_DEFAULT);
       if (attr < 0) {
           printf("%s: Error with H5Acreate [%s]\n", __func__, name);
           return -1;
       }

       ret  = H5Awrite(attr, H5T_NATIVE_INT, (void*)data);
       if (ret < 0) {
           printf("%s: Error with H5Awrite [%s]\n", __func__, name);
           return -1;
       }
       H5Sclose(attr_space);
       H5Aclose(attr);
       return 1;
   }

   static void
   WriteGenericSpherePlotfileHeaderHDF5 (hid_t fid,
                                  MultiFab& mf,
                                  const Vector<std::string> &varnames,
                                  const Vector<Geometry> &geom,
                                  Real time,
                                  const Vector<int> &level_steps,
                                  const Vector<IntVect> &ref_ratio,
                                  const Vector<int> resolution,
                                  const std::string &versionName)
   {
      BL_PROFILE("WriteGenericSpherePlotfileHeaderHDF5()");

      int finest_level(0);

      CreateWriteHDF5AttrString(fid, "version_name", versionName.c_str());
      CreateWriteHDF5AttrString(fid, "filetype", "VanillaAMRFileType");

      int ncomp = varnames.size();
      CreateWriteHDF5AttrInt(fid, "num_components", 1, &ncomp);

      char comp_name[32];
      for (int ivar = 0; ivar < varnames.size(); ++ivar) {
         sprintf(comp_name, "component_%d", ivar);
         CreateWriteHDF5AttrString(fid, comp_name, varnames[ivar].c_str());
      }

      int ndim = 2;
      CreateWriteHDF5AttrInt(fid, "dim", 1, &ndim);
      double cur_time = (double)time;
      CreateWriteHDF5AttrDouble(fid, "time", 1, &cur_time);
      CreateWriteHDF5AttrInt(fid, "finest_level", 1, &finest_level);


      int coord = (int) geom[0].Coord();
      CreateWriteHDF5AttrInt(fid, "coordinate_system", 1, &coord);

      hid_t grp;
      char level_name[128];

      //Struct and compound for prob_lo and cellsizes
      struct {
         double x,y;
      } typedef VecStruct;

      hid_t vec_dtype = H5Tcreate (H5T_COMPOUND, sizeof(VecStruct));
      H5Tinsert (vec_dtype, "x", HOFFSET(VecStruct, x), H5T_NATIVE_DOUBLE);
      H5Tinsert (vec_dtype, "y", HOFFSET(VecStruct, y), H5T_NATIVE_DOUBLE);

      // For VisIt Chombo plot
      int data_centering = 7;    //Node centerd-data
      CreateWriteHDF5AttrInt(fid, "data_centering", 1, &data_centering);
      int nlevels = 1;
      CreateWriteHDF5AttrInt(fid, "num_levels", 1, &nlevels);

      VecStruct prob_lo;
      prob_lo.x = 0;
      prob_lo.y = 0;
      
      hid_t aid3 = H5Screate(H5S_SCALAR);
      hid_t vecdx_attr = H5Acreate(fid, "prob_lo", vec_dtype, aid3, H5P_DEFAULT, H5P_DEFAULT);
      H5Awrite(vecdx_attr, vec_dtype, &prob_lo);
      H5Aclose(vecdx_attr);
      H5Sclose(aid3);

      //CreateWriteHDF5AttrDouble(fid, "prob_lo", 3, lo3);

      grp = H5Gcreate(fid, "Chombo_global", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
      CreateWriteHDF5AttrInt(grp, "SpaceDim", 1, &ndim);
      H5Gclose(grp);

      //Compound for box
      hid_t comp_dtype = H5Tcreate (H5T_COMPOUND, 2 * 2 * sizeof(int));
      H5Tinsert (comp_dtype, "lo_i", 0 * sizeof(int), H5T_NATIVE_INT);
      H5Tinsert (comp_dtype, "lo_j", 1 * sizeof(int), H5T_NATIVE_INT);
      H5Tinsert (comp_dtype, "hi_i", 2 * sizeof(int), H5T_NATIVE_INT);
      H5Tinsert (comp_dtype, "hi_j", 3 * sizeof(int), H5T_NATIVE_INT);

      sprintf(level_name, "level_0");
      grp = H5Gcreate(fid, level_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
      if (grp < 0) {
         std::cout << "H5Gcreate [" << level_name << "] failed!" << std::endl;
      }

      int ratio = 1;

      CreateWriteHDF5AttrInt(grp, "ref_ratio", 1, &ratio);

      VecStruct cellSizes;
      cellSizes.x = 2 * M_PI / resolution[0];
      cellSizes.y =  M_PI / resolution[1];
      
      hid_t aid2 = H5Screate(H5S_SCALAR);
      hid_t vecdx_attr2 = H5Acreate(grp, "vec_dx", vec_dtype, aid2, H5P_DEFAULT, H5P_DEFAULT);
      H5Awrite(vecdx_attr2, vec_dtype, &cellSizes);
      H5Aclose(vecdx_attr2);
      H5Sclose(aid2);

      int domain[2*2];
      for (int i = 0; i < 2; ++i) {
         domain[i] = 0;
         domain[i+2] = resolution[i];
      }

      hid_t aid = H5Screate(H5S_SCALAR);
      hid_t domain_attr = H5Acreate(grp, "prob_domain", comp_dtype, aid, H5P_DEFAULT, H5P_DEFAULT);
      H5Awrite(domain_attr, comp_dtype, domain);
      H5Aclose(domain_attr);
      H5Sclose(aid);

      CreateWriteHDF5AttrInt(grp, "steps", 1, &level_steps[0]);

      int ngrid = mf.boxArray().size();
      CreateWriteHDF5AttrInt(grp, "ngrid", 1, &ngrid);
      cur_time = (double)time;
      CreateWriteHDF5AttrDouble(grp, "time", 1, &cur_time);

      int ngrow = mf.nGrow();
      CreateWriteHDF5AttrInt(grp, "ngrow", 1, &ngrow);

      H5Gclose(grp);

      H5Tclose(comp_dtype);
   }

   void
   WriteMultiLevelSpherePlot (const std::string &plotfilename,
                                  int nlevels,
                                  const Vector<const MultiFab*> &mf,
                                  const Vector<std::string> &varnames,
                                  const Vector<Geometry> &geom,
                                  Real time,
                                  const Vector<int> &level_steps,
                                  const Vector<IntVect> &ref_ratio,
                                  const Real radius,
                                  const Vector<Real> center,
                                  const Vector<int> resolution,
                                  const std::string &versionName)
   {
      BL_PROFILE("WriteMultiLevelSpherePlot()");
      BL_ASSERT(nlevels <= mf.size());
      BL_ASSERT(nlevels <= geom.size());
      BL_ASSERT(nlevels <= level_steps.size());
      BL_ASSERT(mf[0]->nComp() == varnames.size());
      BL_ASSERT(resolution.size() == 2);
      BL_ASSERT(center.size() == 3);

      int finest_level = nlevels-1;
      MultiFab sphere_levels_mf = sphereMultifab(mf,geom,finest_level,radius,center,resolution);

      int myProc(ParallelDescriptor::MyProc());
      int nProcs(ParallelDescriptor::NProcs());

   #ifdef AMREX_USE_HDF5_ASYNC
      // For HDF5 async VOL, block and wait previous tasks have all completed
      if (es_id_g != 0) {
         async_vol_es_wait();
      }
      else {
         ExecOnFinalize(async_vol_es_wait_close);
         es_id_g = H5EScreate();
      }
   #endif

      herr_t  ret;
      int ncomp = mf[0]->nComp();
      std::string filename(plotfilename + ".h5");

      // Write out root level metadata
      hid_t fapl, dxpl_col, dxpl_ind, dcpl_id, fid, grp;

      if(ParallelDescriptor::IOProcessor()) {
         BL_PROFILE_VAR("H5writeMetadata", h5dwm);
         // Create the HDF5 file
         fid = H5Fcreate(filename.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
         if (fid < 0)
            FileOpenFailed(filename.c_str());

         WriteGenericSpherePlotfileHeaderHDF5(fid, sphere_levels_mf, varnames, geom, time, level_steps, ref_ratio, resolution, versionName);
         H5Fclose(fid);
         BL_PROFILE_VAR_STOP(h5dwm);
      }

      ParallelDescriptor::Barrier();
      hid_t babox_id;
      babox_id = H5Tcreate (H5T_COMPOUND, 2 * 2 * sizeof(int));
      H5Tinsert (babox_id, "lo_i", 0 * sizeof(int), H5T_NATIVE_INT);
      H5Tinsert (babox_id, "lo_j", 1 * sizeof(int), H5T_NATIVE_INT);
      H5Tinsert (babox_id, "hi_i", 2 * sizeof(int), H5T_NATIVE_INT);
      H5Tinsert (babox_id, "hi_j", 3 * sizeof(int), H5T_NATIVE_INT);

      hid_t center_id = H5Tcreate (H5T_COMPOUND, 2 * sizeof(int));
      H5Tinsert (center_id, "i", 0 * sizeof(int), H5T_NATIVE_INT);
      H5Tinsert (center_id, "j", 1 * sizeof(int), H5T_NATIVE_INT);

      fapl = H5Pcreate (H5P_FILE_ACCESS);
      dxpl_col = H5Pcreate(H5P_DATASET_XFER);
      dxpl_ind = H5Pcreate(H5P_DATASET_XFER);

   #ifdef BL_USE_MPI
      SetHDF5fapl(fapl, ParallelDescriptor::Communicator());
      H5Pset_dxpl_mpio(dxpl_col, H5FD_MPIO_COLLECTIVE);
   #else
      SetHDF5fapl(fapl);
   #endif

      dcpl_id = H5Pcreate(H5P_DATASET_CREATE);
      H5Pset_fill_time(dcpl_id, H5D_FILL_TIME_NEVER);
   #if (defined AMREX_USE_HDF5_ZFP) || (defined AMREX_USE_HDF5_SZ)
      const char *chunk_env = NULL;
      std::string mode_env, value_env;
      double comp_value = -1.0;
      hsize_t chunk_dim = 1024;

      chunk_env = getenv("HDF5_CHUNK_SIZE");
      if (chunk_env != NULL)
         chunk_dim = atoi(chunk_env);

      H5Pset_chunk(dcpl_id, 1, &chunk_dim);
      H5Pset_alloc_time(dcpl_id, H5D_ALLOC_TIME_LATE);

      std::string::size_type pos = compression.find('@');
      if (pos != std::string::npos) {
         mode_env = compression.substr(0, pos);
         value_env = compression.substr(pos+1);
         if (!value_env.empty()) {
            comp_value = atof(value_env.c_str());
         }
      }

   #ifdef AMREX_USE_HDF5_ZFP
      pos = compression.find("ZFP");
      if (pos != std::string::npos) {
         ret = H5Z_zfp_initialize();
         if (ret < 0) amrex::Abort("ZFP initialize failed!");
      }
   #endif

   #ifdef AMREX_USE_HDF5_SZ
      pos = compression.find("SZ");
      if (pos != std::string::npos) {
         ret = H5Z_SZ_Init((char*)value_env.c_str());
         if (ret < 0) {
            std::cout << "SZ config file:" << value_env.c_str() << std::endl;
            amrex::Abort("SZ initialize failed, check SZ config file!");
         }
      }
   #endif

      if (!mode_env.empty() && mode_env != "None") {
         if (mode_env == "ZLIB")
            H5Pset_deflate(dcpl_id, (int)comp_value);
   #ifdef AMREX_USE_HDF5_ZFP
         else if (mode_env == "ZFP_RATE")
            H5Pset_zfp_rate(dcpl_id, comp_value);
         else if (mode_env == "ZFP_PRECISION")
            H5Pset_zfp_precision(dcpl_id, (unsigned int)comp_value);
         else if (mode_env == "ZFP_ACCURACY")
            H5Pset_zfp_accuracy(dcpl_id, comp_value);
         else if (mode_env == "ZFP_REVERSIBLE")
            H5Pset_zfp_reversible(dcpl_id);
         else if (mode_env == "ZLIB")
            H5Pset_deflate(dcpl_id, (int)comp_value);
   #endif

         if (ParallelDescriptor::MyProc() == 0) {
            std::cout << "\nHDF5 plotfile using " << mode_env << ", " <<
                value_env << ", " << chunk_dim << std::endl;
         }
      }
   #endif

      BL_PROFILE_VAR("H5writeAllLevel", h5dwd);

      // All process open the file
   #ifdef AMREX_USE_HDF5_ASYNC
      // Only use async for writing actual data
      fid = H5Fopen_async(filename.c_str(), H5F_ACC_RDWR, fapl, es_id_g);
   #else
      fid = H5Fopen(filename.c_str(), H5F_ACC_RDWR, fapl);
   #endif

      if (fid < 0)
         FileOpenFailed(filename.c_str());

      auto whichRD = FArrayBox::getDataDescriptor();
      bool doConvert(*whichRD != FPC::NativeRealDescriptor());
      int whichRDBytes(whichRD->numBytes());

      // Write data for each level
      char level_name[32];
      sprintf(level_name, "level_0");
#ifdef AMREX_USE_HDF5_ASYNC
      grp = H5Gopen_async(fid, level_name, H5P_DEFAULT, es_id_g);
#else
      grp = H5Gopen(fid, level_name, H5P_DEFAULT);
#endif
      if (grp < 0) { std::cout << "H5Gopen [" << level_name << "] failed!" << std::endl; }

      // Get the boxes assigned to all ranks and calculate their offsets and sizes
      Vector<int> procMap = sphere_levels_mf.DistributionMap().ProcessorMap();
      const BoxArray& grids = sphere_levels_mf.boxArray();
      hid_t boxdataset, boxdataspace;
      hid_t offsetdataset, offsetdataspace;
      std::string bdsname("boxes");
      std::string odsname("data:offsets=0");
      std::string dataname("data:datatype=0");
      hsize_t  flatdims[1];
      flatdims[0] = grids.size();
      boxdataspace = H5Screate_simple(1, flatdims, NULL);

#ifdef AMREX_USE_HDF5_ASYNC
      boxdataset = H5Dcreate_async(grp, bdsname.c_str(), babox_id, boxdataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT, es_id_g);
#else
      boxdataset = H5Dcreate(grp, bdsname.c_str(), babox_id, boxdataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
#endif
      if (boxdataset < 0) { std::cout << "H5Dcreate [" << bdsname << "] failed!" << std::endl; }

      // Create a boxarray sorted by rank
      std::map<int, Vector<Box> > gridMap;
      for(int i(0); i < grids.size(); ++i) {
         int gridProc(procMap[i]);
         Vector<Box> &boxesAtProc = gridMap[gridProc];
         boxesAtProc.push_back(grids[i]);
      }

      BoxArray sortedGrids(grids.size());
      Vector<int> sortedProcs(grids.size());
      int bIndex(0);
      for(auto it = gridMap.begin(); it != gridMap.end(); ++it) {
         int proc = it->first;
         Vector<Box> &boxesAtProc = it->second;
         for(int ii(0); ii < boxesAtProc.size(); ++ii) {
             sortedGrids.set(bIndex, boxesAtProc[ii]);
             sortedProcs[bIndex] = proc;
             ++bIndex;
         }
      }

      hsize_t  oflatdims[1];
      oflatdims[0] = sortedGrids.size() + 1;
      offsetdataspace = H5Screate_simple(1, oflatdims, NULL);
#ifdef AMREX_USE_HDF5_ASYNC
      offsetdataset = H5Dcreate_async(grp, odsname.c_str(), H5T_NATIVE_LLONG, offsetdataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT, es_id_g);
#else
      offsetdataset = H5Dcreate(grp, odsname.c_str(), H5T_NATIVE_LLONG, offsetdataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
#endif
      if(offsetdataset < 0) { std::cout << "create offset dataset failed! ret = " << offsetdataset << std::endl;}

      Vector<unsigned long long> offsets(sortedGrids.size() + 1, 0);
      unsigned long long currentOffset(0L);
      for(int b(0); b < sortedGrids.size(); ++b) {
         offsets[b] = currentOffset;
         currentOffset += sortedGrids[b].numPts() * ncomp;
      }
      offsets[sortedGrids.size()] = currentOffset;

      Vector<unsigned long long> procOffsets(nProcs, 0);
      Vector<unsigned long long> procBufferSize(nProcs, 0);
      unsigned long long totalOffset(0);
      for(auto it = gridMap.begin(); it != gridMap.end(); ++it) {
         int proc = it->first;
         Vector<Box> &boxesAtProc = it->second;
         procOffsets[proc] = totalOffset;
         procBufferSize[proc] = 0L;
         for(int b(0); b < boxesAtProc.size(); ++b) {
             procBufferSize[proc] += boxesAtProc[b].numPts() * ncomp;
         }
         totalOffset += procBufferSize[proc];
      }

      if(ParallelDescriptor::IOProcessor()) {
         int vbCount(0);
         Vector<int> vbox(sortedGrids.size() * 2 * 2);
         for(int b(0); b < sortedGrids.size(); ++b) {
            for(int i(0); i < 2; ++i) {
               vbox[(vbCount * 2 * 2) + i] = sortedGrids[b].smallEnd(i);
               vbox[(vbCount * 2 * 2) + i + 2] = sortedGrids[b].bigEnd(i) - 1;
            }
            ++vbCount;
         }

         // Only proc zero needs to write out this information
#ifdef AMREX_USE_HDF5_ASYNC
         ret = H5Dwrite_async(offsetdataset, H5T_NATIVE_LLONG, H5S_ALL, H5S_ALL, dxpl_ind, &(offsets[0]), es_id_g);
#else
         ret = H5Dwrite(offsetdataset, H5T_NATIVE_LLONG, H5S_ALL, H5S_ALL, dxpl_ind, &(offsets[0]));
#endif
         if(ret < 0) { std::cout << "Write offset dataset failed! ret = " << ret << std::endl; }

#ifdef AMREX_USE_HDF5_ASYNC
         ret = H5Dwrite_async(boxdataset, babox_id, H5S_ALL, H5S_ALL, dxpl_ind, &(vbox[0]), es_id_g);
#else
         ret = H5Dwrite(boxdataset, babox_id, H5S_ALL, H5S_ALL, dxpl_ind, &(vbox[0]));
#endif
         if(ret < 0) { std::cout << "Write box dataset failed! ret = " << ret << std::endl; }
      } // end IOProcessor

      hsize_t hs_procsize[1], hs_allprocsize[1], ch_offset[1];

      ch_offset[0]       = procOffsets[myProc];          // ---- offset on this proc
      hs_procsize[0]     = procBufferSize[myProc];       // ---- size of buffer on this proc
      hs_allprocsize[0]  = offsets[sortedGrids.size()];  // ---- size of buffer on all procs

      hid_t dataspace    = H5Screate_simple(1, hs_allprocsize, NULL);
      hid_t memdataspace = H5Screate_simple(1, hs_procsize, NULL);

      /* fprintf(stderr, "Rank %d: level %d, offset %ld, size %ld, all size %ld\n", myProc, level, ch_offset[0], hs_procsize[0], hs_allprocsize[0]); */

      if (hs_procsize[0] == 0)
         H5Sselect_none(dataspace);
      else
         H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, ch_offset, NULL, hs_procsize, NULL);

      Vector<Real> a_buffer(procBufferSize[myProc], -1.0);
      const MultiFab* data;
      std::unique_ptr<MultiFab> mf_tmp;
      if (sphere_levels_mf.nGrowVect() != 0) {
         mf_tmp = std::make_unique<MultiFab>(sphere_levels_mf.boxArray(),
                                             sphere_levels_mf.DistributionMap(),
                                             sphere_levels_mf.nComp(), 0, MFInfo(),
                                             sphere_levels_mf.Factory());
         MultiFab::Copy(*mf_tmp, sphere_levels_mf, 0, 0, sphere_levels_mf.nComp(), 0);
         data = mf_tmp.get();
      } else {
         data = &sphere_levels_mf;
      }
      Long writeDataItems(0), writeDataSize(0);
      for(MFIter mfi(*data); mfi.isValid(); ++mfi) {
         const FArrayBox &fab = (*data)[mfi];
         writeDataItems = fab.box().numPts() * (*data).nComp();
         if(doConvert) {
            RealDescriptor::convertFromNativeFormat(static_cast<void *> (a_buffer.dataPtr()+writeDataSize),
                                                     writeDataItems, fab.dataPtr(), *whichRD);
         } else {    // ---- copy from the fab
            memcpy(static_cast<void *> (a_buffer.dataPtr()+writeDataSize),
                    fab.dataPtr(), writeDataItems * whichRDBytes);
         }
         writeDataSize += writeDataItems;
      }

      BL_PROFILE_VAR("H5DwriteData", h5dwg);

#ifdef AMREX_USE_HDF5_SZ
      if (mode_env == "SZ") {
         size_t cd_nelmts;
         unsigned int* cd_values = NULL;
         unsigned filter_config;
         SZ_metaDataToCdArray(&cd_nelmts, &cd_values, SZ_DOUBLE, 0, 0, 0, 0, hs_allprocsize[0]);
         H5Pset_filter(dcpl_id, H5Z_FILTER_SZ, H5Z_FLAG_MANDATORY, cd_nelmts, cd_values);
      }
#endif

#ifdef AMREX_USE_HDF5_ASYNC
      hid_t dataset = H5Dcreate_async(grp, dataname.c_str(), H5T_NATIVE_DOUBLE, dataspace, H5P_DEFAULT, dcpl_id, H5P_DEFAULT, es_id_g);
#else
      hid_t dataset = H5Dcreate(grp, dataname.c_str(), H5T_NATIVE_DOUBLE, dataspace, H5P_DEFAULT, dcpl_id, H5P_DEFAULT);
#endif
      if(dataset < 0)
         std::cout << ParallelDescriptor::MyProc() << "create data failed!  ret = " << dataset << std::endl;

#ifdef AMREX_USE_HDF5_ASYNC
      ret = H5Dwrite_async(dataset, H5T_NATIVE_DOUBLE, memdataspace, dataspace, dxpl_col, a_buffer.dataPtr(), es_id_g);
#else
      ret = H5Dwrite(dataset, H5T_NATIVE_DOUBLE, memdataspace, dataspace, dxpl_col, a_buffer.dataPtr());
#endif
      if(ret < 0) { std::cout << ParallelDescriptor::MyProc() << "Write data failed!  ret = " << ret << std::endl;}
      BL_PROFILE_VAR_STOP(h5dwg);

      H5Sclose(memdataspace);
      H5Sclose(dataspace);
      H5Sclose(offsetdataspace);
      H5Sclose(boxdataspace);

#ifdef AMREX_USE_HDF5_ASYNC
      H5Dclose_async(dataset, es_id_g);
      H5Dclose_async(offsetdataset, es_id_g);
      H5Dclose_async(boxdataset, es_id_g);
      H5Gclose_async(grp, es_id_g);
#else
      H5Dclose(dataset);
      H5Dclose(offsetdataset);
      H5Dclose(boxdataset);
      H5Gclose(grp);
#endif

      BL_PROFILE_VAR_STOP(h5dwd);

      H5Tclose(center_id);
      H5Tclose(babox_id);
      H5Pclose(fapl);
      H5Pclose(dxpl_col);
      H5Pclose(dxpl_ind);
      H5Pclose(dcpl_id);

   #ifdef AMREX_USE_HDF5_ASYNC
      H5Fclose_async(fid, es_id_g);
   #else
      H5Fclose(fid);
   #endif
   }

   MultiFab
   sphereMultifab (const Vector<const MultiFab*> &mf_v,
                  const Vector<Geometry> geom,
                  int finest_level,
                  const Real radius,
                  const Vector<Real> center,
                  const Vector<int> resolution)
   {
      double delta_phi = M_PI / resolution[1];
      double delta_theta = 2 * M_PI / resolution[0];

      //1 create box for the sphere mapping
      Box sphere_box(IntVect(AMREX_D_DECL(0,0,0)), IntVect(AMREX_D_DECL(resolution[0],resolution[1],0)), IntVect::TheNodeVector());
      BoxArray new_ba(sphere_box);
      //2 create multifabs for mapping
      DistributionMapping new_dm {new_ba};
      MultiFab sphere_mf(amrex::convert(new_ba,IntVect::TheNodeVector()), new_dm, mf_v[0]->nComp(), 0);

      //3 maps sphere box indices with original data
      for (int n = 0; n < sphere_mf.nComp(); n++) {

         Gpu::ManagedVector<Real> sphere((resolution[0]+1)*(resolution[1]+1));
         Gpu::ManagedVector<int> minLevel((resolution[0]+1)*(resolution[1]+1));
         Gpu::ManagedVector<int> minLevelLocal((resolution[0]+1)*(resolution[1]+1));

         sphere.assign((resolution[0]+1)*(resolution[1]+1), std::numeric_limits<double>::max());
         minLevelLocal.assign((resolution[0]+1)*(resolution[1]+1), -1);
         minLevel.assign((resolution[0]+1)*(resolution[1]+1), -1);

         for (int lev = 0; lev <= finest_level; lev++) {
            const MultiFab* data_mf = mf_v[lev];

            const auto dx     = geom[lev].CellSizeArray();
            const auto problo = geom[lev].ProbLoArray();

      #ifdef AMREX_USE_OMP
      #pragma omp parallel if (Gpu::notInLaunchRegion())
      #endif
            {

              for (MFIter mfi(*data_mf, TilingIfNotGPU()); mfi.isValid(); ++mfi)
              {
                  // Set up tileboxes and nodal tileboxes
                  const Box& bx = mfi.tilebox();

                  const auto loVec = amrex::lbound(bx);
                  const auto hiVec = amrex::ubound(bx);

                  // Grab fab pointers from state multifabs
                  const Array4<const Real> data  = data_mf->const_array(mfi);

                  Real* sphere_ptr = sphere.dataPtr();
                  int* minLevel_ptr = minLevel.dataPtr();
                  int* minLevelLocal_ptr = minLevelLocal.dataPtr();
                  double center_x = center[0];
                  double center_y = center[1];
                  double center_z = center[2];
                  resolution_x = resolution[0];

                  amrex::ParallelFor(sphere_box,
                  [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
                  {
                     calculateSphericalPoint(data, sphere_ptr, minLevel_ptr, minLevelLocal_ptr, i, j, center_x, center_y, center_z, resolution_x, radius, delta_phi, delta_theta, problo, dx, loVec, hiVec, lev, n);
                  });
               }
            }
         }

         //Communicate minimum levels for every coordinate map
         ParallelDescriptor::Barrier();
         if (ParallelDescriptor::NProcs() > 1) {
            ParallelDescriptor::ReduceIntMax(minLevel.dataPtr(), minLevel.size());
         }
         ParallelDescriptor::Barrier();

         //Reset mapped values in coarser level than available
         for (int j = 0; j <= resolution[1]; j++) {
            for (int i = 0; i <= resolution[0]; i++) {
               if (minLevelLocal[i + j * (resolution[0] + 1)] < minLevel[i + j * (resolution[0] + 1)]) {
                  sphere[i + j * (resolution[0] + 1)] = std::numeric_limits<double>::max();
               }
            }
         }
         ParallelDescriptor::Barrier();      

         //Reduce to minimum value to get all finer point mapped values
         if (ParallelDescriptor::NProcs() > 1) {
            ParallelDescriptor::ReduceRealMin(sphere.dataPtr(), sphere.size());
         }

   #ifdef AMREX_USE_OMP
   #pragma omp parallel if (Gpu::notInLaunchRegion())
   #endif
         {

            for (MFIter mfi(sphere_mf, TilingIfNotGPU()); mfi.isValid(); ++mfi)
            {
               // Set up tileboxes and nodal tileboxes
               const Box& bx = mfi.tilebox();

               Array4<Real> data  = sphere_mf.array(mfi);
               Real* sphere_ptr = sphere.dataPtr();
               resolution_x = resolution[0];
               amrex::ParallelFor(bx,
               [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
               {
                        data(i,j,k,n) = sphere_ptr[(int)(i + j * (resolution_x + 1))];
               });
            }
         }
      }

     return sphere_mf;
   }
   

   /*
    *************************************************************************
    *
    * Calculates the value of the spherical projection
    * from a 3D point.
    *
    *************************************************************************
    */

   AMREX_GPU_DEVICE
   AMREX_FORCE_INLINE
   void
   calculateSphericalPoint(
      const Array4<const Real> data, 
      Real *sphere,
      int *minLevel,
      int *minLevelLocal,
      const int i,
      const int j,
      const double center_x,
      const double center_y,
      const double center_z,
      const int resolution_x,
      const Real radius,
      const double delta_phi,
      const double delta_theta,
      const GpuArray<Real,BL_SPACEDIM> problo, 
      const GpuArray<Real,BL_SPACEDIM> dx, 
      const Dim3 loVec, 
      const Dim3 hiVec, 
      const int lev,
      const int comp)
   {
      Real phi = j * delta_phi;
      Real theta = i * delta_theta;
      Real x = center_x + radius * cos(theta) * sin(phi);
      Real y = center_y + radius * sin(theta) * sin(phi);
      Real z = center_z + radius * cos(phi);

      int index = i + j * (resolution_x + 1);

      // Index of lower left point of interpolation cell:
      double ii = ((x - problo[0]) / dx[0]);
      int ibase = floor(ii);
      double jj = ((y - problo[1]) / dx[1]);
      int jbase = floor(jj);
      double kk = ((z - problo[2]) / dx[2]);
      int kbase = floor(kk);

      //Check if index is inside the current box
      if (loVec.x <= ibase && hiVec.x >= ibase && loVec.y <= jbase && hiVec.y >= jbase && loVec.z <= kbase && hiVec.z >= kbase) {

         // Fractional distance point is from lower left point:
         Real xfrac, yfrac, zfrac;
         xfrac = ( x - ((ibase*dx[0]) + problo[0])) / dx[0];
         yfrac = ( y - ((jbase*dx[1]) + problo[1])) / dx[1];
         zfrac = ( z - ((kbase*dx[2]) + problo[2])) / dx[2];

         minLevel[index] = lev;
         minLevelLocal[index] = lev;
         sphere[index] = interpolate(data, ibase, jbase, kbase, comp, xfrac, yfrac, zfrac);
      }
   }

}

