! $Id$ ! ! MODULE_DOC: This module contains GPU related types and functions to be used with the ASTAROTH nucleus. ! ! CPARAM logical, parameter :: lgpu = .true. ! !************************************************************************** ! module GPU ! use Cdata use General, only: keep_compiler_quiet, lpointer, ioptest, loptest use Messages !$ use, intrinsic :: iso_c_binding use iso_c_binding implicit none include 'gpu.h' !$ interface !$ subroutine random_initial_condition() bind(C) !$ endsubroutine random_initial_condition !$ end interface external initialize_gpu_c external register_gpu_c external finalize_gpu_c external get_farray_ptr_gpu_c external rhs_gpu_c external before_boundary_gpu_c external update_after_substep_gpu_c external radtransfer_gpu_c external load_farray_c external reload_gpu_config_c external test_rhs_c external copy_farray_c external update_on_gpu_arr_by_ind_c external update_on_gpu_scal_by_ind_c external pos_real_ptr_c external gpu_set_dt_c external torchtrain_c external torchinfer_c external get_gpu_reduced_vars_c external test_bcs_c external split_update_gpu_c integer, external :: update_on_gpu_arr_by_name_c integer, external :: update_on_gpu_scal_by_name_c !integer(KIND=ikind8) :: pFarr_GPU_in, pFarr_GPU_out type(C_PTR) :: pFarr_GPU_in, pFarr_GPU_out ! Since on the GPU calculation of df and update of f happen without synchronization ! of the decomposed portions of the subdomains (or different processes) we can't compute ! dt 'on the fly' but take it from the previous timestep (calculated at the moment on ! the first substep but the last would make more sense). This means to calculate rhs an extra time ! on the first substep to compute dt as is done normally on the CPUs logical :: lcpu_timestep_on_gpu=.false. ! Whether we compute the timestep in single precision regardless of the precision of real or not. ! We are a bit brave and do it by default in single since the exact value of the timestep ! Should never be that important: your safety factors and other controls are surely more important ! Then the neglibeble amount of round-off. ! Anyways one is supposed to be able to turn it off for testing logical :: lsingle_precision_timestep =.true. ! Astaroth empirically finds the best kernel configuration parameters by running them with different ! options and picking the best one. Sparser autotuning means prune the parameter space more than ! usual by picking only the most likely ones (empirically gives for large grids the same as the ! larger search, but is considerably faster). logical :: lac_sparse_autotuning=.true. ! Whether df is 0 or the accumulated value at the start of the kernel. ! For simplicity df is zero and then accumulated to the buffer, but sometimes you need to be careful like with ! short stopping time approximation in dustvelocity (the only case we are aware at the moment where the difference matter logical :: lcumulative_df_on_gpu=.false. ! Placeholder logical :: lskip_rtime_compilation=.false. ! By default only pde variables and those aux variables that are registered to be always read are read from the device. ! If this is true all variables are always read logical :: lread_all_vars_from_device = .false. ! Whether to use CUDA-aware MPI. If you have it you should always want to use it, but sometimes you do not have it or using ! it is more unstable than routing the communication via the host yourself. logical :: lcuda_aware_mpi=.true. ! Whether to test the agreement of bcs on GPU and CPU logical :: ltest_bcs =.false. ! Whether to test the agreement of RHS on GPU and CPU ! Will run both versions and compare the differences logical :: ltest_rhs =.false. ! At which timestep to perform the comparison ! It is useful to be able to vary this in case some samples need some ! integration time for some of the fields to have meaningful values integer :: it_test_rhs = 1 namelist /gpu_run_pars/ & ltest_bcs,lac_sparse_autotuning,lcpu_timestep_on_gpu,lsingle_precision_timestep,lcumulative_df_on_gpu,& lread_all_vars_from_device,lcuda_aware_mpi,ltest_rhs,it_test_rhs contains !*********************************************************************** subroutine train_gpu(loss, itsub, t) real :: loss integer :: itsub real(KIND=rkind8), intent(IN) :: t call torchtrain_c(loss, itsub, t) endsubroutine train_gpu !*********************************************************************** subroutine infer_gpu(flag) integer :: flag call torchinfer_c(flag) endsubroutine infer_gpu !*********************************************************************** subroutine initialize_GPU(f) ! use Mpicomm, only: MPI_COMM_PENCIL real, dimension(:,:,:,:), intent(IN) :: f integer :: lread_all_vars_from_device_int integer :: lcpu_timestep_on_gpu_int character(LEN=512) :: str ! if(ltest_rhs) lread_all_vars_from_device = .true. str='' if (lanelastic) str=trim(str)//', '//'anelastic' if (lboussinesq) str=trim(str)//', '//'boussinesq' if (lhyperresistivity_strict) str=trim(str)//', '//'hyperresi_strict' if (lhyperviscosity_strict) str=trim(str)//', '//'hypervisc_strict' if (lADI) str=trim(str)//', '//'implicit_physics' if (llorenz_gauge) str=trim(str)//', '//'lorenz_gauge' if (ltestscalar) str=trim(str)//', '//'testscalar' if (ltestfield) str=trim(str)//', '//'testfield' if (ltestflow) str=trim(str)//', '//'testflow' if (ldetonate) str=trim(str)//', '//'detonate' if (lopacity) str=trim(str)//', '//'opacity' if (lpointmasses) str=trim(str)//', '//'pointmasses' if (lsolid_cells) str=trim(str)//', '//'solid_cells' if (lparticles) str=trim(str)//', '//'particles' if (str/='') call fatal_error('initialize_GPU','no GPU implementation available for module(s) "'// & trim(str(3:))//'"') ! if (dt<=0.) dt = dtmin lread_all_vars_from_device_int = merge(1,0,lread_all_vars_from_device) lcpu_timestep_on_gpu_int = merge(1,0,lcpu_timestep_on_gpu) call initialize_gpu_c(f,MPI_COMM_PENCIL,t,nt,lread_all_vars_from_device_int,& lcpu_timestep_on_gpu_int) ! ! Load farray to gpu ! if (nt>0) call load_farray_to_GPU(f) !print'(a,1x,Z0,1x,Z0)', 'pFarr_GPU_in,pFarr_GPU_out=', pFarr_GPU_in,pFarr_GPU_out lverbose_performance_log = .true. endsubroutine initialize_GPU !************************************************************************** subroutine read_gpu_run_pars(iostat) ! use File_io, only: parallel_unit ! integer, intent(out) :: iostat ! read(parallel_unit, NML=gpu_run_pars, IOSTAT=iostat) ! endsubroutine read_gpu_run_pars !*********************************************************************** subroutine write_gpu_run_pars(unit) ! integer, intent(in) :: unit ! write(unit, NML=gpu_run_pars) ! endsubroutine write_gpu_run_pars !************************************************************************** subroutine register_GPU ! call register_gpu_c ! endsubroutine register_GPU !************************************************************************** subroutine finalize_gpu ! call finalize_gpu_c ! endsubroutine finalize_GPU !************************************************************************** subroutine get_farray_ptr_gpu call get_farray_ptr_gpu_c(pFarr_GPU_in) endsubroutine get_farray_ptr_gpu !************************************************************************** subroutine rhs_GPU(f,isubstep) ! use General, only: notanumber real, dimension (mx,my,mz,mfarray), intent(INOUT) :: f integer, intent(IN) :: isubstep ! call rhs_gpu_c(isubstep,t) ! endsubroutine rhs_GPU !************************************************************************** subroutine before_boundary_gpu(f,lrmv,isubstep,t) ! use General, only: notanumber real, dimension (mx,my,mz,mfarray), intent(INOUT) :: f integer, intent(IN) :: isubstep real(KIND=rkind8), intent(IN) :: t logical :: lrmv integer :: lrmv_int,lsubstepping_in_time_int ! !TP: pass int since integers are more compatible with C than logical to booleans if (lrmv) then lrmv_int = 1 else lrmv_int = 0 endif if(lsubstepping_in_time) then lsubstepping_in_time_int = 1 else lsubstepping_in_time_int = 0 endif call before_boundary_gpu_c(lrmv_int,isubstep,t,lsubstepping_in_time_int) ! endsubroutine before_boundary_gpu !************************************************************************** subroutine update_after_substep_gpu call update_after_substep_gpu_c endsubroutine update_after_substep_gpu !************************************************************************** subroutine gpu_set_dt ! call gpu_set_dt_c(t) ! endsubroutine gpu_set_dt !************************************************************************** function get_ptr_GPU(ind1,ind2,lout) result(pFarr) ! ! Fetches the address of the f-array counterpart on the GPU for slots from ind1 to ind2 ! and transforms it to a Fortran pointer. ! integer :: ind1 integer, optional :: ind2 logical, optional :: lout real, dimension(:,:,:,:), pointer :: pFarr integer :: i2 interface type(c_ptr) function pos_real_ptr_c(ptr,ind) import :: c_ptr, ikind8 type(c_ptr) :: ptr integer :: ind endfunction endinterface i2 = ioptest(ind2,ind1) if (loptest(lout)) then call c_f_pointer(pos_real_ptr_c(pFarr_GPU_out,ind1-1),pFarr,(/mx,my,mz,i2-ind1+1/)) else call c_f_pointer(pos_real_ptr_c(pFarr_GPU_in,ind1-1),pFarr,(/mx,my,mz,i2-ind1+1/)) endif endfunction get_ptr_GPU !************************************************************************** function get_ptr_GPU_training(ind1,ind2,lout) result(pFarr) ! For training, pointed-to array needs to be 5-dimensional (last dimension being the batch size) use Cparam use iso_c_binding integer :: ind1 integer, optional :: ind2 logical, optional :: lout real, dimension(:,:,:,:,:), pointer :: pFarr integer :: i2 interface type(c_ptr) function pos_real_ptr_c(ptr,ind) import :: c_ptr, ikind8 type(c_ptr) :: ptr integer :: ind endfunction endinterface i2 = ioptest(ind2,ind1) if (loptest(lout)) then call c_f_pointer(pos_real_ptr_c(pFarr_GPU_out,ind1-1),pFarr,(/mx,my,mz,i2-ind1+1,1/)) else call c_f_pointer(pos_real_ptr_c(pFarr_GPU_in,ind1-1),pFarr,(/mx,my,mz,i2-ind1+1,1/)) endif endfunction get_ptr_GPU_training !************************************************************************** subroutine copy_farray_from_GPU(f,nowait_) !$ use General, only: signal_wait real, dimension (mx,my,mz,mfarray), intent(OUT) :: f logical, optional :: nowait_ logical :: nowait nowait = loptest(nowait_) if (nowait) then call copy_farray_c(f) return endif ! !$ if (lfarray_copied) return ! ! Have to wait since if doing diagnostics don't want to overwrite f. ! !$ call signal_wait(lhelper_perf, .false.) call copy_farray_c(f) !$ lfarray_copied = .true. endsubroutine copy_farray_from_GPU !************************************************************************** subroutine load_farray_to_GPU(f) real, dimension (mx,my,mz,mfarray), intent(IN) :: f call load_farray_c endsubroutine load_farray_to_GPU !************************************************************************** subroutine reload_GPU_config call reload_gpu_config_c endsubroutine reload_GPU_config !************************************************************************** subroutine update_on_gpu(index, varname, value) ! ! Updates an element of the Astaroth configuration, identified by name or index, on the GPU. ! integer, intent(inout) :: index character(LEN=*),optional :: varname real, optional :: value if (index>=0) then if (present(value)) then call update_on_gpu_scal_by_ind_c(index,value) else call update_on_gpu_arr_by_ind_c(index) endif else if (present(value)) then index = update_on_gpu_scal_by_name_c(varname//char(0),value) else index = update_on_gpu_arr_by_name_c(varname//char(0)) endif if (index<0) call fatal_error('update_on_gpu','variable "'//trim(varname)//'" not found') endif endsubroutine update_on_gpu !************************************************************************** subroutine radtransfer_gpu call radtransfer_gpu_c endsubroutine !************************************************************************** subroutine get_gpu_reduced_vars(dst) real, dimension(10) :: dst call get_gpu_reduced_vars_c(dst) endsubroutine get_gpu_reduced_vars !************************************************************************** subroutine test_gpu_bcs call test_bcs_c endsubroutine test_gpu_bcs !************************************************************************** subroutine split_update_gpu(f) real, dimension (mx,my,mz,mfarray), intent(INOUT) :: f call split_update_gpu_c(f) endsubroutine split_update_gpu !************************************************************************** subroutine pushpars2c(p_par) use Syscalls, only: copy_addr use General , only: pos_in_array, string_to_enum integer, parameter :: n_pars=50 integer(KIND=ikind8), dimension(n_pars) :: p_par call copy_addr(lac_sparse_autotuning,p_par(2)) ! bool call copy_addr(lskip_rtime_compilation,p_par(3)) ! bool call copy_addr(lcumulative_df_on_gpu,p_par(4)) ! bool call copy_addr(lcuda_aware_mpi,p_par(6)) ! bool call copy_addr(ltest_bcs,p_par(7)) ! bool call copy_addr(lsingle_precision_timestep,p_par(8)) ! bool endsubroutine pushpars2c !************************************************************************** endmodule GPU