From 0b6d255e77bebeaaf2c57c1a1f3b504c040ed443 Mon Sep 17 00:00:00 2001 From: Craig Warren Date: Thu, 3 Mar 2022 17:32:43 +0000 Subject: [PATCH] Work on templating for CUDA/OpenCL --- gprMax/cuda_opencl_el/knl_fields_updates.py | 138 +++++++- .../knl_pml_updates_electric_HORIPML.py | 299 ++++++++++++------ .../knl_pml_updates_magnetic_HORIPML.py | 298 +++++++++++------ gprMax/cuda_opencl_el/knl_snapshots.py | 54 +++- gprMax/cuda_opencl_el/knl_source_updates.py | 98 +++++- gprMax/cuda_opencl_el/knl_store_outputs.py | 31 +- gprMax/updates.py | 240 ++++---------- 7 files changed, 769 insertions(+), 389 deletions(-) diff --git a/gprMax/cuda_opencl_el/knl_fields_updates.py b/gprMax/cuda_opencl_el/knl_fields_updates.py index 644a6eac..5acda02f 100644 --- a/gprMax/cuda_opencl_el/knl_fields_updates.py +++ b/gprMax/cuda_opencl_el/knl_fields_updates.py @@ -18,7 +18,32 @@ from string import Template -update_electric = Template(""" +update_electric = { + 'args_cuda': Template(""" + __global__ void update_electric(int NX, + int NY, + int NZ, + const unsigned int* __restrict__ ID, + $REAL *Ex, + $REAL *Ey, + $REAL *Ez, + const $REAL* __restrict__ Hx, + const $REAL* __restrict__ Hy, + const $REAL* __restrict__ Hz) + """), + 'args_opencl': Template(""" + int NX, + int NY, + int NZ, + __global const unsigned int* restrict ID, + __global $REAL *Ex, + __global $REAL *Ey, + __global $REAL *Ez, + __global const $REAL * restrict Hx, + __global const $REAL * restrict Hy, + __global const $REAL * restrict Hz + """), + 'func': Template(""" // Electric field updates - normal materials. // // Args: @@ -35,6 +60,8 @@ update_electric = Template(""" int y_ID = ((i % ($NX_ID * $NY_ID * $NZ_ID)) % ($NY_ID * $NZ_ID)) / $NZ_ID; int z_ID = ((i % ($NX_ID * $NY_ID * $NZ_ID)) % ($NY_ID * $NZ_ID)) % $NZ_ID; + $CUDA_IDX + // Ex component if ((NY != 1 || NZ != 1) && x >= 0 && x < NX && y > 0 && y < NY && z > 0 && z < NZ) { int materialEx = ID[IDX4D_ID(0,x_ID,y_ID,z_ID)]; @@ -58,9 +85,34 @@ update_electric = Template(""" updatecoeffsE[IDX2D_MAT(materialEz,1)] * (Hy[IDX3D_FIELDS(x,y,z)] - Hy[IDX3D_FIELDS(x-1,y,z)]) - updatecoeffsE[IDX2D_MAT(materialEz,2)] * (Hx[IDX3D_FIELDS(x,y,z)] - Hx[IDX3D_FIELDS(x,y-1,z)]); } -""") + """)} -update_magnetic = Template(""" +update_magnetic = { + 'args_cuda': Template(""" + __global__ void update_magnetic(int NX, + int NY, + int NZ, + const unsigned int* __restrict__ ID, + $REAL *Hx, + $REAL *Hy, + $REAL *Hz, + const $REAL* __restrict__ Ex, + const $REAL* __restrict__ Ey, + const $REAL* __restrict__ Ez) + """), + 'args_opencl': Template(""" + int NX, + int NY, + int NZ, + __global const unsigned int* restrict ID, + __global $REAL *Hx, + __global $REAL *Hy, + __global $REAL *Hz, + __global const $REAL * restrict Ex, + __global const $REAL * restrict Ey, + __global const $REAL * restrict Ez + """), + 'func': Template(""" // Magnetic field updates - normal materials. // // Args: @@ -77,6 +129,8 @@ update_magnetic = Template(""" int y_ID = ((i % ($NX_ID * $NY_ID * $NZ_ID)) % ($NY_ID * $NZ_ID)) / $NZ_ID; int z_ID = ((i % ($NX_ID * $NY_ID * $NZ_ID)) % ($NY_ID * $NZ_ID)) % $NZ_ID; + $CUDA_IDX + // Hx component if (NX != 1 && x > 0 && x < NX && y >= 0 && y < NY && z >= 0 && z < NZ) { int materialHx = ID[IDX4D_ID(3,x_ID,y_ID,z_ID)]; @@ -100,9 +154,44 @@ update_magnetic = Template(""" updatecoeffsH[IDX2D_MAT(materialHz,1)] * (Ey[IDX3D_FIELDS(x+1,y,z)] - Ey[IDX3D_FIELDS(x,y,z)]) + updatecoeffsH[IDX2D_MAT(materialHz,2)] * (Ex[IDX3D_FIELDS(x,y+1,z)] - Ex[IDX3D_FIELDS(x,y,z)]); } -""") + """)} -update_electric_dispersive_A = Template(""" +update_electric_dispersive_A = { + 'args_cuda': Template(""" + __global__ void update_electric_dispersive_A(int NX, + int NY, + int NZ, + int MAXPOLES, + const $COMPLEX* __restrict__ updatecoeffsdispersive, + $COMPLEX *Tx, + $COMPLEX *Ty, + $COMPLEX *Tz, + const unsigned int* __restrict__ ID, + $REAL *Ex, + $REAL *Ey, + $REAL *Ez, + const $REAL* __restrict__ Hx, + const $REAL* __restrict__ Hy, + const $REAL* __restrict__ Hz) + """), + 'args_opencl': Template(""" + int NX, + int NY, + int NZ, + int MAXPOLES, + __global const $COMPLEX* restrict updatecoeffsdispersive, + __global $COMPLEX *Tx, + __global $COMPLEX *Ty, + __global $COMPLEX *Tz, + __global const unsigned int* restrict ID, + __global $REAL *Ex, + __global $REAL *Ey, + __global $REAL *Ez, + __global const $REAL* restrict Hx, + __global const $REAL* restrict Hy, + __global const $REAL* restrict Hz + """), + 'func': Template(""" // Electric field updates - dispersive materials - part A of updates to electric // field values when dispersive materials // (with multiple poles) are present. @@ -130,6 +219,8 @@ update_electric_dispersive_A = Template(""" int y_T = ((i % ($NX_T * $NY_T * $NZ_T)) % ($NY_T * $NZ_T)) / $NZ_T; int z_T = ((i % ($NX_T * $NY_T * $NZ_T)) % ($NY_T * $NZ_T)) % $NZ_T; + $CUDA_IDX + // Ex component if ((NY != 1 || NZ != 1) && x >= 0 && x < NX && y > 0 && y < NY && z > 0 && z < NZ) { int materialEx = ID[IDX4D_ID(0,x_ID,y_ID,z_ID)]; @@ -174,9 +265,38 @@ update_electric_dispersive_A = Template(""" updatecoeffsE[IDX2D_MAT(materialEz,2)] * (Hx[IDX3D_FIELDS(x,y,z)] - Hx[IDX3D_FIELDS(x,y-1,z)]) - updatecoeffsE[IDX2D_MAT(materialEz,4)] * phi; } -""") + """)} -update_electric_dispersive_B = Template(""" +update_electric_dispersive_B = { + 'args_cuda': Template(""" + __global__ void update_electric_dispersive_B(int NX, + int NY, + int NZ, + int MAXPOLES, + const $COMPLEX* __restrict__ updatecoeffsdispersive, + $COMPLEX *Tx, + $COMPLEX *Ty, + $COMPLEX *Tz, + const unsigned int* __restrict__ ID, + const $REAL* __restrict__ Ex, + const $REAL* __restrict__ Ey, + const $REAL* __restrict__ Ez) + """), + 'args_opencl': Template(""" + int NX, + int NY, + int NZ, + int MAXPOLES, + __global const $COMPLEX* restrict updatecoeffsdispersive, + __global $COMPLEX *Tx, + __global $COMPLEX *Ty, + __global $COMPLEX *Tz, + __global const unsigned int* restrict ID, + __global const $REAL* restrict Ex, + __global const $REAL* restrict Ey, + __global const $REAL* restrict Ez + """), + 'func': Template(""" // Electric field updates - dispersive materials - part B of updates to electric // field values when dispersive materials // (with multiple poles) are present. @@ -204,6 +324,8 @@ update_electric_dispersive_B = Template(""" int y_T = ((i % ($NX_T * $NY_T * $NZ_T)) % ($NY_T * $NZ_T)) / $NZ_T; int z_T = ((i % ($NX_T * $NY_T * $NZ_T)) % ($NY_T * $NZ_T)) % $NZ_T; + $CUDA_IDX + // Ex component if ((NY != 1 || NZ != 1) && x >= 0 && x < NX && y > 0 && y < NY && z > 0 && z < NZ) { int materialEx = ID[IDX4D_ID(0,x_ID,y_ID,z_ID)]; @@ -230,4 +352,4 @@ update_electric_dispersive_B = Template(""" updatecoeffsdispersive[IDX2D_MATDISP(materialEz,2+(pole*3))] * Ez[IDX3D_FIELDS(x,y,z)]; } } -""") \ No newline at end of file + """)} \ No newline at end of file diff --git a/gprMax/cuda_opencl_el/knl_pml_updates_electric_HORIPML.py b/gprMax/cuda_opencl_el/knl_pml_updates_electric_HORIPML.py index 3a6f50c7..8d291ed1 100644 --- a/gprMax/cuda_opencl_el/knl_pml_updates_electric_HORIPML.py +++ b/gprMax/cuda_opencl_el/knl_pml_updates_electric_HORIPML.py @@ -18,91 +18,166 @@ from string import Template -x_args = Template("int xs, " - "int xf, " - "int ys, " - "int yf, " - "int zs, " - "int zf, " - "int NX_PHI1, " - "int NY_PHI1, " - "int NZ_PHI1, " - "int NX_PHI2, " - "int NY_PHI2, " - "int NZ_PHI2, " - "int NY_R, " - "__global const unsigned int* restrict ID, " - "__global const $REAL* restrict Ex, " - "__global $REAL *Ey, " - "__global $REAL *Ez, " - "__global const $REAL* restrict Hx, " - "__global const $REAL* restrict Hy, " - "__global const $REAL* restrict Hz, " - "__global $REAL *PHI1, " - "__global $REAL *PHI2, " - "__global const $REAL* restrict RA, " - "__global const $REAL* restrict RB, " - "__global const $REAL* restrict RE, " - "__global const $REAL* restrict RF, " - "$REAL d") +x_args = {'cuda': Template(""" + __global__ void $FUNC(int xs, + int xf, + int ys, + int yf, + int zs, + int zf, + int NX_PHI1, + int NY_PHI1, + int NZ_PHI1, + int NX_PHI2, + int NY_PHI2, + int NZ_PHI2, + int NY_R, + const unsigned int* __restrict__ ID, + const $REAL* __restrict__ Ex, $REAL *Ey, + $REAL *Ez, + const $REAL* __restrict__ Hx, const $REAL* __restrict__ Hy, const $REAL* __restrict__ Hz, $REAL *PHI1, + $REAL *PHI2, + const $REAL* __restrict__ RA, const $REAL* __restrict__ RB, const $REAL* __restrict__ RE, const $REAL* __restrict__ RF, $REAL d) + """), + 'opencl': Template(""" + int xs, + int xf, + int ys, + int yf, + int zs, + int zf, + int NX_PHI1, + int NY_PHI1, + int NZ_PHI1, + int NX_PHI2, + int NY_PHI2, + int NZ_PHI2, + int NY_R, + __global const unsigned int* restrict ID, + __global const $REAL* restrict Ex, + __global $REAL *Ey, + __global $REAL *Ez, + __global const $REAL* restrict Hx, + __global const $REAL* restrict Hy, + __global const $REAL* restrict Hz, + __global $REAL *PHI1, + __global $REAL *PHI2, + __global const $REAL* restrict RA, + __global const $REAL* restrict RB, + __global const $REAL* restrict RE, + __global const $REAL* restrict RF, + $REAL d + """) + } -y_args = Template("int xs, " - "int xf, " - "int ys, " - "int yf, " - "int zs, " - "int zf, " - "int NX_PHI1, " - "int NY_PHI1, " - "int NZ_PHI1, " - "int NX_PHI2, " - "int NY_PHI2, " - "int NZ_PHI2, " - "int NY_R, " - "__global const unsigned int* restrict ID, " - "__global $REAL *Ex, " - "__global const $REAL* restrict Ey, " - "__global $REAL *Ez, " - "__global const $REAL* restrict Hx, " - "__global const $REAL* restrict Hy, " - "__global const $REAL* restrict Hz, " - "__global $REAL *PHI1, " - "__global $REAL *PHI2, " - "__global const $REAL* restrict RA, " - "__global const $REAL* restrict RB, " - "__global const $REAL* restrict RE, " - "__global const $REAL* restrict RF, " - "$REAL d") +y_args = {'cuda': Template(""" + __global__ void $FUNC(int xs, + int xf, + int ys, + int yf, + int zs, + int zf, + int NX_PHI1, + int NY_PHI1, + int NZ_PHI1, + int NX_PHI2, + int NY_PHI2, + int NZ_PHI2, + int NY_R, + const unsigned int* __restrict__ ID, + $REAL *Ex, + const $REAL* __restrict__ Ey, + $REAL *Ez, + const $REAL* __restrict__ Hx, const $REAL* __restrict__ Hy, const $REAL* __restrict__ Hz, $REAL *PHI1, + $REAL *PHI2, + const $REAL* __restrict__ RA, const $REAL* __restrict__ RB, const $REAL* __restrict__ RE, const $REAL* __restrict__ RF, $REAL d) + """), + 'opencl': Template(""" + int xs, + int xf, + int ys, + int yf, + int zs, + int zf, + int NX_PHI1, + int NY_PHI1, + int NZ_PHI1, + int NX_PHI2, + int NY_PHI2, + int NZ_PHI2, + int NY_R, + __global const unsigned int* restrict ID, + __global $REAL *Ex, + __global const $REAL* restrict Ey, + __global $REAL *Ez, + __global const $REAL* restrict Hx, + __global const $REAL* restrict Hy, + __global const $REAL* restrict Hz, + __global $REAL *PHI1, + __global $REAL *PHI2, + __global const $REAL* restrict RA, + __global const $REAL* restrict RB, + __global const $REAL* restrict RE, + __global const $REAL* restrict RF, + $REAL d + """) + } -z_args = Template("int xs, " - "int xf, " - "int ys, " - "int yf, " - "int zs, " - "int zf, " - "int NX_PHI1, " - "int NY_PHI1, " - "int NZ_PHI1, " - "int NX_PHI2, " - "int NY_PHI2, " - "int NZ_PHI2, " - "int NY_R, " - "__global const unsigned int* restrict ID, " - "__global $REAL *Ex, " - "__global $REAL *Ey, " - "__global const $REAL* restrict Ez, " - "__global const $REAL* restrict Hx, " - "__global const $REAL* restrict Hy, " - "__global const $REAL* restrict Hz, " - "__global $REAL *PHI1, " - "__global $REAL *PHI2, " - "__global const $REAL* restrict RA, " - "__global const $REAL* restrict RB, " - "__global const $REAL* restrict RE, " - "__global const $REAL* restrict RF, " - "$REAL d") +z_args = {'cuda': Template(""" + __global__ void $FUNC(int xs, + int xf, + int ys, + int yf, + int zs, + int zf, + int NX_PHI1, + int NY_PHI1, + int NZ_PHI1, + int NX_PHI2, + int NY_PHI2, + int NZ_PHI2, + int NY_R, + const unsigned int* __restrict__ ID, + $REAL *Ex, + $REAL *Ey, + const $REAL* __restrict__ Ez, + const $REAL* __restrict__ Hx, const $REAL* __restrict__ Hy, const $REAL* __restrict__ Hz, $REAL *PHI1, + $REAL *PHI2, + const $REAL* __restrict__ RA, const $REAL* __restrict__ RB, const $REAL* __restrict__ RE, const $REAL* __restrict__ RF, $REAL d) + """), + 'opencl': Template(""" + int xs, + int xf, + int ys, + int yf, + int zs, + int zf, + int NX_PHI1, + int NY_PHI1, + int NZ_PHI1, + int NX_PHI2, + int NY_PHI2, + int NZ_PHI2, + int NY_R, + __global const unsigned int* restrict ID, + __global $REAL *Ex, + __global $REAL *Ey, + __global const $REAL* restrict Ez, + __global const $REAL* restrict Hx, + __global const $REAL* restrict Hy, + __global const $REAL* restrict Hz, + __global $REAL *PHI1, + __global $REAL *PHI2, + __global const $REAL* restrict RA, + __global const $REAL* restrict RB, + __global const $REAL* restrict RE, + __global const $REAL* restrict RF, + $REAL d + """) + } -order1_xminus = {'args': x_args, +order1_xminus = {'args_cuda': x_args['cuda'], + 'args_opencl': x_args['opencl'], 'func': Template(""" // This function updates the Ey and Ez field components for the xminus slab. // @@ -113,6 +188,8 @@ order1_xminus = {'args': x_args, // Phi, RA, RB, RE, RF: Access to PML electric coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -173,7 +250,8 @@ order1_xminus = {'args': x_args, } """)} -order2_xminus = {'args': x_args, +order2_xminus = {'args_cuda': x_args['cuda'], + 'args_opencl': x_args['opencl'], 'func': Template(""" // This function updates the Ey and Ez field components for the xminus slab. // @@ -184,6 +262,8 @@ order2_xminus = {'args': x_args, // Phi, RA, RB, RE, RF: Access to PML electric coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -260,7 +340,8 @@ order2_xminus = {'args': x_args, } """)} -order1_xplus = {'args': x_args, +order1_xplus = {'args_cuda': x_args['cuda'], + 'args_opencl': x_args['opencl'], 'func': Template(""" // This function updates the Ey and Ez field components for the xplus slab. // @@ -271,6 +352,8 @@ order1_xplus = {'args': x_args, // Phi, RA, RB, RE, RF: Access to PML electric coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -331,7 +414,8 @@ order1_xplus = {'args': x_args, } """)} -order2_xplus = {'args': x_args, +order2_xplus = {'args_cuda': x_args['cuda'], + 'args_opencl': x_args['opencl'], 'func': Template(""" // This function updates the Ey and Ez field components for the xplus slab. // @@ -342,6 +426,8 @@ order2_xplus = {'args': x_args, // Phi, RA, RB, RE, RF: Access to PML electric coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -417,7 +503,8 @@ order2_xplus = {'args': x_args, } """)} -order1_yminus = {'args': y_args, +order1_yminus = {'args_cuda': y_args['cuda'], + 'args_opencl': y_args['opencl'], 'func': Template(""" // This function updates the Ex and Ez field components for the yminus slab. // @@ -428,6 +515,8 @@ order1_yminus = {'args': y_args, // Phi, RA, RB, RE, RF: Access to PML electric coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -488,7 +577,8 @@ order1_yminus = {'args': y_args, } """)} -order2_yminus = {'args': y_args, +order2_yminus = {'args_cuda': y_args['cuda'], + 'args_opencl': y_args['opencl'], 'func': Template(""" // This function updates the Ex and Ez field components for the yminus slab. // @@ -499,6 +589,8 @@ order2_yminus = {'args': y_args, // Phi, RA, RB, RE, RF: Access to PML electric coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -575,7 +667,8 @@ order2_yminus = {'args': y_args, } """)} -order1_yplus = {'args': y_args, +order1_yplus = {'args_cuda': y_args['cuda'], + 'args_opencl': y_args['opencl'], 'func': Template(""" // This function updates the Ex and Ez field components for the yplus slab. // @@ -586,6 +679,8 @@ order1_yplus = {'args': y_args, // Phi, RA, RB, RE, RF: Access to PML electric coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -646,7 +741,8 @@ order1_yplus = {'args': y_args, } """)} -order2_yplus = {'args': y_args, +order2_yplus = {'args_cuda': y_args['cuda'], + 'args_opencl': y_args['opencl'], 'func': Template(""" // This function updates the Ex and Ez field components for the yplus slab. // @@ -657,6 +753,8 @@ order2_yplus = {'args': y_args, // Phi, RA, RB, RE, RF: Access to PML electric coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -733,7 +831,8 @@ order2_yplus = {'args': y_args, } """)} -order1_zminus = {'args': z_args, +order1_zminus = {'args_cuda': z_args['cuda'], + 'args_opencl': z_args['opencl'], 'func': Template(""" // This function updates the Ex and Ey field components for the zminus slab. // @@ -744,6 +843,8 @@ order1_zminus = {'args': z_args, // Phi, RA, RB, RE, RF: Access to PML electric coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -804,7 +905,8 @@ order1_zminus = {'args': z_args, } """)} -order2_zminus = {'args': z_args, +order2_zminus = {'args_cuda': z_args['cuda'], + 'args_opencl': z_args['opencl'], 'func': Template(""" // This function updates the Ex and Ey field components for the zminus slab. // @@ -815,6 +917,8 @@ order2_zminus = {'args': z_args, // Phi, RA, RB, RE, RF: Access to PML electric coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -891,7 +995,8 @@ order2_zminus = {'args': z_args, } """)} -order1_zplus = {'args': z_args, +order1_zplus = {'args_cuda': z_args['cuda'], + 'args_opencl': z_args['opencl'], 'func': Template(""" // This function updates the Ex and Ey field components for the zplus slab. // @@ -902,6 +1007,8 @@ order1_zplus = {'args': z_args, // Phi, RA, RB, RE, RF: Access to PML electric coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -962,7 +1069,8 @@ order1_zplus = {'args': z_args, } """)} -order2_zplus = {'args': z_args, +order2_zplus = {'args_cuda': z_args['cuda'], + 'args_opencl': z_args['opencl'], 'func': Template(""" // This function updates the Ex and Ey field components for the zplus slab. // @@ -973,6 +1081,8 @@ order2_zplus = {'args': z_args, // Phi, RA, RB, RE, RF: Access to PML electric coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -1047,5 +1157,4 @@ order2_zplus = {'args': z_args, (RA0 * dHx + RB0 * PHI2[IDX4D_PHI2(0,i2,j2,k2)]); PHI2[IDX4D_PHI2(0,i2,j2,k2)] = RE0 * PHI2[IDX4D_PHI2(0,i2,j2,k2)] - RF0 * dHx; } -""")} - +""")} \ No newline at end of file diff --git a/gprMax/cuda_opencl_el/knl_pml_updates_magnetic_HORIPML.py b/gprMax/cuda_opencl_el/knl_pml_updates_magnetic_HORIPML.py index 1201659d..63c3bdde 100644 --- a/gprMax/cuda_opencl_el/knl_pml_updates_magnetic_HORIPML.py +++ b/gprMax/cuda_opencl_el/knl_pml_updates_magnetic_HORIPML.py @@ -18,91 +18,168 @@ from string import Template -x_args = Template("int xs, " - "int xf, " - "int ys, " - "int yf, " - "int zs, " - "int zf, " - "int NX_PHI1, " - "int NY_PHI1, " - "int NZ_PHI1, " - "int NX_PHI2, " - "int NY_PHI2, " - "int NZ_PHI2, " - "int NY_R, " - "__global const unsigned int* restrict ID, " - "__global const $REAL* restrict Ex, " - "__global const $REAL* restrict Ey, " - "__global const $REAL* restrict Ez, " - "__global const $REAL* restrict Hx, " - "__global $REAL *Hy, " - "__global $REAL *Hz, " - "__global $REAL *PHI1, " - "__global $REAL *PHI2, " - "__global const $REAL* restrict RA, " - "__global const $REAL* restrict RB, " - "__global const $REAL* restrict RE, " - "__global const $REAL* restrict RF, " - "$REAL d") +x_args = {'cuda': Template(""" + __global__ void $FUNC(int xs, + int xf, + int ys, + int yf, + int zs, + int zf, + int NX_PHI1, + int NY_PHI1, + int NZ_PHI1, + int NX_PHI2, + int NY_PHI2, + int NZ_PHI2, + int NY_R, + const unsigned int* __restrict__ ID, + const $REAL* __restrict__ Ex, const $REAL* __restrict__ Ey, const $REAL* __restrict__ Ez, const $REAL* __restrict__ Hx, $REAL *Hy, + $REAL *Hz, + $REAL *PHI1, + $REAL *PHI2, + const $REAL* __restrict__ RA, const $REAL* __restrict__ RB, const $REAL* __restrict__ RE, const $REAL* __restrict__ RF, $REAL d) + """), + 'opencl': Template(""" + int xs, + int xf, + int ys, + int yf, + int zs, + int zf, + int NX_PHI1, + int NY_PHI1, + int NZ_PHI1, + int NX_PHI2, + int NY_PHI2, + int NZ_PHI2, + int NY_R, + __global const unsigned int* restrict ID, + __global const $REAL* restrict Ex, + __global const $REAL* restrict Ey, + __global const $REAL* restrict Ez, + __global const $REAL* restrict Hx, + __global $REAL *Hy, + __global $REAL *Hz, + __global $REAL *PHI1, + __global $REAL *PHI2, + __global const $REAL* restrict RA, + __global const $REAL* restrict RB, + __global const $REAL* restrict RE, + __global const $REAL* restrict RF, + $REAL d + """) + } -y_args = Template("int xs, " - "int xf, " - "int ys, " - "int yf, " - "int zs, " - "int zf, " - "int NX_PHI1, " - "int NY_PHI1, " - "int NZ_PHI1, " - "int NX_PHI2, " - "int NY_PHI2, " - "int NZ_PHI2, " - "int NY_R, " - "__global const unsigned int* restrict ID, " - "__global const $REAL* restrict Ex, " - "__global const $REAL* restrict Ey, " - "__global const $REAL* restrict Ez, " - "__global $REAL *Hx, " - "__global const $REAL* restrict Hy, " - "__global $REAL *Hz, " - "__global $REAL *PHI1, " - "__global $REAL *PHI2, " - "__global const $REAL* restrict RA, " - "__global const $REAL* restrict RB, " - "__global const $REAL* restrict RE, " - "__global const $REAL* restrict RF, " - "$REAL d") +y_args = {'cuda': Template(""" + __global__ void $FUNC(int xs, + int xf, + int ys, + int yf, + int zs, + int zf, + int NX_PHI1, + int NY_PHI1, + int NZ_PHI1, + int NX_PHI2, + int NY_PHI2, + int NZ_PHI2, + int NY_R, + const unsigned int* __restrict__ ID, + const $REAL* __restrict__ Ex, const $REAL* __restrict__ Ey, const $REAL* __restrict__ Ez, + $REAL *Hx, + const $REAL* __restrict__ Hy, + $REAL *Hz, + $REAL *PHI1, + $REAL *PHI2, + const $REAL* __restrict__ RA, const $REAL* __restrict__ RB, const $REAL* __restrict__ RE, const $REAL* __restrict__ RF, $REAL d) + """), + 'opencl': Template(""" + int xs, + int xf, + int ys, + int yf, + int zs, + int zf, + int NX_PHI1, + int NY_PHI1, + int NZ_PHI1, + int NX_PHI2, + int NY_PHI2, + int NZ_PHI2, + int NY_R, + __global const unsigned int* restrict ID, + __global const $REAL* restrict Ex, + __global const $REAL* restrict Ey, + __global const $REAL* restrict Ez, + __global $REAL *Hx, + __global const $REAL* restrict Hy, + __global $REAL *Hz, + __global $REAL *PHI1, + __global $REAL *PHI2, + __global const $REAL* restrict RA, + __global const $REAL* restrict RB, + __global const $REAL* restrict RE, + __global const $REAL* restrict RF, + $REAL d + """) + } -z_args = Template("int xs, " - "int xf, " - "int ys, " - "int yf, " - "int zs, " - "int zf, " - "int NX_PHI1, " - "int NY_PHI1, " - "int NZ_PHI1, " - "int NX_PHI2, " - "int NY_PHI2, " - "int NZ_PHI2, " - "int NY_R, " - "__global const unsigned int* restrict ID, " - "__global const $REAL* restrict Ex, " - "__global const $REAL* restrict Ey, " - "__global const $REAL* restrict Ez, " - "__global $REAL *Hx, " - "__global $REAL *Hy, " - "__global const $REAL* restrict Hz, " - "__global $REAL *PHI1, " - "__global $REAL *PHI2, " - "__global const $REAL* restrict RA, " - "__global const $REAL* restrict RB, " - "__global const $REAL* restrict RE, " - "__global const $REAL* restrict RF, " - "$REAL d") +z_args = {'cuda': Template(""" + __global__ void $FUNC(int xs, + int xf, + int ys, + int yf, + int zs, + int zf, + int NX_PHI1, + int NY_PHI1, + int NZ_PHI1, + int NX_PHI2, + int NY_PHI2, + int NZ_PHI2, + int NY_R, + const unsigned int* __restrict__ ID, + const $REAL* __restrict__ Ex, const $REAL* __restrict__ Ey, const $REAL* __restrict__ Ez, + $REAL *Hx, + $REAL *Hy, + const $REAL* __restrict__ Hz, + $REAL *PHI1, + $REAL *PHI2, + const $REAL* __restrict__ RA, const $REAL* __restrict__ RB, const $REAL* __restrict__ RE, const $REAL* __restrict__ RF, $REAL d) + """), + 'opencl': Template(""" + int xs, + int xf, + int ys, + int yf, + int zs, + int zf, + int NX_PHI1, + int NY_PHI1, + int NZ_PHI1, + int NX_PHI2, + int NY_PHI2, + int NZ_PHI2, + int NY_R, + __global const unsigned int* restrict ID, + __global const $REAL* restrict Ex, + __global const $REAL* restrict Ey, + __global const $REAL* restrict Ez, + __global $REAL *Hx, + __global $REAL *Hy, + __global const $REAL* restrict Hz, + __global $REAL *PHI1, + __global $REAL *PHI2, + __global const $REAL* restrict RA, + __global const $REAL* restrict RB, + __global const $REAL* restrict RE, + __global const $REAL* restrict RF, + $REAL d + """) + } -order1_xminus = {'args': x_args, +order1_xminus = {'args_cuda': x_args['cuda'], + 'args_opencl': x_args['opencl'], 'func': Template(""" // This function updates the Hy and Hz field components for the xminus slab. // @@ -113,6 +190,8 @@ order1_xminus = {'args': x_args, // Phi, RA, RB, RE, RF: Access to PML magnetic coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -173,7 +252,8 @@ order1_xminus = {'args': x_args, } """)} -order2_xminus = {'args': x_args, +order2_xminus = {'args_cuda': x_args['cuda'], + 'args_opencl': x_args['opencl'], 'func': Template(""" // This function updates the Hy and Hz field components for the xminus slab. // @@ -184,6 +264,8 @@ order2_xminus = {'args': x_args, // Phi, RA, RB, RE, RF: Access to PML magnetic coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -260,7 +342,8 @@ order2_xminus = {'args': x_args, } """)} -order1_xplus = {'args': x_args, +order1_xplus = {'args_cuda': x_args['cuda'], + 'args_opencl': x_args['opencl'], 'func': Template(""" // This function updates the Hy and Hz field components for the xplus slab. // @@ -271,6 +354,8 @@ order1_xplus = {'args': x_args, // Phi, RA, RB, RE, RF: Access to PML magnetic coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -331,7 +416,8 @@ order1_xplus = {'args': x_args, } """)} -order2_xplus = {'args': x_args, +order2_xplus = {'args_cuda': x_args['cuda'], + 'args_opencl': x_args['opencl'], 'func': Template(""" // This function updates the Hy and Hz field components for the xplus slab. // @@ -342,6 +428,8 @@ order2_xplus = {'args': x_args, // Phi, RA, RB, RE, RF: Access to PML magnetic coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -418,7 +506,8 @@ order2_xplus = {'args': x_args, } """)} -order1_yminus = {'args': y_args, +order1_yminus = {'args_cuda': y_args['cuda'], + 'args_opencl': y_args['opencl'], 'func': Template(""" // This function updates the Hx and Hz field components for the yminus slab. // @@ -429,6 +518,8 @@ order1_yminus = {'args': y_args, // Phi, RA, RB, RE, RF: Access to PML magnetic coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -489,7 +580,8 @@ order1_yminus = {'args': y_args, } """)} -order2_yminus = {'args': y_args, +order2_yminus = {'args_cuda': y_args['cuda'], + 'args_opencl': y_args['opencl'], 'func': Template(""" // This function updates the Hx and Hz field components for the yminus slab. // @@ -500,6 +592,8 @@ order2_yminus = {'args': y_args, // Phi, RA, RB, RE, RF: Access to PML magnetic coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -575,7 +669,8 @@ order2_yminus = {'args': y_args, } """)} -order1_yplus = {'args': y_args, +order1_yplus = {'args_cuda': y_args['cuda'], + 'args_opencl': y_args['opencl'], 'func': Template(""" // This function updates the Hx and Hz field components for the yplus slab. // @@ -586,6 +681,8 @@ order1_yplus = {'args': y_args, // Phi, RA, RB, RE, RF: Access to PML magnetic coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -646,7 +743,8 @@ order1_yplus = {'args': y_args, } """)} -order2_yplus = {'args': y_args, +order2_yplus = {'args_cuda': y_args['cuda'], + 'args_opencl': y_args['opencl'], 'func': Template(""" // This function updates the Hx and Hz field components for the yplus slab. // @@ -657,6 +755,8 @@ order2_yplus = {'args': y_args, // Phi, RA, RB, RE, RF: Access to PML magnetic coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -731,7 +831,8 @@ order2_yplus = {'args': y_args, } """)} -order1_zminus = {'args': z_args, +order1_zminus = {'args_cuda': z_args['cuda'], + 'args_opencl': z_args['opencl'], 'func': Template(""" // This function updates the Hx and Hy field components for the zminus slab. // @@ -742,6 +843,8 @@ order1_zminus = {'args': z_args, // Phi, RA, RB, RE, RF: Access to PML magnetic coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -802,7 +905,8 @@ order1_zminus = {'args': z_args, } """)} -order2_zminus = {'args': z_args, +order2_zminus = {'args_cuda': z_args['cuda'], + 'args_opencl': z_args['opencl'], 'func': Template(""" // This function updates the Hx and Hy field components for the zminus slab. // @@ -813,6 +917,8 @@ order2_zminus = {'args': z_args, // Phi, RA, RB, RE, RF: Access to PML magnetic coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -889,7 +995,8 @@ order2_zminus = {'args': z_args, } """)} -order1_zplus = {'args': z_args, +order1_zplus = {'args_cuda': z_args['cuda'], + 'args_opencl': z_args['opencl'], 'func': Template(""" // This function updates the Hx and Hy field components for the zplus slab. // @@ -900,6 +1007,8 @@ order1_zplus = {'args': z_args, // Phi, RA, RB, RE, RF: Access to PML magnetic coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); @@ -960,7 +1069,8 @@ order1_zplus = {'args': z_args, } """)} -order2_zplus = {'args': z_args, +order2_zplus = {'args_cuda': z_args['cuda'], + 'args_opencl': z_args['opencl'], 'func': Template(""" // This function updates the Hx and Hy field components for the zplus slab. // @@ -971,6 +1081,8 @@ order2_zplus = {'args': z_args, // Phi, RA, RB, RE, RF: Access to PML magnetic coefficient arrays // d: Spatial discretisation, e.g. dx, dy or dz + $CUDA_IDX + // Convert the linear index to subscripts for PML PHI1 (4D) arrays int p1 = i / (NX_PHI1 * NY_PHI1 * NZ_PHI1); int i1 = (i % (NX_PHI1 * NY_PHI1 * NZ_PHI1)) / (NY_PHI1 * NZ_PHI1); diff --git a/gprMax/cuda_opencl_el/knl_snapshots.py b/gprMax/cuda_opencl_el/knl_snapshots.py index 2da13396..322f1859 100644 --- a/gprMax/cuda_opencl_el/knl_snapshots.py +++ b/gprMax/cuda_opencl_el/knl_snapshots.py @@ -19,7 +19,55 @@ from string import Template -store_snapshot = Template(""" +store_snapshot = {'args_cuda': Template(""" + __global__ void store_snapshot(int p, + int xs, + int xf, + int ys, + int yf, + int zs, + int zf, + int dx, + int dy, + int dz, + const $REAL* __restrict__ Ex, + const $REAL* __restrict__ Ey, + const $REAL* __restrict__ Ez, + const $REAL* __restrict__ Hx, + const $REAL* __restrict__ Hy, + const $REAL* __restrict__ Hz, + $REAL *snapEx, + $REAL *snapEy, + $REAL *snapEz, + $REAL *snapHx, + $REAL *snapHy, + $REAL *snapHz) + """), + 'args_opencl': Template(""" + int p, + int xs, + int xf, + int ys, + int yf, + int zs, + int zf, + int dx, + int dy, + int dz, + __global const $REAL* restrict Ex, + __global const $REAL* restrict Ey, + __global const $REAL* restrict Ez, + __global const $REAL* restrict Hx, + __global const $REAL* restrict Hy, + __global const $REAL* restrict Hz, + __global $REAL *snapEx, + __global $REAL *snapEy, + __global $REAL *snapEz, + __global $REAL *snapHx, + __global $REAL *snapHy, + __global $REAL *snapHz + """), + 'func': Template(""" // Stores field values for a snapshot. // // Args: @@ -29,6 +77,7 @@ store_snapshot = Template(""" // E, H: Access to field component arrays. // snapEx, snapEy, snapEz, snapHx, snapHy, snapHz: Access to arrays to store snapshots. + $CUDA_IDX // Convert the linear index to subscripts for 4D SNAPS array int x = (i % ($NX_SNAPS * $NY_SNAPS * $NZ_SNAPS)) / ($NY_SNAPS * $NZ_SNAPS); @@ -69,4 +118,5 @@ store_snapshot = Template(""" snapHz[IDX4D_SNAPS(p,x,y,z)] = (Hz[IDX3D_FIELDS(xx,yy,zz)] + Hz[IDX3D_FIELDS(xx,yy,zz+1)]) / 2; } -""") \ No newline at end of file +""") +} \ No newline at end of file diff --git a/gprMax/cuda_opencl_el/knl_source_updates.py b/gprMax/cuda_opencl_el/knl_source_updates.py index a7ca3372..71932f04 100644 --- a/gprMax/cuda_opencl_el/knl_source_updates.py +++ b/gprMax/cuda_opencl_el/knl_source_updates.py @@ -18,7 +18,35 @@ from string import Template -update_hertzian_dipole = Template(""" +update_hertzian_dipole = {'args_cuda': Template(""" + __global__ void update_hertzian_dipole(int NHERTZDIPOLE, + int iteration, + $REAL dx, + $REAL dy, + $REAL dz, + const int* __restrict__ srcinfo1, + const $REAL* __restrict__ srcinfo2, + const $REAL* __restrict__ srcwaveforms, + const unsigned int* __restrict__ ID, + $REAL *Ex, + $REAL *Ey, + $REAL *Ez) + """), + 'args_opencl': Template(""" + int NHERTZDIPOLE, + int iteration, + $REAL dx, + $REAL dy, + $REAL dz, + __global const int* restrict srcinfo1, + __global const $REAL* restrict srcinfo2, + __global const $REAL* restrict srcwaveforms, + __global const unsigned int* restrict ID, + __global $REAL *Ex, + __global $REAL *Ey, + __global $REAL *Ez + """), + 'func': Template(""" // Updates electric field values for Hertzian dipole sources. // // Args: @@ -30,6 +58,7 @@ update_hertzian_dipole = Template(""" // srcwaveforms: Source waveform values. // ID, E: Access to ID and field component arrays. + $CUDA_IDX if (i < NHERTZDIPOLE) { @@ -64,8 +93,37 @@ update_hertzian_dipole = Template(""" } } """) +} -update_magnetic_dipole = Template(""" +update_magnetic_dipole = {'args_cuda': Template(""" + __global__ void update_magnetic_dipole(int NMAGDIPOLE, + int iteration, + $REAL dx, + $REAL dy, + $REAL dz, + const int* __restrict__ srcinfo1, + const $REAL* __restrict__ srcinfo2, + const $REAL* __restrict__ srcwaveforms, + const unsigned int* __restrict__ ID, + $REAL *Hx, + $REAL *Hy, + $REAL *Hz) + """), + 'args_opencl': Template(""" + int NMAGDIPOLE, + int iteration, + $REAL dx, + $REAL dy, + $REAL dz, + __global const int* restrict rcinfo1, + __global const $REAL* restrict rcinfo2, + __global const $REAL* restrict rcwaveforms, + __global const unsigned int* estrict ID, + __global $REAL *Hx, + __global $REAL *Hy, + __global $REAL *Hz + """), + 'func': Template(""" // Updates electric field values for Hertzian dipole sources. // // Args: @@ -77,6 +135,7 @@ update_magnetic_dipole = Template(""" // srcwaveforms: Source waveform values. // ID, H: Access to ID and field component arrays. + $CUDA_IDX if (i < NMAGDIPOLE) { @@ -109,8 +168,37 @@ update_magnetic_dipole = Template(""" } } """) +} -update_voltage_source = Template(""" +update_voltage_source = {'args_cuda': Template(""" + __global__ void update_voltage_source(int NVOLTSRC, + int iteration, + $REAL dx, + $REAL dy, + $REAL dz, + const int* __restrict__ srcinfo1, + const $REAL* __restrict__ srcinfo2, + const $REAL* __restrict__ srcwaveforms, + const unsigned int* __restrict__ ID, + $REAL *Ex, + $REAL *Ey, + $REAL *Ez) + """), + 'args_opencl': Template(""" + int NVOLTSRC, + int iteration, + $REAL dx, + $REAL dy, + $REAL dz, + __global const int* restrict rcinfo1, + __global const $REAL* restrict rcinfo2, + __global const $REAL* restrict rcwaveforms, + __global const unsigned int* estrict ID, + __global $REAL *Ex, + __global $REAL *Ey, + __global $REAL *Ez + """), + 'func': Template(""" // Updates electric field values for voltage sources. // // Args: @@ -122,6 +210,7 @@ update_voltage_source = Template(""" // srcwaveforms: Source waveform values. // ID, E: Access to ID and field component arrays. + $CUDA_IDX if (i < NVOLTSRC) { @@ -170,4 +259,5 @@ update_voltage_source = Template(""" } } } -""") \ No newline at end of file +""") +} \ No newline at end of file diff --git a/gprMax/cuda_opencl_el/knl_store_outputs.py b/gprMax/cuda_opencl_el/knl_store_outputs.py index 496ea3d9..96aa599b 100644 --- a/gprMax/cuda_opencl_el/knl_store_outputs.py +++ b/gprMax/cuda_opencl_el/knl_store_outputs.py @@ -19,7 +19,31 @@ from string import Template -store_outputs = Template(""" +store_outputs = {'args_cuda': Template(""" + __global__ void store_outputs(int NRX, + int iteration, + const int* __restrict__ rxcoords, + $REAL *rxs, + const $REAL* __restrict__ Ex, + const $REAL* __restrict__ Ey, + const $REAL* __restrict__ Ez, + const $REAL* __restrict__ Hx, + const $REAL* __restrict__ Hy, + const $REAL* __restrict__ Hz) + """), + 'args_opencl': Template(""" + int NRX, + int iteration, + __global const int* restrict rxcoords, + __global $REAL *rxs, + __global const $REAL* restrict Ex, + __global const $REAL* restrict Ey, + __global const $REAL* restrict Ez, + __global const $REAL* restrict Hx, + __global const $REAL* restrict Hy, + __global const $REAL* restrict Hz + """), + 'func': Template(""" // Stores field component values for every receiver in the model. // // Args: @@ -27,6 +51,8 @@ store_outputs = Template(""" // rxs: array to store field components for receivers - rows // are field components; columns are iterations; pages are receiver. + $CUDA_IDX + if (i < NRX) { int x, y, z; x = rxcoords[IDX2D_RXCOORDS(i,0)]; @@ -39,4 +65,5 @@ store_outputs = Template(""" rxs[IDX3D_RXS(4,iteration,i)] = Hy[IDX3D_FIELDS(x,y,z)]; rxs[IDX3D_RXS(5,iteration,i)] = Hz[IDX3D_FIELDS(x,y,z)]; } -""") \ No newline at end of file +""") +} \ No newline at end of file diff --git a/gprMax/updates.py b/gprMax/updates.py index 412ae5a8..3fcae201 100644 --- a/gprMax/updates.py +++ b/gprMax/updates.py @@ -791,45 +791,23 @@ class OpenCLUpdates: NY_SNAPS=Snapshot.ny_max, NZ_SNAPS=Snapshot.nz_max) + subs = {'CUDA_IDX': '', + 'NX_FIELDS': self.grid.nx + 1, + 'NY_FIELDS': self.grid.ny + 1, + 'NZ_FIELDS': self.grid.nz + 1, + 'NX_ID': self.grid.ID.shape[1], + 'NY_ID': self.grid.ID.shape[2], + 'NZ_ID': self.grid.ID.shape[3]} + self.update_electric_dev = self.elwise(self.ctx, - Template("int NX, " - "int NY, " - "int NZ, " - "__global const unsigned int* restrict ID, " - "__global $REAL *Ex, " - "__global $REAL *Ey, " - "__global $REAL *Ez, " - "__global const $REAL * restrict Hx, " - "__global const $REAL * restrict Hy, " - "__global const $REAL * restrict Hz").substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), - knl_fields_updates.update_electric.substitute({ - 'NX_FIELDS': self.grid.nx + 1, - 'NY_FIELDS': self.grid.ny + 1, - 'NZ_FIELDS': self.grid.nz + 1, - 'NX_ID': self.grid.ID.shape[1], - 'NY_ID': self.grid.ID.shape[2], - 'NZ_ID': self.grid.ID.shape[3]}), + knl_fields_updates.update_electric['args_opencl'].substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), + knl_fields_updates.update_electric['func'].substitute(subs), 'update_electric', preamble=self.knl_common, options=config.sim_config.devices['compiler_opts']) self.update_magnetic_dev = self.elwise(self.ctx, - Template("int NX, " - "int NY, " - "int NZ, " - "__global const unsigned int* restrict ID, " - "__global $REAL *Hx, " - "__global $REAL *Hy, " - "__global $REAL *Hz, " - "__global const $REAL * restrict Ex, " - "__global const $REAL * restrict Ey, " - "__global const $REAL * restrict Ez").substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), - knl_fields_updates.update_magnetic.substitute({ - 'NX_FIELDS': self.grid.nx + 1, - 'NY_FIELDS': self.grid.ny + 1, - 'NZ_FIELDS': self.grid.nz + 1, - 'NX_ID': self.grid.ID.shape[1], - 'NY_ID': self.grid.ID.shape[2], - 'NZ_ID': self.grid.ID.shape[3]}), + knl_fields_updates.update_magnetic['args_opencl'].substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), + knl_fields_updates.update_magnetic['func'].substitute(subs), 'update_magnetic', preamble=self.knl_common, options=config.sim_config.devices['compiler_opts']) @@ -838,64 +816,26 @@ class OpenCLUpdates: # If there are any dispersive materials (updates are split into two # parts as they require present and updated electric field values). if config.get_model_config().materials['maxpoles'] > 0: + subs = {'CUDA_IDX': '', + 'REAL': config.sim_config.dtypes['C_float_or_double'], + 'REALFUNC': config.get_model_config().materials['crealfunc'], + 'NX_FIELDS': self.grid.nx + 1, + 'NY_FIELDS': self.grid.ny + 1, + 'NZ_FIELDS': self.grid.nz + 1, + 'NX_ID': self.grid.ID.shape[1], + 'NY_ID': self.grid.ID.shape[2], + 'NZ_ID': self.grid.ID.shape[3], + 'NX_T': NX_T, + 'NY_T': NY_T, + 'NZ_T': NZ_T} self.dispersive_update_a = self.elwise(self.ctx, - Template("int NX, " - "int NY, " - "int NZ, " - "int MAXPOLES, " - "__global const $COMPLEX* restrict updatecoeffsdispersive, " - "__global $COMPLEX *Tx, " - "__global $COMPLEX *Ty, " - "__global $COMPLEX *Tz, " - "__global const unsigned int* restrict ID, " - "__global $REAL *Ex, " - "__global $REAL *Ey, " - "__global $REAL *Ez, " - "__global const $REAL* restrict Hx, " - "__global const $REAL* restrict Hy, " - "__global const $REAL* restrict Hz").substitute({'REAL': config.sim_config.dtypes['C_float_or_double'], 'COMPLEX': config.get_model_config().materials['dispersiveCdtype']}), - knl_fields_updates.update_electric_dispersive_A.substitute({ - 'REAL': config.sim_config.dtypes['C_float_or_double'], - 'REALFUNC': config.get_model_config().materials['crealfunc'], - 'NX_FIELDS': self.grid.nx + 1, - 'NY_FIELDS': self.grid.ny + 1, - 'NZ_FIELDS': self.grid.nz + 1, - 'NX_ID': self.grid.ID.shape[1], - 'NY_ID': self.grid.ID.shape[2], - 'NZ_ID': self.grid.ID.shape[3], - 'NX_T': NX_T, - 'NY_T': NY_T, - 'NZ_T': NZ_T}), + knl_fields_updates.update_electric_dispersive_A['args_opencl'].substitute({'REAL': config.sim_config.dtypes['C_float_or_double'], 'COMPLEX': config.get_model_config().materials['dispersiveCdtype']}), + knl_fields_updates.update_electric_dispersive_A['func'].substitute(subs), 'update_electric_dispersive_A', preamble=self.knl_common, options=config.sim_config.devices['compiler_opts']) self.dispersive_update_b = self.elwise(self.ctx, - Template("int NX, " - "int NY, " - "int NZ, " - "int MAXPOLES, " - "__global const $COMPLEX* restrict updatecoeffsdispersive, " - "__global $COMPLEX *Tx, " - "__global $COMPLEX *Ty, " - "__global $COMPLEX *Tz, " - "__global const unsigned int* restrict ID, " - "__global $REAL *Ex, " - "__global $REAL *Ey, " - "__global $REAL *Ez, " - "__global const $REAL* restrict Hx, " - "__global const $REAL* restrict Hy, " - "__global const $REAL* restrict Hz").substitute({'REAL': config.sim_config.dtypes['C_float_or_double'] ,'COMPLEX': config.get_model_config().materials['dispersiveCdtype']}), - knl_fields_updates.update_electric_dispersive_B.substitute({ - 'REAL': config.sim_config.dtypes['C_float_or_double'], - 'REALFUNC': config.get_model_config().materials['crealfunc'], - 'NX_FIELDS': self.grid.nx + 1, - 'NY_FIELDS': self.grid.ny + 1, - 'NZ_FIELDS': self.grid.nz + 1, - 'NX_ID': self.grid.ID.shape[1], - 'NY_ID': self.grid.ID.shape[2], - 'NZ_ID': self.grid.ID.shape[3], - 'NX_T': NX_T, - 'NY_T': NY_T, - 'NZ_T': NZ_T}), + knl_fields_updates.update_electric_dispersive_B['args_opencl'].substitute({'REAL': config.sim_config.dtypes['C_float_or_double'] ,'COMPLEX': config.get_model_config().materials['dispersiveCdtype']}), + knl_fields_updates.update_electric_dispersive_B['func'].substitute(subs), 'update_electric_dispersive_B', preamble=self.knl_common, options=config.sim_config.devices['compiler_opts']) @@ -911,6 +851,15 @@ class OpenCLUpdates: knl_pml_updates_electric = import_module('gprMax.cuda_opencl_el.knl_pml_updates_electric_' + self.grid.pmlformulation) knl_pml_updates_magnetic = import_module('gprMax.cuda_opencl_el.knl_pml_updates_magnetic_' + self.grid.pmlformulation) + subs = {'CUDA_IDX': '', + 'REAL': config.sim_config.dtypes['C_float_or_double'], + 'NX_FIELDS': self.grid.nx + 1, + 'NY_FIELDS': self.grid.ny + 1, + 'NZ_FIELDS': self.grid.nz + 1, + 'NX_ID': self.grid.ID.shape[1], + 'NY_ID': self.grid.ID.shape[2], + 'NZ_ID': self.grid.ID.shape[3]} + # Set workgroup size, initialise arrays on compute device, and get # kernel functions for pml in self.grid.pmls: @@ -921,29 +870,15 @@ class OpenCLUpdates: knl_magnetic_name = getattr(knl_pml_updates_magnetic, knl_name) pml.update_electric_dev = self.elwise(self.ctx, - knl_electric_name['args'].substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), - knl_electric_name['func'].substitute({ - 'REAL': config.sim_config.dtypes['C_float_or_double'], - 'NX_FIELDS': self.grid.nx + 1, - 'NY_FIELDS': self.grid.ny + 1, - 'NZ_FIELDS': self.grid.nz + 1, - 'NX_ID': self.grid.ID.shape[1], - 'NY_ID': self.grid.ID.shape[2], - 'NZ_ID': self.grid.ID.shape[3]}), + knl_electric_name['args_opencl'].substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), + knl_electric_name['func'].substitute(subs), 'pml_updates_electric_' + knl_name, preamble=self.knl_common, options=config.sim_config.devices['compiler_opts']) pml.update_magnetic_dev = self.elwise(self.ctx, - knl_magnetic_name['args'].substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), - knl_magnetic_name['func'].substitute({ - 'REAL': config.sim_config.dtypes['C_float_or_double'], - 'NX_FIELDS': self.grid.nx + 1, - 'NY_FIELDS': self.grid.ny + 1, - 'NZ_FIELDS': self.grid.nz + 1, - 'NX_ID': self.grid.ID.shape[1], - 'NY_ID': self.grid.ID.shape[2], - 'NZ_ID': self.grid.ID.shape[3]}), + knl_magnetic_name['args_opencl'].substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), + knl_magnetic_name['func'].substitute(subs), 'pml_updates_magnetic_' + knl_name, preamble=self.knl_common, options=config.sim_config.devices['compiler_opts']) @@ -954,17 +889,8 @@ class OpenCLUpdates: """ self.rxcoords_dev, self.rxs_dev = htod_rx_arrays(self.grid, self.queue) self.store_outputs_dev = self.elwise(self.ctx, - Template("int NRX, " - "int iteration, " - "__global const int* restrict rxcoords, " - "__global $REAL *rxs, " - "__global const $REAL* restrict Ex, " - "__global const $REAL* restrict Ey, " - "__global const $REAL* restrict Ez, " - "__global const $REAL* restrict Hx, " - "__global const $REAL* restrict Hy, " - "__global const $REAL* restrict Hz").substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), - knl_store_outputs.store_outputs.substitute(), + knl_store_outputs.store_outputs['args_opencl'].substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), + knl_store_outputs.store_outputs['func'].substitute({'CUDA_IDX': ''}), 'store_outputs', preamble=self.knl_common, options=config.sim_config.devices['compiler_opts']) @@ -975,55 +901,22 @@ class OpenCLUpdates: if self.grid.hertziandipoles: self.srcinfo1_hertzian_dev, self.srcinfo2_hertzian_dev, self.srcwaves_hertzian_dev = htod_src_arrays(self.grid.hertziandipoles, self.grid, self.queue) self.update_hertzian_dipole_dev = self.elwise(self.ctx, - Template("int NHERTZDIPOLE, " - "int iteration, " - "$REAL dx, " - "$REAL dy, " - "$REAL dz, " - "__global const int* restrict srcinfo1, " - "__global const $REAL* restrict srcinfo2, " - "__global const $REAL* restrict srcwaveforms, " - "__global const unsigned int* restrict ID, " - "__global $REAL *Ex, " - "__global $REAL *Ey, " - "__global $REAL *Ez").substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), - knl_source_updates.update_hertzian_dipole.substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), + knl_source_updates.update_hertzian_dipole['args_opencl'].substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), + knl_source_updates.update_hertzian_dipole['func'].substitute({'CUDA_IDX': '', 'REAL': config.sim_config.dtypes['C_float_or_double']}), 'update_hertzian_dipole', preamble=self.knl_common, options=config.sim_config.devices['compiler_opts']) if self.grid.magneticdipoles: self.srcinfo1_magnetic_dev, self.srcinfo2_magnetic_dev, self.srcwaves_magnetic_dev = htod_src_arrays(self.grid.magneticdipoles, self.grid, self.queue) self.update_magnetic_dipole_dev = self.elwise(self.ctx, - Template("int NMAGDIPOLE, " - "int iteration, " - "$REAL dx, " - "$REAL dy, " - "$REAL dz, " - "__global const int* restrict srcinfo1, " - "__global const $REAL* restrict srcinfo2, " - "__global const $REAL* restrict srcwaveforms, " - "__global const unsigned int* restrict ID, " - "__global $REAL *Hx, " - "__global $REAL *Hy, " - "__global $REAL *Hz").substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), - knl_source_updates.update_magnetic_dipole.substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), + knl_source_updates.update_magnetic_dipole['args_opencl'].substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), + knl_source_updates.update_magnetic_dipole['func'].substitute({'CUDA_IDX': '', 'REAL': config.sim_config.dtypes['C_float_or_double']}), 'update_magnetic_dipole', preamble=self.knl_common, options=config.sim_config.devices['compiler_opts']) if self.grid.voltagesources: self.srcinfo1_voltage_dev, self.srcinfo2_voltage_dev,self.srcwaves_voltage_dev = htod_src_arrays(self.grid.voltagesources, self.grid, self.queue) self.update_voltage_source_dev = self.elwise(self.ctx, - Template("int NVOLTSRC, " - "int iteration, " - "$REAL dx, " - "$REAL dy, " - "$REAL dz, " - "__global const int* restrict srcinfo1, " - "__global const $REAL* restrict srcinfo2, " - "__global const $REAL* restrict srcwaveforms, " - "__global const unsigned int* restrict ID, " - "__global $REAL *Ex, " - "__global $REAL *Ey, " - "__global $REAL *Ez").substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), - knl_source_updates.update_voltage_source.substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), 'update_voltage_source', preamble=self.knl_common, + knl_source_updates.update_voltage_source['args_opencl'].substitute({'CUDA_IDX': '', 'REAL': config.sim_config.dtypes['C_float_or_double']}), + knl_source_updates.update_voltage_source['func'].substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), 'update_voltage_source', preamble=self.knl_common, options=config.sim_config.devices['compiler_opts']) def _set_snapshot_knl(self): @@ -1032,31 +925,11 @@ class OpenCLUpdates: """ self.snapEx_dev, self.snapEy_dev, self.snapEz_dev, self.snapHx_dev, self.snapHy_dev, self.snapHz_dev = htod_snapshot_array(self.grid, self.queue) self.store_snapshot_dev = self.elwise(self.ctx, - Template("int p, " - "int xs, " - "int xf, " - "int ys, " - "int yf, " - "int zs, " - "int zf, " - "int dx, " - "int dy, " - "int dz, " - "__global const $REAL* restrict Ex, " - "__global const $REAL* restrict Ey, " - "__global const $REAL* restrict Ez, " - "__global const $REAL* restrict Hx, " - "__global const $REAL* restrict Hy, " - "__global const $REAL* restrict Hz, " - "__global $REAL *snapEx, " - "__global $REAL *snapEy, " - "__global $REAL *snapEz, " - "__global $REAL *snapHx, " - "__global $REAL *snapHy, " - "__global $REAL *snapHz").substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), - knl_snapshots.store_snapshot.substitute({'NX_SNAPS': Snapshot.nx_max, - 'NY_SNAPS': Snapshot.ny_max, - 'NZ_SNAPS': Snapshot.nz_max}), + knl_snapshots.store_snapshot['args_opencl'].substitute({'REAL': config.sim_config.dtypes['C_float_or_double']}), + knl_snapshots.store_snapshot['func'].substitute( {'CUDA_IDX': '', + 'NX_SNAPS': Snapshot.nx_max, + 'NY_SNAPS': Snapshot.ny_max, + 'NZ_SNAPS': Snapshot.nz_max}), 'store_snapshot', preamble=self.knl_common, options=config.sim_config.devices['compiler_opts']) @@ -1260,10 +1133,7 @@ class OpenCLUpdates: self.grid.ID_dev, self.grid.Ex_dev, self.grid.Ey_dev, - self.grid.Ez_dev, - self.grid.Hx_dev, - self.grid.Hy_dev, - self.grid.Hz_dev) + self.grid.Ez_dev) event.wait() self.compute_time += (event.profile.end - event.profile.start)*1e-9