21#include "../Concepts.hpp"
22#include "../GenericID.hpp"
23#include "../Helpers.hpp"
24#include "../Index.hpp"
25#include "../tdv/SinglePassStrategies.hpp"
61template <concepts::TransitionFunction TransFunc,
62 tdv::single_pass::KernelArgument<TransFunc> TDVKernelArgument,
64 typename in_pipe,
typename out_pipe>
65 requires(n_processing_elements % TransFunc::n_subiterations == 0)
68 using Cell =
typename TransFunc::Cell;
69 using TDV =
typename TransFunc::TimeDependentValue;
70 using TDVLocalState =
typename TDVKernelArgument::LocalState;
76 static constexpr uindex_t stencil_diameter = StencilImpl::diameter;
78 static constexpr uindex_t iters_per_pass = n_processing_elements / TransFunc::n_subiterations;
81 return n_processing_elements * TransFunc::stencil_radius * (grid_height + 1);
85 return grid_width * grid_height + calc_pipeline_latency(grid_height);
93 static constexpr unsigned long bits_1d =
94 std::bit_width(std::max(max_grid_width, max_grid_height));
95 using index_1d_t = ac_int<bits_1d + 1, true>;
96 using uindex_1d_t = ac_int<bits_1d, false>;
98 static constexpr unsigned long bits_2d = 2 * bits_1d;
99 using index_2d_t = ac_int<bits_2d + 1, true>;
100 using uindex_2d_t = ac_int<bits_2d, false>;
102 static constexpr unsigned long bits_pes =
103 std::max<int>(2, std::bit_width(n_processing_elements));
104 using index_pes_t = ac_int<bits_pes + 1, true>;
105 using uindex_pes_t = ac_int<bits_pes, false>;
107 static constexpr unsigned long bits_n_iterations =
108 std::bit_width(calc_n_iterations(max_grid_width, max_grid_height));
109 using index_n_iterations_t = ac_int<bits_n_iterations + 1, true>;
110 using uindex_n_iterations_t = ac_int<bits_n_iterations, false>;
135 TDVKernelArgument tdv_kernel_argument)
136 : trans_func(trans_func), i_iteration(i_iteration), target_i_iteration(target_i_iteration),
137 grid_width(grid_width), grid_height(grid_height), halo_value(halo_value),
138 tdv_kernel_argument(tdv_kernel_argument) {
139 assert(grid_height <= max_grid_height);
146 [[intel::fpga_register]] index_1d_t c[n_processing_elements];
147 [[intel::fpga_register]] index_1d_t r[n_processing_elements];
148 TDVLocalState tdv_local_state(tdv_kernel_argument);
151 index_1d_t prev_c = 0;
152 index_1d_t prev_r = 0;
154 for (uindex_pes_t i = 0; i < uindex_pes_t(n_processing_elements); i++) {
155 c[i] = prev_c - TransFunc::stencil_radius;
156 r[i] = prev_r - TransFunc::stencil_radius;
157 if (r[i] < index_pes_t(0)) {
172 [[intel::fpga_memory,
173 intel::numbanks(2 * std::bit_ceil(n_processing_elements))]]
Padded<Cell>
174 cache[2][max_grid_height][std::bit_ceil(n_processing_elements)][stencil_diameter - 1];
175 [[intel::fpga_register]] Cell stencil_buffer[n_processing_elements][stencil_diameter]
178 uindex_n_iterations_t n_iterations = calc_n_iterations(grid_width, grid_height);
179 for (uindex_n_iterations_t i = 0; i < n_iterations; i++) {
181 if (i < uindex_n_iterations_t(grid_width * grid_height)) {
182 carry = in_pipe::read();
188 for (uindex_pes_t i_processing_element = 0;
189 i_processing_element < uindex_pes_t(n_processing_elements);
190 i_processing_element++) {
192 for (uindex_stencil_t r = 0; r < uindex_stencil_t(stencil_diameter - 1); r++) {
194 for (uindex_stencil_t c = 0; c < uindex_stencil_t(stencil_diameter); c++) {
195 stencil_buffer[i_processing_element][c][r] =
196 stencil_buffer[i_processing_element][c][r + 1];
203 for (uindex_stencil_t cache_c = 0; cache_c < uindex_stencil_t(stencil_diameter);
206 if (cache_c == uindex_stencil_t(stencil_diameter - 1)) {
209 new_value = cache[c[i_processing_element][0]][r[i_processing_element]]
210 [i_processing_element][cache_c]
214 stencil_buffer[i_processing_element][cache_c][stencil_diameter - 1] = new_value;
216 cache[(~c[i_processing_element])[0]][r[i_processing_element]]
217 [i_processing_element][cache_c - 1]
223 (i_iteration + i_processing_element / TransFunc::n_subiterations).to_uint();
225 (i_processing_element % TransFunc::n_subiterations).to_uint();
227 if (pe_iteration < target_i_iteration) {
228 TDV tdv = tdv_local_state.get_time_dependent_value(
229 (i_processing_element / TransFunc::n_subiterations).to_uint());
231 UID(grid_width, grid_height), pe_iteration, pe_subiteration,
234 bool h_halo_mask[stencil_diameter];
235 bool v_halo_mask[stencil_diameter];
237 for (uindex_stencil_t mask_i = 0; mask_i < uindex_stencil_t(stencil_diameter);
242 if (mask_i < uindex_stencil_t(TransFunc::stencil_radius)) {
243 h_halo_mask[mask_i] = c[i_processing_element] >=
244 index_1d_t(TransFunc::stencil_radius - mask_i);
245 v_halo_mask[mask_i] = r[i_processing_element] >=
246 index_1d_t(TransFunc::stencil_radius - mask_i);
247 }
else if (mask_i == uindex_stencil_t(TransFunc::stencil_radius)) {
248 h_halo_mask[mask_i] =
true;
249 v_halo_mask[mask_i] =
true;
251 h_halo_mask[mask_i] =
252 c[i_processing_element] <
253 grid_width + index_1d_t(TransFunc::stencil_radius - mask_i);
254 v_halo_mask[mask_i] =
255 r[i_processing_element] <
256 grid_height + index_1d_t(TransFunc::stencil_radius - mask_i);
261 for (uindex_stencil_t cell_c = 0; cell_c < uindex_stencil_t(stencil_diameter);
264 for (uindex_stencil_t cell_r = 0;
265 cell_r < uindex_stencil_t(stencil_diameter); cell_r++) {
266 if (h_halo_mask[cell_c] && v_halo_mask[cell_r]) {
267 stencil[StencilUID(cell_c, cell_r)] =
268 stencil_buffer[i_processing_element][cell_c][cell_r];
270 stencil[StencilUID(cell_c, cell_r)] = halo_value;
277 carry = stencil_buffer[i_processing_element][TransFunc::stencil_radius]
278 [TransFunc::stencil_radius];
281 r[i_processing_element] += 1;
282 if (r[i_processing_element] == index_1d_t(grid_height)) {
283 r[i_processing_element] = 0;
284 c[i_processing_element] += 1;
288 if (i >= uindex_n_iterations_t(calc_pipeline_latency(grid_height))) {
289 out_pipe::write(carry);
295 TransFunc trans_func;
301 TDVKernelArgument tdv_kernel_argument;
341 using Cell = F::Cell;
342 using TDV =
typename F::TimeDependentValue;
412 : params(params), n_processed_cells(0), work_events(), walltime(0.0) {}
433 throw std::range_error(
"The grid is too tall for the stencil update kernel.");
436 throw std::range_error(
"The grid is too wide for the stencil update kernel.");
438 using in_pipe = sycl::pipe<class monotile_in_pipe, Cell>;
439 using out_pipe = sycl::pipe<class monotile_out_pipe, Cell>;
441 constexpr uindex_t iters_per_pass = n_processing_elements / F::n_subiterations;
443 using TDVGlobalState = TDVStrategy::template GlobalState<F, iters_per_pass>;
444 using TDVKernelArgument =
typename TDVGlobalState::KernelArgument;
445 using ExecutionKernelImpl =
447 max_grid_height, in_pipe, out_pipe>;
449 sycl::queue input_kernel_queue =
450 sycl::queue(params.
device, {sycl::property::queue::in_order{}});
451 sycl::queue output_kernel_queue =
452 sycl::queue(params.device, {sycl::property::queue::in_order{}});
454 sycl::queue update_kernel_queue =
455 sycl::queue(params.device, {cl::sycl::property::queue::enable_profiling{},
456 sycl::property::queue::in_order{}});
458 GridImpl swap_grid_a = source_grid.make_similar();
459 GridImpl swap_grid_b = source_grid.make_similar();
461 GridImpl *pass_source = &source_grid;
462 GridImpl *pass_target = &swap_grid_b;
464 F trans_func = params.transition_function;
465 TDVGlobalState tdv_global_state(trans_func, params.iteration_offset, params.n_iterations);
467 auto walltime_start = std::chrono::high_resolution_clock::now();
469 uindex_t target_n_iterations = params.iteration_offset + params.n_iterations;
470 for (
uindex_t i = params.iteration_offset; i < target_n_iterations; i += iters_per_pass) {
471 pass_source->template submit_read<in_pipe>(input_kernel_queue);
472 uindex_t iters_in_this_pass = std::min(iters_per_pass, target_n_iterations - i);
474 sycl::event work_event = update_kernel_queue.submit([&](sycl::handler &cgh) {
475 TDVKernelArgument tdv_kernel_argument(tdv_global_state, cgh, i, iters_in_this_pass);
476 ExecutionKernelImpl exec_kernel(
477 trans_func, i, target_n_iterations, source_grid.get_grid_width(),
478 source_grid.get_grid_height(), params.halo_value, tdv_kernel_argument);
479 cgh.single_task<ExecutionKernelImpl>(exec_kernel);
481 if (params.profiling) {
482 work_events.push_back(work_event);
485 pass_target->template submit_write<out_pipe>(output_kernel_queue);
487 if (i == params.iteration_offset) {
488 pass_source = &swap_grid_b;
489 pass_target = &swap_grid_a;
491 std::swap(pass_source, pass_target);
495 if (params.blocking) {
496 output_kernel_queue.wait();
499 auto walltime_end = std::chrono::high_resolution_clock::now();
500 std::chrono::duration<double> walltime = walltime_end - walltime_start;
501 this->walltime += walltime.count();
504 params.n_iterations * source_grid.get_grid_width() * source_grid.get_grid_height();
525 double kernel_runtime = 0.0;
526 for (sycl::event work_event : work_events) {
527 const double timesteps_per_second = 1000000000.0;
530 .get_profiling_info<cl::sycl::info::event_profiling::command_start>()) /
531 timesteps_per_second;
534 work_event.get_profiling_info<cl::sycl::info::event_profiling::command_end>()) /
535 timesteps_per_second;
536 kernel_runtime += end - start;
538 return kernel_runtime;
553 std::vector<sycl::event> work_events;
T value
Definition Helpers.hpp:0
A generic, two-dimensional index.
Definition GenericID.hpp:32
The stencil buffer.
Definition Stencil.hpp:48
ac_int< bits_stencil, false > uindex_stencil_t
An unsigned index type for column and row indices in this stencil.
Definition Stencil.hpp:60
ac_int< bits_stencil, true > index_stencil_t
A signed index type for column and row indices in this stencil.
Definition Stencil.hpp:57
A grid class for the monotile architecture.
Definition Grid.hpp:58
uindex_t get_grid_width() const
Return the width, or number of columns, of the grid.
Definition Grid.hpp:128
uindex_t get_grid_height() const
Return the height, or number of rows, of the grid.
Definition Grid.hpp:133
The execution kernel of the monotile architecture.
Definition StencilUpdate.hpp:66
void operator()() const
Execute the kernel.
Definition StencilUpdate.hpp:145
StencilUpdateKernel(TransFunc trans_func, uindex_t i_iteration, uindex_t target_i_iteration, uindex_t grid_width, uindex_t grid_height, Cell halo_value, TDVKernelArgument tdv_kernel_argument)
Create and configure the execution kernel.
Definition StencilUpdate.hpp:133
A grid updater that applies an iterative stencil code to a grid.
Definition StencilUpdate.hpp:339
double get_kernel_runtime() const
Return the accumulated total runtime of the execution kernel.
Definition StencilUpdate.hpp:524
double get_walltime() const
Return the accumulated runtime of the updater, measured from the host side.
Definition StencilUpdate.hpp:547
Params & get_params()
Return a reference to the parameters.
Definition StencilUpdate.hpp:419
StencilUpdate(Params params)
Create a new stencil updater object.
Definition StencilUpdate.hpp:411
GridImpl operator()(GridImpl &source_grid)
Compute a new grid based on the source grid, using the configured transition function.
Definition StencilUpdate.hpp:431
uindex_t get_n_processed_cells() const
Return the accumulated total number of cells processed by this updater.
Definition StencilUpdate.hpp:516
A technical definition of a stencil transition function.
Definition Concepts.hpp:62
Requirements for a TDV implementation strategy.
Definition SinglePassStrategies.hpp:99
Definition AccessorSubscript.hpp:24
GenericID< uindex_t > UID
An unsigned, two-dimensional index.
Definition GenericID.hpp:80
BOOST_PP_CAT(BOOST_PP_CAT(uint, STENCIL_INDEX_WIDTH), _t) uindex_t
An unsigned integer of configurable width.
Definition Index.hpp:42
GenericID< index_t > ID
A signed, two-dimensional index.
Definition GenericID.hpp:75
A container with padding to the next power of two.
Definition Helpers.hpp:45
Parameters for the stencil updater.
Definition StencilUpdate.hpp:351
uindex_t n_iterations
The number of iterations to compute.
Definition StencilUpdate.hpp:375
uindex_t iteration_offset
The iteration index offset.
Definition StencilUpdate.hpp:370
bool profiling
Enable profiling.
Definition StencilUpdate.hpp:405
sycl::device device
The device to use for computations.
Definition StencilUpdate.hpp:385
Cell halo_value
The cell value to present for cells outside of the grid.
Definition StencilUpdate.hpp:362
bool blocking
Should the stencil updater block until completion, or return immediately after all kernels have been ...
Definition StencilUpdate.hpp:396
F transition_function
An instance of the transition function type.
Definition StencilUpdate.hpp:357
A TDV implementation strategy that inlines the TDV function into the transition function.
Definition SinglePassStrategies.hpp:114