21#include "../Concepts.hpp"
22#include "../GenericID.hpp"
23#include "../Helpers.hpp"
24#include "../Index.hpp"
25#include "../tdv/SinglePassStrategies.hpp"
56template <concepts::TransitionFunction TransFunc,
57 tdv::single_pass::KernelArgument<TransFunc> TDVKernelArgument,
59 typename in_pipe,
typename out_pipe>
60 requires(n_processing_elements % TransFunc::n_subiterations == 0)
63 using Cell =
typename TransFunc::Cell;
64 using TDV =
typename TransFunc::TimeDependentValue;
66 using TDVLocalState =
typename TDVKernelArgument::LocalState;
68 static constexpr uindex_t stencil_diameter = StencilImpl::diameter;
70 static constexpr uindex_t halo_radius = TransFunc::stencil_radius * n_processing_elements;
72 static constexpr uindex_t max_input_tile_width = 2 * halo_radius + output_tile_width;
74 static constexpr uindex_t input_tile_height = 2 * halo_radius + output_tile_height;
76 static constexpr uindex_t n_input_cells = max_input_tile_width * input_tile_height;
83 static constexpr unsigned long bits_1d =
84 std::bit_width(std::max(max_input_tile_width, input_tile_height));
85 using index_1d_t = ac_int<bits_1d + 1, true>;
86 using uindex_1d_t = ac_int<bits_1d, false>;
88 static constexpr unsigned long bits_2d = 2 * bits_1d;
89 using index_2d_t = ac_int<bits_2d + 1, true>;
90 using uindex_2d_t = ac_int<bits_2d, false>;
94 static constexpr unsigned long bits_pes =
95 std::max<int>(2, std::bit_width(n_processing_elements));
96 using index_pes_t = ac_int<bits_pes + 1, true>;
97 using uindex_pes_t = ac_int<bits_pes, false>;
128 uindex_t grid_height, Cell halo_value,
129 TDVKernelArgument tdv_kernel_argument)
130 : trans_func(trans_func), i_iteration(i_iteration), target_i_iteration(target_i_iteration),
131 grid_c_offset(grid_c_offset), grid_r_offset(grid_r_offset), grid_width(grid_width),
132 grid_height(grid_height), halo_value(halo_value),
133 tdv_kernel_argument(tdv_kernel_argument) {
134 assert(grid_c_offset % output_tile_width == 0);
135 assert(grid_r_offset % output_tile_height == 0);
142 TDVLocalState tdv_local_state(tdv_kernel_argument);
144 uindex_1d_t input_tile_c = 0;
145 uindex_1d_t input_tile_r = 0;
154 [[intel::fpga_memory,
155 intel::numbanks(2 * std::bit_ceil(n_processing_elements))]]
Padded<Cell>
156 cache[2][input_tile_height][std::bit_ceil(n_processing_elements)][stencil_diameter - 1];
157 [[intel::fpga_register]] Cell stencil_buffer[n_processing_elements][stencil_diameter]
160 uindex_1d_t output_tile_section_width =
161 std::min(output_tile_width, grid_width - grid_c_offset);
162 uindex_1d_t output_tile_section_height =
163 std::min(output_tile_height, grid_height - grid_r_offset);
164 uindex_1d_t input_tile_section_width = output_tile_section_width + 2 * halo_radius;
165 uindex_1d_t input_tile_section_height = output_tile_section_height + 2 * halo_radius;
166 uindex_2d_t n_iterations = input_tile_section_width * input_tile_section_height;
168 for (uindex_2d_t i = 0; i < n_iterations; i++) {
169 [[intel::fpga_register]] Cell carry = in_pipe::read();
172 for (uindex_pes_t i_processing_element = 0;
173 i_processing_element < uindex_pes_t(n_processing_elements);
174 i_processing_element++) {
181 for (uindex_stencil_t r = 0; r < uindex_stencil_t(stencil_diameter - 1); r++) {
183 for (uindex_stencil_t c = 0; c < uindex_stencil_t(stencil_diameter); c++) {
184 stencil_buffer[i_processing_element][c][r] =
185 stencil_buffer[i_processing_element][c][r + 1];
189 index_1d_t rel_input_grid_c =
190 index_1d_t(input_tile_c) -
191 index_1d_t((stencil_diameter - 1) +
192 (n_processing_elements + i_processing_element - 2) *
193 TransFunc::stencil_radius);
194 index_t input_grid_c = grid_c_offset + rel_input_grid_c.to_int64();
195 index_1d_t rel_input_grid_r =
196 index_1d_t(input_tile_r) -
197 index_1d_t((stencil_diameter - 1) +
198 (n_processing_elements + i_processing_element - 2) *
199 TransFunc::stencil_radius);
200 index_t input_grid_r = grid_r_offset + rel_input_grid_r.to_int64();
205 for (uindex_stencil_t cache_c = 0; cache_c < uindex_stencil_t(stencil_diameter);
208 if (cache_c == uindex_stencil_t(stencil_diameter - 1)) {
209 bool is_halo = (grid_c_offset == 0 && rel_input_grid_c < 0);
210 is_halo |= (grid_r_offset == 0 && rel_input_grid_r < 0);
211 is_halo |= input_grid_c >= grid_width || input_grid_r >= grid_height;
213 new_value = is_halo ? halo_value : carry;
216 cache[input_tile_c[0]][input_tile_r][i_processing_element][cache_c]
220 stencil_buffer[i_processing_element][cache_c][stencil_diameter - 1] = new_value;
222 cache[(~input_tile_c)[0]][input_tile_r][i_processing_element][cache_c - 1]
228 (i_iteration + i_processing_element / TransFunc::n_subiterations).to_uint();
230 (i_processing_element % TransFunc::n_subiterations).to_uint();
231 index_t output_grid_c = input_grid_c -
index_t(TransFunc::stencil_radius);
232 index_t output_grid_r = input_grid_r -
index_t(TransFunc::stencil_radius);
233 TDV tdv = tdv_local_state.get_time_dependent_value(i_processing_element /
234 TransFunc::n_subiterations);
236 pe_iteration, pe_subiteration, tdv,
237 stencil_buffer[i_processing_element]);
239 if (pe_iteration < target_i_iteration) {
242 carry = stencil_buffer[i_processing_element][TransFunc::stencil_radius]
243 [TransFunc::stencil_radius];
247 bool is_valid_output =
248 input_tile_c >= uindex_1d_t((stencil_diameter - 1) * n_processing_elements);
250 input_tile_r >= uindex_1d_t((stencil_diameter - 1) * n_processing_elements);
252 if (is_valid_output) {
253 out_pipe::write(carry);
256 if (input_tile_r == input_tile_section_height - 1) {
266 TransFunc trans_func;
274 TDVKernelArgument tdv_kernel_argument;
310 using Cell = F::Cell;
311 using TDVGlobalState =
typename TDVStrategy::template GlobalState<F, n_processing_elements>;
312 using TDVKernelArgument =
typename TDVGlobalState::KernelArgument;
389 : params(params), n_processed_cells(0), work_events(), walltime(0.0) {}
409 using in_pipe = sycl::pipe<class tiling_in_pipe, Cell>;
410 using out_pipe = sycl::pipe<class tiling_out_pipe, Cell>;
411 using ExecutionKernelImpl =
StencilUpdateKernel<F, TDVKernelArgument, n_processing_elements,
412 tile_width, tile_height, in_pipe, out_pipe>;
418 sycl::queue input_kernel_queue =
419 sycl::queue(params.
device, {sycl::property::queue::in_order{}});
420 sycl::queue output_kernel_queue =
421 sycl::queue(params.device, {sycl::property::queue::in_order{}});
422 sycl::queue working_queue =
423 sycl::queue(params.device, {cl::sycl::property::queue::enable_profiling{},
424 sycl::property::queue::in_order{}});
426 GridImpl swap_grid_a = source_grid.make_similar();
427 GridImpl swap_grid_b = source_grid.make_similar();
429 uindex_t iters_per_pass = n_processing_elements / F::n_subiterations;
430 GridImpl *pass_source = &source_grid;
431 GridImpl *pass_target = &swap_grid_b;
433 UID tile_range = source_grid.get_tile_range();
434 uindex_t grid_width = source_grid.get_grid_width();
435 uindex_t grid_height = source_grid.get_grid_height();
437 F trans_func = params.transition_function;
438 TDVGlobalState tdv_global_state(trans_func, params.iteration_offset, params.n_iterations);
440 auto walltime_start = std::chrono::high_resolution_clock::now();
442 uindex_t target_n_iterations = params.iteration_offset + params.n_iterations;
443 for (
uindex_t i = params.iteration_offset; i < target_n_iterations; i += iters_per_pass) {
444 uindex_t iters_in_this_pass = std::min(iters_per_pass, target_n_iterations - i);
446 for (
uindex_t i_tile_c = 0; i_tile_c < tile_range.c; i_tile_c++) {
447 for (
uindex_t i_tile_r = 0; i_tile_r < tile_range.r; i_tile_r++) {
448 pass_source->template submit_read<in_pipe>(input_kernel_queue, i_tile_c,
449 i_tile_r, params.halo_value);
451 auto work_event = working_queue.submit([&](sycl::handler &cgh) {
452 TDVKernelArgument tdv_kernel_argument(tdv_global_state, cgh, i,
454 uindex_t c_offset = i_tile_c * tile_width;
455 uindex_t r_offset = i_tile_r * tile_height;
457 ExecutionKernelImpl exec_kernel(trans_func, i, target_n_iterations,
458 c_offset, r_offset, grid_width, grid_height,
459 params.halo_value, tdv_kernel_argument);
461 cgh.single_task<ExecutionKernelImpl>(exec_kernel);
463 if (params.profiling) {
464 work_events.push_back(work_event);
467 pass_target->template submit_write<out_pipe>(output_kernel_queue, i_tile_c,
472 if (i == params.iteration_offset) {
473 pass_source = &swap_grid_b;
474 pass_target = &swap_grid_a;
476 std::swap(pass_source, pass_target);
480 if (params.blocking) {
481 output_kernel_queue.wait();
484 auto walltime_end = std::chrono::high_resolution_clock::now();
485 std::chrono::duration<double> walltime = walltime_end - walltime_start;
486 this->walltime += walltime.count();
489 params.n_iterations * source_grid.get_grid_width() * source_grid.get_grid_height();
510 double kernel_runtime = 0.0;
511 for (sycl::event work_event : work_events) {
512 const double timesteps_per_second = 1000000000.0;
515 .get_profiling_info<cl::sycl::info::event_profiling::command_start>()) /
516 timesteps_per_second;
519 work_event.get_profiling_info<cl::sycl::info::event_profiling::command_end>()) /
520 timesteps_per_second;
521 kernel_runtime += end - start;
523 return kernel_runtime;
538 std::vector<sycl::event> work_events;
T value
Definition Helpers.hpp:0
A generic, two-dimensional index.
Definition GenericID.hpp:32
The stencil buffer.
Definition Stencil.hpp:48
ac_int< bits_stencil, false > uindex_stencil_t
An unsigned index type for column and row indices in this stencil.
Definition Stencil.hpp:60
ac_int< bits_stencil, true > index_stencil_t
A signed index type for column and row indices in this stencil.
Definition Stencil.hpp:57
A grid class for the tiling architecture.
Definition Grid.hpp:74
A kernel that executes a stencil transition function on a tile.
Definition StencilUpdate.hpp:61
void operator()() const
Execute the configured operations.
Definition StencilUpdate.hpp:141
StencilUpdateKernel(TransFunc trans_func, uindex_t i_iteration, uindex_t target_i_iteration, uindex_t grid_c_offset, uindex_t grid_r_offset, uindex_t grid_width, uindex_t grid_height, Cell halo_value, TDVKernelArgument tdv_kernel_argument)
Create and configure the execution kernel.
Definition StencilUpdate.hpp:126
A grid updater that applies an iterative stencil code to a grid.
Definition StencilUpdate.hpp:308
Grid< Cell, tile_width, tile_height, halo_radius > GridImpl
A shorthand for the used and supported grid type.
Definition StencilUpdate.hpp:323
double get_walltime() const
Return the accumulated runtime of the updater, measured from the host side.
Definition StencilUpdate.hpp:532
StencilUpdate(Params params)
Create a new stencil updater object.
Definition StencilUpdate.hpp:388
uindex_t get_n_processed_cells() const
Return the accumulated total number of cells processed by this updater.
Definition StencilUpdate.hpp:501
static constexpr uindex_t halo_radius
The radius of an input's tile halo.
Definition StencilUpdate.hpp:318
Params & get_params()
Return a reference to the parameters.
Definition StencilUpdate.hpp:396
GridImpl operator()(GridImpl &source_grid)
Compute a new grid based on the source grid, using the configured transition function.
Definition StencilUpdate.hpp:408
double get_kernel_runtime() const
Return the accumulated total runtime of the execution kernel.
Definition StencilUpdate.hpp:509
A technical definition of a stencil transition function.
Definition Concepts.hpp:62
Requirements for a TDV implementation strategy.
Definition SinglePassStrategies.hpp:99
Definition AccessorSubscript.hpp:24
GenericID< uindex_t > UID
An unsigned, two-dimensional index.
Definition GenericID.hpp:80
BOOST_PP_CAT(BOOST_PP_CAT(uint, STENCIL_INDEX_WIDTH), _t) uindex_t
An unsigned integer of configurable width.
Definition Index.hpp:42
GenericID< index_t > ID
A signed, two-dimensional index.
Definition GenericID.hpp:75
BOOST_PP_CAT(BOOST_PP_CAT(int, STENCIL_INDEX_WIDTH), _t) index_t
A signed integer of configurable width.
Definition Index.hpp:56
A container with padding to the next power of two.
Definition Helpers.hpp:45
A TDV implementation strategy that inlines the TDV function into the transition function.
Definition SinglePassStrategies.hpp:114
Parameters for the stencil updater.
Definition StencilUpdate.hpp:328
bool blocking
Should the stencil updater block until completion, or return immediately after all kernels have been ...
Definition StencilUpdate.hpp:373
uindex_t iteration_offset
The iteration index offset.
Definition StencilUpdate.hpp:347
uindex_t n_iterations
The number of iterations to compute.
Definition StencilUpdate.hpp:352
Cell halo_value
The cell value to present for cells outside of the grid.
Definition StencilUpdate.hpp:339
F transition_function
An instance of the transition function type.
Definition StencilUpdate.hpp:334
bool profiling
Enable profiling.
Definition StencilUpdate.hpp:382
sycl::device device
The device to use for computations.
Definition StencilUpdate.hpp:362