StencilStream v3.0.0
SYCL-based Stencil Simulation Framework Targeting FPGAs
Loading...
Searching...
No Matches
StencilUpdate.hpp
Go to the documentation of this file.
1/*
2 * Copyright © 2020-2024 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
3 * University
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
6 * associated documentation files (the “Software”), to deal in the Software without restriction,
7 * including without limitation the rights to use, copy, modify, merge, publish, distribute,
8 * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in all copies or
12 * substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
15 * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
16 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
17 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 */
20#pragma once
21#include "../Concepts.hpp"
22#include "../GenericID.hpp"
23#include "../Helpers.hpp"
24#include "../Index.hpp"
25#include "../tdv/SinglePassStrategies.hpp"
26#include "Grid.hpp"
27#include <chrono>
28#include <type_traits>
29
30namespace stencil {
31namespace monotile {
32
61template <concepts::TransitionFunction TransFunc,
62 tdv::single_pass::KernelArgument<TransFunc> TDVKernelArgument,
63 uindex_t n_processing_elements, uindex_t max_grid_width, uindex_t max_grid_height,
64 typename in_pipe, typename out_pipe>
65 requires(n_processing_elements % TransFunc::n_subiterations == 0)
67 private:
68 using Cell = typename TransFunc::Cell;
69 using TDV = typename TransFunc::TimeDependentValue;
70 using TDVLocalState = typename TDVKernelArgument::LocalState;
72
76 static constexpr uindex_t stencil_diameter = StencilImpl::diameter;
77
78 static constexpr uindex_t iters_per_pass = n_processing_elements / TransFunc::n_subiterations;
79
80 static constexpr uindex_t calc_pipeline_latency(uindex_t grid_height) {
81 return n_processing_elements * TransFunc::stencil_radius * (grid_height + 1);
82 }
83
84 static constexpr uindex_t calc_n_iterations(uindex_t grid_width, uindex_t grid_height) {
85 return grid_width * grid_height + calc_pipeline_latency(grid_height);
86 }
87
88 using index_stencil_t = typename StencilImpl::index_stencil_t;
89 using uindex_stencil_t = typename StencilImpl::uindex_stencil_t;
90 using StencilID = typename StencilImpl::StencilID;
91 using StencilUID = typename StencilImpl::StencilUID;
92
93 static constexpr unsigned long bits_1d =
94 std::bit_width(std::max(max_grid_width, max_grid_height));
95 using index_1d_t = ac_int<bits_1d + 1, true>;
96 using uindex_1d_t = ac_int<bits_1d, false>;
97
98 static constexpr unsigned long bits_2d = 2 * bits_1d;
99 using index_2d_t = ac_int<bits_2d + 1, true>;
100 using uindex_2d_t = ac_int<bits_2d, false>;
101
102 static constexpr unsigned long bits_pes =
103 std::max<int>(2, std::bit_width(n_processing_elements));
104 using index_pes_t = ac_int<bits_pes + 1, true>;
105 using uindex_pes_t = ac_int<bits_pes, false>;
106
107 static constexpr unsigned long bits_n_iterations =
108 std::bit_width(calc_n_iterations(max_grid_width, max_grid_height));
109 using index_n_iterations_t = ac_int<bits_n_iterations + 1, true>;
110 using uindex_n_iterations_t = ac_int<bits_n_iterations, false>;
111
112 public:
133 StencilUpdateKernel(TransFunc trans_func, uindex_t i_iteration, uindex_t target_i_iteration,
134 uindex_t grid_width, uindex_t grid_height, Cell halo_value,
135 TDVKernelArgument tdv_kernel_argument)
136 : trans_func(trans_func), i_iteration(i_iteration), target_i_iteration(target_i_iteration),
137 grid_width(grid_width), grid_height(grid_height), halo_value(halo_value),
138 tdv_kernel_argument(tdv_kernel_argument) {
139 assert(grid_height <= max_grid_height);
140 }
141
145 void operator()() const {
146 [[intel::fpga_register]] index_1d_t c[n_processing_elements];
147 [[intel::fpga_register]] index_1d_t r[n_processing_elements];
148 TDVLocalState tdv_local_state(tdv_kernel_argument);
149
150 // Initializing (output) column and row counters.
151 index_1d_t prev_c = 0;
152 index_1d_t prev_r = 0;
153#pragma unroll
154 for (uindex_pes_t i = 0; i < uindex_pes_t(n_processing_elements); i++) {
155 c[i] = prev_c - TransFunc::stencil_radius;
156 r[i] = prev_r - TransFunc::stencil_radius;
157 if (r[i] < index_pes_t(0)) {
158 r[i] += grid_height;
159 c[i] -= 1;
160 }
161 prev_c = c[i];
162 prev_r = r[i];
163 }
164
165 /*
166 * The intel::numbanks attribute requires a power of two as it's argument and if the
167 * number of processing elements isn't a power of two, it would produce an error. Therefore,
168 * we calculate the next power of two and use it to allocate the cache. The compiler is
169 * smart enough to see that these additional banks in the cache aren't used and therefore
170 * optimizes them away.
171 */
172 [[intel::fpga_memory,
173 intel::numbanks(2 * std::bit_ceil(n_processing_elements))]] Padded<Cell>
174 cache[2][max_grid_height][std::bit_ceil(n_processing_elements)][stencil_diameter - 1];
175 [[intel::fpga_register]] Cell stencil_buffer[n_processing_elements][stencil_diameter]
176 [stencil_diameter];
177
178 uindex_n_iterations_t n_iterations = calc_n_iterations(grid_width, grid_height);
179 for (uindex_n_iterations_t i = 0; i < n_iterations; i++) {
180 Cell carry;
181 if (i < uindex_n_iterations_t(grid_width * grid_height)) {
182 carry = in_pipe::read();
183 } else {
184 carry = halo_value;
185 }
186
187#pragma unroll
188 for (uindex_pes_t i_processing_element = 0;
189 i_processing_element < uindex_pes_t(n_processing_elements);
190 i_processing_element++) {
191#pragma unroll
192 for (uindex_stencil_t r = 0; r < uindex_stencil_t(stencil_diameter - 1); r++) {
193#pragma unroll
194 for (uindex_stencil_t c = 0; c < uindex_stencil_t(stencil_diameter); c++) {
195 stencil_buffer[i_processing_element][c][r] =
196 stencil_buffer[i_processing_element][c][r + 1];
197 }
198 }
199
200 // Update the stencil buffer and cache with previous cache contents and the new
201 // input cell.
202#pragma unroll
203 for (uindex_stencil_t cache_c = 0; cache_c < uindex_stencil_t(stencil_diameter);
204 cache_c++) {
205 Cell new_value;
206 if (cache_c == uindex_stencil_t(stencil_diameter - 1)) {
207 new_value = carry;
208 } else {
209 new_value = cache[c[i_processing_element][0]][r[i_processing_element]]
210 [i_processing_element][cache_c]
211 .value;
212 }
213
214 stencil_buffer[i_processing_element][cache_c][stencil_diameter - 1] = new_value;
215 if (cache_c > 0) {
216 cache[(~c[i_processing_element])[0]][r[i_processing_element]]
217 [i_processing_element][cache_c - 1]
218 .value = new_value;
219 }
220 }
221
222 uindex_t pe_iteration =
223 (i_iteration + i_processing_element / TransFunc::n_subiterations).to_uint();
224 uindex_t pe_subiteration =
225 (i_processing_element % TransFunc::n_subiterations).to_uint();
226
227 if (pe_iteration < target_i_iteration) {
228 TDV tdv = tdv_local_state.get_time_dependent_value(
229 (i_processing_element / TransFunc::n_subiterations).to_uint());
230 StencilImpl stencil(ID(c[i_processing_element], r[i_processing_element]),
231 UID(grid_width, grid_height), pe_iteration, pe_subiteration,
232 tdv);
233
234 bool h_halo_mask[stencil_diameter];
235 bool v_halo_mask[stencil_diameter];
236#pragma unroll
237 for (uindex_stencil_t mask_i = 0; mask_i < uindex_stencil_t(stencil_diameter);
238 mask_i++) {
239 // These computation assume that the central cell is in the grid. If it's
240 // not, the resulting value of this processing element will be discarded
241 // anyways, so this is safe.
242 if (mask_i < uindex_stencil_t(TransFunc::stencil_radius)) {
243 h_halo_mask[mask_i] = c[i_processing_element] >=
244 index_1d_t(TransFunc::stencil_radius - mask_i);
245 v_halo_mask[mask_i] = r[i_processing_element] >=
246 index_1d_t(TransFunc::stencil_radius - mask_i);
247 } else if (mask_i == uindex_stencil_t(TransFunc::stencil_radius)) {
248 h_halo_mask[mask_i] = true;
249 v_halo_mask[mask_i] = true;
250 } else {
251 h_halo_mask[mask_i] =
252 c[i_processing_element] <
253 grid_width + index_1d_t(TransFunc::stencil_radius - mask_i);
254 v_halo_mask[mask_i] =
255 r[i_processing_element] <
256 grid_height + index_1d_t(TransFunc::stencil_radius - mask_i);
257 }
258 }
259
260#pragma unroll
261 for (uindex_stencil_t cell_c = 0; cell_c < uindex_stencil_t(stencil_diameter);
262 cell_c++) {
263#pragma unroll
264 for (uindex_stencil_t cell_r = 0;
265 cell_r < uindex_stencil_t(stencil_diameter); cell_r++) {
266 if (h_halo_mask[cell_c] && v_halo_mask[cell_r]) {
267 stencil[StencilUID(cell_c, cell_r)] =
268 stencil_buffer[i_processing_element][cell_c][cell_r];
269 } else {
270 stencil[StencilUID(cell_c, cell_r)] = halo_value;
271 }
272 }
273 }
274
275 carry = trans_func(stencil);
276 } else {
277 carry = stencil_buffer[i_processing_element][TransFunc::stencil_radius]
278 [TransFunc::stencil_radius];
279 }
280
281 r[i_processing_element] += 1;
282 if (r[i_processing_element] == index_1d_t(grid_height)) {
283 r[i_processing_element] = 0;
284 c[i_processing_element] += 1;
285 }
286 }
287
288 if (i >= uindex_n_iterations_t(calc_pipeline_latency(grid_height))) {
289 out_pipe::write(carry);
290 }
291 }
292 }
293
294 private:
295 TransFunc trans_func;
296 uindex_t i_iteration;
297 uindex_t target_i_iteration;
298 uindex_t grid_width;
299 uindex_t grid_height;
300 Cell halo_value;
301 TDVKernelArgument tdv_kernel_argument;
302};
303
334template <concepts::TransitionFunction F, uindex_t n_processing_elements = 1,
335 uindex_t max_grid_width = 1024, uindex_t max_grid_height = 1024,
338 uindex_t word_size = 64>
340 private:
341 using Cell = F::Cell;
342 using TDV = typename F::TimeDependentValue;
343
344 public:
347
351 struct Params {
358
362 Cell halo_value = Cell();
363
371
376
385 sycl::device device = sycl::device();
386
396 bool blocking = false;
397
405 bool profiling = false;
406 };
407
412 : params(params), n_processed_cells(0), work_events(), walltime(0.0) {}
413
419 Params &get_params() { return params; }
420
432 if (source_grid.get_grid_height() > max_grid_height) {
433 throw std::range_error("The grid is too tall for the stencil update kernel.");
434 }
435 if (source_grid.get_grid_width() > max_grid_width) {
436 throw std::range_error("The grid is too wide for the stencil update kernel.");
437 }
438 using in_pipe = sycl::pipe<class monotile_in_pipe, Cell>;
439 using out_pipe = sycl::pipe<class monotile_out_pipe, Cell>;
440
441 constexpr uindex_t iters_per_pass = n_processing_elements / F::n_subiterations;
442
443 using TDVGlobalState = TDVStrategy::template GlobalState<F, iters_per_pass>;
444 using TDVKernelArgument = typename TDVGlobalState::KernelArgument;
445 using ExecutionKernelImpl =
446 StencilUpdateKernel<F, TDVKernelArgument, n_processing_elements, max_grid_width,
447 max_grid_height, in_pipe, out_pipe>;
448
449 sycl::queue input_kernel_queue =
450 sycl::queue(params.device, {sycl::property::queue::in_order{}});
451 sycl::queue output_kernel_queue =
452 sycl::queue(params.device, {sycl::property::queue::in_order{}});
453
454 sycl::queue update_kernel_queue =
455 sycl::queue(params.device, {cl::sycl::property::queue::enable_profiling{},
456 sycl::property::queue::in_order{}});
457
458 GridImpl swap_grid_a = source_grid.make_similar();
459 GridImpl swap_grid_b = source_grid.make_similar();
460
461 GridImpl *pass_source = &source_grid;
462 GridImpl *pass_target = &swap_grid_b;
463
464 F trans_func = params.transition_function;
465 TDVGlobalState tdv_global_state(trans_func, params.iteration_offset, params.n_iterations);
466
467 auto walltime_start = std::chrono::high_resolution_clock::now();
468
469 uindex_t target_n_iterations = params.iteration_offset + params.n_iterations;
470 for (uindex_t i = params.iteration_offset; i < target_n_iterations; i += iters_per_pass) {
471 pass_source->template submit_read<in_pipe>(input_kernel_queue);
472 uindex_t iters_in_this_pass = std::min(iters_per_pass, target_n_iterations - i);
473
474 sycl::event work_event = update_kernel_queue.submit([&](sycl::handler &cgh) {
475 TDVKernelArgument tdv_kernel_argument(tdv_global_state, cgh, i, iters_in_this_pass);
476 ExecutionKernelImpl exec_kernel(
477 trans_func, i, target_n_iterations, source_grid.get_grid_width(),
478 source_grid.get_grid_height(), params.halo_value, tdv_kernel_argument);
479 cgh.single_task<ExecutionKernelImpl>(exec_kernel);
480 });
481 if (params.profiling) {
482 work_events.push_back(work_event);
483 }
484
485 pass_target->template submit_write<out_pipe>(output_kernel_queue);
486
487 if (i == params.iteration_offset) {
488 pass_source = &swap_grid_b;
489 pass_target = &swap_grid_a;
490 } else {
491 std::swap(pass_source, pass_target);
492 }
493 }
494
495 if (params.blocking) {
496 output_kernel_queue.wait();
497 }
498
499 auto walltime_end = std::chrono::high_resolution_clock::now();
500 std::chrono::duration<double> walltime = walltime_end - walltime_start;
501 this->walltime += walltime.count();
502
503 n_processed_cells +=
504 params.n_iterations * source_grid.get_grid_width() * source_grid.get_grid_height();
505
506 return *pass_source;
507 }
508
516 uindex_t get_n_processed_cells() const { return n_processed_cells; }
517
524 double get_kernel_runtime() const {
525 double kernel_runtime = 0.0;
526 for (sycl::event work_event : work_events) {
527 const double timesteps_per_second = 1000000000.0;
528 double start =
529 double(work_event
530 .get_profiling_info<cl::sycl::info::event_profiling::command_start>()) /
531 timesteps_per_second;
532 double end =
533 double(
534 work_event.get_profiling_info<cl::sycl::info::event_profiling::command_end>()) /
535 timesteps_per_second;
536 kernel_runtime += end - start;
537 }
538 return kernel_runtime;
539 }
540
547 double get_walltime() const { return walltime; }
548
549 private:
550 Params params;
551 uindex_t n_processed_cells;
552 double walltime;
553 std::vector<sycl::event> work_events;
554};
555
556} // namespace monotile
557} // namespace stencil
T value
Definition Helpers.hpp:0
A generic, two-dimensional index.
Definition GenericID.hpp:32
The stencil buffer.
Definition Stencil.hpp:48
ac_int< bits_stencil, false > uindex_stencil_t
An unsigned index type for column and row indices in this stencil.
Definition Stencil.hpp:60
ac_int< bits_stencil, true > index_stencil_t
A signed index type for column and row indices in this stencil.
Definition Stencil.hpp:57
A grid class for the monotile architecture.
Definition Grid.hpp:58
uindex_t get_grid_width() const
Return the width, or number of columns, of the grid.
Definition Grid.hpp:128
uindex_t get_grid_height() const
Return the height, or number of rows, of the grid.
Definition Grid.hpp:133
The execution kernel of the monotile architecture.
Definition StencilUpdate.hpp:66
void operator()() const
Execute the kernel.
Definition StencilUpdate.hpp:145
StencilUpdateKernel(TransFunc trans_func, uindex_t i_iteration, uindex_t target_i_iteration, uindex_t grid_width, uindex_t grid_height, Cell halo_value, TDVKernelArgument tdv_kernel_argument)
Create and configure the execution kernel.
Definition StencilUpdate.hpp:133
A grid updater that applies an iterative stencil code to a grid.
Definition StencilUpdate.hpp:339
double get_kernel_runtime() const
Return the accumulated total runtime of the execution kernel.
Definition StencilUpdate.hpp:524
double get_walltime() const
Return the accumulated runtime of the updater, measured from the host side.
Definition StencilUpdate.hpp:547
Params & get_params()
Return a reference to the parameters.
Definition StencilUpdate.hpp:419
StencilUpdate(Params params)
Create a new stencil updater object.
Definition StencilUpdate.hpp:411
GridImpl operator()(GridImpl &source_grid)
Compute a new grid based on the source grid, using the configured transition function.
Definition StencilUpdate.hpp:431
uindex_t get_n_processed_cells() const
Return the accumulated total number of cells processed by this updater.
Definition StencilUpdate.hpp:516
A technical definition of a stencil transition function.
Definition Concepts.hpp:62
Requirements for a TDV implementation strategy.
Definition SinglePassStrategies.hpp:99
Definition AccessorSubscript.hpp:24
GenericID< uindex_t > UID
An unsigned, two-dimensional index.
Definition GenericID.hpp:80
BOOST_PP_CAT(BOOST_PP_CAT(uint, STENCIL_INDEX_WIDTH), _t) uindex_t
An unsigned integer of configurable width.
Definition Index.hpp:42
GenericID< index_t > ID
A signed, two-dimensional index.
Definition GenericID.hpp:75
A container with padding to the next power of two.
Definition Helpers.hpp:45
Parameters for the stencil updater.
Definition StencilUpdate.hpp:351
uindex_t n_iterations
The number of iterations to compute.
Definition StencilUpdate.hpp:375
uindex_t iteration_offset
The iteration index offset.
Definition StencilUpdate.hpp:370
bool profiling
Enable profiling.
Definition StencilUpdate.hpp:405
sycl::device device
The device to use for computations.
Definition StencilUpdate.hpp:385
Cell halo_value
The cell value to present for cells outside of the grid.
Definition StencilUpdate.hpp:362
bool blocking
Should the stencil updater block until completion, or return immediately after all kernels have been ...
Definition StencilUpdate.hpp:396
F transition_function
An instance of the transition function type.
Definition StencilUpdate.hpp:357
A TDV implementation strategy that inlines the TDV function into the transition function.
Definition SinglePassStrategies.hpp:114