StencilStream v3.0.0
SYCL-based Stencil Simulation Framework Targeting FPGAs
Loading...
Searching...
No Matches
StencilUpdate.hpp
Go to the documentation of this file.
1/*
2 * Copyright © 2020-2024 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
3 * University
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
6 * associated documentation files (the “Software”), to deal in the Software without restriction,
7 * including without limitation the rights to use, copy, modify, merge, publish, distribute,
8 * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in all copies or
12 * substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
15 * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
16 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
17 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 */
20#pragma once
21#include "../Concepts.hpp"
22#include "../GenericID.hpp"
23#include "../Helpers.hpp"
24#include "../Index.hpp"
25#include "../tdv/SinglePassStrategies.hpp"
26#include "Grid.hpp"
27
28#include <chrono>
29#include <optional>
30
31namespace stencil {
32namespace tiling {
33
56template <concepts::TransitionFunction TransFunc,
57 tdv::single_pass::KernelArgument<TransFunc> TDVKernelArgument,
58 uindex_t n_processing_elements, uindex_t output_tile_width, uindex_t output_tile_height,
59 typename in_pipe, typename out_pipe>
60 requires(n_processing_elements % TransFunc::n_subiterations == 0)
62 private:
63 using Cell = typename TransFunc::Cell;
64 using TDV = typename TransFunc::TimeDependentValue;
66 using TDVLocalState = typename TDVKernelArgument::LocalState;
67
68 static constexpr uindex_t stencil_diameter = StencilImpl::diameter;
69
70 static constexpr uindex_t halo_radius = TransFunc::stencil_radius * n_processing_elements;
71
72 static constexpr uindex_t max_input_tile_width = 2 * halo_radius + output_tile_width;
73
74 static constexpr uindex_t input_tile_height = 2 * halo_radius + output_tile_height;
75
76 static constexpr uindex_t n_input_cells = max_input_tile_width * input_tile_height;
77
78 using index_stencil_t = typename StencilImpl::index_stencil_t;
79 using uindex_stencil_t = typename StencilImpl::uindex_stencil_t;
80 using StencilID = typename StencilImpl::StencilID;
81 using StencilUID = typename StencilImpl::StencilUID;
82
83 static constexpr unsigned long bits_1d =
84 std::bit_width(std::max(max_input_tile_width, input_tile_height));
85 using index_1d_t = ac_int<bits_1d + 1, true>;
86 using uindex_1d_t = ac_int<bits_1d, false>;
87
88 static constexpr unsigned long bits_2d = 2 * bits_1d;
89 using index_2d_t = ac_int<bits_2d + 1, true>;
90 using uindex_2d_t = ac_int<bits_2d, false>;
91
92 // bits_pes must not be 1 since modulo operations on ac_int<1, false> don't work with oneAPI
93 // 2022.2. If they work with a future version of oneAPI, this can be reverted.
94 static constexpr unsigned long bits_pes =
95 std::max<int>(2, std::bit_width(n_processing_elements));
96 using index_pes_t = ac_int<bits_pes + 1, true>;
97 using uindex_pes_t = ac_int<bits_pes, false>;
98
99 public:
126 StencilUpdateKernel(TransFunc trans_func, uindex_t i_iteration, uindex_t target_i_iteration,
127 uindex_t grid_c_offset, uindex_t grid_r_offset, uindex_t grid_width,
128 uindex_t grid_height, Cell halo_value,
129 TDVKernelArgument tdv_kernel_argument)
130 : trans_func(trans_func), i_iteration(i_iteration), target_i_iteration(target_i_iteration),
131 grid_c_offset(grid_c_offset), grid_r_offset(grid_r_offset), grid_width(grid_width),
132 grid_height(grid_height), halo_value(halo_value),
133 tdv_kernel_argument(tdv_kernel_argument) {
134 assert(grid_c_offset % output_tile_width == 0);
135 assert(grid_r_offset % output_tile_height == 0);
136 }
137
141 void operator()() const {
142 TDVLocalState tdv_local_state(tdv_kernel_argument);
143
144 uindex_1d_t input_tile_c = 0;
145 uindex_1d_t input_tile_r = 0;
146
147 /*
148 * The intel::numbanks attribute requires a power of two as it's argument and if the
149 * number of processing elements isn't a power of two, it would produce an error. Therefore,
150 * we calculate the next power of two and use it to allocate the cache. The compiler is
151 * smart enough to see that these additional banks in the cache aren't used and therefore
152 * optimizes them away.
153 */
154 [[intel::fpga_memory,
155 intel::numbanks(2 * std::bit_ceil(n_processing_elements))]] Padded<Cell>
156 cache[2][input_tile_height][std::bit_ceil(n_processing_elements)][stencil_diameter - 1];
157 [[intel::fpga_register]] Cell stencil_buffer[n_processing_elements][stencil_diameter]
158 [stencil_diameter];
159
160 uindex_1d_t output_tile_section_width =
161 std::min(output_tile_width, grid_width - grid_c_offset);
162 uindex_1d_t output_tile_section_height =
163 std::min(output_tile_height, grid_height - grid_r_offset);
164 uindex_1d_t input_tile_section_width = output_tile_section_width + 2 * halo_radius;
165 uindex_1d_t input_tile_section_height = output_tile_section_height + 2 * halo_radius;
166 uindex_2d_t n_iterations = input_tile_section_width * input_tile_section_height;
167
168 for (uindex_2d_t i = 0; i < n_iterations; i++) {
169 [[intel::fpga_register]] Cell carry = in_pipe::read();
170
171#pragma unroll
172 for (uindex_pes_t i_processing_element = 0;
173 i_processing_element < uindex_pes_t(n_processing_elements);
174 i_processing_element++) {
175 /*
176 * Shift up every value in the stencil_buffer.
177 * This operation does not touch the values in the bottom row, which will be filled
178 * from the cache and the new input value later.
179 */
180#pragma unroll
181 for (uindex_stencil_t r = 0; r < uindex_stencil_t(stencil_diameter - 1); r++) {
182#pragma unroll
183 for (uindex_stencil_t c = 0; c < uindex_stencil_t(stencil_diameter); c++) {
184 stencil_buffer[i_processing_element][c][r] =
185 stencil_buffer[i_processing_element][c][r + 1];
186 }
187 }
188
189 index_1d_t rel_input_grid_c =
190 index_1d_t(input_tile_c) -
191 index_1d_t((stencil_diameter - 1) +
192 (n_processing_elements + i_processing_element - 2) *
193 TransFunc::stencil_radius);
194 index_t input_grid_c = grid_c_offset + rel_input_grid_c.to_int64();
195 index_1d_t rel_input_grid_r =
196 index_1d_t(input_tile_r) -
197 index_1d_t((stencil_diameter - 1) +
198 (n_processing_elements + i_processing_element - 2) *
199 TransFunc::stencil_radius);
200 index_t input_grid_r = grid_r_offset + rel_input_grid_r.to_int64();
201
202 // Update the stencil buffer and cache with previous cache contents and the new
203 // input cell.
204#pragma unroll
205 for (uindex_stencil_t cache_c = 0; cache_c < uindex_stencil_t(stencil_diameter);
206 cache_c++) {
207 Cell new_value;
208 if (cache_c == uindex_stencil_t(stencil_diameter - 1)) {
209 bool is_halo = (grid_c_offset == 0 && rel_input_grid_c < 0);
210 is_halo |= (grid_r_offset == 0 && rel_input_grid_r < 0);
211 is_halo |= input_grid_c >= grid_width || input_grid_r >= grid_height;
212
213 new_value = is_halo ? halo_value : carry;
214 } else {
215 new_value =
216 cache[input_tile_c[0]][input_tile_r][i_processing_element][cache_c]
217 .value;
218 }
219
220 stencil_buffer[i_processing_element][cache_c][stencil_diameter - 1] = new_value;
221 if (cache_c > 0) {
222 cache[(~input_tile_c)[0]][input_tile_r][i_processing_element][cache_c - 1]
223 .value = new_value;
224 }
225 }
226
227 uindex_t pe_iteration =
228 (i_iteration + i_processing_element / TransFunc::n_subiterations).to_uint();
229 uindex_t pe_subiteration =
230 (i_processing_element % TransFunc::n_subiterations).to_uint();
231 index_t output_grid_c = input_grid_c - index_t(TransFunc::stencil_radius);
232 index_t output_grid_r = input_grid_r - index_t(TransFunc::stencil_radius);
233 TDV tdv = tdv_local_state.get_time_dependent_value(i_processing_element /
234 TransFunc::n_subiterations);
235 StencilImpl stencil(ID(output_grid_c, output_grid_r), UID(grid_width, grid_height),
236 pe_iteration, pe_subiteration, tdv,
237 stencil_buffer[i_processing_element]);
238
239 if (pe_iteration < target_i_iteration) {
240 carry = trans_func(stencil);
241 } else {
242 carry = stencil_buffer[i_processing_element][TransFunc::stencil_radius]
243 [TransFunc::stencil_radius];
244 }
245 }
246
247 bool is_valid_output =
248 input_tile_c >= uindex_1d_t((stencil_diameter - 1) * n_processing_elements);
249 is_valid_output &=
250 input_tile_r >= uindex_1d_t((stencil_diameter - 1) * n_processing_elements);
251
252 if (is_valid_output) {
253 out_pipe::write(carry);
254 }
255
256 if (input_tile_r == input_tile_section_height - 1) {
257 input_tile_r = 0;
258 input_tile_c++;
259 } else {
260 input_tile_r++;
261 }
262 }
263 }
264
265 private:
266 TransFunc trans_func;
267 uindex_t i_iteration;
268 uindex_t target_i_iteration;
269 uindex_t grid_c_offset;
270 uindex_t grid_r_offset;
271 uindex_t grid_width;
272 uindex_t grid_height;
273 Cell halo_value;
274 TDVKernelArgument tdv_kernel_argument;
275};
276
304template <concepts::TransitionFunction F, uindex_t n_processing_elements = 1,
305 uindex_t tile_width = 1024, uindex_t tile_height = 1024,
309 private:
310 using Cell = F::Cell;
311 using TDVGlobalState = typename TDVStrategy::template GlobalState<F, n_processing_elements>;
312 using TDVKernelArgument = typename TDVGlobalState::KernelArgument;
313
314 public:
318 static constexpr uindex_t halo_radius = F::stencil_radius * n_processing_elements;
319
324
328 struct Params {
335
339 Cell halo_value = Cell();
340
348
353
362 sycl::device device = sycl::device();
363
373 bool blocking = false;
374
382 bool profiling = false;
383 };
384
389 : params(params), n_processed_cells(0), work_events(), walltime(0.0) {}
390
396 Params &get_params() { return params; }
397
409 using in_pipe = sycl::pipe<class tiling_in_pipe, Cell>;
410 using out_pipe = sycl::pipe<class tiling_out_pipe, Cell>;
411 using ExecutionKernelImpl = StencilUpdateKernel<F, TDVKernelArgument, n_processing_elements,
412 tile_width, tile_height, in_pipe, out_pipe>;
413
414 if (params.n_iterations == 0) {
415 return GridImpl(source_grid);
416 }
417
418 sycl::queue input_kernel_queue =
419 sycl::queue(params.device, {sycl::property::queue::in_order{}});
420 sycl::queue output_kernel_queue =
421 sycl::queue(params.device, {sycl::property::queue::in_order{}});
422 sycl::queue working_queue =
423 sycl::queue(params.device, {cl::sycl::property::queue::enable_profiling{},
424 sycl::property::queue::in_order{}});
425
426 GridImpl swap_grid_a = source_grid.make_similar();
427 GridImpl swap_grid_b = source_grid.make_similar();
428
429 uindex_t iters_per_pass = n_processing_elements / F::n_subiterations;
430 GridImpl *pass_source = &source_grid;
431 GridImpl *pass_target = &swap_grid_b;
432
433 UID tile_range = source_grid.get_tile_range();
434 uindex_t grid_width = source_grid.get_grid_width();
435 uindex_t grid_height = source_grid.get_grid_height();
436
437 F trans_func = params.transition_function;
438 TDVGlobalState tdv_global_state(trans_func, params.iteration_offset, params.n_iterations);
439
440 auto walltime_start = std::chrono::high_resolution_clock::now();
441
442 uindex_t target_n_iterations = params.iteration_offset + params.n_iterations;
443 for (uindex_t i = params.iteration_offset; i < target_n_iterations; i += iters_per_pass) {
444 uindex_t iters_in_this_pass = std::min(iters_per_pass, target_n_iterations - i);
445
446 for (uindex_t i_tile_c = 0; i_tile_c < tile_range.c; i_tile_c++) {
447 for (uindex_t i_tile_r = 0; i_tile_r < tile_range.r; i_tile_r++) {
448 pass_source->template submit_read<in_pipe>(input_kernel_queue, i_tile_c,
449 i_tile_r, params.halo_value);
450
451 auto work_event = working_queue.submit([&](sycl::handler &cgh) {
452 TDVKernelArgument tdv_kernel_argument(tdv_global_state, cgh, i,
453 iters_in_this_pass);
454 uindex_t c_offset = i_tile_c * tile_width;
455 uindex_t r_offset = i_tile_r * tile_height;
456
457 ExecutionKernelImpl exec_kernel(trans_func, i, target_n_iterations,
458 c_offset, r_offset, grid_width, grid_height,
459 params.halo_value, tdv_kernel_argument);
460
461 cgh.single_task<ExecutionKernelImpl>(exec_kernel);
462 });
463 if (params.profiling) {
464 work_events.push_back(work_event);
465 }
466
467 pass_target->template submit_write<out_pipe>(output_kernel_queue, i_tile_c,
468 i_tile_r);
469 }
470 }
471
472 if (i == params.iteration_offset) {
473 pass_source = &swap_grid_b;
474 pass_target = &swap_grid_a;
475 } else {
476 std::swap(pass_source, pass_target);
477 }
478 }
479
480 if (params.blocking) {
481 output_kernel_queue.wait();
482 }
483
484 auto walltime_end = std::chrono::high_resolution_clock::now();
485 std::chrono::duration<double> walltime = walltime_end - walltime_start;
486 this->walltime += walltime.count();
487
488 n_processed_cells +=
489 params.n_iterations * source_grid.get_grid_width() * source_grid.get_grid_height();
490
491 return *pass_source;
492 }
493
501 uindex_t get_n_processed_cells() const { return n_processed_cells; }
502
509 double get_kernel_runtime() const {
510 double kernel_runtime = 0.0;
511 for (sycl::event work_event : work_events) {
512 const double timesteps_per_second = 1000000000.0;
513 double start =
514 double(work_event
515 .get_profiling_info<cl::sycl::info::event_profiling::command_start>()) /
516 timesteps_per_second;
517 double end =
518 double(
519 work_event.get_profiling_info<cl::sycl::info::event_profiling::command_end>()) /
520 timesteps_per_second;
521 kernel_runtime += end - start;
522 }
523 return kernel_runtime;
524 }
525
532 double get_walltime() const { return walltime; }
533
534 private:
535 Params params;
536 uindex_t n_processed_cells;
537 double walltime;
538 std::vector<sycl::event> work_events;
539};
540
541} // namespace tiling
542} // namespace stencil
T value
Definition Helpers.hpp:0
A generic, two-dimensional index.
Definition GenericID.hpp:32
The stencil buffer.
Definition Stencil.hpp:48
ac_int< bits_stencil, false > uindex_stencil_t
An unsigned index type for column and row indices in this stencil.
Definition Stencil.hpp:60
ac_int< bits_stencil, true > index_stencil_t
A signed index type for column and row indices in this stencil.
Definition Stencil.hpp:57
A grid class for the tiling architecture.
Definition Grid.hpp:74
A kernel that executes a stencil transition function on a tile.
Definition StencilUpdate.hpp:61
void operator()() const
Execute the configured operations.
Definition StencilUpdate.hpp:141
StencilUpdateKernel(TransFunc trans_func, uindex_t i_iteration, uindex_t target_i_iteration, uindex_t grid_c_offset, uindex_t grid_r_offset, uindex_t grid_width, uindex_t grid_height, Cell halo_value, TDVKernelArgument tdv_kernel_argument)
Create and configure the execution kernel.
Definition StencilUpdate.hpp:126
A grid updater that applies an iterative stencil code to a grid.
Definition StencilUpdate.hpp:308
Grid< Cell, tile_width, tile_height, halo_radius > GridImpl
A shorthand for the used and supported grid type.
Definition StencilUpdate.hpp:323
double get_walltime() const
Return the accumulated runtime of the updater, measured from the host side.
Definition StencilUpdate.hpp:532
StencilUpdate(Params params)
Create a new stencil updater object.
Definition StencilUpdate.hpp:388
uindex_t get_n_processed_cells() const
Return the accumulated total number of cells processed by this updater.
Definition StencilUpdate.hpp:501
static constexpr uindex_t halo_radius
The radius of an input's tile halo.
Definition StencilUpdate.hpp:318
Params & get_params()
Return a reference to the parameters.
Definition StencilUpdate.hpp:396
GridImpl operator()(GridImpl &source_grid)
Compute a new grid based on the source grid, using the configured transition function.
Definition StencilUpdate.hpp:408
double get_kernel_runtime() const
Return the accumulated total runtime of the execution kernel.
Definition StencilUpdate.hpp:509
A technical definition of a stencil transition function.
Definition Concepts.hpp:62
Requirements for a TDV implementation strategy.
Definition SinglePassStrategies.hpp:99
Definition AccessorSubscript.hpp:24
GenericID< uindex_t > UID
An unsigned, two-dimensional index.
Definition GenericID.hpp:80
BOOST_PP_CAT(BOOST_PP_CAT(uint, STENCIL_INDEX_WIDTH), _t) uindex_t
An unsigned integer of configurable width.
Definition Index.hpp:42
GenericID< index_t > ID
A signed, two-dimensional index.
Definition GenericID.hpp:75
BOOST_PP_CAT(BOOST_PP_CAT(int, STENCIL_INDEX_WIDTH), _t) index_t
A signed integer of configurable width.
Definition Index.hpp:56
A container with padding to the next power of two.
Definition Helpers.hpp:45
A TDV implementation strategy that inlines the TDV function into the transition function.
Definition SinglePassStrategies.hpp:114
Parameters for the stencil updater.
Definition StencilUpdate.hpp:328
bool blocking
Should the stencil updater block until completion, or return immediately after all kernels have been ...
Definition StencilUpdate.hpp:373
uindex_t iteration_offset
The iteration index offset.
Definition StencilUpdate.hpp:347
uindex_t n_iterations
The number of iterations to compute.
Definition StencilUpdate.hpp:352
Cell halo_value
The cell value to present for cells outside of the grid.
Definition StencilUpdate.hpp:339
F transition_function
An instance of the transition function type.
Definition StencilUpdate.hpp:334
bool profiling
Enable profiling.
Definition StencilUpdate.hpp:382
sycl::device device
The device to use for computations.
Definition StencilUpdate.hpp:362