Example: Automatic Buffer Synchronization

The following executions use the STREAM benchmark kernel. See Example: STREAM Benchmark for details and building of the bitstream.

The kernel's first compute unit can be executed with automatic synchronization of the buffers as follows:

array_size = 2^20

a = Array{Float64}(MemAlign(4096), array_size)
b = Array{Float64}(MemAlign(4096), array_size)
c = Array{Float64}(MemAlign(4096), array_size)

a[:] .= rand(array_size)
b[:] .= rand(array_size)
c[:] .= 0

uuid = load_xclbin!("stream.xclbin")
stream = XRT.Kernel(uuid, "stream_calc:{k1}")

xa = XRT.BOArray(a, group_id(stream, 0))
xb = XRT.BOArray(b, group_id(stream, 1))
xc = XRT.BOArray(c, group_id(stream, 2))

XRT.@sync_buffers XRT.Run(stream, xa, xb, xc, 2.0, array_size, 1)

a[:] .= xa[:]
b[:] .= xb[:]
c[:] .= xc[:]

@test all(.≈(c, 2 .* a .+ b, atol = 0.01))

This code snippet synchronizes all buffers to and from the device. To differ between input and output buffers the XRT.BOArray can be replaced with directional XRT.ToDeviceBOArray and XRT.ToDeviceBOArray:

xa = XRT.ToDeviceBOArray(a, group_id(stream, 0))
xb = XRT.ToDeviceBOArray(b, group_id(stream, 1))
xc = XRT.FromDeviceBOArray(c, group_id(stream, 2))

Alternatively, the BOArrays can continue to be used, but the synchronization to the host can be handled manually instead. The XRT.@sync_buffers macro can only synchronize the buffers in one direction to the device:

XRT.@sync_buffers direction=XRT.TO_DEVICE r = XRT.Run(stream, xa, xb, xc, 2.0, array_size, 1)
# In this case the `wait` call is explicitly need
wait(r)

sync!(xc, XRT.FROM_DEVICE)
c[:] .= xc[:]

@test all(.≈(c, 2 .* a .+ b, atol = 0.01))

Parallel kernel execution

The automatic buffer synchronization can also be used when running kernels in parallel on multiple FPGAs. The STREAM kernel consists of two compute units. It is possible to run both in parallel on different FPGAs:

array_size = 2^20

a1 = Array{Float64}(MemAlign(4096), array_size)
b1 = Array{Float64}(MemAlign(4096), array_size)
c1 = Array{Float64}(MemAlign(4096), array_size)

a2 = Array{Float64}(MemAlign(4096), array_size)
b2 = Array{Float64}(MemAlign(4096), array_size)
c2 = Array{Float64}(MemAlign(4096), array_size)

a1[:] .= rand(array_size)
b1[:] .= rand(array_size)
c1[:] .= 0

a2[:] .= rand(array_size)
b2[:] .= rand(array_size)
c2[:] .= 0

uuid1 = load_xclbin!("stream.xclbin"; device=XRT.device(1))
k1 = XRT.Kernel(uuid, "stream_calc:{k1}")

uuid2 = load_xclbin!("stream.xclbin"; device=XRT.device(2))
k2 = XRT.Kernel(uuid, "stream_calc:{k2}")

xa1 = XRT.BOArray(a1, group_id(k1, 0))
xb1 = XRT.BOArray(b1, group_id(k1, 1))
xc1 = XRT.BOArray(c1, group_id(k1, 2))

xa2 = XRT.BOArray(a2, group_id(k2, 0))
xb2 = XRT.BOArray(b2, group_id(k2, 1))
xc2 = XRT.BOArray(c2, group_id(k2, 2))

XRT.@sync_buffers begin
    XRT.Run(k1, xa1, xb1, xc1, 2.0, array_size, 1)
    XRT.Run(k2, xa2, xb2, xc2, 2.0, array_size, 1)
    end

a1[:] .= xa1[:]
b1[:] .= xb1[:]
c1[:] .= xc1[:]

a2[:] .= xa2[:]
b2[:] .= xb2[:]
c2[:] .= xc2[:]

@test all(.≈(c1, 2 .* a1 .+ b1, atol = 0.01))

@test all(.≈(c2, 2 .* a2 .+ b2, atol = 0.01))