sycl_source/kernel.cpp

/*
 * SYCL kernel code for BM3D
 * Copyright (c) 2003, 2007-14 Matteo Frigo
 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
 * Copyright (c) 2021-2023 WolframRhodium
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 *
 */

// functions "dct_pack8_interleave4" and "idct_pack8_interleave4"
// are modified from code generated by fftw-3.3.9
// WolframRhodium, 8 May 2021

#include <cfloat>
#include <type_traits>

#include <sycl/sycl.hpp>

#define FMA(a, b, c) (((a) * (b)) + (c))
#define FMS(a, b, c) (((a) * (b)) - (c))
#define FNMS(a, b, c) ((c) - ((a) * (b)))

sycl::event launch(
    /* shape: [(chroma ? 3 : 1), (2 * radius + 1), 2, height, stride] */
    float * d_res,
    /* shape: [(final_ ? 2 : 1), (chroma ? 3 : 1), (2 * radius + 1), height, stride] */
    float * d_src,
    /* HtoD shape: [(final_ ? 2 : 1), (chroma ? 3 : 1), (2 * radius + 1), height, stride] */
    /* DtoH shape: [(chroma ? 3 : 1), (2 * radius + 1), 2, height, stride] */
    float * h_res,
    int width, int height, int stride,
    float sigma, int block_step, int bm_range,
    int radius, int ps_num, int ps_range,
    bool chroma, float sigma_u, float sigma_v,
    bool final_, float extractor,
    sycl::queue & stream
);

#ifndef SUBGROUP_SIZE
// ponte vecchio (xe-hpc) should set this to 16
#define SUBGROUP_SIZE 8
#endif

static constexpr int smem_stride = 32 + 1;

// https://docs.nvidia.com/cuda/archive/12.2.2/cuda-c-programming-guide/index.html#id36
template <int width, typename T>
    requires std::is_trivially_copyable_v<T>
static inline T shuffle_up(T var, int delta, sycl::sub_group sub_group) {
    auto sub_lane_id = static_cast<int>(sub_group.get_local_id()[0]);
    int idx;
    if (sub_lane_id % width < delta) {
        idx = sub_lane_id;
    } else {
        idx = sub_lane_id - delta;
    }
    var = sub_group.shuffle(var, idx);
    return var;
}

// https://docs.nvidia.com/cuda/archive/12.2.2/cuda-c-programming-guide/index.html#id36
template <int width, typename T>
    requires std::is_trivially_copyable_v<T>
static inline T shuffle(T var, int src_lane, sycl::sub_group sub_group) {
    if (sub_group.get_max_local_range()[0] == width) {
        return sub_group.shuffle(var, src_lane);
    }

    int idx = (static_cast<int>(sub_group.get_local_id()[0]) & -width) + src_lane;
    var = sub_group.shuffle(var, idx);
    return var;
}

template <auto transform_impl, int stride=256, int howmany=8, int howmany_stride=32>
static inline void transform_pack8_interleave4(
    float * __restrict__ data, float * __restrict__ buffer
) {

    #pragma unroll
    for (int iter = 0; iter < howmany; ++iter, data += howmany_stride) {
        float v[8];

        #pragma unroll
        for (int i = 0; i < 8; ++i) {
            v[i] = data[i * stride];
        }

        transform_impl(v);

        #pragma unroll
        for (int i = 0; i < 8; ++i) {
            data[i * stride] = v[i];
        }
    }
}

// modified from fftw-3.3.9 generated code:
// fftw-3.3.9/rdft/scalar/r2r/e10_8.c and e01_8.c
// (normalized, scaled) DCT-II/DCT-III
template <bool forward>
static inline void dct(float v[8]) {
    if constexpr (forward) {
        float KP414213562 {+0.414213562373095048801688724209698078569671875};
        float KP1_847759065 {+1.847759065022573512256366378793576573644833252};
        float KP198912367 {+0.198912367379658006911597622644676228597850501};
        float KP1_961570560 {+1.961570560806460898252364472268478073947867462};
        float KP1_414213562 {+1.414213562373095048801688724209698078569671875};
        float KP668178637 {+0.668178637919298919997757686523080761552472251};
        float KP1_662939224 {+1.662939224605090474157576755235811513477121624};
        float KP707106781 {+0.707106781186547524400844362104849039284835938};

        auto T1 = v[0];
        auto T2 = v[7];
        auto T3 = T1 - T2;
        auto Tj = T1 + T2;
        auto Tc = v[4];
        auto Td = v[3];
        auto Te = Tc - Td;
        auto Tk = Tc + Td;
        auto T4 = v[2];
        auto T5 = v[5];
        auto T6 = T4 - T5;
        auto T7 = v[1];
        auto T8 = v[6];
        auto T9 = T7 - T8;
        auto Ta = T6 + T9;
        auto Tn = T7 + T8;
        auto Tf = T6 - T9;
        auto Tm = T4 + T5;
        auto Tb = FNMS(KP707106781, Ta, T3);
        auto Tg = FNMS(KP707106781, Tf, Te);
        v[3] = KP1_662939224 * (FMA(KP668178637, Tg, Tb));
        v[5] = -(KP1_662939224 * (FNMS(KP668178637, Tb, Tg)));
        auto Tp = Tj + Tk;
        auto Tq = Tm + Tn;
        v[4] = KP1_414213562 * (Tp - Tq);
        v[0] = KP1_414213562 * (Tp + Tq);
        auto Th = FMA(KP707106781, Ta, T3);
        auto Ti = FMA(KP707106781, Tf, Te);
        v[1] = KP1_961570560 * (FNMS(KP198912367, Ti, Th));
        v[7] = KP1_961570560 * (FMA(KP198912367, Th, Ti));
        auto Tl = Tj - Tk;
        auto To = Tm - Tn;
        v[2] = KP1_847759065 * (FNMS(KP414213562, To, Tl));
        v[6] = KP1_847759065 * (FMA(KP414213562, Tl, To));
    } else {
        float KP1_662939224 {+1.662939224605090474157576755235811513477121624};
        float KP668178637 {+0.668178637919298919997757686523080761552472251};
        float KP1_961570560 {+1.961570560806460898252364472268478073947867462};
        float KP198912367 {+0.198912367379658006911597622644676228597850501};
        float KP1_847759065 {+1.847759065022573512256366378793576573644833252};
        float KP707106781 {+0.707106781186547524400844362104849039284835938};
        float KP414213562 {+0.414213562373095048801688724209698078569671875};
        float KP1_414213562 {+1.414213562373095048801688724209698078569671875};

        auto T1 = v[0] * KP1_414213562;
        auto T2 = v[4];
        auto T3 = FMA(KP1_414213562, T2, T1);
        auto Tj = FNMS(KP1_414213562, T2, T1);
        auto T4 = v[2];
        auto T5 = v[6];
        auto T6 = FMA(KP414213562, T5, T4);
        auto Tk = FMS(KP414213562, T4, T5);
        auto T8 = v[1];
        auto Td = v[7];
        auto T9 = v[5];
        auto Ta = v[3];
        auto Tb = T9 + Ta;
        auto Te = Ta - T9;
        auto Tc = FMA(KP707106781, Tb, T8);
        auto Tn = FNMS(KP707106781, Te, Td);
        auto Tf = FMA(KP707106781, Te, Td);
        auto Tm = FNMS(KP707106781, Tb, T8);
        auto T7 = FMA(KP1_847759065, T6, T3);
        auto Tg = FMA(KP198912367, Tf, Tc);
        v[7] = FNMS(KP1_961570560, Tg, T7);
        v[0] = FMA(KP1_961570560, Tg, T7);
        auto Tp = FNMS(KP1_847759065, Tk, Tj);
        auto Tq = FMA(KP668178637, Tm, Tn);
        v[5] = FNMS(KP1_662939224, Tq, Tp);
        v[2] = FMA(KP1_662939224, Tq, Tp);
        auto Th = FNMS(KP1_847759065, T6, T3);
        auto Ti = FNMS(KP198912367, Tc, Tf);
        v[3] = FNMS(KP1_961570560, Ti, Th);
        v[4] = FMA(KP1_961570560, Ti, Th);
        auto Tl = FMA(KP1_847759065, Tk, Tj);
        auto To = FNMS(KP668178637, Tn, Tm);
        v[6] = FNMS(KP1_662939224, To, Tl);
        v[1] = FMA(KP1_662939224, To, Tl);
    }
}

// 2-D transposition
// launched by blockDim(x=32, y=1, z=1)
template <int stride=256, int howmany=8, int howmany_stride=32>
static inline void transpose_pack8_interleave4(
    float * __restrict__ data, float * __restrict__ buffer,
    sycl::nd_item<2> it
) {

    auto group = it.get_group();

    int lane_id = static_cast<int>(group.get_local_id(1));

    #pragma unroll
    for (int iter = 0; iter < howmany; ++iter, data += howmany_stride) {
        it.barrier(sycl::access::fence_space::local_space);

        #pragma unroll
        for (int i = 0; i < 8; ++i) {
            buffer[i * smem_stride + lane_id] = data[i * stride];
        }

        it.barrier(sycl::access::fence_space::local_space);

        #pragma unroll
        for (int i = 0; i < 8; ++i) {
            data[i * stride] = buffer[(lane_id % 8) * smem_stride + (lane_id & -8) + i];
        }
    }
}

// launched by blockDim(x=32, y=1, z=1)
template <int stride=32>
static inline float hard_thresholding(float * data, float sigma, sycl::sub_group sub_group) {
    int sub_lane_id = static_cast<int>(sub_group.get_local_id()[0]);

    // number of non-zero coefficients
    float k {};

    #pragma unroll
    for (int i = 0; i < 64; ++i) {
        auto val = data[i * stride];

        float thr;
        if (i == 0) {
            thr = (sub_lane_id % 8) ? sigma : 0.0f; // protects DC component
        } else {
            thr = sigma;
        }

        float flag = fabsf(val) >= thr;

        k += flag;

        data[i * stride] = flag ? (val * (1.0f / 4096.0f)) : 0.0f;
    }

    #pragma unroll
    for (int i = 4; i >= 1; i /= 2) {
        k += sub_group.shuffle_xor(k, i);
    }

    return 1.0f / k;
}

// hard thresholding
// launched by blockDim(x=32, y=1, z=1)
static inline float collaborative_hard(
    float * __restrict__ denoising_patch, float sigma, float * __restrict__ buffer,
    sycl::nd_item<2> it
) {

    constexpr int stride1 = 1;
    constexpr int stride2 = stride1 * 8;

    #pragma unroll
    for (int ndim = 0; ndim < 2; ++ndim) {
        transform_pack8_interleave4<dct<true>, stride1, 8, stride2>(denoising_patch, buffer);
        transpose_pack8_interleave4<stride1, 8, stride2>(denoising_patch, buffer, it);
    }
    transform_pack8_interleave4<dct<true>, stride2, 8, stride1>(denoising_patch, buffer);

    float adaptive_weight = hard_thresholding<stride1>(denoising_patch, sigma, it.get_sub_group());

    #pragma unroll
    for (int ndim = 0; ndim < 2; ++ndim) {
        transform_pack8_interleave4<dct<false>, stride1, 8, stride2>(denoising_patch, buffer);
        transpose_pack8_interleave4<stride1, 8, stride2>(denoising_patch, buffer, it);
    }
    transform_pack8_interleave4<dct<false>, stride2, 8, stride1>(denoising_patch, buffer);

    return adaptive_weight;
}

// launched by blockDim(x=32, y=1, z=1)
template <int stride=32>
static inline float wiener_filtering(
    float * __restrict__ data, float * __restrict__ ref, float sigma,
    sycl::sub_group sub_group
) {

    int sub_lane_id = static_cast<int>(sub_group.get_local_id()[0]);

    // squared l2-norm of coefficients
    float k {};

    #pragma unroll
    for (int i = 0; i < 64; ++i) {
        auto val = data[i * stride];
        auto ref_val = ref[i * stride];
        float coeff = (ref_val * ref_val) / (ref_val * ref_val + sigma * sigma);
        if (i == 0) {
            coeff = (sub_lane_id % 8) ? coeff : 1.0f; // protects DC component
        }
        val *= coeff;
        k += coeff * coeff;
        data[i * stride] = val * (1.0f / 4096.0f);
    }

    #pragma unroll
    for (int i = 4; i >= 1; i /= 2) {
        k += sub_group.shuffle_xor(k, i);
    }

    return 1.0f / k;
}

// wiener filtering
// launched by blockDim(x=32, y=1, z=1)
static inline float collaborative_wiener(
    float * __restrict__ denoising_patch, float * __restrict__ ref_patch,
    float sigma, float * __restrict__ buffer,
    sycl::nd_item<2> it
) {

    constexpr int stride1 = 1;
    constexpr int stride2 = stride1 * 8;

    #pragma unroll
    for (int ndim = 0; ndim < 2; ++ndim) {
        transform_pack8_interleave4<dct<true>, stride1, 8, stride2>(denoising_patch, buffer);
        transpose_pack8_interleave4<stride1, 8, stride2>(denoising_patch, buffer, it);
    }
    transform_pack8_interleave4<dct<true>, stride2, 8, stride1>(denoising_patch, buffer);

    #pragma unroll
    for (int ndim = 0; ndim < 2; ++ndim) {
        transform_pack8_interleave4<dct<true>, stride1, 8, stride2>(ref_patch, buffer);
        transpose_pack8_interleave4<stride1, 8, stride2>(ref_patch, buffer, it);
    }
    transform_pack8_interleave4<dct<true>, stride2, 8, stride1>(ref_patch, buffer);

    float adaptive_weight = wiener_filtering<stride1>(denoising_patch, ref_patch, sigma, it.get_sub_group());

    #pragma unroll
    for (int ndim = 0; ndim < 2; ++ndim) {
        transform_pack8_interleave4<dct<false>, stride1, 8, stride2>(denoising_patch, buffer);
        transpose_pack8_interleave4<stride1, 8, stride2>(denoising_patch, buffer, it);
    }
    transform_pack8_interleave4<dct<false>, stride2, 8, stride1>(denoising_patch, buffer);

    return adaptive_weight;
}

// BM3D kernel
template <bool temporal=false, bool chroma=false, bool final_=false>
static void bm3d(
    /* shape: [(chroma ? 3 : 1), (2 * radius + 1), 2, height, stride] */
    float * __restrict__ res,
    /* shape: [(final_ ? 2 : 1), (chroma ? 3 : 1), (2 * radius + 1), height, stride] */
    const float * __restrict__ src,
    int width, int height, int stride,
    float sigma, int block_step, int bm_range,
    int _radius, int ps_num, int ps_range,
    [[maybe_unused]] float sigma_u, [[maybe_unused]] float sigma_v,
    float extractor, // used for deteriministic summation
    sycl::nd_item<2> it
    #ifndef SYCL_EXT_ONEAPI_LOCAL_MEMORY
    , sycl::local_accessor<float, 1> buffer_accessor
    #endif
) {

    #ifdef SYCL_EXT_ONEAPI_LOCAL_MEMORY
    float * buffer = *sycl::ext::oneapi::group_local_memory_for_overwrite<float[8 * smem_stride]>(it.get_group()).get();
    #else
    float * buffer = buffer_accessor.get_pointer().get();
    #endif

    int lane_id = it.get_local_id(1);

    const int sub_lane_id = lane_id % 8; // 0 ~ 7
    int x = (4 * it.get_group(1) + lane_id / 8) * block_step;
    int y = block_step * it.get_group(0);
    if (x >= width - 8 + block_step || y >= height - 8 + block_step) {
        return;
    }

    x = sycl::min(x, width - 8);
    y = sycl::min(y, height - 8);

    int radius = 0;
    if constexpr (temporal) {
        radius = _radius;
    }

    int temporal_stride = height * stride;
    int temporal_width = 2 * radius + 1;
    int plane_stride = temporal_width * temporal_stride;
    int clip_stride = (chroma ? 3 : 1) * temporal_width * temporal_stride;

    float current_patch[8];
    const float * const srcpc = &src[radius * temporal_stride + sub_lane_id];

    {
        const float * srcp = &srcpc[y * stride + x];

        #pragma unroll
        for (int i = 0; i < 8; ++i) {
            current_patch[i] = srcp[i * stride];
        }
    }

    float errors8 = FLT_MAX;
    int index8_x = 0;
    int index8_y = 0;

    {
        int left = sycl::max(x - bm_range, 0);
        int right = sycl::min(x + bm_range, width - 8);
        int top = sycl::max(y - bm_range, 0);
        int bottom = sycl::min(y + bm_range, height - 8);

        const float * srcp_row = &srcpc[top * stride + left];
        for (int row_i = top; row_i <= bottom; ++row_i) {
            const float * srcp_col = srcp_row;
            for (int col_i = left; col_i <= right; ++col_i) {
                float errors[2] { 0.0f };

                const float * srcp = srcp_col;

                #pragma unroll
                for (int i = 0; i < 8; ++i) {
                    float val = current_patch[i] - srcp[i * stride];
                    errors[i % 2] += val * val;
                }

                float error = errors[0] + errors[1];

                auto sub_group = it.get_sub_group();

                error += sub_group.shuffle_xor(error, 1);
                error += sub_group.shuffle_xor(error, 2);
                error += sub_group.shuffle_xor(error, 4);

                auto pre_error = shuffle_up<8>(errors8, 1, sub_group);
                int pre_index_x = shuffle_up<8>(index8_x, 1, sub_group);
                int pre_index_y = shuffle_up<8>(index8_y, 1, sub_group);

                int flag = error < errors8;
                int pre_flag = shuffle_up<8>(flag, 1, sub_group);

                if (flag) {
                    int first = (sub_lane_id == 0) || (!pre_flag);
                    errors8 = first ? error : pre_error;
                    index8_x = first ? col_i : pre_index_x;
                    index8_y = first ? row_i : pre_index_y;
                }

                ++srcp_col;
            }

            srcp_row += stride;
        }
    }
    [[maybe_unused]] int index8_z = radius;

    if /* constexpr */ (temporal) {
        int center_index8_x = index8_x;
        int center_index8_y = index8_y;

        #pragma unroll
        for (int direction = -1; direction <= 1; direction += 2) {
            int last_index8_x = center_index8_x;
            int last_index8_y = center_index8_y;

            for (int t = 1; t <= radius; ++t) {
                /*
                membermask =
                    (((x & -32) >= bm_range + t * ps_range) &&
                     ((x & -32) + bm_range + t * ps_range <= width - 32) &&
                     (y >= bm_range + t * ps_range) &&
                     (y + bm_range + t * ps_range <= height - 8))
                    ? 0xFFFFFFFF
                    : 0xFF << (lane_id & -8);
                */
                int temporal_index = radius + direction * t;
                float frame_errors8 = FLT_MAX;
                int frame_index8_x = 0;
                int frame_index8_y = 0;

                const float * temporal_srcpc = &src[temporal_index * temporal_stride + sub_lane_id];

                for (int i = 0; i < ps_num; ++i) {
                    auto sub_group = it.get_sub_group();

                    int xx = shuffle<8>(last_index8_x, i, sub_group);
                    int yy = shuffle<8>(last_index8_y, i, sub_group);

                    int left = sycl::max(xx - ps_range, 0);
                    int right = sycl::min(xx + ps_range, width - 8);
                    int top = sycl::max(yy - ps_range, 0);
                    int bottom = sycl::min(yy + ps_range, height - 8);

                    const float * srcp_row = &temporal_srcpc[top * stride + left];
                    for (int row_i = top; row_i <= bottom; ++row_i) {
                        const float * srcp_col = srcp_row;
                        for (int col_i = left; col_i <= right; ++col_i) {
                            float errors[2] { 0.0f };

                            const float * srcp = srcp_col;

                            #pragma unroll
                            for (int j = 0; j < 8; ++j) {
                                float val = current_patch[j] - srcp[j * stride];
                                errors[j % 2] += val * val;
                            }

                            float error = errors[0] + errors[1];

                            error += sub_group.shuffle_xor(error, 1);
                            error += sub_group.shuffle_xor(error, 2);
                            error += sub_group.shuffle_xor(error, 4);

                            float pre_error = shuffle_up<8>(frame_errors8, 1, sub_group);
                            int pre_index_x = shuffle_up<8>(frame_index8_x, 1, sub_group);
                            int pre_index_y = shuffle_up<8>(frame_index8_y, 1, sub_group);

                            int flag = error < frame_errors8;
                            int pre_flag = shuffle_up<8>(flag, 1, sub_group);

                            if (flag) {
                                int first = (sub_lane_id == 0) || (!pre_flag);
                                frame_errors8 = first ? error : pre_error;
                                frame_index8_x = first ? col_i : pre_index_x;
                                frame_index8_y = first ? row_i : pre_index_y;
                            }

                            ++srcp_col;
                        }

                        srcp_row += stride;
                    }
                }

                for (int i = 0; i < ps_num; ++i) {
                    auto sub_group = it.get_sub_group();

                    float tmp_error = shuffle<8>(frame_errors8, i, sub_group);
                    int tmp_x = shuffle<8>(frame_index8_x, i, sub_group);
                    int tmp_y = shuffle<8>(frame_index8_y, i, sub_group);

                    int flag = tmp_error < errors8;
                    int pre_flag = shuffle_up<8>(flag, 1, sub_group);
                    float pre_error = shuffle_up<8>(errors8, 1, sub_group);
                    int pre_index_x = shuffle_up<8>(index8_x, 1, sub_group);
                    int pre_index_y = shuffle_up<8>(index8_y, 1, sub_group);
                    int pre_index_z = shuffle_up<8>(index8_z, 1, sub_group);

                    if (flag) {
                        int first = (sub_lane_id == 0) || (!pre_flag);
                        errors8 = first ? tmp_error : pre_error;
                        index8_x = first ? tmp_x : pre_index_x;
                        index8_y = first ? tmp_y : pre_index_y;
                        index8_z = first ? temporal_index : pre_index_z;
                    }
                }

                last_index8_x = frame_index8_x;
                last_index8_y = frame_index8_y;
            }
        }
    }

    // insert center block
    {
        int flag;
        if constexpr (temporal) {
            flag = index8_x == x && index8_y == y && index8_z == radius;
        } else {
            flag = index8_x == x && index8_y == y;
        }

        auto sub_group = it.get_sub_group();

        flag += sub_group.shuffle_xor(flag, 1);
        flag += sub_group.shuffle_xor(flag, 2);
        flag += sub_group.shuffle_xor(flag, 4);

        float pre_error = shuffle_up<8>(errors8, 1, sub_group);
        int pre_index_x = shuffle_up<8>(index8_x, 1, sub_group);
        int pre_index_y = shuffle_up<8>(index8_y, 1, sub_group);
        [[maybe_unused]] int pre_index_z;
        if constexpr (temporal) {
            pre_index_z = shuffle_up<8>(index8_z, 1, sub_group);
        }
        if (!flag) {
            int first = (sub_lane_id == 0);
            errors8 = first ? 0.0f : pre_error;
            index8_x = first ? x : pre_index_x;
            index8_y = first ? y : pre_index_y;
            if constexpr (temporal) {
                index8_z = first ? radius : pre_index_z;
            }
        }
    }

    float denoising_patch[64];
    [[maybe_unused]] float ref_patch[64];

    int num_planes = 1;
    if constexpr (chroma) {
        num_planes = 3;
    }
    #pragma unroll
    for (int plane = 0; plane < num_planes; ++plane) {
        if (plane == 1) {
            sigma = sigma_u;
        } else if (plane == 2) {
            sigma = sigma_v;
        }

        if constexpr (chroma) {
            if (sigma < FLT_EPSILON) {
                src += plane_stride;
                res += plane_stride * 2;
                continue;
            }
        }

        float adaptive_weight;
        if /* constexpr */ (final_) {
            #pragma unroll
            for (int i = 0; i < 8; ++i) {
                auto sub_group = it.get_sub_group();

                int tmp_x = shuffle<8>(index8_x, i, sub_group);
                int tmp_y = shuffle<8>(index8_y, i, sub_group);
                const float * refp;
                if constexpr (temporal) {
                    int tmp_z = shuffle<8>(index8_z, i, sub_group);
                    refp = &src[tmp_z * temporal_stride + tmp_y * stride + tmp_x + sub_lane_id];
                } else {
                    refp = &src[tmp_y * stride + tmp_x + sub_lane_id];
                }
                const float * srcp = &refp[clip_stride];

                #pragma unroll
                for (int j = 0; j < 8; ++j) {
                    ref_patch[i * 8 + j] = refp[j * stride];
                    denoising_patch[i * 8 + j] = srcp[j * stride];
                }
            }

            adaptive_weight = collaborative_wiener(denoising_patch, ref_patch, sigma, buffer, it);
        } else {
            #pragma unroll
            for (int i = 0; i < 8; ++i) {
                auto sub_group = it.get_sub_group();

                int tmp_x = shuffle<8>(index8_x, i, sub_group);
                int tmp_y = shuffle<8>(index8_y, i, sub_group);
                const float * srcp;
                if constexpr (temporal) {
                    int tmp_z = shuffle<8>(index8_z, i, sub_group);
                    srcp = &src[tmp_z * temporal_stride + tmp_y * stride + tmp_x + sub_lane_id];
                } else {
                    srcp = &src[tmp_y * stride + tmp_x + sub_lane_id];
                }

                #pragma unroll
                for (int j = 0; j < 8; ++j) {
                    denoising_patch[i * 8 + j] = srcp[j * stride];
                }
            }

            adaptive_weight = collaborative_hard(denoising_patch, sigma, buffer, it);
        }

        float * const wdstpc = &res[sub_lane_id];
        float * const weightpc = &res[temporal_stride + sub_lane_id];

        #pragma unroll
        for (int i = 0; i < 8; ++i) {
            auto sub_group = it.get_sub_group();

            int tmp_x = shuffle<8>(index8_x, i, sub_group);
            int tmp_y = shuffle<8>(index8_y, i, sub_group);
            int offset;
            if constexpr (temporal) {
                int tmp_z = shuffle<8>(index8_z, i, sub_group);
                offset = tmp_z * 2 * temporal_stride + tmp_y * stride + tmp_x;
            } else {
                offset = tmp_y * stride + tmp_x;
            }

            float * wdstp = &wdstpc[offset];
            float * weightp = &weightpc[offset];

            #pragma unroll
            for (int j = 0; j < 8; ++j) {
                float wdst_val = adaptive_weight * denoising_patch[i * 8 + j];
                float weight_val = adaptive_weight;

                // pre-rounding
                wdst_val = (wdst_val + extractor) - extractor;
                weight_val = (weight_val + extractor) - extractor;

                auto wdst = sycl::atomic_ref<
                    float,
                    sycl::memory_order::relaxed,
                    sycl::memory_scope::device,
                    sycl::access::address_space::global_space
                >(wdstp[j * stride]);
                wdst.fetch_add(wdst_val);

                auto weight = sycl::atomic_ref<
                    float,
                    sycl::memory_order::relaxed,
                    sycl::memory_scope::device,
                    sycl::access::address_space::global_space
                >(weightp[j * stride]);
                weight.fetch_add(weight_val);
            }
        }

        src += plane_stride;
        res += plane_stride * 2;
    }
}

sycl::event launch(
    float * d_res, float * d_src, float * h_res,
    int width, int height, int stride,
    float sigma, int block_step, int bm_range,
    int radius, int ps_num, int ps_range,
    bool chroma, float sigma_u, float sigma_v, bool final_,
    float extractor,
    sycl::queue & stream
) {

    size_t pitch { stride * sizeof(float) };
    int temporal_width { 2 * radius + 1 };
    int num_planes { chroma ? 3 : 1 };

    auto memcpy_h_to_d_node = stream.memcpy(
        d_src,
        h_res,
        (final_ ? 2 : 1) * num_planes * temporal_width * height * pitch
    );

    auto memset_node = stream.memset(
        d_res,
        0,
        num_planes * temporal_width * 2 * height * pitch
    );

    auto kernel_node = stream.submit([&](sycl::handler & h) {
        h.depends_on(memcpy_h_to_d_node);
        h.depends_on(memset_node);

        sycl::range<2> block_dims { 1, 32 };
        sycl::range<2> grid_dims {
            static_cast<size_t>((height + (block_step - 1)) / block_step * block_dims[0]),
            static_cast<size_t>((width + (4 * block_step - 1)) / (4 * block_step) * block_dims[1])
        };

        #ifndef SYCL_EXT_ONEAPI_LOCAL_MEMORY
        sycl::local_accessor<float, 1> buffer_accessor(8 * smem_stride, h);
        #endif

        if (radius) {
            if (chroma) {
                if (final_) {
                    auto bm3d_kernel = [=](sycl::nd_item<2> it)
                        [[sycl::reqd_work_group_size(1, 32)]]
                        [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]]
                        #if defined SYCL_EXT_INTEL_KERNEL_ARGS_RESTRICT && SYCL_EXT_INTEL_KERNEL_ARGS_RESTRICT
                        [[intel::kernel_args_restrict]]
                        #endif
                    {
                        bm3d<true, true, true>(
                            d_res, d_src,
                            width, height, stride,
                            sigma, block_step, bm_range,
                            radius, ps_num, ps_range,
                            sigma_u, sigma_v, extractor,
                            it
                            #ifndef SYCL_EXT_ONEAPI_LOCAL_MEMORY
                            , buffer_accessor
                            #endif
                        );
                    };

                    h.parallel_for(sycl::nd_range { grid_dims, block_dims }, bm3d_kernel);
                } else {
                    auto bm3d_kernel = [=](sycl::nd_item<2> it)
                        [[sycl::reqd_work_group_size(1, 32)]]
                        [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]]
                        #if defined SYCL_EXT_INTEL_KERNEL_ARGS_RESTRICT && SYCL_EXT_INTEL_KERNEL_ARGS_RESTRICT
                        [[intel::kernel_args_restrict]]
                        #endif
                    {
                        bm3d<true, true, false>(
                            d_res, d_src,
                            width, height, stride,
                            sigma, block_step, bm_range,
                            radius, ps_num, ps_range,
                            sigma_u, sigma_v, extractor,
                            it
                            #ifndef SYCL_EXT_ONEAPI_LOCAL_MEMORY
                            , buffer_accessor
                            #endif
                        );
                    };

                    h.parallel_for(sycl::nd_range { grid_dims, block_dims }, bm3d_kernel);
                }
            } else {
                if (final_) {
                    auto bm3d_kernel = [=](sycl::nd_item<2> it)
                        [[sycl::reqd_work_group_size(1, 32)]]
                        [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]]
                        #if defined SYCL_EXT_INTEL_KERNEL_ARGS_RESTRICT && SYCL_EXT_INTEL_KERNEL_ARGS_RESTRICT
                        [[intel::kernel_args_restrict]]
                        #endif
                    {
                        bm3d<true, false, true>(
                            d_res, d_src,
                            width, height, stride,
                            sigma, block_step, bm_range,
                            radius, ps_num, ps_range,
                            sigma_u, sigma_v, extractor,
                            it
                            #ifndef SYCL_EXT_ONEAPI_LOCAL_MEMORY
                            , buffer_accessor
                            #endif
                        );
                    };

                    h.parallel_for(sycl::nd_range { grid_dims, block_dims }, bm3d_kernel);
                } else {
                    auto bm3d_kernel = [=](sycl::nd_item<2> it)
                        [[sycl::reqd_work_group_size(1, 32)]]
                        [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]]
                        #if defined SYCL_EXT_INTEL_KERNEL_ARGS_RESTRICT && SYCL_EXT_INTEL_KERNEL_ARGS_RESTRICT
                        [[intel::kernel_args_restrict]]
                        #endif
                    {
                        bm3d<true, false, false>(
                            d_res, d_src,
                            width, height, stride,
                            sigma, block_step, bm_range,
                            radius, ps_num, ps_range,
                            sigma_u, sigma_v, extractor,
                            it
                            #ifndef SYCL_EXT_ONEAPI_LOCAL_MEMORY
                            , buffer_accessor
                            #endif
                        );
                    };

                    h.parallel_for(sycl::nd_range { grid_dims, block_dims }, bm3d_kernel);
                }
            }
        } else {
            if (chroma) {
                if (final_) {
                    auto bm3d_kernel = [=](sycl::nd_item<2> it)
                        [[sycl::reqd_work_group_size(1, 32)]]
                        [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]]
                        #if defined SYCL_EXT_INTEL_KERNEL_ARGS_RESTRICT && SYCL_EXT_INTEL_KERNEL_ARGS_RESTRICT
                        [[intel::kernel_args_restrict]]
                        #endif
                    {
                        bm3d<false, true, true>(
                            d_res, d_src,
                            width, height, stride,
                            sigma, block_step, bm_range,
                            radius, ps_num, ps_range,
                            sigma_u, sigma_v, extractor,
                            it
                            #ifndef SYCL_EXT_ONEAPI_LOCAL_MEMORY
                            , buffer_accessor
                            #endif
                        );
                    };

                    h.parallel_for(sycl::nd_range { grid_dims, block_dims }, bm3d_kernel);
                } else {
                    auto bm3d_kernel = [=](sycl::nd_item<2> it)
                        [[sycl::reqd_work_group_size(1, 32)]]
                        [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]]
                        #if defined SYCL_EXT_INTEL_KERNEL_ARGS_RESTRICT && SYCL_EXT_INTEL_KERNEL_ARGS_RESTRICT
                        [[intel::kernel_args_restrict]]
                        #endif
                    {
                        bm3d<false, true, false>(
                            d_res, d_src,
                            width, height, stride,
                            sigma, block_step, bm_range,
                            radius, ps_num, ps_range,
                            sigma_u, sigma_v, extractor,
                            it
                            #ifndef SYCL_EXT_ONEAPI_LOCAL_MEMORY
                            , buffer_accessor
                            #endif
                        );
                    };

                    h.parallel_for(sycl::nd_range { grid_dims, block_dims }, bm3d_kernel);
                }
            } else {
                if (final_) {
                    auto bm3d_kernel = [=](sycl::nd_item<2> it)
                        [[sycl::reqd_work_group_size(1, 32)]]
                        [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]]
                        #if defined SYCL_EXT_INTEL_KERNEL_ARGS_RESTRICT && SYCL_EXT_INTEL_KERNEL_ARGS_RESTRICT
                        [[intel::kernel_args_restrict]]
                        #endif
                    {
                        bm3d<false, false, true>(
                            d_res, d_src,
                            width, height, stride,
                            sigma, block_step, bm_range,
                            radius, ps_num, ps_range,
                            sigma_u, sigma_v, extractor,
                            it
                            #ifndef SYCL_EXT_ONEAPI_LOCAL_MEMORY
                            , buffer_accessor
                            #endif
                        );
                    };

                    h.parallel_for(sycl::nd_range { grid_dims, block_dims }, bm3d_kernel);
                } else {
                    auto bm3d_kernel = [=](sycl::nd_item<2> it)
                        [[sycl::reqd_work_group_size(1, 32)]]
                        [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]]
                        #if defined SYCL_EXT_INTEL_KERNEL_ARGS_RESTRICT && SYCL_EXT_INTEL_KERNEL_ARGS_RESTRICT
                        [[intel::kernel_args_restrict]]
                        #endif
                    {
                        bm3d<false, false, false>(
                            d_res, d_src,
                            width, height, stride,
                            sigma, block_step, bm_range,
                            radius, ps_num, ps_range,
                            sigma_u, sigma_v, extractor,
                            it
                            #ifndef SYCL_EXT_ONEAPI_LOCAL_MEMORY
                            , buffer_accessor
                            #endif
                        );
                    };

                    h.parallel_for(sycl::nd_range { grid_dims, block_dims }, bm3d_kernel);
                }
            }
        }
    });

    auto memcpy_d_to_h_node = stream.submit([&](sycl::handler & h) {
        h.depends_on(kernel_node);

        h.memcpy(h_res, d_res, num_planes * temporal_width * 2 * height * pitch);
    });

    return memcpy_d_to_h_node;
}