Program Listing for File switchkey.cu

↰ Return to documentation for file (src/lib/kernel/switchkey.cu)
// Copyright 2024-2026 Alişah Özcan
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
// Developer: Alişah Özcan

#include <heongpu/kernel/switchkey.cuh>

namespace heongpu
{

    __global__ void cipher_broadcast_kernel(Data64* input, Data64* output,
                                            Modulus64* modulus, int n_power,
                                            int rns_mod_count)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count

        int location = (rns_mod_count * block_y) << n_power;
        Data64 input_ = input[idx + (block_y << n_power)];
        Data64 one_ = 1;
#pragma unroll
        for (int i = 0; i < rns_mod_count; i++)
        {
            output[idx + (i << n_power) + location] =
                OPERATOR_GPU_64::mult(one_, input_, modulus[i]);
        }
    }

    __global__ void
    cipher_broadcast_leveled_kernel(Data64* input, Data64* output,
                                    Modulus64* modulus, int first_rns_mod_count,
                                    int current_rns_mod_count, int n_power)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Current Decomposition Modulus Count

        int location = (current_rns_mod_count * block_y) << n_power;

        Data64 input_ = input[idx + (block_y << n_power)];
        int level = first_rns_mod_count - current_rns_mod_count;
#pragma unroll
        for (int i = 0; i < current_rns_mod_count; i++)
        {
            int mod_index;
            if (i < gridDim.y)
            {
                mod_index = i;
            }
            else
            {
                mod_index = i + level;
            }

            Data64 result =
                OPERATOR_GPU_64::reduce_forced(input_, modulus[mod_index]);

            output[idx + (i << n_power) + location] = result;
        }
    }

    __global__ void keyswitch_multiply_accumulate_kernel(
        Data64* input, const Data64* __restrict__ relinkey, Data64* output,
        Modulus64* modulus, int n_power, int Q_tilda_size, int iteration_count1,
        int iteration_count2)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // RNS Modulus Count

        int Q_tilda_size_reg = Q_tilda_size;
        int key_offset1 = Q_tilda_size_reg << n_power;
        int key_offset2 = Q_tilda_size_reg << (n_power + 1);
        int index = idx + (block_y << n_power);

        Modulus64 modulus_reg = modulus[block_y];

        Data64 ct_0_sum = 0;
        Data64 ct_1_sum = 0;
#pragma unroll
        for (int i = 0; i < iteration_count1; i++)
        {
            Data64 in_piece1 =
                input[index + (((4 * i) * Q_tilda_size_reg) << n_power)];
            Data64 rk0_1 = __ldg(&relinkey[index + (key_offset2 * (4 * i))]);
            Data64 rk1_1 =
                __ldg(&relinkey[index + (key_offset2 * (4 * i)) + key_offset1]);

            Data64 in_piece2 =
                input[index + ((((4 * i) + 1) * Q_tilda_size_reg) << n_power)];
            Data64 rk0_2 =
                __ldg(&relinkey[index + (key_offset2 * ((4 * i) + 1))]);
            Data64 rk1_2 = __ldg(
                &relinkey[index + (key_offset2 * ((4 * i) + 1)) + key_offset1]);

            Data64 in_piece3 =
                input[index + ((((4 * i) + 2) * Q_tilda_size_reg) << n_power)];
            Data64 rk0_3 =
                __ldg(&relinkey[index + (key_offset2 * ((4 * i) + 2))]);
            Data64 rk1_3 = __ldg(
                &relinkey[index + (key_offset2 * ((4 * i) + 2)) + key_offset1]);

            Data64 in_piece4 =
                input[index + ((((4 * i) + 3) * Q_tilda_size_reg) << n_power)];
            Data64 rk0_4 =
                __ldg(&relinkey[index + (key_offset2 * ((4 * i) + 3))]);
            Data64 rk1_4 = __ldg(
                &relinkey[index + (key_offset2 * ((4 * i) + 3)) + key_offset1]);

            Data64 mult0_1 =
                OPERATOR_GPU_64::mult(in_piece1, rk0_1, modulus_reg);
            Data64 mult1_1 =
                OPERATOR_GPU_64::mult(in_piece1, rk1_1, modulus_reg);

            Data64 mult0_2 =
                OPERATOR_GPU_64::mult(in_piece2, rk0_2, modulus_reg);
            Data64 mult1_2 =
                OPERATOR_GPU_64::mult(in_piece2, rk1_2, modulus_reg);

            Data64 mult0_3 =
                OPERATOR_GPU_64::mult(in_piece3, rk0_3, modulus_reg);
            Data64 mult1_3 =
                OPERATOR_GPU_64::mult(in_piece3, rk1_3, modulus_reg);

            Data64 mult0_4 =
                OPERATOR_GPU_64::mult(in_piece4, rk0_4, modulus_reg);
            Data64 mult1_4 =
                OPERATOR_GPU_64::mult(in_piece4, rk1_4, modulus_reg);

            ct_0_sum = OPERATOR_GPU_64::add(ct_0_sum, mult0_1, modulus_reg);
            ct_1_sum = OPERATOR_GPU_64::add(ct_1_sum, mult1_1, modulus_reg);

            ct_0_sum = OPERATOR_GPU_64::add(ct_0_sum, mult0_2, modulus_reg);
            ct_1_sum = OPERATOR_GPU_64::add(ct_1_sum, mult1_2, modulus_reg);

            ct_0_sum = OPERATOR_GPU_64::add(ct_0_sum, mult0_3, modulus_reg);
            ct_1_sum = OPERATOR_GPU_64::add(ct_1_sum, mult1_3, modulus_reg);

            ct_0_sum = OPERATOR_GPU_64::add(ct_0_sum, mult0_4, modulus_reg);
            ct_1_sum = OPERATOR_GPU_64::add(ct_1_sum, mult1_4, modulus_reg);
        }

        int loop_offset = iteration_count1 * 4;
#pragma unroll
        for (int i = loop_offset; i < loop_offset + iteration_count2; i++)
        {
            Data64 in_piece1 =
                input[index + ((i * Q_tilda_size_reg) << n_power)];
            Data64 rk0_1 = __ldg(&relinkey[index + (key_offset2 * i)]);
            Data64 rk1_1 =
                __ldg(&relinkey[index + (key_offset2 * i) + key_offset1]);

            Data64 mult0_1 =
                OPERATOR_GPU_64::mult(in_piece1, rk0_1, modulus_reg);
            Data64 mult1_1 =
                OPERATOR_GPU_64::mult(in_piece1, rk1_1, modulus_reg);

            ct_0_sum = OPERATOR_GPU_64::add(ct_0_sum, mult0_1, modulus_reg);
            ct_1_sum = OPERATOR_GPU_64::add(ct_1_sum, mult1_1, modulus_reg);
        }

        output[index] = ct_0_sum;
        output[index + key_offset1] = ct_1_sum;
    }

    __global__ void keyswitch_multiply_accumulate_leveled_kernel(
        Data64* input, const Data64* __restrict__ relinkey, Data64* output,
        Modulus64* modulus, int first_rns_mod_count,
        int current_decomp_mod_count, int iteration_count1,
        int iteration_count2, int n_power)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // RNS Modulus Count

        int key_index = (block_y == current_decomp_mod_count)
                            ? (first_rns_mod_count - 1)
                            : block_y;

        int current_decomp_mod_count_plus_one = current_decomp_mod_count + 1;
        int key_offset1 = first_rns_mod_count << n_power;
        int key_offset2 = first_rns_mod_count << (n_power + 1);
        int index = idx + (block_y << n_power);

        Modulus64 modulus_reg = modulus[key_index];

        Data64 ct_0_sum = 0;
        Data64 ct_1_sum = 0;
#pragma unroll
        for (int i = 0; i < iteration_count1; i++)
        {
            Data64 in_piece1 =
                input[index + (((4 * i) * current_decomp_mod_count_plus_one)
                               << n_power)];
            Data64 rk0_1 = __ldg(&relinkey[idx + (key_index << n_power) +
                                           (key_offset2 * (4 * i))]);
            Data64 rk1_1 =
                __ldg(&relinkey[idx + (key_index << n_power) +
                                (key_offset2 * (4 * i)) + key_offset1]);

            Data64 in_piece2 =
                input[index +
                      ((((4 * i) + 1) * current_decomp_mod_count_plus_one)
                       << n_power)];
            Data64 rk0_2 = __ldg(&relinkey[idx + (key_index << n_power) +
                                           (key_offset2 * ((4 * i) + 1))]);
            Data64 rk1_2 =
                __ldg(&relinkey[idx + (key_index << n_power) +
                                (key_offset2 * ((4 * i) + 1)) + key_offset1]);

            Data64 in_piece3 =
                input[index +
                      ((((4 * i) + 2) * current_decomp_mod_count_plus_one)
                       << n_power)];
            Data64 rk0_3 = __ldg(&relinkey[idx + (key_index << n_power) +
                                           (key_offset2 * ((4 * i) + 2))]);
            Data64 rk1_3 =
                __ldg(&relinkey[idx + (key_index << n_power) +
                                (key_offset2 * ((4 * i) + 2)) + key_offset1]);

            Data64 in_piece4 =
                input[index +
                      ((((4 * i) + 3) * current_decomp_mod_count_plus_one)
                       << n_power)];
            Data64 rk0_4 = __ldg(&relinkey[idx + (key_index << n_power) +
                                           (key_offset2 * ((4 * i) + 3))]);
            Data64 rk1_4 =
                __ldg(&relinkey[idx + (key_index << n_power) +
                                (key_offset2 * ((4 * i) + 3)) + key_offset1]);

            Data64 mult0_1 =
                OPERATOR_GPU_64::mult(in_piece1, rk0_1, modulus_reg);
            Data64 mult1_1 =
                OPERATOR_GPU_64::mult(in_piece1, rk1_1, modulus_reg);

            Data64 mult0_2 =
                OPERATOR_GPU_64::mult(in_piece2, rk0_2, modulus_reg);
            Data64 mult1_2 =
                OPERATOR_GPU_64::mult(in_piece2, rk1_2, modulus_reg);

            Data64 mult0_3 =
                OPERATOR_GPU_64::mult(in_piece3, rk0_3, modulus_reg);
            Data64 mult1_3 =
                OPERATOR_GPU_64::mult(in_piece3, rk1_3, modulus_reg);

            Data64 mult0_4 =
                OPERATOR_GPU_64::mult(in_piece4, rk0_4, modulus_reg);
            Data64 mult1_4 =
                OPERATOR_GPU_64::mult(in_piece4, rk1_4, modulus_reg);

            ct_0_sum = OPERATOR_GPU_64::add(ct_0_sum, mult0_1, modulus_reg);
            ct_1_sum = OPERATOR_GPU_64::add(ct_1_sum, mult1_1, modulus_reg);

            ct_0_sum = OPERATOR_GPU_64::add(ct_0_sum, mult0_2, modulus_reg);
            ct_1_sum = OPERATOR_GPU_64::add(ct_1_sum, mult1_2, modulus_reg);

            ct_0_sum = OPERATOR_GPU_64::add(ct_0_sum, mult0_3, modulus_reg);
            ct_1_sum = OPERATOR_GPU_64::add(ct_1_sum, mult1_3, modulus_reg);

            ct_0_sum = OPERATOR_GPU_64::add(ct_0_sum, mult0_4, modulus_reg);
            ct_1_sum = OPERATOR_GPU_64::add(ct_1_sum, mult1_4, modulus_reg);
        }

        int loop_offset = iteration_count1 * 4;
#pragma unroll
        for (int i = loop_offset; i < loop_offset + iteration_count2; i++)
        {
            Data64 in_piece1 =
                input[index +
                      ((i * current_decomp_mod_count_plus_one) << n_power)];
            Data64 rk0_1 = __ldg(
                &relinkey[idx + (key_index << n_power) + (key_offset2 * i)]);
            Data64 rk1_1 = __ldg(&relinkey[idx + (key_index << n_power) +
                                           (key_offset2 * i) + key_offset1]);

            Data64 mult0_1 =
                OPERATOR_GPU_64::mult(in_piece1, rk0_1, modulus_reg);
            Data64 mult1_1 =
                OPERATOR_GPU_64::mult(in_piece1, rk1_1, modulus_reg);

            ct_0_sum = OPERATOR_GPU_64::add(ct_0_sum, mult0_1, modulus_reg);
            ct_1_sum = OPERATOR_GPU_64::add(ct_1_sum, mult1_1, modulus_reg);
        }

        output[index] = ct_0_sum;
        output[index + (current_decomp_mod_count_plus_one << n_power)] =
            ct_1_sum;
    }

    __global__ void keyswitch_multiply_accumulate_leveled_method_II_kernel(
        Data64* input, const Data64* __restrict__ relinkey, Data64* output,
        Modulus64* modulus, int first_rns_mod_count,
        int current_decomp_mod_count, int current_rns_mod_count,
        int iteration_count1, int iteration_count2, int level, int n_power)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // RNS Modulus Count

        int key_index =
            (block_y < current_decomp_mod_count) ? block_y : (block_y + level);

        int key_offset1 = first_rns_mod_count << n_power;
        int key_offset2 = first_rns_mod_count << (n_power + 1);
        int index = idx + (block_y << n_power);
        int index2 = idx + (key_index << n_power);

        Modulus64 modulus_reg = modulus[key_index];

        Data64 ct_0_sum = 0;
        Data64 ct_1_sum = 0;
#pragma unroll
        for (int i = 0; i < iteration_count1; i++)
        {
            Data64 in_piece1 =
                input[index + (((4 * i) * (current_rns_mod_count)) << n_power)];
            Data64 rk0_1 = __ldg(&relinkey[index2 + (key_offset2 * (4 * i))]);
            Data64 rk1_1 = __ldg(
                &relinkey[index2 + (key_offset2 * (4 * i)) + key_offset1]);

            Data64 in_piece2 =
                input[index +
                      ((((4 * i) + 1) * (current_rns_mod_count)) << n_power)];
            Data64 rk0_2 =
                __ldg(&relinkey[index2 + (key_offset2 * ((4 * i) + 1))]);
            Data64 rk1_2 =
                __ldg(&relinkey[index2 + (key_offset2 * ((4 * i) + 1)) +
                                key_offset1]);

            Data64 in_piece3 =
                input[index +
                      ((((4 * i) + 2) * (current_rns_mod_count)) << n_power)];
            Data64 rk0_3 =
                __ldg(&relinkey[index2 + (key_offset2 * ((4 * i) + 2))]);
            Data64 rk1_3 =
                __ldg(&relinkey[index2 + (key_offset2 * ((4 * i) + 2)) +
                                key_offset1]);

            Data64 in_piece4 =
                input[index +
                      ((((4 * i) + 3) * (current_rns_mod_count)) << n_power)];
            Data64 rk0_4 =
                __ldg(&relinkey[index2 + (key_offset2 * ((4 * i) + 3))]);
            Data64 rk1_4 =
                __ldg(&relinkey[index2 + (key_offset2 * ((4 * i) + 3)) +
                                key_offset1]);

            Data64 mult0_1 =
                OPERATOR_GPU_64::mult(in_piece1, rk0_1, modulus_reg);
            Data64 mult1_1 =
                OPERATOR_GPU_64::mult(in_piece1, rk1_1, modulus_reg);

            Data64 mult0_2 =
                OPERATOR_GPU_64::mult(in_piece2, rk0_2, modulus_reg);
            Data64 mult1_2 =
                OPERATOR_GPU_64::mult(in_piece2, rk1_2, modulus_reg);

            Data64 mult0_3 =
                OPERATOR_GPU_64::mult(in_piece3, rk0_3, modulus_reg);
            Data64 mult1_3 =
                OPERATOR_GPU_64::mult(in_piece3, rk1_3, modulus_reg);

            Data64 mult0_4 =
                OPERATOR_GPU_64::mult(in_piece4, rk0_4, modulus_reg);
            Data64 mult1_4 =
                OPERATOR_GPU_64::mult(in_piece4, rk1_4, modulus_reg);

            ct_0_sum = OPERATOR_GPU_64::add(ct_0_sum, mult0_1, modulus_reg);
            ct_1_sum = OPERATOR_GPU_64::add(ct_1_sum, mult1_1, modulus_reg);

            ct_0_sum = OPERATOR_GPU_64::add(ct_0_sum, mult0_2, modulus_reg);
            ct_1_sum = OPERATOR_GPU_64::add(ct_1_sum, mult1_2, modulus_reg);

            ct_0_sum = OPERATOR_GPU_64::add(ct_0_sum, mult0_3, modulus_reg);
            ct_1_sum = OPERATOR_GPU_64::add(ct_1_sum, mult1_3, modulus_reg);

            ct_0_sum = OPERATOR_GPU_64::add(ct_0_sum, mult0_4, modulus_reg);
            ct_1_sum = OPERATOR_GPU_64::add(ct_1_sum, mult1_4, modulus_reg);
        }

        int loop_offset = iteration_count1 * 4;
#pragma unroll
        for (int i = loop_offset; i < loop_offset + iteration_count2; i++)
        {
            Data64 in_piece1 =
                input[index + ((i * (current_rns_mod_count)) << n_power)];
            Data64 rk0_1 = __ldg(&relinkey[index2 + (key_offset2 * i)]);
            Data64 rk1_1 =
                __ldg(&relinkey[index2 + (key_offset2 * i) + key_offset1]);

            Data64 mult0_1 =
                OPERATOR_GPU_64::mult(in_piece1, rk0_1, modulus_reg);
            Data64 mult1_1 =
                OPERATOR_GPU_64::mult(in_piece1, rk1_1, modulus_reg);

            ct_0_sum = OPERATOR_GPU_64::add(ct_0_sum, mult0_1, modulus_reg);
            ct_1_sum = OPERATOR_GPU_64::add(ct_1_sum, mult1_1, modulus_reg);
        }

        output[index] = ct_0_sum;
        output[index + (current_rns_mod_count << n_power)] = ct_1_sum;
    }

    __global__ void divide_round_lastq_kernel(Data64* input, Data64* ct,
                                              Data64* output,
                                              Modulus64* modulus, Data64* half,
                                              Data64* half_mod,
                                              Data64* last_q_modinv,
                                              int n_power, int decomp_mod_count)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count
        int block_z = blockIdx.z; // Cipher Size (2)

        Data64 last_ct = input[idx + (decomp_mod_count << n_power) +
                               (((decomp_mod_count + 1) << n_power) * block_z)];

        last_ct =
            OPERATOR_GPU_64::add(last_ct, half[0], modulus[decomp_mod_count]);

        last_ct = OPERATOR_GPU_64::reduce_forced(last_ct, modulus[block_y]);

        last_ct =
            OPERATOR_GPU_64::sub(last_ct, half_mod[block_y], modulus[block_y]);

        Data64 input_ = input[idx + (block_y << n_power) +
                              (((decomp_mod_count + 1) << n_power) * block_z)];

        input_ = OPERATOR_GPU_64::sub(input_, last_ct, modulus[block_y]);

        input_ = OPERATOR_GPU_64::mult(input_, last_q_modinv[block_y],
                                       modulus[block_y]);

        Data64 ct_in = ct[idx + (block_y << n_power) +
                          (((decomp_mod_count) << n_power) * block_z)];

        ct_in = OPERATOR_GPU_64::add(ct_in, input_, modulus[block_y]);

        output[idx + (block_y << n_power) +
               (((decomp_mod_count) << n_power) * block_z)] = ct_in;
    }

    __global__ void divide_round_lastq_switchkey_kernel(
        Data64* input, Data64* ct, Data64* output, Modulus64* modulus,
        Data64* half, Data64* half_mod, Data64* last_q_modinv, int n_power,
        int decomp_mod_count)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count
        int block_z = blockIdx.z; // Cipher Size (2)

        Data64 last_ct = input[idx + (decomp_mod_count << n_power) +
                               (((decomp_mod_count + 1) << n_power) * block_z)];

        last_ct =
            OPERATOR_GPU_64::add(last_ct, half[0], modulus[decomp_mod_count]);

        last_ct = OPERATOR_GPU_64::reduce_forced(last_ct, modulus[block_y]);

        last_ct =
            OPERATOR_GPU_64::sub(last_ct, half_mod[block_y], modulus[block_y]);

        Data64 input_ = input[idx + (block_y << n_power) +
                              (((decomp_mod_count + 1) << n_power) * block_z)];

        input_ = OPERATOR_GPU_64::sub(input_, last_ct, modulus[block_y]);

        input_ = OPERATOR_GPU_64::mult(input_, last_q_modinv[block_y],
                                       modulus[block_y]);

        Data64 ct_in = 0ULL;
        if (block_z == 0)
        {
            ct_in = ct[idx + (block_y << n_power) +
                       (((decomp_mod_count) << n_power) * block_z)];
        }

        ct_in = OPERATOR_GPU_64::add(ct_in, input_, modulus[block_y]);

        output[idx + (block_y << n_power) +
               (((decomp_mod_count) << n_power) * block_z)] = ct_in;
    }

    __global__ void divide_round_lastq_extended_kernel(
        Data64* input, Data64* ct, Data64* output, Modulus64* modulus,
        Data64* half, Data64* half_mod, Data64* last_q_modinv, int n_power,
        int Q_prime_size, int Q_size, int P_size)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count (Q_size)
        int block_z = blockIdx.z; // Cipher Size (2)

        // Max P size is 15.
        Data64 last_ct[15];
        for (int i = 0; i < P_size; i++)
        {
            last_ct[i] = input[idx + ((Q_size + i) << n_power) +
                               ((Q_prime_size << n_power) * block_z)];
        }

        Data64 input_ = input[idx + (block_y << n_power) +
                              ((Q_prime_size << n_power) * block_z)];

        int location_ = 0;
        for (int i = 0; i < P_size; i++)
        {
            Data64 last_ct_add_half_ = last_ct[(P_size - 1 - i)];
            last_ct_add_half_ = OPERATOR_GPU_64::add(
                last_ct_add_half_, half[i], modulus[(Q_prime_size - 1 - i)]);
            for (int j = 0; j < (P_size - 1 - i); j++)
            {
                Data64 temp1 = OPERATOR_GPU_64::reduce_forced(
                    last_ct_add_half_, modulus[Q_size + j]);

                temp1 = OPERATOR_GPU_64::sub(temp1,
                                             half_mod[location_ + Q_size + j],
                                             modulus[Q_size + j]);

                temp1 = OPERATOR_GPU_64::sub(last_ct[j], temp1,
                                             modulus[Q_size + j]);

                last_ct[j] = OPERATOR_GPU_64::mult(
                    temp1, last_q_modinv[location_ + Q_size + j],
                    modulus[Q_size + j]);
            }

            Data64 temp1 = OPERATOR_GPU_64::reduce_forced(last_ct_add_half_,
                                                          modulus[block_y]);
            temp1 = OPERATOR_GPU_64::sub(temp1, half_mod[location_ + block_y],
                                         modulus[block_y]);

            temp1 = OPERATOR_GPU_64::sub(input_, temp1, modulus[block_y]);

            input_ = OPERATOR_GPU_64::mult(
                temp1, last_q_modinv[location_ + block_y], modulus[block_y]);

            location_ = location_ + (Q_prime_size - 1 - i);
        }

        Data64 ct_in =
            ct[idx + (block_y << n_power) + (((Q_size) << n_power) * block_z)];

        ct_in = OPERATOR_GPU_64::add(ct_in, input_, modulus[block_y]);

        output[idx + (block_y << n_power) + (((Q_size) << n_power) * block_z)] =
            ct_in;
    }

    __global__ void divide_round_lastq_extended_switchkey_kernel(
        Data64* input, Data64* ct, Data64* output, Modulus64* modulus,
        Data64* half, Data64* half_mod, Data64* last_q_modinv, int n_power,
        int Q_prime_size, int Q_size, int P_size)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count (Q_size)
        int block_z = blockIdx.z; // Cipher Size (2)

        // Max P size is 15.
        Data64 last_ct[15];
        for (int i = 0; i < P_size; i++)
        {
            last_ct[i] = input[idx + ((Q_size + i) << n_power) +
                               ((Q_prime_size << n_power) * block_z)];
        }

        Data64 input_ = input[idx + (block_y << n_power) +
                              ((Q_prime_size << n_power) * block_z)];

        int location_ = 0;
        for (int i = 0; i < P_size; i++)
        {
            Data64 last_ct_add_half_ = last_ct[(P_size - 1 - i)];
            last_ct_add_half_ = OPERATOR_GPU_64::add(
                last_ct_add_half_, half[i], modulus[(Q_prime_size - 1 - i)]);
            for (int j = 0; j < (P_size - 1 - i); j++)
            {
                Data64 temp1 = OPERATOR_GPU_64::reduce_forced(
                    last_ct_add_half_, modulus[Q_size + j]);
                temp1 = OPERATOR_GPU_64::sub(temp1,
                                             half_mod[location_ + Q_size + j],
                                             modulus[Q_size + j]);

                temp1 = OPERATOR_GPU_64::sub(last_ct[j], temp1,
                                             modulus[Q_size + j]);

                last_ct[j] = OPERATOR_GPU_64::mult(
                    temp1, last_q_modinv[location_ + Q_size + j],
                    modulus[Q_size + j]);
            }

            Data64 temp1 = OPERATOR_GPU_64::reduce_forced(last_ct_add_half_,
                                                          modulus[block_y]);
            temp1 = OPERATOR_GPU_64::sub(temp1, half_mod[location_ + block_y],
                                         modulus[block_y]);

            temp1 = OPERATOR_GPU_64::sub(input_, temp1, modulus[block_y]);

            input_ = OPERATOR_GPU_64::mult(
                temp1, last_q_modinv[location_ + block_y], modulus[block_y]);

            location_ = location_ + (Q_prime_size - 1 - i);
        }

        Data64 ct_in = 0ULL;
        if (block_z == 0)
        {
            ct_in = ct[idx + (block_y << n_power) +
                       (((Q_size) << n_power) * block_z)];
        }

        ct_in = OPERATOR_GPU_64::add(ct_in, input_, modulus[block_y]);

        output[idx + (block_y << n_power) + (((Q_size) << n_power) * block_z)] =
            ct_in;
    }

    __global__ void DivideRoundLastqNewP_leveled(
        Data64* input, Data64* ct, Data64* output, Modulus64* modulus,
        Data64* half, Data64* half_mod, Data64* last_q_modinv, int n_power,
        int Q_prime_size, int Q_size, int P_size)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count (Q_size)
        int block_z = blockIdx.z; // Cipher Size (2)

        // Max P size is 15.
        Data64 last_ct[15];
        for (int i = 0; i < P_size; i++)
        {
            last_ct[i] = input[idx + ((Q_size + i) << n_power) +
                               ((Q_prime_size << n_power) * block_z)];
        }

        Data64 input_ = input[idx + (block_y << n_power) +
                              ((Q_prime_size << n_power) * block_z)];

        int location_ = 0;
        for (int i = 0; i < P_size; i++)
        {
            Data64 last_ct_add_half_ = last_ct[(P_size - 1 - i)];
            last_ct_add_half_ = OPERATOR_GPU_64::add(
                last_ct_add_half_, half[i], modulus[(Q_prime_size - 1 - i)]);
            for (int j = 0; j < (P_size - 1 - i); j++)
            {
                Data64 temp1 = OPERATOR_GPU_64::reduce_forced(
                    last_ct_add_half_, modulus[Q_size + j]);
                temp1 = OPERATOR_GPU_64::sub(temp1,
                                             half_mod[location_ + Q_size + j],
                                             modulus[Q_size + j]);

                temp1 = OPERATOR_GPU_64::sub(last_ct[j], temp1,
                                             modulus[Q_size + j]);

                last_ct[j] = OPERATOR_GPU_64::mult(
                    temp1, last_q_modinv[location_ + Q_size + j],
                    modulus[Q_size + j]);
            }

            Data64 temp1 = OPERATOR_GPU_64::reduce_forced(last_ct_add_half_,
                                                          modulus[block_y]);
            temp1 = OPERATOR_GPU_64::sub(temp1, half_mod[location_ + block_y],
                                         modulus[block_y]);

            temp1 = OPERATOR_GPU_64::sub(input_, temp1, modulus[block_y]);

            input_ = OPERATOR_GPU_64::mult(
                temp1, last_q_modinv[location_ + block_y], modulus[block_y]);

            location_ = location_ + (Q_prime_size - 1 - i);
        }

        Data64 ct_in =
            ct[idx + (block_y << n_power) + (((Q_size) << n_power) * block_z)];

        ct_in = OPERATOR_GPU_64::add(ct_in, input_, modulus[block_y]);

        output[idx + (block_y << n_power) + (((Q_size) << n_power) * block_z)] =
            ct_in;
    }

    __global__ void divide_round_lastq_leveled_stage_one_kernel(
        Data64* input, Data64* output, Modulus64* modulus, Data64* half,
        Data64* half_mod, int n_power, int first_decomp_count,
        int current_decomp_count)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Cipher Size (2)

        Data64 last_ct =
            input[idx + (current_decomp_count << n_power) +
                  (((current_decomp_count + 1) << n_power) * block_y)];

        last_ct =
            OPERATOR_GPU_64::add(last_ct, half[0], modulus[first_decomp_count]);

#pragma unroll
        for (int i = 0; i < current_decomp_count; i++)
        {
            Data64 last_ct_i =
                OPERATOR_GPU_64::reduce_forced(last_ct, modulus[i]);

            last_ct_i =
                OPERATOR_GPU_64::sub(last_ct_i, half_mod[i], modulus[i]);

            output[idx + (i << n_power) +
                   (((current_decomp_count) << n_power) * block_y)] = last_ct_i;
        }
    }

    __global__ void divide_round_lastq_leveled_stage_two_kernel(
        Data64* input_last, Data64* input, Data64* ct, Data64* output,
        Modulus64* modulus, Data64* last_q_modinv, int n_power,
        int current_decomp_count)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count
        int block_z = blockIdx.z; // Cipher Size (2)

        Data64 last_ct =
            input_last[idx + (block_y << n_power) +
                       (((current_decomp_count) << n_power) * block_z)];

        Data64 input_ =
            input[idx + (block_y << n_power) +
                  (((current_decomp_count + 1) << n_power) * block_z)];

        input_ = OPERATOR_GPU_64::sub(input_, last_ct, modulus[block_y]);

        input_ = OPERATOR_GPU_64::mult(input_, last_q_modinv[block_y],
                                       modulus[block_y]);

        Data64 ct_in = ct[idx + (block_y << n_power) +
                          (((current_decomp_count) << n_power) * block_z)];

        ct_in = OPERATOR_GPU_64::add(ct_in, input_, modulus[block_y]);

        output[idx + (block_y << n_power) +
               (((current_decomp_count) << n_power) * block_z)] = ct_in;
    }

    __global__ void divide_round_lastq_leveled_stage_two_switchkey_kernel(
        Data64* input_last, Data64* input, Data64* ct, Data64* output,
        Modulus64* modulus, Data64* last_q_modinv, int n_power,
        int current_decomp_count)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count
        int block_z = blockIdx.z; // Cipher Size (2)

        Data64 last_ct =
            input_last[idx + (block_y << n_power) +
                       (((current_decomp_count) << n_power) * block_z)];

        Data64 input_ =
            input[idx + (block_y << n_power) +
                  (((current_decomp_count + 1) << n_power) * block_z)];

        input_ = OPERATOR_GPU_64::sub(input_, last_ct, modulus[block_y]);

        input_ = OPERATOR_GPU_64::mult(input_, last_q_modinv[block_y],
                                       modulus[block_y]);

        Data64 ct_in = 0ULL;
        if (block_z == 0)
        {
            ct_in = ct[idx + (block_y << n_power) +
                       (((current_decomp_count) << n_power) * block_z)];
        }

        ct_in = OPERATOR_GPU_64::add(ct_in, input_, modulus[block_y]);

        output[idx + (block_y << n_power) +
               (((current_decomp_count) << n_power) * block_z)] = ct_in;
    }

    // FOR RESCALE

    __global__ void move_cipher_leveled_kernel(Data64* input, Data64* output,
                                               int n_power,
                                               int current_decomp_count)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // current_decomp_count - 1
        int block_z = blockIdx.z; // Cipher Size (2)

        Data64 r_input =
            input[idx + (block_y << n_power) +
                  (((current_decomp_count + 1) << n_power) * block_z)];

        output[idx + (block_y << n_power) +
               (((current_decomp_count + 1) << n_power) * block_z)] = r_input;
    }

    __global__ void divide_round_lastq_rescale_kernel(
        Data64* input_last, Data64* input, Data64* output, Modulus64* modulus,
        Data64* last_q_modinv, int n_power, int current_decomp_count)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count
        int block_z = blockIdx.z; // Cipher Size (2)

        Data64 last_ct =
            input_last[idx + (block_y << n_power) +
                       (((current_decomp_count) << n_power) * block_z)];

        Data64 input_ =
            input[idx + (block_y << n_power) +
                  (((current_decomp_count + 1) << n_power) * block_z)];

        input_ = OPERATOR_GPU_64::sub(input_, last_ct, modulus[block_y]);

        input_ = OPERATOR_GPU_64::mult(input_, last_q_modinv[block_y],
                                       modulus[block_y]);

        output[idx + (block_y << n_power) +
               (((current_decomp_count) << n_power) * block_z)] = input_;
    }

    __global__ void base_conversion_DtoB_relin_kernel(
        Data64* ciphertext, Data64* output, Modulus64* modulus,
        Modulus64* B_base, Data64* base_change_matrix_D_to_B,
        Data64* Mi_inv_D_to_B, Data64* prod_D_to_B, int* I_j_, int* I_location_,
        int n_power, int l, int d_tilda, int d, int r_prime)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // d

        const int I_j = I_j_[block_y];
        int I_location = I_location_[block_y];

        int location = idx + (I_location << n_power);
        int location_out = idx + ((block_y * r_prime) << n_power);
        int matrix_index = I_location * r_prime;

        Data64 partial[20];
        float r = 0;
        float div;
        float mod;
#pragma unroll
        for (int i = 0; i < I_j; i++)
        {
            Data64 temp = ciphertext[location + (i << n_power)];
            partial[i] = OPERATOR_GPU_64::mult(
                temp, Mi_inv_D_to_B[I_location + i], modulus[I_location + i]);
            div = static_cast<float>(partial[i]);
            mod = static_cast<float>(modulus[I_location + i].value);
            r += (div / mod);
        }

        // r = round(r);
        r = roundf(r);
        Data64 r_ = static_cast<Data64>(r);

        for (int i = 0; i < r_prime; i++)
        {
            Data64 temp = 0;
#pragma unroll
            for (int j = 0; j < I_j; j++)
            {
                Data64 mult = OPERATOR_GPU_64::mult(
                    partial[j],
                    base_change_matrix_D_to_B[j + (i * I_j) + matrix_index],
                    B_base[i]);
                temp = OPERATOR_GPU_64::add(temp, mult, B_base[i]);
            }

            Data64 r_mul = OPERATOR_GPU_64::mult(
                r_, prod_D_to_B[i + (block_y * r_prime)], B_base[i]);
            r_mul = OPERATOR_GPU_64::sub(temp, r_mul, B_base[i]);
            output[location_out + (i << n_power)] = r_mul;
        }
    }

    __global__ void base_conversion_DtoQtilde_relin_kernel(
        Data64* ciphertext, Data64* output, Modulus64* modulus,
        Data64* base_change_matrix_D_to_Qtilda, Data64* Mi_inv_D_to_Qtilda,
        Data64* prod_D_to_Qtilda, int* I_j_, int* I_location_, int n_power,
        int l, int Q_tilda, int d)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // d

        const int I_j = I_j_[block_y];
        int I_location = I_location_[block_y];

        int location = idx + (I_location << n_power);
        int location_out = idx + ((block_y * Q_tilda) << n_power);
        int matrix_index = I_location * Q_tilda;

        Data64 partial[20];
        float r = 0;
        float div;
        float mod;
#pragma unroll
        for (int i = 0; i < I_j; i++)
        {
            Data64 temp = ciphertext[location + (i << n_power)];
            partial[i] =
                OPERATOR_GPU_64::mult(temp, Mi_inv_D_to_Qtilda[I_location + i],
                                      modulus[I_location + i]);
            div = static_cast<float>(partial[i]);
            mod = static_cast<float>(modulus[I_location + i].value);
            r += (div / mod);
        }

        // r = round(r);
        r = roundf(r);
        Data64 r_ = static_cast<Data64>(r);

        for (int i = 0; i < Q_tilda; i++)
        {
            Data64 temp = 0;
#pragma unroll
            for (int j = 0; j < I_j; j++)
            {
                Data64 mult = OPERATOR_GPU_64::mult(
                    partial[j],
                    base_change_matrix_D_to_Qtilda[j + (i * I_j) +
                                                   matrix_index],
                    modulus[i]);
                temp = OPERATOR_GPU_64::add(temp, mult, modulus[i]);
            }

            Data64 r_mul = OPERATOR_GPU_64::mult(
                r_, prod_D_to_Qtilda[i + (block_y * Q_tilda)], modulus[i]);
            r_mul = OPERATOR_GPU_64::sub(temp, r_mul, modulus[i]);
            output[location_out + (i << n_power)] = r_mul;
        }
    }

    __global__ void base_conversion_DtoB_relin_leveled_kernel(
        Data64* ciphertext, Data64* output, Modulus64* modulus,
        Modulus64* B_base, Data64* base_change_matrix_D_to_B,
        Data64* Mi_inv_D_to_B, Data64* prod_D_to_B, int* I_j_, int* I_location_,
        int n_power, int d_tilda, int d, int r_prime, int* mod_index)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // d

        const int I_j = I_j_[block_y];
        int I_location = I_location_[block_y];

        int location = idx + (I_location << n_power);
        int location_out = idx + ((block_y * r_prime) << n_power);
        int matrix_index = I_location * r_prime;

        Data64 partial[20];
        float r = 0;
        float div;
        float mod;
#pragma unroll
        for (int i = 0; i < I_j; i++)
        {
            Data64 temp = ciphertext[location + (i << n_power)];
            partial[i] =
                OPERATOR_GPU_64::mult(temp, Mi_inv_D_to_B[I_location + i],
                                      modulus[mod_index[I_location + i]]);
            div = static_cast<float>(partial[i]);
            mod = static_cast<float>(modulus[mod_index[I_location + i]].value);
            r += (div / mod);
        }

        // r = roundf(r);
        r = round(r);
        Data64 r_ = static_cast<Data64>(r);

        for (int i = 0; i < r_prime; i++)
        {
            Data64 temp = 0;
#pragma unroll
            for (int j = 0; j < I_j; j++)
            {
                Data64 mult = OPERATOR_GPU_64::mult(
                    partial[j],
                    base_change_matrix_D_to_B[j + (i * I_j) + matrix_index],
                    B_base[i]);
                temp = OPERATOR_GPU_64::add(temp, mult, B_base[i]);
            }

            Data64 r_mul = OPERATOR_GPU_64::mult(
                r_, prod_D_to_B[i + (block_y * r_prime)], B_base[i]);
            r_mul = OPERATOR_GPU_64::sub(temp, r_mul, B_base[i]);
            output[location_out + (i << n_power)] = r_mul;
        }
    }

    __global__ void base_conversion_DtoQtilde_relin_leveled_kernel(
        Data64* ciphertext, Data64* output, Modulus64* modulus,
        Data64* base_change_matrix_D_to_Qtilda, Data64* Mi_inv_D_to_Qtilda,
        Data64* prod_D_to_Qtilda, int* I_j_, int* I_location_, int n_power,
        int d, int current_Qtilda_size, int current_Q_size, int level,
        int* mod_index)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // d

        const int I_j = I_j_[block_y];
        int I_location = I_location_[block_y];

        int location = idx + (I_location << n_power);
        int location_out = idx + ((block_y * current_Qtilda_size) << n_power);
        int matrix_index = I_location * current_Qtilda_size;

        Data64 partial[20];
        float r = 0;
        float div;
        float mod;
#pragma unroll
        for (int i = 0; i < I_j; i++)
        {
            Data64 temp = ciphertext[location + (i << n_power)];
            partial[i] =
                OPERATOR_GPU_64::mult(temp, Mi_inv_D_to_Qtilda[I_location + i],
                                      modulus[I_location + i]);
            div = static_cast<float>(partial[i]);
            mod = static_cast<float>(modulus[I_location + i].value);
            r += (div / mod);
        }

        // r = roundf(r);
        r = round(r);
        Data64 r_ = static_cast<Data64>(r);

        for (int i = 0; i < current_Qtilda_size; i++)
        {
            int mod_location = (i < current_Q_size) ? i : (i + level);

            Data64 temp = 0;
#pragma unroll
            for (int j = 0; j < I_j; j++)
            {
                Data64 mult = OPERATOR_GPU_64::reduce_forced(
                    partial[j], modulus[mod_location]);
                mult = OPERATOR_GPU_64::mult(
                    mult,
                    base_change_matrix_D_to_Qtilda[j + (i * I_j) +
                                                   matrix_index],
                    modulus[mod_location]);
                temp = OPERATOR_GPU_64::add(temp, mult, modulus[mod_location]);
            }

            Data64 r_mul = OPERATOR_GPU_64::mult(
                r_, prod_D_to_Qtilda[i + (block_y * current_Qtilda_size)],
                modulus[mod_location]);
            r_mul = OPERATOR_GPU_64::sub(temp, r_mul, modulus[mod_location]);
            output[location_out + (i << n_power)] = r_mul;
        }
    }

    __global__ void multiply_accumulate_extended_kernel(
        Data64* input, Data64* relinkey, Data64* output, Modulus64* B_prime,
        int n_power, int d_tilda, int d, int r_prime)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // r_prime
        int block_z = blockIdx.z; // d_tilda

        int key_offset1 = (r_prime * d_tilda) << n_power;
        int key_offset2 = (r_prime * d_tilda) << (n_power + 1);

        int offset1 = idx + (block_y << n_power);
        int offset2 = offset1 + ((r_prime << n_power) * block_z);

        Modulus modulus = B_prime[block_y];

        Data64* relinkey_ = relinkey + offset2;

        Data64 ct_0_sum = 0;
        Data64 ct_1_sum = 0;

#pragma unroll 2
        for (int i = 0; i < d; i++)
        {
            Data64 in_piece =
                __ldg(input + offset1 + ((i * r_prime) << n_power));

            Data64 rk0 = __ldg(relinkey_ + (key_offset2 * i));
            Data64 mult0 = OPERATOR_GPU_64::mult(in_piece, rk0, modulus);
            ct_0_sum = OPERATOR_GPU_64::add(ct_0_sum, mult0, modulus);

            Data64 rk1 = __ldg(relinkey_ + (key_offset2 * i) + key_offset1);
            Data64 mult1 = OPERATOR_GPU_64::mult(in_piece, rk1, modulus);
            ct_1_sum = OPERATOR_GPU_64::add(ct_1_sum, mult1, modulus);
        }

        output[offset2] = ct_0_sum;
        output[offset2 + key_offset1] = ct_1_sum;
    }

    __global__ void base_conversion_BtoD_relin_kernel(
        Data64* input, Data64* output, Modulus64* modulus, Modulus64* B_base,
        Data64* base_change_matrix_B_to_D, Data64* Mi_inv_B_to_D,
        Data64* prod_B_to_D, int* I_j_, int* I_location_, int n_power,
        int l_tilda, int d_tilda, int d, int r_prime)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // d_tilda
        int block_z = blockIdx.z; // 2

        int I_j = I_j_[block_y];
        int I_location = I_location_[block_y];
        int location_out =
            idx + (I_location << n_power) + ((l_tilda << n_power) * block_z);

        int location = idx + ((r_prime << n_power) * block_y) +
                       (((d_tilda * r_prime) << n_power) * block_z);
        int matrix_index = I_location * r_prime;

        Data64 partial[20];
        float r = 0;
#pragma unroll
        for (int i = 0; i < r_prime; i++)
        {
            Data64 temp = input[location + (i << n_power)];
            partial[i] =
                OPERATOR_GPU_64::mult(temp, Mi_inv_B_to_D[i], B_base[i]);
            float div = static_cast<float>(partial[i]);
            float mod = static_cast<float>(B_base[i].value);
            r += (div / mod);
        }
        // r = roundf(r);
        r = round(r);
        Data64 r_ = static_cast<Data64>(r);

#pragma unroll
        for (int i = 0; i < I_j; i++)
        {
            Data64 temp = 0;
#pragma unroll
            for (int j = 0; j < r_prime; j++)
            {
                Data64 partial_ = OPERATOR_GPU_64::reduce(
                    partial[j], modulus[i + I_location]); // new

                Data64 mult = OPERATOR_GPU_64::mult(
                    partial_,
                    base_change_matrix_B_to_D[j + (i * r_prime) + matrix_index],
                    modulus[i + I_location]);
                temp =
                    OPERATOR_GPU_64::add(temp, mult, modulus[i + I_location]);
            }

            Data64 r_mul = OPERATOR_GPU_64::mult(
                r_, prod_B_to_D[i + I_location], modulus[i + I_location]);
            temp = OPERATOR_GPU_64::sub(temp, r_mul, modulus[i + I_location]);

            temp =
                OPERATOR_GPU_64::reduce(temp, modulus[i + I_location]); // new

            output[location_out + (i << n_power)] = temp;
        }
    }

    __global__ void base_conversion_BtoD_relin_leveled_kernel(
        Data64* input, Data64* output, Modulus64* modulus, Modulus64* B_base,
        Data64* base_change_matrix_B_to_D, Data64* Mi_inv_B_to_D,
        Data64* prod_B_to_D, int* I_j_, int* I_location_, int n_power,
        int l_tilda, int d_tilda, int d, int r_prime, int* mod_index)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // d_tilda
        int block_z = blockIdx.z; // 2

        int I_j = I_j_[block_y];
        int I_location = I_location_[block_y];
        int location_out =
            idx + (I_location << n_power) + ((l_tilda << n_power) * block_z);

        int location = idx + ((r_prime << n_power) * block_y) +
                       (((d_tilda * r_prime) << n_power) * block_z);
        int matrix_index = I_location * r_prime;

        Data64 partial[20];
        float r = 0;
#pragma unroll
        for (int i = 0; i < r_prime; i++)
        {
            Data64 temp = input[location + (i << n_power)];
            partial[i] =
                OPERATOR_GPU_64::mult(temp, Mi_inv_B_to_D[i], B_base[i]);
            float div = static_cast<float>(partial[i]);
            float mod = static_cast<float>(B_base[i].value);
            r += (div / mod);
        }
        // r = roundf(r);
        r = round(r);
        Data64 r_ = static_cast<Data64>(r);

#pragma unroll
        for (int i = 0; i < I_j; i++)
        {
            Data64 temp = 0;
#pragma unroll
            for (int j = 0; j < r_prime; j++)
            {
                // Data64 partial_ = OPERATOR_GPU_64::reduce(partial[j],
                // modulus[mod_index[I_location + i]]); // new
                Data64 partial_ = OPERATOR_GPU_64::reduce_forced(
                    partial[j], modulus[mod_index[I_location + i]]); // new

                Data64 mult = OPERATOR_GPU_64::mult(
                    partial_,
                    base_change_matrix_B_to_D[j + (i * r_prime) + matrix_index],
                    modulus[mod_index[I_location + i]]);
                temp = OPERATOR_GPU_64::add(temp, mult,
                                            modulus[mod_index[I_location + i]]);
            }

            Data64 r_mul =
                OPERATOR_GPU_64::mult(r_, prod_B_to_D[i + I_location],
                                      modulus[mod_index[I_location + i]]);
            temp = OPERATOR_GPU_64::sub(temp, r_mul,
                                        modulus[mod_index[I_location + i]]);

            // temp = OPERATOR_GPU_64::reduce(temp, modulus[mod_index[I_location
            // + i]]);// new
            temp = OPERATOR_GPU_64::reduce_forced(
                temp, modulus[mod_index[I_location + i]]); // new

            output[location_out + (i << n_power)] = temp;
        }
    }

    __global__ void divide_round_lastq_extended_leveled_kernel(
        Data64* input, Data64* output, Modulus64* modulus, Data64* half,
        Data64* half_mod, Data64* last_q_modinv, int n_power, int Q_prime_size,
        int Q_size, int first_Q_prime_size, int first_Q_size, int P_size)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count (Q_size)
        int block_z = blockIdx.z; // Cipher Size (2)

        // Max P size is 15.
        Data64 last_ct[15];
        for (int i = 0; i < P_size; i++)
        {
            last_ct[i] = input[idx + ((Q_size + i) << n_power) +
                               ((Q_prime_size << n_power) * block_z)];
        }

        Data64 input_ = input[idx + (block_y << n_power) +
                              ((Q_prime_size << n_power) * block_z)];

        int location_ = 0;
        for (int i = 0; i < P_size; i++)
        {
            Data64 last_ct_add_half_ = last_ct[(P_size - 1 - i)];
            last_ct_add_half_ =
                OPERATOR_GPU_64::add(last_ct_add_half_, half[i],
                                     modulus[(first_Q_prime_size - 1 - i)]);
            for (int j = 0; j < (P_size - 1 - i); j++)
            {
                Data64 temp1 = OPERATOR_GPU_64::reduce_forced(
                    last_ct_add_half_, modulus[first_Q_size + j]);

                temp1 = OPERATOR_GPU_64::sub(
                    temp1, half_mod[location_ + first_Q_size + j],
                    modulus[first_Q_size + j]);

                temp1 = OPERATOR_GPU_64::sub(last_ct[j], temp1,
                                             modulus[first_Q_size + j]);

                last_ct[j] = OPERATOR_GPU_64::mult(
                    temp1, last_q_modinv[location_ + first_Q_size + j],
                    modulus[first_Q_size + j]);
            }

            Data64 temp1 = OPERATOR_GPU_64::reduce_forced(last_ct_add_half_,
                                                          modulus[block_y]);

            temp1 = OPERATOR_GPU_64::sub(temp1, half_mod[location_ + block_y],
                                         modulus[block_y]);

            temp1 = OPERATOR_GPU_64::sub(input_, temp1, modulus[block_y]);

            input_ = OPERATOR_GPU_64::mult(
                temp1, last_q_modinv[location_ + block_y], modulus[block_y]);

            location_ = location_ + (first_Q_prime_size - 1 - i);
        }

        output[idx + (block_y << n_power) + (((Q_size) << n_power) * block_z)] =
            input_;
    }

    __global__ void global_memory_replace_kernel(Data64* input, Data64* output,
                                                 int n_power)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count (Q_size)
        int block_z = blockIdx.z; // Cipher Size (2)

        int location =
            idx + (block_y << n_power) + ((gridDim.y << n_power) * block_z);

        Data64 in_reg = input[location];

        output[location] = in_reg;
    }

    __global__ void
    global_memory_replace_offset_kernel(Data64* input, Data64* output,
                                        int current_decomposition_count,
                                        int n_power)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count (Q_size)
        int block_z = blockIdx.z; // Cipher Size (2)

        int location_in = idx + (block_y << n_power) +
                          ((current_decomposition_count << n_power) * block_z);
        int location_out =
            idx + (block_y << n_power) +
            (((current_decomposition_count - 1) << n_power) * block_z);

        Data64 in_reg = input[location_in];

        output[location_out] = in_reg;
    }

    __global__ void
    cipher_broadcast_switchkey_kernel(Data64* cipher, Data64* out0,
                                      Data64* out1, Modulus64* modulus,
                                      int n_power, int decomp_mod_count)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count
        int block_z = blockIdx.z; // Cipher Size (2)

        int rns_mod_count = (decomp_mod_count + 1);

        Data64 result_value = cipher[idx + (block_y << n_power) +
                                     ((decomp_mod_count << n_power) * block_z)];

        if (block_z == 0)
        {
            out0[idx + (block_y << n_power) +
                 ((decomp_mod_count << n_power) * block_z)] = result_value;
        }
        else
        {
            int location = (rns_mod_count * block_y) << n_power;

            for (int i = 0; i < rns_mod_count; i++)
            {
                out1[idx + (i << n_power) + location] = result_value;
            }
        }
    }

    __global__ void cipher_broadcast_switchkey_method_II_kernel(
        Data64* cipher, Data64* out0, Data64* out1, Modulus64* modulus,
        int n_power, int decomp_mod_count)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count
        int block_z = blockIdx.z; // Cipher Size (2)

        Data64 result_value = cipher[idx + (block_y << n_power) +
                                     ((decomp_mod_count << n_power) * block_z)];

        if (block_z == 0)
        {
            out0[idx + (block_y << n_power)] = result_value;
        }
        else
        {
            out1[idx + (block_y << n_power)] = result_value;
        }
    }

    __global__ void cipher_broadcast_switchkey_leveled_kernel(
        Data64* cipher, Data64* out0, Data64* out1, Modulus64* modulus,
        int n_power, int first_rns_mod_count, int current_rns_mod_count,
        int current_decomp_mod_count)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count
        int block_z = blockIdx.z; // Cipher Size (2)

        Data64 result_value =
            cipher[idx + (block_y << n_power) +
                   ((current_decomp_mod_count << n_power) * block_z)];

        if (block_z == 0)
        {
            out0[idx + (block_y << n_power) +
                 ((current_decomp_mod_count << n_power) * block_z)] =
                result_value;
        }
        else
        {
            int location = (current_rns_mod_count * block_y) << n_power;
            int level = first_rns_mod_count - current_rns_mod_count;
            for (int i = 0; i < current_rns_mod_count; i++)
            {
                int mod_index;
                if (i < current_decomp_mod_count)
                {
                    mod_index = i;
                }
                else
                {
                    mod_index = i + level;
                }

                Data64 reduced_result = OPERATOR_GPU_64::reduce_forced(
                    result_value, modulus[mod_index]);

                out1[idx + (i << n_power) + location] = reduced_result;
            }
        }
    }

    __global__ void addition_switchkey(Data64* in1, Data64* in2, Data64* out,
                                       Modulus64* modulus, int n_power)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // ring size
        int idy = blockIdx.y; // rns count
        int idz = blockIdx.z; // cipher count

        int location = idx + (idy << n_power) + ((gridDim.y * idz) << n_power);

        if (idz == 0)
        {
            out[location] = OPERATOR_GPU_64::add(in1[location], in2[location],
                                                 modulus[idy]);
        }
        else
        {
            out[location] = in1[location];
        }
    }

    __global__ void negacyclic_shift_poly_coeffmod_kernel(Data64* cipher_in,
                                                          Data64* cipher_out,
                                                          Modulus64* modulus,
                                                          int shift,
                                                          int n_power)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count
        int block_z = blockIdx.z; // Cipher Size (2)

        int coeff_count_minus_one = (1 << n_power) - 1;

        int index_raw = idx + shift;
        int index = index_raw & coeff_count_minus_one;
        Data64 result_value = cipher_in[idx + (block_y << n_power) +
                                        ((gridDim.y << n_power) * block_z)];

        if ((index_raw >> n_power) & 1)
        {
            result_value = (modulus[block_y].value - result_value);
        }

        cipher_out[index + (block_y << n_power) +
                   ((gridDim.y << n_power) * block_z)] = result_value;
    }

    // Double Hoisting Kernels

    // NTT-domain Galois automorphism X -> X^galois_elt, computed on the fly.
    //
    // In the bit-reversed NTT layout, slot j holds the evaluation of the
    // polynomial at the primitive 2N-th root of unity psi raised to the
    // odd power (2·br(j) + 1), where br(j) is the n_power-bit reversal
    // of j.  The automorphism X -> X^k maps evaluation point psi^e to
    // psi^(k·e mod 2N), so it becomes a pure index gather in NTT domain
    // (no coefficient negation needed, unlike the coefficient-domain
    // version).
    //
    // Per-thread formula (one output slot per thread):
    //   1. br_j    = bit-reverse of destination index j      (via __brev)
    //   2. exp_j   = 2·br_j + 1         (odd evaluation exponent at slot j)
    //   3. new_exp = k · exp_j mod 2N    (automorphed exponent, still odd)
    //   4. s       = (new_exp - 1) / 2   (logical NTT index of source slot)
    //   5. src_idx = bit-reverse of s    (physical array position to gather)
    //
    // Grid: dim3((n >> 8), pql_count, 2), block: 256
    //   blockIdx.x · 256 + threadIdx.x  ->  ring element index  (N threads)
    //   blockIdx.y                       ->  RNS / PQ_l limb
    //   blockIdx.z                       ->  ciphertext component (0 or 1)
    __global__ void galois_permute_ntt_pql_kernel(Data64* input, Data64* output,
                                                  int galois_elt, int n_power,
                                                  int pql_count)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x;
        int block_y = blockIdx.y;
        int block_z = blockIdx.z;

        // __brev returns a full 32-bit reversal; shift right to keep only
        // the low n_power bits (i.e. an n_power-bit bit-reversal).
        int shift = 32 - n_power;
        int two_N = 2 << n_power; // = 2N
        int br_j = __brev(idx) >> shift; // bit-reverse destination idx
        int exp_j = 2 * br_j + 1; // odd eval exponent at slot j
        int new_exp = (int) (((long long) galois_elt * exp_j) %
                             two_N); // automorphed exponent
        int s = (new_exp - 1) >> 1; // logical NTT index of source
        int src_idx =
            __brev(s) >> shift; // physical (bit-reversed) source position

        output[idx + (block_y << n_power) +
               ((pql_count << n_power) * block_z)] =
            input[src_idx + (block_y << n_power) +
                  ((pql_count << n_power) * block_z)];
    }

    // Creates P*c in PQ_l NTT domain from Q_l NTT input.
    // Q limbs: (P mod q_j) * c[j]. P limbs: 0.
    // Grid: dim3((n >> 8), pql_count, 1), 256
    __global__ void broadcast_scale_P_kernel(Data64* c_ntt, Data64* output,
                                             Data64* P_mod_q,
                                             Modulus64* pq_modulus, int n_power,
                                             int current_decomp_count,
                                             int pql_count)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // PQ_l limb index

        Data64 result;
        if (block_y < current_decomp_count)
        {
            // Q limb: P_mod_q[j] * c[j] mod q_j
            Data64 c_val = c_ntt[idx + (block_y << n_power)];
            result = OPERATOR_GPU_64::mult(c_val, P_mod_q[block_y],
                                           pq_modulus[block_y]);
        }
        else
        {
            // P limb: P mod p_k = 0
            result = 0;
        }

        output[idx + (block_y << n_power)] = result;
    }

    // Element-wise modular addition for 2-component PQ_l polynomials.
    // Grid: dim3((n >> 8), pql_count, 2), 256
    __global__ void addition_pql_kernel(Data64* in1, Data64* in2, Data64* out,
                                        Modulus64* pq_modulus, int n_power,
                                        int pql_count)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // PQ_l limb index
        int block_z = blockIdx.z; // Cipher component (0 or 1)

        int offset =
            idx + (block_y << n_power) + ((pql_count << n_power) * block_z);

        Data64 val1 = in1[offset];
        Data64 val2 = in2[offset];

        out[offset] = OPERATOR_GPU_64::add(val1, val2, pq_modulus[block_y]);
    }

    // Optimized Hoisting-Rotations

    __global__ void ckks_duplicate_kernel(Data64* cipher, Data64* output,
                                          Modulus64* modulus, int n_power,
                                          int first_rns_mod_count,
                                          int current_rns_mod_count,
                                          int current_decomp_mod_count)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count

        Data64 result_value = cipher[idx + (block_y << n_power) +
                                     ((current_decomp_mod_count << n_power))];

        int location = (current_rns_mod_count * block_y) << n_power;
        int level = first_rns_mod_count - current_rns_mod_count;

        for (int i = 0; i < current_rns_mod_count; i++)
        {
            int mod_index;
            if (i < current_decomp_mod_count)
            {
                mod_index = i;
            }
            else
            {
                mod_index = i + level;
            }

            Data64 reduced_result = OPERATOR_GPU_64::reduce_forced(
                result_value, modulus[mod_index]);

            output[idx + (i << n_power) + location] = reduced_result;
        }
    }

    __global__ void bfv_duplicate_kernel(Data64* cipher, Data64* output1,
                                         Data64* output2, Modulus64* modulus,
                                         int n_power, int rns_mod_count)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count
        int block_z = blockIdx.z; // Decomposition Modulus Count

        Data64 result_value = cipher[idx + (block_y << n_power) +
                                     ((gridDim.y << n_power) * block_z)];

        if (block_z == 0)
        {
            output1[idx + (block_y << n_power)] = result_value;
        }
        else
        {
            int location = (rns_mod_count * block_y) << n_power;

            for (int i = 0; i < rns_mod_count; i++)
            {
                Data64 reduced_result =
                    OPERATOR_GPU_64::reduce_forced(result_value, modulus[i]);

                output2[idx + (i << n_power) + location] = reduced_result;
            }
        }
    }

    __global__ void divide_round_lastq_permute_ckks_kernel(
        Data64* input, Data64* input2, Data64* output, Modulus64* modulus,
        Data64* half, Data64* half_mod, Data64* last_q_modinv, int galois_elt,
        int n_power, int Q_prime_size, int Q_size, int first_Q_prime_size,
        int first_Q_size, int P_size)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count (Q_size)
        int block_z = blockIdx.z; // Cipher Size (2)

        // Max P size is 15.
        Data64 last_ct[15];
        for (int i = 0; i < P_size; i++)
        {
            last_ct[i] = input[idx + ((Q_size + i) << n_power) +
                               ((Q_prime_size << n_power) * block_z)];
        }

        Data64 input_ = input[idx + (block_y << n_power) +
                              ((Q_prime_size << n_power) * block_z)];

        int location_ = 0;
        for (int i = 0; i < P_size; i++)
        {
            Data64 last_ct_add_half_ = last_ct[(P_size - 1 - i)];
            last_ct_add_half_ =
                OPERATOR_GPU_64::add(last_ct_add_half_, half[i],
                                     modulus[(first_Q_prime_size - 1 - i)]);
            for (int j = 0; j < (P_size - 1 - i); j++)
            {
                Data64 temp1 = OPERATOR_GPU_64::reduce_forced(
                    last_ct_add_half_, modulus[first_Q_size + j]);

                temp1 = OPERATOR_GPU_64::sub(
                    temp1, half_mod[location_ + first_Q_size + j],
                    modulus[first_Q_size + j]);

                temp1 = OPERATOR_GPU_64::sub(last_ct[j], temp1,
                                             modulus[first_Q_size + j]);

                last_ct[j] = OPERATOR_GPU_64::mult(
                    temp1, last_q_modinv[location_ + first_Q_size + j],
                    modulus[first_Q_size + j]);
            }

            Data64 temp1 = OPERATOR_GPU_64::reduce_forced(last_ct_add_half_,
                                                          modulus[block_y]);

            temp1 = OPERATOR_GPU_64::sub(temp1, half_mod[location_ + block_y],
                                         modulus[block_y]);

            temp1 = OPERATOR_GPU_64::sub(input_, temp1, modulus[block_y]);

            input_ = OPERATOR_GPU_64::mult(
                temp1, last_q_modinv[location_ + block_y], modulus[block_y]);

            location_ = location_ + (first_Q_prime_size - 1 - i);
        }

        if (block_z == 0)
        {
            Data64 ct_in = input2[idx + (block_y << n_power)];

            ct_in = OPERATOR_GPU_64::add(ct_in, input_, modulus[block_y]);

            //
            int coeff_count_minus_one = (1 << n_power) - 1;

            int index_raw = idx * galois_elt;
            int index = index_raw & coeff_count_minus_one;

            if ((index_raw >> n_power) & 1)
            {
                ct_in = (modulus[block_y].value - ct_in);
            }
            //

            output[index + (block_y << n_power) +
                   (((Q_size) << n_power) * block_z)] = ct_in;
        }
        else
        {
            //
            int coeff_count_minus_one = (1 << n_power) - 1;

            int index_raw = idx * galois_elt;
            int index = index_raw & coeff_count_minus_one;

            if ((index_raw >> n_power) & 1)
            {
                input_ = (modulus[block_y].value - input_);
            }
            //

            output[index + (block_y << n_power) +
                   (((Q_size) << n_power) * block_z)] = input_;
        }
    }

    __global__ void divide_round_lastq_permute_bfv_kernel(
        Data64* input, Data64* ct, Data64* output, Modulus64* modulus,
        Data64* half, Data64* half_mod, Data64* last_q_modinv, int galois_elt,
        int n_power, int Q_prime_size, int Q_size, int P_size)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x; // Ring Sizes
        int block_y = blockIdx.y; // Decomposition Modulus Count (Q_size)
        int block_z = blockIdx.z; // Cipher Size (2)

        // Max P size is 15.
        Data64 last_ct[15];
        for (int i = 0; i < P_size; i++)
        {
            last_ct[i] = input[idx + ((Q_size + i) << n_power) +
                               ((Q_prime_size << n_power) * block_z)];
        }

        Data64 input_ = input[idx + (block_y << n_power) +
                              ((Q_prime_size << n_power) * block_z)];

        int location_ = 0;
        for (int i = 0; i < P_size; i++)
        {
            Data64 last_ct_add_half_ = last_ct[(P_size - 1 - i)];
            last_ct_add_half_ = OPERATOR_GPU_64::add(
                last_ct_add_half_, half[i], modulus[(Q_prime_size - 1 - i)]);
            for (int j = 0; j < (P_size - 1 - i); j++)
            {
                Data64 temp1 = OPERATOR_GPU_64::reduce_forced(
                    last_ct_add_half_, modulus[Q_size + j]);
                temp1 = OPERATOR_GPU_64::sub(temp1,
                                             half_mod[location_ + Q_size + j],
                                             modulus[Q_size + j]);

                temp1 = OPERATOR_GPU_64::sub(last_ct[j], temp1,
                                             modulus[Q_size + j]);

                last_ct[j] = OPERATOR_GPU_64::mult(
                    temp1, last_q_modinv[location_ + Q_size + j],
                    modulus[Q_size + j]);
            }

            Data64 temp1 = OPERATOR_GPU_64::reduce_forced(last_ct_add_half_,
                                                          modulus[block_y]);
            temp1 = OPERATOR_GPU_64::sub(temp1, half_mod[location_ + block_y],
                                         modulus[block_y]);

            temp1 = OPERATOR_GPU_64::sub(input_, temp1, modulus[block_y]);

            input_ = OPERATOR_GPU_64::mult(
                temp1, last_q_modinv[location_ + block_y], modulus[block_y]);

            location_ = location_ + (Q_prime_size - 1 - i);
        }

        if (block_z == 0)
        {
            Data64 ct_in = ct[idx + (block_y << n_power)];

            ct_in = OPERATOR_GPU_64::add(ct_in, input_, modulus[block_y]);

            //
            int coeff_count_minus_one = (1 << n_power) - 1;

            int index_raw = idx * galois_elt;
            int index = index_raw & coeff_count_minus_one;

            if ((index_raw >> n_power) & 1)
            {
                ct_in = (modulus[block_y].value - ct_in);
            }
            //

            output[index + (block_y << n_power) +
                   (((Q_size) << n_power) * block_z)] = ct_in;
        }
        else
        {
            //
            int coeff_count_minus_one = (1 << n_power) - 1;

            int index_raw = idx * galois_elt;
            int index = index_raw & coeff_count_minus_one;

            if ((index_raw >> n_power) & 1)
            {
                input_ = (modulus[block_y].value - input_);
            }
            //

            output[index + (block_y << n_power) +
                   (((Q_size) << n_power) * block_z)] = input_;
        }
    }

} // namespace heongpu