Program Listing for File small_ntt.cu
↰ Return to documentation for file (src/lib/kernel/small_ntt.cu)
// Copyright 2024-2026 Alişah Özcan
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
// Developer: Alişah Özcan
#include <heongpu/kernel/small_ntt.cuh>
namespace heongpu
{
template <typename T>
__device__ void
SmallForwardNTT(T* polynomial_in_shared, const Root<T>* root_of_unity_table,
const Modulus<T> modulus, bool reduction_poly_check)
{
const int idx_x = threadIdx.x;
const int N_power = 10;
int t_2 = N_power - 1;
int t_ = 9;
int m = 1;
int t = 1 << t_;
int in_shared_address = ((idx_x >> t_) << t_) + idx_x;
int current_root_index;
#pragma unroll
for (int lp = 0; lp < 4; lp++)
{
if (reduction_poly_check)
{ // X_N_minus
current_root_index = (idx_x >> t_2);
}
else
{ // X_N_plus
current_root_index = m + (idx_x >> t_2);
}
gpuntt::CooleyTukeyUnit(polynomial_in_shared[in_shared_address],
polynomial_in_shared[in_shared_address + t],
root_of_unity_table[current_root_index],
modulus);
t = t >> 1;
t_2 -= 1;
t_ -= 1;
m <<= 1;
in_shared_address = ((idx_x >> t_) << t_) + idx_x;
__syncthreads();
}
#pragma unroll
for (int lp = 0; lp < 6; lp++)
{
if (reduction_poly_check)
{ // X_N_minus
current_root_index = (idx_x >> t_2);
}
else
{ // X_N_plus
current_root_index = m + (idx_x >> t_2);
}
gpuntt::CooleyTukeyUnit(polynomial_in_shared[in_shared_address],
polynomial_in_shared[in_shared_address + t],
root_of_unity_table[current_root_index],
modulus);
t = t >> 1;
t_2 -= 1;
t_ -= 1;
m <<= 1;
in_shared_address = ((idx_x >> t_) << t_) + idx_x;
}
__syncthreads();
}
template <typename T>
__device__ void
SmallInverseNTT(T* polynomial_in_shared, const Root<T>* root_of_unity_table,
const Modulus<T> modulus, const Ninverse<T> n_inverse,
bool reduction_poly_check)
{
const int idx_x = threadIdx.x;
int t_2 = 0;
int t_ = 0;
int m = 1 << 9;
int t = 1 << t_;
int in_shared_address = ((idx_x >> t_) << t_) + idx_x;
int current_root_index;
#pragma unroll
for (int lp = 0; lp < 10; lp++)
{
if (reduction_poly_check)
{ // X_N_minus
current_root_index = (idx_x >> t_2);
}
else
{ // X_N_plus
current_root_index = m + (idx_x >> t_2);
}
gpuntt::GentlemanSandeUnit(
polynomial_in_shared[in_shared_address],
polynomial_in_shared[in_shared_address + t],
root_of_unity_table[current_root_index], modulus);
t = t << 1;
t_2 += 1;
t_ += 1;
m >>= 1;
in_shared_address = ((idx_x >> t_) << t_) + idx_x;
__syncthreads();
}
polynomial_in_shared[idx_x] = OPERATOR_GPU<T>::mult(
polynomial_in_shared[idx_x], n_inverse, modulus);
polynomial_in_shared[idx_x + blockDim.x] = OPERATOR_GPU<T>::mult(
polynomial_in_shared[idx_x + blockDim.x], n_inverse, modulus);
__syncthreads();
}
template __device__ void SmallForwardNTT<Data32>(
Data32* polynomial_in_shared, const Root<Data32>* root_of_unity_table,
const Modulus<Data32> modulus, bool reduction_poly_check);
template __device__ void SmallForwardNTT<Data64>(
Data64* polynomial_in_shared, const Root<Data64>* root_of_unity_table,
const Modulus<Data64> modulus, bool reduction_poly_check);
template __device__ void SmallInverseNTT<Data32>(
Data32* polynomial_in_shared, const Root<Data32>* root_of_unity_table,
const Modulus<Data32> modulus, const Ninverse<Data32> n_inverse,
bool reduction_poly_check);
template __device__ void SmallInverseNTT<Data64>(
Data64* polynomial_in_shared, const Root<Data64>* root_of_unity_table,
const Modulus<Data64> modulus, const Ninverse<Data64> n_inverse,
bool reduction_poly_check);
} // namespace heongpu