Program Listing for File encoder.cu
↰ Return to documentation for file (src/lib/host/bfv/encoder.cu)
// Copyright 2024-2026 Alişah Özcan
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
// Developer: Alişah Özcan
#include <heongpu/host/bfv/encoder.cuh>
namespace heongpu
{
__host__ HEEncoder<Scheme::BFV>::HEEncoder(HEContext<Scheme::BFV> context)
{
if (!context || !context->context_generated_)
{
throw std::invalid_argument("HEContext is not generated!");
}
context_ = std::move(context);
const auto* ctx = context_.get();
const int n = ctx->n;
const int n_power = ctx->n_power;
// Encode - Decode Index
std::vector<Data64> encode_index;
int m = n << 1;
int gen = 3;
int pos = 1;
int index = 0;
int location = 0;
for (int i = 0; i < int(n / 2); i++)
{
index = (pos - 1) >> 1;
location = gpuntt::bitreverse(index, n_power);
encode_index.push_back(location);
pos *= gen;
pos &= (m - 1);
}
for (int i = int(n / 2); i < n; i++)
{
index = (m - pos - 1) >> 1;
location = gpuntt::bitreverse(index, n_power);
encode_index.push_back(location);
pos *= gen;
pos &= (m - 1);
}
encoding_location_ =
std::make_shared<DeviceVector<Data64>>(encode_index);
}
__host__ void
HEEncoder<Scheme::BFV>::encode_bfv(Plaintext<Scheme::BFV>& plain,
const std::vector<uint64_t>& message,
const cudaStream_t stream)
{
const auto* ctx = context_.get();
const int n = ctx->n;
const int n_power = ctx->n_power;
const int slot_count = ctx->n;
DeviceVector<Data64> output_memory(n, stream);
DeviceVector<Data64> message_gpu(slot_count, stream);
cudaMemcpyAsync(message_gpu.data(), message.data(),
message.size() * sizeof(Data64), cudaMemcpyHostToDevice,
stream);
HEONGPU_CUDA_CHECK(cudaGetLastError());
encode_kernel_bfv<<<dim3((n >> 8), 1, 1), 256, 0, stream>>>(
output_memory.data(), message_gpu.data(),
encoding_location_->data(), ctx->plain_modulus2_->data(),
message.size());
HEONGPU_CUDA_CHECK(cudaGetLastError());
gpuntt::ntt_rns_configuration<Data64> cfg_intt = {
.n_power = n_power,
.ntt_type = gpuntt::INVERSE,
.ntt_layout = gpuntt::PerPolynomial,
.reduction_poly = gpuntt::ReductionPolynomial::X_N_plus,
.zero_padding = false,
.mod_inverse = ctx->n_plain_inverse_->data(),
.stream = stream};
gpuntt::GPU_INTT_Inplace(output_memory.data(),
ctx->plain_intt_tables_->data(),
ctx->plain_modulus2_->data(), cfg_intt, 1, 1);
plain.memory_set(std::move(output_memory));
}
__host__ void
HEEncoder<Scheme::BFV>::encode_bfv(Plaintext<Scheme::BFV>& plain,
const std::vector<int64_t>& message,
const cudaStream_t stream)
{
const auto* ctx = context_.get();
const int n = ctx->n;
const int n_power = ctx->n_power;
const int slot_count = ctx->n;
DeviceVector<Data64> output_memory(n, stream);
DeviceVector<Data64> message_gpu(slot_count, stream);
cudaMemcpyAsync(message_gpu.data(), message.data(),
message.size() * sizeof(Data64), cudaMemcpyHostToDevice,
stream);
HEONGPU_CUDA_CHECK(cudaGetLastError());
encode_kernel_bfv<<<dim3((n >> 8), 1, 1), 256, 0, stream>>>(
output_memory.data(), message_gpu.data(),
encoding_location_->data(), ctx->plain_modulus2_->data(),
message.size());
HEONGPU_CUDA_CHECK(cudaGetLastError());
gpuntt::ntt_rns_configuration<Data64> cfg_intt = {
.n_power = n_power,
.ntt_type = gpuntt::INVERSE,
.ntt_layout = gpuntt::PerPolynomial,
.reduction_poly = gpuntt::ReductionPolynomial::X_N_plus,
.zero_padding = false,
.mod_inverse = ctx->n_plain_inverse_->data(),
.stream = stream};
gpuntt::GPU_INTT_Inplace(output_memory.data(),
ctx->plain_intt_tables_->data(),
ctx->plain_modulus2_->data(), cfg_intt, 1, 1);
plain.memory_set(std::move(output_memory));
}
__host__ void
HEEncoder<Scheme::BFV>::encode_bfv(Plaintext<Scheme::BFV>& plain,
const HostVector<uint64_t>& message,
const cudaStream_t stream)
{
const auto* ctx = context_.get();
const int n = ctx->n;
const int n_power = ctx->n_power;
const int slot_count = ctx->n;
DeviceVector<Data64> output_memory(n, stream);
DeviceVector<Data64> message_gpu(slot_count, stream);
cudaMemcpyAsync(message_gpu.data(), message.data(),
message.size() * sizeof(Data64), cudaMemcpyHostToDevice,
stream);
HEONGPU_CUDA_CHECK(cudaGetLastError());
encode_kernel_bfv<<<dim3((n >> 8), 1, 1), 256, 0, stream>>>(
output_memory.data(), message_gpu.data(),
encoding_location_->data(), ctx->plain_modulus2_->data(),
message.size());
HEONGPU_CUDA_CHECK(cudaGetLastError());
gpuntt::ntt_rns_configuration<Data64> cfg_intt = {
.n_power = n_power,
.ntt_type = gpuntt::INVERSE,
.ntt_layout = gpuntt::PerPolynomial,
.reduction_poly = gpuntt::ReductionPolynomial::X_N_plus,
.zero_padding = false,
.mod_inverse = ctx->n_plain_inverse_->data(),
.stream = stream};
gpuntt::GPU_INTT_Inplace(output_memory.data(),
ctx->plain_intt_tables_->data(),
ctx->plain_modulus2_->data(), cfg_intt, 1, 1);
plain.memory_set(std::move(output_memory));
}
__host__ void
HEEncoder<Scheme::BFV>::encode_bfv(Plaintext<Scheme::BFV>& plain,
const HostVector<int64_t>& message,
const cudaStream_t stream)
{
const auto* ctx = context_.get();
const int n = ctx->n;
const int n_power = ctx->n_power;
const int slot_count = ctx->n;
DeviceVector<Data64> output_memory(n, stream);
DeviceVector<Data64> message_gpu(slot_count, stream);
cudaMemcpyAsync(message_gpu.data(), message.data(),
message.size() * sizeof(Data64), cudaMemcpyHostToDevice,
stream);
HEONGPU_CUDA_CHECK(cudaGetLastError());
encode_kernel_bfv<<<dim3((n >> 8), 1, 1), 256, 0, stream>>>(
output_memory.data(), message_gpu.data(),
encoding_location_->data(), ctx->plain_modulus2_->data(),
message.size());
HEONGPU_CUDA_CHECK(cudaGetLastError());
gpuntt::ntt_rns_configuration<Data64> cfg_intt = {
.n_power = n_power,
.ntt_type = gpuntt::INVERSE,
.ntt_layout = gpuntt::PerPolynomial,
.reduction_poly = gpuntt::ReductionPolynomial::X_N_plus,
.zero_padding = false,
.mod_inverse = ctx->n_plain_inverse_->data(),
.stream = stream};
gpuntt::GPU_INTT_Inplace(output_memory.data(),
ctx->plain_intt_tables_->data(),
ctx->plain_modulus2_->data(), cfg_intt, 1, 1);
plain.memory_set(std::move(output_memory));
}
__host__ void
HEEncoder<Scheme::BFV>::decode_bfv(std::vector<uint64_t>& message,
Plaintext<Scheme::BFV>& plain,
const cudaStream_t stream)
{
const auto* ctx = context_.get();
const int n = ctx->n;
const int n_power = ctx->n_power;
const int slot_count = ctx->n;
DeviceVector<Data64> temp_memory(slot_count + n, stream);
Data64* message_gpu = temp_memory.data();
Data64* temp_plain = message_gpu + slot_count;
gpuntt::ntt_rns_configuration<Data64> cfg_ntt = {
.n_power = n_power,
.ntt_type = gpuntt::FORWARD,
.ntt_layout = gpuntt::PerPolynomial,
.reduction_poly = gpuntt::ReductionPolynomial::X_N_plus,
.zero_padding = false,
.stream = stream};
gpuntt::GPU_NTT(plain.data(), temp_plain,
ctx->plain_ntt_tables_->data(),
ctx->plain_modulus2_->data(), cfg_ntt, 1, 1);
HEONGPU_CUDA_CHECK(cudaGetLastError());
decode_kernel_bfv<<<dim3((n >> 8), 1, 1), 256, 0, stream>>>(
message_gpu, temp_plain, encoding_location_->data());
HEONGPU_CUDA_CHECK(cudaGetLastError());
message.resize(slot_count);
cudaMemcpyAsync(message.data(), message_gpu,
slot_count * sizeof(uint64_t), cudaMemcpyDeviceToHost,
stream);
HEONGPU_CUDA_CHECK(cudaGetLastError());
}
__host__ void
HEEncoder<Scheme::BFV>::decode_bfv(std::vector<int64_t>& message,
Plaintext<Scheme::BFV>& plain,
const cudaStream_t stream)
{
const auto* ctx = context_.get();
const int n = ctx->n;
const int n_power = ctx->n_power;
const int slot_count = ctx->n;
DeviceVector<Data64> temp_memory(slot_count + n, stream);
Data64* message_gpu = temp_memory.data();
Data64* temp_plain = message_gpu + slot_count;
gpuntt::ntt_rns_configuration<Data64> cfg_ntt = {
.n_power = n_power,
.ntt_type = gpuntt::FORWARD,
.ntt_layout = gpuntt::PerPolynomial,
.reduction_poly = gpuntt::ReductionPolynomial::X_N_plus,
.zero_padding = false,
.stream = stream};
gpuntt::GPU_NTT(plain.data(), temp_plain,
ctx->plain_ntt_tables_->data(),
ctx->plain_modulus2_->data(), cfg_ntt, 1, 1);
decode_kernel_bfv<<<dim3((n >> 8), 1, 1), 256, 0, stream>>>(
message_gpu, temp_plain, encoding_location_->data());
HEONGPU_CUDA_CHECK(cudaGetLastError());
unsigned_signed_convert<<<dim3((slot_count >> 8), 1, 1), 256, 0,
stream>>>(message_gpu, message_gpu,
ctx->plain_modulus2_->data());
HEONGPU_CUDA_CHECK(cudaGetLastError());
message.resize(slot_count);
cudaMemcpyAsync(message.data(), message_gpu,
slot_count * sizeof(int64_t), cudaMemcpyDeviceToHost,
stream);
HEONGPU_CUDA_CHECK(cudaGetLastError());
}
__host__ void
HEEncoder<Scheme::BFV>::decode_bfv(HostVector<uint64_t>& message,
Plaintext<Scheme::BFV>& plain,
const cudaStream_t stream)
{
const auto* ctx = context_.get();
const int n = ctx->n;
const int n_power = ctx->n_power;
const int slot_count = ctx->n;
DeviceVector<Data64> temp_memory(slot_count + n, stream);
Data64* message_gpu = temp_memory.data();
Data64* temp_plain = message_gpu + slot_count;
gpuntt::ntt_rns_configuration<Data64> cfg_ntt = {
.n_power = n_power,
.ntt_type = gpuntt::FORWARD,
.ntt_layout = gpuntt::PerPolynomial,
.reduction_poly = gpuntt::ReductionPolynomial::X_N_plus,
.zero_padding = false,
.stream = stream};
gpuntt::GPU_NTT(plain.data(), temp_plain,
ctx->plain_ntt_tables_->data(),
ctx->plain_modulus2_->data(), cfg_ntt, 1, 1);
decode_kernel_bfv<<<dim3((n >> 8), 1, 1), 256, 0, stream>>>(
message_gpu, temp_plain, encoding_location_->data());
HEONGPU_CUDA_CHECK(cudaGetLastError());
message.resize(slot_count);
cudaMemcpyAsync(message.data(), message_gpu,
slot_count * sizeof(uint64_t), cudaMemcpyDeviceToHost,
stream);
HEONGPU_CUDA_CHECK(cudaGetLastError());
}
__host__ void
HEEncoder<Scheme::BFV>::decode_bfv(HostVector<int64_t>& message,
Plaintext<Scheme::BFV>& plain,
const cudaStream_t stream)
{
const auto* ctx = context_.get();
const int n = ctx->n;
const int n_power = ctx->n_power;
const int slot_count = ctx->n;
DeviceVector<Data64> temp_memory(slot_count + n, stream);
Data64* message_gpu = temp_memory.data();
Data64* temp_plain = message_gpu + slot_count;
gpuntt::ntt_rns_configuration<Data64> cfg_ntt = {
.n_power = n_power,
.ntt_type = gpuntt::FORWARD,
.ntt_layout = gpuntt::PerPolynomial,
.reduction_poly = gpuntt::ReductionPolynomial::X_N_plus,
.zero_padding = false,
.stream = stream};
gpuntt::GPU_NTT(plain.data(), temp_plain,
ctx->plain_ntt_tables_->data(),
ctx->plain_modulus2_->data(), cfg_ntt, 1, 1);
decode_kernel_bfv<<<dim3((n >> 8), 1, 1), 256, 0, stream>>>(
message_gpu, temp_plain, encoding_location_->data());
HEONGPU_CUDA_CHECK(cudaGetLastError());
unsigned_signed_convert<<<dim3((slot_count >> 8), 1, 1), 256, 0,
stream>>>(message_gpu, message_gpu,
ctx->plain_modulus2_->data());
HEONGPU_CUDA_CHECK(cudaGetLastError());
message.resize(slot_count);
cudaMemcpyAsync(message.data(), message_gpu,
slot_count * sizeof(int64_t), cudaMemcpyDeviceToHost,
stream);
HEONGPU_CUDA_CHECK(cudaGetLastError());
}
} // namespace heongpu