mirror of
https://github.com/QuilibriumNetwork/ceremonyclient.git
synced 2026-02-21 18:37:26 +08:00
* v2.1.0 [omit consensus and adjacent] - this commit will be amended with the full release after the file copy is complete * 2.1.0 main node rollup
352 lines
11 KiB
C++
352 lines
11 KiB
C++
#include "emp-tool/emp-tool.h"
|
|
#include <iostream>
|
|
using namespace std;
|
|
using namespace emp;
|
|
|
|
void mul128_0(__m128i a, __m128i b, __m128i *res1, __m128i *res2) {
|
|
__m128i tmp3, tmp4, tmp5, tmp6;
|
|
tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
|
|
tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
|
|
tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
|
|
tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
|
|
|
|
tmp4 = _mm_xor_si128(tmp4, tmp5);
|
|
tmp5 = _mm_slli_si128(tmp4, 8);
|
|
tmp4 = _mm_srli_si128(tmp4, 8);
|
|
tmp3 = _mm_xor_si128(tmp3, tmp5);
|
|
tmp6 = _mm_xor_si128(tmp6, tmp4);
|
|
// initial mul now in tmp3, tmp6
|
|
*res1 = tmp3;
|
|
*res2 = tmp6;
|
|
}
|
|
|
|
void mul128_1(__m128i a, __m128i b, __m128i *res1, __m128i *res2) {
|
|
__m128i tmp3, tmp4, tmp5, tmp6;
|
|
tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
|
|
tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
|
|
tmp4 = _mm_shuffle_epi32(a,78);
|
|
tmp5 = _mm_shuffle_epi32(b,78);
|
|
tmp4 = _mm_xor_si128(tmp4, a);
|
|
tmp5 = _mm_xor_si128(tmp5, b);
|
|
tmp4 = _mm_clmulepi64_si128(tmp4, tmp5, 0x00);
|
|
tmp4 = _mm_xor_si128(tmp4, tmp3);
|
|
tmp4 = _mm_xor_si128(tmp4, tmp6);
|
|
tmp5 = _mm_slli_si128(tmp4, 8);
|
|
tmp4 = _mm_srli_si128(tmp4, 8);
|
|
*res1 = _mm_xor_si128(tmp3, tmp5);
|
|
*res2 = _mm_xor_si128(tmp6, tmp4);
|
|
}
|
|
|
|
__m128i reflect_xmm(__m128i X) {
|
|
__m128i tmp1,tmp2;
|
|
__m128i AND_MASK =
|
|
_mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);
|
|
__m128i LOWER_MASK =
|
|
_mm_set_epi32(0x0f070b03, 0x0d050901, 0x0e060a02, 0x0c040800);
|
|
__m128i HIGHER_MASK =
|
|
_mm_set_epi32(0xf070b030, 0xd0509010, 0xe060a020, 0xc0408000);
|
|
__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
|
|
tmp2 = _mm_srli_epi16(X, 4);
|
|
tmp1 = _mm_and_si128(X, AND_MASK);
|
|
tmp2 = _mm_and_si128(tmp2, AND_MASK);
|
|
tmp1 = _mm_shuffle_epi8(HIGHER_MASK ,tmp1); tmp2 = _mm_shuffle_epi8(LOWER_MASK ,tmp2); tmp1 = _mm_xor_si128(tmp1, tmp2);
|
|
return _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
|
}
|
|
|
|
void gfmul_0(__m128i a, __m128i b, __m128i *res){
|
|
__m128i tmp3, tmp4, tmp5, tmp6,
|
|
tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
|
|
__m128i XMMMASK = _mm_setr_epi32(0xffffffff, 0x0, 0x0, 0x0);
|
|
tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
|
|
tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
|
|
tmp4 = _mm_shuffle_epi32(a,78);
|
|
tmp5 = _mm_shuffle_epi32(b,78);
|
|
tmp4 = _mm_xor_si128(tmp4, a);
|
|
tmp5 = _mm_xor_si128(tmp5, b);
|
|
tmp4 = _mm_clmulepi64_si128(tmp4, tmp5, 0x00);
|
|
tmp4 = _mm_xor_si128(tmp4, tmp3);
|
|
tmp4 = _mm_xor_si128(tmp4, tmp6);
|
|
tmp5 = _mm_slli_si128(tmp4, 8);
|
|
tmp4 = _mm_srli_si128(tmp4, 8);
|
|
tmp3 = _mm_xor_si128(tmp3, tmp5);
|
|
tmp6 = _mm_xor_si128(tmp6, tmp4);
|
|
tmp7 = _mm_srli_epi32(tmp6, 31);
|
|
tmp8 = _mm_srli_epi32(tmp6, 30);
|
|
tmp9 = _mm_srli_epi32(tmp6, 25);
|
|
tmp7 = _mm_xor_si128(tmp7, tmp8);
|
|
tmp7 = _mm_xor_si128(tmp7, tmp9);
|
|
tmp8 = _mm_shuffle_epi32(tmp7, 147);
|
|
|
|
tmp7 = _mm_and_si128(XMMMASK, tmp8);
|
|
tmp8 = _mm_andnot_si128(XMMMASK, tmp8);
|
|
tmp3 = _mm_xor_si128(tmp3, tmp8);
|
|
tmp6 = _mm_xor_si128(tmp6, tmp7);
|
|
tmp10 = _mm_slli_epi32(tmp6, 1);
|
|
tmp3 = _mm_xor_si128(tmp3, tmp10);
|
|
tmp11 = _mm_slli_epi32(tmp6, 2);
|
|
tmp3 = _mm_xor_si128(tmp3, tmp11);
|
|
tmp12 = _mm_slli_epi32(tmp6, 7);
|
|
tmp3 = _mm_xor_si128(tmp3, tmp12);
|
|
|
|
*res = _mm_xor_si128(tmp3, tmp6);
|
|
}
|
|
void gfmul_1(__m128i a, __m128i b, __m128i *res){
|
|
__m128i tmp3, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
|
|
__m128i XMMMASK = _mm_setr_epi32(0xffffffff, 0x0, 0x0, 0x0);
|
|
mul128_0(a, b, &tmp3, &tmp6);
|
|
tmp7 = _mm_srli_epi32(tmp6, 31);
|
|
tmp8 = _mm_srli_epi32(tmp6, 30);
|
|
tmp9 = _mm_srli_epi32(tmp6, 25);
|
|
tmp7 = _mm_xor_si128(tmp7, tmp8);
|
|
tmp7 = _mm_xor_si128(tmp7, tmp9);
|
|
tmp8 = _mm_shuffle_epi32(tmp7, 147);
|
|
|
|
tmp7 = _mm_and_si128(XMMMASK, tmp8);
|
|
tmp8 = _mm_andnot_si128(XMMMASK, tmp8);
|
|
tmp3 = _mm_xor_si128(tmp3, tmp8);
|
|
tmp6 = _mm_xor_si128(tmp6, tmp7);
|
|
tmp10 = _mm_slli_epi32(tmp6, 1);
|
|
tmp3 = _mm_xor_si128(tmp3, tmp10);
|
|
tmp11 = _mm_slli_epi32(tmp6, 2);
|
|
tmp3 = _mm_xor_si128(tmp3, tmp11);
|
|
tmp12 = _mm_slli_epi32(tmp6, 7);
|
|
tmp3 = _mm_xor_si128(tmp3, tmp12);
|
|
|
|
*res = _mm_xor_si128(tmp3, tmp6);
|
|
}
|
|
|
|
void uni_hash_coeff_gen_batch_1(block* coeff, block seed, int sz) {
|
|
coeff[0] = seed;
|
|
for(int i = 1; i < sz; i++) {
|
|
gfmul(coeff[i-1], seed, coeff+i);
|
|
}
|
|
}
|
|
|
|
void uni_hash_coeff_gen_batch_2(block* coeff, block seed, int sz) {
|
|
coeff[0] = seed;
|
|
block multiplier;
|
|
gfmul(seed, seed, &multiplier);
|
|
coeff[1] = multiplier;
|
|
for(int i = 2; i < sz; i+=2) {
|
|
gfmul(coeff[i-2], multiplier, coeff+i);
|
|
gfmul(coeff[i-1], multiplier, coeff+i+1);
|
|
}
|
|
}
|
|
|
|
void uni_hash_coeff_gen_batch_4(block* coeff, block seed, int sz) {
|
|
coeff[0] = seed;
|
|
gfmul(seed, seed, &coeff[1]);
|
|
gfmul(coeff[1], seed, &coeff[2]);
|
|
block multiplier;
|
|
gfmul(coeff[2], seed, &multiplier);
|
|
coeff[3] = multiplier;
|
|
for(int i = 4; i < sz; i+=4) {
|
|
gfmul(coeff[i-4], multiplier, &coeff[i]);
|
|
gfmul(coeff[i-3], multiplier, &coeff[i+1]);
|
|
gfmul(coeff[i-2], multiplier, &coeff[i+2]);
|
|
gfmul(coeff[i-1], multiplier, &coeff[i+3]);
|
|
}
|
|
}
|
|
|
|
#define HASH_BLOCK_SZ 128
|
|
|
|
void hash1(block *ret, block r, block *a, int sz) {
|
|
PRG prg;
|
|
prg.reseed(&r);
|
|
block *chi = new block[sz];
|
|
prg.random_block(chi, sz);
|
|
block r2=zero_block, tmp = zero_block;
|
|
int i = 0;
|
|
for(; i < sz/HASH_BLOCK_SZ; ++i) {
|
|
vector_inn_prdt_sum_red<HASH_BLOCK_SZ>(&tmp, a+i*HASH_BLOCK_SZ, chi+i*HASH_BLOCK_SZ);
|
|
r2 = r2 ^ tmp;
|
|
}
|
|
if(sz % HASH_BLOCK_SZ != 0)
|
|
vector_inn_prdt_sum_red(&tmp, a+i*HASH_BLOCK_SZ, chi+i*HASH_BLOCK_SZ, sz%128);
|
|
*ret = r2 ^ tmp;
|
|
delete[] chi;
|
|
}
|
|
|
|
void hash2(block *ret, block r, block *a, int sz) {
|
|
block *coeff = new block[sz];
|
|
int i = 0;
|
|
for(; i < sz/HASH_BLOCK_SZ; ++i) {
|
|
uni_hash_coeff_gen<HASH_BLOCK_SZ>(coeff+i*HASH_BLOCK_SZ, r);
|
|
}
|
|
if(sz % HASH_BLOCK_SZ != 0)
|
|
uni_hash_coeff_gen(coeff+i*HASH_BLOCK_SZ, r, sz%HASH_BLOCK_SZ);
|
|
i = 0;
|
|
block r2=zero_block, tmp = zero_block;
|
|
for(; i < sz/HASH_BLOCK_SZ; ++i) {
|
|
vector_inn_prdt_sum_red<HASH_BLOCK_SZ>(&tmp, a+i*HASH_BLOCK_SZ, coeff+i*HASH_BLOCK_SZ);
|
|
r2 = r2 ^ tmp;
|
|
}
|
|
if(sz % HASH_BLOCK_SZ != 0)
|
|
vector_inn_prdt_sum_red(&tmp, a+i*HASH_BLOCK_SZ, coeff+i*HASH_BLOCK_SZ, sz%HASH_BLOCK_SZ);
|
|
*ret = r2 ^ tmp;
|
|
delete[] coeff;
|
|
}
|
|
|
|
int main() {
|
|
PRG prg;
|
|
block a, b;
|
|
block res[4];
|
|
int bench_size = 1024*1024;
|
|
|
|
cout << "Simplified GF2^128 multiplication ... ";
|
|
|
|
for(int i = 0; i < 65536; ++i) {
|
|
prg.random_block(&a, 1);
|
|
prg.random_block(&b, 1);
|
|
mul128_0(a, b, res, res+1);
|
|
mul128_1(a, b, res+2, res+3);
|
|
if(cmpBlock(res, res+2, 2) == false) {
|
|
cout << "incorrect multiplication" << endl;
|
|
abort();
|
|
}
|
|
}
|
|
cout << "correct" << endl;
|
|
|
|
cout << "Benchmark: ";
|
|
auto start = clock_start();
|
|
for(int i = 0; i < bench_size; ++i) {
|
|
mul128_0(res[0], res[1], res+2, res+3);
|
|
mul128_0(res[2], res[3], res, res+1);
|
|
}
|
|
cout << bench_size*2/(time_from(start))*1e6 << " operations/second" << endl;
|
|
|
|
cout << "Benchmark: ";
|
|
start = clock_start();
|
|
for(int i = 0; i < bench_size; ++i) {
|
|
mul128_1(res[0], res[1], res+2, res+3);
|
|
mul128_1(res[2], res[3], res, res+1);
|
|
}
|
|
cout << bench_size*2/(time_from(start))*1e6 << " operations/second" << endl;
|
|
|
|
cout << "\nSimplified GF2^128 multiplication ... ";
|
|
|
|
for(int i = 0; i < 65536; ++i) {
|
|
prg.random_block(&a, 1);
|
|
prg.random_block(&b, 1);
|
|
gfmul(a, b, res);
|
|
gfmul_reflect(reflect_xmm(a), reflect_xmm(b), res+1);
|
|
res[1] = reflect_xmm(res[1]);
|
|
if(cmpBlock(res, res+1, 1) == false) {
|
|
cout << "incorrect multiplication" << endl;
|
|
abort();
|
|
}
|
|
}
|
|
cout << "correct" << endl;
|
|
|
|
cout << "Benchmark: ";
|
|
start = clock_start();
|
|
for(int i = 0; i < bench_size; ++i) {
|
|
gfmul_0(res[0], res[1], res+2);
|
|
gfmul_0(res[2], res[3], res);
|
|
}
|
|
cout << bench_size*2/(time_from(start))*1e6 << " operations/second" << endl;
|
|
|
|
cout << "Benchmark: ";
|
|
start = clock_start();
|
|
for(int i = 0; i < bench_size; ++i) {
|
|
gfmul_1(res[0], res[1], res+2);
|
|
gfmul_1(res[2], res[3], res);
|
|
}
|
|
cout << bench_size*2/(time_from(start))*1e6 << " operations/second" << endl;
|
|
|
|
cout << "\nVector inner product (with reduction)\n";
|
|
block v0[128], v1[128];
|
|
block r[2];
|
|
start = clock_start();
|
|
for(int i = 0; i < bench_size; ++i) {
|
|
prg.random_block(v0, 128);
|
|
prg.random_block(v1, 128);
|
|
vector_inn_prdt_sum_red(r, v0, v1, 128);
|
|
}
|
|
cout << "Benchmark: PRG+MUL: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl;
|
|
|
|
cout << "\nVector inner product (without reduction)\n";
|
|
start = clock_start();
|
|
for(int i = 0; i < bench_size; ++i) {
|
|
prg.random_block(v0, 128);
|
|
prg.random_block(v1, 128);
|
|
vector_inn_prdt_sum_no_red(r, v0, v1, 128);
|
|
}
|
|
cout << "Benchmark: PRG+MUL: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl;
|
|
|
|
cout << "\nUniversal hash coefficients (no batch)\n";
|
|
block seed;
|
|
block coefficient[128];
|
|
start = clock_start();
|
|
for(int i = 0; i < bench_size; ++i) {
|
|
prg.random_block(&seed, 1);
|
|
uni_hash_coeff_gen_batch_1(coefficient, seed, 128);
|
|
}
|
|
cout << "Benchmark: PRG+Coefficient generation: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl;
|
|
|
|
cout << "\nUniversal hash coefficients (batch 2)\n";
|
|
start = clock_start();
|
|
for(int i = 0; i < bench_size; ++i) {
|
|
prg.random_block(&seed, 1);
|
|
uni_hash_coeff_gen_batch_2(coefficient, seed, 128);
|
|
}
|
|
cout << "Benchmark: PRG+Coefficient generation: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl;
|
|
|
|
cout << "\nUniversal hash coefficients (batch 4)\n";
|
|
start = clock_start();
|
|
for(int i = 0; i < bench_size; ++i) {
|
|
prg.random_block(&seed, 1);
|
|
uni_hash_coeff_gen_batch_4(coefficient, seed, 128);
|
|
}
|
|
cout << "Benchmark: PRG+Coefficient generation: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl;
|
|
|
|
start = clock_start();
|
|
for(int i = 0; i < bench_size; ++i) {
|
|
prg.random_block(&seed, 1);
|
|
uni_hash_coeff_gen(coefficient, seed, 126);
|
|
}
|
|
cout << "Benchmark: PRG+Coefficient generation: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl;
|
|
|
|
cout << "\nGalois field packing\n";
|
|
start = clock_start();
|
|
GaloisFieldPacking pack;
|
|
block data[128], res1[128];
|
|
for(int i = 0; i < bench_size; ++i) {
|
|
prg.random_block(data, 128);
|
|
pack.packing(res1, data);
|
|
}
|
|
cout << "Benchmark: PRG+packing: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl;
|
|
|
|
cout << "\nXOR of all elements in a vector\n";
|
|
start = clock_start();
|
|
for(int i = 0; i < bench_size; ++i) {
|
|
prg.random_block(data, 128);
|
|
vector_self_xor(res1, data, 128);
|
|
}
|
|
cout << "Benchmark: PRG+XOR: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl;
|
|
|
|
cout << "\nHash method 1: inner product with Chi vector (PRG)\n";
|
|
start = clock_start();
|
|
int hash_len = 1024*128;
|
|
block *to_hash = new block[hash_len];
|
|
block ret;
|
|
for(int i = 0; i < 1024; ++i) {
|
|
prg.random_block(r, 1);
|
|
prg.random_block(to_hash, hash_len);
|
|
hash1(&ret, r[0], to_hash, hash_len);
|
|
}
|
|
cout << "Benchmark: PRG+XOR: " << 1024/(time_from(start))*1e6 << " operations/second" << endl;
|
|
|
|
cout << "\nHash method 2: almost universal hash\n";
|
|
start = clock_start();
|
|
for(int i = 0; i < 1024; ++i) {
|
|
prg.random_block(r, 1);
|
|
prg.random_block(to_hash, hash_len);
|
|
hash2(&ret, r[0], to_hash, hash_len);
|
|
}
|
|
cout << "Benchmark: PRG+XOR: " << 1024/(time_from(start))*1e6 << " operations/second" << endl;
|
|
delete[] to_hash;
|
|
|
|
return 0;
|
|
}
|