ceremonyclient/emp-tool/test/f2k.cpp

#include "emp-tool/emp-tool.h"
#include <iostream>
using namespace std;
using namespace emp;

void mul128_0(__m128i a, __m128i b, __m128i *res1, __m128i *res2) {
	__m128i tmp3, tmp4, tmp5, tmp6;
	tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
	tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
	tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
	tmp6 = _mm_clmulepi64_si128(a, b, 0x11);

	tmp4 = _mm_xor_si128(tmp4, tmp5);
	tmp5 = _mm_slli_si128(tmp4, 8);
	tmp4 = _mm_srli_si128(tmp4, 8);
	tmp3 = _mm_xor_si128(tmp3, tmp5);
	tmp6 = _mm_xor_si128(tmp6, tmp4);
	// initial mul now in tmp3, tmp6
	*res1 = tmp3;
	*res2 = tmp6;
}

void mul128_1(__m128i a, __m128i b, __m128i *res1, __m128i *res2) {
	__m128i tmp3, tmp4, tmp5, tmp6;
	tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
	tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
	tmp4 = _mm_shuffle_epi32(a,78);
	tmp5 = _mm_shuffle_epi32(b,78);
	tmp4 = _mm_xor_si128(tmp4, a);
	tmp5 = _mm_xor_si128(tmp5, b);
	tmp4 = _mm_clmulepi64_si128(tmp4, tmp5, 0x00);
	tmp4 = _mm_xor_si128(tmp4, tmp3);
	tmp4 = _mm_xor_si128(tmp4, tmp6);
	tmp5 = _mm_slli_si128(tmp4, 8);
	tmp4 = _mm_srli_si128(tmp4, 8);
	*res1 = _mm_xor_si128(tmp3, tmp5);
	*res2 = _mm_xor_si128(tmp6, tmp4);
}

__m128i reflect_xmm(__m128i X) {
	__m128i tmp1,tmp2;
	__m128i AND_MASK =
		_mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);
	__m128i LOWER_MASK =
		_mm_set_epi32(0x0f070b03, 0x0d050901, 0x0e060a02, 0x0c040800);
	__m128i HIGHER_MASK =
		_mm_set_epi32(0xf070b030, 0xd0509010, 0xe060a020, 0xc0408000);
	__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
	tmp2 = _mm_srli_epi16(X, 4);
	tmp1 = _mm_and_si128(X, AND_MASK);
	tmp2 = _mm_and_si128(tmp2, AND_MASK);
	tmp1 = _mm_shuffle_epi8(HIGHER_MASK ,tmp1); tmp2 = _mm_shuffle_epi8(LOWER_MASK ,tmp2); tmp1 = _mm_xor_si128(tmp1, tmp2);
	return _mm_shuffle_epi8(tmp1, BSWAP_MASK);
}

void gfmul_0(__m128i a, __m128i b, __m128i *res){
	__m128i tmp3, tmp4, tmp5, tmp6,
			  tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
	__m128i XMMMASK = _mm_setr_epi32(0xffffffff, 0x0, 0x0, 0x0);
	tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
	tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
	tmp4 = _mm_shuffle_epi32(a,78);
	tmp5 = _mm_shuffle_epi32(b,78);
	tmp4 = _mm_xor_si128(tmp4, a);
	tmp5 = _mm_xor_si128(tmp5, b);
	tmp4 = _mm_clmulepi64_si128(tmp4, tmp5, 0x00);
	tmp4 = _mm_xor_si128(tmp4, tmp3);
	tmp4 = _mm_xor_si128(tmp4, tmp6);
	tmp5 = _mm_slli_si128(tmp4, 8);
	tmp4 = _mm_srli_si128(tmp4, 8);
	tmp3 = _mm_xor_si128(tmp3, tmp5);
	tmp6 = _mm_xor_si128(tmp6, tmp4);
	tmp7 = _mm_srli_epi32(tmp6, 31);
	tmp8 = _mm_srli_epi32(tmp6, 30);
	tmp9 = _mm_srli_epi32(tmp6, 25);
	tmp7 = _mm_xor_si128(tmp7, tmp8);
	tmp7 = _mm_xor_si128(tmp7, tmp9);
	tmp8 = _mm_shuffle_epi32(tmp7, 147);

	tmp7 = _mm_and_si128(XMMMASK, tmp8);
	tmp8 = _mm_andnot_si128(XMMMASK, tmp8);
	tmp3 = _mm_xor_si128(tmp3, tmp8);
	tmp6 = _mm_xor_si128(tmp6, tmp7);
	tmp10 = _mm_slli_epi32(tmp6, 1);
	tmp3 = _mm_xor_si128(tmp3, tmp10);
	tmp11 = _mm_slli_epi32(tmp6, 2);
	tmp3 = _mm_xor_si128(tmp3, tmp11);
	tmp12 = _mm_slli_epi32(tmp6, 7);
	tmp3 = _mm_xor_si128(tmp3, tmp12);

	*res = _mm_xor_si128(tmp3, tmp6);
}
void gfmul_1(__m128i a, __m128i b, __m128i *res){
	__m128i tmp3, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
	__m128i XMMMASK = _mm_setr_epi32(0xffffffff, 0x0, 0x0, 0x0);
	mul128_0(a, b, &tmp3, &tmp6);
	tmp7 = _mm_srli_epi32(tmp6, 31);
	tmp8 = _mm_srli_epi32(tmp6, 30);
	tmp9 = _mm_srli_epi32(tmp6, 25);
	tmp7 = _mm_xor_si128(tmp7, tmp8);
	tmp7 = _mm_xor_si128(tmp7, tmp9);
	tmp8 = _mm_shuffle_epi32(tmp7, 147);

	tmp7 = _mm_and_si128(XMMMASK, tmp8);
	tmp8 = _mm_andnot_si128(XMMMASK, tmp8);
	tmp3 = _mm_xor_si128(tmp3, tmp8);
	tmp6 = _mm_xor_si128(tmp6, tmp7);
	tmp10 = _mm_slli_epi32(tmp6, 1);
	tmp3 = _mm_xor_si128(tmp3, tmp10);
	tmp11 = _mm_slli_epi32(tmp6, 2);
	tmp3 = _mm_xor_si128(tmp3, tmp11);
	tmp12 = _mm_slli_epi32(tmp6, 7);
	tmp3 = _mm_xor_si128(tmp3, tmp12);

	*res = _mm_xor_si128(tmp3, tmp6);
}

void uni_hash_coeff_gen_batch_1(block* coeff, block seed, int sz) {
	coeff[0] = seed;
	for(int i = 1; i < sz; i++) {
		gfmul(coeff[i-1], seed, coeff+i);
	}
}

void uni_hash_coeff_gen_batch_2(block* coeff, block seed, int sz) {
	coeff[0] = seed;
	block multiplier;
	gfmul(seed, seed, &multiplier);
	coeff[1] = multiplier;
	for(int i = 2; i < sz; i+=2) {
		gfmul(coeff[i-2], multiplier, coeff+i);
		gfmul(coeff[i-1], multiplier, coeff+i+1);
	}
}

void uni_hash_coeff_gen_batch_4(block* coeff, block seed, int sz) {
	coeff[0] = seed;
	gfmul(seed, seed, &coeff[1]);
	gfmul(coeff[1], seed, &coeff[2]);
	block multiplier;
	gfmul(coeff[2], seed, &multiplier);
	coeff[3] = multiplier;
	for(int i = 4; i < sz; i+=4) {
		gfmul(coeff[i-4], multiplier, &coeff[i]);
		gfmul(coeff[i-3], multiplier, &coeff[i+1]);
		gfmul(coeff[i-2], multiplier, &coeff[i+2]);
		gfmul(coeff[i-1], multiplier, &coeff[i+3]);
	}
}

#define HASH_BLOCK_SZ 128

void hash1(block *ret, block r, block *a, int sz) {
	PRG prg;
	prg.reseed(&r);
	block *chi = new block[sz];
	prg.random_block(chi, sz);
	block r2=zero_block, tmp = zero_block;
	int i = 0;
	for(; i < sz/HASH_BLOCK_SZ; ++i) {
		vector_inn_prdt_sum_red<HASH_BLOCK_SZ>(&tmp, a+i*HASH_BLOCK_SZ, chi+i*HASH_BLOCK_SZ);
		r2 = r2 ^ tmp;
	}
	if(sz % HASH_BLOCK_SZ != 0)
		vector_inn_prdt_sum_red(&tmp, a+i*HASH_BLOCK_SZ, chi+i*HASH_BLOCK_SZ, sz%128);
	*ret = r2 ^ tmp;
	delete[] chi;
}

void hash2(block *ret, block r, block *a, int sz) {
	block *coeff = new block[sz];
	int i = 0;
	for(; i < sz/HASH_BLOCK_SZ; ++i) {
		uni_hash_coeff_gen<HASH_BLOCK_SZ>(coeff+i*HASH_BLOCK_SZ, r);
	}
	if(sz % HASH_BLOCK_SZ != 0)
		uni_hash_coeff_gen(coeff+i*HASH_BLOCK_SZ, r, sz%HASH_BLOCK_SZ);
	i = 0;
	block r2=zero_block, tmp = zero_block;
	for(; i < sz/HASH_BLOCK_SZ; ++i) {
		vector_inn_prdt_sum_red<HASH_BLOCK_SZ>(&tmp, a+i*HASH_BLOCK_SZ, coeff+i*HASH_BLOCK_SZ);
		r2 = r2 ^ tmp;
	}
	if(sz % HASH_BLOCK_SZ != 0)
		vector_inn_prdt_sum_red(&tmp, a+i*HASH_BLOCK_SZ, coeff+i*HASH_BLOCK_SZ, sz%HASH_BLOCK_SZ);
	*ret = r2 ^ tmp;
	delete[] coeff;
}

int main() {
	PRG prg;
	block a, b;
	block res[4];
	int bench_size = 1024*1024;

	cout << "Simplified GF2^128 multiplication ... ";

	for(int i = 0; i < 65536; ++i) {
		prg.random_block(&a, 1);
		prg.random_block(&b, 1);
		mul128_0(a, b, res, res+1);
		mul128_1(a, b, res+2, res+3);
		if(cmpBlock(res, res+2, 2) == false) {
			cout << "incorrect multiplication" << endl;
			abort();
		}
	}
	cout << "correct" << endl;

	cout << "Benchmark: ";
	auto start = clock_start();
	for(int i = 0; i < bench_size; ++i) {
		mul128_0(res[0], res[1], res+2, res+3);
		mul128_0(res[2], res[3], res, res+1);
	}
	cout << bench_size*2/(time_from(start))*1e6 << " operations/second" << endl;

	cout << "Benchmark: ";
	start = clock_start();
	for(int i = 0; i < bench_size; ++i) {
		mul128_1(res[0], res[1], res+2, res+3);
		mul128_1(res[2], res[3], res, res+1);
	}
	cout << bench_size*2/(time_from(start))*1e6 << " operations/second" << endl;

	cout << "\nSimplified GF2^128 multiplication ... ";

	for(int i = 0; i < 65536; ++i) {
		prg.random_block(&a, 1);
		prg.random_block(&b, 1);
		gfmul(a, b, res);
		gfmul_reflect(reflect_xmm(a), reflect_xmm(b), res+1);
		res[1] = reflect_xmm(res[1]);
		if(cmpBlock(res, res+1, 1) == false) {
			cout << "incorrect multiplication" << endl;
			abort();
		}
	}
	cout << "correct" << endl;

	cout << "Benchmark: ";
	start = clock_start();
	for(int i = 0; i < bench_size; ++i) {
		gfmul_0(res[0], res[1], res+2);
		gfmul_0(res[2], res[3], res);
	}
	cout << bench_size*2/(time_from(start))*1e6 << " operations/second" << endl;

	cout << "Benchmark: ";
	start = clock_start();
	for(int i = 0; i < bench_size; ++i) {
		gfmul_1(res[0], res[1], res+2);
		gfmul_1(res[2], res[3], res);
	}
	cout << bench_size*2/(time_from(start))*1e6 << " operations/second" << endl;

	cout << "\nVector inner product (with reduction)\n";
	block v0[128], v1[128];
	block r[2];
	start = clock_start();
	for(int i = 0; i < bench_size; ++i) {
		prg.random_block(v0, 128);
		prg.random_block(v1, 128);
		vector_inn_prdt_sum_red(r, v0, v1, 128);
	}
	cout << "Benchmark: PRG+MUL: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl;

	cout << "\nVector inner product (without reduction)\n";
	start = clock_start();
	for(int i = 0; i < bench_size; ++i) {
		prg.random_block(v0, 128);
		prg.random_block(v1, 128);
		vector_inn_prdt_sum_no_red(r, v0, v1, 128);
	}
	cout << "Benchmark: PRG+MUL: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl;

	cout << "\nUniversal hash coefficients (no batch)\n";
	block seed;
	block coefficient[128];
	start = clock_start();
	for(int i = 0; i < bench_size; ++i) {
		prg.random_block(&seed, 1);
		uni_hash_coeff_gen_batch_1(coefficient, seed, 128);
	}
	cout << "Benchmark: PRG+Coefficient generation: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl;

	cout << "\nUniversal hash coefficients (batch 2)\n";
	start = clock_start();
	for(int i = 0; i < bench_size; ++i) {
		prg.random_block(&seed, 1);
		uni_hash_coeff_gen_batch_2(coefficient, seed, 128);
	}
	cout << "Benchmark: PRG+Coefficient generation: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl;

	cout << "\nUniversal hash coefficients (batch 4)\n";
	start = clock_start();
	for(int i = 0; i < bench_size; ++i) {
		prg.random_block(&seed, 1);
		uni_hash_coeff_gen_batch_4(coefficient, seed, 128);
	}
	cout << "Benchmark: PRG+Coefficient generation: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl;

	start = clock_start();
	for(int i = 0; i < bench_size; ++i) {
		prg.random_block(&seed, 1);
		uni_hash_coeff_gen(coefficient, seed, 126);
	}
	cout << "Benchmark: PRG+Coefficient generation: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl;

	cout << "\nGalois field packing\n";
	start = clock_start();
	GaloisFieldPacking pack;
	block data[128], res1[128];
	for(int i = 0; i < bench_size; ++i) {
		prg.random_block(data, 128);
		pack.packing(res1, data);
	}
	cout << "Benchmark: PRG+packing: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl;

	cout << "\nXOR of all elements in a vector\n";
	start = clock_start();
	for(int i = 0; i < bench_size; ++i) {
		prg.random_block(data, 128);
		vector_self_xor(res1, data, 128);
	}
	cout << "Benchmark: PRG+XOR: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl;

	cout << "\nHash method 1: inner product with Chi vector (PRG)\n";
	start = clock_start();
	int hash_len = 1024*128;
	block *to_hash = new block[hash_len];
	block ret;
	for(int i = 0; i < 1024; ++i) {
		prg.random_block(r, 1);
		prg.random_block(to_hash, hash_len);
		hash1(&ret, r[0], to_hash, hash_len);
	}
	cout << "Benchmark: PRG+XOR: " << 1024/(time_from(start))*1e6 << " operations/second" << endl;

	cout << "\nHash method 2: almost universal hash\n";
	start = clock_start();
	for(int i = 0; i < 1024; ++i) {
		prg.random_block(r, 1);
		prg.random_block(to_hash, hash_len);
		hash2(&ret, r[0], to_hash, hash_len);
	}
	cout << "Benchmark: PRG+XOR: " << 1024/(time_from(start))*1e6 << " operations/second" << endl;
	delete[] to_hash;

	return 0;
}