#include "emp-tool/emp-tool.h" #include using namespace std; using namespace emp; void mul128_0(__m128i a, __m128i b, __m128i *res1, __m128i *res2) { __m128i tmp3, tmp4, tmp5, tmp6; tmp3 = _mm_clmulepi64_si128(a, b, 0x00); tmp4 = _mm_clmulepi64_si128(a, b, 0x10); tmp5 = _mm_clmulepi64_si128(a, b, 0x01); tmp6 = _mm_clmulepi64_si128(a, b, 0x11); tmp4 = _mm_xor_si128(tmp4, tmp5); tmp5 = _mm_slli_si128(tmp4, 8); tmp4 = _mm_srli_si128(tmp4, 8); tmp3 = _mm_xor_si128(tmp3, tmp5); tmp6 = _mm_xor_si128(tmp6, tmp4); // initial mul now in tmp3, tmp6 *res1 = tmp3; *res2 = tmp6; } void mul128_1(__m128i a, __m128i b, __m128i *res1, __m128i *res2) { __m128i tmp3, tmp4, tmp5, tmp6; tmp3 = _mm_clmulepi64_si128(a, b, 0x00); tmp6 = _mm_clmulepi64_si128(a, b, 0x11); tmp4 = _mm_shuffle_epi32(a,78); tmp5 = _mm_shuffle_epi32(b,78); tmp4 = _mm_xor_si128(tmp4, a); tmp5 = _mm_xor_si128(tmp5, b); tmp4 = _mm_clmulepi64_si128(tmp4, tmp5, 0x00); tmp4 = _mm_xor_si128(tmp4, tmp3); tmp4 = _mm_xor_si128(tmp4, tmp6); tmp5 = _mm_slli_si128(tmp4, 8); tmp4 = _mm_srli_si128(tmp4, 8); *res1 = _mm_xor_si128(tmp3, tmp5); *res2 = _mm_xor_si128(tmp6, tmp4); } __m128i reflect_xmm(__m128i X) { __m128i tmp1,tmp2; __m128i AND_MASK = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f); __m128i LOWER_MASK = _mm_set_epi32(0x0f070b03, 0x0d050901, 0x0e060a02, 0x0c040800); __m128i HIGHER_MASK = _mm_set_epi32(0xf070b030, 0xd0509010, 0xe060a020, 0xc0408000); __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); tmp2 = _mm_srli_epi16(X, 4); tmp1 = _mm_and_si128(X, AND_MASK); tmp2 = _mm_and_si128(tmp2, AND_MASK); tmp1 = _mm_shuffle_epi8(HIGHER_MASK ,tmp1); tmp2 = _mm_shuffle_epi8(LOWER_MASK ,tmp2); tmp1 = _mm_xor_si128(tmp1, tmp2); return _mm_shuffle_epi8(tmp1, BSWAP_MASK); } void gfmul_0(__m128i a, __m128i b, __m128i *res){ __m128i tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12; __m128i XMMMASK = _mm_setr_epi32(0xffffffff, 0x0, 0x0, 0x0); tmp3 = _mm_clmulepi64_si128(a, b, 0x00); tmp6 = _mm_clmulepi64_si128(a, b, 0x11); tmp4 = _mm_shuffle_epi32(a,78); tmp5 = _mm_shuffle_epi32(b,78); tmp4 = _mm_xor_si128(tmp4, a); tmp5 = _mm_xor_si128(tmp5, b); tmp4 = _mm_clmulepi64_si128(tmp4, tmp5, 0x00); tmp4 = _mm_xor_si128(tmp4, tmp3); tmp4 = _mm_xor_si128(tmp4, tmp6); tmp5 = _mm_slli_si128(tmp4, 8); tmp4 = _mm_srli_si128(tmp4, 8); tmp3 = _mm_xor_si128(tmp3, tmp5); tmp6 = _mm_xor_si128(tmp6, tmp4); tmp7 = _mm_srli_epi32(tmp6, 31); tmp8 = _mm_srli_epi32(tmp6, 30); tmp9 = _mm_srli_epi32(tmp6, 25); tmp7 = _mm_xor_si128(tmp7, tmp8); tmp7 = _mm_xor_si128(tmp7, tmp9); tmp8 = _mm_shuffle_epi32(tmp7, 147); tmp7 = _mm_and_si128(XMMMASK, tmp8); tmp8 = _mm_andnot_si128(XMMMASK, tmp8); tmp3 = _mm_xor_si128(tmp3, tmp8); tmp6 = _mm_xor_si128(tmp6, tmp7); tmp10 = _mm_slli_epi32(tmp6, 1); tmp3 = _mm_xor_si128(tmp3, tmp10); tmp11 = _mm_slli_epi32(tmp6, 2); tmp3 = _mm_xor_si128(tmp3, tmp11); tmp12 = _mm_slli_epi32(tmp6, 7); tmp3 = _mm_xor_si128(tmp3, tmp12); *res = _mm_xor_si128(tmp3, tmp6); } void gfmul_1(__m128i a, __m128i b, __m128i *res){ __m128i tmp3, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12; __m128i XMMMASK = _mm_setr_epi32(0xffffffff, 0x0, 0x0, 0x0); mul128_0(a, b, &tmp3, &tmp6); tmp7 = _mm_srli_epi32(tmp6, 31); tmp8 = _mm_srli_epi32(tmp6, 30); tmp9 = _mm_srli_epi32(tmp6, 25); tmp7 = _mm_xor_si128(tmp7, tmp8); tmp7 = _mm_xor_si128(tmp7, tmp9); tmp8 = _mm_shuffle_epi32(tmp7, 147); tmp7 = _mm_and_si128(XMMMASK, tmp8); tmp8 = _mm_andnot_si128(XMMMASK, tmp8); tmp3 = _mm_xor_si128(tmp3, tmp8); tmp6 = _mm_xor_si128(tmp6, tmp7); tmp10 = _mm_slli_epi32(tmp6, 1); tmp3 = _mm_xor_si128(tmp3, tmp10); tmp11 = _mm_slli_epi32(tmp6, 2); tmp3 = _mm_xor_si128(tmp3, tmp11); tmp12 = _mm_slli_epi32(tmp6, 7); tmp3 = _mm_xor_si128(tmp3, tmp12); *res = _mm_xor_si128(tmp3, tmp6); } void uni_hash_coeff_gen_batch_1(block* coeff, block seed, int sz) { coeff[0] = seed; for(int i = 1; i < sz; i++) { gfmul(coeff[i-1], seed, coeff+i); } } void uni_hash_coeff_gen_batch_2(block* coeff, block seed, int sz) { coeff[0] = seed; block multiplier; gfmul(seed, seed, &multiplier); coeff[1] = multiplier; for(int i = 2; i < sz; i+=2) { gfmul(coeff[i-2], multiplier, coeff+i); gfmul(coeff[i-1], multiplier, coeff+i+1); } } void uni_hash_coeff_gen_batch_4(block* coeff, block seed, int sz) { coeff[0] = seed; gfmul(seed, seed, &coeff[1]); gfmul(coeff[1], seed, &coeff[2]); block multiplier; gfmul(coeff[2], seed, &multiplier); coeff[3] = multiplier; for(int i = 4; i < sz; i+=4) { gfmul(coeff[i-4], multiplier, &coeff[i]); gfmul(coeff[i-3], multiplier, &coeff[i+1]); gfmul(coeff[i-2], multiplier, &coeff[i+2]); gfmul(coeff[i-1], multiplier, &coeff[i+3]); } } #define HASH_BLOCK_SZ 128 void hash1(block *ret, block r, block *a, int sz) { PRG prg; prg.reseed(&r); block *chi = new block[sz]; prg.random_block(chi, sz); block r2=zero_block, tmp = zero_block; int i = 0; for(; i < sz/HASH_BLOCK_SZ; ++i) { vector_inn_prdt_sum_red(&tmp, a+i*HASH_BLOCK_SZ, chi+i*HASH_BLOCK_SZ); r2 = r2 ^ tmp; } if(sz % HASH_BLOCK_SZ != 0) vector_inn_prdt_sum_red(&tmp, a+i*HASH_BLOCK_SZ, chi+i*HASH_BLOCK_SZ, sz%128); *ret = r2 ^ tmp; delete[] chi; } void hash2(block *ret, block r, block *a, int sz) { block *coeff = new block[sz]; int i = 0; for(; i < sz/HASH_BLOCK_SZ; ++i) { uni_hash_coeff_gen(coeff+i*HASH_BLOCK_SZ, r); } if(sz % HASH_BLOCK_SZ != 0) uni_hash_coeff_gen(coeff+i*HASH_BLOCK_SZ, r, sz%HASH_BLOCK_SZ); i = 0; block r2=zero_block, tmp = zero_block; for(; i < sz/HASH_BLOCK_SZ; ++i) { vector_inn_prdt_sum_red(&tmp, a+i*HASH_BLOCK_SZ, coeff+i*HASH_BLOCK_SZ); r2 = r2 ^ tmp; } if(sz % HASH_BLOCK_SZ != 0) vector_inn_prdt_sum_red(&tmp, a+i*HASH_BLOCK_SZ, coeff+i*HASH_BLOCK_SZ, sz%HASH_BLOCK_SZ); *ret = r2 ^ tmp; delete[] coeff; } int main() { PRG prg; block a, b; block res[4]; int bench_size = 1024*1024; cout << "Simplified GF2^128 multiplication ... "; for(int i = 0; i < 65536; ++i) { prg.random_block(&a, 1); prg.random_block(&b, 1); mul128_0(a, b, res, res+1); mul128_1(a, b, res+2, res+3); if(cmpBlock(res, res+2, 2) == false) { cout << "incorrect multiplication" << endl; abort(); } } cout << "correct" << endl; cout << "Benchmark: "; auto start = clock_start(); for(int i = 0; i < bench_size; ++i) { mul128_0(res[0], res[1], res+2, res+3); mul128_0(res[2], res[3], res, res+1); } cout << bench_size*2/(time_from(start))*1e6 << " operations/second" << endl; cout << "Benchmark: "; start = clock_start(); for(int i = 0; i < bench_size; ++i) { mul128_1(res[0], res[1], res+2, res+3); mul128_1(res[2], res[3], res, res+1); } cout << bench_size*2/(time_from(start))*1e6 << " operations/second" << endl; cout << "\nSimplified GF2^128 multiplication ... "; for(int i = 0; i < 65536; ++i) { prg.random_block(&a, 1); prg.random_block(&b, 1); gfmul(a, b, res); gfmul_reflect(reflect_xmm(a), reflect_xmm(b), res+1); res[1] = reflect_xmm(res[1]); if(cmpBlock(res, res+1, 1) == false) { cout << "incorrect multiplication" << endl; abort(); } } cout << "correct" << endl; cout << "Benchmark: "; start = clock_start(); for(int i = 0; i < bench_size; ++i) { gfmul_0(res[0], res[1], res+2); gfmul_0(res[2], res[3], res); } cout << bench_size*2/(time_from(start))*1e6 << " operations/second" << endl; cout << "Benchmark: "; start = clock_start(); for(int i = 0; i < bench_size; ++i) { gfmul_1(res[0], res[1], res+2); gfmul_1(res[2], res[3], res); } cout << bench_size*2/(time_from(start))*1e6 << " operations/second" << endl; cout << "\nVector inner product (with reduction)\n"; block v0[128], v1[128]; block r[2]; start = clock_start(); for(int i = 0; i < bench_size; ++i) { prg.random_block(v0, 128); prg.random_block(v1, 128); vector_inn_prdt_sum_red(r, v0, v1, 128); } cout << "Benchmark: PRG+MUL: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl; cout << "\nVector inner product (without reduction)\n"; start = clock_start(); for(int i = 0; i < bench_size; ++i) { prg.random_block(v0, 128); prg.random_block(v1, 128); vector_inn_prdt_sum_no_red(r, v0, v1, 128); } cout << "Benchmark: PRG+MUL: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl; cout << "\nUniversal hash coefficients (no batch)\n"; block seed; block coefficient[128]; start = clock_start(); for(int i = 0; i < bench_size; ++i) { prg.random_block(&seed, 1); uni_hash_coeff_gen_batch_1(coefficient, seed, 128); } cout << "Benchmark: PRG+Coefficient generation: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl; cout << "\nUniversal hash coefficients (batch 2)\n"; start = clock_start(); for(int i = 0; i < bench_size; ++i) { prg.random_block(&seed, 1); uni_hash_coeff_gen_batch_2(coefficient, seed, 128); } cout << "Benchmark: PRG+Coefficient generation: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl; cout << "\nUniversal hash coefficients (batch 4)\n"; start = clock_start(); for(int i = 0; i < bench_size; ++i) { prg.random_block(&seed, 1); uni_hash_coeff_gen_batch_4(coefficient, seed, 128); } cout << "Benchmark: PRG+Coefficient generation: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl; start = clock_start(); for(int i = 0; i < bench_size; ++i) { prg.random_block(&seed, 1); uni_hash_coeff_gen(coefficient, seed, 126); } cout << "Benchmark: PRG+Coefficient generation: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl; cout << "\nGalois field packing\n"; start = clock_start(); GaloisFieldPacking pack; block data[128], res1[128]; for(int i = 0; i < bench_size; ++i) { prg.random_block(data, 128); pack.packing(res1, data); } cout << "Benchmark: PRG+packing: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl; cout << "\nXOR of all elements in a vector\n"; start = clock_start(); for(int i = 0; i < bench_size; ++i) { prg.random_block(data, 128); vector_self_xor(res1, data, 128); } cout << "Benchmark: PRG+XOR: " << bench_size/(time_from(start))*1e6 << " operations/second" << endl; cout << "\nHash method 1: inner product with Chi vector (PRG)\n"; start = clock_start(); int hash_len = 1024*128; block *to_hash = new block[hash_len]; block ret; for(int i = 0; i < 1024; ++i) { prg.random_block(r, 1); prg.random_block(to_hash, hash_len); hash1(&ret, r[0], to_hash, hash_len); } cout << "Benchmark: PRG+XOR: " << 1024/(time_from(start))*1e6 << " operations/second" << endl; cout << "\nHash method 2: almost universal hash\n"; start = clock_start(); for(int i = 0; i < 1024; ++i) { prg.random_block(r, 1); prg.random_block(to_hash, hash_len); hash2(&ret, r[0], to_hash, hash_len); } cout << "Benchmark: PRG+XOR: " << 1024/(time_from(start))*1e6 << " operations/second" << endl; delete[] to_hash; return 0; }