338 lines
13 KiB
Diff
338 lines
13 KiB
Diff
From b2f3a6407d2d6ec89522410d7ac4c56d310c92b1 Mon Sep 17 00:00:00 2001
|
|
From: Daiki Ueno <dueno@redhat.com>
|
|
Date: Mon, 18 Sep 2017 11:24:00 +0200
|
|
Subject: [PATCH] freebl: Reorganize AES-GCM source code based on hw/sw
|
|
implementation
|
|
|
|
diff --git a/lib/freebl/gcm-hw.c b/lib/freebl/gcm-hw.c
|
|
new file mode 100644
|
|
--- /dev/null
|
|
+++ b/lib/freebl/gcm-hw.c
|
|
@@ -0,0 +1,151 @@
|
|
+/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
+
|
|
+#ifdef FREEBL_NO_DEPEND
|
|
+#include "stubs.h"
|
|
+#endif
|
|
+#include "gcm.h"
|
|
+#include "secerr.h"
|
|
+
|
|
+#ifdef NSS_X86_OR_X64
|
|
+#include <wmmintrin.h> /* clmul */
|
|
+#endif
|
|
+
|
|
+#define WRITE64(x, bytes) \
|
|
+ (bytes)[0] = (x) >> 56; \
|
|
+ (bytes)[1] = (x) >> 48; \
|
|
+ (bytes)[2] = (x) >> 40; \
|
|
+ (bytes)[3] = (x) >> 32; \
|
|
+ (bytes)[4] = (x) >> 24; \
|
|
+ (bytes)[5] = (x) >> 16; \
|
|
+ (bytes)[6] = (x) >> 8; \
|
|
+ (bytes)[7] = (x);
|
|
+
|
|
+SECStatus
|
|
+gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf,
|
|
+ unsigned int maxout)
|
|
+{
|
|
+#ifdef NSS_X86_OR_X64
|
|
+ uint64_t tmp_out[2];
|
|
+ _mm_storeu_si128((__m128i *)tmp_out, ghash->x);
|
|
+ PORT_Assert(maxout >= 16);
|
|
+ WRITE64(tmp_out[0], outbuf + 8);
|
|
+ WRITE64(tmp_out[1], outbuf);
|
|
+ return SECSuccess;
|
|
+#else
|
|
+ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
|
|
+ return SECFailure;
|
|
+#endif /* NSS_X86_OR_X64 */
|
|
+}
|
|
+
|
|
+SECStatus
|
|
+gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf,
|
|
+ unsigned int count)
|
|
+{
|
|
+#ifdef NSS_X86_OR_X64
|
|
+ size_t i;
|
|
+ pre_align __m128i z_high post_align;
|
|
+ pre_align __m128i z_low post_align;
|
|
+ pre_align __m128i C post_align;
|
|
+ pre_align __m128i D post_align;
|
|
+ pre_align __m128i E post_align;
|
|
+ pre_align __m128i F post_align;
|
|
+ pre_align __m128i bin post_align;
|
|
+ pre_align __m128i Ci post_align;
|
|
+ pre_align __m128i tmp post_align;
|
|
+
|
|
+ for (i = 0; i < count; i++, buf += 16) {
|
|
+ bin = _mm_set_epi16(((uint16_t)buf[0] << 8) | buf[1],
|
|
+ ((uint16_t)buf[2] << 8) | buf[3],
|
|
+ ((uint16_t)buf[4] << 8) | buf[5],
|
|
+ ((uint16_t)buf[6] << 8) | buf[7],
|
|
+ ((uint16_t)buf[8] << 8) | buf[9],
|
|
+ ((uint16_t)buf[10] << 8) | buf[11],
|
|
+ ((uint16_t)buf[12] << 8) | buf[13],
|
|
+ ((uint16_t)buf[14] << 8) | buf[15]);
|
|
+ Ci = _mm_xor_si128(bin, ghash->x);
|
|
+
|
|
+ /* Do binary mult ghash->X = Ci * ghash->H. */
|
|
+ C = _mm_clmulepi64_si128(Ci, ghash->h, 0x00);
|
|
+ D = _mm_clmulepi64_si128(Ci, ghash->h, 0x11);
|
|
+ E = _mm_clmulepi64_si128(Ci, ghash->h, 0x01);
|
|
+ F = _mm_clmulepi64_si128(Ci, ghash->h, 0x10);
|
|
+ tmp = _mm_xor_si128(E, F);
|
|
+ z_high = _mm_xor_si128(tmp, _mm_slli_si128(D, 8));
|
|
+ z_high = _mm_unpackhi_epi64(z_high, D);
|
|
+ z_low = _mm_xor_si128(_mm_slli_si128(tmp, 8), C);
|
|
+ z_low = _mm_unpackhi_epi64(_mm_slli_si128(C, 8), z_low);
|
|
+
|
|
+ /* Shift one to the left (multiply by x) as gcm spec is stupid. */
|
|
+ C = _mm_slli_si128(z_low, 8);
|
|
+ E = _mm_srli_epi64(C, 63);
|
|
+ D = _mm_slli_si128(z_high, 8);
|
|
+ F = _mm_srli_epi64(D, 63);
|
|
+ /* Carry over */
|
|
+ C = _mm_srli_si128(z_low, 8);
|
|
+ D = _mm_srli_epi64(C, 63);
|
|
+ z_low = _mm_or_si128(_mm_slli_epi64(z_low, 1), E);
|
|
+ z_high = _mm_or_si128(_mm_or_si128(_mm_slli_epi64(z_high, 1), F), D);
|
|
+
|
|
+ /* Reduce */
|
|
+ C = _mm_slli_si128(z_low, 8);
|
|
+ /* D = z_low << 127 */
|
|
+ D = _mm_slli_epi64(C, 63);
|
|
+ /* E = z_low << 126 */
|
|
+ E = _mm_slli_epi64(C, 62);
|
|
+ /* F = z_low << 121 */
|
|
+ F = _mm_slli_epi64(C, 57);
|
|
+ /* z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); */
|
|
+ z_low = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(z_low, D), E), F);
|
|
+ C = _mm_srli_si128(z_low, 8);
|
|
+ /* D = z_low >> 1 */
|
|
+ D = _mm_slli_epi64(C, 63);
|
|
+ D = _mm_or_si128(_mm_srli_epi64(z_low, 1), D);
|
|
+ /* E = z_low >> 2 */
|
|
+ E = _mm_slli_epi64(C, 62);
|
|
+ E = _mm_or_si128(_mm_srli_epi64(z_low, 2), E);
|
|
+ /* F = z_low >> 7 */
|
|
+ F = _mm_slli_epi64(C, 57);
|
|
+ F = _mm_or_si128(_mm_srli_epi64(z_low, 7), F);
|
|
+ /* ghash->x ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); */
|
|
+ ghash->x = _mm_xor_si128(_mm_xor_si128(
|
|
+ _mm_xor_si128(_mm_xor_si128(z_high, z_low), D), E),
|
|
+ F);
|
|
+ }
|
|
+ return SECSuccess;
|
|
+#else
|
|
+ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
|
|
+ return SECFailure;
|
|
+#endif /* NSS_X86_OR_X64 */
|
|
+}
|
|
+
|
|
+SECStatus
|
|
+gcm_HashInit_hw(gcmHashContext *ghash)
|
|
+{
|
|
+#ifdef NSS_X86_OR_X64
|
|
+ ghash->ghash_mul = gcm_HashMult_hw;
|
|
+ ghash->x = _mm_setzero_si128();
|
|
+ /* MSVC requires __m64 to load epi64. */
|
|
+ ghash->h = _mm_set_epi32(ghash->h_high >> 32, (uint32_t)ghash->h_high,
|
|
+ ghash->h_low >> 32, (uint32_t)ghash->h_low);
|
|
+ ghash->hw = PR_TRUE;
|
|
+ return SECSuccess;
|
|
+#else
|
|
+ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
|
|
+ return SECFailure;
|
|
+#endif /* NSS_X86_OR_X64 */
|
|
+}
|
|
+
|
|
+SECStatus
|
|
+gcm_HashZeroX_hw(gcmHashContext *ghash)
|
|
+{
|
|
+#ifdef NSS_X86_OR_X64
|
|
+ ghash->x = _mm_setzero_si128();
|
|
+ return SECSuccess;
|
|
+#else
|
|
+ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
|
|
+ return SECFailure;
|
|
+#endif /* NSS_X86_OR_X64 */
|
|
+}
|
|
+
|
|
diff --git a/lib/freebl/rijndael-hw.c b/lib/freebl/rijndael-hw.c
|
|
new file mode 100644
|
|
--- /dev/null
|
|
+++ b/lib/freebl/rijndael-hw.c
|
|
@@ -0,0 +1,170 @@
|
|
+/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
+
|
|
+#ifdef FREEBL_NO_DEPEND
|
|
+#include "stubs.h"
|
|
+#endif
|
|
+#include "rijndael.h"
|
|
+#include "secerr.h"
|
|
+
|
|
+#ifdef NSS_X86_OR_X64
|
|
+#include <wmmintrin.h> /* aes-ni */
|
|
+#endif
|
|
+
|
|
+#if defined(NSS_X86_OR_X64)
|
|
+#define EXPAND_KEY128(k, rcon, res) \
|
|
+ tmp_key = _mm_aeskeygenassist_si128(k, rcon); \
|
|
+ tmp_key = _mm_shuffle_epi32(tmp_key, 0xFF); \
|
|
+ tmp = _mm_xor_si128(k, _mm_slli_si128(k, 4)); \
|
|
+ tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \
|
|
+ tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \
|
|
+ res = _mm_xor_si128(tmp, tmp_key)
|
|
+
|
|
+static void
|
|
+native_key_expansion128(AESContext *cx, const unsigned char *key)
|
|
+{
|
|
+ __m128i *keySchedule = cx->keySchedule;
|
|
+ pre_align __m128i tmp_key post_align;
|
|
+ pre_align __m128i tmp post_align;
|
|
+ keySchedule[0] = _mm_loadu_si128((__m128i *)key);
|
|
+ EXPAND_KEY128(keySchedule[0], 0x01, keySchedule[1]);
|
|
+ EXPAND_KEY128(keySchedule[1], 0x02, keySchedule[2]);
|
|
+ EXPAND_KEY128(keySchedule[2], 0x04, keySchedule[3]);
|
|
+ EXPAND_KEY128(keySchedule[3], 0x08, keySchedule[4]);
|
|
+ EXPAND_KEY128(keySchedule[4], 0x10, keySchedule[5]);
|
|
+ EXPAND_KEY128(keySchedule[5], 0x20, keySchedule[6]);
|
|
+ EXPAND_KEY128(keySchedule[6], 0x40, keySchedule[7]);
|
|
+ EXPAND_KEY128(keySchedule[7], 0x80, keySchedule[8]);
|
|
+ EXPAND_KEY128(keySchedule[8], 0x1B, keySchedule[9]);
|
|
+ EXPAND_KEY128(keySchedule[9], 0x36, keySchedule[10]);
|
|
+}
|
|
+
|
|
+#define EXPAND_KEY192_PART1(res, k0, kt, rcon) \
|
|
+ tmp2 = _mm_slli_si128(k0, 4); \
|
|
+ tmp1 = _mm_xor_si128(k0, tmp2); \
|
|
+ tmp2 = _mm_slli_si128(tmp2, 4); \
|
|
+ tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \
|
|
+ tmp2 = _mm_aeskeygenassist_si128(kt, rcon); \
|
|
+ res = _mm_xor_si128(tmp1, _mm_shuffle_epi32(tmp2, 0x55))
|
|
+
|
|
+#define EXPAND_KEY192_PART2(res, k1, k2) \
|
|
+ tmp2 = _mm_xor_si128(k1, _mm_slli_si128(k1, 4)); \
|
|
+ res = _mm_xor_si128(tmp2, _mm_shuffle_epi32(k2, 0xFF))
|
|
+
|
|
+#define EXPAND_KEY192(k0, res1, res2, res3, carry, rcon1, rcon2) \
|
|
+ EXPAND_KEY192_PART1(tmp3, k0, res1, rcon1); \
|
|
+ EXPAND_KEY192_PART2(carry, res1, tmp3); \
|
|
+ res1 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(res1), \
|
|
+ _mm_castsi128_pd(tmp3), 0)); \
|
|
+ res2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp3), \
|
|
+ _mm_castsi128_pd(carry), 1)); \
|
|
+ EXPAND_KEY192_PART1(res3, tmp3, carry, rcon2)
|
|
+
|
|
+static void
|
|
+native_key_expansion192(AESContext *cx, const unsigned char *key)
|
|
+{
|
|
+ __m128i *keySchedule = cx->keySchedule;
|
|
+ pre_align __m128i tmp1 post_align;
|
|
+ pre_align __m128i tmp2 post_align;
|
|
+ pre_align __m128i tmp3 post_align;
|
|
+ pre_align __m128i carry post_align;
|
|
+ keySchedule[0] = _mm_loadu_si128((__m128i *)key);
|
|
+ keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16));
|
|
+ EXPAND_KEY192(keySchedule[0], keySchedule[1], keySchedule[2],
|
|
+ keySchedule[3], carry, 0x1, 0x2);
|
|
+ EXPAND_KEY192_PART2(keySchedule[4], carry, keySchedule[3]);
|
|
+ EXPAND_KEY192(keySchedule[3], keySchedule[4], keySchedule[5],
|
|
+ keySchedule[6], carry, 0x4, 0x8);
|
|
+ EXPAND_KEY192_PART2(keySchedule[7], carry, keySchedule[6]);
|
|
+ EXPAND_KEY192(keySchedule[6], keySchedule[7], keySchedule[8],
|
|
+ keySchedule[9], carry, 0x10, 0x20);
|
|
+ EXPAND_KEY192_PART2(keySchedule[10], carry, keySchedule[9]);
|
|
+ EXPAND_KEY192(keySchedule[9], keySchedule[10], keySchedule[11],
|
|
+ keySchedule[12], carry, 0x40, 0x80);
|
|
+}
|
|
+
|
|
+#define EXPAND_KEY256_PART(res, rconx, k1x, k2x, X) \
|
|
+ tmp_key = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(k2x, rconx), X); \
|
|
+ tmp2 = _mm_slli_si128(k1x, 4); \
|
|
+ tmp1 = _mm_xor_si128(k1x, tmp2); \
|
|
+ tmp2 = _mm_slli_si128(tmp2, 4); \
|
|
+ tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \
|
|
+ res = _mm_xor_si128(tmp1, tmp_key);
|
|
+
|
|
+#define EXPAND_KEY256(res1, res2, k1, k2, rcon) \
|
|
+ EXPAND_KEY256_PART(res1, rcon, k1, k2, 0xFF); \
|
|
+ EXPAND_KEY256_PART(res2, 0x00, k2, res1, 0xAA)
|
|
+
|
|
+static void
|
|
+native_key_expansion256(AESContext *cx, const unsigned char *key)
|
|
+{
|
|
+ __m128i *keySchedule = cx->keySchedule;
|
|
+ pre_align __m128i tmp_key post_align;
|
|
+ pre_align __m128i tmp1 post_align;
|
|
+ pre_align __m128i tmp2 post_align;
|
|
+ keySchedule[0] = _mm_loadu_si128((__m128i *)key);
|
|
+ keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16));
|
|
+ EXPAND_KEY256(keySchedule[2], keySchedule[3], keySchedule[0],
|
|
+ keySchedule[1], 0x01);
|
|
+ EXPAND_KEY256(keySchedule[4], keySchedule[5], keySchedule[2],
|
|
+ keySchedule[3], 0x02);
|
|
+ EXPAND_KEY256(keySchedule[6], keySchedule[7], keySchedule[4],
|
|
+ keySchedule[5], 0x04);
|
|
+ EXPAND_KEY256(keySchedule[8], keySchedule[9], keySchedule[6],
|
|
+ keySchedule[7], 0x08);
|
|
+ EXPAND_KEY256(keySchedule[10], keySchedule[11], keySchedule[8],
|
|
+ keySchedule[9], 0x10);
|
|
+ EXPAND_KEY256(keySchedule[12], keySchedule[13], keySchedule[10],
|
|
+ keySchedule[11], 0x20);
|
|
+ EXPAND_KEY256_PART(keySchedule[14], 0x40, keySchedule[12],
|
|
+ keySchedule[13], 0xFF);
|
|
+}
|
|
+
|
|
+#endif /* NSS_X86_OR_X64 */
|
|
+
|
|
+/*
|
|
+ * AES key expansion using aes-ni instructions.
|
|
+ */
|
|
+void
|
|
+rijndael_native_key_expansion(AESContext *cx, const unsigned char *key,
|
|
+ unsigned int Nk)
|
|
+{
|
|
+#ifdef NSS_X86_OR_X64
|
|
+ switch (Nk) {
|
|
+ case 4:
|
|
+ native_key_expansion128(cx, key);
|
|
+ return;
|
|
+ case 6:
|
|
+ native_key_expansion192(cx, key);
|
|
+ return;
|
|
+ case 8:
|
|
+ native_key_expansion256(cx, key);
|
|
+ return;
|
|
+ default:
|
|
+ /* This shouldn't happen. */
|
|
+ PORT_Assert(0);
|
|
+ }
|
|
+#else
|
|
+ PORT_Assert(0);
|
|
+#endif /* NSS_X86_OR_X64 */
|
|
+}
|
|
+
|
|
+void
|
|
+rijndael_native_encryptBlock(AESContext *cx,
|
|
+ unsigned char *output,
|
|
+ const unsigned char *input)
|
|
+{
|
|
+#ifdef NSS_X86_OR_X64
|
|
+ int i;
|
|
+ pre_align __m128i m post_align = _mm_loadu_si128((__m128i *)input);
|
|
+ m = _mm_xor_si128(m, cx->keySchedule[0]);
|
|
+ for (i = 1; i < cx->Nr; ++i) {
|
|
+ m = _mm_aesenc_si128(m, cx->keySchedule[i]);
|
|
+ }
|
|
+ m = _mm_aesenclast_si128(m, cx->keySchedule[cx->Nr]);
|
|
+ _mm_storeu_si128((__m128i *)output, m);
|
|
+#else
|
|
+ PORT_Assert(0);
|
|
+#endif /* NSS_X86_OR_X64 */
|
|
+}
|