forked from pool/libgcrypt
Dirk Mueller
057648ad6a
OBS-URL: https://build.opensuse.org/package/show/devel:libraries:c_c++/libgcrypt?expand=0&rev=39
1619 lines
53 KiB
Diff
1619 lines
53 KiB
Diff
--- cipher/blowfish-arm.S
|
||
+++ cipher/blowfish-arm.S
|
||
@@ -0,0 +1,743 @@
|
||
+/* blowfish-arm.S - ARM assembly implementation of Blowfish cipher
|
||
+ *
|
||
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
||
+ *
|
||
+ * This file is part of Libgcrypt.
|
||
+ *
|
||
+ * Libgcrypt is free software; you can redistribute it and/or modify
|
||
+ * it under the terms of the GNU Lesser General Public License as
|
||
+ * published by the Free Software Foundation; either version 2.1 of
|
||
+ * the License, or (at your option) any later version.
|
||
+ *
|
||
+ * Libgcrypt is distributed in the hope that it will be useful,
|
||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
+ * GNU Lesser General Public License for more details.
|
||
+ *
|
||
+ * You should have received a copy of the GNU Lesser General Public
|
||
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||
+ */
|
||
+
|
||
+#include <config.h>
|
||
+
|
||
+#if defined(__ARMEL__)
|
||
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
|
||
+
|
||
+.text
|
||
+
|
||
+.syntax unified
|
||
+.arm
|
||
+
|
||
+/* structure of crypto context */
|
||
+#define s0 0
|
||
+#define s1 (s0 + (1 * 256) * 4)
|
||
+#define s2 (s0 + (2 * 256) * 4)
|
||
+#define s3 (s0 + (3 * 256) * 4)
|
||
+#define p (s3 + (1 * 256) * 4)
|
||
+
|
||
+/* register macros */
|
||
+#define CTXs0 %r0
|
||
+#define CTXs1 %r9
|
||
+#define CTXs2 %r8
|
||
+#define CTXs3 %r10
|
||
+#define RMASK %lr
|
||
+#define RKEYL %r2
|
||
+#define RKEYR %ip
|
||
+
|
||
+#define RL0 %r3
|
||
+#define RR0 %r4
|
||
+
|
||
+#define RL1 %r9
|
||
+#define RR1 %r10
|
||
+
|
||
+#define RT0 %r11
|
||
+#define RT1 %r7
|
||
+#define RT2 %r5
|
||
+#define RT3 %r6
|
||
+
|
||
+/* helper macros */
|
||
+#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
|
||
+ ldrb rout, [rsrc, #((offs) + 0)]; \
|
||
+ ldrb rtmp, [rsrc, #((offs) + 1)]; \
|
||
+ orr rout, rout, rtmp, lsl #8; \
|
||
+ ldrb rtmp, [rsrc, #((offs) + 2)]; \
|
||
+ orr rout, rout, rtmp, lsl #16; \
|
||
+ ldrb rtmp, [rsrc, #((offs) + 3)]; \
|
||
+ orr rout, rout, rtmp, lsl #24;
|
||
+
|
||
+#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
|
||
+ mov rtmp0, rin, lsr #8; \
|
||
+ strb rin, [rdst, #((offs) + 0)]; \
|
||
+ mov rtmp1, rin, lsr #16; \
|
||
+ strb rtmp0, [rdst, #((offs) + 1)]; \
|
||
+ mov rtmp0, rin, lsr #24; \
|
||
+ strb rtmp1, [rdst, #((offs) + 2)]; \
|
||
+ strb rtmp0, [rdst, #((offs) + 3)];
|
||
+
|
||
+#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
|
||
+ ldrb rout, [rsrc, #((offs) + 3)]; \
|
||
+ ldrb rtmp, [rsrc, #((offs) + 2)]; \
|
||
+ orr rout, rout, rtmp, lsl #8; \
|
||
+ ldrb rtmp, [rsrc, #((offs) + 1)]; \
|
||
+ orr rout, rout, rtmp, lsl #16; \
|
||
+ ldrb rtmp, [rsrc, #((offs) + 0)]; \
|
||
+ orr rout, rout, rtmp, lsl #24;
|
||
+
|
||
+#define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \
|
||
+ mov rtmp0, rin, lsr #8; \
|
||
+ strb rin, [rdst, #((offs) + 3)]; \
|
||
+ mov rtmp1, rin, lsr #16; \
|
||
+ strb rtmp0, [rdst, #((offs) + 2)]; \
|
||
+ mov rtmp0, rin, lsr #24; \
|
||
+ strb rtmp1, [rdst, #((offs) + 1)]; \
|
||
+ strb rtmp0, [rdst, #((offs) + 0)];
|
||
+
|
||
+#ifdef __ARMEL__
|
||
+ #define ldr_unaligned_host ldr_unaligned_le
|
||
+ #define str_unaligned_host str_unaligned_le
|
||
+
|
||
+ /* bswap on little-endian */
|
||
+#ifdef HAVE_ARM_ARCH_V6
|
||
+ #define host_to_be(reg, rtmp) \
|
||
+ rev reg, reg;
|
||
+ #define be_to_host(reg, rtmp) \
|
||
+ rev reg, reg;
|
||
+#else
|
||
+ #define host_to_be(reg, rtmp) \
|
||
+ eor rtmp, reg, reg, ror #16; \
|
||
+ mov rtmp, rtmp, lsr #8; \
|
||
+ bic rtmp, rtmp, #65280; \
|
||
+ eor reg, rtmp, reg, ror #8;
|
||
+ #define be_to_host(reg, rtmp) \
|
||
+ eor rtmp, reg, reg, ror #16; \
|
||
+ mov rtmp, rtmp, lsr #8; \
|
||
+ bic rtmp, rtmp, #65280; \
|
||
+ eor reg, rtmp, reg, ror #8;
|
||
+#endif
|
||
+#else
|
||
+ #define ldr_unaligned_host ldr_unaligned_be
|
||
+ #define str_unaligned_host str_unaligned_be
|
||
+
|
||
+ /* nop on big-endian */
|
||
+ #define host_to_be(reg, rtmp) /*_*/
|
||
+ #define be_to_host(reg, rtmp) /*_*/
|
||
+#endif
|
||
+
|
||
+#define host_to_host(x, y) /*_*/
|
||
+
|
||
+/***********************************************************************
|
||
+ * 1-way blowfish
|
||
+ ***********************************************************************/
|
||
+#define F(l, r) \
|
||
+ and RT0, RMASK, l, lsr#(24 - 2); \
|
||
+ and RT1, RMASK, l, lsr#(16 - 2); \
|
||
+ ldr RT0, [CTXs0, RT0]; \
|
||
+ and RT2, RMASK, l, lsr#(8 - 2); \
|
||
+ ldr RT1, [CTXs1, RT1]; \
|
||
+ and RT3, RMASK, l, lsl#2; \
|
||
+ ldr RT2, [CTXs2, RT2]; \
|
||
+ add RT0, RT1; \
|
||
+ ldr RT3, [CTXs3, RT3]; \
|
||
+ eor RT0, RT2; \
|
||
+ add RT0, RT3; \
|
||
+ eor r, RT0;
|
||
+
|
||
+#define load_roundkey_enc(n) \
|
||
+ ldr RKEYL, [CTXs2, #((p - s2) + (4 * (n) + 0))]; \
|
||
+ ldr RKEYR, [CTXs2, #((p - s2) + (4 * (n) + 4))];
|
||
+
|
||
+#define add_roundkey_enc() \
|
||
+ eor RL0, RKEYL; \
|
||
+ eor RR0, RKEYR;
|
||
+
|
||
+#define round_enc(n) \
|
||
+ add_roundkey_enc(); \
|
||
+ load_roundkey_enc(n); \
|
||
+ \
|
||
+ F(RL0, RR0); \
|
||
+ F(RR0, RL0);
|
||
+
|
||
+#define load_roundkey_dec(n) \
|
||
+ ldr RKEYL, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 4))]; \
|
||
+ ldr RKEYR, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 0))];
|
||
+
|
||
+#define add_roundkey_dec() \
|
||
+ eor RL0, RKEYL; \
|
||
+ eor RR0, RKEYR;
|
||
+
|
||
+#define round_dec(n) \
|
||
+ add_roundkey_dec(); \
|
||
+ load_roundkey_dec(n); \
|
||
+ \
|
||
+ F(RL0, RR0); \
|
||
+ F(RR0, RL0);
|
||
+
|
||
+#define read_block_aligned(rin, offs, l0, r0, convert, rtmp) \
|
||
+ ldr l0, [rin, #((offs) + 0)]; \
|
||
+ ldr r0, [rin, #((offs) + 4)]; \
|
||
+ convert(l0, rtmp); \
|
||
+ convert(r0, rtmp);
|
||
+
|
||
+#define write_block_aligned(rout, offs, l0, r0, convert, rtmp) \
|
||
+ convert(l0, rtmp); \
|
||
+ convert(r0, rtmp); \
|
||
+ str l0, [rout, #((offs) + 0)]; \
|
||
+ str r0, [rout, #((offs) + 4)];
|
||
+
|
||
+#ifdef __ARM_FEATURE_UNALIGNED
|
||
+ /* unaligned word reads allowed */
|
||
+ #define read_block(rin, offs, l0, r0, rtmp0) \
|
||
+ read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0)
|
||
+
|
||
+ #define write_block(rout, offs, r0, l0, rtmp0, rtmp1) \
|
||
+ write_block_aligned(rout, offs, r0, l0, be_to_host, rtmp0)
|
||
+
|
||
+ #define read_block_host(rin, offs, l0, r0, rtmp0) \
|
||
+ read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0)
|
||
+
|
||
+ #define write_block_host(rout, offs, r0, l0, rtmp0, rtmp1) \
|
||
+ write_block_aligned(rout, offs, r0, l0, host_to_host, rtmp0)
|
||
+#else
|
||
+ /* need to handle unaligned reads by byte reads */
|
||
+ #define read_block(rin, offs, l0, r0, rtmp0) \
|
||
+ tst rin, #3; \
|
||
+ beq 1f; \
|
||
+ ldr_unaligned_be(l0, rin, (offs) + 0, rtmp0); \
|
||
+ ldr_unaligned_be(r0, rin, (offs) + 4, rtmp0); \
|
||
+ b 2f; \
|
||
+ 1:;\
|
||
+ read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0); \
|
||
+ 2:;
|
||
+
|
||
+ #define write_block(rout, offs, l0, r0, rtmp0, rtmp1) \
|
||
+ tst rout, #3; \
|
||
+ beq 1f; \
|
||
+ str_unaligned_be(l0, rout, (offs) + 0, rtmp0, rtmp1); \
|
||
+ str_unaligned_be(r0, rout, (offs) + 4, rtmp0, rtmp1); \
|
||
+ b 2f; \
|
||
+ 1:;\
|
||
+ write_block_aligned(rout, offs, l0, r0, be_to_host, rtmp0); \
|
||
+ 2:;
|
||
+
|
||
+ #define read_block_host(rin, offs, l0, r0, rtmp0) \
|
||
+ tst rin, #3; \
|
||
+ beq 1f; \
|
||
+ ldr_unaligned_host(l0, rin, (offs) + 0, rtmp0); \
|
||
+ ldr_unaligned_host(r0, rin, (offs) + 4, rtmp0); \
|
||
+ b 2f; \
|
||
+ 1:;\
|
||
+ read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0); \
|
||
+ 2:;
|
||
+
|
||
+ #define write_block_host(rout, offs, l0, r0, rtmp0, rtmp1) \
|
||
+ tst rout, #3; \
|
||
+ beq 1f; \
|
||
+ str_unaligned_host(l0, rout, (offs) + 0, rtmp0, rtmp1); \
|
||
+ str_unaligned_host(r0, rout, (offs) + 4, rtmp0, rtmp1); \
|
||
+ b 2f; \
|
||
+ 1:;\
|
||
+ write_block_aligned(rout, offs, l0, r0, host_to_host); \
|
||
+ 2:;
|
||
+#endif
|
||
+
|
||
+.align 3
|
||
+.type __blowfish_enc_blk1,%function;
|
||
+
|
||
+__blowfish_enc_blk1:
|
||
+ /* input:
|
||
+ * preloaded: CTX
|
||
+ * [RL0, RR0]: src
|
||
+ * output:
|
||
+ * [RR0, RL0]: dst
|
||
+ */
|
||
+ push {%lr};
|
||
+
|
||
+ add CTXs1, CTXs0, #(s1 - s0);
|
||
+ add CTXs2, CTXs0, #(s2 - s0);
|
||
+ mov RMASK, #(0xff << 2); /* byte mask */
|
||
+ add CTXs3, CTXs1, #(s3 - s1);
|
||
+
|
||
+ load_roundkey_enc(0);
|
||
+ round_enc(2);
|
||
+ round_enc(4);
|
||
+ round_enc(6);
|
||
+ round_enc(8);
|
||
+ round_enc(10);
|
||
+ round_enc(12);
|
||
+ round_enc(14);
|
||
+ round_enc(16);
|
||
+ add_roundkey_enc();
|
||
+
|
||
+ pop {%pc};
|
||
+.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;
|
||
+
|
||
+.align 8
|
||
+.globl _gcry_blowfish_arm_do_encrypt
|
||
+.type _gcry_blowfish_arm_do_encrypt,%function;
|
||
+
|
||
+_gcry_blowfish_arm_do_encrypt:
|
||
+ /* input:
|
||
+ * %r0: ctx, CTX
|
||
+ * %r1: u32 *ret_xl
|
||
+ * %r2: u32 *ret_xr
|
||
+ */
|
||
+ push {%r2, %r4-%r11, %ip, %lr};
|
||
+
|
||
+ ldr RL0, [%r1];
|
||
+ ldr RR0, [%r2];
|
||
+
|
||
+ bl __blowfish_enc_blk1;
|
||
+
|
||
+ pop {%r2};
|
||
+ str RR0, [%r1];
|
||
+ str RL0, [%r2];
|
||
+
|
||
+ pop {%r4-%r11, %ip, %pc};
|
||
+.size _gcry_blowfish_arm_do_encrypt,.-_gcry_blowfish_arm_do_encrypt;
|
||
+
|
||
+.align 3
|
||
+.globl _gcry_blowfish_arm_encrypt_block
|
||
+.type _gcry_blowfish_arm_encrypt_block,%function;
|
||
+
|
||
+_gcry_blowfish_arm_encrypt_block:
|
||
+ /* input:
|
||
+ * %r0: ctx, CTX
|
||
+ * %r1: dst
|
||
+ * %r2: src
|
||
+ */
|
||
+ push {%r4-%r11, %ip, %lr};
|
||
+
|
||
+ read_block(%r2, 0, RL0, RR0, RT0);
|
||
+
|
||
+ bl __blowfish_enc_blk1;
|
||
+
|
||
+ write_block(%r1, 0, RR0, RL0, RT0, RT1);
|
||
+
|
||
+ pop {%r4-%r11, %ip, %pc};
|
||
+.size _gcry_blowfish_arm_encrypt_block,.-_gcry_blowfish_arm_encrypt_block;
|
||
+
|
||
+.align 3
|
||
+.globl _gcry_blowfish_arm_decrypt_block
|
||
+.type _gcry_blowfish_arm_decrypt_block,%function;
|
||
+
|
||
+_gcry_blowfish_arm_decrypt_block:
|
||
+ /* input:
|
||
+ * %r0: ctx, CTX
|
||
+ * %r1: dst
|
||
+ * %r2: src
|
||
+ */
|
||
+ push {%r4-%r11, %ip, %lr};
|
||
+
|
||
+ add CTXs1, CTXs0, #(s1 - s0);
|
||
+ add CTXs2, CTXs0, #(s2 - s0);
|
||
+ mov RMASK, #(0xff << 2); /* byte mask */
|
||
+ add CTXs3, CTXs1, #(s3 - s1);
|
||
+
|
||
+ read_block(%r2, 0, RL0, RR0, RT0);
|
||
+
|
||
+ load_roundkey_dec(17);
|
||
+ round_dec(15);
|
||
+ round_dec(13);
|
||
+ round_dec(11);
|
||
+ round_dec(9);
|
||
+ round_dec(7);
|
||
+ round_dec(5);
|
||
+ round_dec(3);
|
||
+ round_dec(1);
|
||
+ add_roundkey_dec();
|
||
+
|
||
+ write_block(%r1, 0, RR0, RL0, RT0, RT1);
|
||
+
|
||
+ pop {%r4-%r11, %ip, %pc};
|
||
+.size _gcry_blowfish_arm_decrypt_block,.-_gcry_blowfish_arm_decrypt_block;
|
||
+
|
||
+/***********************************************************************
|
||
+ * 2-way blowfish
|
||
+ ***********************************************************************/
|
||
+#define F2(n, l0, r0, l1, r1, set_nextk, dec) \
|
||
+ \
|
||
+ and RT0, RMASK, l0, lsr#(24 - 2); \
|
||
+ and RT1, RMASK, l0, lsr#(16 - 2); \
|
||
+ and RT2, RMASK, l0, lsr#(8 - 2); \
|
||
+ add RT1, #(s1 - s0); \
|
||
+ \
|
||
+ ldr RT0, [CTXs0, RT0]; \
|
||
+ and RT3, RMASK, l0, lsl#2; \
|
||
+ ldr RT1, [CTXs0, RT1]; \
|
||
+ add RT3, #(s3 - s2); \
|
||
+ ldr RT2, [CTXs2, RT2]; \
|
||
+ add RT0, RT1; \
|
||
+ ldr RT3, [CTXs2, RT3]; \
|
||
+ \
|
||
+ and RT1, RMASK, l1, lsr#(24 - 2); \
|
||
+ eor RT0, RT2; \
|
||
+ and RT2, RMASK, l1, lsr#(16 - 2); \
|
||
+ add RT0, RT3; \
|
||
+ add RT2, #(s1 - s0); \
|
||
+ and RT3, RMASK, l1, lsr#(8 - 2); \
|
||
+ eor r0, RT0; \
|
||
+ \
|
||
+ ldr RT1, [CTXs0, RT1]; \
|
||
+ and RT0, RMASK, l1, lsl#2; \
|
||
+ ldr RT2, [CTXs0, RT2]; \
|
||
+ add RT0, #(s3 - s2); \
|
||
+ ldr RT3, [CTXs2, RT3]; \
|
||
+ add RT1, RT2; \
|
||
+ ldr RT0, [CTXs2, RT0]; \
|
||
+ \
|
||
+ and RT2, RMASK, r0, lsr#(24 - 2); \
|
||
+ eor RT1, RT3; \
|
||
+ and RT3, RMASK, r0, lsr#(16 - 2); \
|
||
+ add RT1, RT0; \
|
||
+ add RT3, #(s1 - s0); \
|
||
+ and RT0, RMASK, r0, lsr#(8 - 2); \
|
||
+ eor r1, RT1; \
|
||
+ \
|
||
+ ldr RT2, [CTXs0, RT2]; \
|
||
+ and RT1, RMASK, r0, lsl#2; \
|
||
+ ldr RT3, [CTXs0, RT3]; \
|
||
+ add RT1, #(s3 - s2); \
|
||
+ ldr RT0, [CTXs2, RT0]; \
|
||
+ add RT2, RT3; \
|
||
+ ldr RT1, [CTXs2, RT1]; \
|
||
+ \
|
||
+ and RT3, RMASK, r1, lsr#(24 - 2); \
|
||
+ eor RT2, RT0; \
|
||
+ and RT0, RMASK, r1, lsr#(16 - 2); \
|
||
+ add RT2, RT1; \
|
||
+ add RT0, #(s1 - s0); \
|
||
+ and RT1, RMASK, r1, lsr#(8 - 2); \
|
||
+ eor l0, RT2; \
|
||
+ \
|
||
+ ldr RT3, [CTXs0, RT3]; \
|
||
+ and RT2, RMASK, r1, lsl#2; \
|
||
+ ldr RT0, [CTXs0, RT0]; \
|
||
+ add RT2, #(s3 - s2); \
|
||
+ ldr RT1, [CTXs2, RT1]; \
|
||
+ eor l1, RKEYL; \
|
||
+ ldr RT2, [CTXs2, RT2]; \
|
||
+ \
|
||
+ eor r0, RKEYR; \
|
||
+ add RT3, RT0; \
|
||
+ eor r1, RKEYR; \
|
||
+ eor RT3, RT1; \
|
||
+ eor l0, RKEYL; \
|
||
+ add RT3, RT2; \
|
||
+ set_nextk(RKEYL, (p - s2) + (4 * (n) + ((dec) * 4))); \
|
||
+ eor l1, RT3; \
|
||
+ set_nextk(RKEYR, (p - s2) + (4 * (n) + (!(dec) * 4)));
|
||
+
|
||
+#define load_n_add_roundkey_enc2(n) \
|
||
+ load_roundkey_enc(n); \
|
||
+ eor RL0, RKEYL; \
|
||
+ eor RR0, RKEYR; \
|
||
+ eor RL1, RKEYL; \
|
||
+ eor RR1, RKEYR; \
|
||
+ load_roundkey_enc((n) + 2);
|
||
+
|
||
+#define next_key(reg, offs) \
|
||
+ ldr reg, [CTXs2, #(offs)];
|
||
+
|
||
+#define dummy(x, y) /* do nothing */
|
||
+
|
||
+#define round_enc2(n, load_next_key) \
|
||
+ F2((n) + 2, RL0, RR0, RL1, RR1, load_next_key, 0);
|
||
+
|
||
+#define load_n_add_roundkey_dec2(n) \
|
||
+ load_roundkey_dec(n); \
|
||
+ eor RL0, RKEYL; \
|
||
+ eor RR0, RKEYR; \
|
||
+ eor RL1, RKEYL; \
|
||
+ eor RR1, RKEYR; \
|
||
+ load_roundkey_dec((n) - 2);
|
||
+
|
||
+#define round_dec2(n, load_next_key) \
|
||
+ F2((n) - 3, RL0, RR0, RL1, RR1, load_next_key, 1);
|
||
+
|
||
+#define read_block2_aligned(rin, l0, r0, l1, r1, convert, rtmp) \
|
||
+ ldr l0, [rin, #(0)]; \
|
||
+ ldr r0, [rin, #(4)]; \
|
||
+ convert(l0, rtmp); \
|
||
+ ldr l1, [rin, #(8)]; \
|
||
+ convert(r0, rtmp); \
|
||
+ ldr r1, [rin, #(12)]; \
|
||
+ convert(l1, rtmp); \
|
||
+ convert(r1, rtmp);
|
||
+
|
||
+#define write_block2_aligned(rout, l0, r0, l1, r1, convert, rtmp) \
|
||
+ convert(l0, rtmp); \
|
||
+ convert(r0, rtmp); \
|
||
+ convert(l1, rtmp); \
|
||
+ str l0, [rout, #(0)]; \
|
||
+ convert(r1, rtmp); \
|
||
+ str r0, [rout, #(4)]; \
|
||
+ str l1, [rout, #(8)]; \
|
||
+ str r1, [rout, #(12)];
|
||
+
|
||
+#ifdef __ARM_FEATURE_UNALIGNED
|
||
+ /* unaligned word reads allowed */
|
||
+ #define read_block2(rin, l0, r0, l1, r1, rtmp0) \
|
||
+ read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0)
|
||
+
|
||
+ #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
|
||
+ write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0)
|
||
+
|
||
+ #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
|
||
+ read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0)
|
||
+
|
||
+ #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
|
||
+ write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0)
|
||
+#else
|
||
+ /* need to handle unaligned reads by byte reads */
|
||
+ #define read_block2(rin, l0, r0, l1, r1, rtmp0) \
|
||
+ tst rin, #3; \
|
||
+ beq 1f; \
|
||
+ ldr_unaligned_be(l0, rin, 0, rtmp0); \
|
||
+ ldr_unaligned_be(r0, rin, 4, rtmp0); \
|
||
+ ldr_unaligned_be(l1, rin, 8, rtmp0); \
|
||
+ ldr_unaligned_be(r1, rin, 12, rtmp0); \
|
||
+ b 2f; \
|
||
+ 1:;\
|
||
+ read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0); \
|
||
+ 2:;
|
||
+
|
||
+ #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
|
||
+ tst rout, #3; \
|
||
+ beq 1f; \
|
||
+ str_unaligned_be(l0, rout, 0, rtmp0, rtmp1); \
|
||
+ str_unaligned_be(r0, rout, 4, rtmp0, rtmp1); \
|
||
+ str_unaligned_be(l1, rout, 8, rtmp0, rtmp1); \
|
||
+ str_unaligned_be(r1, rout, 12, rtmp0, rtmp1); \
|
||
+ b 2f; \
|
||
+ 1:;\
|
||
+ write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0); \
|
||
+ 2:;
|
||
+
|
||
+ #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
|
||
+ tst rin, #3; \
|
||
+ beq 1f; \
|
||
+ ldr_unaligned_host(l0, rin, 0, rtmp0); \
|
||
+ ldr_unaligned_host(r0, rin, 4, rtmp0); \
|
||
+ ldr_unaligned_host(l1, rin, 8, rtmp0); \
|
||
+ ldr_unaligned_host(r1, rin, 12, rtmp0); \
|
||
+ b 2f; \
|
||
+ 1:;\
|
||
+ read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0); \
|
||
+ 2:;
|
||
+
|
||
+ #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
|
||
+ tst rout, #3; \
|
||
+ beq 1f; \
|
||
+ str_unaligned_host(l0, rout, 0, rtmp0, rtmp1); \
|
||
+ str_unaligned_host(r0, rout, 4, rtmp0, rtmp1); \
|
||
+ str_unaligned_host(l1, rout, 8, rtmp0, rtmp1); \
|
||
+ str_unaligned_host(r1, rout, 12, rtmp0, rtmp1); \
|
||
+ b 2f; \
|
||
+ 1:;\
|
||
+ write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0); \
|
||
+ 2:;
|
||
+#endif
|
||
+
|
||
+.align 3
|
||
+.type _gcry_blowfish_arm_enc_blk2,%function;
|
||
+
|
||
+_gcry_blowfish_arm_enc_blk2:
|
||
+ /* input:
|
||
+ * preloaded: CTX
|
||
+ * [RL0, RR0], [RL1, RR1]: src
|
||
+ * output:
|
||
+ * [RR0, RL0], [RR1, RL1]: dst
|
||
+ */
|
||
+ push {RT0,%lr};
|
||
+
|
||
+ add CTXs2, CTXs0, #(s2 - s0);
|
||
+ mov RMASK, #(0xff << 2); /* byte mask */
|
||
+
|
||
+ load_n_add_roundkey_enc2(0);
|
||
+ round_enc2(2, next_key);
|
||
+ round_enc2(4, next_key);
|
||
+ round_enc2(6, next_key);
|
||
+ round_enc2(8, next_key);
|
||
+ round_enc2(10, next_key);
|
||
+ round_enc2(12, next_key);
|
||
+ round_enc2(14, next_key);
|
||
+ round_enc2(16, dummy);
|
||
+
|
||
+ host_to_be(RR0, RT0);
|
||
+ host_to_be(RL0, RT0);
|
||
+ host_to_be(RR1, RT0);
|
||
+ host_to_be(RL1, RT0);
|
||
+
|
||
+ pop {RT0,%pc};
|
||
+.size _gcry_blowfish_arm_enc_blk2,.-_gcry_blowfish_arm_enc_blk2;
|
||
+
|
||
+.align 3
|
||
+.globl _gcry_blowfish_arm_cfb_dec;
|
||
+.type _gcry_blowfish_arm_cfb_dec,%function;
|
||
+
|
||
+_gcry_blowfish_arm_cfb_dec:
|
||
+ /* input:
|
||
+ * %r0: CTX
|
||
+ * %r1: dst (2 blocks)
|
||
+ * %r2: src (2 blocks)
|
||
+ * %r3: iv (64bit)
|
||
+ */
|
||
+ push {%r2, %r4-%r11, %ip, %lr};
|
||
+
|
||
+ mov %lr, %r3;
|
||
+
|
||
+ /* Load input (iv/%r3 is aligned, src/%r2 might not be) */
|
||
+ ldm %r3, {RL0, RR0};
|
||
+ host_to_be(RL0, RT0);
|
||
+ host_to_be(RR0, RT0);
|
||
+ read_block(%r2, 0, RL1, RR1, RT0);
|
||
+
|
||
+ /* Update IV, load src[1] and save to iv[0] */
|
||
+ read_block_host(%r2, 8, %r5, %r6, RT0);
|
||
+ stm %lr, {%r5, %r6};
|
||
+
|
||
+ bl _gcry_blowfish_arm_enc_blk2;
|
||
+ /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
|
||
+
|
||
+ /* %r1: dst, %r0: %src */
|
||
+ pop {%r0};
|
||
+
|
||
+ /* dst = src ^ result */
|
||
+ read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
|
||
+ eor %r5, %r4;
|
||
+ eor %r6, %r3;
|
||
+ eor %r7, %r10;
|
||
+ eor %r8, %r9;
|
||
+ write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
|
||
+
|
||
+ pop {%r4-%r11, %ip, %pc};
|
||
+.ltorg
|
||
+.size _gcry_blowfish_arm_cfb_dec,.-_gcry_blowfish_arm_cfb_dec;
|
||
+
|
||
+.align 3
|
||
+.globl _gcry_blowfish_arm_ctr_enc;
|
||
+.type _gcry_blowfish_arm_ctr_enc,%function;
|
||
+
|
||
+_gcry_blowfish_arm_ctr_enc:
|
||
+ /* input:
|
||
+ * %r0: CTX
|
||
+ * %r1: dst (2 blocks)
|
||
+ * %r2: src (2 blocks)
|
||
+ * %r3: iv (64bit, big-endian)
|
||
+ */
|
||
+ push {%r2, %r4-%r11, %ip, %lr};
|
||
+
|
||
+ mov %lr, %r3;
|
||
+
|
||
+ /* Load IV (big => host endian) */
|
||
+ read_block_aligned(%lr, 0, RL0, RR0, be_to_host, RT0);
|
||
+
|
||
+ /* Construct IVs */
|
||
+ adds RR1, RR0, #1; /* +1 */
|
||
+ adc RL1, RL0, #0;
|
||
+ adds %r6, RR1, #1; /* +2 */
|
||
+ adc %r5, RL1, #0;
|
||
+
|
||
+ /* Store new IV (host => big-endian) */
|
||
+ write_block_aligned(%lr, 0, %r5, %r6, host_to_be, RT0);
|
||
+
|
||
+ bl _gcry_blowfish_arm_enc_blk2;
|
||
+ /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
|
||
+
|
||
+ /* %r1: dst, %r0: %src */
|
||
+ pop {%r0};
|
||
+
|
||
+ /* XOR key-stream with plaintext */
|
||
+ read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
|
||
+ eor %r5, %r4;
|
||
+ eor %r6, %r3;
|
||
+ eor %r7, %r10;
|
||
+ eor %r8, %r9;
|
||
+ write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
|
||
+
|
||
+ pop {%r4-%r11, %ip, %pc};
|
||
+.ltorg
|
||
+.size _gcry_blowfish_arm_ctr_enc,.-_gcry_blowfish_arm_ctr_enc;
|
||
+
|
||
+.align 3
|
||
+.type _gcry_blowfish_arm_dec_blk2,%function;
|
||
+
|
||
+_gcry_blowfish_arm_dec_blk2:
|
||
+ /* input:
|
||
+ * preloaded: CTX
|
||
+ * [RL0, RR0], [RL1, RR1]: src
|
||
+ * output:
|
||
+ * [RR0, RL0], [RR1, RL1]: dst
|
||
+ */
|
||
+ add CTXs2, CTXs0, #(s2 - s0);
|
||
+ mov RMASK, #(0xff << 2); /* byte mask */
|
||
+
|
||
+ load_n_add_roundkey_dec2(17);
|
||
+ round_dec2(15, next_key);
|
||
+ round_dec2(13, next_key);
|
||
+ round_dec2(11, next_key);
|
||
+ round_dec2(9, next_key);
|
||
+ round_dec2(7, next_key);
|
||
+ round_dec2(5, next_key);
|
||
+ round_dec2(3, next_key);
|
||
+ round_dec2(1, dummy);
|
||
+
|
||
+ host_to_be(RR0, RT0);
|
||
+ host_to_be(RL0, RT0);
|
||
+ host_to_be(RR1, RT0);
|
||
+ host_to_be(RL1, RT0);
|
||
+
|
||
+ b .Ldec_cbc_tail;
|
||
+.ltorg
|
||
+.size _gcry_blowfish_arm_dec_blk2,.-_gcry_blowfish_arm_dec_blk2;
|
||
+
|
||
+.align 3
|
||
+.globl _gcry_blowfish_arm_cbc_dec;
|
||
+.type _gcry_blowfish_arm_cbc_dec,%function;
|
||
+
|
||
+_gcry_blowfish_arm_cbc_dec:
|
||
+ /* input:
|
||
+ * %r0: CTX
|
||
+ * %r1: dst (2 blocks)
|
||
+ * %r2: src (2 blocks)
|
||
+ * %r3: iv (64bit)
|
||
+ */
|
||
+ push {%r2-%r11, %ip, %lr};
|
||
+
|
||
+ read_block2(%r2, RL0, RR0, RL1, RR1, RT0);
|
||
+
|
||
+ /* dec_blk2 is only used by cbc_dec, jump directly in/out instead
|
||
+ * of function call. */
|
||
+ b _gcry_blowfish_arm_dec_blk2;
|
||
+.Ldec_cbc_tail:
|
||
+ /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
|
||
+
|
||
+ /* %r0: %src, %r1: dst, %r2: iv */
|
||
+ pop {%r0, %r2};
|
||
+
|
||
+ /* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */
|
||
+ read_block_host(%r0, 0, %r7, %r8, %r5);
|
||
+ /* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */
|
||
+ ldm %r2, {%r5, %r6};
|
||
+
|
||
+ /* out[1] ^= IV+1 */
|
||
+ eor %r10, %r7;
|
||
+ eor %r9, %r8;
|
||
+ /* out[0] ^= IV */
|
||
+ eor %r4, %r5;
|
||
+ eor %r3, %r6;
|
||
+
|
||
+ /* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */
|
||
+ read_block_host(%r0, 8, %r7, %r8, %r5);
|
||
+ /* store IV+2 to iv[0] (aligned). */
|
||
+ stm %r2, {%r7, %r8};
|
||
+
|
||
+ /* store result to dst[0-3]. Might be unaligned. */
|
||
+ write_block2_host(%r1, %r4, %r3, %r10, %r9, %r5, %r6);
|
||
+
|
||
+ pop {%r4-%r11, %ip, %pc};
|
||
+.ltorg
|
||
+.size _gcry_blowfish_arm_cbc_dec,.-_gcry_blowfish_arm_cbc_dec;
|
||
+
|
||
+#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
|
||
+#endif /*__ARM_ARCH >= 6*/
|
||
--- cipher/serpent-armv7-neon.S
|
||
+++ cipher/serpent-armv7-neon.S
|
||
@@ -0,0 +1,869 @@
|
||
+/* serpent-armv7-neon.S - ARM/NEON assembly implementation of Serpent cipher
|
||
+ *
|
||
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
||
+ *
|
||
+ * This file is part of Libgcrypt.
|
||
+ *
|
||
+ * Libgcrypt is free software; you can redistribute it and/or modify
|
||
+ * it under the terms of the GNU Lesser General Public License as
|
||
+ * published by the Free Software Foundation; either version 2.1 of
|
||
+ * the License, or (at your option) any later version.
|
||
+ *
|
||
+ * Libgcrypt is distributed in the hope that it will be useful,
|
||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
+ * GNU Lesser General Public License for more details.
|
||
+ *
|
||
+ * You should have received a copy of the GNU Lesser General Public
|
||
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||
+ */
|
||
+
|
||
+#include <config.h>
|
||
+
|
||
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
|
||
+ defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
|
||
+ defined(HAVE_GCC_INLINE_ASM_NEON)
|
||
+
|
||
+.text
|
||
+
|
||
+.syntax unified
|
||
+.fpu neon
|
||
+.arm
|
||
+
|
||
+/* ARM registers */
|
||
+#define RROUND r0
|
||
+
|
||
+/* NEON vector registers */
|
||
+#define RA0 q0
|
||
+#define RA1 q1
|
||
+#define RA2 q2
|
||
+#define RA3 q3
|
||
+#define RA4 q4
|
||
+#define RB0 q5
|
||
+#define RB1 q6
|
||
+#define RB2 q7
|
||
+#define RB3 q8
|
||
+#define RB4 q9
|
||
+
|
||
+#define RT0 q10
|
||
+#define RT1 q11
|
||
+#define RT2 q12
|
||
+#define RT3 q13
|
||
+
|
||
+#define RA0d0 d0
|
||
+#define RA0d1 d1
|
||
+#define RA1d0 d2
|
||
+#define RA1d1 d3
|
||
+#define RA2d0 d4
|
||
+#define RA2d1 d5
|
||
+#define RA3d0 d6
|
||
+#define RA3d1 d7
|
||
+#define RA4d0 d8
|
||
+#define RA4d1 d9
|
||
+#define RB0d0 d10
|
||
+#define RB0d1 d11
|
||
+#define RB1d0 d12
|
||
+#define RB1d1 d13
|
||
+#define RB2d0 d14
|
||
+#define RB2d1 d15
|
||
+#define RB3d0 d16
|
||
+#define RB3d1 d17
|
||
+#define RB4d0 d18
|
||
+#define RB4d1 d19
|
||
+#define RT0d0 d20
|
||
+#define RT0d1 d21
|
||
+#define RT1d0 d22
|
||
+#define RT1d1 d23
|
||
+#define RT2d0 d24
|
||
+#define RT2d1 d25
|
||
+
|
||
+/**********************************************************************
|
||
+ helper macros
|
||
+ **********************************************************************/
|
||
+
|
||
+#define transpose_4x4(_q0, _q1, _q2, _q3) \
|
||
+ vtrn.32 _q0, _q1; \
|
||
+ vtrn.32 _q2, _q3; \
|
||
+ vswp _q0##d1, _q2##d0; \
|
||
+ vswp _q1##d1, _q3##d0;
|
||
+
|
||
+/**********************************************************************
|
||
+ 8-way serpent
|
||
+ **********************************************************************/
|
||
+
|
||
+/*
|
||
+ * These are the S-Boxes of Serpent from following research paper.
|
||
+ *
|
||
+ * D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference,
|
||
+ * (New York, New York, USA), p. 317–329, National Institute of Standards and
|
||
+ * Technology, 2000.
|
||
+ *
|
||
+ * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
|
||
+ *
|
||
+ */
|
||
+#define SBOX0(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ veor a3, a3, a0; veor b3, b3, b0; vmov a4, a1; vmov b4, b1; \
|
||
+ vand a1, a1, a3; vand b1, b1, b3; veor a4, a4, a2; veor b4, b4, b2; \
|
||
+ veor a1, a1, a0; veor b1, b1, b0; vorr a0, a0, a3; vorr b0, b0, b3; \
|
||
+ veor a0, a0, a4; veor b0, b0, b4; veor a4, a4, a3; veor b4, b4, b3; \
|
||
+ veor a3, a3, a2; veor b3, b3, b2; vorr a2, a2, a1; vorr b2, b2, b1; \
|
||
+ veor a2, a2, a4; veor b2, b2, b4; vmvn a4, a4; vmvn b4, b4; \
|
||
+ vorr a4, a4, a1; vorr b4, b4, b1; veor a1, a1, a3; veor b1, b1, b3; \
|
||
+ veor a1, a1, a4; veor b1, b1, b4; vorr a3, a3, a0; vorr b3, b3, b0; \
|
||
+ veor a1, a1, a3; veor b1, b1, b3; veor a4, a3; veor b4, b3;
|
||
+
|
||
+#define SBOX0_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ vmvn a2, a2; vmvn b2, b2; vmov a4, a1; vmov b4, b1; \
|
||
+ vorr a1, a1, a0; vorr b1, b1, b0; vmvn a4, a4; vmvn b4, b4; \
|
||
+ veor a1, a1, a2; veor b1, b1, b2; vorr a2, a2, a4; vorr b2, b2, b4; \
|
||
+ veor a1, a1, a3; veor b1, b1, b3; veor a0, a0, a4; veor b0, b0, b4; \
|
||
+ veor a2, a2, a0; veor b2, b2, b0; vand a0, a0, a3; vand b0, b0, b3; \
|
||
+ veor a4, a4, a0; veor b4, b4, b0; vorr a0, a0, a1; vorr b0, b0, b1; \
|
||
+ veor a0, a0, a2; veor b0, b0, b2; veor a3, a3, a4; veor b3, b3, b4; \
|
||
+ veor a2, a2, a1; veor b2, b2, b1; veor a3, a3, a0; veor b3, b3, b0; \
|
||
+ veor a3, a3, a1; veor b3, b3, b1;\
|
||
+ vand a2, a2, a3; vand b2, b2, b3;\
|
||
+ veor a4, a2; veor b4, b2;
|
||
+
|
||
+#define SBOX1(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ vmvn a0, a0; vmvn b0, b0; vmvn a2, a2; vmvn b2, b2; \
|
||
+ vmov a4, a0; vmov b4, b0; vand a0, a0, a1; vand b0, b0, b1; \
|
||
+ veor a2, a2, a0; veor b2, b2, b0; vorr a0, a0, a3; vorr b0, b0, b3; \
|
||
+ veor a3, a3, a2; veor b3, b3, b2; veor a1, a1, a0; veor b1, b1, b0; \
|
||
+ veor a0, a0, a4; veor b0, b0, b4; vorr a4, a4, a1; vorr b4, b4, b1; \
|
||
+ veor a1, a1, a3; veor b1, b1, b3; vorr a2, a2, a0; vorr b2, b2, b0; \
|
||
+ vand a2, a2, a4; vand b2, b2, b4; veor a0, a0, a1; veor b0, b0, b1; \
|
||
+ vand a1, a1, a2; vand b1, b1, b2;\
|
||
+ veor a1, a1, a0; veor b1, b1, b0; vand a0, a0, a2; vand b0, b0, b2; \
|
||
+ veor a0, a4; veor b0, b4;
|
||
+
|
||
+#define SBOX1_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ vmov a4, a1; vmov b4, b1; veor a1, a1, a3; veor b1, b1, b3; \
|
||
+ vand a3, a3, a1; vand b3, b3, b1; veor a4, a4, a2; veor b4, b4, b2; \
|
||
+ veor a3, a3, a0; veor b3, b3, b0; vorr a0, a0, a1; vorr b0, b0, b1; \
|
||
+ veor a2, a2, a3; veor b2, b2, b3; veor a0, a0, a4; veor b0, b0, b4; \
|
||
+ vorr a0, a0, a2; vorr b0, b0, b2; veor a1, a1, a3; veor b1, b1, b3; \
|
||
+ veor a0, a0, a1; veor b0, b0, b1; vorr a1, a1, a3; vorr b1, b1, b3; \
|
||
+ veor a1, a1, a0; veor b1, b1, b0; vmvn a4, a4; vmvn b4, b4; \
|
||
+ veor a4, a4, a1; veor b4, b4, b1; vorr a1, a1, a0; vorr b1, b1, b0; \
|
||
+ veor a1, a1, a0; veor b1, b1, b0;\
|
||
+ vorr a1, a1, a4; vorr b1, b1, b4;\
|
||
+ veor a3, a1; veor b3, b1;
|
||
+
|
||
+#define SBOX2(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ vmov a4, a0; vmov b4, b0; vand a0, a0, a2; vand b0, b0, b2; \
|
||
+ veor a0, a0, a3; veor b0, b0, b3; veor a2, a2, a1; veor b2, b2, b1; \
|
||
+ veor a2, a2, a0; veor b2, b2, b0; vorr a3, a3, a4; vorr b3, b3, b4; \
|
||
+ veor a3, a3, a1; veor b3, b3, b1; veor a4, a4, a2; veor b4, b4, b2; \
|
||
+ vmov a1, a3; vmov b1, b3; vorr a3, a3, a4; vorr b3, b3, b4; \
|
||
+ veor a3, a3, a0; veor b3, b3, b0; vand a0, a0, a1; vand b0, b0, b1; \
|
||
+ veor a4, a4, a0; veor b4, b4, b0; veor a1, a1, a3; veor b1, b1, b3; \
|
||
+ veor a1, a1, a4; veor b1, b1, b4; vmvn a4, a4; vmvn b4, b4;
|
||
+
|
||
+#define SBOX2_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ veor a2, a2, a3; veor b2, b2, b3; veor a3, a3, a0; veor b3, b3, b0; \
|
||
+ vmov a4, a3; vmov b4, b3; vand a3, a3, a2; vand b3, b3, b2; \
|
||
+ veor a3, a3, a1; veor b3, b3, b1; vorr a1, a1, a2; vorr b1, b1, b2; \
|
||
+ veor a1, a1, a4; veor b1, b1, b4; vand a4, a4, a3; vand b4, b4, b3; \
|
||
+ veor a2, a2, a3; veor b2, b2, b3; vand a4, a4, a0; vand b4, b4, b0; \
|
||
+ veor a4, a4, a2; veor b4, b4, b2; vand a2, a2, a1; vand b2, b2, b1; \
|
||
+ vorr a2, a2, a0; vorr b2, b2, b0; vmvn a3, a3; vmvn b3, b3; \
|
||
+ veor a2, a2, a3; veor b2, b2, b3; veor a0, a0, a3; veor b0, b0, b3; \
|
||
+ vand a0, a0, a1; vand b0, b0, b1; veor a3, a3, a4; veor b3, b3, b4; \
|
||
+ veor a3, a0; veor b3, b0;
|
||
+
|
||
+#define SBOX3(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ vmov a4, a0; vmov b4, b0; vorr a0, a0, a3; vorr b0, b0, b3; \
|
||
+ veor a3, a3, a1; veor b3, b3, b1; vand a1, a1, a4; vand b1, b1, b4; \
|
||
+ veor a4, a4, a2; veor b4, b4, b2; veor a2, a2, a3; veor b2, b2, b3; \
|
||
+ vand a3, a3, a0; vand b3, b3, b0; vorr a4, a4, a1; vorr b4, b4, b1; \
|
||
+ veor a3, a3, a4; veor b3, b3, b4; veor a0, a0, a1; veor b0, b0, b1; \
|
||
+ vand a4, a4, a0; vand b4, b4, b0; veor a1, a1, a3; veor b1, b1, b3; \
|
||
+ veor a4, a4, a2; veor b4, b4, b2; vorr a1, a1, a0; vorr b1, b1, b0; \
|
||
+ veor a1, a1, a2; veor b1, b1, b2; veor a0, a0, a3; veor b0, b0, b3; \
|
||
+ vmov a2, a1; vmov b2, b1; vorr a1, a1, a3; vorr b1, b1, b3; \
|
||
+ veor a1, a0; veor b1, b0;
|
||
+
|
||
+#define SBOX3_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ vmov a4, a2; vmov b4, b2; veor a2, a2, a1; veor b2, b2, b1; \
|
||
+ veor a0, a0, a2; veor b0, b0, b2; vand a4, a4, a2; vand b4, b4, b2; \
|
||
+ veor a4, a4, a0; veor b4, b4, b0; vand a0, a0, a1; vand b0, b0, b1; \
|
||
+ veor a1, a1, a3; veor b1, b1, b3; vorr a3, a3, a4; vorr b3, b3, b4; \
|
||
+ veor a2, a2, a3; veor b2, b2, b3; veor a0, a0, a3; veor b0, b0, b3; \
|
||
+ veor a1, a1, a4; veor b1, b1, b4; vand a3, a3, a2; vand b3, b3, b2; \
|
||
+ veor a3, a3, a1; veor b3, b3, b1; veor a1, a1, a0; veor b1, b1, b0; \
|
||
+ vorr a1, a1, a2; vorr b1, b1, b2; veor a0, a0, a3; veor b0, b0, b3; \
|
||
+ veor a1, a1, a4; veor b1, b1, b4;\
|
||
+ veor a0, a1; veor b0, b1;
|
||
+
|
||
+#define SBOX4(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ veor a1, a1, a3; veor b1, b1, b3; vmvn a3, a3; vmvn b3, b3; \
|
||
+ veor a2, a2, a3; veor b2, b2, b3; veor a3, a3, a0; veor b3, b3, b0; \
|
||
+ vmov a4, a1; vmov b4, b1; vand a1, a1, a3; vand b1, b1, b3; \
|
||
+ veor a1, a1, a2; veor b1, b1, b2; veor a4, a4, a3; veor b4, b4, b3; \
|
||
+ veor a0, a0, a4; veor b0, b0, b4; vand a2, a2, a4; vand b2, b2, b4; \
|
||
+ veor a2, a2, a0; veor b2, b2, b0; vand a0, a0, a1; vand b0, b0, b1; \
|
||
+ veor a3, a3, a0; veor b3, b3, b0; vorr a4, a4, a1; vorr b4, b4, b1; \
|
||
+ veor a4, a4, a0; veor b4, b4, b0; vorr a0, a0, a3; vorr b0, b0, b3; \
|
||
+ veor a0, a0, a2; veor b0, b0, b2; vand a2, a2, a3; vand b2, b2, b3; \
|
||
+ vmvn a0, a0; vmvn b0, b0; veor a4, a2; veor b4, b2;
|
||
+
|
||
+#define SBOX4_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ vmov a4, a2; vmov b4, b2; vand a2, a2, a3; vand b2, b2, b3; \
|
||
+ veor a2, a2, a1; veor b2, b2, b1; vorr a1, a1, a3; vorr b1, b1, b3; \
|
||
+ vand a1, a1, a0; vand b1, b1, b0; veor a4, a4, a2; veor b4, b4, b2; \
|
||
+ veor a4, a4, a1; veor b4, b4, b1; vand a1, a1, a2; vand b1, b1, b2; \
|
||
+ vmvn a0, a0; vmvn b0, b0; veor a3, a3, a4; veor b3, b3, b4; \
|
||
+ veor a1, a1, a3; veor b1, b1, b3; vand a3, a3, a0; vand b3, b3, b0; \
|
||
+ veor a3, a3, a2; veor b3, b3, b2; veor a0, a0, a1; veor b0, b0, b1; \
|
||
+ vand a2, a2, a0; vand b2, b2, b0; veor a3, a3, a0; veor b3, b3, b0; \
|
||
+ veor a2, a2, a4; veor b2, b2, b4;\
|
||
+ vorr a2, a2, a3; vorr b2, b2, b3; veor a3, a3, a0; veor b3, b3, b0; \
|
||
+ veor a2, a1; veor b2, b1;
|
||
+
|
||
+#define SBOX5(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ veor a0, a0, a1; veor b0, b0, b1; veor a1, a1, a3; veor b1, b1, b3; \
|
||
+ vmvn a3, a3; vmvn b3, b3; vmov a4, a1; vmov b4, b1; \
|
||
+ vand a1, a1, a0; vand b1, b1, b0; veor a2, a2, a3; veor b2, b2, b3; \
|
||
+ veor a1, a1, a2; veor b1, b1, b2; vorr a2, a2, a4; vorr b2, b2, b4; \
|
||
+ veor a4, a4, a3; veor b4, b4, b3; vand a3, a3, a1; vand b3, b3, b1; \
|
||
+ veor a3, a3, a0; veor b3, b3, b0; veor a4, a4, a1; veor b4, b4, b1; \
|
||
+ veor a4, a4, a2; veor b4, b4, b2; veor a2, a2, a0; veor b2, b2, b0; \
|
||
+ vand a0, a0, a3; vand b0, b0, b3; vmvn a2, a2; vmvn b2, b2; \
|
||
+ veor a0, a0, a4; veor b0, b0, b4; vorr a4, a4, a3; vorr b4, b4, b3; \
|
||
+ veor a2, a4; veor b2, b4;
|
||
+
|
||
+#define SBOX5_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ vmvn a1, a1; vmvn b1, b1; vmov a4, a3; vmov b4, b3; \
|
||
+ veor a2, a2, a1; veor b2, b2, b1; vorr a3, a3, a0; vorr b3, b3, b0; \
|
||
+ veor a3, a3, a2; veor b3, b3, b2; vorr a2, a2, a1; vorr b2, b2, b1; \
|
||
+ vand a2, a2, a0; vand b2, b2, b0; veor a4, a4, a3; veor b4, b4, b3; \
|
||
+ veor a2, a2, a4; veor b2, b2, b4; vorr a4, a4, a0; vorr b4, b4, b0; \
|
||
+ veor a4, a4, a1; veor b4, b4, b1; vand a1, a1, a2; vand b1, b1, b2; \
|
||
+ veor a1, a1, a3; veor b1, b1, b3; veor a4, a4, a2; veor b4, b4, b2; \
|
||
+ vand a3, a3, a4; vand b3, b3, b4; veor a4, a4, a1; veor b4, b4, b1; \
|
||
+ veor a3, a3, a4; veor b3, b3, b4; vmvn a4, a4; vmvn b4, b4; \
|
||
+ veor a3, a0; veor b3, b0;
|
||
+
|
||
+#define SBOX6(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ vmvn a2, a2; vmvn b2, b2; vmov a4, a3; vmov b4, b3; \
|
||
+ vand a3, a3, a0; vand b3, b3, b0; veor a0, a0, a4; veor b0, b0, b4; \
|
||
+ veor a3, a3, a2; veor b3, b3, b2; vorr a2, a2, a4; vorr b2, b2, b4; \
|
||
+ veor a1, a1, a3; veor b1, b1, b3; veor a2, a2, a0; veor b2, b2, b0; \
|
||
+ vorr a0, a0, a1; vorr b0, b0, b1; veor a2, a2, a1; veor b2, b2, b1; \
|
||
+ veor a4, a4, a0; veor b4, b4, b0; vorr a0, a0, a3; vorr b0, b0, b3; \
|
||
+ veor a0, a0, a2; veor b0, b0, b2; veor a4, a4, a3; veor b4, b4, b3; \
|
||
+ veor a4, a4, a0; veor b4, b4, b0; vmvn a3, a3; vmvn b3, b3; \
|
||
+ vand a2, a2, a4; vand b2, b2, b4;\
|
||
+ veor a2, a3; veor b2, b3;
|
||
+
|
||
+#define SBOX6_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ veor a0, a0, a2; veor b0, b0, b2; vmov a4, a2; vmov b4, b2; \
|
||
+ vand a2, a2, a0; vand b2, b2, b0; veor a4, a4, a3; veor b4, b4, b3; \
|
||
+ vmvn a2, a2; vmvn b2, b2; veor a3, a3, a1; veor b3, b3, b1; \
|
||
+ veor a2, a2, a3; veor b2, b2, b3; vorr a4, a4, a0; vorr b4, b4, b0; \
|
||
+ veor a0, a0, a2; veor b0, b0, b2; veor a3, a3, a4; veor b3, b3, b4; \
|
||
+ veor a4, a4, a1; veor b4, b4, b1; vand a1, a1, a3; vand b1, b1, b3; \
|
||
+ veor a1, a1, a0; veor b1, b1, b0; veor a0, a0, a3; veor b0, b0, b3; \
|
||
+ vorr a0, a0, a2; vorr b0, b0, b2; veor a3, a3, a1; veor b3, b3, b1; \
|
||
+ veor a4, a0; veor b4, b0;
|
||
+
|
||
+#define SBOX7(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ vmov a4, a1; vmov b4, b1; vorr a1, a1, a2; vorr b1, b1, b2; \
|
||
+ veor a1, a1, a3; veor b1, b1, b3; veor a4, a4, a2; veor b4, b4, b2; \
|
||
+ veor a2, a2, a1; veor b2, b2, b1; vorr a3, a3, a4; vorr b3, b3, b4; \
|
||
+ vand a3, a3, a0; vand b3, b3, b0; veor a4, a4, a2; veor b4, b4, b2; \
|
||
+ veor a3, a3, a1; veor b3, b3, b1; vorr a1, a1, a4; vorr b1, b1, b4; \
|
||
+ veor a1, a1, a0; veor b1, b1, b0; vorr a0, a0, a4; vorr b0, b0, b4; \
|
||
+ veor a0, a0, a2; veor b0, b0, b2; veor a1, a1, a4; veor b1, b1, b4; \
|
||
+ veor a2, a2, a1; veor b2, b2, b1; vand a1, a1, a0; vand b1, b1, b0; \
|
||
+ veor a1, a1, a4; veor b1, b1, b4; vmvn a2, a2; vmvn b2, b2; \
|
||
+ vorr a2, a2, a0; vorr b2, b2, b0;\
|
||
+ veor a4, a2; veor b4, b2;
|
||
+
|
||
+#define SBOX7_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ vmov a4, a2; vmov b4, b2; veor a2, a2, a0; veor b2, b2, b0; \
|
||
+ vand a0, a0, a3; vand b0, b0, b3; vorr a4, a4, a3; vorr b4, b4, b3; \
|
||
+ vmvn a2, a2; vmvn b2, b2; veor a3, a3, a1; veor b3, b3, b1; \
|
||
+ vorr a1, a1, a0; vorr b1, b1, b0; veor a0, a0, a2; veor b0, b0, b2; \
|
||
+ vand a2, a2, a4; vand b2, b2, b4; vand a3, a3, a4; vand b3, b3, b4; \
|
||
+ veor a1, a1, a2; veor b1, b1, b2; veor a2, a2, a0; veor b2, b2, b0; \
|
||
+ vorr a0, a0, a2; vorr b0, b0, b2; veor a4, a4, a1; veor b4, b4, b1; \
|
||
+ veor a0, a0, a3; veor b0, b0, b3; veor a3, a3, a4; veor b3, b3, b4; \
|
||
+ vorr a4, a4, a0; vorr b4, b4, b0; veor a3, a3, a2; veor b3, b3, b2; \
|
||
+ veor a4, a2; veor b4, b2;
|
||
+
|
||
+/* Apply SBOX number WHICH to to the block. */
|
||
+#define SBOX(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ SBOX##which (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4)
|
||
+
|
||
+/* Apply inverse SBOX number WHICH to to the block. */
|
||
+#define SBOX_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ SBOX##which##_INVERSE (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4)
|
||
+
|
||
+/* XOR round key into block state in a0,a1,a2,a3. a4 used as temporary. */
|
||
+#define BLOCK_XOR_KEY(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ vdup.32 RT3, RT0d0[0]; \
|
||
+ vdup.32 RT1, RT0d0[1]; \
|
||
+ vdup.32 RT2, RT0d1[0]; \
|
||
+ vdup.32 RT0, RT0d1[1]; \
|
||
+ veor a0, a0, RT3; veor b0, b0, RT3; \
|
||
+ veor a1, a1, RT1; veor b1, b1, RT1; \
|
||
+ veor a2, a2, RT2; veor b2, b2, RT2; \
|
||
+ veor a3, a3, RT0; veor b3, b3, RT0;
|
||
+
|
||
+#define BLOCK_LOAD_KEY_ENC() \
|
||
+ vld1.8 {RT0d0, RT0d1}, [RROUND]!;
|
||
+
|
||
+#define BLOCK_LOAD_KEY_DEC() \
|
||
+ vld1.8 {RT0d0, RT0d1}, [RROUND]; \
|
||
+ sub RROUND, RROUND, #16
|
||
+
|
||
+/* Apply the linear transformation to BLOCK. */
|
||
+#define LINEAR_TRANSFORMATION(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ vshl.u32 a4, a0, #13; vshl.u32 b4, b0, #13; \
|
||
+ vshr.u32 a0, a0, #(32-13); vshr.u32 b0, b0, #(32-13); \
|
||
+ veor a0, a0, a4; veor b0, b0, b4; \
|
||
+ vshl.u32 a4, a2, #3; vshl.u32 b4, b2, #3; \
|
||
+ vshr.u32 a2, a2, #(32-3); vshr.u32 b2, b2, #(32-3); \
|
||
+ veor a2, a2, a4; veor b2, b2, b4; \
|
||
+ veor a1, a0, a1; veor b1, b0, b1; \
|
||
+ veor a1, a2, a1; veor b1, b2, b1; \
|
||
+ vshl.u32 a4, a0, #3; vshl.u32 b4, b0, #3; \
|
||
+ veor a3, a2, a3; veor b3, b2, b3; \
|
||
+ veor a3, a4, a3; veor b3, b4, b3; \
|
||
+ vshl.u32 a4, a1, #1; vshl.u32 b4, b1, #1; \
|
||
+ vshr.u32 a1, a1, #(32-1); vshr.u32 b1, b1, #(32-1); \
|
||
+ veor a1, a1, a4; veor b1, b1, b4; \
|
||
+ vshl.u32 a4, a3, #7; vshl.u32 b4, b3, #7; \
|
||
+ vshr.u32 a3, a3, #(32-7); vshr.u32 b3, b3, #(32-7); \
|
||
+ veor a3, a3, a4; veor b3, b3, b4; \
|
||
+ veor a0, a1, a0; veor b0, b1, b0; \
|
||
+ veor a0, a3, a0; veor b0, b3, b0; \
|
||
+ vshl.u32 a4, a1, #7; vshl.u32 b4, b1, #7; \
|
||
+ veor a2, a3, a2; veor b2, b3, b2; \
|
||
+ veor a2, a4, a2; veor b2, b4, b2; \
|
||
+ vshl.u32 a4, a0, #5; vshl.u32 b4, b0, #5; \
|
||
+ vshr.u32 a0, a0, #(32-5); vshr.u32 b0, b0, #(32-5); \
|
||
+ veor a0, a0, a4; veor b0, b0, b4; \
|
||
+ vshl.u32 a4, a2, #22; vshl.u32 b4, b2, #22; \
|
||
+ vshr.u32 a2, a2, #(32-22); vshr.u32 b2, b2, #(32-22); \
|
||
+ veor a2, a2, a4; veor b2, b2, b4;
|
||
+
|
||
+/* Apply the inverse linear transformation to BLOCK. */
|
||
+#define LINEAR_TRANSFORMATION_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
|
||
+ vshr.u32 a4, a2, #22; vshr.u32 b4, b2, #22; \
|
||
+ vshl.u32 a2, a2, #(32-22); vshl.u32 b2, b2, #(32-22); \
|
||
+ veor a2, a2, a4; veor b2, b2, b4; \
|
||
+ vshr.u32 a4, a0, #5; vshr.u32 b4, b0, #5; \
|
||
+ vshl.u32 a0, a0, #(32-5); vshl.u32 b0, b0, #(32-5); \
|
||
+ veor a0, a0, a4; veor b0, b0, b4; \
|
||
+ vshl.u32 a4, a1, #7; vshl.u32 b4, b1, #7; \
|
||
+ veor a2, a3, a2; veor b2, b3, b2; \
|
||
+ veor a2, a4, a2; veor b2, b4, b2; \
|
||
+ veor a0, a1, a0; veor b0, b1, b0; \
|
||
+ veor a0, a3, a0; veor b0, b3, b0; \
|
||
+ vshr.u32 a4, a3, #7; vshr.u32 b4, b3, #7; \
|
||
+ vshl.u32 a3, a3, #(32-7); vshl.u32 b3, b3, #(32-7); \
|
||
+ veor a3, a3, a4; veor b3, b3, b4; \
|
||
+ vshr.u32 a4, a1, #1; vshr.u32 b4, b1, #1; \
|
||
+ vshl.u32 a1, a1, #(32-1); vshl.u32 b1, b1, #(32-1); \
|
||
+ veor a1, a1, a4; veor b1, b1, b4; \
|
||
+ vshl.u32 a4, a0, #3; vshl.u32 b4, b0, #3; \
|
||
+ veor a3, a2, a3; veor b3, b2, b3; \
|
||
+ veor a3, a4, a3; veor b3, b4, b3; \
|
||
+ veor a1, a0, a1; veor b1, b0, b1; \
|
||
+ veor a1, a2, a1; veor b1, b2, b1; \
|
||
+ vshr.u32 a4, a2, #3; vshr.u32 b4, b2, #3; \
|
||
+ vshl.u32 a2, a2, #(32-3); vshl.u32 b2, b2, #(32-3); \
|
||
+ veor a2, a2, a4; veor b2, b2, b4; \
|
||
+ vshr.u32 a4, a0, #13; vshr.u32 b4, b0, #13; \
|
||
+ vshl.u32 a0, a0, #(32-13); vshl.u32 b0, b0, #(32-13); \
|
||
+ veor a0, a0, a4; veor b0, b0, b4;
|
||
+
|
||
+/* Apply a Serpent round to eight parallel blocks. This macro increments
|
||
+ `round'. */
|
||
+#define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
|
||
+ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
|
||
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
|
||
+ BLOCK_LOAD_KEY_ENC (); \
|
||
+ SBOX (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
|
||
+ LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4);
|
||
+
|
||
+/* Apply the last Serpent round to eight parallel blocks. This macro increments
|
||
+ `round'. */
|
||
+#define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
|
||
+ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
|
||
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
|
||
+ BLOCK_LOAD_KEY_ENC (); \
|
||
+ SBOX (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
|
||
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4);
|
||
+
|
||
+/* Apply an inverse Serpent round to eight parallel blocks. This macro
|
||
+ increments `round'. */
|
||
+#define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \
|
||
+ na0, na1, na2, na3, na4, \
|
||
+ b0, b1, b2, b3, b4, \
|
||
+ nb0, nb1, nb2, nb3, nb4) \
|
||
+ LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
|
||
+ SBOX_INVERSE (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
|
||
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4); \
|
||
+ BLOCK_LOAD_KEY_DEC ();
|
||
+
|
||
+/* Apply the first inverse Serpent round to eight parallel blocks. This macro
|
||
+ increments `round'. */
|
||
+#define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \
|
||
+ na0, na1, na2, na3, na4, \
|
||
+ b0, b1, b2, b3, b4, \
|
||
+ nb0, nb1, nb2, nb3, nb4) \
|
||
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
|
||
+ BLOCK_LOAD_KEY_DEC (); \
|
||
+ SBOX_INVERSE (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
|
||
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4); \
|
||
+ BLOCK_LOAD_KEY_DEC ();
|
||
+
|
||
+.align 3
|
||
+.type __serpent_enc_blk8,%function;
|
||
+__serpent_enc_blk8:
|
||
+ /* input:
|
||
+ * r0: round key pointer
|
||
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
|
||
+ * blocks
|
||
+ * output:
|
||
+ * RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: eight parallel
|
||
+ * ciphertext blocks
|
||
+ */
|
||
+
|
||
+ transpose_4x4(RA0, RA1, RA2, RA3);
|
||
+ BLOCK_LOAD_KEY_ENC ();
|
||
+ transpose_4x4(RB0, RB1, RB2, RB3);
|
||
+
|
||
+ ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
|
||
+ RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
|
||
+ ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
|
||
+ RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
|
||
+ ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
|
||
+ RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
|
||
+ ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
|
||
+ RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
|
||
+ ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
|
||
+ RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
|
||
+ ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
|
||
+ RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
|
||
+ ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
|
||
+ RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
|
||
+ ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
|
||
+ RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
|
||
+ ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0,
|
||
+ RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0);
|
||
+ ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0,
|
||
+ RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0);
|
||
+ ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2,
|
||
+ RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2);
|
||
+ ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4,
|
||
+ RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4);
|
||
+ ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0,
|
||
+ RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0);
|
||
+ ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0,
|
||
+ RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0);
|
||
+ ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3,
|
||
+ RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3);
|
||
+ ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0,
|
||
+ RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0);
|
||
+ ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4,
|
||
+ RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4);
|
||
+ ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4,
|
||
+ RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4);
|
||
+ ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2,
|
||
+ RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2);
|
||
+ ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3,
|
||
+ RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3);
|
||
+ ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4,
|
||
+ RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4);
|
||
+ ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4,
|
||
+ RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4);
|
||
+ ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0,
|
||
+ RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0);
|
||
+ ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4,
|
||
+ RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4);
|
||
+ ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
|
||
+ RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
|
||
+ ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
|
||
+ RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
|
||
+ ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
|
||
+ RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
|
||
+ ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
|
||
+ RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
|
||
+ ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
|
||
+ RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
|
||
+ ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
|
||
+ RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
|
||
+ ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
|
||
+ RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
|
||
+ ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
|
||
+ RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
|
||
+
|
||
+ transpose_4x4(RA4, RA1, RA2, RA0);
|
||
+ transpose_4x4(RB4, RB1, RB2, RB0);
|
||
+
|
||
+ bx lr;
|
||
+.size __serpent_enc_blk8,.-__serpent_enc_blk8;
|
||
+
|
||
+.align 3
|
||
+.type __serpent_dec_blk8,%function;
|
||
+__serpent_dec_blk8:
|
||
+ /* input:
|
||
+ * r0: round key pointer
|
||
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
|
||
+ * ciphertext blocks
|
||
+ * output:
|
||
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
|
||
+ * blocks
|
||
+ */
|
||
+
|
||
+ add RROUND, RROUND, #(32*16);
|
||
+
|
||
+ transpose_4x4(RA0, RA1, RA2, RA3);
|
||
+ BLOCK_LOAD_KEY_DEC ();
|
||
+ transpose_4x4(RB0, RB1, RB2, RB3);
|
||
+
|
||
+ ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4,
|
||
+ RA3, RA0, RA1, RA4, RA2,
|
||
+ RB0, RB1, RB2, RB3, RB4,
|
||
+ RB3, RB0, RB1, RB4, RB2);
|
||
+ ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3,
|
||
+ RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3);
|
||
+ ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0,
|
||
+ RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0);
|
||
+ ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3,
|
||
+ RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3);
|
||
+ ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3,
|
||
+ RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3);
|
||
+ ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4,
|
||
+ RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4);
|
||
+ ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3,
|
||
+ RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3);
|
||
+ ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1,
|
||
+ RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1);
|
||
+ ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2,
|
||
+ RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2);
|
||
+ ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0,
|
||
+ RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0);
|
||
+ ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4,
|
||
+ RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4);
|
||
+ ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0,
|
||
+ RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0);
|
||
+ ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0,
|
||
+ RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0);
|
||
+ ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1,
|
||
+ RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1);
|
||
+ ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0,
|
||
+ RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0);
|
||
+ ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3,
|
||
+ RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3);
|
||
+ ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2,
|
||
+ RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2);
|
||
+ ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4,
|
||
+ RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4);
|
||
+ ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1,
|
||
+ RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1);
|
||
+ ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4,
|
||
+ RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4);
|
||
+ ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4,
|
||
+ RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4);
|
||
+ ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3,
|
||
+ RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3);
|
||
+ ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4,
|
||
+ RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4);
|
||
+ ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0,
|
||
+ RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0);
|
||
+ ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2,
|
||
+ RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2);
|
||
+ ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1,
|
||
+ RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1);
|
||
+ ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3,
|
||
+ RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3);
|
||
+ ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1,
|
||
+ RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1);
|
||
+ ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1,
|
||
+ RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1);
|
||
+ ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0,
|
||
+ RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0);
|
||
+ ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1,
|
||
+ RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1);
|
||
+ ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4,
|
||
+ RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4);
|
||
+
|
||
+ transpose_4x4(RA0, RA1, RA2, RA3);
|
||
+ transpose_4x4(RB0, RB1, RB2, RB3);
|
||
+
|
||
+ bx lr;
|
||
+.size __serpent_dec_blk8,.-__serpent_dec_blk8;
|
||
+
|
||
+.align 3
|
||
+.globl _gcry_serpent_neon_ctr_enc
|
||
+.type _gcry_serpent_neon_ctr_enc,%function;
|
||
+_gcry_serpent_neon_ctr_enc:
|
||
+ /* input:
|
||
+ * r0: ctx, CTX
|
||
+ * r1: dst (8 blocks)
|
||
+ * r2: src (8 blocks)
|
||
+ * r3: iv
|
||
+ */
|
||
+
|
||
+ vmov.u8 RT1d0, #0xff; /* u64: -1 */
|
||
+ push {r4,lr};
|
||
+ vadd.u64 RT2d0, RT1d0, RT1d0; /* u64: -2 */
|
||
+ vpush {RA4-RB2};
|
||
+
|
||
+ /* load IV and byteswap */
|
||
+ vld1.8 {RA0}, [r3];
|
||
+ vrev64.u8 RT0, RA0; /* be => le */
|
||
+ ldr r4, [r3, #8];
|
||
+
|
||
+ /* construct IVs */
|
||
+ vsub.u64 RA2d1, RT0d1, RT2d0; /* +2 */
|
||
+ vsub.u64 RA1d1, RT0d1, RT1d0; /* +1 */
|
||
+ cmp r4, #-1;
|
||
+
|
||
+ vsub.u64 RB0d1, RA2d1, RT2d0; /* +4 */
|
||
+ vsub.u64 RA3d1, RA2d1, RT1d0; /* +3 */
|
||
+ ldr r4, [r3, #12];
|
||
+
|
||
+ vsub.u64 RB2d1, RB0d1, RT2d0; /* +6 */
|
||
+ vsub.u64 RB1d1, RB0d1, RT1d0; /* +5 */
|
||
+
|
||
+ vsub.u64 RT2d1, RB2d1, RT2d0; /* +8 */
|
||
+ vsub.u64 RB3d1, RB2d1, RT1d0; /* +7 */
|
||
+
|
||
+ vmov RA1d0, RT0d0;
|
||
+ vmov RA2d0, RT0d0;
|
||
+ vmov RA3d0, RT0d0;
|
||
+ vmov RB0d0, RT0d0;
|
||
+ rev r4, r4;
|
||
+ vmov RB1d0, RT0d0;
|
||
+ vmov RB2d0, RT0d0;
|
||
+ vmov RB3d0, RT0d0;
|
||
+ vmov RT2d0, RT0d0;
|
||
+
|
||
+ /* check need for handling 64-bit overflow and carry */
|
||
+ beq .Ldo_ctr_carry;
|
||
+
|
||
+.Lctr_carry_done:
|
||
+ /* le => be */
|
||
+ vrev64.u8 RA1, RA1;
|
||
+ vrev64.u8 RA2, RA2;
|
||
+ vrev64.u8 RA3, RA3;
|
||
+ vrev64.u8 RB0, RB0;
|
||
+ vrev64.u8 RT2, RT2;
|
||
+ vrev64.u8 RB1, RB1;
|
||
+ vrev64.u8 RB2, RB2;
|
||
+ vrev64.u8 RB3, RB3;
|
||
+ /* store new IV */
|
||
+ vst1.8 {RT2}, [r3];
|
||
+
|
||
+ bl __serpent_enc_blk8;
|
||
+
|
||
+ vld1.8 {RT0, RT1}, [r2]!;
|
||
+ vld1.8 {RT2, RT3}, [r2]!;
|
||
+ veor RA4, RA4, RT0;
|
||
+ veor RA1, RA1, RT1;
|
||
+ vld1.8 {RT0, RT1}, [r2]!;
|
||
+ veor RA2, RA2, RT2;
|
||
+ veor RA0, RA0, RT3;
|
||
+ vld1.8 {RT2, RT3}, [r2]!;
|
||
+ veor RB4, RB4, RT0;
|
||
+ veor RT0, RT0;
|
||
+ veor RB1, RB1, RT1;
|
||
+ veor RT1, RT1;
|
||
+ veor RB2, RB2, RT2;
|
||
+ veor RT2, RT2;
|
||
+ veor RB0, RB0, RT3;
|
||
+ veor RT3, RT3;
|
||
+
|
||
+ vst1.8 {RA4}, [r1]!;
|
||
+ vst1.8 {RA1}, [r1]!;
|
||
+ veor RA1, RA1;
|
||
+ vst1.8 {RA2}, [r1]!;
|
||
+ veor RA2, RA2;
|
||
+ vst1.8 {RA0}, [r1]!;
|
||
+ veor RA0, RA0;
|
||
+ vst1.8 {RB4}, [r1]!;
|
||
+ veor RB4, RB4;
|
||
+ vst1.8 {RB1}, [r1]!;
|
||
+ vst1.8 {RB2}, [r1]!;
|
||
+ vst1.8 {RB0}, [r1]!;
|
||
+
|
||
+ vpop {RA4-RB2};
|
||
+
|
||
+ /* clear the used registers */
|
||
+ veor RA3, RA3;
|
||
+ veor RB3, RB3;
|
||
+
|
||
+ pop {r4,pc};
|
||
+
|
||
+.Ldo_ctr_carry:
|
||
+ cmp r4, #-8;
|
||
+ blo .Lctr_carry_done;
|
||
+ beq .Lcarry_RT2;
|
||
+
|
||
+ cmp r4, #-6;
|
||
+ blo .Lcarry_RB3;
|
||
+ beq .Lcarry_RB2;
|
||
+
|
||
+ cmp r4, #-4;
|
||
+ blo .Lcarry_RB1;
|
||
+ beq .Lcarry_RB0;
|
||
+
|
||
+ cmp r4, #-2;
|
||
+ blo .Lcarry_RA3;
|
||
+ beq .Lcarry_RA2;
|
||
+
|
||
+ vsub.u64 RA1d0, RT1d0;
|
||
+.Lcarry_RA2:
|
||
+ vsub.u64 RA2d0, RT1d0;
|
||
+.Lcarry_RA3:
|
||
+ vsub.u64 RA3d0, RT1d0;
|
||
+.Lcarry_RB0:
|
||
+ vsub.u64 RB0d0, RT1d0;
|
||
+.Lcarry_RB1:
|
||
+ vsub.u64 RB1d0, RT1d0;
|
||
+.Lcarry_RB2:
|
||
+ vsub.u64 RB2d0, RT1d0;
|
||
+.Lcarry_RB3:
|
||
+ vsub.u64 RB3d0, RT1d0;
|
||
+.Lcarry_RT2:
|
||
+ vsub.u64 RT2d0, RT1d0;
|
||
+
|
||
+ b .Lctr_carry_done;
|
||
+.size _gcry_serpent_neon_ctr_enc,.-_gcry_serpent_neon_ctr_enc;
|
||
+
|
||
+.align 3
|
||
+.globl _gcry_serpent_neon_cfb_dec
|
||
+.type _gcry_serpent_neon_cfb_dec,%function;
|
||
+_gcry_serpent_neon_cfb_dec:
|
||
+ /* input:
|
||
+ * r0: ctx, CTX
|
||
+ * r1: dst (8 blocks)
|
||
+ * r2: src (8 blocks)
|
||
+ * r3: iv
|
||
+ */
|
||
+
|
||
+ push {lr};
|
||
+ vpush {RA4-RB2};
|
||
+
|
||
+ /* Load input */
|
||
+ vld1.8 {RA0}, [r3];
|
||
+ vld1.8 {RA1, RA2}, [r2]!;
|
||
+ vld1.8 {RA3}, [r2]!;
|
||
+ vld1.8 {RB0}, [r2]!;
|
||
+ vld1.8 {RB1, RB2}, [r2]!;
|
||
+ vld1.8 {RB3}, [r2]!;
|
||
+
|
||
+ /* Update IV */
|
||
+ vld1.8 {RT0}, [r2]!;
|
||
+ vst1.8 {RT0}, [r3];
|
||
+ mov r3, lr;
|
||
+ sub r2, r2, #(8*16);
|
||
+
|
||
+ bl __serpent_enc_blk8;
|
||
+
|
||
+ vld1.8 {RT0, RT1}, [r2]!;
|
||
+ vld1.8 {RT2, RT3}, [r2]!;
|
||
+ veor RA4, RA4, RT0;
|
||
+ veor RA1, RA1, RT1;
|
||
+ vld1.8 {RT0, RT1}, [r2]!;
|
||
+ veor RA2, RA2, RT2;
|
||
+ veor RA0, RA0, RT3;
|
||
+ vld1.8 {RT2, RT3}, [r2]!;
|
||
+ veor RB4, RB4, RT0;
|
||
+ veor RT0, RT0;
|
||
+ veor RB1, RB1, RT1;
|
||
+ veor RT1, RT1;
|
||
+ veor RB2, RB2, RT2;
|
||
+ veor RT2, RT2;
|
||
+ veor RB0, RB0, RT3;
|
||
+ veor RT3, RT3;
|
||
+
|
||
+ vst1.8 {RA4}, [r1]!;
|
||
+ vst1.8 {RA1}, [r1]!;
|
||
+ veor RA1, RA1;
|
||
+ vst1.8 {RA2}, [r1]!;
|
||
+ veor RA2, RA2;
|
||
+ vst1.8 {RA0}, [r1]!;
|
||
+ veor RA0, RA0;
|
||
+ vst1.8 {RB4}, [r1]!;
|
||
+ veor RB4, RB4;
|
||
+ vst1.8 {RB1}, [r1]!;
|
||
+ vst1.8 {RB2}, [r1]!;
|
||
+ vst1.8 {RB0}, [r1]!;
|
||
+
|
||
+ vpop {RA4-RB2};
|
||
+
|
||
+ /* clear the used registers */
|
||
+ veor RA3, RA3;
|
||
+ veor RB3, RB3;
|
||
+
|
||
+ pop {pc};
|
||
+.size _gcry_serpent_neon_cfb_dec,.-_gcry_serpent_neon_cfb_dec;
|
||
+
|
||
+.align 3
|
||
+.globl _gcry_serpent_neon_cbc_dec
|
||
+.type _gcry_serpent_neon_cbc_dec,%function;
|
||
+_gcry_serpent_neon_cbc_dec:
|
||
+ /* input:
|
||
+ * r0: ctx, CTX
|
||
+ * r1: dst (8 blocks)
|
||
+ * r2: src (8 blocks)
|
||
+ * r3: iv
|
||
+ */
|
||
+
|
||
+ push {lr};
|
||
+ vpush {RA4-RB2};
|
||
+
|
||
+ vld1.8 {RA0, RA1}, [r2]!;
|
||
+ vld1.8 {RA2, RA3}, [r2]!;
|
||
+ vld1.8 {RB0, RB1}, [r2]!;
|
||
+ vld1.8 {RB2, RB3}, [r2]!;
|
||
+ sub r2, r2, #(8*16);
|
||
+
|
||
+ bl __serpent_dec_blk8;
|
||
+
|
||
+ vld1.8 {RB4}, [r3];
|
||
+ vld1.8 {RT0, RT1}, [r2]!;
|
||
+ vld1.8 {RT2, RT3}, [r2]!;
|
||
+ veor RA0, RA0, RB4;
|
||
+ veor RA1, RA1, RT0;
|
||
+ veor RA2, RA2, RT1;
|
||
+ vld1.8 {RT0, RT1}, [r2]!;
|
||
+ veor RA3, RA3, RT2;
|
||
+ veor RB0, RB0, RT3;
|
||
+ vld1.8 {RT2, RT3}, [r2]!;
|
||
+ veor RB1, RB1, RT0;
|
||
+ veor RT0, RT0;
|
||
+ veor RB2, RB2, RT1;
|
||
+ veor RT1, RT1;
|
||
+ veor RB3, RB3, RT2;
|
||
+ veor RT2, RT2;
|
||
+ vst1.8 {RT3}, [r3]; /* store new IV */
|
||
+ veor RT3, RT3;
|
||
+
|
||
+ vst1.8 {RA0, RA1}, [r1]!;
|
||
+ veor RA0, RA0;
|
||
+ veor RA1, RA1;
|
||
+ vst1.8 {RA2, RA3}, [r1]!;
|
||
+ veor RA2, RA2;
|
||
+ vst1.8 {RB0, RB1}, [r1]!;
|
||
+ veor RA3, RA3;
|
||
+ vst1.8 {RB2, RB3}, [r1]!;
|
||
+ veor RB3, RB3;
|
||
+
|
||
+ vpop {RA4-RB2};
|
||
+
|
||
+ /* clear the used registers */
|
||
+ veor RB4, RB4;
|
||
+
|
||
+ pop {pc};
|
||
+.size _gcry_serpent_neon_cbc_dec,.-_gcry_serpent_neon_cbc_dec;
|
||
+
|
||
+#endif
|