3
0
forked from pool/libgcrypt
libgcrypt/arm-missing-files.diff

1619 lines
53 KiB
Diff
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

--- cipher/blowfish-arm.S
+++ cipher/blowfish-arm.S
@@ -0,0 +1,743 @@
+/* blowfish-arm.S - ARM assembly implementation of Blowfish cipher
+ *
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__ARMEL__)
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+.text
+
+.syntax unified
+.arm
+
+/* structure of crypto context */
+#define s0 0
+#define s1 (s0 + (1 * 256) * 4)
+#define s2 (s0 + (2 * 256) * 4)
+#define s3 (s0 + (3 * 256) * 4)
+#define p (s3 + (1 * 256) * 4)
+
+/* register macros */
+#define CTXs0 %r0
+#define CTXs1 %r9
+#define CTXs2 %r8
+#define CTXs3 %r10
+#define RMASK %lr
+#define RKEYL %r2
+#define RKEYR %ip
+
+#define RL0 %r3
+#define RR0 %r4
+
+#define RL1 %r9
+#define RR1 %r10
+
+#define RT0 %r11
+#define RT1 %r7
+#define RT2 %r5
+#define RT3 %r6
+
+/* helper macros */
+#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
+ ldrb rout, [rsrc, #((offs) + 0)]; \
+ ldrb rtmp, [rsrc, #((offs) + 1)]; \
+ orr rout, rout, rtmp, lsl #8; \
+ ldrb rtmp, [rsrc, #((offs) + 2)]; \
+ orr rout, rout, rtmp, lsl #16; \
+ ldrb rtmp, [rsrc, #((offs) + 3)]; \
+ orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
+ mov rtmp0, rin, lsr #8; \
+ strb rin, [rdst, #((offs) + 0)]; \
+ mov rtmp1, rin, lsr #16; \
+ strb rtmp0, [rdst, #((offs) + 1)]; \
+ mov rtmp0, rin, lsr #24; \
+ strb rtmp1, [rdst, #((offs) + 2)]; \
+ strb rtmp0, [rdst, #((offs) + 3)];
+
+#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
+ ldrb rout, [rsrc, #((offs) + 3)]; \
+ ldrb rtmp, [rsrc, #((offs) + 2)]; \
+ orr rout, rout, rtmp, lsl #8; \
+ ldrb rtmp, [rsrc, #((offs) + 1)]; \
+ orr rout, rout, rtmp, lsl #16; \
+ ldrb rtmp, [rsrc, #((offs) + 0)]; \
+ orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \
+ mov rtmp0, rin, lsr #8; \
+ strb rin, [rdst, #((offs) + 3)]; \
+ mov rtmp1, rin, lsr #16; \
+ strb rtmp0, [rdst, #((offs) + 2)]; \
+ mov rtmp0, rin, lsr #24; \
+ strb rtmp1, [rdst, #((offs) + 1)]; \
+ strb rtmp0, [rdst, #((offs) + 0)];
+
+#ifdef __ARMEL__
+ #define ldr_unaligned_host ldr_unaligned_le
+ #define str_unaligned_host str_unaligned_le
+
+ /* bswap on little-endian */
+#ifdef HAVE_ARM_ARCH_V6
+ #define host_to_be(reg, rtmp) \
+ rev reg, reg;
+ #define be_to_host(reg, rtmp) \
+ rev reg, reg;
+#else
+ #define host_to_be(reg, rtmp) \
+ eor rtmp, reg, reg, ror #16; \
+ mov rtmp, rtmp, lsr #8; \
+ bic rtmp, rtmp, #65280; \
+ eor reg, rtmp, reg, ror #8;
+ #define be_to_host(reg, rtmp) \
+ eor rtmp, reg, reg, ror #16; \
+ mov rtmp, rtmp, lsr #8; \
+ bic rtmp, rtmp, #65280; \
+ eor reg, rtmp, reg, ror #8;
+#endif
+#else
+ #define ldr_unaligned_host ldr_unaligned_be
+ #define str_unaligned_host str_unaligned_be
+
+ /* nop on big-endian */
+ #define host_to_be(reg, rtmp) /*_*/
+ #define be_to_host(reg, rtmp) /*_*/
+#endif
+
+#define host_to_host(x, y) /*_*/
+
+/***********************************************************************
+ * 1-way blowfish
+ ***********************************************************************/
+#define F(l, r) \
+ and RT0, RMASK, l, lsr#(24 - 2); \
+ and RT1, RMASK, l, lsr#(16 - 2); \
+ ldr RT0, [CTXs0, RT0]; \
+ and RT2, RMASK, l, lsr#(8 - 2); \
+ ldr RT1, [CTXs1, RT1]; \
+ and RT3, RMASK, l, lsl#2; \
+ ldr RT2, [CTXs2, RT2]; \
+ add RT0, RT1; \
+ ldr RT3, [CTXs3, RT3]; \
+ eor RT0, RT2; \
+ add RT0, RT3; \
+ eor r, RT0;
+
+#define load_roundkey_enc(n) \
+ ldr RKEYL, [CTXs2, #((p - s2) + (4 * (n) + 0))]; \
+ ldr RKEYR, [CTXs2, #((p - s2) + (4 * (n) + 4))];
+
+#define add_roundkey_enc() \
+ eor RL0, RKEYL; \
+ eor RR0, RKEYR;
+
+#define round_enc(n) \
+ add_roundkey_enc(); \
+ load_roundkey_enc(n); \
+ \
+ F(RL0, RR0); \
+ F(RR0, RL0);
+
+#define load_roundkey_dec(n) \
+ ldr RKEYL, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 4))]; \
+ ldr RKEYR, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 0))];
+
+#define add_roundkey_dec() \
+ eor RL0, RKEYL; \
+ eor RR0, RKEYR;
+
+#define round_dec(n) \
+ add_roundkey_dec(); \
+ load_roundkey_dec(n); \
+ \
+ F(RL0, RR0); \
+ F(RR0, RL0);
+
+#define read_block_aligned(rin, offs, l0, r0, convert, rtmp) \
+ ldr l0, [rin, #((offs) + 0)]; \
+ ldr r0, [rin, #((offs) + 4)]; \
+ convert(l0, rtmp); \
+ convert(r0, rtmp);
+
+#define write_block_aligned(rout, offs, l0, r0, convert, rtmp) \
+ convert(l0, rtmp); \
+ convert(r0, rtmp); \
+ str l0, [rout, #((offs) + 0)]; \
+ str r0, [rout, #((offs) + 4)];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+ /* unaligned word reads allowed */
+ #define read_block(rin, offs, l0, r0, rtmp0) \
+ read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0)
+
+ #define write_block(rout, offs, r0, l0, rtmp0, rtmp1) \
+ write_block_aligned(rout, offs, r0, l0, be_to_host, rtmp0)
+
+ #define read_block_host(rin, offs, l0, r0, rtmp0) \
+ read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0)
+
+ #define write_block_host(rout, offs, r0, l0, rtmp0, rtmp1) \
+ write_block_aligned(rout, offs, r0, l0, host_to_host, rtmp0)
+#else
+ /* need to handle unaligned reads by byte reads */
+ #define read_block(rin, offs, l0, r0, rtmp0) \
+ tst rin, #3; \
+ beq 1f; \
+ ldr_unaligned_be(l0, rin, (offs) + 0, rtmp0); \
+ ldr_unaligned_be(r0, rin, (offs) + 4, rtmp0); \
+ b 2f; \
+ 1:;\
+ read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0); \
+ 2:;
+
+ #define write_block(rout, offs, l0, r0, rtmp0, rtmp1) \
+ tst rout, #3; \
+ beq 1f; \
+ str_unaligned_be(l0, rout, (offs) + 0, rtmp0, rtmp1); \
+ str_unaligned_be(r0, rout, (offs) + 4, rtmp0, rtmp1); \
+ b 2f; \
+ 1:;\
+ write_block_aligned(rout, offs, l0, r0, be_to_host, rtmp0); \
+ 2:;
+
+ #define read_block_host(rin, offs, l0, r0, rtmp0) \
+ tst rin, #3; \
+ beq 1f; \
+ ldr_unaligned_host(l0, rin, (offs) + 0, rtmp0); \
+ ldr_unaligned_host(r0, rin, (offs) + 4, rtmp0); \
+ b 2f; \
+ 1:;\
+ read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0); \
+ 2:;
+
+ #define write_block_host(rout, offs, l0, r0, rtmp0, rtmp1) \
+ tst rout, #3; \
+ beq 1f; \
+ str_unaligned_host(l0, rout, (offs) + 0, rtmp0, rtmp1); \
+ str_unaligned_host(r0, rout, (offs) + 4, rtmp0, rtmp1); \
+ b 2f; \
+ 1:;\
+ write_block_aligned(rout, offs, l0, r0, host_to_host); \
+ 2:;
+#endif
+
+.align 3
+.type __blowfish_enc_blk1,%function;
+
+__blowfish_enc_blk1:
+ /* input:
+ * preloaded: CTX
+ * [RL0, RR0]: src
+ * output:
+ * [RR0, RL0]: dst
+ */
+ push {%lr};
+
+ add CTXs1, CTXs0, #(s1 - s0);
+ add CTXs2, CTXs0, #(s2 - s0);
+ mov RMASK, #(0xff << 2); /* byte mask */
+ add CTXs3, CTXs1, #(s3 - s1);
+
+ load_roundkey_enc(0);
+ round_enc(2);
+ round_enc(4);
+ round_enc(6);
+ round_enc(8);
+ round_enc(10);
+ round_enc(12);
+ round_enc(14);
+ round_enc(16);
+ add_roundkey_enc();
+
+ pop {%pc};
+.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;
+
+.align 8
+.globl _gcry_blowfish_arm_do_encrypt
+.type _gcry_blowfish_arm_do_encrypt,%function;
+
+_gcry_blowfish_arm_do_encrypt:
+ /* input:
+ * %r0: ctx, CTX
+ * %r1: u32 *ret_xl
+ * %r2: u32 *ret_xr
+ */
+ push {%r2, %r4-%r11, %ip, %lr};
+
+ ldr RL0, [%r1];
+ ldr RR0, [%r2];
+
+ bl __blowfish_enc_blk1;
+
+ pop {%r2};
+ str RR0, [%r1];
+ str RL0, [%r2];
+
+ pop {%r4-%r11, %ip, %pc};
+.size _gcry_blowfish_arm_do_encrypt,.-_gcry_blowfish_arm_do_encrypt;
+
+.align 3
+.globl _gcry_blowfish_arm_encrypt_block
+.type _gcry_blowfish_arm_encrypt_block,%function;
+
+_gcry_blowfish_arm_encrypt_block:
+ /* input:
+ * %r0: ctx, CTX
+ * %r1: dst
+ * %r2: src
+ */
+ push {%r4-%r11, %ip, %lr};
+
+ read_block(%r2, 0, RL0, RR0, RT0);
+
+ bl __blowfish_enc_blk1;
+
+ write_block(%r1, 0, RR0, RL0, RT0, RT1);
+
+ pop {%r4-%r11, %ip, %pc};
+.size _gcry_blowfish_arm_encrypt_block,.-_gcry_blowfish_arm_encrypt_block;
+
+.align 3
+.globl _gcry_blowfish_arm_decrypt_block
+.type _gcry_blowfish_arm_decrypt_block,%function;
+
+_gcry_blowfish_arm_decrypt_block:
+ /* input:
+ * %r0: ctx, CTX
+ * %r1: dst
+ * %r2: src
+ */
+ push {%r4-%r11, %ip, %lr};
+
+ add CTXs1, CTXs0, #(s1 - s0);
+ add CTXs2, CTXs0, #(s2 - s0);
+ mov RMASK, #(0xff << 2); /* byte mask */
+ add CTXs3, CTXs1, #(s3 - s1);
+
+ read_block(%r2, 0, RL0, RR0, RT0);
+
+ load_roundkey_dec(17);
+ round_dec(15);
+ round_dec(13);
+ round_dec(11);
+ round_dec(9);
+ round_dec(7);
+ round_dec(5);
+ round_dec(3);
+ round_dec(1);
+ add_roundkey_dec();
+
+ write_block(%r1, 0, RR0, RL0, RT0, RT1);
+
+ pop {%r4-%r11, %ip, %pc};
+.size _gcry_blowfish_arm_decrypt_block,.-_gcry_blowfish_arm_decrypt_block;
+
+/***********************************************************************
+ * 2-way blowfish
+ ***********************************************************************/
+#define F2(n, l0, r0, l1, r1, set_nextk, dec) \
+ \
+ and RT0, RMASK, l0, lsr#(24 - 2); \
+ and RT1, RMASK, l0, lsr#(16 - 2); \
+ and RT2, RMASK, l0, lsr#(8 - 2); \
+ add RT1, #(s1 - s0); \
+ \
+ ldr RT0, [CTXs0, RT0]; \
+ and RT3, RMASK, l0, lsl#2; \
+ ldr RT1, [CTXs0, RT1]; \
+ add RT3, #(s3 - s2); \
+ ldr RT2, [CTXs2, RT2]; \
+ add RT0, RT1; \
+ ldr RT3, [CTXs2, RT3]; \
+ \
+ and RT1, RMASK, l1, lsr#(24 - 2); \
+ eor RT0, RT2; \
+ and RT2, RMASK, l1, lsr#(16 - 2); \
+ add RT0, RT3; \
+ add RT2, #(s1 - s0); \
+ and RT3, RMASK, l1, lsr#(8 - 2); \
+ eor r0, RT0; \
+ \
+ ldr RT1, [CTXs0, RT1]; \
+ and RT0, RMASK, l1, lsl#2; \
+ ldr RT2, [CTXs0, RT2]; \
+ add RT0, #(s3 - s2); \
+ ldr RT3, [CTXs2, RT3]; \
+ add RT1, RT2; \
+ ldr RT0, [CTXs2, RT0]; \
+ \
+ and RT2, RMASK, r0, lsr#(24 - 2); \
+ eor RT1, RT3; \
+ and RT3, RMASK, r0, lsr#(16 - 2); \
+ add RT1, RT0; \
+ add RT3, #(s1 - s0); \
+ and RT0, RMASK, r0, lsr#(8 - 2); \
+ eor r1, RT1; \
+ \
+ ldr RT2, [CTXs0, RT2]; \
+ and RT1, RMASK, r0, lsl#2; \
+ ldr RT3, [CTXs0, RT3]; \
+ add RT1, #(s3 - s2); \
+ ldr RT0, [CTXs2, RT0]; \
+ add RT2, RT3; \
+ ldr RT1, [CTXs2, RT1]; \
+ \
+ and RT3, RMASK, r1, lsr#(24 - 2); \
+ eor RT2, RT0; \
+ and RT0, RMASK, r1, lsr#(16 - 2); \
+ add RT2, RT1; \
+ add RT0, #(s1 - s0); \
+ and RT1, RMASK, r1, lsr#(8 - 2); \
+ eor l0, RT2; \
+ \
+ ldr RT3, [CTXs0, RT3]; \
+ and RT2, RMASK, r1, lsl#2; \
+ ldr RT0, [CTXs0, RT0]; \
+ add RT2, #(s3 - s2); \
+ ldr RT1, [CTXs2, RT1]; \
+ eor l1, RKEYL; \
+ ldr RT2, [CTXs2, RT2]; \
+ \
+ eor r0, RKEYR; \
+ add RT3, RT0; \
+ eor r1, RKEYR; \
+ eor RT3, RT1; \
+ eor l0, RKEYL; \
+ add RT3, RT2; \
+ set_nextk(RKEYL, (p - s2) + (4 * (n) + ((dec) * 4))); \
+ eor l1, RT3; \
+ set_nextk(RKEYR, (p - s2) + (4 * (n) + (!(dec) * 4)));
+
+#define load_n_add_roundkey_enc2(n) \
+ load_roundkey_enc(n); \
+ eor RL0, RKEYL; \
+ eor RR0, RKEYR; \
+ eor RL1, RKEYL; \
+ eor RR1, RKEYR; \
+ load_roundkey_enc((n) + 2);
+
+#define next_key(reg, offs) \
+ ldr reg, [CTXs2, #(offs)];
+
+#define dummy(x, y) /* do nothing */
+
+#define round_enc2(n, load_next_key) \
+ F2((n) + 2, RL0, RR0, RL1, RR1, load_next_key, 0);
+
+#define load_n_add_roundkey_dec2(n) \
+ load_roundkey_dec(n); \
+ eor RL0, RKEYL; \
+ eor RR0, RKEYR; \
+ eor RL1, RKEYL; \
+ eor RR1, RKEYR; \
+ load_roundkey_dec((n) - 2);
+
+#define round_dec2(n, load_next_key) \
+ F2((n) - 3, RL0, RR0, RL1, RR1, load_next_key, 1);
+
+#define read_block2_aligned(rin, l0, r0, l1, r1, convert, rtmp) \
+ ldr l0, [rin, #(0)]; \
+ ldr r0, [rin, #(4)]; \
+ convert(l0, rtmp); \
+ ldr l1, [rin, #(8)]; \
+ convert(r0, rtmp); \
+ ldr r1, [rin, #(12)]; \
+ convert(l1, rtmp); \
+ convert(r1, rtmp);
+
+#define write_block2_aligned(rout, l0, r0, l1, r1, convert, rtmp) \
+ convert(l0, rtmp); \
+ convert(r0, rtmp); \
+ convert(l1, rtmp); \
+ str l0, [rout, #(0)]; \
+ convert(r1, rtmp); \
+ str r0, [rout, #(4)]; \
+ str l1, [rout, #(8)]; \
+ str r1, [rout, #(12)];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+ /* unaligned word reads allowed */
+ #define read_block2(rin, l0, r0, l1, r1, rtmp0) \
+ read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0)
+
+ #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+ write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0)
+
+ #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
+ read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0)
+
+ #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+ write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0)
+#else
+ /* need to handle unaligned reads by byte reads */
+ #define read_block2(rin, l0, r0, l1, r1, rtmp0) \
+ tst rin, #3; \
+ beq 1f; \
+ ldr_unaligned_be(l0, rin, 0, rtmp0); \
+ ldr_unaligned_be(r0, rin, 4, rtmp0); \
+ ldr_unaligned_be(l1, rin, 8, rtmp0); \
+ ldr_unaligned_be(r1, rin, 12, rtmp0); \
+ b 2f; \
+ 1:;\
+ read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0); \
+ 2:;
+
+ #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+ tst rout, #3; \
+ beq 1f; \
+ str_unaligned_be(l0, rout, 0, rtmp0, rtmp1); \
+ str_unaligned_be(r0, rout, 4, rtmp0, rtmp1); \
+ str_unaligned_be(l1, rout, 8, rtmp0, rtmp1); \
+ str_unaligned_be(r1, rout, 12, rtmp0, rtmp1); \
+ b 2f; \
+ 1:;\
+ write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0); \
+ 2:;
+
+ #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
+ tst rin, #3; \
+ beq 1f; \
+ ldr_unaligned_host(l0, rin, 0, rtmp0); \
+ ldr_unaligned_host(r0, rin, 4, rtmp0); \
+ ldr_unaligned_host(l1, rin, 8, rtmp0); \
+ ldr_unaligned_host(r1, rin, 12, rtmp0); \
+ b 2f; \
+ 1:;\
+ read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0); \
+ 2:;
+
+ #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+ tst rout, #3; \
+ beq 1f; \
+ str_unaligned_host(l0, rout, 0, rtmp0, rtmp1); \
+ str_unaligned_host(r0, rout, 4, rtmp0, rtmp1); \
+ str_unaligned_host(l1, rout, 8, rtmp0, rtmp1); \
+ str_unaligned_host(r1, rout, 12, rtmp0, rtmp1); \
+ b 2f; \
+ 1:;\
+ write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0); \
+ 2:;
+#endif
+
+.align 3
+.type _gcry_blowfish_arm_enc_blk2,%function;
+
+_gcry_blowfish_arm_enc_blk2:
+ /* input:
+ * preloaded: CTX
+ * [RL0, RR0], [RL1, RR1]: src
+ * output:
+ * [RR0, RL0], [RR1, RL1]: dst
+ */
+ push {RT0,%lr};
+
+ add CTXs2, CTXs0, #(s2 - s0);
+ mov RMASK, #(0xff << 2); /* byte mask */
+
+ load_n_add_roundkey_enc2(0);
+ round_enc2(2, next_key);
+ round_enc2(4, next_key);
+ round_enc2(6, next_key);
+ round_enc2(8, next_key);
+ round_enc2(10, next_key);
+ round_enc2(12, next_key);
+ round_enc2(14, next_key);
+ round_enc2(16, dummy);
+
+ host_to_be(RR0, RT0);
+ host_to_be(RL0, RT0);
+ host_to_be(RR1, RT0);
+ host_to_be(RL1, RT0);
+
+ pop {RT0,%pc};
+.size _gcry_blowfish_arm_enc_blk2,.-_gcry_blowfish_arm_enc_blk2;
+
+.align 3
+.globl _gcry_blowfish_arm_cfb_dec;
+.type _gcry_blowfish_arm_cfb_dec,%function;
+
+_gcry_blowfish_arm_cfb_dec:
+ /* input:
+ * %r0: CTX
+ * %r1: dst (2 blocks)
+ * %r2: src (2 blocks)
+ * %r3: iv (64bit)
+ */
+ push {%r2, %r4-%r11, %ip, %lr};
+
+ mov %lr, %r3;
+
+ /* Load input (iv/%r3 is aligned, src/%r2 might not be) */
+ ldm %r3, {RL0, RR0};
+ host_to_be(RL0, RT0);
+ host_to_be(RR0, RT0);
+ read_block(%r2, 0, RL1, RR1, RT0);
+
+ /* Update IV, load src[1] and save to iv[0] */
+ read_block_host(%r2, 8, %r5, %r6, RT0);
+ stm %lr, {%r5, %r6};
+
+ bl _gcry_blowfish_arm_enc_blk2;
+ /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+
+ /* %r1: dst, %r0: %src */
+ pop {%r0};
+
+ /* dst = src ^ result */
+ read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
+ eor %r5, %r4;
+ eor %r6, %r3;
+ eor %r7, %r10;
+ eor %r8, %r9;
+ write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
+
+ pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_blowfish_arm_cfb_dec,.-_gcry_blowfish_arm_cfb_dec;
+
+.align 3
+.globl _gcry_blowfish_arm_ctr_enc;
+.type _gcry_blowfish_arm_ctr_enc,%function;
+
+_gcry_blowfish_arm_ctr_enc:
+ /* input:
+ * %r0: CTX
+ * %r1: dst (2 blocks)
+ * %r2: src (2 blocks)
+ * %r3: iv (64bit, big-endian)
+ */
+ push {%r2, %r4-%r11, %ip, %lr};
+
+ mov %lr, %r3;
+
+ /* Load IV (big => host endian) */
+ read_block_aligned(%lr, 0, RL0, RR0, be_to_host, RT0);
+
+ /* Construct IVs */
+ adds RR1, RR0, #1; /* +1 */
+ adc RL1, RL0, #0;
+ adds %r6, RR1, #1; /* +2 */
+ adc %r5, RL1, #0;
+
+ /* Store new IV (host => big-endian) */
+ write_block_aligned(%lr, 0, %r5, %r6, host_to_be, RT0);
+
+ bl _gcry_blowfish_arm_enc_blk2;
+ /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+
+ /* %r1: dst, %r0: %src */
+ pop {%r0};
+
+ /* XOR key-stream with plaintext */
+ read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
+ eor %r5, %r4;
+ eor %r6, %r3;
+ eor %r7, %r10;
+ eor %r8, %r9;
+ write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
+
+ pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_blowfish_arm_ctr_enc,.-_gcry_blowfish_arm_ctr_enc;
+
+.align 3
+.type _gcry_blowfish_arm_dec_blk2,%function;
+
+_gcry_blowfish_arm_dec_blk2:
+ /* input:
+ * preloaded: CTX
+ * [RL0, RR0], [RL1, RR1]: src
+ * output:
+ * [RR0, RL0], [RR1, RL1]: dst
+ */
+ add CTXs2, CTXs0, #(s2 - s0);
+ mov RMASK, #(0xff << 2); /* byte mask */
+
+ load_n_add_roundkey_dec2(17);
+ round_dec2(15, next_key);
+ round_dec2(13, next_key);
+ round_dec2(11, next_key);
+ round_dec2(9, next_key);
+ round_dec2(7, next_key);
+ round_dec2(5, next_key);
+ round_dec2(3, next_key);
+ round_dec2(1, dummy);
+
+ host_to_be(RR0, RT0);
+ host_to_be(RL0, RT0);
+ host_to_be(RR1, RT0);
+ host_to_be(RL1, RT0);
+
+ b .Ldec_cbc_tail;
+.ltorg
+.size _gcry_blowfish_arm_dec_blk2,.-_gcry_blowfish_arm_dec_blk2;
+
+.align 3
+.globl _gcry_blowfish_arm_cbc_dec;
+.type _gcry_blowfish_arm_cbc_dec,%function;
+
+_gcry_blowfish_arm_cbc_dec:
+ /* input:
+ * %r0: CTX
+ * %r1: dst (2 blocks)
+ * %r2: src (2 blocks)
+ * %r3: iv (64bit)
+ */
+ push {%r2-%r11, %ip, %lr};
+
+ read_block2(%r2, RL0, RR0, RL1, RR1, RT0);
+
+ /* dec_blk2 is only used by cbc_dec, jump directly in/out instead
+ * of function call. */
+ b _gcry_blowfish_arm_dec_blk2;
+.Ldec_cbc_tail:
+ /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+
+ /* %r0: %src, %r1: dst, %r2: iv */
+ pop {%r0, %r2};
+
+ /* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */
+ read_block_host(%r0, 0, %r7, %r8, %r5);
+ /* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */
+ ldm %r2, {%r5, %r6};
+
+ /* out[1] ^= IV+1 */
+ eor %r10, %r7;
+ eor %r9, %r8;
+ /* out[0] ^= IV */
+ eor %r4, %r5;
+ eor %r3, %r6;
+
+ /* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */
+ read_block_host(%r0, 8, %r7, %r8, %r5);
+ /* store IV+2 to iv[0] (aligned). */
+ stm %r2, {%r7, %r8};
+
+ /* store result to dst[0-3]. Might be unaligned. */
+ write_block2_host(%r1, %r4, %r3, %r10, %r9, %r5, %r6);
+
+ pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_blowfish_arm_cbc_dec,.-_gcry_blowfish_arm_cbc_dec;
+
+#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
+#endif /*__ARM_ARCH >= 6*/
--- cipher/serpent-armv7-neon.S
+++ cipher/serpent-armv7-neon.S
@@ -0,0 +1,869 @@
+/* serpent-armv7-neon.S - ARM/NEON assembly implementation of Serpent cipher
+ *
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+ defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_NEON)
+
+.text
+
+.syntax unified
+.fpu neon
+.arm
+
+/* ARM registers */
+#define RROUND r0
+
+/* NEON vector registers */
+#define RA0 q0
+#define RA1 q1
+#define RA2 q2
+#define RA3 q3
+#define RA4 q4
+#define RB0 q5
+#define RB1 q6
+#define RB2 q7
+#define RB3 q8
+#define RB4 q9
+
+#define RT0 q10
+#define RT1 q11
+#define RT2 q12
+#define RT3 q13
+
+#define RA0d0 d0
+#define RA0d1 d1
+#define RA1d0 d2
+#define RA1d1 d3
+#define RA2d0 d4
+#define RA2d1 d5
+#define RA3d0 d6
+#define RA3d1 d7
+#define RA4d0 d8
+#define RA4d1 d9
+#define RB0d0 d10
+#define RB0d1 d11
+#define RB1d0 d12
+#define RB1d1 d13
+#define RB2d0 d14
+#define RB2d1 d15
+#define RB3d0 d16
+#define RB3d1 d17
+#define RB4d0 d18
+#define RB4d1 d19
+#define RT0d0 d20
+#define RT0d1 d21
+#define RT1d0 d22
+#define RT1d1 d23
+#define RT2d0 d24
+#define RT2d1 d25
+
+/**********************************************************************
+ helper macros
+ **********************************************************************/
+
+#define transpose_4x4(_q0, _q1, _q2, _q3) \
+ vtrn.32 _q0, _q1; \
+ vtrn.32 _q2, _q3; \
+ vswp _q0##d1, _q2##d0; \
+ vswp _q1##d1, _q3##d0;
+
+/**********************************************************************
+ 8-way serpent
+ **********************************************************************/
+
+/*
+ * These are the S-Boxes of Serpent from following research paper.
+ *
+ * D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference,
+ * (New York, New York, USA), p. 317329, National Institute of Standards and
+ * Technology, 2000.
+ *
+ * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
+ *
+ */
+#define SBOX0(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ veor a3, a3, a0; veor b3, b3, b0; vmov a4, a1; vmov b4, b1; \
+ vand a1, a1, a3; vand b1, b1, b3; veor a4, a4, a2; veor b4, b4, b2; \
+ veor a1, a1, a0; veor b1, b1, b0; vorr a0, a0, a3; vorr b0, b0, b3; \
+ veor a0, a0, a4; veor b0, b0, b4; veor a4, a4, a3; veor b4, b4, b3; \
+ veor a3, a3, a2; veor b3, b3, b2; vorr a2, a2, a1; vorr b2, b2, b1; \
+ veor a2, a2, a4; veor b2, b2, b4; vmvn a4, a4; vmvn b4, b4; \
+ vorr a4, a4, a1; vorr b4, b4, b1; veor a1, a1, a3; veor b1, b1, b3; \
+ veor a1, a1, a4; veor b1, b1, b4; vorr a3, a3, a0; vorr b3, b3, b0; \
+ veor a1, a1, a3; veor b1, b1, b3; veor a4, a3; veor b4, b3;
+
+#define SBOX0_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ vmvn a2, a2; vmvn b2, b2; vmov a4, a1; vmov b4, b1; \
+ vorr a1, a1, a0; vorr b1, b1, b0; vmvn a4, a4; vmvn b4, b4; \
+ veor a1, a1, a2; veor b1, b1, b2; vorr a2, a2, a4; vorr b2, b2, b4; \
+ veor a1, a1, a3; veor b1, b1, b3; veor a0, a0, a4; veor b0, b0, b4; \
+ veor a2, a2, a0; veor b2, b2, b0; vand a0, a0, a3; vand b0, b0, b3; \
+ veor a4, a4, a0; veor b4, b4, b0; vorr a0, a0, a1; vorr b0, b0, b1; \
+ veor a0, a0, a2; veor b0, b0, b2; veor a3, a3, a4; veor b3, b3, b4; \
+ veor a2, a2, a1; veor b2, b2, b1; veor a3, a3, a0; veor b3, b3, b0; \
+ veor a3, a3, a1; veor b3, b3, b1;\
+ vand a2, a2, a3; vand b2, b2, b3;\
+ veor a4, a2; veor b4, b2;
+
+#define SBOX1(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ vmvn a0, a0; vmvn b0, b0; vmvn a2, a2; vmvn b2, b2; \
+ vmov a4, a0; vmov b4, b0; vand a0, a0, a1; vand b0, b0, b1; \
+ veor a2, a2, a0; veor b2, b2, b0; vorr a0, a0, a3; vorr b0, b0, b3; \
+ veor a3, a3, a2; veor b3, b3, b2; veor a1, a1, a0; veor b1, b1, b0; \
+ veor a0, a0, a4; veor b0, b0, b4; vorr a4, a4, a1; vorr b4, b4, b1; \
+ veor a1, a1, a3; veor b1, b1, b3; vorr a2, a2, a0; vorr b2, b2, b0; \
+ vand a2, a2, a4; vand b2, b2, b4; veor a0, a0, a1; veor b0, b0, b1; \
+ vand a1, a1, a2; vand b1, b1, b2;\
+ veor a1, a1, a0; veor b1, b1, b0; vand a0, a0, a2; vand b0, b0, b2; \
+ veor a0, a4; veor b0, b4;
+
+#define SBOX1_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ vmov a4, a1; vmov b4, b1; veor a1, a1, a3; veor b1, b1, b3; \
+ vand a3, a3, a1; vand b3, b3, b1; veor a4, a4, a2; veor b4, b4, b2; \
+ veor a3, a3, a0; veor b3, b3, b0; vorr a0, a0, a1; vorr b0, b0, b1; \
+ veor a2, a2, a3; veor b2, b2, b3; veor a0, a0, a4; veor b0, b0, b4; \
+ vorr a0, a0, a2; vorr b0, b0, b2; veor a1, a1, a3; veor b1, b1, b3; \
+ veor a0, a0, a1; veor b0, b0, b1; vorr a1, a1, a3; vorr b1, b1, b3; \
+ veor a1, a1, a0; veor b1, b1, b0; vmvn a4, a4; vmvn b4, b4; \
+ veor a4, a4, a1; veor b4, b4, b1; vorr a1, a1, a0; vorr b1, b1, b0; \
+ veor a1, a1, a0; veor b1, b1, b0;\
+ vorr a1, a1, a4; vorr b1, b1, b4;\
+ veor a3, a1; veor b3, b1;
+
+#define SBOX2(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ vmov a4, a0; vmov b4, b0; vand a0, a0, a2; vand b0, b0, b2; \
+ veor a0, a0, a3; veor b0, b0, b3; veor a2, a2, a1; veor b2, b2, b1; \
+ veor a2, a2, a0; veor b2, b2, b0; vorr a3, a3, a4; vorr b3, b3, b4; \
+ veor a3, a3, a1; veor b3, b3, b1; veor a4, a4, a2; veor b4, b4, b2; \
+ vmov a1, a3; vmov b1, b3; vorr a3, a3, a4; vorr b3, b3, b4; \
+ veor a3, a3, a0; veor b3, b3, b0; vand a0, a0, a1; vand b0, b0, b1; \
+ veor a4, a4, a0; veor b4, b4, b0; veor a1, a1, a3; veor b1, b1, b3; \
+ veor a1, a1, a4; veor b1, b1, b4; vmvn a4, a4; vmvn b4, b4;
+
+#define SBOX2_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ veor a2, a2, a3; veor b2, b2, b3; veor a3, a3, a0; veor b3, b3, b0; \
+ vmov a4, a3; vmov b4, b3; vand a3, a3, a2; vand b3, b3, b2; \
+ veor a3, a3, a1; veor b3, b3, b1; vorr a1, a1, a2; vorr b1, b1, b2; \
+ veor a1, a1, a4; veor b1, b1, b4; vand a4, a4, a3; vand b4, b4, b3; \
+ veor a2, a2, a3; veor b2, b2, b3; vand a4, a4, a0; vand b4, b4, b0; \
+ veor a4, a4, a2; veor b4, b4, b2; vand a2, a2, a1; vand b2, b2, b1; \
+ vorr a2, a2, a0; vorr b2, b2, b0; vmvn a3, a3; vmvn b3, b3; \
+ veor a2, a2, a3; veor b2, b2, b3; veor a0, a0, a3; veor b0, b0, b3; \
+ vand a0, a0, a1; vand b0, b0, b1; veor a3, a3, a4; veor b3, b3, b4; \
+ veor a3, a0; veor b3, b0;
+
+#define SBOX3(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ vmov a4, a0; vmov b4, b0; vorr a0, a0, a3; vorr b0, b0, b3; \
+ veor a3, a3, a1; veor b3, b3, b1; vand a1, a1, a4; vand b1, b1, b4; \
+ veor a4, a4, a2; veor b4, b4, b2; veor a2, a2, a3; veor b2, b2, b3; \
+ vand a3, a3, a0; vand b3, b3, b0; vorr a4, a4, a1; vorr b4, b4, b1; \
+ veor a3, a3, a4; veor b3, b3, b4; veor a0, a0, a1; veor b0, b0, b1; \
+ vand a4, a4, a0; vand b4, b4, b0; veor a1, a1, a3; veor b1, b1, b3; \
+ veor a4, a4, a2; veor b4, b4, b2; vorr a1, a1, a0; vorr b1, b1, b0; \
+ veor a1, a1, a2; veor b1, b1, b2; veor a0, a0, a3; veor b0, b0, b3; \
+ vmov a2, a1; vmov b2, b1; vorr a1, a1, a3; vorr b1, b1, b3; \
+ veor a1, a0; veor b1, b0;
+
+#define SBOX3_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ vmov a4, a2; vmov b4, b2; veor a2, a2, a1; veor b2, b2, b1; \
+ veor a0, a0, a2; veor b0, b0, b2; vand a4, a4, a2; vand b4, b4, b2; \
+ veor a4, a4, a0; veor b4, b4, b0; vand a0, a0, a1; vand b0, b0, b1; \
+ veor a1, a1, a3; veor b1, b1, b3; vorr a3, a3, a4; vorr b3, b3, b4; \
+ veor a2, a2, a3; veor b2, b2, b3; veor a0, a0, a3; veor b0, b0, b3; \
+ veor a1, a1, a4; veor b1, b1, b4; vand a3, a3, a2; vand b3, b3, b2; \
+ veor a3, a3, a1; veor b3, b3, b1; veor a1, a1, a0; veor b1, b1, b0; \
+ vorr a1, a1, a2; vorr b1, b1, b2; veor a0, a0, a3; veor b0, b0, b3; \
+ veor a1, a1, a4; veor b1, b1, b4;\
+ veor a0, a1; veor b0, b1;
+
+#define SBOX4(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ veor a1, a1, a3; veor b1, b1, b3; vmvn a3, a3; vmvn b3, b3; \
+ veor a2, a2, a3; veor b2, b2, b3; veor a3, a3, a0; veor b3, b3, b0; \
+ vmov a4, a1; vmov b4, b1; vand a1, a1, a3; vand b1, b1, b3; \
+ veor a1, a1, a2; veor b1, b1, b2; veor a4, a4, a3; veor b4, b4, b3; \
+ veor a0, a0, a4; veor b0, b0, b4; vand a2, a2, a4; vand b2, b2, b4; \
+ veor a2, a2, a0; veor b2, b2, b0; vand a0, a0, a1; vand b0, b0, b1; \
+ veor a3, a3, a0; veor b3, b3, b0; vorr a4, a4, a1; vorr b4, b4, b1; \
+ veor a4, a4, a0; veor b4, b4, b0; vorr a0, a0, a3; vorr b0, b0, b3; \
+ veor a0, a0, a2; veor b0, b0, b2; vand a2, a2, a3; vand b2, b2, b3; \
+ vmvn a0, a0; vmvn b0, b0; veor a4, a2; veor b4, b2;
+
+#define SBOX4_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ vmov a4, a2; vmov b4, b2; vand a2, a2, a3; vand b2, b2, b3; \
+ veor a2, a2, a1; veor b2, b2, b1; vorr a1, a1, a3; vorr b1, b1, b3; \
+ vand a1, a1, a0; vand b1, b1, b0; veor a4, a4, a2; veor b4, b4, b2; \
+ veor a4, a4, a1; veor b4, b4, b1; vand a1, a1, a2; vand b1, b1, b2; \
+ vmvn a0, a0; vmvn b0, b0; veor a3, a3, a4; veor b3, b3, b4; \
+ veor a1, a1, a3; veor b1, b1, b3; vand a3, a3, a0; vand b3, b3, b0; \
+ veor a3, a3, a2; veor b3, b3, b2; veor a0, a0, a1; veor b0, b0, b1; \
+ vand a2, a2, a0; vand b2, b2, b0; veor a3, a3, a0; veor b3, b3, b0; \
+ veor a2, a2, a4; veor b2, b2, b4;\
+ vorr a2, a2, a3; vorr b2, b2, b3; veor a3, a3, a0; veor b3, b3, b0; \
+ veor a2, a1; veor b2, b1;
+
+#define SBOX5(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ veor a0, a0, a1; veor b0, b0, b1; veor a1, a1, a3; veor b1, b1, b3; \
+ vmvn a3, a3; vmvn b3, b3; vmov a4, a1; vmov b4, b1; \
+ vand a1, a1, a0; vand b1, b1, b0; veor a2, a2, a3; veor b2, b2, b3; \
+ veor a1, a1, a2; veor b1, b1, b2; vorr a2, a2, a4; vorr b2, b2, b4; \
+ veor a4, a4, a3; veor b4, b4, b3; vand a3, a3, a1; vand b3, b3, b1; \
+ veor a3, a3, a0; veor b3, b3, b0; veor a4, a4, a1; veor b4, b4, b1; \
+ veor a4, a4, a2; veor b4, b4, b2; veor a2, a2, a0; veor b2, b2, b0; \
+ vand a0, a0, a3; vand b0, b0, b3; vmvn a2, a2; vmvn b2, b2; \
+ veor a0, a0, a4; veor b0, b0, b4; vorr a4, a4, a3; vorr b4, b4, b3; \
+ veor a2, a4; veor b2, b4;
+
+#define SBOX5_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ vmvn a1, a1; vmvn b1, b1; vmov a4, a3; vmov b4, b3; \
+ veor a2, a2, a1; veor b2, b2, b1; vorr a3, a3, a0; vorr b3, b3, b0; \
+ veor a3, a3, a2; veor b3, b3, b2; vorr a2, a2, a1; vorr b2, b2, b1; \
+ vand a2, a2, a0; vand b2, b2, b0; veor a4, a4, a3; veor b4, b4, b3; \
+ veor a2, a2, a4; veor b2, b2, b4; vorr a4, a4, a0; vorr b4, b4, b0; \
+ veor a4, a4, a1; veor b4, b4, b1; vand a1, a1, a2; vand b1, b1, b2; \
+ veor a1, a1, a3; veor b1, b1, b3; veor a4, a4, a2; veor b4, b4, b2; \
+ vand a3, a3, a4; vand b3, b3, b4; veor a4, a4, a1; veor b4, b4, b1; \
+ veor a3, a3, a4; veor b3, b3, b4; vmvn a4, a4; vmvn b4, b4; \
+ veor a3, a0; veor b3, b0;
+
+#define SBOX6(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ vmvn a2, a2; vmvn b2, b2; vmov a4, a3; vmov b4, b3; \
+ vand a3, a3, a0; vand b3, b3, b0; veor a0, a0, a4; veor b0, b0, b4; \
+ veor a3, a3, a2; veor b3, b3, b2; vorr a2, a2, a4; vorr b2, b2, b4; \
+ veor a1, a1, a3; veor b1, b1, b3; veor a2, a2, a0; veor b2, b2, b0; \
+ vorr a0, a0, a1; vorr b0, b0, b1; veor a2, a2, a1; veor b2, b2, b1; \
+ veor a4, a4, a0; veor b4, b4, b0; vorr a0, a0, a3; vorr b0, b0, b3; \
+ veor a0, a0, a2; veor b0, b0, b2; veor a4, a4, a3; veor b4, b4, b3; \
+ veor a4, a4, a0; veor b4, b4, b0; vmvn a3, a3; vmvn b3, b3; \
+ vand a2, a2, a4; vand b2, b2, b4;\
+ veor a2, a3; veor b2, b3;
+
+#define SBOX6_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ veor a0, a0, a2; veor b0, b0, b2; vmov a4, a2; vmov b4, b2; \
+ vand a2, a2, a0; vand b2, b2, b0; veor a4, a4, a3; veor b4, b4, b3; \
+ vmvn a2, a2; vmvn b2, b2; veor a3, a3, a1; veor b3, b3, b1; \
+ veor a2, a2, a3; veor b2, b2, b3; vorr a4, a4, a0; vorr b4, b4, b0; \
+ veor a0, a0, a2; veor b0, b0, b2; veor a3, a3, a4; veor b3, b3, b4; \
+ veor a4, a4, a1; veor b4, b4, b1; vand a1, a1, a3; vand b1, b1, b3; \
+ veor a1, a1, a0; veor b1, b1, b0; veor a0, a0, a3; veor b0, b0, b3; \
+ vorr a0, a0, a2; vorr b0, b0, b2; veor a3, a3, a1; veor b3, b3, b1; \
+ veor a4, a0; veor b4, b0;
+
+#define SBOX7(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ vmov a4, a1; vmov b4, b1; vorr a1, a1, a2; vorr b1, b1, b2; \
+ veor a1, a1, a3; veor b1, b1, b3; veor a4, a4, a2; veor b4, b4, b2; \
+ veor a2, a2, a1; veor b2, b2, b1; vorr a3, a3, a4; vorr b3, b3, b4; \
+ vand a3, a3, a0; vand b3, b3, b0; veor a4, a4, a2; veor b4, b4, b2; \
+ veor a3, a3, a1; veor b3, b3, b1; vorr a1, a1, a4; vorr b1, b1, b4; \
+ veor a1, a1, a0; veor b1, b1, b0; vorr a0, a0, a4; vorr b0, b0, b4; \
+ veor a0, a0, a2; veor b0, b0, b2; veor a1, a1, a4; veor b1, b1, b4; \
+ veor a2, a2, a1; veor b2, b2, b1; vand a1, a1, a0; vand b1, b1, b0; \
+ veor a1, a1, a4; veor b1, b1, b4; vmvn a2, a2; vmvn b2, b2; \
+ vorr a2, a2, a0; vorr b2, b2, b0;\
+ veor a4, a2; veor b4, b2;
+
+#define SBOX7_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ vmov a4, a2; vmov b4, b2; veor a2, a2, a0; veor b2, b2, b0; \
+ vand a0, a0, a3; vand b0, b0, b3; vorr a4, a4, a3; vorr b4, b4, b3; \
+ vmvn a2, a2; vmvn b2, b2; veor a3, a3, a1; veor b3, b3, b1; \
+ vorr a1, a1, a0; vorr b1, b1, b0; veor a0, a0, a2; veor b0, b0, b2; \
+ vand a2, a2, a4; vand b2, b2, b4; vand a3, a3, a4; vand b3, b3, b4; \
+ veor a1, a1, a2; veor b1, b1, b2; veor a2, a2, a0; veor b2, b2, b0; \
+ vorr a0, a0, a2; vorr b0, b0, b2; veor a4, a4, a1; veor b4, b4, b1; \
+ veor a0, a0, a3; veor b0, b0, b3; veor a3, a3, a4; veor b3, b3, b4; \
+ vorr a4, a4, a0; vorr b4, b4, b0; veor a3, a3, a2; veor b3, b3, b2; \
+ veor a4, a2; veor b4, b2;
+
+/* Apply SBOX number WHICH to to the block. */
+#define SBOX(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ SBOX##which (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4)
+
+/* Apply inverse SBOX number WHICH to to the block. */
+#define SBOX_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ SBOX##which##_INVERSE (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4)
+
+/* XOR round key into block state in a0,a1,a2,a3. a4 used as temporary. */
+#define BLOCK_XOR_KEY(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ vdup.32 RT3, RT0d0[0]; \
+ vdup.32 RT1, RT0d0[1]; \
+ vdup.32 RT2, RT0d1[0]; \
+ vdup.32 RT0, RT0d1[1]; \
+ veor a0, a0, RT3; veor b0, b0, RT3; \
+ veor a1, a1, RT1; veor b1, b1, RT1; \
+ veor a2, a2, RT2; veor b2, b2, RT2; \
+ veor a3, a3, RT0; veor b3, b3, RT0;
+
+#define BLOCK_LOAD_KEY_ENC() \
+ vld1.8 {RT0d0, RT0d1}, [RROUND]!;
+
+#define BLOCK_LOAD_KEY_DEC() \
+ vld1.8 {RT0d0, RT0d1}, [RROUND]; \
+ sub RROUND, RROUND, #16
+
+/* Apply the linear transformation to BLOCK. */
+#define LINEAR_TRANSFORMATION(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ vshl.u32 a4, a0, #13; vshl.u32 b4, b0, #13; \
+ vshr.u32 a0, a0, #(32-13); vshr.u32 b0, b0, #(32-13); \
+ veor a0, a0, a4; veor b0, b0, b4; \
+ vshl.u32 a4, a2, #3; vshl.u32 b4, b2, #3; \
+ vshr.u32 a2, a2, #(32-3); vshr.u32 b2, b2, #(32-3); \
+ veor a2, a2, a4; veor b2, b2, b4; \
+ veor a1, a0, a1; veor b1, b0, b1; \
+ veor a1, a2, a1; veor b1, b2, b1; \
+ vshl.u32 a4, a0, #3; vshl.u32 b4, b0, #3; \
+ veor a3, a2, a3; veor b3, b2, b3; \
+ veor a3, a4, a3; veor b3, b4, b3; \
+ vshl.u32 a4, a1, #1; vshl.u32 b4, b1, #1; \
+ vshr.u32 a1, a1, #(32-1); vshr.u32 b1, b1, #(32-1); \
+ veor a1, a1, a4; veor b1, b1, b4; \
+ vshl.u32 a4, a3, #7; vshl.u32 b4, b3, #7; \
+ vshr.u32 a3, a3, #(32-7); vshr.u32 b3, b3, #(32-7); \
+ veor a3, a3, a4; veor b3, b3, b4; \
+ veor a0, a1, a0; veor b0, b1, b0; \
+ veor a0, a3, a0; veor b0, b3, b0; \
+ vshl.u32 a4, a1, #7; vshl.u32 b4, b1, #7; \
+ veor a2, a3, a2; veor b2, b3, b2; \
+ veor a2, a4, a2; veor b2, b4, b2; \
+ vshl.u32 a4, a0, #5; vshl.u32 b4, b0, #5; \
+ vshr.u32 a0, a0, #(32-5); vshr.u32 b0, b0, #(32-5); \
+ veor a0, a0, a4; veor b0, b0, b4; \
+ vshl.u32 a4, a2, #22; vshl.u32 b4, b2, #22; \
+ vshr.u32 a2, a2, #(32-22); vshr.u32 b2, b2, #(32-22); \
+ veor a2, a2, a4; veor b2, b2, b4;
+
+/* Apply the inverse linear transformation to BLOCK. */
+#define LINEAR_TRANSFORMATION_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+ vshr.u32 a4, a2, #22; vshr.u32 b4, b2, #22; \
+ vshl.u32 a2, a2, #(32-22); vshl.u32 b2, b2, #(32-22); \
+ veor a2, a2, a4; veor b2, b2, b4; \
+ vshr.u32 a4, a0, #5; vshr.u32 b4, b0, #5; \
+ vshl.u32 a0, a0, #(32-5); vshl.u32 b0, b0, #(32-5); \
+ veor a0, a0, a4; veor b0, b0, b4; \
+ vshl.u32 a4, a1, #7; vshl.u32 b4, b1, #7; \
+ veor a2, a3, a2; veor b2, b3, b2; \
+ veor a2, a4, a2; veor b2, b4, b2; \
+ veor a0, a1, a0; veor b0, b1, b0; \
+ veor a0, a3, a0; veor b0, b3, b0; \
+ vshr.u32 a4, a3, #7; vshr.u32 b4, b3, #7; \
+ vshl.u32 a3, a3, #(32-7); vshl.u32 b3, b3, #(32-7); \
+ veor a3, a3, a4; veor b3, b3, b4; \
+ vshr.u32 a4, a1, #1; vshr.u32 b4, b1, #1; \
+ vshl.u32 a1, a1, #(32-1); vshl.u32 b1, b1, #(32-1); \
+ veor a1, a1, a4; veor b1, b1, b4; \
+ vshl.u32 a4, a0, #3; vshl.u32 b4, b0, #3; \
+ veor a3, a2, a3; veor b3, b2, b3; \
+ veor a3, a4, a3; veor b3, b4, b3; \
+ veor a1, a0, a1; veor b1, b0, b1; \
+ veor a1, a2, a1; veor b1, b2, b1; \
+ vshr.u32 a4, a2, #3; vshr.u32 b4, b2, #3; \
+ vshl.u32 a2, a2, #(32-3); vshl.u32 b2, b2, #(32-3); \
+ veor a2, a2, a4; veor b2, b2, b4; \
+ vshr.u32 a4, a0, #13; vshr.u32 b4, b0, #13; \
+ vshl.u32 a0, a0, #(32-13); vshl.u32 b0, b0, #(32-13); \
+ veor a0, a0, a4; veor b0, b0, b4;
+
+/* Apply a Serpent round to eight parallel blocks. This macro increments
+ `round'. */
+#define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
+ BLOCK_LOAD_KEY_ENC (); \
+ SBOX (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
+ LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4);
+
+/* Apply the last Serpent round to eight parallel blocks. This macro increments
+ `round'. */
+#define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
+ BLOCK_LOAD_KEY_ENC (); \
+ SBOX (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4);
+
+/* Apply an inverse Serpent round to eight parallel blocks. This macro
+ increments `round'. */
+#define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \
+ na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, \
+ nb0, nb1, nb2, nb3, nb4) \
+ LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
+ SBOX_INVERSE (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4); \
+ BLOCK_LOAD_KEY_DEC ();
+
+/* Apply the first inverse Serpent round to eight parallel blocks. This macro
+ increments `round'. */
+#define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \
+ na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, \
+ nb0, nb1, nb2, nb3, nb4) \
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
+ BLOCK_LOAD_KEY_DEC (); \
+ SBOX_INVERSE (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4); \
+ BLOCK_LOAD_KEY_DEC ();
+
+.align 3
+.type __serpent_enc_blk8,%function;
+__serpent_enc_blk8:
+ /* input:
+ * r0: round key pointer
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
+ * blocks
+ * output:
+ * RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: eight parallel
+ * ciphertext blocks
+ */
+
+ transpose_4x4(RA0, RA1, RA2, RA3);
+ BLOCK_LOAD_KEY_ENC ();
+ transpose_4x4(RB0, RB1, RB2, RB3);
+
+ ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+ RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+ ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+ RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+ ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+ RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+ ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+ RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+ ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+ RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+ ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+ RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+ ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+ RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+ ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+ RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+ ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0,
+ RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0);
+ ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0,
+ RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0);
+ ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2,
+ RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2);
+ ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4,
+ RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4);
+ ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0,
+ RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0);
+ ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0,
+ RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0);
+ ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3,
+ RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3);
+ ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0,
+ RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0);
+ ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4,
+ RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4);
+ ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4,
+ RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4);
+ ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2,
+ RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2);
+ ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3,
+ RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3);
+ ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4,
+ RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4);
+ ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4,
+ RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4);
+ ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0,
+ RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0);
+ ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4,
+ RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4);
+ ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+ RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+ ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+ RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+ ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+ RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+ ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+ RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+ ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+ RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+ ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+ RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+ ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+ RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+ ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+ RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+
+ transpose_4x4(RA4, RA1, RA2, RA0);
+ transpose_4x4(RB4, RB1, RB2, RB0);
+
+ bx lr;
+.size __serpent_enc_blk8,.-__serpent_enc_blk8;
+
+.align 3
+.type __serpent_dec_blk8,%function;
+__serpent_dec_blk8:
+ /* input:
+ * r0: round key pointer
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+ * ciphertext blocks
+ * output:
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
+ * blocks
+ */
+
+ add RROUND, RROUND, #(32*16);
+
+ transpose_4x4(RA0, RA1, RA2, RA3);
+ BLOCK_LOAD_KEY_DEC ();
+ transpose_4x4(RB0, RB1, RB2, RB3);
+
+ ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4,
+ RA3, RA0, RA1, RA4, RA2,
+ RB0, RB1, RB2, RB3, RB4,
+ RB3, RB0, RB1, RB4, RB2);
+ ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3,
+ RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3);
+ ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0,
+ RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0);
+ ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3,
+ RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3);
+ ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3,
+ RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3);
+ ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4,
+ RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4);
+ ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3,
+ RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3);
+ ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1,
+ RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1);
+ ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2,
+ RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2);
+ ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0,
+ RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0);
+ ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4,
+ RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4);
+ ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0,
+ RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0);
+ ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0,
+ RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0);
+ ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1,
+ RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1);
+ ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0,
+ RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0);
+ ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3,
+ RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3);
+ ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2,
+ RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2);
+ ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4,
+ RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4);
+ ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1,
+ RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1);
+ ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4,
+ RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4);
+ ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4,
+ RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4);
+ ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3,
+ RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3);
+ ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4,
+ RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4);
+ ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0,
+ RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0);
+ ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2,
+ RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2);
+ ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1,
+ RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1);
+ ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3,
+ RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3);
+ ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1,
+ RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1);
+ ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1,
+ RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1);
+ ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0,
+ RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0);
+ ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1,
+ RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1);
+ ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4,
+ RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4);
+
+ transpose_4x4(RA0, RA1, RA2, RA3);
+ transpose_4x4(RB0, RB1, RB2, RB3);
+
+ bx lr;
+.size __serpent_dec_blk8,.-__serpent_dec_blk8;
+
+.align 3
+.globl _gcry_serpent_neon_ctr_enc
+.type _gcry_serpent_neon_ctr_enc,%function;
+_gcry_serpent_neon_ctr_enc:
+ /* input:
+ * r0: ctx, CTX
+ * r1: dst (8 blocks)
+ * r2: src (8 blocks)
+ * r3: iv
+ */
+
+ vmov.u8 RT1d0, #0xff; /* u64: -1 */
+ push {r4,lr};
+ vadd.u64 RT2d0, RT1d0, RT1d0; /* u64: -2 */
+ vpush {RA4-RB2};
+
+ /* load IV and byteswap */
+ vld1.8 {RA0}, [r3];
+ vrev64.u8 RT0, RA0; /* be => le */
+ ldr r4, [r3, #8];
+
+ /* construct IVs */
+ vsub.u64 RA2d1, RT0d1, RT2d0; /* +2 */
+ vsub.u64 RA1d1, RT0d1, RT1d0; /* +1 */
+ cmp r4, #-1;
+
+ vsub.u64 RB0d1, RA2d1, RT2d0; /* +4 */
+ vsub.u64 RA3d1, RA2d1, RT1d0; /* +3 */
+ ldr r4, [r3, #12];
+
+ vsub.u64 RB2d1, RB0d1, RT2d0; /* +6 */
+ vsub.u64 RB1d1, RB0d1, RT1d0; /* +5 */
+
+ vsub.u64 RT2d1, RB2d1, RT2d0; /* +8 */
+ vsub.u64 RB3d1, RB2d1, RT1d0; /* +7 */
+
+ vmov RA1d0, RT0d0;
+ vmov RA2d0, RT0d0;
+ vmov RA3d0, RT0d0;
+ vmov RB0d0, RT0d0;
+ rev r4, r4;
+ vmov RB1d0, RT0d0;
+ vmov RB2d0, RT0d0;
+ vmov RB3d0, RT0d0;
+ vmov RT2d0, RT0d0;
+
+ /* check need for handling 64-bit overflow and carry */
+ beq .Ldo_ctr_carry;
+
+.Lctr_carry_done:
+ /* le => be */
+ vrev64.u8 RA1, RA1;
+ vrev64.u8 RA2, RA2;
+ vrev64.u8 RA3, RA3;
+ vrev64.u8 RB0, RB0;
+ vrev64.u8 RT2, RT2;
+ vrev64.u8 RB1, RB1;
+ vrev64.u8 RB2, RB2;
+ vrev64.u8 RB3, RB3;
+ /* store new IV */
+ vst1.8 {RT2}, [r3];
+
+ bl __serpent_enc_blk8;
+
+ vld1.8 {RT0, RT1}, [r2]!;
+ vld1.8 {RT2, RT3}, [r2]!;
+ veor RA4, RA4, RT0;
+ veor RA1, RA1, RT1;
+ vld1.8 {RT0, RT1}, [r2]!;
+ veor RA2, RA2, RT2;
+ veor RA0, RA0, RT3;
+ vld1.8 {RT2, RT3}, [r2]!;
+ veor RB4, RB4, RT0;
+ veor RT0, RT0;
+ veor RB1, RB1, RT1;
+ veor RT1, RT1;
+ veor RB2, RB2, RT2;
+ veor RT2, RT2;
+ veor RB0, RB0, RT3;
+ veor RT3, RT3;
+
+ vst1.8 {RA4}, [r1]!;
+ vst1.8 {RA1}, [r1]!;
+ veor RA1, RA1;
+ vst1.8 {RA2}, [r1]!;
+ veor RA2, RA2;
+ vst1.8 {RA0}, [r1]!;
+ veor RA0, RA0;
+ vst1.8 {RB4}, [r1]!;
+ veor RB4, RB4;
+ vst1.8 {RB1}, [r1]!;
+ vst1.8 {RB2}, [r1]!;
+ vst1.8 {RB0}, [r1]!;
+
+ vpop {RA4-RB2};
+
+ /* clear the used registers */
+ veor RA3, RA3;
+ veor RB3, RB3;
+
+ pop {r4,pc};
+
+.Ldo_ctr_carry:
+ cmp r4, #-8;
+ blo .Lctr_carry_done;
+ beq .Lcarry_RT2;
+
+ cmp r4, #-6;
+ blo .Lcarry_RB3;
+ beq .Lcarry_RB2;
+
+ cmp r4, #-4;
+ blo .Lcarry_RB1;
+ beq .Lcarry_RB0;
+
+ cmp r4, #-2;
+ blo .Lcarry_RA3;
+ beq .Lcarry_RA2;
+
+ vsub.u64 RA1d0, RT1d0;
+.Lcarry_RA2:
+ vsub.u64 RA2d0, RT1d0;
+.Lcarry_RA3:
+ vsub.u64 RA3d0, RT1d0;
+.Lcarry_RB0:
+ vsub.u64 RB0d0, RT1d0;
+.Lcarry_RB1:
+ vsub.u64 RB1d0, RT1d0;
+.Lcarry_RB2:
+ vsub.u64 RB2d0, RT1d0;
+.Lcarry_RB3:
+ vsub.u64 RB3d0, RT1d0;
+.Lcarry_RT2:
+ vsub.u64 RT2d0, RT1d0;
+
+ b .Lctr_carry_done;
+.size _gcry_serpent_neon_ctr_enc,.-_gcry_serpent_neon_ctr_enc;
+
+.align 3
+.globl _gcry_serpent_neon_cfb_dec
+.type _gcry_serpent_neon_cfb_dec,%function;
+_gcry_serpent_neon_cfb_dec:
+ /* input:
+ * r0: ctx, CTX
+ * r1: dst (8 blocks)
+ * r2: src (8 blocks)
+ * r3: iv
+ */
+
+ push {lr};
+ vpush {RA4-RB2};
+
+ /* Load input */
+ vld1.8 {RA0}, [r3];
+ vld1.8 {RA1, RA2}, [r2]!;
+ vld1.8 {RA3}, [r2]!;
+ vld1.8 {RB0}, [r2]!;
+ vld1.8 {RB1, RB2}, [r2]!;
+ vld1.8 {RB3}, [r2]!;
+
+ /* Update IV */
+ vld1.8 {RT0}, [r2]!;
+ vst1.8 {RT0}, [r3];
+ mov r3, lr;
+ sub r2, r2, #(8*16);
+
+ bl __serpent_enc_blk8;
+
+ vld1.8 {RT0, RT1}, [r2]!;
+ vld1.8 {RT2, RT3}, [r2]!;
+ veor RA4, RA4, RT0;
+ veor RA1, RA1, RT1;
+ vld1.8 {RT0, RT1}, [r2]!;
+ veor RA2, RA2, RT2;
+ veor RA0, RA0, RT3;
+ vld1.8 {RT2, RT3}, [r2]!;
+ veor RB4, RB4, RT0;
+ veor RT0, RT0;
+ veor RB1, RB1, RT1;
+ veor RT1, RT1;
+ veor RB2, RB2, RT2;
+ veor RT2, RT2;
+ veor RB0, RB0, RT3;
+ veor RT3, RT3;
+
+ vst1.8 {RA4}, [r1]!;
+ vst1.8 {RA1}, [r1]!;
+ veor RA1, RA1;
+ vst1.8 {RA2}, [r1]!;
+ veor RA2, RA2;
+ vst1.8 {RA0}, [r1]!;
+ veor RA0, RA0;
+ vst1.8 {RB4}, [r1]!;
+ veor RB4, RB4;
+ vst1.8 {RB1}, [r1]!;
+ vst1.8 {RB2}, [r1]!;
+ vst1.8 {RB0}, [r1]!;
+
+ vpop {RA4-RB2};
+
+ /* clear the used registers */
+ veor RA3, RA3;
+ veor RB3, RB3;
+
+ pop {pc};
+.size _gcry_serpent_neon_cfb_dec,.-_gcry_serpent_neon_cfb_dec;
+
+.align 3
+.globl _gcry_serpent_neon_cbc_dec
+.type _gcry_serpent_neon_cbc_dec,%function;
+_gcry_serpent_neon_cbc_dec:
+ /* input:
+ * r0: ctx, CTX
+ * r1: dst (8 blocks)
+ * r2: src (8 blocks)
+ * r3: iv
+ */
+
+ push {lr};
+ vpush {RA4-RB2};
+
+ vld1.8 {RA0, RA1}, [r2]!;
+ vld1.8 {RA2, RA3}, [r2]!;
+ vld1.8 {RB0, RB1}, [r2]!;
+ vld1.8 {RB2, RB3}, [r2]!;
+ sub r2, r2, #(8*16);
+
+ bl __serpent_dec_blk8;
+
+ vld1.8 {RB4}, [r3];
+ vld1.8 {RT0, RT1}, [r2]!;
+ vld1.8 {RT2, RT3}, [r2]!;
+ veor RA0, RA0, RB4;
+ veor RA1, RA1, RT0;
+ veor RA2, RA2, RT1;
+ vld1.8 {RT0, RT1}, [r2]!;
+ veor RA3, RA3, RT2;
+ veor RB0, RB0, RT3;
+ vld1.8 {RT2, RT3}, [r2]!;
+ veor RB1, RB1, RT0;
+ veor RT0, RT0;
+ veor RB2, RB2, RT1;
+ veor RT1, RT1;
+ veor RB3, RB3, RT2;
+ veor RT2, RT2;
+ vst1.8 {RT3}, [r3]; /* store new IV */
+ veor RT3, RT3;
+
+ vst1.8 {RA0, RA1}, [r1]!;
+ veor RA0, RA0;
+ veor RA1, RA1;
+ vst1.8 {RA2, RA3}, [r1]!;
+ veor RA2, RA2;
+ vst1.8 {RB0, RB1}, [r1]!;
+ veor RA3, RA3;
+ vst1.8 {RB2, RB3}, [r1]!;
+ veor RB3, RB3;
+
+ vpop {RA4-RB2};
+
+ /* clear the used registers */
+ veor RB4, RB4;
+
+ pop {pc};
+.size _gcry_serpent_neon_cbc_dec,.-_gcry_serpent_neon_cbc_dec;
+
+#endif