--- cipher/blowfish-arm.S +++ cipher/blowfish-arm.S @@ -0,0 +1,743 @@ +/* blowfish-arm.S - ARM assembly implementation of Blowfish cipher + * + * Copyright (C) 2013 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include + +#if defined(__ARMEL__) +#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS + +.text + +.syntax unified +.arm + +/* structure of crypto context */ +#define s0 0 +#define s1 (s0 + (1 * 256) * 4) +#define s2 (s0 + (2 * 256) * 4) +#define s3 (s0 + (3 * 256) * 4) +#define p (s3 + (1 * 256) * 4) + +/* register macros */ +#define CTXs0 %r0 +#define CTXs1 %r9 +#define CTXs2 %r8 +#define CTXs3 %r10 +#define RMASK %lr +#define RKEYL %r2 +#define RKEYR %ip + +#define RL0 %r3 +#define RR0 %r4 + +#define RL1 %r9 +#define RR1 %r10 + +#define RT0 %r11 +#define RT1 %r7 +#define RT2 %r5 +#define RT3 %r6 + +/* helper macros */ +#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \ + ldrb rout, [rsrc, #((offs) + 0)]; \ + ldrb rtmp, [rsrc, #((offs) + 1)]; \ + orr rout, rout, rtmp, lsl #8; \ + ldrb rtmp, [rsrc, #((offs) + 2)]; \ + orr rout, rout, rtmp, lsl #16; \ + ldrb rtmp, [rsrc, #((offs) + 3)]; \ + orr rout, rout, rtmp, lsl #24; + +#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \ + mov rtmp0, rin, lsr #8; \ + strb rin, [rdst, #((offs) + 0)]; \ + mov rtmp1, rin, lsr #16; \ + strb rtmp0, [rdst, #((offs) + 1)]; \ + mov rtmp0, rin, lsr #24; \ + strb rtmp1, [rdst, #((offs) + 2)]; \ + strb rtmp0, [rdst, #((offs) + 3)]; + +#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \ + ldrb rout, [rsrc, #((offs) + 3)]; \ + ldrb rtmp, [rsrc, #((offs) + 2)]; \ + orr rout, rout, rtmp, lsl #8; \ + ldrb rtmp, [rsrc, #((offs) + 1)]; \ + orr rout, rout, rtmp, lsl #16; \ + ldrb rtmp, [rsrc, #((offs) + 0)]; \ + orr rout, rout, rtmp, lsl #24; + +#define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \ + mov rtmp0, rin, lsr #8; \ + strb rin, [rdst, #((offs) + 3)]; \ + mov rtmp1, rin, lsr #16; \ + strb rtmp0, [rdst, #((offs) + 2)]; \ + mov rtmp0, rin, lsr #24; \ + strb rtmp1, [rdst, #((offs) + 1)]; \ + strb rtmp0, [rdst, #((offs) + 0)]; + +#ifdef __ARMEL__ + #define ldr_unaligned_host ldr_unaligned_le + #define str_unaligned_host str_unaligned_le + + /* bswap on little-endian */ +#ifdef HAVE_ARM_ARCH_V6 + #define host_to_be(reg, rtmp) \ + rev reg, reg; + #define be_to_host(reg, rtmp) \ + rev reg, reg; +#else + #define host_to_be(reg, rtmp) \ + eor rtmp, reg, reg, ror #16; \ + mov rtmp, rtmp, lsr #8; \ + bic rtmp, rtmp, #65280; \ + eor reg, rtmp, reg, ror #8; + #define be_to_host(reg, rtmp) \ + eor rtmp, reg, reg, ror #16; \ + mov rtmp, rtmp, lsr #8; \ + bic rtmp, rtmp, #65280; \ + eor reg, rtmp, reg, ror #8; +#endif +#else + #define ldr_unaligned_host ldr_unaligned_be + #define str_unaligned_host str_unaligned_be + + /* nop on big-endian */ + #define host_to_be(reg, rtmp) /*_*/ + #define be_to_host(reg, rtmp) /*_*/ +#endif + +#define host_to_host(x, y) /*_*/ + +/*********************************************************************** + * 1-way blowfish + ***********************************************************************/ +#define F(l, r) \ + and RT0, RMASK, l, lsr#(24 - 2); \ + and RT1, RMASK, l, lsr#(16 - 2); \ + ldr RT0, [CTXs0, RT0]; \ + and RT2, RMASK, l, lsr#(8 - 2); \ + ldr RT1, [CTXs1, RT1]; \ + and RT3, RMASK, l, lsl#2; \ + ldr RT2, [CTXs2, RT2]; \ + add RT0, RT1; \ + ldr RT3, [CTXs3, RT3]; \ + eor RT0, RT2; \ + add RT0, RT3; \ + eor r, RT0; + +#define load_roundkey_enc(n) \ + ldr RKEYL, [CTXs2, #((p - s2) + (4 * (n) + 0))]; \ + ldr RKEYR, [CTXs2, #((p - s2) + (4 * (n) + 4))]; + +#define add_roundkey_enc() \ + eor RL0, RKEYL; \ + eor RR0, RKEYR; + +#define round_enc(n) \ + add_roundkey_enc(); \ + load_roundkey_enc(n); \ + \ + F(RL0, RR0); \ + F(RR0, RL0); + +#define load_roundkey_dec(n) \ + ldr RKEYL, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 4))]; \ + ldr RKEYR, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 0))]; + +#define add_roundkey_dec() \ + eor RL0, RKEYL; \ + eor RR0, RKEYR; + +#define round_dec(n) \ + add_roundkey_dec(); \ + load_roundkey_dec(n); \ + \ + F(RL0, RR0); \ + F(RR0, RL0); + +#define read_block_aligned(rin, offs, l0, r0, convert, rtmp) \ + ldr l0, [rin, #((offs) + 0)]; \ + ldr r0, [rin, #((offs) + 4)]; \ + convert(l0, rtmp); \ + convert(r0, rtmp); + +#define write_block_aligned(rout, offs, l0, r0, convert, rtmp) \ + convert(l0, rtmp); \ + convert(r0, rtmp); \ + str l0, [rout, #((offs) + 0)]; \ + str r0, [rout, #((offs) + 4)]; + +#ifdef __ARM_FEATURE_UNALIGNED + /* unaligned word reads allowed */ + #define read_block(rin, offs, l0, r0, rtmp0) \ + read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0) + + #define write_block(rout, offs, r0, l0, rtmp0, rtmp1) \ + write_block_aligned(rout, offs, r0, l0, be_to_host, rtmp0) + + #define read_block_host(rin, offs, l0, r0, rtmp0) \ + read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0) + + #define write_block_host(rout, offs, r0, l0, rtmp0, rtmp1) \ + write_block_aligned(rout, offs, r0, l0, host_to_host, rtmp0) +#else + /* need to handle unaligned reads by byte reads */ + #define read_block(rin, offs, l0, r0, rtmp0) \ + tst rin, #3; \ + beq 1f; \ + ldr_unaligned_be(l0, rin, (offs) + 0, rtmp0); \ + ldr_unaligned_be(r0, rin, (offs) + 4, rtmp0); \ + b 2f; \ + 1:;\ + read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0); \ + 2:; + + #define write_block(rout, offs, l0, r0, rtmp0, rtmp1) \ + tst rout, #3; \ + beq 1f; \ + str_unaligned_be(l0, rout, (offs) + 0, rtmp0, rtmp1); \ + str_unaligned_be(r0, rout, (offs) + 4, rtmp0, rtmp1); \ + b 2f; \ + 1:;\ + write_block_aligned(rout, offs, l0, r0, be_to_host, rtmp0); \ + 2:; + + #define read_block_host(rin, offs, l0, r0, rtmp0) \ + tst rin, #3; \ + beq 1f; \ + ldr_unaligned_host(l0, rin, (offs) + 0, rtmp0); \ + ldr_unaligned_host(r0, rin, (offs) + 4, rtmp0); \ + b 2f; \ + 1:;\ + read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0); \ + 2:; + + #define write_block_host(rout, offs, l0, r0, rtmp0, rtmp1) \ + tst rout, #3; \ + beq 1f; \ + str_unaligned_host(l0, rout, (offs) + 0, rtmp0, rtmp1); \ + str_unaligned_host(r0, rout, (offs) + 4, rtmp0, rtmp1); \ + b 2f; \ + 1:;\ + write_block_aligned(rout, offs, l0, r0, host_to_host); \ + 2:; +#endif + +.align 3 +.type __blowfish_enc_blk1,%function; + +__blowfish_enc_blk1: + /* input: + * preloaded: CTX + * [RL0, RR0]: src + * output: + * [RR0, RL0]: dst + */ + push {%lr}; + + add CTXs1, CTXs0, #(s1 - s0); + add CTXs2, CTXs0, #(s2 - s0); + mov RMASK, #(0xff << 2); /* byte mask */ + add CTXs3, CTXs1, #(s3 - s1); + + load_roundkey_enc(0); + round_enc(2); + round_enc(4); + round_enc(6); + round_enc(8); + round_enc(10); + round_enc(12); + round_enc(14); + round_enc(16); + add_roundkey_enc(); + + pop {%pc}; +.size __blowfish_enc_blk1,.-__blowfish_enc_blk1; + +.align 8 +.globl _gcry_blowfish_arm_do_encrypt +.type _gcry_blowfish_arm_do_encrypt,%function; + +_gcry_blowfish_arm_do_encrypt: + /* input: + * %r0: ctx, CTX + * %r1: u32 *ret_xl + * %r2: u32 *ret_xr + */ + push {%r2, %r4-%r11, %ip, %lr}; + + ldr RL0, [%r1]; + ldr RR0, [%r2]; + + bl __blowfish_enc_blk1; + + pop {%r2}; + str RR0, [%r1]; + str RL0, [%r2]; + + pop {%r4-%r11, %ip, %pc}; +.size _gcry_blowfish_arm_do_encrypt,.-_gcry_blowfish_arm_do_encrypt; + +.align 3 +.globl _gcry_blowfish_arm_encrypt_block +.type _gcry_blowfish_arm_encrypt_block,%function; + +_gcry_blowfish_arm_encrypt_block: + /* input: + * %r0: ctx, CTX + * %r1: dst + * %r2: src + */ + push {%r4-%r11, %ip, %lr}; + + read_block(%r2, 0, RL0, RR0, RT0); + + bl __blowfish_enc_blk1; + + write_block(%r1, 0, RR0, RL0, RT0, RT1); + + pop {%r4-%r11, %ip, %pc}; +.size _gcry_blowfish_arm_encrypt_block,.-_gcry_blowfish_arm_encrypt_block; + +.align 3 +.globl _gcry_blowfish_arm_decrypt_block +.type _gcry_blowfish_arm_decrypt_block,%function; + +_gcry_blowfish_arm_decrypt_block: + /* input: + * %r0: ctx, CTX + * %r1: dst + * %r2: src + */ + push {%r4-%r11, %ip, %lr}; + + add CTXs1, CTXs0, #(s1 - s0); + add CTXs2, CTXs0, #(s2 - s0); + mov RMASK, #(0xff << 2); /* byte mask */ + add CTXs3, CTXs1, #(s3 - s1); + + read_block(%r2, 0, RL0, RR0, RT0); + + load_roundkey_dec(17); + round_dec(15); + round_dec(13); + round_dec(11); + round_dec(9); + round_dec(7); + round_dec(5); + round_dec(3); + round_dec(1); + add_roundkey_dec(); + + write_block(%r1, 0, RR0, RL0, RT0, RT1); + + pop {%r4-%r11, %ip, %pc}; +.size _gcry_blowfish_arm_decrypt_block,.-_gcry_blowfish_arm_decrypt_block; + +/*********************************************************************** + * 2-way blowfish + ***********************************************************************/ +#define F2(n, l0, r0, l1, r1, set_nextk, dec) \ + \ + and RT0, RMASK, l0, lsr#(24 - 2); \ + and RT1, RMASK, l0, lsr#(16 - 2); \ + and RT2, RMASK, l0, lsr#(8 - 2); \ + add RT1, #(s1 - s0); \ + \ + ldr RT0, [CTXs0, RT0]; \ + and RT3, RMASK, l0, lsl#2; \ + ldr RT1, [CTXs0, RT1]; \ + add RT3, #(s3 - s2); \ + ldr RT2, [CTXs2, RT2]; \ + add RT0, RT1; \ + ldr RT3, [CTXs2, RT3]; \ + \ + and RT1, RMASK, l1, lsr#(24 - 2); \ + eor RT0, RT2; \ + and RT2, RMASK, l1, lsr#(16 - 2); \ + add RT0, RT3; \ + add RT2, #(s1 - s0); \ + and RT3, RMASK, l1, lsr#(8 - 2); \ + eor r0, RT0; \ + \ + ldr RT1, [CTXs0, RT1]; \ + and RT0, RMASK, l1, lsl#2; \ + ldr RT2, [CTXs0, RT2]; \ + add RT0, #(s3 - s2); \ + ldr RT3, [CTXs2, RT3]; \ + add RT1, RT2; \ + ldr RT0, [CTXs2, RT0]; \ + \ + and RT2, RMASK, r0, lsr#(24 - 2); \ + eor RT1, RT3; \ + and RT3, RMASK, r0, lsr#(16 - 2); \ + add RT1, RT0; \ + add RT3, #(s1 - s0); \ + and RT0, RMASK, r0, lsr#(8 - 2); \ + eor r1, RT1; \ + \ + ldr RT2, [CTXs0, RT2]; \ + and RT1, RMASK, r0, lsl#2; \ + ldr RT3, [CTXs0, RT3]; \ + add RT1, #(s3 - s2); \ + ldr RT0, [CTXs2, RT0]; \ + add RT2, RT3; \ + ldr RT1, [CTXs2, RT1]; \ + \ + and RT3, RMASK, r1, lsr#(24 - 2); \ + eor RT2, RT0; \ + and RT0, RMASK, r1, lsr#(16 - 2); \ + add RT2, RT1; \ + add RT0, #(s1 - s0); \ + and RT1, RMASK, r1, lsr#(8 - 2); \ + eor l0, RT2; \ + \ + ldr RT3, [CTXs0, RT3]; \ + and RT2, RMASK, r1, lsl#2; \ + ldr RT0, [CTXs0, RT0]; \ + add RT2, #(s3 - s2); \ + ldr RT1, [CTXs2, RT1]; \ + eor l1, RKEYL; \ + ldr RT2, [CTXs2, RT2]; \ + \ + eor r0, RKEYR; \ + add RT3, RT0; \ + eor r1, RKEYR; \ + eor RT3, RT1; \ + eor l0, RKEYL; \ + add RT3, RT2; \ + set_nextk(RKEYL, (p - s2) + (4 * (n) + ((dec) * 4))); \ + eor l1, RT3; \ + set_nextk(RKEYR, (p - s2) + (4 * (n) + (!(dec) * 4))); + +#define load_n_add_roundkey_enc2(n) \ + load_roundkey_enc(n); \ + eor RL0, RKEYL; \ + eor RR0, RKEYR; \ + eor RL1, RKEYL; \ + eor RR1, RKEYR; \ + load_roundkey_enc((n) + 2); + +#define next_key(reg, offs) \ + ldr reg, [CTXs2, #(offs)]; + +#define dummy(x, y) /* do nothing */ + +#define round_enc2(n, load_next_key) \ + F2((n) + 2, RL0, RR0, RL1, RR1, load_next_key, 0); + +#define load_n_add_roundkey_dec2(n) \ + load_roundkey_dec(n); \ + eor RL0, RKEYL; \ + eor RR0, RKEYR; \ + eor RL1, RKEYL; \ + eor RR1, RKEYR; \ + load_roundkey_dec((n) - 2); + +#define round_dec2(n, load_next_key) \ + F2((n) - 3, RL0, RR0, RL1, RR1, load_next_key, 1); + +#define read_block2_aligned(rin, l0, r0, l1, r1, convert, rtmp) \ + ldr l0, [rin, #(0)]; \ + ldr r0, [rin, #(4)]; \ + convert(l0, rtmp); \ + ldr l1, [rin, #(8)]; \ + convert(r0, rtmp); \ + ldr r1, [rin, #(12)]; \ + convert(l1, rtmp); \ + convert(r1, rtmp); + +#define write_block2_aligned(rout, l0, r0, l1, r1, convert, rtmp) \ + convert(l0, rtmp); \ + convert(r0, rtmp); \ + convert(l1, rtmp); \ + str l0, [rout, #(0)]; \ + convert(r1, rtmp); \ + str r0, [rout, #(4)]; \ + str l1, [rout, #(8)]; \ + str r1, [rout, #(12)]; + +#ifdef __ARM_FEATURE_UNALIGNED + /* unaligned word reads allowed */ + #define read_block2(rin, l0, r0, l1, r1, rtmp0) \ + read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0) + + #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \ + write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0) + + #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \ + read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0) + + #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \ + write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0) +#else + /* need to handle unaligned reads by byte reads */ + #define read_block2(rin, l0, r0, l1, r1, rtmp0) \ + tst rin, #3; \ + beq 1f; \ + ldr_unaligned_be(l0, rin, 0, rtmp0); \ + ldr_unaligned_be(r0, rin, 4, rtmp0); \ + ldr_unaligned_be(l1, rin, 8, rtmp0); \ + ldr_unaligned_be(r1, rin, 12, rtmp0); \ + b 2f; \ + 1:;\ + read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0); \ + 2:; + + #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \ + tst rout, #3; \ + beq 1f; \ + str_unaligned_be(l0, rout, 0, rtmp0, rtmp1); \ + str_unaligned_be(r0, rout, 4, rtmp0, rtmp1); \ + str_unaligned_be(l1, rout, 8, rtmp0, rtmp1); \ + str_unaligned_be(r1, rout, 12, rtmp0, rtmp1); \ + b 2f; \ + 1:;\ + write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0); \ + 2:; + + #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \ + tst rin, #3; \ + beq 1f; \ + ldr_unaligned_host(l0, rin, 0, rtmp0); \ + ldr_unaligned_host(r0, rin, 4, rtmp0); \ + ldr_unaligned_host(l1, rin, 8, rtmp0); \ + ldr_unaligned_host(r1, rin, 12, rtmp0); \ + b 2f; \ + 1:;\ + read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0); \ + 2:; + + #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \ + tst rout, #3; \ + beq 1f; \ + str_unaligned_host(l0, rout, 0, rtmp0, rtmp1); \ + str_unaligned_host(r0, rout, 4, rtmp0, rtmp1); \ + str_unaligned_host(l1, rout, 8, rtmp0, rtmp1); \ + str_unaligned_host(r1, rout, 12, rtmp0, rtmp1); \ + b 2f; \ + 1:;\ + write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0); \ + 2:; +#endif + +.align 3 +.type _gcry_blowfish_arm_enc_blk2,%function; + +_gcry_blowfish_arm_enc_blk2: + /* input: + * preloaded: CTX + * [RL0, RR0], [RL1, RR1]: src + * output: + * [RR0, RL0], [RR1, RL1]: dst + */ + push {RT0,%lr}; + + add CTXs2, CTXs0, #(s2 - s0); + mov RMASK, #(0xff << 2); /* byte mask */ + + load_n_add_roundkey_enc2(0); + round_enc2(2, next_key); + round_enc2(4, next_key); + round_enc2(6, next_key); + round_enc2(8, next_key); + round_enc2(10, next_key); + round_enc2(12, next_key); + round_enc2(14, next_key); + round_enc2(16, dummy); + + host_to_be(RR0, RT0); + host_to_be(RL0, RT0); + host_to_be(RR1, RT0); + host_to_be(RL1, RT0); + + pop {RT0,%pc}; +.size _gcry_blowfish_arm_enc_blk2,.-_gcry_blowfish_arm_enc_blk2; + +.align 3 +.globl _gcry_blowfish_arm_cfb_dec; +.type _gcry_blowfish_arm_cfb_dec,%function; + +_gcry_blowfish_arm_cfb_dec: + /* input: + * %r0: CTX + * %r1: dst (2 blocks) + * %r2: src (2 blocks) + * %r3: iv (64bit) + */ + push {%r2, %r4-%r11, %ip, %lr}; + + mov %lr, %r3; + + /* Load input (iv/%r3 is aligned, src/%r2 might not be) */ + ldm %r3, {RL0, RR0}; + host_to_be(RL0, RT0); + host_to_be(RR0, RT0); + read_block(%r2, 0, RL1, RR1, RT0); + + /* Update IV, load src[1] and save to iv[0] */ + read_block_host(%r2, 8, %r5, %r6, RT0); + stm %lr, {%r5, %r6}; + + bl _gcry_blowfish_arm_enc_blk2; + /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */ + + /* %r1: dst, %r0: %src */ + pop {%r0}; + + /* dst = src ^ result */ + read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr); + eor %r5, %r4; + eor %r6, %r3; + eor %r7, %r10; + eor %r8, %r9; + write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10); + + pop {%r4-%r11, %ip, %pc}; +.ltorg +.size _gcry_blowfish_arm_cfb_dec,.-_gcry_blowfish_arm_cfb_dec; + +.align 3 +.globl _gcry_blowfish_arm_ctr_enc; +.type _gcry_blowfish_arm_ctr_enc,%function; + +_gcry_blowfish_arm_ctr_enc: + /* input: + * %r0: CTX + * %r1: dst (2 blocks) + * %r2: src (2 blocks) + * %r3: iv (64bit, big-endian) + */ + push {%r2, %r4-%r11, %ip, %lr}; + + mov %lr, %r3; + + /* Load IV (big => host endian) */ + read_block_aligned(%lr, 0, RL0, RR0, be_to_host, RT0); + + /* Construct IVs */ + adds RR1, RR0, #1; /* +1 */ + adc RL1, RL0, #0; + adds %r6, RR1, #1; /* +2 */ + adc %r5, RL1, #0; + + /* Store new IV (host => big-endian) */ + write_block_aligned(%lr, 0, %r5, %r6, host_to_be, RT0); + + bl _gcry_blowfish_arm_enc_blk2; + /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */ + + /* %r1: dst, %r0: %src */ + pop {%r0}; + + /* XOR key-stream with plaintext */ + read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr); + eor %r5, %r4; + eor %r6, %r3; + eor %r7, %r10; + eor %r8, %r9; + write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10); + + pop {%r4-%r11, %ip, %pc}; +.ltorg +.size _gcry_blowfish_arm_ctr_enc,.-_gcry_blowfish_arm_ctr_enc; + +.align 3 +.type _gcry_blowfish_arm_dec_blk2,%function; + +_gcry_blowfish_arm_dec_blk2: + /* input: + * preloaded: CTX + * [RL0, RR0], [RL1, RR1]: src + * output: + * [RR0, RL0], [RR1, RL1]: dst + */ + add CTXs2, CTXs0, #(s2 - s0); + mov RMASK, #(0xff << 2); /* byte mask */ + + load_n_add_roundkey_dec2(17); + round_dec2(15, next_key); + round_dec2(13, next_key); + round_dec2(11, next_key); + round_dec2(9, next_key); + round_dec2(7, next_key); + round_dec2(5, next_key); + round_dec2(3, next_key); + round_dec2(1, dummy); + + host_to_be(RR0, RT0); + host_to_be(RL0, RT0); + host_to_be(RR1, RT0); + host_to_be(RL1, RT0); + + b .Ldec_cbc_tail; +.ltorg +.size _gcry_blowfish_arm_dec_blk2,.-_gcry_blowfish_arm_dec_blk2; + +.align 3 +.globl _gcry_blowfish_arm_cbc_dec; +.type _gcry_blowfish_arm_cbc_dec,%function; + +_gcry_blowfish_arm_cbc_dec: + /* input: + * %r0: CTX + * %r1: dst (2 blocks) + * %r2: src (2 blocks) + * %r3: iv (64bit) + */ + push {%r2-%r11, %ip, %lr}; + + read_block2(%r2, RL0, RR0, RL1, RR1, RT0); + + /* dec_blk2 is only used by cbc_dec, jump directly in/out instead + * of function call. */ + b _gcry_blowfish_arm_dec_blk2; +.Ldec_cbc_tail: + /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */ + + /* %r0: %src, %r1: dst, %r2: iv */ + pop {%r0, %r2}; + + /* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */ + read_block_host(%r0, 0, %r7, %r8, %r5); + /* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */ + ldm %r2, {%r5, %r6}; + + /* out[1] ^= IV+1 */ + eor %r10, %r7; + eor %r9, %r8; + /* out[0] ^= IV */ + eor %r4, %r5; + eor %r3, %r6; + + /* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */ + read_block_host(%r0, 8, %r7, %r8, %r5); + /* store IV+2 to iv[0] (aligned). */ + stm %r2, {%r7, %r8}; + + /* store result to dst[0-3]. Might be unaligned. */ + write_block2_host(%r1, %r4, %r3, %r10, %r9, %r5, %r6); + + pop {%r4-%r11, %ip, %pc}; +.ltorg +.size _gcry_blowfish_arm_cbc_dec,.-_gcry_blowfish_arm_cbc_dec; + +#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/ +#endif /*__ARM_ARCH >= 6*/ --- cipher/serpent-armv7-neon.S +++ cipher/serpent-armv7-neon.S @@ -0,0 +1,869 @@ +/* serpent-armv7-neon.S - ARM/NEON assembly implementation of Serpent cipher + * + * Copyright (C) 2013 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include + +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ + defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_NEON) + +.text + +.syntax unified +.fpu neon +.arm + +/* ARM registers */ +#define RROUND r0 + +/* NEON vector registers */ +#define RA0 q0 +#define RA1 q1 +#define RA2 q2 +#define RA3 q3 +#define RA4 q4 +#define RB0 q5 +#define RB1 q6 +#define RB2 q7 +#define RB3 q8 +#define RB4 q9 + +#define RT0 q10 +#define RT1 q11 +#define RT2 q12 +#define RT3 q13 + +#define RA0d0 d0 +#define RA0d1 d1 +#define RA1d0 d2 +#define RA1d1 d3 +#define RA2d0 d4 +#define RA2d1 d5 +#define RA3d0 d6 +#define RA3d1 d7 +#define RA4d0 d8 +#define RA4d1 d9 +#define RB0d0 d10 +#define RB0d1 d11 +#define RB1d0 d12 +#define RB1d1 d13 +#define RB2d0 d14 +#define RB2d1 d15 +#define RB3d0 d16 +#define RB3d1 d17 +#define RB4d0 d18 +#define RB4d1 d19 +#define RT0d0 d20 +#define RT0d1 d21 +#define RT1d0 d22 +#define RT1d1 d23 +#define RT2d0 d24 +#define RT2d1 d25 + +/********************************************************************** + helper macros + **********************************************************************/ + +#define transpose_4x4(_q0, _q1, _q2, _q3) \ + vtrn.32 _q0, _q1; \ + vtrn.32 _q2, _q3; \ + vswp _q0##d1, _q2##d0; \ + vswp _q1##d1, _q3##d0; + +/********************************************************************** + 8-way serpent + **********************************************************************/ + +/* + * These are the S-Boxes of Serpent from following research paper. + * + * D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference, + * (New York, New York, USA), p. 317–329, National Institute of Standards and + * Technology, 2000. + * + * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf + * + */ +#define SBOX0(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + veor a3, a3, a0; veor b3, b3, b0; vmov a4, a1; vmov b4, b1; \ + vand a1, a1, a3; vand b1, b1, b3; veor a4, a4, a2; veor b4, b4, b2; \ + veor a1, a1, a0; veor b1, b1, b0; vorr a0, a0, a3; vorr b0, b0, b3; \ + veor a0, a0, a4; veor b0, b0, b4; veor a4, a4, a3; veor b4, b4, b3; \ + veor a3, a3, a2; veor b3, b3, b2; vorr a2, a2, a1; vorr b2, b2, b1; \ + veor a2, a2, a4; veor b2, b2, b4; vmvn a4, a4; vmvn b4, b4; \ + vorr a4, a4, a1; vorr b4, b4, b1; veor a1, a1, a3; veor b1, b1, b3; \ + veor a1, a1, a4; veor b1, b1, b4; vorr a3, a3, a0; vorr b3, b3, b0; \ + veor a1, a1, a3; veor b1, b1, b3; veor a4, a3; veor b4, b3; + +#define SBOX0_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + vmvn a2, a2; vmvn b2, b2; vmov a4, a1; vmov b4, b1; \ + vorr a1, a1, a0; vorr b1, b1, b0; vmvn a4, a4; vmvn b4, b4; \ + veor a1, a1, a2; veor b1, b1, b2; vorr a2, a2, a4; vorr b2, b2, b4; \ + veor a1, a1, a3; veor b1, b1, b3; veor a0, a0, a4; veor b0, b0, b4; \ + veor a2, a2, a0; veor b2, b2, b0; vand a0, a0, a3; vand b0, b0, b3; \ + veor a4, a4, a0; veor b4, b4, b0; vorr a0, a0, a1; vorr b0, b0, b1; \ + veor a0, a0, a2; veor b0, b0, b2; veor a3, a3, a4; veor b3, b3, b4; \ + veor a2, a2, a1; veor b2, b2, b1; veor a3, a3, a0; veor b3, b3, b0; \ + veor a3, a3, a1; veor b3, b3, b1;\ + vand a2, a2, a3; vand b2, b2, b3;\ + veor a4, a2; veor b4, b2; + +#define SBOX1(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + vmvn a0, a0; vmvn b0, b0; vmvn a2, a2; vmvn b2, b2; \ + vmov a4, a0; vmov b4, b0; vand a0, a0, a1; vand b0, b0, b1; \ + veor a2, a2, a0; veor b2, b2, b0; vorr a0, a0, a3; vorr b0, b0, b3; \ + veor a3, a3, a2; veor b3, b3, b2; veor a1, a1, a0; veor b1, b1, b0; \ + veor a0, a0, a4; veor b0, b0, b4; vorr a4, a4, a1; vorr b4, b4, b1; \ + veor a1, a1, a3; veor b1, b1, b3; vorr a2, a2, a0; vorr b2, b2, b0; \ + vand a2, a2, a4; vand b2, b2, b4; veor a0, a0, a1; veor b0, b0, b1; \ + vand a1, a1, a2; vand b1, b1, b2;\ + veor a1, a1, a0; veor b1, b1, b0; vand a0, a0, a2; vand b0, b0, b2; \ + veor a0, a4; veor b0, b4; + +#define SBOX1_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + vmov a4, a1; vmov b4, b1; veor a1, a1, a3; veor b1, b1, b3; \ + vand a3, a3, a1; vand b3, b3, b1; veor a4, a4, a2; veor b4, b4, b2; \ + veor a3, a3, a0; veor b3, b3, b0; vorr a0, a0, a1; vorr b0, b0, b1; \ + veor a2, a2, a3; veor b2, b2, b3; veor a0, a0, a4; veor b0, b0, b4; \ + vorr a0, a0, a2; vorr b0, b0, b2; veor a1, a1, a3; veor b1, b1, b3; \ + veor a0, a0, a1; veor b0, b0, b1; vorr a1, a1, a3; vorr b1, b1, b3; \ + veor a1, a1, a0; veor b1, b1, b0; vmvn a4, a4; vmvn b4, b4; \ + veor a4, a4, a1; veor b4, b4, b1; vorr a1, a1, a0; vorr b1, b1, b0; \ + veor a1, a1, a0; veor b1, b1, b0;\ + vorr a1, a1, a4; vorr b1, b1, b4;\ + veor a3, a1; veor b3, b1; + +#define SBOX2(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + vmov a4, a0; vmov b4, b0; vand a0, a0, a2; vand b0, b0, b2; \ + veor a0, a0, a3; veor b0, b0, b3; veor a2, a2, a1; veor b2, b2, b1; \ + veor a2, a2, a0; veor b2, b2, b0; vorr a3, a3, a4; vorr b3, b3, b4; \ + veor a3, a3, a1; veor b3, b3, b1; veor a4, a4, a2; veor b4, b4, b2; \ + vmov a1, a3; vmov b1, b3; vorr a3, a3, a4; vorr b3, b3, b4; \ + veor a3, a3, a0; veor b3, b3, b0; vand a0, a0, a1; vand b0, b0, b1; \ + veor a4, a4, a0; veor b4, b4, b0; veor a1, a1, a3; veor b1, b1, b3; \ + veor a1, a1, a4; veor b1, b1, b4; vmvn a4, a4; vmvn b4, b4; + +#define SBOX2_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + veor a2, a2, a3; veor b2, b2, b3; veor a3, a3, a0; veor b3, b3, b0; \ + vmov a4, a3; vmov b4, b3; vand a3, a3, a2; vand b3, b3, b2; \ + veor a3, a3, a1; veor b3, b3, b1; vorr a1, a1, a2; vorr b1, b1, b2; \ + veor a1, a1, a4; veor b1, b1, b4; vand a4, a4, a3; vand b4, b4, b3; \ + veor a2, a2, a3; veor b2, b2, b3; vand a4, a4, a0; vand b4, b4, b0; \ + veor a4, a4, a2; veor b4, b4, b2; vand a2, a2, a1; vand b2, b2, b1; \ + vorr a2, a2, a0; vorr b2, b2, b0; vmvn a3, a3; vmvn b3, b3; \ + veor a2, a2, a3; veor b2, b2, b3; veor a0, a0, a3; veor b0, b0, b3; \ + vand a0, a0, a1; vand b0, b0, b1; veor a3, a3, a4; veor b3, b3, b4; \ + veor a3, a0; veor b3, b0; + +#define SBOX3(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + vmov a4, a0; vmov b4, b0; vorr a0, a0, a3; vorr b0, b0, b3; \ + veor a3, a3, a1; veor b3, b3, b1; vand a1, a1, a4; vand b1, b1, b4; \ + veor a4, a4, a2; veor b4, b4, b2; veor a2, a2, a3; veor b2, b2, b3; \ + vand a3, a3, a0; vand b3, b3, b0; vorr a4, a4, a1; vorr b4, b4, b1; \ + veor a3, a3, a4; veor b3, b3, b4; veor a0, a0, a1; veor b0, b0, b1; \ + vand a4, a4, a0; vand b4, b4, b0; veor a1, a1, a3; veor b1, b1, b3; \ + veor a4, a4, a2; veor b4, b4, b2; vorr a1, a1, a0; vorr b1, b1, b0; \ + veor a1, a1, a2; veor b1, b1, b2; veor a0, a0, a3; veor b0, b0, b3; \ + vmov a2, a1; vmov b2, b1; vorr a1, a1, a3; vorr b1, b1, b3; \ + veor a1, a0; veor b1, b0; + +#define SBOX3_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + vmov a4, a2; vmov b4, b2; veor a2, a2, a1; veor b2, b2, b1; \ + veor a0, a0, a2; veor b0, b0, b2; vand a4, a4, a2; vand b4, b4, b2; \ + veor a4, a4, a0; veor b4, b4, b0; vand a0, a0, a1; vand b0, b0, b1; \ + veor a1, a1, a3; veor b1, b1, b3; vorr a3, a3, a4; vorr b3, b3, b4; \ + veor a2, a2, a3; veor b2, b2, b3; veor a0, a0, a3; veor b0, b0, b3; \ + veor a1, a1, a4; veor b1, b1, b4; vand a3, a3, a2; vand b3, b3, b2; \ + veor a3, a3, a1; veor b3, b3, b1; veor a1, a1, a0; veor b1, b1, b0; \ + vorr a1, a1, a2; vorr b1, b1, b2; veor a0, a0, a3; veor b0, b0, b3; \ + veor a1, a1, a4; veor b1, b1, b4;\ + veor a0, a1; veor b0, b1; + +#define SBOX4(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + veor a1, a1, a3; veor b1, b1, b3; vmvn a3, a3; vmvn b3, b3; \ + veor a2, a2, a3; veor b2, b2, b3; veor a3, a3, a0; veor b3, b3, b0; \ + vmov a4, a1; vmov b4, b1; vand a1, a1, a3; vand b1, b1, b3; \ + veor a1, a1, a2; veor b1, b1, b2; veor a4, a4, a3; veor b4, b4, b3; \ + veor a0, a0, a4; veor b0, b0, b4; vand a2, a2, a4; vand b2, b2, b4; \ + veor a2, a2, a0; veor b2, b2, b0; vand a0, a0, a1; vand b0, b0, b1; \ + veor a3, a3, a0; veor b3, b3, b0; vorr a4, a4, a1; vorr b4, b4, b1; \ + veor a4, a4, a0; veor b4, b4, b0; vorr a0, a0, a3; vorr b0, b0, b3; \ + veor a0, a0, a2; veor b0, b0, b2; vand a2, a2, a3; vand b2, b2, b3; \ + vmvn a0, a0; vmvn b0, b0; veor a4, a2; veor b4, b2; + +#define SBOX4_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + vmov a4, a2; vmov b4, b2; vand a2, a2, a3; vand b2, b2, b3; \ + veor a2, a2, a1; veor b2, b2, b1; vorr a1, a1, a3; vorr b1, b1, b3; \ + vand a1, a1, a0; vand b1, b1, b0; veor a4, a4, a2; veor b4, b4, b2; \ + veor a4, a4, a1; veor b4, b4, b1; vand a1, a1, a2; vand b1, b1, b2; \ + vmvn a0, a0; vmvn b0, b0; veor a3, a3, a4; veor b3, b3, b4; \ + veor a1, a1, a3; veor b1, b1, b3; vand a3, a3, a0; vand b3, b3, b0; \ + veor a3, a3, a2; veor b3, b3, b2; veor a0, a0, a1; veor b0, b0, b1; \ + vand a2, a2, a0; vand b2, b2, b0; veor a3, a3, a0; veor b3, b3, b0; \ + veor a2, a2, a4; veor b2, b2, b4;\ + vorr a2, a2, a3; vorr b2, b2, b3; veor a3, a3, a0; veor b3, b3, b0; \ + veor a2, a1; veor b2, b1; + +#define SBOX5(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + veor a0, a0, a1; veor b0, b0, b1; veor a1, a1, a3; veor b1, b1, b3; \ + vmvn a3, a3; vmvn b3, b3; vmov a4, a1; vmov b4, b1; \ + vand a1, a1, a0; vand b1, b1, b0; veor a2, a2, a3; veor b2, b2, b3; \ + veor a1, a1, a2; veor b1, b1, b2; vorr a2, a2, a4; vorr b2, b2, b4; \ + veor a4, a4, a3; veor b4, b4, b3; vand a3, a3, a1; vand b3, b3, b1; \ + veor a3, a3, a0; veor b3, b3, b0; veor a4, a4, a1; veor b4, b4, b1; \ + veor a4, a4, a2; veor b4, b4, b2; veor a2, a2, a0; veor b2, b2, b0; \ + vand a0, a0, a3; vand b0, b0, b3; vmvn a2, a2; vmvn b2, b2; \ + veor a0, a0, a4; veor b0, b0, b4; vorr a4, a4, a3; vorr b4, b4, b3; \ + veor a2, a4; veor b2, b4; + +#define SBOX5_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + vmvn a1, a1; vmvn b1, b1; vmov a4, a3; vmov b4, b3; \ + veor a2, a2, a1; veor b2, b2, b1; vorr a3, a3, a0; vorr b3, b3, b0; \ + veor a3, a3, a2; veor b3, b3, b2; vorr a2, a2, a1; vorr b2, b2, b1; \ + vand a2, a2, a0; vand b2, b2, b0; veor a4, a4, a3; veor b4, b4, b3; \ + veor a2, a2, a4; veor b2, b2, b4; vorr a4, a4, a0; vorr b4, b4, b0; \ + veor a4, a4, a1; veor b4, b4, b1; vand a1, a1, a2; vand b1, b1, b2; \ + veor a1, a1, a3; veor b1, b1, b3; veor a4, a4, a2; veor b4, b4, b2; \ + vand a3, a3, a4; vand b3, b3, b4; veor a4, a4, a1; veor b4, b4, b1; \ + veor a3, a3, a4; veor b3, b3, b4; vmvn a4, a4; vmvn b4, b4; \ + veor a3, a0; veor b3, b0; + +#define SBOX6(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + vmvn a2, a2; vmvn b2, b2; vmov a4, a3; vmov b4, b3; \ + vand a3, a3, a0; vand b3, b3, b0; veor a0, a0, a4; veor b0, b0, b4; \ + veor a3, a3, a2; veor b3, b3, b2; vorr a2, a2, a4; vorr b2, b2, b4; \ + veor a1, a1, a3; veor b1, b1, b3; veor a2, a2, a0; veor b2, b2, b0; \ + vorr a0, a0, a1; vorr b0, b0, b1; veor a2, a2, a1; veor b2, b2, b1; \ + veor a4, a4, a0; veor b4, b4, b0; vorr a0, a0, a3; vorr b0, b0, b3; \ + veor a0, a0, a2; veor b0, b0, b2; veor a4, a4, a3; veor b4, b4, b3; \ + veor a4, a4, a0; veor b4, b4, b0; vmvn a3, a3; vmvn b3, b3; \ + vand a2, a2, a4; vand b2, b2, b4;\ + veor a2, a3; veor b2, b3; + +#define SBOX6_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + veor a0, a0, a2; veor b0, b0, b2; vmov a4, a2; vmov b4, b2; \ + vand a2, a2, a0; vand b2, b2, b0; veor a4, a4, a3; veor b4, b4, b3; \ + vmvn a2, a2; vmvn b2, b2; veor a3, a3, a1; veor b3, b3, b1; \ + veor a2, a2, a3; veor b2, b2, b3; vorr a4, a4, a0; vorr b4, b4, b0; \ + veor a0, a0, a2; veor b0, b0, b2; veor a3, a3, a4; veor b3, b3, b4; \ + veor a4, a4, a1; veor b4, b4, b1; vand a1, a1, a3; vand b1, b1, b3; \ + veor a1, a1, a0; veor b1, b1, b0; veor a0, a0, a3; veor b0, b0, b3; \ + vorr a0, a0, a2; vorr b0, b0, b2; veor a3, a3, a1; veor b3, b3, b1; \ + veor a4, a0; veor b4, b0; + +#define SBOX7(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + vmov a4, a1; vmov b4, b1; vorr a1, a1, a2; vorr b1, b1, b2; \ + veor a1, a1, a3; veor b1, b1, b3; veor a4, a4, a2; veor b4, b4, b2; \ + veor a2, a2, a1; veor b2, b2, b1; vorr a3, a3, a4; vorr b3, b3, b4; \ + vand a3, a3, a0; vand b3, b3, b0; veor a4, a4, a2; veor b4, b4, b2; \ + veor a3, a3, a1; veor b3, b3, b1; vorr a1, a1, a4; vorr b1, b1, b4; \ + veor a1, a1, a0; veor b1, b1, b0; vorr a0, a0, a4; vorr b0, b0, b4; \ + veor a0, a0, a2; veor b0, b0, b2; veor a1, a1, a4; veor b1, b1, b4; \ + veor a2, a2, a1; veor b2, b2, b1; vand a1, a1, a0; vand b1, b1, b0; \ + veor a1, a1, a4; veor b1, b1, b4; vmvn a2, a2; vmvn b2, b2; \ + vorr a2, a2, a0; vorr b2, b2, b0;\ + veor a4, a2; veor b4, b2; + +#define SBOX7_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + vmov a4, a2; vmov b4, b2; veor a2, a2, a0; veor b2, b2, b0; \ + vand a0, a0, a3; vand b0, b0, b3; vorr a4, a4, a3; vorr b4, b4, b3; \ + vmvn a2, a2; vmvn b2, b2; veor a3, a3, a1; veor b3, b3, b1; \ + vorr a1, a1, a0; vorr b1, b1, b0; veor a0, a0, a2; veor b0, b0, b2; \ + vand a2, a2, a4; vand b2, b2, b4; vand a3, a3, a4; vand b3, b3, b4; \ + veor a1, a1, a2; veor b1, b1, b2; veor a2, a2, a0; veor b2, b2, b0; \ + vorr a0, a0, a2; vorr b0, b0, b2; veor a4, a4, a1; veor b4, b4, b1; \ + veor a0, a0, a3; veor b0, b0, b3; veor a3, a3, a4; veor b3, b3, b4; \ + vorr a4, a4, a0; vorr b4, b4, b0; veor a3, a3, a2; veor b3, b3, b2; \ + veor a4, a2; veor b4, b2; + +/* Apply SBOX number WHICH to to the block. */ +#define SBOX(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + SBOX##which (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) + +/* Apply inverse SBOX number WHICH to to the block. */ +#define SBOX_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + SBOX##which##_INVERSE (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) + +/* XOR round key into block state in a0,a1,a2,a3. a4 used as temporary. */ +#define BLOCK_XOR_KEY(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + vdup.32 RT3, RT0d0[0]; \ + vdup.32 RT1, RT0d0[1]; \ + vdup.32 RT2, RT0d1[0]; \ + vdup.32 RT0, RT0d1[1]; \ + veor a0, a0, RT3; veor b0, b0, RT3; \ + veor a1, a1, RT1; veor b1, b1, RT1; \ + veor a2, a2, RT2; veor b2, b2, RT2; \ + veor a3, a3, RT0; veor b3, b3, RT0; + +#define BLOCK_LOAD_KEY_ENC() \ + vld1.8 {RT0d0, RT0d1}, [RROUND]!; + +#define BLOCK_LOAD_KEY_DEC() \ + vld1.8 {RT0d0, RT0d1}, [RROUND]; \ + sub RROUND, RROUND, #16 + +/* Apply the linear transformation to BLOCK. */ +#define LINEAR_TRANSFORMATION(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + vshl.u32 a4, a0, #13; vshl.u32 b4, b0, #13; \ + vshr.u32 a0, a0, #(32-13); vshr.u32 b0, b0, #(32-13); \ + veor a0, a0, a4; veor b0, b0, b4; \ + vshl.u32 a4, a2, #3; vshl.u32 b4, b2, #3; \ + vshr.u32 a2, a2, #(32-3); vshr.u32 b2, b2, #(32-3); \ + veor a2, a2, a4; veor b2, b2, b4; \ + veor a1, a0, a1; veor b1, b0, b1; \ + veor a1, a2, a1; veor b1, b2, b1; \ + vshl.u32 a4, a0, #3; vshl.u32 b4, b0, #3; \ + veor a3, a2, a3; veor b3, b2, b3; \ + veor a3, a4, a3; veor b3, b4, b3; \ + vshl.u32 a4, a1, #1; vshl.u32 b4, b1, #1; \ + vshr.u32 a1, a1, #(32-1); vshr.u32 b1, b1, #(32-1); \ + veor a1, a1, a4; veor b1, b1, b4; \ + vshl.u32 a4, a3, #7; vshl.u32 b4, b3, #7; \ + vshr.u32 a3, a3, #(32-7); vshr.u32 b3, b3, #(32-7); \ + veor a3, a3, a4; veor b3, b3, b4; \ + veor a0, a1, a0; veor b0, b1, b0; \ + veor a0, a3, a0; veor b0, b3, b0; \ + vshl.u32 a4, a1, #7; vshl.u32 b4, b1, #7; \ + veor a2, a3, a2; veor b2, b3, b2; \ + veor a2, a4, a2; veor b2, b4, b2; \ + vshl.u32 a4, a0, #5; vshl.u32 b4, b0, #5; \ + vshr.u32 a0, a0, #(32-5); vshr.u32 b0, b0, #(32-5); \ + veor a0, a0, a4; veor b0, b0, b4; \ + vshl.u32 a4, a2, #22; vshl.u32 b4, b2, #22; \ + vshr.u32 a2, a2, #(32-22); vshr.u32 b2, b2, #(32-22); \ + veor a2, a2, a4; veor b2, b2, b4; + +/* Apply the inverse linear transformation to BLOCK. */ +#define LINEAR_TRANSFORMATION_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ + vshr.u32 a4, a2, #22; vshr.u32 b4, b2, #22; \ + vshl.u32 a2, a2, #(32-22); vshl.u32 b2, b2, #(32-22); \ + veor a2, a2, a4; veor b2, b2, b4; \ + vshr.u32 a4, a0, #5; vshr.u32 b4, b0, #5; \ + vshl.u32 a0, a0, #(32-5); vshl.u32 b0, b0, #(32-5); \ + veor a0, a0, a4; veor b0, b0, b4; \ + vshl.u32 a4, a1, #7; vshl.u32 b4, b1, #7; \ + veor a2, a3, a2; veor b2, b3, b2; \ + veor a2, a4, a2; veor b2, b4, b2; \ + veor a0, a1, a0; veor b0, b1, b0; \ + veor a0, a3, a0; veor b0, b3, b0; \ + vshr.u32 a4, a3, #7; vshr.u32 b4, b3, #7; \ + vshl.u32 a3, a3, #(32-7); vshl.u32 b3, b3, #(32-7); \ + veor a3, a3, a4; veor b3, b3, b4; \ + vshr.u32 a4, a1, #1; vshr.u32 b4, b1, #1; \ + vshl.u32 a1, a1, #(32-1); vshl.u32 b1, b1, #(32-1); \ + veor a1, a1, a4; veor b1, b1, b4; \ + vshl.u32 a4, a0, #3; vshl.u32 b4, b0, #3; \ + veor a3, a2, a3; veor b3, b2, b3; \ + veor a3, a4, a3; veor b3, b4, b3; \ + veor a1, a0, a1; veor b1, b0, b1; \ + veor a1, a2, a1; veor b1, b2, b1; \ + vshr.u32 a4, a2, #3; vshr.u32 b4, b2, #3; \ + vshl.u32 a2, a2, #(32-3); vshl.u32 b2, b2, #(32-3); \ + veor a2, a2, a4; veor b2, b2, b4; \ + vshr.u32 a4, a0, #13; vshr.u32 b4, b0, #13; \ + vshl.u32 a0, a0, #(32-13); vshl.u32 b0, b0, #(32-13); \ + veor a0, a0, a4; veor b0, b0, b4; + +/* Apply a Serpent round to eight parallel blocks. This macro increments + `round'. */ +#define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \ + b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \ + BLOCK_XOR_KEY (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \ + BLOCK_LOAD_KEY_ENC (); \ + SBOX (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \ + LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4); + +/* Apply the last Serpent round to eight parallel blocks. This macro increments + `round'. */ +#define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \ + b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \ + BLOCK_XOR_KEY (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \ + BLOCK_LOAD_KEY_ENC (); \ + SBOX (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \ + BLOCK_XOR_KEY (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4); + +/* Apply an inverse Serpent round to eight parallel blocks. This macro + increments `round'. */ +#define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \ + na0, na1, na2, na3, na4, \ + b0, b1, b2, b3, b4, \ + nb0, nb1, nb2, nb3, nb4) \ + LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \ + SBOX_INVERSE (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \ + BLOCK_XOR_KEY (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4); \ + BLOCK_LOAD_KEY_DEC (); + +/* Apply the first inverse Serpent round to eight parallel blocks. This macro + increments `round'. */ +#define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \ + na0, na1, na2, na3, na4, \ + b0, b1, b2, b3, b4, \ + nb0, nb1, nb2, nb3, nb4) \ + BLOCK_XOR_KEY (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \ + BLOCK_LOAD_KEY_DEC (); \ + SBOX_INVERSE (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \ + BLOCK_XOR_KEY (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4); \ + BLOCK_LOAD_KEY_DEC (); + +.align 3 +.type __serpent_enc_blk8,%function; +__serpent_enc_blk8: + /* input: + * r0: round key pointer + * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext + * blocks + * output: + * RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: eight parallel + * ciphertext blocks + */ + + transpose_4x4(RA0, RA1, RA2, RA3); + BLOCK_LOAD_KEY_ENC (); + transpose_4x4(RB0, RB1, RB2, RB3); + + ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3, + RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3); + ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3, + RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3); + ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2, + RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2); + ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0, + RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0); + ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3, + RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3); + ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3, + RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3); + ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4, + RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4); + ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3, + RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3); + ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0, + RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0); + ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0, + RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0); + ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2, + RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2); + ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4, + RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4); + ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0, + RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0); + ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0, + RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0); + ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3, + RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3); + ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0, + RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0); + ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4, + RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4); + ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4, + RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4); + ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2, + RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2); + ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3, + RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3); + ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4, + RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4); + ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4, + RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4); + ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0, + RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0); + ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4, + RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4); + ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3, + RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3); + ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3, + RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3); + ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2, + RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2); + ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0, + RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0); + ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3, + RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3); + ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3, + RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3); + ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4, + RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4); + ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3, + RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3); + + transpose_4x4(RA4, RA1, RA2, RA0); + transpose_4x4(RB4, RB1, RB2, RB0); + + bx lr; +.size __serpent_enc_blk8,.-__serpent_enc_blk8; + +.align 3 +.type __serpent_dec_blk8,%function; +__serpent_dec_blk8: + /* input: + * r0: round key pointer + * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel + * ciphertext blocks + * output: + * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext + * blocks + */ + + add RROUND, RROUND, #(32*16); + + transpose_4x4(RA0, RA1, RA2, RA3); + BLOCK_LOAD_KEY_DEC (); + transpose_4x4(RB0, RB1, RB2, RB3); + + ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4, + RA3, RA0, RA1, RA4, RA2, + RB0, RB1, RB2, RB3, RB4, + RB3, RB0, RB1, RB4, RB2); + ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3, + RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3); + ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0, + RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0); + ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3, + RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3); + ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3, + RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3); + ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4, + RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4); + ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3, + RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3); + ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1, + RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1); + ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2, + RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2); + ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0, + RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0); + ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4, + RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4); + ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0, + RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0); + ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0, + RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0); + ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1, + RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1); + ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0, + RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0); + ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3, + RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3); + ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2, + RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2); + ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4, + RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4); + ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1, + RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1); + ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4, + RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4); + ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4, + RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4); + ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3, + RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3); + ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4, + RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4); + ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0, + RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0); + ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2, + RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2); + ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1, + RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1); + ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3, + RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3); + ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1, + RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1); + ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1, + RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1); + ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0, + RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0); + ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1, + RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1); + ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4, + RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4); + + transpose_4x4(RA0, RA1, RA2, RA3); + transpose_4x4(RB0, RB1, RB2, RB3); + + bx lr; +.size __serpent_dec_blk8,.-__serpent_dec_blk8; + +.align 3 +.globl _gcry_serpent_neon_ctr_enc +.type _gcry_serpent_neon_ctr_enc,%function; +_gcry_serpent_neon_ctr_enc: + /* input: + * r0: ctx, CTX + * r1: dst (8 blocks) + * r2: src (8 blocks) + * r3: iv + */ + + vmov.u8 RT1d0, #0xff; /* u64: -1 */ + push {r4,lr}; + vadd.u64 RT2d0, RT1d0, RT1d0; /* u64: -2 */ + vpush {RA4-RB2}; + + /* load IV and byteswap */ + vld1.8 {RA0}, [r3]; + vrev64.u8 RT0, RA0; /* be => le */ + ldr r4, [r3, #8]; + + /* construct IVs */ + vsub.u64 RA2d1, RT0d1, RT2d0; /* +2 */ + vsub.u64 RA1d1, RT0d1, RT1d0; /* +1 */ + cmp r4, #-1; + + vsub.u64 RB0d1, RA2d1, RT2d0; /* +4 */ + vsub.u64 RA3d1, RA2d1, RT1d0; /* +3 */ + ldr r4, [r3, #12]; + + vsub.u64 RB2d1, RB0d1, RT2d0; /* +6 */ + vsub.u64 RB1d1, RB0d1, RT1d0; /* +5 */ + + vsub.u64 RT2d1, RB2d1, RT2d0; /* +8 */ + vsub.u64 RB3d1, RB2d1, RT1d0; /* +7 */ + + vmov RA1d0, RT0d0; + vmov RA2d0, RT0d0; + vmov RA3d0, RT0d0; + vmov RB0d0, RT0d0; + rev r4, r4; + vmov RB1d0, RT0d0; + vmov RB2d0, RT0d0; + vmov RB3d0, RT0d0; + vmov RT2d0, RT0d0; + + /* check need for handling 64-bit overflow and carry */ + beq .Ldo_ctr_carry; + +.Lctr_carry_done: + /* le => be */ + vrev64.u8 RA1, RA1; + vrev64.u8 RA2, RA2; + vrev64.u8 RA3, RA3; + vrev64.u8 RB0, RB0; + vrev64.u8 RT2, RT2; + vrev64.u8 RB1, RB1; + vrev64.u8 RB2, RB2; + vrev64.u8 RB3, RB3; + /* store new IV */ + vst1.8 {RT2}, [r3]; + + bl __serpent_enc_blk8; + + vld1.8 {RT0, RT1}, [r2]!; + vld1.8 {RT2, RT3}, [r2]!; + veor RA4, RA4, RT0; + veor RA1, RA1, RT1; + vld1.8 {RT0, RT1}, [r2]!; + veor RA2, RA2, RT2; + veor RA0, RA0, RT3; + vld1.8 {RT2, RT3}, [r2]!; + veor RB4, RB4, RT0; + veor RT0, RT0; + veor RB1, RB1, RT1; + veor RT1, RT1; + veor RB2, RB2, RT2; + veor RT2, RT2; + veor RB0, RB0, RT3; + veor RT3, RT3; + + vst1.8 {RA4}, [r1]!; + vst1.8 {RA1}, [r1]!; + veor RA1, RA1; + vst1.8 {RA2}, [r1]!; + veor RA2, RA2; + vst1.8 {RA0}, [r1]!; + veor RA0, RA0; + vst1.8 {RB4}, [r1]!; + veor RB4, RB4; + vst1.8 {RB1}, [r1]!; + vst1.8 {RB2}, [r1]!; + vst1.8 {RB0}, [r1]!; + + vpop {RA4-RB2}; + + /* clear the used registers */ + veor RA3, RA3; + veor RB3, RB3; + + pop {r4,pc}; + +.Ldo_ctr_carry: + cmp r4, #-8; + blo .Lctr_carry_done; + beq .Lcarry_RT2; + + cmp r4, #-6; + blo .Lcarry_RB3; + beq .Lcarry_RB2; + + cmp r4, #-4; + blo .Lcarry_RB1; + beq .Lcarry_RB0; + + cmp r4, #-2; + blo .Lcarry_RA3; + beq .Lcarry_RA2; + + vsub.u64 RA1d0, RT1d0; +.Lcarry_RA2: + vsub.u64 RA2d0, RT1d0; +.Lcarry_RA3: + vsub.u64 RA3d0, RT1d0; +.Lcarry_RB0: + vsub.u64 RB0d0, RT1d0; +.Lcarry_RB1: + vsub.u64 RB1d0, RT1d0; +.Lcarry_RB2: + vsub.u64 RB2d0, RT1d0; +.Lcarry_RB3: + vsub.u64 RB3d0, RT1d0; +.Lcarry_RT2: + vsub.u64 RT2d0, RT1d0; + + b .Lctr_carry_done; +.size _gcry_serpent_neon_ctr_enc,.-_gcry_serpent_neon_ctr_enc; + +.align 3 +.globl _gcry_serpent_neon_cfb_dec +.type _gcry_serpent_neon_cfb_dec,%function; +_gcry_serpent_neon_cfb_dec: + /* input: + * r0: ctx, CTX + * r1: dst (8 blocks) + * r2: src (8 blocks) + * r3: iv + */ + + push {lr}; + vpush {RA4-RB2}; + + /* Load input */ + vld1.8 {RA0}, [r3]; + vld1.8 {RA1, RA2}, [r2]!; + vld1.8 {RA3}, [r2]!; + vld1.8 {RB0}, [r2]!; + vld1.8 {RB1, RB2}, [r2]!; + vld1.8 {RB3}, [r2]!; + + /* Update IV */ + vld1.8 {RT0}, [r2]!; + vst1.8 {RT0}, [r3]; + mov r3, lr; + sub r2, r2, #(8*16); + + bl __serpent_enc_blk8; + + vld1.8 {RT0, RT1}, [r2]!; + vld1.8 {RT2, RT3}, [r2]!; + veor RA4, RA4, RT0; + veor RA1, RA1, RT1; + vld1.8 {RT0, RT1}, [r2]!; + veor RA2, RA2, RT2; + veor RA0, RA0, RT3; + vld1.8 {RT2, RT3}, [r2]!; + veor RB4, RB4, RT0; + veor RT0, RT0; + veor RB1, RB1, RT1; + veor RT1, RT1; + veor RB2, RB2, RT2; + veor RT2, RT2; + veor RB0, RB0, RT3; + veor RT3, RT3; + + vst1.8 {RA4}, [r1]!; + vst1.8 {RA1}, [r1]!; + veor RA1, RA1; + vst1.8 {RA2}, [r1]!; + veor RA2, RA2; + vst1.8 {RA0}, [r1]!; + veor RA0, RA0; + vst1.8 {RB4}, [r1]!; + veor RB4, RB4; + vst1.8 {RB1}, [r1]!; + vst1.8 {RB2}, [r1]!; + vst1.8 {RB0}, [r1]!; + + vpop {RA4-RB2}; + + /* clear the used registers */ + veor RA3, RA3; + veor RB3, RB3; + + pop {pc}; +.size _gcry_serpent_neon_cfb_dec,.-_gcry_serpent_neon_cfb_dec; + +.align 3 +.globl _gcry_serpent_neon_cbc_dec +.type _gcry_serpent_neon_cbc_dec,%function; +_gcry_serpent_neon_cbc_dec: + /* input: + * r0: ctx, CTX + * r1: dst (8 blocks) + * r2: src (8 blocks) + * r3: iv + */ + + push {lr}; + vpush {RA4-RB2}; + + vld1.8 {RA0, RA1}, [r2]!; + vld1.8 {RA2, RA3}, [r2]!; + vld1.8 {RB0, RB1}, [r2]!; + vld1.8 {RB2, RB3}, [r2]!; + sub r2, r2, #(8*16); + + bl __serpent_dec_blk8; + + vld1.8 {RB4}, [r3]; + vld1.8 {RT0, RT1}, [r2]!; + vld1.8 {RT2, RT3}, [r2]!; + veor RA0, RA0, RB4; + veor RA1, RA1, RT0; + veor RA2, RA2, RT1; + vld1.8 {RT0, RT1}, [r2]!; + veor RA3, RA3, RT2; + veor RB0, RB0, RT3; + vld1.8 {RT2, RT3}, [r2]!; + veor RB1, RB1, RT0; + veor RT0, RT0; + veor RB2, RB2, RT1; + veor RT1, RT1; + veor RB3, RB3, RT2; + veor RT2, RT2; + vst1.8 {RT3}, [r3]; /* store new IV */ + veor RT3, RT3; + + vst1.8 {RA0, RA1}, [r1]!; + veor RA0, RA0; + veor RA1, RA1; + vst1.8 {RA2, RA3}, [r1]!; + veor RA2, RA2; + vst1.8 {RB0, RB1}, [r1]!; + veor RA3, RA3; + vst1.8 {RB2, RB3}, [r1]!; + veor RB3, RB3; + + vpop {RA4-RB2}; + + /* clear the used registers */ + veor RB4, RB4; + + pop {pc}; +.size _gcry_serpent_neon_cbc_dec,.-_gcry_serpent_neon_cbc_dec; + +#endif