summaryrefslogtreecommitdiff
path: root/lib/crypto
diff options
context:
space:
mode:
authorEric Biggers <ebiggers@kernel.org>2025-10-25 22:50:26 -0700
committerEric Biggers <ebiggers@kernel.org>2025-11-05 20:02:35 -0800
commit1e29a750572a25200fcea995d91e5f6448f340c0 (patch)
treed19698b26f79f9af8bc3bb8ef8d655681650e9cc /lib/crypto
parentbe755eb2b021495e1935813abaf9e1f1de1d8c78 (diff)
lib/crypto: arm64/sha3: Migrate optimized code into library
Instead of exposing the arm64-optimized SHA-3 code via arm64-specific crypto_shash algorithms, instead just implement the sha3_absorb_blocks() and sha3_keccakf() library functions. This is much simpler, it makes the SHA-3 library functions be arm64-optimized, and it fixes the longstanding issue where the arm64-optimized SHA-3 code was disabled by default. SHA-3 still remains available through crypto_shash, but individual architectures no longer need to handle it. Note: to see the diff from arch/arm64/crypto/sha3-ce-glue.c to lib/crypto/arm64/sha3.h, view this commit with 'git show -M10'. Reviewed-by: Ard Biesheuvel <ardb@kernel.org> Link: https://lore.kernel.org/r/20251026055032.1413733-10-ebiggers@kernel.org Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Diffstat (limited to 'lib/crypto')
-rw-r--r--lib/crypto/Kconfig5
-rw-r--r--lib/crypto/Makefile5
-rw-r--r--lib/crypto/arm64/sha3-ce-core.S213
-rw-r--r--lib/crypto/arm64/sha3.h62
4 files changed, 285 insertions, 0 deletions
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index a05f5a349cd8..587490ca6565 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -202,6 +202,11 @@ config CRYPTO_LIB_SHA3
The SHA3 library functions. Select this if your module uses any of
the functions from <crypto/sha3.h>.
+config CRYPTO_LIB_SHA3_ARCH
+ bool
+ depends on CRYPTO_LIB_SHA3 && !UML
+ default y if ARM64 && KERNEL_MODE_NEON
+
config CRYPTO_LIB_SM3
tristate
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 0cfdb511f32b..5515e73bfd5e 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -281,6 +281,11 @@ endif # CONFIG_CRYPTO_LIB_SHA512_ARCH
obj-$(CONFIG_CRYPTO_LIB_SHA3) += libsha3.o
libsha3-y := sha3.o
+ifeq ($(CONFIG_CRYPTO_LIB_SHA3_ARCH),y)
+CFLAGS_sha3.o += -I$(src)/$(SRCARCH)
+libsha3-$(CONFIG_ARM64) += arm64/sha3-ce-core.o
+endif # CONFIG_CRYPTO_LIB_SHA3_ARCH
+
################################################################################
obj-$(CONFIG_MPILIB) += mpi/
diff --git a/lib/crypto/arm64/sha3-ce-core.S b/lib/crypto/arm64/sha3-ce-core.S
new file mode 100644
index 000000000000..b62bd714839b
--- /dev/null
+++ b/lib/crypto/arm64/sha3-ce-core.S
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+ .set .Lv\b\().2d, \b
+ .set .Lv\b\().16b, \b
+ .endr
+
+ /*
+ * ARMv8.2 Crypto Extensions instructions
+ */
+ .macro eor3, rd, rn, rm, ra
+ .inst 0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+ .endm
+
+ .macro rax1, rd, rn, rm
+ .inst 0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
+ .endm
+
+ .macro bcax, rd, rn, rm, ra
+ .inst 0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+ .endm
+
+ .macro xar, rd, rn, rm, imm6
+ .inst 0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
+ .endm
+
+ /*
+ * size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
+ * size_t nblocks, size_t block_size)
+ *
+ * block_size is assumed to be one of 72 (SHA3-512), 104 (SHA3-384), 136
+ * (SHA3-256 and SHAKE256), 144 (SHA3-224), or 168 (SHAKE128).
+ */
+ .text
+SYM_FUNC_START(sha3_ce_transform)
+ /* load state */
+ add x8, x0, #32
+ ld1 { v0.1d- v3.1d}, [x0]
+ ld1 { v4.1d- v7.1d}, [x8], #32
+ ld1 { v8.1d-v11.1d}, [x8], #32
+ ld1 {v12.1d-v15.1d}, [x8], #32
+ ld1 {v16.1d-v19.1d}, [x8], #32
+ ld1 {v20.1d-v23.1d}, [x8], #32
+ ld1 {v24.1d}, [x8]
+
+0: sub x2, x2, #1
+ mov w8, #24
+ adr_l x9, .Lsha3_rcon
+
+ /* load input */
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ ld1 {v29.8b}, [x1], #8
+ eor v0.8b, v0.8b, v25.8b
+ eor v1.8b, v1.8b, v26.8b
+ eor v2.8b, v2.8b, v27.8b
+ eor v3.8b, v3.8b, v28.8b
+ eor v4.8b, v4.8b, v29.8b
+
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ eor v5.8b, v5.8b, v25.8b
+ eor v6.8b, v6.8b, v26.8b
+ eor v7.8b, v7.8b, v27.8b
+ eor v8.8b, v8.8b, v28.8b
+ cmp x3, #72
+ b.eq 3f /* SHA3-512 (block_size=72)? */
+
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ eor v9.8b, v9.8b, v25.8b
+ eor v10.8b, v10.8b, v26.8b
+ eor v11.8b, v11.8b, v27.8b
+ eor v12.8b, v12.8b, v28.8b
+ cmp x3, #104
+ b.eq 3f /* SHA3-384 (block_size=104)? */
+
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ eor v13.8b, v13.8b, v25.8b
+ eor v14.8b, v14.8b, v26.8b
+ eor v15.8b, v15.8b, v27.8b
+ eor v16.8b, v16.8b, v28.8b
+ cmp x3, #144
+ b.lt 3f /* SHA3-256 or SHAKE256 (block_size=136)? */
+ b.eq 2f /* SHA3-224 (block_size=144)? */
+
+ /* SHAKE128 (block_size=168) */
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ eor v17.8b, v17.8b, v25.8b
+ eor v18.8b, v18.8b, v26.8b
+ eor v19.8b, v19.8b, v27.8b
+ eor v20.8b, v20.8b, v28.8b
+ b 3f
+2:
+ /* SHA3-224 (block_size=144) */
+ ld1 {v25.8b}, [x1], #8
+ eor v17.8b, v17.8b, v25.8b
+
+3: sub w8, w8, #1
+
+ eor3 v29.16b, v4.16b, v9.16b, v14.16b
+ eor3 v26.16b, v1.16b, v6.16b, v11.16b
+ eor3 v28.16b, v3.16b, v8.16b, v13.16b
+ eor3 v25.16b, v0.16b, v5.16b, v10.16b
+ eor3 v27.16b, v2.16b, v7.16b, v12.16b
+ eor3 v29.16b, v29.16b, v19.16b, v24.16b
+ eor3 v26.16b, v26.16b, v16.16b, v21.16b
+ eor3 v28.16b, v28.16b, v18.16b, v23.16b
+ eor3 v25.16b, v25.16b, v15.16b, v20.16b
+ eor3 v27.16b, v27.16b, v17.16b, v22.16b
+
+ rax1 v30.2d, v29.2d, v26.2d // bc[0]
+ rax1 v26.2d, v26.2d, v28.2d // bc[2]
+ rax1 v28.2d, v28.2d, v25.2d // bc[4]
+ rax1 v25.2d, v25.2d, v27.2d // bc[1]
+ rax1 v27.2d, v27.2d, v29.2d // bc[3]
+
+ eor v0.16b, v0.16b, v30.16b
+ xar v29.2d, v1.2d, v25.2d, (64 - 1)
+ xar v1.2d, v6.2d, v25.2d, (64 - 44)
+ xar v6.2d, v9.2d, v28.2d, (64 - 20)
+ xar v9.2d, v22.2d, v26.2d, (64 - 61)
+ xar v22.2d, v14.2d, v28.2d, (64 - 39)
+ xar v14.2d, v20.2d, v30.2d, (64 - 18)
+ xar v31.2d, v2.2d, v26.2d, (64 - 62)
+ xar v2.2d, v12.2d, v26.2d, (64 - 43)
+ xar v12.2d, v13.2d, v27.2d, (64 - 25)
+ xar v13.2d, v19.2d, v28.2d, (64 - 8)
+ xar v19.2d, v23.2d, v27.2d, (64 - 56)
+ xar v23.2d, v15.2d, v30.2d, (64 - 41)
+ xar v15.2d, v4.2d, v28.2d, (64 - 27)
+ xar v28.2d, v24.2d, v28.2d, (64 - 14)
+ xar v24.2d, v21.2d, v25.2d, (64 - 2)
+ xar v8.2d, v8.2d, v27.2d, (64 - 55)
+ xar v4.2d, v16.2d, v25.2d, (64 - 45)
+ xar v16.2d, v5.2d, v30.2d, (64 - 36)
+ xar v5.2d, v3.2d, v27.2d, (64 - 28)
+ xar v27.2d, v18.2d, v27.2d, (64 - 21)
+ xar v3.2d, v17.2d, v26.2d, (64 - 15)
+ xar v25.2d, v11.2d, v25.2d, (64 - 10)
+ xar v26.2d, v7.2d, v26.2d, (64 - 6)
+ xar v30.2d, v10.2d, v30.2d, (64 - 3)
+
+ bcax v20.16b, v31.16b, v22.16b, v8.16b
+ bcax v21.16b, v8.16b, v23.16b, v22.16b
+ bcax v22.16b, v22.16b, v24.16b, v23.16b
+ bcax v23.16b, v23.16b, v31.16b, v24.16b
+ bcax v24.16b, v24.16b, v8.16b, v31.16b
+
+ ld1r {v31.2d}, [x9], #8
+
+ bcax v17.16b, v25.16b, v19.16b, v3.16b
+ bcax v18.16b, v3.16b, v15.16b, v19.16b
+ bcax v19.16b, v19.16b, v16.16b, v15.16b
+ bcax v15.16b, v15.16b, v25.16b, v16.16b
+ bcax v16.16b, v16.16b, v3.16b, v25.16b
+
+ bcax v10.16b, v29.16b, v12.16b, v26.16b
+ bcax v11.16b, v26.16b, v13.16b, v12.16b
+ bcax v12.16b, v12.16b, v14.16b, v13.16b
+ bcax v13.16b, v13.16b, v29.16b, v14.16b
+ bcax v14.16b, v14.16b, v26.16b, v29.16b
+
+ bcax v7.16b, v30.16b, v9.16b, v4.16b
+ bcax v8.16b, v4.16b, v5.16b, v9.16b
+ bcax v9.16b, v9.16b, v6.16b, v5.16b
+ bcax v5.16b, v5.16b, v30.16b, v6.16b
+ bcax v6.16b, v6.16b, v4.16b, v30.16b
+
+ bcax v3.16b, v27.16b, v0.16b, v28.16b
+ bcax v4.16b, v28.16b, v1.16b, v0.16b
+ bcax v0.16b, v0.16b, v2.16b, v1.16b
+ bcax v1.16b, v1.16b, v27.16b, v2.16b
+ bcax v2.16b, v2.16b, v28.16b, v27.16b
+
+ eor v0.16b, v0.16b, v31.16b
+
+ cbnz w8, 3b
+ cond_yield 4f, x8, x9
+ cbnz x2, 0b
+
+ /* save state */
+4: st1 { v0.1d- v3.1d}, [x0], #32
+ st1 { v4.1d- v7.1d}, [x0], #32
+ st1 { v8.1d-v11.1d}, [x0], #32
+ st1 {v12.1d-v15.1d}, [x0], #32
+ st1 {v16.1d-v19.1d}, [x0], #32
+ st1 {v20.1d-v23.1d}, [x0], #32
+ st1 {v24.1d}, [x0]
+ mov x0, x2
+ ret
+SYM_FUNC_END(sha3_ce_transform)
+
+ .section ".rodata", "a"
+ .align 8
+.Lsha3_rcon:
+ .quad 0x0000000000000001, 0x0000000000008082, 0x800000000000808a
+ .quad 0x8000000080008000, 0x000000000000808b, 0x0000000080000001
+ .quad 0x8000000080008081, 0x8000000000008009, 0x000000000000008a
+ .quad 0x0000000000000088, 0x0000000080008009, 0x000000008000000a
+ .quad 0x000000008000808b, 0x800000000000008b, 0x8000000000008089
+ .quad 0x8000000000008003, 0x8000000000008002, 0x8000000000000080
+ .quad 0x000000000000800a, 0x800000008000000a, 0x8000000080008081
+ .quad 0x8000000000008080, 0x0000000080000001, 0x8000000080008008
diff --git a/lib/crypto/arm64/sha3.h b/lib/crypto/arm64/sha3.h
new file mode 100644
index 000000000000..6dd5183056da
--- /dev/null
+++ b/lib/crypto/arm64/sha3.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3);
+
+asmlinkage size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
+ size_t nblocks, size_t block_size);
+
+static void sha3_absorb_blocks(struct sha3_state *state, const u8 *data,
+ size_t nblocks, size_t block_size)
+{
+ if (static_branch_likely(&have_sha3) && likely(may_use_simd())) {
+ do {
+ size_t rem;
+
+ kernel_neon_begin();
+ rem = sha3_ce_transform(state, data, nblocks,
+ block_size);
+ kernel_neon_end();
+ data += (nblocks - rem) * block_size;
+ nblocks = rem;
+ } while (nblocks);
+ } else {
+ sha3_absorb_blocks_generic(state, data, nblocks, block_size);
+ }
+}
+
+static void sha3_keccakf(struct sha3_state *state)
+{
+ if (static_branch_likely(&have_sha3) && likely(may_use_simd())) {
+ /*
+ * Passing zeroes into sha3_ce_transform() gives the plain
+ * Keccak-f permutation, which is what we want here. Any
+ * supported block size may be used. Use SHA3_512_BLOCK_SIZE
+ * since it's the shortest.
+ */
+ static const u8 zeroes[SHA3_512_BLOCK_SIZE];
+
+ kernel_neon_begin();
+ sha3_ce_transform(state, zeroes, 1, sizeof(zeroes));
+ kernel_neon_end();
+ } else {
+ sha3_keccakf_generic(state);
+ }
+}
+
+#define sha3_mod_init_arch sha3_mod_init_arch
+static void sha3_mod_init_arch(void)
+{
+ if (cpu_have_named_feature(SHA3))
+ static_branch_enable(&have_sha3);
+}