23 files changed, 3 insertions, 2401 deletions
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index c436eec22d86..f30d743df264 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -33,22 +33,6 @@ config CRYPTO_NHPOLY1305_NEON
 	  Architecture: arm using:
 	  - NEON (Advanced SIMD) extensions
 
-config CRYPTO_BLAKE2B_NEON
-	tristate "Hash functions: BLAKE2b (NEON)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_BLAKE2B
-	help
-	  BLAKE2b cryptographic hash function (RFC 7693)
-
-	  Architecture: arm using
-	  - NEON (Advanced SIMD) extensions
-
-	  BLAKE2b digest algorithm optimized with ARM NEON instructions.
-	  On ARM processors that have NEON support but not the ARMv8
-	  Crypto Extensions, typically this BLAKE2b implementation is
-	  much faster than the SHA-2 family and slightly faster than
-	  SHA-1.
-
 config CRYPTO_AES_ARM
 	tristate "Ciphers: AES"
 	select CRYPTO_ALGAPI
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 6346a73effc0..86dd43313dbf 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -5,7 +5,6 @@
 
 obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
 obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
-obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
@@ -13,7 +12,6 @@ obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
 
 aes-arm-y	:= aes-cipher-core.o aes-cipher-glue.o
 aes-arm-bs-y	:= aes-neonbs-core.o aes-neonbs-glue.o
-blake2b-neon-y  := blake2b-neon-core.o blake2b-neon-glue.o
 aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
diff --git a/arch/arm/crypto/blake2b-neon-core.S b/arch/arm/crypto/blake2b-neon-core.S
deleted file mode 100644
index 0406a186377f..000000000000
--- a/arch/arm/crypto/blake2b-neon-core.S
+++ /dev/null
@@ -1,347 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * BLAKE2b digest algorithm, NEON accelerated
- *
- * Copyright 2020 Google LLC
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-
-#include <linux/linkage.h>
-
-	.text
-	.fpu		neon
-
-	// The arguments to blake2b_compress_neon()
-	STATE		.req	r0
-	BLOCK		.req	r1
-	NBLOCKS		.req	r2
-	INC		.req	r3
-
-	// Pointers to the rotation tables
-	ROR24_TABLE	.req	r4
-	ROR16_TABLE	.req	r5
-
-	// The original stack pointer
-	ORIG_SP		.req	r6
-
-	// NEON registers which contain the message words of the current block.
-	// M_0-M_3 are occasionally used for other purposes too.
-	M_0		.req	d16
-	M_1		.req	d17
-	M_2		.req	d18
-	M_3		.req	d19
-	M_4		.req	d20
-	M_5		.req	d21
-	M_6		.req	d22
-	M_7		.req	d23
-	M_8		.req	d24
-	M_9		.req	d25
-	M_10		.req	d26
-	M_11		.req	d27
-	M_12		.req	d28
-	M_13		.req	d29
-	M_14		.req	d30
-	M_15		.req	d31
-
-	.align		4
-	// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
-	// instruction.  This is the most efficient way to implement these
-	// rotation amounts with NEON.  (On Cortex-A53 it's the same speed as
-	// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
-.Lror24_table:
-	.byte		3, 4, 5, 6, 7, 0, 1, 2
-.Lror16_table:
-	.byte		2, 3, 4, 5, 6, 7, 0, 1
-	// The BLAKE2b initialization vector
-.Lblake2b_IV:
-	.quad		0x6a09e667f3bcc908, 0xbb67ae8584caa73b
-	.quad		0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
-	.quad		0x510e527fade682d1, 0x9b05688c2b3e6c1f
-	.quad		0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
-
-// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
-// NEON registers q0-q7.  The message block is in q8..q15 (M_0-M_15).  The stack
-// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
-// (M_0-M_3), so that they can be reloaded if they are used as temporary
-// registers.  The macro arguments s0-s15 give the order in which the message
-// words are used in this round.  'final' is 1 if this is the final round.
-.macro	_blake2b_round	s0, s1, s2, s3, s4, s5, s6, s7, \
-			s8, s9, s10, s11, s12, s13, s14, s15, final=0
-
-	// Mix the columns:
-	// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
-	// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
-
-	// a += b + m[blake2b_sigma[r][2*i + 0]];
-	vadd.u64	q0, q0, q2
-	vadd.u64	q1, q1, q3
-	vadd.u64	d0, d0, M_\s0
-	vadd.u64	d1, d1, M_\s2
-	vadd.u64	d2, d2, M_\s4
-	vadd.u64	d3, d3, M_\s6
-
-	// d = ror64(d ^ a, 32);
-	veor		q6, q6, q0
-	veor		q7, q7, q1
-	vrev64.32	q6, q6
-	vrev64.32	q7, q7
-
-	// c += d;
-	vadd.u64	q4, q4, q6
-	vadd.u64	q5, q5, q7
-
-	// b = ror64(b ^ c, 24);
-	vld1.8		{M_0}, [ROR24_TABLE, :64]
-	veor		q2, q2, q4
-	veor		q3, q3, q5
-	vtbl.8		d4, {d4}, M_0
-	vtbl.8		d5, {d5}, M_0
-	vtbl.8		d6, {d6}, M_0
-	vtbl.8		d7, {d7}, M_0
-
-	// a += b + m[blake2b_sigma[r][2*i + 1]];
-	//
-	// M_0 got clobbered above, so we have to reload it if any of the four
-	// message words this step needs happens to be M_0.  Otherwise we don't
-	// need to reload it here, as it will just get clobbered again below.
-.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
-	vld1.8		{M_0}, [sp, :64]
-.endif
-	vadd.u64	q0, q0, q2
-	vadd.u64	q1, q1, q3
-	vadd.u64	d0, d0, M_\s1
-	vadd.u64	d1, d1, M_\s3
-	vadd.u64	d2, d2, M_\s5
-	vadd.u64	d3, d3, M_\s7
-
-	// d = ror64(d ^ a, 16);
-	vld1.8		{M_0}, [ROR16_TABLE, :64]
-	veor		q6, q6, q0
-	veor		q7, q7, q1
-	vtbl.8		d12, {d12}, M_0
-	vtbl.8		d13, {d13}, M_0
-	vtbl.8		d14, {d14}, M_0
-	vtbl.8		d15, {d15}, M_0
-
-	// c += d;
-	vadd.u64	q4, q4, q6
-	vadd.u64	q5, q5, q7
-
-	// b = ror64(b ^ c, 63);
-	//
-	// This rotation amount isn't a multiple of 8, so it has to be
-	// implemented using a pair of shifts, which requires temporary
-	// registers.  Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
-	veor		q8, q2, q4
-	veor		q9, q3, q5
-	vshr.u64	q2, q8, #63
-	vshr.u64	q3, q9, #63
-	vsli.u64	q2, q8, #1
-	vsli.u64	q3, q9, #1
-	vld1.8		{q8-q9}, [sp, :256]
-
-	// Mix the diagonals:
-	// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
-	// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
-	//
-	// There are two possible ways to do this: use 'vext' instructions to
-	// shift the rows of the matrix so that the diagonals become columns,
-	// and undo it afterwards; or just use 64-bit operations on 'd'
-	// registers instead of 128-bit operations on 'q' registers.  We use the
-	// latter approach, as it performs much better on Cortex-A7.
-
-	// a += b + m[blake2b_sigma[r][2*i + 0]];
-	vadd.u64	d0, d0, d5
-	vadd.u64	d1, d1, d6
-	vadd.u64	d2, d2, d7
-	vadd.u64	d3, d3, d4
-	vadd.u64	d0, d0, M_\s8
-	vadd.u64	d1, d1, M_\s10
-	vadd.u64	d2, d2, M_\s12
-	vadd.u64	d3, d3, M_\s14
-
-	// d = ror64(d ^ a, 32);
-	veor		d15, d15, d0
-	veor		d12, d12, d1
-	veor		d13, d13, d2
-	veor		d14, d14, d3
-	vrev64.32	d15, d15
-	vrev64.32	d12, d12
-	vrev64.32	d13, d13
-	vrev64.32	d14, d14
-
-	// c += d;
-	vadd.u64	d10, d10, d15
-	vadd.u64	d11, d11, d12
-	vadd.u64	d8, d8, d13
-	vadd.u64	d9, d9, d14
-
-	// b = ror64(b ^ c, 24);
-	vld1.8		{M_0}, [ROR24_TABLE, :64]
-	veor		d5, d5, d10
-	veor		d6, d6, d11
-	veor		d7, d7, d8
-	veor		d4, d4, d9
-	vtbl.8		d5, {d5}, M_0
-	vtbl.8		d6, {d6}, M_0
-	vtbl.8		d7, {d7}, M_0
-	vtbl.8		d4, {d4}, M_0
-
-	// a += b + m[blake2b_sigma[r][2*i + 1]];
-.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
-	vld1.8		{M_0}, [sp, :64]
-.endif
-	vadd.u64	d0, d0, d5
-	vadd.u64	d1, d1, d6
-	vadd.u64	d2, d2, d7
-	vadd.u64	d3, d3, d4
-	vadd.u64	d0, d0, M_\s9
-	vadd.u64	d1, d1, M_\s11
-	vadd.u64	d2, d2, M_\s13
-	vadd.u64	d3, d3, M_\s15
-
-	// d = ror64(d ^ a, 16);
-	vld1.8		{M_0}, [ROR16_TABLE, :64]
-	veor		d15, d15, d0
-	veor		d12, d12, d1
-	veor		d13, d13, d2
-	veor		d14, d14, d3
-	vtbl.8		d12, {d12}, M_0
-	vtbl.8		d13, {d13}, M_0
-	vtbl.8		d14, {d14}, M_0
-	vtbl.8		d15, {d15}, M_0
-
-	// c += d;
-	vadd.u64	d10, d10, d15
-	vadd.u64	d11, d11, d12
-	vadd.u64	d8, d8, d13
-	vadd.u64	d9, d9, d14
-
-	// b = ror64(b ^ c, 63);
-	veor		d16, d4, d9
-	veor		d17, d5, d10
-	veor		d18, d6, d11
-	veor		d19, d7, d8
-	vshr.u64	q2, q8, #63
-	vshr.u64	q3, q9, #63
-	vsli.u64	q2, q8, #1
-	vsli.u64	q3, q9, #1
-	// Reloading q8-q9 can be skipped on the final round.
-.if ! \final
-	vld1.8		{q8-q9}, [sp, :256]
-.endif
-.endm
-
-//
-// void blake2b_compress_neon(struct blake2b_state *state,
-//			      const u8 *block, size_t nblocks, u32 inc);
-//
-// Only the first three fields of struct blake2b_state are used:
-//	u64 h[8];	(inout)
-//	u64 t[2];	(inout)
-//	u64 f[2];	(in)
-//
-	.align		5
-ENTRY(blake2b_compress_neon)
-	push		{r4-r10}
-
-	// Allocate a 32-byte stack buffer that is 32-byte aligned.
-	mov		ORIG_SP, sp
-	sub		ip, sp, #32
-	bic		ip, ip, #31
-	mov		sp, ip
-
-	adr		ROR24_TABLE, .Lror24_table
-	adr		ROR16_TABLE, .Lror16_table
-
-	mov		ip, STATE
-	vld1.64		{q0-q1}, [ip]!		// Load h[0..3]
-	vld1.64		{q2-q3}, [ip]!		// Load h[4..7]
-.Lnext_block:
-	  adr		r10, .Lblake2b_IV
-	vld1.64		{q14-q15}, [ip]		// Load t[0..1] and f[0..1]
-	vld1.64		{q4-q5}, [r10]!		// Load IV[0..3]
-	  vmov		r7, r8, d28		// Copy t[0] to (r7, r8)
-	vld1.64		{q6-q7}, [r10]		// Load IV[4..7]
-	  adds		r7, r7, INC		// Increment counter
-	bcs		.Lslow_inc_ctr
-	vmov.i32	d28[0], r7
-	vst1.64		{d28}, [ip]		// Update t[0]
-.Linc_ctr_done:
-
-	// Load the next message block and finish initializing the state matrix
-	// 'v'.  Fortunately, there are exactly enough NEON registers to fit the
-	// entire state matrix in q0-q7 and the entire message block in q8-15.
-	//
-	// However, _blake2b_round also needs some extra registers for rotates,
-	// so we have to spill some registers.  It's better to spill the message
-	// registers than the state registers, as the message doesn't change.
-	// Therefore we store a copy of the first 32 bytes of the message block
-	// (q8-q9) in an aligned buffer on the stack so that they can be
-	// reloaded when needed.  (We could just reload directly from the
-	// message buffer, but it's faster to use aligned loads.)
-	vld1.8		{q8-q9}, [BLOCK]!
-	  veor		q6, q6, q14	// v[12..13] = IV[4..5] ^ t[0..1]
-	vld1.8		{q10-q11}, [BLOCK]!
-	  veor		q7, q7, q15	// v[14..15] = IV[6..7] ^ f[0..1]
-	vld1.8		{q12-q13}, [BLOCK]!
-	vst1.8		{q8-q9}, [sp, :256]
-	  mov		ip, STATE
-	vld1.8		{q14-q15}, [BLOCK]!
-
-	// Execute the rounds.  Each round is provided the order in which it
-	// needs to use the message words.
-	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
-	_blake2b_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
-	_blake2b_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
-	_blake2b_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
-	_blake2b_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
-	_blake2b_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
-	_blake2b_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
-	_blake2b_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
-	_blake2b_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
-	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
-			final=1
-
-	// Fold the final state matrix into the hash chaining value:
-	//
-	//	for (i = 0; i < 8; i++)
-	//		h[i] ^= v[i] ^ v[i + 8];
-	//
-	  vld1.64	{q8-q9}, [ip]!		// Load old h[0..3]
-	veor		q0, q0, q4		// v[0..1] ^= v[8..9]
-	veor		q1, q1, q5		// v[2..3] ^= v[10..11]
-	  vld1.64	{q10-q11}, [ip]		// Load old h[4..7]
-	veor		q2, q2, q6		// v[4..5] ^= v[12..13]
-	veor		q3, q3, q7		// v[6..7] ^= v[14..15]
-	veor		q0, q0, q8		// v[0..1] ^= h[0..1]
-	veor		q1, q1, q9		// v[2..3] ^= h[2..3]
-	  mov		ip, STATE
-	  subs		NBLOCKS, NBLOCKS, #1	// nblocks--
-	  vst1.64	{q0-q1}, [ip]!		// Store new h[0..3]
-	veor		q2, q2, q10		// v[4..5] ^= h[4..5]
-	veor		q3, q3, q11		// v[6..7] ^= h[6..7]
-	  vst1.64	{q2-q3}, [ip]!		// Store new h[4..7]
-
-	// Advance to the next block, if there is one.
-	bne		.Lnext_block		// nblocks != 0?
-
-	mov		sp, ORIG_SP
-	pop		{r4-r10}
-	mov		pc, lr
-
-.Lslow_inc_ctr:
-	// Handle the case where the counter overflowed its low 32 bits, by
-	// carrying the overflow bit into the full 128-bit counter.
-	vmov		r9, r10, d29
-	adcs		r8, r8, #0
-	adcs		r9, r9, #0
-	adc		r10, r10, #0
-	vmov		d28, r7, r8
-	vmov		d29, r9, r10
-	vst1.64		{q14}, [ip]		// Update t[0] and t[1]
-	b		.Linc_ctr_done
-ENDPROC(blake2b_compress_neon)
diff --git a/arch/arm/crypto/blake2b-neon-glue.c b/arch/arm/crypto/blake2b-neon-glue.c
deleted file mode 100644
index 2ff443a91724..000000000000
--- a/arch/arm/crypto/blake2b-neon-glue.c
+++ /dev/null
@@ -1,104 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * BLAKE2b digest algorithm, NEON accelerated
- *
- * Copyright 2020 Google LLC
- */
-
-#include <crypto/internal/blake2b.h>
-#include <crypto/internal/hash.h>
-
-#include <linux/module.h>
-#include <linux/sizes.h>
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void blake2b_compress_neon(struct blake2b_state *state,
-				      const u8 *block, size_t nblocks, u32 inc);
-
-static void blake2b_compress_arch(struct blake2b_state *state,
-				  const u8 *block, size_t nblocks, u32 inc)
-{
-	do {
-		const size_t blocks = min_t(size_t, nblocks,
-					    SZ_4K / BLAKE2B_BLOCK_SIZE);
-
-		kernel_neon_begin();
-		blake2b_compress_neon(state, block, blocks, inc);
-		kernel_neon_end();
-
-		nblocks -= blocks;
-		block += blocks * BLAKE2B_BLOCK_SIZE;
-	} while (nblocks);
-}
-
-static int crypto_blake2b_update_neon(struct shash_desc *desc,
-				      const u8 *in, unsigned int inlen)
-{
-	return crypto_blake2b_update_bo(desc, in, inlen, blake2b_compress_arch);
-}
-
-static int crypto_blake2b_finup_neon(struct shash_desc *desc, const u8 *in,
-				     unsigned int inlen, u8 *out)
-{
-	return crypto_blake2b_finup(desc, in, inlen, out,
-				    blake2b_compress_arch);
-}
-
-#define BLAKE2B_ALG(name, driver_name, digest_size)			\
-	{								\
-		.base.cra_name		= name,				\
-		.base.cra_driver_name	= driver_name,			\
-		.base.cra_priority	= 200,				\
-		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY |	\
-					  CRYPTO_AHASH_ALG_BLOCK_ONLY |	\
-					  CRYPTO_AHASH_ALG_FINAL_NONZERO, \
-		.base.cra_blocksize	= BLAKE2B_BLOCK_SIZE,		\
-		.base.cra_ctxsize	= sizeof(struct blake2b_tfm_ctx), \
-		.base.cra_module	= THIS_MODULE,			\
-		.digestsize		= digest_size,			\
-		.setkey			= crypto_blake2b_setkey,	\
-		.init			= crypto_blake2b_init,		\
-		.update			= crypto_blake2b_update_neon,	\
-		.finup			= crypto_blake2b_finup_neon,	\
-		.descsize		= sizeof(struct blake2b_state),	\
-		.statesize		= BLAKE2B_STATE_SIZE,		\
-	}
-
-static struct shash_alg blake2b_neon_algs[] = {
-	BLAKE2B_ALG("blake2b-160", "blake2b-160-neon", BLAKE2B_160_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-256", "blake2b-256-neon", BLAKE2B_256_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-384", "blake2b-384-neon", BLAKE2B_384_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-512", "blake2b-512-neon", BLAKE2B_512_HASH_SIZE),
-};
-
-static int __init blake2b_neon_mod_init(void)
-{
-	if (!(elf_hwcap & HWCAP_NEON))
-		return -ENODEV;
-
-	return crypto_register_shashes(blake2b_neon_algs,
-				       ARRAY_SIZE(blake2b_neon_algs));
-}
-
-static void __exit blake2b_neon_mod_exit(void)
-{
-	crypto_unregister_shashes(blake2b_neon_algs,
-				  ARRAY_SIZE(blake2b_neon_algs));
-}
-
-module_init(blake2b_neon_mod_init);
-module_exit(blake2b_neon_mod_exit);
-
-MODULE_DESCRIPTION("BLAKE2b digest algorithm, NEON accelerated");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
-MODULE_ALIAS_CRYPTO("blake2b-160");
-MODULE_ALIAS_CRYPTO("blake2b-160-neon");
-MODULE_ALIAS_CRYPTO("blake2b-256");
-MODULE_ALIAS_CRYPTO("blake2b-256-neon");
-MODULE_ALIAS_CRYPTO("blake2b-384");
-MODULE_ALIAS_CRYPTO("blake2b-384-neon");
-MODULE_ALIAS_CRYPTO("blake2b-512");
-MODULE_ALIAS_CRYPTO("blake2b-512-neon");
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 1a48faad2473..997fa7cd9de5 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1783,10 +1783,10 @@ CONFIG_CRYPTO_CHACHA20=m
 CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_ECHAINIV=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_SHA3=m
 CONFIG_CRYPTO_ANSI_CPRNG=y
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_GHASH_ARM64_CE=y
-CONFIG_CRYPTO_SHA3_ARM64=m
 CONFIG_CRYPTO_SM3_ARM64_CE=m
 CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
 CONFIG_CRYPTO_AES_ARM64_BS=m
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 91f3093eee6a..bdd276a6e540 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -25,17 +25,6 @@ config CRYPTO_NHPOLY1305_NEON
 	  Architecture: arm64 using:
 	  - NEON (Advanced SIMD) extensions
 
-config CRYPTO_SHA3_ARM64
-	tristate "Hash functions: SHA-3 (ARMv8.2 Crypto Extensions)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_HASH
-	select CRYPTO_SHA3
-	help
-	  SHA-3 secure hash algorithms (FIPS 202)
-
-	  Architecture: arm64 using:
-	  - ARMv8.2 Crypto Extensions
-
 config CRYPTO_SM3_NEON
 	tristate "Hash functions: SM3 (NEON)"
 	depends on KERNEL_MODE_NEON
@@ -58,16 +47,6 @@ config CRYPTO_SM3_ARM64_CE
 	  Architecture: arm64 using:
 	  - ARMv8.2 Crypto Extensions
 
-config CRYPTO_POLYVAL_ARM64_CE
-	tristate "Hash functions: POLYVAL (ARMv8 Crypto Extensions)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_POLYVAL
-	help
-	  POLYVAL hash function for HCTR2
-
-	  Architecture: arm64 using:
-	  - ARMv8 Crypto Extensions
-
 config CRYPTO_AES_ARM64
 	tristate "Ciphers: AES, modes: ECB, CBC, CTR, CTS, XCTR, XTS"
 	select CRYPTO_AES
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index a8b2cdbe202c..1e330aa08d3f 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -5,9 +5,6 @@
 # Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
 #
 
-obj-$(CONFIG_CRYPTO_SHA3_ARM64) += sha3-ce.o
-sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o
-
 obj-$(CONFIG_CRYPTO_SM3_NEON) += sm3-neon.o
 sm3-neon-y := sm3-neon-glue.o sm3-neon-core.o
 
@@ -32,9 +29,6 @@ sm4-neon-y := sm4-neon-glue.o sm4-neon-core.o
 obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
 ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
 
-obj-$(CONFIG_CRYPTO_POLYVAL_ARM64_CE) += polyval-ce.o
-polyval-ce-y := polyval-ce-glue.o polyval-ce-core.o
-
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
 aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
 
diff --git a/arch/arm64/crypto/polyval-ce-core.S b/arch/arm64/crypto/polyval-ce-core.S
deleted file mode 100644
index b5326540d2e3..000000000000
--- a/arch/arm64/crypto/polyval-ce-core.S
+++ /dev/null
@@ -1,361 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Implementation of POLYVAL using ARMv8 Crypto Extensions.
- *
- * Copyright 2021 Google LLC
- */
-/*
- * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions
- * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8,
- * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split
- * finite field multiplication into two steps.
- *
- * In the first step, we consider h^i, m_i as normal polynomials of degree less
- * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
- * is simply polynomial multiplication.
- *
- * In the second step, we compute the reduction of p(x) modulo the finite field
- * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
- *
- * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
- * multiplication is finite field multiplication. The advantage is that the
- * two-step process  only requires 1 finite field reduction for every 8
- * polynomial multiplications. Further parallelism is gained by interleaving the
- * multiplications and polynomial reductions.
- */
-
-#include <linux/linkage.h>
-#define STRIDE_BLOCKS 8
-
-KEY_POWERS	.req	x0
-MSG		.req	x1
-BLOCKS_LEFT	.req	x2
-ACCUMULATOR	.req	x3
-KEY_START	.req	x10
-EXTRA_BYTES	.req	x11
-TMP	.req	x13
-
-M0	.req	v0
-M1	.req	v1
-M2	.req	v2
-M3	.req	v3
-M4	.req	v4
-M5	.req	v5
-M6	.req	v6
-M7	.req	v7
-KEY8	.req	v8
-KEY7	.req	v9
-KEY6	.req	v10
-KEY5	.req	v11
-KEY4	.req	v12
-KEY3	.req	v13
-KEY2	.req	v14
-KEY1	.req	v15
-PL	.req	v16
-PH	.req	v17
-TMP_V	.req	v18
-LO	.req	v20
-MI	.req	v21
-HI	.req	v22
-SUM	.req	v23
-GSTAR	.req	v24
-
-	.text
-
-	.arch	armv8-a+crypto
-	.align	4
-
-.Lgstar:
-	.quad	0xc200000000000000, 0xc200000000000000
-
-/*
- * Computes the product of two 128-bit polynomials in X and Y and XORs the
- * components of the 256-bit product into LO, MI, HI.
- *
- * Given:
- *  X = [X_1 : X_0]
- *  Y = [Y_1 : Y_0]
- *
- * We compute:
- *  LO += X_0 * Y_0
- *  MI += (X_0 + X_1) * (Y_0 + Y_1)
- *  HI += X_1 * Y_1
- *
- * Later, the 256-bit result can be extracted as:
- *   [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0]
- * This step is done when computing the polynomial reduction for efficiency
- * reasons.
- *
- * Karatsuba multiplication is used instead of Schoolbook multiplication because
- * it was found to be slightly faster on ARM64 CPUs.
- *
- */
-.macro karatsuba1 X Y
-	X .req \X
-	Y .req \Y
-	ext	v25.16b, X.16b, X.16b, #8
-	ext	v26.16b, Y.16b, Y.16b, #8
-	eor	v25.16b, v25.16b, X.16b
-	eor	v26.16b, v26.16b, Y.16b
-	pmull2	v28.1q, X.2d, Y.2d
-	pmull	v29.1q, X.1d, Y.1d
-	pmull	v27.1q, v25.1d, v26.1d
-	eor	HI.16b, HI.16b, v28.16b
-	eor	LO.16b, LO.16b, v29.16b
-	eor	MI.16b, MI.16b, v27.16b
-	.unreq X
-	.unreq Y
-.endm
-
-/*
- * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into
- * them.
- */
-.macro karatsuba1_store X Y
-	X .req \X
-	Y .req \Y
-	ext	v25.16b, X.16b, X.16b, #8
-	ext	v26.16b, Y.16b, Y.16b, #8
-	eor	v25.16b, v25.16b, X.16b
-	eor	v26.16b, v26.16b, Y.16b
-	pmull2	HI.1q, X.2d, Y.2d
-	pmull	LO.1q, X.1d, Y.1d
-	pmull	MI.1q, v25.1d, v26.1d
-	.unreq X
-	.unreq Y
-.endm
-
-/*
- * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
- * the result in PL, PH.
- * [PH : PL] =
- *   [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
- */
-.macro karatsuba2
-	// v4 = [HI_1 + MI_1 : HI_0 + MI_0]
-	eor	v4.16b, HI.16b, MI.16b
-	// v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0]
-	eor	v4.16b, v4.16b, LO.16b
-	// v5 = [HI_0 : LO_1]
-	ext	v5.16b, LO.16b, HI.16b, #8
-	// v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0]
-	eor	v4.16b, v4.16b, v5.16b
-	// HI = [HI_0 : HI_1]
-	ext	HI.16b, HI.16b, HI.16b, #8
-	// LO = [LO_0 : LO_1]
-	ext	LO.16b, LO.16b, LO.16b, #8
-	// PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1]
-	ext	PH.16b, v4.16b, HI.16b, #8
-	// PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
-	ext	PL.16b, LO.16b, v4.16b, #8
-.endm
-
-/*
- * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
- *
- * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
- * x^128 + x^127 + x^126 + x^121 + 1.
- *
- * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
- * product of two 128-bit polynomials in Montgomery form.  We need to reduce it
- * mod g(x).  Also, since polynomials in Montgomery form have an "extra" factor
- * of x^128, this product has two extra factors of x^128.  To get it back into
- * Montgomery form, we need to remove one of these factors by dividing by x^128.
- *
- * To accomplish both of these goals, we add multiples of g(x) that cancel out
- * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
- * bits are zero, the polynomial division by x^128 can be done by right
- * shifting.
- *
- * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
- * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x).  The CPU can
- * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
- * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x).  Adding this to
- * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
- * = T_1 : T_0 = g*(x) * P_0.  Thus, bits 0-63 got "folded" into bits 64-191.
- *
- * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
- * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
- * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
- * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
- * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
- *
- * So our final computation is:
- *   T = T_1 : T_0 = g*(x) * P_0
- *   V = V_1 : V_0 = g*(x) * (P_1 + T_0)
- *   p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
- *
- * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
- * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
- * T_1 into dest.  This allows us to reuse P_1 + T_0 when computing V.
- */
-.macro montgomery_reduction dest
-	DEST .req \dest
-	// TMP_V = T_1 : T_0 = P_0 * g*(x)
-	pmull	TMP_V.1q, PL.1d, GSTAR.1d
-	// TMP_V = T_0 : T_1
-	ext	TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
-	// TMP_V = P_1 + T_0 : P_0 + T_1
-	eor	TMP_V.16b, PL.16b, TMP_V.16b
-	// PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
-	eor	PH.16b, PH.16b, TMP_V.16b
-	// TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x)
-	pmull2	TMP_V.1q, TMP_V.2d, GSTAR.2d
-	eor	DEST.16b, PH.16b, TMP_V.16b
-	.unreq DEST
-.endm
-
-/*
- * Compute Polyval on 8 blocks.
- *
- * If reduce is set, also computes the montgomery reduction of the
- * previous full_stride call and XORs with the first message block.
- * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
- * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
- *
- * Sets PL, PH.
- */
-.macro full_stride reduce
-	eor		LO.16b, LO.16b, LO.16b
-	eor		MI.16b, MI.16b, MI.16b
-	eor		HI.16b, HI.16b, HI.16b
-
-	ld1		{M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
-	ld1		{M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64
-
-	karatsuba1 M7 KEY1
-	.if \reduce
-	pmull	TMP_V.1q, PL.1d, GSTAR.1d
-	.endif
-
-	karatsuba1 M6 KEY2
-	.if \reduce
-	ext	TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
-	.endif
-
-	karatsuba1 M5 KEY3
-	.if \reduce
-	eor	TMP_V.16b, PL.16b, TMP_V.16b
-	.endif
-
-	karatsuba1 M4 KEY4
-	.if \reduce
-	eor	PH.16b, PH.16b, TMP_V.16b
-	.endif
-
-	karatsuba1 M3 KEY5
-	.if \reduce
-	pmull2	TMP_V.1q, TMP_V.2d, GSTAR.2d
-	.endif
-
-	karatsuba1 M2 KEY6
-	.if \reduce
-	eor	SUM.16b, PH.16b, TMP_V.16b
-	.endif
-
-	karatsuba1 M1 KEY7
-	eor	M0.16b, M0.16b, SUM.16b
-
-	karatsuba1 M0 KEY8
-	karatsuba2
-.endm
-
-/*
- * Handle any extra blocks after full_stride loop.
- */
-.macro partial_stride
-	add	KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4)
-	sub	KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4
-	ld1	{KEY1.16b}, [KEY_POWERS], #16
-
-	ld1	{TMP_V.16b}, [MSG], #16
-	eor	SUM.16b, SUM.16b, TMP_V.16b
-	karatsuba1_store KEY1 SUM
-	sub	BLOCKS_LEFT, BLOCKS_LEFT, #1
-
-	tst	BLOCKS_LEFT, #4
-	beq	.Lpartial4BlocksDone
-	ld1	{M0.16b, M1.16b,  M2.16b, M3.16b}, [MSG], #64
-	ld1	{KEY8.16b, KEY7.16b, KEY6.16b,	KEY5.16b}, [KEY_POWERS], #64
-	karatsuba1 M0 KEY8
-	karatsuba1 M1 KEY7
-	karatsuba1 M2 KEY6
-	karatsuba1 M3 KEY5
-.Lpartial4BlocksDone:
-	tst	BLOCKS_LEFT, #2
-	beq	.Lpartial2BlocksDone
-	ld1	{M0.16b, M1.16b}, [MSG], #32
-	ld1	{KEY8.16b, KEY7.16b}, [KEY_POWERS], #32
-	karatsuba1 M0 KEY8
-	karatsuba1 M1 KEY7
-.Lpartial2BlocksDone:
-	tst	BLOCKS_LEFT, #1
-	beq	.LpartialDone
-	ld1	{M0.16b}, [MSG], #16
-	ld1	{KEY8.16b}, [KEY_POWERS], #16
-	karatsuba1 M0 KEY8
-.LpartialDone:
-	karatsuba2
-	montgomery_reduction SUM
-.endm
-
-/*
- * Perform montgomery multiplication in GF(2^128) and store result in op1.
- *
- * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
- * If op1, op2 are in montgomery form, this computes the montgomery
- * form of op1*op2.
- *
- * void pmull_polyval_mul(u8 *op1, const u8 *op2);
- */
-SYM_FUNC_START(pmull_polyval_mul)
-	adr	TMP, .Lgstar
-	ld1	{GSTAR.2d}, [TMP]
-	ld1	{v0.16b}, [x0]
-	ld1	{v1.16b}, [x1]
-	karatsuba1_store v0 v1
-	karatsuba2
-	montgomery_reduction SUM
-	st1	{SUM.16b}, [x0]
-	ret
-SYM_FUNC_END(pmull_polyval_mul)
-
-/*
- * Perform polynomial evaluation as specified by POLYVAL.  This computes:
- *	h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
- * where n=nblocks, h is the hash key, and m_i are the message blocks.
- *
- * x0 - pointer to precomputed key powers h^8 ... h^1
- * x1 - pointer to message blocks
- * x2 - number of blocks to hash
- * x3 - pointer to accumulator
- *
- * void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in,
- *			     size_t nblocks, u8 *accumulator);
- */
-SYM_FUNC_START(pmull_polyval_update)
-	adr	TMP, .Lgstar
-	mov	KEY_START, KEY_POWERS
-	ld1	{GSTAR.2d}, [TMP]
-	ld1	{SUM.16b}, [ACCUMULATOR]
-	subs	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
-	blt .LstrideLoopExit
-	ld1	{KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
-	ld1	{KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64
-	full_stride 0
-	subs	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
-	blt .LstrideLoopExitReduce
-.LstrideLoop:
-	full_stride 1
-	subs	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
-	bge	.LstrideLoop
-.LstrideLoopExitReduce:
-	montgomery_reduction SUM
-.LstrideLoopExit:
-	adds	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
-	beq	.LskipPartial
-	partial_stride
-.LskipPartial:
-	st1	{SUM.16b}, [ACCUMULATOR]
-	ret
-SYM_FUNC_END(pmull_polyval_update)
diff --git a/arch/arm64/crypto/polyval-ce-glue.c b/arch/arm64/crypto/polyval-ce-glue.c
deleted file mode 100644
index c4e653688ea0..000000000000
--- a/arch/arm64/crypto/polyval-ce-glue.c
+++ /dev/null
@@ -1,158 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Glue code for POLYVAL using ARMv8 Crypto Extensions
- *
- * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
- * Copyright (c) 2009 Intel Corp.
- *   Author: Huang Ying <ying.huang@intel.com>
- * Copyright 2021 Google LLC
- */
-
-/*
- * Glue code based on ghash-clmulni-intel_glue.c.
- *
- * This implementation of POLYVAL uses montgomery multiplication accelerated by
- * ARMv8 Crypto Extensions instructions to implement the finite field operations.
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/polyval.h>
-#include <crypto/utils.h>
-#include <linux/cpufeature.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#define NUM_KEY_POWERS	8
-
-struct polyval_tfm_ctx {
-	/*
-	 * These powers must be in the order h^8, ..., h^1.
-	 */
-	u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE];
-};
-
-struct polyval_desc_ctx {
-	u8 buffer[POLYVAL_BLOCK_SIZE];
-};
-
-asmlinkage void pmull_polyval_update(const struct polyval_tfm_ctx *keys,
-	const u8 *in, size_t nblocks, u8 *accumulator);
-asmlinkage void pmull_polyval_mul(u8 *op1, const u8 *op2);
-
-static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
-	const u8 *in, size_t nblocks, u8 *accumulator)
-{
-	kernel_neon_begin();
-	pmull_polyval_update(keys, in, nblocks, accumulator);
-	kernel_neon_end();
-}
-
-static void internal_polyval_mul(u8 *op1, const u8 *op2)
-{
-	kernel_neon_begin();
-	pmull_polyval_mul(op1, op2);
-	kernel_neon_end();
-}
-
-static int polyval_arm64_setkey(struct crypto_shash *tfm,
-			const u8 *key, unsigned int keylen)
-{
-	struct polyval_tfm_ctx *tctx = crypto_shash_ctx(tfm);
-	int i;
-
-	if (keylen != POLYVAL_BLOCK_SIZE)
-		return -EINVAL;
-
-	memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
-
-	for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
-		memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
-		internal_polyval_mul(tctx->key_powers[i],
-				     tctx->key_powers[i+1]);
-	}
-
-	return 0;
-}
-
-static int polyval_arm64_init(struct shash_desc *desc)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	memset(dctx, 0, sizeof(*dctx));
-
-	return 0;
-}
-
-static int polyval_arm64_update(struct shash_desc *desc,
-			 const u8 *src, unsigned int srclen)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
-	unsigned int nblocks;
-
-	do {
-		/* allow rescheduling every 4K bytes */
-		nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
-		internal_polyval_update(tctx, src, nblocks, dctx->buffer);
-		srclen -= nblocks * POLYVAL_BLOCK_SIZE;
-		src += nblocks * POLYVAL_BLOCK_SIZE;
-	} while (srclen >= POLYVAL_BLOCK_SIZE);
-
-	return srclen;
-}
-
-static int polyval_arm64_finup(struct shash_desc *desc, const u8 *src,
-			       unsigned int len, u8 *dst)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
-
-	if (len) {
-		crypto_xor(dctx->buffer, src, len);
-		internal_polyval_mul(dctx->buffer,
-				     tctx->key_powers[NUM_KEY_POWERS-1]);
-	}
-
-	memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
-
-	return 0;
-}
-
-static struct shash_alg polyval_alg = {
-	.digestsize	= POLYVAL_DIGEST_SIZE,
-	.init		= polyval_arm64_init,
-	.update		= polyval_arm64_update,
-	.finup		= polyval_arm64_finup,
-	.setkey		= polyval_arm64_setkey,
-	.descsize	= sizeof(struct polyval_desc_ctx),
-	.base		= {
-		.cra_name		= "polyval",
-		.cra_driver_name	= "polyval-ce",
-		.cra_priority		= 200,
-		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize		= POLYVAL_BLOCK_SIZE,
-		.cra_ctxsize		= sizeof(struct polyval_tfm_ctx),
-		.cra_module		= THIS_MODULE,
-	},
-};
-
-static int __init polyval_ce_mod_init(void)
-{
-	return crypto_register_shash(&polyval_alg);
-}
-
-static void __exit polyval_ce_mod_exit(void)
-{
-	crypto_unregister_shash(&polyval_alg);
-}
-
-module_cpu_feature_match(PMULL, polyval_ce_mod_init)
-module_exit(polyval_ce_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("POLYVAL hash function accelerated by ARMv8 Crypto Extensions");
-MODULE_ALIAS_CRYPTO("polyval");
-MODULE_ALIAS_CRYPTO("polyval-ce");
diff --git a/arch/arm64/crypto/sha3-ce-core.S b/arch/arm64/crypto/sha3-ce-core.S
deleted file mode 100644
index 9c77313f5a60..000000000000
--- a/arch/arm64/crypto/sha3-ce-core.S
+++ /dev/null
@@ -1,212 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions
- *
- * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	.irp	b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-	.set	.Lv\b\().2d, \b
-	.set	.Lv\b\().16b, \b
-	.endr
-
-	/*
-	 * ARMv8.2 Crypto Extensions instructions
-	 */
-	.macro	eor3, rd, rn, rm, ra
-	.inst	0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
-	.endm
-
-	.macro	rax1, rd, rn, rm
-	.inst	0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
-	.endm
-
-	.macro	bcax, rd, rn, rm, ra
-	.inst	0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
-	.endm
-
-	.macro	xar, rd, rn, rm, imm6
-	.inst	0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
-	.endm
-
-	/*
-	 * int sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
-	 */
-	.text
-SYM_FUNC_START(sha3_ce_transform)
-	/* load state */
-	add	x8, x0, #32
-	ld1	{ v0.1d- v3.1d}, [x0]
-	ld1	{ v4.1d- v7.1d}, [x8], #32
-	ld1	{ v8.1d-v11.1d}, [x8], #32
-	ld1	{v12.1d-v15.1d}, [x8], #32
-	ld1	{v16.1d-v19.1d}, [x8], #32
-	ld1	{v20.1d-v23.1d}, [x8], #32
-	ld1	{v24.1d}, [x8]
-
-0:	sub	w2, w2, #1
-	mov	w8, #24
-	adr_l	x9, .Lsha3_rcon
-
-	/* load input */
-	ld1	{v25.8b-v28.8b}, [x1], #32
-	ld1	{v29.8b-v31.8b}, [x1], #24
-	eor	v0.8b, v0.8b, v25.8b
-	eor	v1.8b, v1.8b, v26.8b
-	eor	v2.8b, v2.8b, v27.8b
-	eor	v3.8b, v3.8b, v28.8b
-	eor	v4.8b, v4.8b, v29.8b
-	eor	v5.8b, v5.8b, v30.8b
-	eor	v6.8b, v6.8b, v31.8b
-
-	tbnz	x3, #6, 2f		// SHA3-512
-
-	ld1	{v25.8b-v28.8b}, [x1], #32
-	ld1	{v29.8b-v30.8b}, [x1], #16
-	eor	 v7.8b,  v7.8b, v25.8b
-	eor	 v8.8b,  v8.8b, v26.8b
-	eor	 v9.8b,  v9.8b, v27.8b
-	eor	v10.8b, v10.8b, v28.8b
-	eor	v11.8b, v11.8b, v29.8b
-	eor	v12.8b, v12.8b, v30.8b
-
-	tbnz	x3, #4, 1f		// SHA3-384 or SHA3-224
-
-	// SHA3-256
-	ld1	{v25.8b-v28.8b}, [x1], #32
-	eor	v13.8b, v13.8b, v25.8b
-	eor	v14.8b, v14.8b, v26.8b
-	eor	v15.8b, v15.8b, v27.8b
-	eor	v16.8b, v16.8b, v28.8b
-	b	3f
-
-1:	tbz	x3, #2, 3f		// bit 2 cleared? SHA-384
-
-	// SHA3-224
-	ld1	{v25.8b-v28.8b}, [x1], #32
-	ld1	{v29.8b}, [x1], #8
-	eor	v13.8b, v13.8b, v25.8b
-	eor	v14.8b, v14.8b, v26.8b
-	eor	v15.8b, v15.8b, v27.8b
-	eor	v16.8b, v16.8b, v28.8b
-	eor	v17.8b, v17.8b, v29.8b
-	b	3f
-
-	// SHA3-512
-2:	ld1	{v25.8b-v26.8b}, [x1], #16
-	eor	 v7.8b,  v7.8b, v25.8b
-	eor	 v8.8b,  v8.8b, v26.8b
-
-3:	sub	w8, w8, #1
-
-	eor3	v29.16b,  v4.16b,  v9.16b, v14.16b
-	eor3	v26.16b,  v1.16b,  v6.16b, v11.16b
-	eor3	v28.16b,  v3.16b,  v8.16b, v13.16b
-	eor3	v25.16b,  v0.16b,  v5.16b, v10.16b
-	eor3	v27.16b,  v2.16b,  v7.16b, v12.16b
-	eor3	v29.16b, v29.16b, v19.16b, v24.16b
-	eor3	v26.16b, v26.16b, v16.16b, v21.16b
-	eor3	v28.16b, v28.16b, v18.16b, v23.16b
-	eor3	v25.16b, v25.16b, v15.16b, v20.16b
-	eor3	v27.16b, v27.16b, v17.16b, v22.16b
-
-	rax1	v30.2d, v29.2d, v26.2d	// bc[0]
-	rax1	v26.2d, v26.2d, v28.2d	// bc[2]
-	rax1	v28.2d, v28.2d, v25.2d	// bc[4]
-	rax1	v25.2d, v25.2d, v27.2d	// bc[1]
-	rax1	v27.2d, v27.2d, v29.2d	// bc[3]
-
-	eor	 v0.16b,  v0.16b, v30.16b
-	xar	 v29.2d,   v1.2d,  v25.2d, (64 - 1)
-	xar	  v1.2d,   v6.2d,  v25.2d, (64 - 44)
-	xar	  v6.2d,   v9.2d,  v28.2d, (64 - 20)
-	xar	  v9.2d,  v22.2d,  v26.2d, (64 - 61)
-	xar	 v22.2d,  v14.2d,  v28.2d, (64 - 39)
-	xar	 v14.2d,  v20.2d,  v30.2d, (64 - 18)
-	xar	 v31.2d,   v2.2d,  v26.2d, (64 - 62)
-	xar	  v2.2d,  v12.2d,  v26.2d, (64 - 43)
-	xar	 v12.2d,  v13.2d,  v27.2d, (64 - 25)
-	xar	 v13.2d,  v19.2d,  v28.2d, (64 - 8)
-	xar	 v19.2d,  v23.2d,  v27.2d, (64 - 56)
-	xar	 v23.2d,  v15.2d,  v30.2d, (64 - 41)
-	xar	 v15.2d,   v4.2d,  v28.2d, (64 - 27)
-	xar	 v28.2d,  v24.2d,  v28.2d, (64 - 14)
-	xar	 v24.2d,  v21.2d,  v25.2d, (64 - 2)
-	xar	  v8.2d,   v8.2d,  v27.2d, (64 - 55)
-	xar	  v4.2d,  v16.2d,  v25.2d, (64 - 45)
-	xar	 v16.2d,   v5.2d,  v30.2d, (64 - 36)
-	xar	  v5.2d,   v3.2d,  v27.2d, (64 - 28)
-	xar	 v27.2d,  v18.2d,  v27.2d, (64 - 21)
-	xar	  v3.2d,  v17.2d,  v26.2d, (64 - 15)
-	xar	 v25.2d,  v11.2d,  v25.2d, (64 - 10)
-	xar	 v26.2d,   v7.2d,  v26.2d, (64 - 6)
-	xar	 v30.2d,  v10.2d,  v30.2d, (64 - 3)
-
-	bcax	v20.16b, v31.16b, v22.16b,  v8.16b
-	bcax	v21.16b,  v8.16b, v23.16b, v22.16b
-	bcax	v22.16b, v22.16b, v24.16b, v23.16b
-	bcax	v23.16b, v23.16b, v31.16b, v24.16b
-	bcax	v24.16b, v24.16b,  v8.16b, v31.16b
-
-	ld1r	{v31.2d}, [x9], #8
-
-	bcax	v17.16b, v25.16b, v19.16b,  v3.16b
-	bcax	v18.16b,  v3.16b, v15.16b, v19.16b
-	bcax	v19.16b, v19.16b, v16.16b, v15.16b
-	bcax	v15.16b, v15.16b, v25.16b, v16.16b
-	bcax	v16.16b, v16.16b,  v3.16b, v25.16b
-
-	bcax	v10.16b, v29.16b, v12.16b, v26.16b
-	bcax	v11.16b, v26.16b, v13.16b, v12.16b
-	bcax	v12.16b, v12.16b, v14.16b, v13.16b
-	bcax	v13.16b, v13.16b, v29.16b, v14.16b
-	bcax	v14.16b, v14.16b, v26.16b, v29.16b
-
-	bcax	 v7.16b, v30.16b,  v9.16b,  v4.16b
-	bcax	 v8.16b,  v4.16b,  v5.16b,  v9.16b
-	bcax	 v9.16b,  v9.16b,  v6.16b,  v5.16b
-	bcax	 v5.16b,  v5.16b, v30.16b,  v6.16b
-	bcax	 v6.16b,  v6.16b,  v4.16b, v30.16b
-
-	bcax	 v3.16b, v27.16b,  v0.16b, v28.16b
-	bcax	 v4.16b, v28.16b,  v1.16b,  v0.16b
-	bcax	 v0.16b,  v0.16b,  v2.16b,  v1.16b
-	bcax	 v1.16b,  v1.16b, v27.16b,  v2.16b
-	bcax	 v2.16b,  v2.16b, v28.16b, v27.16b
-
-	eor	 v0.16b,  v0.16b, v31.16b
-
-	cbnz	w8, 3b
-	cond_yield 4f, x8, x9
-	cbnz	w2, 0b
-
-	/* save state */
-4:	st1	{ v0.1d- v3.1d}, [x0], #32
-	st1	{ v4.1d- v7.1d}, [x0], #32
-	st1	{ v8.1d-v11.1d}, [x0], #32
-	st1	{v12.1d-v15.1d}, [x0], #32
-	st1	{v16.1d-v19.1d}, [x0], #32
-	st1	{v20.1d-v23.1d}, [x0], #32
-	st1	{v24.1d}, [x0]
-	mov	w0, w2
-	ret
-SYM_FUNC_END(sha3_ce_transform)
-
-	.section	".rodata", "a"
-	.align		8
-.Lsha3_rcon:
-	.quad	0x0000000000000001, 0x0000000000008082, 0x800000000000808a
-	.quad	0x8000000080008000, 0x000000000000808b, 0x0000000080000001
-	.quad	0x8000000080008081, 0x8000000000008009, 0x000000000000008a
-	.quad	0x0000000000000088, 0x0000000080008009, 0x000000008000000a
-	.quad	0x000000008000808b, 0x800000000000008b, 0x8000000000008089
-	.quad	0x8000000000008003, 0x8000000000008002, 0x8000000000000080
-	.quad	0x000000000000800a, 0x800000008000000a, 0x8000000080008081
-	.quad	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c
deleted file mode 100644
index b4f1001046c9..000000000000
--- a/arch/arm64/crypto/sha3-ce-glue.c
+++ /dev/null
@@ -1,151 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * sha3-ce-glue.c - core SHA-3 transform using v8.2 Crypto Extensions
- *
- * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha3.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-MODULE_DESCRIPTION("SHA3 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha3-224");
-MODULE_ALIAS_CRYPTO("sha3-256");
-MODULE_ALIAS_CRYPTO("sha3-384");
-MODULE_ALIAS_CRYPTO("sha3-512");
-
-asmlinkage int sha3_ce_transform(u64 *st, const u8 *data, int blocks,
-				 int md_len);
-
-static int sha3_update(struct shash_desc *desc, const u8 *data,
-		       unsigned int len)
-{
-	struct sha3_state *sctx = shash_desc_ctx(desc);
-	struct crypto_shash *tfm = desc->tfm;
-	unsigned int bs, ds;
-	int blocks;
-
-	ds = crypto_shash_digestsize(tfm);
-	bs = crypto_shash_blocksize(tfm);
-	blocks = len / bs;
-	len -= blocks * bs;
-	do {
-		int rem;
-
-		kernel_neon_begin();
-		rem = sha3_ce_transform(sctx->st, data, blocks, ds);
-		kernel_neon_end();
-		data += (blocks - rem) * bs;
-		blocks = rem;
-	} while (blocks);
-	return len;
-}
-
-static int sha3_finup(struct shash_desc *desc, const u8 *src, unsigned int len,
-		      u8 *out)
-{
-	struct sha3_state *sctx = shash_desc_ctx(desc);
-	struct crypto_shash *tfm = desc->tfm;
-	__le64 *digest = (__le64 *)out;
-	u8 block[SHA3_224_BLOCK_SIZE];
-	unsigned int bs, ds;
-	int i;
-
-	ds = crypto_shash_digestsize(tfm);
-	bs = crypto_shash_blocksize(tfm);
-	memcpy(block, src, len);
-
-	block[len++] = 0x06;
-	memset(block + len, 0, bs - len);
-	block[bs - 1] |= 0x80;
-
-	kernel_neon_begin();
-	sha3_ce_transform(sctx->st, block, 1, ds);
-	kernel_neon_end();
-	memzero_explicit(block , sizeof(block));
-
-	for (i = 0; i < ds / 8; i++)
-		put_unaligned_le64(sctx->st[i], digest++);
-
-	if (ds & 4)
-		put_unaligned_le32(sctx->st[i], (__le32 *)digest);
-
-	return 0;
-}
-
-static struct shash_alg algs[] = { {
-	.digestsize		= SHA3_224_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= sha3_update,
-	.finup			= sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-224",
-	.base.cra_driver_name	= "sha3-224-ce",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_224_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_priority	= 200,
-}, {
-	.digestsize		= SHA3_256_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= sha3_update,
-	.finup			= sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-256",
-	.base.cra_driver_name	= "sha3-256-ce",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_256_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_priority	= 200,
-}, {
-	.digestsize		= SHA3_384_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= sha3_update,
-	.finup			= sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-384",
-	.base.cra_driver_name	= "sha3-384-ce",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_384_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_priority	= 200,
-}, {
-	.digestsize		= SHA3_512_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= sha3_update,
-	.finup			= sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-512",
-	.base.cra_driver_name	= "sha3-512-ce",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_512_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_priority	= 200,
-} };
-
-static int __init sha3_neon_mod_init(void)
-{
-	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha3_neon_mod_fini(void)
-{
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_cpu_feature_match(SHA3, sha3_neon_mod_init);
-module_exit(sha3_neon_mod_fini);
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 8433f769f7e1..1df484ed6329 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -796,6 +796,7 @@ CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MD5=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA3=m
 CONFIG_CRYPTO_SM3_GENERIC=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_XCBC=m
@@ -809,8 +810,6 @@ CONFIG_CRYPTO_USER_API_HASH=m
 CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
-CONFIG_CRYPTO_SHA3_256_S390=m
-CONFIG_CRYPTO_SHA3_512_S390=m
 CONFIG_CRYPTO_GHASH_S390=m
 CONFIG_CRYPTO_AES_S390=m
 CONFIG_CRYPTO_DES_S390=m
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index 4414dabd04a6..df89105dd520 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -780,6 +780,7 @@ CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MD5=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA3=m
 CONFIG_CRYPTO_SM3_GENERIC=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_XCBC=m
@@ -794,8 +795,6 @@ CONFIG_CRYPTO_USER_API_HASH=m
 CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
-CONFIG_CRYPTO_SHA3_256_S390=m
-CONFIG_CRYPTO_SHA3_512_S390=m
 CONFIG_CRYPTO_GHASH_S390=m
 CONFIG_CRYPTO_AES_S390=m
 CONFIG_CRYPTO_DES_S390=m
diff --git a/arch/s390/crypto/Kconfig b/arch/s390/crypto/Kconfig
index 03f73fbd38b6..f838ca055f6d 100644
--- a/arch/s390/crypto/Kconfig
+++ b/arch/s390/crypto/Kconfig
@@ -2,26 +2,6 @@
 
 menu "Accelerated Cryptographic Algorithms for CPU (s390)"
 
-config CRYPTO_SHA3_256_S390
-	tristate "Hash functions: SHA3-224 and SHA3-256"
-	select CRYPTO_HASH
-	help
-	  SHA3-224 and SHA3-256 secure hash algorithms (FIPS 202)
-
-	  Architecture: s390
-
-	  It is available as of z14.
-
-config CRYPTO_SHA3_512_S390
-	tristate "Hash functions: SHA3-384 and SHA3-512"
-	select CRYPTO_HASH
-	help
-	  SHA3-384 and SHA3-512 secure hash algorithms (FIPS 202)
-
-	  Architecture: s390
-
-	  It is available as of z14.
-
 config CRYPTO_GHASH_S390
 	tristate "Hash functions: GHASH"
 	select CRYPTO_HASH
diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile
index 998f4b656b18..387a229e1038 100644
--- a/arch/s390/crypto/Makefile
+++ b/arch/s390/crypto/Makefile
@@ -3,8 +3,6 @@
 # Cryptographic API
 #
 
-obj-$(CONFIG_CRYPTO_SHA3_256_S390) += sha3_256_s390.o sha_common.o
-obj-$(CONFIG_CRYPTO_SHA3_512_S390) += sha3_512_s390.o sha_common.o
 obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o
 obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o
 obj-$(CONFIG_CRYPTO_PAES_S390) += paes_s390.o
diff --git a/arch/s390/crypto/sha.h b/arch/s390/crypto/sha.h
deleted file mode 100644
index b9cd9572dd35..000000000000
--- a/arch/s390/crypto/sha.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * Cryptographic API.
- *
- * s390 generic implementation of the SHA Secure Hash Algorithms.
- *
- * Copyright IBM Corp. 2007
- * Author(s): Jan Glauber (jang@de.ibm.com)
- */
-#ifndef _CRYPTO_ARCH_S390_SHA_H
-#define _CRYPTO_ARCH_S390_SHA_H
-
-#include <crypto/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha3.h>
-#include <linux/build_bug.h>
-#include <linux/types.h>
-
-/* must be big enough for the largest SHA variant */
-#define CPACF_MAX_PARMBLOCK_SIZE	SHA3_STATE_SIZE
-#define SHA_MAX_BLOCK_SIZE		SHA3_224_BLOCK_SIZE
-
-struct s390_sha_ctx {
-	u64 count;		/* message length in bytes */
-	union {
-		u32 state[CPACF_MAX_PARMBLOCK_SIZE / sizeof(u32)];
-		struct {
-			u64 state[SHA512_DIGEST_SIZE / sizeof(u64)];
-			u64 count_hi;
-		} sha512;
-		struct {
-			__le64 state[SHA3_STATE_SIZE / sizeof(u64)];
-		} sha3;
-	};
-	int func;		/* KIMD function to use */
-	bool first_message_part;
-};
-
-struct shash_desc;
-
-int s390_sha_update_blocks(struct shash_desc *desc, const u8 *data,
-			   unsigned int len);
-int s390_sha_finup(struct shash_desc *desc, const u8 *src, unsigned int len,
-		   u8 *out);
-
-static inline void __check_s390_sha_ctx_size(void)
-{
-	BUILD_BUG_ON(S390_SHA_CTX_SIZE != sizeof(struct s390_sha_ctx));
-}
-
-#endif
diff --git a/arch/s390/crypto/sha3_256_s390.c b/arch/s390/crypto/sha3_256_s390.c
deleted file mode 100644
index 03bb4f4bab70..000000000000
--- a/arch/s390/crypto/sha3_256_s390.c
+++ /dev/null
@@ -1,157 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Cryptographic API.
- *
- * s390 implementation of the SHA256 and SHA224 Secure Hash Algorithm.
- *
- * s390 Version:
- *   Copyright IBM Corp. 2019
- *   Author(s): Joerg Schmidbauer (jschmidb@de.ibm.com)
- */
-#include <asm/cpacf.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha3.h>
-#include <linux/cpufeature.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include "sha.h"
-
-static int sha3_256_init(struct shash_desc *desc)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	sctx->first_message_part = test_facility(86);
-	if (!sctx->first_message_part)
-		memset(sctx->state, 0, sizeof(sctx->state));
-	sctx->count = 0;
-	sctx->func = CPACF_KIMD_SHA3_256;
-
-	return 0;
-}
-
-static int sha3_256_export(struct shash_desc *desc, void *out)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-	union {
-		u8 *u8;
-		u64 *u64;
-	} p = { .u8 = out };
-	int i;
-
-	if (sctx->first_message_part) {
-		memset(out, 0, SHA3_STATE_SIZE);
-		return 0;
-	}
-	for (i = 0; i < SHA3_STATE_SIZE / 8; i++)
-		put_unaligned(le64_to_cpu(sctx->sha3.state[i]), p.u64++);
-	return 0;
-}
-
-static int sha3_256_import(struct shash_desc *desc, const void *in)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-	union {
-		const u8 *u8;
-		const u64 *u64;
-	} p = { .u8 = in };
-	int i;
-
-	for (i = 0; i < SHA3_STATE_SIZE / 8; i++)
-		sctx->sha3.state[i] = cpu_to_le64(get_unaligned(p.u64++));
-	sctx->count = 0;
-	sctx->first_message_part = 0;
-	sctx->func = CPACF_KIMD_SHA3_256;
-
-	return 0;
-}
-
-static int sha3_224_import(struct shash_desc *desc, const void *in)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	sha3_256_import(desc, in);
-	sctx->func = CPACF_KIMD_SHA3_224;
-	return 0;
-}
-
-static struct shash_alg sha3_256_alg = {
-	.digestsize	=	SHA3_256_DIGEST_SIZE,	   /* = 32 */
-	.init		=	sha3_256_init,
-	.update		=	s390_sha_update_blocks,
-	.finup		=	s390_sha_finup,
-	.export		=	sha3_256_export,
-	.import		=	sha3_256_import,
-	.descsize	=	S390_SHA_CTX_SIZE,
-	.statesize	=	SHA3_STATE_SIZE,
-	.base		=	{
-		.cra_name	 =	"sha3-256",
-		.cra_driver_name =	"sha3-256-s390",
-		.cra_priority	 =	300,
-		.cra_flags	 =	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	 =	SHA3_256_BLOCK_SIZE,
-		.cra_module	 =	THIS_MODULE,
-	}
-};
-
-static int sha3_224_init(struct shash_desc *desc)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	sha3_256_init(desc);
-	sctx->func = CPACF_KIMD_SHA3_224;
-	return 0;
-}
-
-static struct shash_alg sha3_224_alg = {
-	.digestsize	=	SHA3_224_DIGEST_SIZE,
-	.init		=	sha3_224_init,
-	.update		=	s390_sha_update_blocks,
-	.finup		=	s390_sha_finup,
-	.export		=	sha3_256_export, /* same as for 256 */
-	.import		=	sha3_224_import, /* function code different! */
-	.descsize	=	S390_SHA_CTX_SIZE,
-	.statesize	=	SHA3_STATE_SIZE,
-	.base		=	{
-		.cra_name	 =	"sha3-224",
-		.cra_driver_name =	"sha3-224-s390",
-		.cra_priority	 =	300,
-		.cra_flags	 =	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	 =	SHA3_224_BLOCK_SIZE,
-		.cra_module	 =	THIS_MODULE,
-	}
-};
-
-static int __init sha3_256_s390_init(void)
-{
-	int ret;
-
-	if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA3_256))
-		return -ENODEV;
-
-	ret = crypto_register_shash(&sha3_256_alg);
-	if (ret < 0)
-		goto out;
-
-	ret = crypto_register_shash(&sha3_224_alg);
-	if (ret < 0)
-		crypto_unregister_shash(&sha3_256_alg);
-out:
-	return ret;
-}
-
-static void __exit sha3_256_s390_fini(void)
-{
-	crypto_unregister_shash(&sha3_224_alg);
-	crypto_unregister_shash(&sha3_256_alg);
-}
-
-module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha3_256_s390_init);
-module_exit(sha3_256_s390_fini);
-
-MODULE_ALIAS_CRYPTO("sha3-256");
-MODULE_ALIAS_CRYPTO("sha3-224");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA3-256 and SHA3-224 Secure Hash Algorithm");
diff --git a/arch/s390/crypto/sha3_512_s390.c b/arch/s390/crypto/sha3_512_s390.c
deleted file mode 100644
index a5c9690eecb1..000000000000
--- a/arch/s390/crypto/sha3_512_s390.c
+++ /dev/null
@@ -1,157 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Cryptographic API.
- *
- * s390 implementation of the SHA512 and SHA384 Secure Hash Algorithm.
- *
- * Copyright IBM Corp. 2019
- * Author(s): Joerg Schmidbauer (jschmidb@de.ibm.com)
- */
-#include <asm/cpacf.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha3.h>
-#include <linux/cpufeature.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include "sha.h"
-
-static int sha3_512_init(struct shash_desc *desc)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	sctx->first_message_part = test_facility(86);
-	if (!sctx->first_message_part)
-		memset(sctx->state, 0, sizeof(sctx->state));
-	sctx->count = 0;
-	sctx->func = CPACF_KIMD_SHA3_512;
-
-	return 0;
-}
-
-static int sha3_512_export(struct shash_desc *desc, void *out)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-	union {
-		u8 *u8;
-		u64 *u64;
-	} p = { .u8 = out };
-	int i;
-
-	if (sctx->first_message_part) {
-		memset(out, 0, SHA3_STATE_SIZE);
-		return 0;
-	}
-	for (i = 0; i < SHA3_STATE_SIZE / 8; i++)
-		put_unaligned(le64_to_cpu(sctx->sha3.state[i]), p.u64++);
-	return 0;
-}
-
-static int sha3_512_import(struct shash_desc *desc, const void *in)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-	union {
-		const u8 *u8;
-		const u64 *u64;
-	} p = { .u8 = in };
-	int i;
-
-	for (i = 0; i < SHA3_STATE_SIZE / 8; i++)
-		sctx->sha3.state[i] = cpu_to_le64(get_unaligned(p.u64++));
-	sctx->count = 0;
-	sctx->first_message_part = 0;
-	sctx->func = CPACF_KIMD_SHA3_512;
-
-	return 0;
-}
-
-static int sha3_384_import(struct shash_desc *desc, const void *in)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	sha3_512_import(desc, in);
-	sctx->func = CPACF_KIMD_SHA3_384;
-	return 0;
-}
-
-static struct shash_alg sha3_512_alg = {
-	.digestsize	=	SHA3_512_DIGEST_SIZE,
-	.init		=	sha3_512_init,
-	.update		=	s390_sha_update_blocks,
-	.finup		=	s390_sha_finup,
-	.export		=	sha3_512_export,
-	.import		=	sha3_512_import,
-	.descsize	=	S390_SHA_CTX_SIZE,
-	.statesize	=	SHA3_STATE_SIZE,
-	.base		=	{
-		.cra_name	 =	"sha3-512",
-		.cra_driver_name =	"sha3-512-s390",
-		.cra_priority	 =	300,
-		.cra_flags	 =	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	 =	SHA3_512_BLOCK_SIZE,
-		.cra_module	 =	THIS_MODULE,
-	}
-};
-
-MODULE_ALIAS_CRYPTO("sha3-512");
-
-static int sha3_384_init(struct shash_desc *desc)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	sha3_512_init(desc);
-	sctx->func = CPACF_KIMD_SHA3_384;
-	return 0;
-}
-
-static struct shash_alg sha3_384_alg = {
-	.digestsize	=	SHA3_384_DIGEST_SIZE,
-	.init		=	sha3_384_init,
-	.update		=	s390_sha_update_blocks,
-	.finup		=	s390_sha_finup,
-	.export		=	sha3_512_export, /* same as for 512 */
-	.import		=	sha3_384_import, /* function code different! */
-	.descsize	=	S390_SHA_CTX_SIZE,
-	.statesize	=	SHA3_STATE_SIZE,
-	.base		=	{
-		.cra_name	 =	"sha3-384",
-		.cra_driver_name =	"sha3-384-s390",
-		.cra_priority	 =	300,
-		.cra_flags	 =	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	 =	SHA3_384_BLOCK_SIZE,
-		.cra_ctxsize	 =	sizeof(struct s390_sha_ctx),
-		.cra_module	 =	THIS_MODULE,
-	}
-};
-
-MODULE_ALIAS_CRYPTO("sha3-384");
-
-static int __init init(void)
-{
-	int ret;
-
-	if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA3_512))
-		return -ENODEV;
-	ret = crypto_register_shash(&sha3_512_alg);
-	if (ret < 0)
-		goto out;
-	ret = crypto_register_shash(&sha3_384_alg);
-	if (ret < 0)
-		crypto_unregister_shash(&sha3_512_alg);
-out:
-	return ret;
-}
-
-static void __exit fini(void)
-{
-	crypto_unregister_shash(&sha3_512_alg);
-	crypto_unregister_shash(&sha3_384_alg);
-}
-
-module_cpu_feature_match(S390_CPU_FEATURE_MSA, init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA3-512 and SHA3-384 Secure Hash Algorithm");
diff --git a/arch/s390/crypto/sha_common.c b/arch/s390/crypto/sha_common.c
deleted file mode 100644
index d6f839618794..000000000000
--- a/arch/s390/crypto/sha_common.c
+++ /dev/null
@@ -1,117 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Cryptographic API.
- *
- * s390 generic implementation of the SHA Secure Hash Algorithms.
- *
- * Copyright IBM Corp. 2007
- * Author(s): Jan Glauber (jang@de.ibm.com)
- */
-
-#include <crypto/internal/hash.h>
-#include <linux/export.h>
-#include <linux/module.h>
-#include <asm/cpacf.h>
-#include "sha.h"
-
-int s390_sha_update_blocks(struct shash_desc *desc, const u8 *data,
-			   unsigned int len)
-{
-	unsigned int bsize = crypto_shash_blocksize(desc->tfm);
-	struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
-	unsigned int n;
-	int fc;
-
-	fc = ctx->func;
-	if (ctx->first_message_part)
-		fc |= CPACF_KIMD_NIP;
-
-	/* process as many blocks as possible */
-	n = (len / bsize) * bsize;
-	ctx->count += n;
-	switch (ctx->func) {
-	case CPACF_KLMD_SHA_512:
-	case CPACF_KLMD_SHA3_384:
-		if (ctx->count < n)
-			ctx->sha512.count_hi++;
-		break;
-	}
-	cpacf_kimd(fc, ctx->state, data, n);
-	ctx->first_message_part = 0;
-	return len - n;
-}
-EXPORT_SYMBOL_GPL(s390_sha_update_blocks);
-
-static int s390_crypto_shash_parmsize(int func)
-{
-	switch (func) {
-	case CPACF_KLMD_SHA_1:
-		return 20;
-	case CPACF_KLMD_SHA_256:
-		return 32;
-	case CPACF_KLMD_SHA_512:
-		return 64;
-	case CPACF_KLMD_SHA3_224:
-	case CPACF_KLMD_SHA3_256:
-	case CPACF_KLMD_SHA3_384:
-	case CPACF_KLMD_SHA3_512:
-		return 200;
-	default:
-		return -EINVAL;
-	}
-}
-
-int s390_sha_finup(struct shash_desc *desc, const u8 *src, unsigned int len,
-		   u8 *out)
-{
-	struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
-	int mbl_offset, fc;
-	u64 bits;
-
-	ctx->count += len;
-
-	bits = ctx->count * 8;
-	mbl_offset = s390_crypto_shash_parmsize(ctx->func);
-	if (mbl_offset < 0)
-		return -EINVAL;
-
-	mbl_offset = mbl_offset / sizeof(u32);
-
-	/* set total msg bit length (mbl) in CPACF parmblock */
-	switch (ctx->func) {
-	case CPACF_KLMD_SHA_512:
-		/* The SHA512 parmblock has a 128-bit mbl field. */
-		if (ctx->count < len)
-			ctx->sha512.count_hi++;
-		ctx->sha512.count_hi <<= 3;
-		ctx->sha512.count_hi |= ctx->count >> 61;
-		mbl_offset += sizeof(u64) / sizeof(u32);
-		fallthrough;
-	case CPACF_KLMD_SHA_1:
-	case CPACF_KLMD_SHA_256:
-		memcpy(ctx->state + mbl_offset, &bits, sizeof(bits));
-		break;
-	case CPACF_KLMD_SHA3_224:
-	case CPACF_KLMD_SHA3_256:
-	case CPACF_KLMD_SHA3_384:
-	case CPACF_KLMD_SHA3_512:
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	fc = ctx->func;
-	fc |= test_facility(86) ? CPACF_KLMD_DUFOP : 0;
-	if (ctx->first_message_part)
-		fc |= CPACF_KLMD_NIP;
-	cpacf_klmd(fc, ctx->state, src, len);
-
-	/* copy digest to out */
-	memcpy(out, ctx->state, crypto_shash_digestsize(desc->tfm));
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(s390_sha_finup);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("s390 SHA cipher common functions");
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 48d3076b6053..3fd2423d3cf8 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -353,16 +353,6 @@ config CRYPTO_NHPOLY1305_AVX2
 	  Architecture: x86_64 using:
 	  - AVX2 (Advanced Vector Extensions 2)
 
-config CRYPTO_POLYVAL_CLMUL_NI
-	tristate "Hash functions: POLYVAL (CLMUL-NI)"
-	depends on 64BIT
-	select CRYPTO_POLYVAL
-	help
-	  POLYVAL hash function for HCTR2
-
-	  Architecture: x86_64 using:
-	  - CLMUL-NI (carry-less multiplication new instructions)
-
 config CRYPTO_SM3_AVX_X86_64
 	tristate "Hash functions: SM3 (AVX)"
 	depends on 64BIT
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 2d30d5d36145..4a24dd38da50 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -52,9 +52,6 @@ aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 
-obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o
-polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o
-
 obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
 nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
diff --git a/arch/x86/crypto/polyval-clmulni_asm.S b/arch/x86/crypto/polyval-clmulni_asm.S
deleted file mode 100644
index a6ebe4e7dd2b..000000000000
--- a/arch/x86/crypto/polyval-clmulni_asm.S
+++ /dev/null
@@ -1,321 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright 2021 Google LLC
- */
-/*
- * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI
- * instructions. It works on 8 blocks at a time, by precomputing the first 8
- * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation
- * allows us to split finite field multiplication into two steps.
- *
- * In the first step, we consider h^i, m_i as normal polynomials of degree less
- * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
- * is simply polynomial multiplication.
- *
- * In the second step, we compute the reduction of p(x) modulo the finite field
- * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
- *
- * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
- * multiplication is finite field multiplication. The advantage is that the
- * two-step process  only requires 1 finite field reduction for every 8
- * polynomial multiplications. Further parallelism is gained by interleaving the
- * multiplications and polynomial reductions.
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-#define STRIDE_BLOCKS 8
-
-#define GSTAR %xmm7
-#define PL %xmm8
-#define PH %xmm9
-#define TMP_XMM %xmm11
-#define LO %xmm12
-#define HI %xmm13
-#define MI %xmm14
-#define SUM %xmm15
-
-#define KEY_POWERS %rdi
-#define MSG %rsi
-#define BLOCKS_LEFT %rdx
-#define ACCUMULATOR %rcx
-#define TMP %rax
-
-.section    .rodata.cst16.gstar, "aM", @progbits, 16
-.align 16
-
-.Lgstar:
-	.quad 0xc200000000000000, 0xc200000000000000
-
-.text
-
-/*
- * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length
- * count pointed to by MSG and KEY_POWERS.
- */
-.macro schoolbook1 count
-	.set i, 0
-	.rept (\count)
-		schoolbook1_iteration i 0
-		.set i, (i +1)
-	.endr
-.endm
-
-/*
- * Computes the product of two 128-bit polynomials at the memory locations
- * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of
- * the 256-bit product into LO, MI, HI.
- *
- * Given:
- *   X = [X_1 : X_0]
- *   Y = [Y_1 : Y_0]
- *
- * We compute:
- *   LO += X_0 * Y_0
- *   MI += X_0 * Y_1 + X_1 * Y_0
- *   HI += X_1 * Y_1
- *
- * Later, the 256-bit result can be extracted as:
- *   [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
- * This step is done when computing the polynomial reduction for efficiency
- * reasons.
- *
- * If xor_sum == 1, then also XOR the value of SUM into m_0.  This avoids an
- * extra multiplication of SUM and h^8.
- */
-.macro schoolbook1_iteration i xor_sum
-	movups (16*\i)(MSG), %xmm0
-	.if (\i == 0 && \xor_sum == 1)
-		pxor SUM, %xmm0
-	.endif
-	vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2
-	vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1
-	vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3
-	vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4
-	vpxor %xmm2, MI, MI
-	vpxor %xmm1, LO, LO
-	vpxor %xmm4, HI, HI
-	vpxor %xmm3, MI, MI
-.endm
-
-/*
- * Performs the same computation as schoolbook1_iteration, except we expect the
- * arguments to already be loaded into xmm0 and xmm1 and we set the result
- * registers LO, MI, and HI directly rather than XOR'ing into them.
- */
-.macro schoolbook1_noload
-	vpclmulqdq $0x01, %xmm0, %xmm1, MI
-	vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2
-	vpclmulqdq $0x00, %xmm0, %xmm1, LO
-	vpclmulqdq $0x11, %xmm0, %xmm1, HI
-	vpxor %xmm2, MI, MI
-.endm
-
-/*
- * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
- * the result in PL, PH.
- *   [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
- */
-.macro schoolbook2
-	vpslldq $8, MI, PL
-	vpsrldq $8, MI, PH
-	pxor LO, PL
-	pxor HI, PH
-.endm
-
-/*
- * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
- *
- * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
- * x^128 + x^127 + x^126 + x^121 + 1.
- *
- * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
- * product of two 128-bit polynomials in Montgomery form.  We need to reduce it
- * mod g(x).  Also, since polynomials in Montgomery form have an "extra" factor
- * of x^128, this product has two extra factors of x^128.  To get it back into
- * Montgomery form, we need to remove one of these factors by dividing by x^128.
- *
- * To accomplish both of these goals, we add multiples of g(x) that cancel out
- * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
- * bits are zero, the polynomial division by x^128 can be done by right shifting.
- *
- * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
- * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x).  The CPU can
- * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
- * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x).  Adding this to
- * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
- * = T_1 : T_0 = g*(x) * P_0.  Thus, bits 0-63 got "folded" into bits 64-191.
- *
- * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
- * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
- * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
- * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
- * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
- *
- * So our final computation is:
- *   T = T_1 : T_0 = g*(x) * P_0
- *   V = V_1 : V_0 = g*(x) * (P_1 + T_0)
- *   p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
- *
- * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
- * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
- * T_1 into dest.  This allows us to reuse P_1 + T_0 when computing V.
- */
-.macro montgomery_reduction dest
-	vpclmulqdq $0x00, PL, GSTAR, TMP_XMM	# TMP_XMM = T_1 : T_0 = P_0 * g*(x)
-	pshufd $0b01001110, TMP_XMM, TMP_XMM	# TMP_XMM = T_0 : T_1
-	pxor PL, TMP_XMM			# TMP_XMM = P_1 + T_0 : P_0 + T_1
-	pxor TMP_XMM, PH			# PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
-	pclmulqdq $0x11, GSTAR, TMP_XMM		# TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)]
-	vpxor TMP_XMM, PH, \dest
-.endm
-
-/*
- * Compute schoolbook multiplication for 8 blocks
- * m_0h^8 + ... + m_7h^1
- *
- * If reduce is set, also computes the montgomery reduction of the
- * previous full_stride call and XORs with the first message block.
- * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
- * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
- */
-.macro full_stride reduce
-	pxor LO, LO
-	pxor HI, HI
-	pxor MI, MI
-
-	schoolbook1_iteration 7 0
-	.if \reduce
-		vpclmulqdq $0x00, PL, GSTAR, TMP_XMM
-	.endif
-
-	schoolbook1_iteration 6 0
-	.if \reduce
-		pshufd $0b01001110, TMP_XMM, TMP_XMM
-	.endif
-
-	schoolbook1_iteration 5 0
-	.if \reduce
-		pxor PL, TMP_XMM
-	.endif
-
-	schoolbook1_iteration 4 0
-	.if \reduce
-		pxor TMP_XMM, PH
-	.endif
-
-	schoolbook1_iteration 3 0
-	.if \reduce
-		pclmulqdq $0x11, GSTAR, TMP_XMM
-	.endif
-
-	schoolbook1_iteration 2 0
-	.if \reduce
-		vpxor TMP_XMM, PH, SUM
-	.endif
-
-	schoolbook1_iteration 1 0
-
-	schoolbook1_iteration 0 1
-
-	addq $(8*16), MSG
-	schoolbook2
-.endm
-
-/*
- * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS
- */
-.macro partial_stride
-	mov BLOCKS_LEFT, TMP
-	shlq $4, TMP
-	addq $(16*STRIDE_BLOCKS), KEY_POWERS
-	subq TMP, KEY_POWERS
-
-	movups (MSG), %xmm0
-	pxor SUM, %xmm0
-	movaps (KEY_POWERS), %xmm1
-	schoolbook1_noload
-	dec BLOCKS_LEFT
-	addq $16, MSG
-	addq $16, KEY_POWERS
-
-	test $4, BLOCKS_LEFT
-	jz .Lpartial4BlocksDone
-	schoolbook1 4
-	addq $(4*16), MSG
-	addq $(4*16), KEY_POWERS
-.Lpartial4BlocksDone:
-	test $2, BLOCKS_LEFT
-	jz .Lpartial2BlocksDone
-	schoolbook1 2
-	addq $(2*16), MSG
-	addq $(2*16), KEY_POWERS
-.Lpartial2BlocksDone:
-	test $1, BLOCKS_LEFT
-	jz .LpartialDone
-	schoolbook1 1
-.LpartialDone:
-	schoolbook2
-	montgomery_reduction SUM
-.endm
-
-/*
- * Perform montgomery multiplication in GF(2^128) and store result in op1.
- *
- * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
- * If op1, op2 are in montgomery form, this computes the montgomery
- * form of op1*op2.
- *
- * void clmul_polyval_mul(u8 *op1, const u8 *op2);
- */
-SYM_FUNC_START(clmul_polyval_mul)
-	FRAME_BEGIN
-	vmovdqa .Lgstar(%rip), GSTAR
-	movups (%rdi), %xmm0
-	movups (%rsi), %xmm1
-	schoolbook1_noload
-	schoolbook2
-	montgomery_reduction SUM
-	movups SUM, (%rdi)
-	FRAME_END
-	RET
-SYM_FUNC_END(clmul_polyval_mul)
-
-/*
- * Perform polynomial evaluation as specified by POLYVAL.  This computes:
- *	h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
- * where n=nblocks, h is the hash key, and m_i are the message blocks.
- *
- * rdi - pointer to precomputed key powers h^8 ... h^1
- * rsi - pointer to message blocks
- * rdx - number of blocks to hash
- * rcx - pointer to the accumulator
- *
- * void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
- *	const u8 *in, size_t nblocks, u8 *accumulator);
- */
-SYM_FUNC_START(clmul_polyval_update)
-	FRAME_BEGIN
-	vmovdqa .Lgstar(%rip), GSTAR
-	movups (ACCUMULATOR), SUM
-	subq $STRIDE_BLOCKS, BLOCKS_LEFT
-	js .LstrideLoopExit
-	full_stride 0
-	subq $STRIDE_BLOCKS, BLOCKS_LEFT
-	js .LstrideLoopExitReduce
-.LstrideLoop:
-	full_stride 1
-	subq $STRIDE_BLOCKS, BLOCKS_LEFT
-	jns .LstrideLoop
-.LstrideLoopExitReduce:
-	montgomery_reduction SUM
-.LstrideLoopExit:
-	add $STRIDE_BLOCKS, BLOCKS_LEFT
-	jz .LskipPartial
-	partial_stride
-.LskipPartial:
-	movups SUM, (ACCUMULATOR)
-	FRAME_END
-	RET
-SYM_FUNC_END(clmul_polyval_update)
diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c
deleted file mode 100644
index 6b466867f91a..000000000000
--- a/arch/x86/crypto/polyval-clmulni_glue.c
+++ /dev/null
@@ -1,180 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Glue code for POLYVAL using PCMULQDQ-NI
- *
- * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
- * Copyright (c) 2009 Intel Corp.
- *   Author: Huang Ying <ying.huang@intel.com>
- * Copyright 2021 Google LLC
- */
-
-/*
- * Glue code based on ghash-clmulni-intel_glue.c.
- *
- * This implementation of POLYVAL uses montgomery multiplication
- * accelerated by PCLMULQDQ-NI to implement the finite field
- * operations.
- */
-
-#include <asm/cpu_device_id.h>
-#include <asm/fpu/api.h>
-#include <crypto/internal/hash.h>
-#include <crypto/polyval.h>
-#include <crypto/utils.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#define POLYVAL_ALIGN	16
-#define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN)
-#define POLYVAL_ALIGN_EXTRA ((POLYVAL_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
-#define POLYVAL_CTX_SIZE (sizeof(struct polyval_tfm_ctx) + POLYVAL_ALIGN_EXTRA)
-#define NUM_KEY_POWERS	8
-
-struct polyval_tfm_ctx {
-	/*
-	 * These powers must be in the order h^8, ..., h^1.
-	 */
-	u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE] POLYVAL_ALIGN_ATTR;
-};
-
-struct polyval_desc_ctx {
-	u8 buffer[POLYVAL_BLOCK_SIZE];
-};
-
-asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
-	const u8 *in, size_t nblocks, u8 *accumulator);
-asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2);
-
-static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm)
-{
-	return PTR_ALIGN(crypto_shash_ctx(tfm), POLYVAL_ALIGN);
-}
-
-static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
-	const u8 *in, size_t nblocks, u8 *accumulator)
-{
-	kernel_fpu_begin();
-	clmul_polyval_update(keys, in, nblocks, accumulator);
-	kernel_fpu_end();
-}
-
-static void internal_polyval_mul(u8 *op1, const u8 *op2)
-{
-	kernel_fpu_begin();
-	clmul_polyval_mul(op1, op2);
-	kernel_fpu_end();
-}
-
-static int polyval_x86_setkey(struct crypto_shash *tfm,
-			const u8 *key, unsigned int keylen)
-{
-	struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(tfm);
-	int i;
-
-	if (keylen != POLYVAL_BLOCK_SIZE)
-		return -EINVAL;
-
-	memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
-
-	for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
-		memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
-		internal_polyval_mul(tctx->key_powers[i],
-				     tctx->key_powers[i+1]);
-	}
-
-	return 0;
-}
-
-static int polyval_x86_init(struct shash_desc *desc)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	memset(dctx, 0, sizeof(*dctx));
-
-	return 0;
-}
-
-static int polyval_x86_update(struct shash_desc *desc,
-			 const u8 *src, unsigned int srclen)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
-	unsigned int nblocks;
-
-	do {
-		/* Allow rescheduling every 4K bytes. */
-		nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
-		internal_polyval_update(tctx, src, nblocks, dctx->buffer);
-		srclen -= nblocks * POLYVAL_BLOCK_SIZE;
-		src += nblocks * POLYVAL_BLOCK_SIZE;
-	} while (srclen >= POLYVAL_BLOCK_SIZE);
-
-	return srclen;
-}
-
-static int polyval_x86_finup(struct shash_desc *desc, const u8 *src,
-			     unsigned int len, u8 *dst)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
-
-	if (len) {
-		crypto_xor(dctx->buffer, src, len);
-		internal_polyval_mul(dctx->buffer,
-				     tctx->key_powers[NUM_KEY_POWERS-1]);
-	}
-
-	memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
-
-	return 0;
-}
-
-static struct shash_alg polyval_alg = {
-	.digestsize	= POLYVAL_DIGEST_SIZE,
-	.init		= polyval_x86_init,
-	.update		= polyval_x86_update,
-	.finup		= polyval_x86_finup,
-	.setkey		= polyval_x86_setkey,
-	.descsize	= sizeof(struct polyval_desc_ctx),
-	.base		= {
-		.cra_name		= "polyval",
-		.cra_driver_name	= "polyval-clmulni",
-		.cra_priority		= 200,
-		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize		= POLYVAL_BLOCK_SIZE,
-		.cra_ctxsize		= POLYVAL_CTX_SIZE,
-		.cra_module		= THIS_MODULE,
-	},
-};
-
-__maybe_unused static const struct x86_cpu_id pcmul_cpu_id[] = {
-	X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL),
-	{}
-};
-MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
-
-static int __init polyval_clmulni_mod_init(void)
-{
-	if (!x86_match_cpu(pcmul_cpu_id))
-		return -ENODEV;
-
-	if (!boot_cpu_has(X86_FEATURE_AVX))
-		return -ENODEV;
-
-	return crypto_register_shash(&polyval_alg);
-}
-
-static void __exit polyval_clmulni_mod_exit(void)
-{
-	crypto_unregister_shash(&polyval_alg);
-}
-
-module_init(polyval_clmulni_mod_init);
-module_exit(polyval_clmulni_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("POLYVAL hash function accelerated by PCLMULQDQ-NI");
-MODULE_ALIAS_CRYPTO("polyval");
-MODULE_ALIAS_CRYPTO("polyval-clmulni");