diff options
Diffstat (limited to 'arch')
| -rw-r--r-- | arch/arm/crypto/Kconfig | 16 | ||||
| -rw-r--r-- | arch/arm/crypto/Makefile | 2 | ||||
| -rw-r--r-- | arch/arm/crypto/blake2b-neon-core.S | 347 | ||||
| -rw-r--r-- | arch/arm/crypto/blake2b-neon-glue.c | 104 | ||||
| -rw-r--r-- | arch/arm64/configs/defconfig | 2 | ||||
| -rw-r--r-- | arch/arm64/crypto/Kconfig | 21 | ||||
| -rw-r--r-- | arch/arm64/crypto/Makefile | 6 | ||||
| -rw-r--r-- | arch/arm64/crypto/polyval-ce-core.S | 361 | ||||
| -rw-r--r-- | arch/arm64/crypto/polyval-ce-glue.c | 158 | ||||
| -rw-r--r-- | arch/arm64/crypto/sha3-ce-core.S | 212 | ||||
| -rw-r--r-- | arch/arm64/crypto/sha3-ce-glue.c | 151 | ||||
| -rw-r--r-- | arch/s390/configs/debug_defconfig | 3 | ||||
| -rw-r--r-- | arch/s390/configs/defconfig | 3 | ||||
| -rw-r--r-- | arch/s390/crypto/Kconfig | 20 | ||||
| -rw-r--r-- | arch/s390/crypto/Makefile | 2 | ||||
| -rw-r--r-- | arch/s390/crypto/sha.h | 51 | ||||
| -rw-r--r-- | arch/s390/crypto/sha3_256_s390.c | 157 | ||||
| -rw-r--r-- | arch/s390/crypto/sha3_512_s390.c | 157 | ||||
| -rw-r--r-- | arch/s390/crypto/sha_common.c | 117 | ||||
| -rw-r--r-- | arch/x86/crypto/Kconfig | 10 | ||||
| -rw-r--r-- | arch/x86/crypto/Makefile | 3 | ||||
| -rw-r--r-- | arch/x86/crypto/polyval-clmulni_asm.S | 321 | ||||
| -rw-r--r-- | arch/x86/crypto/polyval-clmulni_glue.c | 180 |
23 files changed, 3 insertions, 2401 deletions
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index c436eec22d86..f30d743df264 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -33,22 +33,6 @@ config CRYPTO_NHPOLY1305_NEON Architecture: arm using: - NEON (Advanced SIMD) extensions -config CRYPTO_BLAKE2B_NEON - tristate "Hash functions: BLAKE2b (NEON)" - depends on KERNEL_MODE_NEON - select CRYPTO_BLAKE2B - help - BLAKE2b cryptographic hash function (RFC 7693) - - Architecture: arm using - - NEON (Advanced SIMD) extensions - - BLAKE2b digest algorithm optimized with ARM NEON instructions. - On ARM processors that have NEON support but not the ARMv8 - Crypto Extensions, typically this BLAKE2b implementation is - much faster than the SHA-2 family and slightly faster than - SHA-1. - config CRYPTO_AES_ARM tristate "Ciphers: AES" select CRYPTO_ALGAPI diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index 6346a73effc0..86dd43313dbf 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -5,7 +5,6 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o -obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o @@ -13,7 +12,6 @@ obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o aes-arm-y := aes-cipher-core.o aes-cipher-glue.o aes-arm-bs-y := aes-neonbs-core.o aes-neonbs-glue.o -blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o diff --git a/arch/arm/crypto/blake2b-neon-core.S b/arch/arm/crypto/blake2b-neon-core.S deleted file mode 100644 index 0406a186377f..000000000000 --- a/arch/arm/crypto/blake2b-neon-core.S +++ /dev/null @@ -1,347 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * BLAKE2b digest algorithm, NEON accelerated - * - * Copyright 2020 Google LLC - * - * Author: Eric Biggers <ebiggers@google.com> - */ - -#include <linux/linkage.h> - - .text - .fpu neon - - // The arguments to blake2b_compress_neon() - STATE .req r0 - BLOCK .req r1 - NBLOCKS .req r2 - INC .req r3 - - // Pointers to the rotation tables - ROR24_TABLE .req r4 - ROR16_TABLE .req r5 - - // The original stack pointer - ORIG_SP .req r6 - - // NEON registers which contain the message words of the current block. - // M_0-M_3 are occasionally used for other purposes too. - M_0 .req d16 - M_1 .req d17 - M_2 .req d18 - M_3 .req d19 - M_4 .req d20 - M_5 .req d21 - M_6 .req d22 - M_7 .req d23 - M_8 .req d24 - M_9 .req d25 - M_10 .req d26 - M_11 .req d27 - M_12 .req d28 - M_13 .req d29 - M_14 .req d30 - M_15 .req d31 - - .align 4 - // Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8 - // instruction. This is the most efficient way to implement these - // rotation amounts with NEON. (On Cortex-A53 it's the same speed as - // vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.) -.Lror24_table: - .byte 3, 4, 5, 6, 7, 0, 1, 2 -.Lror16_table: - .byte 2, 3, 4, 5, 6, 7, 0, 1 - // The BLAKE2b initialization vector -.Lblake2b_IV: - .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b - .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 - .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f - .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 - -// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the -// NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack -// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9 -// (M_0-M_3), so that they can be reloaded if they are used as temporary -// registers. The macro arguments s0-s15 give the order in which the message -// words are used in this round. 'final' is 1 if this is the final round. -.macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \ - s8, s9, s10, s11, s12, s13, s14, s15, final=0 - - // Mix the columns: - // (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]), - // (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]). - - // a += b + m[blake2b_sigma[r][2*i + 0]]; - vadd.u64 q0, q0, q2 - vadd.u64 q1, q1, q3 - vadd.u64 d0, d0, M_\s0 - vadd.u64 d1, d1, M_\s2 - vadd.u64 d2, d2, M_\s4 - vadd.u64 d3, d3, M_\s6 - - // d = ror64(d ^ a, 32); - veor q6, q6, q0 - veor q7, q7, q1 - vrev64.32 q6, q6 - vrev64.32 q7, q7 - - // c += d; - vadd.u64 q4, q4, q6 - vadd.u64 q5, q5, q7 - - // b = ror64(b ^ c, 24); - vld1.8 {M_0}, [ROR24_TABLE, :64] - veor q2, q2, q4 - veor q3, q3, q5 - vtbl.8 d4, {d4}, M_0 - vtbl.8 d5, {d5}, M_0 - vtbl.8 d6, {d6}, M_0 - vtbl.8 d7, {d7}, M_0 - - // a += b + m[blake2b_sigma[r][2*i + 1]]; - // - // M_0 got clobbered above, so we have to reload it if any of the four - // message words this step needs happens to be M_0. Otherwise we don't - // need to reload it here, as it will just get clobbered again below. -.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0 - vld1.8 {M_0}, [sp, :64] -.endif - vadd.u64 q0, q0, q2 - vadd.u64 q1, q1, q3 - vadd.u64 d0, d0, M_\s1 - vadd.u64 d1, d1, M_\s3 - vadd.u64 d2, d2, M_\s5 - vadd.u64 d3, d3, M_\s7 - - // d = ror64(d ^ a, 16); - vld1.8 {M_0}, [ROR16_TABLE, :64] - veor q6, q6, q0 - veor q7, q7, q1 - vtbl.8 d12, {d12}, M_0 - vtbl.8 d13, {d13}, M_0 - vtbl.8 d14, {d14}, M_0 - vtbl.8 d15, {d15}, M_0 - - // c += d; - vadd.u64 q4, q4, q6 - vadd.u64 q5, q5, q7 - - // b = ror64(b ^ c, 63); - // - // This rotation amount isn't a multiple of 8, so it has to be - // implemented using a pair of shifts, which requires temporary - // registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards. - veor q8, q2, q4 - veor q9, q3, q5 - vshr.u64 q2, q8, #63 - vshr.u64 q3, q9, #63 - vsli.u64 q2, q8, #1 - vsli.u64 q3, q9, #1 - vld1.8 {q8-q9}, [sp, :256] - - // Mix the diagonals: - // (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]), - // (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]). - // - // There are two possible ways to do this: use 'vext' instructions to - // shift the rows of the matrix so that the diagonals become columns, - // and undo it afterwards; or just use 64-bit operations on 'd' - // registers instead of 128-bit operations on 'q' registers. We use the - // latter approach, as it performs much better on Cortex-A7. - - // a += b + m[blake2b_sigma[r][2*i + 0]]; - vadd.u64 d0, d0, d5 - vadd.u64 d1, d1, d6 - vadd.u64 d2, d2, d7 - vadd.u64 d3, d3, d4 - vadd.u64 d0, d0, M_\s8 - vadd.u64 d1, d1, M_\s10 - vadd.u64 d2, d2, M_\s12 - vadd.u64 d3, d3, M_\s14 - - // d = ror64(d ^ a, 32); - veor d15, d15, d0 - veor d12, d12, d1 - veor d13, d13, d2 - veor d14, d14, d3 - vrev64.32 d15, d15 - vrev64.32 d12, d12 - vrev64.32 d13, d13 - vrev64.32 d14, d14 - - // c += d; - vadd.u64 d10, d10, d15 - vadd.u64 d11, d11, d12 - vadd.u64 d8, d8, d13 - vadd.u64 d9, d9, d14 - - // b = ror64(b ^ c, 24); - vld1.8 {M_0}, [ROR24_TABLE, :64] - veor d5, d5, d10 - veor d6, d6, d11 - veor d7, d7, d8 - veor d4, d4, d9 - vtbl.8 d5, {d5}, M_0 - vtbl.8 d6, {d6}, M_0 - vtbl.8 d7, {d7}, M_0 - vtbl.8 d4, {d4}, M_0 - - // a += b + m[blake2b_sigma[r][2*i + 1]]; -.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0 - vld1.8 {M_0}, [sp, :64] -.endif - vadd.u64 d0, d0, d5 - vadd.u64 d1, d1, d6 - vadd.u64 d2, d2, d7 - vadd.u64 d3, d3, d4 - vadd.u64 d0, d0, M_\s9 - vadd.u64 d1, d1, M_\s11 - vadd.u64 d2, d2, M_\s13 - vadd.u64 d3, d3, M_\s15 - - // d = ror64(d ^ a, 16); - vld1.8 {M_0}, [ROR16_TABLE, :64] - veor d15, d15, d0 - veor d12, d12, d1 - veor d13, d13, d2 - veor d14, d14, d3 - vtbl.8 d12, {d12}, M_0 - vtbl.8 d13, {d13}, M_0 - vtbl.8 d14, {d14}, M_0 - vtbl.8 d15, {d15}, M_0 - - // c += d; - vadd.u64 d10, d10, d15 - vadd.u64 d11, d11, d12 - vadd.u64 d8, d8, d13 - vadd.u64 d9, d9, d14 - - // b = ror64(b ^ c, 63); - veor d16, d4, d9 - veor d17, d5, d10 - veor d18, d6, d11 - veor d19, d7, d8 - vshr.u64 q2, q8, #63 - vshr.u64 q3, q9, #63 - vsli.u64 q2, q8, #1 - vsli.u64 q3, q9, #1 - // Reloading q8-q9 can be skipped on the final round. -.if ! \final - vld1.8 {q8-q9}, [sp, :256] -.endif -.endm - -// -// void blake2b_compress_neon(struct blake2b_state *state, -// const u8 *block, size_t nblocks, u32 inc); -// -// Only the first three fields of struct blake2b_state are used: -// u64 h[8]; (inout) -// u64 t[2]; (inout) -// u64 f[2]; (in) -// - .align 5 -ENTRY(blake2b_compress_neon) - push {r4-r10} - - // Allocate a 32-byte stack buffer that is 32-byte aligned. - mov ORIG_SP, sp - sub ip, sp, #32 - bic ip, ip, #31 - mov sp, ip - - adr ROR24_TABLE, .Lror24_table - adr ROR16_TABLE, .Lror16_table - - mov ip, STATE - vld1.64 {q0-q1}, [ip]! // Load h[0..3] - vld1.64 {q2-q3}, [ip]! // Load h[4..7] -.Lnext_block: - adr r10, .Lblake2b_IV - vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1] - vld1.64 {q4-q5}, [r10]! // Load IV[0..3] - vmov r7, r8, d28 // Copy t[0] to (r7, r8) - vld1.64 {q6-q7}, [r10] // Load IV[4..7] - adds r7, r7, INC // Increment counter - bcs .Lslow_inc_ctr - vmov.i32 d28[0], r7 - vst1.64 {d28}, [ip] // Update t[0] -.Linc_ctr_done: - - // Load the next message block and finish initializing the state matrix - // 'v'. Fortunately, there are exactly enough NEON registers to fit the - // entire state matrix in q0-q7 and the entire message block in q8-15. - // - // However, _blake2b_round also needs some extra registers for rotates, - // so we have to spill some registers. It's better to spill the message - // registers than the state registers, as the message doesn't change. - // Therefore we store a copy of the first 32 bytes of the message block - // (q8-q9) in an aligned buffer on the stack so that they can be - // reloaded when needed. (We could just reload directly from the - // message buffer, but it's faster to use aligned loads.) - vld1.8 {q8-q9}, [BLOCK]! - veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1] - vld1.8 {q10-q11}, [BLOCK]! - veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1] - vld1.8 {q12-q13}, [BLOCK]! - vst1.8 {q8-q9}, [sp, :256] - mov ip, STATE - vld1.8 {q14-q15}, [BLOCK]! - - // Execute the rounds. Each round is provided the order in which it - // needs to use the message words. - _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 - _blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 - _blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 - _blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 - _blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 - _blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 - _blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 - _blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 - _blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 - _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \ - final=1 - - // Fold the final state matrix into the hash chaining value: - // - // for (i = 0; i < 8; i++) - // h[i] ^= v[i] ^ v[i + 8]; - // - vld1.64 {q8-q9}, [ip]! // Load old h[0..3] - veor q0, q0, q4 // v[0..1] ^= v[8..9] - veor q1, q1, q5 // v[2..3] ^= v[10..11] - vld1.64 {q10-q11}, [ip] // Load old h[4..7] - veor q2, q2, q6 // v[4..5] ^= v[12..13] - veor q3, q3, q7 // v[6..7] ^= v[14..15] - veor q0, q0, q8 // v[0..1] ^= h[0..1] - veor q1, q1, q9 // v[2..3] ^= h[2..3] - mov ip, STATE - subs NBLOCKS, NBLOCKS, #1 // nblocks-- - vst1.64 {q0-q1}, [ip]! // Store new h[0..3] - veor q2, q2, q10 // v[4..5] ^= h[4..5] - veor q3, q3, q11 // v[6..7] ^= h[6..7] - vst1.64 {q2-q3}, [ip]! // Store new h[4..7] - - // Advance to the next block, if there is one. - bne .Lnext_block // nblocks != 0? - - mov sp, ORIG_SP - pop {r4-r10} - mov pc, lr - -.Lslow_inc_ctr: - // Handle the case where the counter overflowed its low 32 bits, by - // carrying the overflow bit into the full 128-bit counter. - vmov r9, r10, d29 - adcs r8, r8, #0 - adcs r9, r9, #0 - adc r10, r10, #0 - vmov d28, r7, r8 - vmov d29, r9, r10 - vst1.64 {q14}, [ip] // Update t[0] and t[1] - b .Linc_ctr_done -ENDPROC(blake2b_compress_neon) diff --git a/arch/arm/crypto/blake2b-neon-glue.c b/arch/arm/crypto/blake2b-neon-glue.c deleted file mode 100644 index 2ff443a91724..000000000000 --- a/arch/arm/crypto/blake2b-neon-glue.c +++ /dev/null @@ -1,104 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * BLAKE2b digest algorithm, NEON accelerated - * - * Copyright 2020 Google LLC - */ - -#include <crypto/internal/blake2b.h> -#include <crypto/internal/hash.h> - -#include <linux/module.h> -#include <linux/sizes.h> - -#include <asm/neon.h> -#include <asm/simd.h> - -asmlinkage void blake2b_compress_neon(struct blake2b_state *state, - const u8 *block, size_t nblocks, u32 inc); - -static void blake2b_compress_arch(struct blake2b_state *state, - const u8 *block, size_t nblocks, u32 inc) -{ - do { - const size_t blocks = min_t(size_t, nblocks, - SZ_4K / BLAKE2B_BLOCK_SIZE); - - kernel_neon_begin(); - blake2b_compress_neon(state, block, blocks, inc); - kernel_neon_end(); - - nblocks -= blocks; - block += blocks * BLAKE2B_BLOCK_SIZE; - } while (nblocks); -} - -static int crypto_blake2b_update_neon(struct shash_desc *desc, - const u8 *in, unsigned int inlen) -{ - return crypto_blake2b_update_bo(desc, in, inlen, blake2b_compress_arch); -} - -static int crypto_blake2b_finup_neon(struct shash_desc *desc, const u8 *in, - unsigned int inlen, u8 *out) -{ - return crypto_blake2b_finup(desc, in, inlen, out, - blake2b_compress_arch); -} - -#define BLAKE2B_ALG(name, driver_name, digest_size) \ - { \ - .base.cra_name = name, \ - .base.cra_driver_name = driver_name, \ - .base.cra_priority = 200, \ - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY | \ - CRYPTO_AHASH_ALG_BLOCK_ONLY | \ - CRYPTO_AHASH_ALG_FINAL_NONZERO, \ - .base.cra_blocksize = BLAKE2B_BLOCK_SIZE, \ - .base.cra_ctxsize = sizeof(struct blake2b_tfm_ctx), \ - .base.cra_module = THIS_MODULE, \ - .digestsize = digest_size, \ - .setkey = crypto_blake2b_setkey, \ - .init = crypto_blake2b_init, \ - .update = crypto_blake2b_update_neon, \ - .finup = crypto_blake2b_finup_neon, \ - .descsize = sizeof(struct blake2b_state), \ - .statesize = BLAKE2B_STATE_SIZE, \ - } - -static struct shash_alg blake2b_neon_algs[] = { - BLAKE2B_ALG("blake2b-160", "blake2b-160-neon", BLAKE2B_160_HASH_SIZE), - BLAKE2B_ALG("blake2b-256", "blake2b-256-neon", BLAKE2B_256_HASH_SIZE), - BLAKE2B_ALG("blake2b-384", "blake2b-384-neon", BLAKE2B_384_HASH_SIZE), - BLAKE2B_ALG("blake2b-512", "blake2b-512-neon", BLAKE2B_512_HASH_SIZE), -}; - -static int __init blake2b_neon_mod_init(void) -{ - if (!(elf_hwcap & HWCAP_NEON)) - return -ENODEV; - - return crypto_register_shashes(blake2b_neon_algs, - ARRAY_SIZE(blake2b_neon_algs)); -} - -static void __exit blake2b_neon_mod_exit(void) -{ - crypto_unregister_shashes(blake2b_neon_algs, - ARRAY_SIZE(blake2b_neon_algs)); -} - -module_init(blake2b_neon_mod_init); -module_exit(blake2b_neon_mod_exit); - -MODULE_DESCRIPTION("BLAKE2b digest algorithm, NEON accelerated"); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>"); -MODULE_ALIAS_CRYPTO("blake2b-160"); -MODULE_ALIAS_CRYPTO("blake2b-160-neon"); -MODULE_ALIAS_CRYPTO("blake2b-256"); -MODULE_ALIAS_CRYPTO("blake2b-256-neon"); -MODULE_ALIAS_CRYPTO("blake2b-384"); -MODULE_ALIAS_CRYPTO("blake2b-384-neon"); -MODULE_ALIAS_CRYPTO("blake2b-512"); -MODULE_ALIAS_CRYPTO("blake2b-512-neon"); diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 1a48faad2473..997fa7cd9de5 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1783,10 +1783,10 @@ CONFIG_CRYPTO_CHACHA20=m CONFIG_CRYPTO_BENCHMARK=m CONFIG_CRYPTO_ECHAINIV=y CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_SHA3=m CONFIG_CRYPTO_ANSI_CPRNG=y CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_GHASH_ARM64_CE=y -CONFIG_CRYPTO_SHA3_ARM64=m CONFIG_CRYPTO_SM3_ARM64_CE=m CONFIG_CRYPTO_AES_ARM64_CE_BLK=y CONFIG_CRYPTO_AES_ARM64_BS=m diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 91f3093eee6a..bdd276a6e540 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -25,17 +25,6 @@ config CRYPTO_NHPOLY1305_NEON Architecture: arm64 using: - NEON (Advanced SIMD) extensions -config CRYPTO_SHA3_ARM64 - tristate "Hash functions: SHA-3 (ARMv8.2 Crypto Extensions)" - depends on KERNEL_MODE_NEON - select CRYPTO_HASH - select CRYPTO_SHA3 - help - SHA-3 secure hash algorithms (FIPS 202) - - Architecture: arm64 using: - - ARMv8.2 Crypto Extensions - config CRYPTO_SM3_NEON tristate "Hash functions: SM3 (NEON)" depends on KERNEL_MODE_NEON @@ -58,16 +47,6 @@ config CRYPTO_SM3_ARM64_CE Architecture: arm64 using: - ARMv8.2 Crypto Extensions -config CRYPTO_POLYVAL_ARM64_CE - tristate "Hash functions: POLYVAL (ARMv8 Crypto Extensions)" - depends on KERNEL_MODE_NEON - select CRYPTO_POLYVAL - help - POLYVAL hash function for HCTR2 - - Architecture: arm64 using: - - ARMv8 Crypto Extensions - config CRYPTO_AES_ARM64 tristate "Ciphers: AES, modes: ECB, CBC, CTR, CTS, XCTR, XTS" select CRYPTO_AES diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index a8b2cdbe202c..1e330aa08d3f 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -5,9 +5,6 @@ # Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> # -obj-$(CONFIG_CRYPTO_SHA3_ARM64) += sha3-ce.o -sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o - obj-$(CONFIG_CRYPTO_SM3_NEON) += sm3-neon.o sm3-neon-y := sm3-neon-glue.o sm3-neon-core.o @@ -32,9 +29,6 @@ sm4-neon-y := sm4-neon-glue.o sm4-neon-core.o obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o -obj-$(CONFIG_CRYPTO_POLYVAL_ARM64_CE) += polyval-ce.o -polyval-ce-y := polyval-ce-glue.o polyval-ce-core.o - obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o diff --git a/arch/arm64/crypto/polyval-ce-core.S b/arch/arm64/crypto/polyval-ce-core.S deleted file mode 100644 index b5326540d2e3..000000000000 --- a/arch/arm64/crypto/polyval-ce-core.S +++ /dev/null @@ -1,361 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Implementation of POLYVAL using ARMv8 Crypto Extensions. - * - * Copyright 2021 Google LLC - */ -/* - * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions - * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8, - * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split - * finite field multiplication into two steps. - * - * In the first step, we consider h^i, m_i as normal polynomials of degree less - * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication - * is simply polynomial multiplication. - * - * In the second step, we compute the reduction of p(x) modulo the finite field - * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1. - * - * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where - * multiplication is finite field multiplication. The advantage is that the - * two-step process only requires 1 finite field reduction for every 8 - * polynomial multiplications. Further parallelism is gained by interleaving the - * multiplications and polynomial reductions. - */ - -#include <linux/linkage.h> -#define STRIDE_BLOCKS 8 - -KEY_POWERS .req x0 -MSG .req x1 -BLOCKS_LEFT .req x2 -ACCUMULATOR .req x3 -KEY_START .req x10 -EXTRA_BYTES .req x11 -TMP .req x13 - -M0 .req v0 -M1 .req v1 -M2 .req v2 -M3 .req v3 -M4 .req v4 -M5 .req v5 -M6 .req v6 -M7 .req v7 -KEY8 .req v8 -KEY7 .req v9 -KEY6 .req v10 -KEY5 .req v11 -KEY4 .req v12 -KEY3 .req v13 -KEY2 .req v14 -KEY1 .req v15 -PL .req v16 -PH .req v17 -TMP_V .req v18 -LO .req v20 -MI .req v21 -HI .req v22 -SUM .req v23 -GSTAR .req v24 - - .text - - .arch armv8-a+crypto - .align 4 - -.Lgstar: - .quad 0xc200000000000000, 0xc200000000000000 - -/* - * Computes the product of two 128-bit polynomials in X and Y and XORs the - * components of the 256-bit product into LO, MI, HI. - * - * Given: - * X = [X_1 : X_0] - * Y = [Y_1 : Y_0] - * - * We compute: - * LO += X_0 * Y_0 - * MI += (X_0 + X_1) * (Y_0 + Y_1) - * HI += X_1 * Y_1 - * - * Later, the 256-bit result can be extracted as: - * [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0] - * This step is done when computing the polynomial reduction for efficiency - * reasons. - * - * Karatsuba multiplication is used instead of Schoolbook multiplication because - * it was found to be slightly faster on ARM64 CPUs. - * - */ -.macro karatsuba1 X Y - X .req \X - Y .req \Y - ext v25.16b, X.16b, X.16b, #8 - ext v26.16b, Y.16b, Y.16b, #8 - eor v25.16b, v25.16b, X.16b - eor v26.16b, v26.16b, Y.16b - pmull2 v28.1q, X.2d, Y.2d - pmull v29.1q, X.1d, Y.1d - pmull v27.1q, v25.1d, v26.1d - eor HI.16b, HI.16b, v28.16b - eor LO.16b, LO.16b, v29.16b - eor MI.16b, MI.16b, v27.16b - .unreq X - .unreq Y -.endm - -/* - * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into - * them. - */ -.macro karatsuba1_store X Y - X .req \X - Y .req \Y - ext v25.16b, X.16b, X.16b, #8 - ext v26.16b, Y.16b, Y.16b, #8 - eor v25.16b, v25.16b, X.16b - eor v26.16b, v26.16b, Y.16b - pmull2 HI.1q, X.2d, Y.2d - pmull LO.1q, X.1d, Y.1d - pmull MI.1q, v25.1d, v26.1d - .unreq X - .unreq Y -.endm - -/* - * Computes the 256-bit polynomial represented by LO, HI, MI. Stores - * the result in PL, PH. - * [PH : PL] = - * [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0] - */ -.macro karatsuba2 - // v4 = [HI_1 + MI_1 : HI_0 + MI_0] - eor v4.16b, HI.16b, MI.16b - // v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0] - eor v4.16b, v4.16b, LO.16b - // v5 = [HI_0 : LO_1] - ext v5.16b, LO.16b, HI.16b, #8 - // v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0] - eor v4.16b, v4.16b, v5.16b - // HI = [HI_0 : HI_1] - ext HI.16b, HI.16b, HI.16b, #8 - // LO = [LO_0 : LO_1] - ext LO.16b, LO.16b, LO.16b, #8 - // PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1] - ext PH.16b, v4.16b, HI.16b, #8 - // PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0] - ext PL.16b, LO.16b, v4.16b, #8 -.endm - -/* - * Computes the 128-bit reduction of PH : PL. Stores the result in dest. - * - * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) = - * x^128 + x^127 + x^126 + x^121 + 1. - * - * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the - * product of two 128-bit polynomials in Montgomery form. We need to reduce it - * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor - * of x^128, this product has two extra factors of x^128. To get it back into - * Montgomery form, we need to remove one of these factors by dividing by x^128. - * - * To accomplish both of these goals, we add multiples of g(x) that cancel out - * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low - * bits are zero, the polynomial division by x^128 can be done by right - * shifting. - * - * Since the only nonzero term in the low 64 bits of g(x) is the constant term, - * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can - * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 + - * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to - * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T - * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191. - * - * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits - * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1 - * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) * - * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 : - * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0). - * - * So our final computation is: - * T = T_1 : T_0 = g*(x) * P_0 - * V = V_1 : V_0 = g*(x) * (P_1 + T_0) - * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0 - * - * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0 - * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 : - * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V. - */ -.macro montgomery_reduction dest - DEST .req \dest - // TMP_V = T_1 : T_0 = P_0 * g*(x) - pmull TMP_V.1q, PL.1d, GSTAR.1d - // TMP_V = T_0 : T_1 - ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8 - // TMP_V = P_1 + T_0 : P_0 + T_1 - eor TMP_V.16b, PL.16b, TMP_V.16b - // PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1 - eor PH.16b, PH.16b, TMP_V.16b - // TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x) - pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d - eor DEST.16b, PH.16b, TMP_V.16b - .unreq DEST -.endm - -/* - * Compute Polyval on 8 blocks. - * - * If reduce is set, also computes the montgomery reduction of the - * previous full_stride call and XORs with the first message block. - * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1. - * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0. - * - * Sets PL, PH. - */ -.macro full_stride reduce - eor LO.16b, LO.16b, LO.16b - eor MI.16b, MI.16b, MI.16b - eor HI.16b, HI.16b, HI.16b - - ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64 - ld1 {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64 - - karatsuba1 M7 KEY1 - .if \reduce - pmull TMP_V.1q, PL.1d, GSTAR.1d - .endif - - karatsuba1 M6 KEY2 - .if \reduce - ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8 - .endif - - karatsuba1 M5 KEY3 - .if \reduce - eor TMP_V.16b, PL.16b, TMP_V.16b - .endif - - karatsuba1 M4 KEY4 - .if \reduce - eor PH.16b, PH.16b, TMP_V.16b - .endif - - karatsuba1 M3 KEY5 - .if \reduce - pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d - .endif - - karatsuba1 M2 KEY6 - .if \reduce - eor SUM.16b, PH.16b, TMP_V.16b - .endif - - karatsuba1 M1 KEY7 - eor M0.16b, M0.16b, SUM.16b - - karatsuba1 M0 KEY8 - karatsuba2 -.endm - -/* - * Handle any extra blocks after full_stride loop. - */ -.macro partial_stride - add KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4) - sub KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4 - ld1 {KEY1.16b}, [KEY_POWERS], #16 - - ld1 {TMP_V.16b}, [MSG], #16 - eor SUM.16b, SUM.16b, TMP_V.16b - karatsuba1_store KEY1 SUM - sub BLOCKS_LEFT, BLOCKS_LEFT, #1 - - tst BLOCKS_LEFT, #4 - beq .Lpartial4BlocksDone - ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64 - ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64 - karatsuba1 M0 KEY8 - karatsuba1 M1 KEY7 - karatsuba1 M2 KEY6 - karatsuba1 M3 KEY5 -.Lpartial4BlocksDone: - tst BLOCKS_LEFT, #2 - beq .Lpartial2BlocksDone - ld1 {M0.16b, M1.16b}, [MSG], #32 - ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32 - karatsuba1 M0 KEY8 - karatsuba1 M1 KEY7 -.Lpartial2BlocksDone: - tst BLOCKS_LEFT, #1 - beq .LpartialDone - ld1 {M0.16b}, [MSG], #16 - ld1 {KEY8.16b}, [KEY_POWERS], #16 - karatsuba1 M0 KEY8 -.LpartialDone: - karatsuba2 - montgomery_reduction SUM -.endm - -/* - * Perform montgomery multiplication in GF(2^128) and store result in op1. - * - * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1 - * If op1, op2 are in montgomery form, this computes the montgomery - * form of op1*op2. - * - * void pmull_polyval_mul(u8 *op1, const u8 *op2); - */ -SYM_FUNC_START(pmull_polyval_mul) - adr TMP, .Lgstar - ld1 {GSTAR.2d}, [TMP] - ld1 {v0.16b}, [x0] - ld1 {v1.16b}, [x1] - karatsuba1_store v0 v1 - karatsuba2 - montgomery_reduction SUM - st1 {SUM.16b}, [x0] - ret -SYM_FUNC_END(pmull_polyval_mul) - -/* - * Perform polynomial evaluation as specified by POLYVAL. This computes: - * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1} - * where n=nblocks, h is the hash key, and m_i are the message blocks. - * - * x0 - pointer to precomputed key powers h^8 ... h^1 - * x1 - pointer to message blocks - * x2 - number of blocks to hash - * x3 - pointer to accumulator - * - * void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in, - * size_t nblocks, u8 *accumulator); - */ -SYM_FUNC_START(pmull_polyval_update) - adr TMP, .Lgstar - mov KEY_START, KEY_POWERS - ld1 {GSTAR.2d}, [TMP] - ld1 {SUM.16b}, [ACCUMULATOR] - subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS - blt .LstrideLoopExit - ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64 - ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64 - full_stride 0 - subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS - blt .LstrideLoopExitReduce -.LstrideLoop: - full_stride 1 - subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS - bge .LstrideLoop -.LstrideLoopExitReduce: - montgomery_reduction SUM -.LstrideLoopExit: - adds BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS - beq .LskipPartial - partial_stride -.LskipPartial: - st1 {SUM.16b}, [ACCUMULATOR] - ret -SYM_FUNC_END(pmull_polyval_update) diff --git a/arch/arm64/crypto/polyval-ce-glue.c b/arch/arm64/crypto/polyval-ce-glue.c deleted file mode 100644 index c4e653688ea0..000000000000 --- a/arch/arm64/crypto/polyval-ce-glue.c +++ /dev/null @@ -1,158 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Glue code for POLYVAL using ARMv8 Crypto Extensions - * - * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi> - * Copyright (c) 2009 Intel Corp. - * Author: Huang Ying <ying.huang@intel.com> - * Copyright 2021 Google LLC - */ - -/* - * Glue code based on ghash-clmulni-intel_glue.c. - * - * This implementation of POLYVAL uses montgomery multiplication accelerated by - * ARMv8 Crypto Extensions instructions to implement the finite field operations. - */ - -#include <asm/neon.h> -#include <crypto/internal/hash.h> -#include <crypto/polyval.h> -#include <crypto/utils.h> -#include <linux/cpufeature.h> -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/string.h> - -#define NUM_KEY_POWERS 8 - -struct polyval_tfm_ctx { - /* - * These powers must be in the order h^8, ..., h^1. - */ - u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE]; -}; - -struct polyval_desc_ctx { - u8 buffer[POLYVAL_BLOCK_SIZE]; -}; - -asmlinkage void pmull_polyval_update(const struct polyval_tfm_ctx *keys, - const u8 *in, size_t nblocks, u8 *accumulator); -asmlinkage void pmull_polyval_mul(u8 *op1, const u8 *op2); - -static void internal_polyval_update(const struct polyval_tfm_ctx *keys, - const u8 *in, size_t nblocks, u8 *accumulator) -{ - kernel_neon_begin(); - pmull_polyval_update(keys, in, nblocks, accumulator); - kernel_neon_end(); -} - -static void internal_polyval_mul(u8 *op1, const u8 *op2) -{ - kernel_neon_begin(); - pmull_polyval_mul(op1, op2); - kernel_neon_end(); -} - -static int polyval_arm64_setkey(struct crypto_shash *tfm, - const u8 *key, unsigned int keylen) -{ - struct polyval_tfm_ctx *tctx = crypto_shash_ctx(tfm); - int i; - - if (keylen != POLYVAL_BLOCK_SIZE) - return -EINVAL; - - memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE); - - for (i = NUM_KEY_POWERS-2; i >= 0; i--) { - memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE); - internal_polyval_mul(tctx->key_powers[i], - tctx->key_powers[i+1]); - } - - return 0; -} - -static int polyval_arm64_init(struct shash_desc *desc) -{ - struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); - - memset(dctx, 0, sizeof(*dctx)); - - return 0; -} - -static int polyval_arm64_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen) -{ - struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); - const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); - unsigned int nblocks; - - do { - /* allow rescheduling every 4K bytes */ - nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE; - internal_polyval_update(tctx, src, nblocks, dctx->buffer); - srclen -= nblocks * POLYVAL_BLOCK_SIZE; - src += nblocks * POLYVAL_BLOCK_SIZE; - } while (srclen >= POLYVAL_BLOCK_SIZE); - - return srclen; -} - -static int polyval_arm64_finup(struct shash_desc *desc, const u8 *src, - unsigned int len, u8 *dst) -{ - struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); - const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); - - if (len) { - crypto_xor(dctx->buffer, src, len); - internal_polyval_mul(dctx->buffer, - tctx->key_powers[NUM_KEY_POWERS-1]); - } - - memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE); - - return 0; -} - -static struct shash_alg polyval_alg = { - .digestsize = POLYVAL_DIGEST_SIZE, - .init = polyval_arm64_init, - .update = polyval_arm64_update, - .finup = polyval_arm64_finup, - .setkey = polyval_arm64_setkey, - .descsize = sizeof(struct polyval_desc_ctx), - .base = { - .cra_name = "polyval", - .cra_driver_name = "polyval-ce", - .cra_priority = 200, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = POLYVAL_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct polyval_tfm_ctx), - .cra_module = THIS_MODULE, - }, -}; - -static int __init polyval_ce_mod_init(void) -{ - return crypto_register_shash(&polyval_alg); -} - -static void __exit polyval_ce_mod_exit(void) -{ - crypto_unregister_shash(&polyval_alg); -} - -module_cpu_feature_match(PMULL, polyval_ce_mod_init) -module_exit(polyval_ce_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("POLYVAL hash function accelerated by ARMv8 Crypto Extensions"); -MODULE_ALIAS_CRYPTO("polyval"); -MODULE_ALIAS_CRYPTO("polyval-ce"); diff --git a/arch/arm64/crypto/sha3-ce-core.S b/arch/arm64/crypto/sha3-ce-core.S deleted file mode 100644 index 9c77313f5a60..000000000000 --- a/arch/arm64/crypto/sha3-ce-core.S +++ /dev/null @@ -1,212 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions - * - * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/linkage.h> -#include <asm/assembler.h> - - .irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 - .set .Lv\b\().2d, \b - .set .Lv\b\().16b, \b - .endr - - /* - * ARMv8.2 Crypto Extensions instructions - */ - .macro eor3, rd, rn, rm, ra - .inst 0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16) - .endm - - .macro rax1, rd, rn, rm - .inst 0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16) - .endm - - .macro bcax, rd, rn, rm, ra - .inst 0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16) - .endm - - .macro xar, rd, rn, rm, imm6 - .inst 0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16) - .endm - - /* - * int sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size) - */ - .text -SYM_FUNC_START(sha3_ce_transform) - /* load state */ - add x8, x0, #32 - ld1 { v0.1d- v3.1d}, [x0] - ld1 { v4.1d- v7.1d}, [x8], #32 - ld1 { v8.1d-v11.1d}, [x8], #32 - ld1 {v12.1d-v15.1d}, [x8], #32 - ld1 {v16.1d-v19.1d}, [x8], #32 - ld1 {v20.1d-v23.1d}, [x8], #32 - ld1 {v24.1d}, [x8] - -0: sub w2, w2, #1 - mov w8, #24 - adr_l x9, .Lsha3_rcon - - /* load input */ - ld1 {v25.8b-v28.8b}, [x1], #32 - ld1 {v29.8b-v31.8b}, [x1], #24 - eor v0.8b, v0.8b, v25.8b - eor v1.8b, v1.8b, v26.8b - eor v2.8b, v2.8b, v27.8b - eor v3.8b, v3.8b, v28.8b - eor v4.8b, v4.8b, v29.8b - eor v5.8b, v5.8b, v30.8b - eor v6.8b, v6.8b, v31.8b - - tbnz x3, #6, 2f // SHA3-512 - - ld1 {v25.8b-v28.8b}, [x1], #32 - ld1 {v29.8b-v30.8b}, [x1], #16 - eor v7.8b, v7.8b, v25.8b - eor v8.8b, v8.8b, v26.8b - eor v9.8b, v9.8b, v27.8b - eor v10.8b, v10.8b, v28.8b - eor v11.8b, v11.8b, v29.8b - eor v12.8b, v12.8b, v30.8b - - tbnz x3, #4, 1f // SHA3-384 or SHA3-224 - - // SHA3-256 - ld1 {v25.8b-v28.8b}, [x1], #32 - eor v13.8b, v13.8b, v25.8b - eor v14.8b, v14.8b, v26.8b - eor v15.8b, v15.8b, v27.8b - eor v16.8b, v16.8b, v28.8b - b 3f - -1: tbz x3, #2, 3f // bit 2 cleared? SHA-384 - - // SHA3-224 - ld1 {v25.8b-v28.8b}, [x1], #32 - ld1 {v29.8b}, [x1], #8 - eor v13.8b, v13.8b, v25.8b - eor v14.8b, v14.8b, v26.8b - eor v15.8b, v15.8b, v27.8b - eor v16.8b, v16.8b, v28.8b - eor v17.8b, v17.8b, v29.8b - b 3f - - // SHA3-512 -2: ld1 {v25.8b-v26.8b}, [x1], #16 - eor v7.8b, v7.8b, v25.8b - eor v8.8b, v8.8b, v26.8b - -3: sub w8, w8, #1 - - eor3 v29.16b, v4.16b, v9.16b, v14.16b - eor3 v26.16b, v1.16b, v6.16b, v11.16b - eor3 v28.16b, v3.16b, v8.16b, v13.16b - eor3 v25.16b, v0.16b, v5.16b, v10.16b - eor3 v27.16b, v2.16b, v7.16b, v12.16b - eor3 v29.16b, v29.16b, v19.16b, v24.16b - eor3 v26.16b, v26.16b, v16.16b, v21.16b - eor3 v28.16b, v28.16b, v18.16b, v23.16b - eor3 v25.16b, v25.16b, v15.16b, v20.16b - eor3 v27.16b, v27.16b, v17.16b, v22.16b - - rax1 v30.2d, v29.2d, v26.2d // bc[0] - rax1 v26.2d, v26.2d, v28.2d // bc[2] - rax1 v28.2d, v28.2d, v25.2d // bc[4] - rax1 v25.2d, v25.2d, v27.2d // bc[1] - rax1 v27.2d, v27.2d, v29.2d // bc[3] - - eor v0.16b, v0.16b, v30.16b - xar v29.2d, v1.2d, v25.2d, (64 - 1) - xar v1.2d, v6.2d, v25.2d, (64 - 44) - xar v6.2d, v9.2d, v28.2d, (64 - 20) - xar v9.2d, v22.2d, v26.2d, (64 - 61) - xar v22.2d, v14.2d, v28.2d, (64 - 39) - xar v14.2d, v20.2d, v30.2d, (64 - 18) - xar v31.2d, v2.2d, v26.2d, (64 - 62) - xar v2.2d, v12.2d, v26.2d, (64 - 43) - xar v12.2d, v13.2d, v27.2d, (64 - 25) - xar v13.2d, v19.2d, v28.2d, (64 - 8) - xar v19.2d, v23.2d, v27.2d, (64 - 56) - xar v23.2d, v15.2d, v30.2d, (64 - 41) - xar v15.2d, v4.2d, v28.2d, (64 - 27) - xar v28.2d, v24.2d, v28.2d, (64 - 14) - xar v24.2d, v21.2d, v25.2d, (64 - 2) - xar v8.2d, v8.2d, v27.2d, (64 - 55) - xar v4.2d, v16.2d, v25.2d, (64 - 45) - xar v16.2d, v5.2d, v30.2d, (64 - 36) - xar v5.2d, v3.2d, v27.2d, (64 - 28) - xar v27.2d, v18.2d, v27.2d, (64 - 21) - xar v3.2d, v17.2d, v26.2d, (64 - 15) - xar v25.2d, v11.2d, v25.2d, (64 - 10) - xar v26.2d, v7.2d, v26.2d, (64 - 6) - xar v30.2d, v10.2d, v30.2d, (64 - 3) - - bcax v20.16b, v31.16b, v22.16b, v8.16b - bcax v21.16b, v8.16b, v23.16b, v22.16b - bcax v22.16b, v22.16b, v24.16b, v23.16b - bcax v23.16b, v23.16b, v31.16b, v24.16b - bcax v24.16b, v24.16b, v8.16b, v31.16b - - ld1r {v31.2d}, [x9], #8 - - bcax v17.16b, v25.16b, v19.16b, v3.16b - bcax v18.16b, v3.16b, v15.16b, v19.16b - bcax v19.16b, v19.16b, v16.16b, v15.16b - bcax v15.16b, v15.16b, v25.16b, v16.16b - bcax v16.16b, v16.16b, v3.16b, v25.16b - - bcax v10.16b, v29.16b, v12.16b, v26.16b - bcax v11.16b, v26.16b, v13.16b, v12.16b - bcax v12.16b, v12.16b, v14.16b, v13.16b - bcax v13.16b, v13.16b, v29.16b, v14.16b - bcax v14.16b, v14.16b, v26.16b, v29.16b - - bcax v7.16b, v30.16b, v9.16b, v4.16b - bcax v8.16b, v4.16b, v5.16b, v9.16b - bcax v9.16b, v9.16b, v6.16b, v5.16b - bcax v5.16b, v5.16b, v30.16b, v6.16b - bcax v6.16b, v6.16b, v4.16b, v30.16b - - bcax v3.16b, v27.16b, v0.16b, v28.16b - bcax v4.16b, v28.16b, v1.16b, v0.16b - bcax v0.16b, v0.16b, v2.16b, v1.16b - bcax v1.16b, v1.16b, v27.16b, v2.16b - bcax v2.16b, v2.16b, v28.16b, v27.16b - - eor v0.16b, v0.16b, v31.16b - - cbnz w8, 3b - cond_yield 4f, x8, x9 - cbnz w2, 0b - - /* save state */ -4: st1 { v0.1d- v3.1d}, [x0], #32 - st1 { v4.1d- v7.1d}, [x0], #32 - st1 { v8.1d-v11.1d}, [x0], #32 - st1 {v12.1d-v15.1d}, [x0], #32 - st1 {v16.1d-v19.1d}, [x0], #32 - st1 {v20.1d-v23.1d}, [x0], #32 - st1 {v24.1d}, [x0] - mov w0, w2 - ret -SYM_FUNC_END(sha3_ce_transform) - - .section ".rodata", "a" - .align 8 -.Lsha3_rcon: - .quad 0x0000000000000001, 0x0000000000008082, 0x800000000000808a - .quad 0x8000000080008000, 0x000000000000808b, 0x0000000080000001 - .quad 0x8000000080008081, 0x8000000000008009, 0x000000000000008a - .quad 0x0000000000000088, 0x0000000080008009, 0x000000008000000a - .quad 0x000000008000808b, 0x800000000000008b, 0x8000000000008089 - .quad 0x8000000000008003, 0x8000000000008002, 0x8000000000000080 - .quad 0x000000000000800a, 0x800000008000000a, 0x8000000080008081 - .quad 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c deleted file mode 100644 index b4f1001046c9..000000000000 --- a/arch/arm64/crypto/sha3-ce-glue.c +++ /dev/null @@ -1,151 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * sha3-ce-glue.c - core SHA-3 transform using v8.2 Crypto Extensions - * - * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <asm/hwcap.h> -#include <asm/neon.h> -#include <asm/simd.h> -#include <crypto/internal/hash.h> -#include <crypto/sha3.h> -#include <linux/cpufeature.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/string.h> -#include <linux/unaligned.h> - -MODULE_DESCRIPTION("SHA3 secure hash using ARMv8 Crypto Extensions"); -MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("sha3-224"); -MODULE_ALIAS_CRYPTO("sha3-256"); -MODULE_ALIAS_CRYPTO("sha3-384"); -MODULE_ALIAS_CRYPTO("sha3-512"); - -asmlinkage int sha3_ce_transform(u64 *st, const u8 *data, int blocks, - int md_len); - -static int sha3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct sha3_state *sctx = shash_desc_ctx(desc); - struct crypto_shash *tfm = desc->tfm; - unsigned int bs, ds; - int blocks; - - ds = crypto_shash_digestsize(tfm); - bs = crypto_shash_blocksize(tfm); - blocks = len / bs; - len -= blocks * bs; - do { - int rem; - - kernel_neon_begin(); - rem = sha3_ce_transform(sctx->st, data, blocks, ds); - kernel_neon_end(); - data += (blocks - rem) * bs; - blocks = rem; - } while (blocks); - return len; -} - -static int sha3_finup(struct shash_desc *desc, const u8 *src, unsigned int len, - u8 *out) -{ - struct sha3_state *sctx = shash_desc_ctx(desc); - struct crypto_shash *tfm = desc->tfm; - __le64 *digest = (__le64 *)out; - u8 block[SHA3_224_BLOCK_SIZE]; - unsigned int bs, ds; - int i; - - ds = crypto_shash_digestsize(tfm); - bs = crypto_shash_blocksize(tfm); - memcpy(block, src, len); - - block[len++] = 0x06; - memset(block + len, 0, bs - len); - block[bs - 1] |= 0x80; - - kernel_neon_begin(); - sha3_ce_transform(sctx->st, block, 1, ds); - kernel_neon_end(); - memzero_explicit(block , sizeof(block)); - - for (i = 0; i < ds / 8; i++) - put_unaligned_le64(sctx->st[i], digest++); - - if (ds & 4) - put_unaligned_le32(sctx->st[i], (__le32 *)digest); - - return 0; -} - -static struct shash_alg algs[] = { { - .digestsize = SHA3_224_DIGEST_SIZE, - .init = crypto_sha3_init, - .update = sha3_update, - .finup = sha3_finup, - .descsize = SHA3_STATE_SIZE, - .base.cra_name = "sha3-224", - .base.cra_driver_name = "sha3-224-ce", - .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .base.cra_blocksize = SHA3_224_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - .base.cra_priority = 200, -}, { - .digestsize = SHA3_256_DIGEST_SIZE, - .init = crypto_sha3_init, - .update = sha3_update, - .finup = sha3_finup, - .descsize = SHA3_STATE_SIZE, - .base.cra_name = "sha3-256", - .base.cra_driver_name = "sha3-256-ce", - .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .base.cra_blocksize = SHA3_256_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - .base.cra_priority = 200, -}, { - .digestsize = SHA3_384_DIGEST_SIZE, - .init = crypto_sha3_init, - .update = sha3_update, - .finup = sha3_finup, - .descsize = SHA3_STATE_SIZE, - .base.cra_name = "sha3-384", - .base.cra_driver_name = "sha3-384-ce", - .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .base.cra_blocksize = SHA3_384_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - .base.cra_priority = 200, -}, { - .digestsize = SHA3_512_DIGEST_SIZE, - .init = crypto_sha3_init, - .update = sha3_update, - .finup = sha3_finup, - .descsize = SHA3_STATE_SIZE, - .base.cra_name = "sha3-512", - .base.cra_driver_name = "sha3-512-ce", - .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .base.cra_blocksize = SHA3_512_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - .base.cra_priority = 200, -} }; - -static int __init sha3_neon_mod_init(void) -{ - return crypto_register_shashes(algs, ARRAY_SIZE(algs)); -} - -static void __exit sha3_neon_mod_fini(void) -{ - crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); -} - -module_cpu_feature_match(SHA3, sha3_neon_mod_init); -module_exit(sha3_neon_mod_fini); diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 8433f769f7e1..1df484ed6329 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -796,6 +796,7 @@ CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_SHA3=m CONFIG_CRYPTO_SM3_GENERIC=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_XCBC=m @@ -809,8 +810,6 @@ CONFIG_CRYPTO_USER_API_HASH=m CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m -CONFIG_CRYPTO_SHA3_256_S390=m -CONFIG_CRYPTO_SHA3_512_S390=m CONFIG_CRYPTO_GHASH_S390=m CONFIG_CRYPTO_AES_S390=m CONFIG_CRYPTO_DES_S390=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 4414dabd04a6..df89105dd520 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -780,6 +780,7 @@ CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_SHA3=m CONFIG_CRYPTO_SM3_GENERIC=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_XCBC=m @@ -794,8 +795,6 @@ CONFIG_CRYPTO_USER_API_HASH=m CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m -CONFIG_CRYPTO_SHA3_256_S390=m -CONFIG_CRYPTO_SHA3_512_S390=m CONFIG_CRYPTO_GHASH_S390=m CONFIG_CRYPTO_AES_S390=m CONFIG_CRYPTO_DES_S390=m diff --git a/arch/s390/crypto/Kconfig b/arch/s390/crypto/Kconfig index 03f73fbd38b6..f838ca055f6d 100644 --- a/arch/s390/crypto/Kconfig +++ b/arch/s390/crypto/Kconfig @@ -2,26 +2,6 @@ menu "Accelerated Cryptographic Algorithms for CPU (s390)" -config CRYPTO_SHA3_256_S390 - tristate "Hash functions: SHA3-224 and SHA3-256" - select CRYPTO_HASH - help - SHA3-224 and SHA3-256 secure hash algorithms (FIPS 202) - - Architecture: s390 - - It is available as of z14. - -config CRYPTO_SHA3_512_S390 - tristate "Hash functions: SHA3-384 and SHA3-512" - select CRYPTO_HASH - help - SHA3-384 and SHA3-512 secure hash algorithms (FIPS 202) - - Architecture: s390 - - It is available as of z14. - config CRYPTO_GHASH_S390 tristate "Hash functions: GHASH" select CRYPTO_HASH diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile index 998f4b656b18..387a229e1038 100644 --- a/arch/s390/crypto/Makefile +++ b/arch/s390/crypto/Makefile @@ -3,8 +3,6 @@ # Cryptographic API # -obj-$(CONFIG_CRYPTO_SHA3_256_S390) += sha3_256_s390.o sha_common.o -obj-$(CONFIG_CRYPTO_SHA3_512_S390) += sha3_512_s390.o sha_common.o obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o obj-$(CONFIG_CRYPTO_PAES_S390) += paes_s390.o diff --git a/arch/s390/crypto/sha.h b/arch/s390/crypto/sha.h deleted file mode 100644 index b9cd9572dd35..000000000000 --- a/arch/s390/crypto/sha.h +++ /dev/null @@ -1,51 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * Cryptographic API. - * - * s390 generic implementation of the SHA Secure Hash Algorithms. - * - * Copyright IBM Corp. 2007 - * Author(s): Jan Glauber (jang@de.ibm.com) - */ -#ifndef _CRYPTO_ARCH_S390_SHA_H -#define _CRYPTO_ARCH_S390_SHA_H - -#include <crypto/hash.h> -#include <crypto/sha2.h> -#include <crypto/sha3.h> -#include <linux/build_bug.h> -#include <linux/types.h> - -/* must be big enough for the largest SHA variant */ -#define CPACF_MAX_PARMBLOCK_SIZE SHA3_STATE_SIZE -#define SHA_MAX_BLOCK_SIZE SHA3_224_BLOCK_SIZE - -struct s390_sha_ctx { - u64 count; /* message length in bytes */ - union { - u32 state[CPACF_MAX_PARMBLOCK_SIZE / sizeof(u32)]; - struct { - u64 state[SHA512_DIGEST_SIZE / sizeof(u64)]; - u64 count_hi; - } sha512; - struct { - __le64 state[SHA3_STATE_SIZE / sizeof(u64)]; - } sha3; - }; - int func; /* KIMD function to use */ - bool first_message_part; -}; - -struct shash_desc; - -int s390_sha_update_blocks(struct shash_desc *desc, const u8 *data, - unsigned int len); -int s390_sha_finup(struct shash_desc *desc, const u8 *src, unsigned int len, - u8 *out); - -static inline void __check_s390_sha_ctx_size(void) -{ - BUILD_BUG_ON(S390_SHA_CTX_SIZE != sizeof(struct s390_sha_ctx)); -} - -#endif diff --git a/arch/s390/crypto/sha3_256_s390.c b/arch/s390/crypto/sha3_256_s390.c deleted file mode 100644 index 03bb4f4bab70..000000000000 --- a/arch/s390/crypto/sha3_256_s390.c +++ /dev/null @@ -1,157 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Cryptographic API. - * - * s390 implementation of the SHA256 and SHA224 Secure Hash Algorithm. - * - * s390 Version: - * Copyright IBM Corp. 2019 - * Author(s): Joerg Schmidbauer (jschmidb@de.ibm.com) - */ -#include <asm/cpacf.h> -#include <crypto/internal/hash.h> -#include <crypto/sha3.h> -#include <linux/cpufeature.h> -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/string.h> - -#include "sha.h" - -static int sha3_256_init(struct shash_desc *desc) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - - sctx->first_message_part = test_facility(86); - if (!sctx->first_message_part) - memset(sctx->state, 0, sizeof(sctx->state)); - sctx->count = 0; - sctx->func = CPACF_KIMD_SHA3_256; - - return 0; -} - -static int sha3_256_export(struct shash_desc *desc, void *out) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - union { - u8 *u8; - u64 *u64; - } p = { .u8 = out }; - int i; - - if (sctx->first_message_part) { - memset(out, 0, SHA3_STATE_SIZE); - return 0; - } - for (i = 0; i < SHA3_STATE_SIZE / 8; i++) - put_unaligned(le64_to_cpu(sctx->sha3.state[i]), p.u64++); - return 0; -} - -static int sha3_256_import(struct shash_desc *desc, const void *in) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - union { - const u8 *u8; - const u64 *u64; - } p = { .u8 = in }; - int i; - - for (i = 0; i < SHA3_STATE_SIZE / 8; i++) - sctx->sha3.state[i] = cpu_to_le64(get_unaligned(p.u64++)); - sctx->count = 0; - sctx->first_message_part = 0; - sctx->func = CPACF_KIMD_SHA3_256; - - return 0; -} - -static int sha3_224_import(struct shash_desc *desc, const void *in) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - - sha3_256_import(desc, in); - sctx->func = CPACF_KIMD_SHA3_224; - return 0; -} - -static struct shash_alg sha3_256_alg = { - .digestsize = SHA3_256_DIGEST_SIZE, /* = 32 */ - .init = sha3_256_init, - .update = s390_sha_update_blocks, - .finup = s390_sha_finup, - .export = sha3_256_export, - .import = sha3_256_import, - .descsize = S390_SHA_CTX_SIZE, - .statesize = SHA3_STATE_SIZE, - .base = { - .cra_name = "sha3-256", - .cra_driver_name = "sha3-256-s390", - .cra_priority = 300, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA3_256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int sha3_224_init(struct shash_desc *desc) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - - sha3_256_init(desc); - sctx->func = CPACF_KIMD_SHA3_224; - return 0; -} - -static struct shash_alg sha3_224_alg = { - .digestsize = SHA3_224_DIGEST_SIZE, - .init = sha3_224_init, - .update = s390_sha_update_blocks, - .finup = s390_sha_finup, - .export = sha3_256_export, /* same as for 256 */ - .import = sha3_224_import, /* function code different! */ - .descsize = S390_SHA_CTX_SIZE, - .statesize = SHA3_STATE_SIZE, - .base = { - .cra_name = "sha3-224", - .cra_driver_name = "sha3-224-s390", - .cra_priority = 300, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA3_224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init sha3_256_s390_init(void) -{ - int ret; - - if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA3_256)) - return -ENODEV; - - ret = crypto_register_shash(&sha3_256_alg); - if (ret < 0) - goto out; - - ret = crypto_register_shash(&sha3_224_alg); - if (ret < 0) - crypto_unregister_shash(&sha3_256_alg); -out: - return ret; -} - -static void __exit sha3_256_s390_fini(void) -{ - crypto_unregister_shash(&sha3_224_alg); - crypto_unregister_shash(&sha3_256_alg); -} - -module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha3_256_s390_init); -module_exit(sha3_256_s390_fini); - -MODULE_ALIAS_CRYPTO("sha3-256"); -MODULE_ALIAS_CRYPTO("sha3-224"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA3-256 and SHA3-224 Secure Hash Algorithm"); diff --git a/arch/s390/crypto/sha3_512_s390.c b/arch/s390/crypto/sha3_512_s390.c deleted file mode 100644 index a5c9690eecb1..000000000000 --- a/arch/s390/crypto/sha3_512_s390.c +++ /dev/null @@ -1,157 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Cryptographic API. - * - * s390 implementation of the SHA512 and SHA384 Secure Hash Algorithm. - * - * Copyright IBM Corp. 2019 - * Author(s): Joerg Schmidbauer (jschmidb@de.ibm.com) - */ -#include <asm/cpacf.h> -#include <crypto/internal/hash.h> -#include <crypto/sha3.h> -#include <linux/cpufeature.h> -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/string.h> - -#include "sha.h" - -static int sha3_512_init(struct shash_desc *desc) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - - sctx->first_message_part = test_facility(86); - if (!sctx->first_message_part) - memset(sctx->state, 0, sizeof(sctx->state)); - sctx->count = 0; - sctx->func = CPACF_KIMD_SHA3_512; - - return 0; -} - -static int sha3_512_export(struct shash_desc *desc, void *out) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - union { - u8 *u8; - u64 *u64; - } p = { .u8 = out }; - int i; - - if (sctx->first_message_part) { - memset(out, 0, SHA3_STATE_SIZE); - return 0; - } - for (i = 0; i < SHA3_STATE_SIZE / 8; i++) - put_unaligned(le64_to_cpu(sctx->sha3.state[i]), p.u64++); - return 0; -} - -static int sha3_512_import(struct shash_desc *desc, const void *in) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - union { - const u8 *u8; - const u64 *u64; - } p = { .u8 = in }; - int i; - - for (i = 0; i < SHA3_STATE_SIZE / 8; i++) - sctx->sha3.state[i] = cpu_to_le64(get_unaligned(p.u64++)); - sctx->count = 0; - sctx->first_message_part = 0; - sctx->func = CPACF_KIMD_SHA3_512; - - return 0; -} - -static int sha3_384_import(struct shash_desc *desc, const void *in) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - - sha3_512_import(desc, in); - sctx->func = CPACF_KIMD_SHA3_384; - return 0; -} - -static struct shash_alg sha3_512_alg = { - .digestsize = SHA3_512_DIGEST_SIZE, - .init = sha3_512_init, - .update = s390_sha_update_blocks, - .finup = s390_sha_finup, - .export = sha3_512_export, - .import = sha3_512_import, - .descsize = S390_SHA_CTX_SIZE, - .statesize = SHA3_STATE_SIZE, - .base = { - .cra_name = "sha3-512", - .cra_driver_name = "sha3-512-s390", - .cra_priority = 300, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA3_512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -MODULE_ALIAS_CRYPTO("sha3-512"); - -static int sha3_384_init(struct shash_desc *desc) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - - sha3_512_init(desc); - sctx->func = CPACF_KIMD_SHA3_384; - return 0; -} - -static struct shash_alg sha3_384_alg = { - .digestsize = SHA3_384_DIGEST_SIZE, - .init = sha3_384_init, - .update = s390_sha_update_blocks, - .finup = s390_sha_finup, - .export = sha3_512_export, /* same as for 512 */ - .import = sha3_384_import, /* function code different! */ - .descsize = S390_SHA_CTX_SIZE, - .statesize = SHA3_STATE_SIZE, - .base = { - .cra_name = "sha3-384", - .cra_driver_name = "sha3-384-s390", - .cra_priority = 300, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA3_384_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct s390_sha_ctx), - .cra_module = THIS_MODULE, - } -}; - -MODULE_ALIAS_CRYPTO("sha3-384"); - -static int __init init(void) -{ - int ret; - - if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA3_512)) - return -ENODEV; - ret = crypto_register_shash(&sha3_512_alg); - if (ret < 0) - goto out; - ret = crypto_register_shash(&sha3_384_alg); - if (ret < 0) - crypto_unregister_shash(&sha3_512_alg); -out: - return ret; -} - -static void __exit fini(void) -{ - crypto_unregister_shash(&sha3_512_alg); - crypto_unregister_shash(&sha3_384_alg); -} - -module_cpu_feature_match(S390_CPU_FEATURE_MSA, init); -module_exit(fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA3-512 and SHA3-384 Secure Hash Algorithm"); diff --git a/arch/s390/crypto/sha_common.c b/arch/s390/crypto/sha_common.c deleted file mode 100644 index d6f839618794..000000000000 --- a/arch/s390/crypto/sha_common.c +++ /dev/null @@ -1,117 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Cryptographic API. - * - * s390 generic implementation of the SHA Secure Hash Algorithms. - * - * Copyright IBM Corp. 2007 - * Author(s): Jan Glauber (jang@de.ibm.com) - */ - -#include <crypto/internal/hash.h> -#include <linux/export.h> -#include <linux/module.h> -#include <asm/cpacf.h> -#include "sha.h" - -int s390_sha_update_blocks(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - unsigned int bsize = crypto_shash_blocksize(desc->tfm); - struct s390_sha_ctx *ctx = shash_desc_ctx(desc); - unsigned int n; - int fc; - - fc = ctx->func; - if (ctx->first_message_part) - fc |= CPACF_KIMD_NIP; - - /* process as many blocks as possible */ - n = (len / bsize) * bsize; - ctx->count += n; - switch (ctx->func) { - case CPACF_KLMD_SHA_512: - case CPACF_KLMD_SHA3_384: - if (ctx->count < n) - ctx->sha512.count_hi++; - break; - } - cpacf_kimd(fc, ctx->state, data, n); - ctx->first_message_part = 0; - return len - n; -} -EXPORT_SYMBOL_GPL(s390_sha_update_blocks); - -static int s390_crypto_shash_parmsize(int func) -{ - switch (func) { - case CPACF_KLMD_SHA_1: - return 20; - case CPACF_KLMD_SHA_256: - return 32; - case CPACF_KLMD_SHA_512: - return 64; - case CPACF_KLMD_SHA3_224: - case CPACF_KLMD_SHA3_256: - case CPACF_KLMD_SHA3_384: - case CPACF_KLMD_SHA3_512: - return 200; - default: - return -EINVAL; - } -} - -int s390_sha_finup(struct shash_desc *desc, const u8 *src, unsigned int len, - u8 *out) -{ - struct s390_sha_ctx *ctx = shash_desc_ctx(desc); - int mbl_offset, fc; - u64 bits; - - ctx->count += len; - - bits = ctx->count * 8; - mbl_offset = s390_crypto_shash_parmsize(ctx->func); - if (mbl_offset < 0) - return -EINVAL; - - mbl_offset = mbl_offset / sizeof(u32); - - /* set total msg bit length (mbl) in CPACF parmblock */ - switch (ctx->func) { - case CPACF_KLMD_SHA_512: - /* The SHA512 parmblock has a 128-bit mbl field. */ - if (ctx->count < len) - ctx->sha512.count_hi++; - ctx->sha512.count_hi <<= 3; - ctx->sha512.count_hi |= ctx->count >> 61; - mbl_offset += sizeof(u64) / sizeof(u32); - fallthrough; - case CPACF_KLMD_SHA_1: - case CPACF_KLMD_SHA_256: - memcpy(ctx->state + mbl_offset, &bits, sizeof(bits)); - break; - case CPACF_KLMD_SHA3_224: - case CPACF_KLMD_SHA3_256: - case CPACF_KLMD_SHA3_384: - case CPACF_KLMD_SHA3_512: - break; - default: - return -EINVAL; - } - - fc = ctx->func; - fc |= test_facility(86) ? CPACF_KLMD_DUFOP : 0; - if (ctx->first_message_part) - fc |= CPACF_KLMD_NIP; - cpacf_klmd(fc, ctx->state, src, len); - - /* copy digest to out */ - memcpy(out, ctx->state, crypto_shash_digestsize(desc->tfm)); - - return 0; -} -EXPORT_SYMBOL_GPL(s390_sha_finup); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("s390 SHA cipher common functions"); diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index 48d3076b6053..3fd2423d3cf8 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -353,16 +353,6 @@ config CRYPTO_NHPOLY1305_AVX2 Architecture: x86_64 using: - AVX2 (Advanced Vector Extensions 2) -config CRYPTO_POLYVAL_CLMUL_NI - tristate "Hash functions: POLYVAL (CLMUL-NI)" - depends on 64BIT - select CRYPTO_POLYVAL - help - POLYVAL hash function for HCTR2 - - Architecture: x86_64 using: - - CLMUL-NI (carry-less multiplication new instructions) - config CRYPTO_SM3_AVX_X86_64 tristate "Hash functions: SM3 (AVX)" depends on 64BIT diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 2d30d5d36145..4a24dd38da50 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -52,9 +52,6 @@ aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \ obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o -obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o -polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o - obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o diff --git a/arch/x86/crypto/polyval-clmulni_asm.S b/arch/x86/crypto/polyval-clmulni_asm.S deleted file mode 100644 index a6ebe4e7dd2b..000000000000 --- a/arch/x86/crypto/polyval-clmulni_asm.S +++ /dev/null @@ -1,321 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright 2021 Google LLC - */ -/* - * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI - * instructions. It works on 8 blocks at a time, by precomputing the first 8 - * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation - * allows us to split finite field multiplication into two steps. - * - * In the first step, we consider h^i, m_i as normal polynomials of degree less - * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication - * is simply polynomial multiplication. - * - * In the second step, we compute the reduction of p(x) modulo the finite field - * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1. - * - * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where - * multiplication is finite field multiplication. The advantage is that the - * two-step process only requires 1 finite field reduction for every 8 - * polynomial multiplications. Further parallelism is gained by interleaving the - * multiplications and polynomial reductions. - */ - -#include <linux/linkage.h> -#include <asm/frame.h> - -#define STRIDE_BLOCKS 8 - -#define GSTAR %xmm7 -#define PL %xmm8 -#define PH %xmm9 -#define TMP_XMM %xmm11 -#define LO %xmm12 -#define HI %xmm13 -#define MI %xmm14 -#define SUM %xmm15 - -#define KEY_POWERS %rdi -#define MSG %rsi -#define BLOCKS_LEFT %rdx -#define ACCUMULATOR %rcx -#define TMP %rax - -.section .rodata.cst16.gstar, "aM", @progbits, 16 -.align 16 - -.Lgstar: - .quad 0xc200000000000000, 0xc200000000000000 - -.text - -/* - * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length - * count pointed to by MSG and KEY_POWERS. - */ -.macro schoolbook1 count - .set i, 0 - .rept (\count) - schoolbook1_iteration i 0 - .set i, (i +1) - .endr -.endm - -/* - * Computes the product of two 128-bit polynomials at the memory locations - * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of - * the 256-bit product into LO, MI, HI. - * - * Given: - * X = [X_1 : X_0] - * Y = [Y_1 : Y_0] - * - * We compute: - * LO += X_0 * Y_0 - * MI += X_0 * Y_1 + X_1 * Y_0 - * HI += X_1 * Y_1 - * - * Later, the 256-bit result can be extracted as: - * [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0] - * This step is done when computing the polynomial reduction for efficiency - * reasons. - * - * If xor_sum == 1, then also XOR the value of SUM into m_0. This avoids an - * extra multiplication of SUM and h^8. - */ -.macro schoolbook1_iteration i xor_sum - movups (16*\i)(MSG), %xmm0 - .if (\i == 0 && \xor_sum == 1) - pxor SUM, %xmm0 - .endif - vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2 - vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1 - vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3 - vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4 - vpxor %xmm2, MI, MI - vpxor %xmm1, LO, LO - vpxor %xmm4, HI, HI - vpxor %xmm3, MI, MI -.endm - -/* - * Performs the same computation as schoolbook1_iteration, except we expect the - * arguments to already be loaded into xmm0 and xmm1 and we set the result - * registers LO, MI, and HI directly rather than XOR'ing into them. - */ -.macro schoolbook1_noload - vpclmulqdq $0x01, %xmm0, %xmm1, MI - vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2 - vpclmulqdq $0x00, %xmm0, %xmm1, LO - vpclmulqdq $0x11, %xmm0, %xmm1, HI - vpxor %xmm2, MI, MI -.endm - -/* - * Computes the 256-bit polynomial represented by LO, HI, MI. Stores - * the result in PL, PH. - * [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0] - */ -.macro schoolbook2 - vpslldq $8, MI, PL - vpsrldq $8, MI, PH - pxor LO, PL - pxor HI, PH -.endm - -/* - * Computes the 128-bit reduction of PH : PL. Stores the result in dest. - * - * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) = - * x^128 + x^127 + x^126 + x^121 + 1. - * - * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the - * product of two 128-bit polynomials in Montgomery form. We need to reduce it - * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor - * of x^128, this product has two extra factors of x^128. To get it back into - * Montgomery form, we need to remove one of these factors by dividing by x^128. - * - * To accomplish both of these goals, we add multiples of g(x) that cancel out - * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low - * bits are zero, the polynomial division by x^128 can be done by right shifting. - * - * Since the only nonzero term in the low 64 bits of g(x) is the constant term, - * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can - * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 + - * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to - * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T - * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191. - * - * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits - * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1 - * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) * - * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 : - * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0). - * - * So our final computation is: - * T = T_1 : T_0 = g*(x) * P_0 - * V = V_1 : V_0 = g*(x) * (P_1 + T_0) - * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0 - * - * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0 - * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 : - * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V. - */ -.macro montgomery_reduction dest - vpclmulqdq $0x00, PL, GSTAR, TMP_XMM # TMP_XMM = T_1 : T_0 = P_0 * g*(x) - pshufd $0b01001110, TMP_XMM, TMP_XMM # TMP_XMM = T_0 : T_1 - pxor PL, TMP_XMM # TMP_XMM = P_1 + T_0 : P_0 + T_1 - pxor TMP_XMM, PH # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1 - pclmulqdq $0x11, GSTAR, TMP_XMM # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)] - vpxor TMP_XMM, PH, \dest -.endm - -/* - * Compute schoolbook multiplication for 8 blocks - * m_0h^8 + ... + m_7h^1 - * - * If reduce is set, also computes the montgomery reduction of the - * previous full_stride call and XORs with the first message block. - * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1. - * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0. - */ -.macro full_stride reduce - pxor LO, LO - pxor HI, HI - pxor MI, MI - - schoolbook1_iteration 7 0 - .if \reduce - vpclmulqdq $0x00, PL, GSTAR, TMP_XMM - .endif - - schoolbook1_iteration 6 0 - .if \reduce - pshufd $0b01001110, TMP_XMM, TMP_XMM - .endif - - schoolbook1_iteration 5 0 - .if \reduce - pxor PL, TMP_XMM - .endif - - schoolbook1_iteration 4 0 - .if \reduce - pxor TMP_XMM, PH - .endif - - schoolbook1_iteration 3 0 - .if \reduce - pclmulqdq $0x11, GSTAR, TMP_XMM - .endif - - schoolbook1_iteration 2 0 - .if \reduce - vpxor TMP_XMM, PH, SUM - .endif - - schoolbook1_iteration 1 0 - - schoolbook1_iteration 0 1 - - addq $(8*16), MSG - schoolbook2 -.endm - -/* - * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS - */ -.macro partial_stride - mov BLOCKS_LEFT, TMP - shlq $4, TMP - addq $(16*STRIDE_BLOCKS), KEY_POWERS - subq TMP, KEY_POWERS - - movups (MSG), %xmm0 - pxor SUM, %xmm0 - movaps (KEY_POWERS), %xmm1 - schoolbook1_noload - dec BLOCKS_LEFT - addq $16, MSG - addq $16, KEY_POWERS - - test $4, BLOCKS_LEFT - jz .Lpartial4BlocksDone - schoolbook1 4 - addq $(4*16), MSG - addq $(4*16), KEY_POWERS -.Lpartial4BlocksDone: - test $2, BLOCKS_LEFT - jz .Lpartial2BlocksDone - schoolbook1 2 - addq $(2*16), MSG - addq $(2*16), KEY_POWERS -.Lpartial2BlocksDone: - test $1, BLOCKS_LEFT - jz .LpartialDone - schoolbook1 1 -.LpartialDone: - schoolbook2 - montgomery_reduction SUM -.endm - -/* - * Perform montgomery multiplication in GF(2^128) and store result in op1. - * - * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1 - * If op1, op2 are in montgomery form, this computes the montgomery - * form of op1*op2. - * - * void clmul_polyval_mul(u8 *op1, const u8 *op2); - */ -SYM_FUNC_START(clmul_polyval_mul) - FRAME_BEGIN - vmovdqa .Lgstar(%rip), GSTAR - movups (%rdi), %xmm0 - movups (%rsi), %xmm1 - schoolbook1_noload - schoolbook2 - montgomery_reduction SUM - movups SUM, (%rdi) - FRAME_END - RET -SYM_FUNC_END(clmul_polyval_mul) - -/* - * Perform polynomial evaluation as specified by POLYVAL. This computes: - * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1} - * where n=nblocks, h is the hash key, and m_i are the message blocks. - * - * rdi - pointer to precomputed key powers h^8 ... h^1 - * rsi - pointer to message blocks - * rdx - number of blocks to hash - * rcx - pointer to the accumulator - * - * void clmul_polyval_update(const struct polyval_tfm_ctx *keys, - * const u8 *in, size_t nblocks, u8 *accumulator); - */ -SYM_FUNC_START(clmul_polyval_update) - FRAME_BEGIN - vmovdqa .Lgstar(%rip), GSTAR - movups (ACCUMULATOR), SUM - subq $STRIDE_BLOCKS, BLOCKS_LEFT - js .LstrideLoopExit - full_stride 0 - subq $STRIDE_BLOCKS, BLOCKS_LEFT - js .LstrideLoopExitReduce -.LstrideLoop: - full_stride 1 - subq $STRIDE_BLOCKS, BLOCKS_LEFT - jns .LstrideLoop -.LstrideLoopExitReduce: - montgomery_reduction SUM -.LstrideLoopExit: - add $STRIDE_BLOCKS, BLOCKS_LEFT - jz .LskipPartial - partial_stride -.LskipPartial: - movups SUM, (ACCUMULATOR) - FRAME_END - RET -SYM_FUNC_END(clmul_polyval_update) diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c deleted file mode 100644 index 6b466867f91a..000000000000 --- a/arch/x86/crypto/polyval-clmulni_glue.c +++ /dev/null @@ -1,180 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Glue code for POLYVAL using PCMULQDQ-NI - * - * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi> - * Copyright (c) 2009 Intel Corp. - * Author: Huang Ying <ying.huang@intel.com> - * Copyright 2021 Google LLC - */ - -/* - * Glue code based on ghash-clmulni-intel_glue.c. - * - * This implementation of POLYVAL uses montgomery multiplication - * accelerated by PCLMULQDQ-NI to implement the finite field - * operations. - */ - -#include <asm/cpu_device_id.h> -#include <asm/fpu/api.h> -#include <crypto/internal/hash.h> -#include <crypto/polyval.h> -#include <crypto/utils.h> -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/string.h> - -#define POLYVAL_ALIGN 16 -#define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN) -#define POLYVAL_ALIGN_EXTRA ((POLYVAL_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1)) -#define POLYVAL_CTX_SIZE (sizeof(struct polyval_tfm_ctx) + POLYVAL_ALIGN_EXTRA) -#define NUM_KEY_POWERS 8 - -struct polyval_tfm_ctx { - /* - * These powers must be in the order h^8, ..., h^1. - */ - u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE] POLYVAL_ALIGN_ATTR; -}; - -struct polyval_desc_ctx { - u8 buffer[POLYVAL_BLOCK_SIZE]; -}; - -asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys, - const u8 *in, size_t nblocks, u8 *accumulator); -asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2); - -static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm) -{ - return PTR_ALIGN(crypto_shash_ctx(tfm), POLYVAL_ALIGN); -} - -static void internal_polyval_update(const struct polyval_tfm_ctx *keys, - const u8 *in, size_t nblocks, u8 *accumulator) -{ - kernel_fpu_begin(); - clmul_polyval_update(keys, in, nblocks, accumulator); - kernel_fpu_end(); -} - -static void internal_polyval_mul(u8 *op1, const u8 *op2) -{ - kernel_fpu_begin(); - clmul_polyval_mul(op1, op2); - kernel_fpu_end(); -} - -static int polyval_x86_setkey(struct crypto_shash *tfm, - const u8 *key, unsigned int keylen) -{ - struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(tfm); - int i; - - if (keylen != POLYVAL_BLOCK_SIZE) - return -EINVAL; - - memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE); - - for (i = NUM_KEY_POWERS-2; i >= 0; i--) { - memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE); - internal_polyval_mul(tctx->key_powers[i], - tctx->key_powers[i+1]); - } - - return 0; -} - -static int polyval_x86_init(struct shash_desc *desc) -{ - struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); - - memset(dctx, 0, sizeof(*dctx)); - - return 0; -} - -static int polyval_x86_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen) -{ - struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); - const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm); - unsigned int nblocks; - - do { - /* Allow rescheduling every 4K bytes. */ - nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE; - internal_polyval_update(tctx, src, nblocks, dctx->buffer); - srclen -= nblocks * POLYVAL_BLOCK_SIZE; - src += nblocks * POLYVAL_BLOCK_SIZE; - } while (srclen >= POLYVAL_BLOCK_SIZE); - - return srclen; -} - -static int polyval_x86_finup(struct shash_desc *desc, const u8 *src, - unsigned int len, u8 *dst) -{ - struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); - const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm); - - if (len) { - crypto_xor(dctx->buffer, src, len); - internal_polyval_mul(dctx->buffer, - tctx->key_powers[NUM_KEY_POWERS-1]); - } - - memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE); - - return 0; -} - -static struct shash_alg polyval_alg = { - .digestsize = POLYVAL_DIGEST_SIZE, - .init = polyval_x86_init, - .update = polyval_x86_update, - .finup = polyval_x86_finup, - .setkey = polyval_x86_setkey, - .descsize = sizeof(struct polyval_desc_ctx), - .base = { - .cra_name = "polyval", - .cra_driver_name = "polyval-clmulni", - .cra_priority = 200, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = POLYVAL_BLOCK_SIZE, - .cra_ctxsize = POLYVAL_CTX_SIZE, - .cra_module = THIS_MODULE, - }, -}; - -__maybe_unused static const struct x86_cpu_id pcmul_cpu_id[] = { - X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id); - -static int __init polyval_clmulni_mod_init(void) -{ - if (!x86_match_cpu(pcmul_cpu_id)) - return -ENODEV; - - if (!boot_cpu_has(X86_FEATURE_AVX)) - return -ENODEV; - - return crypto_register_shash(&polyval_alg); -} - -static void __exit polyval_clmulni_mod_exit(void) -{ - crypto_unregister_shash(&polyval_alg); -} - -module_init(polyval_clmulni_mod_init); -module_exit(polyval_clmulni_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("POLYVAL hash function accelerated by PCLMULQDQ-NI"); -MODULE_ALIAS_CRYPTO("polyval"); -MODULE_ALIAS_CRYPTO("polyval-clmulni"); |