| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742 |
- /* SPDX-License-Identifier: GPL-2.0-or-later */
- /*
- * SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions
- * as specified in rfc8998
- * https://datatracker.ietf.org/doc/html/rfc8998
- *
- * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
- */
- #include <linux/linkage.h>
- #include <linux/cfi_types.h>
- #include <asm/assembler.h>
- #include "sm4-ce-asm.h"
- .arch armv8-a+crypto
- .irp b, 0, 1, 2, 3, 24, 25, 26, 27, 28, 29, 30, 31
- .set .Lv\b\().4s, \b
- .endr
- .macro sm4e, vd, vn
- .inst 0xcec08400 | (.L\vn << 5) | .L\vd
- .endm
- /* Register macros */
- /* Used for both encryption and decryption */
- #define RHASH v21
- #define RRCONST v22
- #define RZERO v23
- /* Helper macros. */
- /*
- * input: m0, m1
- * output: r0:r1 (low 128-bits in r0, high in r1)
- */
- #define PMUL_128x128(r0, r1, m0, m1, T0, T1) \
- ext T0.16b, m1.16b, m1.16b, #8; \
- pmull r0.1q, m0.1d, m1.1d; \
- pmull T1.1q, m0.1d, T0.1d; \
- pmull2 T0.1q, m0.2d, T0.2d; \
- pmull2 r1.1q, m0.2d, m1.2d; \
- eor T0.16b, T0.16b, T1.16b; \
- ext T1.16b, RZERO.16b, T0.16b, #8; \
- ext T0.16b, T0.16b, RZERO.16b, #8; \
- eor r0.16b, r0.16b, T1.16b; \
- eor r1.16b, r1.16b, T0.16b;
- #define PMUL_128x128_4x(r0, r1, m0, m1, T0, T1, \
- r2, r3, m2, m3, T2, T3, \
- r4, r5, m4, m5, T4, T5, \
- r6, r7, m6, m7, T6, T7) \
- ext T0.16b, m1.16b, m1.16b, #8; \
- ext T2.16b, m3.16b, m3.16b, #8; \
- ext T4.16b, m5.16b, m5.16b, #8; \
- ext T6.16b, m7.16b, m7.16b, #8; \
- pmull r0.1q, m0.1d, m1.1d; \
- pmull r2.1q, m2.1d, m3.1d; \
- pmull r4.1q, m4.1d, m5.1d; \
- pmull r6.1q, m6.1d, m7.1d; \
- pmull T1.1q, m0.1d, T0.1d; \
- pmull T3.1q, m2.1d, T2.1d; \
- pmull T5.1q, m4.1d, T4.1d; \
- pmull T7.1q, m6.1d, T6.1d; \
- pmull2 T0.1q, m0.2d, T0.2d; \
- pmull2 T2.1q, m2.2d, T2.2d; \
- pmull2 T4.1q, m4.2d, T4.2d; \
- pmull2 T6.1q, m6.2d, T6.2d; \
- pmull2 r1.1q, m0.2d, m1.2d; \
- pmull2 r3.1q, m2.2d, m3.2d; \
- pmull2 r5.1q, m4.2d, m5.2d; \
- pmull2 r7.1q, m6.2d, m7.2d; \
- eor T0.16b, T0.16b, T1.16b; \
- eor T2.16b, T2.16b, T3.16b; \
- eor T4.16b, T4.16b, T5.16b; \
- eor T6.16b, T6.16b, T7.16b; \
- ext T1.16b, RZERO.16b, T0.16b, #8; \
- ext T3.16b, RZERO.16b, T2.16b, #8; \
- ext T5.16b, RZERO.16b, T4.16b, #8; \
- ext T7.16b, RZERO.16b, T6.16b, #8; \
- ext T0.16b, T0.16b, RZERO.16b, #8; \
- ext T2.16b, T2.16b, RZERO.16b, #8; \
- ext T4.16b, T4.16b, RZERO.16b, #8; \
- ext T6.16b, T6.16b, RZERO.16b, #8; \
- eor r0.16b, r0.16b, T1.16b; \
- eor r2.16b, r2.16b, T3.16b; \
- eor r4.16b, r4.16b, T5.16b; \
- eor r6.16b, r6.16b, T7.16b; \
- eor r1.16b, r1.16b, T0.16b; \
- eor r3.16b, r3.16b, T2.16b; \
- eor r5.16b, r5.16b, T4.16b; \
- eor r7.16b, r7.16b, T6.16b;
- /*
- * input: r0:r1 (low 128-bits in r0, high in r1)
- * output: a
- */
- #define REDUCTION(a, r0, r1, rconst, T0, T1) \
- pmull2 T0.1q, r1.2d, rconst.2d; \
- ext T1.16b, T0.16b, RZERO.16b, #8; \
- ext T0.16b, RZERO.16b, T0.16b, #8; \
- eor r1.16b, r1.16b, T1.16b; \
- eor r0.16b, r0.16b, T0.16b; \
- pmull T0.1q, r1.1d, rconst.1d; \
- eor a.16b, r0.16b, T0.16b;
- #define SM4_CRYPT_PMUL_128x128_BLK(b0, r0, r1, m0, m1, T0, T1) \
- rev32 b0.16b, b0.16b; \
- ext T0.16b, m1.16b, m1.16b, #8; \
- sm4e b0.4s, v24.4s; \
- pmull r0.1q, m0.1d, m1.1d; \
- sm4e b0.4s, v25.4s; \
- pmull T1.1q, m0.1d, T0.1d; \
- sm4e b0.4s, v26.4s; \
- pmull2 T0.1q, m0.2d, T0.2d; \
- sm4e b0.4s, v27.4s; \
- pmull2 r1.1q, m0.2d, m1.2d; \
- sm4e b0.4s, v28.4s; \
- eor T0.16b, T0.16b, T1.16b; \
- sm4e b0.4s, v29.4s; \
- ext T1.16b, RZERO.16b, T0.16b, #8; \
- sm4e b0.4s, v30.4s; \
- ext T0.16b, T0.16b, RZERO.16b, #8; \
- sm4e b0.4s, v31.4s; \
- eor r0.16b, r0.16b, T1.16b; \
- rev64 b0.4s, b0.4s; \
- eor r1.16b, r1.16b, T0.16b; \
- ext b0.16b, b0.16b, b0.16b, #8; \
- rev32 b0.16b, b0.16b;
- #define SM4_CRYPT_PMUL_128x128_BLK3(b0, b1, b2, \
- r0, r1, m0, m1, T0, T1, \
- r2, r3, m2, m3, T2, T3, \
- r4, r5, m4, m5, T4, T5) \
- rev32 b0.16b, b0.16b; \
- rev32 b1.16b, b1.16b; \
- rev32 b2.16b, b2.16b; \
- ext T0.16b, m1.16b, m1.16b, #8; \
- ext T2.16b, m3.16b, m3.16b, #8; \
- ext T4.16b, m5.16b, m5.16b, #8; \
- sm4e b0.4s, v24.4s; \
- sm4e b1.4s, v24.4s; \
- sm4e b2.4s, v24.4s; \
- pmull r0.1q, m0.1d, m1.1d; \
- pmull r2.1q, m2.1d, m3.1d; \
- pmull r4.1q, m4.1d, m5.1d; \
- sm4e b0.4s, v25.4s; \
- sm4e b1.4s, v25.4s; \
- sm4e b2.4s, v25.4s; \
- pmull T1.1q, m0.1d, T0.1d; \
- pmull T3.1q, m2.1d, T2.1d; \
- pmull T5.1q, m4.1d, T4.1d; \
- sm4e b0.4s, v26.4s; \
- sm4e b1.4s, v26.4s; \
- sm4e b2.4s, v26.4s; \
- pmull2 T0.1q, m0.2d, T0.2d; \
- pmull2 T2.1q, m2.2d, T2.2d; \
- pmull2 T4.1q, m4.2d, T4.2d; \
- sm4e b0.4s, v27.4s; \
- sm4e b1.4s, v27.4s; \
- sm4e b2.4s, v27.4s; \
- pmull2 r1.1q, m0.2d, m1.2d; \
- pmull2 r3.1q, m2.2d, m3.2d; \
- pmull2 r5.1q, m4.2d, m5.2d; \
- sm4e b0.4s, v28.4s; \
- sm4e b1.4s, v28.4s; \
- sm4e b2.4s, v28.4s; \
- eor T0.16b, T0.16b, T1.16b; \
- eor T2.16b, T2.16b, T3.16b; \
- eor T4.16b, T4.16b, T5.16b; \
- sm4e b0.4s, v29.4s; \
- sm4e b1.4s, v29.4s; \
- sm4e b2.4s, v29.4s; \
- ext T1.16b, RZERO.16b, T0.16b, #8; \
- ext T3.16b, RZERO.16b, T2.16b, #8; \
- ext T5.16b, RZERO.16b, T4.16b, #8; \
- sm4e b0.4s, v30.4s; \
- sm4e b1.4s, v30.4s; \
- sm4e b2.4s, v30.4s; \
- ext T0.16b, T0.16b, RZERO.16b, #8; \
- ext T2.16b, T2.16b, RZERO.16b, #8; \
- ext T4.16b, T4.16b, RZERO.16b, #8; \
- sm4e b0.4s, v31.4s; \
- sm4e b1.4s, v31.4s; \
- sm4e b2.4s, v31.4s; \
- eor r0.16b, r0.16b, T1.16b; \
- eor r2.16b, r2.16b, T3.16b; \
- eor r4.16b, r4.16b, T5.16b; \
- rev64 b0.4s, b0.4s; \
- rev64 b1.4s, b1.4s; \
- rev64 b2.4s, b2.4s; \
- eor r1.16b, r1.16b, T0.16b; \
- eor r3.16b, r3.16b, T2.16b; \
- eor r5.16b, r5.16b, T4.16b; \
- ext b0.16b, b0.16b, b0.16b, #8; \
- ext b1.16b, b1.16b, b1.16b, #8; \
- ext b2.16b, b2.16b, b2.16b, #8; \
- eor r0.16b, r0.16b, r2.16b; \
- eor r1.16b, r1.16b, r3.16b; \
- rev32 b0.16b, b0.16b; \
- rev32 b1.16b, b1.16b; \
- rev32 b2.16b, b2.16b; \
- eor r0.16b, r0.16b, r4.16b; \
- eor r1.16b, r1.16b, r5.16b;
- #define inc32_le128(vctr) \
- mov vctr.d[1], x9; \
- add w6, w9, #1; \
- mov vctr.d[0], x8; \
- bfi x9, x6, #0, #32; \
- rev64 vctr.16b, vctr.16b;
- #define GTAG_HASH_LENGTHS(vctr0, vlen) \
- ld1 {vlen.16b}, [x7]; \
- /* construct CTR0 */ \
- /* the lower 32-bits of initial IV is always be32(1) */ \
- mov x6, #0x1; \
- bfi x9, x6, #0, #32; \
- mov vctr0.d[0], x8; \
- mov vctr0.d[1], x9; \
- rbit vlen.16b, vlen.16b; \
- rev64 vctr0.16b, vctr0.16b; \
- /* authtag = GCTR(CTR0, GHASH) */ \
- eor RHASH.16b, RHASH.16b, vlen.16b; \
- SM4_CRYPT_PMUL_128x128_BLK(vctr0, RR0, RR1, RHASH, RH1, \
- RTMP0, RTMP1); \
- REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3); \
- rbit RHASH.16b, RHASH.16b; \
- eor RHASH.16b, RHASH.16b, vctr0.16b;
- /* Register macros for encrypt and ghash */
- /* can be the same as input v0-v3 */
- #define RR1 v0
- #define RR3 v1
- #define RR5 v2
- #define RR7 v3
- #define RR0 v4
- #define RR2 v5
- #define RR4 v6
- #define RR6 v7
- #define RTMP0 v8
- #define RTMP1 v9
- #define RTMP2 v10
- #define RTMP3 v11
- #define RTMP4 v12
- #define RTMP5 v13
- #define RTMP6 v14
- #define RTMP7 v15
- #define RH1 v16
- #define RH2 v17
- #define RH3 v18
- #define RH4 v19
- .align 3
- SYM_FUNC_START(sm4_ce_pmull_ghash_setup)
- /* input:
- * x0: round key array, CTX
- * x1: ghash table
- */
- SM4_PREPARE(x0)
- adr_l x2, .Lghash_rconst
- ld1r {RRCONST.2d}, [x2]
- eor RZERO.16b, RZERO.16b, RZERO.16b
- /* H = E(K, 0^128) */
- rev32 v0.16b, RZERO.16b
- SM4_CRYPT_BLK_BE(v0)
- /* H ^ 1 */
- rbit RH1.16b, v0.16b
- /* H ^ 2 */
- PMUL_128x128(RR0, RR1, RH1, RH1, RTMP0, RTMP1)
- REDUCTION(RH2, RR0, RR1, RRCONST, RTMP2, RTMP3)
- /* H ^ 3 */
- PMUL_128x128(RR0, RR1, RH2, RH1, RTMP0, RTMP1)
- REDUCTION(RH3, RR0, RR1, RRCONST, RTMP2, RTMP3)
- /* H ^ 4 */
- PMUL_128x128(RR0, RR1, RH2, RH2, RTMP0, RTMP1)
- REDUCTION(RH4, RR0, RR1, RRCONST, RTMP2, RTMP3)
- st1 {RH1.16b-RH4.16b}, [x1]
- ret
- SYM_FUNC_END(sm4_ce_pmull_ghash_setup)
- .align 3
- SYM_FUNC_START(pmull_ghash_update)
- /* input:
- * x0: ghash table
- * x1: ghash result
- * x2: src
- * w3: nblocks
- */
- ld1 {RH1.16b-RH4.16b}, [x0]
- ld1 {RHASH.16b}, [x1]
- rbit RHASH.16b, RHASH.16b
- adr_l x4, .Lghash_rconst
- ld1r {RRCONST.2d}, [x4]
- eor RZERO.16b, RZERO.16b, RZERO.16b
- .Lghash_loop_4x:
- cmp w3, #4
- blt .Lghash_loop_1x
- sub w3, w3, #4
- ld1 {v0.16b-v3.16b}, [x2], #64
- rbit v0.16b, v0.16b
- rbit v1.16b, v1.16b
- rbit v2.16b, v2.16b
- rbit v3.16b, v3.16b
- /*
- * (in0 ^ HASH) * H^4 => rr0:rr1
- * (in1) * H^3 => rr2:rr3
- * (in2) * H^2 => rr4:rr5
- * (in3) * H^1 => rr6:rr7
- */
- eor RHASH.16b, RHASH.16b, v0.16b
- PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
- RR2, RR3, v1, RH3, RTMP2, RTMP3,
- RR4, RR5, v2, RH2, RTMP4, RTMP5,
- RR6, RR7, v3, RH1, RTMP6, RTMP7)
- eor RR0.16b, RR0.16b, RR2.16b
- eor RR1.16b, RR1.16b, RR3.16b
- eor RR0.16b, RR0.16b, RR4.16b
- eor RR1.16b, RR1.16b, RR5.16b
- eor RR0.16b, RR0.16b, RR6.16b
- eor RR1.16b, RR1.16b, RR7.16b
- REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
- cbz w3, .Lghash_end
- b .Lghash_loop_4x
- .Lghash_loop_1x:
- sub w3, w3, #1
- ld1 {v0.16b}, [x2], #16
- rbit v0.16b, v0.16b
- eor RHASH.16b, RHASH.16b, v0.16b
- PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
- REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
- cbnz w3, .Lghash_loop_1x
- .Lghash_end:
- rbit RHASH.16b, RHASH.16b
- st1 {RHASH.2d}, [x1]
- ret
- SYM_FUNC_END(pmull_ghash_update)
- .align 3
- SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_enc)
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * x3: ctr (big endian, 128 bit)
- * w4: nbytes
- * x5: ghash result
- * x6: ghash table
- * x7: lengths (only for last block)
- */
- SM4_PREPARE(x0)
- ldp x8, x9, [x3]
- rev x8, x8
- rev x9, x9
- ld1 {RH1.16b-RH4.16b}, [x6]
- ld1 {RHASH.16b}, [x5]
- rbit RHASH.16b, RHASH.16b
- adr_l x6, .Lghash_rconst
- ld1r {RRCONST.2d}, [x6]
- eor RZERO.16b, RZERO.16b, RZERO.16b
- cbz w4, .Lgcm_enc_hash_len
- .Lgcm_enc_loop_4x:
- cmp w4, #(4 * 16)
- blt .Lgcm_enc_loop_1x
- sub w4, w4, #(4 * 16)
- /* construct CTRs */
- inc32_le128(v0) /* +0 */
- inc32_le128(v1) /* +1 */
- inc32_le128(v2) /* +2 */
- inc32_le128(v3) /* +3 */
- ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
- SM4_CRYPT_BLK4(v0, v1, v2, v3)
- eor v0.16b, v0.16b, RTMP0.16b
- eor v1.16b, v1.16b, RTMP1.16b
- eor v2.16b, v2.16b, RTMP2.16b
- eor v3.16b, v3.16b, RTMP3.16b
- st1 {v0.16b-v3.16b}, [x1], #64
- /* ghash update */
- rbit v0.16b, v0.16b
- rbit v1.16b, v1.16b
- rbit v2.16b, v2.16b
- rbit v3.16b, v3.16b
- /*
- * (in0 ^ HASH) * H^4 => rr0:rr1
- * (in1) * H^3 => rr2:rr3
- * (in2) * H^2 => rr4:rr5
- * (in3) * H^1 => rr6:rr7
- */
- eor RHASH.16b, RHASH.16b, v0.16b
- PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
- RR2, RR3, v1, RH3, RTMP2, RTMP3,
- RR4, RR5, v2, RH2, RTMP4, RTMP5,
- RR6, RR7, v3, RH1, RTMP6, RTMP7)
- eor RR0.16b, RR0.16b, RR2.16b
- eor RR1.16b, RR1.16b, RR3.16b
- eor RR0.16b, RR0.16b, RR4.16b
- eor RR1.16b, RR1.16b, RR5.16b
- eor RR0.16b, RR0.16b, RR6.16b
- eor RR1.16b, RR1.16b, RR7.16b
- REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
- cbz w4, .Lgcm_enc_hash_len
- b .Lgcm_enc_loop_4x
- .Lgcm_enc_loop_1x:
- cmp w4, #16
- blt .Lgcm_enc_tail
- sub w4, w4, #16
- /* construct CTRs */
- inc32_le128(v0)
- ld1 {RTMP0.16b}, [x2], #16
- SM4_CRYPT_BLK(v0)
- eor v0.16b, v0.16b, RTMP0.16b
- st1 {v0.16b}, [x1], #16
- /* ghash update */
- rbit v0.16b, v0.16b
- eor RHASH.16b, RHASH.16b, v0.16b
- PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
- REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
- cbz w4, .Lgcm_enc_hash_len
- b .Lgcm_enc_loop_1x
- .Lgcm_enc_tail:
- /* construct CTRs */
- inc32_le128(v0)
- SM4_CRYPT_BLK(v0)
- /* load permute table */
- adr_l x0, .Lcts_permute_table
- add x0, x0, #32
- sub x0, x0, w4, uxtw
- ld1 {v3.16b}, [x0]
- .Lgcm_enc_tail_loop:
- /* do encrypt */
- ldrb w0, [x2], #1 /* get 1 byte from input */
- umov w6, v0.b[0] /* get top crypted byte */
- eor w6, w6, w0 /* w6 = CTR ^ input */
- strb w6, [x1], #1 /* store out byte */
- /* shift right out one byte */
- ext v0.16b, v0.16b, v0.16b, #1
- /* the last ciphertext is placed in high bytes */
- ins v0.b[15], w6
- subs w4, w4, #1
- bne .Lgcm_enc_tail_loop
- /* padding last block with zeros */
- tbl v0.16b, {v0.16b}, v3.16b
- /* ghash update */
- rbit v0.16b, v0.16b
- eor RHASH.16b, RHASH.16b, v0.16b
- PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
- REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
- .Lgcm_enc_hash_len:
- cbz x7, .Lgcm_enc_end
- GTAG_HASH_LENGTHS(v1, v3)
- b .Lgcm_enc_ret
- .Lgcm_enc_end:
- /* store new CTR */
- rev x8, x8
- rev x9, x9
- stp x8, x9, [x3]
- rbit RHASH.16b, RHASH.16b
- .Lgcm_enc_ret:
- /* store new MAC */
- st1 {RHASH.2d}, [x5]
- ret
- SYM_FUNC_END(sm4_ce_pmull_gcm_enc)
- #undef RR1
- #undef RR3
- #undef RR5
- #undef RR7
- #undef RR0
- #undef RR2
- #undef RR4
- #undef RR6
- #undef RTMP0
- #undef RTMP1
- #undef RTMP2
- #undef RTMP3
- #undef RTMP4
- #undef RTMP5
- #undef RTMP6
- #undef RTMP7
- #undef RH1
- #undef RH2
- #undef RH3
- #undef RH4
- /* Register macros for decrypt */
- /* v0-v2 for building CTRs, v3-v5 for saving inputs */
- #define RR1 v6
- #define RR3 v7
- #define RR5 v8
- #define RR0 v9
- #define RR2 v10
- #define RR4 v11
- #define RTMP0 v12
- #define RTMP1 v13
- #define RTMP2 v14
- #define RTMP3 v15
- #define RTMP4 v16
- #define RTMP5 v17
- #define RH1 v18
- #define RH2 v19
- #define RH3 v20
- .align 3
- SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_dec)
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * x3: ctr (big endian, 128 bit)
- * w4: nbytes
- * x5: ghash result
- * x6: ghash table
- * x7: lengths (only for last block)
- */
- SM4_PREPARE(x0)
- ldp x8, x9, [x3]
- rev x8, x8
- rev x9, x9
- ld1 {RH1.16b-RH3.16b}, [x6]
- ld1 {RHASH.16b}, [x5]
- rbit RHASH.16b, RHASH.16b
- adr_l x6, .Lghash_rconst
- ld1r {RRCONST.2d}, [x6]
- eor RZERO.16b, RZERO.16b, RZERO.16b
- cbz w4, .Lgcm_dec_hash_len
- .Lgcm_dec_loop_3x:
- cmp w4, #(3 * 16)
- blt .Lgcm_dec_loop_1x
- sub w4, w4, #(3 * 16)
- ld1 {v3.16b-v5.16b}, [x2], #(3 * 16)
- /* construct CTRs */
- inc32_le128(v0) /* +0 */
- rbit v6.16b, v3.16b
- inc32_le128(v1) /* +1 */
- rbit v7.16b, v4.16b
- inc32_le128(v2) /* +2 */
- rbit v8.16b, v5.16b
- eor RHASH.16b, RHASH.16b, v6.16b
- /* decrypt & ghash update */
- SM4_CRYPT_PMUL_128x128_BLK3(v0, v1, v2,
- RR0, RR1, RHASH, RH3, RTMP0, RTMP1,
- RR2, RR3, v7, RH2, RTMP2, RTMP3,
- RR4, RR5, v8, RH1, RTMP4, RTMP5)
- eor v0.16b, v0.16b, v3.16b
- eor v1.16b, v1.16b, v4.16b
- eor v2.16b, v2.16b, v5.16b
- REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
- st1 {v0.16b-v2.16b}, [x1], #(3 * 16)
- cbz w4, .Lgcm_dec_hash_len
- b .Lgcm_dec_loop_3x
- .Lgcm_dec_loop_1x:
- cmp w4, #16
- blt .Lgcm_dec_tail
- sub w4, w4, #16
- ld1 {v3.16b}, [x2], #16
- /* construct CTRs */
- inc32_le128(v0)
- rbit v6.16b, v3.16b
- eor RHASH.16b, RHASH.16b, v6.16b
- SM4_CRYPT_PMUL_128x128_BLK(v0, RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
- eor v0.16b, v0.16b, v3.16b
- REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
- st1 {v0.16b}, [x1], #16
- cbz w4, .Lgcm_dec_hash_len
- b .Lgcm_dec_loop_1x
- .Lgcm_dec_tail:
- /* construct CTRs */
- inc32_le128(v0)
- SM4_CRYPT_BLK(v0)
- /* load permute table */
- adr_l x0, .Lcts_permute_table
- add x0, x0, #32
- sub x0, x0, w4, uxtw
- ld1 {v3.16b}, [x0]
- .Lgcm_dec_tail_loop:
- /* do decrypt */
- ldrb w0, [x2], #1 /* get 1 byte from input */
- umov w6, v0.b[0] /* get top crypted byte */
- eor w6, w6, w0 /* w6 = CTR ^ input */
- strb w6, [x1], #1 /* store out byte */
- /* shift right out one byte */
- ext v0.16b, v0.16b, v0.16b, #1
- /* the last ciphertext is placed in high bytes */
- ins v0.b[15], w0
- subs w4, w4, #1
- bne .Lgcm_dec_tail_loop
- /* padding last block with zeros */
- tbl v0.16b, {v0.16b}, v3.16b
- /* ghash update */
- rbit v0.16b, v0.16b
- eor RHASH.16b, RHASH.16b, v0.16b
- PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
- REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
- .Lgcm_dec_hash_len:
- cbz x7, .Lgcm_dec_end
- GTAG_HASH_LENGTHS(v1, v3)
- b .Lgcm_dec_ret
- .Lgcm_dec_end:
- /* store new CTR */
- rev x8, x8
- rev x9, x9
- stp x8, x9, [x3]
- rbit RHASH.16b, RHASH.16b
- .Lgcm_dec_ret:
- /* store new MAC */
- st1 {RHASH.2d}, [x5]
- ret
- SYM_FUNC_END(sm4_ce_pmull_gcm_dec)
- .section ".rodata", "a"
- .align 4
- .Lcts_permute_table:
- .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
- .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
- .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
- .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
- .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
- .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
- .Lghash_rconst:
- .quad 0x87
|