1051 lines
23 KiB
ArmAsm
1051 lines
23 KiB
ArmAsm
# This submission to NSS is to be made available under the terms of the
|
|
# Mozilla Public License, v. 2.0. You can obtain one at //mozilla.org/MPL/2.0/
|
|
# Copyright(c) 2021, Niels Möller and Mamone Tarsha
|
|
|
|
# Registers:
|
|
|
|
.set SP, 1
|
|
.set TOCP, 2
|
|
|
|
.macro VEC_LOAD_DATA VR, DATA, GPR
|
|
addis \GPR, 2, \DATA@got@ha
|
|
ld \GPR, \DATA@got@l(\GPR)
|
|
lvx \VR, 0, \GPR
|
|
.endm
|
|
|
|
.macro VEC_LOAD VR, GPR, IDX
|
|
lxvd2x \VR+32, \IDX, \GPR
|
|
vperm \VR, \VR, \VR, SWAP_MASK
|
|
.endm
|
|
|
|
.macro VEC_LOAD_INC VR, GPR, IDX
|
|
lxvd2x \VR+32, \IDX, \GPR
|
|
addi \IDX,\IDX,16
|
|
vperm \VR, \VR, \VR, SWAP_MASK
|
|
.endm
|
|
|
|
.macro VEC_STORE VR, GPR, IDX
|
|
vperm \VR, \VR, \VR, SWAP_MASK
|
|
stxvd2x \VR+32, \IDX, \GPR
|
|
.endm
|
|
|
|
# 0 < LEN < 16, pad the remaining bytes with zeros
|
|
.macro LOAD_LEN DATA, LEN, VAL1, VAL0, TMP0, TMP1, TMP2
|
|
li \TMP0, 0
|
|
li \VAL1, 0
|
|
li \VAL0, 0
|
|
andi. \TMP1, \LEN, 8
|
|
beq 1f
|
|
ldbrx \VAL1, 0, \DATA
|
|
li \TMP0, 8
|
|
1:
|
|
andi. \TMP1, \LEN, 7
|
|
beq 3f
|
|
li \TMP1, 56
|
|
2:
|
|
lbzx \TMP2, \TMP0, \DATA
|
|
sld \TMP2, \TMP2, \TMP1
|
|
subi \TMP1, \TMP1, 8
|
|
or \VAL0, \VAL0, \TMP2
|
|
addi \TMP0, \TMP0, 1
|
|
cmpld \TMP0, \LEN
|
|
bne 2b
|
|
andi. \TMP1, \LEN, 8
|
|
bne 3f
|
|
mr \VAL1, \VAL0
|
|
li \VAL0, 0
|
|
3:
|
|
.endm
|
|
|
|
# 0 < LEN < 16
|
|
.macro STORE_LEN DATA, LEN, VAL1, VAL0, TMP0, TMP1, TMP2
|
|
andi. \TMP1, \LEN, 8
|
|
beq 1f
|
|
stdbrx \VAL1, 0, \DATA
|
|
li \TMP0, 8
|
|
b 2f
|
|
1:
|
|
li \TMP0, 0
|
|
mr \VAL0, \VAL1
|
|
2:
|
|
andi. \TMP1, \LEN, 7
|
|
beq 4f
|
|
li \TMP1, 56
|
|
3:
|
|
srd \TMP2, \VAL0, \TMP1
|
|
subi \TMP1, \TMP1, 8
|
|
stbx \TMP2, \TMP0, \DATA
|
|
addi \TMP0, \TMP0, 1
|
|
cmpld \TMP0, \LEN
|
|
bne 3b
|
|
4:
|
|
.endm
|
|
|
|
.text
|
|
|
|
################################################################################
|
|
# Generates the H table
|
|
# void ppc_aes_gcmINIT(uint8_t Htbl[16*8], uint32_t *KS, int NR);
|
|
.globl ppc_aes_gcmINIT
|
|
.type ppc_aes_gcmINIT,@function
|
|
.align 5
|
|
ppc_aes_gcmINIT:
|
|
addis TOCP,12,(.TOC.-ppc_aes_gcmINIT)@ha
|
|
addi TOCP,TOCP,(.TOC.-ppc_aes_gcmINIT)@l
|
|
.localentry ppc_aes_gcmINIT, .-ppc_aes_gcmINIT
|
|
|
|
.set Htbl, 3
|
|
.set KS, 4
|
|
.set NR, 5
|
|
|
|
.set ZERO, 19
|
|
.set MSB, 18
|
|
.set ONE, 17
|
|
.set SWAP_MASK, 0
|
|
.set POLY, 1
|
|
.set K, 2
|
|
.set H, 3
|
|
.set H2, 4
|
|
.set H3, 5
|
|
.set H4, 6
|
|
.set HP, 7
|
|
.set HS, 8
|
|
.set R, 9
|
|
.set F, 10
|
|
.set T, 11
|
|
.set H1M, 12
|
|
.set H1L, 13
|
|
.set H2M, 14
|
|
.set H2L, 15
|
|
.set H3M, 16
|
|
.set H3L, 17
|
|
.set H4M, 18
|
|
.set H4L, 19
|
|
|
|
VEC_LOAD_DATA SWAP_MASK, .Ldb_bswap_mask, 6
|
|
VEC_LOAD_DATA POLY, .Lpoly, 6
|
|
|
|
li 6, 0
|
|
VEC_LOAD_INC H, KS, 6
|
|
VEC_LOAD_INC K, KS, 6
|
|
vcipher H, H, K
|
|
VEC_LOAD_INC K, KS, 6
|
|
vcipher H, H, K
|
|
VEC_LOAD_INC K, KS, 6
|
|
vcipher H, H, K
|
|
VEC_LOAD_INC K, KS, 6
|
|
vcipher H, H, K
|
|
VEC_LOAD_INC K, KS, 6
|
|
vcipher H, H, K
|
|
VEC_LOAD_INC K, KS, 6
|
|
vcipher H, H, K
|
|
VEC_LOAD_INC K, KS, 6
|
|
vcipher H, H, K
|
|
VEC_LOAD_INC K, KS, 6
|
|
vcipher H, H, K
|
|
VEC_LOAD_INC K, KS, 6
|
|
vcipher H, H, K
|
|
cmpwi NR, 10
|
|
beq .LH_done
|
|
VEC_LOAD_INC K, KS, 6
|
|
vcipher H, H, K
|
|
VEC_LOAD_INC K, KS, 6
|
|
vcipher H, H, K
|
|
cmpwi NR, 12
|
|
beq .LH_done
|
|
VEC_LOAD_INC K, KS, 6
|
|
vcipher H, H, K
|
|
VEC_LOAD_INC K, KS, 6
|
|
vcipher H, H, K
|
|
|
|
.LH_done:
|
|
VEC_LOAD K, KS, 6
|
|
vcipherlast H, H, K
|
|
|
|
vupkhsb MSB, H
|
|
vspltisb ONE, 1
|
|
vspltb MSB, MSB, 0
|
|
vsl H, H, ONE
|
|
vand MSB, MSB, POLY
|
|
vxor ZERO, ZERO, ZERO
|
|
vxor H, H, MSB
|
|
vsldoi POLY, ZERO, POLY, 8
|
|
|
|
vpmsumd HP, H, POLY
|
|
vsldoi HS, H, H, 8
|
|
vxor HP, HP, HS
|
|
vsldoi H1L, HP, HS, 8
|
|
vsldoi H1M, HS, HP, 8
|
|
vsldoi H1L, H1L, H1L, 8
|
|
|
|
# calculate H^2
|
|
|
|
vpmsumd F, H, H1L
|
|
vpmsumd R, H, H1M
|
|
|
|
vpmsumd T, F, POLY
|
|
vsldoi H2, F, F, 8
|
|
vxor R, R, T
|
|
vxor H2, H2, R
|
|
|
|
vpmsumd HP, H2, POLY
|
|
vsldoi HS, H2, H2, 8
|
|
vxor HP, HP, HS
|
|
vsldoi H2L, HP, HS, 8
|
|
vsldoi H2M, HS, HP, 8
|
|
vsldoi H2L, H2L, H2L, 8
|
|
|
|
# calculate H^3
|
|
|
|
vpmsumd F, H2, H1L
|
|
vpmsumd R, H2, H1M
|
|
|
|
vpmsumd T, F, POLY
|
|
vsldoi H3, F, F, 8
|
|
vxor R, R, T
|
|
vxor H3, H3, R
|
|
|
|
vpmsumd HP, H3, POLY
|
|
vsldoi HS, H3, H3, 8
|
|
vxor HP, HP, HS
|
|
vsldoi H3L, HP, HS, 8
|
|
vsldoi H3M, HS, HP, 8
|
|
vsldoi H3L, H3L, H3L, 8
|
|
|
|
# calculate H^4
|
|
|
|
vpmsumd F, H2, H2L
|
|
vpmsumd R, H2, H2M
|
|
|
|
vpmsumd T, F, POLY
|
|
vsldoi H4, F, F, 8
|
|
vxor R, R, T
|
|
vxor H4, H4, R
|
|
|
|
vpmsumd HP, H4, POLY
|
|
vsldoi HS, H4, H4, 8
|
|
vxor HP, HP, HS
|
|
vsldoi H4L, HP, HS, 8
|
|
vsldoi H4M, HS, HP, 8
|
|
vsldoi H4L, H4L, H4L, 8
|
|
|
|
li 8, 16*1
|
|
li 9, 16*2
|
|
li 10, 16*3
|
|
stxvd2x H1L+32, 0, Htbl
|
|
stxvd2x H1M+32, 8, Htbl
|
|
stxvd2x H2L+32, 9, Htbl
|
|
stxvd2x H2M+32, 10, Htbl
|
|
li 7, 16*4
|
|
li 8, 16*5
|
|
li 9, 16*6
|
|
li 10, 16*7
|
|
stxvd2x H3L+32, 7, Htbl
|
|
stxvd2x H3M+32, 8, Htbl
|
|
stxvd2x H4L+32, 9, Htbl
|
|
stxvd2x H4M+32, 10, Htbl
|
|
|
|
blr
|
|
.size ppc_aes_gcmINIT, . - ppc_aes_gcmINIT
|
|
|
|
################################################################################
|
|
# Authenticate only
|
|
# void ppc_aes_gcmHASH(uint8_t Htbl[16*8], uint8_t *AAD, uint64_t Alen, uint8_t *Tp);
|
|
.globl ppc_aes_gcmHASH
|
|
.type ppc_aes_gcmHASH,@function
|
|
.align 5
|
|
ppc_aes_gcmHASH:
|
|
addis TOCP,12,(.TOC.-ppc_aes_gcmHASH)@ha
|
|
addi TOCP,TOCP,(.TOC.-ppc_aes_gcmHASH)@l
|
|
.localentry ppc_aes_gcmHASH, .-ppc_aes_gcmHASH
|
|
|
|
.set Htbl, 3
|
|
.set AAD, 4
|
|
.set Alen, 5
|
|
.set Tp, 6
|
|
|
|
.set SWAP_MASK, 0
|
|
.set POLY, 1
|
|
.set D, 2
|
|
.set C0, 3
|
|
.set C1, 4
|
|
.set C2, 5
|
|
.set C3, 6
|
|
.set T, 7
|
|
.set R, 8
|
|
.set F, 9
|
|
.set R2, 10
|
|
.set F2, 11
|
|
.set R3, 12
|
|
.set F3, 13
|
|
.set R4, 14
|
|
.set F4, 15
|
|
.set H1M, 16
|
|
.set H1L, 17
|
|
.set H2M, 18
|
|
.set H2L, 19
|
|
.set H3M, 28
|
|
.set H3L, 29
|
|
.set H4M, 30
|
|
.set H4L, 31
|
|
|
|
# store non-volatile vector registers
|
|
addi 7, SP, -16
|
|
stvx 31, 0, 7
|
|
addi 7, SP, -32
|
|
stvx 30, 0, 7
|
|
addi 7, SP, -48
|
|
stvx 29, 0, 7
|
|
addi 7, SP, -64
|
|
stvx 28, 0, 7
|
|
|
|
VEC_LOAD_DATA SWAP_MASK, .Ldb_bswap_mask, 7
|
|
VEC_LOAD_DATA POLY, .Lpoly_r, 7
|
|
|
|
VEC_LOAD D, Tp, 0
|
|
|
|
# --- process 4 blocks ---
|
|
|
|
srdi. 7, Alen, 6 # 4-blocks loop count
|
|
beq .L2x
|
|
|
|
mtctr 7 # set counter register
|
|
|
|
# load table elements
|
|
li 8, 1*16
|
|
li 9, 2*16
|
|
li 10, 3*16
|
|
lxvd2x H1L+32, 0, Htbl
|
|
lxvd2x H1M+32, 8, Htbl
|
|
lxvd2x H2L+32, 9, Htbl
|
|
lxvd2x H2M+32, 10, Htbl
|
|
li 7, 4*16
|
|
li 8, 5*16
|
|
li 9, 6*16
|
|
li 10, 7*16
|
|
lxvd2x H3L+32, 7, Htbl
|
|
lxvd2x H3M+32, 8, Htbl
|
|
lxvd2x H4L+32, 9, Htbl
|
|
lxvd2x H4M+32, 10, Htbl
|
|
|
|
li 8, 0x10
|
|
li 9, 0x20
|
|
li 10, 0x30
|
|
.align 5
|
|
.L4x_loop:
|
|
# load input
|
|
lxvd2x C0+32, 0, AAD
|
|
lxvd2x C1+32, 8, AAD
|
|
lxvd2x C2+32, 9, AAD
|
|
lxvd2x C3+32, 10, AAD
|
|
|
|
vperm C0, C0, C0, SWAP_MASK
|
|
vperm C1, C1, C1, SWAP_MASK
|
|
vperm C2, C2, C2, SWAP_MASK
|
|
vperm C3, C3, C3, SWAP_MASK
|
|
|
|
# digest combining
|
|
vxor C0, C0, D
|
|
|
|
# polynomial multiplication
|
|
vpmsumd F2, H3L, C1
|
|
vpmsumd R2, H3M, C1
|
|
vpmsumd F3, H2L, C2
|
|
vpmsumd R3, H2M, C2
|
|
vpmsumd F4, H1L, C3
|
|
vpmsumd R4, H1M, C3
|
|
vpmsumd F, H4L, C0
|
|
vpmsumd R, H4M, C0
|
|
|
|
# deferred recombination of partial products
|
|
vxor F3, F3, F4
|
|
vxor R3, R3, R4
|
|
vxor F, F, F2
|
|
vxor R, R, R2
|
|
vxor F, F, F3
|
|
vxor R, R, R3
|
|
|
|
# reduction
|
|
vpmsumd T, F, POLY
|
|
vsldoi D, F, F, 8
|
|
vxor R, R, T
|
|
vxor D, R, D
|
|
|
|
addi AAD, AAD, 0x40
|
|
bdnz .L4x_loop
|
|
|
|
clrldi Alen, Alen, 58
|
|
.L2x:
|
|
# --- process 2 blocks ---
|
|
|
|
srdi. 7, Alen, 5
|
|
beq .L1x
|
|
|
|
# load table elements
|
|
li 8, 1*16
|
|
li 9, 2*16
|
|
li 10, 3*16
|
|
lxvd2x H1L+32, 0, Htbl
|
|
lxvd2x H1M+32, 8, Htbl
|
|
lxvd2x H2L+32, 9, Htbl
|
|
lxvd2x H2M+32, 10, Htbl
|
|
|
|
# load input
|
|
li 10, 0x10
|
|
lxvd2x C0+32, 0, AAD
|
|
lxvd2x C1+32, 10, AAD
|
|
|
|
vperm C0, C0, C0, SWAP_MASK
|
|
vperm C1, C1, C1, SWAP_MASK
|
|
|
|
# previous digest combining
|
|
vxor C0, C0, D
|
|
|
|
# polynomial multiplication
|
|
vpmsumd F2, H1L, C1
|
|
vpmsumd R2, H1M, C1
|
|
vpmsumd F, H2L, C0
|
|
vpmsumd R, H2M, C0
|
|
|
|
# deferred recombination of partial products
|
|
vxor F, F, F2
|
|
vxor R, R, R2
|
|
|
|
# reduction
|
|
vpmsumd T, F, POLY
|
|
vsldoi D, F, F, 8
|
|
vxor R, R, T
|
|
vxor D, R, D
|
|
|
|
addi AAD, AAD, 0x20
|
|
clrldi Alen, Alen, 59
|
|
.L1x:
|
|
# --- process 1 block ---
|
|
|
|
srdi. 7, Alen, 4
|
|
beq .Ltail
|
|
|
|
# load table elements
|
|
li 8, 1*16
|
|
lxvd2x H1L+32, 0, Htbl
|
|
lxvd2x H1M+32, 8, Htbl
|
|
|
|
# load input
|
|
lxvd2x C0+32, 0, AAD
|
|
|
|
vperm C0, C0, C0, SWAP_MASK
|
|
|
|
# previous digest combining
|
|
vxor C0, C0, D
|
|
|
|
# polynomial multiplication
|
|
vpmsumd F, H1L, C0
|
|
vpmsumd R, H1M, C0
|
|
|
|
# reduction
|
|
vpmsumd T, F, POLY
|
|
vsldoi D, F, F, 8
|
|
vxor R, R, T
|
|
vxor D, R, D
|
|
|
|
addi AAD, AAD, 0x10
|
|
clrldi Alen, Alen, 60
|
|
|
|
.Ltail:
|
|
cmpldi Alen, 0
|
|
beq .Lh_done
|
|
# --- process the final partial block ---
|
|
|
|
# load table elements
|
|
li 8, 1*16
|
|
lxvd2x H1L+32, 0, Htbl
|
|
lxvd2x H1M+32, 8, Htbl
|
|
|
|
LOAD_LEN AAD, Alen, 10, 9, 3, 7, 8
|
|
mtvrd C0, 10
|
|
mtvrd C1, 9
|
|
xxmrghd C0+32, C0+32, C1+32
|
|
|
|
# previous digest combining
|
|
vxor C0, C0, D
|
|
|
|
# polynomial multiplication
|
|
vpmsumd F, H1L, C0
|
|
vpmsumd R, H1M, C0
|
|
|
|
# reduction
|
|
vpmsumd T, F, POLY
|
|
vsldoi D, F, F, 8
|
|
vxor R, R, T
|
|
vxor D, R, D
|
|
.Lh_done:
|
|
VEC_STORE D, Tp, 0
|
|
|
|
# restore non-volatile vector registers
|
|
addi 7, SP, -16
|
|
lvx 31, 0, 7
|
|
addi 7, SP, -32
|
|
lvx 30, 0, 7
|
|
addi 7, SP, -48
|
|
lvx 29, 0, 7
|
|
addi 7, SP, -64
|
|
lvx 28, 0, 7
|
|
blr
|
|
.size ppc_aes_gcmHASH, . - ppc_aes_gcmHASH
|
|
|
|
################################################################################
|
|
# Generates the final GCM tag
|
|
# void ppc_aes_gcmTAG(uint8_t Htbl[16*8], uint8_t *Tp, uint64_t Mlen, uint64_t Alen, uint8_t* X0, uint8_t* TAG);
|
|
.globl ppc_aes_gcmTAG
|
|
.type ppc_aes_gcmTAG,@function
|
|
.align 5
|
|
ppc_aes_gcmTAG:
|
|
addis TOCP,12,(.TOC.-ppc_aes_gcmTAG)@ha
|
|
addi TOCP,TOCP,(.TOC.-ppc_aes_gcmTAG)@l
|
|
.localentry ppc_aes_gcmTAG, .-ppc_aes_gcmTAG
|
|
|
|
.set Htbl, 3
|
|
.set Tp, 4
|
|
.set Mlen, 5
|
|
.set Alen, 6
|
|
.set X0, 7
|
|
.set TAG, 8
|
|
|
|
.set SWAP_MASK, 0
|
|
.set POLY, 1
|
|
.set D, 2
|
|
.set C0, 3
|
|
.set C1, 4
|
|
.set T, 5
|
|
.set R, 6
|
|
.set F, 7
|
|
.set H1M, 8
|
|
.set H1L, 9
|
|
.set X, 10
|
|
|
|
VEC_LOAD_DATA SWAP_MASK, .Ldb_bswap_mask, 9
|
|
VEC_LOAD_DATA POLY, .Lpoly_r, 9
|
|
|
|
VEC_LOAD D, Tp, 0
|
|
|
|
# load table elements
|
|
li 9, 1*16
|
|
lxvd2x H1L+32, 0, Htbl
|
|
lxvd2x H1M+32, 9, Htbl
|
|
|
|
sldi Alen, Alen, 3
|
|
sldi Mlen, Mlen, 3
|
|
mtvrd C0, Alen
|
|
mtvrd C1, Mlen
|
|
xxmrghd C0+32, C0+32, C1+32
|
|
|
|
# previous digest combining
|
|
vxor C0, C0, D
|
|
|
|
# polynomial multiplication
|
|
vpmsumd F, H1L, C0
|
|
vpmsumd R, H1M, C0
|
|
|
|
# reduction
|
|
vpmsumd T, F, POLY
|
|
vsldoi D, F, F, 8
|
|
vxor R, R, T
|
|
vxor D, R, D
|
|
|
|
lxvd2x X+32, 0, X0
|
|
vperm D, D, D, SWAP_MASK
|
|
vxor X, X, D
|
|
stxvd2x X+32, 0, TAG
|
|
|
|
blr
|
|
.size ppc_aes_gcmTAG, . - ppc_aes_gcmTAG
|
|
|
|
################################################################################
|
|
# Crypt only
|
|
# void ppc_aes_gcmCRYPT(const uint8_t* PT, uint8_t* CT, uint64_t LEN, uint8_t *CTRP, uint32_t *KS, int NR);
|
|
.globl ppc_aes_gcmCRYPT
|
|
.type ppc_aes_gcmCRYPT,@function
|
|
.align 5
|
|
ppc_aes_gcmCRYPT:
|
|
addis TOCP,12,(.TOC.-ppc_aes_gcmCRYPT)@ha
|
|
addi TOCP,TOCP,(.TOC.-ppc_aes_gcmCRYPT)@l
|
|
.localentry ppc_aes_gcmCRYPT, .-ppc_aes_gcmCRYPT
|
|
|
|
.set PT, 3
|
|
.set CT, 4
|
|
.set LEN, 5
|
|
.set CTRP, 6
|
|
.set KS, 7
|
|
.set NR, 8
|
|
|
|
.set SWAP_MASK, 0
|
|
.set K, 1
|
|
.set CTR, 2
|
|
.set CTR0, 3
|
|
.set CTR1, 4
|
|
.set CTR2, 5
|
|
.set CTR3, 6
|
|
.set CTR4, 7
|
|
.set CTR5, 8
|
|
.set CTR6, 9
|
|
.set CTR7, 10
|
|
.set ZERO, 11
|
|
.set I1, 12
|
|
.set I2, 13
|
|
.set I3, 14
|
|
.set I4, 15
|
|
.set I5, 16
|
|
.set I6, 17
|
|
.set I7, 18
|
|
.set I8, 19
|
|
.set IN0, 24
|
|
.set IN1, 25
|
|
.set IN2, 26
|
|
.set IN3, 27
|
|
.set IN4, 28
|
|
.set IN5, 29
|
|
.set IN6, 30
|
|
.set IN7, 31
|
|
|
|
.macro ROUND_8
|
|
VEC_LOAD_INC K, KS, 10
|
|
vcipher CTR0, CTR0, K
|
|
vcipher CTR1, CTR1, K
|
|
vcipher CTR2, CTR2, K
|
|
vcipher CTR3, CTR3, K
|
|
vcipher CTR4, CTR4, K
|
|
vcipher CTR5, CTR5, K
|
|
vcipher CTR6, CTR6, K
|
|
vcipher CTR7, CTR7, K
|
|
.endm
|
|
|
|
.macro ROUND_4
|
|
VEC_LOAD_INC K, KS, 10
|
|
vcipher CTR0, CTR0, K
|
|
vcipher CTR1, CTR1, K
|
|
vcipher CTR2, CTR2, K
|
|
vcipher CTR3, CTR3, K
|
|
.endm
|
|
|
|
.macro ROUND_2
|
|
VEC_LOAD_INC K, KS, 10
|
|
vcipher CTR0, CTR0, K
|
|
vcipher CTR1, CTR1, K
|
|
.endm
|
|
|
|
.macro ROUND_1
|
|
VEC_LOAD_INC K, KS, 10
|
|
vcipher CTR0, CTR0, K
|
|
.endm
|
|
|
|
# store non-volatile general registers
|
|
std 31,-8(SP);
|
|
std 30,-16(SP);
|
|
std 29,-24(SP);
|
|
std 28,-32(SP);
|
|
std 27,-40(SP);
|
|
std 26,-48(SP);
|
|
std 25,-56(SP);
|
|
|
|
# store non-volatile vector registers
|
|
addi 9, SP, -80
|
|
stvx 31, 0, 9
|
|
addi 9, SP, -96
|
|
stvx 30, 0, 9
|
|
addi 9, SP, -112
|
|
stvx 29, 0, 9
|
|
addi 9, SP, -128
|
|
stvx 28, 0, 9
|
|
addi 9, SP, -144
|
|
stvx 27, 0, 9
|
|
addi 9, SP, -160
|
|
stvx 26, 0, 9
|
|
addi 9, SP, -176
|
|
stvx 25, 0, 9
|
|
addi 9, SP, -192
|
|
stvx 24, 0, 9
|
|
|
|
VEC_LOAD_DATA SWAP_MASK, .Ldb_bswap_mask, 9
|
|
|
|
vxor ZERO, ZERO, ZERO
|
|
vspltisb I1, 1
|
|
vspltisb I2, 2
|
|
vspltisb I3, 3
|
|
vspltisb I4, 4
|
|
vspltisb I5, 5
|
|
vspltisb I6, 6
|
|
vspltisb I7, 7
|
|
vspltisb I8, 8
|
|
vsldoi I1, ZERO, I1, 1
|
|
vsldoi I2, ZERO, I2, 1
|
|
vsldoi I3, ZERO, I3, 1
|
|
vsldoi I4, ZERO, I4, 1
|
|
vsldoi I5, ZERO, I5, 1
|
|
vsldoi I6, ZERO, I6, 1
|
|
vsldoi I7, ZERO, I7, 1
|
|
vsldoi I8, ZERO, I8, 1
|
|
|
|
VEC_LOAD CTR, CTRP, 0
|
|
|
|
srdi. 9, LEN, 7
|
|
beq .Lctr_4x
|
|
|
|
mtctr 9
|
|
|
|
li 25, 0x10
|
|
li 26, 0x20
|
|
li 27, 0x30
|
|
li 28, 0x40
|
|
li 29, 0x50
|
|
li 30, 0x60
|
|
li 31, 0x70
|
|
|
|
.align 5
|
|
.L8x_loop:
|
|
li 10, 0
|
|
VEC_LOAD_INC K, KS, 10
|
|
|
|
vadduwm CTR1, CTR, I1
|
|
vadduwm CTR2, CTR, I2
|
|
vadduwm CTR3, CTR, I3
|
|
vadduwm CTR4, CTR, I4
|
|
vadduwm CTR5, CTR, I5
|
|
vadduwm CTR6, CTR, I6
|
|
vadduwm CTR7, CTR, I7
|
|
|
|
vxor CTR0, CTR, K
|
|
vxor CTR1, CTR1, K
|
|
vxor CTR2, CTR2, K
|
|
vxor CTR3, CTR3, K
|
|
vxor CTR4, CTR4, K
|
|
vxor CTR5, CTR5, K
|
|
vxor CTR6, CTR6, K
|
|
vxor CTR7, CTR7, K
|
|
|
|
ROUND_8
|
|
ROUND_8
|
|
ROUND_8
|
|
ROUND_8
|
|
ROUND_8
|
|
ROUND_8
|
|
ROUND_8
|
|
ROUND_8
|
|
ROUND_8
|
|
cmpwi NR, 10
|
|
beq .Llast_8
|
|
ROUND_8
|
|
ROUND_8
|
|
cmpwi NR, 12
|
|
beq .Llast_8
|
|
ROUND_8
|
|
ROUND_8
|
|
|
|
.Llast_8:
|
|
VEC_LOAD K, KS, 10
|
|
vcipherlast CTR0, CTR0, K
|
|
vcipherlast CTR1, CTR1, K
|
|
vcipherlast CTR2, CTR2, K
|
|
vcipherlast CTR3, CTR3, K
|
|
vcipherlast CTR4, CTR4, K
|
|
vcipherlast CTR5, CTR5, K
|
|
vcipherlast CTR6, CTR6, K
|
|
vcipherlast CTR7, CTR7, K
|
|
|
|
lxvd2x IN0+32, 0, PT
|
|
lxvd2x IN1+32, 25, PT
|
|
lxvd2x IN2+32, 26, PT
|
|
lxvd2x IN3+32, 27, PT
|
|
lxvd2x IN4+32, 28, PT
|
|
lxvd2x IN5+32, 29, PT
|
|
lxvd2x IN6+32, 30, PT
|
|
lxvd2x IN7+32, 31, PT
|
|
|
|
vperm CTR0, CTR0, CTR0, SWAP_MASK
|
|
vperm CTR1, CTR1, CTR1, SWAP_MASK
|
|
vperm CTR2, CTR2, CTR2, SWAP_MASK
|
|
vperm CTR3, CTR3, CTR3, SWAP_MASK
|
|
vperm CTR4, CTR4, CTR4, SWAP_MASK
|
|
vperm CTR5, CTR5, CTR5, SWAP_MASK
|
|
vperm CTR6, CTR6, CTR6, SWAP_MASK
|
|
vperm CTR7, CTR7, CTR7, SWAP_MASK
|
|
|
|
vxor IN0, IN0, CTR0
|
|
vxor IN1, IN1, CTR1
|
|
vxor IN2, IN2, CTR2
|
|
vxor IN3, IN3, CTR3
|
|
vxor IN4, IN4, CTR4
|
|
vxor IN5, IN5, CTR5
|
|
vxor IN6, IN6, CTR6
|
|
vxor IN7, IN7, CTR7
|
|
|
|
stxvd2x IN0+32, 0, CT
|
|
stxvd2x IN1+32, 25, CT
|
|
stxvd2x IN2+32, 26, CT
|
|
stxvd2x IN3+32, 27, CT
|
|
stxvd2x IN4+32, 28, CT
|
|
stxvd2x IN5+32, 29, CT
|
|
stxvd2x IN6+32, 30, CT
|
|
stxvd2x IN7+32, 31, CT
|
|
|
|
vadduwm CTR, CTR, I8
|
|
addi PT, PT, 0x80
|
|
addi CT, CT, 0x80
|
|
bdnz .L8x_loop
|
|
|
|
clrldi LEN, LEN, 57
|
|
|
|
.Lctr_4x:
|
|
srdi. 9, LEN, 6
|
|
beq .Lctr_2x
|
|
|
|
li 10, 0
|
|
li 29, 0x10
|
|
li 30, 0x20
|
|
li 31, 0x30
|
|
|
|
VEC_LOAD_INC K, KS, 10
|
|
|
|
vadduwm CTR1, CTR, I1
|
|
vadduwm CTR2, CTR, I2
|
|
vadduwm CTR3, CTR, I3
|
|
|
|
vxor CTR0, CTR, K
|
|
vxor CTR1, CTR1, K
|
|
vxor CTR2, CTR2, K
|
|
vxor CTR3, CTR3, K
|
|
|
|
ROUND_4
|
|
ROUND_4
|
|
ROUND_4
|
|
ROUND_4
|
|
ROUND_4
|
|
ROUND_4
|
|
ROUND_4
|
|
ROUND_4
|
|
ROUND_4
|
|
cmpwi NR, 10
|
|
beq .Llast_4
|
|
ROUND_4
|
|
ROUND_4
|
|
cmpwi NR, 12
|
|
beq .Llast_4
|
|
ROUND_4
|
|
ROUND_4
|
|
|
|
.Llast_4:
|
|
VEC_LOAD K, KS, 10
|
|
vcipherlast CTR0, CTR0, K
|
|
vcipherlast CTR1, CTR1, K
|
|
vcipherlast CTR2, CTR2, K
|
|
vcipherlast CTR3, CTR3, K
|
|
|
|
lxvd2x IN0+32, 0, PT
|
|
lxvd2x IN1+32, 29, PT
|
|
lxvd2x IN2+32, 30, PT
|
|
lxvd2x IN3+32, 31, PT
|
|
|
|
vperm CTR0, CTR0, CTR0, SWAP_MASK
|
|
vperm CTR1, CTR1, CTR1, SWAP_MASK
|
|
vperm CTR2, CTR2, CTR2, SWAP_MASK
|
|
vperm CTR3, CTR3, CTR3, SWAP_MASK
|
|
|
|
vxor IN0, IN0, CTR0
|
|
vxor IN1, IN1, CTR1
|
|
vxor IN2, IN2, CTR2
|
|
vxor IN3, IN3, CTR3
|
|
|
|
stxvd2x IN0+32, 0, CT
|
|
stxvd2x IN1+32, 29, CT
|
|
stxvd2x IN2+32, 30, CT
|
|
stxvd2x IN3+32, 31, CT
|
|
|
|
vadduwm CTR, CTR, I4
|
|
addi PT, PT, 0x40
|
|
addi CT, CT, 0x40
|
|
|
|
clrldi LEN, LEN, 58
|
|
|
|
.Lctr_2x:
|
|
srdi. 9, LEN, 5
|
|
beq .Lctr_1x
|
|
|
|
li 10, 0
|
|
li 31, 0x10
|
|
|
|
VEC_LOAD_INC K, KS, 10
|
|
|
|
vadduwm CTR1, CTR, I1
|
|
|
|
vxor CTR0, CTR, K
|
|
vxor CTR1, CTR1, K
|
|
|
|
ROUND_2
|
|
ROUND_2
|
|
ROUND_2
|
|
ROUND_2
|
|
ROUND_2
|
|
ROUND_2
|
|
ROUND_2
|
|
ROUND_2
|
|
ROUND_2
|
|
cmpwi NR, 10
|
|
beq .Llast_2
|
|
ROUND_2
|
|
ROUND_2
|
|
cmpwi NR, 12
|
|
beq .Llast_2
|
|
ROUND_2
|
|
ROUND_2
|
|
|
|
.Llast_2:
|
|
VEC_LOAD K, KS, 10
|
|
vcipherlast CTR0, CTR0, K
|
|
vcipherlast CTR1, CTR1, K
|
|
|
|
lxvd2x IN0+32, 0, PT
|
|
lxvd2x IN1+32, 31, PT
|
|
|
|
vperm CTR0, CTR0, CTR0, SWAP_MASK
|
|
vperm CTR1, CTR1, CTR1, SWAP_MASK
|
|
|
|
vxor IN0, IN0, CTR0
|
|
vxor IN1, IN1, CTR1
|
|
|
|
stxvd2x IN0+32, 0, CT
|
|
stxvd2x IN1+32, 31, CT
|
|
|
|
vadduwm CTR, CTR, I2
|
|
addi PT, PT, 0x20
|
|
addi CT, CT, 0x20
|
|
|
|
clrldi LEN, LEN, 59
|
|
|
|
.Lctr_1x:
|
|
srdi. 9, LEN, 4
|
|
beq .Lctr_tail
|
|
|
|
li 10, 0
|
|
|
|
VEC_LOAD_INC K, KS, 10
|
|
vxor CTR0, CTR, K
|
|
|
|
ROUND_1
|
|
ROUND_1
|
|
ROUND_1
|
|
ROUND_1
|
|
ROUND_1
|
|
ROUND_1
|
|
ROUND_1
|
|
ROUND_1
|
|
ROUND_1
|
|
cmpwi NR, 10
|
|
beq .Llast_1
|
|
ROUND_1
|
|
ROUND_1
|
|
cmpwi NR, 12
|
|
beq .Llast_1
|
|
ROUND_1
|
|
ROUND_1
|
|
|
|
.Llast_1:
|
|
VEC_LOAD K, KS, 10
|
|
vcipherlast CTR0, CTR0, K
|
|
|
|
lxvd2x IN0+32, 0, PT
|
|
|
|
vperm CTR0, CTR0, CTR0, SWAP_MASK
|
|
|
|
vxor IN0, IN0, CTR0
|
|
|
|
stxvd2x IN0+32, 0, CT
|
|
|
|
vadduwm CTR, CTR, I1
|
|
addi PT, PT, 0x10
|
|
addi CT, CT, 0x10
|
|
|
|
clrldi LEN, LEN, 60
|
|
|
|
.Lctr_tail:
|
|
cmpldi LEN, 0
|
|
beq .Lc_done
|
|
|
|
li 10, 0
|
|
|
|
VEC_LOAD_INC K, KS, 10
|
|
vxor CTR0, CTR, K
|
|
|
|
ROUND_1
|
|
ROUND_1
|
|
ROUND_1
|
|
ROUND_1
|
|
ROUND_1
|
|
ROUND_1
|
|
ROUND_1
|
|
ROUND_1
|
|
ROUND_1
|
|
cmpwi NR, 10
|
|
beq .Llast_tail
|
|
ROUND_1
|
|
ROUND_1
|
|
cmpwi NR, 12
|
|
beq .Llast_tail
|
|
ROUND_1
|
|
ROUND_1
|
|
|
|
.Llast_tail:
|
|
VEC_LOAD K, KS, 10
|
|
vcipherlast CTR0, CTR0, K
|
|
|
|
LOAD_LEN PT, LEN, 10, 9, 29, 30, 31
|
|
|
|
vsldoi CTR1, CTR0, CTR0, 8
|
|
mfvrd 31, CTR0
|
|
mfvrd 30, CTR1
|
|
|
|
xor 10, 10, 31
|
|
xor 9, 9, 30
|
|
|
|
STORE_LEN CT, LEN, 10, 9, 29, 30, 31
|
|
|
|
vadduwm CTR, CTR, I1
|
|
|
|
.Lc_done:
|
|
VEC_STORE CTR, CTRP, 0
|
|
|
|
# restore non-volatile vector registers
|
|
addi 9, SP, -80
|
|
lvx 31, 0, 9
|
|
addi 9, SP, -96
|
|
lvx 30, 0, 9
|
|
addi 9, SP, -112
|
|
lvx 29, 0, 9
|
|
addi 9, SP, -128
|
|
lvx 28, 0, 9
|
|
addi 9, SP, -144
|
|
lvx 27, 0, 9
|
|
addi 9, SP, -160
|
|
lvx 26, 0, 9
|
|
addi 9, SP, -176
|
|
lvx 25, 0, 9
|
|
addi 9, SP, -192
|
|
lvx 24, 0, 9
|
|
|
|
# restore non-volatile general registers
|
|
ld 31,-8(SP);
|
|
ld 30,-16(SP);
|
|
ld 29,-24(SP);
|
|
ld 28,-32(SP);
|
|
ld 27,-40(SP);
|
|
ld 26,-48(SP);
|
|
ld 25,-56(SP);
|
|
blr
|
|
.size ppc_aes_gcmCRYPT, . - ppc_aes_gcmCRYPT
|
|
|
|
.data
|
|
.align 4
|
|
.Lpoly:
|
|
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
|
|
.Lpoly_r:
|
|
.byte 0,0,0,0,0,0,0,0xc2,0,0,0,0,0,0,0,0
|
|
.Ldb_bswap_mask:
|
|
.byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7
|