mirror of
https://github.com/Xevion/easy7zip.git
synced 2025-12-09 06:07:05 -06:00
Update to 7-Zip Version 21.02
This commit is contained in:
@@ -1,237 +1,734 @@
|
||||
; AesOpt.asm -- Intel's AES.
|
||||
; 2009-12-12 : Igor Pavlov : Public domain
|
||||
; AesOpt.asm -- AES optimized code for x86 AES hardware instructions
|
||||
; 2021-03-10 : Igor Pavlov : Public domain
|
||||
|
||||
include 7zAsm.asm
|
||||
|
||||
ifdef ymm0
|
||||
use_vaes_256 equ 1
|
||||
ECHO "++ VAES 256"
|
||||
else
|
||||
ECHO "-- NO VAES 256"
|
||||
endif
|
||||
|
||||
ifdef x64
|
||||
ECHO "x86-64"
|
||||
else
|
||||
ECHO "x86"
|
||||
if (IS_CDECL gt 0)
|
||||
ECHO "ABI : CDECL"
|
||||
else
|
||||
ECHO "ABI : no CDECL : FASTCALL"
|
||||
endif
|
||||
endif
|
||||
|
||||
if (IS_LINUX gt 0)
|
||||
ECHO "ABI : LINUX"
|
||||
else
|
||||
ECHO "ABI : WINDOWS"
|
||||
endif
|
||||
|
||||
MY_ASM_START
|
||||
|
||||
ifndef x64
|
||||
.686
|
||||
.xmm
|
||||
endif
|
||||
|
||||
ifdef x64
|
||||
num equ r8
|
||||
else
|
||||
num equ [r4 + REG_SIZE * 4]
|
||||
|
||||
; MY_ALIGN EQU ALIGN(64)
|
||||
MY_ALIGN EQU
|
||||
|
||||
SEG_ALIGN EQU MY_ALIGN
|
||||
|
||||
MY_SEG_PROC macro name:req, numParams:req
|
||||
; seg_name equ @CatStr(_TEXT$, name)
|
||||
; seg_name SEGMENT SEG_ALIGN 'CODE'
|
||||
MY_PROC name, numParams
|
||||
endm
|
||||
|
||||
MY_SEG_ENDP macro
|
||||
; seg_name ENDS
|
||||
endm
|
||||
|
||||
|
||||
NUM_AES_KEYS_MAX equ 15
|
||||
|
||||
; the number of push operators in function PROLOG
|
||||
if (IS_LINUX eq 0) or (IS_X64 eq 0)
|
||||
num_regs_push equ 2
|
||||
stack_param_offset equ (REG_SIZE * (1 + num_regs_push))
|
||||
endif
|
||||
|
||||
rD equ r2
|
||||
rN equ r0
|
||||
ifdef x64
|
||||
num_param equ REG_ABI_PARAM_2
|
||||
else
|
||||
if (IS_CDECL gt 0)
|
||||
; size_t size
|
||||
; void * data
|
||||
; UInt32 * aes
|
||||
; ret-ip <- (r4)
|
||||
aes_OFFS equ (stack_param_offset)
|
||||
data_OFFS equ (REG_SIZE + aes_OFFS)
|
||||
size_OFFS equ (REG_SIZE + data_OFFS)
|
||||
num_param equ [r4 + size_OFFS]
|
||||
else
|
||||
num_param equ [r4 + stack_param_offset]
|
||||
endif
|
||||
endif
|
||||
|
||||
MY_PROLOG macro reg:req
|
||||
ifdef x64
|
||||
movdqa [r4 + 8], xmm6
|
||||
movdqa [r4 + 8 + 16], xmm7
|
||||
endif
|
||||
keys equ REG_PARAM_0 ; r1
|
||||
rD equ REG_PARAM_1 ; r2
|
||||
rN equ r0
|
||||
|
||||
push r3
|
||||
push r5
|
||||
push r6
|
||||
koffs_x equ x7
|
||||
koffs_r equ r7
|
||||
|
||||
mov rN, num
|
||||
mov x6, [r1 + 16]
|
||||
shl x6, 5
|
||||
ksize_x equ x6
|
||||
ksize_r equ r6
|
||||
|
||||
movdqa reg, [r1]
|
||||
add r1, 32
|
||||
endm
|
||||
keys2 equ r3
|
||||
|
||||
MY_EPILOG macro
|
||||
pop r6
|
||||
pop r5
|
||||
pop r3
|
||||
state equ xmm0
|
||||
key equ xmm0
|
||||
key_ymm equ ymm0
|
||||
key_ymm_n equ 0
|
||||
|
||||
ifdef x64
|
||||
movdqa xmm6, [r4 + 8]
|
||||
movdqa xmm7, [r4 + 8 + 16]
|
||||
endif
|
||||
ifdef x64
|
||||
ways = 11
|
||||
else
|
||||
ways = 4
|
||||
endif
|
||||
|
||||
MY_ENDP
|
||||
endm
|
||||
ways_start_reg equ 1
|
||||
|
||||
ways equ 4
|
||||
ways16 equ (ways * 16)
|
||||
iv equ @CatStr(xmm, %(ways_start_reg + ways))
|
||||
iv_ymm equ @CatStr(ymm, %(ways_start_reg + ways))
|
||||
|
||||
OP_W macro op, op2
|
||||
|
||||
WOP macro op, op2
|
||||
i = 0
|
||||
rept ways
|
||||
op @CatStr(xmm,%i), op2
|
||||
i = i + 1
|
||||
op @CatStr(xmm, %(ways_start_reg + i)), op2
|
||||
i = i + 1
|
||||
endm
|
||||
endm
|
||||
|
||||
LOAD_OP macro op:req, offs:req
|
||||
op xmm0, [r1 + r3 offs]
|
||||
endm
|
||||
|
||||
ifndef ABI_LINUX
|
||||
ifdef x64
|
||||
|
||||
; we use 32 bytes of home space in stack in WIN64-x64
|
||||
NUM_HOME_MM_REGS equ (32 / 16)
|
||||
; we preserve xmm registers starting from xmm6 in WIN64-x64
|
||||
MM_START_SAVE_REG equ 6
|
||||
|
||||
SAVE_XMM macro num_used_mm_regs:req
|
||||
num_save_mm_regs = num_used_mm_regs - MM_START_SAVE_REG
|
||||
if num_save_mm_regs GT 0
|
||||
num_save_mm_regs2 = num_save_mm_regs - NUM_HOME_MM_REGS
|
||||
; RSP is (16*x + 8) after entering the function in WIN64-x64
|
||||
stack_offset = 16 * num_save_mm_regs2 + (stack_param_offset mod 16)
|
||||
|
||||
LOAD_OP_W macro op:req, offs:req
|
||||
movdqa xmm7, [r1 + r3 offs]
|
||||
OP_W op, xmm7
|
||||
i = 0
|
||||
rept num_save_mm_regs
|
||||
|
||||
if i eq NUM_HOME_MM_REGS
|
||||
sub r4, stack_offset
|
||||
endif
|
||||
|
||||
if i lt NUM_HOME_MM_REGS
|
||||
movdqa [r4 + stack_param_offset + i * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))
|
||||
else
|
||||
movdqa [r4 + (i - NUM_HOME_MM_REGS) * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))
|
||||
endif
|
||||
|
||||
i = i + 1
|
||||
endm
|
||||
endif
|
||||
endm
|
||||
|
||||
RESTORE_XMM macro num_used_mm_regs:req
|
||||
if num_save_mm_regs GT 0
|
||||
i = 0
|
||||
if num_save_mm_regs2 GT 0
|
||||
rept num_save_mm_regs2
|
||||
movdqa @CatStr(xmm, %(MM_START_SAVE_REG + NUM_HOME_MM_REGS + i)), [r4 + i * 16]
|
||||
i = i + 1
|
||||
endm
|
||||
add r4, stack_offset
|
||||
endif
|
||||
|
||||
num_low_regs = num_save_mm_regs - i
|
||||
i = 0
|
||||
rept num_low_regs
|
||||
movdqa @CatStr(xmm, %(MM_START_SAVE_REG + i)), [r4 + stack_param_offset + i * 16]
|
||||
i = i + 1
|
||||
endm
|
||||
endif
|
||||
endm
|
||||
|
||||
endif ; x64
|
||||
endif ; ABI_LINUX
|
||||
|
||||
|
||||
MY_PROLOG macro num_used_mm_regs:req
|
||||
; num_regs_push: must be equal to the number of push operators
|
||||
; push r3
|
||||
; push r5
|
||||
if (IS_LINUX eq 0) or (IS_X64 eq 0)
|
||||
push r6
|
||||
push r7
|
||||
endif
|
||||
|
||||
mov rN, num_param ; don't move it; num_param can use stack pointer (r4)
|
||||
|
||||
if (IS_X64 eq 0)
|
||||
if (IS_CDECL gt 0)
|
||||
mov rD, [r4 + data_OFFS]
|
||||
mov keys, [r4 + aes_OFFS]
|
||||
endif
|
||||
elseif (IS_LINUX gt 0)
|
||||
MY_ABI_LINUX_TO_WIN_2
|
||||
endif
|
||||
|
||||
|
||||
ifndef ABI_LINUX
|
||||
ifdef x64
|
||||
SAVE_XMM num_used_mm_regs
|
||||
endif
|
||||
endif
|
||||
|
||||
mov ksize_x, [keys + 16]
|
||||
shl ksize_x, 5
|
||||
endm
|
||||
|
||||
|
||||
MY_EPILOG macro
|
||||
ifndef ABI_LINUX
|
||||
ifdef x64
|
||||
RESTORE_XMM num_save_mm_regs
|
||||
endif
|
||||
endif
|
||||
|
||||
if (IS_LINUX eq 0) or (IS_X64 eq 0)
|
||||
pop r7
|
||||
pop r6
|
||||
endif
|
||||
; pop r5
|
||||
; pop r3
|
||||
MY_ENDP
|
||||
endm
|
||||
|
||||
|
||||
OP_KEY macro op:req, offs:req
|
||||
op state, [keys + offs]
|
||||
endm
|
||||
|
||||
|
||||
WOP_KEY macro op:req, offs:req
|
||||
movdqa key, [keys + offs]
|
||||
WOP op, key
|
||||
endm
|
||||
|
||||
|
||||
; ---------- AES-CBC Decode ----------
|
||||
|
||||
CBC_DEC_UPDATE macro reg, offs
|
||||
pxor reg, xmm6
|
||||
movdqa xmm6, [rD + offs]
|
||||
movdqa [rD + offs], reg
|
||||
|
||||
XOR_WITH_DATA macro reg, _ppp_
|
||||
pxor reg, [rD + i * 16]
|
||||
endm
|
||||
|
||||
DECODE macro op:req
|
||||
op aesdec, +16
|
||||
@@:
|
||||
op aesdec, +0
|
||||
op aesdec, -16
|
||||
sub x3, 32
|
||||
jnz @B
|
||||
op aesdeclast, +0
|
||||
WRITE_TO_DATA macro reg, _ppp_
|
||||
movdqa [rD + i * 16], reg
|
||||
endm
|
||||
|
||||
MY_PROC AesCbc_Decode_Intel, 3
|
||||
MY_PROLOG xmm6
|
||||
|
||||
sub x6, 32
|
||||
; state0 equ @CatStr(xmm, %(ways_start_reg))
|
||||
|
||||
jmp check2
|
||||
key0 equ @CatStr(xmm, %(ways_start_reg + ways + 1))
|
||||
key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))
|
||||
|
||||
align 16
|
||||
nextBlocks2:
|
||||
mov x3, x6
|
||||
OP_W movdqa, [rD + i * 16]
|
||||
LOAD_OP_W pxor, +32
|
||||
DECODE LOAD_OP_W
|
||||
OP_W CBC_DEC_UPDATE, i * 16
|
||||
add rD, ways16
|
||||
check2:
|
||||
sub rN, ways
|
||||
jnc nextBlocks2
|
||||
key_last equ @CatStr(xmm, %(ways_start_reg + ways + 2))
|
||||
key_last_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))
|
||||
key_last_ymm_n equ (ways_start_reg + ways + 2)
|
||||
|
||||
add rN, ways
|
||||
jmp check
|
||||
|
||||
nextBlock:
|
||||
mov x3, x6
|
||||
movdqa xmm1, [rD]
|
||||
LOAD_OP movdqa, +32
|
||||
pxor xmm0, xmm1
|
||||
DECODE LOAD_OP
|
||||
pxor xmm0, xmm6
|
||||
movdqa [rD], xmm0
|
||||
movdqa xmm6, xmm1
|
||||
add rD, 16
|
||||
check:
|
||||
sub rN, 1
|
||||
jnc nextBlock
|
||||
|
||||
movdqa [r1 - 32], xmm6
|
||||
MY_EPILOG
|
||||
NUM_CBC_REGS equ (ways_start_reg + ways + 3)
|
||||
|
||||
|
||||
; ---------- AES-CBC Encode ----------
|
||||
MY_SEG_PROC AesCbc_Decode_HW, 3
|
||||
|
||||
ENCODE macro op:req
|
||||
op aesenc, -16
|
||||
@@:
|
||||
op aesenc, +0
|
||||
op aesenc, +16
|
||||
add r3, 32
|
||||
jnz @B
|
||||
op aesenclast, +0
|
||||
endm
|
||||
AesCbc_Decode_HW_start::
|
||||
MY_PROLOG NUM_CBC_REGS
|
||||
|
||||
AesCbc_Decode_HW_start_2::
|
||||
movdqa iv, [keys]
|
||||
add keys, 32
|
||||
|
||||
MY_PROC AesCbc_Encode_Intel, 3
|
||||
MY_PROLOG xmm0
|
||||
movdqa key0, [keys + 1 * ksize_r]
|
||||
movdqa key_last, [keys]
|
||||
sub ksize_x, 16
|
||||
|
||||
add r1, r6
|
||||
neg r6
|
||||
add r6, 32
|
||||
jmp check2
|
||||
align 16
|
||||
nextBlocks2:
|
||||
WOP movdqa, [rD + i * 16]
|
||||
mov koffs_x, ksize_x
|
||||
; WOP_KEY pxor, ksize_r + 16
|
||||
WOP pxor, key0
|
||||
; align 16
|
||||
@@:
|
||||
WOP_KEY aesdec, 1 * koffs_r
|
||||
sub koffs_r, 16
|
||||
jnz @B
|
||||
; WOP_KEY aesdeclast, 0
|
||||
WOP aesdeclast, key_last
|
||||
|
||||
pxor @CatStr(xmm, %(ways_start_reg)), iv
|
||||
i = 1
|
||||
rept ways - 1
|
||||
pxor @CatStr(xmm, %(ways_start_reg + i)), [rD + i * 16 - 16]
|
||||
i = i + 1
|
||||
endm
|
||||
movdqa iv, [rD + ways * 16 - 16]
|
||||
WOP WRITE_TO_DATA
|
||||
|
||||
jmp check_e
|
||||
add rD, ways * 16
|
||||
AesCbc_Decode_HW_start_3::
|
||||
check2:
|
||||
sub rN, ways
|
||||
jnc nextBlocks2
|
||||
add rN, ways
|
||||
|
||||
align 16
|
||||
nextBlock_e:
|
||||
mov r3, r6
|
||||
pxor xmm0, [rD]
|
||||
pxor xmm0, [r1 + r3 - 32]
|
||||
ENCODE LOAD_OP
|
||||
movdqa [rD], xmm0
|
||||
add rD, 16
|
||||
check_e:
|
||||
sub rN, 1
|
||||
jnc nextBlock_e
|
||||
sub ksize_x, 16
|
||||
|
||||
movdqa [r1 + r6 - 64], xmm0
|
||||
MY_EPILOG
|
||||
jmp check
|
||||
nextBlock:
|
||||
movdqa state, [rD]
|
||||
mov koffs_x, ksize_x
|
||||
; OP_KEY pxor, 1 * ksize_r + 32
|
||||
pxor state, key0
|
||||
; movdqa state0, [rD]
|
||||
; movdqa state, key0
|
||||
; pxor state, state0
|
||||
@@:
|
||||
OP_KEY aesdec, 1 * koffs_r + 16
|
||||
OP_KEY aesdec, 1 * koffs_r
|
||||
sub koffs_r, 32
|
||||
jnz @B
|
||||
OP_KEY aesdec, 16
|
||||
; OP_KEY aesdeclast, 0
|
||||
aesdeclast state, key_last
|
||||
|
||||
pxor state, iv
|
||||
movdqa iv, [rD]
|
||||
; movdqa iv, state0
|
||||
movdqa [rD], state
|
||||
|
||||
add rD, 16
|
||||
check:
|
||||
sub rN, 1
|
||||
jnc nextBlock
|
||||
|
||||
movdqa [keys - 32], iv
|
||||
MY_EPILOG
|
||||
|
||||
|
||||
; ---------- AES-CTR ----------
|
||||
|
||||
XOR_UPD_1 macro reg, offs
|
||||
pxor reg, [rD + offs]
|
||||
endm
|
||||
|
||||
XOR_UPD_2 macro reg, offs
|
||||
movdqa [rD + offs], reg
|
||||
endm
|
||||
; ---------- AVX ----------
|
||||
|
||||
MY_PROC AesCtr_Code_Intel, 3
|
||||
MY_PROLOG xmm6
|
||||
|
||||
mov r5, r4
|
||||
shr r5, 4
|
||||
dec r5
|
||||
shl r5, 4
|
||||
|
||||
mov DWORD PTR [r5], 1
|
||||
mov DWORD PTR [r5 + 4], 0
|
||||
mov DWORD PTR [r5 + 8], 0
|
||||
mov DWORD PTR [r5 + 12], 0
|
||||
|
||||
add r1, r6
|
||||
neg r6
|
||||
add r6, 32
|
||||
|
||||
jmp check2_c
|
||||
|
||||
align 16
|
||||
nextBlocks2_c:
|
||||
movdqa xmm7, [r5]
|
||||
|
||||
AVX__WOP_n macro op
|
||||
i = 0
|
||||
rept ways
|
||||
paddq xmm6, xmm7
|
||||
movdqa @CatStr(xmm,%i), xmm6
|
||||
i = i + 1
|
||||
op (ways_start_reg + i)
|
||||
i = i + 1
|
||||
endm
|
||||
endm
|
||||
|
||||
AVX__WOP macro op
|
||||
i = 0
|
||||
rept ways
|
||||
op @CatStr(ymm, %(ways_start_reg + i))
|
||||
i = i + 1
|
||||
endm
|
||||
endm
|
||||
|
||||
|
||||
AVX__WOP_KEY macro op:req, offs:req
|
||||
vmovdqa key_ymm, ymmword ptr [keys2 + offs]
|
||||
AVX__WOP_n op
|
||||
endm
|
||||
|
||||
|
||||
AVX__CBC_START macro reg
|
||||
; vpxor reg, key_ymm, ymmword ptr [rD + 32 * i]
|
||||
vpxor reg, key0_ymm, ymmword ptr [rD + 32 * i]
|
||||
endm
|
||||
|
||||
AVX__CBC_END macro reg
|
||||
if i eq 0
|
||||
vpxor reg, reg, iv_ymm
|
||||
else
|
||||
vpxor reg, reg, ymmword ptr [rD + i * 32 - 16]
|
||||
endif
|
||||
endm
|
||||
|
||||
|
||||
AVX__WRITE_TO_DATA macro reg
|
||||
vmovdqu ymmword ptr [rD + 32 * i], reg
|
||||
endm
|
||||
|
||||
AVX__XOR_WITH_DATA macro reg
|
||||
vpxor reg, reg, ymmword ptr [rD + 32 * i]
|
||||
endm
|
||||
|
||||
AVX__CTR_START macro reg
|
||||
vpaddq iv_ymm, iv_ymm, one_ymm
|
||||
; vpxor reg, iv_ymm, key_ymm
|
||||
vpxor reg, iv_ymm, key0_ymm
|
||||
endm
|
||||
|
||||
|
||||
MY_VAES_INSTR_2 macro cmd, dest, a1, a2
|
||||
db 0c4H
|
||||
db 2 + 040H + 020h * (1 - (a2) / 8) + 080h * (1 - (dest) / 8)
|
||||
db 5 + 8 * ((not (a1)) and 15)
|
||||
db cmd
|
||||
db 0c0H + 8 * ((dest) and 7) + ((a2) and 7)
|
||||
endm
|
||||
|
||||
MY_VAES_INSTR macro cmd, dest, a
|
||||
MY_VAES_INSTR_2 cmd, dest, dest, a
|
||||
endm
|
||||
|
||||
MY_vaesenc macro dest, a
|
||||
MY_VAES_INSTR 0dcH, dest, a
|
||||
endm
|
||||
MY_vaesenclast macro dest, a
|
||||
MY_VAES_INSTR 0ddH, dest, a
|
||||
endm
|
||||
MY_vaesdec macro dest, a
|
||||
MY_VAES_INSTR 0deH, dest, a
|
||||
endm
|
||||
MY_vaesdeclast macro dest, a
|
||||
MY_VAES_INSTR 0dfH, dest, a
|
||||
endm
|
||||
|
||||
|
||||
AVX__VAES_DEC macro reg
|
||||
MY_vaesdec reg, key_ymm_n
|
||||
endm
|
||||
|
||||
AVX__VAES_DEC_LAST_key_last macro reg
|
||||
; MY_vaesdeclast reg, key_ymm_n
|
||||
MY_vaesdeclast reg, key_last_ymm_n
|
||||
endm
|
||||
|
||||
AVX__VAES_ENC macro reg
|
||||
MY_vaesenc reg, key_ymm_n
|
||||
endm
|
||||
|
||||
AVX__VAES_ENC_LAST macro reg
|
||||
MY_vaesenclast reg, key_ymm_n
|
||||
endm
|
||||
|
||||
AVX__vinserti128_TO_HIGH macro dest, src
|
||||
vinserti128 dest, dest, src, 1
|
||||
endm
|
||||
|
||||
|
||||
MY_PROC AesCbc_Decode_HW_256, 3
|
||||
ifdef use_vaes_256
|
||||
MY_PROLOG NUM_CBC_REGS
|
||||
|
||||
cmp rN, ways * 2
|
||||
jb AesCbc_Decode_HW_start_2
|
||||
|
||||
vmovdqa iv, xmmword ptr [keys]
|
||||
add keys, 32
|
||||
|
||||
vbroadcasti128 key0_ymm, xmmword ptr [keys + 1 * ksize_r]
|
||||
vbroadcasti128 key_last_ymm, xmmword ptr [keys]
|
||||
sub ksize_x, 16
|
||||
mov koffs_x, ksize_x
|
||||
add ksize_x, ksize_x
|
||||
|
||||
AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 2) * 32)
|
||||
push keys2
|
||||
sub r4, AVX_STACK_SUB
|
||||
; sub r4, 32
|
||||
; sub r4, ksize_r
|
||||
; lea keys2, [r4 + 32]
|
||||
mov keys2, r4
|
||||
and keys2, -32
|
||||
broad:
|
||||
vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]
|
||||
vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm
|
||||
sub koffs_r, 16
|
||||
; jnc broad
|
||||
jnz broad
|
||||
|
||||
sub rN, ways * 2
|
||||
|
||||
align 16
|
||||
avx_cbcdec_nextBlock2:
|
||||
mov koffs_x, ksize_x
|
||||
; AVX__WOP_KEY AVX__CBC_START, 1 * koffs_r + 32
|
||||
AVX__WOP AVX__CBC_START
|
||||
@@:
|
||||
AVX__WOP_KEY AVX__VAES_DEC, 1 * koffs_r
|
||||
sub koffs_r, 32
|
||||
jnz @B
|
||||
; AVX__WOP_KEY AVX__VAES_DEC_LAST, 0
|
||||
AVX__WOP_n AVX__VAES_DEC_LAST_key_last
|
||||
|
||||
AVX__vinserti128_TO_HIGH iv_ymm, xmmword ptr [rD]
|
||||
AVX__WOP AVX__CBC_END
|
||||
|
||||
vmovdqa iv, xmmword ptr [rD + ways * 32 - 16]
|
||||
AVX__WOP AVX__WRITE_TO_DATA
|
||||
|
||||
add rD, ways * 32
|
||||
sub rN, ways * 2
|
||||
jnc avx_cbcdec_nextBlock2
|
||||
add rN, ways * 2
|
||||
|
||||
shr ksize_x, 1
|
||||
|
||||
; lea r4, [r4 + 1 * ksize_r + 32]
|
||||
add r4, AVX_STACK_SUB
|
||||
pop keys2
|
||||
|
||||
vzeroupper
|
||||
jmp AesCbc_Decode_HW_start_3
|
||||
else
|
||||
jmp AesCbc_Decode_HW_start
|
||||
endif
|
||||
MY_ENDP
|
||||
MY_SEG_ENDP
|
||||
|
||||
|
||||
|
||||
|
||||
; ---------- AES-CBC Encode ----------
|
||||
|
||||
e0 equ xmm1
|
||||
|
||||
CENC_START_KEY equ 2
|
||||
CENC_NUM_REG_KEYS equ (3 * 2)
|
||||
; last_key equ @CatStr(xmm, %(CENC_START_KEY + CENC_NUM_REG_KEYS))
|
||||
|
||||
MY_SEG_PROC AesCbc_Encode_HW, 3
|
||||
MY_PROLOG (CENC_START_KEY + CENC_NUM_REG_KEYS + 0)
|
||||
|
||||
movdqa state, [keys]
|
||||
add keys, 32
|
||||
|
||||
i = 0
|
||||
rept CENC_NUM_REG_KEYS
|
||||
movdqa @CatStr(xmm, %(CENC_START_KEY + i)), [keys + i * 16]
|
||||
i = i + 1
|
||||
endm
|
||||
|
||||
add keys, ksize_r
|
||||
neg ksize_r
|
||||
add ksize_r, (16 * CENC_NUM_REG_KEYS)
|
||||
; movdqa last_key, [keys]
|
||||
jmp check_e
|
||||
|
||||
align 16
|
||||
nextBlock_e:
|
||||
movdqa e0, [rD]
|
||||
mov koffs_r, ksize_r
|
||||
pxor e0, @CatStr(xmm, %(CENC_START_KEY))
|
||||
pxor state, e0
|
||||
|
||||
i = 1
|
||||
rept (CENC_NUM_REG_KEYS - 1)
|
||||
aesenc state, @CatStr(xmm, %(CENC_START_KEY + i))
|
||||
i = i + 1
|
||||
endm
|
||||
|
||||
mov r3, r6
|
||||
LOAD_OP_W pxor, -32
|
||||
ENCODE LOAD_OP_W
|
||||
OP_W XOR_UPD_1, i * 16
|
||||
OP_W XOR_UPD_2, i * 16
|
||||
add rD, ways16
|
||||
check2_c:
|
||||
sub rN, ways
|
||||
jnc nextBlocks2_c
|
||||
@@:
|
||||
OP_KEY aesenc, 1 * koffs_r
|
||||
OP_KEY aesenc, 1 * koffs_r + 16
|
||||
add koffs_r, 32
|
||||
jnz @B
|
||||
OP_KEY aesenclast, 0
|
||||
; aesenclast state, last_key
|
||||
|
||||
movdqa [rD], state
|
||||
add rD, 16
|
||||
check_e:
|
||||
sub rN, 1
|
||||
jnc nextBlock_e
|
||||
|
||||
add rN, ways
|
||||
jmp check_c
|
||||
; movdqa [keys - 32], state
|
||||
movdqa [keys + 1 * ksize_r - (16 * CENC_NUM_REG_KEYS) - 32], state
|
||||
MY_EPILOG
|
||||
MY_SEG_ENDP
|
||||
|
||||
nextBlock_c:
|
||||
paddq xmm6, [r5]
|
||||
mov r3, r6
|
||||
movdqa xmm0, [r1 + r3 - 32]
|
||||
pxor xmm0, xmm6
|
||||
ENCODE LOAD_OP
|
||||
XOR_UPD_1 xmm0, 0
|
||||
XOR_UPD_2 xmm0, 0
|
||||
add rD, 16
|
||||
check_c:
|
||||
sub rN, 1
|
||||
jnc nextBlock_c
|
||||
|
||||
movdqa [r1 + r6 - 64], xmm6
|
||||
MY_EPILOG
|
||||
|
||||
; ---------- AES-CTR ----------
|
||||
|
||||
ifdef x64
|
||||
; ways = 11
|
||||
endif
|
||||
|
||||
|
||||
one equ @CatStr(xmm, %(ways_start_reg + ways + 1))
|
||||
one_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))
|
||||
key0 equ @CatStr(xmm, %(ways_start_reg + ways + 2))
|
||||
key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))
|
||||
NUM_CTR_REGS equ (ways_start_reg + ways + 3)
|
||||
|
||||
INIT_CTR macro reg, _ppp_
|
||||
paddq iv, one
|
||||
movdqa reg, iv
|
||||
endm
|
||||
|
||||
|
||||
MY_SEG_PROC AesCtr_Code_HW, 3
|
||||
Ctr_start::
|
||||
MY_PROLOG NUM_CTR_REGS
|
||||
|
||||
Ctr_start_2::
|
||||
movdqa iv, [keys]
|
||||
add keys, 32
|
||||
movdqa key0, [keys]
|
||||
|
||||
add keys, ksize_r
|
||||
neg ksize_r
|
||||
add ksize_r, 16
|
||||
|
||||
Ctr_start_3::
|
||||
mov koffs_x, 1
|
||||
movd one, koffs_x
|
||||
jmp check2_c
|
||||
|
||||
align 16
|
||||
nextBlocks2_c:
|
||||
WOP INIT_CTR, 0
|
||||
mov koffs_r, ksize_r
|
||||
; WOP_KEY pxor, 1 * koffs_r -16
|
||||
WOP pxor, key0
|
||||
@@:
|
||||
WOP_KEY aesenc, 1 * koffs_r
|
||||
add koffs_r, 16
|
||||
jnz @B
|
||||
WOP_KEY aesenclast, 0
|
||||
|
||||
WOP XOR_WITH_DATA
|
||||
WOP WRITE_TO_DATA
|
||||
add rD, ways * 16
|
||||
check2_c:
|
||||
sub rN, ways
|
||||
jnc nextBlocks2_c
|
||||
add rN, ways
|
||||
|
||||
sub keys, 16
|
||||
add ksize_r, 16
|
||||
|
||||
jmp check_c
|
||||
|
||||
; align 16
|
||||
nextBlock_c:
|
||||
paddq iv, one
|
||||
; movdqa state, [keys + 1 * koffs_r - 16]
|
||||
movdqa state, key0
|
||||
mov koffs_r, ksize_r
|
||||
pxor state, iv
|
||||
|
||||
@@:
|
||||
OP_KEY aesenc, 1 * koffs_r
|
||||
OP_KEY aesenc, 1 * koffs_r + 16
|
||||
add koffs_r, 32
|
||||
jnz @B
|
||||
OP_KEY aesenc, 0
|
||||
OP_KEY aesenclast, 16
|
||||
|
||||
pxor state, [rD]
|
||||
movdqa [rD], state
|
||||
add rD, 16
|
||||
check_c:
|
||||
sub rN, 1
|
||||
jnc nextBlock_c
|
||||
|
||||
; movdqa [keys - 32], iv
|
||||
movdqa [keys + 1 * ksize_r - 16 - 32], iv
|
||||
MY_EPILOG
|
||||
|
||||
|
||||
MY_PROC AesCtr_Code_HW_256, 3
|
||||
ifdef use_vaes_256
|
||||
MY_PROLOG NUM_CTR_REGS
|
||||
|
||||
cmp rN, ways * 2
|
||||
jb Ctr_start_2
|
||||
|
||||
vbroadcasti128 iv_ymm, xmmword ptr [keys]
|
||||
add keys, 32
|
||||
vbroadcasti128 key0_ymm, xmmword ptr [keys]
|
||||
mov koffs_x, 1
|
||||
vmovd one, koffs_x
|
||||
vpsubq iv_ymm, iv_ymm, one_ymm
|
||||
vpaddq one, one, one
|
||||
AVX__vinserti128_TO_HIGH one_ymm, one
|
||||
|
||||
add keys, ksize_r
|
||||
sub ksize_x, 16
|
||||
neg ksize_r
|
||||
mov koffs_r, ksize_r
|
||||
add ksize_r, ksize_r
|
||||
|
||||
AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 1) * 32)
|
||||
push keys2
|
||||
lea keys2, [r4 - 32]
|
||||
sub r4, AVX_STACK_SUB
|
||||
and keys2, -32
|
||||
vbroadcasti128 key_ymm, xmmword ptr [keys]
|
||||
vmovdqa ymmword ptr [keys2], key_ymm
|
||||
@@:
|
||||
vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]
|
||||
vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm
|
||||
add koffs_r, 16
|
||||
jnz @B
|
||||
|
||||
sub rN, ways * 2
|
||||
|
||||
align 16
|
||||
avx_ctr_nextBlock2:
|
||||
mov koffs_r, ksize_r
|
||||
AVX__WOP AVX__CTR_START
|
||||
; AVX__WOP_KEY AVX__CTR_START, 1 * koffs_r - 32
|
||||
@@:
|
||||
AVX__WOP_KEY AVX__VAES_ENC, 1 * koffs_r
|
||||
add koffs_r, 32
|
||||
jnz @B
|
||||
AVX__WOP_KEY AVX__VAES_ENC_LAST, 0
|
||||
|
||||
AVX__WOP AVX__XOR_WITH_DATA
|
||||
AVX__WOP AVX__WRITE_TO_DATA
|
||||
|
||||
add rD, ways * 32
|
||||
sub rN, ways * 2
|
||||
jnc avx_ctr_nextBlock2
|
||||
add rN, ways * 2
|
||||
|
||||
vextracti128 iv, iv_ymm, 1
|
||||
sar ksize_r, 1
|
||||
|
||||
add r4, AVX_STACK_SUB
|
||||
pop keys2
|
||||
|
||||
vzeroupper
|
||||
jmp Ctr_start_3
|
||||
else
|
||||
jmp Ctr_start
|
||||
endif
|
||||
MY_ENDP
|
||||
MY_SEG_ENDP
|
||||
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user