mirror of
https://github.com/Xevion/easy7zip.git
synced 2025-12-08 12:07:03 -06:00
Update to 7-Zip Version 21.02
This commit is contained in:
181
Asm/arm64/7zAsm.S
Normal file
181
Asm/arm64/7zAsm.S
Normal file
@@ -0,0 +1,181 @@
|
||||
// 7zAsm.S -- ASM macros for arm64
|
||||
// 2021-04-25 : Igor Pavlov : Public domain
|
||||
|
||||
#define r0 x0
|
||||
#define r1 x1
|
||||
#define r2 x2
|
||||
#define r3 x3
|
||||
#define r4 x4
|
||||
#define r5 x5
|
||||
#define r6 x6
|
||||
#define r7 x7
|
||||
#define r8 x8
|
||||
#define r9 x9
|
||||
#define r10 x10
|
||||
#define r11 x11
|
||||
#define r12 x12
|
||||
#define r13 x13
|
||||
#define r14 x14
|
||||
#define r15 x15
|
||||
#define r16 x16
|
||||
#define r17 x17
|
||||
#define r18 x18
|
||||
#define r19 x19
|
||||
#define r20 x20
|
||||
#define r21 x21
|
||||
#define r22 x22
|
||||
#define r23 x23
|
||||
#define r24 x24
|
||||
#define r25 x25
|
||||
#define r26 x26
|
||||
#define r27 x27
|
||||
#define r28 x28
|
||||
#define r29 x29
|
||||
#define r30 x30
|
||||
|
||||
#define REG_ABI_PARAM_0 r0
|
||||
#define REG_ABI_PARAM_1 r1
|
||||
#define REG_ABI_PARAM_2 r2
|
||||
|
||||
|
||||
.macro p2_add reg:req, param:req
|
||||
add \reg, \reg, \param
|
||||
.endm
|
||||
|
||||
.macro p2_sub reg:req, param:req
|
||||
sub \reg, \reg, \param
|
||||
.endm
|
||||
|
||||
.macro p2_sub_s reg:req, param:req
|
||||
subs \reg, \reg, \param
|
||||
.endm
|
||||
|
||||
.macro p2_and reg:req, param:req
|
||||
and \reg, \reg, \param
|
||||
.endm
|
||||
|
||||
.macro xor reg:req, param:req
|
||||
eor \reg, \reg, \param
|
||||
.endm
|
||||
|
||||
.macro or reg:req, param:req
|
||||
orr \reg, \reg, \param
|
||||
.endm
|
||||
|
||||
.macro shl reg:req, param:req
|
||||
lsl \reg, \reg, \param
|
||||
.endm
|
||||
|
||||
.macro shr reg:req, param:req
|
||||
lsr \reg, \reg, \param
|
||||
.endm
|
||||
|
||||
.macro sar reg:req, param:req
|
||||
asr \reg, \reg, \param
|
||||
.endm
|
||||
|
||||
.macro p1_neg reg:req
|
||||
neg \reg, \reg
|
||||
.endm
|
||||
|
||||
.macro dec reg:req
|
||||
sub \reg, \reg, 1
|
||||
.endm
|
||||
|
||||
.macro dec_s reg:req
|
||||
subs \reg, \reg, 1
|
||||
.endm
|
||||
|
||||
.macro inc reg:req
|
||||
add \reg, \reg, 1
|
||||
.endm
|
||||
|
||||
.macro inc_s reg:req
|
||||
adds \reg, \reg, 1
|
||||
.endm
|
||||
|
||||
|
||||
.macro imul reg:req, param:req
|
||||
mul \reg, \reg, \param
|
||||
.endm
|
||||
|
||||
/*
|
||||
arm64 and arm use reverted c flag after subs/cmp instructions:
|
||||
arm64-arm : x86
|
||||
b.lo / b.cc : jb / jc
|
||||
b.hs / b.cs : jae / jnc
|
||||
*/
|
||||
|
||||
.macro jmp lab:req
|
||||
b \lab
|
||||
.endm
|
||||
|
||||
.macro je lab:req
|
||||
b.eq \lab
|
||||
.endm
|
||||
|
||||
.macro jz lab:req
|
||||
b.eq \lab
|
||||
.endm
|
||||
|
||||
.macro jnz lab:req
|
||||
b.ne \lab
|
||||
.endm
|
||||
|
||||
.macro jne lab:req
|
||||
b.ne \lab
|
||||
.endm
|
||||
|
||||
.macro jb lab:req
|
||||
b.lo \lab
|
||||
.endm
|
||||
|
||||
.macro jbe lab:req
|
||||
b.ls \lab
|
||||
.endm
|
||||
|
||||
.macro ja lab:req
|
||||
b.hi \lab
|
||||
.endm
|
||||
|
||||
.macro jae lab:req
|
||||
b.hs \lab
|
||||
.endm
|
||||
|
||||
|
||||
.macro cmove dest:req, srcTrue:req
|
||||
csel \dest, \srcTrue, \dest, eq
|
||||
.endm
|
||||
|
||||
.macro cmovne dest:req, srcTrue:req
|
||||
csel \dest, \srcTrue, \dest, ne
|
||||
.endm
|
||||
|
||||
.macro cmovs dest:req, srcTrue:req
|
||||
csel \dest, \srcTrue, \dest, mi
|
||||
.endm
|
||||
|
||||
.macro cmovns dest:req, srcTrue:req
|
||||
csel \dest, \srcTrue, \dest, pl
|
||||
.endm
|
||||
|
||||
.macro cmovb dest:req, srcTrue:req
|
||||
csel \dest, \srcTrue, \dest, lo
|
||||
.endm
|
||||
|
||||
.macro cmovae dest:req, srcTrue:req
|
||||
csel \dest, \srcTrue, \dest, hs
|
||||
.endm
|
||||
|
||||
|
||||
.macro MY_ALIGN_16 macro
|
||||
.p2align 4,, (1 << 4) - 1
|
||||
.endm
|
||||
|
||||
.macro MY_ALIGN_32 macro
|
||||
.p2align 5,, (1 << 5) - 1
|
||||
.endm
|
||||
|
||||
.macro MY_ALIGN_64 macro
|
||||
.p2align 6,, (1 << 6) - 1
|
||||
.endm
|
||||
1487
Asm/arm64/LzmaDecOpt.S
Normal file
1487
Asm/arm64/LzmaDecOpt.S
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,32 @@
|
||||
; 7zAsm.asm -- ASM macros
|
||||
; 2018-02-03 : Igor Pavlov : Public domain
|
||||
; 2021-02-07 : Igor Pavlov : Public domain
|
||||
|
||||
ifdef RAX
|
||||
x64 equ 1
|
||||
endif
|
||||
|
||||
ifdef x64
|
||||
IS_X64 equ 1
|
||||
else
|
||||
IS_X64 equ 0
|
||||
endif
|
||||
|
||||
ifdef ABI_LINUX
|
||||
IS_LINUX equ 1
|
||||
else
|
||||
IS_LINUX equ 0
|
||||
endif
|
||||
|
||||
ifndef x64
|
||||
; Use ABI_CDECL for x86 (32-bit) only
|
||||
; if ABI_CDECL is not defined, we use fastcall abi
|
||||
ifdef ABI_CDECL
|
||||
IS_CDECL equ 1
|
||||
else
|
||||
IS_CDECL equ 0
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
MY_ASM_START macro
|
||||
ifdef x64
|
||||
@@ -14,8 +41,12 @@ endm
|
||||
MY_PROC macro name:req, numParams:req
|
||||
align 16
|
||||
proc_numParams = numParams
|
||||
ifdef x64
|
||||
if (IS_X64 gt 0)
|
||||
proc_name equ name
|
||||
elseif (IS_LINUX gt 0)
|
||||
proc_name equ name
|
||||
elseif (IS_CDECL gt 0)
|
||||
proc_name equ @CatStr(_,name)
|
||||
else
|
||||
proc_name equ @CatStr(@,name,@, %numParams * 4)
|
||||
endif
|
||||
@@ -23,18 +54,19 @@ MY_PROC macro name:req, numParams:req
|
||||
endm
|
||||
|
||||
MY_ENDP macro
|
||||
ifdef x64
|
||||
ret
|
||||
else
|
||||
if proc_numParams LT 3
|
||||
ret
|
||||
if (IS_X64 gt 0)
|
||||
ret
|
||||
elseif (IS_CDECL gt 0)
|
||||
ret
|
||||
elseif (proc_numParams LT 3)
|
||||
ret
|
||||
else
|
||||
ret (proc_numParams - 2) * 4
|
||||
ret (proc_numParams - 2) * 4
|
||||
endif
|
||||
endif
|
||||
proc_name ENDP
|
||||
endm
|
||||
|
||||
|
||||
ifdef x64
|
||||
REG_SIZE equ 8
|
||||
REG_LOGAR_SIZE equ 3
|
||||
@@ -103,6 +135,24 @@ else
|
||||
r7 equ x7
|
||||
endif
|
||||
|
||||
|
||||
ifdef x64
|
||||
ifdef ABI_LINUX
|
||||
|
||||
MY_PUSH_2_REGS macro
|
||||
push r3
|
||||
push r5
|
||||
endm
|
||||
|
||||
MY_POP_2_REGS macro
|
||||
pop r5
|
||||
pop r3
|
||||
endm
|
||||
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
MY_PUSH_4_REGS macro
|
||||
push r3
|
||||
push r5
|
||||
@@ -118,30 +168,91 @@ MY_POP_4_REGS macro
|
||||
endm
|
||||
|
||||
|
||||
ifdef x64
|
||||
; for fastcall and for WIN-x64
|
||||
REG_PARAM_0_x equ x1
|
||||
REG_PARAM_0 equ r1
|
||||
REG_PARAM_1 equ r2
|
||||
|
||||
; for WIN64-x64 ABI:
|
||||
ifndef x64
|
||||
; for x86-fastcall
|
||||
|
||||
REG_PARAM_0 equ r1
|
||||
REG_PARAM_1 equ r2
|
||||
REG_ABI_PARAM_0_x equ REG_PARAM_0_x
|
||||
REG_ABI_PARAM_0 equ REG_PARAM_0
|
||||
REG_ABI_PARAM_1 equ REG_PARAM_1
|
||||
|
||||
else
|
||||
; x64
|
||||
|
||||
if (IS_LINUX eq 0)
|
||||
|
||||
; for WIN-x64:
|
||||
REG_PARAM_2 equ r8
|
||||
REG_PARAM_3 equ r9
|
||||
|
||||
MY_PUSH_PRESERVED_REGS macro
|
||||
MY_PUSH_4_REGS
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
REG_ABI_PARAM_0_x equ REG_PARAM_0_x
|
||||
REG_ABI_PARAM_0 equ REG_PARAM_0
|
||||
REG_ABI_PARAM_1 equ REG_PARAM_1
|
||||
REG_ABI_PARAM_2 equ REG_PARAM_2
|
||||
REG_ABI_PARAM_3 equ REG_PARAM_3
|
||||
|
||||
else
|
||||
; for LINUX-x64:
|
||||
REG_LINUX_PARAM_0_x equ x7
|
||||
REG_LINUX_PARAM_0 equ r7
|
||||
REG_LINUX_PARAM_1 equ r6
|
||||
REG_LINUX_PARAM_2 equ r2
|
||||
REG_LINUX_PARAM_3 equ r1
|
||||
|
||||
REG_ABI_PARAM_0_x equ REG_LINUX_PARAM_0_x
|
||||
REG_ABI_PARAM_0 equ REG_LINUX_PARAM_0
|
||||
REG_ABI_PARAM_1 equ REG_LINUX_PARAM_1
|
||||
REG_ABI_PARAM_2 equ REG_LINUX_PARAM_2
|
||||
REG_ABI_PARAM_3 equ REG_LINUX_PARAM_3
|
||||
|
||||
MY_ABI_LINUX_TO_WIN_2 macro
|
||||
mov r2, r6
|
||||
mov r1, r7
|
||||
endm
|
||||
|
||||
MY_ABI_LINUX_TO_WIN_3 macro
|
||||
mov r8, r2
|
||||
mov r2, r6
|
||||
mov r1, r7
|
||||
endm
|
||||
|
||||
MY_ABI_LINUX_TO_WIN_4 macro
|
||||
mov r9, r1
|
||||
mov r8, r2
|
||||
mov r2, r6
|
||||
mov r1, r7
|
||||
endm
|
||||
|
||||
endif ; IS_LINUX
|
||||
|
||||
|
||||
MY_PUSH_PRESERVED_ABI_REGS macro
|
||||
if (IS_LINUX gt 0)
|
||||
MY_PUSH_2_REGS
|
||||
else
|
||||
MY_PUSH_4_REGS
|
||||
endif
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
endm
|
||||
|
||||
|
||||
MY_POP_PRESERVED_REGS macro
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
MY_POP_4_REGS
|
||||
MY_POP_PRESERVED_ABI_REGS macro
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
if (IS_LINUX gt 0)
|
||||
MY_POP_2_REGS
|
||||
else
|
||||
MY_POP_4_REGS
|
||||
endif
|
||||
endm
|
||||
|
||||
endif
|
||||
endif ; x64
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
; 7zCrcOpt.asm -- CRC32 calculation : optimized version
|
||||
; 2009-12-12 : Igor Pavlov : Public domain
|
||||
; 2021-02-07 : Igor Pavlov : Public domain
|
||||
|
||||
include 7zAsm.asm
|
||||
|
||||
@@ -7,21 +7,28 @@ MY_ASM_START
|
||||
|
||||
rD equ r2
|
||||
rN equ r7
|
||||
rT equ r5
|
||||
|
||||
ifdef x64
|
||||
num_VAR equ r8
|
||||
table_VAR equ r9
|
||||
else
|
||||
data_size equ (REG_SIZE * 5)
|
||||
crc_table equ (REG_SIZE + data_size)
|
||||
num_VAR equ [r4 + data_size]
|
||||
table_VAR equ [r4 + crc_table]
|
||||
if (IS_CDECL gt 0)
|
||||
crc_OFFS equ (REG_SIZE * 5)
|
||||
data_OFFS equ (REG_SIZE + crc_OFFS)
|
||||
size_OFFS equ (REG_SIZE + data_OFFS)
|
||||
else
|
||||
size_OFFS equ (REG_SIZE * 5)
|
||||
endif
|
||||
table_OFFS equ (REG_SIZE + size_OFFS)
|
||||
num_VAR equ [r4 + size_OFFS]
|
||||
table_VAR equ [r4 + table_OFFS]
|
||||
endif
|
||||
|
||||
SRCDAT equ rN + rD + 4 *
|
||||
SRCDAT equ rD + rN * 1 + 4 *
|
||||
|
||||
CRC macro op:req, dest:req, src:req, t:req
|
||||
op dest, DWORD PTR [r5 + src * 4 + 0400h * t]
|
||||
op dest, DWORD PTR [rT + src * 4 + 0400h * t]
|
||||
endm
|
||||
|
||||
CRC_XOR macro dest:req, src:req, t:req
|
||||
@@ -43,11 +50,33 @@ CRC1b macro
|
||||
endm
|
||||
|
||||
MY_PROLOG macro crc_end:req
|
||||
MY_PUSH_4_REGS
|
||||
|
||||
ifdef x64
|
||||
if (IS_LINUX gt 0)
|
||||
MY_PUSH_2_REGS
|
||||
mov x0, REG_ABI_PARAM_0_x ; x0 = x7
|
||||
mov rT, REG_ABI_PARAM_3 ; r5 = r1
|
||||
mov rN, REG_ABI_PARAM_2 ; r7 = r2
|
||||
mov rD, REG_ABI_PARAM_1 ; r2 = r6
|
||||
else
|
||||
MY_PUSH_4_REGS
|
||||
mov x0, REG_ABI_PARAM_0_x ; x0 = x1
|
||||
mov rT, REG_ABI_PARAM_3 ; r5 = r9
|
||||
mov rN, REG_ABI_PARAM_2 ; r7 = r8
|
||||
; mov rD, REG_ABI_PARAM_1 ; r2 = r2
|
||||
endif
|
||||
else
|
||||
MY_PUSH_4_REGS
|
||||
if (IS_CDECL gt 0)
|
||||
mov x0, [r4 + crc_OFFS]
|
||||
mov rD, [r4 + data_OFFS]
|
||||
else
|
||||
mov x0, REG_ABI_PARAM_0_x
|
||||
endif
|
||||
mov rN, num_VAR
|
||||
mov rT, table_VAR
|
||||
endif
|
||||
|
||||
mov x0, x1
|
||||
mov rN, num_VAR
|
||||
mov r5, table_VAR
|
||||
test rN, rN
|
||||
jz crc_end
|
||||
@@:
|
||||
@@ -77,7 +106,11 @@ MY_EPILOG macro crc_end:req
|
||||
CRC1b
|
||||
jmp crc_end
|
||||
@@:
|
||||
MY_POP_4_REGS
|
||||
if (IS_X64 gt 0) and (IS_LINUX gt 0)
|
||||
MY_POP_2_REGS
|
||||
else
|
||||
MY_POP_4_REGS
|
||||
endif
|
||||
endm
|
||||
|
||||
MY_PROC CrcUpdateT8, 4
|
||||
|
||||
@@ -1,237 +1,734 @@
|
||||
; AesOpt.asm -- Intel's AES.
|
||||
; 2009-12-12 : Igor Pavlov : Public domain
|
||||
; AesOpt.asm -- AES optimized code for x86 AES hardware instructions
|
||||
; 2021-03-10 : Igor Pavlov : Public domain
|
||||
|
||||
include 7zAsm.asm
|
||||
|
||||
ifdef ymm0
|
||||
use_vaes_256 equ 1
|
||||
ECHO "++ VAES 256"
|
||||
else
|
||||
ECHO "-- NO VAES 256"
|
||||
endif
|
||||
|
||||
ifdef x64
|
||||
ECHO "x86-64"
|
||||
else
|
||||
ECHO "x86"
|
||||
if (IS_CDECL gt 0)
|
||||
ECHO "ABI : CDECL"
|
||||
else
|
||||
ECHO "ABI : no CDECL : FASTCALL"
|
||||
endif
|
||||
endif
|
||||
|
||||
if (IS_LINUX gt 0)
|
||||
ECHO "ABI : LINUX"
|
||||
else
|
||||
ECHO "ABI : WINDOWS"
|
||||
endif
|
||||
|
||||
MY_ASM_START
|
||||
|
||||
ifndef x64
|
||||
.686
|
||||
.xmm
|
||||
endif
|
||||
|
||||
ifdef x64
|
||||
num equ r8
|
||||
else
|
||||
num equ [r4 + REG_SIZE * 4]
|
||||
|
||||
; MY_ALIGN EQU ALIGN(64)
|
||||
MY_ALIGN EQU
|
||||
|
||||
SEG_ALIGN EQU MY_ALIGN
|
||||
|
||||
MY_SEG_PROC macro name:req, numParams:req
|
||||
; seg_name equ @CatStr(_TEXT$, name)
|
||||
; seg_name SEGMENT SEG_ALIGN 'CODE'
|
||||
MY_PROC name, numParams
|
||||
endm
|
||||
|
||||
MY_SEG_ENDP macro
|
||||
; seg_name ENDS
|
||||
endm
|
||||
|
||||
|
||||
NUM_AES_KEYS_MAX equ 15
|
||||
|
||||
; the number of push operators in function PROLOG
|
||||
if (IS_LINUX eq 0) or (IS_X64 eq 0)
|
||||
num_regs_push equ 2
|
||||
stack_param_offset equ (REG_SIZE * (1 + num_regs_push))
|
||||
endif
|
||||
|
||||
rD equ r2
|
||||
rN equ r0
|
||||
ifdef x64
|
||||
num_param equ REG_ABI_PARAM_2
|
||||
else
|
||||
if (IS_CDECL gt 0)
|
||||
; size_t size
|
||||
; void * data
|
||||
; UInt32 * aes
|
||||
; ret-ip <- (r4)
|
||||
aes_OFFS equ (stack_param_offset)
|
||||
data_OFFS equ (REG_SIZE + aes_OFFS)
|
||||
size_OFFS equ (REG_SIZE + data_OFFS)
|
||||
num_param equ [r4 + size_OFFS]
|
||||
else
|
||||
num_param equ [r4 + stack_param_offset]
|
||||
endif
|
||||
endif
|
||||
|
||||
MY_PROLOG macro reg:req
|
||||
ifdef x64
|
||||
movdqa [r4 + 8], xmm6
|
||||
movdqa [r4 + 8 + 16], xmm7
|
||||
endif
|
||||
keys equ REG_PARAM_0 ; r1
|
||||
rD equ REG_PARAM_1 ; r2
|
||||
rN equ r0
|
||||
|
||||
push r3
|
||||
push r5
|
||||
push r6
|
||||
koffs_x equ x7
|
||||
koffs_r equ r7
|
||||
|
||||
mov rN, num
|
||||
mov x6, [r1 + 16]
|
||||
shl x6, 5
|
||||
ksize_x equ x6
|
||||
ksize_r equ r6
|
||||
|
||||
movdqa reg, [r1]
|
||||
add r1, 32
|
||||
endm
|
||||
keys2 equ r3
|
||||
|
||||
MY_EPILOG macro
|
||||
pop r6
|
||||
pop r5
|
||||
pop r3
|
||||
state equ xmm0
|
||||
key equ xmm0
|
||||
key_ymm equ ymm0
|
||||
key_ymm_n equ 0
|
||||
|
||||
ifdef x64
|
||||
movdqa xmm6, [r4 + 8]
|
||||
movdqa xmm7, [r4 + 8 + 16]
|
||||
endif
|
||||
ifdef x64
|
||||
ways = 11
|
||||
else
|
||||
ways = 4
|
||||
endif
|
||||
|
||||
MY_ENDP
|
||||
endm
|
||||
ways_start_reg equ 1
|
||||
|
||||
ways equ 4
|
||||
ways16 equ (ways * 16)
|
||||
iv equ @CatStr(xmm, %(ways_start_reg + ways))
|
||||
iv_ymm equ @CatStr(ymm, %(ways_start_reg + ways))
|
||||
|
||||
OP_W macro op, op2
|
||||
|
||||
WOP macro op, op2
|
||||
i = 0
|
||||
rept ways
|
||||
op @CatStr(xmm,%i), op2
|
||||
i = i + 1
|
||||
op @CatStr(xmm, %(ways_start_reg + i)), op2
|
||||
i = i + 1
|
||||
endm
|
||||
endm
|
||||
|
||||
LOAD_OP macro op:req, offs:req
|
||||
op xmm0, [r1 + r3 offs]
|
||||
endm
|
||||
|
||||
ifndef ABI_LINUX
|
||||
ifdef x64
|
||||
|
||||
; we use 32 bytes of home space in stack in WIN64-x64
|
||||
NUM_HOME_MM_REGS equ (32 / 16)
|
||||
; we preserve xmm registers starting from xmm6 in WIN64-x64
|
||||
MM_START_SAVE_REG equ 6
|
||||
|
||||
SAVE_XMM macro num_used_mm_regs:req
|
||||
num_save_mm_regs = num_used_mm_regs - MM_START_SAVE_REG
|
||||
if num_save_mm_regs GT 0
|
||||
num_save_mm_regs2 = num_save_mm_regs - NUM_HOME_MM_REGS
|
||||
; RSP is (16*x + 8) after entering the function in WIN64-x64
|
||||
stack_offset = 16 * num_save_mm_regs2 + (stack_param_offset mod 16)
|
||||
|
||||
LOAD_OP_W macro op:req, offs:req
|
||||
movdqa xmm7, [r1 + r3 offs]
|
||||
OP_W op, xmm7
|
||||
i = 0
|
||||
rept num_save_mm_regs
|
||||
|
||||
if i eq NUM_HOME_MM_REGS
|
||||
sub r4, stack_offset
|
||||
endif
|
||||
|
||||
if i lt NUM_HOME_MM_REGS
|
||||
movdqa [r4 + stack_param_offset + i * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))
|
||||
else
|
||||
movdqa [r4 + (i - NUM_HOME_MM_REGS) * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))
|
||||
endif
|
||||
|
||||
i = i + 1
|
||||
endm
|
||||
endif
|
||||
endm
|
||||
|
||||
RESTORE_XMM macro num_used_mm_regs:req
|
||||
if num_save_mm_regs GT 0
|
||||
i = 0
|
||||
if num_save_mm_regs2 GT 0
|
||||
rept num_save_mm_regs2
|
||||
movdqa @CatStr(xmm, %(MM_START_SAVE_REG + NUM_HOME_MM_REGS + i)), [r4 + i * 16]
|
||||
i = i + 1
|
||||
endm
|
||||
add r4, stack_offset
|
||||
endif
|
||||
|
||||
num_low_regs = num_save_mm_regs - i
|
||||
i = 0
|
||||
rept num_low_regs
|
||||
movdqa @CatStr(xmm, %(MM_START_SAVE_REG + i)), [r4 + stack_param_offset + i * 16]
|
||||
i = i + 1
|
||||
endm
|
||||
endif
|
||||
endm
|
||||
|
||||
endif ; x64
|
||||
endif ; ABI_LINUX
|
||||
|
||||
|
||||
MY_PROLOG macro num_used_mm_regs:req
|
||||
; num_regs_push: must be equal to the number of push operators
|
||||
; push r3
|
||||
; push r5
|
||||
if (IS_LINUX eq 0) or (IS_X64 eq 0)
|
||||
push r6
|
||||
push r7
|
||||
endif
|
||||
|
||||
mov rN, num_param ; don't move it; num_param can use stack pointer (r4)
|
||||
|
||||
if (IS_X64 eq 0)
|
||||
if (IS_CDECL gt 0)
|
||||
mov rD, [r4 + data_OFFS]
|
||||
mov keys, [r4 + aes_OFFS]
|
||||
endif
|
||||
elseif (IS_LINUX gt 0)
|
||||
MY_ABI_LINUX_TO_WIN_2
|
||||
endif
|
||||
|
||||
|
||||
ifndef ABI_LINUX
|
||||
ifdef x64
|
||||
SAVE_XMM num_used_mm_regs
|
||||
endif
|
||||
endif
|
||||
|
||||
mov ksize_x, [keys + 16]
|
||||
shl ksize_x, 5
|
||||
endm
|
||||
|
||||
|
||||
MY_EPILOG macro
|
||||
ifndef ABI_LINUX
|
||||
ifdef x64
|
||||
RESTORE_XMM num_save_mm_regs
|
||||
endif
|
||||
endif
|
||||
|
||||
if (IS_LINUX eq 0) or (IS_X64 eq 0)
|
||||
pop r7
|
||||
pop r6
|
||||
endif
|
||||
; pop r5
|
||||
; pop r3
|
||||
MY_ENDP
|
||||
endm
|
||||
|
||||
|
||||
OP_KEY macro op:req, offs:req
|
||||
op state, [keys + offs]
|
||||
endm
|
||||
|
||||
|
||||
WOP_KEY macro op:req, offs:req
|
||||
movdqa key, [keys + offs]
|
||||
WOP op, key
|
||||
endm
|
||||
|
||||
|
||||
; ---------- AES-CBC Decode ----------
|
||||
|
||||
CBC_DEC_UPDATE macro reg, offs
|
||||
pxor reg, xmm6
|
||||
movdqa xmm6, [rD + offs]
|
||||
movdqa [rD + offs], reg
|
||||
|
||||
XOR_WITH_DATA macro reg, _ppp_
|
||||
pxor reg, [rD + i * 16]
|
||||
endm
|
||||
|
||||
DECODE macro op:req
|
||||
op aesdec, +16
|
||||
@@:
|
||||
op aesdec, +0
|
||||
op aesdec, -16
|
||||
sub x3, 32
|
||||
jnz @B
|
||||
op aesdeclast, +0
|
||||
WRITE_TO_DATA macro reg, _ppp_
|
||||
movdqa [rD + i * 16], reg
|
||||
endm
|
||||
|
||||
MY_PROC AesCbc_Decode_Intel, 3
|
||||
MY_PROLOG xmm6
|
||||
|
||||
sub x6, 32
|
||||
; state0 equ @CatStr(xmm, %(ways_start_reg))
|
||||
|
||||
jmp check2
|
||||
key0 equ @CatStr(xmm, %(ways_start_reg + ways + 1))
|
||||
key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))
|
||||
|
||||
align 16
|
||||
nextBlocks2:
|
||||
mov x3, x6
|
||||
OP_W movdqa, [rD + i * 16]
|
||||
LOAD_OP_W pxor, +32
|
||||
DECODE LOAD_OP_W
|
||||
OP_W CBC_DEC_UPDATE, i * 16
|
||||
add rD, ways16
|
||||
check2:
|
||||
sub rN, ways
|
||||
jnc nextBlocks2
|
||||
key_last equ @CatStr(xmm, %(ways_start_reg + ways + 2))
|
||||
key_last_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))
|
||||
key_last_ymm_n equ (ways_start_reg + ways + 2)
|
||||
|
||||
add rN, ways
|
||||
jmp check
|
||||
|
||||
nextBlock:
|
||||
mov x3, x6
|
||||
movdqa xmm1, [rD]
|
||||
LOAD_OP movdqa, +32
|
||||
pxor xmm0, xmm1
|
||||
DECODE LOAD_OP
|
||||
pxor xmm0, xmm6
|
||||
movdqa [rD], xmm0
|
||||
movdqa xmm6, xmm1
|
||||
add rD, 16
|
||||
check:
|
||||
sub rN, 1
|
||||
jnc nextBlock
|
||||
|
||||
movdqa [r1 - 32], xmm6
|
||||
MY_EPILOG
|
||||
NUM_CBC_REGS equ (ways_start_reg + ways + 3)
|
||||
|
||||
|
||||
; ---------- AES-CBC Encode ----------
|
||||
MY_SEG_PROC AesCbc_Decode_HW, 3
|
||||
|
||||
ENCODE macro op:req
|
||||
op aesenc, -16
|
||||
@@:
|
||||
op aesenc, +0
|
||||
op aesenc, +16
|
||||
add r3, 32
|
||||
jnz @B
|
||||
op aesenclast, +0
|
||||
endm
|
||||
AesCbc_Decode_HW_start::
|
||||
MY_PROLOG NUM_CBC_REGS
|
||||
|
||||
AesCbc_Decode_HW_start_2::
|
||||
movdqa iv, [keys]
|
||||
add keys, 32
|
||||
|
||||
MY_PROC AesCbc_Encode_Intel, 3
|
||||
MY_PROLOG xmm0
|
||||
movdqa key0, [keys + 1 * ksize_r]
|
||||
movdqa key_last, [keys]
|
||||
sub ksize_x, 16
|
||||
|
||||
add r1, r6
|
||||
neg r6
|
||||
add r6, 32
|
||||
jmp check2
|
||||
align 16
|
||||
nextBlocks2:
|
||||
WOP movdqa, [rD + i * 16]
|
||||
mov koffs_x, ksize_x
|
||||
; WOP_KEY pxor, ksize_r + 16
|
||||
WOP pxor, key0
|
||||
; align 16
|
||||
@@:
|
||||
WOP_KEY aesdec, 1 * koffs_r
|
||||
sub koffs_r, 16
|
||||
jnz @B
|
||||
; WOP_KEY aesdeclast, 0
|
||||
WOP aesdeclast, key_last
|
||||
|
||||
pxor @CatStr(xmm, %(ways_start_reg)), iv
|
||||
i = 1
|
||||
rept ways - 1
|
||||
pxor @CatStr(xmm, %(ways_start_reg + i)), [rD + i * 16 - 16]
|
||||
i = i + 1
|
||||
endm
|
||||
movdqa iv, [rD + ways * 16 - 16]
|
||||
WOP WRITE_TO_DATA
|
||||
|
||||
jmp check_e
|
||||
add rD, ways * 16
|
||||
AesCbc_Decode_HW_start_3::
|
||||
check2:
|
||||
sub rN, ways
|
||||
jnc nextBlocks2
|
||||
add rN, ways
|
||||
|
||||
align 16
|
||||
nextBlock_e:
|
||||
mov r3, r6
|
||||
pxor xmm0, [rD]
|
||||
pxor xmm0, [r1 + r3 - 32]
|
||||
ENCODE LOAD_OP
|
||||
movdqa [rD], xmm0
|
||||
add rD, 16
|
||||
check_e:
|
||||
sub rN, 1
|
||||
jnc nextBlock_e
|
||||
sub ksize_x, 16
|
||||
|
||||
movdqa [r1 + r6 - 64], xmm0
|
||||
MY_EPILOG
|
||||
jmp check
|
||||
nextBlock:
|
||||
movdqa state, [rD]
|
||||
mov koffs_x, ksize_x
|
||||
; OP_KEY pxor, 1 * ksize_r + 32
|
||||
pxor state, key0
|
||||
; movdqa state0, [rD]
|
||||
; movdqa state, key0
|
||||
; pxor state, state0
|
||||
@@:
|
||||
OP_KEY aesdec, 1 * koffs_r + 16
|
||||
OP_KEY aesdec, 1 * koffs_r
|
||||
sub koffs_r, 32
|
||||
jnz @B
|
||||
OP_KEY aesdec, 16
|
||||
; OP_KEY aesdeclast, 0
|
||||
aesdeclast state, key_last
|
||||
|
||||
pxor state, iv
|
||||
movdqa iv, [rD]
|
||||
; movdqa iv, state0
|
||||
movdqa [rD], state
|
||||
|
||||
add rD, 16
|
||||
check:
|
||||
sub rN, 1
|
||||
jnc nextBlock
|
||||
|
||||
movdqa [keys - 32], iv
|
||||
MY_EPILOG
|
||||
|
||||
|
||||
; ---------- AES-CTR ----------
|
||||
|
||||
XOR_UPD_1 macro reg, offs
|
||||
pxor reg, [rD + offs]
|
||||
endm
|
||||
|
||||
XOR_UPD_2 macro reg, offs
|
||||
movdqa [rD + offs], reg
|
||||
endm
|
||||
; ---------- AVX ----------
|
||||
|
||||
MY_PROC AesCtr_Code_Intel, 3
|
||||
MY_PROLOG xmm6
|
||||
|
||||
mov r5, r4
|
||||
shr r5, 4
|
||||
dec r5
|
||||
shl r5, 4
|
||||
|
||||
mov DWORD PTR [r5], 1
|
||||
mov DWORD PTR [r5 + 4], 0
|
||||
mov DWORD PTR [r5 + 8], 0
|
||||
mov DWORD PTR [r5 + 12], 0
|
||||
|
||||
add r1, r6
|
||||
neg r6
|
||||
add r6, 32
|
||||
|
||||
jmp check2_c
|
||||
|
||||
align 16
|
||||
nextBlocks2_c:
|
||||
movdqa xmm7, [r5]
|
||||
|
||||
AVX__WOP_n macro op
|
||||
i = 0
|
||||
rept ways
|
||||
paddq xmm6, xmm7
|
||||
movdqa @CatStr(xmm,%i), xmm6
|
||||
i = i + 1
|
||||
op (ways_start_reg + i)
|
||||
i = i + 1
|
||||
endm
|
||||
endm
|
||||
|
||||
AVX__WOP macro op
|
||||
i = 0
|
||||
rept ways
|
||||
op @CatStr(ymm, %(ways_start_reg + i))
|
||||
i = i + 1
|
||||
endm
|
||||
endm
|
||||
|
||||
|
||||
AVX__WOP_KEY macro op:req, offs:req
|
||||
vmovdqa key_ymm, ymmword ptr [keys2 + offs]
|
||||
AVX__WOP_n op
|
||||
endm
|
||||
|
||||
|
||||
AVX__CBC_START macro reg
|
||||
; vpxor reg, key_ymm, ymmword ptr [rD + 32 * i]
|
||||
vpxor reg, key0_ymm, ymmword ptr [rD + 32 * i]
|
||||
endm
|
||||
|
||||
AVX__CBC_END macro reg
|
||||
if i eq 0
|
||||
vpxor reg, reg, iv_ymm
|
||||
else
|
||||
vpxor reg, reg, ymmword ptr [rD + i * 32 - 16]
|
||||
endif
|
||||
endm
|
||||
|
||||
|
||||
AVX__WRITE_TO_DATA macro reg
|
||||
vmovdqu ymmword ptr [rD + 32 * i], reg
|
||||
endm
|
||||
|
||||
AVX__XOR_WITH_DATA macro reg
|
||||
vpxor reg, reg, ymmword ptr [rD + 32 * i]
|
||||
endm
|
||||
|
||||
AVX__CTR_START macro reg
|
||||
vpaddq iv_ymm, iv_ymm, one_ymm
|
||||
; vpxor reg, iv_ymm, key_ymm
|
||||
vpxor reg, iv_ymm, key0_ymm
|
||||
endm
|
||||
|
||||
|
||||
MY_VAES_INSTR_2 macro cmd, dest, a1, a2
|
||||
db 0c4H
|
||||
db 2 + 040H + 020h * (1 - (a2) / 8) + 080h * (1 - (dest) / 8)
|
||||
db 5 + 8 * ((not (a1)) and 15)
|
||||
db cmd
|
||||
db 0c0H + 8 * ((dest) and 7) + ((a2) and 7)
|
||||
endm
|
||||
|
||||
MY_VAES_INSTR macro cmd, dest, a
|
||||
MY_VAES_INSTR_2 cmd, dest, dest, a
|
||||
endm
|
||||
|
||||
MY_vaesenc macro dest, a
|
||||
MY_VAES_INSTR 0dcH, dest, a
|
||||
endm
|
||||
MY_vaesenclast macro dest, a
|
||||
MY_VAES_INSTR 0ddH, dest, a
|
||||
endm
|
||||
MY_vaesdec macro dest, a
|
||||
MY_VAES_INSTR 0deH, dest, a
|
||||
endm
|
||||
MY_vaesdeclast macro dest, a
|
||||
MY_VAES_INSTR 0dfH, dest, a
|
||||
endm
|
||||
|
||||
|
||||
AVX__VAES_DEC macro reg
|
||||
MY_vaesdec reg, key_ymm_n
|
||||
endm
|
||||
|
||||
AVX__VAES_DEC_LAST_key_last macro reg
|
||||
; MY_vaesdeclast reg, key_ymm_n
|
||||
MY_vaesdeclast reg, key_last_ymm_n
|
||||
endm
|
||||
|
||||
AVX__VAES_ENC macro reg
|
||||
MY_vaesenc reg, key_ymm_n
|
||||
endm
|
||||
|
||||
AVX__VAES_ENC_LAST macro reg
|
||||
MY_vaesenclast reg, key_ymm_n
|
||||
endm
|
||||
|
||||
AVX__vinserti128_TO_HIGH macro dest, src
|
||||
vinserti128 dest, dest, src, 1
|
||||
endm
|
||||
|
||||
|
||||
MY_PROC AesCbc_Decode_HW_256, 3
|
||||
ifdef use_vaes_256
|
||||
MY_PROLOG NUM_CBC_REGS
|
||||
|
||||
cmp rN, ways * 2
|
||||
jb AesCbc_Decode_HW_start_2
|
||||
|
||||
vmovdqa iv, xmmword ptr [keys]
|
||||
add keys, 32
|
||||
|
||||
vbroadcasti128 key0_ymm, xmmword ptr [keys + 1 * ksize_r]
|
||||
vbroadcasti128 key_last_ymm, xmmword ptr [keys]
|
||||
sub ksize_x, 16
|
||||
mov koffs_x, ksize_x
|
||||
add ksize_x, ksize_x
|
||||
|
||||
AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 2) * 32)
|
||||
push keys2
|
||||
sub r4, AVX_STACK_SUB
|
||||
; sub r4, 32
|
||||
; sub r4, ksize_r
|
||||
; lea keys2, [r4 + 32]
|
||||
mov keys2, r4
|
||||
and keys2, -32
|
||||
broad:
|
||||
vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]
|
||||
vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm
|
||||
sub koffs_r, 16
|
||||
; jnc broad
|
||||
jnz broad
|
||||
|
||||
sub rN, ways * 2
|
||||
|
||||
align 16
|
||||
avx_cbcdec_nextBlock2:
|
||||
mov koffs_x, ksize_x
|
||||
; AVX__WOP_KEY AVX__CBC_START, 1 * koffs_r + 32
|
||||
AVX__WOP AVX__CBC_START
|
||||
@@:
|
||||
AVX__WOP_KEY AVX__VAES_DEC, 1 * koffs_r
|
||||
sub koffs_r, 32
|
||||
jnz @B
|
||||
; AVX__WOP_KEY AVX__VAES_DEC_LAST, 0
|
||||
AVX__WOP_n AVX__VAES_DEC_LAST_key_last
|
||||
|
||||
AVX__vinserti128_TO_HIGH iv_ymm, xmmword ptr [rD]
|
||||
AVX__WOP AVX__CBC_END
|
||||
|
||||
vmovdqa iv, xmmword ptr [rD + ways * 32 - 16]
|
||||
AVX__WOP AVX__WRITE_TO_DATA
|
||||
|
||||
add rD, ways * 32
|
||||
sub rN, ways * 2
|
||||
jnc avx_cbcdec_nextBlock2
|
||||
add rN, ways * 2
|
||||
|
||||
shr ksize_x, 1
|
||||
|
||||
; lea r4, [r4 + 1 * ksize_r + 32]
|
||||
add r4, AVX_STACK_SUB
|
||||
pop keys2
|
||||
|
||||
vzeroupper
|
||||
jmp AesCbc_Decode_HW_start_3
|
||||
else
|
||||
jmp AesCbc_Decode_HW_start
|
||||
endif
|
||||
MY_ENDP
|
||||
MY_SEG_ENDP
|
||||
|
||||
|
||||
|
||||
|
||||
; ---------- AES-CBC Encode ----------
|
||||
|
||||
e0 equ xmm1
|
||||
|
||||
CENC_START_KEY equ 2
|
||||
CENC_NUM_REG_KEYS equ (3 * 2)
|
||||
; last_key equ @CatStr(xmm, %(CENC_START_KEY + CENC_NUM_REG_KEYS))
|
||||
|
||||
MY_SEG_PROC AesCbc_Encode_HW, 3
|
||||
MY_PROLOG (CENC_START_KEY + CENC_NUM_REG_KEYS + 0)
|
||||
|
||||
movdqa state, [keys]
|
||||
add keys, 32
|
||||
|
||||
i = 0
|
||||
rept CENC_NUM_REG_KEYS
|
||||
movdqa @CatStr(xmm, %(CENC_START_KEY + i)), [keys + i * 16]
|
||||
i = i + 1
|
||||
endm
|
||||
|
||||
add keys, ksize_r
|
||||
neg ksize_r
|
||||
add ksize_r, (16 * CENC_NUM_REG_KEYS)
|
||||
; movdqa last_key, [keys]
|
||||
jmp check_e
|
||||
|
||||
align 16
|
||||
nextBlock_e:
|
||||
movdqa e0, [rD]
|
||||
mov koffs_r, ksize_r
|
||||
pxor e0, @CatStr(xmm, %(CENC_START_KEY))
|
||||
pxor state, e0
|
||||
|
||||
i = 1
|
||||
rept (CENC_NUM_REG_KEYS - 1)
|
||||
aesenc state, @CatStr(xmm, %(CENC_START_KEY + i))
|
||||
i = i + 1
|
||||
endm
|
||||
|
||||
mov r3, r6
|
||||
LOAD_OP_W pxor, -32
|
||||
ENCODE LOAD_OP_W
|
||||
OP_W XOR_UPD_1, i * 16
|
||||
OP_W XOR_UPD_2, i * 16
|
||||
add rD, ways16
|
||||
check2_c:
|
||||
sub rN, ways
|
||||
jnc nextBlocks2_c
|
||||
@@:
|
||||
OP_KEY aesenc, 1 * koffs_r
|
||||
OP_KEY aesenc, 1 * koffs_r + 16
|
||||
add koffs_r, 32
|
||||
jnz @B
|
||||
OP_KEY aesenclast, 0
|
||||
; aesenclast state, last_key
|
||||
|
||||
movdqa [rD], state
|
||||
add rD, 16
|
||||
check_e:
|
||||
sub rN, 1
|
||||
jnc nextBlock_e
|
||||
|
||||
add rN, ways
|
||||
jmp check_c
|
||||
; movdqa [keys - 32], state
|
||||
movdqa [keys + 1 * ksize_r - (16 * CENC_NUM_REG_KEYS) - 32], state
|
||||
MY_EPILOG
|
||||
MY_SEG_ENDP
|
||||
|
||||
nextBlock_c:
|
||||
paddq xmm6, [r5]
|
||||
mov r3, r6
|
||||
movdqa xmm0, [r1 + r3 - 32]
|
||||
pxor xmm0, xmm6
|
||||
ENCODE LOAD_OP
|
||||
XOR_UPD_1 xmm0, 0
|
||||
XOR_UPD_2 xmm0, 0
|
||||
add rD, 16
|
||||
check_c:
|
||||
sub rN, 1
|
||||
jnc nextBlock_c
|
||||
|
||||
movdqa [r1 + r6 - 64], xmm6
|
||||
MY_EPILOG
|
||||
|
||||
; ---------- AES-CTR ----------
|
||||
|
||||
ifdef x64
|
||||
; ways = 11
|
||||
endif
|
||||
|
||||
|
||||
one equ @CatStr(xmm, %(ways_start_reg + ways + 1))
|
||||
one_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))
|
||||
key0 equ @CatStr(xmm, %(ways_start_reg + ways + 2))
|
||||
key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))
|
||||
NUM_CTR_REGS equ (ways_start_reg + ways + 3)
|
||||
|
||||
INIT_CTR macro reg, _ppp_
|
||||
paddq iv, one
|
||||
movdqa reg, iv
|
||||
endm
|
||||
|
||||
|
||||
MY_SEG_PROC AesCtr_Code_HW, 3
|
||||
Ctr_start::
|
||||
MY_PROLOG NUM_CTR_REGS
|
||||
|
||||
Ctr_start_2::
|
||||
movdqa iv, [keys]
|
||||
add keys, 32
|
||||
movdqa key0, [keys]
|
||||
|
||||
add keys, ksize_r
|
||||
neg ksize_r
|
||||
add ksize_r, 16
|
||||
|
||||
Ctr_start_3::
|
||||
mov koffs_x, 1
|
||||
movd one, koffs_x
|
||||
jmp check2_c
|
||||
|
||||
align 16
|
||||
nextBlocks2_c:
|
||||
WOP INIT_CTR, 0
|
||||
mov koffs_r, ksize_r
|
||||
; WOP_KEY pxor, 1 * koffs_r -16
|
||||
WOP pxor, key0
|
||||
@@:
|
||||
WOP_KEY aesenc, 1 * koffs_r
|
||||
add koffs_r, 16
|
||||
jnz @B
|
||||
WOP_KEY aesenclast, 0
|
||||
|
||||
WOP XOR_WITH_DATA
|
||||
WOP WRITE_TO_DATA
|
||||
add rD, ways * 16
|
||||
check2_c:
|
||||
sub rN, ways
|
||||
jnc nextBlocks2_c
|
||||
add rN, ways
|
||||
|
||||
sub keys, 16
|
||||
add ksize_r, 16
|
||||
|
||||
jmp check_c
|
||||
|
||||
; align 16
|
||||
nextBlock_c:
|
||||
paddq iv, one
|
||||
; movdqa state, [keys + 1 * koffs_r - 16]
|
||||
movdqa state, key0
|
||||
mov koffs_r, ksize_r
|
||||
pxor state, iv
|
||||
|
||||
@@:
|
||||
OP_KEY aesenc, 1 * koffs_r
|
||||
OP_KEY aesenc, 1 * koffs_r + 16
|
||||
add koffs_r, 32
|
||||
jnz @B
|
||||
OP_KEY aesenc, 0
|
||||
OP_KEY aesenclast, 16
|
||||
|
||||
pxor state, [rD]
|
||||
movdqa [rD], state
|
||||
add rD, 16
|
||||
check_c:
|
||||
sub rN, 1
|
||||
jnc nextBlock_c
|
||||
|
||||
; movdqa [keys - 32], iv
|
||||
movdqa [keys + 1 * ksize_r - 16 - 32], iv
|
||||
MY_EPILOG
|
||||
|
||||
|
||||
MY_PROC AesCtr_Code_HW_256, 3
|
||||
ifdef use_vaes_256
|
||||
MY_PROLOG NUM_CTR_REGS
|
||||
|
||||
cmp rN, ways * 2
|
||||
jb Ctr_start_2
|
||||
|
||||
vbroadcasti128 iv_ymm, xmmword ptr [keys]
|
||||
add keys, 32
|
||||
vbroadcasti128 key0_ymm, xmmword ptr [keys]
|
||||
mov koffs_x, 1
|
||||
vmovd one, koffs_x
|
||||
vpsubq iv_ymm, iv_ymm, one_ymm
|
||||
vpaddq one, one, one
|
||||
AVX__vinserti128_TO_HIGH one_ymm, one
|
||||
|
||||
add keys, ksize_r
|
||||
sub ksize_x, 16
|
||||
neg ksize_r
|
||||
mov koffs_r, ksize_r
|
||||
add ksize_r, ksize_r
|
||||
|
||||
AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 1) * 32)
|
||||
push keys2
|
||||
lea keys2, [r4 - 32]
|
||||
sub r4, AVX_STACK_SUB
|
||||
and keys2, -32
|
||||
vbroadcasti128 key_ymm, xmmword ptr [keys]
|
||||
vmovdqa ymmword ptr [keys2], key_ymm
|
||||
@@:
|
||||
vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]
|
||||
vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm
|
||||
add koffs_r, 16
|
||||
jnz @B
|
||||
|
||||
sub rN, ways * 2
|
||||
|
||||
align 16
|
||||
avx_ctr_nextBlock2:
|
||||
mov koffs_r, ksize_r
|
||||
AVX__WOP AVX__CTR_START
|
||||
; AVX__WOP_KEY AVX__CTR_START, 1 * koffs_r - 32
|
||||
@@:
|
||||
AVX__WOP_KEY AVX__VAES_ENC, 1 * koffs_r
|
||||
add koffs_r, 32
|
||||
jnz @B
|
||||
AVX__WOP_KEY AVX__VAES_ENC_LAST, 0
|
||||
|
||||
AVX__WOP AVX__XOR_WITH_DATA
|
||||
AVX__WOP AVX__WRITE_TO_DATA
|
||||
|
||||
add rD, ways * 32
|
||||
sub rN, ways * 2
|
||||
jnc avx_ctr_nextBlock2
|
||||
add rN, ways * 2
|
||||
|
||||
vextracti128 iv, iv_ymm, 1
|
||||
sar ksize_r, 1
|
||||
|
||||
add r4, AVX_STACK_SUB
|
||||
pop keys2
|
||||
|
||||
vzeroupper
|
||||
jmp Ctr_start_3
|
||||
else
|
||||
jmp Ctr_start
|
||||
endif
|
||||
MY_ENDP
|
||||
MY_SEG_ENDP
|
||||
|
||||
end
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function
|
||||
; 2018-02-06: Igor Pavlov : Public domain
|
||||
; 2021-02-23: Igor Pavlov : Public domain
|
||||
;
|
||||
; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()
|
||||
; function for check at link time.
|
||||
@@ -62,6 +62,7 @@ PMULT equ (1 SHL PSHIFT)
|
||||
PMULT_HALF equ (1 SHL (PSHIFT - 1))
|
||||
PMULT_2 equ (1 SHL (PSHIFT + 1))
|
||||
|
||||
kMatchSpecLen_Error_Data equ (1 SHL 9)
|
||||
|
||||
; x0 range
|
||||
; x1 pbPos / (prob) TREE
|
||||
@@ -416,7 +417,7 @@ REV_1_VAR macro prob:req
|
||||
NORM_CALC prob
|
||||
|
||||
cmovae range, t0
|
||||
lea t0_R, [sym_R + sym2_R]
|
||||
lea t0_R, [sym_R + 1 * sym2_R]
|
||||
cmovae sym_R, t0_R
|
||||
mov t0, kBitModelOffset
|
||||
cmovb cod, t1
|
||||
@@ -583,7 +584,7 @@ IsMatchBranch_Pre macro reg
|
||||
mov pbPos, LOC pbMask
|
||||
and pbPos, processedPos
|
||||
shl pbPos, (kLenNumLowBits + 1 + PSHIFT)
|
||||
lea probs_state_R, [probs + state_R]
|
||||
lea probs_state_R, [probs + 1 * state_R]
|
||||
endm
|
||||
|
||||
|
||||
@@ -605,13 +606,13 @@ endm
|
||||
; RSP is (16x + 8) bytes aligned in WIN64-x64
|
||||
; LocalSize equ ((((SIZEOF CLzmaDec_Asm_Loc) + 7) / 16 * 16) + 8)
|
||||
|
||||
PARAM_lzma equ REG_PARAM_0
|
||||
PARAM_limit equ REG_PARAM_1
|
||||
PARAM_bufLimit equ REG_PARAM_2
|
||||
PARAM_lzma equ REG_ABI_PARAM_0
|
||||
PARAM_limit equ REG_ABI_PARAM_1
|
||||
PARAM_bufLimit equ REG_ABI_PARAM_2
|
||||
|
||||
; MY_ALIGN_64
|
||||
MY_PROC LzmaDec_DecodeReal_3, 3
|
||||
MY_PUSH_PRESERVED_REGS
|
||||
MY_PUSH_PRESERVED_ABI_REGS
|
||||
|
||||
lea r0, [RSP - (SIZEOF CLzmaDec_Asm_Loc)]
|
||||
and r0, -128
|
||||
@@ -777,7 +778,7 @@ len8_loop:
|
||||
jb len8_loop
|
||||
|
||||
mov len_temp, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - 1 - kMatchMinLen
|
||||
jmp len_mid_2
|
||||
jmp short len_mid_2 ; we use short here for MASM that doesn't optimize that code as another assembler programs
|
||||
|
||||
MY_ALIGN_32
|
||||
len_mid_0:
|
||||
@@ -890,11 +891,16 @@ decode_dist_end:
|
||||
|
||||
; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize))
|
||||
|
||||
mov t1, LOC rep0
|
||||
mov x1, LOC rep1
|
||||
mov x2, LOC rep2
|
||||
|
||||
mov t0, LOC checkDicSize
|
||||
test t0, t0
|
||||
cmove t0, processedPos
|
||||
cmp sym, t0
|
||||
jae end_of_payload
|
||||
; jmp end_of_payload ; for debug
|
||||
|
||||
; rep3 = rep2;
|
||||
; rep2 = rep1;
|
||||
@@ -902,15 +908,12 @@ decode_dist_end:
|
||||
; rep0 = distance + 1;
|
||||
|
||||
inc sym
|
||||
mov t0, LOC rep0
|
||||
mov t1, LOC rep1
|
||||
mov x1, LOC rep2
|
||||
mov LOC rep0, sym
|
||||
; mov sym, LOC remainLen
|
||||
mov sym, len_temp
|
||||
mov LOC rep1, t0
|
||||
mov LOC rep2, t1
|
||||
mov LOC rep3, x1
|
||||
mov LOC rep1, t1
|
||||
mov LOC rep2, x1
|
||||
mov LOC rep3, x2
|
||||
|
||||
; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
|
||||
cmp state, (kNumStates + kNumLitStates) * PMULT
|
||||
@@ -932,7 +935,7 @@ copy_match:
|
||||
; }
|
||||
mov cnt_R, LOC limit
|
||||
sub cnt_R, dicPos
|
||||
jz fin_ERROR
|
||||
jz fin_dicPos_LIMIT
|
||||
|
||||
; curLen = ((rem < len) ? (unsigned)rem : len);
|
||||
cmp cnt_R, sym_R
|
||||
@@ -1091,11 +1094,23 @@ IsRep0Short_label:
|
||||
sub t0_R, dic
|
||||
|
||||
sub probs, RepLenCoder * PMULT
|
||||
inc processedPos
|
||||
|
||||
; state = state < kNumLitStates ? 9 : 11;
|
||||
or state, 1 * PMULT
|
||||
|
||||
; the caller doesn't allow (dicPos >= limit) case for REP_SHORT
|
||||
; so we don't need the following (dicPos == limit) check here:
|
||||
; cmp dicPos, LOC limit
|
||||
; jae fin_dicPos_LIMIT_REP_SHORT
|
||||
|
||||
inc processedPos
|
||||
|
||||
IsMatchBranch_Pre
|
||||
|
||||
; xor sym, sym
|
||||
; sub t0_R, probBranch_R
|
||||
; cmovb sym_R, LOC dicBufSize
|
||||
; add t0_R, sym_R
|
||||
sub t0_R, probBranch_R
|
||||
jae @f
|
||||
add t0_R, LOC dicBufSize
|
||||
@@ -1210,15 +1225,45 @@ copy_match_cross:
|
||||
|
||||
|
||||
|
||||
fin_ERROR:
|
||||
; fin_dicPos_LIMIT_REP_SHORT:
|
||||
; mov sym, 1
|
||||
|
||||
fin_dicPos_LIMIT:
|
||||
mov LOC remainLen, sym
|
||||
jmp fin_OK
|
||||
; For more strict mode we can stop decoding with error
|
||||
; mov sym, 1
|
||||
; jmp fin
|
||||
|
||||
|
||||
fin_ERROR_MATCH_DIST:
|
||||
|
||||
; rep3 = rep2;
|
||||
; rep2 = rep1;
|
||||
; rep1 = rep0;
|
||||
; rep0 = distance + 1;
|
||||
|
||||
add len_temp, kMatchSpecLen_Error_Data
|
||||
mov LOC remainLen, len_temp
|
||||
; fin_ERROR_2:
|
||||
|
||||
mov LOC rep0, sym
|
||||
mov LOC rep1, t1
|
||||
mov LOC rep2, x1
|
||||
mov LOC rep3, x2
|
||||
|
||||
; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
|
||||
cmp state, (kNumStates + kNumLitStates) * PMULT
|
||||
mov state, kNumLitStates * PMULT
|
||||
mov t0, (kNumLitStates + 3) * PMULT
|
||||
cmovae state, t0
|
||||
|
||||
; jmp fin_OK
|
||||
mov sym, 1
|
||||
jmp fin
|
||||
|
||||
end_of_payload:
|
||||
cmp sym, 0FFFFFFFFh ; -1
|
||||
jne fin_ERROR
|
||||
inc sym
|
||||
jnz fin_ERROR_MATCH_DIST
|
||||
|
||||
mov LOC remainLen, kMatchSpecLenStart
|
||||
sub state, kNumStates * PMULT
|
||||
@@ -1250,7 +1295,7 @@ fin:
|
||||
|
||||
mov RSP, LOC Old_RSP
|
||||
|
||||
MY_POP_PRESERVED_REGS
|
||||
MY_POP_PRESERVED_ABI_REGS
|
||||
MY_ENDP
|
||||
|
||||
_TEXT$LZMADECOPT ENDS
|
||||
|
||||
263
Asm/x86/Sha1Opt.asm
Normal file
263
Asm/x86/Sha1Opt.asm
Normal file
@@ -0,0 +1,263 @@
|
||||
; Sha1Opt.asm -- SHA-1 optimized code for SHA-1 x86 hardware instructions
|
||||
; 2021-03-10 : Igor Pavlov : Public domain
|
||||
|
||||
include 7zAsm.asm
|
||||
|
||||
MY_ASM_START
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
CONST SEGMENT
|
||||
|
||||
align 16
|
||||
Reverse_Endian_Mask db 15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
CONST ENDS
|
||||
|
||||
; _TEXT$SHA1OPT SEGMENT 'CODE'
|
||||
|
||||
ifndef x64
|
||||
.686
|
||||
.xmm
|
||||
endif
|
||||
|
||||
ifdef x64
|
||||
rNum equ REG_ABI_PARAM_2
|
||||
if (IS_LINUX eq 0)
|
||||
LOCAL_SIZE equ (16 * 2)
|
||||
endif
|
||||
else
|
||||
rNum equ r0
|
||||
LOCAL_SIZE equ (16 * 1)
|
||||
endif
|
||||
|
||||
rState equ REG_ABI_PARAM_0
|
||||
rData equ REG_ABI_PARAM_1
|
||||
|
||||
|
||||
MY_sha1rnds4 macro a1, a2, imm
|
||||
db 0fH, 03aH, 0ccH, (0c0H + a1 * 8 + a2), imm
|
||||
endm
|
||||
|
||||
MY_SHA_INSTR macro cmd, a1, a2
|
||||
db 0fH, 038H, cmd, (0c0H + a1 * 8 + a2)
|
||||
endm
|
||||
|
||||
cmd_sha1nexte equ 0c8H
|
||||
cmd_sha1msg1 equ 0c9H
|
||||
cmd_sha1msg2 equ 0caH
|
||||
|
||||
MY_sha1nexte macro a1, a2
|
||||
MY_SHA_INSTR cmd_sha1nexte, a1, a2
|
||||
endm
|
||||
|
||||
MY_sha1msg1 macro a1, a2
|
||||
MY_SHA_INSTR cmd_sha1msg1, a1, a2
|
||||
endm
|
||||
|
||||
MY_sha1msg2 macro a1, a2
|
||||
MY_SHA_INSTR cmd_sha1msg2, a1, a2
|
||||
endm
|
||||
|
||||
MY_PROLOG macro
|
||||
ifdef x64
|
||||
if (IS_LINUX eq 0)
|
||||
movdqa [r4 + 8], xmm6
|
||||
movdqa [r4 + 8 + 16], xmm7
|
||||
sub r4, LOCAL_SIZE + 8
|
||||
movdqa [r4 ], xmm8
|
||||
movdqa [r4 + 16], xmm9
|
||||
endif
|
||||
else ; x86
|
||||
if (IS_CDECL gt 0)
|
||||
mov rState, [r4 + REG_SIZE * 1]
|
||||
mov rData, [r4 + REG_SIZE * 2]
|
||||
mov rNum, [r4 + REG_SIZE * 3]
|
||||
else ; fastcall
|
||||
mov rNum, [r4 + REG_SIZE * 1]
|
||||
endif
|
||||
push r5
|
||||
mov r5, r4
|
||||
and r4, -16
|
||||
sub r4, LOCAL_SIZE
|
||||
endif
|
||||
endm
|
||||
|
||||
MY_EPILOG macro
|
||||
ifdef x64
|
||||
if (IS_LINUX eq 0)
|
||||
movdqa xmm8, [r4]
|
||||
movdqa xmm9, [r4 + 16]
|
||||
add r4, LOCAL_SIZE + 8
|
||||
movdqa xmm6, [r4 + 8]
|
||||
movdqa xmm7, [r4 + 8 + 16]
|
||||
endif
|
||||
else ; x86
|
||||
mov r4, r5
|
||||
pop r5
|
||||
endif
|
||||
MY_ENDP
|
||||
endm
|
||||
|
||||
|
||||
e0_N equ 0
|
||||
e1_N equ 1
|
||||
abcd_N equ 2
|
||||
e0_save_N equ 3
|
||||
w_regs equ 4
|
||||
|
||||
e0 equ @CatStr(xmm, %e0_N)
|
||||
e1 equ @CatStr(xmm, %e1_N)
|
||||
abcd equ @CatStr(xmm, %abcd_N)
|
||||
e0_save equ @CatStr(xmm, %e0_save_N)
|
||||
|
||||
|
||||
ifdef x64
|
||||
abcd_save equ xmm8
|
||||
mask2 equ xmm9
|
||||
else
|
||||
abcd_save equ [r4]
|
||||
mask2 equ e1
|
||||
endif
|
||||
|
||||
LOAD_MASK macro
|
||||
movdqa mask2, XMMWORD PTR Reverse_Endian_Mask
|
||||
endm
|
||||
|
||||
LOAD_W macro k:req
|
||||
movdqu @CatStr(xmm, %(w_regs + k)), [rData + (16 * (k))]
|
||||
pshufb @CatStr(xmm, %(w_regs + k)), mask2
|
||||
endm
|
||||
|
||||
|
||||
; pre2 can be 2 or 3 (recommended)
|
||||
pre2 equ 3
|
||||
pre1 equ (pre2 + 1)
|
||||
|
||||
NUM_ROUNDS4 equ 20
|
||||
|
||||
RND4 macro k
|
||||
movdqa @CatStr(xmm, %(e0_N + ((k + 1) mod 2))), abcd
|
||||
MY_sha1rnds4 abcd_N, (e0_N + (k mod 2)), k / 5
|
||||
|
||||
nextM = (w_regs + ((k + 1) mod 4))
|
||||
|
||||
if (k EQ NUM_ROUNDS4 - 1)
|
||||
nextM = e0_save_N
|
||||
endif
|
||||
|
||||
MY_sha1nexte (e0_N + ((k + 1) mod 2)), nextM
|
||||
|
||||
if (k GE (4 - pre2)) AND (k LT (NUM_ROUNDS4 - pre2))
|
||||
pxor @CatStr(xmm, %(w_regs + ((k + pre2) mod 4))), @CatStr(xmm, %(w_regs + ((k + pre2 - 2) mod 4)))
|
||||
endif
|
||||
|
||||
if (k GE (4 - pre1)) AND (k LT (NUM_ROUNDS4 - pre1))
|
||||
MY_sha1msg1 (w_regs + ((k + pre1) mod 4)), (w_regs + ((k + pre1 - 3) mod 4))
|
||||
endif
|
||||
|
||||
if (k GE (4 - pre2)) AND (k LT (NUM_ROUNDS4 - pre2))
|
||||
MY_sha1msg2 (w_regs + ((k + pre2) mod 4)), (w_regs + ((k + pre2 - 1) mod 4))
|
||||
endif
|
||||
endm
|
||||
|
||||
|
||||
REVERSE_STATE macro
|
||||
; abcd ; dcba
|
||||
; e0 ; 000e
|
||||
pshufd abcd, abcd, 01bH ; abcd
|
||||
pshufd e0, e0, 01bH ; e000
|
||||
endm
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
MY_PROC Sha1_UpdateBlocks_HW, 3
|
||||
MY_PROLOG
|
||||
|
||||
cmp rNum, 0
|
||||
je end_c
|
||||
|
||||
movdqu abcd, [rState] ; dcba
|
||||
movd e0, dword ptr [rState + 16] ; 000e
|
||||
|
||||
REVERSE_STATE
|
||||
|
||||
ifdef x64
|
||||
LOAD_MASK
|
||||
endif
|
||||
|
||||
align 16
|
||||
nextBlock:
|
||||
movdqa abcd_save, abcd
|
||||
movdqa e0_save, e0
|
||||
|
||||
ifndef x64
|
||||
LOAD_MASK
|
||||
endif
|
||||
|
||||
LOAD_W 0
|
||||
LOAD_W 1
|
||||
LOAD_W 2
|
||||
LOAD_W 3
|
||||
|
||||
paddd e0, @CatStr(xmm, %(w_regs))
|
||||
k = 0
|
||||
rept NUM_ROUNDS4
|
||||
RND4 k
|
||||
k = k + 1
|
||||
endm
|
||||
|
||||
paddd abcd, abcd_save
|
||||
|
||||
|
||||
add rData, 64
|
||||
sub rNum, 1
|
||||
jnz nextBlock
|
||||
|
||||
REVERSE_STATE
|
||||
|
||||
movdqu [rState], abcd
|
||||
movd dword ptr [rState + 16], e0
|
||||
|
||||
end_c:
|
||||
MY_EPILOG
|
||||
|
||||
; _TEXT$SHA1OPT ENDS
|
||||
|
||||
end
|
||||
263
Asm/x86/Sha256Opt.asm
Normal file
263
Asm/x86/Sha256Opt.asm
Normal file
@@ -0,0 +1,263 @@
|
||||
; Sha256Opt.asm -- SHA-256 optimized code for SHA-256 x86 hardware instructions
|
||||
; 2021-03-10 : Igor Pavlov : Public domain
|
||||
|
||||
include 7zAsm.asm
|
||||
|
||||
MY_ASM_START
|
||||
|
||||
; .data
|
||||
; public K
|
||||
|
||||
; we can use external SHA256_K_ARRAY defined in Sha256.c
|
||||
; but we must guarantee that SHA256_K_ARRAY is aligned for 16-bytes
|
||||
|
||||
COMMENT @
|
||||
ifdef x64
|
||||
K_CONST equ SHA256_K_ARRAY
|
||||
else
|
||||
K_CONST equ _SHA256_K_ARRAY
|
||||
endif
|
||||
EXTRN K_CONST:xmmword
|
||||
@
|
||||
|
||||
CONST SEGMENT
|
||||
|
||||
align 16
|
||||
Reverse_Endian_Mask db 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12
|
||||
|
||||
; COMMENT @
|
||||
align 16
|
||||
K_CONST \
|
||||
DD 0428a2f98H, 071374491H, 0b5c0fbcfH, 0e9b5dba5H
|
||||
DD 03956c25bH, 059f111f1H, 0923f82a4H, 0ab1c5ed5H
|
||||
DD 0d807aa98H, 012835b01H, 0243185beH, 0550c7dc3H
|
||||
DD 072be5d74H, 080deb1feH, 09bdc06a7H, 0c19bf174H
|
||||
DD 0e49b69c1H, 0efbe4786H, 00fc19dc6H, 0240ca1ccH
|
||||
DD 02de92c6fH, 04a7484aaH, 05cb0a9dcH, 076f988daH
|
||||
DD 0983e5152H, 0a831c66dH, 0b00327c8H, 0bf597fc7H
|
||||
DD 0c6e00bf3H, 0d5a79147H, 006ca6351H, 014292967H
|
||||
DD 027b70a85H, 02e1b2138H, 04d2c6dfcH, 053380d13H
|
||||
DD 0650a7354H, 0766a0abbH, 081c2c92eH, 092722c85H
|
||||
DD 0a2bfe8a1H, 0a81a664bH, 0c24b8b70H, 0c76c51a3H
|
||||
DD 0d192e819H, 0d6990624H, 0f40e3585H, 0106aa070H
|
||||
DD 019a4c116H, 01e376c08H, 02748774cH, 034b0bcb5H
|
||||
DD 0391c0cb3H, 04ed8aa4aH, 05b9cca4fH, 0682e6ff3H
|
||||
DD 0748f82eeH, 078a5636fH, 084c87814H, 08cc70208H
|
||||
DD 090befffaH, 0a4506cebH, 0bef9a3f7H, 0c67178f2H
|
||||
; @
|
||||
|
||||
CONST ENDS
|
||||
|
||||
; _TEXT$SHA256OPT SEGMENT 'CODE'
|
||||
|
||||
ifndef x64
|
||||
.686
|
||||
.xmm
|
||||
endif
|
||||
|
||||
ifdef x64
|
||||
rNum equ REG_ABI_PARAM_2
|
||||
if (IS_LINUX eq 0)
|
||||
LOCAL_SIZE equ (16 * 2)
|
||||
endif
|
||||
else
|
||||
rNum equ r0
|
||||
LOCAL_SIZE equ (16 * 1)
|
||||
endif
|
||||
|
||||
rState equ REG_ABI_PARAM_0
|
||||
rData equ REG_ABI_PARAM_1
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
MY_SHA_INSTR macro cmd, a1, a2
|
||||
db 0fH, 038H, cmd, (0c0H + a1 * 8 + a2)
|
||||
endm
|
||||
|
||||
cmd_sha256rnds2 equ 0cbH
|
||||
cmd_sha256msg1 equ 0ccH
|
||||
cmd_sha256msg2 equ 0cdH
|
||||
|
||||
MY_sha256rnds2 macro a1, a2
|
||||
MY_SHA_INSTR cmd_sha256rnds2, a1, a2
|
||||
endm
|
||||
|
||||
MY_sha256msg1 macro a1, a2
|
||||
MY_SHA_INSTR cmd_sha256msg1, a1, a2
|
||||
endm
|
||||
|
||||
MY_sha256msg2 macro a1, a2
|
||||
MY_SHA_INSTR cmd_sha256msg2, a1, a2
|
||||
endm
|
||||
|
||||
MY_PROLOG macro
|
||||
ifdef x64
|
||||
if (IS_LINUX eq 0)
|
||||
movdqa [r4 + 8], xmm6
|
||||
movdqa [r4 + 8 + 16], xmm7
|
||||
sub r4, LOCAL_SIZE + 8
|
||||
movdqa [r4 ], xmm8
|
||||
movdqa [r4 + 16], xmm9
|
||||
endif
|
||||
else ; x86
|
||||
if (IS_CDECL gt 0)
|
||||
mov rState, [r4 + REG_SIZE * 1]
|
||||
mov rData, [r4 + REG_SIZE * 2]
|
||||
mov rNum, [r4 + REG_SIZE * 3]
|
||||
else ; fastcall
|
||||
mov rNum, [r4 + REG_SIZE * 1]
|
||||
endif
|
||||
push r5
|
||||
mov r5, r4
|
||||
and r4, -16
|
||||
sub r4, LOCAL_SIZE
|
||||
endif
|
||||
endm
|
||||
|
||||
MY_EPILOG macro
|
||||
ifdef x64
|
||||
if (IS_LINUX eq 0)
|
||||
movdqa xmm8, [r4]
|
||||
movdqa xmm9, [r4 + 16]
|
||||
add r4, LOCAL_SIZE + 8
|
||||
movdqa xmm6, [r4 + 8]
|
||||
movdqa xmm7, [r4 + 8 + 16]
|
||||
endif
|
||||
else ; x86
|
||||
mov r4, r5
|
||||
pop r5
|
||||
endif
|
||||
MY_ENDP
|
||||
endm
|
||||
|
||||
|
||||
msg equ xmm0
|
||||
tmp equ xmm0
|
||||
state0_N equ 2
|
||||
state1_N equ 3
|
||||
w_regs equ 4
|
||||
|
||||
|
||||
state1_save equ xmm1
|
||||
state0 equ @CatStr(xmm, %state0_N)
|
||||
state1 equ @CatStr(xmm, %state1_N)
|
||||
|
||||
|
||||
ifdef x64
|
||||
state0_save equ xmm8
|
||||
mask2 equ xmm9
|
||||
else
|
||||
state0_save equ [r4]
|
||||
mask2 equ xmm0
|
||||
endif
|
||||
|
||||
LOAD_MASK macro
|
||||
movdqa mask2, XMMWORD PTR Reverse_Endian_Mask
|
||||
endm
|
||||
|
||||
LOAD_W macro k:req
|
||||
movdqu @CatStr(xmm, %(w_regs + k)), [rData + (16 * (k))]
|
||||
pshufb @CatStr(xmm, %(w_regs + k)), mask2
|
||||
endm
|
||||
|
||||
|
||||
; pre1 <= 4 && pre2 >= 1 && pre1 > pre2 && (pre1 - pre2) <= 1
|
||||
pre1 equ 3
|
||||
pre2 equ 2
|
||||
|
||||
|
||||
|
||||
RND4 macro k
|
||||
movdqa msg, xmmword ptr [K_CONST + (k) * 16]
|
||||
paddd msg, @CatStr(xmm, %(w_regs + ((k + 0) mod 4)))
|
||||
MY_sha256rnds2 state0_N, state1_N
|
||||
pshufd msg, msg, 0eH
|
||||
|
||||
if (k GE (4 - pre1)) AND (k LT (16 - pre1))
|
||||
; w4[0] = msg1(w4[-4], w4[-3])
|
||||
MY_sha256msg1 (w_regs + ((k + pre1) mod 4)), (w_regs + ((k + pre1 - 3) mod 4))
|
||||
endif
|
||||
|
||||
MY_sha256rnds2 state1_N, state0_N
|
||||
|
||||
if (k GE (4 - pre2)) AND (k LT (16 - pre2))
|
||||
movdqa tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 1) mod 4)))
|
||||
palignr tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 2) mod 4))), 4
|
||||
paddd @CatStr(xmm, %(w_regs + ((k + pre2) mod 4))), tmp
|
||||
; w4[0] = msg2(w4[0], w4[-1])
|
||||
MY_sha256msg2 %(w_regs + ((k + pre2) mod 4)), %(w_regs + ((k + pre2 - 1) mod 4))
|
||||
endif
|
||||
endm
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
REVERSE_STATE macro
|
||||
; state0 ; dcba
|
||||
; state1 ; hgfe
|
||||
pshufd tmp, state0, 01bH ; abcd
|
||||
pshufd state0, state1, 01bH ; efgh
|
||||
movdqa state1, state0 ; efgh
|
||||
punpcklqdq state0, tmp ; cdgh
|
||||
punpckhqdq state1, tmp ; abef
|
||||
endm
|
||||
|
||||
|
||||
MY_PROC Sha256_UpdateBlocks_HW, 3
|
||||
MY_PROLOG
|
||||
|
||||
cmp rNum, 0
|
||||
je end_c
|
||||
|
||||
movdqu state0, [rState] ; dcba
|
||||
movdqu state1, [rState + 16] ; hgfe
|
||||
|
||||
REVERSE_STATE
|
||||
|
||||
ifdef x64
|
||||
LOAD_MASK
|
||||
endif
|
||||
|
||||
align 16
|
||||
nextBlock:
|
||||
movdqa state0_save, state0
|
||||
movdqa state1_save, state1
|
||||
|
||||
ifndef x64
|
||||
LOAD_MASK
|
||||
endif
|
||||
|
||||
LOAD_W 0
|
||||
LOAD_W 1
|
||||
LOAD_W 2
|
||||
LOAD_W 3
|
||||
|
||||
|
||||
k = 0
|
||||
rept 16
|
||||
RND4 k
|
||||
k = k + 1
|
||||
endm
|
||||
|
||||
paddd state0, state0_save
|
||||
paddd state1, state1_save
|
||||
|
||||
add rData, 64
|
||||
sub rNum, 1
|
||||
jnz nextBlock
|
||||
|
||||
REVERSE_STATE
|
||||
|
||||
movdqu [rState], state0
|
||||
movdqu [rState + 16], state1
|
||||
|
||||
end_c:
|
||||
MY_EPILOG
|
||||
|
||||
; _TEXT$SHA256OPT ENDS
|
||||
|
||||
end
|
||||
@@ -1,5 +1,5 @@
|
||||
; XzCrc64Opt.asm -- CRC64 calculation : optimized version
|
||||
; 2011-06-28 : Igor Pavlov : Public domain
|
||||
; 2021-02-06 : Igor Pavlov : Public domain
|
||||
|
||||
include 7zAsm.asm
|
||||
|
||||
@@ -7,16 +7,15 @@ MY_ASM_START
|
||||
|
||||
ifdef x64
|
||||
|
||||
rD equ r9
|
||||
rN equ r10
|
||||
|
||||
num_VAR equ r8
|
||||
table_VAR equ r9
|
||||
|
||||
SRCDAT equ rN + rD
|
||||
rD equ r9
|
||||
rN equ r10
|
||||
rT equ r5
|
||||
num_VAR equ r8
|
||||
|
||||
SRCDAT4 equ dword ptr [rD + rN * 1]
|
||||
|
||||
CRC_XOR macro dest:req, src:req, t:req
|
||||
xor dest, QWORD PTR [r5 + src * 8 + 0800h * t]
|
||||
xor dest, QWORD PTR [rT + src * 8 + 0800h * t]
|
||||
endm
|
||||
|
||||
CRC1b macro
|
||||
@@ -30,12 +29,15 @@ CRC1b macro
|
||||
endm
|
||||
|
||||
MY_PROLOG macro crc_end:req
|
||||
ifdef ABI_LINUX
|
||||
MY_PUSH_2_REGS
|
||||
else
|
||||
MY_PUSH_4_REGS
|
||||
|
||||
mov r0, r1
|
||||
mov rN, num_VAR
|
||||
mov r5, table_VAR
|
||||
mov rD, r2
|
||||
endif
|
||||
mov r0, REG_ABI_PARAM_0
|
||||
mov rN, REG_ABI_PARAM_2
|
||||
mov rT, REG_ABI_PARAM_3
|
||||
mov rD, REG_ABI_PARAM_1
|
||||
test rN, rN
|
||||
jz crc_end
|
||||
@@:
|
||||
@@ -51,14 +53,14 @@ MY_PROLOG macro crc_end:req
|
||||
sub rN, 4
|
||||
and rN, NOT 3
|
||||
sub rD, rN
|
||||
mov x1, [SRCDAT]
|
||||
mov x1, SRCDAT4
|
||||
xor r0, r1
|
||||
add rN, 4
|
||||
endm
|
||||
|
||||
MY_EPILOG macro crc_end:req
|
||||
sub rN, 4
|
||||
mov x1, [SRCDAT]
|
||||
mov x1, SRCDAT4
|
||||
xor r0, r1
|
||||
mov rD, rN
|
||||
mov rN, num_VAR
|
||||
@@ -69,14 +71,18 @@ MY_EPILOG macro crc_end:req
|
||||
CRC1b
|
||||
jmp crc_end
|
||||
@@:
|
||||
ifdef ABI_LINUX
|
||||
MY_POP_2_REGS
|
||||
else
|
||||
MY_POP_4_REGS
|
||||
endif
|
||||
endm
|
||||
|
||||
MY_PROC XzCrc64UpdateT4, 4
|
||||
MY_PROLOG crc_end_4
|
||||
align 16
|
||||
main_loop_4:
|
||||
mov x1, [SRCDAT]
|
||||
mov x1, SRCDAT4
|
||||
movzx x2, x0_L
|
||||
movzx x3, x0_H
|
||||
shr r0, 16
|
||||
@@ -96,21 +102,43 @@ MY_PROC XzCrc64UpdateT4, 4
|
||||
MY_ENDP
|
||||
|
||||
else
|
||||
; x86 (32-bit)
|
||||
|
||||
rD equ r1
|
||||
rN equ r7
|
||||
rD equ r1
|
||||
rN equ r7
|
||||
rT equ r5
|
||||
|
||||
crc_val equ (REG_SIZE * 5)
|
||||
crc_table equ (8 + crc_val)
|
||||
table_VAR equ [r4 + crc_table]
|
||||
num_VAR equ table_VAR
|
||||
crc_OFFS equ (REG_SIZE * 5)
|
||||
|
||||
if (IS_CDECL gt 0) or (IS_LINUX gt 0)
|
||||
; cdecl or (GNU fastcall) stack:
|
||||
; (UInt32 *) table
|
||||
; size_t size
|
||||
; void * data
|
||||
; (UInt64) crc
|
||||
; ret-ip <-(r4)
|
||||
data_OFFS equ (8 + crc_OFFS)
|
||||
size_OFFS equ (REG_SIZE + data_OFFS)
|
||||
table_OFFS equ (REG_SIZE + size_OFFS)
|
||||
num_VAR equ [r4 + size_OFFS]
|
||||
table_VAR equ [r4 + table_OFFS]
|
||||
else
|
||||
; Windows fastcall:
|
||||
; r1 = data, r2 = size
|
||||
; stack:
|
||||
; (UInt32 *) table
|
||||
; (UInt64) crc
|
||||
; ret-ip <-(r4)
|
||||
table_OFFS equ (8 + crc_OFFS)
|
||||
table_VAR equ [r4 + table_OFFS]
|
||||
num_VAR equ table_VAR
|
||||
endif
|
||||
|
||||
SRCDAT equ rN + rD
|
||||
SRCDAT4 equ dword ptr [rD + rN * 1]
|
||||
|
||||
CRC macro op0:req, op1:req, dest0:req, dest1:req, src:req, t:req
|
||||
op0 dest0, DWORD PTR [r5 + src * 8 + 0800h * t]
|
||||
op1 dest1, DWORD PTR [r5 + src * 8 + 0800h * t + 4]
|
||||
op0 dest0, DWORD PTR [rT + src * 8 + 0800h * t]
|
||||
op1 dest1, DWORD PTR [rT + src * 8 + 0800h * t + 4]
|
||||
endm
|
||||
|
||||
CRC_XOR macro dest0:req, dest1:req, src:req, t:req
|
||||
@@ -131,12 +159,18 @@ endm
|
||||
|
||||
MY_PROLOG macro crc_end:req
|
||||
MY_PUSH_4_REGS
|
||||
|
||||
mov rN, r2
|
||||
|
||||
mov x0, [r4 + crc_val]
|
||||
mov x2, [r4 + crc_val + 4]
|
||||
mov r5, table_VAR
|
||||
if (IS_CDECL gt 0) or (IS_LINUX gt 0)
|
||||
proc_numParams = proc_numParams + 2 ; for ABI_LINUX
|
||||
mov rN, [r4 + size_OFFS]
|
||||
mov rD, [r4 + data_OFFS]
|
||||
else
|
||||
mov rN, r2
|
||||
endif
|
||||
|
||||
mov x0, [r4 + crc_OFFS]
|
||||
mov x2, [r4 + crc_OFFS + 4]
|
||||
mov rT, table_VAR
|
||||
test rN, rN
|
||||
jz crc_end
|
||||
@@:
|
||||
@@ -154,13 +188,13 @@ MY_PROLOG macro crc_end:req
|
||||
sub rN, 4
|
||||
and rN, NOT 3
|
||||
sub rD, rN
|
||||
xor r0, [SRCDAT]
|
||||
xor r0, SRCDAT4
|
||||
add rN, 4
|
||||
endm
|
||||
|
||||
MY_EPILOG macro crc_end:req
|
||||
sub rN, 4
|
||||
xor r0, [SRCDAT]
|
||||
xor r0, SRCDAT4
|
||||
|
||||
mov rD, rN
|
||||
mov rN, num_VAR
|
||||
@@ -179,7 +213,7 @@ MY_PROC XzCrc64UpdateT4, 5
|
||||
movzx x6, x0_L
|
||||
align 16
|
||||
main_loop_4:
|
||||
mov r3, [SRCDAT]
|
||||
mov r3, SRCDAT4
|
||||
xor r3, r2
|
||||
|
||||
CRC xor, mov, r3, r2, r6, 3
|
||||
@@ -200,6 +234,6 @@ MY_PROC XzCrc64UpdateT4, 5
|
||||
MY_EPILOG crc_end_4
|
||||
MY_ENDP
|
||||
|
||||
endif
|
||||
endif ; ! x64
|
||||
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user