Update to 7-Zip Version 21.02

This commit is contained in:
Tino Reichardt
2021-05-13 16:39:14 +02:00
parent 3724ecfedc
commit 48fa49f76c
620 changed files with 35032 additions and 10925 deletions

181
Asm/arm64/7zAsm.S Normal file
View File

@@ -0,0 +1,181 @@
// 7zAsm.S -- ASM macros for arm64
// 2021-04-25 : Igor Pavlov : Public domain
#define r0 x0
#define r1 x1
#define r2 x2
#define r3 x3
#define r4 x4
#define r5 x5
#define r6 x6
#define r7 x7
#define r8 x8
#define r9 x9
#define r10 x10
#define r11 x11
#define r12 x12
#define r13 x13
#define r14 x14
#define r15 x15
#define r16 x16
#define r17 x17
#define r18 x18
#define r19 x19
#define r20 x20
#define r21 x21
#define r22 x22
#define r23 x23
#define r24 x24
#define r25 x25
#define r26 x26
#define r27 x27
#define r28 x28
#define r29 x29
#define r30 x30
#define REG_ABI_PARAM_0 r0
#define REG_ABI_PARAM_1 r1
#define REG_ABI_PARAM_2 r2
.macro p2_add reg:req, param:req
add \reg, \reg, \param
.endm
.macro p2_sub reg:req, param:req
sub \reg, \reg, \param
.endm
.macro p2_sub_s reg:req, param:req
subs \reg, \reg, \param
.endm
.macro p2_and reg:req, param:req
and \reg, \reg, \param
.endm
.macro xor reg:req, param:req
eor \reg, \reg, \param
.endm
.macro or reg:req, param:req
orr \reg, \reg, \param
.endm
.macro shl reg:req, param:req
lsl \reg, \reg, \param
.endm
.macro shr reg:req, param:req
lsr \reg, \reg, \param
.endm
.macro sar reg:req, param:req
asr \reg, \reg, \param
.endm
.macro p1_neg reg:req
neg \reg, \reg
.endm
.macro dec reg:req
sub \reg, \reg, 1
.endm
.macro dec_s reg:req
subs \reg, \reg, 1
.endm
.macro inc reg:req
add \reg, \reg, 1
.endm
.macro inc_s reg:req
adds \reg, \reg, 1
.endm
.macro imul reg:req, param:req
mul \reg, \reg, \param
.endm
/*
arm64 and arm use reverted c flag after subs/cmp instructions:
arm64-arm : x86
b.lo / b.cc : jb / jc
b.hs / b.cs : jae / jnc
*/
.macro jmp lab:req
b \lab
.endm
.macro je lab:req
b.eq \lab
.endm
.macro jz lab:req
b.eq \lab
.endm
.macro jnz lab:req
b.ne \lab
.endm
.macro jne lab:req
b.ne \lab
.endm
.macro jb lab:req
b.lo \lab
.endm
.macro jbe lab:req
b.ls \lab
.endm
.macro ja lab:req
b.hi \lab
.endm
.macro jae lab:req
b.hs \lab
.endm
.macro cmove dest:req, srcTrue:req
csel \dest, \srcTrue, \dest, eq
.endm
.macro cmovne dest:req, srcTrue:req
csel \dest, \srcTrue, \dest, ne
.endm
.macro cmovs dest:req, srcTrue:req
csel \dest, \srcTrue, \dest, mi
.endm
.macro cmovns dest:req, srcTrue:req
csel \dest, \srcTrue, \dest, pl
.endm
.macro cmovb dest:req, srcTrue:req
csel \dest, \srcTrue, \dest, lo
.endm
.macro cmovae dest:req, srcTrue:req
csel \dest, \srcTrue, \dest, hs
.endm
.macro MY_ALIGN_16 macro
.p2align 4,, (1 << 4) - 1
.endm
.macro MY_ALIGN_32 macro
.p2align 5,, (1 << 5) - 1
.endm
.macro MY_ALIGN_64 macro
.p2align 6,, (1 << 6) - 1
.endm

1487
Asm/arm64/LzmaDecOpt.S Normal file
View File

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,32 @@
; 7zAsm.asm -- ASM macros
; 2018-02-03 : Igor Pavlov : Public domain
; 2021-02-07 : Igor Pavlov : Public domain
ifdef RAX
x64 equ 1
endif
ifdef x64
IS_X64 equ 1
else
IS_X64 equ 0
endif
ifdef ABI_LINUX
IS_LINUX equ 1
else
IS_LINUX equ 0
endif
ifndef x64
; Use ABI_CDECL for x86 (32-bit) only
; if ABI_CDECL is not defined, we use fastcall abi
ifdef ABI_CDECL
IS_CDECL equ 1
else
IS_CDECL equ 0
endif
endif
MY_ASM_START macro
ifdef x64
@@ -14,8 +41,12 @@ endm
MY_PROC macro name:req, numParams:req
align 16
proc_numParams = numParams
ifdef x64
if (IS_X64 gt 0)
proc_name equ name
elseif (IS_LINUX gt 0)
proc_name equ name
elseif (IS_CDECL gt 0)
proc_name equ @CatStr(_,name)
else
proc_name equ @CatStr(@,name,@, %numParams * 4)
endif
@@ -23,18 +54,19 @@ MY_PROC macro name:req, numParams:req
endm
MY_ENDP macro
ifdef x64
ret
else
if proc_numParams LT 3
ret
if (IS_X64 gt 0)
ret
elseif (IS_CDECL gt 0)
ret
elseif (proc_numParams LT 3)
ret
else
ret (proc_numParams - 2) * 4
ret (proc_numParams - 2) * 4
endif
endif
proc_name ENDP
endm
ifdef x64
REG_SIZE equ 8
REG_LOGAR_SIZE equ 3
@@ -103,6 +135,24 @@ else
r7 equ x7
endif
ifdef x64
ifdef ABI_LINUX
MY_PUSH_2_REGS macro
push r3
push r5
endm
MY_POP_2_REGS macro
pop r5
pop r3
endm
endif
endif
MY_PUSH_4_REGS macro
push r3
push r5
@@ -118,30 +168,91 @@ MY_POP_4_REGS macro
endm
ifdef x64
; for fastcall and for WIN-x64
REG_PARAM_0_x equ x1
REG_PARAM_0 equ r1
REG_PARAM_1 equ r2
; for WIN64-x64 ABI:
ifndef x64
; for x86-fastcall
REG_PARAM_0 equ r1
REG_PARAM_1 equ r2
REG_ABI_PARAM_0_x equ REG_PARAM_0_x
REG_ABI_PARAM_0 equ REG_PARAM_0
REG_ABI_PARAM_1 equ REG_PARAM_1
else
; x64
if (IS_LINUX eq 0)
; for WIN-x64:
REG_PARAM_2 equ r8
REG_PARAM_3 equ r9
MY_PUSH_PRESERVED_REGS macro
MY_PUSH_4_REGS
push r12
push r13
push r14
push r15
REG_ABI_PARAM_0_x equ REG_PARAM_0_x
REG_ABI_PARAM_0 equ REG_PARAM_0
REG_ABI_PARAM_1 equ REG_PARAM_1
REG_ABI_PARAM_2 equ REG_PARAM_2
REG_ABI_PARAM_3 equ REG_PARAM_3
else
; for LINUX-x64:
REG_LINUX_PARAM_0_x equ x7
REG_LINUX_PARAM_0 equ r7
REG_LINUX_PARAM_1 equ r6
REG_LINUX_PARAM_2 equ r2
REG_LINUX_PARAM_3 equ r1
REG_ABI_PARAM_0_x equ REG_LINUX_PARAM_0_x
REG_ABI_PARAM_0 equ REG_LINUX_PARAM_0
REG_ABI_PARAM_1 equ REG_LINUX_PARAM_1
REG_ABI_PARAM_2 equ REG_LINUX_PARAM_2
REG_ABI_PARAM_3 equ REG_LINUX_PARAM_3
MY_ABI_LINUX_TO_WIN_2 macro
mov r2, r6
mov r1, r7
endm
MY_ABI_LINUX_TO_WIN_3 macro
mov r8, r2
mov r2, r6
mov r1, r7
endm
MY_ABI_LINUX_TO_WIN_4 macro
mov r9, r1
mov r8, r2
mov r2, r6
mov r1, r7
endm
endif ; IS_LINUX
MY_PUSH_PRESERVED_ABI_REGS macro
if (IS_LINUX gt 0)
MY_PUSH_2_REGS
else
MY_PUSH_4_REGS
endif
push r12
push r13
push r14
push r15
endm
MY_POP_PRESERVED_REGS macro
pop r15
pop r14
pop r13
pop r12
MY_POP_4_REGS
MY_POP_PRESERVED_ABI_REGS macro
pop r15
pop r14
pop r13
pop r12
if (IS_LINUX gt 0)
MY_POP_2_REGS
else
MY_POP_4_REGS
endif
endm
endif
endif ; x64

View File

@@ -1,5 +1,5 @@
; 7zCrcOpt.asm -- CRC32 calculation : optimized version
; 2009-12-12 : Igor Pavlov : Public domain
; 2021-02-07 : Igor Pavlov : Public domain
include 7zAsm.asm
@@ -7,21 +7,28 @@ MY_ASM_START
rD equ r2
rN equ r7
rT equ r5
ifdef x64
num_VAR equ r8
table_VAR equ r9
else
data_size equ (REG_SIZE * 5)
crc_table equ (REG_SIZE + data_size)
num_VAR equ [r4 + data_size]
table_VAR equ [r4 + crc_table]
if (IS_CDECL gt 0)
crc_OFFS equ (REG_SIZE * 5)
data_OFFS equ (REG_SIZE + crc_OFFS)
size_OFFS equ (REG_SIZE + data_OFFS)
else
size_OFFS equ (REG_SIZE * 5)
endif
table_OFFS equ (REG_SIZE + size_OFFS)
num_VAR equ [r4 + size_OFFS]
table_VAR equ [r4 + table_OFFS]
endif
SRCDAT equ rN + rD + 4 *
SRCDAT equ rD + rN * 1 + 4 *
CRC macro op:req, dest:req, src:req, t:req
op dest, DWORD PTR [r5 + src * 4 + 0400h * t]
op dest, DWORD PTR [rT + src * 4 + 0400h * t]
endm
CRC_XOR macro dest:req, src:req, t:req
@@ -43,11 +50,33 @@ CRC1b macro
endm
MY_PROLOG macro crc_end:req
MY_PUSH_4_REGS
ifdef x64
if (IS_LINUX gt 0)
MY_PUSH_2_REGS
mov x0, REG_ABI_PARAM_0_x ; x0 = x7
mov rT, REG_ABI_PARAM_3 ; r5 = r1
mov rN, REG_ABI_PARAM_2 ; r7 = r2
mov rD, REG_ABI_PARAM_1 ; r2 = r6
else
MY_PUSH_4_REGS
mov x0, REG_ABI_PARAM_0_x ; x0 = x1
mov rT, REG_ABI_PARAM_3 ; r5 = r9
mov rN, REG_ABI_PARAM_2 ; r7 = r8
; mov rD, REG_ABI_PARAM_1 ; r2 = r2
endif
else
MY_PUSH_4_REGS
if (IS_CDECL gt 0)
mov x0, [r4 + crc_OFFS]
mov rD, [r4 + data_OFFS]
else
mov x0, REG_ABI_PARAM_0_x
endif
mov rN, num_VAR
mov rT, table_VAR
endif
mov x0, x1
mov rN, num_VAR
mov r5, table_VAR
test rN, rN
jz crc_end
@@:
@@ -77,7 +106,11 @@ MY_EPILOG macro crc_end:req
CRC1b
jmp crc_end
@@:
MY_POP_4_REGS
if (IS_X64 gt 0) and (IS_LINUX gt 0)
MY_POP_2_REGS
else
MY_POP_4_REGS
endif
endm
MY_PROC CrcUpdateT8, 4

View File

@@ -1,237 +1,734 @@
; AesOpt.asm -- Intel's AES.
; 2009-12-12 : Igor Pavlov : Public domain
; AesOpt.asm -- AES optimized code for x86 AES hardware instructions
; 2021-03-10 : Igor Pavlov : Public domain
include 7zAsm.asm
ifdef ymm0
use_vaes_256 equ 1
ECHO "++ VAES 256"
else
ECHO "-- NO VAES 256"
endif
ifdef x64
ECHO "x86-64"
else
ECHO "x86"
if (IS_CDECL gt 0)
ECHO "ABI : CDECL"
else
ECHO "ABI : no CDECL : FASTCALL"
endif
endif
if (IS_LINUX gt 0)
ECHO "ABI : LINUX"
else
ECHO "ABI : WINDOWS"
endif
MY_ASM_START
ifndef x64
.686
.xmm
endif
ifdef x64
num equ r8
else
num equ [r4 + REG_SIZE * 4]
; MY_ALIGN EQU ALIGN(64)
MY_ALIGN EQU
SEG_ALIGN EQU MY_ALIGN
MY_SEG_PROC macro name:req, numParams:req
; seg_name equ @CatStr(_TEXT$, name)
; seg_name SEGMENT SEG_ALIGN 'CODE'
MY_PROC name, numParams
endm
MY_SEG_ENDP macro
; seg_name ENDS
endm
NUM_AES_KEYS_MAX equ 15
; the number of push operators in function PROLOG
if (IS_LINUX eq 0) or (IS_X64 eq 0)
num_regs_push equ 2
stack_param_offset equ (REG_SIZE * (1 + num_regs_push))
endif
rD equ r2
rN equ r0
ifdef x64
num_param equ REG_ABI_PARAM_2
else
if (IS_CDECL gt 0)
; size_t size
; void * data
; UInt32 * aes
; ret-ip <- (r4)
aes_OFFS equ (stack_param_offset)
data_OFFS equ (REG_SIZE + aes_OFFS)
size_OFFS equ (REG_SIZE + data_OFFS)
num_param equ [r4 + size_OFFS]
else
num_param equ [r4 + stack_param_offset]
endif
endif
MY_PROLOG macro reg:req
ifdef x64
movdqa [r4 + 8], xmm6
movdqa [r4 + 8 + 16], xmm7
endif
keys equ REG_PARAM_0 ; r1
rD equ REG_PARAM_1 ; r2
rN equ r0
push r3
push r5
push r6
koffs_x equ x7
koffs_r equ r7
mov rN, num
mov x6, [r1 + 16]
shl x6, 5
ksize_x equ x6
ksize_r equ r6
movdqa reg, [r1]
add r1, 32
endm
keys2 equ r3
MY_EPILOG macro
pop r6
pop r5
pop r3
state equ xmm0
key equ xmm0
key_ymm equ ymm0
key_ymm_n equ 0
ifdef x64
movdqa xmm6, [r4 + 8]
movdqa xmm7, [r4 + 8 + 16]
endif
ifdef x64
ways = 11
else
ways = 4
endif
MY_ENDP
endm
ways_start_reg equ 1
ways equ 4
ways16 equ (ways * 16)
iv equ @CatStr(xmm, %(ways_start_reg + ways))
iv_ymm equ @CatStr(ymm, %(ways_start_reg + ways))
OP_W macro op, op2
WOP macro op, op2
i = 0
rept ways
op @CatStr(xmm,%i), op2
i = i + 1
op @CatStr(xmm, %(ways_start_reg + i)), op2
i = i + 1
endm
endm
LOAD_OP macro op:req, offs:req
op xmm0, [r1 + r3 offs]
endm
ifndef ABI_LINUX
ifdef x64
; we use 32 bytes of home space in stack in WIN64-x64
NUM_HOME_MM_REGS equ (32 / 16)
; we preserve xmm registers starting from xmm6 in WIN64-x64
MM_START_SAVE_REG equ 6
SAVE_XMM macro num_used_mm_regs:req
num_save_mm_regs = num_used_mm_regs - MM_START_SAVE_REG
if num_save_mm_regs GT 0
num_save_mm_regs2 = num_save_mm_regs - NUM_HOME_MM_REGS
; RSP is (16*x + 8) after entering the function in WIN64-x64
stack_offset = 16 * num_save_mm_regs2 + (stack_param_offset mod 16)
LOAD_OP_W macro op:req, offs:req
movdqa xmm7, [r1 + r3 offs]
OP_W op, xmm7
i = 0
rept num_save_mm_regs
if i eq NUM_HOME_MM_REGS
sub r4, stack_offset
endif
if i lt NUM_HOME_MM_REGS
movdqa [r4 + stack_param_offset + i * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))
else
movdqa [r4 + (i - NUM_HOME_MM_REGS) * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))
endif
i = i + 1
endm
endif
endm
RESTORE_XMM macro num_used_mm_regs:req
if num_save_mm_regs GT 0
i = 0
if num_save_mm_regs2 GT 0
rept num_save_mm_regs2
movdqa @CatStr(xmm, %(MM_START_SAVE_REG + NUM_HOME_MM_REGS + i)), [r4 + i * 16]
i = i + 1
endm
add r4, stack_offset
endif
num_low_regs = num_save_mm_regs - i
i = 0
rept num_low_regs
movdqa @CatStr(xmm, %(MM_START_SAVE_REG + i)), [r4 + stack_param_offset + i * 16]
i = i + 1
endm
endif
endm
endif ; x64
endif ; ABI_LINUX
MY_PROLOG macro num_used_mm_regs:req
; num_regs_push: must be equal to the number of push operators
; push r3
; push r5
if (IS_LINUX eq 0) or (IS_X64 eq 0)
push r6
push r7
endif
mov rN, num_param ; don't move it; num_param can use stack pointer (r4)
if (IS_X64 eq 0)
if (IS_CDECL gt 0)
mov rD, [r4 + data_OFFS]
mov keys, [r4 + aes_OFFS]
endif
elseif (IS_LINUX gt 0)
MY_ABI_LINUX_TO_WIN_2
endif
ifndef ABI_LINUX
ifdef x64
SAVE_XMM num_used_mm_regs
endif
endif
mov ksize_x, [keys + 16]
shl ksize_x, 5
endm
MY_EPILOG macro
ifndef ABI_LINUX
ifdef x64
RESTORE_XMM num_save_mm_regs
endif
endif
if (IS_LINUX eq 0) or (IS_X64 eq 0)
pop r7
pop r6
endif
; pop r5
; pop r3
MY_ENDP
endm
OP_KEY macro op:req, offs:req
op state, [keys + offs]
endm
WOP_KEY macro op:req, offs:req
movdqa key, [keys + offs]
WOP op, key
endm
; ---------- AES-CBC Decode ----------
CBC_DEC_UPDATE macro reg, offs
pxor reg, xmm6
movdqa xmm6, [rD + offs]
movdqa [rD + offs], reg
XOR_WITH_DATA macro reg, _ppp_
pxor reg, [rD + i * 16]
endm
DECODE macro op:req
op aesdec, +16
@@:
op aesdec, +0
op aesdec, -16
sub x3, 32
jnz @B
op aesdeclast, +0
WRITE_TO_DATA macro reg, _ppp_
movdqa [rD + i * 16], reg
endm
MY_PROC AesCbc_Decode_Intel, 3
MY_PROLOG xmm6
sub x6, 32
; state0 equ @CatStr(xmm, %(ways_start_reg))
jmp check2
key0 equ @CatStr(xmm, %(ways_start_reg + ways + 1))
key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))
align 16
nextBlocks2:
mov x3, x6
OP_W movdqa, [rD + i * 16]
LOAD_OP_W pxor, +32
DECODE LOAD_OP_W
OP_W CBC_DEC_UPDATE, i * 16
add rD, ways16
check2:
sub rN, ways
jnc nextBlocks2
key_last equ @CatStr(xmm, %(ways_start_reg + ways + 2))
key_last_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))
key_last_ymm_n equ (ways_start_reg + ways + 2)
add rN, ways
jmp check
nextBlock:
mov x3, x6
movdqa xmm1, [rD]
LOAD_OP movdqa, +32
pxor xmm0, xmm1
DECODE LOAD_OP
pxor xmm0, xmm6
movdqa [rD], xmm0
movdqa xmm6, xmm1
add rD, 16
check:
sub rN, 1
jnc nextBlock
movdqa [r1 - 32], xmm6
MY_EPILOG
NUM_CBC_REGS equ (ways_start_reg + ways + 3)
; ---------- AES-CBC Encode ----------
MY_SEG_PROC AesCbc_Decode_HW, 3
ENCODE macro op:req
op aesenc, -16
@@:
op aesenc, +0
op aesenc, +16
add r3, 32
jnz @B
op aesenclast, +0
endm
AesCbc_Decode_HW_start::
MY_PROLOG NUM_CBC_REGS
AesCbc_Decode_HW_start_2::
movdqa iv, [keys]
add keys, 32
MY_PROC AesCbc_Encode_Intel, 3
MY_PROLOG xmm0
movdqa key0, [keys + 1 * ksize_r]
movdqa key_last, [keys]
sub ksize_x, 16
add r1, r6
neg r6
add r6, 32
jmp check2
align 16
nextBlocks2:
WOP movdqa, [rD + i * 16]
mov koffs_x, ksize_x
; WOP_KEY pxor, ksize_r + 16
WOP pxor, key0
; align 16
@@:
WOP_KEY aesdec, 1 * koffs_r
sub koffs_r, 16
jnz @B
; WOP_KEY aesdeclast, 0
WOP aesdeclast, key_last
pxor @CatStr(xmm, %(ways_start_reg)), iv
i = 1
rept ways - 1
pxor @CatStr(xmm, %(ways_start_reg + i)), [rD + i * 16 - 16]
i = i + 1
endm
movdqa iv, [rD + ways * 16 - 16]
WOP WRITE_TO_DATA
jmp check_e
add rD, ways * 16
AesCbc_Decode_HW_start_3::
check2:
sub rN, ways
jnc nextBlocks2
add rN, ways
align 16
nextBlock_e:
mov r3, r6
pxor xmm0, [rD]
pxor xmm0, [r1 + r3 - 32]
ENCODE LOAD_OP
movdqa [rD], xmm0
add rD, 16
check_e:
sub rN, 1
jnc nextBlock_e
sub ksize_x, 16
movdqa [r1 + r6 - 64], xmm0
MY_EPILOG
jmp check
nextBlock:
movdqa state, [rD]
mov koffs_x, ksize_x
; OP_KEY pxor, 1 * ksize_r + 32
pxor state, key0
; movdqa state0, [rD]
; movdqa state, key0
; pxor state, state0
@@:
OP_KEY aesdec, 1 * koffs_r + 16
OP_KEY aesdec, 1 * koffs_r
sub koffs_r, 32
jnz @B
OP_KEY aesdec, 16
; OP_KEY aesdeclast, 0
aesdeclast state, key_last
pxor state, iv
movdqa iv, [rD]
; movdqa iv, state0
movdqa [rD], state
add rD, 16
check:
sub rN, 1
jnc nextBlock
movdqa [keys - 32], iv
MY_EPILOG
; ---------- AES-CTR ----------
XOR_UPD_1 macro reg, offs
pxor reg, [rD + offs]
endm
XOR_UPD_2 macro reg, offs
movdqa [rD + offs], reg
endm
; ---------- AVX ----------
MY_PROC AesCtr_Code_Intel, 3
MY_PROLOG xmm6
mov r5, r4
shr r5, 4
dec r5
shl r5, 4
mov DWORD PTR [r5], 1
mov DWORD PTR [r5 + 4], 0
mov DWORD PTR [r5 + 8], 0
mov DWORD PTR [r5 + 12], 0
add r1, r6
neg r6
add r6, 32
jmp check2_c
align 16
nextBlocks2_c:
movdqa xmm7, [r5]
AVX__WOP_n macro op
i = 0
rept ways
paddq xmm6, xmm7
movdqa @CatStr(xmm,%i), xmm6
i = i + 1
op (ways_start_reg + i)
i = i + 1
endm
endm
AVX__WOP macro op
i = 0
rept ways
op @CatStr(ymm, %(ways_start_reg + i))
i = i + 1
endm
endm
AVX__WOP_KEY macro op:req, offs:req
vmovdqa key_ymm, ymmword ptr [keys2 + offs]
AVX__WOP_n op
endm
AVX__CBC_START macro reg
; vpxor reg, key_ymm, ymmword ptr [rD + 32 * i]
vpxor reg, key0_ymm, ymmword ptr [rD + 32 * i]
endm
AVX__CBC_END macro reg
if i eq 0
vpxor reg, reg, iv_ymm
else
vpxor reg, reg, ymmword ptr [rD + i * 32 - 16]
endif
endm
AVX__WRITE_TO_DATA macro reg
vmovdqu ymmword ptr [rD + 32 * i], reg
endm
AVX__XOR_WITH_DATA macro reg
vpxor reg, reg, ymmword ptr [rD + 32 * i]
endm
AVX__CTR_START macro reg
vpaddq iv_ymm, iv_ymm, one_ymm
; vpxor reg, iv_ymm, key_ymm
vpxor reg, iv_ymm, key0_ymm
endm
MY_VAES_INSTR_2 macro cmd, dest, a1, a2
db 0c4H
db 2 + 040H + 020h * (1 - (a2) / 8) + 080h * (1 - (dest) / 8)
db 5 + 8 * ((not (a1)) and 15)
db cmd
db 0c0H + 8 * ((dest) and 7) + ((a2) and 7)
endm
MY_VAES_INSTR macro cmd, dest, a
MY_VAES_INSTR_2 cmd, dest, dest, a
endm
MY_vaesenc macro dest, a
MY_VAES_INSTR 0dcH, dest, a
endm
MY_vaesenclast macro dest, a
MY_VAES_INSTR 0ddH, dest, a
endm
MY_vaesdec macro dest, a
MY_VAES_INSTR 0deH, dest, a
endm
MY_vaesdeclast macro dest, a
MY_VAES_INSTR 0dfH, dest, a
endm
AVX__VAES_DEC macro reg
MY_vaesdec reg, key_ymm_n
endm
AVX__VAES_DEC_LAST_key_last macro reg
; MY_vaesdeclast reg, key_ymm_n
MY_vaesdeclast reg, key_last_ymm_n
endm
AVX__VAES_ENC macro reg
MY_vaesenc reg, key_ymm_n
endm
AVX__VAES_ENC_LAST macro reg
MY_vaesenclast reg, key_ymm_n
endm
AVX__vinserti128_TO_HIGH macro dest, src
vinserti128 dest, dest, src, 1
endm
MY_PROC AesCbc_Decode_HW_256, 3
ifdef use_vaes_256
MY_PROLOG NUM_CBC_REGS
cmp rN, ways * 2
jb AesCbc_Decode_HW_start_2
vmovdqa iv, xmmword ptr [keys]
add keys, 32
vbroadcasti128 key0_ymm, xmmword ptr [keys + 1 * ksize_r]
vbroadcasti128 key_last_ymm, xmmword ptr [keys]
sub ksize_x, 16
mov koffs_x, ksize_x
add ksize_x, ksize_x
AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 2) * 32)
push keys2
sub r4, AVX_STACK_SUB
; sub r4, 32
; sub r4, ksize_r
; lea keys2, [r4 + 32]
mov keys2, r4
and keys2, -32
broad:
vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]
vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm
sub koffs_r, 16
; jnc broad
jnz broad
sub rN, ways * 2
align 16
avx_cbcdec_nextBlock2:
mov koffs_x, ksize_x
; AVX__WOP_KEY AVX__CBC_START, 1 * koffs_r + 32
AVX__WOP AVX__CBC_START
@@:
AVX__WOP_KEY AVX__VAES_DEC, 1 * koffs_r
sub koffs_r, 32
jnz @B
; AVX__WOP_KEY AVX__VAES_DEC_LAST, 0
AVX__WOP_n AVX__VAES_DEC_LAST_key_last
AVX__vinserti128_TO_HIGH iv_ymm, xmmword ptr [rD]
AVX__WOP AVX__CBC_END
vmovdqa iv, xmmword ptr [rD + ways * 32 - 16]
AVX__WOP AVX__WRITE_TO_DATA
add rD, ways * 32
sub rN, ways * 2
jnc avx_cbcdec_nextBlock2
add rN, ways * 2
shr ksize_x, 1
; lea r4, [r4 + 1 * ksize_r + 32]
add r4, AVX_STACK_SUB
pop keys2
vzeroupper
jmp AesCbc_Decode_HW_start_3
else
jmp AesCbc_Decode_HW_start
endif
MY_ENDP
MY_SEG_ENDP
; ---------- AES-CBC Encode ----------
e0 equ xmm1
CENC_START_KEY equ 2
CENC_NUM_REG_KEYS equ (3 * 2)
; last_key equ @CatStr(xmm, %(CENC_START_KEY + CENC_NUM_REG_KEYS))
MY_SEG_PROC AesCbc_Encode_HW, 3
MY_PROLOG (CENC_START_KEY + CENC_NUM_REG_KEYS + 0)
movdqa state, [keys]
add keys, 32
i = 0
rept CENC_NUM_REG_KEYS
movdqa @CatStr(xmm, %(CENC_START_KEY + i)), [keys + i * 16]
i = i + 1
endm
add keys, ksize_r
neg ksize_r
add ksize_r, (16 * CENC_NUM_REG_KEYS)
; movdqa last_key, [keys]
jmp check_e
align 16
nextBlock_e:
movdqa e0, [rD]
mov koffs_r, ksize_r
pxor e0, @CatStr(xmm, %(CENC_START_KEY))
pxor state, e0
i = 1
rept (CENC_NUM_REG_KEYS - 1)
aesenc state, @CatStr(xmm, %(CENC_START_KEY + i))
i = i + 1
endm
mov r3, r6
LOAD_OP_W pxor, -32
ENCODE LOAD_OP_W
OP_W XOR_UPD_1, i * 16
OP_W XOR_UPD_2, i * 16
add rD, ways16
check2_c:
sub rN, ways
jnc nextBlocks2_c
@@:
OP_KEY aesenc, 1 * koffs_r
OP_KEY aesenc, 1 * koffs_r + 16
add koffs_r, 32
jnz @B
OP_KEY aesenclast, 0
; aesenclast state, last_key
movdqa [rD], state
add rD, 16
check_e:
sub rN, 1
jnc nextBlock_e
add rN, ways
jmp check_c
; movdqa [keys - 32], state
movdqa [keys + 1 * ksize_r - (16 * CENC_NUM_REG_KEYS) - 32], state
MY_EPILOG
MY_SEG_ENDP
nextBlock_c:
paddq xmm6, [r5]
mov r3, r6
movdqa xmm0, [r1 + r3 - 32]
pxor xmm0, xmm6
ENCODE LOAD_OP
XOR_UPD_1 xmm0, 0
XOR_UPD_2 xmm0, 0
add rD, 16
check_c:
sub rN, 1
jnc nextBlock_c
movdqa [r1 + r6 - 64], xmm6
MY_EPILOG
; ---------- AES-CTR ----------
ifdef x64
; ways = 11
endif
one equ @CatStr(xmm, %(ways_start_reg + ways + 1))
one_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))
key0 equ @CatStr(xmm, %(ways_start_reg + ways + 2))
key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))
NUM_CTR_REGS equ (ways_start_reg + ways + 3)
INIT_CTR macro reg, _ppp_
paddq iv, one
movdqa reg, iv
endm
MY_SEG_PROC AesCtr_Code_HW, 3
Ctr_start::
MY_PROLOG NUM_CTR_REGS
Ctr_start_2::
movdqa iv, [keys]
add keys, 32
movdqa key0, [keys]
add keys, ksize_r
neg ksize_r
add ksize_r, 16
Ctr_start_3::
mov koffs_x, 1
movd one, koffs_x
jmp check2_c
align 16
nextBlocks2_c:
WOP INIT_CTR, 0
mov koffs_r, ksize_r
; WOP_KEY pxor, 1 * koffs_r -16
WOP pxor, key0
@@:
WOP_KEY aesenc, 1 * koffs_r
add koffs_r, 16
jnz @B
WOP_KEY aesenclast, 0
WOP XOR_WITH_DATA
WOP WRITE_TO_DATA
add rD, ways * 16
check2_c:
sub rN, ways
jnc nextBlocks2_c
add rN, ways
sub keys, 16
add ksize_r, 16
jmp check_c
; align 16
nextBlock_c:
paddq iv, one
; movdqa state, [keys + 1 * koffs_r - 16]
movdqa state, key0
mov koffs_r, ksize_r
pxor state, iv
@@:
OP_KEY aesenc, 1 * koffs_r
OP_KEY aesenc, 1 * koffs_r + 16
add koffs_r, 32
jnz @B
OP_KEY aesenc, 0
OP_KEY aesenclast, 16
pxor state, [rD]
movdqa [rD], state
add rD, 16
check_c:
sub rN, 1
jnc nextBlock_c
; movdqa [keys - 32], iv
movdqa [keys + 1 * ksize_r - 16 - 32], iv
MY_EPILOG
MY_PROC AesCtr_Code_HW_256, 3
ifdef use_vaes_256
MY_PROLOG NUM_CTR_REGS
cmp rN, ways * 2
jb Ctr_start_2
vbroadcasti128 iv_ymm, xmmword ptr [keys]
add keys, 32
vbroadcasti128 key0_ymm, xmmword ptr [keys]
mov koffs_x, 1
vmovd one, koffs_x
vpsubq iv_ymm, iv_ymm, one_ymm
vpaddq one, one, one
AVX__vinserti128_TO_HIGH one_ymm, one
add keys, ksize_r
sub ksize_x, 16
neg ksize_r
mov koffs_r, ksize_r
add ksize_r, ksize_r
AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 1) * 32)
push keys2
lea keys2, [r4 - 32]
sub r4, AVX_STACK_SUB
and keys2, -32
vbroadcasti128 key_ymm, xmmword ptr [keys]
vmovdqa ymmword ptr [keys2], key_ymm
@@:
vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]
vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm
add koffs_r, 16
jnz @B
sub rN, ways * 2
align 16
avx_ctr_nextBlock2:
mov koffs_r, ksize_r
AVX__WOP AVX__CTR_START
; AVX__WOP_KEY AVX__CTR_START, 1 * koffs_r - 32
@@:
AVX__WOP_KEY AVX__VAES_ENC, 1 * koffs_r
add koffs_r, 32
jnz @B
AVX__WOP_KEY AVX__VAES_ENC_LAST, 0
AVX__WOP AVX__XOR_WITH_DATA
AVX__WOP AVX__WRITE_TO_DATA
add rD, ways * 32
sub rN, ways * 2
jnc avx_ctr_nextBlock2
add rN, ways * 2
vextracti128 iv, iv_ymm, 1
sar ksize_r, 1
add r4, AVX_STACK_SUB
pop keys2
vzeroupper
jmp Ctr_start_3
else
jmp Ctr_start
endif
MY_ENDP
MY_SEG_ENDP
end

View File

@@ -1,5 +1,5 @@
; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function
; 2018-02-06: Igor Pavlov : Public domain
; 2021-02-23: Igor Pavlov : Public domain
;
; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()
; function for check at link time.
@@ -62,6 +62,7 @@ PMULT equ (1 SHL PSHIFT)
PMULT_HALF equ (1 SHL (PSHIFT - 1))
PMULT_2 equ (1 SHL (PSHIFT + 1))
kMatchSpecLen_Error_Data equ (1 SHL 9)
; x0 range
; x1 pbPos / (prob) TREE
@@ -416,7 +417,7 @@ REV_1_VAR macro prob:req
NORM_CALC prob
cmovae range, t0
lea t0_R, [sym_R + sym2_R]
lea t0_R, [sym_R + 1 * sym2_R]
cmovae sym_R, t0_R
mov t0, kBitModelOffset
cmovb cod, t1
@@ -583,7 +584,7 @@ IsMatchBranch_Pre macro reg
mov pbPos, LOC pbMask
and pbPos, processedPos
shl pbPos, (kLenNumLowBits + 1 + PSHIFT)
lea probs_state_R, [probs + state_R]
lea probs_state_R, [probs + 1 * state_R]
endm
@@ -605,13 +606,13 @@ endm
; RSP is (16x + 8) bytes aligned in WIN64-x64
; LocalSize equ ((((SIZEOF CLzmaDec_Asm_Loc) + 7) / 16 * 16) + 8)
PARAM_lzma equ REG_PARAM_0
PARAM_limit equ REG_PARAM_1
PARAM_bufLimit equ REG_PARAM_2
PARAM_lzma equ REG_ABI_PARAM_0
PARAM_limit equ REG_ABI_PARAM_1
PARAM_bufLimit equ REG_ABI_PARAM_2
; MY_ALIGN_64
MY_PROC LzmaDec_DecodeReal_3, 3
MY_PUSH_PRESERVED_REGS
MY_PUSH_PRESERVED_ABI_REGS
lea r0, [RSP - (SIZEOF CLzmaDec_Asm_Loc)]
and r0, -128
@@ -777,7 +778,7 @@ len8_loop:
jb len8_loop
mov len_temp, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - 1 - kMatchMinLen
jmp len_mid_2
jmp short len_mid_2 ; we use short here for MASM that doesn't optimize that code as another assembler programs
MY_ALIGN_32
len_mid_0:
@@ -890,11 +891,16 @@ decode_dist_end:
; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize))
mov t1, LOC rep0
mov x1, LOC rep1
mov x2, LOC rep2
mov t0, LOC checkDicSize
test t0, t0
cmove t0, processedPos
cmp sym, t0
jae end_of_payload
; jmp end_of_payload ; for debug
; rep3 = rep2;
; rep2 = rep1;
@@ -902,15 +908,12 @@ decode_dist_end:
; rep0 = distance + 1;
inc sym
mov t0, LOC rep0
mov t1, LOC rep1
mov x1, LOC rep2
mov LOC rep0, sym
; mov sym, LOC remainLen
mov sym, len_temp
mov LOC rep1, t0
mov LOC rep2, t1
mov LOC rep3, x1
mov LOC rep1, t1
mov LOC rep2, x1
mov LOC rep3, x2
; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
cmp state, (kNumStates + kNumLitStates) * PMULT
@@ -932,7 +935,7 @@ copy_match:
; }
mov cnt_R, LOC limit
sub cnt_R, dicPos
jz fin_ERROR
jz fin_dicPos_LIMIT
; curLen = ((rem < len) ? (unsigned)rem : len);
cmp cnt_R, sym_R
@@ -1091,11 +1094,23 @@ IsRep0Short_label:
sub t0_R, dic
sub probs, RepLenCoder * PMULT
inc processedPos
; state = state < kNumLitStates ? 9 : 11;
or state, 1 * PMULT
; the caller doesn't allow (dicPos >= limit) case for REP_SHORT
; so we don't need the following (dicPos == limit) check here:
; cmp dicPos, LOC limit
; jae fin_dicPos_LIMIT_REP_SHORT
inc processedPos
IsMatchBranch_Pre
; xor sym, sym
; sub t0_R, probBranch_R
; cmovb sym_R, LOC dicBufSize
; add t0_R, sym_R
sub t0_R, probBranch_R
jae @f
add t0_R, LOC dicBufSize
@@ -1210,15 +1225,45 @@ copy_match_cross:
fin_ERROR:
; fin_dicPos_LIMIT_REP_SHORT:
; mov sym, 1
fin_dicPos_LIMIT:
mov LOC remainLen, sym
jmp fin_OK
; For more strict mode we can stop decoding with error
; mov sym, 1
; jmp fin
fin_ERROR_MATCH_DIST:
; rep3 = rep2;
; rep2 = rep1;
; rep1 = rep0;
; rep0 = distance + 1;
add len_temp, kMatchSpecLen_Error_Data
mov LOC remainLen, len_temp
; fin_ERROR_2:
mov LOC rep0, sym
mov LOC rep1, t1
mov LOC rep2, x1
mov LOC rep3, x2
; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
cmp state, (kNumStates + kNumLitStates) * PMULT
mov state, kNumLitStates * PMULT
mov t0, (kNumLitStates + 3) * PMULT
cmovae state, t0
; jmp fin_OK
mov sym, 1
jmp fin
end_of_payload:
cmp sym, 0FFFFFFFFh ; -1
jne fin_ERROR
inc sym
jnz fin_ERROR_MATCH_DIST
mov LOC remainLen, kMatchSpecLenStart
sub state, kNumStates * PMULT
@@ -1250,7 +1295,7 @@ fin:
mov RSP, LOC Old_RSP
MY_POP_PRESERVED_REGS
MY_POP_PRESERVED_ABI_REGS
MY_ENDP
_TEXT$LZMADECOPT ENDS

263
Asm/x86/Sha1Opt.asm Normal file
View File

@@ -0,0 +1,263 @@
; Sha1Opt.asm -- SHA-1 optimized code for SHA-1 x86 hardware instructions
; 2021-03-10 : Igor Pavlov : Public domain
include 7zAsm.asm
MY_ASM_START
CONST SEGMENT
align 16
Reverse_Endian_Mask db 15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0
CONST ENDS
; _TEXT$SHA1OPT SEGMENT 'CODE'
ifndef x64
.686
.xmm
endif
ifdef x64
rNum equ REG_ABI_PARAM_2
if (IS_LINUX eq 0)
LOCAL_SIZE equ (16 * 2)
endif
else
rNum equ r0
LOCAL_SIZE equ (16 * 1)
endif
rState equ REG_ABI_PARAM_0
rData equ REG_ABI_PARAM_1
MY_sha1rnds4 macro a1, a2, imm
db 0fH, 03aH, 0ccH, (0c0H + a1 * 8 + a2), imm
endm
MY_SHA_INSTR macro cmd, a1, a2
db 0fH, 038H, cmd, (0c0H + a1 * 8 + a2)
endm
cmd_sha1nexte equ 0c8H
cmd_sha1msg1 equ 0c9H
cmd_sha1msg2 equ 0caH
MY_sha1nexte macro a1, a2
MY_SHA_INSTR cmd_sha1nexte, a1, a2
endm
MY_sha1msg1 macro a1, a2
MY_SHA_INSTR cmd_sha1msg1, a1, a2
endm
MY_sha1msg2 macro a1, a2
MY_SHA_INSTR cmd_sha1msg2, a1, a2
endm
MY_PROLOG macro
ifdef x64
if (IS_LINUX eq 0)
movdqa [r4 + 8], xmm6
movdqa [r4 + 8 + 16], xmm7
sub r4, LOCAL_SIZE + 8
movdqa [r4 ], xmm8
movdqa [r4 + 16], xmm9
endif
else ; x86
if (IS_CDECL gt 0)
mov rState, [r4 + REG_SIZE * 1]
mov rData, [r4 + REG_SIZE * 2]
mov rNum, [r4 + REG_SIZE * 3]
else ; fastcall
mov rNum, [r4 + REG_SIZE * 1]
endif
push r5
mov r5, r4
and r4, -16
sub r4, LOCAL_SIZE
endif
endm
MY_EPILOG macro
ifdef x64
if (IS_LINUX eq 0)
movdqa xmm8, [r4]
movdqa xmm9, [r4 + 16]
add r4, LOCAL_SIZE + 8
movdqa xmm6, [r4 + 8]
movdqa xmm7, [r4 + 8 + 16]
endif
else ; x86
mov r4, r5
pop r5
endif
MY_ENDP
endm
e0_N equ 0
e1_N equ 1
abcd_N equ 2
e0_save_N equ 3
w_regs equ 4
e0 equ @CatStr(xmm, %e0_N)
e1 equ @CatStr(xmm, %e1_N)
abcd equ @CatStr(xmm, %abcd_N)
e0_save equ @CatStr(xmm, %e0_save_N)
ifdef x64
abcd_save equ xmm8
mask2 equ xmm9
else
abcd_save equ [r4]
mask2 equ e1
endif
LOAD_MASK macro
movdqa mask2, XMMWORD PTR Reverse_Endian_Mask
endm
LOAD_W macro k:req
movdqu @CatStr(xmm, %(w_regs + k)), [rData + (16 * (k))]
pshufb @CatStr(xmm, %(w_regs + k)), mask2
endm
; pre2 can be 2 or 3 (recommended)
pre2 equ 3
pre1 equ (pre2 + 1)
NUM_ROUNDS4 equ 20
RND4 macro k
movdqa @CatStr(xmm, %(e0_N + ((k + 1) mod 2))), abcd
MY_sha1rnds4 abcd_N, (e0_N + (k mod 2)), k / 5
nextM = (w_regs + ((k + 1) mod 4))
if (k EQ NUM_ROUNDS4 - 1)
nextM = e0_save_N
endif
MY_sha1nexte (e0_N + ((k + 1) mod 2)), nextM
if (k GE (4 - pre2)) AND (k LT (NUM_ROUNDS4 - pre2))
pxor @CatStr(xmm, %(w_regs + ((k + pre2) mod 4))), @CatStr(xmm, %(w_regs + ((k + pre2 - 2) mod 4)))
endif
if (k GE (4 - pre1)) AND (k LT (NUM_ROUNDS4 - pre1))
MY_sha1msg1 (w_regs + ((k + pre1) mod 4)), (w_regs + ((k + pre1 - 3) mod 4))
endif
if (k GE (4 - pre2)) AND (k LT (NUM_ROUNDS4 - pre2))
MY_sha1msg2 (w_regs + ((k + pre2) mod 4)), (w_regs + ((k + pre2 - 1) mod 4))
endif
endm
REVERSE_STATE macro
; abcd ; dcba
; e0 ; 000e
pshufd abcd, abcd, 01bH ; abcd
pshufd e0, e0, 01bH ; e000
endm
MY_PROC Sha1_UpdateBlocks_HW, 3
MY_PROLOG
cmp rNum, 0
je end_c
movdqu abcd, [rState] ; dcba
movd e0, dword ptr [rState + 16] ; 000e
REVERSE_STATE
ifdef x64
LOAD_MASK
endif
align 16
nextBlock:
movdqa abcd_save, abcd
movdqa e0_save, e0
ifndef x64
LOAD_MASK
endif
LOAD_W 0
LOAD_W 1
LOAD_W 2
LOAD_W 3
paddd e0, @CatStr(xmm, %(w_regs))
k = 0
rept NUM_ROUNDS4
RND4 k
k = k + 1
endm
paddd abcd, abcd_save
add rData, 64
sub rNum, 1
jnz nextBlock
REVERSE_STATE
movdqu [rState], abcd
movd dword ptr [rState + 16], e0
end_c:
MY_EPILOG
; _TEXT$SHA1OPT ENDS
end

263
Asm/x86/Sha256Opt.asm Normal file
View File

@@ -0,0 +1,263 @@
; Sha256Opt.asm -- SHA-256 optimized code for SHA-256 x86 hardware instructions
; 2021-03-10 : Igor Pavlov : Public domain
include 7zAsm.asm
MY_ASM_START
; .data
; public K
; we can use external SHA256_K_ARRAY defined in Sha256.c
; but we must guarantee that SHA256_K_ARRAY is aligned for 16-bytes
COMMENT @
ifdef x64
K_CONST equ SHA256_K_ARRAY
else
K_CONST equ _SHA256_K_ARRAY
endif
EXTRN K_CONST:xmmword
@
CONST SEGMENT
align 16
Reverse_Endian_Mask db 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12
; COMMENT @
align 16
K_CONST \
DD 0428a2f98H, 071374491H, 0b5c0fbcfH, 0e9b5dba5H
DD 03956c25bH, 059f111f1H, 0923f82a4H, 0ab1c5ed5H
DD 0d807aa98H, 012835b01H, 0243185beH, 0550c7dc3H
DD 072be5d74H, 080deb1feH, 09bdc06a7H, 0c19bf174H
DD 0e49b69c1H, 0efbe4786H, 00fc19dc6H, 0240ca1ccH
DD 02de92c6fH, 04a7484aaH, 05cb0a9dcH, 076f988daH
DD 0983e5152H, 0a831c66dH, 0b00327c8H, 0bf597fc7H
DD 0c6e00bf3H, 0d5a79147H, 006ca6351H, 014292967H
DD 027b70a85H, 02e1b2138H, 04d2c6dfcH, 053380d13H
DD 0650a7354H, 0766a0abbH, 081c2c92eH, 092722c85H
DD 0a2bfe8a1H, 0a81a664bH, 0c24b8b70H, 0c76c51a3H
DD 0d192e819H, 0d6990624H, 0f40e3585H, 0106aa070H
DD 019a4c116H, 01e376c08H, 02748774cH, 034b0bcb5H
DD 0391c0cb3H, 04ed8aa4aH, 05b9cca4fH, 0682e6ff3H
DD 0748f82eeH, 078a5636fH, 084c87814H, 08cc70208H
DD 090befffaH, 0a4506cebH, 0bef9a3f7H, 0c67178f2H
; @
CONST ENDS
; _TEXT$SHA256OPT SEGMENT 'CODE'
ifndef x64
.686
.xmm
endif
ifdef x64
rNum equ REG_ABI_PARAM_2
if (IS_LINUX eq 0)
LOCAL_SIZE equ (16 * 2)
endif
else
rNum equ r0
LOCAL_SIZE equ (16 * 1)
endif
rState equ REG_ABI_PARAM_0
rData equ REG_ABI_PARAM_1
MY_SHA_INSTR macro cmd, a1, a2
db 0fH, 038H, cmd, (0c0H + a1 * 8 + a2)
endm
cmd_sha256rnds2 equ 0cbH
cmd_sha256msg1 equ 0ccH
cmd_sha256msg2 equ 0cdH
MY_sha256rnds2 macro a1, a2
MY_SHA_INSTR cmd_sha256rnds2, a1, a2
endm
MY_sha256msg1 macro a1, a2
MY_SHA_INSTR cmd_sha256msg1, a1, a2
endm
MY_sha256msg2 macro a1, a2
MY_SHA_INSTR cmd_sha256msg2, a1, a2
endm
MY_PROLOG macro
ifdef x64
if (IS_LINUX eq 0)
movdqa [r4 + 8], xmm6
movdqa [r4 + 8 + 16], xmm7
sub r4, LOCAL_SIZE + 8
movdqa [r4 ], xmm8
movdqa [r4 + 16], xmm9
endif
else ; x86
if (IS_CDECL gt 0)
mov rState, [r4 + REG_SIZE * 1]
mov rData, [r4 + REG_SIZE * 2]
mov rNum, [r4 + REG_SIZE * 3]
else ; fastcall
mov rNum, [r4 + REG_SIZE * 1]
endif
push r5
mov r5, r4
and r4, -16
sub r4, LOCAL_SIZE
endif
endm
MY_EPILOG macro
ifdef x64
if (IS_LINUX eq 0)
movdqa xmm8, [r4]
movdqa xmm9, [r4 + 16]
add r4, LOCAL_SIZE + 8
movdqa xmm6, [r4 + 8]
movdqa xmm7, [r4 + 8 + 16]
endif
else ; x86
mov r4, r5
pop r5
endif
MY_ENDP
endm
msg equ xmm0
tmp equ xmm0
state0_N equ 2
state1_N equ 3
w_regs equ 4
state1_save equ xmm1
state0 equ @CatStr(xmm, %state0_N)
state1 equ @CatStr(xmm, %state1_N)
ifdef x64
state0_save equ xmm8
mask2 equ xmm9
else
state0_save equ [r4]
mask2 equ xmm0
endif
LOAD_MASK macro
movdqa mask2, XMMWORD PTR Reverse_Endian_Mask
endm
LOAD_W macro k:req
movdqu @CatStr(xmm, %(w_regs + k)), [rData + (16 * (k))]
pshufb @CatStr(xmm, %(w_regs + k)), mask2
endm
; pre1 <= 4 && pre2 >= 1 && pre1 > pre2 && (pre1 - pre2) <= 1
pre1 equ 3
pre2 equ 2
RND4 macro k
movdqa msg, xmmword ptr [K_CONST + (k) * 16]
paddd msg, @CatStr(xmm, %(w_regs + ((k + 0) mod 4)))
MY_sha256rnds2 state0_N, state1_N
pshufd msg, msg, 0eH
if (k GE (4 - pre1)) AND (k LT (16 - pre1))
; w4[0] = msg1(w4[-4], w4[-3])
MY_sha256msg1 (w_regs + ((k + pre1) mod 4)), (w_regs + ((k + pre1 - 3) mod 4))
endif
MY_sha256rnds2 state1_N, state0_N
if (k GE (4 - pre2)) AND (k LT (16 - pre2))
movdqa tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 1) mod 4)))
palignr tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 2) mod 4))), 4
paddd @CatStr(xmm, %(w_regs + ((k + pre2) mod 4))), tmp
; w4[0] = msg2(w4[0], w4[-1])
MY_sha256msg2 %(w_regs + ((k + pre2) mod 4)), %(w_regs + ((k + pre2 - 1) mod 4))
endif
endm
REVERSE_STATE macro
; state0 ; dcba
; state1 ; hgfe
pshufd tmp, state0, 01bH ; abcd
pshufd state0, state1, 01bH ; efgh
movdqa state1, state0 ; efgh
punpcklqdq state0, tmp ; cdgh
punpckhqdq state1, tmp ; abef
endm
MY_PROC Sha256_UpdateBlocks_HW, 3
MY_PROLOG
cmp rNum, 0
je end_c
movdqu state0, [rState] ; dcba
movdqu state1, [rState + 16] ; hgfe
REVERSE_STATE
ifdef x64
LOAD_MASK
endif
align 16
nextBlock:
movdqa state0_save, state0
movdqa state1_save, state1
ifndef x64
LOAD_MASK
endif
LOAD_W 0
LOAD_W 1
LOAD_W 2
LOAD_W 3
k = 0
rept 16
RND4 k
k = k + 1
endm
paddd state0, state0_save
paddd state1, state1_save
add rData, 64
sub rNum, 1
jnz nextBlock
REVERSE_STATE
movdqu [rState], state0
movdqu [rState + 16], state1
end_c:
MY_EPILOG
; _TEXT$SHA256OPT ENDS
end

View File

@@ -1,5 +1,5 @@
; XzCrc64Opt.asm -- CRC64 calculation : optimized version
; 2011-06-28 : Igor Pavlov : Public domain
; 2021-02-06 : Igor Pavlov : Public domain
include 7zAsm.asm
@@ -7,16 +7,15 @@ MY_ASM_START
ifdef x64
rD equ r9
rN equ r10
num_VAR equ r8
table_VAR equ r9
SRCDAT equ rN + rD
rD equ r9
rN equ r10
rT equ r5
num_VAR equ r8
SRCDAT4 equ dword ptr [rD + rN * 1]
CRC_XOR macro dest:req, src:req, t:req
xor dest, QWORD PTR [r5 + src * 8 + 0800h * t]
xor dest, QWORD PTR [rT + src * 8 + 0800h * t]
endm
CRC1b macro
@@ -30,12 +29,15 @@ CRC1b macro
endm
MY_PROLOG macro crc_end:req
ifdef ABI_LINUX
MY_PUSH_2_REGS
else
MY_PUSH_4_REGS
mov r0, r1
mov rN, num_VAR
mov r5, table_VAR
mov rD, r2
endif
mov r0, REG_ABI_PARAM_0
mov rN, REG_ABI_PARAM_2
mov rT, REG_ABI_PARAM_3
mov rD, REG_ABI_PARAM_1
test rN, rN
jz crc_end
@@:
@@ -51,14 +53,14 @@ MY_PROLOG macro crc_end:req
sub rN, 4
and rN, NOT 3
sub rD, rN
mov x1, [SRCDAT]
mov x1, SRCDAT4
xor r0, r1
add rN, 4
endm
MY_EPILOG macro crc_end:req
sub rN, 4
mov x1, [SRCDAT]
mov x1, SRCDAT4
xor r0, r1
mov rD, rN
mov rN, num_VAR
@@ -69,14 +71,18 @@ MY_EPILOG macro crc_end:req
CRC1b
jmp crc_end
@@:
ifdef ABI_LINUX
MY_POP_2_REGS
else
MY_POP_4_REGS
endif
endm
MY_PROC XzCrc64UpdateT4, 4
MY_PROLOG crc_end_4
align 16
main_loop_4:
mov x1, [SRCDAT]
mov x1, SRCDAT4
movzx x2, x0_L
movzx x3, x0_H
shr r0, 16
@@ -96,21 +102,43 @@ MY_PROC XzCrc64UpdateT4, 4
MY_ENDP
else
; x86 (32-bit)
rD equ r1
rN equ r7
rD equ r1
rN equ r7
rT equ r5
crc_val equ (REG_SIZE * 5)
crc_table equ (8 + crc_val)
table_VAR equ [r4 + crc_table]
num_VAR equ table_VAR
crc_OFFS equ (REG_SIZE * 5)
if (IS_CDECL gt 0) or (IS_LINUX gt 0)
; cdecl or (GNU fastcall) stack:
; (UInt32 *) table
; size_t size
; void * data
; (UInt64) crc
; ret-ip <-(r4)
data_OFFS equ (8 + crc_OFFS)
size_OFFS equ (REG_SIZE + data_OFFS)
table_OFFS equ (REG_SIZE + size_OFFS)
num_VAR equ [r4 + size_OFFS]
table_VAR equ [r4 + table_OFFS]
else
; Windows fastcall:
; r1 = data, r2 = size
; stack:
; (UInt32 *) table
; (UInt64) crc
; ret-ip <-(r4)
table_OFFS equ (8 + crc_OFFS)
table_VAR equ [r4 + table_OFFS]
num_VAR equ table_VAR
endif
SRCDAT equ rN + rD
SRCDAT4 equ dword ptr [rD + rN * 1]
CRC macro op0:req, op1:req, dest0:req, dest1:req, src:req, t:req
op0 dest0, DWORD PTR [r5 + src * 8 + 0800h * t]
op1 dest1, DWORD PTR [r5 + src * 8 + 0800h * t + 4]
op0 dest0, DWORD PTR [rT + src * 8 + 0800h * t]
op1 dest1, DWORD PTR [rT + src * 8 + 0800h * t + 4]
endm
CRC_XOR macro dest0:req, dest1:req, src:req, t:req
@@ -131,12 +159,18 @@ endm
MY_PROLOG macro crc_end:req
MY_PUSH_4_REGS
mov rN, r2
mov x0, [r4 + crc_val]
mov x2, [r4 + crc_val + 4]
mov r5, table_VAR
if (IS_CDECL gt 0) or (IS_LINUX gt 0)
proc_numParams = proc_numParams + 2 ; for ABI_LINUX
mov rN, [r4 + size_OFFS]
mov rD, [r4 + data_OFFS]
else
mov rN, r2
endif
mov x0, [r4 + crc_OFFS]
mov x2, [r4 + crc_OFFS + 4]
mov rT, table_VAR
test rN, rN
jz crc_end
@@:
@@ -154,13 +188,13 @@ MY_PROLOG macro crc_end:req
sub rN, 4
and rN, NOT 3
sub rD, rN
xor r0, [SRCDAT]
xor r0, SRCDAT4
add rN, 4
endm
MY_EPILOG macro crc_end:req
sub rN, 4
xor r0, [SRCDAT]
xor r0, SRCDAT4
mov rD, rN
mov rN, num_VAR
@@ -179,7 +213,7 @@ MY_PROC XzCrc64UpdateT4, 5
movzx x6, x0_L
align 16
main_loop_4:
mov r3, [SRCDAT]
mov r3, SRCDAT4
xor r3, r2
CRC xor, mov, r3, r2, r6, 3
@@ -200,6 +234,6 @@ MY_PROC XzCrc64UpdateT4, 5
MY_EPILOG crc_end_4
MY_ENDP
endif
endif ; ! x64
end