You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
4080 lines
80 KiB
ArmAsm
4080 lines
80 KiB
ArmAsm
// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
|
|
|
|
//go:build !appengine && !noasm && gc && !noasm
|
|
|
|
// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
|
|
// Requires: CMOV
|
|
TEXT ·sequenceDecs_decode_amd64(SB), $8-32
|
|
MOVQ br+8(FP), AX
|
|
MOVQ 32(AX), DX
|
|
MOVBQZX 40(AX), BX
|
|
MOVQ 24(AX), SI
|
|
MOVQ (AX), AX
|
|
ADDQ SI, AX
|
|
MOVQ AX, (SP)
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 72(AX), DI
|
|
MOVQ 80(AX), R8
|
|
MOVQ 88(AX), R9
|
|
MOVQ 104(AX), R10
|
|
MOVQ s+0(FP), AX
|
|
MOVQ 144(AX), R11
|
|
MOVQ 152(AX), R12
|
|
MOVQ 160(AX), R13
|
|
|
|
sequenceDecs_decode_amd64_main_loop:
|
|
MOVQ (SP), R14
|
|
|
|
// Fill bitreader to have enough for the offset and match length.
|
|
CMPQ SI, $0x08
|
|
JL sequenceDecs_decode_amd64_fill_byte_by_byte
|
|
MOVQ BX, AX
|
|
SHRQ $0x03, AX
|
|
SUBQ AX, R14
|
|
MOVQ (R14), DX
|
|
SUBQ AX, SI
|
|
ANDQ $0x07, BX
|
|
JMP sequenceDecs_decode_amd64_fill_end
|
|
|
|
sequenceDecs_decode_amd64_fill_byte_by_byte:
|
|
CMPQ SI, $0x00
|
|
JLE sequenceDecs_decode_amd64_fill_end
|
|
CMPQ BX, $0x07
|
|
JLE sequenceDecs_decode_amd64_fill_end
|
|
SHLQ $0x08, DX
|
|
SUBQ $0x01, R14
|
|
SUBQ $0x01, SI
|
|
SUBQ $0x08, BX
|
|
MOVBQZX (R14), AX
|
|
ORQ AX, DX
|
|
JMP sequenceDecs_decode_amd64_fill_byte_by_byte
|
|
|
|
sequenceDecs_decode_amd64_fill_end:
|
|
// Update offset
|
|
MOVQ R9, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R15
|
|
SHLQ CL, R15
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decode_amd64_of_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decode_amd64_of_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decode_amd64_of_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R15
|
|
ADDQ R15, AX
|
|
|
|
sequenceDecs_decode_amd64_of_update_zero:
|
|
MOVQ AX, 16(R10)
|
|
|
|
// Update match length
|
|
MOVQ R8, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R15
|
|
SHLQ CL, R15
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decode_amd64_ml_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decode_amd64_ml_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decode_amd64_ml_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R15
|
|
ADDQ R15, AX
|
|
|
|
sequenceDecs_decode_amd64_ml_update_zero:
|
|
MOVQ AX, 8(R10)
|
|
|
|
// Fill bitreader to have enough for the remaining
|
|
CMPQ SI, $0x08
|
|
JL sequenceDecs_decode_amd64_fill_2_byte_by_byte
|
|
MOVQ BX, AX
|
|
SHRQ $0x03, AX
|
|
SUBQ AX, R14
|
|
MOVQ (R14), DX
|
|
SUBQ AX, SI
|
|
ANDQ $0x07, BX
|
|
JMP sequenceDecs_decode_amd64_fill_2_end
|
|
|
|
sequenceDecs_decode_amd64_fill_2_byte_by_byte:
|
|
CMPQ SI, $0x00
|
|
JLE sequenceDecs_decode_amd64_fill_2_end
|
|
CMPQ BX, $0x07
|
|
JLE sequenceDecs_decode_amd64_fill_2_end
|
|
SHLQ $0x08, DX
|
|
SUBQ $0x01, R14
|
|
SUBQ $0x01, SI
|
|
SUBQ $0x08, BX
|
|
MOVBQZX (R14), AX
|
|
ORQ AX, DX
|
|
JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte
|
|
|
|
sequenceDecs_decode_amd64_fill_2_end:
|
|
// Update literal length
|
|
MOVQ DI, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R15
|
|
SHLQ CL, R15
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decode_amd64_ll_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decode_amd64_ll_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decode_amd64_ll_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R15
|
|
ADDQ R15, AX
|
|
|
|
sequenceDecs_decode_amd64_ll_update_zero:
|
|
MOVQ AX, (R10)
|
|
|
|
// Fill bitreader for state updates
|
|
MOVQ R14, (SP)
|
|
MOVQ R9, AX
|
|
SHRQ $0x08, AX
|
|
MOVBQZX AL, AX
|
|
MOVQ ctx+16(FP), CX
|
|
CMPQ 96(CX), $0x00
|
|
JZ sequenceDecs_decode_amd64_skip_update
|
|
|
|
// Update Literal Length State
|
|
MOVBQZX DI, R14
|
|
SHRQ $0x10, DI
|
|
MOVWQZX DI, DI
|
|
LEAQ (BX)(R14*1), CX
|
|
MOVQ DX, R15
|
|
MOVQ CX, BX
|
|
ROLQ CL, R15
|
|
MOVL $0x00000001, BP
|
|
MOVB R14, CL
|
|
SHLL CL, BP
|
|
DECL BP
|
|
ANDQ BP, R15
|
|
ADDQ R15, DI
|
|
|
|
// Load ctx.llTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ (CX)(DI*8), DI
|
|
|
|
// Update Match Length State
|
|
MOVBQZX R8, R14
|
|
SHRQ $0x10, R8
|
|
MOVWQZX R8, R8
|
|
LEAQ (BX)(R14*1), CX
|
|
MOVQ DX, R15
|
|
MOVQ CX, BX
|
|
ROLQ CL, R15
|
|
MOVL $0x00000001, BP
|
|
MOVB R14, CL
|
|
SHLL CL, BP
|
|
DECL BP
|
|
ANDQ BP, R15
|
|
ADDQ R15, R8
|
|
|
|
// Load ctx.mlTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 24(CX), CX
|
|
MOVQ (CX)(R8*8), R8
|
|
|
|
// Update Offset State
|
|
MOVBQZX R9, R14
|
|
SHRQ $0x10, R9
|
|
MOVWQZX R9, R9
|
|
LEAQ (BX)(R14*1), CX
|
|
MOVQ DX, R15
|
|
MOVQ CX, BX
|
|
ROLQ CL, R15
|
|
MOVL $0x00000001, BP
|
|
MOVB R14, CL
|
|
SHLL CL, BP
|
|
DECL BP
|
|
ANDQ BP, R15
|
|
ADDQ R15, R9
|
|
|
|
// Load ctx.ofTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 48(CX), CX
|
|
MOVQ (CX)(R9*8), R9
|
|
|
|
sequenceDecs_decode_amd64_skip_update:
|
|
// Adjust offset
|
|
MOVQ 16(R10), CX
|
|
CMPQ AX, $0x01
|
|
JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
|
|
MOVQ R12, R13
|
|
MOVQ R11, R12
|
|
MOVQ CX, R11
|
|
JMP sequenceDecs_decode_amd64_after_adjust
|
|
|
|
sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
|
|
CMPQ (R10), $0x00000000
|
|
JNE sequenceDecs_decode_amd64_adjust_offset_maybezero
|
|
INCQ CX
|
|
JMP sequenceDecs_decode_amd64_adjust_offset_nonzero
|
|
|
|
sequenceDecs_decode_amd64_adjust_offset_maybezero:
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero
|
|
MOVQ R11, CX
|
|
JMP sequenceDecs_decode_amd64_after_adjust
|
|
|
|
sequenceDecs_decode_amd64_adjust_offset_nonzero:
|
|
CMPQ CX, $0x01
|
|
JB sequenceDecs_decode_amd64_adjust_zero
|
|
JEQ sequenceDecs_decode_amd64_adjust_one
|
|
CMPQ CX, $0x02
|
|
JA sequenceDecs_decode_amd64_adjust_three
|
|
JMP sequenceDecs_decode_amd64_adjust_two
|
|
|
|
sequenceDecs_decode_amd64_adjust_zero:
|
|
MOVQ R11, AX
|
|
JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_amd64_adjust_one:
|
|
MOVQ R12, AX
|
|
JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_amd64_adjust_two:
|
|
MOVQ R13, AX
|
|
JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_amd64_adjust_three:
|
|
LEAQ -1(R11), AX
|
|
|
|
sequenceDecs_decode_amd64_adjust_test_temp_valid:
|
|
TESTQ AX, AX
|
|
JNZ sequenceDecs_decode_amd64_adjust_temp_valid
|
|
MOVQ $0x00000001, AX
|
|
|
|
sequenceDecs_decode_amd64_adjust_temp_valid:
|
|
CMPQ CX, $0x01
|
|
CMOVQNE R12, R13
|
|
MOVQ R11, R12
|
|
MOVQ AX, R11
|
|
MOVQ AX, CX
|
|
|
|
sequenceDecs_decode_amd64_after_adjust:
|
|
MOVQ CX, 16(R10)
|
|
|
|
// Check values
|
|
MOVQ 8(R10), AX
|
|
MOVQ (R10), R14
|
|
LEAQ (AX)(R14*1), R15
|
|
MOVQ s+0(FP), BP
|
|
ADDQ R15, 256(BP)
|
|
MOVQ ctx+16(FP), R15
|
|
SUBQ R14, 128(R15)
|
|
JS error_not_enough_literals
|
|
CMPQ AX, $0x00020002
|
|
JA sequenceDecs_decode_amd64_error_match_len_too_big
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decode_amd64_match_len_ofs_ok
|
|
TESTQ AX, AX
|
|
JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
|
|
|
|
sequenceDecs_decode_amd64_match_len_ofs_ok:
|
|
ADDQ $0x18, R10
|
|
MOVQ ctx+16(FP), AX
|
|
DECQ 96(AX)
|
|
JNS sequenceDecs_decode_amd64_main_loop
|
|
MOVQ s+0(FP), AX
|
|
MOVQ R11, 144(AX)
|
|
MOVQ R12, 152(AX)
|
|
MOVQ R13, 160(AX)
|
|
MOVQ br+8(FP), AX
|
|
MOVQ DX, 32(AX)
|
|
MOVB BL, 40(AX)
|
|
MOVQ SI, 24(AX)
|
|
|
|
// Return success
|
|
MOVQ $0x00000000, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match length error
|
|
sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
|
|
MOVQ $0x00000001, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match too long error
|
|
sequenceDecs_decode_amd64_error_match_len_too_big:
|
|
MOVQ $0x00000002, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match offset too long error
|
|
MOVQ $0x00000003, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough literals error
|
|
error_not_enough_literals:
|
|
MOVQ $0x00000004, ret+24(FP)
|
|
RET
|
|
|
|
// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
|
|
// Requires: CMOV
|
|
TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
|
|
MOVQ br+8(FP), AX
|
|
MOVQ 32(AX), DX
|
|
MOVBQZX 40(AX), BX
|
|
MOVQ 24(AX), SI
|
|
MOVQ (AX), AX
|
|
ADDQ SI, AX
|
|
MOVQ AX, (SP)
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 72(AX), DI
|
|
MOVQ 80(AX), R8
|
|
MOVQ 88(AX), R9
|
|
MOVQ 104(AX), R10
|
|
MOVQ s+0(FP), AX
|
|
MOVQ 144(AX), R11
|
|
MOVQ 152(AX), R12
|
|
MOVQ 160(AX), R13
|
|
|
|
sequenceDecs_decode_56_amd64_main_loop:
|
|
MOVQ (SP), R14
|
|
|
|
// Fill bitreader to have enough for the offset and match length.
|
|
CMPQ SI, $0x08
|
|
JL sequenceDecs_decode_56_amd64_fill_byte_by_byte
|
|
MOVQ BX, AX
|
|
SHRQ $0x03, AX
|
|
SUBQ AX, R14
|
|
MOVQ (R14), DX
|
|
SUBQ AX, SI
|
|
ANDQ $0x07, BX
|
|
JMP sequenceDecs_decode_56_amd64_fill_end
|
|
|
|
sequenceDecs_decode_56_amd64_fill_byte_by_byte:
|
|
CMPQ SI, $0x00
|
|
JLE sequenceDecs_decode_56_amd64_fill_end
|
|
CMPQ BX, $0x07
|
|
JLE sequenceDecs_decode_56_amd64_fill_end
|
|
SHLQ $0x08, DX
|
|
SUBQ $0x01, R14
|
|
SUBQ $0x01, SI
|
|
SUBQ $0x08, BX
|
|
MOVBQZX (R14), AX
|
|
ORQ AX, DX
|
|
JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte
|
|
|
|
sequenceDecs_decode_56_amd64_fill_end:
|
|
// Update offset
|
|
MOVQ R9, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R15
|
|
SHLQ CL, R15
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decode_56_amd64_of_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decode_56_amd64_of_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decode_56_amd64_of_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R15
|
|
ADDQ R15, AX
|
|
|
|
sequenceDecs_decode_56_amd64_of_update_zero:
|
|
MOVQ AX, 16(R10)
|
|
|
|
// Update match length
|
|
MOVQ R8, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R15
|
|
SHLQ CL, R15
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decode_56_amd64_ml_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decode_56_amd64_ml_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decode_56_amd64_ml_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R15
|
|
ADDQ R15, AX
|
|
|
|
sequenceDecs_decode_56_amd64_ml_update_zero:
|
|
MOVQ AX, 8(R10)
|
|
|
|
// Update literal length
|
|
MOVQ DI, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R15
|
|
SHLQ CL, R15
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decode_56_amd64_ll_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decode_56_amd64_ll_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decode_56_amd64_ll_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R15
|
|
ADDQ R15, AX
|
|
|
|
sequenceDecs_decode_56_amd64_ll_update_zero:
|
|
MOVQ AX, (R10)
|
|
|
|
// Fill bitreader for state updates
|
|
MOVQ R14, (SP)
|
|
MOVQ R9, AX
|
|
SHRQ $0x08, AX
|
|
MOVBQZX AL, AX
|
|
MOVQ ctx+16(FP), CX
|
|
CMPQ 96(CX), $0x00
|
|
JZ sequenceDecs_decode_56_amd64_skip_update
|
|
|
|
// Update Literal Length State
|
|
MOVBQZX DI, R14
|
|
SHRQ $0x10, DI
|
|
MOVWQZX DI, DI
|
|
LEAQ (BX)(R14*1), CX
|
|
MOVQ DX, R15
|
|
MOVQ CX, BX
|
|
ROLQ CL, R15
|
|
MOVL $0x00000001, BP
|
|
MOVB R14, CL
|
|
SHLL CL, BP
|
|
DECL BP
|
|
ANDQ BP, R15
|
|
ADDQ R15, DI
|
|
|
|
// Load ctx.llTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ (CX)(DI*8), DI
|
|
|
|
// Update Match Length State
|
|
MOVBQZX R8, R14
|
|
SHRQ $0x10, R8
|
|
MOVWQZX R8, R8
|
|
LEAQ (BX)(R14*1), CX
|
|
MOVQ DX, R15
|
|
MOVQ CX, BX
|
|
ROLQ CL, R15
|
|
MOVL $0x00000001, BP
|
|
MOVB R14, CL
|
|
SHLL CL, BP
|
|
DECL BP
|
|
ANDQ BP, R15
|
|
ADDQ R15, R8
|
|
|
|
// Load ctx.mlTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 24(CX), CX
|
|
MOVQ (CX)(R8*8), R8
|
|
|
|
// Update Offset State
|
|
MOVBQZX R9, R14
|
|
SHRQ $0x10, R9
|
|
MOVWQZX R9, R9
|
|
LEAQ (BX)(R14*1), CX
|
|
MOVQ DX, R15
|
|
MOVQ CX, BX
|
|
ROLQ CL, R15
|
|
MOVL $0x00000001, BP
|
|
MOVB R14, CL
|
|
SHLL CL, BP
|
|
DECL BP
|
|
ANDQ BP, R15
|
|
ADDQ R15, R9
|
|
|
|
// Load ctx.ofTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 48(CX), CX
|
|
MOVQ (CX)(R9*8), R9
|
|
|
|
sequenceDecs_decode_56_amd64_skip_update:
|
|
// Adjust offset
|
|
MOVQ 16(R10), CX
|
|
CMPQ AX, $0x01
|
|
JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
|
|
MOVQ R12, R13
|
|
MOVQ R11, R12
|
|
MOVQ CX, R11
|
|
JMP sequenceDecs_decode_56_amd64_after_adjust
|
|
|
|
sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
|
|
CMPQ (R10), $0x00000000
|
|
JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero
|
|
INCQ CX
|
|
JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero
|
|
|
|
sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero
|
|
MOVQ R11, CX
|
|
JMP sequenceDecs_decode_56_amd64_after_adjust
|
|
|
|
sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
|
|
CMPQ CX, $0x01
|
|
JB sequenceDecs_decode_56_amd64_adjust_zero
|
|
JEQ sequenceDecs_decode_56_amd64_adjust_one
|
|
CMPQ CX, $0x02
|
|
JA sequenceDecs_decode_56_amd64_adjust_three
|
|
JMP sequenceDecs_decode_56_amd64_adjust_two
|
|
|
|
sequenceDecs_decode_56_amd64_adjust_zero:
|
|
MOVQ R11, AX
|
|
JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_56_amd64_adjust_one:
|
|
MOVQ R12, AX
|
|
JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_56_amd64_adjust_two:
|
|
MOVQ R13, AX
|
|
JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_56_amd64_adjust_three:
|
|
LEAQ -1(R11), AX
|
|
|
|
sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
|
|
TESTQ AX, AX
|
|
JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid
|
|
MOVQ $0x00000001, AX
|
|
|
|
sequenceDecs_decode_56_amd64_adjust_temp_valid:
|
|
CMPQ CX, $0x01
|
|
CMOVQNE R12, R13
|
|
MOVQ R11, R12
|
|
MOVQ AX, R11
|
|
MOVQ AX, CX
|
|
|
|
sequenceDecs_decode_56_amd64_after_adjust:
|
|
MOVQ CX, 16(R10)
|
|
|
|
// Check values
|
|
MOVQ 8(R10), AX
|
|
MOVQ (R10), R14
|
|
LEAQ (AX)(R14*1), R15
|
|
MOVQ s+0(FP), BP
|
|
ADDQ R15, 256(BP)
|
|
MOVQ ctx+16(FP), R15
|
|
SUBQ R14, 128(R15)
|
|
JS error_not_enough_literals
|
|
CMPQ AX, $0x00020002
|
|
JA sequenceDecs_decode_56_amd64_error_match_len_too_big
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok
|
|
TESTQ AX, AX
|
|
JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
|
|
|
|
sequenceDecs_decode_56_amd64_match_len_ofs_ok:
|
|
ADDQ $0x18, R10
|
|
MOVQ ctx+16(FP), AX
|
|
DECQ 96(AX)
|
|
JNS sequenceDecs_decode_56_amd64_main_loop
|
|
MOVQ s+0(FP), AX
|
|
MOVQ R11, 144(AX)
|
|
MOVQ R12, 152(AX)
|
|
MOVQ R13, 160(AX)
|
|
MOVQ br+8(FP), AX
|
|
MOVQ DX, 32(AX)
|
|
MOVB BL, 40(AX)
|
|
MOVQ SI, 24(AX)
|
|
|
|
// Return success
|
|
MOVQ $0x00000000, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match length error
|
|
sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
|
|
MOVQ $0x00000001, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match too long error
|
|
sequenceDecs_decode_56_amd64_error_match_len_too_big:
|
|
MOVQ $0x00000002, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match offset too long error
|
|
MOVQ $0x00000003, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough literals error
|
|
error_not_enough_literals:
|
|
MOVQ $0x00000004, ret+24(FP)
|
|
RET
|
|
|
|
// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
|
|
// Requires: BMI, BMI2, CMOV
|
|
TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
|
|
MOVQ br+8(FP), CX
|
|
MOVQ 32(CX), AX
|
|
MOVBQZX 40(CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ (CX), CX
|
|
ADDQ BX, CX
|
|
MOVQ CX, (SP)
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 72(CX), SI
|
|
MOVQ 80(CX), DI
|
|
MOVQ 88(CX), R8
|
|
MOVQ 104(CX), R9
|
|
MOVQ s+0(FP), CX
|
|
MOVQ 144(CX), R10
|
|
MOVQ 152(CX), R11
|
|
MOVQ 160(CX), R12
|
|
|
|
sequenceDecs_decode_bmi2_main_loop:
|
|
MOVQ (SP), R13
|
|
|
|
// Fill bitreader to have enough for the offset and match length.
|
|
CMPQ BX, $0x08
|
|
JL sequenceDecs_decode_bmi2_fill_byte_by_byte
|
|
MOVQ DX, CX
|
|
SHRQ $0x03, CX
|
|
SUBQ CX, R13
|
|
MOVQ (R13), AX
|
|
SUBQ CX, BX
|
|
ANDQ $0x07, DX
|
|
JMP sequenceDecs_decode_bmi2_fill_end
|
|
|
|
sequenceDecs_decode_bmi2_fill_byte_by_byte:
|
|
CMPQ BX, $0x00
|
|
JLE sequenceDecs_decode_bmi2_fill_end
|
|
CMPQ DX, $0x07
|
|
JLE sequenceDecs_decode_bmi2_fill_end
|
|
SHLQ $0x08, AX
|
|
SUBQ $0x01, R13
|
|
SUBQ $0x01, BX
|
|
SUBQ $0x08, DX
|
|
MOVBQZX (R13), CX
|
|
ORQ CX, AX
|
|
JMP sequenceDecs_decode_bmi2_fill_byte_by_byte
|
|
|
|
sequenceDecs_decode_bmi2_fill_end:
|
|
// Update offset
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, R8, R14
|
|
MOVQ AX, R15
|
|
LEAQ (DX)(R14*1), CX
|
|
ROLQ CL, R15
|
|
BZHIQ R14, R15, R15
|
|
MOVQ CX, DX
|
|
MOVQ R8, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R15, CX
|
|
MOVQ CX, 16(R9)
|
|
|
|
// Update match length
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, DI, R14
|
|
MOVQ AX, R15
|
|
LEAQ (DX)(R14*1), CX
|
|
ROLQ CL, R15
|
|
BZHIQ R14, R15, R15
|
|
MOVQ CX, DX
|
|
MOVQ DI, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R15, CX
|
|
MOVQ CX, 8(R9)
|
|
|
|
// Fill bitreader to have enough for the remaining
|
|
CMPQ BX, $0x08
|
|
JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte
|
|
MOVQ DX, CX
|
|
SHRQ $0x03, CX
|
|
SUBQ CX, R13
|
|
MOVQ (R13), AX
|
|
SUBQ CX, BX
|
|
ANDQ $0x07, DX
|
|
JMP sequenceDecs_decode_bmi2_fill_2_end
|
|
|
|
sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
|
|
CMPQ BX, $0x00
|
|
JLE sequenceDecs_decode_bmi2_fill_2_end
|
|
CMPQ DX, $0x07
|
|
JLE sequenceDecs_decode_bmi2_fill_2_end
|
|
SHLQ $0x08, AX
|
|
SUBQ $0x01, R13
|
|
SUBQ $0x01, BX
|
|
SUBQ $0x08, DX
|
|
MOVBQZX (R13), CX
|
|
ORQ CX, AX
|
|
JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte
|
|
|
|
sequenceDecs_decode_bmi2_fill_2_end:
|
|
// Update literal length
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, SI, R14
|
|
MOVQ AX, R15
|
|
LEAQ (DX)(R14*1), CX
|
|
ROLQ CL, R15
|
|
BZHIQ R14, R15, R15
|
|
MOVQ CX, DX
|
|
MOVQ SI, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R15, CX
|
|
MOVQ CX, (R9)
|
|
|
|
// Fill bitreader for state updates
|
|
MOVQ R13, (SP)
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, R8, R13
|
|
MOVQ ctx+16(FP), CX
|
|
CMPQ 96(CX), $0x00
|
|
JZ sequenceDecs_decode_bmi2_skip_update
|
|
LEAQ (SI)(DI*1), R14
|
|
ADDQ R8, R14
|
|
MOVBQZX R14, R14
|
|
LEAQ (DX)(R14*1), CX
|
|
MOVQ AX, R15
|
|
MOVQ CX, DX
|
|
ROLQ CL, R15
|
|
BZHIQ R14, R15, R15
|
|
|
|
// Update Offset State
|
|
BZHIQ R8, R15, CX
|
|
SHRXQ R8, R15, R15
|
|
MOVQ $0x00001010, R14
|
|
BEXTRQ R14, R8, R8
|
|
ADDQ CX, R8
|
|
|
|
// Load ctx.ofTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 48(CX), CX
|
|
MOVQ (CX)(R8*8), R8
|
|
|
|
// Update Match Length State
|
|
BZHIQ DI, R15, CX
|
|
SHRXQ DI, R15, R15
|
|
MOVQ $0x00001010, R14
|
|
BEXTRQ R14, DI, DI
|
|
ADDQ CX, DI
|
|
|
|
// Load ctx.mlTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 24(CX), CX
|
|
MOVQ (CX)(DI*8), DI
|
|
|
|
// Update Literal Length State
|
|
BZHIQ SI, R15, CX
|
|
MOVQ $0x00001010, R14
|
|
BEXTRQ R14, SI, SI
|
|
ADDQ CX, SI
|
|
|
|
// Load ctx.llTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ (CX)(SI*8), SI
|
|
|
|
sequenceDecs_decode_bmi2_skip_update:
|
|
// Adjust offset
|
|
MOVQ 16(R9), CX
|
|
CMPQ R13, $0x01
|
|
JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
|
|
MOVQ R11, R12
|
|
MOVQ R10, R11
|
|
MOVQ CX, R10
|
|
JMP sequenceDecs_decode_bmi2_after_adjust
|
|
|
|
sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
|
|
CMPQ (R9), $0x00000000
|
|
JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero
|
|
INCQ CX
|
|
JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero
|
|
|
|
sequenceDecs_decode_bmi2_adjust_offset_maybezero:
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero
|
|
MOVQ R10, CX
|
|
JMP sequenceDecs_decode_bmi2_after_adjust
|
|
|
|
sequenceDecs_decode_bmi2_adjust_offset_nonzero:
|
|
CMPQ CX, $0x01
|
|
JB sequenceDecs_decode_bmi2_adjust_zero
|
|
JEQ sequenceDecs_decode_bmi2_adjust_one
|
|
CMPQ CX, $0x02
|
|
JA sequenceDecs_decode_bmi2_adjust_three
|
|
JMP sequenceDecs_decode_bmi2_adjust_two
|
|
|
|
sequenceDecs_decode_bmi2_adjust_zero:
|
|
MOVQ R10, R13
|
|
JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_bmi2_adjust_one:
|
|
MOVQ R11, R13
|
|
JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_bmi2_adjust_two:
|
|
MOVQ R12, R13
|
|
JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_bmi2_adjust_three:
|
|
LEAQ -1(R10), R13
|
|
|
|
sequenceDecs_decode_bmi2_adjust_test_temp_valid:
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decode_bmi2_adjust_temp_valid
|
|
MOVQ $0x00000001, R13
|
|
|
|
sequenceDecs_decode_bmi2_adjust_temp_valid:
|
|
CMPQ CX, $0x01
|
|
CMOVQNE R11, R12
|
|
MOVQ R10, R11
|
|
MOVQ R13, R10
|
|
MOVQ R13, CX
|
|
|
|
sequenceDecs_decode_bmi2_after_adjust:
|
|
MOVQ CX, 16(R9)
|
|
|
|
// Check values
|
|
MOVQ 8(R9), R13
|
|
MOVQ (R9), R14
|
|
LEAQ (R13)(R14*1), R15
|
|
MOVQ s+0(FP), BP
|
|
ADDQ R15, 256(BP)
|
|
MOVQ ctx+16(FP), R15
|
|
SUBQ R14, 128(R15)
|
|
JS error_not_enough_literals
|
|
CMPQ R13, $0x00020002
|
|
JA sequenceDecs_decode_bmi2_error_match_len_too_big
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
|
|
|
|
sequenceDecs_decode_bmi2_match_len_ofs_ok:
|
|
ADDQ $0x18, R9
|
|
MOVQ ctx+16(FP), CX
|
|
DECQ 96(CX)
|
|
JNS sequenceDecs_decode_bmi2_main_loop
|
|
MOVQ s+0(FP), CX
|
|
MOVQ R10, 144(CX)
|
|
MOVQ R11, 152(CX)
|
|
MOVQ R12, 160(CX)
|
|
MOVQ br+8(FP), CX
|
|
MOVQ AX, 32(CX)
|
|
MOVB DL, 40(CX)
|
|
MOVQ BX, 24(CX)
|
|
|
|
// Return success
|
|
MOVQ $0x00000000, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match length error
|
|
sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
|
|
MOVQ $0x00000001, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match too long error
|
|
sequenceDecs_decode_bmi2_error_match_len_too_big:
|
|
MOVQ $0x00000002, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match offset too long error
|
|
MOVQ $0x00000003, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough literals error
|
|
error_not_enough_literals:
|
|
MOVQ $0x00000004, ret+24(FP)
|
|
RET
|
|
|
|
// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
|
|
// Requires: BMI, BMI2, CMOV
|
|
TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
|
|
MOVQ br+8(FP), CX
|
|
MOVQ 32(CX), AX
|
|
MOVBQZX 40(CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ (CX), CX
|
|
ADDQ BX, CX
|
|
MOVQ CX, (SP)
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 72(CX), SI
|
|
MOVQ 80(CX), DI
|
|
MOVQ 88(CX), R8
|
|
MOVQ 104(CX), R9
|
|
MOVQ s+0(FP), CX
|
|
MOVQ 144(CX), R10
|
|
MOVQ 152(CX), R11
|
|
MOVQ 160(CX), R12
|
|
|
|
sequenceDecs_decode_56_bmi2_main_loop:
|
|
MOVQ (SP), R13
|
|
|
|
// Fill bitreader to have enough for the offset and match length.
|
|
CMPQ BX, $0x08
|
|
JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte
|
|
MOVQ DX, CX
|
|
SHRQ $0x03, CX
|
|
SUBQ CX, R13
|
|
MOVQ (R13), AX
|
|
SUBQ CX, BX
|
|
ANDQ $0x07, DX
|
|
JMP sequenceDecs_decode_56_bmi2_fill_end
|
|
|
|
sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
|
|
CMPQ BX, $0x00
|
|
JLE sequenceDecs_decode_56_bmi2_fill_end
|
|
CMPQ DX, $0x07
|
|
JLE sequenceDecs_decode_56_bmi2_fill_end
|
|
SHLQ $0x08, AX
|
|
SUBQ $0x01, R13
|
|
SUBQ $0x01, BX
|
|
SUBQ $0x08, DX
|
|
MOVBQZX (R13), CX
|
|
ORQ CX, AX
|
|
JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte
|
|
|
|
sequenceDecs_decode_56_bmi2_fill_end:
|
|
// Update offset
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, R8, R14
|
|
MOVQ AX, R15
|
|
LEAQ (DX)(R14*1), CX
|
|
ROLQ CL, R15
|
|
BZHIQ R14, R15, R15
|
|
MOVQ CX, DX
|
|
MOVQ R8, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R15, CX
|
|
MOVQ CX, 16(R9)
|
|
|
|
// Update match length
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, DI, R14
|
|
MOVQ AX, R15
|
|
LEAQ (DX)(R14*1), CX
|
|
ROLQ CL, R15
|
|
BZHIQ R14, R15, R15
|
|
MOVQ CX, DX
|
|
MOVQ DI, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R15, CX
|
|
MOVQ CX, 8(R9)
|
|
|
|
// Update literal length
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, SI, R14
|
|
MOVQ AX, R15
|
|
LEAQ (DX)(R14*1), CX
|
|
ROLQ CL, R15
|
|
BZHIQ R14, R15, R15
|
|
MOVQ CX, DX
|
|
MOVQ SI, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R15, CX
|
|
MOVQ CX, (R9)
|
|
|
|
// Fill bitreader for state updates
|
|
MOVQ R13, (SP)
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, R8, R13
|
|
MOVQ ctx+16(FP), CX
|
|
CMPQ 96(CX), $0x00
|
|
JZ sequenceDecs_decode_56_bmi2_skip_update
|
|
LEAQ (SI)(DI*1), R14
|
|
ADDQ R8, R14
|
|
MOVBQZX R14, R14
|
|
LEAQ (DX)(R14*1), CX
|
|
MOVQ AX, R15
|
|
MOVQ CX, DX
|
|
ROLQ CL, R15
|
|
BZHIQ R14, R15, R15
|
|
|
|
// Update Offset State
|
|
BZHIQ R8, R15, CX
|
|
SHRXQ R8, R15, R15
|
|
MOVQ $0x00001010, R14
|
|
BEXTRQ R14, R8, R8
|
|
ADDQ CX, R8
|
|
|
|
// Load ctx.ofTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 48(CX), CX
|
|
MOVQ (CX)(R8*8), R8
|
|
|
|
// Update Match Length State
|
|
BZHIQ DI, R15, CX
|
|
SHRXQ DI, R15, R15
|
|
MOVQ $0x00001010, R14
|
|
BEXTRQ R14, DI, DI
|
|
ADDQ CX, DI
|
|
|
|
// Load ctx.mlTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 24(CX), CX
|
|
MOVQ (CX)(DI*8), DI
|
|
|
|
// Update Literal Length State
|
|
BZHIQ SI, R15, CX
|
|
MOVQ $0x00001010, R14
|
|
BEXTRQ R14, SI, SI
|
|
ADDQ CX, SI
|
|
|
|
// Load ctx.llTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ (CX)(SI*8), SI
|
|
|
|
sequenceDecs_decode_56_bmi2_skip_update:
|
|
// Adjust offset
|
|
MOVQ 16(R9), CX
|
|
CMPQ R13, $0x01
|
|
JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
|
|
MOVQ R11, R12
|
|
MOVQ R10, R11
|
|
MOVQ CX, R10
|
|
JMP sequenceDecs_decode_56_bmi2_after_adjust
|
|
|
|
sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
|
|
CMPQ (R9), $0x00000000
|
|
JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
|
|
INCQ CX
|
|
JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
|
|
|
|
sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
|
|
MOVQ R10, CX
|
|
JMP sequenceDecs_decode_56_bmi2_after_adjust
|
|
|
|
sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
|
|
CMPQ CX, $0x01
|
|
JB sequenceDecs_decode_56_bmi2_adjust_zero
|
|
JEQ sequenceDecs_decode_56_bmi2_adjust_one
|
|
CMPQ CX, $0x02
|
|
JA sequenceDecs_decode_56_bmi2_adjust_three
|
|
JMP sequenceDecs_decode_56_bmi2_adjust_two
|
|
|
|
sequenceDecs_decode_56_bmi2_adjust_zero:
|
|
MOVQ R10, R13
|
|
JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_56_bmi2_adjust_one:
|
|
MOVQ R11, R13
|
|
JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_56_bmi2_adjust_two:
|
|
MOVQ R12, R13
|
|
JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_56_bmi2_adjust_three:
|
|
LEAQ -1(R10), R13
|
|
|
|
sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid
|
|
MOVQ $0x00000001, R13
|
|
|
|
sequenceDecs_decode_56_bmi2_adjust_temp_valid:
|
|
CMPQ CX, $0x01
|
|
CMOVQNE R11, R12
|
|
MOVQ R10, R11
|
|
MOVQ R13, R10
|
|
MOVQ R13, CX
|
|
|
|
sequenceDecs_decode_56_bmi2_after_adjust:
|
|
MOVQ CX, 16(R9)
|
|
|
|
// Check values
|
|
MOVQ 8(R9), R13
|
|
MOVQ (R9), R14
|
|
LEAQ (R13)(R14*1), R15
|
|
MOVQ s+0(FP), BP
|
|
ADDQ R15, 256(BP)
|
|
MOVQ ctx+16(FP), R15
|
|
SUBQ R14, 128(R15)
|
|
JS error_not_enough_literals
|
|
CMPQ R13, $0x00020002
|
|
JA sequenceDecs_decode_56_bmi2_error_match_len_too_big
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
|
|
|
|
sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
|
|
ADDQ $0x18, R9
|
|
MOVQ ctx+16(FP), CX
|
|
DECQ 96(CX)
|
|
JNS sequenceDecs_decode_56_bmi2_main_loop
|
|
MOVQ s+0(FP), CX
|
|
MOVQ R10, 144(CX)
|
|
MOVQ R11, 152(CX)
|
|
MOVQ R12, 160(CX)
|
|
MOVQ br+8(FP), CX
|
|
MOVQ AX, 32(CX)
|
|
MOVB DL, 40(CX)
|
|
MOVQ BX, 24(CX)
|
|
|
|
// Return success
|
|
MOVQ $0x00000000, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match length error
|
|
sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
|
|
MOVQ $0x00000001, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match too long error
|
|
sequenceDecs_decode_56_bmi2_error_match_len_too_big:
|
|
MOVQ $0x00000002, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match offset too long error
|
|
MOVQ $0x00000003, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough literals error
|
|
error_not_enough_literals:
|
|
MOVQ $0x00000004, ret+24(FP)
|
|
RET
|
|
|
|
// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
|
|
// Requires: SSE
|
|
TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
|
|
MOVQ ctx+0(FP), R10
|
|
MOVQ 8(R10), CX
|
|
TESTQ CX, CX
|
|
JZ empty_seqs
|
|
MOVQ (R10), AX
|
|
MOVQ 24(R10), DX
|
|
MOVQ 32(R10), BX
|
|
MOVQ 80(R10), SI
|
|
MOVQ 104(R10), DI
|
|
MOVQ 120(R10), R8
|
|
MOVQ 56(R10), R9
|
|
MOVQ 64(R10), R10
|
|
ADDQ R10, R9
|
|
|
|
// seqsBase += 24 * seqIndex
|
|
LEAQ (DX)(DX*2), R11
|
|
SHLQ $0x03, R11
|
|
ADDQ R11, AX
|
|
|
|
// outBase += outPosition
|
|
ADDQ DI, BX
|
|
|
|
main_loop:
|
|
MOVQ (AX), R11
|
|
MOVQ 16(AX), R12
|
|
MOVQ 8(AX), R13
|
|
|
|
// Copy literals
|
|
TESTQ R11, R11
|
|
JZ check_offset
|
|
XORQ R14, R14
|
|
|
|
copy_1:
|
|
MOVUPS (SI)(R14*1), X0
|
|
MOVUPS X0, (BX)(R14*1)
|
|
ADDQ $0x10, R14
|
|
CMPQ R14, R11
|
|
JB copy_1
|
|
ADDQ R11, SI
|
|
ADDQ R11, BX
|
|
ADDQ R11, DI
|
|
|
|
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
|
|
check_offset:
|
|
LEAQ (DI)(R10*1), R11
|
|
CMPQ R12, R11
|
|
JG error_match_off_too_big
|
|
CMPQ R12, R8
|
|
JG error_match_off_too_big
|
|
|
|
// Copy match from history
|
|
MOVQ R12, R11
|
|
SUBQ DI, R11
|
|
JLS copy_match
|
|
MOVQ R9, R14
|
|
SUBQ R11, R14
|
|
CMPQ R13, R11
|
|
JG copy_all_from_history
|
|
MOVQ R13, R11
|
|
SUBQ $0x10, R11
|
|
JB copy_4_small
|
|
|
|
copy_4_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (BX)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, BX
|
|
SUBQ $0x10, R11
|
|
JAE copy_4_loop
|
|
LEAQ 16(R14)(R11*1), R14
|
|
LEAQ 16(BX)(R11*1), BX
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(BX)
|
|
JMP copy_4_end
|
|
|
|
copy_4_small:
|
|
CMPQ R13, $0x03
|
|
JE copy_4_move_3
|
|
CMPQ R13, $0x08
|
|
JB copy_4_move_4through7
|
|
JMP copy_4_move_8through16
|
|
|
|
copy_4_move_3:
|
|
MOVW (R14), R11
|
|
MOVB 2(R14), R12
|
|
MOVW R11, (BX)
|
|
MOVB R12, 2(BX)
|
|
ADDQ R13, R14
|
|
ADDQ R13, BX
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_4through7:
|
|
MOVL (R14), R11
|
|
MOVL -4(R14)(R13*1), R12
|
|
MOVL R11, (BX)
|
|
MOVL R12, -4(BX)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, BX
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_8through16:
|
|
MOVQ (R14), R11
|
|
MOVQ -8(R14)(R13*1), R12
|
|
MOVQ R11, (BX)
|
|
MOVQ R12, -8(BX)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, BX
|
|
|
|
copy_4_end:
|
|
ADDQ R13, DI
|
|
ADDQ $0x18, AX
|
|
INCQ DX
|
|
CMPQ DX, CX
|
|
JB main_loop
|
|
JMP loop_finished
|
|
|
|
copy_all_from_history:
|
|
MOVQ R11, R15
|
|
SUBQ $0x10, R15
|
|
JB copy_5_small
|
|
|
|
copy_5_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (BX)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, BX
|
|
SUBQ $0x10, R15
|
|
JAE copy_5_loop
|
|
LEAQ 16(R14)(R15*1), R14
|
|
LEAQ 16(BX)(R15*1), BX
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(BX)
|
|
JMP copy_5_end
|
|
|
|
copy_5_small:
|
|
CMPQ R11, $0x03
|
|
JE copy_5_move_3
|
|
JB copy_5_move_1or2
|
|
CMPQ R11, $0x08
|
|
JB copy_5_move_4through7
|
|
JMP copy_5_move_8through16
|
|
|
|
copy_5_move_1or2:
|
|
MOVB (R14), R15
|
|
MOVB -1(R14)(R11*1), BP
|
|
MOVB R15, (BX)
|
|
MOVB BP, -1(BX)(R11*1)
|
|
ADDQ R11, R14
|
|
ADDQ R11, BX
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_3:
|
|
MOVW (R14), R15
|
|
MOVB 2(R14), BP
|
|
MOVW R15, (BX)
|
|
MOVB BP, 2(BX)
|
|
ADDQ R11, R14
|
|
ADDQ R11, BX
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_4through7:
|
|
MOVL (R14), R15
|
|
MOVL -4(R14)(R11*1), BP
|
|
MOVL R15, (BX)
|
|
MOVL BP, -4(BX)(R11*1)
|
|
ADDQ R11, R14
|
|
ADDQ R11, BX
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_8through16:
|
|
MOVQ (R14), R15
|
|
MOVQ -8(R14)(R11*1), BP
|
|
MOVQ R15, (BX)
|
|
MOVQ BP, -8(BX)(R11*1)
|
|
ADDQ R11, R14
|
|
ADDQ R11, BX
|
|
|
|
copy_5_end:
|
|
ADDQ R11, DI
|
|
SUBQ R11, R13
|
|
|
|
// Copy match from the current buffer
|
|
copy_match:
|
|
MOVQ BX, R11
|
|
SUBQ R12, R11
|
|
|
|
// ml <= mo
|
|
CMPQ R13, R12
|
|
JA copy_overlapping_match
|
|
|
|
// Copy non-overlapping match
|
|
ADDQ R13, DI
|
|
MOVQ BX, R12
|
|
ADDQ R13, BX
|
|
|
|
copy_2:
|
|
MOVUPS (R11), X0
|
|
MOVUPS X0, (R12)
|
|
ADDQ $0x10, R11
|
|
ADDQ $0x10, R12
|
|
SUBQ $0x10, R13
|
|
JHI copy_2
|
|
JMP handle_loop
|
|
|
|
// Copy overlapping match
|
|
copy_overlapping_match:
|
|
ADDQ R13, DI
|
|
|
|
copy_slow_3:
|
|
MOVB (R11), R12
|
|
MOVB R12, (BX)
|
|
INCQ R11
|
|
INCQ BX
|
|
DECQ R13
|
|
JNZ copy_slow_3
|
|
|
|
handle_loop:
|
|
ADDQ $0x18, AX
|
|
INCQ DX
|
|
CMPQ DX, CX
|
|
JB main_loop
|
|
|
|
loop_finished:
|
|
// Return value
|
|
MOVB $0x01, ret+8(FP)
|
|
|
|
// Update the context
|
|
MOVQ ctx+0(FP), AX
|
|
MOVQ DX, 24(AX)
|
|
MOVQ DI, 104(AX)
|
|
SUBQ 80(AX), SI
|
|
MOVQ SI, 112(AX)
|
|
RET
|
|
|
|
error_match_off_too_big:
|
|
// Return value
|
|
MOVB $0x00, ret+8(FP)
|
|
|
|
// Update the context
|
|
MOVQ ctx+0(FP), AX
|
|
MOVQ DX, 24(AX)
|
|
MOVQ DI, 104(AX)
|
|
SUBQ 80(AX), SI
|
|
MOVQ SI, 112(AX)
|
|
RET
|
|
|
|
empty_seqs:
|
|
// Return value
|
|
MOVB $0x01, ret+8(FP)
|
|
RET
|
|
|
|
// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
|
|
// Requires: SSE
|
|
TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
|
|
MOVQ ctx+0(FP), R10
|
|
MOVQ 8(R10), CX
|
|
TESTQ CX, CX
|
|
JZ empty_seqs
|
|
MOVQ (R10), AX
|
|
MOVQ 24(R10), DX
|
|
MOVQ 32(R10), BX
|
|
MOVQ 80(R10), SI
|
|
MOVQ 104(R10), DI
|
|
MOVQ 120(R10), R8
|
|
MOVQ 56(R10), R9
|
|
MOVQ 64(R10), R10
|
|
ADDQ R10, R9
|
|
|
|
// seqsBase += 24 * seqIndex
|
|
LEAQ (DX)(DX*2), R11
|
|
SHLQ $0x03, R11
|
|
ADDQ R11, AX
|
|
|
|
// outBase += outPosition
|
|
ADDQ DI, BX
|
|
|
|
main_loop:
|
|
MOVQ (AX), R11
|
|
MOVQ 16(AX), R12
|
|
MOVQ 8(AX), R13
|
|
|
|
// Copy literals
|
|
TESTQ R11, R11
|
|
JZ check_offset
|
|
MOVQ R11, R14
|
|
SUBQ $0x10, R14
|
|
JB copy_1_small
|
|
|
|
copy_1_loop:
|
|
MOVUPS (SI), X0
|
|
MOVUPS X0, (BX)
|
|
ADDQ $0x10, SI
|
|
ADDQ $0x10, BX
|
|
SUBQ $0x10, R14
|
|
JAE copy_1_loop
|
|
LEAQ 16(SI)(R14*1), SI
|
|
LEAQ 16(BX)(R14*1), BX
|
|
MOVUPS -16(SI), X0
|
|
MOVUPS X0, -16(BX)
|
|
JMP copy_1_end
|
|
|
|
copy_1_small:
|
|
CMPQ R11, $0x03
|
|
JE copy_1_move_3
|
|
JB copy_1_move_1or2
|
|
CMPQ R11, $0x08
|
|
JB copy_1_move_4through7
|
|
JMP copy_1_move_8through16
|
|
|
|
copy_1_move_1or2:
|
|
MOVB (SI), R14
|
|
MOVB -1(SI)(R11*1), R15
|
|
MOVB R14, (BX)
|
|
MOVB R15, -1(BX)(R11*1)
|
|
ADDQ R11, SI
|
|
ADDQ R11, BX
|
|
JMP copy_1_end
|
|
|
|
copy_1_move_3:
|
|
MOVW (SI), R14
|
|
MOVB 2(SI), R15
|
|
MOVW R14, (BX)
|
|
MOVB R15, 2(BX)
|
|
ADDQ R11, SI
|
|
ADDQ R11, BX
|
|
JMP copy_1_end
|
|
|
|
copy_1_move_4through7:
|
|
MOVL (SI), R14
|
|
MOVL -4(SI)(R11*1), R15
|
|
MOVL R14, (BX)
|
|
MOVL R15, -4(BX)(R11*1)
|
|
ADDQ R11, SI
|
|
ADDQ R11, BX
|
|
JMP copy_1_end
|
|
|
|
copy_1_move_8through16:
|
|
MOVQ (SI), R14
|
|
MOVQ -8(SI)(R11*1), R15
|
|
MOVQ R14, (BX)
|
|
MOVQ R15, -8(BX)(R11*1)
|
|
ADDQ R11, SI
|
|
ADDQ R11, BX
|
|
|
|
copy_1_end:
|
|
ADDQ R11, DI
|
|
|
|
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
|
|
check_offset:
|
|
LEAQ (DI)(R10*1), R11
|
|
CMPQ R12, R11
|
|
JG error_match_off_too_big
|
|
CMPQ R12, R8
|
|
JG error_match_off_too_big
|
|
|
|
// Copy match from history
|
|
MOVQ R12, R11
|
|
SUBQ DI, R11
|
|
JLS copy_match
|
|
MOVQ R9, R14
|
|
SUBQ R11, R14
|
|
CMPQ R13, R11
|
|
JG copy_all_from_history
|
|
MOVQ R13, R11
|
|
SUBQ $0x10, R11
|
|
JB copy_4_small
|
|
|
|
copy_4_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (BX)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, BX
|
|
SUBQ $0x10, R11
|
|
JAE copy_4_loop
|
|
LEAQ 16(R14)(R11*1), R14
|
|
LEAQ 16(BX)(R11*1), BX
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(BX)
|
|
JMP copy_4_end
|
|
|
|
copy_4_small:
|
|
CMPQ R13, $0x03
|
|
JE copy_4_move_3
|
|
CMPQ R13, $0x08
|
|
JB copy_4_move_4through7
|
|
JMP copy_4_move_8through16
|
|
|
|
copy_4_move_3:
|
|
MOVW (R14), R11
|
|
MOVB 2(R14), R12
|
|
MOVW R11, (BX)
|
|
MOVB R12, 2(BX)
|
|
ADDQ R13, R14
|
|
ADDQ R13, BX
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_4through7:
|
|
MOVL (R14), R11
|
|
MOVL -4(R14)(R13*1), R12
|
|
MOVL R11, (BX)
|
|
MOVL R12, -4(BX)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, BX
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_8through16:
|
|
MOVQ (R14), R11
|
|
MOVQ -8(R14)(R13*1), R12
|
|
MOVQ R11, (BX)
|
|
MOVQ R12, -8(BX)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, BX
|
|
|
|
copy_4_end:
|
|
ADDQ R13, DI
|
|
ADDQ $0x18, AX
|
|
INCQ DX
|
|
CMPQ DX, CX
|
|
JB main_loop
|
|
JMP loop_finished
|
|
|
|
copy_all_from_history:
|
|
MOVQ R11, R15
|
|
SUBQ $0x10, R15
|
|
JB copy_5_small
|
|
|
|
copy_5_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (BX)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, BX
|
|
SUBQ $0x10, R15
|
|
JAE copy_5_loop
|
|
LEAQ 16(R14)(R15*1), R14
|
|
LEAQ 16(BX)(R15*1), BX
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(BX)
|
|
JMP copy_5_end
|
|
|
|
copy_5_small:
|
|
CMPQ R11, $0x03
|
|
JE copy_5_move_3
|
|
JB copy_5_move_1or2
|
|
CMPQ R11, $0x08
|
|
JB copy_5_move_4through7
|
|
JMP copy_5_move_8through16
|
|
|
|
copy_5_move_1or2:
|
|
MOVB (R14), R15
|
|
MOVB -1(R14)(R11*1), BP
|
|
MOVB R15, (BX)
|
|
MOVB BP, -1(BX)(R11*1)
|
|
ADDQ R11, R14
|
|
ADDQ R11, BX
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_3:
|
|
MOVW (R14), R15
|
|
MOVB 2(R14), BP
|
|
MOVW R15, (BX)
|
|
MOVB BP, 2(BX)
|
|
ADDQ R11, R14
|
|
ADDQ R11, BX
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_4through7:
|
|
MOVL (R14), R15
|
|
MOVL -4(R14)(R11*1), BP
|
|
MOVL R15, (BX)
|
|
MOVL BP, -4(BX)(R11*1)
|
|
ADDQ R11, R14
|
|
ADDQ R11, BX
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_8through16:
|
|
MOVQ (R14), R15
|
|
MOVQ -8(R14)(R11*1), BP
|
|
MOVQ R15, (BX)
|
|
MOVQ BP, -8(BX)(R11*1)
|
|
ADDQ R11, R14
|
|
ADDQ R11, BX
|
|
|
|
copy_5_end:
|
|
ADDQ R11, DI
|
|
SUBQ R11, R13
|
|
|
|
// Copy match from the current buffer
|
|
copy_match:
|
|
MOVQ BX, R11
|
|
SUBQ R12, R11
|
|
|
|
// ml <= mo
|
|
CMPQ R13, R12
|
|
JA copy_overlapping_match
|
|
|
|
// Copy non-overlapping match
|
|
ADDQ R13, DI
|
|
MOVQ R13, R12
|
|
SUBQ $0x10, R12
|
|
JB copy_2_small
|
|
|
|
copy_2_loop:
|
|
MOVUPS (R11), X0
|
|
MOVUPS X0, (BX)
|
|
ADDQ $0x10, R11
|
|
ADDQ $0x10, BX
|
|
SUBQ $0x10, R12
|
|
JAE copy_2_loop
|
|
LEAQ 16(R11)(R12*1), R11
|
|
LEAQ 16(BX)(R12*1), BX
|
|
MOVUPS -16(R11), X0
|
|
MOVUPS X0, -16(BX)
|
|
JMP copy_2_end
|
|
|
|
copy_2_small:
|
|
CMPQ R13, $0x03
|
|
JE copy_2_move_3
|
|
JB copy_2_move_1or2
|
|
CMPQ R13, $0x08
|
|
JB copy_2_move_4through7
|
|
JMP copy_2_move_8through16
|
|
|
|
copy_2_move_1or2:
|
|
MOVB (R11), R12
|
|
MOVB -1(R11)(R13*1), R14
|
|
MOVB R12, (BX)
|
|
MOVB R14, -1(BX)(R13*1)
|
|
ADDQ R13, R11
|
|
ADDQ R13, BX
|
|
JMP copy_2_end
|
|
|
|
copy_2_move_3:
|
|
MOVW (R11), R12
|
|
MOVB 2(R11), R14
|
|
MOVW R12, (BX)
|
|
MOVB R14, 2(BX)
|
|
ADDQ R13, R11
|
|
ADDQ R13, BX
|
|
JMP copy_2_end
|
|
|
|
copy_2_move_4through7:
|
|
MOVL (R11), R12
|
|
MOVL -4(R11)(R13*1), R14
|
|
MOVL R12, (BX)
|
|
MOVL R14, -4(BX)(R13*1)
|
|
ADDQ R13, R11
|
|
ADDQ R13, BX
|
|
JMP copy_2_end
|
|
|
|
copy_2_move_8through16:
|
|
MOVQ (R11), R12
|
|
MOVQ -8(R11)(R13*1), R14
|
|
MOVQ R12, (BX)
|
|
MOVQ R14, -8(BX)(R13*1)
|
|
ADDQ R13, R11
|
|
ADDQ R13, BX
|
|
|
|
copy_2_end:
|
|
JMP handle_loop
|
|
|
|
// Copy overlapping match
|
|
copy_overlapping_match:
|
|
ADDQ R13, DI
|
|
|
|
copy_slow_3:
|
|
MOVB (R11), R12
|
|
MOVB R12, (BX)
|
|
INCQ R11
|
|
INCQ BX
|
|
DECQ R13
|
|
JNZ copy_slow_3
|
|
|
|
handle_loop:
|
|
ADDQ $0x18, AX
|
|
INCQ DX
|
|
CMPQ DX, CX
|
|
JB main_loop
|
|
|
|
loop_finished:
|
|
// Return value
|
|
MOVB $0x01, ret+8(FP)
|
|
|
|
// Update the context
|
|
MOVQ ctx+0(FP), AX
|
|
MOVQ DX, 24(AX)
|
|
MOVQ DI, 104(AX)
|
|
SUBQ 80(AX), SI
|
|
MOVQ SI, 112(AX)
|
|
RET
|
|
|
|
error_match_off_too_big:
|
|
// Return value
|
|
MOVB $0x00, ret+8(FP)
|
|
|
|
// Update the context
|
|
MOVQ ctx+0(FP), AX
|
|
MOVQ DX, 24(AX)
|
|
MOVQ DI, 104(AX)
|
|
SUBQ 80(AX), SI
|
|
MOVQ SI, 112(AX)
|
|
RET
|
|
|
|
empty_seqs:
|
|
// Return value
|
|
MOVB $0x01, ret+8(FP)
|
|
RET
|
|
|
|
// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
|
|
// Requires: CMOV, SSE
|
|
TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
|
|
MOVQ br+8(FP), AX
|
|
MOVQ 32(AX), DX
|
|
MOVBQZX 40(AX), BX
|
|
MOVQ 24(AX), SI
|
|
MOVQ (AX), AX
|
|
ADDQ SI, AX
|
|
MOVQ AX, (SP)
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 72(AX), DI
|
|
MOVQ 80(AX), R8
|
|
MOVQ 88(AX), R9
|
|
XORQ CX, CX
|
|
MOVQ CX, 8(SP)
|
|
MOVQ CX, 16(SP)
|
|
MOVQ CX, 24(SP)
|
|
MOVQ 112(AX), R10
|
|
MOVQ 128(AX), CX
|
|
MOVQ CX, 32(SP)
|
|
MOVQ 144(AX), R11
|
|
MOVQ 136(AX), R12
|
|
MOVQ 200(AX), CX
|
|
MOVQ CX, 56(SP)
|
|
MOVQ 176(AX), CX
|
|
MOVQ CX, 48(SP)
|
|
MOVQ 184(AX), AX
|
|
MOVQ AX, 40(SP)
|
|
MOVQ 40(SP), AX
|
|
ADDQ AX, 48(SP)
|
|
|
|
// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
|
|
ADDQ R10, 32(SP)
|
|
|
|
// outBase += outPosition
|
|
ADDQ R12, R10
|
|
|
|
sequenceDecs_decodeSync_amd64_main_loop:
|
|
MOVQ (SP), R13
|
|
|
|
// Fill bitreader to have enough for the offset and match length.
|
|
CMPQ SI, $0x08
|
|
JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte
|
|
MOVQ BX, AX
|
|
SHRQ $0x03, AX
|
|
SUBQ AX, R13
|
|
MOVQ (R13), DX
|
|
SUBQ AX, SI
|
|
ANDQ $0x07, BX
|
|
JMP sequenceDecs_decodeSync_amd64_fill_end
|
|
|
|
sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
|
|
CMPQ SI, $0x00
|
|
JLE sequenceDecs_decodeSync_amd64_fill_end
|
|
CMPQ BX, $0x07
|
|
JLE sequenceDecs_decodeSync_amd64_fill_end
|
|
SHLQ $0x08, DX
|
|
SUBQ $0x01, R13
|
|
SUBQ $0x01, SI
|
|
SUBQ $0x08, BX
|
|
MOVBQZX (R13), AX
|
|
ORQ AX, DX
|
|
JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte
|
|
|
|
sequenceDecs_decodeSync_amd64_fill_end:
|
|
// Update offset
|
|
MOVQ R9, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R14
|
|
SHLQ CL, R14
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decodeSync_amd64_of_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decodeSync_amd64_of_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decodeSync_amd64_of_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R14
|
|
ADDQ R14, AX
|
|
|
|
sequenceDecs_decodeSync_amd64_of_update_zero:
|
|
MOVQ AX, 8(SP)
|
|
|
|
// Update match length
|
|
MOVQ R8, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R14
|
|
SHLQ CL, R14
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decodeSync_amd64_ml_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decodeSync_amd64_ml_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decodeSync_amd64_ml_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R14
|
|
ADDQ R14, AX
|
|
|
|
sequenceDecs_decodeSync_amd64_ml_update_zero:
|
|
MOVQ AX, 16(SP)
|
|
|
|
// Fill bitreader to have enough for the remaining
|
|
CMPQ SI, $0x08
|
|
JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
|
|
MOVQ BX, AX
|
|
SHRQ $0x03, AX
|
|
SUBQ AX, R13
|
|
MOVQ (R13), DX
|
|
SUBQ AX, SI
|
|
ANDQ $0x07, BX
|
|
JMP sequenceDecs_decodeSync_amd64_fill_2_end
|
|
|
|
sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
|
|
CMPQ SI, $0x00
|
|
JLE sequenceDecs_decodeSync_amd64_fill_2_end
|
|
CMPQ BX, $0x07
|
|
JLE sequenceDecs_decodeSync_amd64_fill_2_end
|
|
SHLQ $0x08, DX
|
|
SUBQ $0x01, R13
|
|
SUBQ $0x01, SI
|
|
SUBQ $0x08, BX
|
|
MOVBQZX (R13), AX
|
|
ORQ AX, DX
|
|
JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
|
|
|
|
sequenceDecs_decodeSync_amd64_fill_2_end:
|
|
// Update literal length
|
|
MOVQ DI, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R14
|
|
SHLQ CL, R14
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decodeSync_amd64_ll_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decodeSync_amd64_ll_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decodeSync_amd64_ll_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R14
|
|
ADDQ R14, AX
|
|
|
|
sequenceDecs_decodeSync_amd64_ll_update_zero:
|
|
MOVQ AX, 24(SP)
|
|
|
|
// Fill bitreader for state updates
|
|
MOVQ R13, (SP)
|
|
MOVQ R9, AX
|
|
SHRQ $0x08, AX
|
|
MOVBQZX AL, AX
|
|
MOVQ ctx+16(FP), CX
|
|
CMPQ 96(CX), $0x00
|
|
JZ sequenceDecs_decodeSync_amd64_skip_update
|
|
|
|
// Update Literal Length State
|
|
MOVBQZX DI, R13
|
|
SHRQ $0x10, DI
|
|
MOVWQZX DI, DI
|
|
LEAQ (BX)(R13*1), CX
|
|
MOVQ DX, R14
|
|
MOVQ CX, BX
|
|
ROLQ CL, R14
|
|
MOVL $0x00000001, R15
|
|
MOVB R13, CL
|
|
SHLL CL, R15
|
|
DECL R15
|
|
ANDQ R15, R14
|
|
ADDQ R14, DI
|
|
|
|
// Load ctx.llTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ (CX)(DI*8), DI
|
|
|
|
// Update Match Length State
|
|
MOVBQZX R8, R13
|
|
SHRQ $0x10, R8
|
|
MOVWQZX R8, R8
|
|
LEAQ (BX)(R13*1), CX
|
|
MOVQ DX, R14
|
|
MOVQ CX, BX
|
|
ROLQ CL, R14
|
|
MOVL $0x00000001, R15
|
|
MOVB R13, CL
|
|
SHLL CL, R15
|
|
DECL R15
|
|
ANDQ R15, R14
|
|
ADDQ R14, R8
|
|
|
|
// Load ctx.mlTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 24(CX), CX
|
|
MOVQ (CX)(R8*8), R8
|
|
|
|
// Update Offset State
|
|
MOVBQZX R9, R13
|
|
SHRQ $0x10, R9
|
|
MOVWQZX R9, R9
|
|
LEAQ (BX)(R13*1), CX
|
|
MOVQ DX, R14
|
|
MOVQ CX, BX
|
|
ROLQ CL, R14
|
|
MOVL $0x00000001, R15
|
|
MOVB R13, CL
|
|
SHLL CL, R15
|
|
DECL R15
|
|
ANDQ R15, R14
|
|
ADDQ R14, R9
|
|
|
|
// Load ctx.ofTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 48(CX), CX
|
|
MOVQ (CX)(R9*8), R9
|
|
|
|
sequenceDecs_decodeSync_amd64_skip_update:
|
|
// Adjust offset
|
|
MOVQ s+0(FP), CX
|
|
MOVQ 8(SP), R13
|
|
CMPQ AX, $0x01
|
|
JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
|
|
MOVUPS 144(CX), X0
|
|
MOVQ R13, 144(CX)
|
|
MOVUPS X0, 152(CX)
|
|
JMP sequenceDecs_decodeSync_amd64_after_adjust
|
|
|
|
sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
|
|
CMPQ 24(SP), $0x00000000
|
|
JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
|
|
INCQ R13
|
|
JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
|
|
|
|
sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
|
|
MOVQ 144(CX), R13
|
|
JMP sequenceDecs_decodeSync_amd64_after_adjust
|
|
|
|
sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
|
|
MOVQ R13, AX
|
|
XORQ R14, R14
|
|
MOVQ $-1, R15
|
|
CMPQ R13, $0x03
|
|
CMOVQEQ R14, AX
|
|
CMOVQEQ R15, R14
|
|
ADDQ 144(CX)(AX*8), R14
|
|
JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid
|
|
MOVQ $0x00000001, R14
|
|
|
|
sequenceDecs_decodeSync_amd64_adjust_temp_valid:
|
|
CMPQ R13, $0x01
|
|
JZ sequenceDecs_decodeSync_amd64_adjust_skip
|
|
MOVQ 152(CX), AX
|
|
MOVQ AX, 160(CX)
|
|
|
|
sequenceDecs_decodeSync_amd64_adjust_skip:
|
|
MOVQ 144(CX), AX
|
|
MOVQ AX, 152(CX)
|
|
MOVQ R14, 144(CX)
|
|
MOVQ R14, R13
|
|
|
|
sequenceDecs_decodeSync_amd64_after_adjust:
|
|
MOVQ R13, 8(SP)
|
|
|
|
// Check values
|
|
MOVQ 16(SP), AX
|
|
MOVQ 24(SP), CX
|
|
LEAQ (AX)(CX*1), R14
|
|
MOVQ s+0(FP), R15
|
|
ADDQ R14, 256(R15)
|
|
MOVQ ctx+16(FP), R14
|
|
SUBQ CX, 104(R14)
|
|
JS error_not_enough_literals
|
|
CMPQ AX, $0x00020002
|
|
JA sequenceDecs_decodeSync_amd64_error_match_len_too_big
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok
|
|
TESTQ AX, AX
|
|
JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
|
|
|
|
sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
|
|
MOVQ 24(SP), AX
|
|
MOVQ 8(SP), CX
|
|
MOVQ 16(SP), R13
|
|
|
|
// Check if we have enough space in s.out
|
|
LEAQ (AX)(R13*1), R14
|
|
ADDQ R10, R14
|
|
CMPQ R14, 32(SP)
|
|
JA error_not_enough_space
|
|
|
|
// Copy literals
|
|
TESTQ AX, AX
|
|
JZ check_offset
|
|
XORQ R14, R14
|
|
|
|
copy_1:
|
|
MOVUPS (R11)(R14*1), X0
|
|
MOVUPS X0, (R10)(R14*1)
|
|
ADDQ $0x10, R14
|
|
CMPQ R14, AX
|
|
JB copy_1
|
|
ADDQ AX, R11
|
|
ADDQ AX, R10
|
|
ADDQ AX, R12
|
|
|
|
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
|
|
check_offset:
|
|
MOVQ R12, AX
|
|
ADDQ 40(SP), AX
|
|
CMPQ CX, AX
|
|
JG error_match_off_too_big
|
|
CMPQ CX, 56(SP)
|
|
JG error_match_off_too_big
|
|
|
|
// Copy match from history
|
|
MOVQ CX, AX
|
|
SUBQ R12, AX
|
|
JLS copy_match
|
|
MOVQ 48(SP), R14
|
|
SUBQ AX, R14
|
|
CMPQ R13, AX
|
|
JG copy_all_from_history
|
|
MOVQ R13, AX
|
|
SUBQ $0x10, AX
|
|
JB copy_4_small
|
|
|
|
copy_4_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (R10)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, R10
|
|
SUBQ $0x10, AX
|
|
JAE copy_4_loop
|
|
LEAQ 16(R14)(AX*1), R14
|
|
LEAQ 16(R10)(AX*1), R10
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(R10)
|
|
JMP copy_4_end
|
|
|
|
copy_4_small:
|
|
CMPQ R13, $0x03
|
|
JE copy_4_move_3
|
|
CMPQ R13, $0x08
|
|
JB copy_4_move_4through7
|
|
JMP copy_4_move_8through16
|
|
|
|
copy_4_move_3:
|
|
MOVW (R14), AX
|
|
MOVB 2(R14), CL
|
|
MOVW AX, (R10)
|
|
MOVB CL, 2(R10)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R10
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_4through7:
|
|
MOVL (R14), AX
|
|
MOVL -4(R14)(R13*1), CX
|
|
MOVL AX, (R10)
|
|
MOVL CX, -4(R10)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R10
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_8through16:
|
|
MOVQ (R14), AX
|
|
MOVQ -8(R14)(R13*1), CX
|
|
MOVQ AX, (R10)
|
|
MOVQ CX, -8(R10)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R10
|
|
|
|
copy_4_end:
|
|
ADDQ R13, R12
|
|
JMP handle_loop
|
|
JMP loop_finished
|
|
|
|
copy_all_from_history:
|
|
MOVQ AX, R15
|
|
SUBQ $0x10, R15
|
|
JB copy_5_small
|
|
|
|
copy_5_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (R10)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, R10
|
|
SUBQ $0x10, R15
|
|
JAE copy_5_loop
|
|
LEAQ 16(R14)(R15*1), R14
|
|
LEAQ 16(R10)(R15*1), R10
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(R10)
|
|
JMP copy_5_end
|
|
|
|
copy_5_small:
|
|
CMPQ AX, $0x03
|
|
JE copy_5_move_3
|
|
JB copy_5_move_1or2
|
|
CMPQ AX, $0x08
|
|
JB copy_5_move_4through7
|
|
JMP copy_5_move_8through16
|
|
|
|
copy_5_move_1or2:
|
|
MOVB (R14), R15
|
|
MOVB -1(R14)(AX*1), BP
|
|
MOVB R15, (R10)
|
|
MOVB BP, -1(R10)(AX*1)
|
|
ADDQ AX, R14
|
|
ADDQ AX, R10
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_3:
|
|
MOVW (R14), R15
|
|
MOVB 2(R14), BP
|
|
MOVW R15, (R10)
|
|
MOVB BP, 2(R10)
|
|
ADDQ AX, R14
|
|
ADDQ AX, R10
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_4through7:
|
|
MOVL (R14), R15
|
|
MOVL -4(R14)(AX*1), BP
|
|
MOVL R15, (R10)
|
|
MOVL BP, -4(R10)(AX*1)
|
|
ADDQ AX, R14
|
|
ADDQ AX, R10
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_8through16:
|
|
MOVQ (R14), R15
|
|
MOVQ -8(R14)(AX*1), BP
|
|
MOVQ R15, (R10)
|
|
MOVQ BP, -8(R10)(AX*1)
|
|
ADDQ AX, R14
|
|
ADDQ AX, R10
|
|
|
|
copy_5_end:
|
|
ADDQ AX, R12
|
|
SUBQ AX, R13
|
|
|
|
// Copy match from the current buffer
|
|
copy_match:
|
|
MOVQ R10, AX
|
|
SUBQ CX, AX
|
|
|
|
// ml <= mo
|
|
CMPQ R13, CX
|
|
JA copy_overlapping_match
|
|
|
|
// Copy non-overlapping match
|
|
ADDQ R13, R12
|
|
MOVQ R10, CX
|
|
ADDQ R13, R10
|
|
|
|
copy_2:
|
|
MOVUPS (AX), X0
|
|
MOVUPS X0, (CX)
|
|
ADDQ $0x10, AX
|
|
ADDQ $0x10, CX
|
|
SUBQ $0x10, R13
|
|
JHI copy_2
|
|
JMP handle_loop
|
|
|
|
// Copy overlapping match
|
|
copy_overlapping_match:
|
|
ADDQ R13, R12
|
|
|
|
copy_slow_3:
|
|
MOVB (AX), CL
|
|
MOVB CL, (R10)
|
|
INCQ AX
|
|
INCQ R10
|
|
DECQ R13
|
|
JNZ copy_slow_3
|
|
|
|
handle_loop:
|
|
MOVQ ctx+16(FP), AX
|
|
DECQ 96(AX)
|
|
JNS sequenceDecs_decodeSync_amd64_main_loop
|
|
|
|
loop_finished:
|
|
MOVQ br+8(FP), AX
|
|
MOVQ DX, 32(AX)
|
|
MOVB BL, 40(AX)
|
|
MOVQ SI, 24(AX)
|
|
|
|
// Update the context
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ R12, 136(AX)
|
|
MOVQ 144(AX), CX
|
|
SUBQ CX, R11
|
|
MOVQ R11, 168(AX)
|
|
|
|
// Return success
|
|
MOVQ $0x00000000, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match length error
|
|
sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
|
|
MOVQ 16(SP), AX
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ AX, 216(CX)
|
|
MOVQ $0x00000001, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match too long error
|
|
sequenceDecs_decodeSync_amd64_error_match_len_too_big:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 16(SP), CX
|
|
MOVQ CX, 216(AX)
|
|
MOVQ $0x00000002, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match offset too long error
|
|
error_match_off_too_big:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 8(SP), CX
|
|
MOVQ CX, 224(AX)
|
|
MOVQ R12, 136(AX)
|
|
MOVQ $0x00000003, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough literals error
|
|
error_not_enough_literals:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 24(SP), CX
|
|
MOVQ CX, 208(AX)
|
|
MOVQ $0x00000004, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough output space error
|
|
error_not_enough_space:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 24(SP), CX
|
|
MOVQ CX, 208(AX)
|
|
MOVQ 16(SP), CX
|
|
MOVQ CX, 216(AX)
|
|
MOVQ R12, 136(AX)
|
|
MOVQ $0x00000005, ret+24(FP)
|
|
RET
|
|
|
|
// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
|
|
// Requires: BMI, BMI2, CMOV, SSE
|
|
TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
|
|
MOVQ br+8(FP), CX
|
|
MOVQ 32(CX), AX
|
|
MOVBQZX 40(CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ (CX), CX
|
|
ADDQ BX, CX
|
|
MOVQ CX, (SP)
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 72(CX), SI
|
|
MOVQ 80(CX), DI
|
|
MOVQ 88(CX), R8
|
|
XORQ R9, R9
|
|
MOVQ R9, 8(SP)
|
|
MOVQ R9, 16(SP)
|
|
MOVQ R9, 24(SP)
|
|
MOVQ 112(CX), R9
|
|
MOVQ 128(CX), R10
|
|
MOVQ R10, 32(SP)
|
|
MOVQ 144(CX), R10
|
|
MOVQ 136(CX), R11
|
|
MOVQ 200(CX), R12
|
|
MOVQ R12, 56(SP)
|
|
MOVQ 176(CX), R12
|
|
MOVQ R12, 48(SP)
|
|
MOVQ 184(CX), CX
|
|
MOVQ CX, 40(SP)
|
|
MOVQ 40(SP), CX
|
|
ADDQ CX, 48(SP)
|
|
|
|
// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
|
|
ADDQ R9, 32(SP)
|
|
|
|
// outBase += outPosition
|
|
ADDQ R11, R9
|
|
|
|
sequenceDecs_decodeSync_bmi2_main_loop:
|
|
MOVQ (SP), R12
|
|
|
|
// Fill bitreader to have enough for the offset and match length.
|
|
CMPQ BX, $0x08
|
|
JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
|
|
MOVQ DX, CX
|
|
SHRQ $0x03, CX
|
|
SUBQ CX, R12
|
|
MOVQ (R12), AX
|
|
SUBQ CX, BX
|
|
ANDQ $0x07, DX
|
|
JMP sequenceDecs_decodeSync_bmi2_fill_end
|
|
|
|
sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
|
|
CMPQ BX, $0x00
|
|
JLE sequenceDecs_decodeSync_bmi2_fill_end
|
|
CMPQ DX, $0x07
|
|
JLE sequenceDecs_decodeSync_bmi2_fill_end
|
|
SHLQ $0x08, AX
|
|
SUBQ $0x01, R12
|
|
SUBQ $0x01, BX
|
|
SUBQ $0x08, DX
|
|
MOVBQZX (R12), CX
|
|
ORQ CX, AX
|
|
JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
|
|
|
|
sequenceDecs_decodeSync_bmi2_fill_end:
|
|
// Update offset
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, R8, R13
|
|
MOVQ AX, R14
|
|
LEAQ (DX)(R13*1), CX
|
|
ROLQ CL, R14
|
|
BZHIQ R13, R14, R14
|
|
MOVQ CX, DX
|
|
MOVQ R8, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R14, CX
|
|
MOVQ CX, 8(SP)
|
|
|
|
// Update match length
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, DI, R13
|
|
MOVQ AX, R14
|
|
LEAQ (DX)(R13*1), CX
|
|
ROLQ CL, R14
|
|
BZHIQ R13, R14, R14
|
|
MOVQ CX, DX
|
|
MOVQ DI, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R14, CX
|
|
MOVQ CX, 16(SP)
|
|
|
|
// Fill bitreader to have enough for the remaining
|
|
CMPQ BX, $0x08
|
|
JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
|
|
MOVQ DX, CX
|
|
SHRQ $0x03, CX
|
|
SUBQ CX, R12
|
|
MOVQ (R12), AX
|
|
SUBQ CX, BX
|
|
ANDQ $0x07, DX
|
|
JMP sequenceDecs_decodeSync_bmi2_fill_2_end
|
|
|
|
sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
|
|
CMPQ BX, $0x00
|
|
JLE sequenceDecs_decodeSync_bmi2_fill_2_end
|
|
CMPQ DX, $0x07
|
|
JLE sequenceDecs_decodeSync_bmi2_fill_2_end
|
|
SHLQ $0x08, AX
|
|
SUBQ $0x01, R12
|
|
SUBQ $0x01, BX
|
|
SUBQ $0x08, DX
|
|
MOVBQZX (R12), CX
|
|
ORQ CX, AX
|
|
JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
|
|
|
|
sequenceDecs_decodeSync_bmi2_fill_2_end:
|
|
// Update literal length
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, SI, R13
|
|
MOVQ AX, R14
|
|
LEAQ (DX)(R13*1), CX
|
|
ROLQ CL, R14
|
|
BZHIQ R13, R14, R14
|
|
MOVQ CX, DX
|
|
MOVQ SI, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R14, CX
|
|
MOVQ CX, 24(SP)
|
|
|
|
// Fill bitreader for state updates
|
|
MOVQ R12, (SP)
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, R8, R12
|
|
MOVQ ctx+16(FP), CX
|
|
CMPQ 96(CX), $0x00
|
|
JZ sequenceDecs_decodeSync_bmi2_skip_update
|
|
LEAQ (SI)(DI*1), R13
|
|
ADDQ R8, R13
|
|
MOVBQZX R13, R13
|
|
LEAQ (DX)(R13*1), CX
|
|
MOVQ AX, R14
|
|
MOVQ CX, DX
|
|
ROLQ CL, R14
|
|
BZHIQ R13, R14, R14
|
|
|
|
// Update Offset State
|
|
BZHIQ R8, R14, CX
|
|
SHRXQ R8, R14, R14
|
|
MOVQ $0x00001010, R13
|
|
BEXTRQ R13, R8, R8
|
|
ADDQ CX, R8
|
|
|
|
// Load ctx.ofTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 48(CX), CX
|
|
MOVQ (CX)(R8*8), R8
|
|
|
|
// Update Match Length State
|
|
BZHIQ DI, R14, CX
|
|
SHRXQ DI, R14, R14
|
|
MOVQ $0x00001010, R13
|
|
BEXTRQ R13, DI, DI
|
|
ADDQ CX, DI
|
|
|
|
// Load ctx.mlTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 24(CX), CX
|
|
MOVQ (CX)(DI*8), DI
|
|
|
|
// Update Literal Length State
|
|
BZHIQ SI, R14, CX
|
|
MOVQ $0x00001010, R13
|
|
BEXTRQ R13, SI, SI
|
|
ADDQ CX, SI
|
|
|
|
// Load ctx.llTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ (CX)(SI*8), SI
|
|
|
|
sequenceDecs_decodeSync_bmi2_skip_update:
|
|
// Adjust offset
|
|
MOVQ s+0(FP), CX
|
|
MOVQ 8(SP), R13
|
|
CMPQ R12, $0x01
|
|
JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
|
|
MOVUPS 144(CX), X0
|
|
MOVQ R13, 144(CX)
|
|
MOVUPS X0, 152(CX)
|
|
JMP sequenceDecs_decodeSync_bmi2_after_adjust
|
|
|
|
sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
|
|
CMPQ 24(SP), $0x00000000
|
|
JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
|
|
INCQ R13
|
|
JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
|
|
|
|
sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
|
|
MOVQ 144(CX), R13
|
|
JMP sequenceDecs_decodeSync_bmi2_after_adjust
|
|
|
|
sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
|
|
MOVQ R13, R12
|
|
XORQ R14, R14
|
|
MOVQ $-1, R15
|
|
CMPQ R13, $0x03
|
|
CMOVQEQ R14, R12
|
|
CMOVQEQ R15, R14
|
|
ADDQ 144(CX)(R12*8), R14
|
|
JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid
|
|
MOVQ $0x00000001, R14
|
|
|
|
sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
|
|
CMPQ R13, $0x01
|
|
JZ sequenceDecs_decodeSync_bmi2_adjust_skip
|
|
MOVQ 152(CX), R12
|
|
MOVQ R12, 160(CX)
|
|
|
|
sequenceDecs_decodeSync_bmi2_adjust_skip:
|
|
MOVQ 144(CX), R12
|
|
MOVQ R12, 152(CX)
|
|
MOVQ R14, 144(CX)
|
|
MOVQ R14, R13
|
|
|
|
sequenceDecs_decodeSync_bmi2_after_adjust:
|
|
MOVQ R13, 8(SP)
|
|
|
|
// Check values
|
|
MOVQ 16(SP), CX
|
|
MOVQ 24(SP), R12
|
|
LEAQ (CX)(R12*1), R14
|
|
MOVQ s+0(FP), R15
|
|
ADDQ R14, 256(R15)
|
|
MOVQ ctx+16(FP), R14
|
|
SUBQ R12, 104(R14)
|
|
JS error_not_enough_literals
|
|
CMPQ CX, $0x00020002
|
|
JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
|
|
|
|
sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
|
|
MOVQ 24(SP), CX
|
|
MOVQ 8(SP), R12
|
|
MOVQ 16(SP), R13
|
|
|
|
// Check if we have enough space in s.out
|
|
LEAQ (CX)(R13*1), R14
|
|
ADDQ R9, R14
|
|
CMPQ R14, 32(SP)
|
|
JA error_not_enough_space
|
|
|
|
// Copy literals
|
|
TESTQ CX, CX
|
|
JZ check_offset
|
|
XORQ R14, R14
|
|
|
|
copy_1:
|
|
MOVUPS (R10)(R14*1), X0
|
|
MOVUPS X0, (R9)(R14*1)
|
|
ADDQ $0x10, R14
|
|
CMPQ R14, CX
|
|
JB copy_1
|
|
ADDQ CX, R10
|
|
ADDQ CX, R9
|
|
ADDQ CX, R11
|
|
|
|
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
|
|
check_offset:
|
|
MOVQ R11, CX
|
|
ADDQ 40(SP), CX
|
|
CMPQ R12, CX
|
|
JG error_match_off_too_big
|
|
CMPQ R12, 56(SP)
|
|
JG error_match_off_too_big
|
|
|
|
// Copy match from history
|
|
MOVQ R12, CX
|
|
SUBQ R11, CX
|
|
JLS copy_match
|
|
MOVQ 48(SP), R14
|
|
SUBQ CX, R14
|
|
CMPQ R13, CX
|
|
JG copy_all_from_history
|
|
MOVQ R13, CX
|
|
SUBQ $0x10, CX
|
|
JB copy_4_small
|
|
|
|
copy_4_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (R9)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, R9
|
|
SUBQ $0x10, CX
|
|
JAE copy_4_loop
|
|
LEAQ 16(R14)(CX*1), R14
|
|
LEAQ 16(R9)(CX*1), R9
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(R9)
|
|
JMP copy_4_end
|
|
|
|
copy_4_small:
|
|
CMPQ R13, $0x03
|
|
JE copy_4_move_3
|
|
CMPQ R13, $0x08
|
|
JB copy_4_move_4through7
|
|
JMP copy_4_move_8through16
|
|
|
|
copy_4_move_3:
|
|
MOVW (R14), CX
|
|
MOVB 2(R14), R12
|
|
MOVW CX, (R9)
|
|
MOVB R12, 2(R9)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R9
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_4through7:
|
|
MOVL (R14), CX
|
|
MOVL -4(R14)(R13*1), R12
|
|
MOVL CX, (R9)
|
|
MOVL R12, -4(R9)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R9
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_8through16:
|
|
MOVQ (R14), CX
|
|
MOVQ -8(R14)(R13*1), R12
|
|
MOVQ CX, (R9)
|
|
MOVQ R12, -8(R9)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R9
|
|
|
|
copy_4_end:
|
|
ADDQ R13, R11
|
|
JMP handle_loop
|
|
JMP loop_finished
|
|
|
|
copy_all_from_history:
|
|
MOVQ CX, R15
|
|
SUBQ $0x10, R15
|
|
JB copy_5_small
|
|
|
|
copy_5_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (R9)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, R9
|
|
SUBQ $0x10, R15
|
|
JAE copy_5_loop
|
|
LEAQ 16(R14)(R15*1), R14
|
|
LEAQ 16(R9)(R15*1), R9
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(R9)
|
|
JMP copy_5_end
|
|
|
|
copy_5_small:
|
|
CMPQ CX, $0x03
|
|
JE copy_5_move_3
|
|
JB copy_5_move_1or2
|
|
CMPQ CX, $0x08
|
|
JB copy_5_move_4through7
|
|
JMP copy_5_move_8through16
|
|
|
|
copy_5_move_1or2:
|
|
MOVB (R14), R15
|
|
MOVB -1(R14)(CX*1), BP
|
|
MOVB R15, (R9)
|
|
MOVB BP, -1(R9)(CX*1)
|
|
ADDQ CX, R14
|
|
ADDQ CX, R9
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_3:
|
|
MOVW (R14), R15
|
|
MOVB 2(R14), BP
|
|
MOVW R15, (R9)
|
|
MOVB BP, 2(R9)
|
|
ADDQ CX, R14
|
|
ADDQ CX, R9
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_4through7:
|
|
MOVL (R14), R15
|
|
MOVL -4(R14)(CX*1), BP
|
|
MOVL R15, (R9)
|
|
MOVL BP, -4(R9)(CX*1)
|
|
ADDQ CX, R14
|
|
ADDQ CX, R9
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_8through16:
|
|
MOVQ (R14), R15
|
|
MOVQ -8(R14)(CX*1), BP
|
|
MOVQ R15, (R9)
|
|
MOVQ BP, -8(R9)(CX*1)
|
|
ADDQ CX, R14
|
|
ADDQ CX, R9
|
|
|
|
copy_5_end:
|
|
ADDQ CX, R11
|
|
SUBQ CX, R13
|
|
|
|
// Copy match from the current buffer
|
|
copy_match:
|
|
MOVQ R9, CX
|
|
SUBQ R12, CX
|
|
|
|
// ml <= mo
|
|
CMPQ R13, R12
|
|
JA copy_overlapping_match
|
|
|
|
// Copy non-overlapping match
|
|
ADDQ R13, R11
|
|
MOVQ R9, R12
|
|
ADDQ R13, R9
|
|
|
|
copy_2:
|
|
MOVUPS (CX), X0
|
|
MOVUPS X0, (R12)
|
|
ADDQ $0x10, CX
|
|
ADDQ $0x10, R12
|
|
SUBQ $0x10, R13
|
|
JHI copy_2
|
|
JMP handle_loop
|
|
|
|
// Copy overlapping match
|
|
copy_overlapping_match:
|
|
ADDQ R13, R11
|
|
|
|
copy_slow_3:
|
|
MOVB (CX), R12
|
|
MOVB R12, (R9)
|
|
INCQ CX
|
|
INCQ R9
|
|
DECQ R13
|
|
JNZ copy_slow_3
|
|
|
|
handle_loop:
|
|
MOVQ ctx+16(FP), CX
|
|
DECQ 96(CX)
|
|
JNS sequenceDecs_decodeSync_bmi2_main_loop
|
|
|
|
loop_finished:
|
|
MOVQ br+8(FP), CX
|
|
MOVQ AX, 32(CX)
|
|
MOVB DL, 40(CX)
|
|
MOVQ BX, 24(CX)
|
|
|
|
// Update the context
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ R11, 136(AX)
|
|
MOVQ 144(AX), CX
|
|
SUBQ CX, R10
|
|
MOVQ R10, 168(AX)
|
|
|
|
// Return success
|
|
MOVQ $0x00000000, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match length error
|
|
sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
|
|
MOVQ 16(SP), AX
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ AX, 216(CX)
|
|
MOVQ $0x00000001, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match too long error
|
|
sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 16(SP), CX
|
|
MOVQ CX, 216(AX)
|
|
MOVQ $0x00000002, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match offset too long error
|
|
error_match_off_too_big:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 8(SP), CX
|
|
MOVQ CX, 224(AX)
|
|
MOVQ R11, 136(AX)
|
|
MOVQ $0x00000003, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough literals error
|
|
error_not_enough_literals:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 24(SP), CX
|
|
MOVQ CX, 208(AX)
|
|
MOVQ $0x00000004, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough output space error
|
|
error_not_enough_space:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 24(SP), CX
|
|
MOVQ CX, 208(AX)
|
|
MOVQ 16(SP), CX
|
|
MOVQ CX, 216(AX)
|
|
MOVQ R11, 136(AX)
|
|
MOVQ $0x00000005, ret+24(FP)
|
|
RET
|
|
|
|
// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
|
|
// Requires: CMOV, SSE
|
|
TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
|
|
MOVQ br+8(FP), AX
|
|
MOVQ 32(AX), DX
|
|
MOVBQZX 40(AX), BX
|
|
MOVQ 24(AX), SI
|
|
MOVQ (AX), AX
|
|
ADDQ SI, AX
|
|
MOVQ AX, (SP)
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 72(AX), DI
|
|
MOVQ 80(AX), R8
|
|
MOVQ 88(AX), R9
|
|
XORQ CX, CX
|
|
MOVQ CX, 8(SP)
|
|
MOVQ CX, 16(SP)
|
|
MOVQ CX, 24(SP)
|
|
MOVQ 112(AX), R10
|
|
MOVQ 128(AX), CX
|
|
MOVQ CX, 32(SP)
|
|
MOVQ 144(AX), R11
|
|
MOVQ 136(AX), R12
|
|
MOVQ 200(AX), CX
|
|
MOVQ CX, 56(SP)
|
|
MOVQ 176(AX), CX
|
|
MOVQ CX, 48(SP)
|
|
MOVQ 184(AX), AX
|
|
MOVQ AX, 40(SP)
|
|
MOVQ 40(SP), AX
|
|
ADDQ AX, 48(SP)
|
|
|
|
// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
|
|
ADDQ R10, 32(SP)
|
|
|
|
// outBase += outPosition
|
|
ADDQ R12, R10
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_main_loop:
|
|
MOVQ (SP), R13
|
|
|
|
// Fill bitreader to have enough for the offset and match length.
|
|
CMPQ SI, $0x08
|
|
JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
|
|
MOVQ BX, AX
|
|
SHRQ $0x03, AX
|
|
SUBQ AX, R13
|
|
MOVQ (R13), DX
|
|
SUBQ AX, SI
|
|
ANDQ $0x07, BX
|
|
JMP sequenceDecs_decodeSync_safe_amd64_fill_end
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
|
|
CMPQ SI, $0x00
|
|
JLE sequenceDecs_decodeSync_safe_amd64_fill_end
|
|
CMPQ BX, $0x07
|
|
JLE sequenceDecs_decodeSync_safe_amd64_fill_end
|
|
SHLQ $0x08, DX
|
|
SUBQ $0x01, R13
|
|
SUBQ $0x01, SI
|
|
SUBQ $0x08, BX
|
|
MOVBQZX (R13), AX
|
|
ORQ AX, DX
|
|
JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_fill_end:
|
|
// Update offset
|
|
MOVQ R9, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R14
|
|
SHLQ CL, R14
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decodeSync_safe_amd64_of_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R14
|
|
ADDQ R14, AX
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_of_update_zero:
|
|
MOVQ AX, 8(SP)
|
|
|
|
// Update match length
|
|
MOVQ R8, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R14
|
|
SHLQ CL, R14
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R14
|
|
ADDQ R14, AX
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
|
|
MOVQ AX, 16(SP)
|
|
|
|
// Fill bitreader to have enough for the remaining
|
|
CMPQ SI, $0x08
|
|
JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
|
|
MOVQ BX, AX
|
|
SHRQ $0x03, AX
|
|
SUBQ AX, R13
|
|
MOVQ (R13), DX
|
|
SUBQ AX, SI
|
|
ANDQ $0x07, BX
|
|
JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
|
|
CMPQ SI, $0x00
|
|
JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
|
|
CMPQ BX, $0x07
|
|
JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
|
|
SHLQ $0x08, DX
|
|
SUBQ $0x01, R13
|
|
SUBQ $0x01, SI
|
|
SUBQ $0x08, BX
|
|
MOVBQZX (R13), AX
|
|
ORQ AX, DX
|
|
JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_fill_2_end:
|
|
// Update literal length
|
|
MOVQ DI, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R14
|
|
SHLQ CL, R14
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R14
|
|
ADDQ R14, AX
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
|
|
MOVQ AX, 24(SP)
|
|
|
|
// Fill bitreader for state updates
|
|
MOVQ R13, (SP)
|
|
MOVQ R9, AX
|
|
SHRQ $0x08, AX
|
|
MOVBQZX AL, AX
|
|
MOVQ ctx+16(FP), CX
|
|
CMPQ 96(CX), $0x00
|
|
JZ sequenceDecs_decodeSync_safe_amd64_skip_update
|
|
|
|
// Update Literal Length State
|
|
MOVBQZX DI, R13
|
|
SHRQ $0x10, DI
|
|
MOVWQZX DI, DI
|
|
LEAQ (BX)(R13*1), CX
|
|
MOVQ DX, R14
|
|
MOVQ CX, BX
|
|
ROLQ CL, R14
|
|
MOVL $0x00000001, R15
|
|
MOVB R13, CL
|
|
SHLL CL, R15
|
|
DECL R15
|
|
ANDQ R15, R14
|
|
ADDQ R14, DI
|
|
|
|
// Load ctx.llTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ (CX)(DI*8), DI
|
|
|
|
// Update Match Length State
|
|
MOVBQZX R8, R13
|
|
SHRQ $0x10, R8
|
|
MOVWQZX R8, R8
|
|
LEAQ (BX)(R13*1), CX
|
|
MOVQ DX, R14
|
|
MOVQ CX, BX
|
|
ROLQ CL, R14
|
|
MOVL $0x00000001, R15
|
|
MOVB R13, CL
|
|
SHLL CL, R15
|
|
DECL R15
|
|
ANDQ R15, R14
|
|
ADDQ R14, R8
|
|
|
|
// Load ctx.mlTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 24(CX), CX
|
|
MOVQ (CX)(R8*8), R8
|
|
|
|
// Update Offset State
|
|
MOVBQZX R9, R13
|
|
SHRQ $0x10, R9
|
|
MOVWQZX R9, R9
|
|
LEAQ (BX)(R13*1), CX
|
|
MOVQ DX, R14
|
|
MOVQ CX, BX
|
|
ROLQ CL, R14
|
|
MOVL $0x00000001, R15
|
|
MOVB R13, CL
|
|
SHLL CL, R15
|
|
DECL R15
|
|
ANDQ R15, R14
|
|
ADDQ R14, R9
|
|
|
|
// Load ctx.ofTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 48(CX), CX
|
|
MOVQ (CX)(R9*8), R9
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_skip_update:
|
|
// Adjust offset
|
|
MOVQ s+0(FP), CX
|
|
MOVQ 8(SP), R13
|
|
CMPQ AX, $0x01
|
|
JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
|
|
MOVUPS 144(CX), X0
|
|
MOVQ R13, 144(CX)
|
|
MOVUPS X0, 152(CX)
|
|
JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
|
|
CMPQ 24(SP), $0x00000000
|
|
JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
|
|
INCQ R13
|
|
JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
|
|
MOVQ 144(CX), R13
|
|
JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
|
|
MOVQ R13, AX
|
|
XORQ R14, R14
|
|
MOVQ $-1, R15
|
|
CMPQ R13, $0x03
|
|
CMOVQEQ R14, AX
|
|
CMOVQEQ R15, R14
|
|
ADDQ 144(CX)(AX*8), R14
|
|
JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
|
|
MOVQ $0x00000001, R14
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
|
|
CMPQ R13, $0x01
|
|
JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip
|
|
MOVQ 152(CX), AX
|
|
MOVQ AX, 160(CX)
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_adjust_skip:
|
|
MOVQ 144(CX), AX
|
|
MOVQ AX, 152(CX)
|
|
MOVQ R14, 144(CX)
|
|
MOVQ R14, R13
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_after_adjust:
|
|
MOVQ R13, 8(SP)
|
|
|
|
// Check values
|
|
MOVQ 16(SP), AX
|
|
MOVQ 24(SP), CX
|
|
LEAQ (AX)(CX*1), R14
|
|
MOVQ s+0(FP), R15
|
|
ADDQ R14, 256(R15)
|
|
MOVQ ctx+16(FP), R14
|
|
SUBQ CX, 104(R14)
|
|
JS error_not_enough_literals
|
|
CMPQ AX, $0x00020002
|
|
JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
|
|
TESTQ AX, AX
|
|
JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
|
|
MOVQ 24(SP), AX
|
|
MOVQ 8(SP), CX
|
|
MOVQ 16(SP), R13
|
|
|
|
// Check if we have enough space in s.out
|
|
LEAQ (AX)(R13*1), R14
|
|
ADDQ R10, R14
|
|
CMPQ R14, 32(SP)
|
|
JA error_not_enough_space
|
|
|
|
// Copy literals
|
|
TESTQ AX, AX
|
|
JZ check_offset
|
|
MOVQ AX, R14
|
|
SUBQ $0x10, R14
|
|
JB copy_1_small
|
|
|
|
copy_1_loop:
|
|
MOVUPS (R11), X0
|
|
MOVUPS X0, (R10)
|
|
ADDQ $0x10, R11
|
|
ADDQ $0x10, R10
|
|
SUBQ $0x10, R14
|
|
JAE copy_1_loop
|
|
LEAQ 16(R11)(R14*1), R11
|
|
LEAQ 16(R10)(R14*1), R10
|
|
MOVUPS -16(R11), X0
|
|
MOVUPS X0, -16(R10)
|
|
JMP copy_1_end
|
|
|
|
copy_1_small:
|
|
CMPQ AX, $0x03
|
|
JE copy_1_move_3
|
|
JB copy_1_move_1or2
|
|
CMPQ AX, $0x08
|
|
JB copy_1_move_4through7
|
|
JMP copy_1_move_8through16
|
|
|
|
copy_1_move_1or2:
|
|
MOVB (R11), R14
|
|
MOVB -1(R11)(AX*1), R15
|
|
MOVB R14, (R10)
|
|
MOVB R15, -1(R10)(AX*1)
|
|
ADDQ AX, R11
|
|
ADDQ AX, R10
|
|
JMP copy_1_end
|
|
|
|
copy_1_move_3:
|
|
MOVW (R11), R14
|
|
MOVB 2(R11), R15
|
|
MOVW R14, (R10)
|
|
MOVB R15, 2(R10)
|
|
ADDQ AX, R11
|
|
ADDQ AX, R10
|
|
JMP copy_1_end
|
|
|
|
copy_1_move_4through7:
|
|
MOVL (R11), R14
|
|
MOVL -4(R11)(AX*1), R15
|
|
MOVL R14, (R10)
|
|
MOVL R15, -4(R10)(AX*1)
|
|
ADDQ AX, R11
|
|
ADDQ AX, R10
|
|
JMP copy_1_end
|
|
|
|
copy_1_move_8through16:
|
|
MOVQ (R11), R14
|
|
MOVQ -8(R11)(AX*1), R15
|
|
MOVQ R14, (R10)
|
|
MOVQ R15, -8(R10)(AX*1)
|
|
ADDQ AX, R11
|
|
ADDQ AX, R10
|
|
|
|
copy_1_end:
|
|
ADDQ AX, R12
|
|
|
|
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
|
|
check_offset:
|
|
MOVQ R12, AX
|
|
ADDQ 40(SP), AX
|
|
CMPQ CX, AX
|
|
JG error_match_off_too_big
|
|
CMPQ CX, 56(SP)
|
|
JG error_match_off_too_big
|
|
|
|
// Copy match from history
|
|
MOVQ CX, AX
|
|
SUBQ R12, AX
|
|
JLS copy_match
|
|
MOVQ 48(SP), R14
|
|
SUBQ AX, R14
|
|
CMPQ R13, AX
|
|
JG copy_all_from_history
|
|
MOVQ R13, AX
|
|
SUBQ $0x10, AX
|
|
JB copy_4_small
|
|
|
|
copy_4_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (R10)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, R10
|
|
SUBQ $0x10, AX
|
|
JAE copy_4_loop
|
|
LEAQ 16(R14)(AX*1), R14
|
|
LEAQ 16(R10)(AX*1), R10
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(R10)
|
|
JMP copy_4_end
|
|
|
|
copy_4_small:
|
|
CMPQ R13, $0x03
|
|
JE copy_4_move_3
|
|
CMPQ R13, $0x08
|
|
JB copy_4_move_4through7
|
|
JMP copy_4_move_8through16
|
|
|
|
copy_4_move_3:
|
|
MOVW (R14), AX
|
|
MOVB 2(R14), CL
|
|
MOVW AX, (R10)
|
|
MOVB CL, 2(R10)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R10
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_4through7:
|
|
MOVL (R14), AX
|
|
MOVL -4(R14)(R13*1), CX
|
|
MOVL AX, (R10)
|
|
MOVL CX, -4(R10)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R10
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_8through16:
|
|
MOVQ (R14), AX
|
|
MOVQ -8(R14)(R13*1), CX
|
|
MOVQ AX, (R10)
|
|
MOVQ CX, -8(R10)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R10
|
|
|
|
copy_4_end:
|
|
ADDQ R13, R12
|
|
JMP handle_loop
|
|
JMP loop_finished
|
|
|
|
copy_all_from_history:
|
|
MOVQ AX, R15
|
|
SUBQ $0x10, R15
|
|
JB copy_5_small
|
|
|
|
copy_5_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (R10)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, R10
|
|
SUBQ $0x10, R15
|
|
JAE copy_5_loop
|
|
LEAQ 16(R14)(R15*1), R14
|
|
LEAQ 16(R10)(R15*1), R10
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(R10)
|
|
JMP copy_5_end
|
|
|
|
copy_5_small:
|
|
CMPQ AX, $0x03
|
|
JE copy_5_move_3
|
|
JB copy_5_move_1or2
|
|
CMPQ AX, $0x08
|
|
JB copy_5_move_4through7
|
|
JMP copy_5_move_8through16
|
|
|
|
copy_5_move_1or2:
|
|
MOVB (R14), R15
|
|
MOVB -1(R14)(AX*1), BP
|
|
MOVB R15, (R10)
|
|
MOVB BP, -1(R10)(AX*1)
|
|
ADDQ AX, R14
|
|
ADDQ AX, R10
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_3:
|
|
MOVW (R14), R15
|
|
MOVB 2(R14), BP
|
|
MOVW R15, (R10)
|
|
MOVB BP, 2(R10)
|
|
ADDQ AX, R14
|
|
ADDQ AX, R10
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_4through7:
|
|
MOVL (R14), R15
|
|
MOVL -4(R14)(AX*1), BP
|
|
MOVL R15, (R10)
|
|
MOVL BP, -4(R10)(AX*1)
|
|
ADDQ AX, R14
|
|
ADDQ AX, R10
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_8through16:
|
|
MOVQ (R14), R15
|
|
MOVQ -8(R14)(AX*1), BP
|
|
MOVQ R15, (R10)
|
|
MOVQ BP, -8(R10)(AX*1)
|
|
ADDQ AX, R14
|
|
ADDQ AX, R10
|
|
|
|
copy_5_end:
|
|
ADDQ AX, R12
|
|
SUBQ AX, R13
|
|
|
|
// Copy match from the current buffer
|
|
copy_match:
|
|
MOVQ R10, AX
|
|
SUBQ CX, AX
|
|
|
|
// ml <= mo
|
|
CMPQ R13, CX
|
|
JA copy_overlapping_match
|
|
|
|
// Copy non-overlapping match
|
|
ADDQ R13, R12
|
|
MOVQ R13, CX
|
|
SUBQ $0x10, CX
|
|
JB copy_2_small
|
|
|
|
copy_2_loop:
|
|
MOVUPS (AX), X0
|
|
MOVUPS X0, (R10)
|
|
ADDQ $0x10, AX
|
|
ADDQ $0x10, R10
|
|
SUBQ $0x10, CX
|
|
JAE copy_2_loop
|
|
LEAQ 16(AX)(CX*1), AX
|
|
LEAQ 16(R10)(CX*1), R10
|
|
MOVUPS -16(AX), X0
|
|
MOVUPS X0, -16(R10)
|
|
JMP copy_2_end
|
|
|
|
copy_2_small:
|
|
CMPQ R13, $0x03
|
|
JE copy_2_move_3
|
|
JB copy_2_move_1or2
|
|
CMPQ R13, $0x08
|
|
JB copy_2_move_4through7
|
|
JMP copy_2_move_8through16
|
|
|
|
copy_2_move_1or2:
|
|
MOVB (AX), CL
|
|
MOVB -1(AX)(R13*1), R14
|
|
MOVB CL, (R10)
|
|
MOVB R14, -1(R10)(R13*1)
|
|
ADDQ R13, AX
|
|
ADDQ R13, R10
|
|
JMP copy_2_end
|
|
|
|
copy_2_move_3:
|
|
MOVW (AX), CX
|
|
MOVB 2(AX), R14
|
|
MOVW CX, (R10)
|
|
MOVB R14, 2(R10)
|
|
ADDQ R13, AX
|
|
ADDQ R13, R10
|
|
JMP copy_2_end
|
|
|
|
copy_2_move_4through7:
|
|
MOVL (AX), CX
|
|
MOVL -4(AX)(R13*1), R14
|
|
MOVL CX, (R10)
|
|
MOVL R14, -4(R10)(R13*1)
|
|
ADDQ R13, AX
|
|
ADDQ R13, R10
|
|
JMP copy_2_end
|
|
|
|
copy_2_move_8through16:
|
|
MOVQ (AX), CX
|
|
MOVQ -8(AX)(R13*1), R14
|
|
MOVQ CX, (R10)
|
|
MOVQ R14, -8(R10)(R13*1)
|
|
ADDQ R13, AX
|
|
ADDQ R13, R10
|
|
|
|
copy_2_end:
|
|
JMP handle_loop
|
|
|
|
// Copy overlapping match
|
|
copy_overlapping_match:
|
|
ADDQ R13, R12
|
|
|
|
copy_slow_3:
|
|
MOVB (AX), CL
|
|
MOVB CL, (R10)
|
|
INCQ AX
|
|
INCQ R10
|
|
DECQ R13
|
|
JNZ copy_slow_3
|
|
|
|
handle_loop:
|
|
MOVQ ctx+16(FP), AX
|
|
DECQ 96(AX)
|
|
JNS sequenceDecs_decodeSync_safe_amd64_main_loop
|
|
|
|
loop_finished:
|
|
MOVQ br+8(FP), AX
|
|
MOVQ DX, 32(AX)
|
|
MOVB BL, 40(AX)
|
|
MOVQ SI, 24(AX)
|
|
|
|
// Update the context
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ R12, 136(AX)
|
|
MOVQ 144(AX), CX
|
|
SUBQ CX, R11
|
|
MOVQ R11, 168(AX)
|
|
|
|
// Return success
|
|
MOVQ $0x00000000, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match length error
|
|
sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
|
|
MOVQ 16(SP), AX
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ AX, 216(CX)
|
|
MOVQ $0x00000001, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match too long error
|
|
sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 16(SP), CX
|
|
MOVQ CX, 216(AX)
|
|
MOVQ $0x00000002, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match offset too long error
|
|
error_match_off_too_big:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 8(SP), CX
|
|
MOVQ CX, 224(AX)
|
|
MOVQ R12, 136(AX)
|
|
MOVQ $0x00000003, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough literals error
|
|
error_not_enough_literals:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 24(SP), CX
|
|
MOVQ CX, 208(AX)
|
|
MOVQ $0x00000004, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough output space error
|
|
error_not_enough_space:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 24(SP), CX
|
|
MOVQ CX, 208(AX)
|
|
MOVQ 16(SP), CX
|
|
MOVQ CX, 216(AX)
|
|
MOVQ R12, 136(AX)
|
|
MOVQ $0x00000005, ret+24(FP)
|
|
RET
|
|
|
|
// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
|
|
// Requires: BMI, BMI2, CMOV, SSE
|
|
TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
|
|
MOVQ br+8(FP), CX
|
|
MOVQ 32(CX), AX
|
|
MOVBQZX 40(CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ (CX), CX
|
|
ADDQ BX, CX
|
|
MOVQ CX, (SP)
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 72(CX), SI
|
|
MOVQ 80(CX), DI
|
|
MOVQ 88(CX), R8
|
|
XORQ R9, R9
|
|
MOVQ R9, 8(SP)
|
|
MOVQ R9, 16(SP)
|
|
MOVQ R9, 24(SP)
|
|
MOVQ 112(CX), R9
|
|
MOVQ 128(CX), R10
|
|
MOVQ R10, 32(SP)
|
|
MOVQ 144(CX), R10
|
|
MOVQ 136(CX), R11
|
|
MOVQ 200(CX), R12
|
|
MOVQ R12, 56(SP)
|
|
MOVQ 176(CX), R12
|
|
MOVQ R12, 48(SP)
|
|
MOVQ 184(CX), CX
|
|
MOVQ CX, 40(SP)
|
|
MOVQ 40(SP), CX
|
|
ADDQ CX, 48(SP)
|
|
|
|
// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
|
|
ADDQ R9, 32(SP)
|
|
|
|
// outBase += outPosition
|
|
ADDQ R11, R9
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_main_loop:
|
|
MOVQ (SP), R12
|
|
|
|
// Fill bitreader to have enough for the offset and match length.
|
|
CMPQ BX, $0x08
|
|
JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
|
|
MOVQ DX, CX
|
|
SHRQ $0x03, CX
|
|
SUBQ CX, R12
|
|
MOVQ (R12), AX
|
|
SUBQ CX, BX
|
|
ANDQ $0x07, DX
|
|
JMP sequenceDecs_decodeSync_safe_bmi2_fill_end
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
|
|
CMPQ BX, $0x00
|
|
JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
|
|
CMPQ DX, $0x07
|
|
JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
|
|
SHLQ $0x08, AX
|
|
SUBQ $0x01, R12
|
|
SUBQ $0x01, BX
|
|
SUBQ $0x08, DX
|
|
MOVBQZX (R12), CX
|
|
ORQ CX, AX
|
|
JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_fill_end:
|
|
// Update offset
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, R8, R13
|
|
MOVQ AX, R14
|
|
LEAQ (DX)(R13*1), CX
|
|
ROLQ CL, R14
|
|
BZHIQ R13, R14, R14
|
|
MOVQ CX, DX
|
|
MOVQ R8, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R14, CX
|
|
MOVQ CX, 8(SP)
|
|
|
|
// Update match length
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, DI, R13
|
|
MOVQ AX, R14
|
|
LEAQ (DX)(R13*1), CX
|
|
ROLQ CL, R14
|
|
BZHIQ R13, R14, R14
|
|
MOVQ CX, DX
|
|
MOVQ DI, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R14, CX
|
|
MOVQ CX, 16(SP)
|
|
|
|
// Fill bitreader to have enough for the remaining
|
|
CMPQ BX, $0x08
|
|
JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
|
|
MOVQ DX, CX
|
|
SHRQ $0x03, CX
|
|
SUBQ CX, R12
|
|
MOVQ (R12), AX
|
|
SUBQ CX, BX
|
|
ANDQ $0x07, DX
|
|
JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
|
|
CMPQ BX, $0x00
|
|
JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
|
|
CMPQ DX, $0x07
|
|
JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
|
|
SHLQ $0x08, AX
|
|
SUBQ $0x01, R12
|
|
SUBQ $0x01, BX
|
|
SUBQ $0x08, DX
|
|
MOVBQZX (R12), CX
|
|
ORQ CX, AX
|
|
JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
|
|
// Update literal length
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, SI, R13
|
|
MOVQ AX, R14
|
|
LEAQ (DX)(R13*1), CX
|
|
ROLQ CL, R14
|
|
BZHIQ R13, R14, R14
|
|
MOVQ CX, DX
|
|
MOVQ SI, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R14, CX
|
|
MOVQ CX, 24(SP)
|
|
|
|
// Fill bitreader for state updates
|
|
MOVQ R12, (SP)
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, R8, R12
|
|
MOVQ ctx+16(FP), CX
|
|
CMPQ 96(CX), $0x00
|
|
JZ sequenceDecs_decodeSync_safe_bmi2_skip_update
|
|
LEAQ (SI)(DI*1), R13
|
|
ADDQ R8, R13
|
|
MOVBQZX R13, R13
|
|
LEAQ (DX)(R13*1), CX
|
|
MOVQ AX, R14
|
|
MOVQ CX, DX
|
|
ROLQ CL, R14
|
|
BZHIQ R13, R14, R14
|
|
|
|
// Update Offset State
|
|
BZHIQ R8, R14, CX
|
|
SHRXQ R8, R14, R14
|
|
MOVQ $0x00001010, R13
|
|
BEXTRQ R13, R8, R8
|
|
ADDQ CX, R8
|
|
|
|
// Load ctx.ofTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 48(CX), CX
|
|
MOVQ (CX)(R8*8), R8
|
|
|
|
// Update Match Length State
|
|
BZHIQ DI, R14, CX
|
|
SHRXQ DI, R14, R14
|
|
MOVQ $0x00001010, R13
|
|
BEXTRQ R13, DI, DI
|
|
ADDQ CX, DI
|
|
|
|
// Load ctx.mlTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 24(CX), CX
|
|
MOVQ (CX)(DI*8), DI
|
|
|
|
// Update Literal Length State
|
|
BZHIQ SI, R14, CX
|
|
MOVQ $0x00001010, R13
|
|
BEXTRQ R13, SI, SI
|
|
ADDQ CX, SI
|
|
|
|
// Load ctx.llTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ (CX)(SI*8), SI
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_skip_update:
|
|
// Adjust offset
|
|
MOVQ s+0(FP), CX
|
|
MOVQ 8(SP), R13
|
|
CMPQ R12, $0x01
|
|
JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
|
|
MOVUPS 144(CX), X0
|
|
MOVQ R13, 144(CX)
|
|
MOVUPS X0, 152(CX)
|
|
JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
|
|
CMPQ 24(SP), $0x00000000
|
|
JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
|
|
INCQ R13
|
|
JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
|
|
MOVQ 144(CX), R13
|
|
JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
|
|
MOVQ R13, R12
|
|
XORQ R14, R14
|
|
MOVQ $-1, R15
|
|
CMPQ R13, $0x03
|
|
CMOVQEQ R14, R12
|
|
CMOVQEQ R15, R14
|
|
ADDQ 144(CX)(R12*8), R14
|
|
JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
|
|
MOVQ $0x00000001, R14
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
|
|
CMPQ R13, $0x01
|
|
JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip
|
|
MOVQ 152(CX), R12
|
|
MOVQ R12, 160(CX)
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
|
|
MOVQ 144(CX), R12
|
|
MOVQ R12, 152(CX)
|
|
MOVQ R14, 144(CX)
|
|
MOVQ R14, R13
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_after_adjust:
|
|
MOVQ R13, 8(SP)
|
|
|
|
// Check values
|
|
MOVQ 16(SP), CX
|
|
MOVQ 24(SP), R12
|
|
LEAQ (CX)(R12*1), R14
|
|
MOVQ s+0(FP), R15
|
|
ADDQ R14, 256(R15)
|
|
MOVQ ctx+16(FP), R14
|
|
SUBQ R12, 104(R14)
|
|
JS error_not_enough_literals
|
|
CMPQ CX, $0x00020002
|
|
JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
|
|
MOVQ 24(SP), CX
|
|
MOVQ 8(SP), R12
|
|
MOVQ 16(SP), R13
|
|
|
|
// Check if we have enough space in s.out
|
|
LEAQ (CX)(R13*1), R14
|
|
ADDQ R9, R14
|
|
CMPQ R14, 32(SP)
|
|
JA error_not_enough_space
|
|
|
|
// Copy literals
|
|
TESTQ CX, CX
|
|
JZ check_offset
|
|
MOVQ CX, R14
|
|
SUBQ $0x10, R14
|
|
JB copy_1_small
|
|
|
|
copy_1_loop:
|
|
MOVUPS (R10), X0
|
|
MOVUPS X0, (R9)
|
|
ADDQ $0x10, R10
|
|
ADDQ $0x10, R9
|
|
SUBQ $0x10, R14
|
|
JAE copy_1_loop
|
|
LEAQ 16(R10)(R14*1), R10
|
|
LEAQ 16(R9)(R14*1), R9
|
|
MOVUPS -16(R10), X0
|
|
MOVUPS X0, -16(R9)
|
|
JMP copy_1_end
|
|
|
|
copy_1_small:
|
|
CMPQ CX, $0x03
|
|
JE copy_1_move_3
|
|
JB copy_1_move_1or2
|
|
CMPQ CX, $0x08
|
|
JB copy_1_move_4through7
|
|
JMP copy_1_move_8through16
|
|
|
|
copy_1_move_1or2:
|
|
MOVB (R10), R14
|
|
MOVB -1(R10)(CX*1), R15
|
|
MOVB R14, (R9)
|
|
MOVB R15, -1(R9)(CX*1)
|
|
ADDQ CX, R10
|
|
ADDQ CX, R9
|
|
JMP copy_1_end
|
|
|
|
copy_1_move_3:
|
|
MOVW (R10), R14
|
|
MOVB 2(R10), R15
|
|
MOVW R14, (R9)
|
|
MOVB R15, 2(R9)
|
|
ADDQ CX, R10
|
|
ADDQ CX, R9
|
|
JMP copy_1_end
|
|
|
|
copy_1_move_4through7:
|
|
MOVL (R10), R14
|
|
MOVL -4(R10)(CX*1), R15
|
|
MOVL R14, (R9)
|
|
MOVL R15, -4(R9)(CX*1)
|
|
ADDQ CX, R10
|
|
ADDQ CX, R9
|
|
JMP copy_1_end
|
|
|
|
copy_1_move_8through16:
|
|
MOVQ (R10), R14
|
|
MOVQ -8(R10)(CX*1), R15
|
|
MOVQ R14, (R9)
|
|
MOVQ R15, -8(R9)(CX*1)
|
|
ADDQ CX, R10
|
|
ADDQ CX, R9
|
|
|
|
copy_1_end:
|
|
ADDQ CX, R11
|
|
|
|
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
|
|
check_offset:
|
|
MOVQ R11, CX
|
|
ADDQ 40(SP), CX
|
|
CMPQ R12, CX
|
|
JG error_match_off_too_big
|
|
CMPQ R12, 56(SP)
|
|
JG error_match_off_too_big
|
|
|
|
// Copy match from history
|
|
MOVQ R12, CX
|
|
SUBQ R11, CX
|
|
JLS copy_match
|
|
MOVQ 48(SP), R14
|
|
SUBQ CX, R14
|
|
CMPQ R13, CX
|
|
JG copy_all_from_history
|
|
MOVQ R13, CX
|
|
SUBQ $0x10, CX
|
|
JB copy_4_small
|
|
|
|
copy_4_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (R9)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, R9
|
|
SUBQ $0x10, CX
|
|
JAE copy_4_loop
|
|
LEAQ 16(R14)(CX*1), R14
|
|
LEAQ 16(R9)(CX*1), R9
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(R9)
|
|
JMP copy_4_end
|
|
|
|
copy_4_small:
|
|
CMPQ R13, $0x03
|
|
JE copy_4_move_3
|
|
CMPQ R13, $0x08
|
|
JB copy_4_move_4through7
|
|
JMP copy_4_move_8through16
|
|
|
|
copy_4_move_3:
|
|
MOVW (R14), CX
|
|
MOVB 2(R14), R12
|
|
MOVW CX, (R9)
|
|
MOVB R12, 2(R9)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R9
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_4through7:
|
|
MOVL (R14), CX
|
|
MOVL -4(R14)(R13*1), R12
|
|
MOVL CX, (R9)
|
|
MOVL R12, -4(R9)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R9
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_8through16:
|
|
MOVQ (R14), CX
|
|
MOVQ -8(R14)(R13*1), R12
|
|
MOVQ CX, (R9)
|
|
MOVQ R12, -8(R9)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R9
|
|
|
|
copy_4_end:
|
|
ADDQ R13, R11
|
|
JMP handle_loop
|
|
JMP loop_finished
|
|
|
|
copy_all_from_history:
|
|
MOVQ CX, R15
|
|
SUBQ $0x10, R15
|
|
JB copy_5_small
|
|
|
|
copy_5_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (R9)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, R9
|
|
SUBQ $0x10, R15
|
|
JAE copy_5_loop
|
|
LEAQ 16(R14)(R15*1), R14
|
|
LEAQ 16(R9)(R15*1), R9
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(R9)
|
|
JMP copy_5_end
|
|
|
|
copy_5_small:
|
|
CMPQ CX, $0x03
|
|
JE copy_5_move_3
|
|
JB copy_5_move_1or2
|
|
CMPQ CX, $0x08
|
|
JB copy_5_move_4through7
|
|
JMP copy_5_move_8through16
|
|
|
|
copy_5_move_1or2:
|
|
MOVB (R14), R15
|
|
MOVB -1(R14)(CX*1), BP
|
|
MOVB R15, (R9)
|
|
MOVB BP, -1(R9)(CX*1)
|
|
ADDQ CX, R14
|
|
ADDQ CX, R9
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_3:
|
|
MOVW (R14), R15
|
|
MOVB 2(R14), BP
|
|
MOVW R15, (R9)
|
|
MOVB BP, 2(R9)
|
|
ADDQ CX, R14
|
|
ADDQ CX, R9
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_4through7:
|
|
MOVL (R14), R15
|
|
MOVL -4(R14)(CX*1), BP
|
|
MOVL R15, (R9)
|
|
MOVL BP, -4(R9)(CX*1)
|
|
ADDQ CX, R14
|
|
ADDQ CX, R9
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_8through16:
|
|
MOVQ (R14), R15
|
|
MOVQ -8(R14)(CX*1), BP
|
|
MOVQ R15, (R9)
|
|
MOVQ BP, -8(R9)(CX*1)
|
|
ADDQ CX, R14
|
|
ADDQ CX, R9
|
|
|
|
copy_5_end:
|
|
ADDQ CX, R11
|
|
SUBQ CX, R13
|
|
|
|
// Copy match from the current buffer
|
|
copy_match:
|
|
MOVQ R9, CX
|
|
SUBQ R12, CX
|
|
|
|
// ml <= mo
|
|
CMPQ R13, R12
|
|
JA copy_overlapping_match
|
|
|
|
// Copy non-overlapping match
|
|
ADDQ R13, R11
|
|
MOVQ R13, R12
|
|
SUBQ $0x10, R12
|
|
JB copy_2_small
|
|
|
|
copy_2_loop:
|
|
MOVUPS (CX), X0
|
|
MOVUPS X0, (R9)
|
|
ADDQ $0x10, CX
|
|
ADDQ $0x10, R9
|
|
SUBQ $0x10, R12
|
|
JAE copy_2_loop
|
|
LEAQ 16(CX)(R12*1), CX
|
|
LEAQ 16(R9)(R12*1), R9
|
|
MOVUPS -16(CX), X0
|
|
MOVUPS X0, -16(R9)
|
|
JMP copy_2_end
|
|
|
|
copy_2_small:
|
|
CMPQ R13, $0x03
|
|
JE copy_2_move_3
|
|
JB copy_2_move_1or2
|
|
CMPQ R13, $0x08
|
|
JB copy_2_move_4through7
|
|
JMP copy_2_move_8through16
|
|
|
|
copy_2_move_1or2:
|
|
MOVB (CX), R12
|
|
MOVB -1(CX)(R13*1), R14
|
|
MOVB R12, (R9)
|
|
MOVB R14, -1(R9)(R13*1)
|
|
ADDQ R13, CX
|
|
ADDQ R13, R9
|
|
JMP copy_2_end
|
|
|
|
copy_2_move_3:
|
|
MOVW (CX), R12
|
|
MOVB 2(CX), R14
|
|
MOVW R12, (R9)
|
|
MOVB R14, 2(R9)
|
|
ADDQ R13, CX
|
|
ADDQ R13, R9
|
|
JMP copy_2_end
|
|
|
|
copy_2_move_4through7:
|
|
MOVL (CX), R12
|
|
MOVL -4(CX)(R13*1), R14
|
|
MOVL R12, (R9)
|
|
MOVL R14, -4(R9)(R13*1)
|
|
ADDQ R13, CX
|
|
ADDQ R13, R9
|
|
JMP copy_2_end
|
|
|
|
copy_2_move_8through16:
|
|
MOVQ (CX), R12
|
|
MOVQ -8(CX)(R13*1), R14
|
|
MOVQ R12, (R9)
|
|
MOVQ R14, -8(R9)(R13*1)
|
|
ADDQ R13, CX
|
|
ADDQ R13, R9
|
|
|
|
copy_2_end:
|
|
JMP handle_loop
|
|
|
|
// Copy overlapping match
|
|
copy_overlapping_match:
|
|
ADDQ R13, R11
|
|
|
|
copy_slow_3:
|
|
MOVB (CX), R12
|
|
MOVB R12, (R9)
|
|
INCQ CX
|
|
INCQ R9
|
|
DECQ R13
|
|
JNZ copy_slow_3
|
|
|
|
handle_loop:
|
|
MOVQ ctx+16(FP), CX
|
|
DECQ 96(CX)
|
|
JNS sequenceDecs_decodeSync_safe_bmi2_main_loop
|
|
|
|
loop_finished:
|
|
MOVQ br+8(FP), CX
|
|
MOVQ AX, 32(CX)
|
|
MOVB DL, 40(CX)
|
|
MOVQ BX, 24(CX)
|
|
|
|
// Update the context
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ R11, 136(AX)
|
|
MOVQ 144(AX), CX
|
|
SUBQ CX, R10
|
|
MOVQ R10, 168(AX)
|
|
|
|
// Return success
|
|
MOVQ $0x00000000, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match length error
|
|
sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
|
|
MOVQ 16(SP), AX
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ AX, 216(CX)
|
|
MOVQ $0x00000001, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match too long error
|
|
sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 16(SP), CX
|
|
MOVQ CX, 216(AX)
|
|
MOVQ $0x00000002, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match offset too long error
|
|
error_match_off_too_big:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 8(SP), CX
|
|
MOVQ CX, 224(AX)
|
|
MOVQ R11, 136(AX)
|
|
MOVQ $0x00000003, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough literals error
|
|
error_not_enough_literals:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 24(SP), CX
|
|
MOVQ CX, 208(AX)
|
|
MOVQ $0x00000004, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough output space error
|
|
error_not_enough_space:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 24(SP), CX
|
|
MOVQ CX, 208(AX)
|
|
MOVQ 16(SP), CX
|
|
MOVQ CX, 216(AX)
|
|
MOVQ R11, 136(AX)
|
|
MOVQ $0x00000005, ret+24(FP)
|
|
RET
|