You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
4151 lines
82 KiB
4151 lines
82 KiB
// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT. |
|
|
|
//go:build !appengine && !noasm && gc && !noasm |
|
|
|
// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int |
|
// Requires: CMOV |
|
TEXT ·sequenceDecs_decode_amd64(SB), $8-32 |
|
MOVQ br+8(FP), CX |
|
MOVQ 24(CX), DX |
|
MOVBQZX 40(CX), BX |
|
MOVQ (CX), AX |
|
MOVQ 32(CX), SI |
|
ADDQ SI, AX |
|
MOVQ AX, (SP) |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 72(AX), DI |
|
MOVQ 80(AX), R8 |
|
MOVQ 88(AX), R9 |
|
MOVQ 104(AX), R10 |
|
MOVQ s+0(FP), AX |
|
MOVQ 144(AX), R11 |
|
MOVQ 152(AX), R12 |
|
MOVQ 160(AX), R13 |
|
|
|
sequenceDecs_decode_amd64_main_loop: |
|
MOVQ (SP), R14 |
|
|
|
// Fill bitreader to have enough for the offset and match length. |
|
CMPQ SI, $0x08 |
|
JL sequenceDecs_decode_amd64_fill_byte_by_byte |
|
MOVQ BX, AX |
|
SHRQ $0x03, AX |
|
SUBQ AX, R14 |
|
MOVQ (R14), DX |
|
SUBQ AX, SI |
|
ANDQ $0x07, BX |
|
JMP sequenceDecs_decode_amd64_fill_end |
|
|
|
sequenceDecs_decode_amd64_fill_byte_by_byte: |
|
CMPQ SI, $0x00 |
|
JLE sequenceDecs_decode_amd64_fill_check_overread |
|
CMPQ BX, $0x07 |
|
JLE sequenceDecs_decode_amd64_fill_end |
|
SHLQ $0x08, DX |
|
SUBQ $0x01, R14 |
|
SUBQ $0x01, SI |
|
SUBQ $0x08, BX |
|
MOVBQZX (R14), AX |
|
ORQ AX, DX |
|
JMP sequenceDecs_decode_amd64_fill_byte_by_byte |
|
|
|
sequenceDecs_decode_amd64_fill_check_overread: |
|
CMPQ BX, $0x40 |
|
JA error_overread |
|
|
|
sequenceDecs_decode_amd64_fill_end: |
|
// Update offset |
|
MOVQ R9, AX |
|
MOVQ BX, CX |
|
MOVQ DX, R15 |
|
SHLQ CL, R15 |
|
MOVB AH, CL |
|
SHRQ $0x20, AX |
|
TESTQ CX, CX |
|
JZ sequenceDecs_decode_amd64_of_update_zero |
|
ADDQ CX, BX |
|
CMPQ BX, $0x40 |
|
JA sequenceDecs_decode_amd64_of_update_zero |
|
CMPQ CX, $0x40 |
|
JAE sequenceDecs_decode_amd64_of_update_zero |
|
NEGQ CX |
|
SHRQ CL, R15 |
|
ADDQ R15, AX |
|
|
|
sequenceDecs_decode_amd64_of_update_zero: |
|
MOVQ AX, 16(R10) |
|
|
|
// Update match length |
|
MOVQ R8, AX |
|
MOVQ BX, CX |
|
MOVQ DX, R15 |
|
SHLQ CL, R15 |
|
MOVB AH, CL |
|
SHRQ $0x20, AX |
|
TESTQ CX, CX |
|
JZ sequenceDecs_decode_amd64_ml_update_zero |
|
ADDQ CX, BX |
|
CMPQ BX, $0x40 |
|
JA sequenceDecs_decode_amd64_ml_update_zero |
|
CMPQ CX, $0x40 |
|
JAE sequenceDecs_decode_amd64_ml_update_zero |
|
NEGQ CX |
|
SHRQ CL, R15 |
|
ADDQ R15, AX |
|
|
|
sequenceDecs_decode_amd64_ml_update_zero: |
|
MOVQ AX, 8(R10) |
|
|
|
// Fill bitreader to have enough for the remaining |
|
CMPQ SI, $0x08 |
|
JL sequenceDecs_decode_amd64_fill_2_byte_by_byte |
|
MOVQ BX, AX |
|
SHRQ $0x03, AX |
|
SUBQ AX, R14 |
|
MOVQ (R14), DX |
|
SUBQ AX, SI |
|
ANDQ $0x07, BX |
|
JMP sequenceDecs_decode_amd64_fill_2_end |
|
|
|
sequenceDecs_decode_amd64_fill_2_byte_by_byte: |
|
CMPQ SI, $0x00 |
|
JLE sequenceDecs_decode_amd64_fill_2_check_overread |
|
CMPQ BX, $0x07 |
|
JLE sequenceDecs_decode_amd64_fill_2_end |
|
SHLQ $0x08, DX |
|
SUBQ $0x01, R14 |
|
SUBQ $0x01, SI |
|
SUBQ $0x08, BX |
|
MOVBQZX (R14), AX |
|
ORQ AX, DX |
|
JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte |
|
|
|
sequenceDecs_decode_amd64_fill_2_check_overread: |
|
CMPQ BX, $0x40 |
|
JA error_overread |
|
|
|
sequenceDecs_decode_amd64_fill_2_end: |
|
// Update literal length |
|
MOVQ DI, AX |
|
MOVQ BX, CX |
|
MOVQ DX, R15 |
|
SHLQ CL, R15 |
|
MOVB AH, CL |
|
SHRQ $0x20, AX |
|
TESTQ CX, CX |
|
JZ sequenceDecs_decode_amd64_ll_update_zero |
|
ADDQ CX, BX |
|
CMPQ BX, $0x40 |
|
JA sequenceDecs_decode_amd64_ll_update_zero |
|
CMPQ CX, $0x40 |
|
JAE sequenceDecs_decode_amd64_ll_update_zero |
|
NEGQ CX |
|
SHRQ CL, R15 |
|
ADDQ R15, AX |
|
|
|
sequenceDecs_decode_amd64_ll_update_zero: |
|
MOVQ AX, (R10) |
|
|
|
// Fill bitreader for state updates |
|
MOVQ R14, (SP) |
|
MOVQ R9, AX |
|
SHRQ $0x08, AX |
|
MOVBQZX AL, AX |
|
MOVQ ctx+16(FP), CX |
|
CMPQ 96(CX), $0x00 |
|
JZ sequenceDecs_decode_amd64_skip_update |
|
|
|
// Update Literal Length State |
|
MOVBQZX DI, R14 |
|
SHRL $0x10, DI |
|
LEAQ (BX)(R14*1), CX |
|
MOVQ DX, R15 |
|
MOVQ CX, BX |
|
ROLQ CL, R15 |
|
MOVL $0x00000001, BP |
|
MOVB R14, CL |
|
SHLL CL, BP |
|
DECL BP |
|
ANDQ BP, R15 |
|
ADDQ R15, DI |
|
|
|
// Load ctx.llTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ (CX), CX |
|
MOVQ (CX)(DI*8), DI |
|
|
|
// Update Match Length State |
|
MOVBQZX R8, R14 |
|
SHRL $0x10, R8 |
|
LEAQ (BX)(R14*1), CX |
|
MOVQ DX, R15 |
|
MOVQ CX, BX |
|
ROLQ CL, R15 |
|
MOVL $0x00000001, BP |
|
MOVB R14, CL |
|
SHLL CL, BP |
|
DECL BP |
|
ANDQ BP, R15 |
|
ADDQ R15, R8 |
|
|
|
// Load ctx.mlTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 24(CX), CX |
|
MOVQ (CX)(R8*8), R8 |
|
|
|
// Update Offset State |
|
MOVBQZX R9, R14 |
|
SHRL $0x10, R9 |
|
LEAQ (BX)(R14*1), CX |
|
MOVQ DX, R15 |
|
MOVQ CX, BX |
|
ROLQ CL, R15 |
|
MOVL $0x00000001, BP |
|
MOVB R14, CL |
|
SHLL CL, BP |
|
DECL BP |
|
ANDQ BP, R15 |
|
ADDQ R15, R9 |
|
|
|
// Load ctx.ofTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 48(CX), CX |
|
MOVQ (CX)(R9*8), R9 |
|
|
|
sequenceDecs_decode_amd64_skip_update: |
|
// Adjust offset |
|
MOVQ 16(R10), CX |
|
CMPQ AX, $0x01 |
|
JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0 |
|
MOVQ R12, R13 |
|
MOVQ R11, R12 |
|
MOVQ CX, R11 |
|
JMP sequenceDecs_decode_amd64_after_adjust |
|
|
|
sequenceDecs_decode_amd64_adjust_offsetB_1_or_0: |
|
CMPQ (R10), $0x00000000 |
|
JNE sequenceDecs_decode_amd64_adjust_offset_maybezero |
|
INCQ CX |
|
JMP sequenceDecs_decode_amd64_adjust_offset_nonzero |
|
|
|
sequenceDecs_decode_amd64_adjust_offset_maybezero: |
|
TESTQ CX, CX |
|
JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero |
|
MOVQ R11, CX |
|
JMP sequenceDecs_decode_amd64_after_adjust |
|
|
|
sequenceDecs_decode_amd64_adjust_offset_nonzero: |
|
CMPQ CX, $0x01 |
|
JB sequenceDecs_decode_amd64_adjust_zero |
|
JEQ sequenceDecs_decode_amd64_adjust_one |
|
CMPQ CX, $0x02 |
|
JA sequenceDecs_decode_amd64_adjust_three |
|
JMP sequenceDecs_decode_amd64_adjust_two |
|
|
|
sequenceDecs_decode_amd64_adjust_zero: |
|
MOVQ R11, AX |
|
JMP sequenceDecs_decode_amd64_adjust_test_temp_valid |
|
|
|
sequenceDecs_decode_amd64_adjust_one: |
|
MOVQ R12, AX |
|
JMP sequenceDecs_decode_amd64_adjust_test_temp_valid |
|
|
|
sequenceDecs_decode_amd64_adjust_two: |
|
MOVQ R13, AX |
|
JMP sequenceDecs_decode_amd64_adjust_test_temp_valid |
|
|
|
sequenceDecs_decode_amd64_adjust_three: |
|
LEAQ -1(R11), AX |
|
|
|
sequenceDecs_decode_amd64_adjust_test_temp_valid: |
|
TESTQ AX, AX |
|
JNZ sequenceDecs_decode_amd64_adjust_temp_valid |
|
MOVQ $0x00000001, AX |
|
|
|
sequenceDecs_decode_amd64_adjust_temp_valid: |
|
CMPQ CX, $0x01 |
|
CMOVQNE R12, R13 |
|
MOVQ R11, R12 |
|
MOVQ AX, R11 |
|
MOVQ AX, CX |
|
|
|
sequenceDecs_decode_amd64_after_adjust: |
|
MOVQ CX, 16(R10) |
|
|
|
// Check values |
|
MOVQ 8(R10), AX |
|
MOVQ (R10), R14 |
|
LEAQ (AX)(R14*1), R15 |
|
MOVQ s+0(FP), BP |
|
ADDQ R15, 256(BP) |
|
MOVQ ctx+16(FP), R15 |
|
SUBQ R14, 128(R15) |
|
JS error_not_enough_literals |
|
CMPQ AX, $0x00020002 |
|
JA sequenceDecs_decode_amd64_error_match_len_too_big |
|
TESTQ CX, CX |
|
JNZ sequenceDecs_decode_amd64_match_len_ofs_ok |
|
TESTQ AX, AX |
|
JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch |
|
|
|
sequenceDecs_decode_amd64_match_len_ofs_ok: |
|
ADDQ $0x18, R10 |
|
MOVQ ctx+16(FP), AX |
|
DECQ 96(AX) |
|
JNS sequenceDecs_decode_amd64_main_loop |
|
MOVQ s+0(FP), AX |
|
MOVQ R11, 144(AX) |
|
MOVQ R12, 152(AX) |
|
MOVQ R13, 160(AX) |
|
MOVQ br+8(FP), AX |
|
MOVQ DX, 24(AX) |
|
MOVB BL, 40(AX) |
|
MOVQ SI, 32(AX) |
|
|
|
// Return success |
|
MOVQ $0x00000000, ret+24(FP) |
|
RET |
|
|
|
// Return with match length error |
|
sequenceDecs_decode_amd64_error_match_len_ofs_mismatch: |
|
MOVQ $0x00000001, ret+24(FP) |
|
RET |
|
|
|
// Return with match too long error |
|
sequenceDecs_decode_amd64_error_match_len_too_big: |
|
MOVQ $0x00000002, ret+24(FP) |
|
RET |
|
|
|
// Return with match offset too long error |
|
MOVQ $0x00000003, ret+24(FP) |
|
RET |
|
|
|
// Return with not enough literals error |
|
error_not_enough_literals: |
|
MOVQ $0x00000004, ret+24(FP) |
|
RET |
|
|
|
// Return with overread error |
|
error_overread: |
|
MOVQ $0x00000006, ret+24(FP) |
|
RET |
|
|
|
// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int |
|
// Requires: CMOV |
|
TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32 |
|
MOVQ br+8(FP), CX |
|
MOVQ 24(CX), DX |
|
MOVBQZX 40(CX), BX |
|
MOVQ (CX), AX |
|
MOVQ 32(CX), SI |
|
ADDQ SI, AX |
|
MOVQ AX, (SP) |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 72(AX), DI |
|
MOVQ 80(AX), R8 |
|
MOVQ 88(AX), R9 |
|
MOVQ 104(AX), R10 |
|
MOVQ s+0(FP), AX |
|
MOVQ 144(AX), R11 |
|
MOVQ 152(AX), R12 |
|
MOVQ 160(AX), R13 |
|
|
|
sequenceDecs_decode_56_amd64_main_loop: |
|
MOVQ (SP), R14 |
|
|
|
// Fill bitreader to have enough for the offset and match length. |
|
CMPQ SI, $0x08 |
|
JL sequenceDecs_decode_56_amd64_fill_byte_by_byte |
|
MOVQ BX, AX |
|
SHRQ $0x03, AX |
|
SUBQ AX, R14 |
|
MOVQ (R14), DX |
|
SUBQ AX, SI |
|
ANDQ $0x07, BX |
|
JMP sequenceDecs_decode_56_amd64_fill_end |
|
|
|
sequenceDecs_decode_56_amd64_fill_byte_by_byte: |
|
CMPQ SI, $0x00 |
|
JLE sequenceDecs_decode_56_amd64_fill_check_overread |
|
CMPQ BX, $0x07 |
|
JLE sequenceDecs_decode_56_amd64_fill_end |
|
SHLQ $0x08, DX |
|
SUBQ $0x01, R14 |
|
SUBQ $0x01, SI |
|
SUBQ $0x08, BX |
|
MOVBQZX (R14), AX |
|
ORQ AX, DX |
|
JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte |
|
|
|
sequenceDecs_decode_56_amd64_fill_check_overread: |
|
CMPQ BX, $0x40 |
|
JA error_overread |
|
|
|
sequenceDecs_decode_56_amd64_fill_end: |
|
// Update offset |
|
MOVQ R9, AX |
|
MOVQ BX, CX |
|
MOVQ DX, R15 |
|
SHLQ CL, R15 |
|
MOVB AH, CL |
|
SHRQ $0x20, AX |
|
TESTQ CX, CX |
|
JZ sequenceDecs_decode_56_amd64_of_update_zero |
|
ADDQ CX, BX |
|
CMPQ BX, $0x40 |
|
JA sequenceDecs_decode_56_amd64_of_update_zero |
|
CMPQ CX, $0x40 |
|
JAE sequenceDecs_decode_56_amd64_of_update_zero |
|
NEGQ CX |
|
SHRQ CL, R15 |
|
ADDQ R15, AX |
|
|
|
sequenceDecs_decode_56_amd64_of_update_zero: |
|
MOVQ AX, 16(R10) |
|
|
|
// Update match length |
|
MOVQ R8, AX |
|
MOVQ BX, CX |
|
MOVQ DX, R15 |
|
SHLQ CL, R15 |
|
MOVB AH, CL |
|
SHRQ $0x20, AX |
|
TESTQ CX, CX |
|
JZ sequenceDecs_decode_56_amd64_ml_update_zero |
|
ADDQ CX, BX |
|
CMPQ BX, $0x40 |
|
JA sequenceDecs_decode_56_amd64_ml_update_zero |
|
CMPQ CX, $0x40 |
|
JAE sequenceDecs_decode_56_amd64_ml_update_zero |
|
NEGQ CX |
|
SHRQ CL, R15 |
|
ADDQ R15, AX |
|
|
|
sequenceDecs_decode_56_amd64_ml_update_zero: |
|
MOVQ AX, 8(R10) |
|
|
|
// Update literal length |
|
MOVQ DI, AX |
|
MOVQ BX, CX |
|
MOVQ DX, R15 |
|
SHLQ CL, R15 |
|
MOVB AH, CL |
|
SHRQ $0x20, AX |
|
TESTQ CX, CX |
|
JZ sequenceDecs_decode_56_amd64_ll_update_zero |
|
ADDQ CX, BX |
|
CMPQ BX, $0x40 |
|
JA sequenceDecs_decode_56_amd64_ll_update_zero |
|
CMPQ CX, $0x40 |
|
JAE sequenceDecs_decode_56_amd64_ll_update_zero |
|
NEGQ CX |
|
SHRQ CL, R15 |
|
ADDQ R15, AX |
|
|
|
sequenceDecs_decode_56_amd64_ll_update_zero: |
|
MOVQ AX, (R10) |
|
|
|
// Fill bitreader for state updates |
|
MOVQ R14, (SP) |
|
MOVQ R9, AX |
|
SHRQ $0x08, AX |
|
MOVBQZX AL, AX |
|
MOVQ ctx+16(FP), CX |
|
CMPQ 96(CX), $0x00 |
|
JZ sequenceDecs_decode_56_amd64_skip_update |
|
|
|
// Update Literal Length State |
|
MOVBQZX DI, R14 |
|
SHRL $0x10, DI |
|
LEAQ (BX)(R14*1), CX |
|
MOVQ DX, R15 |
|
MOVQ CX, BX |
|
ROLQ CL, R15 |
|
MOVL $0x00000001, BP |
|
MOVB R14, CL |
|
SHLL CL, BP |
|
DECL BP |
|
ANDQ BP, R15 |
|
ADDQ R15, DI |
|
|
|
// Load ctx.llTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ (CX), CX |
|
MOVQ (CX)(DI*8), DI |
|
|
|
// Update Match Length State |
|
MOVBQZX R8, R14 |
|
SHRL $0x10, R8 |
|
LEAQ (BX)(R14*1), CX |
|
MOVQ DX, R15 |
|
MOVQ CX, BX |
|
ROLQ CL, R15 |
|
MOVL $0x00000001, BP |
|
MOVB R14, CL |
|
SHLL CL, BP |
|
DECL BP |
|
ANDQ BP, R15 |
|
ADDQ R15, R8 |
|
|
|
// Load ctx.mlTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 24(CX), CX |
|
MOVQ (CX)(R8*8), R8 |
|
|
|
// Update Offset State |
|
MOVBQZX R9, R14 |
|
SHRL $0x10, R9 |
|
LEAQ (BX)(R14*1), CX |
|
MOVQ DX, R15 |
|
MOVQ CX, BX |
|
ROLQ CL, R15 |
|
MOVL $0x00000001, BP |
|
MOVB R14, CL |
|
SHLL CL, BP |
|
DECL BP |
|
ANDQ BP, R15 |
|
ADDQ R15, R9 |
|
|
|
// Load ctx.ofTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 48(CX), CX |
|
MOVQ (CX)(R9*8), R9 |
|
|
|
sequenceDecs_decode_56_amd64_skip_update: |
|
// Adjust offset |
|
MOVQ 16(R10), CX |
|
CMPQ AX, $0x01 |
|
JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0 |
|
MOVQ R12, R13 |
|
MOVQ R11, R12 |
|
MOVQ CX, R11 |
|
JMP sequenceDecs_decode_56_amd64_after_adjust |
|
|
|
sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0: |
|
CMPQ (R10), $0x00000000 |
|
JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero |
|
INCQ CX |
|
JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero |
|
|
|
sequenceDecs_decode_56_amd64_adjust_offset_maybezero: |
|
TESTQ CX, CX |
|
JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero |
|
MOVQ R11, CX |
|
JMP sequenceDecs_decode_56_amd64_after_adjust |
|
|
|
sequenceDecs_decode_56_amd64_adjust_offset_nonzero: |
|
CMPQ CX, $0x01 |
|
JB sequenceDecs_decode_56_amd64_adjust_zero |
|
JEQ sequenceDecs_decode_56_amd64_adjust_one |
|
CMPQ CX, $0x02 |
|
JA sequenceDecs_decode_56_amd64_adjust_three |
|
JMP sequenceDecs_decode_56_amd64_adjust_two |
|
|
|
sequenceDecs_decode_56_amd64_adjust_zero: |
|
MOVQ R11, AX |
|
JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid |
|
|
|
sequenceDecs_decode_56_amd64_adjust_one: |
|
MOVQ R12, AX |
|
JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid |
|
|
|
sequenceDecs_decode_56_amd64_adjust_two: |
|
MOVQ R13, AX |
|
JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid |
|
|
|
sequenceDecs_decode_56_amd64_adjust_three: |
|
LEAQ -1(R11), AX |
|
|
|
sequenceDecs_decode_56_amd64_adjust_test_temp_valid: |
|
TESTQ AX, AX |
|
JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid |
|
MOVQ $0x00000001, AX |
|
|
|
sequenceDecs_decode_56_amd64_adjust_temp_valid: |
|
CMPQ CX, $0x01 |
|
CMOVQNE R12, R13 |
|
MOVQ R11, R12 |
|
MOVQ AX, R11 |
|
MOVQ AX, CX |
|
|
|
sequenceDecs_decode_56_amd64_after_adjust: |
|
MOVQ CX, 16(R10) |
|
|
|
// Check values |
|
MOVQ 8(R10), AX |
|
MOVQ (R10), R14 |
|
LEAQ (AX)(R14*1), R15 |
|
MOVQ s+0(FP), BP |
|
ADDQ R15, 256(BP) |
|
MOVQ ctx+16(FP), R15 |
|
SUBQ R14, 128(R15) |
|
JS error_not_enough_literals |
|
CMPQ AX, $0x00020002 |
|
JA sequenceDecs_decode_56_amd64_error_match_len_too_big |
|
TESTQ CX, CX |
|
JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok |
|
TESTQ AX, AX |
|
JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch |
|
|
|
sequenceDecs_decode_56_amd64_match_len_ofs_ok: |
|
ADDQ $0x18, R10 |
|
MOVQ ctx+16(FP), AX |
|
DECQ 96(AX) |
|
JNS sequenceDecs_decode_56_amd64_main_loop |
|
MOVQ s+0(FP), AX |
|
MOVQ R11, 144(AX) |
|
MOVQ R12, 152(AX) |
|
MOVQ R13, 160(AX) |
|
MOVQ br+8(FP), AX |
|
MOVQ DX, 24(AX) |
|
MOVB BL, 40(AX) |
|
MOVQ SI, 32(AX) |
|
|
|
// Return success |
|
MOVQ $0x00000000, ret+24(FP) |
|
RET |
|
|
|
// Return with match length error |
|
sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch: |
|
MOVQ $0x00000001, ret+24(FP) |
|
RET |
|
|
|
// Return with match too long error |
|
sequenceDecs_decode_56_amd64_error_match_len_too_big: |
|
MOVQ $0x00000002, ret+24(FP) |
|
RET |
|
|
|
// Return with match offset too long error |
|
MOVQ $0x00000003, ret+24(FP) |
|
RET |
|
|
|
// Return with not enough literals error |
|
error_not_enough_literals: |
|
MOVQ $0x00000004, ret+24(FP) |
|
RET |
|
|
|
// Return with overread error |
|
error_overread: |
|
MOVQ $0x00000006, ret+24(FP) |
|
RET |
|
|
|
// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int |
|
// Requires: BMI, BMI2, CMOV |
|
TEXT ·sequenceDecs_decode_bmi2(SB), $8-32 |
|
MOVQ br+8(FP), BX |
|
MOVQ 24(BX), AX |
|
MOVBQZX 40(BX), DX |
|
MOVQ (BX), CX |
|
MOVQ 32(BX), BX |
|
ADDQ BX, CX |
|
MOVQ CX, (SP) |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 72(CX), SI |
|
MOVQ 80(CX), DI |
|
MOVQ 88(CX), R8 |
|
MOVQ 104(CX), R9 |
|
MOVQ s+0(FP), CX |
|
MOVQ 144(CX), R10 |
|
MOVQ 152(CX), R11 |
|
MOVQ 160(CX), R12 |
|
|
|
sequenceDecs_decode_bmi2_main_loop: |
|
MOVQ (SP), R13 |
|
|
|
// Fill bitreader to have enough for the offset and match length. |
|
CMPQ BX, $0x08 |
|
JL sequenceDecs_decode_bmi2_fill_byte_by_byte |
|
MOVQ DX, CX |
|
SHRQ $0x03, CX |
|
SUBQ CX, R13 |
|
MOVQ (R13), AX |
|
SUBQ CX, BX |
|
ANDQ $0x07, DX |
|
JMP sequenceDecs_decode_bmi2_fill_end |
|
|
|
sequenceDecs_decode_bmi2_fill_byte_by_byte: |
|
CMPQ BX, $0x00 |
|
JLE sequenceDecs_decode_bmi2_fill_check_overread |
|
CMPQ DX, $0x07 |
|
JLE sequenceDecs_decode_bmi2_fill_end |
|
SHLQ $0x08, AX |
|
SUBQ $0x01, R13 |
|
SUBQ $0x01, BX |
|
SUBQ $0x08, DX |
|
MOVBQZX (R13), CX |
|
ORQ CX, AX |
|
JMP sequenceDecs_decode_bmi2_fill_byte_by_byte |
|
|
|
sequenceDecs_decode_bmi2_fill_check_overread: |
|
CMPQ DX, $0x40 |
|
JA error_overread |
|
|
|
sequenceDecs_decode_bmi2_fill_end: |
|
// Update offset |
|
MOVQ $0x00000808, CX |
|
BEXTRQ CX, R8, R14 |
|
MOVQ AX, R15 |
|
LEAQ (DX)(R14*1), CX |
|
ROLQ CL, R15 |
|
BZHIQ R14, R15, R15 |
|
MOVQ CX, DX |
|
MOVQ R8, CX |
|
SHRQ $0x20, CX |
|
ADDQ R15, CX |
|
MOVQ CX, 16(R9) |
|
|
|
// Update match length |
|
MOVQ $0x00000808, CX |
|
BEXTRQ CX, DI, R14 |
|
MOVQ AX, R15 |
|
LEAQ (DX)(R14*1), CX |
|
ROLQ CL, R15 |
|
BZHIQ R14, R15, R15 |
|
MOVQ CX, DX |
|
MOVQ DI, CX |
|
SHRQ $0x20, CX |
|
ADDQ R15, CX |
|
MOVQ CX, 8(R9) |
|
|
|
// Fill bitreader to have enough for the remaining |
|
CMPQ BX, $0x08 |
|
JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte |
|
MOVQ DX, CX |
|
SHRQ $0x03, CX |
|
SUBQ CX, R13 |
|
MOVQ (R13), AX |
|
SUBQ CX, BX |
|
ANDQ $0x07, DX |
|
JMP sequenceDecs_decode_bmi2_fill_2_end |
|
|
|
sequenceDecs_decode_bmi2_fill_2_byte_by_byte: |
|
CMPQ BX, $0x00 |
|
JLE sequenceDecs_decode_bmi2_fill_2_check_overread |
|
CMPQ DX, $0x07 |
|
JLE sequenceDecs_decode_bmi2_fill_2_end |
|
SHLQ $0x08, AX |
|
SUBQ $0x01, R13 |
|
SUBQ $0x01, BX |
|
SUBQ $0x08, DX |
|
MOVBQZX (R13), CX |
|
ORQ CX, AX |
|
JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte |
|
|
|
sequenceDecs_decode_bmi2_fill_2_check_overread: |
|
CMPQ DX, $0x40 |
|
JA error_overread |
|
|
|
sequenceDecs_decode_bmi2_fill_2_end: |
|
// Update literal length |
|
MOVQ $0x00000808, CX |
|
BEXTRQ CX, SI, R14 |
|
MOVQ AX, R15 |
|
LEAQ (DX)(R14*1), CX |
|
ROLQ CL, R15 |
|
BZHIQ R14, R15, R15 |
|
MOVQ CX, DX |
|
MOVQ SI, CX |
|
SHRQ $0x20, CX |
|
ADDQ R15, CX |
|
MOVQ CX, (R9) |
|
|
|
// Fill bitreader for state updates |
|
MOVQ R13, (SP) |
|
MOVQ $0x00000808, CX |
|
BEXTRQ CX, R8, R13 |
|
MOVQ ctx+16(FP), CX |
|
CMPQ 96(CX), $0x00 |
|
JZ sequenceDecs_decode_bmi2_skip_update |
|
LEAQ (SI)(DI*1), R14 |
|
ADDQ R8, R14 |
|
MOVBQZX R14, R14 |
|
LEAQ (DX)(R14*1), CX |
|
MOVQ AX, R15 |
|
MOVQ CX, DX |
|
ROLQ CL, R15 |
|
BZHIQ R14, R15, R15 |
|
|
|
// Update Offset State |
|
BZHIQ R8, R15, CX |
|
SHRXQ R8, R15, R15 |
|
SHRL $0x10, R8 |
|
ADDQ CX, R8 |
|
|
|
// Load ctx.ofTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 48(CX), CX |
|
MOVQ (CX)(R8*8), R8 |
|
|
|
// Update Match Length State |
|
BZHIQ DI, R15, CX |
|
SHRXQ DI, R15, R15 |
|
SHRL $0x10, DI |
|
ADDQ CX, DI |
|
|
|
// Load ctx.mlTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 24(CX), CX |
|
MOVQ (CX)(DI*8), DI |
|
|
|
// Update Literal Length State |
|
BZHIQ SI, R15, CX |
|
SHRL $0x10, SI |
|
ADDQ CX, SI |
|
|
|
// Load ctx.llTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ (CX), CX |
|
MOVQ (CX)(SI*8), SI |
|
|
|
sequenceDecs_decode_bmi2_skip_update: |
|
// Adjust offset |
|
MOVQ 16(R9), CX |
|
CMPQ R13, $0x01 |
|
JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0 |
|
MOVQ R11, R12 |
|
MOVQ R10, R11 |
|
MOVQ CX, R10 |
|
JMP sequenceDecs_decode_bmi2_after_adjust |
|
|
|
sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0: |
|
CMPQ (R9), $0x00000000 |
|
JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero |
|
INCQ CX |
|
JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero |
|
|
|
sequenceDecs_decode_bmi2_adjust_offset_maybezero: |
|
TESTQ CX, CX |
|
JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero |
|
MOVQ R10, CX |
|
JMP sequenceDecs_decode_bmi2_after_adjust |
|
|
|
sequenceDecs_decode_bmi2_adjust_offset_nonzero: |
|
CMPQ CX, $0x01 |
|
JB sequenceDecs_decode_bmi2_adjust_zero |
|
JEQ sequenceDecs_decode_bmi2_adjust_one |
|
CMPQ CX, $0x02 |
|
JA sequenceDecs_decode_bmi2_adjust_three |
|
JMP sequenceDecs_decode_bmi2_adjust_two |
|
|
|
sequenceDecs_decode_bmi2_adjust_zero: |
|
MOVQ R10, R13 |
|
JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid |
|
|
|
sequenceDecs_decode_bmi2_adjust_one: |
|
MOVQ R11, R13 |
|
JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid |
|
|
|
sequenceDecs_decode_bmi2_adjust_two: |
|
MOVQ R12, R13 |
|
JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid |
|
|
|
sequenceDecs_decode_bmi2_adjust_three: |
|
LEAQ -1(R10), R13 |
|
|
|
sequenceDecs_decode_bmi2_adjust_test_temp_valid: |
|
TESTQ R13, R13 |
|
JNZ sequenceDecs_decode_bmi2_adjust_temp_valid |
|
MOVQ $0x00000001, R13 |
|
|
|
sequenceDecs_decode_bmi2_adjust_temp_valid: |
|
CMPQ CX, $0x01 |
|
CMOVQNE R11, R12 |
|
MOVQ R10, R11 |
|
MOVQ R13, R10 |
|
MOVQ R13, CX |
|
|
|
sequenceDecs_decode_bmi2_after_adjust: |
|
MOVQ CX, 16(R9) |
|
|
|
// Check values |
|
MOVQ 8(R9), R13 |
|
MOVQ (R9), R14 |
|
LEAQ (R13)(R14*1), R15 |
|
MOVQ s+0(FP), BP |
|
ADDQ R15, 256(BP) |
|
MOVQ ctx+16(FP), R15 |
|
SUBQ R14, 128(R15) |
|
JS error_not_enough_literals |
|
CMPQ R13, $0x00020002 |
|
JA sequenceDecs_decode_bmi2_error_match_len_too_big |
|
TESTQ CX, CX |
|
JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok |
|
TESTQ R13, R13 |
|
JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch |
|
|
|
sequenceDecs_decode_bmi2_match_len_ofs_ok: |
|
ADDQ $0x18, R9 |
|
MOVQ ctx+16(FP), CX |
|
DECQ 96(CX) |
|
JNS sequenceDecs_decode_bmi2_main_loop |
|
MOVQ s+0(FP), CX |
|
MOVQ R10, 144(CX) |
|
MOVQ R11, 152(CX) |
|
MOVQ R12, 160(CX) |
|
MOVQ br+8(FP), CX |
|
MOVQ AX, 24(CX) |
|
MOVB DL, 40(CX) |
|
MOVQ BX, 32(CX) |
|
|
|
// Return success |
|
MOVQ $0x00000000, ret+24(FP) |
|
RET |
|
|
|
// Return with match length error |
|
sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch: |
|
MOVQ $0x00000001, ret+24(FP) |
|
RET |
|
|
|
// Return with match too long error |
|
sequenceDecs_decode_bmi2_error_match_len_too_big: |
|
MOVQ $0x00000002, ret+24(FP) |
|
RET |
|
|
|
// Return with match offset too long error |
|
MOVQ $0x00000003, ret+24(FP) |
|
RET |
|
|
|
// Return with not enough literals error |
|
error_not_enough_literals: |
|
MOVQ $0x00000004, ret+24(FP) |
|
RET |
|
|
|
// Return with overread error |
|
error_overread: |
|
MOVQ $0x00000006, ret+24(FP) |
|
RET |
|
|
|
// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int |
|
// Requires: BMI, BMI2, CMOV |
|
TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32 |
|
MOVQ br+8(FP), BX |
|
MOVQ 24(BX), AX |
|
MOVBQZX 40(BX), DX |
|
MOVQ (BX), CX |
|
MOVQ 32(BX), BX |
|
ADDQ BX, CX |
|
MOVQ CX, (SP) |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 72(CX), SI |
|
MOVQ 80(CX), DI |
|
MOVQ 88(CX), R8 |
|
MOVQ 104(CX), R9 |
|
MOVQ s+0(FP), CX |
|
MOVQ 144(CX), R10 |
|
MOVQ 152(CX), R11 |
|
MOVQ 160(CX), R12 |
|
|
|
sequenceDecs_decode_56_bmi2_main_loop: |
|
MOVQ (SP), R13 |
|
|
|
// Fill bitreader to have enough for the offset and match length. |
|
CMPQ BX, $0x08 |
|
JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte |
|
MOVQ DX, CX |
|
SHRQ $0x03, CX |
|
SUBQ CX, R13 |
|
MOVQ (R13), AX |
|
SUBQ CX, BX |
|
ANDQ $0x07, DX |
|
JMP sequenceDecs_decode_56_bmi2_fill_end |
|
|
|
sequenceDecs_decode_56_bmi2_fill_byte_by_byte: |
|
CMPQ BX, $0x00 |
|
JLE sequenceDecs_decode_56_bmi2_fill_check_overread |
|
CMPQ DX, $0x07 |
|
JLE sequenceDecs_decode_56_bmi2_fill_end |
|
SHLQ $0x08, AX |
|
SUBQ $0x01, R13 |
|
SUBQ $0x01, BX |
|
SUBQ $0x08, DX |
|
MOVBQZX (R13), CX |
|
ORQ CX, AX |
|
JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte |
|
|
|
sequenceDecs_decode_56_bmi2_fill_check_overread: |
|
CMPQ DX, $0x40 |
|
JA error_overread |
|
|
|
sequenceDecs_decode_56_bmi2_fill_end: |
|
// Update offset |
|
MOVQ $0x00000808, CX |
|
BEXTRQ CX, R8, R14 |
|
MOVQ AX, R15 |
|
LEAQ (DX)(R14*1), CX |
|
ROLQ CL, R15 |
|
BZHIQ R14, R15, R15 |
|
MOVQ CX, DX |
|
MOVQ R8, CX |
|
SHRQ $0x20, CX |
|
ADDQ R15, CX |
|
MOVQ CX, 16(R9) |
|
|
|
// Update match length |
|
MOVQ $0x00000808, CX |
|
BEXTRQ CX, DI, R14 |
|
MOVQ AX, R15 |
|
LEAQ (DX)(R14*1), CX |
|
ROLQ CL, R15 |
|
BZHIQ R14, R15, R15 |
|
MOVQ CX, DX |
|
MOVQ DI, CX |
|
SHRQ $0x20, CX |
|
ADDQ R15, CX |
|
MOVQ CX, 8(R9) |
|
|
|
// Update literal length |
|
MOVQ $0x00000808, CX |
|
BEXTRQ CX, SI, R14 |
|
MOVQ AX, R15 |
|
LEAQ (DX)(R14*1), CX |
|
ROLQ CL, R15 |
|
BZHIQ R14, R15, R15 |
|
MOVQ CX, DX |
|
MOVQ SI, CX |
|
SHRQ $0x20, CX |
|
ADDQ R15, CX |
|
MOVQ CX, (R9) |
|
|
|
// Fill bitreader for state updates |
|
MOVQ R13, (SP) |
|
MOVQ $0x00000808, CX |
|
BEXTRQ CX, R8, R13 |
|
MOVQ ctx+16(FP), CX |
|
CMPQ 96(CX), $0x00 |
|
JZ sequenceDecs_decode_56_bmi2_skip_update |
|
LEAQ (SI)(DI*1), R14 |
|
ADDQ R8, R14 |
|
MOVBQZX R14, R14 |
|
LEAQ (DX)(R14*1), CX |
|
MOVQ AX, R15 |
|
MOVQ CX, DX |
|
ROLQ CL, R15 |
|
BZHIQ R14, R15, R15 |
|
|
|
// Update Offset State |
|
BZHIQ R8, R15, CX |
|
SHRXQ R8, R15, R15 |
|
SHRL $0x10, R8 |
|
ADDQ CX, R8 |
|
|
|
// Load ctx.ofTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 48(CX), CX |
|
MOVQ (CX)(R8*8), R8 |
|
|
|
// Update Match Length State |
|
BZHIQ DI, R15, CX |
|
SHRXQ DI, R15, R15 |
|
SHRL $0x10, DI |
|
ADDQ CX, DI |
|
|
|
// Load ctx.mlTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 24(CX), CX |
|
MOVQ (CX)(DI*8), DI |
|
|
|
// Update Literal Length State |
|
BZHIQ SI, R15, CX |
|
SHRL $0x10, SI |
|
ADDQ CX, SI |
|
|
|
// Load ctx.llTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ (CX), CX |
|
MOVQ (CX)(SI*8), SI |
|
|
|
sequenceDecs_decode_56_bmi2_skip_update: |
|
// Adjust offset |
|
MOVQ 16(R9), CX |
|
CMPQ R13, $0x01 |
|
JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0 |
|
MOVQ R11, R12 |
|
MOVQ R10, R11 |
|
MOVQ CX, R10 |
|
JMP sequenceDecs_decode_56_bmi2_after_adjust |
|
|
|
sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0: |
|
CMPQ (R9), $0x00000000 |
|
JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero |
|
INCQ CX |
|
JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero |
|
|
|
sequenceDecs_decode_56_bmi2_adjust_offset_maybezero: |
|
TESTQ CX, CX |
|
JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero |
|
MOVQ R10, CX |
|
JMP sequenceDecs_decode_56_bmi2_after_adjust |
|
|
|
sequenceDecs_decode_56_bmi2_adjust_offset_nonzero: |
|
CMPQ CX, $0x01 |
|
JB sequenceDecs_decode_56_bmi2_adjust_zero |
|
JEQ sequenceDecs_decode_56_bmi2_adjust_one |
|
CMPQ CX, $0x02 |
|
JA sequenceDecs_decode_56_bmi2_adjust_three |
|
JMP sequenceDecs_decode_56_bmi2_adjust_two |
|
|
|
sequenceDecs_decode_56_bmi2_adjust_zero: |
|
MOVQ R10, R13 |
|
JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid |
|
|
|
sequenceDecs_decode_56_bmi2_adjust_one: |
|
MOVQ R11, R13 |
|
JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid |
|
|
|
sequenceDecs_decode_56_bmi2_adjust_two: |
|
MOVQ R12, R13 |
|
JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid |
|
|
|
sequenceDecs_decode_56_bmi2_adjust_three: |
|
LEAQ -1(R10), R13 |
|
|
|
sequenceDecs_decode_56_bmi2_adjust_test_temp_valid: |
|
TESTQ R13, R13 |
|
JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid |
|
MOVQ $0x00000001, R13 |
|
|
|
sequenceDecs_decode_56_bmi2_adjust_temp_valid: |
|
CMPQ CX, $0x01 |
|
CMOVQNE R11, R12 |
|
MOVQ R10, R11 |
|
MOVQ R13, R10 |
|
MOVQ R13, CX |
|
|
|
sequenceDecs_decode_56_bmi2_after_adjust: |
|
MOVQ CX, 16(R9) |
|
|
|
// Check values |
|
MOVQ 8(R9), R13 |
|
MOVQ (R9), R14 |
|
LEAQ (R13)(R14*1), R15 |
|
MOVQ s+0(FP), BP |
|
ADDQ R15, 256(BP) |
|
MOVQ ctx+16(FP), R15 |
|
SUBQ R14, 128(R15) |
|
JS error_not_enough_literals |
|
CMPQ R13, $0x00020002 |
|
JA sequenceDecs_decode_56_bmi2_error_match_len_too_big |
|
TESTQ CX, CX |
|
JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok |
|
TESTQ R13, R13 |
|
JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch |
|
|
|
sequenceDecs_decode_56_bmi2_match_len_ofs_ok: |
|
ADDQ $0x18, R9 |
|
MOVQ ctx+16(FP), CX |
|
DECQ 96(CX) |
|
JNS sequenceDecs_decode_56_bmi2_main_loop |
|
MOVQ s+0(FP), CX |
|
MOVQ R10, 144(CX) |
|
MOVQ R11, 152(CX) |
|
MOVQ R12, 160(CX) |
|
MOVQ br+8(FP), CX |
|
MOVQ AX, 24(CX) |
|
MOVB DL, 40(CX) |
|
MOVQ BX, 32(CX) |
|
|
|
// Return success |
|
MOVQ $0x00000000, ret+24(FP) |
|
RET |
|
|
|
// Return with match length error |
|
sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch: |
|
MOVQ $0x00000001, ret+24(FP) |
|
RET |
|
|
|
// Return with match too long error |
|
sequenceDecs_decode_56_bmi2_error_match_len_too_big: |
|
MOVQ $0x00000002, ret+24(FP) |
|
RET |
|
|
|
// Return with match offset too long error |
|
MOVQ $0x00000003, ret+24(FP) |
|
RET |
|
|
|
// Return with not enough literals error |
|
error_not_enough_literals: |
|
MOVQ $0x00000004, ret+24(FP) |
|
RET |
|
|
|
// Return with overread error |
|
error_overread: |
|
MOVQ $0x00000006, ret+24(FP) |
|
RET |
|
|
|
// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool |
|
// Requires: SSE |
|
TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9 |
|
MOVQ ctx+0(FP), R10 |
|
MOVQ 8(R10), CX |
|
TESTQ CX, CX |
|
JZ empty_seqs |
|
MOVQ (R10), AX |
|
MOVQ 24(R10), DX |
|
MOVQ 32(R10), BX |
|
MOVQ 80(R10), SI |
|
MOVQ 104(R10), DI |
|
MOVQ 120(R10), R8 |
|
MOVQ 56(R10), R9 |
|
MOVQ 64(R10), R10 |
|
ADDQ R10, R9 |
|
|
|
// seqsBase += 24 * seqIndex |
|
LEAQ (DX)(DX*2), R11 |
|
SHLQ $0x03, R11 |
|
ADDQ R11, AX |
|
|
|
// outBase += outPosition |
|
ADDQ DI, BX |
|
|
|
main_loop: |
|
MOVQ (AX), R11 |
|
MOVQ 16(AX), R12 |
|
MOVQ 8(AX), R13 |
|
|
|
// Copy literals |
|
TESTQ R11, R11 |
|
JZ check_offset |
|
XORQ R14, R14 |
|
|
|
copy_1: |
|
MOVUPS (SI)(R14*1), X0 |
|
MOVUPS X0, (BX)(R14*1) |
|
ADDQ $0x10, R14 |
|
CMPQ R14, R11 |
|
JB copy_1 |
|
ADDQ R11, SI |
|
ADDQ R11, BX |
|
ADDQ R11, DI |
|
|
|
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) |
|
check_offset: |
|
LEAQ (DI)(R10*1), R11 |
|
CMPQ R12, R11 |
|
JG error_match_off_too_big |
|
CMPQ R12, R8 |
|
JG error_match_off_too_big |
|
|
|
// Copy match from history |
|
MOVQ R12, R11 |
|
SUBQ DI, R11 |
|
JLS copy_match |
|
MOVQ R9, R14 |
|
SUBQ R11, R14 |
|
CMPQ R13, R11 |
|
JG copy_all_from_history |
|
MOVQ R13, R11 |
|
SUBQ $0x10, R11 |
|
JB copy_4_small |
|
|
|
copy_4_loop: |
|
MOVUPS (R14), X0 |
|
MOVUPS X0, (BX) |
|
ADDQ $0x10, R14 |
|
ADDQ $0x10, BX |
|
SUBQ $0x10, R11 |
|
JAE copy_4_loop |
|
LEAQ 16(R14)(R11*1), R14 |
|
LEAQ 16(BX)(R11*1), BX |
|
MOVUPS -16(R14), X0 |
|
MOVUPS X0, -16(BX) |
|
JMP copy_4_end |
|
|
|
copy_4_small: |
|
CMPQ R13, $0x03 |
|
JE copy_4_move_3 |
|
CMPQ R13, $0x08 |
|
JB copy_4_move_4through7 |
|
JMP copy_4_move_8through16 |
|
|
|
copy_4_move_3: |
|
MOVW (R14), R11 |
|
MOVB 2(R14), R12 |
|
MOVW R11, (BX) |
|
MOVB R12, 2(BX) |
|
ADDQ R13, R14 |
|
ADDQ R13, BX |
|
JMP copy_4_end |
|
|
|
copy_4_move_4through7: |
|
MOVL (R14), R11 |
|
MOVL -4(R14)(R13*1), R12 |
|
MOVL R11, (BX) |
|
MOVL R12, -4(BX)(R13*1) |
|
ADDQ R13, R14 |
|
ADDQ R13, BX |
|
JMP copy_4_end |
|
|
|
copy_4_move_8through16: |
|
MOVQ (R14), R11 |
|
MOVQ -8(R14)(R13*1), R12 |
|
MOVQ R11, (BX) |
|
MOVQ R12, -8(BX)(R13*1) |
|
ADDQ R13, R14 |
|
ADDQ R13, BX |
|
|
|
copy_4_end: |
|
ADDQ R13, DI |
|
ADDQ $0x18, AX |
|
INCQ DX |
|
CMPQ DX, CX |
|
JB main_loop |
|
JMP loop_finished |
|
|
|
copy_all_from_history: |
|
MOVQ R11, R15 |
|
SUBQ $0x10, R15 |
|
JB copy_5_small |
|
|
|
copy_5_loop: |
|
MOVUPS (R14), X0 |
|
MOVUPS X0, (BX) |
|
ADDQ $0x10, R14 |
|
ADDQ $0x10, BX |
|
SUBQ $0x10, R15 |
|
JAE copy_5_loop |
|
LEAQ 16(R14)(R15*1), R14 |
|
LEAQ 16(BX)(R15*1), BX |
|
MOVUPS -16(R14), X0 |
|
MOVUPS X0, -16(BX) |
|
JMP copy_5_end |
|
|
|
copy_5_small: |
|
CMPQ R11, $0x03 |
|
JE copy_5_move_3 |
|
JB copy_5_move_1or2 |
|
CMPQ R11, $0x08 |
|
JB copy_5_move_4through7 |
|
JMP copy_5_move_8through16 |
|
|
|
copy_5_move_1or2: |
|
MOVB (R14), R15 |
|
MOVB -1(R14)(R11*1), BP |
|
MOVB R15, (BX) |
|
MOVB BP, -1(BX)(R11*1) |
|
ADDQ R11, R14 |
|
ADDQ R11, BX |
|
JMP copy_5_end |
|
|
|
copy_5_move_3: |
|
MOVW (R14), R15 |
|
MOVB 2(R14), BP |
|
MOVW R15, (BX) |
|
MOVB BP, 2(BX) |
|
ADDQ R11, R14 |
|
ADDQ R11, BX |
|
JMP copy_5_end |
|
|
|
copy_5_move_4through7: |
|
MOVL (R14), R15 |
|
MOVL -4(R14)(R11*1), BP |
|
MOVL R15, (BX) |
|
MOVL BP, -4(BX)(R11*1) |
|
ADDQ R11, R14 |
|
ADDQ R11, BX |
|
JMP copy_5_end |
|
|
|
copy_5_move_8through16: |
|
MOVQ (R14), R15 |
|
MOVQ -8(R14)(R11*1), BP |
|
MOVQ R15, (BX) |
|
MOVQ BP, -8(BX)(R11*1) |
|
ADDQ R11, R14 |
|
ADDQ R11, BX |
|
|
|
copy_5_end: |
|
ADDQ R11, DI |
|
SUBQ R11, R13 |
|
|
|
// Copy match from the current buffer |
|
copy_match: |
|
MOVQ BX, R11 |
|
SUBQ R12, R11 |
|
|
|
// ml <= mo |
|
CMPQ R13, R12 |
|
JA copy_overlapping_match |
|
|
|
// Copy non-overlapping match |
|
ADDQ R13, DI |
|
MOVQ BX, R12 |
|
ADDQ R13, BX |
|
|
|
copy_2: |
|
MOVUPS (R11), X0 |
|
MOVUPS X0, (R12) |
|
ADDQ $0x10, R11 |
|
ADDQ $0x10, R12 |
|
SUBQ $0x10, R13 |
|
JHI copy_2 |
|
JMP handle_loop |
|
|
|
// Copy overlapping match |
|
copy_overlapping_match: |
|
ADDQ R13, DI |
|
|
|
copy_slow_3: |
|
MOVB (R11), R12 |
|
MOVB R12, (BX) |
|
INCQ R11 |
|
INCQ BX |
|
DECQ R13 |
|
JNZ copy_slow_3 |
|
|
|
handle_loop: |
|
ADDQ $0x18, AX |
|
INCQ DX |
|
CMPQ DX, CX |
|
JB main_loop |
|
|
|
loop_finished: |
|
// Return value |
|
MOVB $0x01, ret+8(FP) |
|
|
|
// Update the context |
|
MOVQ ctx+0(FP), AX |
|
MOVQ DX, 24(AX) |
|
MOVQ DI, 104(AX) |
|
SUBQ 80(AX), SI |
|
MOVQ SI, 112(AX) |
|
RET |
|
|
|
error_match_off_too_big: |
|
// Return value |
|
MOVB $0x00, ret+8(FP) |
|
|
|
// Update the context |
|
MOVQ ctx+0(FP), AX |
|
MOVQ DX, 24(AX) |
|
MOVQ DI, 104(AX) |
|
SUBQ 80(AX), SI |
|
MOVQ SI, 112(AX) |
|
RET |
|
|
|
empty_seqs: |
|
// Return value |
|
MOVB $0x01, ret+8(FP) |
|
RET |
|
|
|
// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool |
|
// Requires: SSE |
|
TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9 |
|
MOVQ ctx+0(FP), R10 |
|
MOVQ 8(R10), CX |
|
TESTQ CX, CX |
|
JZ empty_seqs |
|
MOVQ (R10), AX |
|
MOVQ 24(R10), DX |
|
MOVQ 32(R10), BX |
|
MOVQ 80(R10), SI |
|
MOVQ 104(R10), DI |
|
MOVQ 120(R10), R8 |
|
MOVQ 56(R10), R9 |
|
MOVQ 64(R10), R10 |
|
ADDQ R10, R9 |
|
|
|
// seqsBase += 24 * seqIndex |
|
LEAQ (DX)(DX*2), R11 |
|
SHLQ $0x03, R11 |
|
ADDQ R11, AX |
|
|
|
// outBase += outPosition |
|
ADDQ DI, BX |
|
|
|
main_loop: |
|
MOVQ (AX), R11 |
|
MOVQ 16(AX), R12 |
|
MOVQ 8(AX), R13 |
|
|
|
// Copy literals |
|
TESTQ R11, R11 |
|
JZ check_offset |
|
MOVQ R11, R14 |
|
SUBQ $0x10, R14 |
|
JB copy_1_small |
|
|
|
copy_1_loop: |
|
MOVUPS (SI), X0 |
|
MOVUPS X0, (BX) |
|
ADDQ $0x10, SI |
|
ADDQ $0x10, BX |
|
SUBQ $0x10, R14 |
|
JAE copy_1_loop |
|
LEAQ 16(SI)(R14*1), SI |
|
LEAQ 16(BX)(R14*1), BX |
|
MOVUPS -16(SI), X0 |
|
MOVUPS X0, -16(BX) |
|
JMP copy_1_end |
|
|
|
copy_1_small: |
|
CMPQ R11, $0x03 |
|
JE copy_1_move_3 |
|
JB copy_1_move_1or2 |
|
CMPQ R11, $0x08 |
|
JB copy_1_move_4through7 |
|
JMP copy_1_move_8through16 |
|
|
|
copy_1_move_1or2: |
|
MOVB (SI), R14 |
|
MOVB -1(SI)(R11*1), R15 |
|
MOVB R14, (BX) |
|
MOVB R15, -1(BX)(R11*1) |
|
ADDQ R11, SI |
|
ADDQ R11, BX |
|
JMP copy_1_end |
|
|
|
copy_1_move_3: |
|
MOVW (SI), R14 |
|
MOVB 2(SI), R15 |
|
MOVW R14, (BX) |
|
MOVB R15, 2(BX) |
|
ADDQ R11, SI |
|
ADDQ R11, BX |
|
JMP copy_1_end |
|
|
|
copy_1_move_4through7: |
|
MOVL (SI), R14 |
|
MOVL -4(SI)(R11*1), R15 |
|
MOVL R14, (BX) |
|
MOVL R15, -4(BX)(R11*1) |
|
ADDQ R11, SI |
|
ADDQ R11, BX |
|
JMP copy_1_end |
|
|
|
copy_1_move_8through16: |
|
MOVQ (SI), R14 |
|
MOVQ -8(SI)(R11*1), R15 |
|
MOVQ R14, (BX) |
|
MOVQ R15, -8(BX)(R11*1) |
|
ADDQ R11, SI |
|
ADDQ R11, BX |
|
|
|
copy_1_end: |
|
ADDQ R11, DI |
|
|
|
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) |
|
check_offset: |
|
LEAQ (DI)(R10*1), R11 |
|
CMPQ R12, R11 |
|
JG error_match_off_too_big |
|
CMPQ R12, R8 |
|
JG error_match_off_too_big |
|
|
|
// Copy match from history |
|
MOVQ R12, R11 |
|
SUBQ DI, R11 |
|
JLS copy_match |
|
MOVQ R9, R14 |
|
SUBQ R11, R14 |
|
CMPQ R13, R11 |
|
JG copy_all_from_history |
|
MOVQ R13, R11 |
|
SUBQ $0x10, R11 |
|
JB copy_4_small |
|
|
|
copy_4_loop: |
|
MOVUPS (R14), X0 |
|
MOVUPS X0, (BX) |
|
ADDQ $0x10, R14 |
|
ADDQ $0x10, BX |
|
SUBQ $0x10, R11 |
|
JAE copy_4_loop |
|
LEAQ 16(R14)(R11*1), R14 |
|
LEAQ 16(BX)(R11*1), BX |
|
MOVUPS -16(R14), X0 |
|
MOVUPS X0, -16(BX) |
|
JMP copy_4_end |
|
|
|
copy_4_small: |
|
CMPQ R13, $0x03 |
|
JE copy_4_move_3 |
|
CMPQ R13, $0x08 |
|
JB copy_4_move_4through7 |
|
JMP copy_4_move_8through16 |
|
|
|
copy_4_move_3: |
|
MOVW (R14), R11 |
|
MOVB 2(R14), R12 |
|
MOVW R11, (BX) |
|
MOVB R12, 2(BX) |
|
ADDQ R13, R14 |
|
ADDQ R13, BX |
|
JMP copy_4_end |
|
|
|
copy_4_move_4through7: |
|
MOVL (R14), R11 |
|
MOVL -4(R14)(R13*1), R12 |
|
MOVL R11, (BX) |
|
MOVL R12, -4(BX)(R13*1) |
|
ADDQ R13, R14 |
|
ADDQ R13, BX |
|
JMP copy_4_end |
|
|
|
copy_4_move_8through16: |
|
MOVQ (R14), R11 |
|
MOVQ -8(R14)(R13*1), R12 |
|
MOVQ R11, (BX) |
|
MOVQ R12, -8(BX)(R13*1) |
|
ADDQ R13, R14 |
|
ADDQ R13, BX |
|
|
|
copy_4_end: |
|
ADDQ R13, DI |
|
ADDQ $0x18, AX |
|
INCQ DX |
|
CMPQ DX, CX |
|
JB main_loop |
|
JMP loop_finished |
|
|
|
copy_all_from_history: |
|
MOVQ R11, R15 |
|
SUBQ $0x10, R15 |
|
JB copy_5_small |
|
|
|
copy_5_loop: |
|
MOVUPS (R14), X0 |
|
MOVUPS X0, (BX) |
|
ADDQ $0x10, R14 |
|
ADDQ $0x10, BX |
|
SUBQ $0x10, R15 |
|
JAE copy_5_loop |
|
LEAQ 16(R14)(R15*1), R14 |
|
LEAQ 16(BX)(R15*1), BX |
|
MOVUPS -16(R14), X0 |
|
MOVUPS X0, -16(BX) |
|
JMP copy_5_end |
|
|
|
copy_5_small: |
|
CMPQ R11, $0x03 |
|
JE copy_5_move_3 |
|
JB copy_5_move_1or2 |
|
CMPQ R11, $0x08 |
|
JB copy_5_move_4through7 |
|
JMP copy_5_move_8through16 |
|
|
|
copy_5_move_1or2: |
|
MOVB (R14), R15 |
|
MOVB -1(R14)(R11*1), BP |
|
MOVB R15, (BX) |
|
MOVB BP, -1(BX)(R11*1) |
|
ADDQ R11, R14 |
|
ADDQ R11, BX |
|
JMP copy_5_end |
|
|
|
copy_5_move_3: |
|
MOVW (R14), R15 |
|
MOVB 2(R14), BP |
|
MOVW R15, (BX) |
|
MOVB BP, 2(BX) |
|
ADDQ R11, R14 |
|
ADDQ R11, BX |
|
JMP copy_5_end |
|
|
|
copy_5_move_4through7: |
|
MOVL (R14), R15 |
|
MOVL -4(R14)(R11*1), BP |
|
MOVL R15, (BX) |
|
MOVL BP, -4(BX)(R11*1) |
|
ADDQ R11, R14 |
|
ADDQ R11, BX |
|
JMP copy_5_end |
|
|
|
copy_5_move_8through16: |
|
MOVQ (R14), R15 |
|
MOVQ -8(R14)(R11*1), BP |
|
MOVQ R15, (BX) |
|
MOVQ BP, -8(BX)(R11*1) |
|
ADDQ R11, R14 |
|
ADDQ R11, BX |
|
|
|
copy_5_end: |
|
ADDQ R11, DI |
|
SUBQ R11, R13 |
|
|
|
// Copy match from the current buffer |
|
copy_match: |
|
MOVQ BX, R11 |
|
SUBQ R12, R11 |
|
|
|
// ml <= mo |
|
CMPQ R13, R12 |
|
JA copy_overlapping_match |
|
|
|
// Copy non-overlapping match |
|
ADDQ R13, DI |
|
MOVQ R13, R12 |
|
SUBQ $0x10, R12 |
|
JB copy_2_small |
|
|
|
copy_2_loop: |
|
MOVUPS (R11), X0 |
|
MOVUPS X0, (BX) |
|
ADDQ $0x10, R11 |
|
ADDQ $0x10, BX |
|
SUBQ $0x10, R12 |
|
JAE copy_2_loop |
|
LEAQ 16(R11)(R12*1), R11 |
|
LEAQ 16(BX)(R12*1), BX |
|
MOVUPS -16(R11), X0 |
|
MOVUPS X0, -16(BX) |
|
JMP copy_2_end |
|
|
|
copy_2_small: |
|
CMPQ R13, $0x03 |
|
JE copy_2_move_3 |
|
JB copy_2_move_1or2 |
|
CMPQ R13, $0x08 |
|
JB copy_2_move_4through7 |
|
JMP copy_2_move_8through16 |
|
|
|
copy_2_move_1or2: |
|
MOVB (R11), R12 |
|
MOVB -1(R11)(R13*1), R14 |
|
MOVB R12, (BX) |
|
MOVB R14, -1(BX)(R13*1) |
|
ADDQ R13, R11 |
|
ADDQ R13, BX |
|
JMP copy_2_end |
|
|
|
copy_2_move_3: |
|
MOVW (R11), R12 |
|
MOVB 2(R11), R14 |
|
MOVW R12, (BX) |
|
MOVB R14, 2(BX) |
|
ADDQ R13, R11 |
|
ADDQ R13, BX |
|
JMP copy_2_end |
|
|
|
copy_2_move_4through7: |
|
MOVL (R11), R12 |
|
MOVL -4(R11)(R13*1), R14 |
|
MOVL R12, (BX) |
|
MOVL R14, -4(BX)(R13*1) |
|
ADDQ R13, R11 |
|
ADDQ R13, BX |
|
JMP copy_2_end |
|
|
|
copy_2_move_8through16: |
|
MOVQ (R11), R12 |
|
MOVQ -8(R11)(R13*1), R14 |
|
MOVQ R12, (BX) |
|
MOVQ R14, -8(BX)(R13*1) |
|
ADDQ R13, R11 |
|
ADDQ R13, BX |
|
|
|
copy_2_end: |
|
JMP handle_loop |
|
|
|
// Copy overlapping match |
|
copy_overlapping_match: |
|
ADDQ R13, DI |
|
|
|
copy_slow_3: |
|
MOVB (R11), R12 |
|
MOVB R12, (BX) |
|
INCQ R11 |
|
INCQ BX |
|
DECQ R13 |
|
JNZ copy_slow_3 |
|
|
|
handle_loop: |
|
ADDQ $0x18, AX |
|
INCQ DX |
|
CMPQ DX, CX |
|
JB main_loop |
|
|
|
loop_finished: |
|
// Return value |
|
MOVB $0x01, ret+8(FP) |
|
|
|
// Update the context |
|
MOVQ ctx+0(FP), AX |
|
MOVQ DX, 24(AX) |
|
MOVQ DI, 104(AX) |
|
SUBQ 80(AX), SI |
|
MOVQ SI, 112(AX) |
|
RET |
|
|
|
error_match_off_too_big: |
|
// Return value |
|
MOVB $0x00, ret+8(FP) |
|
|
|
// Update the context |
|
MOVQ ctx+0(FP), AX |
|
MOVQ DX, 24(AX) |
|
MOVQ DI, 104(AX) |
|
SUBQ 80(AX), SI |
|
MOVQ SI, 112(AX) |
|
RET |
|
|
|
empty_seqs: |
|
// Return value |
|
MOVB $0x01, ret+8(FP) |
|
RET |
|
|
|
// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int |
|
// Requires: CMOV, SSE |
|
TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32 |
|
MOVQ br+8(FP), CX |
|
MOVQ 24(CX), DX |
|
MOVBQZX 40(CX), BX |
|
MOVQ (CX), AX |
|
MOVQ 32(CX), SI |
|
ADDQ SI, AX |
|
MOVQ AX, (SP) |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 72(AX), DI |
|
MOVQ 80(AX), R8 |
|
MOVQ 88(AX), R9 |
|
XORQ CX, CX |
|
MOVQ CX, 8(SP) |
|
MOVQ CX, 16(SP) |
|
MOVQ CX, 24(SP) |
|
MOVQ 112(AX), R10 |
|
MOVQ 128(AX), CX |
|
MOVQ CX, 32(SP) |
|
MOVQ 144(AX), R11 |
|
MOVQ 136(AX), R12 |
|
MOVQ 200(AX), CX |
|
MOVQ CX, 56(SP) |
|
MOVQ 176(AX), CX |
|
MOVQ CX, 48(SP) |
|
MOVQ 184(AX), AX |
|
MOVQ AX, 40(SP) |
|
MOVQ 40(SP), AX |
|
ADDQ AX, 48(SP) |
|
|
|
// Calculate pointer to s.out[cap(s.out)] (a past-end pointer) |
|
ADDQ R10, 32(SP) |
|
|
|
// outBase += outPosition |
|
ADDQ R12, R10 |
|
|
|
sequenceDecs_decodeSync_amd64_main_loop: |
|
MOVQ (SP), R13 |
|
|
|
// Fill bitreader to have enough for the offset and match length. |
|
CMPQ SI, $0x08 |
|
JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte |
|
MOVQ BX, AX |
|
SHRQ $0x03, AX |
|
SUBQ AX, R13 |
|
MOVQ (R13), DX |
|
SUBQ AX, SI |
|
ANDQ $0x07, BX |
|
JMP sequenceDecs_decodeSync_amd64_fill_end |
|
|
|
sequenceDecs_decodeSync_amd64_fill_byte_by_byte: |
|
CMPQ SI, $0x00 |
|
JLE sequenceDecs_decodeSync_amd64_fill_check_overread |
|
CMPQ BX, $0x07 |
|
JLE sequenceDecs_decodeSync_amd64_fill_end |
|
SHLQ $0x08, DX |
|
SUBQ $0x01, R13 |
|
SUBQ $0x01, SI |
|
SUBQ $0x08, BX |
|
MOVBQZX (R13), AX |
|
ORQ AX, DX |
|
JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte |
|
|
|
sequenceDecs_decodeSync_amd64_fill_check_overread: |
|
CMPQ BX, $0x40 |
|
JA error_overread |
|
|
|
sequenceDecs_decodeSync_amd64_fill_end: |
|
// Update offset |
|
MOVQ R9, AX |
|
MOVQ BX, CX |
|
MOVQ DX, R14 |
|
SHLQ CL, R14 |
|
MOVB AH, CL |
|
SHRQ $0x20, AX |
|
TESTQ CX, CX |
|
JZ sequenceDecs_decodeSync_amd64_of_update_zero |
|
ADDQ CX, BX |
|
CMPQ BX, $0x40 |
|
JA sequenceDecs_decodeSync_amd64_of_update_zero |
|
CMPQ CX, $0x40 |
|
JAE sequenceDecs_decodeSync_amd64_of_update_zero |
|
NEGQ CX |
|
SHRQ CL, R14 |
|
ADDQ R14, AX |
|
|
|
sequenceDecs_decodeSync_amd64_of_update_zero: |
|
MOVQ AX, 8(SP) |
|
|
|
// Update match length |
|
MOVQ R8, AX |
|
MOVQ BX, CX |
|
MOVQ DX, R14 |
|
SHLQ CL, R14 |
|
MOVB AH, CL |
|
SHRQ $0x20, AX |
|
TESTQ CX, CX |
|
JZ sequenceDecs_decodeSync_amd64_ml_update_zero |
|
ADDQ CX, BX |
|
CMPQ BX, $0x40 |
|
JA sequenceDecs_decodeSync_amd64_ml_update_zero |
|
CMPQ CX, $0x40 |
|
JAE sequenceDecs_decodeSync_amd64_ml_update_zero |
|
NEGQ CX |
|
SHRQ CL, R14 |
|
ADDQ R14, AX |
|
|
|
sequenceDecs_decodeSync_amd64_ml_update_zero: |
|
MOVQ AX, 16(SP) |
|
|
|
// Fill bitreader to have enough for the remaining |
|
CMPQ SI, $0x08 |
|
JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte |
|
MOVQ BX, AX |
|
SHRQ $0x03, AX |
|
SUBQ AX, R13 |
|
MOVQ (R13), DX |
|
SUBQ AX, SI |
|
ANDQ $0x07, BX |
|
JMP sequenceDecs_decodeSync_amd64_fill_2_end |
|
|
|
sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte: |
|
CMPQ SI, $0x00 |
|
JLE sequenceDecs_decodeSync_amd64_fill_2_check_overread |
|
CMPQ BX, $0x07 |
|
JLE sequenceDecs_decodeSync_amd64_fill_2_end |
|
SHLQ $0x08, DX |
|
SUBQ $0x01, R13 |
|
SUBQ $0x01, SI |
|
SUBQ $0x08, BX |
|
MOVBQZX (R13), AX |
|
ORQ AX, DX |
|
JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte |
|
|
|
sequenceDecs_decodeSync_amd64_fill_2_check_overread: |
|
CMPQ BX, $0x40 |
|
JA error_overread |
|
|
|
sequenceDecs_decodeSync_amd64_fill_2_end: |
|
// Update literal length |
|
MOVQ DI, AX |
|
MOVQ BX, CX |
|
MOVQ DX, R14 |
|
SHLQ CL, R14 |
|
MOVB AH, CL |
|
SHRQ $0x20, AX |
|
TESTQ CX, CX |
|
JZ sequenceDecs_decodeSync_amd64_ll_update_zero |
|
ADDQ CX, BX |
|
CMPQ BX, $0x40 |
|
JA sequenceDecs_decodeSync_amd64_ll_update_zero |
|
CMPQ CX, $0x40 |
|
JAE sequenceDecs_decodeSync_amd64_ll_update_zero |
|
NEGQ CX |
|
SHRQ CL, R14 |
|
ADDQ R14, AX |
|
|
|
sequenceDecs_decodeSync_amd64_ll_update_zero: |
|
MOVQ AX, 24(SP) |
|
|
|
// Fill bitreader for state updates |
|
MOVQ R13, (SP) |
|
MOVQ R9, AX |
|
SHRQ $0x08, AX |
|
MOVBQZX AL, AX |
|
MOVQ ctx+16(FP), CX |
|
CMPQ 96(CX), $0x00 |
|
JZ sequenceDecs_decodeSync_amd64_skip_update |
|
|
|
// Update Literal Length State |
|
MOVBQZX DI, R13 |
|
SHRL $0x10, DI |
|
LEAQ (BX)(R13*1), CX |
|
MOVQ DX, R14 |
|
MOVQ CX, BX |
|
ROLQ CL, R14 |
|
MOVL $0x00000001, R15 |
|
MOVB R13, CL |
|
SHLL CL, R15 |
|
DECL R15 |
|
ANDQ R15, R14 |
|
ADDQ R14, DI |
|
|
|
// Load ctx.llTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ (CX), CX |
|
MOVQ (CX)(DI*8), DI |
|
|
|
// Update Match Length State |
|
MOVBQZX R8, R13 |
|
SHRL $0x10, R8 |
|
LEAQ (BX)(R13*1), CX |
|
MOVQ DX, R14 |
|
MOVQ CX, BX |
|
ROLQ CL, R14 |
|
MOVL $0x00000001, R15 |
|
MOVB R13, CL |
|
SHLL CL, R15 |
|
DECL R15 |
|
ANDQ R15, R14 |
|
ADDQ R14, R8 |
|
|
|
// Load ctx.mlTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 24(CX), CX |
|
MOVQ (CX)(R8*8), R8 |
|
|
|
// Update Offset State |
|
MOVBQZX R9, R13 |
|
SHRL $0x10, R9 |
|
LEAQ (BX)(R13*1), CX |
|
MOVQ DX, R14 |
|
MOVQ CX, BX |
|
ROLQ CL, R14 |
|
MOVL $0x00000001, R15 |
|
MOVB R13, CL |
|
SHLL CL, R15 |
|
DECL R15 |
|
ANDQ R15, R14 |
|
ADDQ R14, R9 |
|
|
|
// Load ctx.ofTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 48(CX), CX |
|
MOVQ (CX)(R9*8), R9 |
|
|
|
sequenceDecs_decodeSync_amd64_skip_update: |
|
// Adjust offset |
|
MOVQ s+0(FP), CX |
|
MOVQ 8(SP), R13 |
|
CMPQ AX, $0x01 |
|
JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0 |
|
MOVUPS 144(CX), X0 |
|
MOVQ R13, 144(CX) |
|
MOVUPS X0, 152(CX) |
|
JMP sequenceDecs_decodeSync_amd64_after_adjust |
|
|
|
sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0: |
|
CMPQ 24(SP), $0x00000000 |
|
JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero |
|
INCQ R13 |
|
JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero |
|
|
|
sequenceDecs_decodeSync_amd64_adjust_offset_maybezero: |
|
TESTQ R13, R13 |
|
JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero |
|
MOVQ 144(CX), R13 |
|
JMP sequenceDecs_decodeSync_amd64_after_adjust |
|
|
|
sequenceDecs_decodeSync_amd64_adjust_offset_nonzero: |
|
MOVQ R13, AX |
|
XORQ R14, R14 |
|
MOVQ $-1, R15 |
|
CMPQ R13, $0x03 |
|
CMOVQEQ R14, AX |
|
CMOVQEQ R15, R14 |
|
ADDQ 144(CX)(AX*8), R14 |
|
JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid |
|
MOVQ $0x00000001, R14 |
|
|
|
sequenceDecs_decodeSync_amd64_adjust_temp_valid: |
|
CMPQ R13, $0x01 |
|
JZ sequenceDecs_decodeSync_amd64_adjust_skip |
|
MOVQ 152(CX), AX |
|
MOVQ AX, 160(CX) |
|
|
|
sequenceDecs_decodeSync_amd64_adjust_skip: |
|
MOVQ 144(CX), AX |
|
MOVQ AX, 152(CX) |
|
MOVQ R14, 144(CX) |
|
MOVQ R14, R13 |
|
|
|
sequenceDecs_decodeSync_amd64_after_adjust: |
|
MOVQ R13, 8(SP) |
|
|
|
// Check values |
|
MOVQ 16(SP), AX |
|
MOVQ 24(SP), CX |
|
LEAQ (AX)(CX*1), R14 |
|
MOVQ s+0(FP), R15 |
|
ADDQ R14, 256(R15) |
|
MOVQ ctx+16(FP), R14 |
|
SUBQ CX, 104(R14) |
|
JS error_not_enough_literals |
|
CMPQ AX, $0x00020002 |
|
JA sequenceDecs_decodeSync_amd64_error_match_len_too_big |
|
TESTQ R13, R13 |
|
JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok |
|
TESTQ AX, AX |
|
JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch |
|
|
|
sequenceDecs_decodeSync_amd64_match_len_ofs_ok: |
|
MOVQ 24(SP), AX |
|
MOVQ 8(SP), CX |
|
MOVQ 16(SP), R13 |
|
|
|
// Check if we have enough space in s.out |
|
LEAQ (AX)(R13*1), R14 |
|
ADDQ R10, R14 |
|
CMPQ R14, 32(SP) |
|
JA error_not_enough_space |
|
|
|
// Copy literals |
|
TESTQ AX, AX |
|
JZ check_offset |
|
XORQ R14, R14 |
|
|
|
copy_1: |
|
MOVUPS (R11)(R14*1), X0 |
|
MOVUPS X0, (R10)(R14*1) |
|
ADDQ $0x10, R14 |
|
CMPQ R14, AX |
|
JB copy_1 |
|
ADDQ AX, R11 |
|
ADDQ AX, R10 |
|
ADDQ AX, R12 |
|
|
|
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) |
|
check_offset: |
|
MOVQ R12, AX |
|
ADDQ 40(SP), AX |
|
CMPQ CX, AX |
|
JG error_match_off_too_big |
|
CMPQ CX, 56(SP) |
|
JG error_match_off_too_big |
|
|
|
// Copy match from history |
|
MOVQ CX, AX |
|
SUBQ R12, AX |
|
JLS copy_match |
|
MOVQ 48(SP), R14 |
|
SUBQ AX, R14 |
|
CMPQ R13, AX |
|
JG copy_all_from_history |
|
MOVQ R13, AX |
|
SUBQ $0x10, AX |
|
JB copy_4_small |
|
|
|
copy_4_loop: |
|
MOVUPS (R14), X0 |
|
MOVUPS X0, (R10) |
|
ADDQ $0x10, R14 |
|
ADDQ $0x10, R10 |
|
SUBQ $0x10, AX |
|
JAE copy_4_loop |
|
LEAQ 16(R14)(AX*1), R14 |
|
LEAQ 16(R10)(AX*1), R10 |
|
MOVUPS -16(R14), X0 |
|
MOVUPS X0, -16(R10) |
|
JMP copy_4_end |
|
|
|
copy_4_small: |
|
CMPQ R13, $0x03 |
|
JE copy_4_move_3 |
|
CMPQ R13, $0x08 |
|
JB copy_4_move_4through7 |
|
JMP copy_4_move_8through16 |
|
|
|
copy_4_move_3: |
|
MOVW (R14), AX |
|
MOVB 2(R14), CL |
|
MOVW AX, (R10) |
|
MOVB CL, 2(R10) |
|
ADDQ R13, R14 |
|
ADDQ R13, R10 |
|
JMP copy_4_end |
|
|
|
copy_4_move_4through7: |
|
MOVL (R14), AX |
|
MOVL -4(R14)(R13*1), CX |
|
MOVL AX, (R10) |
|
MOVL CX, -4(R10)(R13*1) |
|
ADDQ R13, R14 |
|
ADDQ R13, R10 |
|
JMP copy_4_end |
|
|
|
copy_4_move_8through16: |
|
MOVQ (R14), AX |
|
MOVQ -8(R14)(R13*1), CX |
|
MOVQ AX, (R10) |
|
MOVQ CX, -8(R10)(R13*1) |
|
ADDQ R13, R14 |
|
ADDQ R13, R10 |
|
|
|
copy_4_end: |
|
ADDQ R13, R12 |
|
JMP handle_loop |
|
JMP loop_finished |
|
|
|
copy_all_from_history: |
|
MOVQ AX, R15 |
|
SUBQ $0x10, R15 |
|
JB copy_5_small |
|
|
|
copy_5_loop: |
|
MOVUPS (R14), X0 |
|
MOVUPS X0, (R10) |
|
ADDQ $0x10, R14 |
|
ADDQ $0x10, R10 |
|
SUBQ $0x10, R15 |
|
JAE copy_5_loop |
|
LEAQ 16(R14)(R15*1), R14 |
|
LEAQ 16(R10)(R15*1), R10 |
|
MOVUPS -16(R14), X0 |
|
MOVUPS X0, -16(R10) |
|
JMP copy_5_end |
|
|
|
copy_5_small: |
|
CMPQ AX, $0x03 |
|
JE copy_5_move_3 |
|
JB copy_5_move_1or2 |
|
CMPQ AX, $0x08 |
|
JB copy_5_move_4through7 |
|
JMP copy_5_move_8through16 |
|
|
|
copy_5_move_1or2: |
|
MOVB (R14), R15 |
|
MOVB -1(R14)(AX*1), BP |
|
MOVB R15, (R10) |
|
MOVB BP, -1(R10)(AX*1) |
|
ADDQ AX, R14 |
|
ADDQ AX, R10 |
|
JMP copy_5_end |
|
|
|
copy_5_move_3: |
|
MOVW (R14), R15 |
|
MOVB 2(R14), BP |
|
MOVW R15, (R10) |
|
MOVB BP, 2(R10) |
|
ADDQ AX, R14 |
|
ADDQ AX, R10 |
|
JMP copy_5_end |
|
|
|
copy_5_move_4through7: |
|
MOVL (R14), R15 |
|
MOVL -4(R14)(AX*1), BP |
|
MOVL R15, (R10) |
|
MOVL BP, -4(R10)(AX*1) |
|
ADDQ AX, R14 |
|
ADDQ AX, R10 |
|
JMP copy_5_end |
|
|
|
copy_5_move_8through16: |
|
MOVQ (R14), R15 |
|
MOVQ -8(R14)(AX*1), BP |
|
MOVQ R15, (R10) |
|
MOVQ BP, -8(R10)(AX*1) |
|
ADDQ AX, R14 |
|
ADDQ AX, R10 |
|
|
|
copy_5_end: |
|
ADDQ AX, R12 |
|
SUBQ AX, R13 |
|
|
|
// Copy match from the current buffer |
|
copy_match: |
|
MOVQ R10, AX |
|
SUBQ CX, AX |
|
|
|
// ml <= mo |
|
CMPQ R13, CX |
|
JA copy_overlapping_match |
|
|
|
// Copy non-overlapping match |
|
ADDQ R13, R12 |
|
MOVQ R10, CX |
|
ADDQ R13, R10 |
|
|
|
copy_2: |
|
MOVUPS (AX), X0 |
|
MOVUPS X0, (CX) |
|
ADDQ $0x10, AX |
|
ADDQ $0x10, CX |
|
SUBQ $0x10, R13 |
|
JHI copy_2 |
|
JMP handle_loop |
|
|
|
// Copy overlapping match |
|
copy_overlapping_match: |
|
ADDQ R13, R12 |
|
|
|
copy_slow_3: |
|
MOVB (AX), CL |
|
MOVB CL, (R10) |
|
INCQ AX |
|
INCQ R10 |
|
DECQ R13 |
|
JNZ copy_slow_3 |
|
|
|
handle_loop: |
|
MOVQ ctx+16(FP), AX |
|
DECQ 96(AX) |
|
JNS sequenceDecs_decodeSync_amd64_main_loop |
|
|
|
loop_finished: |
|
MOVQ br+8(FP), AX |
|
MOVQ DX, 24(AX) |
|
MOVB BL, 40(AX) |
|
MOVQ SI, 32(AX) |
|
|
|
// Update the context |
|
MOVQ ctx+16(FP), AX |
|
MOVQ R12, 136(AX) |
|
MOVQ 144(AX), CX |
|
SUBQ CX, R11 |
|
MOVQ R11, 168(AX) |
|
|
|
// Return success |
|
MOVQ $0x00000000, ret+24(FP) |
|
RET |
|
|
|
// Return with match length error |
|
sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch: |
|
MOVQ 16(SP), AX |
|
MOVQ ctx+16(FP), CX |
|
MOVQ AX, 216(CX) |
|
MOVQ $0x00000001, ret+24(FP) |
|
RET |
|
|
|
// Return with match too long error |
|
sequenceDecs_decodeSync_amd64_error_match_len_too_big: |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 16(SP), CX |
|
MOVQ CX, 216(AX) |
|
MOVQ $0x00000002, ret+24(FP) |
|
RET |
|
|
|
// Return with match offset too long error |
|
error_match_off_too_big: |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 8(SP), CX |
|
MOVQ CX, 224(AX) |
|
MOVQ R12, 136(AX) |
|
MOVQ $0x00000003, ret+24(FP) |
|
RET |
|
|
|
// Return with not enough literals error |
|
error_not_enough_literals: |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 24(SP), CX |
|
MOVQ CX, 208(AX) |
|
MOVQ $0x00000004, ret+24(FP) |
|
RET |
|
|
|
// Return with overread error |
|
error_overread: |
|
MOVQ $0x00000006, ret+24(FP) |
|
RET |
|
|
|
// Return with not enough output space error |
|
error_not_enough_space: |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 24(SP), CX |
|
MOVQ CX, 208(AX) |
|
MOVQ 16(SP), CX |
|
MOVQ CX, 216(AX) |
|
MOVQ R12, 136(AX) |
|
MOVQ $0x00000005, ret+24(FP) |
|
RET |
|
|
|
// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int |
|
// Requires: BMI, BMI2, CMOV, SSE |
|
TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32 |
|
MOVQ br+8(FP), BX |
|
MOVQ 24(BX), AX |
|
MOVBQZX 40(BX), DX |
|
MOVQ (BX), CX |
|
MOVQ 32(BX), BX |
|
ADDQ BX, CX |
|
MOVQ CX, (SP) |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 72(CX), SI |
|
MOVQ 80(CX), DI |
|
MOVQ 88(CX), R8 |
|
XORQ R9, R9 |
|
MOVQ R9, 8(SP) |
|
MOVQ R9, 16(SP) |
|
MOVQ R9, 24(SP) |
|
MOVQ 112(CX), R9 |
|
MOVQ 128(CX), R10 |
|
MOVQ R10, 32(SP) |
|
MOVQ 144(CX), R10 |
|
MOVQ 136(CX), R11 |
|
MOVQ 200(CX), R12 |
|
MOVQ R12, 56(SP) |
|
MOVQ 176(CX), R12 |
|
MOVQ R12, 48(SP) |
|
MOVQ 184(CX), CX |
|
MOVQ CX, 40(SP) |
|
MOVQ 40(SP), CX |
|
ADDQ CX, 48(SP) |
|
|
|
// Calculate pointer to s.out[cap(s.out)] (a past-end pointer) |
|
ADDQ R9, 32(SP) |
|
|
|
// outBase += outPosition |
|
ADDQ R11, R9 |
|
|
|
sequenceDecs_decodeSync_bmi2_main_loop: |
|
MOVQ (SP), R12 |
|
|
|
// Fill bitreader to have enough for the offset and match length. |
|
CMPQ BX, $0x08 |
|
JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte |
|
MOVQ DX, CX |
|
SHRQ $0x03, CX |
|
SUBQ CX, R12 |
|
MOVQ (R12), AX |
|
SUBQ CX, BX |
|
ANDQ $0x07, DX |
|
JMP sequenceDecs_decodeSync_bmi2_fill_end |
|
|
|
sequenceDecs_decodeSync_bmi2_fill_byte_by_byte: |
|
CMPQ BX, $0x00 |
|
JLE sequenceDecs_decodeSync_bmi2_fill_check_overread |
|
CMPQ DX, $0x07 |
|
JLE sequenceDecs_decodeSync_bmi2_fill_end |
|
SHLQ $0x08, AX |
|
SUBQ $0x01, R12 |
|
SUBQ $0x01, BX |
|
SUBQ $0x08, DX |
|
MOVBQZX (R12), CX |
|
ORQ CX, AX |
|
JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte |
|
|
|
sequenceDecs_decodeSync_bmi2_fill_check_overread: |
|
CMPQ DX, $0x40 |
|
JA error_overread |
|
|
|
sequenceDecs_decodeSync_bmi2_fill_end: |
|
// Update offset |
|
MOVQ $0x00000808, CX |
|
BEXTRQ CX, R8, R13 |
|
MOVQ AX, R14 |
|
LEAQ (DX)(R13*1), CX |
|
ROLQ CL, R14 |
|
BZHIQ R13, R14, R14 |
|
MOVQ CX, DX |
|
MOVQ R8, CX |
|
SHRQ $0x20, CX |
|
ADDQ R14, CX |
|
MOVQ CX, 8(SP) |
|
|
|
// Update match length |
|
MOVQ $0x00000808, CX |
|
BEXTRQ CX, DI, R13 |
|
MOVQ AX, R14 |
|
LEAQ (DX)(R13*1), CX |
|
ROLQ CL, R14 |
|
BZHIQ R13, R14, R14 |
|
MOVQ CX, DX |
|
MOVQ DI, CX |
|
SHRQ $0x20, CX |
|
ADDQ R14, CX |
|
MOVQ CX, 16(SP) |
|
|
|
// Fill bitreader to have enough for the remaining |
|
CMPQ BX, $0x08 |
|
JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte |
|
MOVQ DX, CX |
|
SHRQ $0x03, CX |
|
SUBQ CX, R12 |
|
MOVQ (R12), AX |
|
SUBQ CX, BX |
|
ANDQ $0x07, DX |
|
JMP sequenceDecs_decodeSync_bmi2_fill_2_end |
|
|
|
sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte: |
|
CMPQ BX, $0x00 |
|
JLE sequenceDecs_decodeSync_bmi2_fill_2_check_overread |
|
CMPQ DX, $0x07 |
|
JLE sequenceDecs_decodeSync_bmi2_fill_2_end |
|
SHLQ $0x08, AX |
|
SUBQ $0x01, R12 |
|
SUBQ $0x01, BX |
|
SUBQ $0x08, DX |
|
MOVBQZX (R12), CX |
|
ORQ CX, AX |
|
JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte |
|
|
|
sequenceDecs_decodeSync_bmi2_fill_2_check_overread: |
|
CMPQ DX, $0x40 |
|
JA error_overread |
|
|
|
sequenceDecs_decodeSync_bmi2_fill_2_end: |
|
// Update literal length |
|
MOVQ $0x00000808, CX |
|
BEXTRQ CX, SI, R13 |
|
MOVQ AX, R14 |
|
LEAQ (DX)(R13*1), CX |
|
ROLQ CL, R14 |
|
BZHIQ R13, R14, R14 |
|
MOVQ CX, DX |
|
MOVQ SI, CX |
|
SHRQ $0x20, CX |
|
ADDQ R14, CX |
|
MOVQ CX, 24(SP) |
|
|
|
// Fill bitreader for state updates |
|
MOVQ R12, (SP) |
|
MOVQ $0x00000808, CX |
|
BEXTRQ CX, R8, R12 |
|
MOVQ ctx+16(FP), CX |
|
CMPQ 96(CX), $0x00 |
|
JZ sequenceDecs_decodeSync_bmi2_skip_update |
|
LEAQ (SI)(DI*1), R13 |
|
ADDQ R8, R13 |
|
MOVBQZX R13, R13 |
|
LEAQ (DX)(R13*1), CX |
|
MOVQ AX, R14 |
|
MOVQ CX, DX |
|
ROLQ CL, R14 |
|
BZHIQ R13, R14, R14 |
|
|
|
// Update Offset State |
|
BZHIQ R8, R14, CX |
|
SHRXQ R8, R14, R14 |
|
SHRL $0x10, R8 |
|
ADDQ CX, R8 |
|
|
|
// Load ctx.ofTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 48(CX), CX |
|
MOVQ (CX)(R8*8), R8 |
|
|
|
// Update Match Length State |
|
BZHIQ DI, R14, CX |
|
SHRXQ DI, R14, R14 |
|
SHRL $0x10, DI |
|
ADDQ CX, DI |
|
|
|
// Load ctx.mlTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 24(CX), CX |
|
MOVQ (CX)(DI*8), DI |
|
|
|
// Update Literal Length State |
|
BZHIQ SI, R14, CX |
|
SHRL $0x10, SI |
|
ADDQ CX, SI |
|
|
|
// Load ctx.llTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ (CX), CX |
|
MOVQ (CX)(SI*8), SI |
|
|
|
sequenceDecs_decodeSync_bmi2_skip_update: |
|
// Adjust offset |
|
MOVQ s+0(FP), CX |
|
MOVQ 8(SP), R13 |
|
CMPQ R12, $0x01 |
|
JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0 |
|
MOVUPS 144(CX), X0 |
|
MOVQ R13, 144(CX) |
|
MOVUPS X0, 152(CX) |
|
JMP sequenceDecs_decodeSync_bmi2_after_adjust |
|
|
|
sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0: |
|
CMPQ 24(SP), $0x00000000 |
|
JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero |
|
INCQ R13 |
|
JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero |
|
|
|
sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero: |
|
TESTQ R13, R13 |
|
JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero |
|
MOVQ 144(CX), R13 |
|
JMP sequenceDecs_decodeSync_bmi2_after_adjust |
|
|
|
sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero: |
|
MOVQ R13, R12 |
|
XORQ R14, R14 |
|
MOVQ $-1, R15 |
|
CMPQ R13, $0x03 |
|
CMOVQEQ R14, R12 |
|
CMOVQEQ R15, R14 |
|
ADDQ 144(CX)(R12*8), R14 |
|
JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid |
|
MOVQ $0x00000001, R14 |
|
|
|
sequenceDecs_decodeSync_bmi2_adjust_temp_valid: |
|
CMPQ R13, $0x01 |
|
JZ sequenceDecs_decodeSync_bmi2_adjust_skip |
|
MOVQ 152(CX), R12 |
|
MOVQ R12, 160(CX) |
|
|
|
sequenceDecs_decodeSync_bmi2_adjust_skip: |
|
MOVQ 144(CX), R12 |
|
MOVQ R12, 152(CX) |
|
MOVQ R14, 144(CX) |
|
MOVQ R14, R13 |
|
|
|
sequenceDecs_decodeSync_bmi2_after_adjust: |
|
MOVQ R13, 8(SP) |
|
|
|
// Check values |
|
MOVQ 16(SP), CX |
|
MOVQ 24(SP), R12 |
|
LEAQ (CX)(R12*1), R14 |
|
MOVQ s+0(FP), R15 |
|
ADDQ R14, 256(R15) |
|
MOVQ ctx+16(FP), R14 |
|
SUBQ R12, 104(R14) |
|
JS error_not_enough_literals |
|
CMPQ CX, $0x00020002 |
|
JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big |
|
TESTQ R13, R13 |
|
JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok |
|
TESTQ CX, CX |
|
JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch |
|
|
|
sequenceDecs_decodeSync_bmi2_match_len_ofs_ok: |
|
MOVQ 24(SP), CX |
|
MOVQ 8(SP), R12 |
|
MOVQ 16(SP), R13 |
|
|
|
// Check if we have enough space in s.out |
|
LEAQ (CX)(R13*1), R14 |
|
ADDQ R9, R14 |
|
CMPQ R14, 32(SP) |
|
JA error_not_enough_space |
|
|
|
// Copy literals |
|
TESTQ CX, CX |
|
JZ check_offset |
|
XORQ R14, R14 |
|
|
|
copy_1: |
|
MOVUPS (R10)(R14*1), X0 |
|
MOVUPS X0, (R9)(R14*1) |
|
ADDQ $0x10, R14 |
|
CMPQ R14, CX |
|
JB copy_1 |
|
ADDQ CX, R10 |
|
ADDQ CX, R9 |
|
ADDQ CX, R11 |
|
|
|
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) |
|
check_offset: |
|
MOVQ R11, CX |
|
ADDQ 40(SP), CX |
|
CMPQ R12, CX |
|
JG error_match_off_too_big |
|
CMPQ R12, 56(SP) |
|
JG error_match_off_too_big |
|
|
|
// Copy match from history |
|
MOVQ R12, CX |
|
SUBQ R11, CX |
|
JLS copy_match |
|
MOVQ 48(SP), R14 |
|
SUBQ CX, R14 |
|
CMPQ R13, CX |
|
JG copy_all_from_history |
|
MOVQ R13, CX |
|
SUBQ $0x10, CX |
|
JB copy_4_small |
|
|
|
copy_4_loop: |
|
MOVUPS (R14), X0 |
|
MOVUPS X0, (R9) |
|
ADDQ $0x10, R14 |
|
ADDQ $0x10, R9 |
|
SUBQ $0x10, CX |
|
JAE copy_4_loop |
|
LEAQ 16(R14)(CX*1), R14 |
|
LEAQ 16(R9)(CX*1), R9 |
|
MOVUPS -16(R14), X0 |
|
MOVUPS X0, -16(R9) |
|
JMP copy_4_end |
|
|
|
copy_4_small: |
|
CMPQ R13, $0x03 |
|
JE copy_4_move_3 |
|
CMPQ R13, $0x08 |
|
JB copy_4_move_4through7 |
|
JMP copy_4_move_8through16 |
|
|
|
copy_4_move_3: |
|
MOVW (R14), CX |
|
MOVB 2(R14), R12 |
|
MOVW CX, (R9) |
|
MOVB R12, 2(R9) |
|
ADDQ R13, R14 |
|
ADDQ R13, R9 |
|
JMP copy_4_end |
|
|
|
copy_4_move_4through7: |
|
MOVL (R14), CX |
|
MOVL -4(R14)(R13*1), R12 |
|
MOVL CX, (R9) |
|
MOVL R12, -4(R9)(R13*1) |
|
ADDQ R13, R14 |
|
ADDQ R13, R9 |
|
JMP copy_4_end |
|
|
|
copy_4_move_8through16: |
|
MOVQ (R14), CX |
|
MOVQ -8(R14)(R13*1), R12 |
|
MOVQ CX, (R9) |
|
MOVQ R12, -8(R9)(R13*1) |
|
ADDQ R13, R14 |
|
ADDQ R13, R9 |
|
|
|
copy_4_end: |
|
ADDQ R13, R11 |
|
JMP handle_loop |
|
JMP loop_finished |
|
|
|
copy_all_from_history: |
|
MOVQ CX, R15 |
|
SUBQ $0x10, R15 |
|
JB copy_5_small |
|
|
|
copy_5_loop: |
|
MOVUPS (R14), X0 |
|
MOVUPS X0, (R9) |
|
ADDQ $0x10, R14 |
|
ADDQ $0x10, R9 |
|
SUBQ $0x10, R15 |
|
JAE copy_5_loop |
|
LEAQ 16(R14)(R15*1), R14 |
|
LEAQ 16(R9)(R15*1), R9 |
|
MOVUPS -16(R14), X0 |
|
MOVUPS X0, -16(R9) |
|
JMP copy_5_end |
|
|
|
copy_5_small: |
|
CMPQ CX, $0x03 |
|
JE copy_5_move_3 |
|
JB copy_5_move_1or2 |
|
CMPQ CX, $0x08 |
|
JB copy_5_move_4through7 |
|
JMP copy_5_move_8through16 |
|
|
|
copy_5_move_1or2: |
|
MOVB (R14), R15 |
|
MOVB -1(R14)(CX*1), BP |
|
MOVB R15, (R9) |
|
MOVB BP, -1(R9)(CX*1) |
|
ADDQ CX, R14 |
|
ADDQ CX, R9 |
|
JMP copy_5_end |
|
|
|
copy_5_move_3: |
|
MOVW (R14), R15 |
|
MOVB 2(R14), BP |
|
MOVW R15, (R9) |
|
MOVB BP, 2(R9) |
|
ADDQ CX, R14 |
|
ADDQ CX, R9 |
|
JMP copy_5_end |
|
|
|
copy_5_move_4through7: |
|
MOVL (R14), R15 |
|
MOVL -4(R14)(CX*1), BP |
|
MOVL R15, (R9) |
|
MOVL BP, -4(R9)(CX*1) |
|
ADDQ CX, R14 |
|
ADDQ CX, R9 |
|
JMP copy_5_end |
|
|
|
copy_5_move_8through16: |
|
MOVQ (R14), R15 |
|
MOVQ -8(R14)(CX*1), BP |
|
MOVQ R15, (R9) |
|
MOVQ BP, -8(R9)(CX*1) |
|
ADDQ CX, R14 |
|
ADDQ CX, R9 |
|
|
|
copy_5_end: |
|
ADDQ CX, R11 |
|
SUBQ CX, R13 |
|
|
|
// Copy match from the current buffer |
|
copy_match: |
|
MOVQ R9, CX |
|
SUBQ R12, CX |
|
|
|
// ml <= mo |
|
CMPQ R13, R12 |
|
JA copy_overlapping_match |
|
|
|
// Copy non-overlapping match |
|
ADDQ R13, R11 |
|
MOVQ R9, R12 |
|
ADDQ R13, R9 |
|
|
|
copy_2: |
|
MOVUPS (CX), X0 |
|
MOVUPS X0, (R12) |
|
ADDQ $0x10, CX |
|
ADDQ $0x10, R12 |
|
SUBQ $0x10, R13 |
|
JHI copy_2 |
|
JMP handle_loop |
|
|
|
// Copy overlapping match |
|
copy_overlapping_match: |
|
ADDQ R13, R11 |
|
|
|
copy_slow_3: |
|
MOVB (CX), R12 |
|
MOVB R12, (R9) |
|
INCQ CX |
|
INCQ R9 |
|
DECQ R13 |
|
JNZ copy_slow_3 |
|
|
|
handle_loop: |
|
MOVQ ctx+16(FP), CX |
|
DECQ 96(CX) |
|
JNS sequenceDecs_decodeSync_bmi2_main_loop |
|
|
|
loop_finished: |
|
MOVQ br+8(FP), CX |
|
MOVQ AX, 24(CX) |
|
MOVB DL, 40(CX) |
|
MOVQ BX, 32(CX) |
|
|
|
// Update the context |
|
MOVQ ctx+16(FP), AX |
|
MOVQ R11, 136(AX) |
|
MOVQ 144(AX), CX |
|
SUBQ CX, R10 |
|
MOVQ R10, 168(AX) |
|
|
|
// Return success |
|
MOVQ $0x00000000, ret+24(FP) |
|
RET |
|
|
|
// Return with match length error |
|
sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch: |
|
MOVQ 16(SP), AX |
|
MOVQ ctx+16(FP), CX |
|
MOVQ AX, 216(CX) |
|
MOVQ $0x00000001, ret+24(FP) |
|
RET |
|
|
|
// Return with match too long error |
|
sequenceDecs_decodeSync_bmi2_error_match_len_too_big: |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 16(SP), CX |
|
MOVQ CX, 216(AX) |
|
MOVQ $0x00000002, ret+24(FP) |
|
RET |
|
|
|
// Return with match offset too long error |
|
error_match_off_too_big: |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 8(SP), CX |
|
MOVQ CX, 224(AX) |
|
MOVQ R11, 136(AX) |
|
MOVQ $0x00000003, ret+24(FP) |
|
RET |
|
|
|
// Return with not enough literals error |
|
error_not_enough_literals: |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 24(SP), CX |
|
MOVQ CX, 208(AX) |
|
MOVQ $0x00000004, ret+24(FP) |
|
RET |
|
|
|
// Return with overread error |
|
error_overread: |
|
MOVQ $0x00000006, ret+24(FP) |
|
RET |
|
|
|
// Return with not enough output space error |
|
error_not_enough_space: |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 24(SP), CX |
|
MOVQ CX, 208(AX) |
|
MOVQ 16(SP), CX |
|
MOVQ CX, 216(AX) |
|
MOVQ R11, 136(AX) |
|
MOVQ $0x00000005, ret+24(FP) |
|
RET |
|
|
|
// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int |
|
// Requires: CMOV, SSE |
|
TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32 |
|
MOVQ br+8(FP), CX |
|
MOVQ 24(CX), DX |
|
MOVBQZX 40(CX), BX |
|
MOVQ (CX), AX |
|
MOVQ 32(CX), SI |
|
ADDQ SI, AX |
|
MOVQ AX, (SP) |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 72(AX), DI |
|
MOVQ 80(AX), R8 |
|
MOVQ 88(AX), R9 |
|
XORQ CX, CX |
|
MOVQ CX, 8(SP) |
|
MOVQ CX, 16(SP) |
|
MOVQ CX, 24(SP) |
|
MOVQ 112(AX), R10 |
|
MOVQ 128(AX), CX |
|
MOVQ CX, 32(SP) |
|
MOVQ 144(AX), R11 |
|
MOVQ 136(AX), R12 |
|
MOVQ 200(AX), CX |
|
MOVQ CX, 56(SP) |
|
MOVQ 176(AX), CX |
|
MOVQ CX, 48(SP) |
|
MOVQ 184(AX), AX |
|
MOVQ AX, 40(SP) |
|
MOVQ 40(SP), AX |
|
ADDQ AX, 48(SP) |
|
|
|
// Calculate pointer to s.out[cap(s.out)] (a past-end pointer) |
|
ADDQ R10, 32(SP) |
|
|
|
// outBase += outPosition |
|
ADDQ R12, R10 |
|
|
|
sequenceDecs_decodeSync_safe_amd64_main_loop: |
|
MOVQ (SP), R13 |
|
|
|
// Fill bitreader to have enough for the offset and match length. |
|
CMPQ SI, $0x08 |
|
JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte |
|
MOVQ BX, AX |
|
SHRQ $0x03, AX |
|
SUBQ AX, R13 |
|
MOVQ (R13), DX |
|
SUBQ AX, SI |
|
ANDQ $0x07, BX |
|
JMP sequenceDecs_decodeSync_safe_amd64_fill_end |
|
|
|
sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte: |
|
CMPQ SI, $0x00 |
|
JLE sequenceDecs_decodeSync_safe_amd64_fill_check_overread |
|
CMPQ BX, $0x07 |
|
JLE sequenceDecs_decodeSync_safe_amd64_fill_end |
|
SHLQ $0x08, DX |
|
SUBQ $0x01, R13 |
|
SUBQ $0x01, SI |
|
SUBQ $0x08, BX |
|
MOVBQZX (R13), AX |
|
ORQ AX, DX |
|
JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte |
|
|
|
sequenceDecs_decodeSync_safe_amd64_fill_check_overread: |
|
CMPQ BX, $0x40 |
|
JA error_overread |
|
|
|
sequenceDecs_decodeSync_safe_amd64_fill_end: |
|
// Update offset |
|
MOVQ R9, AX |
|
MOVQ BX, CX |
|
MOVQ DX, R14 |
|
SHLQ CL, R14 |
|
MOVB AH, CL |
|
SHRQ $0x20, AX |
|
TESTQ CX, CX |
|
JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero |
|
ADDQ CX, BX |
|
CMPQ BX, $0x40 |
|
JA sequenceDecs_decodeSync_safe_amd64_of_update_zero |
|
CMPQ CX, $0x40 |
|
JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero |
|
NEGQ CX |
|
SHRQ CL, R14 |
|
ADDQ R14, AX |
|
|
|
sequenceDecs_decodeSync_safe_amd64_of_update_zero: |
|
MOVQ AX, 8(SP) |
|
|
|
// Update match length |
|
MOVQ R8, AX |
|
MOVQ BX, CX |
|
MOVQ DX, R14 |
|
SHLQ CL, R14 |
|
MOVB AH, CL |
|
SHRQ $0x20, AX |
|
TESTQ CX, CX |
|
JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero |
|
ADDQ CX, BX |
|
CMPQ BX, $0x40 |
|
JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero |
|
CMPQ CX, $0x40 |
|
JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero |
|
NEGQ CX |
|
SHRQ CL, R14 |
|
ADDQ R14, AX |
|
|
|
sequenceDecs_decodeSync_safe_amd64_ml_update_zero: |
|
MOVQ AX, 16(SP) |
|
|
|
// Fill bitreader to have enough for the remaining |
|
CMPQ SI, $0x08 |
|
JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte |
|
MOVQ BX, AX |
|
SHRQ $0x03, AX |
|
SUBQ AX, R13 |
|
MOVQ (R13), DX |
|
SUBQ AX, SI |
|
ANDQ $0x07, BX |
|
JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end |
|
|
|
sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte: |
|
CMPQ SI, $0x00 |
|
JLE sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread |
|
CMPQ BX, $0x07 |
|
JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end |
|
SHLQ $0x08, DX |
|
SUBQ $0x01, R13 |
|
SUBQ $0x01, SI |
|
SUBQ $0x08, BX |
|
MOVBQZX (R13), AX |
|
ORQ AX, DX |
|
JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte |
|
|
|
sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread: |
|
CMPQ BX, $0x40 |
|
JA error_overread |
|
|
|
sequenceDecs_decodeSync_safe_amd64_fill_2_end: |
|
// Update literal length |
|
MOVQ DI, AX |
|
MOVQ BX, CX |
|
MOVQ DX, R14 |
|
SHLQ CL, R14 |
|
MOVB AH, CL |
|
SHRQ $0x20, AX |
|
TESTQ CX, CX |
|
JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero |
|
ADDQ CX, BX |
|
CMPQ BX, $0x40 |
|
JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero |
|
CMPQ CX, $0x40 |
|
JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero |
|
NEGQ CX |
|
SHRQ CL, R14 |
|
ADDQ R14, AX |
|
|
|
sequenceDecs_decodeSync_safe_amd64_ll_update_zero: |
|
MOVQ AX, 24(SP) |
|
|
|
// Fill bitreader for state updates |
|
MOVQ R13, (SP) |
|
MOVQ R9, AX |
|
SHRQ $0x08, AX |
|
MOVBQZX AL, AX |
|
MOVQ ctx+16(FP), CX |
|
CMPQ 96(CX), $0x00 |
|
JZ sequenceDecs_decodeSync_safe_amd64_skip_update |
|
|
|
// Update Literal Length State |
|
MOVBQZX DI, R13 |
|
SHRL $0x10, DI |
|
LEAQ (BX)(R13*1), CX |
|
MOVQ DX, R14 |
|
MOVQ CX, BX |
|
ROLQ CL, R14 |
|
MOVL $0x00000001, R15 |
|
MOVB R13, CL |
|
SHLL CL, R15 |
|
DECL R15 |
|
ANDQ R15, R14 |
|
ADDQ R14, DI |
|
|
|
// Load ctx.llTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ (CX), CX |
|
MOVQ (CX)(DI*8), DI |
|
|
|
// Update Match Length State |
|
MOVBQZX R8, R13 |
|
SHRL $0x10, R8 |
|
LEAQ (BX)(R13*1), CX |
|
MOVQ DX, R14 |
|
MOVQ CX, BX |
|
ROLQ CL, R14 |
|
MOVL $0x00000001, R15 |
|
MOVB R13, CL |
|
SHLL CL, R15 |
|
DECL R15 |
|
ANDQ R15, R14 |
|
ADDQ R14, R8 |
|
|
|
// Load ctx.mlTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 24(CX), CX |
|
MOVQ (CX)(R8*8), R8 |
|
|
|
// Update Offset State |
|
MOVBQZX R9, R13 |
|
SHRL $0x10, R9 |
|
LEAQ (BX)(R13*1), CX |
|
MOVQ DX, R14 |
|
MOVQ CX, BX |
|
ROLQ CL, R14 |
|
MOVL $0x00000001, R15 |
|
MOVB R13, CL |
|
SHLL CL, R15 |
|
DECL R15 |
|
ANDQ R15, R14 |
|
ADDQ R14, R9 |
|
|
|
// Load ctx.ofTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 48(CX), CX |
|
MOVQ (CX)(R9*8), R9 |
|
|
|
sequenceDecs_decodeSync_safe_amd64_skip_update: |
|
// Adjust offset |
|
MOVQ s+0(FP), CX |
|
MOVQ 8(SP), R13 |
|
CMPQ AX, $0x01 |
|
JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0 |
|
MOVUPS 144(CX), X0 |
|
MOVQ R13, 144(CX) |
|
MOVUPS X0, 152(CX) |
|
JMP sequenceDecs_decodeSync_safe_amd64_after_adjust |
|
|
|
sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0: |
|
CMPQ 24(SP), $0x00000000 |
|
JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero |
|
INCQ R13 |
|
JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero |
|
|
|
sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero: |
|
TESTQ R13, R13 |
|
JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero |
|
MOVQ 144(CX), R13 |
|
JMP sequenceDecs_decodeSync_safe_amd64_after_adjust |
|
|
|
sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero: |
|
MOVQ R13, AX |
|
XORQ R14, R14 |
|
MOVQ $-1, R15 |
|
CMPQ R13, $0x03 |
|
CMOVQEQ R14, AX |
|
CMOVQEQ R15, R14 |
|
ADDQ 144(CX)(AX*8), R14 |
|
JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid |
|
MOVQ $0x00000001, R14 |
|
|
|
sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid: |
|
CMPQ R13, $0x01 |
|
JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip |
|
MOVQ 152(CX), AX |
|
MOVQ AX, 160(CX) |
|
|
|
sequenceDecs_decodeSync_safe_amd64_adjust_skip: |
|
MOVQ 144(CX), AX |
|
MOVQ AX, 152(CX) |
|
MOVQ R14, 144(CX) |
|
MOVQ R14, R13 |
|
|
|
sequenceDecs_decodeSync_safe_amd64_after_adjust: |
|
MOVQ R13, 8(SP) |
|
|
|
// Check values |
|
MOVQ 16(SP), AX |
|
MOVQ 24(SP), CX |
|
LEAQ (AX)(CX*1), R14 |
|
MOVQ s+0(FP), R15 |
|
ADDQ R14, 256(R15) |
|
MOVQ ctx+16(FP), R14 |
|
SUBQ CX, 104(R14) |
|
JS error_not_enough_literals |
|
CMPQ AX, $0x00020002 |
|
JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big |
|
TESTQ R13, R13 |
|
JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok |
|
TESTQ AX, AX |
|
JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch |
|
|
|
sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok: |
|
MOVQ 24(SP), AX |
|
MOVQ 8(SP), CX |
|
MOVQ 16(SP), R13 |
|
|
|
// Check if we have enough space in s.out |
|
LEAQ (AX)(R13*1), R14 |
|
ADDQ R10, R14 |
|
CMPQ R14, 32(SP) |
|
JA error_not_enough_space |
|
|
|
// Copy literals |
|
TESTQ AX, AX |
|
JZ check_offset |
|
MOVQ AX, R14 |
|
SUBQ $0x10, R14 |
|
JB copy_1_small |
|
|
|
copy_1_loop: |
|
MOVUPS (R11), X0 |
|
MOVUPS X0, (R10) |
|
ADDQ $0x10, R11 |
|
ADDQ $0x10, R10 |
|
SUBQ $0x10, R14 |
|
JAE copy_1_loop |
|
LEAQ 16(R11)(R14*1), R11 |
|
LEAQ 16(R10)(R14*1), R10 |
|
MOVUPS -16(R11), X0 |
|
MOVUPS X0, -16(R10) |
|
JMP copy_1_end |
|
|
|
copy_1_small: |
|
CMPQ AX, $0x03 |
|
JE copy_1_move_3 |
|
JB copy_1_move_1or2 |
|
CMPQ AX, $0x08 |
|
JB copy_1_move_4through7 |
|
JMP copy_1_move_8through16 |
|
|
|
copy_1_move_1or2: |
|
MOVB (R11), R14 |
|
MOVB -1(R11)(AX*1), R15 |
|
MOVB R14, (R10) |
|
MOVB R15, -1(R10)(AX*1) |
|
ADDQ AX, R11 |
|
ADDQ AX, R10 |
|
JMP copy_1_end |
|
|
|
copy_1_move_3: |
|
MOVW (R11), R14 |
|
MOVB 2(R11), R15 |
|
MOVW R14, (R10) |
|
MOVB R15, 2(R10) |
|
ADDQ AX, R11 |
|
ADDQ AX, R10 |
|
JMP copy_1_end |
|
|
|
copy_1_move_4through7: |
|
MOVL (R11), R14 |
|
MOVL -4(R11)(AX*1), R15 |
|
MOVL R14, (R10) |
|
MOVL R15, -4(R10)(AX*1) |
|
ADDQ AX, R11 |
|
ADDQ AX, R10 |
|
JMP copy_1_end |
|
|
|
copy_1_move_8through16: |
|
MOVQ (R11), R14 |
|
MOVQ -8(R11)(AX*1), R15 |
|
MOVQ R14, (R10) |
|
MOVQ R15, -8(R10)(AX*1) |
|
ADDQ AX, R11 |
|
ADDQ AX, R10 |
|
|
|
copy_1_end: |
|
ADDQ AX, R12 |
|
|
|
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) |
|
check_offset: |
|
MOVQ R12, AX |
|
ADDQ 40(SP), AX |
|
CMPQ CX, AX |
|
JG error_match_off_too_big |
|
CMPQ CX, 56(SP) |
|
JG error_match_off_too_big |
|
|
|
// Copy match from history |
|
MOVQ CX, AX |
|
SUBQ R12, AX |
|
JLS copy_match |
|
MOVQ 48(SP), R14 |
|
SUBQ AX, R14 |
|
CMPQ R13, AX |
|
JG copy_all_from_history |
|
MOVQ R13, AX |
|
SUBQ $0x10, AX |
|
JB copy_4_small |
|
|
|
copy_4_loop: |
|
MOVUPS (R14), X0 |
|
MOVUPS X0, (R10) |
|
ADDQ $0x10, R14 |
|
ADDQ $0x10, R10 |
|
SUBQ $0x10, AX |
|
JAE copy_4_loop |
|
LEAQ 16(R14)(AX*1), R14 |
|
LEAQ 16(R10)(AX*1), R10 |
|
MOVUPS -16(R14), X0 |
|
MOVUPS X0, -16(R10) |
|
JMP copy_4_end |
|
|
|
copy_4_small: |
|
CMPQ R13, $0x03 |
|
JE copy_4_move_3 |
|
CMPQ R13, $0x08 |
|
JB copy_4_move_4through7 |
|
JMP copy_4_move_8through16 |
|
|
|
copy_4_move_3: |
|
MOVW (R14), AX |
|
MOVB 2(R14), CL |
|
MOVW AX, (R10) |
|
MOVB CL, 2(R10) |
|
ADDQ R13, R14 |
|
ADDQ R13, R10 |
|
JMP copy_4_end |
|
|
|
copy_4_move_4through7: |
|
MOVL (R14), AX |
|
MOVL -4(R14)(R13*1), CX |
|
MOVL AX, (R10) |
|
MOVL CX, -4(R10)(R13*1) |
|
ADDQ R13, R14 |
|
ADDQ R13, R10 |
|
JMP copy_4_end |
|
|
|
copy_4_move_8through16: |
|
MOVQ (R14), AX |
|
MOVQ -8(R14)(R13*1), CX |
|
MOVQ AX, (R10) |
|
MOVQ CX, -8(R10)(R13*1) |
|
ADDQ R13, R14 |
|
ADDQ R13, R10 |
|
|
|
copy_4_end: |
|
ADDQ R13, R12 |
|
JMP handle_loop |
|
JMP loop_finished |
|
|
|
copy_all_from_history: |
|
MOVQ AX, R15 |
|
SUBQ $0x10, R15 |
|
JB copy_5_small |
|
|
|
copy_5_loop: |
|
MOVUPS (R14), X0 |
|
MOVUPS X0, (R10) |
|
ADDQ $0x10, R14 |
|
ADDQ $0x10, R10 |
|
SUBQ $0x10, R15 |
|
JAE copy_5_loop |
|
LEAQ 16(R14)(R15*1), R14 |
|
LEAQ 16(R10)(R15*1), R10 |
|
MOVUPS -16(R14), X0 |
|
MOVUPS X0, -16(R10) |
|
JMP copy_5_end |
|
|
|
copy_5_small: |
|
CMPQ AX, $0x03 |
|
JE copy_5_move_3 |
|
JB copy_5_move_1or2 |
|
CMPQ AX, $0x08 |
|
JB copy_5_move_4through7 |
|
JMP copy_5_move_8through16 |
|
|
|
copy_5_move_1or2: |
|
MOVB (R14), R15 |
|
MOVB -1(R14)(AX*1), BP |
|
MOVB R15, (R10) |
|
MOVB BP, -1(R10)(AX*1) |
|
ADDQ AX, R14 |
|
ADDQ AX, R10 |
|
JMP copy_5_end |
|
|
|
copy_5_move_3: |
|
MOVW (R14), R15 |
|
MOVB 2(R14), BP |
|
MOVW R15, (R10) |
|
MOVB BP, 2(R10) |
|
ADDQ AX, R14 |
|
ADDQ AX, R10 |
|
JMP copy_5_end |
|
|
|
copy_5_move_4through7: |
|
MOVL (R14), R15 |
|
MOVL -4(R14)(AX*1), BP |
|
MOVL R15, (R10) |
|
MOVL BP, -4(R10)(AX*1) |
|
ADDQ AX, R14 |
|
ADDQ AX, R10 |
|
JMP copy_5_end |
|
|
|
copy_5_move_8through16: |
|
MOVQ (R14), R15 |
|
MOVQ -8(R14)(AX*1), BP |
|
MOVQ R15, (R10) |
|
MOVQ BP, -8(R10)(AX*1) |
|
ADDQ AX, R14 |
|
ADDQ AX, R10 |
|
|
|
copy_5_end: |
|
ADDQ AX, R12 |
|
SUBQ AX, R13 |
|
|
|
// Copy match from the current buffer |
|
copy_match: |
|
MOVQ R10, AX |
|
SUBQ CX, AX |
|
|
|
// ml <= mo |
|
CMPQ R13, CX |
|
JA copy_overlapping_match |
|
|
|
// Copy non-overlapping match |
|
ADDQ R13, R12 |
|
MOVQ R13, CX |
|
SUBQ $0x10, CX |
|
JB copy_2_small |
|
|
|
copy_2_loop: |
|
MOVUPS (AX), X0 |
|
MOVUPS X0, (R10) |
|
ADDQ $0x10, AX |
|
ADDQ $0x10, R10 |
|
SUBQ $0x10, CX |
|
JAE copy_2_loop |
|
LEAQ 16(AX)(CX*1), AX |
|
LEAQ 16(R10)(CX*1), R10 |
|
MOVUPS -16(AX), X0 |
|
MOVUPS X0, -16(R10) |
|
JMP copy_2_end |
|
|
|
copy_2_small: |
|
CMPQ R13, $0x03 |
|
JE copy_2_move_3 |
|
JB copy_2_move_1or2 |
|
CMPQ R13, $0x08 |
|
JB copy_2_move_4through7 |
|
JMP copy_2_move_8through16 |
|
|
|
copy_2_move_1or2: |
|
MOVB (AX), CL |
|
MOVB -1(AX)(R13*1), R14 |
|
MOVB CL, (R10) |
|
MOVB R14, -1(R10)(R13*1) |
|
ADDQ R13, AX |
|
ADDQ R13, R10 |
|
JMP copy_2_end |
|
|
|
copy_2_move_3: |
|
MOVW (AX), CX |
|
MOVB 2(AX), R14 |
|
MOVW CX, (R10) |
|
MOVB R14, 2(R10) |
|
ADDQ R13, AX |
|
ADDQ R13, R10 |
|
JMP copy_2_end |
|
|
|
copy_2_move_4through7: |
|
MOVL (AX), CX |
|
MOVL -4(AX)(R13*1), R14 |
|
MOVL CX, (R10) |
|
MOVL R14, -4(R10)(R13*1) |
|
ADDQ R13, AX |
|
ADDQ R13, R10 |
|
JMP copy_2_end |
|
|
|
copy_2_move_8through16: |
|
MOVQ (AX), CX |
|
MOVQ -8(AX)(R13*1), R14 |
|
MOVQ CX, (R10) |
|
MOVQ R14, -8(R10)(R13*1) |
|
ADDQ R13, AX |
|
ADDQ R13, R10 |
|
|
|
copy_2_end: |
|
JMP handle_loop |
|
|
|
// Copy overlapping match |
|
copy_overlapping_match: |
|
ADDQ R13, R12 |
|
|
|
copy_slow_3: |
|
MOVB (AX), CL |
|
MOVB CL, (R10) |
|
INCQ AX |
|
INCQ R10 |
|
DECQ R13 |
|
JNZ copy_slow_3 |
|
|
|
handle_loop: |
|
MOVQ ctx+16(FP), AX |
|
DECQ 96(AX) |
|
JNS sequenceDecs_decodeSync_safe_amd64_main_loop |
|
|
|
loop_finished: |
|
MOVQ br+8(FP), AX |
|
MOVQ DX, 24(AX) |
|
MOVB BL, 40(AX) |
|
MOVQ SI, 32(AX) |
|
|
|
// Update the context |
|
MOVQ ctx+16(FP), AX |
|
MOVQ R12, 136(AX) |
|
MOVQ 144(AX), CX |
|
SUBQ CX, R11 |
|
MOVQ R11, 168(AX) |
|
|
|
// Return success |
|
MOVQ $0x00000000, ret+24(FP) |
|
RET |
|
|
|
// Return with match length error |
|
sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch: |
|
MOVQ 16(SP), AX |
|
MOVQ ctx+16(FP), CX |
|
MOVQ AX, 216(CX) |
|
MOVQ $0x00000001, ret+24(FP) |
|
RET |
|
|
|
// Return with match too long error |
|
sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big: |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 16(SP), CX |
|
MOVQ CX, 216(AX) |
|
MOVQ $0x00000002, ret+24(FP) |
|
RET |
|
|
|
// Return with match offset too long error |
|
error_match_off_too_big: |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 8(SP), CX |
|
MOVQ CX, 224(AX) |
|
MOVQ R12, 136(AX) |
|
MOVQ $0x00000003, ret+24(FP) |
|
RET |
|
|
|
// Return with not enough literals error |
|
error_not_enough_literals: |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 24(SP), CX |
|
MOVQ CX, 208(AX) |
|
MOVQ $0x00000004, ret+24(FP) |
|
RET |
|
|
|
// Return with overread error |
|
error_overread: |
|
MOVQ $0x00000006, ret+24(FP) |
|
RET |
|
|
|
// Return with not enough output space error |
|
error_not_enough_space: |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 24(SP), CX |
|
MOVQ CX, 208(AX) |
|
MOVQ 16(SP), CX |
|
MOVQ CX, 216(AX) |
|
MOVQ R12, 136(AX) |
|
MOVQ $0x00000005, ret+24(FP) |
|
RET |
|
|
|
// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int |
|
// Requires: BMI, BMI2, CMOV, SSE |
|
TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32 |
|
MOVQ br+8(FP), BX |
|
MOVQ 24(BX), AX |
|
MOVBQZX 40(BX), DX |
|
MOVQ (BX), CX |
|
MOVQ 32(BX), BX |
|
ADDQ BX, CX |
|
MOVQ CX, (SP) |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 72(CX), SI |
|
MOVQ 80(CX), DI |
|
MOVQ 88(CX), R8 |
|
XORQ R9, R9 |
|
MOVQ R9, 8(SP) |
|
MOVQ R9, 16(SP) |
|
MOVQ R9, 24(SP) |
|
MOVQ 112(CX), R9 |
|
MOVQ 128(CX), R10 |
|
MOVQ R10, 32(SP) |
|
MOVQ 144(CX), R10 |
|
MOVQ 136(CX), R11 |
|
MOVQ 200(CX), R12 |
|
MOVQ R12, 56(SP) |
|
MOVQ 176(CX), R12 |
|
MOVQ R12, 48(SP) |
|
MOVQ 184(CX), CX |
|
MOVQ CX, 40(SP) |
|
MOVQ 40(SP), CX |
|
ADDQ CX, 48(SP) |
|
|
|
// Calculate pointer to s.out[cap(s.out)] (a past-end pointer) |
|
ADDQ R9, 32(SP) |
|
|
|
// outBase += outPosition |
|
ADDQ R11, R9 |
|
|
|
sequenceDecs_decodeSync_safe_bmi2_main_loop: |
|
MOVQ (SP), R12 |
|
|
|
// Fill bitreader to have enough for the offset and match length. |
|
CMPQ BX, $0x08 |
|
JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte |
|
MOVQ DX, CX |
|
SHRQ $0x03, CX |
|
SUBQ CX, R12 |
|
MOVQ (R12), AX |
|
SUBQ CX, BX |
|
ANDQ $0x07, DX |
|
JMP sequenceDecs_decodeSync_safe_bmi2_fill_end |
|
|
|
sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte: |
|
CMPQ BX, $0x00 |
|
JLE sequenceDecs_decodeSync_safe_bmi2_fill_check_overread |
|
CMPQ DX, $0x07 |
|
JLE sequenceDecs_decodeSync_safe_bmi2_fill_end |
|
SHLQ $0x08, AX |
|
SUBQ $0x01, R12 |
|
SUBQ $0x01, BX |
|
SUBQ $0x08, DX |
|
MOVBQZX (R12), CX |
|
ORQ CX, AX |
|
JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte |
|
|
|
sequenceDecs_decodeSync_safe_bmi2_fill_check_overread: |
|
CMPQ DX, $0x40 |
|
JA error_overread |
|
|
|
sequenceDecs_decodeSync_safe_bmi2_fill_end: |
|
// Update offset |
|
MOVQ $0x00000808, CX |
|
BEXTRQ CX, R8, R13 |
|
MOVQ AX, R14 |
|
LEAQ (DX)(R13*1), CX |
|
ROLQ CL, R14 |
|
BZHIQ R13, R14, R14 |
|
MOVQ CX, DX |
|
MOVQ R8, CX |
|
SHRQ $0x20, CX |
|
ADDQ R14, CX |
|
MOVQ CX, 8(SP) |
|
|
|
// Update match length |
|
MOVQ $0x00000808, CX |
|
BEXTRQ CX, DI, R13 |
|
MOVQ AX, R14 |
|
LEAQ (DX)(R13*1), CX |
|
ROLQ CL, R14 |
|
BZHIQ R13, R14, R14 |
|
MOVQ CX, DX |
|
MOVQ DI, CX |
|
SHRQ $0x20, CX |
|
ADDQ R14, CX |
|
MOVQ CX, 16(SP) |
|
|
|
// Fill bitreader to have enough for the remaining |
|
CMPQ BX, $0x08 |
|
JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte |
|
MOVQ DX, CX |
|
SHRQ $0x03, CX |
|
SUBQ CX, R12 |
|
MOVQ (R12), AX |
|
SUBQ CX, BX |
|
ANDQ $0x07, DX |
|
JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end |
|
|
|
sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte: |
|
CMPQ BX, $0x00 |
|
JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread |
|
CMPQ DX, $0x07 |
|
JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end |
|
SHLQ $0x08, AX |
|
SUBQ $0x01, R12 |
|
SUBQ $0x01, BX |
|
SUBQ $0x08, DX |
|
MOVBQZX (R12), CX |
|
ORQ CX, AX |
|
JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte |
|
|
|
sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread: |
|
CMPQ DX, $0x40 |
|
JA error_overread |
|
|
|
sequenceDecs_decodeSync_safe_bmi2_fill_2_end: |
|
// Update literal length |
|
MOVQ $0x00000808, CX |
|
BEXTRQ CX, SI, R13 |
|
MOVQ AX, R14 |
|
LEAQ (DX)(R13*1), CX |
|
ROLQ CL, R14 |
|
BZHIQ R13, R14, R14 |
|
MOVQ CX, DX |
|
MOVQ SI, CX |
|
SHRQ $0x20, CX |
|
ADDQ R14, CX |
|
MOVQ CX, 24(SP) |
|
|
|
// Fill bitreader for state updates |
|
MOVQ R12, (SP) |
|
MOVQ $0x00000808, CX |
|
BEXTRQ CX, R8, R12 |
|
MOVQ ctx+16(FP), CX |
|
CMPQ 96(CX), $0x00 |
|
JZ sequenceDecs_decodeSync_safe_bmi2_skip_update |
|
LEAQ (SI)(DI*1), R13 |
|
ADDQ R8, R13 |
|
MOVBQZX R13, R13 |
|
LEAQ (DX)(R13*1), CX |
|
MOVQ AX, R14 |
|
MOVQ CX, DX |
|
ROLQ CL, R14 |
|
BZHIQ R13, R14, R14 |
|
|
|
// Update Offset State |
|
BZHIQ R8, R14, CX |
|
SHRXQ R8, R14, R14 |
|
SHRL $0x10, R8 |
|
ADDQ CX, R8 |
|
|
|
// Load ctx.ofTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 48(CX), CX |
|
MOVQ (CX)(R8*8), R8 |
|
|
|
// Update Match Length State |
|
BZHIQ DI, R14, CX |
|
SHRXQ DI, R14, R14 |
|
SHRL $0x10, DI |
|
ADDQ CX, DI |
|
|
|
// Load ctx.mlTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ 24(CX), CX |
|
MOVQ (CX)(DI*8), DI |
|
|
|
// Update Literal Length State |
|
BZHIQ SI, R14, CX |
|
SHRL $0x10, SI |
|
ADDQ CX, SI |
|
|
|
// Load ctx.llTable |
|
MOVQ ctx+16(FP), CX |
|
MOVQ (CX), CX |
|
MOVQ (CX)(SI*8), SI |
|
|
|
sequenceDecs_decodeSync_safe_bmi2_skip_update: |
|
// Adjust offset |
|
MOVQ s+0(FP), CX |
|
MOVQ 8(SP), R13 |
|
CMPQ R12, $0x01 |
|
JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0 |
|
MOVUPS 144(CX), X0 |
|
MOVQ R13, 144(CX) |
|
MOVUPS X0, 152(CX) |
|
JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust |
|
|
|
sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0: |
|
CMPQ 24(SP), $0x00000000 |
|
JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero |
|
INCQ R13 |
|
JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero |
|
|
|
sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero: |
|
TESTQ R13, R13 |
|
JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero |
|
MOVQ 144(CX), R13 |
|
JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust |
|
|
|
sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero: |
|
MOVQ R13, R12 |
|
XORQ R14, R14 |
|
MOVQ $-1, R15 |
|
CMPQ R13, $0x03 |
|
CMOVQEQ R14, R12 |
|
CMOVQEQ R15, R14 |
|
ADDQ 144(CX)(R12*8), R14 |
|
JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid |
|
MOVQ $0x00000001, R14 |
|
|
|
sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid: |
|
CMPQ R13, $0x01 |
|
JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip |
|
MOVQ 152(CX), R12 |
|
MOVQ R12, 160(CX) |
|
|
|
sequenceDecs_decodeSync_safe_bmi2_adjust_skip: |
|
MOVQ 144(CX), R12 |
|
MOVQ R12, 152(CX) |
|
MOVQ R14, 144(CX) |
|
MOVQ R14, R13 |
|
|
|
sequenceDecs_decodeSync_safe_bmi2_after_adjust: |
|
MOVQ R13, 8(SP) |
|
|
|
// Check values |
|
MOVQ 16(SP), CX |
|
MOVQ 24(SP), R12 |
|
LEAQ (CX)(R12*1), R14 |
|
MOVQ s+0(FP), R15 |
|
ADDQ R14, 256(R15) |
|
MOVQ ctx+16(FP), R14 |
|
SUBQ R12, 104(R14) |
|
JS error_not_enough_literals |
|
CMPQ CX, $0x00020002 |
|
JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big |
|
TESTQ R13, R13 |
|
JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok |
|
TESTQ CX, CX |
|
JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch |
|
|
|
sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok: |
|
MOVQ 24(SP), CX |
|
MOVQ 8(SP), R12 |
|
MOVQ 16(SP), R13 |
|
|
|
// Check if we have enough space in s.out |
|
LEAQ (CX)(R13*1), R14 |
|
ADDQ R9, R14 |
|
CMPQ R14, 32(SP) |
|
JA error_not_enough_space |
|
|
|
// Copy literals |
|
TESTQ CX, CX |
|
JZ check_offset |
|
MOVQ CX, R14 |
|
SUBQ $0x10, R14 |
|
JB copy_1_small |
|
|
|
copy_1_loop: |
|
MOVUPS (R10), X0 |
|
MOVUPS X0, (R9) |
|
ADDQ $0x10, R10 |
|
ADDQ $0x10, R9 |
|
SUBQ $0x10, R14 |
|
JAE copy_1_loop |
|
LEAQ 16(R10)(R14*1), R10 |
|
LEAQ 16(R9)(R14*1), R9 |
|
MOVUPS -16(R10), X0 |
|
MOVUPS X0, -16(R9) |
|
JMP copy_1_end |
|
|
|
copy_1_small: |
|
CMPQ CX, $0x03 |
|
JE copy_1_move_3 |
|
JB copy_1_move_1or2 |
|
CMPQ CX, $0x08 |
|
JB copy_1_move_4through7 |
|
JMP copy_1_move_8through16 |
|
|
|
copy_1_move_1or2: |
|
MOVB (R10), R14 |
|
MOVB -1(R10)(CX*1), R15 |
|
MOVB R14, (R9) |
|
MOVB R15, -1(R9)(CX*1) |
|
ADDQ CX, R10 |
|
ADDQ CX, R9 |
|
JMP copy_1_end |
|
|
|
copy_1_move_3: |
|
MOVW (R10), R14 |
|
MOVB 2(R10), R15 |
|
MOVW R14, (R9) |
|
MOVB R15, 2(R9) |
|
ADDQ CX, R10 |
|
ADDQ CX, R9 |
|
JMP copy_1_end |
|
|
|
copy_1_move_4through7: |
|
MOVL (R10), R14 |
|
MOVL -4(R10)(CX*1), R15 |
|
MOVL R14, (R9) |
|
MOVL R15, -4(R9)(CX*1) |
|
ADDQ CX, R10 |
|
ADDQ CX, R9 |
|
JMP copy_1_end |
|
|
|
copy_1_move_8through16: |
|
MOVQ (R10), R14 |
|
MOVQ -8(R10)(CX*1), R15 |
|
MOVQ R14, (R9) |
|
MOVQ R15, -8(R9)(CX*1) |
|
ADDQ CX, R10 |
|
ADDQ CX, R9 |
|
|
|
copy_1_end: |
|
ADDQ CX, R11 |
|
|
|
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) |
|
check_offset: |
|
MOVQ R11, CX |
|
ADDQ 40(SP), CX |
|
CMPQ R12, CX |
|
JG error_match_off_too_big |
|
CMPQ R12, 56(SP) |
|
JG error_match_off_too_big |
|
|
|
// Copy match from history |
|
MOVQ R12, CX |
|
SUBQ R11, CX |
|
JLS copy_match |
|
MOVQ 48(SP), R14 |
|
SUBQ CX, R14 |
|
CMPQ R13, CX |
|
JG copy_all_from_history |
|
MOVQ R13, CX |
|
SUBQ $0x10, CX |
|
JB copy_4_small |
|
|
|
copy_4_loop: |
|
MOVUPS (R14), X0 |
|
MOVUPS X0, (R9) |
|
ADDQ $0x10, R14 |
|
ADDQ $0x10, R9 |
|
SUBQ $0x10, CX |
|
JAE copy_4_loop |
|
LEAQ 16(R14)(CX*1), R14 |
|
LEAQ 16(R9)(CX*1), R9 |
|
MOVUPS -16(R14), X0 |
|
MOVUPS X0, -16(R9) |
|
JMP copy_4_end |
|
|
|
copy_4_small: |
|
CMPQ R13, $0x03 |
|
JE copy_4_move_3 |
|
CMPQ R13, $0x08 |
|
JB copy_4_move_4through7 |
|
JMP copy_4_move_8through16 |
|
|
|
copy_4_move_3: |
|
MOVW (R14), CX |
|
MOVB 2(R14), R12 |
|
MOVW CX, (R9) |
|
MOVB R12, 2(R9) |
|
ADDQ R13, R14 |
|
ADDQ R13, R9 |
|
JMP copy_4_end |
|
|
|
copy_4_move_4through7: |
|
MOVL (R14), CX |
|
MOVL -4(R14)(R13*1), R12 |
|
MOVL CX, (R9) |
|
MOVL R12, -4(R9)(R13*1) |
|
ADDQ R13, R14 |
|
ADDQ R13, R9 |
|
JMP copy_4_end |
|
|
|
copy_4_move_8through16: |
|
MOVQ (R14), CX |
|
MOVQ -8(R14)(R13*1), R12 |
|
MOVQ CX, (R9) |
|
MOVQ R12, -8(R9)(R13*1) |
|
ADDQ R13, R14 |
|
ADDQ R13, R9 |
|
|
|
copy_4_end: |
|
ADDQ R13, R11 |
|
JMP handle_loop |
|
JMP loop_finished |
|
|
|
copy_all_from_history: |
|
MOVQ CX, R15 |
|
SUBQ $0x10, R15 |
|
JB copy_5_small |
|
|
|
copy_5_loop: |
|
MOVUPS (R14), X0 |
|
MOVUPS X0, (R9) |
|
ADDQ $0x10, R14 |
|
ADDQ $0x10, R9 |
|
SUBQ $0x10, R15 |
|
JAE copy_5_loop |
|
LEAQ 16(R14)(R15*1), R14 |
|
LEAQ 16(R9)(R15*1), R9 |
|
MOVUPS -16(R14), X0 |
|
MOVUPS X0, -16(R9) |
|
JMP copy_5_end |
|
|
|
copy_5_small: |
|
CMPQ CX, $0x03 |
|
JE copy_5_move_3 |
|
JB copy_5_move_1or2 |
|
CMPQ CX, $0x08 |
|
JB copy_5_move_4through7 |
|
JMP copy_5_move_8through16 |
|
|
|
copy_5_move_1or2: |
|
MOVB (R14), R15 |
|
MOVB -1(R14)(CX*1), BP |
|
MOVB R15, (R9) |
|
MOVB BP, -1(R9)(CX*1) |
|
ADDQ CX, R14 |
|
ADDQ CX, R9 |
|
JMP copy_5_end |
|
|
|
copy_5_move_3: |
|
MOVW (R14), R15 |
|
MOVB 2(R14), BP |
|
MOVW R15, (R9) |
|
MOVB BP, 2(R9) |
|
ADDQ CX, R14 |
|
ADDQ CX, R9 |
|
JMP copy_5_end |
|
|
|
copy_5_move_4through7: |
|
MOVL (R14), R15 |
|
MOVL -4(R14)(CX*1), BP |
|
MOVL R15, (R9) |
|
MOVL BP, -4(R9)(CX*1) |
|
ADDQ CX, R14 |
|
ADDQ CX, R9 |
|
JMP copy_5_end |
|
|
|
copy_5_move_8through16: |
|
MOVQ (R14), R15 |
|
MOVQ -8(R14)(CX*1), BP |
|
MOVQ R15, (R9) |
|
MOVQ BP, -8(R9)(CX*1) |
|
ADDQ CX, R14 |
|
ADDQ CX, R9 |
|
|
|
copy_5_end: |
|
ADDQ CX, R11 |
|
SUBQ CX, R13 |
|
|
|
// Copy match from the current buffer |
|
copy_match: |
|
MOVQ R9, CX |
|
SUBQ R12, CX |
|
|
|
// ml <= mo |
|
CMPQ R13, R12 |
|
JA copy_overlapping_match |
|
|
|
// Copy non-overlapping match |
|
ADDQ R13, R11 |
|
MOVQ R13, R12 |
|
SUBQ $0x10, R12 |
|
JB copy_2_small |
|
|
|
copy_2_loop: |
|
MOVUPS (CX), X0 |
|
MOVUPS X0, (R9) |
|
ADDQ $0x10, CX |
|
ADDQ $0x10, R9 |
|
SUBQ $0x10, R12 |
|
JAE copy_2_loop |
|
LEAQ 16(CX)(R12*1), CX |
|
LEAQ 16(R9)(R12*1), R9 |
|
MOVUPS -16(CX), X0 |
|
MOVUPS X0, -16(R9) |
|
JMP copy_2_end |
|
|
|
copy_2_small: |
|
CMPQ R13, $0x03 |
|
JE copy_2_move_3 |
|
JB copy_2_move_1or2 |
|
CMPQ R13, $0x08 |
|
JB copy_2_move_4through7 |
|
JMP copy_2_move_8through16 |
|
|
|
copy_2_move_1or2: |
|
MOVB (CX), R12 |
|
MOVB -1(CX)(R13*1), R14 |
|
MOVB R12, (R9) |
|
MOVB R14, -1(R9)(R13*1) |
|
ADDQ R13, CX |
|
ADDQ R13, R9 |
|
JMP copy_2_end |
|
|
|
copy_2_move_3: |
|
MOVW (CX), R12 |
|
MOVB 2(CX), R14 |
|
MOVW R12, (R9) |
|
MOVB R14, 2(R9) |
|
ADDQ R13, CX |
|
ADDQ R13, R9 |
|
JMP copy_2_end |
|
|
|
copy_2_move_4through7: |
|
MOVL (CX), R12 |
|
MOVL -4(CX)(R13*1), R14 |
|
MOVL R12, (R9) |
|
MOVL R14, -4(R9)(R13*1) |
|
ADDQ R13, CX |
|
ADDQ R13, R9 |
|
JMP copy_2_end |
|
|
|
copy_2_move_8through16: |
|
MOVQ (CX), R12 |
|
MOVQ -8(CX)(R13*1), R14 |
|
MOVQ R12, (R9) |
|
MOVQ R14, -8(R9)(R13*1) |
|
ADDQ R13, CX |
|
ADDQ R13, R9 |
|
|
|
copy_2_end: |
|
JMP handle_loop |
|
|
|
// Copy overlapping match |
|
copy_overlapping_match: |
|
ADDQ R13, R11 |
|
|
|
copy_slow_3: |
|
MOVB (CX), R12 |
|
MOVB R12, (R9) |
|
INCQ CX |
|
INCQ R9 |
|
DECQ R13 |
|
JNZ copy_slow_3 |
|
|
|
handle_loop: |
|
MOVQ ctx+16(FP), CX |
|
DECQ 96(CX) |
|
JNS sequenceDecs_decodeSync_safe_bmi2_main_loop |
|
|
|
loop_finished: |
|
MOVQ br+8(FP), CX |
|
MOVQ AX, 24(CX) |
|
MOVB DL, 40(CX) |
|
MOVQ BX, 32(CX) |
|
|
|
// Update the context |
|
MOVQ ctx+16(FP), AX |
|
MOVQ R11, 136(AX) |
|
MOVQ 144(AX), CX |
|
SUBQ CX, R10 |
|
MOVQ R10, 168(AX) |
|
|
|
// Return success |
|
MOVQ $0x00000000, ret+24(FP) |
|
RET |
|
|
|
// Return with match length error |
|
sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch: |
|
MOVQ 16(SP), AX |
|
MOVQ ctx+16(FP), CX |
|
MOVQ AX, 216(CX) |
|
MOVQ $0x00000001, ret+24(FP) |
|
RET |
|
|
|
// Return with match too long error |
|
sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big: |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 16(SP), CX |
|
MOVQ CX, 216(AX) |
|
MOVQ $0x00000002, ret+24(FP) |
|
RET |
|
|
|
// Return with match offset too long error |
|
error_match_off_too_big: |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 8(SP), CX |
|
MOVQ CX, 224(AX) |
|
MOVQ R11, 136(AX) |
|
MOVQ $0x00000003, ret+24(FP) |
|
RET |
|
|
|
// Return with not enough literals error |
|
error_not_enough_literals: |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 24(SP), CX |
|
MOVQ CX, 208(AX) |
|
MOVQ $0x00000004, ret+24(FP) |
|
RET |
|
|
|
// Return with overread error |
|
error_overread: |
|
MOVQ $0x00000006, ret+24(FP) |
|
RET |
|
|
|
// Return with not enough output space error |
|
error_not_enough_space: |
|
MOVQ ctx+16(FP), AX |
|
MOVQ 24(SP), CX |
|
MOVQ CX, 208(AX) |
|
MOVQ 16(SP), CX |
|
MOVQ CX, 216(AX) |
|
MOVQ R11, 136(AX) |
|
MOVQ $0x00000005, ret+24(FP) |
|
RET
|
|
|