kusano fc6ab3
/*
kusano fc6ab3
;uInt longest_match_x64(
kusano fc6ab3
;    deflate_state *s,
kusano fc6ab3
;    IPos cur_match);                             // current match 
kusano fc6ab3
kusano fc6ab3
; gvmat64.S -- Asm portion of the optimized longest_match for 32 bits x86_64
kusano fc6ab3
;  (AMD64 on Athlon 64, Opteron, Phenom
kusano fc6ab3
;     and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
kusano fc6ab3
; this file is translation from gvmat64.asm to GCC 4.x (for Linux, Mac XCode)
kusano fc6ab3
; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
kusano fc6ab3
;
kusano fc6ab3
; File written by Gilles Vollant, by converting to assembly the longest_match
kusano fc6ab3
;  from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
kusano fc6ab3
;  and by taking inspiration on asm686 with masm, optimised assembly code
kusano fc6ab3
;        from Brian Raiter, written 1998
kusano fc6ab3
;
kusano fc6ab3
;  This software is provided 'as-is', without any express or implied
kusano fc6ab3
;  warranty.  In no event will the authors be held liable for any damages
kusano fc6ab3
;  arising from the use of this software.
kusano fc6ab3
;
kusano fc6ab3
;  Permission is granted to anyone to use this software for any purpose,
kusano fc6ab3
;  including commercial applications, and to alter it and redistribute it
kusano fc6ab3
;  freely, subject to the following restrictions:
kusano fc6ab3
;
kusano fc6ab3
;  1. The origin of this software must not be misrepresented; you must not
kusano fc6ab3
;     claim that you wrote the original software. If you use this software
kusano fc6ab3
;     in a product, an acknowledgment in the product documentation would be
kusano fc6ab3
;     appreciated but is not required.
kusano fc6ab3
;  2. Altered source versions must be plainly marked as such, and must not be
kusano fc6ab3
;     misrepresented as being the original software
kusano fc6ab3
;  3. This notice may not be removed or altered from any source distribution.
kusano fc6ab3
;
kusano fc6ab3
;         http://www.zlib.net
kusano fc6ab3
;         http://www.winimage.com/zLibDll
kusano fc6ab3
;         http://www.muppetlabs.com/~breadbox/software/assembly.html
kusano fc6ab3
;
kusano fc6ab3
; to compile this file for zLib, I use option:
kusano fc6ab3
;   gcc -c -arch x86_64 gvmat64.S
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
;uInt longest_match(s, cur_match)
kusano fc6ab3
;    deflate_state *s;
kusano fc6ab3
;    IPos cur_match;                             // current match /
kusano fc6ab3
;
kusano fc6ab3
; with XCode for Mac, I had strange error with some jump on intel syntax
kusano fc6ab3
; this is why BEFORE_JMP and AFTER_JMP are used
kusano fc6ab3
 */
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
#define BEFORE_JMP .att_syntax
kusano fc6ab3
#define AFTER_JMP .intel_syntax noprefix
kusano fc6ab3
kusano fc6ab3
#ifndef NO_UNDERLINE
kusano fc6ab3
#	define	match_init	_match_init
kusano fc6ab3
#	define	longest_match	_longest_match
kusano fc6ab3
#endif
kusano fc6ab3
kusano fc6ab3
.intel_syntax noprefix
kusano fc6ab3
kusano fc6ab3
.globl	match_init, longest_match
kusano fc6ab3
.text
kusano fc6ab3
longest_match:
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
#define LocalVarsSize 96
kusano fc6ab3
/*
kusano fc6ab3
; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
kusano fc6ab3
; free register :  r14,r15
kusano fc6ab3
; register can be saved : rsp
kusano fc6ab3
*/
kusano fc6ab3
kusano fc6ab3
#define chainlenwmask     (rsp + 8 - LocalVarsSize)
kusano fc6ab3
#define nicematch         (rsp + 16 - LocalVarsSize)
kusano fc6ab3
kusano fc6ab3
#define save_rdi        (rsp + 24 - LocalVarsSize)
kusano fc6ab3
#define save_rsi        (rsp + 32 - LocalVarsSize)
kusano fc6ab3
#define save_rbx        (rsp + 40 - LocalVarsSize)
kusano fc6ab3
#define save_rbp        (rsp + 48 - LocalVarsSize)
kusano fc6ab3
#define save_r12        (rsp + 56 - LocalVarsSize)
kusano fc6ab3
#define save_r13        (rsp + 64 - LocalVarsSize)
kusano fc6ab3
#define save_r14        (rsp + 72 - LocalVarsSize)
kusano fc6ab3
#define save_r15        (rsp + 80 - LocalVarsSize)
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
/*
kusano fc6ab3
;  all the +4 offsets are due to the addition of pending_buf_size (in zlib
kusano fc6ab3
;  in the deflate_state structure since the asm code was first written
kusano fc6ab3
;  (if you compile with zlib 1.0.4 or older, remove the +4).
kusano fc6ab3
;  Note : these value are good with a 8 bytes boundary pack structure
kusano fc6ab3
*/
kusano fc6ab3
kusano fc6ab3
#define    MAX_MATCH              258
kusano fc6ab3
#define    MIN_MATCH              3
kusano fc6ab3
#define    MIN_LOOKAHEAD          (MAX_MATCH+MIN_MATCH+1)
kusano fc6ab3
kusano fc6ab3
/*
kusano fc6ab3
;;; Offsets for fields in the deflate_state structure. These numbers
kusano fc6ab3
;;; are calculated from the definition of deflate_state, with the
kusano fc6ab3
;;; assumption that the compiler will dword-align the fields. (Thus,
kusano fc6ab3
;;; changing the definition of deflate_state could easily cause this
kusano fc6ab3
;;; program to crash horribly, without so much as a warning at
kusano fc6ab3
;;; compile time. Sigh.)
kusano fc6ab3
kusano fc6ab3
;  all the +zlib1222add offsets are due to the addition of fields
kusano fc6ab3
;  in zlib in the deflate_state structure since the asm code was first written
kusano fc6ab3
;  (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
kusano fc6ab3
;  (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
kusano fc6ab3
;  if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
kusano fc6ab3
*/
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
/* you can check the structure offset by running
kusano fc6ab3
kusano fc6ab3
#include <stdlib.h></stdlib.h>
kusano fc6ab3
#include <stdio.h></stdio.h>
kusano fc6ab3
#include "deflate.h"
kusano fc6ab3
kusano fc6ab3
void print_depl()
kusano fc6ab3
{
kusano fc6ab3
deflate_state ds;
kusano fc6ab3
deflate_state *s=&ds;
kusano fc6ab3
printf("size pointer=%u\n",(int)sizeof(void*));
kusano fc6ab3
kusano fc6ab3
printf("#define dsWSize         %u\n",(int)(((char*)&(s->w_size))-((char*)s)));
kusano fc6ab3
printf("#define dsWMask         %u\n",(int)(((char*)&(s->w_mask))-((char*)s)));
kusano fc6ab3
printf("#define dsWindow        %u\n",(int)(((char*)&(s->window))-((char*)s)));
kusano fc6ab3
printf("#define dsPrev          %u\n",(int)(((char*)&(s->prev))-((char*)s)));
kusano fc6ab3
printf("#define dsMatchLen      %u\n",(int)(((char*)&(s->match_length))-((char*)s)));
kusano fc6ab3
printf("#define dsPrevMatch     %u\n",(int)(((char*)&(s->prev_match))-((char*)s)));
kusano fc6ab3
printf("#define dsStrStart      %u\n",(int)(((char*)&(s->strstart))-((char*)s)));
kusano fc6ab3
printf("#define dsMatchStart    %u\n",(int)(((char*)&(s->match_start))-((char*)s)));
kusano fc6ab3
printf("#define dsLookahead     %u\n",(int)(((char*)&(s->lookahead))-((char*)s)));
kusano fc6ab3
printf("#define dsPrevLen       %u\n",(int)(((char*)&(s->prev_length))-((char*)s)));
kusano fc6ab3
printf("#define dsMaxChainLen   %u\n",(int)(((char*)&(s->max_chain_length))-((char*)s)));
kusano fc6ab3
printf("#define dsGoodMatch     %u\n",(int)(((char*)&(s->good_match))-((char*)s)));
kusano fc6ab3
printf("#define dsNiceMatch     %u\n",(int)(((char*)&(s->nice_match))-((char*)s)));
kusano fc6ab3
}
kusano fc6ab3
*/
kusano fc6ab3
kusano fc6ab3
#define dsWSize          68
kusano fc6ab3
#define dsWMask          76
kusano fc6ab3
#define dsWindow         80
kusano fc6ab3
#define dsPrev           96
kusano fc6ab3
#define dsMatchLen       144
kusano fc6ab3
#define dsPrevMatch      148
kusano fc6ab3
#define dsStrStart       156
kusano fc6ab3
#define dsMatchStart     160
kusano fc6ab3
#define dsLookahead      164
kusano fc6ab3
#define dsPrevLen        168
kusano fc6ab3
#define dsMaxChainLen    172
kusano fc6ab3
#define dsGoodMatch      188
kusano fc6ab3
#define dsNiceMatch      192
kusano fc6ab3
kusano fc6ab3
#define window_size      [ rcx + dsWSize]
kusano fc6ab3
#define WMask            [ rcx + dsWMask]
kusano fc6ab3
#define window_ad        [ rcx + dsWindow]
kusano fc6ab3
#define prev_ad          [ rcx + dsPrev]
kusano fc6ab3
#define strstart         [ rcx + dsStrStart]
kusano fc6ab3
#define match_start      [ rcx + dsMatchStart]
kusano fc6ab3
#define Lookahead        [ rcx + dsLookahead] //; 0ffffffffh on infozip
kusano fc6ab3
#define prev_length      [ rcx + dsPrevLen]
kusano fc6ab3
#define max_chain_length [ rcx + dsMaxChainLen]
kusano fc6ab3
#define good_match       [ rcx + dsGoodMatch]
kusano fc6ab3
#define nice_match       [ rcx + dsNiceMatch]
kusano fc6ab3
kusano fc6ab3
/*
kusano fc6ab3
; windows:
kusano fc6ab3
; parameter 1 in rcx(deflate state s), param 2 in rdx (cur match)
kusano fc6ab3
kusano fc6ab3
; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
kusano fc6ab3
; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
kusano fc6ab3
;
kusano fc6ab3
; All registers must be preserved across the call, except for
kusano fc6ab3
;   rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
kusano fc6ab3
kusano fc6ab3
;
kusano fc6ab3
; gcc on macosx-linux:
kusano fc6ab3
; see http://www.x86-64.org/documentation/abi-0.99.pdf
kusano fc6ab3
; param 1 in rdi, param 2 in rsi
kusano fc6ab3
; rbx, rsp, rbp, r12 to r15 must be preserved
kusano fc6ab3
kusano fc6ab3
;;; Save registers that the compiler may be using, and adjust esp to
kusano fc6ab3
;;; make room for our stack frame.
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
;;; Retrieve the function arguments. r8d will hold cur_match
kusano fc6ab3
;;; throughout the entire function. edx will hold the pointer to the
kusano fc6ab3
;;; deflate_state structure during the function's setup (before
kusano fc6ab3
;;; entering the main loop.
kusano fc6ab3
kusano fc6ab3
; ms: parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
kusano fc6ab3
; mac: param 1 in rdi, param 2 rsi
kusano fc6ab3
; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
kusano fc6ab3
*/
kusano fc6ab3
        mov [save_rbx],rbx
kusano fc6ab3
        mov [save_rbp],rbp
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
        mov rcx,rdi
kusano fc6ab3
kusano fc6ab3
        mov r8d,esi
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
        mov [save_r12],r12
kusano fc6ab3
        mov [save_r13],r13
kusano fc6ab3
        mov [save_r14],r14
kusano fc6ab3
        mov [save_r15],r15
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
//;;; uInt wmask = s->w_mask;
kusano fc6ab3
//;;; unsigned chain_length = s->max_chain_length;
kusano fc6ab3
//;;; if (s->prev_length >= s->good_match) {
kusano fc6ab3
//;;;     chain_length >>= 2;
kusano fc6ab3
//;;; }
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
        mov edi, prev_length
kusano fc6ab3
        mov esi, good_match
kusano fc6ab3
        mov eax, WMask
kusano fc6ab3
        mov ebx, max_chain_length
kusano fc6ab3
        cmp edi, esi
kusano fc6ab3
        jl  LastMatchGood
kusano fc6ab3
        shr ebx, 2
kusano fc6ab3
LastMatchGood:
kusano fc6ab3
kusano fc6ab3
//;;; chainlen is decremented once beforehand so that the function can
kusano fc6ab3
//;;; use the sign flag instead of the zero flag for the exit test.
kusano fc6ab3
//;;; It is then shifted into the high word, to make room for the wmask
kusano fc6ab3
//;;; value, which it will always accompany.
kusano fc6ab3
kusano fc6ab3
        dec ebx
kusano fc6ab3
        shl ebx, 16
kusano fc6ab3
        or  ebx, eax
kusano fc6ab3
kusano fc6ab3
//;;; on zlib only
kusano fc6ab3
//;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
        mov eax, nice_match
kusano fc6ab3
        mov [chainlenwmask], ebx
kusano fc6ab3
        mov r10d, Lookahead
kusano fc6ab3
        cmp r10d, eax
kusano fc6ab3
        cmovnl r10d, eax
kusano fc6ab3
        mov [nicematch],r10d
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
//;;; register Bytef *scan = s->window + s->strstart;
kusano fc6ab3
        mov r10, window_ad
kusano fc6ab3
        mov ebp, strstart
kusano fc6ab3
        lea r13, [r10 + rbp]
kusano fc6ab3
kusano fc6ab3
//;;; Determine how many bytes the scan ptr is off from being
kusano fc6ab3
//;;; dword-aligned.
kusano fc6ab3
kusano fc6ab3
         mov r9,r13
kusano fc6ab3
         neg r13
kusano fc6ab3
         and r13,3
kusano fc6ab3
kusano fc6ab3
//;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
kusano fc6ab3
//;;;     s->strstart - (IPos)MAX_DIST(s) : NIL;
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
        mov eax, window_size
kusano fc6ab3
        sub eax, MIN_LOOKAHEAD
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
        xor edi,edi
kusano fc6ab3
        sub ebp, eax
kusano fc6ab3
kusano fc6ab3
        mov r11d, prev_length
kusano fc6ab3
kusano fc6ab3
        cmovng ebp,edi
kusano fc6ab3
kusano fc6ab3
//;;; int best_len = s->prev_length;
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
//;;; Store the sum of s->window + best_len in esi locally, and in esi.
kusano fc6ab3
kusano fc6ab3
       lea  rsi,[r10+r11]
kusano fc6ab3
kusano fc6ab3
//;;; register ush scan_start = *(ushf*)scan;
kusano fc6ab3
//;;; register ush scan_end   = *(ushf*)(scan+best_len-1);
kusano fc6ab3
//;;; Posf *prev = s->prev;
kusano fc6ab3
kusano fc6ab3
        movzx r12d,word ptr [r9]
kusano fc6ab3
        movzx ebx, word ptr [r9 + r11 - 1]
kusano fc6ab3
kusano fc6ab3
        mov rdi, prev_ad
kusano fc6ab3
kusano fc6ab3
//;;; Jump into the main loop.
kusano fc6ab3
kusano fc6ab3
        mov edx, [chainlenwmask]
kusano fc6ab3
kusano fc6ab3
        cmp bx,word ptr [rsi + r8 - 1]
kusano fc6ab3
        jz  LookupLoopIsZero
kusano fc6ab3
				
kusano fc6ab3
						
kusano fc6ab3
						
kusano fc6ab3
LookupLoop1:
kusano fc6ab3
        and r8d, edx
kusano fc6ab3
kusano fc6ab3
        movzx   r8d, word ptr [rdi + r8*2]
kusano fc6ab3
        cmp r8d, ebp
kusano fc6ab3
        jbe LeaveNow
kusano fc6ab3
		
kusano fc6ab3
		
kusano fc6ab3
		
kusano fc6ab3
        sub edx, 0x00010000
kusano fc6ab3
		BEFORE_JMP
kusano fc6ab3
        js  LeaveNow
kusano fc6ab3
		AFTER_JMP
kusano fc6ab3
kusano fc6ab3
LoopEntry1:
kusano fc6ab3
        cmp bx,word ptr [rsi + r8 - 1]
kusano fc6ab3
		BEFORE_JMP
kusano fc6ab3
        jz  LookupLoopIsZero
kusano fc6ab3
		AFTER_JMP
kusano fc6ab3
kusano fc6ab3
LookupLoop2:
kusano fc6ab3
        and r8d, edx
kusano fc6ab3
kusano fc6ab3
        movzx   r8d, word ptr [rdi + r8*2]
kusano fc6ab3
        cmp r8d, ebp
kusano fc6ab3
		BEFORE_JMP
kusano fc6ab3
        jbe LeaveNow
kusano fc6ab3
		AFTER_JMP
kusano fc6ab3
        sub edx, 0x00010000
kusano fc6ab3
		BEFORE_JMP
kusano fc6ab3
        js  LeaveNow
kusano fc6ab3
		AFTER_JMP
kusano fc6ab3
kusano fc6ab3
LoopEntry2:
kusano fc6ab3
        cmp bx,word ptr [rsi + r8 - 1]
kusano fc6ab3
		BEFORE_JMP
kusano fc6ab3
        jz  LookupLoopIsZero
kusano fc6ab3
		AFTER_JMP
kusano fc6ab3
kusano fc6ab3
LookupLoop4:
kusano fc6ab3
        and r8d, edx
kusano fc6ab3
kusano fc6ab3
        movzx   r8d, word ptr [rdi + r8*2]
kusano fc6ab3
        cmp r8d, ebp
kusano fc6ab3
		BEFORE_JMP
kusano fc6ab3
        jbe LeaveNow
kusano fc6ab3
		AFTER_JMP
kusano fc6ab3
        sub edx, 0x00010000
kusano fc6ab3
		BEFORE_JMP
kusano fc6ab3
        js  LeaveNow
kusano fc6ab3
		AFTER_JMP
kusano fc6ab3
kusano fc6ab3
LoopEntry4:
kusano fc6ab3
kusano fc6ab3
        cmp bx,word ptr [rsi + r8 - 1]
kusano fc6ab3
		BEFORE_JMP
kusano fc6ab3
        jnz LookupLoop1
kusano fc6ab3
        jmp LookupLoopIsZero
kusano fc6ab3
		AFTER_JMP
kusano fc6ab3
/*
kusano fc6ab3
;;; do {
kusano fc6ab3
;;;     match = s->window + cur_match;
kusano fc6ab3
;;;     if (*(ushf*)(match+best_len-1) != scan_end ||
kusano fc6ab3
;;;         *(ushf*)match != scan_start) continue;
kusano fc6ab3
;;;     [...]
kusano fc6ab3
;;; } while ((cur_match = prev[cur_match & wmask]) > limit
kusano fc6ab3
;;;          && --chain_length != 0);
kusano fc6ab3
;;;
kusano fc6ab3
;;; Here is the inner loop of the function. The function will spend the
kusano fc6ab3
;;; majority of its time in this loop, and majority of that time will
kusano fc6ab3
;;; be spent in the first ten instructions.
kusano fc6ab3
;;;
kusano fc6ab3
;;; Within this loop:
kusano fc6ab3
;;; ebx = scanend
kusano fc6ab3
;;; r8d = curmatch
kusano fc6ab3
;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
kusano fc6ab3
;;; esi = windowbestlen - i.e., (window + bestlen)
kusano fc6ab3
;;; edi = prev
kusano fc6ab3
;;; ebp = limit
kusano fc6ab3
*/
kusano fc6ab3
.balign 16
kusano fc6ab3
LookupLoop:
kusano fc6ab3
        and r8d, edx
kusano fc6ab3
kusano fc6ab3
        movzx   r8d, word ptr [rdi + r8*2]
kusano fc6ab3
        cmp r8d, ebp
kusano fc6ab3
		BEFORE_JMP
kusano fc6ab3
        jbe LeaveNow
kusano fc6ab3
		AFTER_JMP
kusano fc6ab3
        sub edx, 0x00010000
kusano fc6ab3
		BEFORE_JMP
kusano fc6ab3
        js  LeaveNow
kusano fc6ab3
		AFTER_JMP
kusano fc6ab3
kusano fc6ab3
LoopEntry:
kusano fc6ab3
kusano fc6ab3
        cmp bx,word ptr [rsi + r8 - 1]
kusano fc6ab3
		BEFORE_JMP
kusano fc6ab3
        jnz LookupLoop1
kusano fc6ab3
		AFTER_JMP
kusano fc6ab3
LookupLoopIsZero:
kusano fc6ab3
        cmp     r12w, word ptr [r10 + r8]
kusano fc6ab3
		BEFORE_JMP
kusano fc6ab3
        jnz LookupLoop1
kusano fc6ab3
		AFTER_JMP
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
//;;; Store the current value of chainlen.
kusano fc6ab3
        mov [chainlenwmask], edx
kusano fc6ab3
/*
kusano fc6ab3
;;; Point edi to the string under scrutiny, and esi to the string we
kusano fc6ab3
;;; are hoping to match it up with. In actuality, esi and edi are
kusano fc6ab3
;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
kusano fc6ab3
;;; initialized to -(MAX_MATCH_8 - scanalign).
kusano fc6ab3
*/
kusano fc6ab3
        lea rsi,[r8+r10]
kusano fc6ab3
        mov rdx, 0xfffffffffffffef8 //; -(MAX_MATCH_8)
kusano fc6ab3
        lea rsi, [rsi + r13 + 0x0108] //;MAX_MATCH_8]
kusano fc6ab3
        lea rdi, [r9 + r13 + 0x0108] //;MAX_MATCH_8]
kusano fc6ab3
kusano fc6ab3
        prefetcht1 [rsi+rdx]
kusano fc6ab3
        prefetcht1 [rdi+rdx]
kusano fc6ab3
kusano fc6ab3
/*
kusano fc6ab3
;;; Test the strings for equality, 8 bytes at a time. At the end,
kusano fc6ab3
;;; adjust rdx so that it is offset to the exact byte that mismatched.
kusano fc6ab3
;;;
kusano fc6ab3
;;; We already know at this point that the first three bytes of the
kusano fc6ab3
;;; strings match each other, and they can be safely passed over before
kusano fc6ab3
;;; starting the compare loop. So what this code does is skip over 0-3
kusano fc6ab3
;;; bytes, as much as necessary in order to dword-align the edi
kusano fc6ab3
;;; pointer. (rsi will still be misaligned three times out of four.)
kusano fc6ab3
;;;
kusano fc6ab3
;;; It should be confessed that this loop usually does not represent
kusano fc6ab3
;;; much of the total running time. Replacing it with a more
kusano fc6ab3
;;; straightforward "rep cmpsb" would not drastically degrade
kusano fc6ab3
;;; performance.
kusano fc6ab3
*/
kusano fc6ab3
kusano fc6ab3
LoopCmps:
kusano fc6ab3
        mov rax, [rsi + rdx]
kusano fc6ab3
        xor rax, [rdi + rdx]
kusano fc6ab3
        jnz LeaveLoopCmps
kusano fc6ab3
kusano fc6ab3
        mov rax, [rsi + rdx + 8]
kusano fc6ab3
        xor rax, [rdi + rdx + 8]
kusano fc6ab3
        jnz LeaveLoopCmps8
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
        mov rax, [rsi + rdx + 8+8]
kusano fc6ab3
        xor rax, [rdi + rdx + 8+8]
kusano fc6ab3
        jnz LeaveLoopCmps16
kusano fc6ab3
kusano fc6ab3
        add rdx,8+8+8
kusano fc6ab3
kusano fc6ab3
		BEFORE_JMP
kusano fc6ab3
        jnz  LoopCmps
kusano fc6ab3
        jmp  LenMaximum
kusano fc6ab3
		AFTER_JMP
kusano fc6ab3
		
kusano fc6ab3
LeaveLoopCmps16: add rdx,8
kusano fc6ab3
LeaveLoopCmps8: add rdx,8
kusano fc6ab3
LeaveLoopCmps:
kusano fc6ab3
kusano fc6ab3
        test    eax, 0x0000FFFF
kusano fc6ab3
        jnz LenLower
kusano fc6ab3
kusano fc6ab3
        test eax,0xffffffff
kusano fc6ab3
kusano fc6ab3
        jnz LenLower32
kusano fc6ab3
kusano fc6ab3
        add rdx,4
kusano fc6ab3
        shr rax,32
kusano fc6ab3
        or ax,ax
kusano fc6ab3
		BEFORE_JMP
kusano fc6ab3
        jnz LenLower
kusano fc6ab3
		AFTER_JMP
kusano fc6ab3
kusano fc6ab3
LenLower32:
kusano fc6ab3
        shr eax,16
kusano fc6ab3
        add rdx,2
kusano fc6ab3
		
kusano fc6ab3
LenLower:		
kusano fc6ab3
        sub al, 1
kusano fc6ab3
        adc rdx, 0
kusano fc6ab3
//;;; Calculate the length of the match. If it is longer than MAX_MATCH,
kusano fc6ab3
//;;; then automatically accept it as the best possible match and leave.
kusano fc6ab3
kusano fc6ab3
        lea rax, [rdi + rdx]
kusano fc6ab3
        sub rax, r9
kusano fc6ab3
        cmp eax, MAX_MATCH
kusano fc6ab3
		BEFORE_JMP
kusano fc6ab3
        jge LenMaximum
kusano fc6ab3
		AFTER_JMP
kusano fc6ab3
/*
kusano fc6ab3
;;; If the length of the match is not longer than the best match we
kusano fc6ab3
;;; have so far, then forget it and return to the lookup loop.
kusano fc6ab3
;///////////////////////////////////
kusano fc6ab3
*/
kusano fc6ab3
        cmp eax, r11d
kusano fc6ab3
        jg  LongerMatch
kusano fc6ab3
kusano fc6ab3
        lea rsi,[r10+r11]
kusano fc6ab3
kusano fc6ab3
        mov rdi, prev_ad
kusano fc6ab3
        mov edx, [chainlenwmask]
kusano fc6ab3
		BEFORE_JMP
kusano fc6ab3
        jmp LookupLoop
kusano fc6ab3
		AFTER_JMP
kusano fc6ab3
/*
kusano fc6ab3
;;;         s->match_start = cur_match;
kusano fc6ab3
;;;         best_len = len;
kusano fc6ab3
;;;         if (len >= nice_match) break;
kusano fc6ab3
;;;         scan_end = *(ushf*)(scan+best_len-1);
kusano fc6ab3
*/
kusano fc6ab3
LongerMatch:
kusano fc6ab3
        mov r11d, eax
kusano fc6ab3
        mov match_start, r8d
kusano fc6ab3
        cmp eax, [nicematch]
kusano fc6ab3
		BEFORE_JMP
kusano fc6ab3
        jge LeaveNow
kusano fc6ab3
		AFTER_JMP
kusano fc6ab3
kusano fc6ab3
        lea rsi,[r10+rax]
kusano fc6ab3
kusano fc6ab3
        movzx   ebx, word ptr [r9 + rax - 1]
kusano fc6ab3
        mov rdi, prev_ad
kusano fc6ab3
        mov edx, [chainlenwmask]
kusano fc6ab3
		BEFORE_JMP
kusano fc6ab3
        jmp LookupLoop
kusano fc6ab3
		AFTER_JMP
kusano fc6ab3
kusano fc6ab3
//;;; Accept the current string, with the maximum possible length.
kusano fc6ab3
kusano fc6ab3
LenMaximum:
kusano fc6ab3
        mov r11d,MAX_MATCH
kusano fc6ab3
        mov match_start, r8d
kusano fc6ab3
kusano fc6ab3
//;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
kusano fc6ab3
//;;; return s->lookahead;
kusano fc6ab3
kusano fc6ab3
LeaveNow:
kusano fc6ab3
        mov eax, Lookahead
kusano fc6ab3
        cmp r11d, eax
kusano fc6ab3
        cmovng eax, r11d
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
//;;; Restore the stack and return from whence we came.
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
//        mov rsi,[save_rsi]
kusano fc6ab3
//        mov rdi,[save_rdi]
kusano fc6ab3
        mov rbx,[save_rbx]
kusano fc6ab3
        mov rbp,[save_rbp]
kusano fc6ab3
        mov r12,[save_r12]
kusano fc6ab3
        mov r13,[save_r13]
kusano fc6ab3
        mov r14,[save_r14]
kusano fc6ab3
        mov r15,[save_r15]
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
        ret 0
kusano fc6ab3
//; please don't remove this string !
kusano fc6ab3
//; Your can freely use gvmat64 in any free or commercial app
kusano fc6ab3
//; but it is far better don't remove the string in the binary!
kusano fc6ab3
 //   db     0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
match_init:
kusano fc6ab3
  ret 0
kusano fc6ab3
kusano fc6ab3