kusano fc6ab3
;uInt longest_match_x64(
kusano fc6ab3
;    deflate_state *s,
kusano fc6ab3
;    IPos cur_match);                             /* current match */
kusano fc6ab3
kusano fc6ab3
; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86_64
kusano fc6ab3
;  (AMD64 on Athlon 64, Opteron, Phenom
kusano fc6ab3
;     and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
kusano fc6ab3
; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
kusano fc6ab3
;
kusano fc6ab3
; File written by Gilles Vollant, by converting to assembly the longest_match
kusano fc6ab3
;  from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
kusano fc6ab3
;
kusano fc6ab3
;  and by taking inspiration on asm686 with masm, optimised assembly code
kusano fc6ab3
;        from Brian Raiter, written 1998
kusano fc6ab3
;
kusano fc6ab3
;  This software is provided 'as-is', without any express or implied
kusano fc6ab3
;  warranty.  In no event will the authors be held liable for any damages
kusano fc6ab3
;  arising from the use of this software.
kusano fc6ab3
;
kusano fc6ab3
;  Permission is granted to anyone to use this software for any purpose,
kusano fc6ab3
;  including commercial applications, and to alter it and redistribute it
kusano fc6ab3
;  freely, subject to the following restrictions:
kusano fc6ab3
;
kusano fc6ab3
;  1. The origin of this software must not be misrepresented; you must not
kusano fc6ab3
;     claim that you wrote the original software. If you use this software
kusano fc6ab3
;     in a product, an acknowledgment in the product documentation would be
kusano fc6ab3
;     appreciated but is not required.
kusano fc6ab3
;  2. Altered source versions must be plainly marked as such, and must not be
kusano fc6ab3
;     misrepresented as being the original software
kusano fc6ab3
;  3. This notice may not be removed or altered from any source distribution.
kusano fc6ab3
;
kusano fc6ab3
;
kusano fc6ab3
;
kusano fc6ab3
;         http://www.zlib.net
kusano fc6ab3
;         http://www.winimage.com/zLibDll
kusano fc6ab3
;         http://www.muppetlabs.com/~breadbox/software/assembly.html
kusano fc6ab3
;
kusano fc6ab3
; to compile this file for infozip Zip, I use option:
kusano fc6ab3
;   ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm
kusano fc6ab3
;
kusano fc6ab3
; to compile this file for zLib, I use option:
kusano fc6ab3
;   ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
kusano fc6ab3
; Be carrefull to adapt zlib1222add below to your version of zLib
kusano fc6ab3
;   (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change
kusano fc6ab3
;    value of zlib1222add later)
kusano fc6ab3
;
kusano fc6ab3
; This file compile with Microsoft Macro Assembler (x64) for AMD64
kusano fc6ab3
;
kusano fc6ab3
;   ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
kusano fc6ab3
;
kusano fc6ab3
;   (you can get Windows WDK with ml64 for AMD64 from
kusano fc6ab3
;      http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
kusano fc6ab3
;
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
;uInt longest_match(s, cur_match)
kusano fc6ab3
;    deflate_state *s;
kusano fc6ab3
;    IPos cur_match;                             /* current match */
kusano fc6ab3
.code
kusano fc6ab3
longest_match PROC
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
;LocalVarsSize   equ 88
kusano fc6ab3
 LocalVarsSize   equ 72
kusano fc6ab3
kusano fc6ab3
; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
kusano fc6ab3
; free register :  r14,r15
kusano fc6ab3
; register can be saved : rsp
kusano fc6ab3
kusano fc6ab3
 chainlenwmask   equ  rsp + 8 - LocalVarsSize    ; high word: current chain len
kusano fc6ab3
                                                 ; low word: s->wmask
kusano fc6ab3
;window          equ  rsp + xx - LocalVarsSize   ; local copy of s->window ; stored in r10
kusano fc6ab3
;windowbestlen   equ  rsp + xx - LocalVarsSize   ; s->window + bestlen , use r10+r11
kusano fc6ab3
;scanstart       equ  rsp + xx - LocalVarsSize   ; first two bytes of string ; stored in r12w
kusano fc6ab3
;scanend         equ  rsp + xx - LocalVarsSize   ; last two bytes of string use ebx
kusano fc6ab3
;scanalign       equ  rsp + xx - LocalVarsSize   ; dword-misalignment of string r13
kusano fc6ab3
;bestlen         equ  rsp + xx - LocalVarsSize   ; size of best match so far -> r11d
kusano fc6ab3
;scan            equ  rsp + xx - LocalVarsSize   ; ptr to string wanting match -> r9
kusano fc6ab3
IFDEF INFOZIP
kusano fc6ab3
ELSE
kusano fc6ab3
 nicematch       equ  (rsp + 16 - LocalVarsSize) ; a good enough match size
kusano fc6ab3
ENDIF
kusano fc6ab3
kusano fc6ab3
save_rdi        equ  rsp + 24 - LocalVarsSize
kusano fc6ab3
save_rsi        equ  rsp + 32 - LocalVarsSize
kusano fc6ab3
save_rbx        equ  rsp + 40 - LocalVarsSize
kusano fc6ab3
save_rbp        equ  rsp + 48 - LocalVarsSize
kusano fc6ab3
save_r12        equ  rsp + 56 - LocalVarsSize
kusano fc6ab3
save_r13        equ  rsp + 64 - LocalVarsSize
kusano fc6ab3
;save_r14        equ  rsp + 72 - LocalVarsSize
kusano fc6ab3
;save_r15        equ  rsp + 80 - LocalVarsSize
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
; summary of register usage
kusano fc6ab3
; scanend     ebx
kusano fc6ab3
; scanendw    bx
kusano fc6ab3
; chainlenwmask   edx
kusano fc6ab3
; curmatch    rsi
kusano fc6ab3
; curmatchd   esi
kusano fc6ab3
; windowbestlen   r8
kusano fc6ab3
; scanalign   r9
kusano fc6ab3
; scanalignd  r9d
kusano fc6ab3
; window      r10
kusano fc6ab3
; bestlen     r11
kusano fc6ab3
; bestlend    r11d
kusano fc6ab3
; scanstart   r12d
kusano fc6ab3
; scanstartw  r12w
kusano fc6ab3
; scan        r13
kusano fc6ab3
; nicematch   r14d
kusano fc6ab3
; limit       r15
kusano fc6ab3
; limitd      r15d
kusano fc6ab3
; prev        rcx
kusano fc6ab3
kusano fc6ab3
;  all the +4 offsets are due to the addition of pending_buf_size (in zlib
kusano fc6ab3
;  in the deflate_state structure since the asm code was first written
kusano fc6ab3
;  (if you compile with zlib 1.0.4 or older, remove the +4).
kusano fc6ab3
;  Note : these value are good with a 8 bytes boundary pack structure
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
    MAX_MATCH           equ     258
kusano fc6ab3
    MIN_MATCH           equ     3
kusano fc6ab3
    MIN_LOOKAHEAD       equ     (MAX_MATCH+MIN_MATCH+1)
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
;;; Offsets for fields in the deflate_state structure. These numbers
kusano fc6ab3
;;; are calculated from the definition of deflate_state, with the
kusano fc6ab3
;;; assumption that the compiler will dword-align the fields. (Thus,
kusano fc6ab3
;;; changing the definition of deflate_state could easily cause this
kusano fc6ab3
;;; program to crash horribly, without so much as a warning at
kusano fc6ab3
;;; compile time. Sigh.)
kusano fc6ab3
kusano fc6ab3
;  all the +zlib1222add offsets are due to the addition of fields
kusano fc6ab3
;  in zlib in the deflate_state structure since the asm code was first written
kusano fc6ab3
;  (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
kusano fc6ab3
;  (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
kusano fc6ab3
;  if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
IFDEF INFOZIP
kusano fc6ab3
kusano fc6ab3
_DATA   SEGMENT
kusano fc6ab3
COMM    window_size:DWORD
kusano fc6ab3
; WMask ; 7fff
kusano fc6ab3
COMM    window:BYTE:010040H
kusano fc6ab3
COMM    prev:WORD:08000H
kusano fc6ab3
; MatchLen : unused
kusano fc6ab3
; PrevMatch : unused
kusano fc6ab3
COMM    strstart:DWORD
kusano fc6ab3
COMM    match_start:DWORD
kusano fc6ab3
; Lookahead : ignore
kusano fc6ab3
COMM    prev_length:DWORD ; PrevLen
kusano fc6ab3
COMM    max_chain_length:DWORD
kusano fc6ab3
COMM    good_match:DWORD
kusano fc6ab3
COMM    nice_match:DWORD
kusano fc6ab3
prev_ad equ OFFSET prev
kusano fc6ab3
window_ad equ OFFSET window
kusano fc6ab3
nicematch equ nice_match
kusano fc6ab3
_DATA ENDS
kusano fc6ab3
WMask equ 07fffh
kusano fc6ab3
kusano fc6ab3
ELSE
kusano fc6ab3
kusano fc6ab3
  IFNDEF zlib1222add
kusano fc6ab3
    zlib1222add equ 8
kusano fc6ab3
  ENDIF
kusano fc6ab3
dsWSize         equ 56+zlib1222add+(zlib1222add/2)
kusano fc6ab3
dsWMask         equ 64+zlib1222add+(zlib1222add/2)
kusano fc6ab3
dsWindow        equ 72+zlib1222add
kusano fc6ab3
dsPrev          equ 88+zlib1222add
kusano fc6ab3
dsMatchLen      equ 128+zlib1222add
kusano fc6ab3
dsPrevMatch     equ 132+zlib1222add
kusano fc6ab3
dsStrStart      equ 140+zlib1222add
kusano fc6ab3
dsMatchStart    equ 144+zlib1222add
kusano fc6ab3
dsLookahead     equ 148+zlib1222add
kusano fc6ab3
dsPrevLen       equ 152+zlib1222add
kusano fc6ab3
dsMaxChainLen   equ 156+zlib1222add
kusano fc6ab3
dsGoodMatch     equ 172+zlib1222add
kusano fc6ab3
dsNiceMatch     equ 176+zlib1222add
kusano fc6ab3
kusano fc6ab3
window_size     equ [ rcx + dsWSize]
kusano fc6ab3
WMask           equ [ rcx + dsWMask]
kusano fc6ab3
window_ad       equ [ rcx + dsWindow]
kusano fc6ab3
prev_ad         equ [ rcx + dsPrev]
kusano fc6ab3
strstart        equ [ rcx + dsStrStart]
kusano fc6ab3
match_start     equ [ rcx + dsMatchStart]
kusano fc6ab3
Lookahead       equ [ rcx + dsLookahead] ; 0ffffffffh on infozip
kusano fc6ab3
prev_length     equ [ rcx + dsPrevLen]
kusano fc6ab3
max_chain_length equ [ rcx + dsMaxChainLen]
kusano fc6ab3
good_match      equ [ rcx + dsGoodMatch]
kusano fc6ab3
nice_match      equ [ rcx + dsNiceMatch]
kusano fc6ab3
ENDIF
kusano fc6ab3
kusano fc6ab3
; parameter 1 in r8(deflate state s), param 2 in rdx (cur match)
kusano fc6ab3
kusano fc6ab3
; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
kusano fc6ab3
; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
kusano fc6ab3
;
kusano fc6ab3
; All registers must be preserved across the call, except for
kusano fc6ab3
;   rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
;;; Save registers that the compiler may be using, and adjust esp to
kusano fc6ab3
;;; make room for our stack frame.
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
;;; Retrieve the function arguments. r8d will hold cur_match
kusano fc6ab3
;;; throughout the entire function. edx will hold the pointer to the
kusano fc6ab3
;;; deflate_state structure during the function's setup (before
kusano fc6ab3
;;; entering the main loop.
kusano fc6ab3
kusano fc6ab3
; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
kusano fc6ab3
kusano fc6ab3
; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
kusano fc6ab3
kusano fc6ab3
        mov [save_rdi],rdi
kusano fc6ab3
        mov [save_rsi],rsi
kusano fc6ab3
        mov [save_rbx],rbx
kusano fc6ab3
        mov [save_rbp],rbp
kusano fc6ab3
IFDEF INFOZIP
kusano fc6ab3
        mov r8d,ecx
kusano fc6ab3
ELSE
kusano fc6ab3
        mov r8d,edx
kusano fc6ab3
ENDIF
kusano fc6ab3
        mov [save_r12],r12
kusano fc6ab3
        mov [save_r13],r13
kusano fc6ab3
;        mov [save_r14],r14
kusano fc6ab3
;        mov [save_r15],r15
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
;;; uInt wmask = s->w_mask;
kusano fc6ab3
;;; unsigned chain_length = s->max_chain_length;
kusano fc6ab3
;;; if (s->prev_length >= s->good_match) {
kusano fc6ab3
;;;     chain_length >>= 2;
kusano fc6ab3
;;; }
kusano fc6ab3
kusano fc6ab3
        mov edi, prev_length
kusano fc6ab3
        mov esi, good_match
kusano fc6ab3
        mov eax, WMask
kusano fc6ab3
        mov ebx, max_chain_length
kusano fc6ab3
        cmp edi, esi
kusano fc6ab3
        jl  LastMatchGood
kusano fc6ab3
        shr ebx, 2
kusano fc6ab3
LastMatchGood:
kusano fc6ab3
kusano fc6ab3
;;; chainlen is decremented once beforehand so that the function can
kusano fc6ab3
;;; use the sign flag instead of the zero flag for the exit test.
kusano fc6ab3
;;; It is then shifted into the high word, to make room for the wmask
kusano fc6ab3
;;; value, which it will always accompany.
kusano fc6ab3
kusano fc6ab3
        dec ebx
kusano fc6ab3
        shl ebx, 16
kusano fc6ab3
        or  ebx, eax
kusano fc6ab3
kusano fc6ab3
;;; on zlib only
kusano fc6ab3
;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
kusano fc6ab3
kusano fc6ab3
IFDEF INFOZIP
kusano fc6ab3
        mov [chainlenwmask], ebx
kusano fc6ab3
; on infozip nice_match = [nice_match]
kusano fc6ab3
ELSE
kusano fc6ab3
        mov eax, nice_match
kusano fc6ab3
        mov [chainlenwmask], ebx
kusano fc6ab3
        mov r10d, Lookahead
kusano fc6ab3
        cmp r10d, eax
kusano fc6ab3
        cmovnl r10d, eax
kusano fc6ab3
        mov [nicematch],r10d
kusano fc6ab3
ENDIF
kusano fc6ab3
kusano fc6ab3
;;; register Bytef *scan = s->window + s->strstart;
kusano fc6ab3
        mov r10, window_ad
kusano fc6ab3
        mov ebp, strstart
kusano fc6ab3
        lea r13, [r10 + rbp]
kusano fc6ab3
kusano fc6ab3
;;; Determine how many bytes the scan ptr is off from being
kusano fc6ab3
;;; dword-aligned.
kusano fc6ab3
kusano fc6ab3
         mov r9,r13
kusano fc6ab3
         neg r13
kusano fc6ab3
         and r13,3
kusano fc6ab3
kusano fc6ab3
;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
kusano fc6ab3
;;;     s->strstart - (IPos)MAX_DIST(s) : NIL;
kusano fc6ab3
IFDEF INFOZIP
kusano fc6ab3
        mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1))
kusano fc6ab3
ELSE
kusano fc6ab3
        mov eax, window_size
kusano fc6ab3
        sub eax, MIN_LOOKAHEAD
kusano fc6ab3
ENDIF
kusano fc6ab3
        xor edi,edi
kusano fc6ab3
        sub ebp, eax
kusano fc6ab3
kusano fc6ab3
        mov r11d, prev_length
kusano fc6ab3
kusano fc6ab3
        cmovng ebp,edi
kusano fc6ab3
kusano fc6ab3
;;; int best_len = s->prev_length;
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
;;; Store the sum of s->window + best_len in esi locally, and in esi.
kusano fc6ab3
kusano fc6ab3
       lea  rsi,[r10+r11]
kusano fc6ab3
kusano fc6ab3
;;; register ush scan_start = *(ushf*)scan;
kusano fc6ab3
;;; register ush scan_end   = *(ushf*)(scan+best_len-1);
kusano fc6ab3
;;; Posf *prev = s->prev;
kusano fc6ab3
kusano fc6ab3
        movzx r12d,word ptr [r9]
kusano fc6ab3
        movzx ebx, word ptr [r9 + r11 - 1]
kusano fc6ab3
kusano fc6ab3
        mov rdi, prev_ad
kusano fc6ab3
kusano fc6ab3
;;; Jump into the main loop.
kusano fc6ab3
kusano fc6ab3
        mov edx, [chainlenwmask]
kusano fc6ab3
kusano fc6ab3
        cmp bx,word ptr [rsi + r8 - 1]
kusano fc6ab3
        jz  LookupLoopIsZero
kusano fc6ab3
kusano fc6ab3
LookupLoop1:
kusano fc6ab3
        and r8d, edx
kusano fc6ab3
kusano fc6ab3
        movzx   r8d, word ptr [rdi + r8*2]
kusano fc6ab3
        cmp r8d, ebp
kusano fc6ab3
        jbe LeaveNow
kusano fc6ab3
        sub edx, 00010000h
kusano fc6ab3
        js  LeaveNow
kusano fc6ab3
kusano fc6ab3
LoopEntry1:
kusano fc6ab3
        cmp bx,word ptr [rsi + r8 - 1]
kusano fc6ab3
        jz  LookupLoopIsZero
kusano fc6ab3
kusano fc6ab3
LookupLoop2:
kusano fc6ab3
        and r8d, edx
kusano fc6ab3
kusano fc6ab3
        movzx   r8d, word ptr [rdi + r8*2]
kusano fc6ab3
        cmp r8d, ebp
kusano fc6ab3
        jbe LeaveNow
kusano fc6ab3
        sub edx, 00010000h
kusano fc6ab3
        js  LeaveNow
kusano fc6ab3
kusano fc6ab3
LoopEntry2:
kusano fc6ab3
        cmp bx,word ptr [rsi + r8 - 1]
kusano fc6ab3
        jz  LookupLoopIsZero
kusano fc6ab3
kusano fc6ab3
LookupLoop4:
kusano fc6ab3
        and r8d, edx
kusano fc6ab3
kusano fc6ab3
        movzx   r8d, word ptr [rdi + r8*2]
kusano fc6ab3
        cmp r8d, ebp
kusano fc6ab3
        jbe LeaveNow
kusano fc6ab3
        sub edx, 00010000h
kusano fc6ab3
        js  LeaveNow
kusano fc6ab3
kusano fc6ab3
LoopEntry4:
kusano fc6ab3
kusano fc6ab3
        cmp bx,word ptr [rsi + r8 - 1]
kusano fc6ab3
        jnz LookupLoop1
kusano fc6ab3
        jmp LookupLoopIsZero
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
;;; do {
kusano fc6ab3
;;;     match = s->window + cur_match;
kusano fc6ab3
;;;     if (*(ushf*)(match+best_len-1) != scan_end ||
kusano fc6ab3
;;;         *(ushf*)match != scan_start) continue;
kusano fc6ab3
;;;     [...]
kusano fc6ab3
;;; } while ((cur_match = prev[cur_match & wmask]) > limit
kusano fc6ab3
;;;          && --chain_length != 0);
kusano fc6ab3
;;;
kusano fc6ab3
;;; Here is the inner loop of the function. The function will spend the
kusano fc6ab3
;;; majority of its time in this loop, and majority of that time will
kusano fc6ab3
;;; be spent in the first ten instructions.
kusano fc6ab3
;;;
kusano fc6ab3
;;; Within this loop:
kusano fc6ab3
;;; ebx = scanend
kusano fc6ab3
;;; r8d = curmatch
kusano fc6ab3
;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
kusano fc6ab3
;;; esi = windowbestlen - i.e., (window + bestlen)
kusano fc6ab3
;;; edi = prev
kusano fc6ab3
;;; ebp = limit
kusano fc6ab3
kusano fc6ab3
LookupLoop:
kusano fc6ab3
        and r8d, edx
kusano fc6ab3
kusano fc6ab3
        movzx   r8d, word ptr [rdi + r8*2]
kusano fc6ab3
        cmp r8d, ebp
kusano fc6ab3
        jbe LeaveNow
kusano fc6ab3
        sub edx, 00010000h
kusano fc6ab3
        js  LeaveNow
kusano fc6ab3
kusano fc6ab3
LoopEntry:
kusano fc6ab3
kusano fc6ab3
        cmp bx,word ptr [rsi + r8 - 1]
kusano fc6ab3
        jnz LookupLoop1
kusano fc6ab3
LookupLoopIsZero:
kusano fc6ab3
        cmp     r12w, word ptr [r10 + r8]
kusano fc6ab3
        jnz LookupLoop1
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
;;; Store the current value of chainlen.
kusano fc6ab3
        mov [chainlenwmask], edx
kusano fc6ab3
kusano fc6ab3
;;; Point edi to the string under scrutiny, and esi to the string we
kusano fc6ab3
;;; are hoping to match it up with. In actuality, esi and edi are
kusano fc6ab3
;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
kusano fc6ab3
;;; initialized to -(MAX_MATCH_8 - scanalign).
kusano fc6ab3
kusano fc6ab3
        lea rsi,[r8+r10]
kusano fc6ab3
        mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8)
kusano fc6ab3
        lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8]
kusano fc6ab3
        lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8]
kusano fc6ab3
kusano fc6ab3
        prefetcht1 [rsi+rdx]
kusano fc6ab3
        prefetcht1 [rdi+rdx]
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
;;; Test the strings for equality, 8 bytes at a time. At the end,
kusano fc6ab3
;;; adjust rdx so that it is offset to the exact byte that mismatched.
kusano fc6ab3
;;;
kusano fc6ab3
;;; We already know at this point that the first three bytes of the
kusano fc6ab3
;;; strings match each other, and they can be safely passed over before
kusano fc6ab3
;;; starting the compare loop. So what this code does is skip over 0-3
kusano fc6ab3
;;; bytes, as much as necessary in order to dword-align the edi
kusano fc6ab3
;;; pointer. (rsi will still be misaligned three times out of four.)
kusano fc6ab3
;;;
kusano fc6ab3
;;; It should be confessed that this loop usually does not represent
kusano fc6ab3
;;; much of the total running time. Replacing it with a more
kusano fc6ab3
;;; straightforward "rep cmpsb" would not drastically degrade
kusano fc6ab3
;;; performance.
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
LoopCmps:
kusano fc6ab3
        mov rax, [rsi + rdx]
kusano fc6ab3
        xor rax, [rdi + rdx]
kusano fc6ab3
        jnz LeaveLoopCmps
kusano fc6ab3
kusano fc6ab3
        mov rax, [rsi + rdx + 8]
kusano fc6ab3
        xor rax, [rdi + rdx + 8]
kusano fc6ab3
        jnz LeaveLoopCmps8
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
        mov rax, [rsi + rdx + 8+8]
kusano fc6ab3
        xor rax, [rdi + rdx + 8+8]
kusano fc6ab3
        jnz LeaveLoopCmps16
kusano fc6ab3
kusano fc6ab3
        add rdx,8+8+8
kusano fc6ab3
kusano fc6ab3
        jnz short LoopCmps
kusano fc6ab3
        jmp short LenMaximum
kusano fc6ab3
LeaveLoopCmps16: add rdx,8
kusano fc6ab3
LeaveLoopCmps8: add rdx,8
kusano fc6ab3
LeaveLoopCmps:
kusano fc6ab3
kusano fc6ab3
        test    eax, 0000FFFFh
kusano fc6ab3
        jnz LenLower
kusano fc6ab3
kusano fc6ab3
        test eax,0ffffffffh
kusano fc6ab3
kusano fc6ab3
        jnz LenLower32
kusano fc6ab3
kusano fc6ab3
        add rdx,4
kusano fc6ab3
        shr rax,32
kusano fc6ab3
        or ax,ax
kusano fc6ab3
        jnz LenLower
kusano fc6ab3
kusano fc6ab3
LenLower32:
kusano fc6ab3
        shr eax,16
kusano fc6ab3
        add rdx,2
kusano fc6ab3
LenLower:   sub al, 1
kusano fc6ab3
        adc rdx, 0
kusano fc6ab3
;;; Calculate the length of the match. If it is longer than MAX_MATCH,
kusano fc6ab3
;;; then automatically accept it as the best possible match and leave.
kusano fc6ab3
kusano fc6ab3
        lea rax, [rdi + rdx]
kusano fc6ab3
        sub rax, r9
kusano fc6ab3
        cmp eax, MAX_MATCH
kusano fc6ab3
        jge LenMaximum
kusano fc6ab3
kusano fc6ab3
;;; If the length of the match is not longer than the best match we
kusano fc6ab3
;;; have so far, then forget it and return to the lookup loop.
kusano fc6ab3
;///////////////////////////////////
kusano fc6ab3
kusano fc6ab3
        cmp eax, r11d
kusano fc6ab3
        jg  LongerMatch
kusano fc6ab3
kusano fc6ab3
        lea rsi,[r10+r11]
kusano fc6ab3
kusano fc6ab3
        mov rdi, prev_ad
kusano fc6ab3
        mov edx, [chainlenwmask]
kusano fc6ab3
        jmp LookupLoop
kusano fc6ab3
kusano fc6ab3
;;;         s->match_start = cur_match;
kusano fc6ab3
;;;         best_len = len;
kusano fc6ab3
;;;         if (len >= nice_match) break;
kusano fc6ab3
;;;         scan_end = *(ushf*)(scan+best_len-1);
kusano fc6ab3
kusano fc6ab3
LongerMatch:
kusano fc6ab3
        mov r11d, eax
kusano fc6ab3
        mov match_start, r8d
kusano fc6ab3
        cmp eax, [nicematch]
kusano fc6ab3
        jge LeaveNow
kusano fc6ab3
kusano fc6ab3
        lea rsi,[r10+rax]
kusano fc6ab3
kusano fc6ab3
        movzx   ebx, word ptr [r9 + rax - 1]
kusano fc6ab3
        mov rdi, prev_ad
kusano fc6ab3
        mov edx, [chainlenwmask]
kusano fc6ab3
        jmp LookupLoop
kusano fc6ab3
kusano fc6ab3
;;; Accept the current string, with the maximum possible length.
kusano fc6ab3
kusano fc6ab3
LenMaximum:
kusano fc6ab3
        mov r11d,MAX_MATCH
kusano fc6ab3
        mov match_start, r8d
kusano fc6ab3
kusano fc6ab3
;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
kusano fc6ab3
;;; return s->lookahead;
kusano fc6ab3
kusano fc6ab3
LeaveNow:
kusano fc6ab3
IFDEF INFOZIP
kusano fc6ab3
        mov eax,r11d
kusano fc6ab3
ELSE
kusano fc6ab3
        mov eax, Lookahead
kusano fc6ab3
        cmp r11d, eax
kusano fc6ab3
        cmovng eax, r11d
kusano fc6ab3
ENDIF
kusano fc6ab3
kusano fc6ab3
;;; Restore the stack and return from whence we came.
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
        mov rsi,[save_rsi]
kusano fc6ab3
        mov rdi,[save_rdi]
kusano fc6ab3
        mov rbx,[save_rbx]
kusano fc6ab3
        mov rbp,[save_rbp]
kusano fc6ab3
        mov r12,[save_r12]
kusano fc6ab3
        mov r13,[save_r13]
kusano fc6ab3
;        mov r14,[save_r14]
kusano fc6ab3
;        mov r15,[save_r15]
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
        ret 0
kusano fc6ab3
; please don't remove this string !
kusano fc6ab3
; Your can freely use gvmat64 in any free or commercial app
kusano fc6ab3
; but it is far better don't remove the string in the binary!
kusano fc6ab3
    db     0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
kusano fc6ab3
longest_match   ENDP
kusano fc6ab3
kusano fc6ab3
match_init PROC
kusano fc6ab3
  ret 0
kusano fc6ab3
match_init ENDP
kusano fc6ab3
kusano fc6ab3
kusano fc6ab3
END