| |
| ; match.asm -- Pentium-Pro optimized version of longest_match() |
| ; |
| ; Updated for zlib 1.1.3 and converted to MASM 6.1x |
| ; Copyright (C) 2000 Dan Higdon <hdan@kinesoft.com> |
| ; and Chuck Walbourn <chuckw@kinesoft.com> |
| ; Corrections by Cosmin Truta <cosmint@cs.ubbcluj.ro> |
| ; |
| ; This is free software; you can redistribute it and/or modify it |
| ; under the terms of the GNU General Public License. |
| |
| ; Based on match.S |
| ; Written for zlib 1.1.2 |
| ; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> |
| ; |
| ; Modified by Gilles Vollant (2005) for add gzhead and gzindex |
| |
| .686P |
| .MODEL FLAT |
| |
| ;=========================================================================== |
| ; EQUATES |
| ;=========================================================================== |
| |
| MAX_MATCH EQU 258 |
| MIN_MATCH EQU 3 |
| MIN_LOOKAHEAD EQU (MAX_MATCH + MIN_MATCH + 1) |
| MAX_MATCH_8 EQU ((MAX_MATCH + 7) AND (NOT 7)) |
| |
| ;=========================================================================== |
| ; STRUCTURES |
| ;=========================================================================== |
| |
| ; This STRUCT assumes a 4-byte alignment |
| |
| DEFLATE_STATE STRUCT |
| ds_strm dd ? |
| ds_status dd ? |
| ds_pending_buf dd ? |
| ds_pending_buf_size dd ? |
| ds_pending_out dd ? |
| ds_pending dd ? |
| ds_wrap dd ? |
| ; gzhead and gzindex are added in zlib 1.2.2.2 (see deflate.h) |
| ds_gzhead dd ? |
| ds_gzindex dd ? |
| ds_data_type db ? |
| ds_method db ? |
| db ? ; padding |
| db ? ; padding |
| ds_last_flush dd ? |
| ds_w_size dd ? ; used |
| ds_w_bits dd ? |
| ds_w_mask dd ? ; used |
| ds_window dd ? ; used |
| ds_window_size dd ? |
| ds_prev dd ? ; used |
| ds_head dd ? |
| ds_ins_h dd ? |
| ds_hash_size dd ? |
| ds_hash_bits dd ? |
| ds_hash_mask dd ? |
| ds_hash_shift dd ? |
| ds_block_start dd ? |
| ds_match_length dd ? ; used |
| ds_prev_match dd ? ; used |
| ds_match_available dd ? |
| ds_strstart dd ? ; used |
| ds_match_start dd ? ; used |
| ds_lookahead dd ? ; used |
| ds_prev_length dd ? ; used |
| ds_max_chain_length dd ? ; used |
| ds_max_laxy_match dd ? |
| ds_level dd ? |
| ds_strategy dd ? |
| ds_good_match dd ? ; used |
| ds_nice_match dd ? ; used |
| |
| ; Don't need anymore of the struct for match |
| DEFLATE_STATE ENDS |
| |
| ;=========================================================================== |
| ; CODE |
| ;=========================================================================== |
| _TEXT SEGMENT |
| |
| ;--------------------------------------------------------------------------- |
| ; match_init |
| ;--------------------------------------------------------------------------- |
| ALIGN 4 |
| PUBLIC _match_init |
| _match_init PROC |
| ; no initialization needed |
| ret |
| _match_init ENDP |
| |
| ;--------------------------------------------------------------------------- |
| ; uInt longest_match(deflate_state *deflatestate, IPos curmatch) |
| ;--------------------------------------------------------------------------- |
| ALIGN 4 |
| |
| PUBLIC _longest_match |
| _longest_match PROC |
| |
| ; Since this code uses EBP for a scratch register, the stack frame must |
| ; be manually constructed and referenced relative to the ESP register. |
| |
| ; Stack image |
| ; Variables |
| chainlenwmask = 0 ; high word: current chain len |
| ; low word: s->wmask |
| window = 4 ; local copy of s->window |
| windowbestlen = 8 ; s->window + bestlen |
| scanend = 12 ; last two bytes of string |
| scanstart = 16 ; first two bytes of string |
| scanalign = 20 ; dword-misalignment of string |
| nicematch = 24 ; a good enough match size |
| bestlen = 28 ; size of best match so far |
| scan = 32 ; ptr to string wanting match |
| varsize = 36 ; number of bytes (also offset to last saved register) |
| |
| ; Saved Registers (actually pushed into place) |
| ebx_save = 36 |
| edi_save = 40 |
| esi_save = 44 |
| ebp_save = 48 |
| |
| ; Parameters |
| retaddr = 52 |
| deflatestate = 56 |
| curmatch = 60 |
| |
| ; Save registers that the compiler may be using |
| push ebp |
| push edi |
| push esi |
| push ebx |
| |
| ; Allocate local variable space |
| sub esp,varsize |
| |
| ; Retrieve the function arguments. ecx will hold cur_match |
| ; throughout the entire function. edx will hold the pointer to the |
| ; deflate_state structure during the function's setup (before |
| ; entering the main loop). |
| |
| mov edx, [esp+deflatestate] |
| ASSUME edx:PTR DEFLATE_STATE |
| |
| mov ecx, [esp+curmatch] |
| |
| ; uInt wmask = s->w_mask; |
| ; unsigned chain_length = s->max_chain_length; |
| ; if (s->prev_length >= s->good_match) { |
| ; chain_length >>= 2; |
| ; } |
| |
| mov eax, [edx].ds_prev_length |
| mov ebx, [edx].ds_good_match |
| cmp eax, ebx |
| mov eax, [edx].ds_w_mask |
| mov ebx, [edx].ds_max_chain_length |
| jl SHORT LastMatchGood |
| shr ebx, 2 |
| LastMatchGood: |
| |
| ; chainlen is decremented once beforehand so that the function can |
| ; use the sign flag instead of the zero flag for the exit test. |
| ; It is then shifted into the high word, to make room for the wmask |
| ; value, which it will always accompany. |
| |
| dec ebx |
| shl ebx, 16 |
| or ebx, eax |
| mov [esp+chainlenwmask], ebx |
| |
| ; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; |
| |
| mov eax, [edx].ds_nice_match |
| mov ebx, [edx].ds_lookahead |
| cmp ebx, eax |
| jl SHORT LookaheadLess |
| mov ebx, eax |
| LookaheadLess: |
| mov [esp+nicematch], ebx |
| |
| ;/* register Bytef *scan = s->window + s->strstart; */ |
| |
| mov esi, [edx].ds_window |
| mov [esp+window], esi |
| mov ebp, [edx].ds_strstart |
| lea edi, [esi+ebp] |
| mov [esp+scan],edi |
| |
| ;/* Determine how many bytes the scan ptr is off from being */ |
| ;/* dword-aligned. */ |
| |
| mov eax, edi |
| neg eax |
| and eax, 3 |
| mov [esp+scanalign], eax |
| |
| ;/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ |
| ;/* s->strstart - (IPos)MAX_DIST(s) : NIL; */ |
| |
| mov eax, [edx].ds_w_size |
| sub eax, MIN_LOOKAHEAD |
| sub ebp, eax |
| jg SHORT LimitPositive |
| xor ebp, ebp |
| LimitPositive: |
| |
| ;/* int best_len = s->prev_length; */ |
| |
| mov eax, [edx].ds_prev_length |
| mov [esp+bestlen], eax |
| |
| ;/* Store the sum of s->window + best_len in %esi locally, and in %esi. */ |
| |
| add esi, eax |
| mov [esp+windowbestlen], esi |
| |
| ;/* register ush scan_start = *(ushf*)scan; */ |
| ;/* register ush scan_end = *(ushf*)(scan+best_len-1); */ |
| ;/* Posf *prev = s->prev; */ |
| |
| movzx ebx, WORD PTR[edi] |
| mov [esp+scanstart], ebx |
| movzx ebx, WORD PTR[eax+edi-1] |
| mov [esp+scanend], ebx |
| mov edi, [edx].ds_prev |
| |
| ;/* Jump into the main loop. */ |
| |
| mov edx, [esp+chainlenwmask] |
| jmp SHORT LoopEntry |
| |
| ;/* do { |
| ; * match = s->window + cur_match; |
| ; * if (*(ushf*)(match+best_len-1) != scan_end || |
| ; * *(ushf*)match != scan_start) continue; |
| ; * [...] |
| ; * } while ((cur_match = prev[cur_match & wmask]) > limit |
| ; * && --chain_length != 0); |
| ; * |
| ; * Here is the inner loop of the function. The function will spend the |
| ; * majority of its time in this loop, and majority of that time will |
| ; * be spent in the first ten instructions. |
| ; * |
| ; * Within this loop: |
| ; * %ebx = scanend |
| ; * %ecx = curmatch |
| ; * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) |
| ; * %esi = windowbestlen - i.e., (window + bestlen) |
| ; * %edi = prev |
| ; * %ebp = limit |
| ; */ |
| |
| ALIGN 4 |
| LookupLoop: |
| and ecx, edx |
| movzx ecx, WORD PTR[edi+ecx*2] |
| cmp ecx, ebp |
| jbe LeaveNow |
| sub edx, 000010000H |
| js LeaveNow |
| |
| LoopEntry: |
| movzx eax, WORD PTR[esi+ecx-1] |
| cmp eax, ebx |
| jnz SHORT LookupLoop |
| |
| mov eax, [esp+window] |
| movzx eax, WORD PTR[eax+ecx] |
| cmp eax, [esp+scanstart] |
| jnz SHORT LookupLoop |
| |
| ;/* Store the current value of chainlen. */ |
| |
| mov [esp+chainlenwmask], edx |
| |
| ;/* Point %edi to the string under scrutiny, and %esi to the string we */ |
| ;/* are hoping to match it up with. In actuality, %esi and %edi are */ |
| ;/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ |
| ;/* initialized to -(MAX_MATCH_8 - scanalign). */ |
| |
| mov esi, [esp+window] |
| mov edi, [esp+scan] |
| add esi, ecx |
| mov eax, [esp+scanalign] |
| mov edx, -MAX_MATCH_8 |
| lea edi, [edi+eax+MAX_MATCH_8] |
| lea esi, [esi+eax+MAX_MATCH_8] |
| |
| ;/* Test the strings for equality, 8 bytes at a time. At the end, |
| ; * adjust %edx so that it is offset to the exact byte that mismatched. |
| ; * |
| ; * We already know at this point that the first three bytes of the |
| ; * strings match each other, and they can be safely passed over before |
| ; * starting the compare loop. So what this code does is skip over 0-3 |
| ; * bytes, as much as necessary in order to dword-align the %edi |
| ; * pointer. (%esi will still be misaligned three times out of four.) |
| ; * |
| ; * It should be confessed that this loop usually does not represent |
| ; * much of the total running time. Replacing it with a more |
| ; * straightforward "rep cmpsb" would not drastically degrade |
| ; * performance. |
| ; */ |
| |
| LoopCmps: |
| mov eax, DWORD PTR[esi+edx] |
| xor eax, DWORD PTR[edi+edx] |
| jnz SHORT LeaveLoopCmps |
| |
| mov eax, DWORD PTR[esi+edx+4] |
| xor eax, DWORD PTR[edi+edx+4] |
| jnz SHORT LeaveLoopCmps4 |
| |
| add edx, 8 |
| jnz SHORT LoopCmps |
| jmp LenMaximum |
| ALIGN 4 |
| |
| LeaveLoopCmps4: |
| add edx, 4 |
| |
| LeaveLoopCmps: |
| test eax, 00000FFFFH |
| jnz SHORT LenLower |
| |
| add edx, 2 |
| shr eax, 16 |
| |
| LenLower: |
| sub al, 1 |
| adc edx, 0 |
| |
| ;/* Calculate the length of the match. If it is longer than MAX_MATCH, */ |
| ;/* then automatically accept it as the best possible match and leave. */ |
| |
| lea eax, [edi+edx] |
| mov edi, [esp+scan] |
| sub eax, edi |
| cmp eax, MAX_MATCH |
| jge SHORT LenMaximum |
| |
| ;/* If the length of the match is not longer than the best match we */ |
| ;/* have so far, then forget it and return to the lookup loop. */ |
| |
| mov edx, [esp+deflatestate] |
| mov ebx, [esp+bestlen] |
| cmp eax, ebx |
| jg SHORT LongerMatch |
| mov esi, [esp+windowbestlen] |
| mov edi, [edx].ds_prev |
| mov ebx, [esp+scanend] |
| mov edx, [esp+chainlenwmask] |
| jmp LookupLoop |
| ALIGN 4 |
| |
| ;/* s->match_start = cur_match; */ |
| ;/* best_len = len; */ |
| ;/* if (len >= nice_match) break; */ |
| ;/* scan_end = *(ushf*)(scan+best_len-1); */ |
| |
| LongerMatch: |
| mov ebx, [esp+nicematch] |
| mov [esp+bestlen], eax |
| mov [edx].ds_match_start, ecx |
| cmp eax, ebx |
| jge SHORT LeaveNow |
| mov esi, [esp+window] |
| add esi, eax |
| mov [esp+windowbestlen], esi |
| movzx ebx, WORD PTR[edi+eax-1] |
| mov edi, [edx].ds_prev |
| mov [esp+scanend], ebx |
| mov edx, [esp+chainlenwmask] |
| jmp LookupLoop |
| ALIGN 4 |
| |
| ;/* Accept the current string, with the maximum possible length. */ |
| |
| LenMaximum: |
| mov edx, [esp+deflatestate] |
| mov DWORD PTR[esp+bestlen], MAX_MATCH |
| mov [edx].ds_match_start, ecx |
| |
| ;/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ |
| ;/* return s->lookahead; */ |
| |
| LeaveNow: |
| mov edx, [esp+deflatestate] |
| mov ebx, [esp+bestlen] |
| mov eax, [edx].ds_lookahead |
| cmp ebx, eax |
| jg SHORT LookaheadRet |
| mov eax, ebx |
| LookaheadRet: |
| |
| ; Restore the stack and return from whence we came. |
| |
| add esp, varsize |
| pop ebx |
| pop esi |
| pop edi |
| pop ebp |
| ret |
| |
| _longest_match ENDP |
| |
| _TEXT ENDS |
| END |