11/* ----------
22 * pg_lzcompress.c -
33 *
4- * $Header: /cvsroot/pgsql/src/backend/utils/adt/pg_lzcompress.c,v 1.13 2001/10/25 05:49:45 momjian Exp $
4+ * $Header: /cvsroot/pgsql/src/backend/utils/adt/pg_lzcompress.c,v 1.14 2001/11/17 06:09:30 tgl Exp $
55 *
66 *This is an implementation of LZ compression for PostgreSQL.
77 *It uses a simple history table and generates 2-3 byte tags
8989 *This limits the offset to 1-4095 (12 bits) and the length
9090 *to 3-18 (4 bits) because 3 is allways added to it. To emit
9191 *a tag of 2 bytes with a length of 2 only saves one control
92- *bit. But weloose one byte in the possible length of a tag.
92+ *bit. But welose one byte in the possible length of a tag.
9393 *
9494 *In the actual implementation, the 2 byte tag's length is
9595 *limited to 3-17, because the value 0xF in the length nibble
116116 *1K and 1M. For smaller items there's not that much chance of
117117 *redundancy in the character sequence (except for large areas
118118 *of identical bytes like trailing spaces) and for bigger ones
119- *the allocation of the history table is expensive (it needs
120- *8 times the size of the input!).
119+ *our 4K maximum look-back distance is too small.
121120 *
122121 *The compressor creates a table for 8192 lists of positions.
123122 *For each input position (except the last 3), a hash key is
124- *built from the 4 next input bytes and theposiiton remembered
123+ *built from the 4 next input bytes and theposition remembered
125124 *in the appropriate list. Thus, the table points to linked
126125 *lists of likely to be at least in the first 4 characters
127126 *matching strings. This is done on the fly while the input
128- *is compressed into the output area.
127+ *is compressed into the output area. Table entries are only
128+ *kept for the last 4096 input positions, since we cannot use
129+ *back-pointers larger than that anyway.
129130 *
130131 *For each byte in the input, it's hash key (built from this
131132 *byte and the next 3) is used to find the appropriate list
170171 *Jan Wieck
171172 * ----------
172173 */
173- #include <stdio.h>
174- #include <stdlib.h>
174+ #include "postgres.h"
175+
175176#include <unistd.h>
176177#include <fcntl.h>
177- #include <string.h>
178178#include <errno.h>
179179
180- #include "postgres.h"
181180#include "utils/pg_lzcompress.h"
182181
183182
184183/* ----------
185184 * Local definitions
186185 * ----------
187186 */
188- #define PGLZ_HISTORY_LISTS 8192
189- #define PGLZ_HISTORY_MASK 0x1fff
187+ #define PGLZ_HISTORY_LISTS 8192/* must be power of 2 */
188+ #define PGLZ_HISTORY_MASK (PGLZ_HISTORY_LISTS - 1)
190189#define PGLZ_HISTORY_SIZE 4096
191190#define PGLZ_MAX_MATCH 273
192191
195194 * PGLZ_HistEntry -
196195 *
197196 *Linked list for the backward history lookup
197+ *
198+ * All the entries sharing a hash key are linked in a doubly linked list.
199+ * This makes it easy to remove an entry when it's time to recycle it
200+ * (because it's more than 4K positions old).
198201 * ----------
199202 */
200203typedef struct PGLZ_HistEntry
201204{
202- struct PGLZ_HistEntry * next ;
205+ struct PGLZ_HistEntry * next ;/* links for my hash key's list */
203206struct PGLZ_HistEntry * prev ;
204- char * pos ;
207+ int hindex ;/* my current hash key */
208+ char * pos ;/* my input position */
205209}PGLZ_HistEntry ;
206210
207211
@@ -249,7 +253,7 @@ static PGLZ_Strategy strategy_never_data = {
249253PGLZ_Strategy * PGLZ_strategy_never = & strategy_never_data ;
250254
251255/* ----------
252- *Global arrays for history
256+ *Statically allocated work arrays for history
253257 * ----------
254258 */
255259static PGLZ_HistEntry * hist_start [PGLZ_HISTORY_LISTS ];
@@ -261,48 +265,55 @@ static PGLZ_HistEntry hist_entries[PGLZ_HISTORY_SIZE];
261265 *
262266 *Computes the history table slot for the lookup by the next 4
263267 *characters in the input.
268+ *
269+ * NB: because we use the next 4 characters, we are not guaranteed to
270+ * find 3-character matches; they very possibly will be in the wrong
271+ * hash list. This seems an acceptable tradeoff for spreading out the
272+ * hash keys more.
264273 * ----------
265274 */
266- #if 1
267- #define pglz_hist_idx (_s ,_e ) (\
268- (((_e) - (_s)) < 4) ? 0 :\
269- ((((_s)[0] << 9) ^ ((_s)[1] << 6) ^\
270- ((_s)[2] << 3) ^ (_s)[3]) & (PGLZ_HISTORY_MASK))\
271- )
272- #else
273275#define pglz_hist_idx (_s ,_e ) (\
274- (((_e) - (_s)) < 2) ? 0 :\
275- ((((_s)[0] << 8) ^ (_s)[1]) & (PGLZ_HISTORY_MASK))\
276+ ((((_e) - (_s)) < 4) ? (int) (_s)[0] :\
277+ (((_s)[0] << 9) ^ ((_s)[1] << 6) ^\
278+ ((_s)[2] << 3) ^ (_s)[3])) & (PGLZ_HISTORY_MASK)\
276279)
277- #endif
278280
279281
280282/* ----------
281283 * pglz_hist_add -
282284 *
283285 *Adds a new entry to the history table.
286+ *
287+ * If _recycle is true, then we are recycling a previously used entry,
288+ * and must first delink it from its old hashcode's linked list.
289+ *
290+ * NOTE: beware of multiple evaluations of macro's arguments, and note that
291+ * _hn and _recycle are modified in the macro.
284292 * ----------
285293 */
286- #define pglz_hist_add (_hs ,_he ,_hn ,_s ,_e ) \
294+ #define pglz_hist_add (_hs ,_he ,_hn ,_recycle , _s ,_e ) \
287295do {\
288296int __hindex = pglz_hist_idx((_s),(_e));\
289- if ((_he)[(_hn)].prev == NULL) {\
290- (_hs)[__hindex] = (_he)[(_hn)].next;\
291- } else {\
292- (_he)[(_hn)].prev->next = (_he)[(_hn)].next;\
293- }\
294- if ((_he)[(_hn)].next != NULL) {\
295- (_he)[(_hn)].next->prev = (_he)[(_hn)].prev;\
296- }\
297- (_he)[(_hn)].next = (_hs)[__hindex];\
298- (_he)[(_hn)].prev = NULL;\
299- (_he)[(_hn)].pos = (_s);\
300- if ((_hs)[__hindex] != NULL) {\
301- (_hs)[__hindex]->prev = &((_he)[(_hn)]);\
297+ PGLZ_HistEntry **__myhsp = &(_hs)[__hindex];\
298+ PGLZ_HistEntry *__myhe = &(_he)[_hn];\
299+ if (_recycle) {\
300+ if (__myhe->prev == NULL)\
301+ (_hs)[__myhe->hindex] = __myhe->next;\
302+ else\
303+ __myhe->prev->next = __myhe->next;\
304+ if (__myhe->next != NULL)\
305+ __myhe->next->prev = __myhe->prev;\
302306}\
303- (_hs)[__hindex] = &((_he)[(_hn)]);\
307+ __myhe->next = *__myhsp;\
308+ __myhe->prev = NULL;\
309+ __myhe->hindex = __hindex;\
310+ __myhe->pos = (_s);\
311+ if (*__myhsp != NULL)\
312+ (*__myhsp)->prev = __myhe;\
313+ *__myhsp = __myhe;\
304314if (++(_hn) >= PGLZ_HISTORY_SIZE) {\
305315(_hn) = 0;\
316+ (_recycle) = true;\
306317}\
307318} while (0)
308319
@@ -382,27 +393,23 @@ pglz_find_match(PGLZ_HistEntry **hstart, char *input, char *end,
382393PGLZ_HistEntry * hent ;
383394int32 len = 0 ;
384395int32 off = 0 ;
385- int32 thislen ;
386- int32 thisoff ;
387- char * ip ;
388- char * hp ;
389396
390397/*
391398 * Traverse the linked history list until a good enough match is
392399 * found.
393400 */
394401hent = hstart [pglz_hist_idx (input ,end )];
395- while (hent && len < good_match )
402+ while (hent )
396403{
397- /*
398- * Be happy with lesser good matches the more entries we visited.
399- */
400- good_match -= ( good_match * good_drop ) / 100 ;
404+ char * ip = input ;
405+ char * hp = hent -> pos ;
406+ int32 thisoff ;
407+ int32 thislen ;
401408
402409/*
403410 * Stop if the offset does not fit into our tag anymore.
404411 */
405- thisoff = ( ip = input ) - ( hp = hent -> pos ) ;
412+ thisoff = ip - hp ;
406413if (thisoff >=0x0fff )
407414break ;
408415
@@ -411,27 +418,33 @@ pglz_find_match(PGLZ_HistEntry **hstart, char *input, char *end,
411418 * the best so far. And if we already have a match of 16 or more
412419 * bytes, it's worth the call overhead to use memcmp() to check if
413420 * this match is equal for the same size. After that we must
414- * fallback to character by charactercomparision to know the
421+ * fallback to character by charactercomparison to know the
415422 * exact position where the diff occured.
416423 */
424+ thislen = 0 ;
417425if (len >=16 )
418426{
419- if (memcmp (ip ,hp ,len )! =0 )
427+ if (memcmp (ip ,hp ,len )= =0 )
420428{
421- hent = hent -> next ;
422- continue ;
429+ thislen = len ;
430+ ip += len ;
431+ hp += len ;
432+ while (ip < end && * ip == * hp && thislen < PGLZ_MAX_MATCH )
433+ {
434+ thislen ++ ;
435+ ip ++ ;
436+ hp ++ ;
437+ }
423438}
424- thislen = len ;
425- ip += len ;
426- hp += len ;
427439}
428440else
429- thislen = 0 ;
430- while (ip < end && * ip == * hp && thislen < PGLZ_MAX_MATCH )
431441{
432- thislen ++ ;
433- ip ++ ;
434- hp ++ ;
442+ while (ip < end && * ip == * hp && thislen < PGLZ_MAX_MATCH )
443+ {
444+ thislen ++ ;
445+ ip ++ ;
446+ hp ++ ;
447+ }
435448}
436449
437450/*
@@ -447,6 +460,17 @@ pglz_find_match(PGLZ_HistEntry **hstart, char *input, char *end,
447460 * Advance to the next history entry
448461 */
449462hent = hent -> next ;
463+
464+ /*
465+ * Be happy with lesser good matches the more entries we visited.
466+ * But no point in doing calculation if we're at end of list.
467+ */
468+ if (hent )
469+ {
470+ if (len >=good_match )
471+ break ;
472+ good_match -= (good_match * good_drop ) /100 ;
473+ }
450474}
451475
452476/*
@@ -473,10 +497,10 @@ pglz_find_match(PGLZ_HistEntry **hstart, char *input, char *end,
473497int
474498pglz_compress (char * source ,int32 slen ,PGLZ_Header * dest ,PGLZ_Strategy * strategy )
475499{
476- int hist_next = 0 ;
477-
478500unsignedchar * bp = ((unsignedchar * )dest )+ sizeof (PGLZ_Header );
479501unsignedchar * bstart = bp ;
502+ int hist_next = 0 ;
503+ bool hist_recycle = false;
480504char * dp = source ;
481505char * dend = source + slen ;
482506unsignedchar ctrl_dummy = 0 ;
@@ -535,12 +559,11 @@ pglz_compress(char *source, int32 slen, PGLZ_Header *dest, PGLZ_Strategy *strate
535559good_drop = 100 ;
536560
537561/*
538- * Initialize the historytables. For inputs smaller than
539- *PGLZ_HISTORY_SIZE, we already have a big enough history table on
540- *the stack frame .
562+ * Initialize the historylists to empty. We do not need to zero
563+ *the hist_entries[] array; its entries are initialized as they
564+ *are used .
541565 */
542566memset ((void * )hist_start ,0 ,sizeof (hist_start ));
543- memset ((void * )hist_entries ,0 ,sizeof (hist_entries ));
544567
545568/*
546569 * Compute the maximum result size allowed by the strategy. If the
@@ -588,7 +611,9 @@ pglz_compress(char *source, int32 slen, PGLZ_Header *dest, PGLZ_Strategy *strate
588611pglz_out_tag (ctrlp ,ctrlb ,ctrl ,bp ,match_len ,match_off );
589612while (match_len -- )
590613{
591- pglz_hist_add (hist_start ,hist_entries ,hist_next ,dp ,dend );
614+ pglz_hist_add (hist_start ,hist_entries ,
615+ hist_next ,hist_recycle ,
616+ dp ,dend );
592617dp ++ ;/* Do not do this ++ in the line above!*/
593618/* The macro would do it four times - Jan.*/
594619}
@@ -599,7 +624,9 @@ pglz_compress(char *source, int32 slen, PGLZ_Header *dest, PGLZ_Strategy *strate
599624 * No match found. Copy one literal byte.
600625 */
601626pglz_out_literal (ctrlp ,ctrlb ,ctrl ,bp ,* dp );
602- pglz_hist_add (hist_start ,hist_entries ,hist_next ,dp ,dend );
627+ pglz_hist_add (hist_start ,hist_entries ,
628+ hist_next ,hist_recycle ,
629+ dp ,dend );
603630dp ++ ;/* Do not do this ++ in the line above!*/
604631/* The macro would do it four times - Jan.*/
605632}