1
1
/* ----------
2
2
* pg_lzcompress.c -
3
3
*
4
- * $Header: /cvsroot/pgsql/src/backend/utils/adt/pg_lzcompress.c,v 1.13 2001/10/25 05:49:45 momjian Exp $
4
+ * $Header: /cvsroot/pgsql/src/backend/utils/adt/pg_lzcompress.c,v 1.14 2001/11/17 06:09:30 tgl Exp $
5
5
*
6
6
*This is an implementation of LZ compression for PostgreSQL.
7
7
*It uses a simple history table and generates 2-3 byte tags
89
89
*This limits the offset to 1-4095 (12 bits) and the length
90
90
*to 3-18 (4 bits) because 3 is allways added to it. To emit
91
91
*a tag of 2 bytes with a length of 2 only saves one control
92
- *bit. But weloose one byte in the possible length of a tag.
92
+ *bit. But welose one byte in the possible length of a tag.
93
93
*
94
94
*In the actual implementation, the 2 byte tag's length is
95
95
*limited to 3-17, because the value 0xF in the length nibble
116
116
*1K and 1M. For smaller items there's not that much chance of
117
117
*redundancy in the character sequence (except for large areas
118
118
*of identical bytes like trailing spaces) and for bigger ones
119
- *the allocation of the history table is expensive (it needs
120
- *8 times the size of the input!).
119
+ *our 4K maximum look-back distance is too small.
121
120
*
122
121
*The compressor creates a table for 8192 lists of positions.
123
122
*For each input position (except the last 3), a hash key is
124
- *built from the 4 next input bytes and theposiiton remembered
123
+ *built from the 4 next input bytes and theposition remembered
125
124
*in the appropriate list. Thus, the table points to linked
126
125
*lists of likely to be at least in the first 4 characters
127
126
*matching strings. This is done on the fly while the input
128
- *is compressed into the output area.
127
+ *is compressed into the output area. Table entries are only
128
+ *kept for the last 4096 input positions, since we cannot use
129
+ *back-pointers larger than that anyway.
129
130
*
130
131
*For each byte in the input, it's hash key (built from this
131
132
*byte and the next 3) is used to find the appropriate list
170
171
*Jan Wieck
171
172
* ----------
172
173
*/
173
- #include <stdio.h>
174
- #include <stdlib.h>
174
+ #include "postgres.h"
175
+
175
176
#include <unistd.h>
176
177
#include <fcntl.h>
177
- #include <string.h>
178
178
#include <errno.h>
179
179
180
- #include "postgres.h"
181
180
#include "utils/pg_lzcompress.h"
182
181
183
182
184
183
/* ----------
185
184
* Local definitions
186
185
* ----------
187
186
*/
188
- #define PGLZ_HISTORY_LISTS 8192
189
- #define PGLZ_HISTORY_MASK 0x1fff
187
+ #define PGLZ_HISTORY_LISTS 8192/* must be power of 2 */
188
+ #define PGLZ_HISTORY_MASK (PGLZ_HISTORY_LISTS - 1)
190
189
#define PGLZ_HISTORY_SIZE 4096
191
190
#define PGLZ_MAX_MATCH 273
192
191
195
194
* PGLZ_HistEntry -
196
195
*
197
196
*Linked list for the backward history lookup
197
+ *
198
+ * All the entries sharing a hash key are linked in a doubly linked list.
199
+ * This makes it easy to remove an entry when it's time to recycle it
200
+ * (because it's more than 4K positions old).
198
201
* ----------
199
202
*/
200
203
typedef struct PGLZ_HistEntry
201
204
{
202
- struct PGLZ_HistEntry * next ;
205
+ struct PGLZ_HistEntry * next ;/* links for my hash key's list */
203
206
struct PGLZ_HistEntry * prev ;
204
- char * pos ;
207
+ int hindex ;/* my current hash key */
208
+ char * pos ;/* my input position */
205
209
}PGLZ_HistEntry ;
206
210
207
211
@@ -249,7 +253,7 @@ static PGLZ_Strategy strategy_never_data = {
249
253
PGLZ_Strategy * PGLZ_strategy_never = & strategy_never_data ;
250
254
251
255
/* ----------
252
- *Global arrays for history
256
+ *Statically allocated work arrays for history
253
257
* ----------
254
258
*/
255
259
static PGLZ_HistEntry * hist_start [PGLZ_HISTORY_LISTS ];
@@ -261,48 +265,55 @@ static PGLZ_HistEntry hist_entries[PGLZ_HISTORY_SIZE];
261
265
*
262
266
*Computes the history table slot for the lookup by the next 4
263
267
*characters in the input.
268
+ *
269
+ * NB: because we use the next 4 characters, we are not guaranteed to
270
+ * find 3-character matches; they very possibly will be in the wrong
271
+ * hash list. This seems an acceptable tradeoff for spreading out the
272
+ * hash keys more.
264
273
* ----------
265
274
*/
266
- #if 1
267
- #define pglz_hist_idx (_s ,_e ) (\
268
- (((_e) - (_s)) < 4) ? 0 :\
269
- ((((_s)[0] << 9) ^ ((_s)[1] << 6) ^\
270
- ((_s)[2] << 3) ^ (_s)[3]) & (PGLZ_HISTORY_MASK))\
271
- )
272
- #else
273
275
#define pglz_hist_idx (_s ,_e ) (\
274
- (((_e) - (_s)) < 2) ? 0 :\
275
- ((((_s)[0] << 8) ^ (_s)[1]) & (PGLZ_HISTORY_MASK))\
276
+ ((((_e) - (_s)) < 4) ? (int) (_s)[0] :\
277
+ (((_s)[0] << 9) ^ ((_s)[1] << 6) ^\
278
+ ((_s)[2] << 3) ^ (_s)[3])) & (PGLZ_HISTORY_MASK)\
276
279
)
277
- #endif
278
280
279
281
280
282
/* ----------
281
283
* pglz_hist_add -
282
284
*
283
285
*Adds a new entry to the history table.
286
+ *
287
+ * If _recycle is true, then we are recycling a previously used entry,
288
+ * and must first delink it from its old hashcode's linked list.
289
+ *
290
+ * NOTE: beware of multiple evaluations of macro's arguments, and note that
291
+ * _hn and _recycle are modified in the macro.
284
292
* ----------
285
293
*/
286
- #define pglz_hist_add (_hs ,_he ,_hn ,_s ,_e ) \
294
+ #define pglz_hist_add (_hs ,_he ,_hn ,_recycle , _s ,_e ) \
287
295
do {\
288
296
int __hindex = pglz_hist_idx((_s),(_e));\
289
- if ((_he)[(_hn)].prev == NULL) {\
290
- (_hs)[__hindex] = (_he)[(_hn)].next;\
291
- } else {\
292
- (_he)[(_hn)].prev->next = (_he)[(_hn)].next;\
293
- }\
294
- if ((_he)[(_hn)].next != NULL) {\
295
- (_he)[(_hn)].next->prev = (_he)[(_hn)].prev;\
296
- }\
297
- (_he)[(_hn)].next = (_hs)[__hindex];\
298
- (_he)[(_hn)].prev = NULL;\
299
- (_he)[(_hn)].pos = (_s);\
300
- if ((_hs)[__hindex] != NULL) {\
301
- (_hs)[__hindex]->prev = &((_he)[(_hn)]);\
297
+ PGLZ_HistEntry **__myhsp = &(_hs)[__hindex];\
298
+ PGLZ_HistEntry *__myhe = &(_he)[_hn];\
299
+ if (_recycle) {\
300
+ if (__myhe->prev == NULL)\
301
+ (_hs)[__myhe->hindex] = __myhe->next;\
302
+ else\
303
+ __myhe->prev->next = __myhe->next;\
304
+ if (__myhe->next != NULL)\
305
+ __myhe->next->prev = __myhe->prev;\
302
306
}\
303
- (_hs)[__hindex] = &((_he)[(_hn)]);\
307
+ __myhe->next = *__myhsp;\
308
+ __myhe->prev = NULL;\
309
+ __myhe->hindex = __hindex;\
310
+ __myhe->pos = (_s);\
311
+ if (*__myhsp != NULL)\
312
+ (*__myhsp)->prev = __myhe;\
313
+ *__myhsp = __myhe;\
304
314
if (++(_hn) >= PGLZ_HISTORY_SIZE) {\
305
315
(_hn) = 0;\
316
+ (_recycle) = true;\
306
317
}\
307
318
} while (0)
308
319
@@ -382,27 +393,23 @@ pglz_find_match(PGLZ_HistEntry **hstart, char *input, char *end,
382
393
PGLZ_HistEntry * hent ;
383
394
int32 len = 0 ;
384
395
int32 off = 0 ;
385
- int32 thislen ;
386
- int32 thisoff ;
387
- char * ip ;
388
- char * hp ;
389
396
390
397
/*
391
398
* Traverse the linked history list until a good enough match is
392
399
* found.
393
400
*/
394
401
hent = hstart [pglz_hist_idx (input ,end )];
395
- while (hent && len < good_match )
402
+ while (hent )
396
403
{
397
- /*
398
- * Be happy with lesser good matches the more entries we visited.
399
- */
400
- good_match -= ( good_match * good_drop ) / 100 ;
404
+ char * ip = input ;
405
+ char * hp = hent -> pos ;
406
+ int32 thisoff ;
407
+ int32 thislen ;
401
408
402
409
/*
403
410
* Stop if the offset does not fit into our tag anymore.
404
411
*/
405
- thisoff = ( ip = input ) - ( hp = hent -> pos ) ;
412
+ thisoff = ip - hp ;
406
413
if (thisoff >=0x0fff )
407
414
break ;
408
415
@@ -411,27 +418,33 @@ pglz_find_match(PGLZ_HistEntry **hstart, char *input, char *end,
411
418
* the best so far. And if we already have a match of 16 or more
412
419
* bytes, it's worth the call overhead to use memcmp() to check if
413
420
* this match is equal for the same size. After that we must
414
- * fallback to character by charactercomparision to know the
421
+ * fallback to character by charactercomparison to know the
415
422
* exact position where the diff occured.
416
423
*/
424
+ thislen = 0 ;
417
425
if (len >=16 )
418
426
{
419
- if (memcmp (ip ,hp ,len )! =0 )
427
+ if (memcmp (ip ,hp ,len )= =0 )
420
428
{
421
- hent = hent -> next ;
422
- continue ;
429
+ thislen = len ;
430
+ ip += len ;
431
+ hp += len ;
432
+ while (ip < end && * ip == * hp && thislen < PGLZ_MAX_MATCH )
433
+ {
434
+ thislen ++ ;
435
+ ip ++ ;
436
+ hp ++ ;
437
+ }
423
438
}
424
- thislen = len ;
425
- ip += len ;
426
- hp += len ;
427
439
}
428
440
else
429
- thislen = 0 ;
430
- while (ip < end && * ip == * hp && thislen < PGLZ_MAX_MATCH )
431
441
{
432
- thislen ++ ;
433
- ip ++ ;
434
- hp ++ ;
442
+ while (ip < end && * ip == * hp && thislen < PGLZ_MAX_MATCH )
443
+ {
444
+ thislen ++ ;
445
+ ip ++ ;
446
+ hp ++ ;
447
+ }
435
448
}
436
449
437
450
/*
@@ -447,6 +460,17 @@ pglz_find_match(PGLZ_HistEntry **hstart, char *input, char *end,
447
460
* Advance to the next history entry
448
461
*/
449
462
hent = hent -> next ;
463
+
464
+ /*
465
+ * Be happy with lesser good matches the more entries we visited.
466
+ * But no point in doing calculation if we're at end of list.
467
+ */
468
+ if (hent )
469
+ {
470
+ if (len >=good_match )
471
+ break ;
472
+ good_match -= (good_match * good_drop ) /100 ;
473
+ }
450
474
}
451
475
452
476
/*
@@ -473,10 +497,10 @@ pglz_find_match(PGLZ_HistEntry **hstart, char *input, char *end,
473
497
int
474
498
pglz_compress (char * source ,int32 slen ,PGLZ_Header * dest ,PGLZ_Strategy * strategy )
475
499
{
476
- int hist_next = 0 ;
477
-
478
500
unsignedchar * bp = ((unsignedchar * )dest )+ sizeof (PGLZ_Header );
479
501
unsignedchar * bstart = bp ;
502
+ int hist_next = 0 ;
503
+ bool hist_recycle = false;
480
504
char * dp = source ;
481
505
char * dend = source + slen ;
482
506
unsignedchar ctrl_dummy = 0 ;
@@ -535,12 +559,11 @@ pglz_compress(char *source, int32 slen, PGLZ_Header *dest, PGLZ_Strategy *strate
535
559
good_drop = 100 ;
536
560
537
561
/*
538
- * Initialize the historytables. For inputs smaller than
539
- *PGLZ_HISTORY_SIZE, we already have a big enough history table on
540
- *the stack frame .
562
+ * Initialize the historylists to empty. We do not need to zero
563
+ *the hist_entries[] array; its entries are initialized as they
564
+ *are used .
541
565
*/
542
566
memset ((void * )hist_start ,0 ,sizeof (hist_start ));
543
- memset ((void * )hist_entries ,0 ,sizeof (hist_entries ));
544
567
545
568
/*
546
569
* Compute the maximum result size allowed by the strategy. If the
@@ -588,7 +611,9 @@ pglz_compress(char *source, int32 slen, PGLZ_Header *dest, PGLZ_Strategy *strate
588
611
pglz_out_tag (ctrlp ,ctrlb ,ctrl ,bp ,match_len ,match_off );
589
612
while (match_len -- )
590
613
{
591
- pglz_hist_add (hist_start ,hist_entries ,hist_next ,dp ,dend );
614
+ pglz_hist_add (hist_start ,hist_entries ,
615
+ hist_next ,hist_recycle ,
616
+ dp ,dend );
592
617
dp ++ ;/* Do not do this ++ in the line above!*/
593
618
/* The macro would do it four times - Jan.*/
594
619
}
@@ -599,7 +624,9 @@ pglz_compress(char *source, int32 slen, PGLZ_Header *dest, PGLZ_Strategy *strate
599
624
* No match found. Copy one literal byte.
600
625
*/
601
626
pglz_out_literal (ctrlp ,ctrlb ,ctrl ,bp ,* dp );
602
- pglz_hist_add (hist_start ,hist_entries ,hist_next ,dp ,dend );
627
+ pglz_hist_add (hist_start ,hist_entries ,
628
+ hist_next ,hist_recycle ,
629
+ dp ,dend );
603
630
dp ++ ;/* Do not do this ++ in the line above!*/
604
631
/* The macro would do it four times - Jan.*/
605
632
}