|
| 1 | +/*------------------------------------------------------------------------- |
| 2 | + * |
| 3 | + * hyperloglog.c |
| 4 | + * HyperLogLog cardinality estimator |
| 5 | + * |
| 6 | + * Portions Copyright (c) 2014, PostgreSQL Global Development Group |
| 7 | + * |
| 8 | + * Based on Hideaki Ohno's C++ implementation. This is probably not ideally |
| 9 | + * suited to estimating the cardinality of very large sets; in particular, we |
| 10 | + * have not attempted to further optimize the implementation as described in |
| 11 | + * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic |
| 12 | + * Engineering of a State of The Art Cardinality Estimation Algorithm". |
| 13 | + * |
| 14 | + * A sparse representation of HyperLogLog state is used, with fixed space |
| 15 | + * overhead. |
| 16 | + * |
| 17 | + * The copyright terms of Ohno's original version (the MIT license) follow. |
| 18 | + * |
| 19 | + * IDENTIFICATION |
| 20 | + * src/backend/lib/hyperloglog.c |
| 21 | + * |
| 22 | + *------------------------------------------------------------------------- |
| 23 | + */ |
| 24 | + |
| 25 | +/* |
| 26 | + * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com> |
| 27 | + * |
| 28 | + * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 29 | + * of this software and associated documentation files (the 'Software'), to |
| 30 | + * deal in the Software without restriction, including without limitation the |
| 31 | + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| 32 | + * sell copies of the Software, and to permit persons to whom the Software is |
| 33 | + * furnished to do so, subject to the following conditions: |
| 34 | + * |
| 35 | + * The above copyright notice and this permission notice shall be included in |
| 36 | + * all copies or substantial portions of the Software. |
| 37 | + * |
| 38 | + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 39 | + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 40 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 41 | + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 42 | + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| 43 | + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| 44 | + * IN THE SOFTWARE. |
| 45 | + */ |
| 46 | + |
| 47 | +#include"postgres.h" |
| 48 | + |
| 49 | +#include<math.h> |
| 50 | + |
| 51 | +#include"lib/hyperloglog.h" |
| 52 | + |
| 53 | +#definePOW_2_32(4294967296.0) |
| 54 | +#defineNEG_POW_2_32(-4294967296.0) |
| 55 | + |
| 56 | +staticinlineuint8rho(uint32x,uint8b); |
| 57 | + |
| 58 | +/* |
| 59 | + * Initialize HyperLogLog track state |
| 60 | + * |
| 61 | + * bwidth is bit width (so register size will be 2 to the power of bwidth). |
| 62 | + * Must be between 4 and 16 inclusive. |
| 63 | + */ |
| 64 | +void |
| 65 | +initHyperLogLog(hyperLogLogState*cState,uint8bwidth) |
| 66 | +{ |
| 67 | +doublealpha; |
| 68 | + |
| 69 | +if (bwidth<4||bwidth>16) |
| 70 | +elog(ERROR,"bit width must be between 4 and 16 inclusive"); |
| 71 | + |
| 72 | +cState->registerWidth=bwidth; |
| 73 | +cState->nRegisters=1 <<bwidth; |
| 74 | +cState->arrSize=sizeof(uint8)*cState->nRegisters+1; |
| 75 | + |
| 76 | +/* |
| 77 | + * Initialize hashes array to zero, not negative infinity, per discussion |
| 78 | + * of the coupon collector problem in the HyperLogLog paper |
| 79 | + */ |
| 80 | +cState->hashesArr=palloc0(cState->arrSize); |
| 81 | + |
| 82 | +/* |
| 83 | + * "alpha" is a value that for each possible number of registers (m) is |
| 84 | + * used to correct a systematic multiplicative bias present in m ^ 2 Z (Z |
| 85 | + * is "the indicator function" through which we finally compute E, |
| 86 | + * estimated cardinality). |
| 87 | + */ |
| 88 | +switch (cState->nRegisters) |
| 89 | +{ |
| 90 | +case16: |
| 91 | +alpha=0.673; |
| 92 | +break; |
| 93 | +case32: |
| 94 | +alpha=0.697; |
| 95 | +break; |
| 96 | +case64: |
| 97 | +alpha=0.709; |
| 98 | +break; |
| 99 | +default: |
| 100 | +alpha=0.7213 / (1.0+1.079 /cState->nRegisters); |
| 101 | +} |
| 102 | + |
| 103 | +/* |
| 104 | + * Precalculate alpha m ^ 2, later used to generate "raw" HyperLogLog |
| 105 | + * estimate E |
| 106 | + */ |
| 107 | +cState->alphaMM=alpha*cState->nRegisters*cState->nRegisters; |
| 108 | +} |
| 109 | + |
| 110 | +/* |
| 111 | + * Adds element to the estimator, from caller-supplied hash. |
| 112 | + * |
| 113 | + * It is critical that the hash value passed be an actual hash value, typically |
| 114 | + * generated using hash_any(). The algorithm relies on a specific bit-pattern |
| 115 | + * observable in conjunction with stochastic averaging. There must be a |
| 116 | + * uniform distribution of bits in hash values for each distinct original value |
| 117 | + * observed. |
| 118 | + */ |
| 119 | +void |
| 120 | +addHyperLogLog(hyperLogLogState*cState,uint32hash) |
| 121 | +{ |
| 122 | +uint8count; |
| 123 | +uint32index; |
| 124 | + |
| 125 | +/* Use the first "k" (registerWidth) bits as a zero based index */ |
| 126 | +index=hash >> (BITS_PER_BYTE*sizeof(uint32)-cState->registerWidth); |
| 127 | + |
| 128 | +/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ |
| 129 | +count=rho(hash <<cState->registerWidth, |
| 130 | +BITS_PER_BYTE*sizeof(uint32)-cState->registerWidth); |
| 131 | + |
| 132 | +cState->hashesArr[index]=Max(count,cState->hashesArr[index]); |
| 133 | +} |
| 134 | + |
| 135 | +/* |
| 136 | + * Estimates cardinality, based on elements added so far |
| 137 | + */ |
| 138 | +double |
| 139 | +estimateHyperLogLog(hyperLogLogState*cState) |
| 140 | +{ |
| 141 | +doubleresult; |
| 142 | +doublesum=0.0; |
| 143 | +inti; |
| 144 | + |
| 145 | +for (i=0;i<cState->nRegisters;i++) |
| 146 | +{ |
| 147 | +sum+=1.0 /pow(2.0,cState->hashesArr[i]); |
| 148 | +} |
| 149 | + |
| 150 | +/* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */ |
| 151 | +result=cState->alphaMM /sum; |
| 152 | + |
| 153 | +if (result <= (5.0 /2.0)*cState->nRegisters) |
| 154 | +{ |
| 155 | +/* Small range correction */ |
| 156 | +intzero_count=0; |
| 157 | + |
| 158 | +for (i=0;i<cState->nRegisters;i++) |
| 159 | +{ |
| 160 | +if (cState->hashesArr[i]==0) |
| 161 | +zero_count++; |
| 162 | +} |
| 163 | + |
| 164 | +if (zero_count!=0) |
| 165 | +result=cState->nRegisters*log((double)cState->nRegisters / |
| 166 | +zero_count); |
| 167 | +} |
| 168 | +elseif (result> (1.0 /30.0)*POW_2_32) |
| 169 | +{ |
| 170 | +/* Large range correction */ |
| 171 | +result=NEG_POW_2_32*log(1.0- (result /POW_2_32)); |
| 172 | +} |
| 173 | + |
| 174 | +returnresult; |
| 175 | +} |
| 176 | + |
| 177 | +/* |
| 178 | + * Merges the estimate from one HyperLogLog state to another, returning the |
| 179 | + * estimate of their union. |
| 180 | + * |
| 181 | + * The number of registers in each must match. |
| 182 | + */ |
| 183 | +void |
| 184 | +mergeHyperLogLog(hyperLogLogState*cState,consthyperLogLogState*oState) |
| 185 | +{ |
| 186 | +intr; |
| 187 | + |
| 188 | +if (cState->nRegisters!=oState->nRegisters) |
| 189 | +elog(ERROR,"number of registers mismatch: %zu != %zu", |
| 190 | +cState->nRegisters,oState->nRegisters); |
| 191 | + |
| 192 | +for (r=0;r<cState->nRegisters;++r) |
| 193 | +{ |
| 194 | +cState->hashesArr[r]=Max(cState->hashesArr[r],oState->hashesArr[r]); |
| 195 | +} |
| 196 | +} |
| 197 | + |
| 198 | + |
| 199 | +/* |
| 200 | + * Worker for addHyperLogLog(). |
| 201 | + * |
| 202 | + * Calculates the position of the first set bit in first b bits of x argument |
| 203 | + * starting from the first, reading from most significant to least significant |
| 204 | + * bits. |
| 205 | + * |
| 206 | + * Example (when considering fist 10 bits of x): |
| 207 | + * |
| 208 | + * rho(x = 0b1000000000) returns 1 |
| 209 | + * rho(x = 0b0010000000) returns 3 |
| 210 | + * rho(x = 0b0000000000) returns b + 1 |
| 211 | + * |
| 212 | + * "The binary address determined by the first b bits of x" |
| 213 | + * |
| 214 | + * Return value "j" used to index bit pattern to watch. |
| 215 | + */ |
| 216 | +staticinlineuint8 |
| 217 | +rho(uint32x,uint8b) |
| 218 | +{ |
| 219 | +uint8j=1; |
| 220 | + |
| 221 | +while (j <=b&& !(x&0x80000000)) |
| 222 | +{ |
| 223 | +j++; |
| 224 | +x <<=1; |
| 225 | +} |
| 226 | + |
| 227 | +returnj; |
| 228 | +} |