Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * sampling.c
4 : * Relation block sampling routines.
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/utils/misc/sampling.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include <math.h>
19 :
20 : #include "utils/sampling.h"
21 :
22 :
23 : /*
24 : * BlockSampler_Init -- prepare for random sampling of blocknumbers
25 : *
26 : * BlockSampler provides algorithm for block level sampling of a relation
27 : * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
28 : * It selects a random sample of samplesize blocks out of
29 : * the nblocks blocks in the table. If the table has less than
30 : * samplesize blocks, all blocks are selected.
31 : *
32 : * Since we know the total number of blocks in advance, we can use the
33 : * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
34 : * algorithm.
35 : */
36 : void
37 217 : BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
38 : long randseed)
39 : {
40 217 : bs->N = nblocks; /* measured table size */
41 :
42 : /*
43 : * If we decide to reduce samplesize for tables that have less or not much
44 : * more than samplesize blocks, here is the place to do it.
45 : */
46 217 : bs->n = samplesize;
47 217 : bs->t = 0; /* blocks scanned so far */
48 217 : bs->m = 0; /* blocks selected so far */
49 :
50 217 : sampler_random_init_state(randseed, bs->randstate);
51 217 : }
52 :
53 : bool
54 10157 : BlockSampler_HasMore(BlockSampler bs)
55 : {
56 10157 : return (bs->t < bs->N) && (bs->m < bs->n);
57 : }
58 :
59 : BlockNumber
60 4970 : BlockSampler_Next(BlockSampler bs)
61 : {
62 4970 : BlockNumber K = bs->N - bs->t; /* remaining blocks */
63 4970 : int k = bs->n - bs->m; /* blocks still to sample */
64 : double p; /* probability to skip block */
65 : double V; /* random */
66 :
67 4970 : Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */
68 :
69 4970 : if ((BlockNumber) k >= K)
70 : {
71 : /* need all the rest */
72 4970 : bs->m++;
73 4970 : return bs->t++;
74 : }
75 :
76 : /*----------
77 : * It is not obvious that this code matches Knuth's Algorithm S.
78 : * Knuth says to skip the current block with probability 1 - k/K.
79 : * If we are to skip, we should advance t (hence decrease K), and
80 : * repeat the same probabilistic test for the next block. The naive
81 : * implementation thus requires a sampler_random_fract() call for each
82 : * block number. But we can reduce this to one sampler_random_fract()
83 : * call per selected block, by noting that each time the while-test
84 : * succeeds, we can reinterpret V as a uniform random number in the range
85 : * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
86 : * the appropriate fraction of its former value, and our next loop
87 : * makes the appropriate probabilistic test.
88 : *
89 : * We have initially K > k > 0. If the loop reduces K to equal k,
90 : * the next while-test must fail since p will become exactly zero
91 : * (we assume there will not be roundoff error in the division).
92 : * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
93 : * to be doubly sure about roundoff error.) Therefore K cannot become
94 : * less than k, which means that we cannot fail to select enough blocks.
95 : *----------
96 : */
97 0 : V = sampler_random_fract(bs->randstate);
98 0 : p = 1.0 - (double) k / (double) K;
99 0 : while (V < p)
100 : {
101 : /* skip */
102 0 : bs->t++;
103 0 : K--; /* keep K == N - t */
104 :
105 : /* adjust p to be new cutoff point in reduced range */
106 0 : p *= 1.0 - (double) k / (double) K;
107 : }
108 :
109 : /* select */
110 0 : bs->m++;
111 0 : return bs->t++;
112 : }
113 :
114 : /*
115 : * These two routines embody Algorithm Z from "Random sampling with a
116 : * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
117 : * (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
118 : * of the count S of records to skip before processing another record.
119 : * It is computed primarily based on t, the number of records already read.
120 : * The only extra state needed between calls is W, a random state variable.
121 : *
122 : * reservoir_init_selection_state computes the initial W value.
123 : *
124 : * Given that we've already read t records (t >= n), reservoir_get_next_S
125 : * determines the number of records to skip before the next record is
126 : * processed.
127 : */
128 : void
129 217 : reservoir_init_selection_state(ReservoirState rs, int n)
130 : {
131 : /*
132 : * Reservoir sampling is not used anywhere where it would need to return
133 : * repeatable results so we can initialize it randomly.
134 : */
135 217 : sampler_random_init_state(random(), rs->randstate);
136 :
137 : /* Initial value of W (for use when Algorithm Z is first applied) */
138 217 : rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n);
139 217 : }
140 :
141 : double
142 0 : reservoir_get_next_S(ReservoirState rs, double t, int n)
143 : {
144 : double S;
145 :
146 : /* The magic constant here is T from Vitter's paper */
147 0 : if (t <= (22.0 * n))
148 : {
149 : /* Process records using Algorithm X until t is large enough */
150 : double V,
151 : quot;
152 :
153 0 : V = sampler_random_fract(rs->randstate); /* Generate V */
154 0 : S = 0;
155 0 : t += 1;
156 : /* Note: "num" in Vitter's code is always equal to t - n */
157 0 : quot = (t - (double) n) / t;
158 : /* Find min S satisfying (4.1) */
159 0 : while (quot > V)
160 : {
161 0 : S += 1;
162 0 : t += 1;
163 0 : quot *= (t - (double) n) / t;
164 : }
165 : }
166 : else
167 : {
168 : /* Now apply Algorithm Z */
169 0 : double W = rs->W;
170 0 : double term = t - (double) n + 1;
171 :
172 : for (;;)
173 : {
174 : double numer,
175 : numer_lim,
176 : denom;
177 : double U,
178 : X,
179 : lhs,
180 : rhs,
181 : y,
182 : tmp;
183 :
184 : /* Generate U and X */
185 0 : U = sampler_random_fract(rs->randstate);
186 0 : X = t * (W - 1.0);
187 0 : S = floor(X); /* S is tentatively set to floor(X) */
188 : /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
189 0 : tmp = (t + 1) / term;
190 0 : lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
191 0 : rhs = (((t + X) / (term + S)) * term) / t;
192 0 : if (lhs <= rhs)
193 : {
194 0 : W = rhs / lhs;
195 0 : break;
196 : }
197 : /* Test if U <= f(S)/cg(X) */
198 0 : y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
199 0 : if ((double) n < S)
200 : {
201 0 : denom = t;
202 0 : numer_lim = term + S;
203 : }
204 : else
205 : {
206 0 : denom = t - (double) n + S;
207 0 : numer_lim = t + 1;
208 : }
209 0 : for (numer = t + S; numer >= numer_lim; numer -= 1)
210 : {
211 0 : y *= numer / denom;
212 0 : denom -= 1;
213 : }
214 0 : W = exp(-log(sampler_random_fract(rs->randstate)) / n); /* Generate W in advance */
215 0 : if (exp(log(y) / n) <= (t + X) / t)
216 0 : break;
217 0 : }
218 0 : rs->W = W;
219 : }
220 0 : return S;
221 : }
222 :
223 :
224 : /*----------
225 : * Random number generator used by sampling
226 : *----------
227 : */
228 : void
229 434 : sampler_random_init_state(long seed, SamplerRandomState randstate)
230 : {
231 434 : randstate[0] = 0x330e; /* same as pg_erand48, but could be anything */
232 434 : randstate[1] = (unsigned short) seed;
233 434 : randstate[2] = (unsigned short) (seed >> 16);
234 434 : }
235 :
236 : /* Select a random value R uniformly distributed in (0 - 1) */
237 : double
238 217 : sampler_random_fract(SamplerRandomState randstate)
239 : {
240 : double res;
241 :
242 : /* pg_erand48 returns a value in [0.0 - 1.0), so we must reject 0 */
243 : do
244 : {
245 217 : res = pg_erand48(randstate);
246 217 : } while (res == 0.0);
247 217 : return res;
248 : }
249 :
250 :
251 : /*
252 : * Backwards-compatible API for block sampling
253 : *
254 : * This code is now deprecated, but since it's still in use by many FDWs,
255 : * we should keep it for awhile at least. The functionality is the same as
256 : * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
257 : * except that a common random state is used across all callers.
258 : */
259 : static ReservoirStateData oldrs;
260 :
261 : double
262 0 : anl_random_fract(void)
263 : {
264 : /* initialize if first time through */
265 0 : if (oldrs.randstate[0] == 0)
266 0 : sampler_random_init_state(random(), oldrs.randstate);
267 :
268 : /* and compute a random fraction */
269 0 : return sampler_random_fract(oldrs.randstate);
270 : }
271 :
272 : double
273 0 : anl_init_selection_state(int n)
274 : {
275 : /* initialize if first time through */
276 0 : if (oldrs.randstate[0] == 0)
277 0 : sampler_random_init_state(random(), oldrs.randstate);
278 :
279 : /* Initial value of W (for use when Algorithm Z is first applied) */
280 0 : return exp(-log(sampler_random_fract(oldrs.randstate)) / n);
281 : }
282 :
283 : double
284 0 : anl_get_next_S(double t, int n, double *stateptr)
285 : {
286 : double result;
287 :
288 0 : oldrs.W = *stateptr;
289 0 : result = reservoir_get_next_S(&oldrs, t, n);
290 0 : *stateptr = oldrs.W;
291 0 : return result;
292 : }
|