Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * ragetypes_typanalyze.c
4 : * Functions for gathering statistics from range columns
5 : *
6 : * For a range type column, histograms of lower and upper bounds, and
7 : * the fraction of NULL and empty ranges are collected.
8 : *
9 : * Both histograms have the same length, and they are combined into a
10 : * single array of ranges. This has the same shape as the histogram that
11 : * std_typanalyze would collect, but the values are different. Each range
12 : * in the array is a valid range, even though the lower and upper bounds
13 : * come from different tuples. In theory, the standard scalar selectivity
14 : * functions could be used with the combined histogram.
15 : *
16 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
17 : * Portions Copyright (c) 1994, Regents of the University of California
18 : *
19 : *
20 : * IDENTIFICATION
21 : * src/backend/utils/adt/rangetypes_typanalyze.c
22 : *
23 : *-------------------------------------------------------------------------
24 : */
25 : #include "postgres.h"
26 :
27 : #include "catalog/pg_operator.h"
28 : #include "commands/vacuum.h"
29 : #include "utils/builtins.h"
30 : #include "utils/lsyscache.h"
31 : #include "utils/rangetypes.h"
32 :
33 : static int float8_qsort_cmp(const void *a1, const void *a2);
34 : static int range_bound_qsort_cmp(const void *a1, const void *a2, void *arg);
35 : static void compute_range_stats(VacAttrStats *stats,
36 : AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
37 :
38 : /*
39 : * range_typanalyze -- typanalyze function for range columns
40 : */
41 : Datum
42 2 : range_typanalyze(PG_FUNCTION_ARGS)
43 : {
44 2 : VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
45 : TypeCacheEntry *typcache;
46 2 : Form_pg_attribute attr = stats->attr;
47 :
48 : /* Get information about range type; note column might be a domain */
49 2 : typcache = range_get_typcache(fcinfo, getBaseType(stats->attrtypid));
50 :
51 2 : if (attr->attstattarget < 0)
52 2 : attr->attstattarget = default_statistics_target;
53 :
54 2 : stats->compute_stats = compute_range_stats;
55 2 : stats->extra_data = typcache;
56 : /* same as in std_typanalyze */
57 2 : stats->minrows = 300 * attr->attstattarget;
58 :
59 2 : PG_RETURN_BOOL(true);
60 : }
61 :
62 : /*
63 : * Comparison function for sorting float8s, used for range lengths.
64 : */
65 : static int
66 19222 : float8_qsort_cmp(const void *a1, const void *a2)
67 : {
68 19222 : const float8 *f1 = (const float8 *) a1;
69 19222 : const float8 *f2 = (const float8 *) a2;
70 :
71 19222 : if (*f1 < *f2)
72 8 : return -1;
73 19214 : else if (*f1 == *f2)
74 16808 : return 0;
75 : else
76 2406 : return 1;
77 : }
78 :
79 : /*
80 : * Comparison function for sorting RangeBounds.
81 : */
82 : static int
83 267566 : range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
84 : {
85 267566 : RangeBound *b1 = (RangeBound *) a1;
86 267566 : RangeBound *b2 = (RangeBound *) a2;
87 267566 : TypeCacheEntry *typcache = (TypeCacheEntry *) arg;
88 :
89 267566 : return range_cmp_bounds(typcache, b1, b2);
90 : }
91 :
92 : /*
93 : * compute_range_stats() -- compute statistics for a range column
94 : */
95 : static void
96 2 : compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
97 : int samplerows, double totalrows)
98 : {
99 2 : TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
100 2 : bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
101 2 : int null_cnt = 0;
102 2 : int non_null_cnt = 0;
103 2 : int non_empty_cnt = 0;
104 2 : int empty_cnt = 0;
105 : int range_no;
106 : int slot_idx;
107 2 : int num_bins = stats->attr->attstattarget;
108 : int num_hist;
109 : float8 *lengths;
110 : RangeBound *lowers,
111 : *uppers;
112 2 : double total_width = 0;
113 :
114 : /* Allocate memory to hold range bounds and lengths of the sample ranges. */
115 2 : lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
116 2 : uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
117 2 : lengths = (float8 *) palloc(sizeof(float8) * samplerows);
118 :
119 : /* Loop over the sample ranges. */
120 12402 : for (range_no = 0; range_no < samplerows; range_no++)
121 : {
122 : Datum value;
123 : bool isnull,
124 : empty;
125 : RangeType *range;
126 : RangeBound lower,
127 : upper;
128 : float8 length;
129 :
130 12400 : vacuum_delay_point();
131 :
132 12400 : value = fetchfunc(stats, range_no, &isnull);
133 12400 : if (isnull)
134 : {
135 : /* range is null, just count that */
136 0 : null_cnt++;
137 0 : continue;
138 : }
139 :
140 : /*
141 : * XXX: should we ignore wide values, like std_typanalyze does, to
142 : * avoid bloating the statistics table?
143 : */
144 12400 : total_width += VARSIZE_ANY(DatumGetPointer(value));
145 :
146 : /* Get range and deserialize it for further analysis. */
147 12400 : range = DatumGetRangeType(value);
148 12400 : range_deserialize(typcache, range, &lower, &upper, &empty);
149 :
150 12400 : if (!empty)
151 : {
152 : /* Remember bounds and length for further usage in histograms */
153 10400 : lowers[non_empty_cnt] = lower;
154 10400 : uppers[non_empty_cnt] = upper;
155 :
156 10400 : if (lower.infinite || upper.infinite)
157 : {
158 : /* Length of any kind of an infinite range is infinite */
159 400 : length = get_float8_infinity();
160 : }
161 10000 : else if (has_subdiff)
162 : {
163 : /*
164 : * For an ordinary range, use subdiff function between upper
165 : * and lower bound values.
166 : */
167 10000 : length = DatumGetFloat8(FunctionCall2Coll(
168 : &typcache->rng_subdiff_finfo,
169 : typcache->rng_collation,
170 : upper.val, lower.val));
171 : }
172 : else
173 : {
174 : /* Use default value of 1.0 if no subdiff is available. */
175 0 : length = 1.0;
176 : }
177 10400 : lengths[non_empty_cnt] = length;
178 :
179 10400 : non_empty_cnt++;
180 : }
181 : else
182 2000 : empty_cnt++;
183 :
184 12400 : non_null_cnt++;
185 : }
186 :
187 2 : slot_idx = 0;
188 :
189 : /* We can only compute real stats if we found some non-null values. */
190 2 : if (non_null_cnt > 0)
191 : {
192 : Datum *bound_hist_values;
193 : Datum *length_hist_values;
194 : int pos,
195 : posfrac,
196 : delta,
197 : deltafrac,
198 : i;
199 : MemoryContext old_cxt;
200 : float4 *emptyfrac;
201 :
202 2 : stats->stats_valid = true;
203 : /* Do the simple null-frac and width stats */
204 2 : stats->stanullfrac = (double) null_cnt / (double) samplerows;
205 2 : stats->stawidth = total_width / (double) non_null_cnt;
206 :
207 : /* Estimate that non-null values are unique */
208 2 : stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
209 :
210 : /* Must copy the target values into anl_context */
211 2 : old_cxt = MemoryContextSwitchTo(stats->anl_context);
212 :
213 : /*
214 : * Generate a bounds histogram slot entry if there are at least two
215 : * values.
216 : */
217 2 : if (non_empty_cnt >= 2)
218 : {
219 : /* Sort bound values */
220 2 : qsort_arg(lowers, non_empty_cnt, sizeof(RangeBound),
221 : range_bound_qsort_cmp, typcache);
222 2 : qsort_arg(uppers, non_empty_cnt, sizeof(RangeBound),
223 : range_bound_qsort_cmp, typcache);
224 :
225 2 : num_hist = non_empty_cnt;
226 2 : if (num_hist > num_bins)
227 2 : num_hist = num_bins + 1;
228 :
229 2 : bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
230 :
231 : /*
232 : * The object of this loop is to construct ranges from first and
233 : * last entries in lowers[] and uppers[] along with evenly-spaced
234 : * values in between. So the i'th value is a range of lowers[(i *
235 : * (nvals - 1)) / (num_hist - 1)] and uppers[(i * (nvals - 1)) /
236 : * (num_hist - 1)]. But computing that subscript directly risks
237 : * integer overflow when the stats target is more than a couple
238 : * thousand. Instead we add (nvals - 1) / (num_hist - 1) to pos
239 : * at each step, tracking the integral and fractional parts of the
240 : * sum separately.
241 : */
242 2 : delta = (non_empty_cnt - 1) / (num_hist - 1);
243 2 : deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
244 2 : pos = posfrac = 0;
245 :
246 204 : for (i = 0; i < num_hist; i++)
247 : {
248 202 : bound_hist_values[i] = PointerGetDatum(range_serialize(
249 : typcache, &lowers[pos], &uppers[pos], false));
250 202 : pos += delta;
251 202 : posfrac += deltafrac;
252 202 : if (posfrac >= (num_hist - 1))
253 : {
254 : /* fractional part exceeds 1, carry to integer part */
255 198 : pos++;
256 198 : posfrac -= (num_hist - 1);
257 : }
258 : }
259 :
260 2 : stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
261 2 : stats->stavalues[slot_idx] = bound_hist_values;
262 2 : stats->numvalues[slot_idx] = num_hist;
263 2 : slot_idx++;
264 : }
265 :
266 : /*
267 : * Generate a length histogram slot entry if there are at least two
268 : * values.
269 : */
270 2 : if (non_empty_cnt >= 2)
271 : {
272 : /*
273 : * Ascending sort of range lengths for further filling of
274 : * histogram
275 : */
276 2 : qsort(lengths, non_empty_cnt, sizeof(float8), float8_qsort_cmp);
277 :
278 2 : num_hist = non_empty_cnt;
279 2 : if (num_hist > num_bins)
280 2 : num_hist = num_bins + 1;
281 :
282 2 : length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
283 :
284 : /*
285 : * The object of this loop is to copy the first and last lengths[]
286 : * entries along with evenly-spaced values in between. So the i'th
287 : * value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
288 : * computing that subscript directly risks integer overflow when
289 : * the stats target is more than a couple thousand. Instead we
290 : * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
291 : * the integral and fractional parts of the sum separately.
292 : */
293 2 : delta = (non_empty_cnt - 1) / (num_hist - 1);
294 2 : deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
295 2 : pos = posfrac = 0;
296 :
297 204 : for (i = 0; i < num_hist; i++)
298 : {
299 202 : length_hist_values[i] = Float8GetDatum(lengths[pos]);
300 202 : pos += delta;
301 202 : posfrac += deltafrac;
302 202 : if (posfrac >= (num_hist - 1))
303 : {
304 : /* fractional part exceeds 1, carry to integer part */
305 198 : pos++;
306 198 : posfrac -= (num_hist - 1);
307 : }
308 : }
309 : }
310 : else
311 : {
312 : /*
313 : * Even when we don't create the histogram, store an empty array
314 : * to mean "no histogram". We can't just leave stavalues NULL,
315 : * because get_attstatsslot() errors if you ask for stavalues, and
316 : * it's NULL. We'll still store the empty fraction in stanumbers.
317 : */
318 0 : length_hist_values = palloc(0);
319 0 : num_hist = 0;
320 : }
321 2 : stats->staop[slot_idx] = Float8LessOperator;
322 2 : stats->stavalues[slot_idx] = length_hist_values;
323 2 : stats->numvalues[slot_idx] = num_hist;
324 2 : stats->statypid[slot_idx] = FLOAT8OID;
325 2 : stats->statyplen[slot_idx] = sizeof(float8);
326 : #ifdef USE_FLOAT8_BYVAL
327 : stats->statypbyval[slot_idx] = true;
328 : #else
329 2 : stats->statypbyval[slot_idx] = false;
330 : #endif
331 2 : stats->statypalign[slot_idx] = 'd';
332 :
333 : /* Store the fraction of empty ranges */
334 2 : emptyfrac = (float4 *) palloc(sizeof(float4));
335 2 : *emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
336 2 : stats->stanumbers[slot_idx] = emptyfrac;
337 2 : stats->numnumbers[slot_idx] = 1;
338 :
339 2 : stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM;
340 2 : slot_idx++;
341 :
342 2 : MemoryContextSwitchTo(old_cxt);
343 : }
344 0 : else if (null_cnt > 0)
345 : {
346 : /* We found only nulls; assume the column is entirely null */
347 0 : stats->stats_valid = true;
348 0 : stats->stanullfrac = 1.0;
349 0 : stats->stawidth = 0; /* "unknown" */
350 0 : stats->stadistinct = 0.0; /* "unknown" */
351 : }
352 :
353 : /*
354 : * We don't need to bother cleaning up any of our temporary palloc's. The
355 : * hashtable should also go away, as it used a child memory context.
356 : */
357 2 : }
|