Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * rangetypes_selfuncs.c
4 : * Functions for selectivity estimation of range operators
5 : *
6 : * Estimates are based on histograms of lower and upper bounds, and the
7 : * fraction of empty ranges.
8 : *
9 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
10 : * Portions Copyright (c) 1994, Regents of the University of California
11 : *
12 : *
13 : * IDENTIFICATION
14 : * src/backend/utils/adt/rangetypes_selfuncs.c
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 : #include "postgres.h"
19 :
20 : #include "access/htup_details.h"
21 : #include "catalog/pg_operator.h"
22 : #include "catalog/pg_statistic.h"
23 : #include "catalog/pg_type.h"
24 : #include "utils/builtins.h"
25 : #include "utils/lsyscache.h"
26 : #include "utils/rangetypes.h"
27 : #include "utils/selfuncs.h"
28 : #include "utils/typcache.h"
29 :
30 : static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
31 : RangeType *constval, Oid operator);
32 : static double default_range_selectivity(Oid operator);
33 : static double calc_hist_selectivity(TypeCacheEntry *typcache,
34 : VariableStatData *vardata, RangeType *constval,
35 : Oid operator);
36 : static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
37 : RangeBound *constbound,
38 : RangeBound *hist, int hist_nvalues,
39 : bool equal);
40 : static int rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value,
41 : RangeBound *hist, int hist_length, bool equal);
42 : static float8 get_position(TypeCacheEntry *typcache, RangeBound *value,
43 : RangeBound *hist1, RangeBound *hist2);
44 : static float8 get_len_position(double value, double hist1, double hist2);
45 : static float8 get_distance(TypeCacheEntry *typcache, RangeBound *bound1,
46 : RangeBound *bound2);
47 : static int length_hist_bsearch(Datum *length_hist_values,
48 : int length_hist_nvalues, double value, bool equal);
49 : static double calc_length_hist_frac(Datum *length_hist_values,
50 : int length_hist_nvalues, double length1, double length2, bool equal);
51 : static double calc_hist_selectivity_contained(TypeCacheEntry *typcache,
52 : RangeBound *lower, RangeBound *upper,
53 : RangeBound *hist_lower, int hist_nvalues,
54 : Datum *length_hist_values, int length_hist_nvalues);
55 : static double calc_hist_selectivity_contains(TypeCacheEntry *typcache,
56 : RangeBound *lower, RangeBound *upper,
57 : RangeBound *hist_lower, int hist_nvalues,
58 : Datum *length_hist_values, int length_hist_nvalues);
59 :
60 : /*
61 : * Returns a default selectivity estimate for given operator, when we don't
62 : * have statistics or cannot use them for some reason.
63 : */
64 : static double
65 105 : default_range_selectivity(Oid operator)
66 : {
67 105 : switch (operator)
68 : {
69 : case OID_RANGE_OVERLAP_OP:
70 10 : return 0.01;
71 :
72 : case OID_RANGE_CONTAINS_OP:
73 : case OID_RANGE_CONTAINED_OP:
74 21 : return 0.005;
75 :
76 : case OID_RANGE_CONTAINS_ELEM_OP:
77 : case OID_RANGE_ELEM_CONTAINED_OP:
78 :
79 : /*
80 : * "range @> elem" is more or less identical to a scalar
81 : * inequality "A >= b AND A <= c".
82 : */
83 12 : return DEFAULT_RANGE_INEQ_SEL;
84 :
85 : case OID_RANGE_LESS_OP:
86 : case OID_RANGE_LESS_EQUAL_OP:
87 : case OID_RANGE_GREATER_OP:
88 : case OID_RANGE_GREATER_EQUAL_OP:
89 : case OID_RANGE_LEFT_OP:
90 : case OID_RANGE_RIGHT_OP:
91 : case OID_RANGE_OVERLAPS_LEFT_OP:
92 : case OID_RANGE_OVERLAPS_RIGHT_OP:
93 : /* these are similar to regular scalar inequalities */
94 62 : return DEFAULT_INEQ_SEL;
95 :
96 : default:
97 : /* all range operators should be handled above, but just in case */
98 0 : return 0.01;
99 : }
100 : }
101 :
102 : /*
103 : * rangesel -- restriction selectivity for range operators
104 : */
105 : Datum
106 135 : rangesel(PG_FUNCTION_ARGS)
107 : {
108 135 : PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
109 135 : Oid operator = PG_GETARG_OID(1);
110 135 : List *args = (List *) PG_GETARG_POINTER(2);
111 135 : int varRelid = PG_GETARG_INT32(3);
112 : VariableStatData vardata;
113 : Node *other;
114 : bool varonleft;
115 : Selectivity selec;
116 135 : TypeCacheEntry *typcache = NULL;
117 135 : RangeType *constrange = NULL;
118 :
119 : /*
120 : * If expression is not (variable op something) or (something op
121 : * variable), then punt and return a default estimate.
122 : */
123 135 : if (!get_restriction_variable(root, args, varRelid,
124 : &vardata, &other, &varonleft))
125 0 : PG_RETURN_FLOAT8(default_range_selectivity(operator));
126 :
127 : /*
128 : * Can't do anything useful if the something is not a constant, either.
129 : */
130 135 : if (!IsA(other, Const))
131 : {
132 0 : ReleaseVariableStats(vardata);
133 0 : PG_RETURN_FLOAT8(default_range_selectivity(operator));
134 : }
135 :
136 : /*
137 : * All the range operators are strict, so we can cope with a NULL constant
138 : * right away.
139 : */
140 135 : if (((Const *) other)->constisnull)
141 : {
142 0 : ReleaseVariableStats(vardata);
143 0 : PG_RETURN_FLOAT8(0.0);
144 : }
145 :
146 : /*
147 : * If var is on the right, commute the operator, so that we can assume the
148 : * var is on the left in what follows.
149 : */
150 135 : if (!varonleft)
151 : {
152 : /* we have other Op var, commute to make var Op other */
153 1 : operator = get_commutator(operator);
154 1 : if (!operator)
155 : {
156 : /* Use default selectivity (should we raise an error instead?) */
157 0 : ReleaseVariableStats(vardata);
158 0 : PG_RETURN_FLOAT8(default_range_selectivity(operator));
159 : }
160 : }
161 :
162 : /*
163 : * OK, there's a Var and a Const we're dealing with here. We need the
164 : * Const to be of same range type as the column, else we can't do anything
165 : * useful. (Such cases will likely fail at runtime, but here we'd rather
166 : * just return a default estimate.)
167 : *
168 : * If the operator is "range @> element", the constant should be of the
169 : * element type of the range column. Convert it to a range that includes
170 : * only that single point, so that we don't need special handling for that
171 : * in what follows.
172 : */
173 135 : if (operator == OID_RANGE_CONTAINS_ELEM_OP)
174 : {
175 11 : typcache = range_get_typcache(fcinfo, vardata.vartype);
176 :
177 11 : if (((Const *) other)->consttype == typcache->rngelemtype->type_id)
178 : {
179 : RangeBound lower,
180 : upper;
181 :
182 11 : lower.inclusive = true;
183 11 : lower.val = ((Const *) other)->constvalue;
184 11 : lower.infinite = false;
185 11 : lower.lower = true;
186 11 : upper.inclusive = true;
187 11 : upper.val = ((Const *) other)->constvalue;
188 11 : upper.infinite = false;
189 11 : upper.lower = false;
190 11 : constrange = range_serialize(typcache, &lower, &upper, false);
191 : }
192 : }
193 124 : else if (operator == OID_RANGE_ELEM_CONTAINED_OP)
194 : {
195 : /*
196 : * Here, the Var is the elem, not the range. For now we just punt and
197 : * return the default estimate. In future we could disassemble the
198 : * range constant and apply scalarineqsel ...
199 : */
200 : }
201 123 : else if (((Const *) other)->consttype == vardata.vartype)
202 : {
203 : /* Both sides are the same range type */
204 123 : typcache = range_get_typcache(fcinfo, vardata.vartype);
205 :
206 123 : constrange = DatumGetRangeType(((Const *) other)->constvalue);
207 : }
208 :
209 : /*
210 : * If we got a valid constant on one side of the operator, proceed to
211 : * estimate using statistics. Otherwise punt and return a default constant
212 : * estimate. Note that calc_rangesel need not handle
213 : * OID_RANGE_ELEM_CONTAINED_OP.
214 : */
215 135 : if (constrange)
216 134 : selec = calc_rangesel(typcache, &vardata, constrange, operator);
217 : else
218 1 : selec = default_range_selectivity(operator);
219 :
220 135 : ReleaseVariableStats(vardata);
221 :
222 135 : CLAMP_PROBABILITY(selec);
223 :
224 135 : PG_RETURN_FLOAT8((float8) selec);
225 : }
226 :
227 : static double
228 134 : calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
229 : RangeType *constval, Oid operator)
230 : {
231 : double hist_selec;
232 : double selec;
233 : float4 empty_frac,
234 : null_frac;
235 :
236 : /*
237 : * First look up the fraction of NULLs and empty ranges from pg_statistic.
238 : */
239 134 : if (HeapTupleIsValid(vardata->statsTuple))
240 : {
241 : Form_pg_statistic stats;
242 : AttStatsSlot sslot;
243 :
244 0 : stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
245 0 : null_frac = stats->stanullfrac;
246 :
247 : /* Try to get fraction of empty ranges */
248 0 : if (get_attstatsslot(&sslot, vardata->statsTuple,
249 : STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
250 : InvalidOid,
251 : ATTSTATSSLOT_NUMBERS))
252 : {
253 0 : if (sslot.nnumbers != 1)
254 0 : elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
255 0 : empty_frac = sslot.numbers[0];
256 0 : free_attstatsslot(&sslot);
257 : }
258 : else
259 : {
260 : /* No empty fraction statistic. Assume no empty ranges. */
261 0 : empty_frac = 0.0;
262 : }
263 : }
264 : else
265 : {
266 : /*
267 : * No stats are available. Follow through the calculations below
268 : * anyway, assuming no NULLs and no empty ranges. This still allows us
269 : * to give a better-than-nothing estimate based on whether the
270 : * constant is an empty range or not.
271 : */
272 134 : null_frac = 0.0;
273 134 : empty_frac = 0.0;
274 : }
275 :
276 134 : if (RangeIsEmpty(constval))
277 : {
278 : /*
279 : * An empty range matches all ranges, all empty ranges, or nothing,
280 : * depending on the operator
281 : */
282 30 : switch (operator)
283 : {
284 : /* these return false if either argument is empty */
285 : case OID_RANGE_OVERLAP_OP:
286 : case OID_RANGE_OVERLAPS_LEFT_OP:
287 : case OID_RANGE_OVERLAPS_RIGHT_OP:
288 : case OID_RANGE_LEFT_OP:
289 : case OID_RANGE_RIGHT_OP:
290 : /* nothing is less than an empty range */
291 : case OID_RANGE_LESS_OP:
292 1 : selec = 0.0;
293 1 : break;
294 :
295 : /* only empty ranges can be contained by an empty range */
296 : case OID_RANGE_CONTAINED_OP:
297 : /* only empty ranges are <= an empty range */
298 : case OID_RANGE_LESS_EQUAL_OP:
299 9 : selec = empty_frac;
300 9 : break;
301 :
302 : /* everything contains an empty range */
303 : case OID_RANGE_CONTAINS_OP:
304 : /* everything is >= an empty range */
305 : case OID_RANGE_GREATER_EQUAL_OP:
306 15 : selec = 1.0;
307 15 : break;
308 :
309 : /* all non-empty ranges are > an empty range */
310 : case OID_RANGE_GREATER_OP:
311 5 : selec = 1.0 - empty_frac;
312 5 : break;
313 :
314 : /* an element cannot be empty */
315 : case OID_RANGE_CONTAINS_ELEM_OP:
316 : default:
317 0 : elog(ERROR, "unexpected operator %u", operator);
318 : selec = 0.0; /* keep compiler quiet */
319 : break;
320 : }
321 : }
322 : else
323 : {
324 : /*
325 : * Calculate selectivity using bound histograms. If that fails for
326 : * some reason, e.g no histogram in pg_statistic, use the default
327 : * constant estimate for the fraction of non-empty values. This is
328 : * still somewhat better than just returning the default estimate,
329 : * because this still takes into account the fraction of empty and
330 : * NULL tuples, if we had statistics for them.
331 : */
332 104 : hist_selec = calc_hist_selectivity(typcache, vardata, constval,
333 : operator);
334 104 : if (hist_selec < 0.0)
335 104 : hist_selec = default_range_selectivity(operator);
336 :
337 : /*
338 : * Now merge the results for the empty ranges and histogram
339 : * calculations, realizing that the histogram covers only the
340 : * non-null, non-empty values.
341 : */
342 104 : if (operator == OID_RANGE_CONTAINED_OP)
343 : {
344 : /* empty is contained by anything non-empty */
345 10 : selec = (1.0 - empty_frac) * hist_selec + empty_frac;
346 : }
347 : else
348 : {
349 : /* with any other operator, empty Op non-empty matches nothing */
350 94 : selec = (1.0 - empty_frac) * hist_selec;
351 : }
352 : }
353 :
354 : /* all range operators are strict */
355 134 : selec *= (1.0 - null_frac);
356 :
357 : /* result should be in range, but make sure... */
358 134 : CLAMP_PROBABILITY(selec);
359 :
360 134 : return selec;
361 : }
362 :
363 : /*
364 : * Calculate range operator selectivity using histograms of range bounds.
365 : *
366 : * This estimate is for the portion of values that are not empty and not
367 : * NULL.
368 : */
369 : static double
370 104 : calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
371 : RangeType *constval, Oid operator)
372 : {
373 : AttStatsSlot hslot;
374 : AttStatsSlot lslot;
375 : int nhist;
376 : RangeBound *hist_lower;
377 : RangeBound *hist_upper;
378 : int i;
379 : RangeBound const_lower;
380 : RangeBound const_upper;
381 : bool empty;
382 : double hist_selec;
383 :
384 : /* Can't use the histogram with insecure range support functions */
385 104 : if (!statistic_proc_security_check(vardata,
386 : typcache->rng_cmp_proc_finfo.fn_oid))
387 0 : return -1;
388 208 : if (OidIsValid(typcache->rng_subdiff_finfo.fn_oid) &&
389 104 : !statistic_proc_security_check(vardata,
390 : typcache->rng_subdiff_finfo.fn_oid))
391 0 : return -1;
392 :
393 : /* Try to get histogram of ranges */
394 104 : if (!(HeapTupleIsValid(vardata->statsTuple) &&
395 0 : get_attstatsslot(&hslot, vardata->statsTuple,
396 : STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
397 : ATTSTATSSLOT_VALUES)))
398 104 : return -1.0;
399 :
400 : /*
401 : * Convert histogram of ranges into histograms of its lower and upper
402 : * bounds.
403 : */
404 0 : nhist = hslot.nvalues;
405 0 : hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
406 0 : hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
407 0 : for (i = 0; i < nhist; i++)
408 : {
409 0 : range_deserialize(typcache, DatumGetRangeType(hslot.values[i]),
410 0 : &hist_lower[i], &hist_upper[i], &empty);
411 : /* The histogram should not contain any empty ranges */
412 0 : if (empty)
413 0 : elog(ERROR, "bounds histogram contains an empty range");
414 : }
415 :
416 : /* @> and @< also need a histogram of range lengths */
417 0 : if (operator == OID_RANGE_CONTAINS_OP ||
418 : operator == OID_RANGE_CONTAINED_OP)
419 : {
420 0 : if (!(HeapTupleIsValid(vardata->statsTuple) &&
421 0 : get_attstatsslot(&lslot, vardata->statsTuple,
422 : STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
423 : InvalidOid,
424 : ATTSTATSSLOT_VALUES)))
425 : {
426 0 : free_attstatsslot(&hslot);
427 0 : return -1.0;
428 : }
429 :
430 : /* check that it's a histogram, not just a dummy entry */
431 0 : if (lslot.nvalues < 2)
432 : {
433 0 : free_attstatsslot(&lslot);
434 0 : free_attstatsslot(&hslot);
435 0 : return -1.0;
436 : }
437 : }
438 : else
439 0 : memset(&lslot, 0, sizeof(lslot));
440 :
441 : /* Extract the bounds of the constant value. */
442 0 : range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
443 0 : Assert(!empty);
444 :
445 : /*
446 : * Calculate selectivity comparing the lower or upper bound of the
447 : * constant with the histogram of lower or upper bounds.
448 : */
449 0 : switch (operator)
450 : {
451 : case OID_RANGE_LESS_OP:
452 :
453 : /*
454 : * The regular b-tree comparison operators (<, <=, >, >=) compare
455 : * the lower bounds first, and the upper bounds for values with
456 : * equal lower bounds. Estimate that by comparing the lower bounds
457 : * only. This gives a fairly accurate estimate assuming there
458 : * aren't many rows with a lower bound equal to the constant's
459 : * lower bound.
460 : */
461 0 : hist_selec =
462 : calc_hist_selectivity_scalar(typcache, &const_lower,
463 : hist_lower, nhist, false);
464 0 : break;
465 :
466 : case OID_RANGE_LESS_EQUAL_OP:
467 0 : hist_selec =
468 : calc_hist_selectivity_scalar(typcache, &const_lower,
469 : hist_lower, nhist, true);
470 0 : break;
471 :
472 : case OID_RANGE_GREATER_OP:
473 0 : hist_selec =
474 0 : 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
475 : hist_lower, nhist, false);
476 0 : break;
477 :
478 : case OID_RANGE_GREATER_EQUAL_OP:
479 0 : hist_selec =
480 0 : 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
481 : hist_lower, nhist, true);
482 0 : break;
483 :
484 : case OID_RANGE_LEFT_OP:
485 : /* var << const when upper(var) < lower(const) */
486 0 : hist_selec =
487 : calc_hist_selectivity_scalar(typcache, &const_lower,
488 : hist_upper, nhist, false);
489 0 : break;
490 :
491 : case OID_RANGE_RIGHT_OP:
492 : /* var >> const when lower(var) > upper(const) */
493 0 : hist_selec =
494 0 : 1 - calc_hist_selectivity_scalar(typcache, &const_upper,
495 : hist_lower, nhist, true);
496 0 : break;
497 :
498 : case OID_RANGE_OVERLAPS_RIGHT_OP:
499 : /* compare lower bounds */
500 0 : hist_selec =
501 0 : 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
502 : hist_lower, nhist, false);
503 0 : break;
504 :
505 : case OID_RANGE_OVERLAPS_LEFT_OP:
506 : /* compare upper bounds */
507 0 : hist_selec =
508 : calc_hist_selectivity_scalar(typcache, &const_upper,
509 : hist_upper, nhist, true);
510 0 : break;
511 :
512 : case OID_RANGE_OVERLAP_OP:
513 : case OID_RANGE_CONTAINS_ELEM_OP:
514 :
515 : /*
516 : * A && B <=> NOT (A << B OR A >> B).
517 : *
518 : * Since A << B and A >> B are mutually exclusive events we can
519 : * sum their probabilities to find probability of (A << B OR A >>
520 : * B).
521 : *
522 : * "range @> elem" is equivalent to "range && [elem,elem]". The
523 : * caller already constructed the singular range from the element
524 : * constant, so just treat it the same as &&.
525 : */
526 0 : hist_selec =
527 : calc_hist_selectivity_scalar(typcache, &const_lower, hist_upper,
528 : nhist, false);
529 0 : hist_selec +=
530 0 : (1.0 - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower,
531 0 : nhist, true));
532 0 : hist_selec = 1.0 - hist_selec;
533 0 : break;
534 :
535 : case OID_RANGE_CONTAINS_OP:
536 0 : hist_selec =
537 0 : calc_hist_selectivity_contains(typcache, &const_lower,
538 : &const_upper, hist_lower, nhist,
539 : lslot.values, lslot.nvalues);
540 0 : break;
541 :
542 : case OID_RANGE_CONTAINED_OP:
543 0 : if (const_lower.infinite)
544 : {
545 : /*
546 : * Lower bound no longer matters. Just estimate the fraction
547 : * with an upper bound <= const upper bound
548 : */
549 0 : hist_selec =
550 : calc_hist_selectivity_scalar(typcache, &const_upper,
551 : hist_upper, nhist, true);
552 : }
553 0 : else if (const_upper.infinite)
554 : {
555 0 : hist_selec =
556 0 : 1.0 - calc_hist_selectivity_scalar(typcache, &const_lower,
557 : hist_lower, nhist, false);
558 : }
559 : else
560 : {
561 0 : hist_selec =
562 0 : calc_hist_selectivity_contained(typcache, &const_lower,
563 : &const_upper, hist_lower, nhist,
564 : lslot.values, lslot.nvalues);
565 : }
566 0 : break;
567 :
568 : default:
569 0 : elog(ERROR, "unknown range operator %u", operator);
570 : hist_selec = -1.0; /* keep compiler quiet */
571 : break;
572 : }
573 :
574 0 : free_attstatsslot(&lslot);
575 0 : free_attstatsslot(&hslot);
576 :
577 0 : return hist_selec;
578 : }
579 :
580 :
581 : /*
582 : * Look up the fraction of values less than (or equal, if 'equal' argument
583 : * is true) a given const in a histogram of range bounds.
584 : */
585 : static double
586 0 : calc_hist_selectivity_scalar(TypeCacheEntry *typcache, RangeBound *constbound,
587 : RangeBound *hist, int hist_nvalues, bool equal)
588 : {
589 : Selectivity selec;
590 : int index;
591 :
592 : /*
593 : * Find the histogram bin the given constant falls into. Estimate
594 : * selectivity as the number of preceding whole bins.
595 : */
596 0 : index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
597 0 : selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
598 :
599 : /* Adjust using linear interpolation within the bin */
600 0 : if (index >= 0 && index < hist_nvalues - 1)
601 0 : selec += get_position(typcache, constbound, &hist[index],
602 0 : &hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
603 :
604 0 : return selec;
605 : }
606 :
607 : /*
608 : * Binary search on an array of range bounds. Returns greatest index of range
609 : * bound in array which is less(less or equal) than given range bound. If all
610 : * range bounds in array are greater or equal(greater) than given range bound,
611 : * return -1. When "equal" flag is set conditions in brackets are used.
612 : *
613 : * This function is used in scalar operator selectivity estimation. Another
614 : * goal of this function is to find a histogram bin where to stop
615 : * interpolation of portion of bounds which are less or equal to given bound.
616 : */
617 : static int
618 0 : rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist,
619 : int hist_length, bool equal)
620 : {
621 0 : int lower = -1,
622 0 : upper = hist_length - 1,
623 : cmp,
624 : middle;
625 :
626 0 : while (lower < upper)
627 : {
628 0 : middle = (lower + upper + 1) / 2;
629 0 : cmp = range_cmp_bounds(typcache, &hist[middle], value);
630 :
631 0 : if (cmp < 0 || (equal && cmp == 0))
632 0 : lower = middle;
633 : else
634 0 : upper = middle - 1;
635 : }
636 0 : return lower;
637 : }
638 :
639 :
640 : /*
641 : * Binary search on length histogram. Returns greatest index of range length in
642 : * histogram which is less than (less than or equal) the given length value. If
643 : * all lengths in the histogram are greater than (greater than or equal) the
644 : * given length, returns -1.
645 : */
646 : static int
647 0 : length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues,
648 : double value, bool equal)
649 : {
650 0 : int lower = -1,
651 0 : upper = length_hist_nvalues - 1,
652 : middle;
653 :
654 0 : while (lower < upper)
655 : {
656 : double middleval;
657 :
658 0 : middle = (lower + upper + 1) / 2;
659 :
660 0 : middleval = DatumGetFloat8(length_hist_values[middle]);
661 0 : if (middleval < value || (equal && middleval <= value))
662 0 : lower = middle;
663 : else
664 0 : upper = middle - 1;
665 : }
666 0 : return lower;
667 : }
668 :
669 : /*
670 : * Get relative position of value in histogram bin in [0,1] range.
671 : */
672 : static float8
673 0 : get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1,
674 : RangeBound *hist2)
675 : {
676 0 : bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
677 : float8 position;
678 :
679 0 : if (!hist1->infinite && !hist2->infinite)
680 : {
681 : float8 bin_width;
682 :
683 : /*
684 : * Both bounds are finite. Assuming the subtype's comparison function
685 : * works sanely, the value must be finite, too, because it lies
686 : * somewhere between the bounds. If it doesn't, just return something.
687 : */
688 0 : if (value->infinite)
689 0 : return 0.5;
690 :
691 : /* Can't interpolate without subdiff function */
692 0 : if (!has_subdiff)
693 0 : return 0.5;
694 :
695 : /* Calculate relative position using subdiff function. */
696 0 : bin_width = DatumGetFloat8(FunctionCall2Coll(
697 : &typcache->rng_subdiff_finfo,
698 : typcache->rng_collation,
699 : hist2->val,
700 : hist1->val));
701 0 : if (bin_width <= 0.0)
702 0 : return 0.5; /* zero width bin */
703 :
704 0 : position = DatumGetFloat8(FunctionCall2Coll(
705 : &typcache->rng_subdiff_finfo,
706 : typcache->rng_collation,
707 : value->val,
708 : hist1->val))
709 : / bin_width;
710 :
711 : /* Relative position must be in [0,1] range */
712 0 : position = Max(position, 0.0);
713 0 : position = Min(position, 1.0);
714 0 : return position;
715 : }
716 0 : else if (hist1->infinite && !hist2->infinite)
717 : {
718 : /*
719 : * Lower bin boundary is -infinite, upper is finite. If the value is
720 : * -infinite, return 0.0 to indicate it's equal to the lower bound.
721 : * Otherwise return 1.0 to indicate it's infinitely far from the lower
722 : * bound.
723 : */
724 0 : return ((value->infinite && value->lower) ? 0.0 : 1.0);
725 : }
726 0 : else if (!hist1->infinite && hist2->infinite)
727 : {
728 : /* same as above, but in reverse */
729 0 : return ((value->infinite && !value->lower) ? 1.0 : 0.0);
730 : }
731 : else
732 : {
733 : /*
734 : * If both bin boundaries are infinite, they should be equal to each
735 : * other, and the value should also be infinite and equal to both
736 : * bounds. (But don't Assert that, to avoid crashing if a user creates
737 : * a datatype with a broken comparison function).
738 : *
739 : * Assume the value to lie in the middle of the infinite bounds.
740 : */
741 0 : return 0.5;
742 : }
743 : }
744 :
745 :
746 : /*
747 : * Get relative position of value in a length histogram bin in [0,1] range.
748 : */
749 : static double
750 0 : get_len_position(double value, double hist1, double hist2)
751 : {
752 0 : if (!is_infinite(hist1) && !is_infinite(hist2))
753 : {
754 : /*
755 : * Both bounds are finite. The value should be finite too, because it
756 : * lies somewhere between the bounds. If it doesn't, just return
757 : * something.
758 : */
759 0 : if (is_infinite(value))
760 0 : return 0.5;
761 :
762 0 : return 1.0 - (hist2 - value) / (hist2 - hist1);
763 : }
764 0 : else if (is_infinite(hist1) && !is_infinite(hist2))
765 : {
766 : /*
767 : * Lower bin boundary is -infinite, upper is finite. Return 1.0 to
768 : * indicate the value is infinitely far from the lower bound.
769 : */
770 0 : return 1.0;
771 : }
772 0 : else if (is_infinite(hist1) && is_infinite(hist2))
773 : {
774 : /* same as above, but in reverse */
775 0 : return 0.0;
776 : }
777 : else
778 : {
779 : /*
780 : * If both bin boundaries are infinite, they should be equal to each
781 : * other, and the value should also be infinite and equal to both
782 : * bounds. (But don't Assert that, to avoid crashing unnecessarily if
783 : * the caller messes up)
784 : *
785 : * Assume the value to lie in the middle of the infinite bounds.
786 : */
787 0 : return 0.5;
788 : }
789 : }
790 :
791 : /*
792 : * Measure distance between two range bounds.
793 : */
794 : static float8
795 0 : get_distance(TypeCacheEntry *typcache, RangeBound *bound1, RangeBound *bound2)
796 : {
797 0 : bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
798 :
799 0 : if (!bound1->infinite && !bound2->infinite)
800 : {
801 : /*
802 : * No bounds are infinite, use subdiff function or return default
803 : * value of 1.0 if no subdiff is available.
804 : */
805 0 : if (has_subdiff)
806 0 : return
807 0 : DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
808 : typcache->rng_collation,
809 : bound2->val,
810 : bound1->val));
811 : else
812 0 : return 1.0;
813 : }
814 0 : else if (bound1->infinite && bound2->infinite)
815 : {
816 : /* Both bounds are infinite */
817 0 : if (bound1->lower == bound2->lower)
818 0 : return 0.0;
819 : else
820 0 : return get_float8_infinity();
821 : }
822 : else
823 : {
824 : /* One bound is infinite, another is not */
825 0 : return get_float8_infinity();
826 : }
827 : }
828 :
829 : /*
830 : * Calculate the average of function P(x), in the interval [length1, length2],
831 : * where P(x) is the fraction of tuples with length < x (or length <= x if
832 : * 'equal' is true).
833 : */
834 : static double
835 0 : calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues,
836 : double length1, double length2, bool equal)
837 : {
838 : double frac;
839 : double A,
840 : B,
841 : PA,
842 : PB;
843 : double pos;
844 : int i;
845 : double area;
846 :
847 0 : Assert(length2 >= length1);
848 :
849 0 : if (length2 < 0.0)
850 0 : return 0.0; /* shouldn't happen, but doesn't hurt to check */
851 :
852 : /* All lengths in the table are <= infinite. */
853 0 : if (is_infinite(length2) && equal)
854 0 : return 1.0;
855 :
856 : /*----------
857 : * The average of a function between A and B can be calculated by the
858 : * formula:
859 : *
860 : * B
861 : * 1 /
862 : * ------- | P(x)dx
863 : * B - A /
864 : * A
865 : *
866 : * The geometrical interpretation of the integral is the area under the
867 : * graph of P(x). P(x) is defined by the length histogram. We calculate
868 : * the area in a piecewise fashion, iterating through the length histogram
869 : * bins. Each bin is a trapezoid:
870 : *
871 : * P(x2)
872 : * /|
873 : * / |
874 : * P(x1)/ |
875 : * | |
876 : * | |
877 : * ---+---+--
878 : * x1 x2
879 : *
880 : * where x1 and x2 are the boundaries of the current histogram, and P(x1)
881 : * and P(x1) are the cumulative fraction of tuples at the boundaries.
882 : *
883 : * The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1)
884 : *
885 : * The first bin contains the lower bound passed by the caller, so we
886 : * use linear interpolation between the previous and next histogram bin
887 : * boundary to calculate P(x1). Likewise for the last bin: we use linear
888 : * interpolation to calculate P(x2). For the bins in between, x1 and x2
889 : * lie on histogram bin boundaries, so P(x1) and P(x2) are simply:
890 : * P(x1) = (bin index) / (number of bins)
891 : * P(x2) = (bin index + 1 / (number of bins)
892 : */
893 :
894 : /* First bin, the one that contains lower bound */
895 0 : i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal);
896 0 : if (i >= length_hist_nvalues - 1)
897 0 : return 1.0;
898 :
899 0 : if (i < 0)
900 : {
901 0 : i = 0;
902 0 : pos = 0.0;
903 : }
904 : else
905 : {
906 : /* interpolate length1's position in the bin */
907 0 : pos = get_len_position(length1,
908 0 : DatumGetFloat8(length_hist_values[i]),
909 0 : DatumGetFloat8(length_hist_values[i + 1]));
910 : }
911 0 : PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
912 0 : B = length1;
913 :
914 : /*
915 : * In the degenerate case that length1 == length2, simply return
916 : * P(length1). This is not merely an optimization: if length1 == length2,
917 : * we'd divide by zero later on.
918 : */
919 0 : if (length2 == length1)
920 0 : return PB;
921 :
922 : /*
923 : * Loop through all the bins, until we hit the last bin, the one that
924 : * contains the upper bound. (if lower and upper bounds are in the same
925 : * bin, this falls out immediately)
926 : */
927 0 : area = 0.0;
928 0 : for (; i < length_hist_nvalues - 1; i++)
929 : {
930 0 : double bin_upper = DatumGetFloat8(length_hist_values[i + 1]);
931 :
932 : /* check if we've reached the last bin */
933 0 : if (!(bin_upper < length2 || (equal && bin_upper <= length2)))
934 : break;
935 :
936 : /* the upper bound of previous bin is the lower bound of this bin */
937 0 : A = B;
938 0 : PA = PB;
939 :
940 0 : B = bin_upper;
941 0 : PB = (double) i / (double) (length_hist_nvalues - 1);
942 :
943 : /*
944 : * Add the area of this trapezoid to the total. The point of the
945 : * if-check is to avoid NaN, in the corner case that PA == PB == 0,
946 : * and B - A == Inf. The area of a zero-height trapezoid (PA == PB ==
947 : * 0) is zero, regardless of the width (B - A).
948 : */
949 0 : if (PA > 0 || PB > 0)
950 0 : area += 0.5 * (PB + PA) * (B - A);
951 : }
952 :
953 : /* Last bin */
954 0 : A = B;
955 0 : PA = PB;
956 :
957 0 : B = length2; /* last bin ends at the query upper bound */
958 0 : if (i >= length_hist_nvalues - 1)
959 0 : pos = 0.0;
960 : else
961 : {
962 0 : if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + 1]))
963 0 : pos = 0.0;
964 : else
965 0 : pos = get_len_position(length2, DatumGetFloat8(length_hist_values[i]), DatumGetFloat8(length_hist_values[i + 1]));
966 : }
967 0 : PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
968 :
969 0 : if (PA > 0 || PB > 0)
970 0 : area += 0.5 * (PB + PA) * (B - A);
971 :
972 : /*
973 : * Ok, we have calculated the area, ie. the integral. Divide by width to
974 : * get the requested average.
975 : *
976 : * Avoid NaN arising from infinite / infinite. This happens at least if
977 : * length2 is infinite. It's not clear what the correct value would be in
978 : * that case, so 0.5 seems as good as any value.
979 : */
980 0 : if (is_infinite(area) && is_infinite(length2))
981 0 : frac = 0.5;
982 : else
983 0 : frac = area / (length2 - length1);
984 :
985 0 : return frac;
986 : }
987 :
988 : /*
989 : * Calculate selectivity of "var <@ const" operator, ie. estimate the fraction
990 : * of ranges that fall within the constant lower and upper bounds. This uses
991 : * the histograms of range lower bounds and range lengths, on the assumption
992 : * that the range lengths are independent of the lower bounds.
993 : *
994 : * The caller has already checked that constant lower and upper bounds are
995 : * finite.
996 : */
997 : static double
998 0 : calc_hist_selectivity_contained(TypeCacheEntry *typcache,
999 : RangeBound *lower, RangeBound *upper,
1000 : RangeBound *hist_lower, int hist_nvalues,
1001 : Datum *length_hist_values, int length_hist_nvalues)
1002 : {
1003 : int i,
1004 : upper_index;
1005 : float8 prev_dist;
1006 : double bin_width;
1007 : double upper_bin_width;
1008 : double sum_frac;
1009 :
1010 : /*
1011 : * Begin by finding the bin containing the upper bound, in the lower bound
1012 : * histogram. Any range with a lower bound > constant upper bound can't
1013 : * match, ie. there are no matches in bins greater than upper_index.
1014 : */
1015 0 : upper->inclusive = !upper->inclusive;
1016 0 : upper->lower = true;
1017 0 : upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues,
1018 : false);
1019 :
1020 : /*
1021 : * Calculate upper_bin_width, ie. the fraction of the (upper_index,
1022 : * upper_index + 1) bin which is greater than upper bound of query range
1023 : * using linear interpolation of subdiff function.
1024 : */
1025 0 : if (upper_index >= 0 && upper_index < hist_nvalues - 1)
1026 0 : upper_bin_width = get_position(typcache, upper,
1027 0 : &hist_lower[upper_index],
1028 0 : &hist_lower[upper_index + 1]);
1029 : else
1030 0 : upper_bin_width = 0.0;
1031 :
1032 : /*
1033 : * In the loop, dist and prev_dist are the distance of the "current" bin's
1034 : * lower and upper bounds from the constant upper bound.
1035 : *
1036 : * bin_width represents the width of the current bin. Normally it is 1.0,
1037 : * meaning a full width bin, but can be less in the corner cases: start
1038 : * and end of the loop. We start with bin_width = upper_bin_width, because
1039 : * we begin at the bin containing the upper bound.
1040 : */
1041 0 : prev_dist = 0.0;
1042 0 : bin_width = upper_bin_width;
1043 :
1044 0 : sum_frac = 0.0;
1045 0 : for (i = upper_index; i >= 0; i--)
1046 : {
1047 : double dist;
1048 : double length_hist_frac;
1049 0 : bool final_bin = false;
1050 :
1051 : /*
1052 : * dist -- distance from upper bound of query range to lower bound of
1053 : * the current bin in the lower bound histogram. Or to the lower bound
1054 : * of the constant range, if this is the final bin, containing the
1055 : * constant lower bound.
1056 : */
1057 0 : if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0)
1058 : {
1059 0 : dist = get_distance(typcache, lower, upper);
1060 :
1061 : /*
1062 : * Subtract from bin_width the portion of this bin that we want to
1063 : * ignore.
1064 : */
1065 0 : bin_width -= get_position(typcache, lower, &hist_lower[i],
1066 0 : &hist_lower[i + 1]);
1067 0 : if (bin_width < 0.0)
1068 0 : bin_width = 0.0;
1069 0 : final_bin = true;
1070 : }
1071 : else
1072 0 : dist = get_distance(typcache, &hist_lower[i], upper);
1073 :
1074 : /*
1075 : * Estimate the fraction of tuples in this bin that are narrow enough
1076 : * to not exceed the distance to the upper bound of the query range.
1077 : */
1078 0 : length_hist_frac = calc_length_hist_frac(length_hist_values,
1079 : length_hist_nvalues,
1080 : prev_dist, dist, true);
1081 :
1082 : /*
1083 : * Add the fraction of tuples in this bin, with a suitable length, to
1084 : * the total.
1085 : */
1086 0 : sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1087 :
1088 0 : if (final_bin)
1089 0 : break;
1090 :
1091 0 : bin_width = 1.0;
1092 0 : prev_dist = dist;
1093 : }
1094 :
1095 0 : return sum_frac;
1096 : }
1097 :
1098 : /*
1099 : * Calculate selectivity of "var @> const" operator, ie. estimate the fraction
1100 : * of ranges that contain the constant lower and upper bounds. This uses
1101 : * the histograms of range lower bounds and range lengths, on the assumption
1102 : * that the range lengths are independent of the lower bounds.
1103 : *
1104 : * Note, this is "var @> const", ie. estimate the fraction of ranges that
1105 : * contain the constant lower and upper bounds.
1106 : */
1107 : static double
1108 0 : calc_hist_selectivity_contains(TypeCacheEntry *typcache,
1109 : RangeBound *lower, RangeBound *upper,
1110 : RangeBound *hist_lower, int hist_nvalues,
1111 : Datum *length_hist_values, int length_hist_nvalues)
1112 : {
1113 : int i,
1114 : lower_index;
1115 : double bin_width,
1116 : lower_bin_width;
1117 : double sum_frac;
1118 : float8 prev_dist;
1119 :
1120 : /* Find the bin containing the lower bound of query range. */
1121 0 : lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues,
1122 : true);
1123 :
1124 : /*
1125 : * Calculate lower_bin_width, ie. the fraction of the of (lower_index,
1126 : * lower_index + 1) bin which is greater than lower bound of query range
1127 : * using linear interpolation of subdiff function.
1128 : */
1129 0 : if (lower_index >= 0 && lower_index < hist_nvalues - 1)
1130 0 : lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
1131 0 : &hist_lower[lower_index + 1]);
1132 : else
1133 0 : lower_bin_width = 0.0;
1134 :
1135 : /*
1136 : * Loop through all the lower bound bins, smaller than the query lower
1137 : * bound. In the loop, dist and prev_dist are the distance of the
1138 : * "current" bin's lower and upper bounds from the constant upper bound.
1139 : * We begin from query lower bound, and walk backwards, so the first bin's
1140 : * upper bound is the query lower bound, and its distance to the query
1141 : * upper bound is the length of the query range.
1142 : *
1143 : * bin_width represents the width of the current bin. Normally it is 1.0,
1144 : * meaning a full width bin, except for the first bin, which is only
1145 : * counted up to the constant lower bound.
1146 : */
1147 0 : prev_dist = get_distance(typcache, lower, upper);
1148 0 : sum_frac = 0.0;
1149 0 : bin_width = lower_bin_width;
1150 0 : for (i = lower_index; i >= 0; i--)
1151 : {
1152 : float8 dist;
1153 : double length_hist_frac;
1154 :
1155 : /*
1156 : * dist -- distance from upper bound of query range to current value
1157 : * of lower bound histogram or lower bound of query range (if we've
1158 : * reach it).
1159 : */
1160 0 : dist = get_distance(typcache, &hist_lower[i], upper);
1161 :
1162 : /*
1163 : * Get average fraction of length histogram which covers intervals
1164 : * longer than (or equal to) distance to upper bound of query range.
1165 : */
1166 0 : length_hist_frac =
1167 0 : 1.0 - calc_length_hist_frac(length_hist_values,
1168 : length_hist_nvalues,
1169 : prev_dist, dist, false);
1170 :
1171 0 : sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1172 :
1173 0 : bin_width = 1.0;
1174 0 : prev_dist = dist;
1175 : }
1176 :
1177 0 : return sum_frac;
1178 : }
|