Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * network_selfuncs.c
4 : * Functions for selectivity estimation of inet/cidr operators
5 : *
6 : * This module provides estimators for the subnet inclusion and overlap
7 : * operators. Estimates are based on null fraction, most common values,
8 : * and histogram of inet/cidr columns.
9 : *
10 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
11 : * Portions Copyright (c) 1994, Regents of the University of California
12 : *
13 : *
14 : * IDENTIFICATION
15 : * src/backend/utils/adt/network_selfuncs.c
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 : #include "postgres.h"
20 :
21 : #include <math.h>
22 :
23 : #include "access/htup_details.h"
24 : #include "catalog/pg_operator.h"
25 : #include "catalog/pg_statistic.h"
26 : #include "utils/builtins.h"
27 : #include "utils/inet.h"
28 : #include "utils/lsyscache.h"
29 : #include "utils/selfuncs.h"
30 :
31 :
32 : /* Default selectivity for the inet overlap operator */
33 : #define DEFAULT_OVERLAP_SEL 0.01
34 :
35 : /* Default selectivity for the various inclusion operators */
36 : #define DEFAULT_INCLUSION_SEL 0.005
37 :
38 : /* Default selectivity for specified operator */
39 : #define DEFAULT_SEL(operator) \
40 : ((operator) == OID_INET_OVERLAP_OP ? \
41 : DEFAULT_OVERLAP_SEL : DEFAULT_INCLUSION_SEL)
42 :
43 : /* Maximum number of items to consider in join selectivity calculations */
44 : #define MAX_CONSIDERED_ELEMS 1024
45 :
46 : static Selectivity networkjoinsel_inner(Oid operator,
47 : VariableStatData *vardata1, VariableStatData *vardata2);
48 : static Selectivity networkjoinsel_semi(Oid operator,
49 : VariableStatData *vardata1, VariableStatData *vardata2);
50 : static Selectivity mcv_population(float4 *mcv_numbers, int mcv_nvalues);
51 : static Selectivity inet_hist_value_sel(Datum *values, int nvalues,
52 : Datum constvalue, int opr_codenum);
53 : static Selectivity inet_mcv_join_sel(Datum *mcv1_values,
54 : float4 *mcv1_numbers, int mcv1_nvalues, Datum *mcv2_values,
55 : float4 *mcv2_numbers, int mcv2_nvalues, Oid operator);
56 : static Selectivity inet_mcv_hist_sel(Datum *mcv_values, float4 *mcv_numbers,
57 : int mcv_nvalues, Datum *hist_values, int hist_nvalues,
58 : int opr_codenum);
59 : static Selectivity inet_hist_inclusion_join_sel(Datum *hist1_values,
60 : int hist1_nvalues,
61 : Datum *hist2_values, int hist2_nvalues,
62 : int opr_codenum);
63 : static Selectivity inet_semi_join_sel(Datum lhs_value,
64 : bool mcv_exists, Datum *mcv_values, int mcv_nvalues,
65 : bool hist_exists, Datum *hist_values, int hist_nvalues,
66 : double hist_weight,
67 : FmgrInfo *proc, int opr_codenum);
68 : static int inet_opr_codenum(Oid operator);
69 : static int inet_inclusion_cmp(inet *left, inet *right, int opr_codenum);
70 : static int inet_masklen_inclusion_cmp(inet *left, inet *right,
71 : int opr_codenum);
72 : static int inet_hist_match_divider(inet *boundary, inet *query,
73 : int opr_codenum);
74 :
75 : /*
76 : * Selectivity estimation for the subnet inclusion/overlap operators
77 : */
78 : Datum
79 144 : networksel(PG_FUNCTION_ARGS)
80 : {
81 144 : PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
82 144 : Oid operator = PG_GETARG_OID(1);
83 144 : List *args = (List *) PG_GETARG_POINTER(2);
84 144 : int varRelid = PG_GETARG_INT32(3);
85 : VariableStatData vardata;
86 : Node *other;
87 : bool varonleft;
88 : Selectivity selec,
89 : mcv_selec,
90 : non_mcv_selec;
91 : Datum constvalue;
92 : Form_pg_statistic stats;
93 : AttStatsSlot hslot;
94 : double sumcommon,
95 : nullfrac;
96 : FmgrInfo proc;
97 :
98 : /*
99 : * If expression is not (variable op something) or (something op
100 : * variable), then punt and return a default estimate.
101 : */
102 144 : if (!get_restriction_variable(root, args, varRelid,
103 : &vardata, &other, &varonleft))
104 0 : PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
105 :
106 : /*
107 : * Can't do anything useful if the something is not a constant, either.
108 : */
109 144 : if (!IsA(other, Const))
110 : {
111 0 : ReleaseVariableStats(vardata);
112 0 : PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
113 : }
114 :
115 : /* All of the operators handled here are strict. */
116 144 : if (((Const *) other)->constisnull)
117 : {
118 0 : ReleaseVariableStats(vardata);
119 0 : PG_RETURN_FLOAT8(0.0);
120 : }
121 144 : constvalue = ((Const *) other)->constvalue;
122 :
123 : /* Otherwise, we need stats in order to produce a non-default estimate. */
124 144 : if (!HeapTupleIsValid(vardata.statsTuple))
125 : {
126 144 : ReleaseVariableStats(vardata);
127 144 : PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
128 : }
129 :
130 0 : stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
131 0 : nullfrac = stats->stanullfrac;
132 :
133 : /*
134 : * If we have most-common-values info, add up the fractions of the MCV
135 : * entries that satisfy MCV OP CONST. These fractions contribute directly
136 : * to the result selectivity. Also add up the total fraction represented
137 : * by MCV entries.
138 : */
139 0 : fmgr_info(get_opcode(operator), &proc);
140 0 : mcv_selec = mcv_selectivity(&vardata, &proc, constvalue, varonleft,
141 : &sumcommon);
142 :
143 : /*
144 : * If we have a histogram, use it to estimate the proportion of the
145 : * non-MCV population that satisfies the clause. If we don't, apply the
146 : * default selectivity to that population.
147 : */
148 0 : if (get_attstatsslot(&hslot, vardata.statsTuple,
149 : STATISTIC_KIND_HISTOGRAM, InvalidOid,
150 : ATTSTATSSLOT_VALUES))
151 : {
152 0 : int opr_codenum = inet_opr_codenum(operator);
153 :
154 : /* Commute if needed, so we can consider histogram to be on the left */
155 0 : if (!varonleft)
156 0 : opr_codenum = -opr_codenum;
157 0 : non_mcv_selec = inet_hist_value_sel(hslot.values, hslot.nvalues,
158 : constvalue, opr_codenum);
159 :
160 0 : free_attstatsslot(&hslot);
161 : }
162 : else
163 0 : non_mcv_selec = DEFAULT_SEL(operator);
164 :
165 : /* Combine selectivities for MCV and non-MCV populations */
166 0 : selec = mcv_selec + (1.0 - nullfrac - sumcommon) * non_mcv_selec;
167 :
168 : /* Result should be in range, but make sure... */
169 0 : CLAMP_PROBABILITY(selec);
170 :
171 0 : ReleaseVariableStats(vardata);
172 :
173 0 : PG_RETURN_FLOAT8(selec);
174 : }
175 :
176 : /*
177 : * Join selectivity estimation for the subnet inclusion/overlap operators
178 : *
179 : * This function has the same structure as eqjoinsel() in selfuncs.c.
180 : *
181 : * Throughout networkjoinsel and its subroutines, we have a performance issue
182 : * in that the amount of work to be done is O(N^2) in the length of the MCV
183 : * and histogram arrays. To keep the runtime from getting out of hand when
184 : * large statistics targets have been set, we arbitrarily limit the number of
185 : * values considered to 1024 (MAX_CONSIDERED_ELEMS). For the MCV arrays, this
186 : * is easy: just consider at most the first N elements. (Since the MCVs are
187 : * sorted by decreasing frequency, this correctly gets us the first N MCVs.)
188 : * For the histogram arrays, we decimate; that is consider only every k'th
189 : * element, where k is chosen so that no more than MAX_CONSIDERED_ELEMS
190 : * elements are considered. This should still give us a good random sample of
191 : * the non-MCV population. Decimation is done on-the-fly in the loops that
192 : * iterate over the histogram arrays.
193 : */
194 : Datum
195 0 : networkjoinsel(PG_FUNCTION_ARGS)
196 : {
197 0 : PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
198 0 : Oid operator = PG_GETARG_OID(1);
199 0 : List *args = (List *) PG_GETARG_POINTER(2);
200 : #ifdef NOT_USED
201 : JoinType jointype = (JoinType) PG_GETARG_INT16(3);
202 : #endif
203 0 : SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
204 : double selec;
205 : VariableStatData vardata1;
206 : VariableStatData vardata2;
207 : bool join_is_reversed;
208 :
209 0 : get_join_variables(root, args, sjinfo,
210 : &vardata1, &vardata2, &join_is_reversed);
211 :
212 0 : switch (sjinfo->jointype)
213 : {
214 : case JOIN_INNER:
215 : case JOIN_LEFT:
216 : case JOIN_FULL:
217 :
218 : /*
219 : * Selectivity for left/full join is not exactly the same as inner
220 : * join, but we neglect the difference, as eqjoinsel does.
221 : */
222 0 : selec = networkjoinsel_inner(operator, &vardata1, &vardata2);
223 0 : break;
224 : case JOIN_SEMI:
225 : case JOIN_ANTI:
226 : /* Here, it's important that we pass the outer var on the left. */
227 0 : if (!join_is_reversed)
228 0 : selec = networkjoinsel_semi(operator, &vardata1, &vardata2);
229 : else
230 0 : selec = networkjoinsel_semi(get_commutator(operator),
231 : &vardata2, &vardata1);
232 0 : break;
233 : default:
234 : /* other values not expected here */
235 0 : elog(ERROR, "unrecognized join type: %d",
236 : (int) sjinfo->jointype);
237 : selec = 0; /* keep compiler quiet */
238 : break;
239 : }
240 :
241 0 : ReleaseVariableStats(vardata1);
242 0 : ReleaseVariableStats(vardata2);
243 :
244 0 : CLAMP_PROBABILITY(selec);
245 :
246 0 : PG_RETURN_FLOAT8((float8) selec);
247 : }
248 :
249 : /*
250 : * Inner join selectivity estimation for subnet inclusion/overlap operators
251 : *
252 : * Calculates MCV vs MCV, MCV vs histogram and histogram vs histogram
253 : * selectivity for join using the subnet inclusion operators. Unlike the
254 : * join selectivity function for the equality operator, eqjoinsel_inner(),
255 : * one to one matching of the values is not enough. Network inclusion
256 : * operators are likely to match many to many, so we must check all pairs.
257 : * (Note: it might be possible to exploit understanding of the histogram's
258 : * btree ordering to reduce the work needed, but we don't currently try.)
259 : * Also, MCV vs histogram selectivity is not neglected as in eqjoinsel_inner().
260 : */
261 : static Selectivity
262 0 : networkjoinsel_inner(Oid operator,
263 : VariableStatData *vardata1, VariableStatData *vardata2)
264 : {
265 : Form_pg_statistic stats;
266 0 : double nullfrac1 = 0.0,
267 0 : nullfrac2 = 0.0;
268 0 : Selectivity selec = 0.0,
269 0 : sumcommon1 = 0.0,
270 0 : sumcommon2 = 0.0;
271 0 : bool mcv1_exists = false,
272 0 : mcv2_exists = false,
273 0 : hist1_exists = false,
274 0 : hist2_exists = false;
275 : int opr_codenum;
276 0 : int mcv1_length = 0,
277 0 : mcv2_length = 0;
278 : AttStatsSlot mcv1_slot;
279 : AttStatsSlot mcv2_slot;
280 : AttStatsSlot hist1_slot;
281 : AttStatsSlot hist2_slot;
282 :
283 0 : if (HeapTupleIsValid(vardata1->statsTuple))
284 : {
285 0 : stats = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
286 0 : nullfrac1 = stats->stanullfrac;
287 :
288 0 : mcv1_exists = get_attstatsslot(&mcv1_slot, vardata1->statsTuple,
289 : STATISTIC_KIND_MCV, InvalidOid,
290 : ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
291 0 : hist1_exists = get_attstatsslot(&hist1_slot, vardata1->statsTuple,
292 : STATISTIC_KIND_HISTOGRAM, InvalidOid,
293 : ATTSTATSSLOT_VALUES);
294 : /* Arbitrarily limit number of MCVs considered */
295 0 : mcv1_length = Min(mcv1_slot.nvalues, MAX_CONSIDERED_ELEMS);
296 0 : if (mcv1_exists)
297 0 : sumcommon1 = mcv_population(mcv1_slot.numbers, mcv1_length);
298 : }
299 : else
300 : {
301 0 : memset(&mcv1_slot, 0, sizeof(mcv1_slot));
302 0 : memset(&hist1_slot, 0, sizeof(hist1_slot));
303 : }
304 :
305 0 : if (HeapTupleIsValid(vardata2->statsTuple))
306 : {
307 0 : stats = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple);
308 0 : nullfrac2 = stats->stanullfrac;
309 :
310 0 : mcv2_exists = get_attstatsslot(&mcv2_slot, vardata2->statsTuple,
311 : STATISTIC_KIND_MCV, InvalidOid,
312 : ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
313 0 : hist2_exists = get_attstatsslot(&hist2_slot, vardata2->statsTuple,
314 : STATISTIC_KIND_HISTOGRAM, InvalidOid,
315 : ATTSTATSSLOT_VALUES);
316 : /* Arbitrarily limit number of MCVs considered */
317 0 : mcv2_length = Min(mcv2_slot.nvalues, MAX_CONSIDERED_ELEMS);
318 0 : if (mcv2_exists)
319 0 : sumcommon2 = mcv_population(mcv2_slot.numbers, mcv2_length);
320 : }
321 : else
322 : {
323 0 : memset(&mcv2_slot, 0, sizeof(mcv2_slot));
324 0 : memset(&hist2_slot, 0, sizeof(hist2_slot));
325 : }
326 :
327 0 : opr_codenum = inet_opr_codenum(operator);
328 :
329 : /*
330 : * Calculate selectivity for MCV vs MCV matches.
331 : */
332 0 : if (mcv1_exists && mcv2_exists)
333 0 : selec += inet_mcv_join_sel(mcv1_slot.values, mcv1_slot.numbers,
334 : mcv1_length,
335 : mcv2_slot.values, mcv2_slot.numbers,
336 : mcv2_length,
337 : operator);
338 :
339 : /*
340 : * Add in selectivities for MCV vs histogram matches, scaling according to
341 : * the fractions of the populations represented by the histograms. Note
342 : * that the second case needs to commute the operator.
343 : */
344 0 : if (mcv1_exists && hist2_exists)
345 0 : selec += (1.0 - nullfrac2 - sumcommon2) *
346 0 : inet_mcv_hist_sel(mcv1_slot.values, mcv1_slot.numbers, mcv1_length,
347 : hist2_slot.values, hist2_slot.nvalues,
348 0 : opr_codenum);
349 0 : if (mcv2_exists && hist1_exists)
350 0 : selec += (1.0 - nullfrac1 - sumcommon1) *
351 0 : inet_mcv_hist_sel(mcv2_slot.values, mcv2_slot.numbers, mcv2_length,
352 : hist1_slot.values, hist1_slot.nvalues,
353 0 : -opr_codenum);
354 :
355 : /*
356 : * Add in selectivity for histogram vs histogram matches, again scaling
357 : * appropriately.
358 : */
359 0 : if (hist1_exists && hist2_exists)
360 0 : selec += (1.0 - nullfrac1 - sumcommon1) *
361 : (1.0 - nullfrac2 - sumcommon2) *
362 0 : inet_hist_inclusion_join_sel(hist1_slot.values, hist1_slot.nvalues,
363 : hist2_slot.values, hist2_slot.nvalues,
364 0 : opr_codenum);
365 :
366 : /*
367 : * If useful statistics are not available then use the default estimate.
368 : * We can apply null fractions if known, though.
369 : */
370 0 : if ((!mcv1_exists && !hist1_exists) || (!mcv2_exists && !hist2_exists))
371 0 : selec = (1.0 - nullfrac1) * (1.0 - nullfrac2) * DEFAULT_SEL(operator);
372 :
373 : /* Release stats. */
374 0 : free_attstatsslot(&mcv1_slot);
375 0 : free_attstatsslot(&mcv2_slot);
376 0 : free_attstatsslot(&hist1_slot);
377 0 : free_attstatsslot(&hist2_slot);
378 :
379 0 : return selec;
380 : }
381 :
382 : /*
383 : * Semi join selectivity estimation for subnet inclusion/overlap operators
384 : *
385 : * Calculates MCV vs MCV, MCV vs histogram, histogram vs MCV, and histogram vs
386 : * histogram selectivity for semi/anti join cases.
387 : */
388 : static Selectivity
389 0 : networkjoinsel_semi(Oid operator,
390 : VariableStatData *vardata1, VariableStatData *vardata2)
391 : {
392 : Form_pg_statistic stats;
393 0 : Selectivity selec = 0.0,
394 0 : sumcommon1 = 0.0,
395 0 : sumcommon2 = 0.0;
396 0 : double nullfrac1 = 0.0,
397 0 : nullfrac2 = 0.0,
398 0 : hist2_weight = 0.0;
399 0 : bool mcv1_exists = false,
400 0 : mcv2_exists = false,
401 0 : hist1_exists = false,
402 0 : hist2_exists = false;
403 : int opr_codenum;
404 : FmgrInfo proc;
405 : int i,
406 0 : mcv1_length = 0,
407 0 : mcv2_length = 0;
408 : AttStatsSlot mcv1_slot;
409 : AttStatsSlot mcv2_slot;
410 : AttStatsSlot hist1_slot;
411 : AttStatsSlot hist2_slot;
412 :
413 0 : if (HeapTupleIsValid(vardata1->statsTuple))
414 : {
415 0 : stats = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
416 0 : nullfrac1 = stats->stanullfrac;
417 :
418 0 : mcv1_exists = get_attstatsslot(&mcv1_slot, vardata1->statsTuple,
419 : STATISTIC_KIND_MCV, InvalidOid,
420 : ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
421 0 : hist1_exists = get_attstatsslot(&hist1_slot, vardata1->statsTuple,
422 : STATISTIC_KIND_HISTOGRAM, InvalidOid,
423 : ATTSTATSSLOT_VALUES);
424 : /* Arbitrarily limit number of MCVs considered */
425 0 : mcv1_length = Min(mcv1_slot.nvalues, MAX_CONSIDERED_ELEMS);
426 0 : if (mcv1_exists)
427 0 : sumcommon1 = mcv_population(mcv1_slot.numbers, mcv1_length);
428 : }
429 : else
430 : {
431 0 : memset(&mcv1_slot, 0, sizeof(mcv1_slot));
432 0 : memset(&hist1_slot, 0, sizeof(hist1_slot));
433 : }
434 :
435 0 : if (HeapTupleIsValid(vardata2->statsTuple))
436 : {
437 0 : stats = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple);
438 0 : nullfrac2 = stats->stanullfrac;
439 :
440 0 : mcv2_exists = get_attstatsslot(&mcv2_slot, vardata2->statsTuple,
441 : STATISTIC_KIND_MCV, InvalidOid,
442 : ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
443 0 : hist2_exists = get_attstatsslot(&hist2_slot, vardata2->statsTuple,
444 : STATISTIC_KIND_HISTOGRAM, InvalidOid,
445 : ATTSTATSSLOT_VALUES);
446 : /* Arbitrarily limit number of MCVs considered */
447 0 : mcv2_length = Min(mcv2_slot.nvalues, MAX_CONSIDERED_ELEMS);
448 0 : if (mcv2_exists)
449 0 : sumcommon2 = mcv_population(mcv2_slot.numbers, mcv2_length);
450 : }
451 : else
452 : {
453 0 : memset(&mcv2_slot, 0, sizeof(mcv2_slot));
454 0 : memset(&hist2_slot, 0, sizeof(hist2_slot));
455 : }
456 :
457 0 : opr_codenum = inet_opr_codenum(operator);
458 0 : fmgr_info(get_opcode(operator), &proc);
459 :
460 : /* Estimate number of input rows represented by RHS histogram. */
461 0 : if (hist2_exists && vardata2->rel)
462 0 : hist2_weight = (1.0 - nullfrac2 - sumcommon2) * vardata2->rel->rows;
463 :
464 : /*
465 : * Consider each element of the LHS MCV list, matching it to whatever RHS
466 : * stats we have. Scale according to the known frequency of the MCV.
467 : */
468 0 : if (mcv1_exists && (mcv2_exists || hist2_exists))
469 : {
470 0 : for (i = 0; i < mcv1_length; i++)
471 : {
472 0 : selec += mcv1_slot.numbers[i] *
473 0 : inet_semi_join_sel(mcv1_slot.values[i],
474 : mcv2_exists, mcv2_slot.values, mcv2_length,
475 : hist2_exists,
476 : hist2_slot.values, hist2_slot.nvalues,
477 : hist2_weight,
478 0 : &proc, opr_codenum);
479 : }
480 : }
481 :
482 : /*
483 : * Consider each element of the LHS histogram, except for the first and
484 : * last elements, which we exclude on the grounds that they're outliers
485 : * and thus not very representative. Scale on the assumption that each
486 : * such histogram element represents an equal share of the LHS histogram
487 : * population (which is a bit bogus, because the members of its bucket may
488 : * not all act the same with respect to the join clause, but it's hard to
489 : * do better).
490 : *
491 : * If there are too many histogram elements, decimate to limit runtime.
492 : */
493 0 : if (hist1_exists && hist1_slot.nvalues > 2 && (mcv2_exists || hist2_exists))
494 : {
495 0 : double hist_selec_sum = 0.0;
496 : int k,
497 : n;
498 :
499 0 : k = (hist1_slot.nvalues - 3) / MAX_CONSIDERED_ELEMS + 1;
500 :
501 0 : n = 0;
502 0 : for (i = 1; i < hist1_slot.nvalues - 1; i += k)
503 : {
504 0 : hist_selec_sum +=
505 0 : inet_semi_join_sel(hist1_slot.values[i],
506 : mcv2_exists, mcv2_slot.values, mcv2_length,
507 : hist2_exists,
508 : hist2_slot.values, hist2_slot.nvalues,
509 : hist2_weight,
510 : &proc, opr_codenum);
511 0 : n++;
512 : }
513 :
514 0 : selec += (1.0 - nullfrac1 - sumcommon1) * hist_selec_sum / n;
515 : }
516 :
517 : /*
518 : * If useful statistics are not available then use the default estimate.
519 : * We can apply null fractions if known, though.
520 : */
521 0 : if ((!mcv1_exists && !hist1_exists) || (!mcv2_exists && !hist2_exists))
522 0 : selec = (1.0 - nullfrac1) * (1.0 - nullfrac2) * DEFAULT_SEL(operator);
523 :
524 : /* Release stats. */
525 0 : free_attstatsslot(&mcv1_slot);
526 0 : free_attstatsslot(&mcv2_slot);
527 0 : free_attstatsslot(&hist1_slot);
528 0 : free_attstatsslot(&hist2_slot);
529 :
530 0 : return selec;
531 : }
532 :
533 : /*
534 : * Compute the fraction of a relation's population that is represented
535 : * by the MCV list.
536 : */
537 : static Selectivity
538 0 : mcv_population(float4 *mcv_numbers, int mcv_nvalues)
539 : {
540 0 : Selectivity sumcommon = 0.0;
541 : int i;
542 :
543 0 : for (i = 0; i < mcv_nvalues; i++)
544 : {
545 0 : sumcommon += mcv_numbers[i];
546 : }
547 :
548 0 : return sumcommon;
549 : }
550 :
551 : /*
552 : * Inet histogram vs single value selectivity estimation
553 : *
554 : * Estimate the fraction of the histogram population that satisfies
555 : * "value OPR CONST". (The result needs to be scaled to reflect the
556 : * proportion of the total population represented by the histogram.)
557 : *
558 : * The histogram is originally for the inet btree comparison operators.
559 : * Only the common bits of the network part and the length of the network part
560 : * (masklen) are interesting for the subnet inclusion operators. Fortunately,
561 : * btree comparison treats the network part as the major sort key. Even so,
562 : * the length of the network part would not really be significant in the
563 : * histogram. This would lead to big mistakes for data sets with uneven
564 : * masklen distribution. To reduce this problem, comparisons with the left
565 : * and the right sides of the buckets are used together.
566 : *
567 : * Histogram bucket matches are calculated in two forms. If the constant
568 : * matches both bucket endpoints the bucket is considered as fully matched.
569 : * The second form is to match the bucket partially; we recognize this when
570 : * the constant matches just one endpoint, or the two endpoints fall on
571 : * opposite sides of the constant. (Note that when the constant matches an
572 : * interior histogram element, it gets credit for partial matches to the
573 : * buckets on both sides, while a match to a histogram endpoint gets credit
574 : * for only one partial match. This is desirable.)
575 : *
576 : * The divider in the partial bucket match is imagined as the distance
577 : * between the decisive bits and the common bits of the addresses. It will
578 : * be used as a power of two as it is the natural scale for the IP network
579 : * inclusion. This partial bucket match divider calculation is an empirical
580 : * formula and subject to change with more experiment.
581 : *
582 : * For a partial match, we try to calculate dividers for both of the
583 : * boundaries. If the address family of a boundary value does not match the
584 : * constant or comparison of the length of the network parts is not correct
585 : * for the operator, the divider for that boundary will not be taken into
586 : * account. If both of the dividers are valid, the greater one will be used
587 : * to minimize the mistake in buckets that have disparate masklens. This
588 : * calculation is unfair when dividers can be calculated for both of the
589 : * boundaries but they are far from each other; but it is not a common
590 : * situation as the boundaries are expected to share most of their significant
591 : * bits of their masklens. The mistake would be greater, if we would use the
592 : * minimum instead of the maximum, and we don't know a sensible way to combine
593 : * them.
594 : *
595 : * For partial match in buckets that have different address families on the
596 : * left and right sides, only the boundary with the same address family is
597 : * taken into consideration. This can cause more mistakes for these buckets
598 : * if the masklens of their boundaries are also disparate. But this can only
599 : * happen in one bucket, since only two address families exist. It seems a
600 : * better option than not considering these buckets at all.
601 : */
602 : static Selectivity
603 0 : inet_hist_value_sel(Datum *values, int nvalues, Datum constvalue,
604 : int opr_codenum)
605 : {
606 0 : Selectivity match = 0.0;
607 : inet *query,
608 : *left,
609 : *right;
610 : int i,
611 : k,
612 : n;
613 : int left_order,
614 : right_order,
615 : left_divider,
616 : right_divider;
617 :
618 : /* guard against zero-divide below */
619 0 : if (nvalues <= 1)
620 0 : return 0.0;
621 :
622 : /* if there are too many histogram elements, decimate to limit runtime */
623 0 : k = (nvalues - 2) / MAX_CONSIDERED_ELEMS + 1;
624 :
625 0 : query = DatumGetInetPP(constvalue);
626 :
627 : /* "left" is the left boundary value of the current bucket ... */
628 0 : left = DatumGetInetPP(values[0]);
629 0 : left_order = inet_inclusion_cmp(left, query, opr_codenum);
630 :
631 0 : n = 0;
632 0 : for (i = k; i < nvalues; i += k)
633 : {
634 : /* ... and "right" is the right boundary value */
635 0 : right = DatumGetInetPP(values[i]);
636 0 : right_order = inet_inclusion_cmp(right, query, opr_codenum);
637 :
638 0 : if (left_order == 0 && right_order == 0)
639 : {
640 : /* The whole bucket matches, since both endpoints do. */
641 0 : match += 1.0;
642 : }
643 0 : else if ((left_order <= 0 && right_order >= 0) ||
644 0 : (left_order >= 0 && right_order <= 0))
645 : {
646 : /* Partial bucket match. */
647 0 : left_divider = inet_hist_match_divider(left, query, opr_codenum);
648 0 : right_divider = inet_hist_match_divider(right, query, opr_codenum);
649 :
650 0 : if (left_divider >= 0 || right_divider >= 0)
651 0 : match += 1.0 / pow(2.0, Max(left_divider, right_divider));
652 : }
653 :
654 : /* Shift the variables. */
655 0 : left = right;
656 0 : left_order = right_order;
657 :
658 : /* Count the number of buckets considered. */
659 0 : n++;
660 : }
661 :
662 0 : return match / n;
663 : }
664 :
665 : /*
666 : * Inet MCV vs MCV join selectivity estimation
667 : *
668 : * We simply add up the fractions of the populations that satisfy the clause.
669 : * The result is exact and does not need to be scaled further.
670 : */
671 : static Selectivity
672 0 : inet_mcv_join_sel(Datum *mcv1_values, float4 *mcv1_numbers, int mcv1_nvalues,
673 : Datum *mcv2_values, float4 *mcv2_numbers, int mcv2_nvalues,
674 : Oid operator)
675 : {
676 0 : Selectivity selec = 0.0;
677 : FmgrInfo proc;
678 : int i,
679 : j;
680 :
681 0 : fmgr_info(get_opcode(operator), &proc);
682 :
683 0 : for (i = 0; i < mcv1_nvalues; i++)
684 : {
685 0 : for (j = 0; j < mcv2_nvalues; j++)
686 0 : if (DatumGetBool(FunctionCall2(&proc,
687 : mcv1_values[i],
688 : mcv2_values[j])))
689 0 : selec += mcv1_numbers[i] * mcv2_numbers[j];
690 : }
691 0 : return selec;
692 : }
693 :
694 : /*
695 : * Inet MCV vs histogram join selectivity estimation
696 : *
697 : * For each MCV on the lefthand side, estimate the fraction of the righthand's
698 : * histogram population that satisfies the join clause, and add those up,
699 : * scaling by the MCV's frequency. The result still needs to be scaled
700 : * according to the fraction of the righthand's population represented by
701 : * the histogram.
702 : */
703 : static Selectivity
704 0 : inet_mcv_hist_sel(Datum *mcv_values, float4 *mcv_numbers, int mcv_nvalues,
705 : Datum *hist_values, int hist_nvalues,
706 : int opr_codenum)
707 : {
708 0 : Selectivity selec = 0.0;
709 : int i;
710 :
711 : /*
712 : * We'll call inet_hist_value_selec with the histogram on the left, so we
713 : * must commute the operator.
714 : */
715 0 : opr_codenum = -opr_codenum;
716 :
717 0 : for (i = 0; i < mcv_nvalues; i++)
718 : {
719 0 : selec += mcv_numbers[i] *
720 0 : inet_hist_value_sel(hist_values, hist_nvalues, mcv_values[i],
721 0 : opr_codenum);
722 : }
723 0 : return selec;
724 : }
725 :
726 : /*
727 : * Inet histogram vs histogram join selectivity estimation
728 : *
729 : * Here, we take all values listed in the second histogram (except for the
730 : * first and last elements, which are excluded on the grounds of possibly
731 : * not being very representative) and treat them as a uniform sample of
732 : * the non-MCV population for that relation. For each one, we apply
733 : * inet_hist_value_selec to see what fraction of the first histogram
734 : * it matches.
735 : *
736 : * We could alternatively do this the other way around using the operator's
737 : * commutator. XXX would it be worthwhile to do it both ways and take the
738 : * average? That would at least avoid non-commutative estimation results.
739 : */
740 : static Selectivity
741 0 : inet_hist_inclusion_join_sel(Datum *hist1_values, int hist1_nvalues,
742 : Datum *hist2_values, int hist2_nvalues,
743 : int opr_codenum)
744 : {
745 0 : double match = 0.0;
746 : int i,
747 : k,
748 : n;
749 :
750 0 : if (hist2_nvalues <= 2)
751 0 : return 0.0; /* no interior histogram elements */
752 :
753 : /* if there are too many histogram elements, decimate to limit runtime */
754 0 : k = (hist2_nvalues - 3) / MAX_CONSIDERED_ELEMS + 1;
755 :
756 0 : n = 0;
757 0 : for (i = 1; i < hist2_nvalues - 1; i += k)
758 : {
759 0 : match += inet_hist_value_sel(hist1_values, hist1_nvalues,
760 0 : hist2_values[i], opr_codenum);
761 0 : n++;
762 : }
763 :
764 0 : return match / n;
765 : }
766 :
767 : /*
768 : * Inet semi join selectivity estimation for one value
769 : *
770 : * The function calculates the probability that there is at least one row
771 : * in the RHS table that satisfies the "lhs_value op column" condition.
772 : * It is used in semi join estimation to check a sample from the left hand
773 : * side table.
774 : *
775 : * The MCV and histogram from the right hand side table should be provided as
776 : * arguments with the lhs_value from the left hand side table for the join.
777 : * hist_weight is the total number of rows represented by the histogram.
778 : * For example, if the table has 1000 rows, and 10% of the rows are in the MCV
779 : * list, and another 10% are NULLs, hist_weight would be 800.
780 : *
781 : * First, the lhs_value will be matched to the most common values. If it
782 : * matches any of them, 1.0 will be returned, because then there is surely
783 : * a match.
784 : *
785 : * Otherwise, the histogram will be used to estimate the number of rows in
786 : * the second table that match the condition. If the estimate is greater
787 : * than 1.0, 1.0 will be returned, because it means there is a greater chance
788 : * that the lhs_value will match more than one row in the table. If it is
789 : * between 0.0 and 1.0, it will be returned as the probability.
790 : */
791 : static Selectivity
792 0 : inet_semi_join_sel(Datum lhs_value,
793 : bool mcv_exists, Datum *mcv_values, int mcv_nvalues,
794 : bool hist_exists, Datum *hist_values, int hist_nvalues,
795 : double hist_weight,
796 : FmgrInfo *proc, int opr_codenum)
797 : {
798 0 : if (mcv_exists)
799 : {
800 : int i;
801 :
802 0 : for (i = 0; i < mcv_nvalues; i++)
803 : {
804 0 : if (DatumGetBool(FunctionCall2(proc,
805 : lhs_value,
806 : mcv_values[i])))
807 0 : return 1.0;
808 : }
809 : }
810 :
811 0 : if (hist_exists && hist_weight > 0)
812 : {
813 : Selectivity hist_selec;
814 :
815 : /* Commute operator, since we're passing lhs_value on the right */
816 0 : hist_selec = inet_hist_value_sel(hist_values, hist_nvalues,
817 : lhs_value, -opr_codenum);
818 :
819 0 : if (hist_selec > 0)
820 0 : return Min(1.0, hist_weight * hist_selec);
821 : }
822 :
823 0 : return 0.0;
824 : }
825 :
826 : /*
827 : * Assign useful code numbers for the subnet inclusion/overlap operators
828 : *
829 : * Only inet_masklen_inclusion_cmp() and inet_hist_match_divider() depend
830 : * on the exact codes assigned here; but many other places in this file
831 : * know that they can negate a code to obtain the code for the commutator
832 : * operator.
833 : */
834 : static int
835 0 : inet_opr_codenum(Oid operator)
836 : {
837 0 : switch (operator)
838 : {
839 : case OID_INET_SUP_OP:
840 0 : return -2;
841 : case OID_INET_SUPEQ_OP:
842 0 : return -1;
843 : case OID_INET_OVERLAP_OP:
844 0 : return 0;
845 : case OID_INET_SUBEQ_OP:
846 0 : return 1;
847 : case OID_INET_SUB_OP:
848 0 : return 2;
849 : default:
850 0 : elog(ERROR, "unrecognized operator %u for inet selectivity",
851 : operator);
852 : }
853 : return 0; /* unreached, but keep compiler quiet */
854 : }
855 :
856 : /*
857 : * Comparison function for the subnet inclusion/overlap operators
858 : *
859 : * If the comparison is okay for the specified inclusion operator, the return
860 : * value will be 0. Otherwise the return value will be less than or greater
861 : * than 0 as appropriate for the operator.
862 : *
863 : * Comparison is compatible with the basic comparison function for the inet
864 : * type. See network_cmp_internal() in network.c for the original. Basic
865 : * comparison operators are implemented with the network_cmp_internal()
866 : * function. It is possible to implement the subnet inclusion operators with
867 : * this function.
868 : *
869 : * Comparison is first on the common bits of the network part, then on the
870 : * length of the network part (masklen) as in the network_cmp_internal()
871 : * function. Only the first part is in this function. The second part is
872 : * separated to another function for reusability. The difference between the
873 : * second part and the original network_cmp_internal() is that the inclusion
874 : * operator is considered while comparing the lengths of the network parts.
875 : * See the inet_masklen_inclusion_cmp() function below.
876 : */
877 : static int
878 0 : inet_inclusion_cmp(inet *left, inet *right, int opr_codenum)
879 : {
880 0 : if (ip_family(left) == ip_family(right))
881 : {
882 : int order;
883 :
884 0 : order = bitncmp(ip_addr(left), ip_addr(right),
885 0 : Min(ip_bits(left), ip_bits(right)));
886 0 : if (order != 0)
887 0 : return order;
888 :
889 0 : return inet_masklen_inclusion_cmp(left, right, opr_codenum);
890 : }
891 :
892 0 : return ip_family(left) - ip_family(right);
893 : }
894 :
895 : /*
896 : * Masklen comparison function for the subnet inclusion/overlap operators
897 : *
898 : * Compares the lengths of the network parts of the inputs. If the comparison
899 : * is okay for the specified inclusion operator, the return value will be 0.
900 : * Otherwise the return value will be less than or greater than 0 as
901 : * appropriate for the operator.
902 : */
903 : static int
904 0 : inet_masklen_inclusion_cmp(inet *left, inet *right, int opr_codenum)
905 : {
906 : int order;
907 :
908 0 : order = (int) ip_bits(left) - (int) ip_bits(right);
909 :
910 : /*
911 : * Return 0 if the operator would accept this combination of masklens.
912 : * Note that opr_codenum zero (overlaps) will accept all cases.
913 : */
914 0 : if ((order > 0 && opr_codenum >= 0) ||
915 0 : (order == 0 && opr_codenum >= -1 && opr_codenum <= 1) ||
916 0 : (order < 0 && opr_codenum <= 0))
917 0 : return 0;
918 :
919 : /*
920 : * Otherwise, return a negative value for sup/supeq (notionally, the RHS
921 : * needs to have a larger masklen than it has, which would make it sort
922 : * later), or a positive value for sub/subeq (vice versa).
923 : */
924 0 : return opr_codenum;
925 : }
926 :
927 : /*
928 : * Inet histogram partial match divider calculation
929 : *
930 : * First the families and the lengths of the network parts are compared using
931 : * the subnet inclusion operator. If those are acceptable for the operator,
932 : * the divider will be calculated using the masklens and the common bits of
933 : * the addresses. -1 will be returned if it cannot be calculated.
934 : *
935 : * See commentary for inet_hist_value_sel() for some rationale for this.
936 : */
937 : static int
938 0 : inet_hist_match_divider(inet *boundary, inet *query, int opr_codenum)
939 : {
940 0 : if (ip_family(boundary) == ip_family(query) &&
941 0 : inet_masklen_inclusion_cmp(boundary, query, opr_codenum) == 0)
942 : {
943 : int min_bits,
944 : decisive_bits;
945 :
946 0 : min_bits = Min(ip_bits(boundary), ip_bits(query));
947 :
948 : /*
949 : * Set decisive_bits to the masklen of the one that should contain the
950 : * other according to the operator.
951 : */
952 0 : if (opr_codenum < 0)
953 0 : decisive_bits = ip_bits(boundary);
954 0 : else if (opr_codenum > 0)
955 0 : decisive_bits = ip_bits(query);
956 : else
957 0 : decisive_bits = min_bits;
958 :
959 : /*
960 : * Now return the number of non-common decisive bits. (This will be
961 : * zero if the boundary and query in fact match, else positive.)
962 : */
963 0 : if (min_bits > 0)
964 0 : return decisive_bits - bitncommon(ip_addr(boundary),
965 0 : ip_addr(query),
966 : min_bits);
967 0 : return decisive_bits;
968 : }
969 :
970 0 : return -1;
971 : }
|