Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * ts_selfuncs.c
4 : * Selectivity estimation functions for text search operators.
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : *
8 : *
9 : * IDENTIFICATION
10 : * src/backend/tsearch/ts_selfuncs.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 : #include "postgres.h"
15 :
16 : #include "access/htup_details.h"
17 : #include "catalog/pg_statistic.h"
18 : #include "catalog/pg_type.h"
19 : #include "miscadmin.h"
20 : #include "nodes/nodes.h"
21 : #include "tsearch/ts_type.h"
22 : #include "utils/builtins.h"
23 : #include "utils/lsyscache.h"
24 : #include "utils/selfuncs.h"
25 : #include "utils/syscache.h"
26 :
27 :
28 : /*
29 : * The default text search selectivity is chosen to be small enough to
30 : * encourage indexscans for typical table densities. See selfuncs.h and
31 : * DEFAULT_EQ_SEL for details.
32 : */
33 : #define DEFAULT_TS_MATCH_SEL 0.005
34 :
35 : /* lookup table type for binary searching through MCELEMs */
36 : typedef struct
37 : {
38 : text *element;
39 : float4 frequency;
40 : } TextFreq;
41 :
42 : /* type of keys for bsearch'ing through an array of TextFreqs */
43 : typedef struct
44 : {
45 : char *lexeme;
46 : int length;
47 : } LexemeKey;
48 :
49 : static Selectivity tsquerysel(VariableStatData *vardata, Datum constval);
50 : static Selectivity mcelem_tsquery_selec(TSQuery query,
51 : Datum *mcelem, int nmcelem,
52 : float4 *numbers, int nnumbers);
53 : static Selectivity tsquery_opr_selec(QueryItem *item, char *operand,
54 : TextFreq *lookup, int length, float4 minfreq);
55 : static int compare_lexeme_textfreq(const void *e1, const void *e2);
56 :
57 : #define tsquery_opr_selec_no_stats(query) \
58 : tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), NULL, 0, 0)
59 :
60 :
61 : /*
62 : * tsmatchsel -- Selectivity of "@@"
63 : *
64 : * restriction selectivity function for tsvector @@ tsquery and
65 : * tsquery @@ tsvector
66 : */
67 : Datum
68 57 : tsmatchsel(PG_FUNCTION_ARGS)
69 : {
70 57 : PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
71 :
72 : #ifdef NOT_USED
73 : Oid operator = PG_GETARG_OID(1);
74 : #endif
75 57 : List *args = (List *) PG_GETARG_POINTER(2);
76 57 : int varRelid = PG_GETARG_INT32(3);
77 : VariableStatData vardata;
78 : Node *other;
79 : bool varonleft;
80 : Selectivity selec;
81 :
82 : /*
83 : * If expression is not variable = something or something = variable, then
84 : * punt and return a default estimate.
85 : */
86 57 : if (!get_restriction_variable(root, args, varRelid,
87 : &vardata, &other, &varonleft))
88 0 : PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
89 :
90 : /*
91 : * Can't do anything useful if the something is not a constant, either.
92 : */
93 57 : if (!IsA(other, Const))
94 : {
95 0 : ReleaseVariableStats(vardata);
96 0 : PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
97 : }
98 :
99 : /*
100 : * The "@@" operator is strict, so we can cope with NULL right away
101 : */
102 57 : if (((Const *) other)->constisnull)
103 : {
104 0 : ReleaseVariableStats(vardata);
105 0 : PG_RETURN_FLOAT8(0.0);
106 : }
107 :
108 : /*
109 : * OK, there's a Var and a Const we're dealing with here. We need the
110 : * Const to be a TSQuery, else we can't do anything useful. We have to
111 : * check this because the Var might be the TSQuery not the TSVector.
112 : */
113 57 : if (((Const *) other)->consttype == TSQUERYOID)
114 : {
115 : /* tsvector @@ tsquery or the other way around */
116 57 : Assert(vardata.vartype == TSVECTOROID);
117 :
118 57 : selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
119 : }
120 : else
121 : {
122 : /* If we can't see the query structure, must punt */
123 0 : selec = DEFAULT_TS_MATCH_SEL;
124 : }
125 :
126 57 : ReleaseVariableStats(vardata);
127 :
128 57 : CLAMP_PROBABILITY(selec);
129 :
130 57 : PG_RETURN_FLOAT8((float8) selec);
131 : }
132 :
133 :
134 : /*
135 : * tsmatchjoinsel -- join selectivity of "@@"
136 : *
137 : * join selectivity function for tsvector @@ tsquery and tsquery @@ tsvector
138 : */
139 : Datum
140 0 : tsmatchjoinsel(PG_FUNCTION_ARGS)
141 : {
142 : /* for the moment we just punt */
143 0 : PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
144 : }
145 :
146 :
147 : /*
148 : * @@ selectivity for tsvector var vs tsquery constant
149 : */
150 : static Selectivity
151 57 : tsquerysel(VariableStatData *vardata, Datum constval)
152 : {
153 : Selectivity selec;
154 : TSQuery query;
155 :
156 : /* The caller made sure the const is a TSQuery, so get it now */
157 57 : query = DatumGetTSQuery(constval);
158 :
159 : /* Empty query matches nothing */
160 57 : if (query->size == 0)
161 0 : return (Selectivity) 0.0;
162 :
163 57 : if (HeapTupleIsValid(vardata->statsTuple))
164 : {
165 : Form_pg_statistic stats;
166 : AttStatsSlot sslot;
167 :
168 51 : stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
169 :
170 : /* MCELEM will be an array of TEXT elements for a tsvector column */
171 51 : if (get_attstatsslot(&sslot, vardata->statsTuple,
172 : STATISTIC_KIND_MCELEM, InvalidOid,
173 : ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
174 : {
175 : /*
176 : * There is a most-common-elements slot for the tsvector Var, so
177 : * use that.
178 : */
179 51 : selec = mcelem_tsquery_selec(query, sslot.values, sslot.nvalues,
180 : sslot.numbers, sslot.nnumbers);
181 51 : free_attstatsslot(&sslot);
182 : }
183 : else
184 : {
185 : /* No most-common-elements info, so do without */
186 0 : selec = tsquery_opr_selec_no_stats(query);
187 : }
188 :
189 : /*
190 : * MCE stats count only non-null rows, so adjust for null rows.
191 : */
192 51 : selec *= (1.0 - stats->stanullfrac);
193 : }
194 : else
195 : {
196 : /* No stats at all, so do without */
197 6 : selec = tsquery_opr_selec_no_stats(query);
198 : /* we assume no nulls here, so no stanullfrac correction */
199 : }
200 :
201 57 : return selec;
202 : }
203 :
204 : /*
205 : * Extract data from the pg_statistic arrays into useful format.
206 : */
207 : static Selectivity
208 51 : mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
209 : float4 *numbers, int nnumbers)
210 : {
211 : float4 minfreq;
212 : TextFreq *lookup;
213 : Selectivity selec;
214 : int i;
215 :
216 : /*
217 : * There should be two more Numbers than Values, because the last two
218 : * cells are taken for minimal and maximal frequency. Punt if not.
219 : *
220 : * (Note: the MCELEM statistics slot definition allows for a third extra
221 : * number containing the frequency of nulls, but we're not expecting that
222 : * to appear for a tsvector column.)
223 : */
224 51 : if (nnumbers != nmcelem + 2)
225 0 : return tsquery_opr_selec_no_stats(query);
226 :
227 : /*
228 : * Transpose the data into a single array so we can use bsearch().
229 : */
230 51 : lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
231 51051 : for (i = 0; i < nmcelem; i++)
232 : {
233 : /*
234 : * The text Datums came from an array, so it cannot be compressed or
235 : * stored out-of-line -- it's safe to use VARSIZE_ANY*.
236 : */
237 51000 : Assert(!VARATT_IS_COMPRESSED(mcelem[i]) && !VARATT_IS_EXTERNAL(mcelem[i]));
238 51000 : lookup[i].element = (text *) DatumGetPointer(mcelem[i]);
239 51000 : lookup[i].frequency = numbers[i];
240 : }
241 :
242 : /*
243 : * Grab the lowest frequency. compute_tsvector_stats() stored it for us in
244 : * the one before the last cell of the Numbers array. See ts_typanalyze.c
245 : */
246 51 : minfreq = numbers[nnumbers - 2];
247 :
248 51 : selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
249 : nmcelem, minfreq);
250 :
251 51 : pfree(lookup);
252 :
253 51 : return selec;
254 : }
255 :
256 : /*
257 : * Traverse the tsquery in preorder, calculating selectivity as:
258 : *
259 : * selec(left_oper) * selec(right_oper) in AND & PHRASE nodes,
260 : *
261 : * selec(left_oper) + selec(right_oper) -
262 : * selec(left_oper) * selec(right_oper) in OR nodes,
263 : *
264 : * 1 - select(oper) in NOT nodes
265 : *
266 : * histogram-based estimation in prefix VAL nodes
267 : *
268 : * freq[val] in exact VAL nodes, if the value is in MCELEM
269 : * min(freq[MCELEM]) / 2 in VAL nodes, if it is not
270 : *
271 : * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
272 : * binary search for determining freq[MCELEM].
273 : *
274 : * If we don't have stats for the tsvector, we still use this logic,
275 : * except we use default estimates for VAL nodes. This case is signaled
276 : * by lookup == NULL.
277 : */
278 : static Selectivity
279 165 : tsquery_opr_selec(QueryItem *item, char *operand,
280 : TextFreq *lookup, int length, float4 minfreq)
281 : {
282 : Selectivity selec;
283 :
284 : /* since this function recurses, it could be driven to stack overflow */
285 165 : check_stack_depth();
286 :
287 165 : if (item->type == QI_VAL)
288 : {
289 109 : QueryOperand *oper = (QueryOperand *) item;
290 : LexemeKey key;
291 :
292 : /*
293 : * Prepare the key for bsearch().
294 : */
295 109 : key.lexeme = operand + oper->distance;
296 109 : key.length = oper->length;
297 :
298 109 : if (oper->prefix)
299 : {
300 : /* Prefix match, ie the query item is lexeme:* */
301 : Selectivity matched,
302 : allmces;
303 : int i,
304 : n_matched;
305 :
306 : /*
307 : * Our strategy is to scan through the MCELEM list and combine the
308 : * frequencies of the ones that match the prefix. We then
309 : * extrapolate the fraction of matching MCELEMs to the remaining
310 : * rows, assuming that the MCELEMs are representative of the whole
311 : * lexeme population in this respect. (Compare
312 : * histogram_selectivity().) Note that these are most common
313 : * elements not most common values, so they're not mutually
314 : * exclusive. We treat occurrences as independent events.
315 : *
316 : * This is only a good plan if we have a pretty fair number of
317 : * MCELEMs available; we set the threshold at 100. If no stats or
318 : * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
319 : */
320 13 : if (lookup == NULL || length < 100)
321 12 : return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
322 :
323 8 : matched = allmces = 0;
324 8 : n_matched = 0;
325 8008 : for (i = 0; i < length; i++)
326 : {
327 8000 : TextFreq *t = lookup + i;
328 8000 : int tlen = VARSIZE_ANY_EXHDR(t->element);
329 :
330 16000 : if (tlen >= key.length &&
331 8000 : strncmp(key.lexeme, VARDATA_ANY(t->element),
332 8000 : key.length) == 0)
333 : {
334 288 : matched += t->frequency - matched * t->frequency;
335 288 : n_matched++;
336 : }
337 8000 : allmces += t->frequency - allmces * t->frequency;
338 : }
339 :
340 : /* Clamp to ensure sanity in the face of roundoff error */
341 8 : CLAMP_PROBABILITY(matched);
342 8 : CLAMP_PROBABILITY(allmces);
343 :
344 8 : selec = matched + (1.0 - allmces) * ((double) n_matched / length);
345 :
346 : /*
347 : * In any case, never believe that a prefix match has selectivity
348 : * less than we would assign for a non-MCELEM lexeme. This
349 : * preserves the property that "word:*" should be estimated to
350 : * match at least as many rows as "word" would be.
351 : */
352 8 : selec = Max(Min(DEFAULT_TS_MATCH_SEL, minfreq / 2), selec);
353 : }
354 : else
355 : {
356 : /* Regular exact lexeme match */
357 : TextFreq *searchres;
358 :
359 : /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
360 96 : if (lookup == NULL)
361 2 : return (Selectivity) DEFAULT_TS_MATCH_SEL;
362 :
363 94 : searchres = (TextFreq *) bsearch(&key, lookup, length,
364 : sizeof(TextFreq),
365 : compare_lexeme_textfreq);
366 :
367 94 : if (searchres)
368 : {
369 : /*
370 : * The element is in MCELEM. Return precise selectivity (or
371 : * at least as precise as ANALYZE could find out).
372 : */
373 78 : selec = searchres->frequency;
374 : }
375 : else
376 : {
377 : /*
378 : * The element is not in MCELEM. Punt, but assume that the
379 : * selectivity cannot be more than minfreq / 2.
380 : */
381 16 : selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
382 : }
383 : }
384 : }
385 : else
386 : {
387 : /* Current TSQuery node is an operator */
388 : Selectivity s1,
389 : s2;
390 :
391 56 : switch (item->qoperator.oper)
392 : {
393 : case OP_NOT:
394 4 : selec = 1.0 - tsquery_opr_selec(item + 1, operand,
395 : lookup, length, minfreq);
396 4 : break;
397 :
398 : case OP_PHRASE:
399 : case OP_AND:
400 25 : s1 = tsquery_opr_selec(item + 1, operand,
401 : lookup, length, minfreq);
402 25 : s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
403 : lookup, length, minfreq);
404 25 : selec = s1 * s2;
405 25 : break;
406 :
407 : case OP_OR:
408 27 : s1 = tsquery_opr_selec(item + 1, operand,
409 : lookup, length, minfreq);
410 27 : s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
411 : lookup, length, minfreq);
412 27 : selec = s1 + s2 - s1 * s2;
413 27 : break;
414 :
415 : default:
416 0 : elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
417 : selec = 0; /* keep compiler quiet */
418 : break;
419 : }
420 : }
421 :
422 : /* Clamp intermediate results to stay sane despite roundoff error */
423 158 : CLAMP_PROBABILITY(selec);
424 :
425 158 : return selec;
426 : }
427 :
428 : /*
429 : * bsearch() comparator for a lexeme (non-NULL terminated string with length)
430 : * and a TextFreq. Use length, then byte-for-byte comparison, because that's
431 : * how ANALYZE code sorted data before storing it in a statistic tuple.
432 : * See ts_typanalyze.c for details.
433 : */
434 : static int
435 740 : compare_lexeme_textfreq(const void *e1, const void *e2)
436 : {
437 740 : const LexemeKey *key = (const LexemeKey *) e1;
438 740 : const TextFreq *t = (const TextFreq *) e2;
439 : int len1,
440 : len2;
441 :
442 740 : len1 = key->length;
443 740 : len2 = VARSIZE_ANY_EXHDR(t->element);
444 :
445 : /* Compare lengths first, possibly avoiding a strncmp call */
446 740 : if (len1 > len2)
447 144 : return 1;
448 596 : else if (len1 < len2)
449 0 : return -1;
450 :
451 : /* Fall back on byte-for-byte comparison */
452 596 : return strncmp(key->lexeme, VARDATA_ANY(t->element), len1);
453 : }
|