Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtutils.c
4 : * Utility code for Postgres btree implementation.
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/nbtree/nbtutils.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include <time.h>
19 :
20 : #include "access/nbtree.h"
21 : #include "access/reloptions.h"
22 : #include "access/relscan.h"
23 : #include "miscadmin.h"
24 : #include "utils/array.h"
25 : #include "utils/lsyscache.h"
26 : #include "utils/memutils.h"
27 : #include "utils/rel.h"
28 :
29 :
30 : typedef struct BTSortArrayContext
31 : {
32 : FmgrInfo flinfo;
33 : Oid collation;
34 : bool reverse;
35 : } BTSortArrayContext;
36 :
37 : static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
38 : StrategyNumber strat,
39 : Datum *elems, int nelems);
40 : static int _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
41 : bool reverse,
42 : Datum *elems, int nelems);
43 : static int _bt_compare_array_elements(const void *a, const void *b, void *arg);
44 : static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
45 : ScanKey leftarg, ScanKey rightarg,
46 : bool *result);
47 : static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption);
48 : static void _bt_mark_scankey_required(ScanKey skey);
49 : static bool _bt_check_rowcompare(ScanKey skey,
50 : IndexTuple tuple, TupleDesc tupdesc,
51 : ScanDirection dir, bool *continuescan);
52 :
53 :
54 : /*
55 : * _bt_mkscankey
56 : * Build an insertion scan key that contains comparison data from itup
57 : * as well as comparator routines appropriate to the key datatypes.
58 : *
59 : * The result is intended for use with _bt_compare().
60 : */
61 : ScanKey
62 206144 : _bt_mkscankey(Relation rel, IndexTuple itup)
63 : {
64 : ScanKey skey;
65 : TupleDesc itupdesc;
66 : int natts;
67 : int16 *indoption;
68 : int i;
69 :
70 206144 : itupdesc = RelationGetDescr(rel);
71 206144 : natts = RelationGetNumberOfAttributes(rel);
72 206144 : indoption = rel->rd_indoption;
73 :
74 206144 : skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
75 :
76 638761 : for (i = 0; i < natts; i++)
77 : {
78 : FmgrInfo *procinfo;
79 : Datum arg;
80 : bool null;
81 : int flags;
82 :
83 : /*
84 : * We can use the cached (default) support procs since no cross-type
85 : * comparison can be needed.
86 : */
87 432617 : procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
88 432617 : arg = index_getattr(itup, i + 1, itupdesc, &null);
89 432617 : flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
90 865234 : ScanKeyEntryInitializeWithInfo(&skey[i],
91 : flags,
92 432617 : (AttrNumber) (i + 1),
93 : InvalidStrategy,
94 : InvalidOid,
95 432617 : rel->rd_indcollation[i],
96 : procinfo,
97 : arg);
98 : }
99 :
100 206144 : return skey;
101 : }
102 :
103 : /*
104 : * _bt_mkscankey_nodata
105 : * Build an insertion scan key that contains 3-way comparator routines
106 : * appropriate to the key datatypes, but no comparison data. The
107 : * comparison data ultimately used must match the key datatypes.
108 : *
109 : * The result cannot be used with _bt_compare(), unless comparison
110 : * data is first stored into the key entries. Currently this
111 : * routine is only called by nbtsort.c and tuplesort.c, which have
112 : * their own comparison routines.
113 : */
114 : ScanKey
115 2637 : _bt_mkscankey_nodata(Relation rel)
116 : {
117 : ScanKey skey;
118 : int natts;
119 : int16 *indoption;
120 : int i;
121 :
122 2637 : natts = RelationGetNumberOfAttributes(rel);
123 2637 : indoption = rel->rd_indoption;
124 :
125 2637 : skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
126 :
127 7064 : for (i = 0; i < natts; i++)
128 : {
129 : FmgrInfo *procinfo;
130 : int flags;
131 :
132 : /*
133 : * We can use the cached (default) support procs since no cross-type
134 : * comparison can be needed.
135 : */
136 4427 : procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
137 4427 : flags = SK_ISNULL | (indoption[i] << SK_BT_INDOPTION_SHIFT);
138 8854 : ScanKeyEntryInitializeWithInfo(&skey[i],
139 : flags,
140 4427 : (AttrNumber) (i + 1),
141 : InvalidStrategy,
142 : InvalidOid,
143 4427 : rel->rd_indcollation[i],
144 : procinfo,
145 : (Datum) 0);
146 : }
147 :
148 2637 : return skey;
149 : }
150 :
151 : /*
152 : * free a scan key made by either _bt_mkscankey or _bt_mkscankey_nodata.
153 : */
154 : void
155 208614 : _bt_freeskey(ScanKey skey)
156 : {
157 208614 : pfree(skey);
158 208614 : }
159 :
160 : /*
161 : * free a retracement stack made by _bt_search.
162 : */
163 : void
164 594821 : _bt_freestack(BTStack stack)
165 : {
166 : BTStack ostack;
167 :
168 1666177 : while (stack != NULL)
169 : {
170 476535 : ostack = stack;
171 476535 : stack = stack->bts_parent;
172 476535 : pfree(ostack);
173 : }
174 594821 : }
175 :
176 :
177 : /*
178 : * _bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys
179 : *
180 : * If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and
181 : * set up BTArrayKeyInfo info for each one that is an equality-type key.
182 : * Prepare modified scan keys in so->arrayKeyData, which will hold the current
183 : * array elements during each primitive indexscan operation. For inequality
184 : * array keys, it's sufficient to find the extreme element value and replace
185 : * the whole array with that scalar value.
186 : *
187 : * Note: the reason we need so->arrayKeyData, rather than just scribbling
188 : * on scan->keyData, is that callers are permitted to call btrescan without
189 : * supplying a new set of scankey data.
190 : */
191 : void
192 389949 : _bt_preprocess_array_keys(IndexScanDesc scan)
193 : {
194 389949 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
195 389949 : int numberOfKeys = scan->numberOfKeys;
196 389949 : int16 *indoption = scan->indexRelation->rd_indoption;
197 : int numArrayKeys;
198 : ScanKey cur;
199 : int i;
200 : MemoryContext oldContext;
201 :
202 : /* Quick check to see if there are any array keys */
203 389949 : numArrayKeys = 0;
204 1057944 : for (i = 0; i < numberOfKeys; i++)
205 : {
206 667995 : cur = &scan->keyData[i];
207 667995 : if (cur->sk_flags & SK_SEARCHARRAY)
208 : {
209 31 : numArrayKeys++;
210 31 : Assert(!(cur->sk_flags & (SK_ROW_HEADER | SK_SEARCHNULL | SK_SEARCHNOTNULL)));
211 : /* If any arrays are null as a whole, we can quit right now. */
212 31 : if (cur->sk_flags & SK_ISNULL)
213 : {
214 0 : so->numArrayKeys = -1;
215 0 : so->arrayKeyData = NULL;
216 0 : return;
217 : }
218 : }
219 : }
220 :
221 : /* Quit if nothing to do. */
222 389949 : if (numArrayKeys == 0)
223 : {
224 389918 : so->numArrayKeys = 0;
225 389918 : so->arrayKeyData = NULL;
226 389918 : return;
227 : }
228 :
229 : /*
230 : * Make a scan-lifespan context to hold array-associated data, or reset it
231 : * if we already have one from a previous rescan cycle.
232 : */
233 31 : if (so->arrayContext == NULL)
234 31 : so->arrayContext = AllocSetContextCreate(CurrentMemoryContext,
235 : "BTree array context",
236 : ALLOCSET_SMALL_SIZES);
237 : else
238 0 : MemoryContextReset(so->arrayContext);
239 :
240 31 : oldContext = MemoryContextSwitchTo(so->arrayContext);
241 :
242 : /* Create modifiable copy of scan->keyData in the workspace context */
243 31 : so->arrayKeyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
244 62 : memcpy(so->arrayKeyData,
245 31 : scan->keyData,
246 31 : scan->numberOfKeys * sizeof(ScanKeyData));
247 :
248 : /* Allocate space for per-array data in the workspace context */
249 31 : so->arrayKeys = (BTArrayKeyInfo *) palloc0(numArrayKeys * sizeof(BTArrayKeyInfo));
250 :
251 : /* Now process each array key */
252 31 : numArrayKeys = 0;
253 66 : for (i = 0; i < numberOfKeys; i++)
254 : {
255 : ArrayType *arrayval;
256 : int16 elmlen;
257 : bool elmbyval;
258 : char elmalign;
259 : int num_elems;
260 : Datum *elem_values;
261 : bool *elem_nulls;
262 : int num_nonnulls;
263 : int j;
264 :
265 35 : cur = &so->arrayKeyData[i];
266 35 : if (!(cur->sk_flags & SK_SEARCHARRAY))
267 8 : continue;
268 :
269 : /*
270 : * First, deconstruct the array into elements. Anything allocated
271 : * here (including a possibly detoasted array value) is in the
272 : * workspace context.
273 : */
274 31 : arrayval = DatumGetArrayTypeP(cur->sk_argument);
275 : /* We could cache this data, but not clear it's worth it */
276 31 : get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
277 : &elmlen, &elmbyval, &elmalign);
278 31 : deconstruct_array(arrayval,
279 : ARR_ELEMTYPE(arrayval),
280 : elmlen, elmbyval, elmalign,
281 : &elem_values, &elem_nulls, &num_elems);
282 :
283 : /*
284 : * Compress out any null elements. We can ignore them since we assume
285 : * all btree operators are strict.
286 : */
287 31 : num_nonnulls = 0;
288 138 : for (j = 0; j < num_elems; j++)
289 : {
290 107 : if (!elem_nulls[j])
291 107 : elem_values[num_nonnulls++] = elem_values[j];
292 : }
293 :
294 : /* We could pfree(elem_nulls) now, but not worth the cycles */
295 :
296 : /* If there's no non-nulls, the scan qual is unsatisfiable */
297 31 : if (num_nonnulls == 0)
298 : {
299 0 : numArrayKeys = -1;
300 0 : break;
301 : }
302 :
303 : /*
304 : * If the comparison operator is not equality, then the array qual
305 : * degenerates to a simple comparison against the smallest or largest
306 : * non-null array element, as appropriate.
307 : */
308 31 : switch (cur->sk_strategy)
309 : {
310 : case BTLessStrategyNumber:
311 : case BTLessEqualStrategyNumber:
312 0 : cur->sk_argument =
313 0 : _bt_find_extreme_element(scan, cur,
314 : BTGreaterStrategyNumber,
315 : elem_values, num_nonnulls);
316 0 : continue;
317 : case BTEqualStrategyNumber:
318 : /* proceed with rest of loop */
319 31 : break;
320 : case BTGreaterEqualStrategyNumber:
321 : case BTGreaterStrategyNumber:
322 0 : cur->sk_argument =
323 0 : _bt_find_extreme_element(scan, cur,
324 : BTLessStrategyNumber,
325 : elem_values, num_nonnulls);
326 0 : continue;
327 : default:
328 0 : elog(ERROR, "unrecognized StrategyNumber: %d",
329 : (int) cur->sk_strategy);
330 : break;
331 : }
332 :
333 : /*
334 : * Sort the non-null elements and eliminate any duplicates. We must
335 : * sort in the same ordering used by the index column, so that the
336 : * successive primitive indexscans produce data in index order.
337 : */
338 62 : num_elems = _bt_sort_array_elements(scan, cur,
339 31 : (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0,
340 : elem_values, num_nonnulls);
341 :
342 : /*
343 : * And set up the BTArrayKeyInfo data.
344 : */
345 31 : so->arrayKeys[numArrayKeys].scan_key = i;
346 31 : so->arrayKeys[numArrayKeys].num_elems = num_elems;
347 31 : so->arrayKeys[numArrayKeys].elem_values = elem_values;
348 31 : numArrayKeys++;
349 : }
350 :
351 31 : so->numArrayKeys = numArrayKeys;
352 :
353 31 : MemoryContextSwitchTo(oldContext);
354 : }
355 :
356 : /*
357 : * _bt_find_extreme_element() -- get least or greatest array element
358 : *
359 : * scan and skey identify the index column, whose opfamily determines the
360 : * comparison semantics. strat should be BTLessStrategyNumber to get the
361 : * least element, or BTGreaterStrategyNumber to get the greatest.
362 : */
363 : static Datum
364 0 : _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
365 : StrategyNumber strat,
366 : Datum *elems, int nelems)
367 : {
368 0 : Relation rel = scan->indexRelation;
369 : Oid elemtype,
370 : cmp_op;
371 : RegProcedure cmp_proc;
372 : FmgrInfo flinfo;
373 : Datum result;
374 : int i;
375 :
376 : /*
377 : * Determine the nominal datatype of the array elements. We have to
378 : * support the convention that sk_subtype == InvalidOid means the opclass
379 : * input type; this is a hack to simplify life for ScanKeyInit().
380 : */
381 0 : elemtype = skey->sk_subtype;
382 0 : if (elemtype == InvalidOid)
383 0 : elemtype = rel->rd_opcintype[skey->sk_attno - 1];
384 :
385 : /*
386 : * Look up the appropriate comparison operator in the opfamily.
387 : *
388 : * Note: it's possible that this would fail, if the opfamily is
389 : * incomplete, but it seems quite unlikely that an opfamily would omit
390 : * non-cross-type comparison operators for any datatype that it supports
391 : * at all.
392 : */
393 0 : cmp_op = get_opfamily_member(rel->rd_opfamily[skey->sk_attno - 1],
394 : elemtype,
395 : elemtype,
396 : strat);
397 0 : if (!OidIsValid(cmp_op))
398 0 : elog(ERROR, "missing operator %d(%u,%u) in opfamily %u",
399 : strat, elemtype, elemtype,
400 : rel->rd_opfamily[skey->sk_attno - 1]);
401 0 : cmp_proc = get_opcode(cmp_op);
402 0 : if (!RegProcedureIsValid(cmp_proc))
403 0 : elog(ERROR, "missing oprcode for operator %u", cmp_op);
404 :
405 0 : fmgr_info(cmp_proc, &flinfo);
406 :
407 0 : Assert(nelems > 0);
408 0 : result = elems[0];
409 0 : for (i = 1; i < nelems; i++)
410 : {
411 0 : if (DatumGetBool(FunctionCall2Coll(&flinfo,
412 : skey->sk_collation,
413 : elems[i],
414 : result)))
415 0 : result = elems[i];
416 : }
417 :
418 0 : return result;
419 : }
420 :
421 : /*
422 : * _bt_sort_array_elements() -- sort and de-dup array elements
423 : *
424 : * The array elements are sorted in-place, and the new number of elements
425 : * after duplicate removal is returned.
426 : *
427 : * scan and skey identify the index column, whose opfamily determines the
428 : * comparison semantics. If reverse is true, we sort in descending order.
429 : */
430 : static int
431 31 : _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
432 : bool reverse,
433 : Datum *elems, int nelems)
434 : {
435 31 : Relation rel = scan->indexRelation;
436 : Oid elemtype;
437 : RegProcedure cmp_proc;
438 : BTSortArrayContext cxt;
439 : int last_non_dup;
440 : int i;
441 :
442 31 : if (nelems <= 1)
443 0 : return nelems; /* no work to do */
444 :
445 : /*
446 : * Determine the nominal datatype of the array elements. We have to
447 : * support the convention that sk_subtype == InvalidOid means the opclass
448 : * input type; this is a hack to simplify life for ScanKeyInit().
449 : */
450 31 : elemtype = skey->sk_subtype;
451 31 : if (elemtype == InvalidOid)
452 0 : elemtype = rel->rd_opcintype[skey->sk_attno - 1];
453 :
454 : /*
455 : * Look up the appropriate comparison function in the opfamily.
456 : *
457 : * Note: it's possible that this would fail, if the opfamily is
458 : * incomplete, but it seems quite unlikely that an opfamily would omit
459 : * non-cross-type support functions for any datatype that it supports at
460 : * all.
461 : */
462 31 : cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1],
463 : elemtype,
464 : elemtype,
465 : BTORDER_PROC);
466 31 : if (!RegProcedureIsValid(cmp_proc))
467 0 : elog(ERROR, "missing support function %d(%u,%u) in opfamily %u",
468 : BTORDER_PROC, elemtype, elemtype,
469 : rel->rd_opfamily[skey->sk_attno - 1]);
470 :
471 : /* Sort the array elements */
472 31 : fmgr_info(cmp_proc, &cxt.flinfo);
473 31 : cxt.collation = skey->sk_collation;
474 31 : cxt.reverse = reverse;
475 31 : qsort_arg((void *) elems, nelems, sizeof(Datum),
476 : _bt_compare_array_elements, (void *) &cxt);
477 :
478 : /* Now scan the sorted elements and remove duplicates */
479 31 : last_non_dup = 0;
480 107 : for (i = 1; i < nelems; i++)
481 : {
482 : int32 compare;
483 :
484 76 : compare = DatumGetInt32(FunctionCall2Coll(&cxt.flinfo,
485 : cxt.collation,
486 : elems[last_non_dup],
487 : elems[i]));
488 76 : if (compare != 0)
489 76 : elems[++last_non_dup] = elems[i];
490 : }
491 :
492 31 : return last_non_dup + 1;
493 : }
494 :
495 : /*
496 : * qsort_arg comparator for sorting array elements
497 : */
498 : static int
499 127 : _bt_compare_array_elements(const void *a, const void *b, void *arg)
500 : {
501 127 : Datum da = *((const Datum *) a);
502 127 : Datum db = *((const Datum *) b);
503 127 : BTSortArrayContext *cxt = (BTSortArrayContext *) arg;
504 : int32 compare;
505 :
506 127 : compare = DatumGetInt32(FunctionCall2Coll(&cxt->flinfo,
507 : cxt->collation,
508 : da, db));
509 127 : if (cxt->reverse)
510 0 : compare = -compare;
511 127 : return compare;
512 : }
513 :
514 : /*
515 : * _bt_start_array_keys() -- Initialize array keys at start of a scan
516 : *
517 : * Set up the cur_elem counters and fill in the first sk_argument value for
518 : * each array scankey. We can't do this until we know the scan direction.
519 : */
520 : void
521 31 : _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
522 : {
523 31 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
524 : int i;
525 :
526 62 : for (i = 0; i < so->numArrayKeys; i++)
527 : {
528 31 : BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
529 31 : ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key];
530 :
531 31 : Assert(curArrayKey->num_elems > 0);
532 31 : if (ScanDirectionIsBackward(dir))
533 0 : curArrayKey->cur_elem = curArrayKey->num_elems - 1;
534 : else
535 31 : curArrayKey->cur_elem = 0;
536 31 : skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem];
537 : }
538 31 : }
539 :
540 : /*
541 : * _bt_advance_array_keys() -- Advance to next set of array elements
542 : *
543 : * Returns TRUE if there is another set of values to consider, FALSE if not.
544 : * On TRUE result, the scankeys are initialized with the next set of values.
545 : */
546 : bool
547 107 : _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
548 : {
549 107 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
550 107 : bool found = false;
551 : int i;
552 :
553 : /*
554 : * We must advance the last array key most quickly, since it will
555 : * correspond to the lowest-order index column among the available
556 : * qualifications. This is necessary to ensure correct ordering of output
557 : * when there are multiple array keys.
558 : */
559 138 : for (i = so->numArrayKeys - 1; i >= 0; i--)
560 : {
561 107 : BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
562 107 : ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key];
563 107 : int cur_elem = curArrayKey->cur_elem;
564 107 : int num_elems = curArrayKey->num_elems;
565 :
566 107 : if (ScanDirectionIsBackward(dir))
567 : {
568 0 : if (--cur_elem < 0)
569 : {
570 0 : cur_elem = num_elems - 1;
571 0 : found = false; /* need to advance next array key */
572 : }
573 : else
574 0 : found = true;
575 : }
576 : else
577 : {
578 107 : if (++cur_elem >= num_elems)
579 : {
580 31 : cur_elem = 0;
581 31 : found = false; /* need to advance next array key */
582 : }
583 : else
584 76 : found = true;
585 : }
586 :
587 107 : curArrayKey->cur_elem = cur_elem;
588 107 : skey->sk_argument = curArrayKey->elem_values[cur_elem];
589 107 : if (found)
590 76 : break;
591 : }
592 :
593 : /* advance parallel scan */
594 107 : if (scan->parallel_scan != NULL)
595 0 : _bt_parallel_advance_array_keys(scan);
596 :
597 107 : return found;
598 : }
599 :
600 : /*
601 : * _bt_mark_array_keys() -- Handle array keys during btmarkpos
602 : *
603 : * Save the current state of the array keys as the "mark" position.
604 : */
605 : void
606 0 : _bt_mark_array_keys(IndexScanDesc scan)
607 : {
608 0 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
609 : int i;
610 :
611 0 : for (i = 0; i < so->numArrayKeys; i++)
612 : {
613 0 : BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
614 :
615 0 : curArrayKey->mark_elem = curArrayKey->cur_elem;
616 : }
617 0 : }
618 :
619 : /*
620 : * _bt_restore_array_keys() -- Handle array keys during btrestrpos
621 : *
622 : * Restore the array keys to where they were when the mark was set.
623 : */
624 : void
625 0 : _bt_restore_array_keys(IndexScanDesc scan)
626 : {
627 0 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
628 0 : bool changed = false;
629 : int i;
630 :
631 : /* Restore each array key to its position when the mark was set */
632 0 : for (i = 0; i < so->numArrayKeys; i++)
633 : {
634 0 : BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
635 0 : ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key];
636 0 : int mark_elem = curArrayKey->mark_elem;
637 :
638 0 : if (curArrayKey->cur_elem != mark_elem)
639 : {
640 0 : curArrayKey->cur_elem = mark_elem;
641 0 : skey->sk_argument = curArrayKey->elem_values[mark_elem];
642 0 : changed = true;
643 : }
644 : }
645 :
646 : /*
647 : * If we changed any keys, we must redo _bt_preprocess_keys. That might
648 : * sound like overkill, but in cases with multiple keys per index column
649 : * it seems necessary to do the full set of pushups.
650 : */
651 0 : if (changed)
652 : {
653 0 : _bt_preprocess_keys(scan);
654 : /* The mark should have been set on a consistent set of keys... */
655 0 : Assert(so->qual_ok);
656 : }
657 0 : }
658 :
659 :
660 : /*
661 : * _bt_preprocess_keys() -- Preprocess scan keys
662 : *
663 : * The given search-type keys (in scan->keyData[] or so->arrayKeyData[])
664 : * are copied to so->keyData[] with possible transformation.
665 : * scan->numberOfKeys is the number of input keys, so->numberOfKeys gets
666 : * the number of output keys (possibly less, never greater).
667 : *
668 : * The output keys are marked with additional sk_flag bits beyond the
669 : * system-standard bits supplied by the caller. The DESC and NULLS_FIRST
670 : * indoption bits for the relevant index attribute are copied into the flags.
671 : * Also, for a DESC column, we commute (flip) all the sk_strategy numbers
672 : * so that the index sorts in the desired direction.
673 : *
674 : * One key purpose of this routine is to discover which scan keys must be
675 : * satisfied to continue the scan. It also attempts to eliminate redundant
676 : * keys and detect contradictory keys. (If the index opfamily provides
677 : * incomplete sets of cross-type operators, we may fail to detect redundant
678 : * or contradictory keys, but we can survive that.)
679 : *
680 : * The output keys must be sorted by index attribute. Presently we expect
681 : * (but verify) that the input keys are already so sorted --- this is done
682 : * by match_clauses_to_index() in indxpath.c. Some reordering of the keys
683 : * within each attribute may be done as a byproduct of the processing here,
684 : * but no other code depends on that.
685 : *
686 : * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD
687 : * if they must be satisfied in order to continue the scan forward or backward
688 : * respectively. _bt_checkkeys uses these flags. For example, if the quals
689 : * are "x = 1 AND y < 4 AND z < 5", then _bt_checkkeys will reject a tuple
690 : * (1,2,7), but we must continue the scan in case there are tuples (1,3,z).
691 : * But once we reach tuples like (1,4,z) we can stop scanning because no
692 : * later tuples could match. This is reflected by marking the x and y keys,
693 : * but not the z key, with SK_BT_REQFWD. In general, the keys for leading
694 : * attributes with "=" keys are marked both SK_BT_REQFWD and SK_BT_REQBKWD.
695 : * For the first attribute without an "=" key, any "<" and "<=" keys are
696 : * marked SK_BT_REQFWD while any ">" and ">=" keys are marked SK_BT_REQBKWD.
697 : * This can be seen to be correct by considering the above example. Note
698 : * in particular that if there are no keys for a given attribute, the keys for
699 : * subsequent attributes can never be required; for instance "WHERE y = 4"
700 : * requires a full-index scan.
701 : *
702 : * If possible, redundant keys are eliminated: we keep only the tightest
703 : * >/>= bound and the tightest </<= bound, and if there's an = key then
704 : * that's the only one returned. (So, we return either a single = key,
705 : * or one or two boundary-condition keys for each attr.) However, if we
706 : * cannot compare two keys for lack of a suitable cross-type operator,
707 : * we cannot eliminate either. If there are two such keys of the same
708 : * operator strategy, the second one is just pushed into the output array
709 : * without further processing here. We may also emit both >/>= or both
710 : * </<= keys if we can't compare them. The logic about required keys still
711 : * works if we don't eliminate redundant keys.
712 : *
713 : * Note that one reason we need direction-sensitive required-key flags is
714 : * precisely that we may not be able to eliminate redundant keys. Suppose
715 : * we have "x > 4::int AND x > 10::bigint", and we are unable to determine
716 : * which key is more restrictive for lack of a suitable cross-type operator.
717 : * _bt_first will arbitrarily pick one of the keys to do the initial
718 : * positioning with. If it picks x > 4, then the x > 10 condition will fail
719 : * until we reach index entries > 10; but we can't stop the scan just because
720 : * x > 10 is failing. On the other hand, if we are scanning backwards, then
721 : * failure of either key is indeed enough to stop the scan. (In general, when
722 : * inequality keys are present, the initial-positioning code only promises to
723 : * position before the first possible match, not exactly at the first match,
724 : * for a forward scan; or after the last match for a backward scan.)
725 : *
726 : * As a byproduct of this work, we can detect contradictory quals such
727 : * as "x = 1 AND x > 2". If we see that, we return so->qual_ok = FALSE,
728 : * indicating the scan need not be run at all since no tuples can match.
729 : * (In this case we do not bother completing the output key array!)
730 : * Again, missing cross-type operators might cause us to fail to prove the
731 : * quals contradictory when they really are, but the scan will work correctly.
732 : *
733 : * Row comparison keys are currently also treated without any smarts:
734 : * we just transfer them into the preprocessed array without any
735 : * editorialization. We can treat them the same as an ordinary inequality
736 : * comparison on the row's first index column, for the purposes of the logic
737 : * about required keys.
738 : *
739 : * Note: the reason we have to copy the preprocessed scan keys into private
740 : * storage is that we are modifying the array based on comparisons of the
741 : * key argument values, which could change on a rescan or after moving to
742 : * new elements of array keys. Therefore we can't overwrite the source data.
743 : */
744 : void
745 389698 : _bt_preprocess_keys(IndexScanDesc scan)
746 : {
747 389698 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
748 389698 : int numberOfKeys = scan->numberOfKeys;
749 389698 : int16 *indoption = scan->indexRelation->rd_indoption;
750 : int new_numberOfKeys;
751 : int numberOfEqualCols;
752 : ScanKey inkeys;
753 : ScanKey outkeys;
754 : ScanKey cur;
755 : ScanKey xform[BTMaxStrategyNumber];
756 : bool test_result;
757 : int i,
758 : j;
759 : AttrNumber attno;
760 :
761 : /* initialize result variables */
762 389698 : so->qual_ok = true;
763 389698 : so->numberOfKeys = 0;
764 :
765 389698 : if (numberOfKeys < 1)
766 155541 : return; /* done if qual-less scan */
767 :
768 : /*
769 : * Read so->arrayKeyData if array keys are present, else scan->keyData
770 : */
771 389354 : if (so->arrayKeyData != NULL)
772 107 : inkeys = so->arrayKeyData;
773 : else
774 389247 : inkeys = scan->keyData;
775 :
776 389354 : outkeys = so->keyData;
777 389354 : cur = &inkeys[0];
778 : /* we check that input keys are correctly ordered */
779 389354 : if (cur->sk_attno < 1)
780 0 : elog(ERROR, "btree index keys must be ordered by attribute");
781 :
782 : /* We can short-circuit most of the work if there's just one key */
783 389354 : if (numberOfKeys == 1)
784 : {
785 : /* Apply indoption to scankey (might change sk_strategy!) */
786 154849 : if (!_bt_fix_scankey_strategy(cur, indoption))
787 1 : so->qual_ok = false;
788 154849 : memcpy(outkeys, cur, sizeof(ScanKeyData));
789 154849 : so->numberOfKeys = 1;
790 : /* We can mark the qual as required if it's for first index col */
791 154849 : if (cur->sk_attno == 1)
792 154831 : _bt_mark_scankey_required(outkeys);
793 154849 : return;
794 : }
795 :
796 : /*
797 : * Otherwise, do the full set of pushups.
798 : */
799 234505 : new_numberOfKeys = 0;
800 234505 : numberOfEqualCols = 0;
801 :
802 : /*
803 : * Initialize for processing of keys for attr 1.
804 : *
805 : * xform[i] points to the currently best scan key of strategy type i+1; it
806 : * is NULL if we haven't yet found such a key for this attr.
807 : */
808 234505 : attno = 1;
809 234505 : memset(xform, 0, sizeof(xform));
810 :
811 : /*
812 : * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
813 : * handle after-last-key processing. Actual exit from the loop is at the
814 : * "break" statement below.
815 : */
816 747427 : for (i = 0;; cur++, i++)
817 : {
818 747427 : if (i < numberOfKeys)
819 : {
820 : /* Apply indoption to scankey (might change sk_strategy!) */
821 512922 : if (!_bt_fix_scankey_strategy(cur, indoption))
822 : {
823 : /* NULL can't be matched, so give up */
824 0 : so->qual_ok = false;
825 0 : return;
826 : }
827 : }
828 :
829 : /*
830 : * If we are at the end of the keys for a particular attr, finish up
831 : * processing and emit the cleaned-up keys.
832 : */
833 747427 : if (i == numberOfKeys || cur->sk_attno != attno)
834 : {
835 512836 : int priorNumberOfEqualCols = numberOfEqualCols;
836 :
837 : /* check input keys are correctly ordered */
838 512836 : if (i < numberOfKeys && cur->sk_attno < attno)
839 0 : elog(ERROR, "btree index keys must be ordered by attribute");
840 :
841 : /*
842 : * If = has been specified, all other keys can be eliminated as
843 : * redundant. If we have a case like key = 1 AND key > 2, we can
844 : * set qual_ok to false and abandon further processing.
845 : *
846 : * We also have to deal with the case of "key IS NULL", which is
847 : * unsatisfiable in combination with any other index condition. By
848 : * the time we get here, that's been classified as an equality
849 : * check, and we've rejected any combination of it with a regular
850 : * equality condition; but not with other types of conditions.
851 : */
852 512836 : if (xform[BTEqualStrategyNumber - 1])
853 : {
854 486162 : ScanKey eq = xform[BTEqualStrategyNumber - 1];
855 :
856 3403114 : for (j = BTMaxStrategyNumber; --j >= 0;)
857 : {
858 2430794 : ScanKey chk = xform[j];
859 :
860 2430794 : if (!chk || j == (BTEqualStrategyNumber - 1))
861 2430778 : continue;
862 :
863 16 : if (eq->sk_flags & SK_SEARCHNULL)
864 : {
865 : /* IS NULL is contradictory to anything else */
866 4 : so->qual_ok = false;
867 4 : return;
868 : }
869 :
870 12 : if (_bt_compare_scankey_args(scan, chk, eq, chk,
871 : &test_result))
872 : {
873 12 : if (!test_result)
874 : {
875 : /* keys proven mutually contradictory */
876 0 : so->qual_ok = false;
877 0 : return;
878 : }
879 : /* else discard the redundant non-equality key */
880 12 : xform[j] = NULL;
881 : }
882 : /* else, cannot determine redundancy, keep both keys */
883 : }
884 : /* track number of attrs for which we have "=" keys */
885 486158 : numberOfEqualCols++;
886 : }
887 :
888 : /* try to keep only one of <, <= */
889 512832 : if (xform[BTLessStrategyNumber - 1]
890 82 : && xform[BTLessEqualStrategyNumber - 1])
891 : {
892 0 : ScanKey lt = xform[BTLessStrategyNumber - 1];
893 0 : ScanKey le = xform[BTLessEqualStrategyNumber - 1];
894 :
895 0 : if (_bt_compare_scankey_args(scan, le, lt, le,
896 : &test_result))
897 : {
898 0 : if (test_result)
899 0 : xform[BTLessEqualStrategyNumber - 1] = NULL;
900 : else
901 0 : xform[BTLessStrategyNumber - 1] = NULL;
902 : }
903 : }
904 :
905 : /* try to keep only one of >, >= */
906 512832 : if (xform[BTGreaterStrategyNumber - 1]
907 26135 : && xform[BTGreaterEqualStrategyNumber - 1])
908 : {
909 0 : ScanKey gt = xform[BTGreaterStrategyNumber - 1];
910 0 : ScanKey ge = xform[BTGreaterEqualStrategyNumber - 1];
911 :
912 0 : if (_bt_compare_scankey_args(scan, ge, gt, ge,
913 : &test_result))
914 : {
915 0 : if (test_result)
916 0 : xform[BTGreaterEqualStrategyNumber - 1] = NULL;
917 : else
918 0 : xform[BTGreaterStrategyNumber - 1] = NULL;
919 : }
920 : }
921 :
922 : /*
923 : * Emit the cleaned-up keys into the outkeys[] array, and then
924 : * mark them if they are required. They are required (possibly
925 : * only in one direction) if all attrs before this one had "=".
926 : */
927 3589824 : for (j = BTMaxStrategyNumber; --j >= 0;)
928 : {
929 2564160 : if (xform[j])
930 : {
931 512900 : ScanKey outkey = &outkeys[new_numberOfKeys++];
932 :
933 512900 : memcpy(outkey, xform[j], sizeof(ScanKeyData));
934 512900 : if (priorNumberOfEqualCols == attno - 1)
935 512847 : _bt_mark_scankey_required(outkey);
936 : }
937 : }
938 :
939 : /*
940 : * Exit loop here if done.
941 : */
942 512832 : if (i == numberOfKeys)
943 234501 : break;
944 :
945 : /* Re-initialize for new attno */
946 278331 : attno = cur->sk_attno;
947 278331 : memset(xform, 0, sizeof(xform));
948 : }
949 :
950 : /* check strategy this key's operator corresponds to */
951 512922 : j = cur->sk_strategy - 1;
952 :
953 : /* if row comparison, push it directly to the output array */
954 512922 : if (cur->sk_flags & SK_ROW_HEADER)
955 : {
956 0 : ScanKey outkey = &outkeys[new_numberOfKeys++];
957 :
958 0 : memcpy(outkey, cur, sizeof(ScanKeyData));
959 0 : if (numberOfEqualCols == attno - 1)
960 0 : _bt_mark_scankey_required(outkey);
961 :
962 : /*
963 : * We don't support RowCompare using equality; such a qual would
964 : * mess up the numberOfEqualCols tracking.
965 : */
966 0 : Assert(j != (BTEqualStrategyNumber - 1));
967 0 : continue;
968 : }
969 :
970 : /* have we seen one of these before? */
971 512922 : if (xform[j] == NULL)
972 : {
973 : /* nope, so remember this scankey */
974 512920 : xform[j] = cur;
975 : }
976 : else
977 : {
978 : /* yup, keep only the more restrictive key */
979 2 : if (_bt_compare_scankey_args(scan, cur, cur, xform[j],
980 : &test_result))
981 : {
982 2 : if (test_result)
983 2 : xform[j] = cur;
984 0 : else if (j == (BTEqualStrategyNumber - 1))
985 : {
986 : /* key == a && key == b, but a != b */
987 0 : so->qual_ok = false;
988 0 : return;
989 : }
990 : /* else old key is more restrictive, keep it */
991 : }
992 : else
993 : {
994 : /*
995 : * We can't determine which key is more restrictive. Keep the
996 : * previous one in xform[j] and push this one directly to the
997 : * output array.
998 : */
999 0 : ScanKey outkey = &outkeys[new_numberOfKeys++];
1000 :
1001 0 : memcpy(outkey, cur, sizeof(ScanKeyData));
1002 0 : if (numberOfEqualCols == attno - 1)
1003 0 : _bt_mark_scankey_required(outkey);
1004 : }
1005 : }
1006 512922 : }
1007 :
1008 234501 : so->numberOfKeys = new_numberOfKeys;
1009 : }
1010 :
1011 : /*
1012 : * Compare two scankey values using a specified operator.
1013 : *
1014 : * The test we want to perform is logically "leftarg op rightarg", where
1015 : * leftarg and rightarg are the sk_argument values in those ScanKeys, and
1016 : * the comparison operator is the one in the op ScanKey. However, in
1017 : * cross-data-type situations we may need to look up the correct operator in
1018 : * the index's opfamily: it is the one having amopstrategy = op->sk_strategy
1019 : * and amoplefttype/amoprighttype equal to the two argument datatypes.
1020 : *
1021 : * If the opfamily doesn't supply a complete set of cross-type operators we
1022 : * may not be able to make the comparison. If we can make the comparison
1023 : * we store the operator result in *result and return TRUE. We return FALSE
1024 : * if the comparison could not be made.
1025 : *
1026 : * Note: op always points at the same ScanKey as either leftarg or rightarg.
1027 : * Since we don't scribble on the scankeys, this aliasing should cause no
1028 : * trouble.
1029 : *
1030 : * Note: this routine needs to be insensitive to any DESC option applied
1031 : * to the index column. For example, "x < 4" is a tighter constraint than
1032 : * "x < 5" regardless of which way the index is sorted.
1033 : */
1034 : static bool
1035 14 : _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
1036 : ScanKey leftarg, ScanKey rightarg,
1037 : bool *result)
1038 : {
1039 14 : Relation rel = scan->indexRelation;
1040 : Oid lefttype,
1041 : righttype,
1042 : optype,
1043 : opcintype,
1044 : cmp_op;
1045 : StrategyNumber strat;
1046 :
1047 : /*
1048 : * First, deal with cases where one or both args are NULL. This should
1049 : * only happen when the scankeys represent IS NULL/NOT NULL conditions.
1050 : */
1051 14 : if ((leftarg->sk_flags | rightarg->sk_flags) & SK_ISNULL)
1052 : {
1053 : bool leftnull,
1054 : rightnull;
1055 :
1056 4 : if (leftarg->sk_flags & SK_ISNULL)
1057 : {
1058 0 : Assert(leftarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL));
1059 0 : leftnull = true;
1060 : }
1061 : else
1062 4 : leftnull = false;
1063 4 : if (rightarg->sk_flags & SK_ISNULL)
1064 : {
1065 4 : Assert(rightarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL));
1066 4 : rightnull = true;
1067 : }
1068 : else
1069 0 : rightnull = false;
1070 :
1071 : /*
1072 : * We treat NULL as either greater than or less than all other values.
1073 : * Since true > false, the tests below work correctly for NULLS LAST
1074 : * logic. If the index is NULLS FIRST, we need to flip the strategy.
1075 : */
1076 4 : strat = op->sk_strategy;
1077 4 : if (op->sk_flags & SK_BT_NULLS_FIRST)
1078 0 : strat = BTCommuteStrategyNumber(strat);
1079 :
1080 4 : switch (strat)
1081 : {
1082 : case BTLessStrategyNumber:
1083 4 : *result = (leftnull < rightnull);
1084 4 : break;
1085 : case BTLessEqualStrategyNumber:
1086 0 : *result = (leftnull <= rightnull);
1087 0 : break;
1088 : case BTEqualStrategyNumber:
1089 0 : *result = (leftnull == rightnull);
1090 0 : break;
1091 : case BTGreaterEqualStrategyNumber:
1092 0 : *result = (leftnull >= rightnull);
1093 0 : break;
1094 : case BTGreaterStrategyNumber:
1095 0 : *result = (leftnull > rightnull);
1096 0 : break;
1097 : default:
1098 0 : elog(ERROR, "unrecognized StrategyNumber: %d", (int) strat);
1099 : *result = false; /* keep compiler quiet */
1100 : break;
1101 : }
1102 4 : return true;
1103 : }
1104 :
1105 : /*
1106 : * The opfamily we need to worry about is identified by the index column.
1107 : */
1108 10 : Assert(leftarg->sk_attno == rightarg->sk_attno);
1109 :
1110 10 : opcintype = rel->rd_opcintype[leftarg->sk_attno - 1];
1111 :
1112 : /*
1113 : * Determine the actual datatypes of the ScanKey arguments. We have to
1114 : * support the convention that sk_subtype == InvalidOid means the opclass
1115 : * input type; this is a hack to simplify life for ScanKeyInit().
1116 : */
1117 10 : lefttype = leftarg->sk_subtype;
1118 10 : if (lefttype == InvalidOid)
1119 0 : lefttype = opcintype;
1120 10 : righttype = rightarg->sk_subtype;
1121 10 : if (righttype == InvalidOid)
1122 0 : righttype = opcintype;
1123 10 : optype = op->sk_subtype;
1124 10 : if (optype == InvalidOid)
1125 0 : optype = opcintype;
1126 :
1127 : /*
1128 : * If leftarg and rightarg match the types expected for the "op" scankey,
1129 : * we can use its already-looked-up comparison function.
1130 : */
1131 10 : if (lefttype == opcintype && righttype == optype)
1132 : {
1133 10 : *result = DatumGetBool(FunctionCall2Coll(&op->sk_func,
1134 : op->sk_collation,
1135 : leftarg->sk_argument,
1136 : rightarg->sk_argument));
1137 10 : return true;
1138 : }
1139 :
1140 : /*
1141 : * Otherwise, we need to go to the syscache to find the appropriate
1142 : * operator. (This cannot result in infinite recursion, since no
1143 : * indexscan initiated by syscache lookup will use cross-data-type
1144 : * operators.)
1145 : *
1146 : * If the sk_strategy was flipped by _bt_fix_scankey_strategy, we have to
1147 : * un-flip it to get the correct opfamily member.
1148 : */
1149 0 : strat = op->sk_strategy;
1150 0 : if (op->sk_flags & SK_BT_DESC)
1151 0 : strat = BTCommuteStrategyNumber(strat);
1152 :
1153 0 : cmp_op = get_opfamily_member(rel->rd_opfamily[leftarg->sk_attno - 1],
1154 : lefttype,
1155 : righttype,
1156 : strat);
1157 0 : if (OidIsValid(cmp_op))
1158 : {
1159 0 : RegProcedure cmp_proc = get_opcode(cmp_op);
1160 :
1161 0 : if (RegProcedureIsValid(cmp_proc))
1162 : {
1163 0 : *result = DatumGetBool(OidFunctionCall2Coll(cmp_proc,
1164 : op->sk_collation,
1165 : leftarg->sk_argument,
1166 : rightarg->sk_argument));
1167 0 : return true;
1168 : }
1169 : }
1170 :
1171 : /* Can't make the comparison */
1172 0 : *result = false; /* suppress compiler warnings */
1173 0 : return false;
1174 : }
1175 :
1176 : /*
1177 : * Adjust a scankey's strategy and flags setting as needed for indoptions.
1178 : *
1179 : * We copy the appropriate indoption value into the scankey sk_flags
1180 : * (shifting to avoid clobbering system-defined flag bits). Also, if
1181 : * the DESC option is set, commute (flip) the operator strategy number.
1182 : *
1183 : * A secondary purpose is to check for IS NULL/NOT NULL scankeys and set up
1184 : * the strategy field correctly for them.
1185 : *
1186 : * Lastly, for ordinary scankeys (not IS NULL/NOT NULL), we check for a
1187 : * NULL comparison value. Since all btree operators are assumed strict,
1188 : * a NULL means that the qual cannot be satisfied. We return TRUE if the
1189 : * comparison value isn't NULL, or FALSE if the scan should be abandoned.
1190 : *
1191 : * This function is applied to the *input* scankey structure; therefore
1192 : * on a rescan we will be looking at already-processed scankeys. Hence
1193 : * we have to be careful not to re-commute the strategy if we already did it.
1194 : * It's a bit ugly to modify the caller's copy of the scankey but in practice
1195 : * there shouldn't be any problem, since the index's indoptions are certainly
1196 : * not going to change while the scankey survives.
1197 : */
1198 : static bool
1199 667771 : _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
1200 : {
1201 : int addflags;
1202 :
1203 667771 : addflags = indoption[skey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
1204 :
1205 : /*
1206 : * We treat all btree operators as strict (even if they're not so marked
1207 : * in pg_proc). This means that it is impossible for an operator condition
1208 : * with a NULL comparison constant to succeed, and we can reject it right
1209 : * away.
1210 : *
1211 : * However, we now also support "x IS NULL" clauses as search conditions,
1212 : * so in that case keep going. The planner has not filled in any
1213 : * particular strategy in this case, so set it to BTEqualStrategyNumber
1214 : * --- we can treat IS NULL as an equality operator for purposes of search
1215 : * strategy.
1216 : *
1217 : * Likewise, "x IS NOT NULL" is supported. We treat that as either "less
1218 : * than NULL" in a NULLS LAST index, or "greater than NULL" in a NULLS
1219 : * FIRST index.
1220 : *
1221 : * Note: someday we might have to fill in sk_collation from the index
1222 : * column's collation. At the moment this is a non-issue because we'll
1223 : * never actually call the comparison operator on a NULL.
1224 : */
1225 667771 : if (skey->sk_flags & SK_ISNULL)
1226 : {
1227 : /* SK_ISNULL shouldn't be set in a row header scankey */
1228 667 : Assert(!(skey->sk_flags & SK_ROW_HEADER));
1229 :
1230 : /* Set indoption flags in scankey (might be done already) */
1231 667 : skey->sk_flags |= addflags;
1232 :
1233 : /* Set correct strategy for IS NULL or NOT NULL search */
1234 667 : if (skey->sk_flags & SK_SEARCHNULL)
1235 : {
1236 20 : skey->sk_strategy = BTEqualStrategyNumber;
1237 20 : skey->sk_subtype = InvalidOid;
1238 20 : skey->sk_collation = InvalidOid;
1239 : }
1240 647 : else if (skey->sk_flags & SK_SEARCHNOTNULL)
1241 : {
1242 646 : if (skey->sk_flags & SK_BT_NULLS_FIRST)
1243 6 : skey->sk_strategy = BTGreaterStrategyNumber;
1244 : else
1245 640 : skey->sk_strategy = BTLessStrategyNumber;
1246 646 : skey->sk_subtype = InvalidOid;
1247 646 : skey->sk_collation = InvalidOid;
1248 : }
1249 : else
1250 : {
1251 : /* regular qual, so it cannot be satisfied */
1252 1 : return false;
1253 : }
1254 :
1255 : /* Needn't do the rest */
1256 666 : return true;
1257 : }
1258 :
1259 : /* Adjust strategy for DESC, if we didn't already */
1260 667104 : if ((addflags & SK_BT_DESC) && !(skey->sk_flags & SK_BT_DESC))
1261 0 : skey->sk_strategy = BTCommuteStrategyNumber(skey->sk_strategy);
1262 667104 : skey->sk_flags |= addflags;
1263 :
1264 : /* If it's a row header, fix row member flags and strategies similarly */
1265 667104 : if (skey->sk_flags & SK_ROW_HEADER)
1266 : {
1267 2 : ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
1268 :
1269 : for (;;)
1270 : {
1271 4 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
1272 4 : addflags = indoption[subkey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
1273 4 : if ((addflags & SK_BT_DESC) && !(subkey->sk_flags & SK_BT_DESC))
1274 0 : subkey->sk_strategy = BTCommuteStrategyNumber(subkey->sk_strategy);
1275 4 : subkey->sk_flags |= addflags;
1276 4 : if (subkey->sk_flags & SK_ROW_END)
1277 2 : break;
1278 2 : subkey++;
1279 2 : }
1280 : }
1281 :
1282 667104 : return true;
1283 : }
1284 :
1285 : /*
1286 : * Mark a scankey as "required to continue the scan".
1287 : *
1288 : * Depending on the operator type, the key may be required for both scan
1289 : * directions or just one. Also, if the key is a row comparison header,
1290 : * we have to mark its first subsidiary ScanKey as required. (Subsequent
1291 : * subsidiary ScanKeys are normally for lower-order columns, and thus
1292 : * cannot be required, since they're after the first non-equality scankey.)
1293 : *
1294 : * Note: when we set required-key flag bits in a subsidiary scankey, we are
1295 : * scribbling on a data structure belonging to the index AM's caller, not on
1296 : * our private copy. This should be OK because the marking will not change
1297 : * from scan to scan within a query, and so we'd just re-mark the same way
1298 : * anyway on a rescan. Something to keep an eye on though.
1299 : */
1300 : static void
1301 667678 : _bt_mark_scankey_required(ScanKey skey)
1302 : {
1303 : int addflags;
1304 :
1305 667678 : switch (skey->sk_strategy)
1306 : {
1307 : case BTLessStrategyNumber:
1308 : case BTLessEqualStrategyNumber:
1309 773 : addflags = SK_BT_REQFWD;
1310 773 : break;
1311 : case BTEqualStrategyNumber:
1312 640173 : addflags = SK_BT_REQFWD | SK_BT_REQBKWD;
1313 640173 : break;
1314 : case BTGreaterEqualStrategyNumber:
1315 : case BTGreaterStrategyNumber:
1316 26732 : addflags = SK_BT_REQBKWD;
1317 26732 : break;
1318 : default:
1319 0 : elog(ERROR, "unrecognized StrategyNumber: %d",
1320 : (int) skey->sk_strategy);
1321 : addflags = 0; /* keep compiler quiet */
1322 : break;
1323 : }
1324 :
1325 667678 : skey->sk_flags |= addflags;
1326 :
1327 667678 : if (skey->sk_flags & SK_ROW_HEADER)
1328 : {
1329 2 : ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
1330 :
1331 : /* First subkey should be same column/operator as the header */
1332 2 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
1333 2 : Assert(subkey->sk_attno == skey->sk_attno);
1334 2 : Assert(subkey->sk_strategy == skey->sk_strategy);
1335 2 : subkey->sk_flags |= addflags;
1336 : }
1337 667678 : }
1338 :
1339 : /*
1340 : * Test whether an indextuple satisfies all the scankey conditions.
1341 : *
1342 : * If so, return the address of the index tuple on the index page.
1343 : * If not, return NULL.
1344 : *
1345 : * If the tuple fails to pass the qual, we also determine whether there's
1346 : * any need to continue the scan beyond this tuple, and set *continuescan
1347 : * accordingly. See comments for _bt_preprocess_keys(), above, about how
1348 : * this is done.
1349 : *
1350 : * scan: index scan descriptor (containing a search-type scankey)
1351 : * page: buffer page containing index tuple
1352 : * offnum: offset number of index tuple (must be a valid item!)
1353 : * dir: direction we are scanning in
1354 : * continuescan: output parameter (will be set correctly in all cases)
1355 : *
1356 : * Caller must hold pin and lock on the index page.
1357 : */
1358 : IndexTuple
1359 2445616 : _bt_checkkeys(IndexScanDesc scan,
1360 : Page page, OffsetNumber offnum,
1361 : ScanDirection dir, bool *continuescan)
1362 : {
1363 2445616 : ItemId iid = PageGetItemId(page, offnum);
1364 : bool tuple_alive;
1365 : IndexTuple tuple;
1366 : TupleDesc tupdesc;
1367 : BTScanOpaque so;
1368 : int keysz;
1369 : int ikey;
1370 : ScanKey key;
1371 :
1372 2445616 : *continuescan = true; /* default assumption */
1373 :
1374 : /*
1375 : * If the scan specifies not to return killed tuples, then we treat a
1376 : * killed tuple as not passing the qual. Most of the time, it's a win to
1377 : * not bother examining the tuple's index keys, but just return
1378 : * immediately with continuescan = true to proceed to the next tuple.
1379 : * However, if this is the last tuple on the page, we should check the
1380 : * index keys to prevent uselessly advancing to the next page.
1381 : */
1382 2445616 : if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1383 : {
1384 : /* return immediately if there are more tuples on the page */
1385 62556 : if (ScanDirectionIsForward(dir))
1386 : {
1387 60227 : if (offnum < PageGetMaxOffsetNumber(page))
1388 59846 : return NULL;
1389 : }
1390 : else
1391 : {
1392 2329 : BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1393 :
1394 2329 : if (offnum > P_FIRSTDATAKEY(opaque))
1395 2315 : return NULL;
1396 : }
1397 :
1398 : /*
1399 : * OK, we want to check the keys so we can set continuescan correctly,
1400 : * but we'll return NULL even if the tuple passes the key tests.
1401 : */
1402 395 : tuple_alive = false;
1403 : }
1404 : else
1405 2383060 : tuple_alive = true;
1406 :
1407 2383455 : tuple = (IndexTuple) PageGetItem(page, iid);
1408 :
1409 2383455 : tupdesc = RelationGetDescr(scan->indexRelation);
1410 2383455 : so = (BTScanOpaque) scan->opaque;
1411 2383455 : keysz = so->numberOfKeys;
1412 :
1413 5082982 : for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
1414 : {
1415 : Datum datum;
1416 : bool isNull;
1417 : Datum test;
1418 :
1419 : /* row-comparison keys need special processing */
1420 3025031 : if (key->sk_flags & SK_ROW_HEADER)
1421 : {
1422 1027 : if (_bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan))
1423 163694 : continue;
1424 326504 : return NULL;
1425 : }
1426 :
1427 3024004 : datum = index_getattr(tuple,
1428 : key->sk_attno,
1429 : tupdesc,
1430 : &isNull);
1431 :
1432 3024004 : if (key->sk_flags & SK_ISNULL)
1433 : {
1434 : /* Handle IS NULL/NOT NULL tests */
1435 171652 : if (key->sk_flags & SK_SEARCHNULL)
1436 : {
1437 8022 : if (isNull)
1438 20 : continue; /* tuple satisfies this qual */
1439 : }
1440 : else
1441 : {
1442 163630 : Assert(key->sk_flags & SK_SEARCHNOTNULL);
1443 163630 : if (!isNull)
1444 163620 : continue; /* tuple satisfies this qual */
1445 : }
1446 :
1447 : /*
1448 : * Tuple fails this qual. If it's a required qual for the current
1449 : * scan direction, then we can conclude no further tuples will
1450 : * pass, either.
1451 : */
1452 8012 : if ((key->sk_flags & SK_BT_REQFWD) &&
1453 : ScanDirectionIsForward(dir))
1454 4 : *continuescan = false;
1455 8008 : else if ((key->sk_flags & SK_BT_REQBKWD) &&
1456 : ScanDirectionIsBackward(dir))
1457 0 : *continuescan = false;
1458 :
1459 : /*
1460 : * In any case, this indextuple doesn't match the qual.
1461 : */
1462 8012 : return NULL;
1463 : }
1464 :
1465 2852352 : if (isNull)
1466 : {
1467 20 : if (key->sk_flags & SK_BT_NULLS_FIRST)
1468 : {
1469 : /*
1470 : * Since NULLs are sorted before non-NULLs, we know we have
1471 : * reached the lower limit of the range of values for this
1472 : * index attr. On a backward scan, we can stop if this qual
1473 : * is one of the "must match" subset. We can stop regardless
1474 : * of whether the qual is > or <, so long as it's required,
1475 : * because it's not possible for any future tuples to pass. On
1476 : * a forward scan, however, we must keep going, because we may
1477 : * have initially positioned to the start of the index.
1478 : */
1479 0 : if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
1480 : ScanDirectionIsBackward(dir))
1481 0 : *continuescan = false;
1482 : }
1483 : else
1484 : {
1485 : /*
1486 : * Since NULLs are sorted after non-NULLs, we know we have
1487 : * reached the upper limit of the range of values for this
1488 : * index attr. On a forward scan, we can stop if this qual is
1489 : * one of the "must match" subset. We can stop regardless of
1490 : * whether the qual is > or <, so long as it's required,
1491 : * because it's not possible for any future tuples to pass. On
1492 : * a backward scan, however, we must keep going, because we
1493 : * may have initially positioned to the end of the index.
1494 : */
1495 20 : if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
1496 : ScanDirectionIsForward(dir))
1497 12 : *continuescan = false;
1498 : }
1499 :
1500 : /*
1501 : * In any case, this indextuple doesn't match the qual.
1502 : */
1503 20 : return NULL;
1504 : }
1505 :
1506 2852332 : test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
1507 : datum, key->sk_argument);
1508 :
1509 2852332 : if (!DatumGetBool(test))
1510 : {
1511 : /*
1512 : * Tuple fails this qual. If it's a required qual for the current
1513 : * scan direction, then we can conclude no further tuples will
1514 : * pass, either.
1515 : *
1516 : * Note: because we stop the scan as soon as any required equality
1517 : * qual fails, it is critical that equality quals be used for the
1518 : * initial positioning in _bt_first() when they are available. See
1519 : * comments in _bt_first().
1520 : */
1521 316472 : if ((key->sk_flags & SK_BT_REQFWD) &&
1522 : ScanDirectionIsForward(dir))
1523 302830 : *continuescan = false;
1524 13642 : else if ((key->sk_flags & SK_BT_REQBKWD) &&
1525 : ScanDirectionIsBackward(dir))
1526 13 : *continuescan = false;
1527 :
1528 : /*
1529 : * In any case, this indextuple doesn't match the qual.
1530 : */
1531 316472 : return NULL;
1532 : }
1533 : }
1534 :
1535 : /* Check for failure due to it being a killed tuple. */
1536 2057951 : if (!tuple_alive)
1537 285 : return NULL;
1538 :
1539 : /* If we get here, the tuple passes all index quals. */
1540 2057666 : return tuple;
1541 : }
1542 :
1543 : /*
1544 : * Test whether an indextuple satisfies a row-comparison scan condition.
1545 : *
1546 : * Return true if so, false if not. If not, also clear *continuescan if
1547 : * it's not possible for any future tuples in the current scan direction
1548 : * to pass the qual.
1549 : *
1550 : * This is a subroutine for _bt_checkkeys, which see for more info.
1551 : */
1552 : static bool
1553 1027 : _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
1554 : ScanDirection dir, bool *continuescan)
1555 : {
1556 1027 : ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
1557 1027 : int32 cmpresult = 0;
1558 : bool result;
1559 :
1560 : /* First subkey should be same as the header says */
1561 1027 : Assert(subkey->sk_attno == skey->sk_attno);
1562 :
1563 : /* Loop over columns of the row condition */
1564 : for (;;)
1565 : {
1566 : Datum datum;
1567 : bool isNull;
1568 :
1569 2033 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
1570 :
1571 2033 : datum = index_getattr(tuple,
1572 : subkey->sk_attno,
1573 : tupdesc,
1574 : &isNull);
1575 :
1576 2033 : if (isNull)
1577 : {
1578 1000 : if (subkey->sk_flags & SK_BT_NULLS_FIRST)
1579 : {
1580 : /*
1581 : * Since NULLs are sorted before non-NULLs, we know we have
1582 : * reached the lower limit of the range of values for this
1583 : * index attr. On a backward scan, we can stop if this qual
1584 : * is one of the "must match" subset. We can stop regardless
1585 : * of whether the qual is > or <, so long as it's required,
1586 : * because it's not possible for any future tuples to pass. On
1587 : * a forward scan, however, we must keep going, because we may
1588 : * have initially positioned to the start of the index.
1589 : */
1590 0 : if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
1591 : ScanDirectionIsBackward(dir))
1592 0 : *continuescan = false;
1593 : }
1594 : else
1595 : {
1596 : /*
1597 : * Since NULLs are sorted after non-NULLs, we know we have
1598 : * reached the upper limit of the range of values for this
1599 : * index attr. On a forward scan, we can stop if this qual is
1600 : * one of the "must match" subset. We can stop regardless of
1601 : * whether the qual is > or <, so long as it's required,
1602 : * because it's not possible for any future tuples to pass. On
1603 : * a backward scan, however, we must keep going, because we
1604 : * may have initially positioned to the end of the index.
1605 : */
1606 1000 : if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
1607 : ScanDirectionIsForward(dir))
1608 0 : *continuescan = false;
1609 : }
1610 :
1611 : /*
1612 : * In any case, this indextuple doesn't match the qual.
1613 : */
1614 2000 : return false;
1615 : }
1616 :
1617 1033 : if (subkey->sk_flags & SK_ISNULL)
1618 : {
1619 : /*
1620 : * Unlike the simple-scankey case, this isn't a disallowed case.
1621 : * But it can never match. If all the earlier row comparison
1622 : * columns are required for the scan direction, we can stop the
1623 : * scan, because there can't be another tuple that will succeed.
1624 : */
1625 0 : if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument))
1626 0 : subkey--;
1627 0 : if ((subkey->sk_flags & SK_BT_REQFWD) &&
1628 : ScanDirectionIsForward(dir))
1629 0 : *continuescan = false;
1630 0 : else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
1631 : ScanDirectionIsBackward(dir))
1632 0 : *continuescan = false;
1633 0 : return false;
1634 : }
1635 :
1636 : /* Perform the test --- three-way comparison not bool operator */
1637 1033 : cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func,
1638 : subkey->sk_collation,
1639 : datum,
1640 : subkey->sk_argument));
1641 :
1642 1033 : if (subkey->sk_flags & SK_BT_DESC)
1643 0 : cmpresult = -cmpresult;
1644 :
1645 : /* Done comparing if unequal, else advance to next column */
1646 1033 : if (cmpresult != 0)
1647 54 : break;
1648 :
1649 1006 : if (subkey->sk_flags & SK_ROW_END)
1650 0 : break;
1651 1006 : subkey++;
1652 1006 : }
1653 :
1654 : /*
1655 : * At this point cmpresult indicates the overall result of the row
1656 : * comparison, and subkey points to the deciding column (or the last
1657 : * column if the result is "=").
1658 : */
1659 27 : switch (subkey->sk_strategy)
1660 : {
1661 : /* EQ and NE cases aren't allowed here */
1662 : case BTLessStrategyNumber:
1663 0 : result = (cmpresult < 0);
1664 0 : break;
1665 : case BTLessEqualStrategyNumber:
1666 0 : result = (cmpresult <= 0);
1667 0 : break;
1668 : case BTGreaterEqualStrategyNumber:
1669 25 : result = (cmpresult >= 0);
1670 25 : break;
1671 : case BTGreaterStrategyNumber:
1672 2 : result = (cmpresult > 0);
1673 2 : break;
1674 : default:
1675 0 : elog(ERROR, "unrecognized RowCompareType: %d",
1676 : (int) subkey->sk_strategy);
1677 : result = 0; /* keep compiler quiet */
1678 : break;
1679 : }
1680 :
1681 27 : if (!result)
1682 : {
1683 : /*
1684 : * Tuple fails this qual. If it's a required qual for the current
1685 : * scan direction, then we can conclude no further tuples will pass,
1686 : * either. Note we have to look at the deciding column, not
1687 : * necessarily the first or last column of the row condition.
1688 : */
1689 0 : if ((subkey->sk_flags & SK_BT_REQFWD) &&
1690 : ScanDirectionIsForward(dir))
1691 0 : *continuescan = false;
1692 0 : else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
1693 : ScanDirectionIsBackward(dir))
1694 0 : *continuescan = false;
1695 : }
1696 :
1697 27 : return result;
1698 : }
1699 :
1700 : /*
1701 : * _bt_killitems - set LP_DEAD state for items an indexscan caller has
1702 : * told us were killed
1703 : *
1704 : * scan->opaque, referenced locally through so, contains information about the
1705 : * current page and killed tuples thereon (generally, this should only be
1706 : * called if so->numKilled > 0).
1707 : *
1708 : * The caller does not have a lock on the page and may or may not have the
1709 : * page pinned in a buffer. Note that read-lock is sufficient for setting
1710 : * LP_DEAD status (which is only a hint).
1711 : *
1712 : * We match items by heap TID before assuming they are the right ones to
1713 : * delete. We cope with cases where items have moved right due to insertions.
1714 : * If an item has moved off the current page due to a split, we'll fail to
1715 : * find it and do nothing (this is not an error case --- we assume the item
1716 : * will eventually get marked in a future indexscan).
1717 : *
1718 : * Note that if we hold a pin on the target page continuously from initially
1719 : * reading the items until applying this function, VACUUM cannot have deleted
1720 : * any items from the page, and so there is no need to search left from the
1721 : * recorded offset. (This observation also guarantees that the item is still
1722 : * the right one to delete, which might otherwise be questionable since heap
1723 : * TIDs can get recycled.) This holds true even if the page has been modified
1724 : * by inserts and page splits, so there is no need to consult the LSN.
1725 : *
1726 : * If the pin was released after reading the page, then we re-read it. If it
1727 : * has been modified since we read it (as determined by the LSN), we dare not
1728 : * flag any entries because it is possible that the old entry was vacuumed
1729 : * away and the TID was re-used by a completely different heap tuple.
1730 : */
1731 : void
1732 2638 : _bt_killitems(IndexScanDesc scan)
1733 : {
1734 2638 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1735 : Page page;
1736 : BTPageOpaque opaque;
1737 : OffsetNumber minoff;
1738 : OffsetNumber maxoff;
1739 : int i;
1740 2638 : int numKilled = so->numKilled;
1741 2638 : bool killedsomething = false;
1742 :
1743 2638 : Assert(BTScanPosIsValid(so->currPos));
1744 :
1745 : /*
1746 : * Always reset the scan state, so we don't look for same items on other
1747 : * pages.
1748 : */
1749 2638 : so->numKilled = 0;
1750 :
1751 2638 : if (BTScanPosIsPinned(so->currPos))
1752 : {
1753 : /*
1754 : * We have held the pin on this page since we read the index tuples,
1755 : * so all we need to do is lock it. The pin will have prevented
1756 : * re-use of any TID on the page, so there is no need to check the
1757 : * LSN.
1758 : */
1759 28 : LockBuffer(so->currPos.buf, BT_READ);
1760 :
1761 28 : page = BufferGetPage(so->currPos.buf);
1762 : }
1763 : else
1764 : {
1765 : Buffer buf;
1766 :
1767 : /* Attempt to re-read the buffer, getting pin and lock. */
1768 2610 : buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ);
1769 :
1770 : /* It might not exist anymore; in which case we can't hint it. */
1771 2610 : if (!BufferIsValid(buf))
1772 0 : return;
1773 :
1774 2610 : page = BufferGetPage(buf);
1775 2610 : if (PageGetLSN(page) == so->currPos.lsn)
1776 2601 : so->currPos.buf = buf;
1777 : else
1778 : {
1779 : /* Modified while not pinned means hinting is not safe. */
1780 9 : _bt_relbuf(scan->indexRelation, buf);
1781 9 : return;
1782 : }
1783 : }
1784 :
1785 2629 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1786 2629 : minoff = P_FIRSTDATAKEY(opaque);
1787 2629 : maxoff = PageGetMaxOffsetNumber(page);
1788 :
1789 8010 : for (i = 0; i < numKilled; i++)
1790 : {
1791 5381 : int itemIndex = so->killedItems[i];
1792 5381 : BTScanPosItem *kitem = &so->currPos.items[itemIndex];
1793 5381 : OffsetNumber offnum = kitem->indexOffset;
1794 :
1795 5381 : Assert(itemIndex >= so->currPos.firstItem &&
1796 : itemIndex <= so->currPos.lastItem);
1797 5381 : if (offnum < minoff)
1798 0 : continue; /* pure paranoia */
1799 10762 : while (offnum <= maxoff)
1800 : {
1801 5381 : ItemId iid = PageGetItemId(page, offnum);
1802 5381 : IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
1803 :
1804 5381 : if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
1805 : {
1806 : /* found the item */
1807 5381 : ItemIdMarkDead(iid);
1808 5381 : killedsomething = true;
1809 5381 : break; /* out of inner search loop */
1810 : }
1811 0 : offnum = OffsetNumberNext(offnum);
1812 : }
1813 : }
1814 :
1815 : /*
1816 : * Since this can be redone later if needed, mark as dirty hint.
1817 : *
1818 : * Whenever we mark anything LP_DEAD, we also set the page's
1819 : * BTP_HAS_GARBAGE flag, which is likewise just a hint.
1820 : */
1821 2629 : if (killedsomething)
1822 : {
1823 2629 : opaque->btpo_flags |= BTP_HAS_GARBAGE;
1824 2629 : MarkBufferDirtyHint(so->currPos.buf, true);
1825 : }
1826 :
1827 2629 : LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1828 : }
1829 :
1830 :
1831 : /*
1832 : * The following routines manage a shared-memory area in which we track
1833 : * assignment of "vacuum cycle IDs" to currently-active btree vacuuming
1834 : * operations. There is a single counter which increments each time we
1835 : * start a vacuum to assign it a cycle ID. Since multiple vacuums could
1836 : * be active concurrently, we have to track the cycle ID for each active
1837 : * vacuum; this requires at most MaxBackends entries (usually far fewer).
1838 : * We assume at most one vacuum can be active for a given index.
1839 : *
1840 : * Access to the shared memory area is controlled by BtreeVacuumLock.
1841 : * In principle we could use a separate lmgr locktag for each index,
1842 : * but a single LWLock is much cheaper, and given the short time that
1843 : * the lock is ever held, the concurrency hit should be minimal.
1844 : */
1845 :
1846 : typedef struct BTOneVacInfo
1847 : {
1848 : LockRelId relid; /* global identifier of an index */
1849 : BTCycleId cycleid; /* cycle ID for its active VACUUM */
1850 : } BTOneVacInfo;
1851 :
1852 : typedef struct BTVacInfo
1853 : {
1854 : BTCycleId cycle_ctr; /* cycle ID most recently assigned */
1855 : int num_vacuums; /* number of currently active VACUUMs */
1856 : int max_vacuums; /* allocated length of vacuums[] array */
1857 : BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER];
1858 : } BTVacInfo;
1859 :
1860 : static BTVacInfo *btvacinfo;
1861 :
1862 :
1863 : /*
1864 : * _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index,
1865 : * or zero if there is no active VACUUM
1866 : *
1867 : * Note: for correct interlocking, the caller must already hold pin and
1868 : * exclusive lock on each buffer it will store the cycle ID into. This
1869 : * ensures that even if a VACUUM starts immediately afterwards, it cannot
1870 : * process those pages until the page split is complete.
1871 : */
1872 : BTCycleId
1873 869 : _bt_vacuum_cycleid(Relation rel)
1874 : {
1875 869 : BTCycleId result = 0;
1876 : int i;
1877 :
1878 : /* Share lock is enough since this is a read-only operation */
1879 869 : LWLockAcquire(BtreeVacuumLock, LW_SHARED);
1880 :
1881 869 : for (i = 0; i < btvacinfo->num_vacuums; i++)
1882 : {
1883 0 : BTOneVacInfo *vac = &btvacinfo->vacuums[i];
1884 :
1885 0 : if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
1886 0 : vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
1887 : {
1888 0 : result = vac->cycleid;
1889 0 : break;
1890 : }
1891 : }
1892 :
1893 869 : LWLockRelease(BtreeVacuumLock);
1894 869 : return result;
1895 : }
1896 :
1897 : /*
1898 : * _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation
1899 : *
1900 : * Note: the caller must guarantee that it will eventually call
1901 : * _bt_end_vacuum, else we'll permanently leak an array slot. To ensure
1902 : * that this happens even in elog(FATAL) scenarios, the appropriate coding
1903 : * is not just a PG_TRY, but
1904 : * PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel))
1905 : */
1906 : BTCycleId
1907 123 : _bt_start_vacuum(Relation rel)
1908 : {
1909 : BTCycleId result;
1910 : int i;
1911 : BTOneVacInfo *vac;
1912 :
1913 123 : LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
1914 :
1915 : /*
1916 : * Assign the next cycle ID, being careful to avoid zero as well as the
1917 : * reserved high values.
1918 : */
1919 123 : result = ++(btvacinfo->cycle_ctr);
1920 123 : if (result == 0 || result > MAX_BT_CYCLE_ID)
1921 0 : result = btvacinfo->cycle_ctr = 1;
1922 :
1923 : /* Let's just make sure there's no entry already for this index */
1924 123 : for (i = 0; i < btvacinfo->num_vacuums; i++)
1925 : {
1926 0 : vac = &btvacinfo->vacuums[i];
1927 0 : if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
1928 0 : vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
1929 : {
1930 : /*
1931 : * Unlike most places in the backend, we have to explicitly
1932 : * release our LWLock before throwing an error. This is because
1933 : * we expect _bt_end_vacuum() to be called before transaction
1934 : * abort cleanup can run to release LWLocks.
1935 : */
1936 0 : LWLockRelease(BtreeVacuumLock);
1937 0 : elog(ERROR, "multiple active vacuums for index \"%s\"",
1938 : RelationGetRelationName(rel));
1939 : }
1940 : }
1941 :
1942 : /* OK, add an entry */
1943 123 : if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums)
1944 : {
1945 0 : LWLockRelease(BtreeVacuumLock);
1946 0 : elog(ERROR, "out of btvacinfo slots");
1947 : }
1948 123 : vac = &btvacinfo->vacuums[btvacinfo->num_vacuums];
1949 123 : vac->relid = rel->rd_lockInfo.lockRelId;
1950 123 : vac->cycleid = result;
1951 123 : btvacinfo->num_vacuums++;
1952 :
1953 123 : LWLockRelease(BtreeVacuumLock);
1954 123 : return result;
1955 : }
1956 :
1957 : /*
1958 : * _bt_end_vacuum --- mark a btree VACUUM operation as done
1959 : *
1960 : * Note: this is deliberately coded not to complain if no entry is found;
1961 : * this allows the caller to put PG_TRY around the start_vacuum operation.
1962 : */
1963 : void
1964 123 : _bt_end_vacuum(Relation rel)
1965 : {
1966 : int i;
1967 :
1968 123 : LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
1969 :
1970 : /* Find the array entry */
1971 123 : for (i = 0; i < btvacinfo->num_vacuums; i++)
1972 : {
1973 123 : BTOneVacInfo *vac = &btvacinfo->vacuums[i];
1974 :
1975 246 : if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
1976 123 : vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
1977 : {
1978 : /* Remove it by shifting down the last entry */
1979 123 : *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
1980 123 : btvacinfo->num_vacuums--;
1981 123 : break;
1982 : }
1983 : }
1984 :
1985 123 : LWLockRelease(BtreeVacuumLock);
1986 123 : }
1987 :
1988 : /*
1989 : * _bt_end_vacuum wrapped as an on_shmem_exit callback function
1990 : */
1991 : void
1992 0 : _bt_end_vacuum_callback(int code, Datum arg)
1993 : {
1994 0 : _bt_end_vacuum((Relation) DatumGetPointer(arg));
1995 0 : }
1996 :
1997 : /*
1998 : * BTreeShmemSize --- report amount of shared memory space needed
1999 : */
2000 : Size
2001 10 : BTreeShmemSize(void)
2002 : {
2003 : Size size;
2004 :
2005 10 : size = offsetof(BTVacInfo, vacuums);
2006 10 : size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
2007 10 : return size;
2008 : }
2009 :
2010 : /*
2011 : * BTreeShmemInit --- initialize this module's shared memory
2012 : */
2013 : void
2014 5 : BTreeShmemInit(void)
2015 : {
2016 : bool found;
2017 :
2018 5 : btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
2019 : BTreeShmemSize(),
2020 : &found);
2021 :
2022 5 : if (!IsUnderPostmaster)
2023 : {
2024 : /* Initialize shared memory area */
2025 5 : Assert(!found);
2026 :
2027 : /*
2028 : * It doesn't really matter what the cycle counter starts at, but
2029 : * having it always start the same doesn't seem good. Seed with
2030 : * low-order bits of time() instead.
2031 : */
2032 5 : btvacinfo->cycle_ctr = (BTCycleId) time(NULL);
2033 :
2034 5 : btvacinfo->num_vacuums = 0;
2035 5 : btvacinfo->max_vacuums = MaxBackends;
2036 : }
2037 : else
2038 0 : Assert(found);
2039 5 : }
2040 :
2041 : bytea *
2042 19 : btoptions(Datum reloptions, bool validate)
2043 : {
2044 19 : return default_reloptions(reloptions, validate, RELOPT_KIND_BTREE);
2045 : }
2046 :
2047 : /*
2048 : * btproperty() -- Check boolean properties of indexes.
2049 : *
2050 : * This is optional, but handling AMPROP_RETURNABLE here saves opening the rel
2051 : * to call btcanreturn.
2052 : */
2053 : bool
2054 98 : btproperty(Oid index_oid, int attno,
2055 : IndexAMProperty prop, const char *propname,
2056 : bool *res, bool *isnull)
2057 : {
2058 98 : switch (prop)
2059 : {
2060 : case AMPROP_RETURNABLE:
2061 : /* answer only for columns, not AM or whole index */
2062 4 : if (attno == 0)
2063 2 : return false;
2064 : /* otherwise, btree can always return data */
2065 2 : *res = true;
2066 2 : return true;
2067 :
2068 : default:
2069 94 : return false; /* punt to generic code */
2070 : }
2071 : }
|