Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * ginget.c
4 : * fetch tuples from a GIN scan.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/gin/ginget.c
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include "access/gin_private.h"
18 : #include "access/relscan.h"
19 : #include "miscadmin.h"
20 : #include "utils/datum.h"
21 : #include "utils/memutils.h"
22 :
23 : /* GUC parameter */
24 : int GinFuzzySearchLimit = 0;
25 :
26 : typedef struct pendingPosition
27 : {
28 : Buffer pendingBuffer;
29 : OffsetNumber firstOffset;
30 : OffsetNumber lastOffset;
31 : ItemPointerData item;
32 : bool *hasMatchKey;
33 : } pendingPosition;
34 :
35 :
36 : /*
37 : * Goes to the next page if current offset is outside of bounds
38 : */
39 : static bool
40 4142 : moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack)
41 : {
42 4142 : Page page = BufferGetPage(stack->buffer);
43 :
44 4142 : if (stack->off > PageGetMaxOffsetNumber(page))
45 : {
46 : /*
47 : * We scanned the whole page, so we should take right page
48 : */
49 38 : if (GinPageRightMost(page))
50 3 : return false; /* no more pages */
51 :
52 35 : stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
53 35 : stack->blkno = BufferGetBlockNumber(stack->buffer);
54 35 : stack->off = FirstOffsetNumber;
55 : }
56 :
57 4139 : return true;
58 : }
59 :
60 : /*
61 : * Scan all pages of a posting tree and save all its heap ItemPointers
62 : * in scanEntry->matchBitmap
63 : */
64 : static void
65 0 : scanPostingTree(Relation index, GinScanEntry scanEntry,
66 : BlockNumber rootPostingTree, Snapshot snapshot)
67 : {
68 : GinBtreeData btree;
69 : GinBtreeStack *stack;
70 : Buffer buffer;
71 : Page page;
72 :
73 : /* Descend to the leftmost leaf page */
74 0 : stack = ginScanBeginPostingTree(&btree, index, rootPostingTree, snapshot);
75 0 : buffer = stack->buffer;
76 0 : IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */
77 :
78 0 : freeGinBtreeStack(stack);
79 :
80 : /*
81 : * Loop iterates through all leaf pages of posting tree
82 : */
83 : for (;;)
84 : {
85 0 : page = BufferGetPage(buffer);
86 0 : if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
87 : {
88 0 : int n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap);
89 :
90 0 : scanEntry->predictNumberResult += n;
91 : }
92 :
93 0 : if (GinPageRightMost(page))
94 0 : break; /* no more pages */
95 :
96 0 : buffer = ginStepRight(buffer, index, GIN_SHARE);
97 0 : }
98 :
99 0 : UnlockReleaseBuffer(buffer);
100 0 : }
101 :
102 : /*
103 : * Collects TIDs into scanEntry->matchBitmap for all heap tuples that
104 : * match the search entry. This supports three different match modes:
105 : *
106 : * 1. Partial-match support: scan from current point until the
107 : * comparePartialFn says we're done.
108 : * 2. SEARCH_MODE_ALL: scan from current point (which should be first
109 : * key for the current attnum) until we hit null items or end of attnum
110 : * 3. SEARCH_MODE_EVERYTHING: scan from current point (which should be first
111 : * key for the current attnum) until we hit end of attnum
112 : *
113 : * Returns true if done, false if it's necessary to restart scan from scratch
114 : */
115 : static bool
116 7 : collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
117 : GinScanEntry scanEntry, Snapshot snapshot)
118 : {
119 : OffsetNumber attnum;
120 : Form_pg_attribute attr;
121 :
122 : /* Initialize empty bitmap result */
123 7 : scanEntry->matchBitmap = tbm_create(work_mem * 1024L, NULL);
124 :
125 : /* Null query cannot partial-match anything */
126 9 : if (scanEntry->isPartialMatch &&
127 2 : scanEntry->queryCategory != GIN_CAT_NORM_KEY)
128 0 : return true;
129 :
130 : /* Locate tupdesc entry for key column (for attbyval/attlen data) */
131 7 : attnum = scanEntry->attnum;
132 7 : attr = TupleDescAttr(btree->ginstate->origTupdesc, attnum - 1);
133 :
134 : for (;;)
135 : {
136 : Page page;
137 : IndexTuple itup;
138 : Datum idatum;
139 : GinNullCategory icategory;
140 :
141 : /*
142 : * stack->off points to the interested entry, buffer is already locked
143 : */
144 4142 : if (moveRightIfItNeeded(btree, stack) == false)
145 10 : return true;
146 :
147 4139 : page = BufferGetPage(stack->buffer);
148 4139 : TestForOldSnapshot(snapshot, btree->index, page);
149 4139 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
150 :
151 : /*
152 : * If tuple stores another attribute then stop scan
153 : */
154 4139 : if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
155 0 : return true;
156 :
157 : /* Safe to fetch attribute value */
158 4139 : idatum = gintuple_get_key(btree->ginstate, itup, &icategory);
159 :
160 : /*
161 : * Check for appropriate scan stop conditions
162 : */
163 4139 : if (scanEntry->isPartialMatch)
164 : {
165 : int32 cmp;
166 :
167 : /*
168 : * In partial match, stop scan at any null (including
169 : * placeholders); partial matches never match nulls
170 : */
171 74 : if (icategory != GIN_CAT_NORM_KEY)
172 0 : return true;
173 :
174 : /*----------
175 : * Check of partial match.
176 : * case cmp == 0 => match
177 : * case cmp > 0 => not match and finish scan
178 : * case cmp < 0 => not match and continue scan
179 : *----------
180 : */
181 74 : cmp = DatumGetInt32(FunctionCall4Coll(&btree->ginstate->comparePartialFn[attnum - 1],
182 : btree->ginstate->supportCollation[attnum - 1],
183 : scanEntry->queryKey,
184 : idatum,
185 : UInt16GetDatum(scanEntry->strategy),
186 : PointerGetDatum(scanEntry->extra_data)));
187 :
188 74 : if (cmp > 0)
189 2 : return true;
190 72 : else if (cmp < 0)
191 : {
192 0 : stack->off++;
193 0 : continue;
194 : }
195 : }
196 4065 : else if (scanEntry->searchMode == GIN_SEARCH_MODE_ALL)
197 : {
198 : /*
199 : * In ALL mode, we are not interested in null items, so we can
200 : * stop if we get to a null-item placeholder (which will be the
201 : * last entry for a given attnum). We do want to include NULL_KEY
202 : * and EMPTY_ITEM entries, though.
203 : */
204 4065 : if (icategory == GIN_CAT_NULL_ITEM)
205 2 : return true;
206 : }
207 :
208 : /*
209 : * OK, we want to return the TIDs listed in this entry.
210 : */
211 4135 : if (GinIsPostingTree(itup))
212 : {
213 0 : BlockNumber rootPostingTree = GinGetPostingTree(itup);
214 :
215 : /*
216 : * We should unlock current page (but not unpin) during tree scan
217 : * to prevent deadlock with vacuum processes.
218 : *
219 : * We save current entry value (idatum) to be able to re-find our
220 : * tuple after re-locking
221 : */
222 0 : if (icategory == GIN_CAT_NORM_KEY)
223 0 : idatum = datumCopy(idatum, attr->attbyval, attr->attlen);
224 :
225 0 : LockBuffer(stack->buffer, GIN_UNLOCK);
226 :
227 : /* Collect all the TIDs in this entry's posting tree */
228 0 : scanPostingTree(btree->index, scanEntry, rootPostingTree,
229 : snapshot);
230 :
231 : /*
232 : * We lock again the entry page and while it was unlocked insert
233 : * might have occurred, so we need to re-find our position.
234 : */
235 0 : LockBuffer(stack->buffer, GIN_SHARE);
236 0 : page = BufferGetPage(stack->buffer);
237 0 : if (!GinPageIsLeaf(page))
238 : {
239 : /*
240 : * Root page becomes non-leaf while we unlock it. We will
241 : * start again, this situation doesn't occur often - root can
242 : * became a non-leaf only once per life of index.
243 : */
244 0 : return false;
245 : }
246 :
247 : /* Search forward to re-find idatum */
248 : for (;;)
249 : {
250 : Datum newDatum;
251 : GinNullCategory newCategory;
252 :
253 0 : if (moveRightIfItNeeded(btree, stack) == false)
254 0 : elog(ERROR, "lost saved point in index"); /* must not happen !!! */
255 :
256 0 : page = BufferGetPage(stack->buffer);
257 0 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
258 :
259 0 : if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
260 0 : elog(ERROR, "lost saved point in index"); /* must not happen !!! */
261 0 : newDatum = gintuple_get_key(btree->ginstate, itup,
262 : &newCategory);
263 :
264 0 : if (ginCompareEntries(btree->ginstate, attnum,
265 : newDatum, newCategory,
266 : idatum, icategory) == 0)
267 0 : break; /* Found! */
268 :
269 0 : stack->off++;
270 0 : }
271 :
272 0 : if (icategory == GIN_CAT_NORM_KEY && !attr->attbyval)
273 0 : pfree(DatumGetPointer(idatum));
274 : }
275 : else
276 : {
277 : ItemPointer ipd;
278 : int nipd;
279 :
280 4135 : ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd);
281 4135 : tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false);
282 4135 : scanEntry->predictNumberResult += GinGetNPosting(itup);
283 4135 : pfree(ipd);
284 : }
285 :
286 : /*
287 : * Done with this entry, go to the next
288 : */
289 4135 : stack->off++;
290 4135 : }
291 : }
292 :
293 : /*
294 : * Start* functions setup beginning state of searches: finds correct buffer and pins it.
295 : */
296 : static void
297 119 : startScanEntry(GinState *ginstate, GinScanEntry entry, Snapshot snapshot)
298 : {
299 : GinBtreeData btreeEntry;
300 : GinBtreeStack *stackEntry;
301 : Page page;
302 : bool needUnlock;
303 :
304 : restartScanEntry:
305 119 : entry->buffer = InvalidBuffer;
306 119 : ItemPointerSetMin(&entry->curItem);
307 119 : entry->offset = InvalidOffsetNumber;
308 119 : if (entry->list)
309 0 : pfree(entry->list);
310 119 : entry->list = NULL;
311 119 : entry->nlist = 0;
312 119 : entry->matchBitmap = NULL;
313 119 : entry->matchResult = NULL;
314 119 : entry->reduceResult = FALSE;
315 119 : entry->predictNumberResult = 0;
316 :
317 : /*
318 : * we should find entry, and begin scan of posting tree or just store
319 : * posting list in memory
320 : */
321 119 : ginPrepareEntryScan(&btreeEntry, entry->attnum,
322 119 : entry->queryKey, entry->queryCategory,
323 : ginstate);
324 119 : stackEntry = ginFindLeafPage(&btreeEntry, true, snapshot);
325 119 : page = BufferGetPage(stackEntry->buffer);
326 : /* ginFindLeafPage() will have already checked snapshot age. */
327 119 : needUnlock = TRUE;
328 :
329 119 : entry->isFinished = TRUE;
330 :
331 236 : if (entry->isPartialMatch ||
332 117 : entry->queryCategory == GIN_CAT_EMPTY_QUERY)
333 : {
334 : /*
335 : * btreeEntry.findItem locates the first item >= given search key.
336 : * (For GIN_CAT_EMPTY_QUERY, it will find the leftmost index item
337 : * because of the way the GIN_CAT_EMPTY_QUERY category code is
338 : * assigned.) We scan forward from there and collect all TIDs needed
339 : * for the entry type.
340 : */
341 7 : btreeEntry.findItem(&btreeEntry, stackEntry);
342 7 : if (collectMatchBitmap(&btreeEntry, stackEntry, entry, snapshot)
343 : == false)
344 : {
345 : /*
346 : * GIN tree was seriously restructured, so we will cleanup all
347 : * found data and rescan. See comments near 'return false' in
348 : * collectMatchBitmap()
349 : */
350 0 : if (entry->matchBitmap)
351 : {
352 0 : if (entry->matchIterator)
353 0 : tbm_end_iterate(entry->matchIterator);
354 0 : entry->matchIterator = NULL;
355 0 : tbm_free(entry->matchBitmap);
356 0 : entry->matchBitmap = NULL;
357 : }
358 0 : LockBuffer(stackEntry->buffer, GIN_UNLOCK);
359 0 : freeGinBtreeStack(stackEntry);
360 0 : goto restartScanEntry;
361 : }
362 :
363 14 : if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap))
364 : {
365 7 : entry->matchIterator = tbm_begin_iterate(entry->matchBitmap);
366 7 : entry->isFinished = FALSE;
367 : }
368 : }
369 112 : else if (btreeEntry.findItem(&btreeEntry, stackEntry))
370 : {
371 101 : IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off));
372 :
373 101 : if (GinIsPostingTree(itup))
374 : {
375 2 : BlockNumber rootPostingTree = GinGetPostingTree(itup);
376 : GinBtreeStack *stack;
377 : Page page;
378 : ItemPointerData minItem;
379 :
380 : /*
381 : * We should unlock entry page before touching posting tree to
382 : * prevent deadlocks with vacuum processes. Because entry is never
383 : * deleted from page and posting tree is never reduced to the
384 : * posting list, we can unlock page after getting BlockNumber of
385 : * root of posting tree.
386 : */
387 2 : LockBuffer(stackEntry->buffer, GIN_UNLOCK);
388 2 : needUnlock = FALSE;
389 :
390 2 : stack = ginScanBeginPostingTree(&entry->btree, ginstate->index,
391 : rootPostingTree, snapshot);
392 2 : entry->buffer = stack->buffer;
393 :
394 : /*
395 : * We keep buffer pinned because we need to prevent deletion of
396 : * page during scan. See GIN's vacuum implementation. RefCount is
397 : * increased to keep buffer pinned after freeGinBtreeStack() call.
398 : */
399 2 : IncrBufferRefCount(entry->buffer);
400 :
401 2 : page = BufferGetPage(entry->buffer);
402 :
403 : /*
404 : * Load the first page into memory.
405 : */
406 2 : ItemPointerSetMin(&minItem);
407 2 : entry->list = GinDataLeafPageGetItems(page, &entry->nlist, minItem);
408 :
409 2 : entry->predictNumberResult = stack->predictNumber * entry->nlist;
410 :
411 2 : LockBuffer(entry->buffer, GIN_UNLOCK);
412 2 : freeGinBtreeStack(stack);
413 2 : entry->isFinished = FALSE;
414 : }
415 99 : else if (GinGetNPosting(itup) > 0)
416 : {
417 99 : entry->list = ginReadTuple(ginstate, entry->attnum, itup,
418 : &entry->nlist);
419 99 : entry->predictNumberResult = entry->nlist;
420 :
421 99 : entry->isFinished = FALSE;
422 : }
423 : }
424 :
425 119 : if (needUnlock)
426 117 : LockBuffer(stackEntry->buffer, GIN_UNLOCK);
427 119 : freeGinBtreeStack(stackEntry);
428 119 : }
429 :
430 : /*
431 : * Comparison function for scan entry indexes. Sorts by predictNumberResult,
432 : * least frequent items first.
433 : */
434 : static int
435 64 : entryIndexByFrequencyCmp(const void *a1, const void *a2, void *arg)
436 : {
437 64 : const GinScanKey key = (const GinScanKey) arg;
438 64 : int i1 = *(const int *) a1;
439 64 : int i2 = *(const int *) a2;
440 64 : uint32 n1 = key->scanEntry[i1]->predictNumberResult;
441 64 : uint32 n2 = key->scanEntry[i2]->predictNumberResult;
442 :
443 64 : if (n1 < n2)
444 17 : return -1;
445 47 : else if (n1 == n2)
446 10 : return 0;
447 : else
448 37 : return 1;
449 : }
450 :
451 : static void
452 74 : startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key)
453 : {
454 74 : MemoryContext oldCtx = CurrentMemoryContext;
455 : int i;
456 : int j;
457 : int *entryIndexes;
458 :
459 74 : ItemPointerSetMin(&key->curItem);
460 74 : key->curItemMatches = false;
461 74 : key->recheckCurItem = false;
462 74 : key->isFinished = false;
463 :
464 : /*
465 : * Divide the entries into two distinct sets: required and additional.
466 : * Additional entries are not enough for a match alone, without any items
467 : * from the required set, but are needed by the consistent function to
468 : * decide if an item matches. When scanning, we can skip over items from
469 : * additional entries that have no corresponding matches in any of the
470 : * required entries. That speeds up queries like "frequent & rare"
471 : * considerably, if the frequent term can be put in the additional set.
472 : *
473 : * There can be many legal ways to divide them entries into these two
474 : * sets. A conservative division is to just put everything in the required
475 : * set, but the more you can put in the additional set, the more you can
476 : * skip during the scan. To maximize skipping, we try to put as many
477 : * frequent items as possible into additional, and less frequent ones into
478 : * required. To do that, sort the entries by frequency
479 : * (predictNumberResult), and put entries into the required set in that
480 : * order, until the consistent function says that none of the remaining
481 : * entries can form a match, without any items from the required set. The
482 : * rest go to the additional set.
483 : */
484 74 : if (key->nentries > 1)
485 : {
486 31 : MemoryContextSwitchTo(so->tempCtx);
487 :
488 31 : entryIndexes = (int *) palloc(sizeof(int) * key->nentries);
489 107 : for (i = 0; i < key->nentries; i++)
490 76 : entryIndexes[i] = i;
491 31 : qsort_arg(entryIndexes, key->nentries, sizeof(int),
492 : entryIndexByFrequencyCmp, key);
493 :
494 51 : for (i = 0; i < key->nentries - 1; i++)
495 : {
496 : /* Pass all entries <= i as FALSE, and the rest as MAYBE */
497 109 : for (j = 0; j <= i; j++)
498 67 : key->entryRes[entryIndexes[j]] = GIN_FALSE;
499 113 : for (j = i + 1; j < key->nentries; j++)
500 71 : key->entryRes[entryIndexes[j]] = GIN_MAYBE;
501 :
502 42 : if (key->triConsistentFn(key) == GIN_FALSE)
503 22 : break;
504 : }
505 : /* i is now the last required entry. */
506 :
507 31 : MemoryContextSwitchTo(so->keyCtx);
508 :
509 31 : key->nrequired = i + 1;
510 31 : key->nadditional = key->nentries - key->nrequired;
511 31 : key->requiredEntries = palloc(key->nrequired * sizeof(GinScanEntry));
512 31 : key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry));
513 :
514 31 : j = 0;
515 82 : for (i = 0; i < key->nrequired; i++)
516 51 : key->requiredEntries[i] = key->scanEntry[entryIndexes[j++]];
517 56 : for (i = 0; i < key->nadditional; i++)
518 25 : key->additionalEntries[i] = key->scanEntry[entryIndexes[j++]];
519 :
520 : /* clean up after consistentFn calls (also frees entryIndexes) */
521 31 : MemoryContextReset(so->tempCtx);
522 : }
523 : else
524 : {
525 43 : MemoryContextSwitchTo(so->keyCtx);
526 :
527 43 : key->nrequired = 1;
528 43 : key->nadditional = 0;
529 43 : key->requiredEntries = palloc(1 * sizeof(GinScanEntry));
530 43 : key->requiredEntries[0] = key->scanEntry[0];
531 : }
532 74 : MemoryContextSwitchTo(oldCtx);
533 74 : }
534 :
535 : static void
536 72 : startScan(IndexScanDesc scan)
537 : {
538 72 : GinScanOpaque so = (GinScanOpaque) scan->opaque;
539 72 : GinState *ginstate = &so->ginstate;
540 : uint32 i;
541 :
542 191 : for (i = 0; i < so->totalentries; i++)
543 119 : startScanEntry(ginstate, so->entries[i], scan->xs_snapshot);
544 :
545 72 : if (GinFuzzySearchLimit > 0)
546 : {
547 : /*
548 : * If all of keys more than threshold we will try to reduce result, we
549 : * hope (and only hope, for intersection operation of array our
550 : * supposition isn't true), that total result will not more than
551 : * minimal predictNumberResult.
552 : */
553 0 : bool reduce = true;
554 :
555 0 : for (i = 0; i < so->totalentries; i++)
556 : {
557 0 : if (so->entries[i]->predictNumberResult <= so->totalentries * GinFuzzySearchLimit)
558 : {
559 0 : reduce = false;
560 0 : break;
561 : }
562 : }
563 0 : if (reduce)
564 : {
565 0 : for (i = 0; i < so->totalentries; i++)
566 : {
567 0 : so->entries[i]->predictNumberResult /= so->totalentries;
568 0 : so->entries[i]->reduceResult = TRUE;
569 : }
570 : }
571 : }
572 :
573 : /*
574 : * Now that we have the estimates for the entry frequencies, finish
575 : * initializing the scan keys.
576 : */
577 146 : for (i = 0; i < so->nkeys; i++)
578 74 : startScanKey(ginstate, so, so->keys + i);
579 72 : }
580 :
581 : /*
582 : * Load the next batch of item pointers from a posting tree.
583 : *
584 : * Note that we copy the page into GinScanEntry->list array and unlock it, but
585 : * keep it pinned to prevent interference with vacuum.
586 : */
587 : static void
588 4 : entryLoadMoreItems(GinState *ginstate, GinScanEntry entry,
589 : ItemPointerData advancePast, Snapshot snapshot)
590 : {
591 : Page page;
592 : int i;
593 : bool stepright;
594 :
595 4 : if (!BufferIsValid(entry->buffer))
596 : {
597 0 : entry->isFinished = true;
598 0 : return;
599 : }
600 :
601 : /*
602 : * We have two strategies for finding the correct page: step right from
603 : * the current page, or descend the tree again from the root. If
604 : * advancePast equals the current item, the next matching item should be
605 : * on the next page, so we step right. Otherwise, descend from root.
606 : */
607 4 : if (ginCompareItemPointers(&entry->curItem, &advancePast) == 0)
608 : {
609 4 : stepright = true;
610 4 : LockBuffer(entry->buffer, GIN_SHARE);
611 : }
612 : else
613 : {
614 : GinBtreeStack *stack;
615 :
616 0 : ReleaseBuffer(entry->buffer);
617 :
618 : /*
619 : * Set the search key, and find the correct leaf page.
620 : */
621 0 : if (ItemPointerIsLossyPage(&advancePast))
622 : {
623 0 : ItemPointerSet(&entry->btree.itemptr,
624 : GinItemPointerGetBlockNumber(&advancePast) + 1,
625 : FirstOffsetNumber);
626 : }
627 : else
628 : {
629 0 : ItemPointerSet(&entry->btree.itemptr,
630 : GinItemPointerGetBlockNumber(&advancePast),
631 : OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast)));
632 : }
633 0 : entry->btree.fullScan = false;
634 0 : stack = ginFindLeafPage(&entry->btree, true, snapshot);
635 :
636 : /* we don't need the stack, just the buffer. */
637 0 : entry->buffer = stack->buffer;
638 0 : IncrBufferRefCount(entry->buffer);
639 0 : freeGinBtreeStack(stack);
640 0 : stepright = false;
641 : }
642 :
643 4 : elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d",
644 : GinItemPointerGetBlockNumber(&advancePast),
645 : GinItemPointerGetOffsetNumber(&advancePast),
646 : !stepright);
647 :
648 4 : page = BufferGetPage(entry->buffer);
649 : for (;;)
650 : {
651 4 : entry->offset = InvalidOffsetNumber;
652 4 : if (entry->list)
653 : {
654 4 : pfree(entry->list);
655 4 : entry->list = NULL;
656 4 : entry->nlist = 0;
657 : }
658 :
659 4 : if (stepright)
660 : {
661 : /*
662 : * We've processed all the entries on this page. If it was the
663 : * last page in the tree, we're done.
664 : */
665 4 : if (GinPageRightMost(page))
666 : {
667 0 : UnlockReleaseBuffer(entry->buffer);
668 0 : entry->buffer = InvalidBuffer;
669 0 : entry->isFinished = TRUE;
670 0 : return;
671 : }
672 :
673 : /*
674 : * Step to next page, following the right link. then find the
675 : * first ItemPointer greater than advancePast.
676 : */
677 4 : entry->buffer = ginStepRight(entry->buffer,
678 : ginstate->index,
679 : GIN_SHARE);
680 4 : page = BufferGetPage(entry->buffer);
681 : }
682 4 : stepright = true;
683 :
684 4 : if (GinPageGetOpaque(page)->flags & GIN_DELETED)
685 0 : continue; /* page was deleted by concurrent vacuum */
686 :
687 : /*
688 : * The first item > advancePast might not be on this page, but
689 : * somewhere to the right, if the page was split, or a non-match from
690 : * another key in the query allowed us to skip some items from this
691 : * entry. Keep following the right-links until we re-find the correct
692 : * page.
693 : */
694 6 : if (!GinPageRightMost(page) &&
695 2 : ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0)
696 : {
697 : /*
698 : * the item we're looking is > the right bound of the page, so it
699 : * can't be on this page.
700 : */
701 0 : continue;
702 : }
703 :
704 4 : entry->list = GinDataLeafPageGetItems(page, &entry->nlist, advancePast);
705 :
706 4 : for (i = 0; i < entry->nlist; i++)
707 : {
708 4 : if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0)
709 : {
710 4 : entry->offset = i;
711 :
712 4 : if (GinPageRightMost(page))
713 : {
714 : /* after processing the copied items, we're done. */
715 2 : UnlockReleaseBuffer(entry->buffer);
716 2 : entry->buffer = InvalidBuffer;
717 : }
718 : else
719 2 : LockBuffer(entry->buffer, GIN_UNLOCK);
720 4 : return;
721 : }
722 : }
723 0 : }
724 : }
725 :
726 : #define gin_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE))
727 : #define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )
728 :
729 : /*
730 : * Sets entry->curItem to next heap item pointer > advancePast, for one entry
731 : * of one scan key, or sets entry->isFinished to TRUE if there are no more.
732 : *
733 : * Item pointers are returned in ascending order.
734 : *
735 : * Note: this can return a "lossy page" item pointer, indicating that the
736 : * entry potentially matches all items on that heap page. However, it is
737 : * not allowed to return both a lossy page pointer and exact (regular)
738 : * item pointers for the same page. (Doing so would break the key-combination
739 : * logic in keyGetItem and scanGetItem; see comment in scanGetItem.) In the
740 : * current implementation this is guaranteed by the behavior of tidbitmaps.
741 : */
742 : static void
743 49596 : entryGetItem(GinState *ginstate, GinScanEntry entry,
744 : ItemPointerData advancePast, Snapshot snapshot)
745 : {
746 49596 : Assert(!entry->isFinished);
747 :
748 49596 : Assert(!ItemPointerIsValid(&entry->curItem) ||
749 : ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
750 :
751 49596 : if (entry->matchBitmap)
752 : {
753 : /* A bitmap result */
754 3699 : BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast);
755 3699 : OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast);
756 3699 : bool gotitem = false;
757 :
758 : do
759 : {
760 : /*
761 : * If we've exhausted all items on this block, move to next block
762 : * in the bitmap.
763 : */
764 11306 : while (entry->matchResult == NULL ||
765 7600 : (entry->matchResult->ntuples >= 0 &&
766 7492 : entry->offset >= entry->matchResult->ntuples) ||
767 7384 : entry->matchResult->blockno < advancePastBlk ||
768 3692 : (ItemPointerIsLossyPage(&advancePast) &&
769 0 : entry->matchResult->blockno == advancePastBlk))
770 : {
771 115 : entry->matchResult = tbm_iterate(entry->matchIterator);
772 :
773 115 : if (entry->matchResult == NULL)
774 : {
775 7 : ItemPointerSetInvalid(&entry->curItem);
776 7 : tbm_end_iterate(entry->matchIterator);
777 7 : entry->matchIterator = NULL;
778 7 : entry->isFinished = TRUE;
779 7 : break;
780 : }
781 :
782 : /*
783 : * Reset counter to the beginning of entry->matchResult. Note:
784 : * entry->offset is still greater than matchResult->ntuples if
785 : * matchResult is lossy. So, on next call we will get next
786 : * result from TIDBitmap.
787 : */
788 108 : entry->offset = 0;
789 : }
790 3699 : if (entry->isFinished)
791 7 : break;
792 :
793 : /*
794 : * We're now on the first page after advancePast which has any
795 : * items on it. If it's a lossy result, return that.
796 : */
797 3692 : if (entry->matchResult->ntuples < 0)
798 : {
799 0 : ItemPointerSetLossyPage(&entry->curItem,
800 : entry->matchResult->blockno);
801 :
802 : /*
803 : * We might as well fall out of the loop; we could not
804 : * estimate number of results on this page to support correct
805 : * reducing of result even if it's enabled.
806 : */
807 0 : gotitem = true;
808 0 : break;
809 : }
810 :
811 : /*
812 : * Not a lossy page. Skip over any offsets <= advancePast, and
813 : * return that.
814 : */
815 3692 : if (entry->matchResult->blockno == advancePastBlk)
816 : {
817 : /*
818 : * First, do a quick check against the last offset on the
819 : * page. If that's > advancePast, so are all the other
820 : * offsets.
821 : */
822 3591 : if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff)
823 : {
824 0 : entry->offset = entry->matchResult->ntuples;
825 0 : continue;
826 : }
827 :
828 : /* Otherwise scan to find the first item > advancePast */
829 7182 : while (entry->matchResult->offsets[entry->offset] <= advancePastOff)
830 0 : entry->offset++;
831 : }
832 :
833 3692 : ItemPointerSet(&entry->curItem,
834 : entry->matchResult->blockno,
835 : entry->matchResult->offsets[entry->offset]);
836 3692 : entry->offset++;
837 3692 : gotitem = true;
838 3692 : } while (!gotitem || (entry->reduceResult == TRUE && dropItem(entry)));
839 : }
840 45897 : else if (!BufferIsValid(entry->buffer))
841 : {
842 : /*
843 : * A posting list from an entry tuple, or the last page of a posting
844 : * tree.
845 : */
846 : do
847 : {
848 19002 : if (entry->offset >= entry->nlist)
849 : {
850 82 : ItemPointerSetInvalid(&entry->curItem);
851 82 : entry->isFinished = TRUE;
852 82 : break;
853 : }
854 :
855 18920 : entry->curItem = entry->list[entry->offset++];
856 18920 : } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
857 : /* XXX: shouldn't we apply the fuzzy search limit here? */
858 : }
859 : else
860 : {
861 : /* A posting tree */
862 : do
863 : {
864 : /* If we've processed the current batch, load more items */
865 56136 : while (entry->offset >= entry->nlist)
866 : {
867 4 : entryLoadMoreItems(ginstate, entry, advancePast, snapshot);
868 :
869 4 : if (entry->isFinished)
870 : {
871 0 : ItemPointerSetInvalid(&entry->curItem);
872 49596 : return;
873 : }
874 : }
875 :
876 28066 : entry->curItem = entry->list[entry->offset++];
877 :
878 56132 : } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0 ||
879 28066 : (entry->reduceResult == TRUE && dropItem(entry)));
880 : }
881 : }
882 :
883 : /*
884 : * Identify the "current" item among the input entry streams for this scan key
885 : * that is greater than advancePast, and test whether it passes the scan key
886 : * qual condition.
887 : *
888 : * The current item is the smallest curItem among the inputs. key->curItem
889 : * is set to that value. key->curItemMatches is set to indicate whether that
890 : * TID passes the consistentFn test. If so, key->recheckCurItem is set true
891 : * iff recheck is needed for this item pointer (including the case where the
892 : * item pointer is a lossy page pointer).
893 : *
894 : * If all entry streams are exhausted, sets key->isFinished to TRUE.
895 : *
896 : * Item pointers must be returned in ascending order.
897 : *
898 : * Note: this can return a "lossy page" item pointer, indicating that the
899 : * key potentially matches all items on that heap page. However, it is
900 : * not allowed to return both a lossy page pointer and exact (regular)
901 : * item pointers for the same page. (Doing so would break the key-combination
902 : * logic in scanGetItem.)
903 : */
904 : static void
905 48289 : keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
906 : ItemPointerData advancePast, Snapshot snapshot)
907 : {
908 : ItemPointerData minItem;
909 : ItemPointerData curPageLossy;
910 : uint32 i;
911 : bool haveLossyEntry;
912 : GinScanEntry entry;
913 : GinTernaryValue res;
914 : MemoryContext oldCtx;
915 : bool allFinished;
916 :
917 48289 : Assert(!key->isFinished);
918 :
919 : /*
920 : * We might have already tested this item; if so, no need to repeat work.
921 : * (Note: the ">" case can happen, if advancePast is exact but we
922 : * previously had to set curItem to a lossy-page pointer.)
923 : */
924 48289 : if (ginCompareItemPointers(&key->curItem, &advancePast) > 0)
925 76 : return;
926 :
927 : /*
928 : * Find the minimum item > advancePast among the active entry streams.
929 : *
930 : * Note: a lossy-page entry is encoded by a ItemPointer with max value for
931 : * offset (0xffff), so that it will sort after any exact entries for the
932 : * same page. So we'll prefer to return exact pointers not lossy
933 : * pointers, which is good.
934 : */
935 48287 : ItemPointerSetMax(&minItem);
936 48287 : allFinished = true;
937 98838 : for (i = 0; i < key->nrequired; i++)
938 : {
939 50551 : entry = key->requiredEntries[i];
940 :
941 50551 : if (entry->isFinished)
942 564 : continue;
943 :
944 : /*
945 : * Advance this stream if necessary.
946 : *
947 : * In particular, since entry->curItem was initialized with
948 : * ItemPointerSetMin, this ensures we fetch the first item for each
949 : * entry on the first call.
950 : */
951 49987 : if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
952 : {
953 48871 : entryGetItem(ginstate, entry, advancePast, snapshot);
954 48871 : if (entry->isFinished)
955 85 : continue;
956 : }
957 :
958 49902 : allFinished = false;
959 49902 : if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
960 48821 : minItem = entry->curItem;
961 : }
962 :
963 48287 : if (allFinished)
964 : {
965 : /* all entries are finished */
966 72 : key->isFinished = TRUE;
967 72 : return;
968 : }
969 :
970 : /*
971 : * Ok, we now know that there are no matches < minItem.
972 : *
973 : * If minItem is lossy, it means that there were no exact items on the
974 : * page among requiredEntries, because lossy pointers sort after exact
975 : * items. However, there might be exact items for the same page among
976 : * additionalEntries, so we mustn't advance past them.
977 : */
978 48215 : if (ItemPointerIsLossyPage(&minItem))
979 : {
980 0 : if (GinItemPointerGetBlockNumber(&advancePast) <
981 0 : GinItemPointerGetBlockNumber(&minItem))
982 : {
983 0 : ItemPointerSet(&advancePast,
984 : GinItemPointerGetBlockNumber(&minItem),
985 : InvalidOffsetNumber);
986 : }
987 : }
988 : else
989 : {
990 48215 : Assert(GinItemPointerGetOffsetNumber(&minItem) > 0);
991 48215 : ItemPointerSet(&advancePast,
992 : GinItemPointerGetBlockNumber(&minItem),
993 : OffsetNumberPrev(GinItemPointerGetOffsetNumber(&minItem)));
994 : }
995 :
996 : /*
997 : * We might not have loaded all the entry streams for this TID yet. We
998 : * could call the consistent function, passing MAYBE for those entries, to
999 : * see if it can decide if this TID matches based on the information we
1000 : * have. But if the consistent-function is expensive, and cannot in fact
1001 : * decide with partial information, that could be a big loss. So, load all
1002 : * the additional entries, before calling the consistent function.
1003 : */
1004 49368 : for (i = 0; i < key->nadditional; i++)
1005 : {
1006 1153 : entry = key->additionalEntries[i];
1007 :
1008 1153 : if (entry->isFinished)
1009 1 : continue;
1010 :
1011 1152 : if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
1012 : {
1013 725 : entryGetItem(ginstate, entry, advancePast, snapshot);
1014 725 : if (entry->isFinished)
1015 4 : continue;
1016 : }
1017 :
1018 : /*
1019 : * Normally, none of the items in additionalEntries can have a curItem
1020 : * larger than minItem. But if minItem is a lossy page, then there
1021 : * might be exact items on the same page among additionalEntries.
1022 : */
1023 1148 : if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
1024 : {
1025 0 : Assert(ItemPointerIsLossyPage(&minItem));
1026 0 : minItem = entry->curItem;
1027 : }
1028 : }
1029 :
1030 : /*
1031 : * Ok, we've advanced all the entries up to minItem now. Set key->curItem,
1032 : * and perform consistentFn test.
1033 : *
1034 : * Lossy-page entries pose a problem, since we don't know the correct
1035 : * entryRes state to pass to the consistentFn, and we also don't know what
1036 : * its combining logic will be (could be AND, OR, or even NOT). If the
1037 : * logic is OR then the consistentFn might succeed for all items in the
1038 : * lossy page even when none of the other entries match.
1039 : *
1040 : * Our strategy is to call the tri-state consistent function, with the
1041 : * lossy-page entries set to MAYBE, and all the other entries FALSE. If it
1042 : * returns FALSE, none of the lossy items alone are enough for a match, so
1043 : * we don't need to return a lossy-page pointer. Otherwise, return a
1044 : * lossy-page pointer to indicate that the whole heap page must be
1045 : * checked. (On subsequent calls, we'll do nothing until minItem is past
1046 : * the page altogether, thus ensuring that we never return both regular
1047 : * and lossy pointers for the same page.)
1048 : *
1049 : * An exception is that it doesn't matter what we pass for lossy pointers
1050 : * in "hidden" entries, because the consistentFn's result can't depend on
1051 : * them. We could pass them as MAYBE as well, but if we're using the
1052 : * "shim" implementation of a tri-state consistent function (see
1053 : * ginlogic.c), it's better to pass as few MAYBEs as possible. So pass
1054 : * them as TRUE.
1055 : *
1056 : * Note that only lossy-page entries pointing to the current item's page
1057 : * should trigger this processing; we might have future lossy pages in the
1058 : * entry array, but they aren't relevant yet.
1059 : */
1060 48215 : key->curItem = minItem;
1061 48215 : ItemPointerSetLossyPage(&curPageLossy,
1062 : GinItemPointerGetBlockNumber(&key->curItem));
1063 48215 : haveLossyEntry = false;
1064 99827 : for (i = 0; i < key->nentries; i++)
1065 : {
1066 51612 : entry = key->scanEntry[i];
1067 102662 : if (entry->isFinished == FALSE &&
1068 51050 : ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
1069 : {
1070 0 : if (i < key->nuserentries)
1071 0 : key->entryRes[i] = GIN_MAYBE;
1072 : else
1073 0 : key->entryRes[i] = GIN_TRUE;
1074 0 : haveLossyEntry = true;
1075 : }
1076 : else
1077 51612 : key->entryRes[i] = GIN_FALSE;
1078 : }
1079 :
1080 : /* prepare for calling consistentFn in temp context */
1081 48215 : oldCtx = MemoryContextSwitchTo(tempCtx);
1082 :
1083 48215 : if (haveLossyEntry)
1084 : {
1085 : /* Have lossy-page entries, so see if whole page matches */
1086 0 : res = key->triConsistentFn(key);
1087 :
1088 0 : if (res == GIN_TRUE || res == GIN_MAYBE)
1089 : {
1090 : /* Yes, so clean up ... */
1091 0 : MemoryContextSwitchTo(oldCtx);
1092 0 : MemoryContextReset(tempCtx);
1093 :
1094 : /* and return lossy pointer for whole page */
1095 0 : key->curItem = curPageLossy;
1096 0 : key->curItemMatches = true;
1097 0 : key->recheckCurItem = true;
1098 0 : return;
1099 : }
1100 : }
1101 :
1102 : /*
1103 : * At this point we know that we don't need to return a lossy whole-page
1104 : * pointer, but we might have matches for individual exact item pointers,
1105 : * possibly in combination with a lossy pointer. Pass lossy pointers as
1106 : * MAYBE to the ternary consistent function, to let it decide if this
1107 : * tuple satisfies the overall key, even though we don't know if the lossy
1108 : * entries match.
1109 : *
1110 : * Prepare entryRes array to be passed to consistentFn.
1111 : */
1112 99827 : for (i = 0; i < key->nentries; i++)
1113 : {
1114 51612 : entry = key->scanEntry[i];
1115 51612 : if (entry->isFinished)
1116 562 : key->entryRes[i] = GIN_FALSE;
1117 : #if 0
1118 :
1119 : /*
1120 : * This case can't currently happen, because we loaded all the entries
1121 : * for this item earlier.
1122 : */
1123 : else if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
1124 : key->entryRes[i] = GIN_MAYBE;
1125 : #endif
1126 51050 : else if (ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
1127 0 : key->entryRes[i] = GIN_MAYBE;
1128 51050 : else if (ginCompareItemPointers(&entry->curItem, &minItem) == 0)
1129 49084 : key->entryRes[i] = GIN_TRUE;
1130 : else
1131 1966 : key->entryRes[i] = GIN_FALSE;
1132 : }
1133 :
1134 48215 : res = key->triConsistentFn(key);
1135 :
1136 48215 : switch (res)
1137 : {
1138 : case GIN_TRUE:
1139 44834 : key->curItemMatches = true;
1140 : /* triConsistentFn set recheckCurItem */
1141 44834 : break;
1142 :
1143 : case GIN_FALSE:
1144 650 : key->curItemMatches = false;
1145 650 : break;
1146 :
1147 : case GIN_MAYBE:
1148 2731 : key->curItemMatches = true;
1149 2731 : key->recheckCurItem = true;
1150 2731 : break;
1151 :
1152 : default:
1153 :
1154 : /*
1155 : * the 'default' case shouldn't happen, but if the consistent
1156 : * function returns something bogus, this is the safe result
1157 : */
1158 0 : key->curItemMatches = true;
1159 0 : key->recheckCurItem = true;
1160 0 : break;
1161 : }
1162 :
1163 : /*
1164 : * We have a tuple, and we know if it matches or not. If it's a non-match,
1165 : * we could continue to find the next matching tuple, but let's break out
1166 : * and give scanGetItem a chance to advance the other keys. They might be
1167 : * able to skip past to a much higher TID, allowing us to save work.
1168 : */
1169 :
1170 : /* clean up after consistentFn calls */
1171 48215 : MemoryContextSwitchTo(oldCtx);
1172 48215 : MemoryContextReset(tempCtx);
1173 : }
1174 :
1175 : /*
1176 : * Get next heap item pointer (after advancePast) from scan.
1177 : * Returns true if anything found.
1178 : * On success, *item and *recheck are set.
1179 : *
1180 : * Note: this is very nearly the same logic as in keyGetItem(), except
1181 : * that we know the keys are to be combined with AND logic, whereas in
1182 : * keyGetItem() the combination logic is known only to the consistentFn.
1183 : */
1184 : static bool
1185 47621 : scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
1186 : ItemPointerData *item, bool *recheck)
1187 : {
1188 47621 : GinScanOpaque so = (GinScanOpaque) scan->opaque;
1189 : uint32 i;
1190 : bool match;
1191 :
1192 : /*----------
1193 : * Advance the scan keys in lock-step, until we find an item that matches
1194 : * all the keys. If any key reports isFinished, meaning its subset of the
1195 : * entries is exhausted, we can stop. Otherwise, set *item to the next
1196 : * matching item.
1197 : *
1198 : * This logic works only if a keyGetItem stream can never contain both
1199 : * exact and lossy pointers for the same page. Else we could have a
1200 : * case like
1201 : *
1202 : * stream 1 stream 2
1203 : * ... ...
1204 : * 42/6 42/7
1205 : * 50/1 42/0xffff
1206 : * ... ...
1207 : *
1208 : * We would conclude that 42/6 is not a match and advance stream 1,
1209 : * thus never detecting the match to the lossy pointer in stream 2.
1210 : * (keyGetItem has a similar problem versus entryGetItem.)
1211 : *----------
1212 : */
1213 : do
1214 : {
1215 48279 : ItemPointerSetMin(item);
1216 48279 : match = true;
1217 95846 : for (i = 0; i < so->nkeys && match; i++)
1218 : {
1219 48289 : GinScanKey key = so->keys + i;
1220 :
1221 : /* Fetch the next item for this key that is > advancePast. */
1222 48289 : keyGetItem(&so->ginstate, so->tempCtx, key, advancePast,
1223 : scan->xs_snapshot);
1224 :
1225 48289 : if (key->isFinished)
1226 72 : return false;
1227 :
1228 : /*
1229 : * If it's not a match, we can immediately conclude that nothing
1230 : * <= this item matches, without checking the rest of the keys.
1231 : */
1232 48217 : if (!key->curItemMatches)
1233 : {
1234 650 : advancePast = key->curItem;
1235 650 : match = false;
1236 650 : break;
1237 : }
1238 :
1239 : /*
1240 : * It's a match. We can conclude that nothing < matches, so the
1241 : * other key streams can skip to this item.
1242 : *
1243 : * Beware of lossy pointers, though; from a lossy pointer, we can
1244 : * only conclude that nothing smaller than this *block* matches.
1245 : */
1246 47567 : if (ItemPointerIsLossyPage(&key->curItem))
1247 : {
1248 0 : if (GinItemPointerGetBlockNumber(&advancePast) <
1249 0 : GinItemPointerGetBlockNumber(&key->curItem))
1250 : {
1251 0 : ItemPointerSet(&advancePast,
1252 : GinItemPointerGetBlockNumber(&key->curItem),
1253 : InvalidOffsetNumber);
1254 : }
1255 : }
1256 : else
1257 : {
1258 47567 : Assert(GinItemPointerGetOffsetNumber(&key->curItem) > 0);
1259 47567 : ItemPointerSet(&advancePast,
1260 : GinItemPointerGetBlockNumber(&key->curItem),
1261 : OffsetNumberPrev(GinItemPointerGetOffsetNumber(&key->curItem)));
1262 : }
1263 :
1264 : /*
1265 : * If this is the first key, remember this location as a potential
1266 : * match, and proceed to check the rest of the keys.
1267 : *
1268 : * Otherwise, check if this is the same item that we checked the
1269 : * previous keys for (or a lossy pointer for the same page). If
1270 : * not, loop back to check the previous keys for this item (we
1271 : * will check this key again too, but keyGetItem returns quickly
1272 : * for that)
1273 : */
1274 47567 : if (i == 0)
1275 : {
1276 47557 : *item = key->curItem;
1277 : }
1278 : else
1279 : {
1280 20 : if (ItemPointerIsLossyPage(&key->curItem) ||
1281 10 : ItemPointerIsLossyPage(item))
1282 : {
1283 0 : Assert(GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item));
1284 0 : match = (GinItemPointerGetBlockNumber(&key->curItem) ==
1285 0 : GinItemPointerGetBlockNumber(item));
1286 : }
1287 : else
1288 : {
1289 10 : Assert(ginCompareItemPointers(&key->curItem, item) >= 0);
1290 10 : match = (ginCompareItemPointers(&key->curItem, item) == 0);
1291 : }
1292 : }
1293 : }
1294 48207 : } while (!match);
1295 :
1296 47549 : Assert(!ItemPointerIsMin(item));
1297 :
1298 : /*
1299 : * Now *item contains the first ItemPointer after previous result that
1300 : * satisfied all the keys for that exact TID, or a lossy reference to the
1301 : * same page.
1302 : *
1303 : * We must return recheck = true if any of the keys are marked recheck.
1304 : */
1305 47549 : *recheck = false;
1306 92369 : for (i = 0; i < so->nkeys; i++)
1307 : {
1308 47551 : GinScanKey key = so->keys + i;
1309 :
1310 47551 : if (key->recheckCurItem)
1311 : {
1312 2731 : *recheck = true;
1313 2731 : break;
1314 : }
1315 : }
1316 :
1317 47549 : return TRUE;
1318 : }
1319 :
1320 :
1321 : /*
1322 : * Functions for scanning the pending list
1323 : */
1324 :
1325 :
1326 : /*
1327 : * Get ItemPointer of next heap row to be checked from pending list.
1328 : * Returns false if there are no more. On pages with several heap rows
1329 : * it returns each row separately, on page with part of heap row returns
1330 : * per page data. pos->firstOffset and pos->lastOffset are set to identify
1331 : * the range of pending-list tuples belonging to this heap row.
1332 : *
1333 : * The pendingBuffer is presumed pinned and share-locked on entry, and is
1334 : * pinned and share-locked on success exit. On failure exit it's released.
1335 : */
1336 : static bool
1337 14 : scanGetCandidate(IndexScanDesc scan, pendingPosition *pos)
1338 : {
1339 : OffsetNumber maxoff;
1340 : Page page;
1341 : IndexTuple itup;
1342 :
1343 14 : ItemPointerSetInvalid(&pos->item);
1344 : for (;;)
1345 : {
1346 14 : page = BufferGetPage(pos->pendingBuffer);
1347 14 : TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
1348 :
1349 14 : maxoff = PageGetMaxOffsetNumber(page);
1350 14 : if (pos->firstOffset > maxoff)
1351 : {
1352 4 : BlockNumber blkno = GinPageGetOpaque(page)->rightlink;
1353 :
1354 4 : if (blkno == InvalidBlockNumber)
1355 : {
1356 4 : UnlockReleaseBuffer(pos->pendingBuffer);
1357 4 : pos->pendingBuffer = InvalidBuffer;
1358 :
1359 4 : return false;
1360 : }
1361 : else
1362 : {
1363 : /*
1364 : * Here we must prevent deletion of next page by insertcleanup
1365 : * process, which may be trying to obtain exclusive lock on
1366 : * current page. So, we lock next page before releasing the
1367 : * current one
1368 : */
1369 0 : Buffer tmpbuf = ReadBuffer(scan->indexRelation, blkno);
1370 :
1371 0 : LockBuffer(tmpbuf, GIN_SHARE);
1372 0 : UnlockReleaseBuffer(pos->pendingBuffer);
1373 :
1374 0 : pos->pendingBuffer = tmpbuf;
1375 0 : pos->firstOffset = FirstOffsetNumber;
1376 : }
1377 : }
1378 : else
1379 : {
1380 10 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->firstOffset));
1381 10 : pos->item = itup->t_tid;
1382 10 : if (GinPageHasFullRow(page))
1383 : {
1384 : /*
1385 : * find itempointer to the next row
1386 : */
1387 18 : for (pos->lastOffset = pos->firstOffset + 1; pos->lastOffset <= maxoff; pos->lastOffset++)
1388 : {
1389 14 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->lastOffset));
1390 14 : if (!ItemPointerEquals(&pos->item, &itup->t_tid))
1391 6 : break;
1392 : }
1393 : }
1394 : else
1395 : {
1396 : /*
1397 : * All itempointers are the same on this page
1398 : */
1399 0 : pos->lastOffset = maxoff + 1;
1400 : }
1401 :
1402 : /*
1403 : * Now pos->firstOffset points to the first tuple of current heap
1404 : * row, pos->lastOffset points to the first tuple of next heap row
1405 : * (or to the end of page)
1406 : */
1407 10 : break;
1408 : }
1409 0 : }
1410 :
1411 10 : return true;
1412 : }
1413 :
1414 : /*
1415 : * Scan pending-list page from current tuple (off) up till the first of:
1416 : * - match is found (then returns true)
1417 : * - no later match is possible
1418 : * - tuple's attribute number is not equal to entry's attrnum
1419 : * - reach end of page
1420 : *
1421 : * datum[]/category[]/datumExtracted[] arrays are used to cache the results
1422 : * of gintuple_get_key() on the current page.
1423 : */
1424 : static bool
1425 0 : matchPartialInPendingList(GinState *ginstate, Page page,
1426 : OffsetNumber off, OffsetNumber maxoff,
1427 : GinScanEntry entry,
1428 : Datum *datum, GinNullCategory *category,
1429 : bool *datumExtracted)
1430 : {
1431 : IndexTuple itup;
1432 : int32 cmp;
1433 :
1434 : /* Partial match to a null is not possible */
1435 0 : if (entry->queryCategory != GIN_CAT_NORM_KEY)
1436 0 : return false;
1437 :
1438 0 : while (off < maxoff)
1439 : {
1440 0 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
1441 :
1442 0 : if (gintuple_get_attrnum(ginstate, itup) != entry->attnum)
1443 0 : return false;
1444 :
1445 0 : if (datumExtracted[off - 1] == false)
1446 : {
1447 0 : datum[off - 1] = gintuple_get_key(ginstate, itup,
1448 0 : &category[off - 1]);
1449 0 : datumExtracted[off - 1] = true;
1450 : }
1451 :
1452 : /* Once we hit nulls, no further match is possible */
1453 0 : if (category[off - 1] != GIN_CAT_NORM_KEY)
1454 0 : return false;
1455 :
1456 : /*----------
1457 : * Check partial match.
1458 : * case cmp == 0 => match
1459 : * case cmp > 0 => not match and end scan (no later match possible)
1460 : * case cmp < 0 => not match and continue scan
1461 : *----------
1462 : */
1463 0 : cmp = DatumGetInt32(FunctionCall4Coll(&ginstate->comparePartialFn[entry->attnum - 1],
1464 : ginstate->supportCollation[entry->attnum - 1],
1465 : entry->queryKey,
1466 : datum[off - 1],
1467 : UInt16GetDatum(entry->strategy),
1468 : PointerGetDatum(entry->extra_data)));
1469 0 : if (cmp == 0)
1470 0 : return true;
1471 0 : else if (cmp > 0)
1472 0 : return false;
1473 :
1474 0 : off++;
1475 : }
1476 :
1477 0 : return false;
1478 : }
1479 :
1480 : /*
1481 : * Set up the entryRes array for each key by looking at
1482 : * every entry for current heap row in pending list.
1483 : *
1484 : * Returns true if each scan key has at least one entryRes match.
1485 : * This corresponds to the situations where the normal index search will
1486 : * try to apply the key's consistentFn. (A tuple not meeting that requirement
1487 : * cannot be returned by the normal search since no entry stream will
1488 : * source its TID.)
1489 : *
1490 : * The pendingBuffer is presumed pinned and share-locked on entry.
1491 : */
1492 : static bool
1493 10 : collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos)
1494 : {
1495 10 : GinScanOpaque so = (GinScanOpaque) scan->opaque;
1496 : OffsetNumber attrnum;
1497 : Page page;
1498 : IndexTuple itup;
1499 : int i,
1500 : j;
1501 :
1502 : /*
1503 : * Reset all entryRes and hasMatchKey flags
1504 : */
1505 20 : for (i = 0; i < so->nkeys; i++)
1506 : {
1507 10 : GinScanKey key = so->keys + i;
1508 :
1509 10 : memset(key->entryRes, GIN_FALSE, key->nentries);
1510 : }
1511 10 : memset(pos->hasMatchKey, FALSE, so->nkeys);
1512 :
1513 : /*
1514 : * Outer loop iterates over multiple pending-list pages when a single heap
1515 : * row has entries spanning those pages.
1516 : */
1517 : for (;;)
1518 : {
1519 : Datum datum[BLCKSZ / sizeof(IndexTupleData)];
1520 : GinNullCategory category[BLCKSZ / sizeof(IndexTupleData)];
1521 : bool datumExtracted[BLCKSZ / sizeof(IndexTupleData)];
1522 :
1523 10 : Assert(pos->lastOffset > pos->firstOffset);
1524 10 : memset(datumExtracted + pos->firstOffset - 1, 0,
1525 10 : sizeof(bool) * (pos->lastOffset - pos->firstOffset));
1526 :
1527 10 : page = BufferGetPage(pos->pendingBuffer);
1528 10 : TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
1529 :
1530 20 : for (i = 0; i < so->nkeys; i++)
1531 : {
1532 10 : GinScanKey key = so->keys + i;
1533 :
1534 30 : for (j = 0; j < key->nentries; j++)
1535 : {
1536 20 : GinScanEntry entry = key->scanEntry[j];
1537 20 : OffsetNumber StopLow = pos->firstOffset,
1538 20 : StopHigh = pos->lastOffset,
1539 : StopMiddle;
1540 :
1541 : /* If already matched on earlier page, do no extra work */
1542 20 : if (key->entryRes[j])
1543 0 : continue;
1544 :
1545 : /*
1546 : * Interesting tuples are from pos->firstOffset to
1547 : * pos->lastOffset and they are ordered by (attnum, Datum) as
1548 : * it's done in entry tree. So we can use binary search to
1549 : * avoid linear scanning.
1550 : */
1551 60 : while (StopLow < StopHigh)
1552 : {
1553 : int res;
1554 :
1555 28 : StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
1556 :
1557 28 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle));
1558 :
1559 28 : attrnum = gintuple_get_attrnum(&so->ginstate, itup);
1560 :
1561 28 : if (key->attnum < attrnum)
1562 : {
1563 0 : StopHigh = StopMiddle;
1564 0 : continue;
1565 : }
1566 28 : if (key->attnum > attrnum)
1567 : {
1568 0 : StopLow = StopMiddle + 1;
1569 0 : continue;
1570 : }
1571 :
1572 28 : if (datumExtracted[StopMiddle - 1] == false)
1573 : {
1574 36 : datum[StopMiddle - 1] =
1575 18 : gintuple_get_key(&so->ginstate, itup,
1576 18 : &category[StopMiddle - 1]);
1577 18 : datumExtracted[StopMiddle - 1] = true;
1578 : }
1579 :
1580 28 : if (entry->queryCategory == GIN_CAT_EMPTY_QUERY)
1581 : {
1582 : /* special behavior depending on searchMode */
1583 0 : if (entry->searchMode == GIN_SEARCH_MODE_ALL)
1584 : {
1585 : /* match anything except NULL_ITEM */
1586 0 : if (category[StopMiddle - 1] == GIN_CAT_NULL_ITEM)
1587 0 : res = -1;
1588 : else
1589 0 : res = 0;
1590 : }
1591 : else
1592 : {
1593 : /* match everything */
1594 0 : res = 0;
1595 : }
1596 : }
1597 : else
1598 : {
1599 112 : res = ginCompareEntries(&so->ginstate,
1600 28 : entry->attnum,
1601 : entry->queryKey,
1602 28 : entry->queryCategory,
1603 28 : datum[StopMiddle - 1],
1604 28 : category[StopMiddle - 1]);
1605 : }
1606 :
1607 28 : if (res == 0)
1608 : {
1609 : /*
1610 : * Found exact match (there can be only one, except in
1611 : * EMPTY_QUERY mode).
1612 : *
1613 : * If doing partial match, scan forward from here to
1614 : * end of page to check for matches.
1615 : *
1616 : * See comment above about tuple's ordering.
1617 : */
1618 8 : if (entry->isPartialMatch)
1619 0 : key->entryRes[j] =
1620 0 : matchPartialInPendingList(&so->ginstate,
1621 : page,
1622 : StopMiddle,
1623 0 : pos->lastOffset,
1624 : entry,
1625 : datum,
1626 : category,
1627 : datumExtracted);
1628 : else
1629 8 : key->entryRes[j] = true;
1630 :
1631 : /* done with binary search */
1632 8 : break;
1633 : }
1634 20 : else if (res < 0)
1635 16 : StopHigh = StopMiddle;
1636 : else
1637 4 : StopLow = StopMiddle + 1;
1638 : }
1639 :
1640 20 : if (StopLow >= StopHigh && entry->isPartialMatch)
1641 : {
1642 : /*
1643 : * No exact match on this page. If doing partial match,
1644 : * scan from the first tuple greater than target value to
1645 : * end of page. Note that since we don't remember whether
1646 : * the comparePartialFn told us to stop early on a
1647 : * previous page, we will uselessly apply comparePartialFn
1648 : * to the first tuple on each subsequent page.
1649 : */
1650 0 : key->entryRes[j] =
1651 0 : matchPartialInPendingList(&so->ginstate,
1652 : page,
1653 : StopHigh,
1654 0 : pos->lastOffset,
1655 : entry,
1656 : datum,
1657 : category,
1658 : datumExtracted);
1659 : }
1660 :
1661 20 : pos->hasMatchKey[i] |= key->entryRes[j];
1662 : }
1663 : }
1664 :
1665 : /* Advance firstOffset over the scanned tuples */
1666 10 : pos->firstOffset = pos->lastOffset;
1667 :
1668 10 : if (GinPageHasFullRow(page))
1669 : {
1670 : /*
1671 : * We have examined all pending entries for the current heap row.
1672 : * Break out of loop over pages.
1673 : */
1674 10 : break;
1675 : }
1676 : else
1677 : {
1678 : /*
1679 : * Advance to next page of pending entries for the current heap
1680 : * row. Complain if there isn't one.
1681 : */
1682 0 : ItemPointerData item = pos->item;
1683 :
1684 0 : if (scanGetCandidate(scan, pos) == false ||
1685 0 : !ItemPointerEquals(&pos->item, &item))
1686 0 : elog(ERROR, "could not find additional pending pages for same heap tuple");
1687 : }
1688 0 : }
1689 :
1690 : /*
1691 : * Now return "true" if all scan keys have at least one matching datum
1692 : */
1693 14 : for (i = 0; i < so->nkeys; i++)
1694 : {
1695 10 : if (pos->hasMatchKey[i] == false)
1696 6 : return false;
1697 : }
1698 :
1699 4 : return true;
1700 : }
1701 :
1702 : /*
1703 : * Collect all matched rows from pending list into bitmap
1704 : */
1705 : static void
1706 72 : scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
1707 : {
1708 72 : GinScanOpaque so = (GinScanOpaque) scan->opaque;
1709 : MemoryContext oldCtx;
1710 : bool recheck,
1711 : match;
1712 : int i;
1713 : pendingPosition pos;
1714 72 : Buffer metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO);
1715 : Page page;
1716 : BlockNumber blkno;
1717 :
1718 72 : *ntids = 0;
1719 :
1720 72 : LockBuffer(metabuffer, GIN_SHARE);
1721 72 : page = BufferGetPage(metabuffer);
1722 72 : TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
1723 72 : blkno = GinPageGetMeta(page)->head;
1724 :
1725 : /*
1726 : * fetch head of list before unlocking metapage. head page must be pinned
1727 : * to prevent deletion by vacuum process
1728 : */
1729 72 : if (blkno == InvalidBlockNumber)
1730 : {
1731 : /* No pending list, so proceed with normal scan */
1732 68 : UnlockReleaseBuffer(metabuffer);
1733 140 : return;
1734 : }
1735 :
1736 4 : pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno);
1737 4 : LockBuffer(pos.pendingBuffer, GIN_SHARE);
1738 4 : pos.firstOffset = FirstOffsetNumber;
1739 4 : UnlockReleaseBuffer(metabuffer);
1740 4 : pos.hasMatchKey = palloc(sizeof(bool) * so->nkeys);
1741 :
1742 : /*
1743 : * loop for each heap row. scanGetCandidate returns full row or row's
1744 : * tuples from first page.
1745 : */
1746 18 : while (scanGetCandidate(scan, &pos))
1747 : {
1748 : /*
1749 : * Check entries in tuple and set up entryRes array.
1750 : *
1751 : * If pending tuples belonging to the current heap row are spread
1752 : * across several pages, collectMatchesForHeapRow will read all of
1753 : * those pages.
1754 : */
1755 10 : if (!collectMatchesForHeapRow(scan, &pos))
1756 6 : continue;
1757 :
1758 : /*
1759 : * Matching of entries of one row is finished, so check row using
1760 : * consistent functions.
1761 : */
1762 4 : oldCtx = MemoryContextSwitchTo(so->tempCtx);
1763 4 : recheck = false;
1764 4 : match = true;
1765 :
1766 8 : for (i = 0; i < so->nkeys; i++)
1767 : {
1768 4 : GinScanKey key = so->keys + i;
1769 :
1770 4 : if (!key->boolConsistentFn(key))
1771 : {
1772 0 : match = false;
1773 0 : break;
1774 : }
1775 4 : recheck |= key->recheckCurItem;
1776 : }
1777 :
1778 4 : MemoryContextSwitchTo(oldCtx);
1779 4 : MemoryContextReset(so->tempCtx);
1780 :
1781 4 : if (match)
1782 : {
1783 4 : tbm_add_tuples(tbm, &pos.item, 1, recheck);
1784 4 : (*ntids)++;
1785 : }
1786 : }
1787 :
1788 4 : pfree(pos.hasMatchKey);
1789 : }
1790 :
1791 :
1792 : #define GinIsVoidRes(s) ( ((GinScanOpaque) scan->opaque)->isVoidRes )
1793 :
1794 : int64
1795 74 : gingetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
1796 : {
1797 74 : GinScanOpaque so = (GinScanOpaque) scan->opaque;
1798 : int64 ntids;
1799 : ItemPointerData iptr;
1800 : bool recheck;
1801 :
1802 : /*
1803 : * Set up the scan keys, and check for unsatisfiable query.
1804 : */
1805 74 : ginFreeScanKeys(so); /* there should be no keys yet, but just to be
1806 : * sure */
1807 74 : ginNewScanKey(scan);
1808 :
1809 74 : if (GinIsVoidRes(scan))
1810 2 : return 0;
1811 :
1812 72 : ntids = 0;
1813 :
1814 : /*
1815 : * First, scan the pending list and collect any matching entries into the
1816 : * bitmap. After we scan a pending item, some other backend could post it
1817 : * into the main index, and so we might visit it a second time during the
1818 : * main scan. This is okay because we'll just re-set the same bit in the
1819 : * bitmap. (The possibility of duplicate visits is a major reason why GIN
1820 : * can't support the amgettuple API, however.) Note that it would not do
1821 : * to scan the main index before the pending list, since concurrent
1822 : * cleanup could then make us miss entries entirely.
1823 : */
1824 72 : scanPendingInsert(scan, tbm, &ntids);
1825 :
1826 : /*
1827 : * Now scan the main index.
1828 : */
1829 72 : startScan(scan);
1830 :
1831 72 : ItemPointerSetMin(&iptr);
1832 :
1833 : for (;;)
1834 : {
1835 47621 : CHECK_FOR_INTERRUPTS();
1836 :
1837 47621 : if (!scanGetItem(scan, iptr, &iptr, &recheck))
1838 72 : break;
1839 :
1840 47549 : if (ItemPointerIsLossyPage(&iptr))
1841 0 : tbm_add_page(tbm, ItemPointerGetBlockNumber(&iptr));
1842 : else
1843 47549 : tbm_add_tuples(tbm, &iptr, 1, recheck);
1844 47549 : ntids++;
1845 47549 : }
1846 :
1847 72 : return ntids;
1848 : }
|