Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * hashsearch.c
4 : * search code for postgres hash tables
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/hash/hashsearch.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include "access/hash.h"
18 : #include "access/relscan.h"
19 : #include "miscadmin.h"
20 : #include "pgstat.h"
21 : #include "utils/rel.h"
22 :
23 :
24 : /*
25 : * _hash_next() -- Get the next item in a scan.
26 : *
27 : * On entry, we have a valid hashso_curpos in the scan, and a
28 : * pin and read lock on the page that contains that item.
29 : * We find the next item in the scan, if any.
30 : * On success exit, we have the page containing the next item
31 : * pinned and locked.
32 : */
33 : bool
34 16534 : _hash_next(IndexScanDesc scan, ScanDirection dir)
35 : {
36 16534 : Relation rel = scan->indexRelation;
37 16534 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
38 : Buffer buf;
39 : Page page;
40 : OffsetNumber offnum;
41 : ItemPointer current;
42 : IndexTuple itup;
43 :
44 : /* we still have the buffer pinned and read-locked */
45 16534 : buf = so->hashso_curbuf;
46 16534 : Assert(BufferIsValid(buf));
47 :
48 : /*
49 : * step to next valid tuple.
50 : */
51 16534 : if (!_hash_step(scan, &buf, dir))
52 21 : return false;
53 :
54 : /* if we're here, _hash_step found a valid tuple */
55 16513 : current = &(so->hashso_curpos);
56 16513 : offnum = ItemPointerGetOffsetNumber(current);
57 16513 : _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
58 16513 : page = BufferGetPage(buf);
59 16513 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
60 16513 : so->hashso_heappos = itup->t_tid;
61 :
62 16513 : return true;
63 : }
64 :
65 : /*
66 : * Advance to next page in a bucket, if any. If we are scanning the bucket
67 : * being populated during split operation then this function advances to the
68 : * bucket being split after the last bucket page of bucket being populated.
69 : */
70 : static void
71 54 : _hash_readnext(IndexScanDesc scan,
72 : Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
73 : {
74 : BlockNumber blkno;
75 54 : Relation rel = scan->indexRelation;
76 54 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
77 54 : bool block_found = false;
78 :
79 54 : blkno = (*opaquep)->hasho_nextblkno;
80 :
81 : /*
82 : * Retain the pin on primary bucket page till the end of scan. Refer the
83 : * comments in _hash_first to know the reason of retaining pin.
84 : */
85 54 : if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
86 25 : LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
87 : else
88 29 : _hash_relbuf(rel, *bufp);
89 :
90 54 : *bufp = InvalidBuffer;
91 : /* check for interrupts while we're not holding any buffer lock */
92 54 : CHECK_FOR_INTERRUPTS();
93 54 : if (BlockNumberIsValid(blkno))
94 : {
95 30 : *bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
96 30 : block_found = true;
97 : }
98 24 : else if (so->hashso_buc_populated && !so->hashso_buc_split)
99 : {
100 : /*
101 : * end of bucket, scan bucket being split if there was a split in
102 : * progress at the start of scan.
103 : */
104 0 : *bufp = so->hashso_split_bucket_buf;
105 :
106 : /*
107 : * buffer for bucket being split must be valid as we acquire the pin
108 : * on it before the start of scan and retain it till end of scan.
109 : */
110 0 : Assert(BufferIsValid(*bufp));
111 :
112 0 : LockBuffer(*bufp, BUFFER_LOCK_SHARE);
113 :
114 : /*
115 : * setting hashso_buc_split to true indicates that we are scanning
116 : * bucket being split.
117 : */
118 0 : so->hashso_buc_split = true;
119 :
120 0 : block_found = true;
121 : }
122 :
123 54 : if (block_found)
124 : {
125 30 : *pagep = BufferGetPage(*bufp);
126 30 : TestForOldSnapshot(scan->xs_snapshot, rel, *pagep);
127 30 : *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
128 : }
129 54 : }
130 :
131 : /*
132 : * Advance to previous page in a bucket, if any. If the current scan has
133 : * started during split operation then this function advances to bucket
134 : * being populated after the first bucket page of bucket being split.
135 : */
136 : static void
137 11 : _hash_readprev(IndexScanDesc scan,
138 : Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
139 : {
140 : BlockNumber blkno;
141 11 : Relation rel = scan->indexRelation;
142 11 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
143 : bool haveprevblk;
144 :
145 11 : blkno = (*opaquep)->hasho_prevblkno;
146 :
147 : /*
148 : * Retain the pin on primary bucket page till the end of scan. Refer the
149 : * comments in _hash_first to know the reason of retaining pin.
150 : */
151 11 : if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
152 : {
153 1 : LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
154 1 : haveprevblk = false;
155 : }
156 : else
157 : {
158 10 : _hash_relbuf(rel, *bufp);
159 10 : haveprevblk = true;
160 : }
161 :
162 11 : *bufp = InvalidBuffer;
163 : /* check for interrupts while we're not holding any buffer lock */
164 11 : CHECK_FOR_INTERRUPTS();
165 :
166 11 : if (haveprevblk)
167 : {
168 10 : Assert(BlockNumberIsValid(blkno));
169 10 : *bufp = _hash_getbuf(rel, blkno, HASH_READ,
170 : LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
171 10 : *pagep = BufferGetPage(*bufp);
172 10 : TestForOldSnapshot(scan->xs_snapshot, rel, *pagep);
173 10 : *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
174 :
175 : /*
176 : * We always maintain the pin on bucket page for whole scan operation,
177 : * so releasing the additional pin we have acquired here.
178 : */
179 10 : if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
180 1 : _hash_dropbuf(rel, *bufp);
181 : }
182 1 : else if (so->hashso_buc_populated && so->hashso_buc_split)
183 : {
184 : /*
185 : * end of bucket, scan bucket being populated if there was a split in
186 : * progress at the start of scan.
187 : */
188 0 : *bufp = so->hashso_bucket_buf;
189 :
190 : /*
191 : * buffer for bucket being populated must be valid as we acquire the
192 : * pin on it before the start of scan and retain it till end of scan.
193 : */
194 0 : Assert(BufferIsValid(*bufp));
195 :
196 0 : LockBuffer(*bufp, BUFFER_LOCK_SHARE);
197 0 : *pagep = BufferGetPage(*bufp);
198 0 : *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
199 :
200 : /* move to the end of bucket chain */
201 0 : while (BlockNumberIsValid((*opaquep)->hasho_nextblkno))
202 0 : _hash_readnext(scan, bufp, pagep, opaquep);
203 :
204 : /*
205 : * setting hashso_buc_split to false indicates that we are scanning
206 : * bucket being populated.
207 : */
208 0 : so->hashso_buc_split = false;
209 : }
210 11 : }
211 :
212 : /*
213 : * _hash_first() -- Find the first item in a scan.
214 : *
215 : * Find the first item in the index that
216 : * satisfies the qualification associated with the scan descriptor. On
217 : * success, the page containing the current index tuple is read locked
218 : * and pinned, and the scan's opaque data entry is updated to
219 : * include the buffer.
220 : */
221 : bool
222 25 : _hash_first(IndexScanDesc scan, ScanDirection dir)
223 : {
224 25 : Relation rel = scan->indexRelation;
225 25 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
226 : ScanKey cur;
227 : uint32 hashkey;
228 : Bucket bucket;
229 : Buffer buf;
230 : Page page;
231 : HashPageOpaque opaque;
232 : IndexTuple itup;
233 : ItemPointer current;
234 : OffsetNumber offnum;
235 :
236 25 : pgstat_count_index_scan(rel);
237 :
238 25 : current = &(so->hashso_curpos);
239 25 : ItemPointerSetInvalid(current);
240 :
241 : /*
242 : * We do not support hash scans with no index qualification, because we
243 : * would have to read the whole index rather than just one bucket. That
244 : * creates a whole raft of problems, since we haven't got a practical way
245 : * to lock all the buckets against splits or compactions.
246 : */
247 25 : if (scan->numberOfKeys < 1)
248 0 : ereport(ERROR,
249 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
250 : errmsg("hash indexes do not support whole-index scans")));
251 :
252 : /* There may be more than one index qual, but we hash only the first */
253 25 : cur = &scan->keyData[0];
254 :
255 : /* We support only single-column hash indexes */
256 25 : Assert(cur->sk_attno == 1);
257 : /* And there's only one operator strategy, too */
258 25 : Assert(cur->sk_strategy == HTEqualStrategyNumber);
259 :
260 : /*
261 : * If the constant in the index qual is NULL, assume it cannot match any
262 : * items in the index.
263 : */
264 25 : if (cur->sk_flags & SK_ISNULL)
265 0 : return false;
266 :
267 : /*
268 : * Okay to compute the hash key. We want to do this before acquiring any
269 : * locks, in case a user-defined hash function happens to be slow.
270 : *
271 : * If scankey operator is not a cross-type comparison, we can use the
272 : * cached hash function; otherwise gotta look it up in the catalogs.
273 : *
274 : * We support the convention that sk_subtype == InvalidOid means the
275 : * opclass input type; this is a hack to simplify life for ScanKeyInit().
276 : */
277 25 : if (cur->sk_subtype == rel->rd_opcintype[0] ||
278 0 : cur->sk_subtype == InvalidOid)
279 25 : hashkey = _hash_datum2hashkey(rel, cur->sk_argument);
280 : else
281 0 : hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument,
282 : cur->sk_subtype);
283 :
284 25 : so->hashso_sk_hash = hashkey;
285 :
286 25 : buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL);
287 25 : page = BufferGetPage(buf);
288 25 : TestForOldSnapshot(scan->xs_snapshot, rel, page);
289 25 : opaque = (HashPageOpaque) PageGetSpecialPointer(page);
290 25 : bucket = opaque->hasho_bucket;
291 :
292 25 : so->hashso_bucket_buf = buf;
293 :
294 : /*
295 : * If a bucket split is in progress, then while scanning the bucket being
296 : * populated, we need to skip tuples that were copied from bucket being
297 : * split. We also need to maintain a pin on the bucket being split to
298 : * ensure that split-cleanup work done by vacuum doesn't remove tuples
299 : * from it till this scan is done. We need to maintain a pin on the
300 : * bucket being populated to ensure that vacuum doesn't squeeze that
301 : * bucket till this scan is complete; otherwise, the ordering of tuples
302 : * can't be maintained during forward and backward scans. Here, we have
303 : * to be cautious about locking order: first, acquire the lock on bucket
304 : * being split; then, release the lock on it but not the pin; then,
305 : * acquire a lock on bucket being populated and again re-verify whether
306 : * the bucket split is still in progress. Acquiring the lock on bucket
307 : * being split first ensures that the vacuum waits for this scan to
308 : * finish.
309 : */
310 25 : if (H_BUCKET_BEING_POPULATED(opaque))
311 : {
312 : BlockNumber old_blkno;
313 : Buffer old_buf;
314 :
315 0 : old_blkno = _hash_get_oldblock_from_newbucket(rel, bucket);
316 :
317 : /*
318 : * release the lock on new bucket and re-acquire it after acquiring
319 : * the lock on old bucket.
320 : */
321 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
322 :
323 0 : old_buf = _hash_getbuf(rel, old_blkno, HASH_READ, LH_BUCKET_PAGE);
324 0 : TestForOldSnapshot(scan->xs_snapshot, rel, BufferGetPage(old_buf));
325 :
326 : /*
327 : * remember the split bucket buffer so as to use it later for
328 : * scanning.
329 : */
330 0 : so->hashso_split_bucket_buf = old_buf;
331 0 : LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
332 :
333 0 : LockBuffer(buf, BUFFER_LOCK_SHARE);
334 0 : page = BufferGetPage(buf);
335 0 : opaque = (HashPageOpaque) PageGetSpecialPointer(page);
336 0 : Assert(opaque->hasho_bucket == bucket);
337 :
338 0 : if (H_BUCKET_BEING_POPULATED(opaque))
339 0 : so->hashso_buc_populated = true;
340 : else
341 : {
342 0 : _hash_dropbuf(rel, so->hashso_split_bucket_buf);
343 0 : so->hashso_split_bucket_buf = InvalidBuffer;
344 : }
345 : }
346 :
347 : /* If a backwards scan is requested, move to the end of the chain */
348 25 : if (ScanDirectionIsBackward(dir))
349 : {
350 : /*
351 : * Backward scans that start during split needs to start from end of
352 : * bucket being split.
353 : */
354 13 : while (BlockNumberIsValid(opaque->hasho_nextblkno) ||
355 1 : (so->hashso_buc_populated && !so->hashso_buc_split))
356 10 : _hash_readnext(scan, &buf, &page, &opaque);
357 : }
358 :
359 : /* Now find the first tuple satisfying the qualification */
360 25 : if (!_hash_step(scan, &buf, dir))
361 4 : return false;
362 :
363 : /* if we're here, _hash_step found a valid tuple */
364 21 : offnum = ItemPointerGetOffsetNumber(current);
365 21 : _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
366 21 : page = BufferGetPage(buf);
367 21 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
368 21 : so->hashso_heappos = itup->t_tid;
369 :
370 21 : return true;
371 : }
372 :
373 : /*
374 : * _hash_step() -- step to the next valid item in a scan in the bucket.
375 : *
376 : * If no valid record exists in the requested direction, return
377 : * false. Else, return true and set the hashso_curpos for the
378 : * scan to the right thing.
379 : *
380 : * Here we need to ensure that if the scan has started during split, then
381 : * skip the tuples that are moved by split while scanning bucket being
382 : * populated and then scan the bucket being split to cover all such
383 : * tuples. This is done to ensure that we don't miss tuples in the scans
384 : * that are started during split.
385 : *
386 : * 'bufP' points to the current buffer, which is pinned and read-locked.
387 : * On success exit, we have pin and read-lock on whichever page
388 : * contains the right item; on failure, we have released all buffers.
389 : */
390 : bool
391 16559 : _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
392 : {
393 16559 : Relation rel = scan->indexRelation;
394 16559 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
395 : ItemPointer current;
396 : Buffer buf;
397 : Page page;
398 : HashPageOpaque opaque;
399 : OffsetNumber maxoff;
400 : OffsetNumber offnum;
401 : BlockNumber blkno;
402 : IndexTuple itup;
403 :
404 16559 : current = &(so->hashso_curpos);
405 :
406 16559 : buf = *bufP;
407 16559 : _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
408 16559 : page = BufferGetPage(buf);
409 16559 : opaque = (HashPageOpaque) PageGetSpecialPointer(page);
410 :
411 : /*
412 : * If _hash_step is called from _hash_first, current will not be valid, so
413 : * we can't dereference it. However, in that case, we presumably want to
414 : * start at the beginning/end of the page...
415 : */
416 16559 : maxoff = PageGetMaxOffsetNumber(page);
417 16559 : if (ItemPointerIsValid(current))
418 16534 : offnum = ItemPointerGetOffsetNumber(current);
419 : else
420 25 : offnum = InvalidOffsetNumber;
421 :
422 : /*
423 : * 'offnum' now points to the last tuple we examined (if any).
424 : *
425 : * continue to step through tuples until: 1) we get to the end of the
426 : * bucket chain or 2) we find a valid tuple.
427 : */
428 : do
429 : {
430 16559 : switch (dir)
431 : {
432 : case ForwardScanDirection:
433 11058 : if (offnum != InvalidOffsetNumber)
434 11034 : offnum = OffsetNumberNext(offnum); /* move forward */
435 : else
436 : {
437 : /* new page, locate starting position by binary search */
438 24 : offnum = _hash_binsearch(page, so->hashso_sk_hash);
439 : }
440 :
441 : for (;;)
442 : {
443 : /*
444 : * check if we're still in the range of items with the
445 : * target hash key
446 : */
447 11078 : if (offnum <= maxoff)
448 : {
449 11056 : Assert(offnum >= FirstOffsetNumber);
450 11056 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
451 :
452 : /*
453 : * skip the tuples that are moved by split operation
454 : * for the scan that has started when split was in
455 : * progress
456 : */
457 11056 : if (so->hashso_buc_populated && !so->hashso_buc_split &&
458 0 : (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK))
459 : {
460 0 : offnum = OffsetNumberNext(offnum); /* move forward */
461 0 : continue;
462 : }
463 :
464 11056 : if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
465 11034 : break; /* yes, so exit for-loop */
466 : }
467 :
468 : /* Before leaving current page, deal with any killed items */
469 44 : if (so->numKilled > 0)
470 0 : _hash_kill_items(scan);
471 :
472 : /*
473 : * ran off the end of this page, try the next
474 : */
475 44 : _hash_readnext(scan, &buf, &page, &opaque);
476 44 : if (BufferIsValid(buf))
477 : {
478 20 : maxoff = PageGetMaxOffsetNumber(page);
479 20 : offnum = _hash_binsearch(page, so->hashso_sk_hash);
480 : }
481 : else
482 : {
483 24 : itup = NULL;
484 24 : break; /* exit for-loop */
485 : }
486 20 : }
487 11058 : break;
488 :
489 : case BackwardScanDirection:
490 5501 : if (offnum != InvalidOffsetNumber)
491 5500 : offnum = OffsetNumberPrev(offnum); /* move back */
492 : else
493 : {
494 : /* new page, locate starting position by binary search */
495 1 : offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
496 : }
497 :
498 : for (;;)
499 : {
500 : /*
501 : * check if we're still in the range of items with the
502 : * target hash key
503 : */
504 5511 : if (offnum >= FirstOffsetNumber)
505 : {
506 5500 : Assert(offnum <= maxoff);
507 5500 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
508 :
509 : /*
510 : * skip the tuples that are moved by split operation
511 : * for the scan that has started when split was in
512 : * progress
513 : */
514 5500 : if (so->hashso_buc_populated && !so->hashso_buc_split &&
515 0 : (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK))
516 : {
517 0 : offnum = OffsetNumberPrev(offnum); /* move back */
518 0 : continue;
519 : }
520 :
521 5500 : if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
522 5500 : break; /* yes, so exit for-loop */
523 : }
524 :
525 : /* Before leaving current page, deal with any killed items */
526 11 : if (so->numKilled > 0)
527 0 : _hash_kill_items(scan);
528 :
529 : /*
530 : * ran off the end of this page, try the next
531 : */
532 11 : _hash_readprev(scan, &buf, &page, &opaque);
533 11 : if (BufferIsValid(buf))
534 : {
535 10 : TestForOldSnapshot(scan->xs_snapshot, rel, page);
536 10 : maxoff = PageGetMaxOffsetNumber(page);
537 10 : offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
538 : }
539 : else
540 : {
541 1 : itup = NULL;
542 1 : break; /* exit for-loop */
543 : }
544 10 : }
545 5501 : break;
546 :
547 : default:
548 : /* NoMovementScanDirection */
549 : /* this should not be reached */
550 0 : itup = NULL;
551 0 : break;
552 : }
553 :
554 16559 : if (itup == NULL)
555 : {
556 : /*
557 : * We ran off the end of the bucket without finding a match.
558 : * Release the pin on bucket buffers. Normally, such pins are
559 : * released at end of scan, however scrolling cursors can
560 : * reacquire the bucket lock and pin in the same scan multiple
561 : * times.
562 : */
563 25 : *bufP = so->hashso_curbuf = InvalidBuffer;
564 25 : ItemPointerSetInvalid(current);
565 25 : _hash_dropscanbuf(rel, so);
566 25 : return false;
567 : }
568 :
569 : /* check the tuple quals, loop around if not met */
570 16534 : } while (!_hash_checkqual(scan, itup));
571 :
572 : /* if we made it to here, we've found a valid tuple */
573 16534 : blkno = BufferGetBlockNumber(buf);
574 16534 : *bufP = so->hashso_curbuf = buf;
575 16534 : ItemPointerSet(current, blkno, offnum);
576 16534 : return true;
577 : }
|