Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * hash.c
4 : * Implementation of Margo Seltzer's Hashing package for postgres.
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/hash/hash.c
12 : *
13 : * NOTES
14 : * This file contains only the public interface routines.
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 :
19 : #include "postgres.h"
20 :
21 : #include "access/hash.h"
22 : #include "access/hash_xlog.h"
23 : #include "access/relscan.h"
24 : #include "catalog/index.h"
25 : #include "commands/vacuum.h"
26 : #include "miscadmin.h"
27 : #include "optimizer/plancat.h"
28 : #include "utils/builtins.h"
29 : #include "utils/index_selfuncs.h"
30 : #include "utils/rel.h"
31 : #include "miscadmin.h"
32 :
33 :
34 : /* Working state for hashbuild and its callback */
35 : typedef struct
36 : {
37 : HSpool *spool; /* NULL if not using spooling */
38 : double indtuples; /* # tuples accepted into index */
39 : Relation heapRel; /* heap relation descriptor */
40 : } HashBuildState;
41 :
42 : static void hashbuildCallback(Relation index,
43 : HeapTuple htup,
44 : Datum *values,
45 : bool *isnull,
46 : bool tupleIsAlive,
47 : void *state);
48 :
49 :
50 : /*
51 : * Hash handler function: return IndexAmRoutine with access method parameters
52 : * and callbacks.
53 : */
54 : Datum
55 134 : hashhandler(PG_FUNCTION_ARGS)
56 : {
57 134 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
58 :
59 134 : amroutine->amstrategies = HTMaxStrategyNumber;
60 134 : amroutine->amsupport = HASHNProcs;
61 134 : amroutine->amcanorder = false;
62 134 : amroutine->amcanorderbyop = false;
63 134 : amroutine->amcanbackward = true;
64 134 : amroutine->amcanunique = false;
65 134 : amroutine->amcanmulticol = false;
66 134 : amroutine->amoptionalkey = false;
67 134 : amroutine->amsearcharray = false;
68 134 : amroutine->amsearchnulls = false;
69 134 : amroutine->amstorage = false;
70 134 : amroutine->amclusterable = false;
71 134 : amroutine->ampredlocks = false;
72 134 : amroutine->amcanparallel = false;
73 134 : amroutine->amkeytype = INT4OID;
74 :
75 134 : amroutine->ambuild = hashbuild;
76 134 : amroutine->ambuildempty = hashbuildempty;
77 134 : amroutine->aminsert = hashinsert;
78 134 : amroutine->ambulkdelete = hashbulkdelete;
79 134 : amroutine->amvacuumcleanup = hashvacuumcleanup;
80 134 : amroutine->amcanreturn = NULL;
81 134 : amroutine->amcostestimate = hashcostestimate;
82 134 : amroutine->amoptions = hashoptions;
83 134 : amroutine->amproperty = NULL;
84 134 : amroutine->amvalidate = hashvalidate;
85 134 : amroutine->ambeginscan = hashbeginscan;
86 134 : amroutine->amrescan = hashrescan;
87 134 : amroutine->amgettuple = hashgettuple;
88 134 : amroutine->amgetbitmap = hashgetbitmap;
89 134 : amroutine->amendscan = hashendscan;
90 134 : amroutine->ammarkpos = NULL;
91 134 : amroutine->amrestrpos = NULL;
92 134 : amroutine->amestimateparallelscan = NULL;
93 134 : amroutine->aminitparallelscan = NULL;
94 134 : amroutine->amparallelrescan = NULL;
95 :
96 134 : PG_RETURN_POINTER(amroutine);
97 : }
98 :
99 : /*
100 : * hashbuild() -- build a new hash index.
101 : */
102 : IndexBuildResult *
103 14 : hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
104 : {
105 : IndexBuildResult *result;
106 : BlockNumber relpages;
107 : double reltuples;
108 : double allvisfrac;
109 : uint32 num_buckets;
110 : long sort_threshold;
111 : HashBuildState buildstate;
112 :
113 : /*
114 : * We expect to be called exactly once for any index relation. If that's
115 : * not the case, big trouble's what we have.
116 : */
117 14 : if (RelationGetNumberOfBlocks(index) != 0)
118 0 : elog(ERROR, "index \"%s\" already contains data",
119 : RelationGetRelationName(index));
120 :
121 : /* Estimate the number of rows currently present in the table */
122 14 : estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
123 :
124 : /* Initialize the hash index metadata page and initial buckets */
125 14 : num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
126 :
127 : /*
128 : * If we just insert the tuples into the index in scan order, then
129 : * (assuming their hash codes are pretty random) there will be no locality
130 : * of access to the index, and if the index is bigger than available RAM
131 : * then we'll thrash horribly. To prevent that scenario, we can sort the
132 : * tuples by (expected) bucket number. However, such a sort is useless
133 : * overhead when the index does fit in RAM. We choose to sort if the
134 : * initial index size exceeds maintenance_work_mem, or the number of
135 : * buffers usable for the index, whichever is less. (Limiting by the
136 : * number of buffers should reduce thrashing between PG buffers and kernel
137 : * buffers, which seems useful even if no physical I/O results. Limiting
138 : * by maintenance_work_mem is useful to allow easy testing of the sort
139 : * code path, and may be useful to DBAs as an additional control knob.)
140 : *
141 : * NOTE: this test will need adjustment if a bucket is ever different from
142 : * one page. Also, "initial index size" accounting does not include the
143 : * metapage, nor the first bitmap page.
144 : */
145 14 : sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ;
146 14 : if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
147 13 : sort_threshold = Min(sort_threshold, NBuffers);
148 : else
149 1 : sort_threshold = Min(sort_threshold, NLocBuffer);
150 :
151 14 : if (num_buckets >= (uint32) sort_threshold)
152 1 : buildstate.spool = _h_spoolinit(heap, index, num_buckets);
153 : else
154 13 : buildstate.spool = NULL;
155 :
156 : /* prepare to build the index */
157 14 : buildstate.indtuples = 0;
158 14 : buildstate.heapRel = heap;
159 :
160 : /* do the heap scan */
161 14 : reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
162 : hashbuildCallback, (void *) &buildstate);
163 :
164 14 : if (buildstate.spool)
165 : {
166 : /* sort the tuples and insert them into the index */
167 1 : _h_indexbuild(buildstate.spool, buildstate.heapRel);
168 1 : _h_spooldestroy(buildstate.spool);
169 : }
170 :
171 : /*
172 : * Return statistics
173 : */
174 14 : result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
175 :
176 14 : result->heap_tuples = reltuples;
177 14 : result->index_tuples = buildstate.indtuples;
178 :
179 14 : return result;
180 : }
181 :
182 : /*
183 : * hashbuildempty() -- build an empty hash index in the initialization fork
184 : */
185 : void
186 1 : hashbuildempty(Relation index)
187 : {
188 1 : _hash_init(index, 0, INIT_FORKNUM);
189 1 : }
190 :
191 : /*
192 : * Per-tuple callback from IndexBuildHeapScan
193 : */
194 : static void
195 50543 : hashbuildCallback(Relation index,
196 : HeapTuple htup,
197 : Datum *values,
198 : bool *isnull,
199 : bool tupleIsAlive,
200 : void *state)
201 : {
202 50543 : HashBuildState *buildstate = (HashBuildState *) state;
203 : Datum index_values[1];
204 : bool index_isnull[1];
205 : IndexTuple itup;
206 :
207 : /* convert data to a hash key; on failure, do not insert anything */
208 50543 : if (!_hash_convert_tuple(index,
209 : values, isnull,
210 : index_values, index_isnull))
211 50543 : return;
212 :
213 : /* Either spool the tuple for sorting, or just put it into the index */
214 50543 : if (buildstate->spool)
215 10000 : _h_spool(buildstate->spool, &htup->t_self,
216 : index_values, index_isnull);
217 : else
218 : {
219 : /* form an index tuple and point it at the heap tuple */
220 40543 : itup = index_form_tuple(RelationGetDescr(index),
221 : index_values, index_isnull);
222 40543 : itup->t_tid = htup->t_self;
223 40543 : _hash_doinsert(index, itup, buildstate->heapRel);
224 40543 : pfree(itup);
225 : }
226 :
227 50543 : buildstate->indtuples += 1;
228 : }
229 :
230 : /*
231 : * hashinsert() -- insert an index tuple into a hash table.
232 : *
233 : * Hash on the heap tuple's key, form an index tuple with hash code.
234 : * Find the appropriate location for the new tuple, and put it there.
235 : */
236 : bool
237 30009 : hashinsert(Relation rel, Datum *values, bool *isnull,
238 : ItemPointer ht_ctid, Relation heapRel,
239 : IndexUniqueCheck checkUnique,
240 : IndexInfo *indexInfo)
241 : {
242 : Datum index_values[1];
243 : bool index_isnull[1];
244 : IndexTuple itup;
245 :
246 : /* convert data to a hash key; on failure, do not insert anything */
247 30009 : if (!_hash_convert_tuple(rel,
248 : values, isnull,
249 : index_values, index_isnull))
250 0 : return false;
251 :
252 : /* form an index tuple and point it at the heap tuple */
253 30009 : itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
254 30009 : itup->t_tid = *ht_ctid;
255 :
256 30009 : _hash_doinsert(rel, itup, heapRel);
257 :
258 30009 : pfree(itup);
259 :
260 30009 : return false;
261 : }
262 :
263 :
264 : /*
265 : * hashgettuple() -- Get the next tuple in the scan.
266 : */
267 : bool
268 16544 : hashgettuple(IndexScanDesc scan, ScanDirection dir)
269 : {
270 16544 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
271 16544 : Relation rel = scan->indexRelation;
272 : Buffer buf;
273 : Page page;
274 : OffsetNumber offnum;
275 : ItemPointer current;
276 : bool res;
277 :
278 : /* Hash indexes are always lossy since we store only the hash code */
279 16544 : scan->xs_recheck = true;
280 :
281 : /*
282 : * We hold pin but not lock on current buffer while outside the hash AM.
283 : * Reacquire the read lock here.
284 : */
285 16544 : if (BufferIsValid(so->hashso_curbuf))
286 16520 : LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
287 :
288 : /*
289 : * If we've already initialized this scan, we can just advance it in the
290 : * appropriate direction. If we haven't done so yet, we call a routine to
291 : * get the first item in the scan.
292 : */
293 16544 : current = &(so->hashso_curpos);
294 16544 : if (ItemPointerIsValid(current))
295 : {
296 : /*
297 : * An insertion into the current index page could have happened while
298 : * we didn't have read lock on it. Re-find our position by looking
299 : * for the TID we previously returned. (Because we hold a pin on the
300 : * primary bucket page, no deletions or splits could have occurred;
301 : * therefore we can expect that the TID still exists in the current
302 : * index page, at an offset >= where we were.)
303 : */
304 : OffsetNumber maxoffnum;
305 :
306 16520 : buf = so->hashso_curbuf;
307 16520 : Assert(BufferIsValid(buf));
308 16520 : page = BufferGetPage(buf);
309 :
310 : /*
311 : * We don't need test for old snapshot here as the current buffer is
312 : * pinned, so vacuum can't clean the page.
313 : */
314 16520 : maxoffnum = PageGetMaxOffsetNumber(page);
315 33043 : for (offnum = ItemPointerGetOffsetNumber(current);
316 : offnum <= maxoffnum;
317 3 : offnum = OffsetNumberNext(offnum))
318 : {
319 : IndexTuple itup;
320 :
321 16523 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
322 16523 : if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid)))
323 16520 : break;
324 : }
325 16520 : if (offnum > maxoffnum)
326 0 : elog(ERROR, "failed to re-find scan position within index \"%s\"",
327 : RelationGetRelationName(rel));
328 16520 : ItemPointerSetOffsetNumber(current, offnum);
329 :
330 : /*
331 : * Check to see if we should kill the previously-fetched tuple.
332 : */
333 16520 : if (scan->kill_prior_tuple)
334 : {
335 : /*
336 : * Yes, so remember it for later. (We'll deal with all such tuples
337 : * at once right after leaving the index page or at end of scan.)
338 : * In case if caller reverses the indexscan direction it is quite
339 : * possible that the same item might get entered multiple times.
340 : * But, we don't detect that; instead, we just forget any excess
341 : * entries.
342 : */
343 0 : if (so->killedItems == NULL)
344 0 : so->killedItems = palloc(MaxIndexTuplesPerPage *
345 : sizeof(HashScanPosItem));
346 :
347 0 : if (so->numKilled < MaxIndexTuplesPerPage)
348 : {
349 0 : so->killedItems[so->numKilled].heapTid = so->hashso_heappos;
350 0 : so->killedItems[so->numKilled].indexOffset =
351 0 : ItemPointerGetOffsetNumber(&(so->hashso_curpos));
352 0 : so->numKilled++;
353 : }
354 : }
355 :
356 : /*
357 : * Now continue the scan.
358 : */
359 16520 : res = _hash_next(scan, dir);
360 : }
361 : else
362 24 : res = _hash_first(scan, dir);
363 :
364 : /*
365 : * Skip killed tuples if asked to.
366 : */
367 16544 : if (scan->ignore_killed_tuples)
368 : {
369 33088 : while (res)
370 : {
371 16520 : offnum = ItemPointerGetOffsetNumber(current);
372 16520 : page = BufferGetPage(so->hashso_curbuf);
373 16520 : if (!ItemIdIsDead(PageGetItemId(page, offnum)))
374 16520 : break;
375 0 : res = _hash_next(scan, dir);
376 : }
377 : }
378 :
379 : /* Release read lock on current buffer, but keep it pinned */
380 16544 : if (BufferIsValid(so->hashso_curbuf))
381 16520 : LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
382 :
383 : /* Return current heap TID on success */
384 16544 : scan->xs_ctup.t_self = so->hashso_heappos;
385 :
386 16544 : return res;
387 : }
388 :
389 :
390 : /*
391 : * hashgetbitmap() -- get all tuples at once
392 : */
393 : int64
394 1 : hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
395 : {
396 1 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
397 : bool res;
398 1 : int64 ntids = 0;
399 :
400 1 : res = _hash_first(scan, ForwardScanDirection);
401 :
402 16 : while (res)
403 : {
404 : bool add_tuple;
405 :
406 : /*
407 : * Skip killed tuples if asked to.
408 : */
409 14 : if (scan->ignore_killed_tuples)
410 : {
411 : Page page;
412 : OffsetNumber offnum;
413 :
414 14 : offnum = ItemPointerGetOffsetNumber(&(so->hashso_curpos));
415 14 : page = BufferGetPage(so->hashso_curbuf);
416 14 : add_tuple = !ItemIdIsDead(PageGetItemId(page, offnum));
417 : }
418 : else
419 0 : add_tuple = true;
420 :
421 : /* Save tuple ID, and continue scanning */
422 14 : if (add_tuple)
423 : {
424 : /* Note we mark the tuple ID as requiring recheck */
425 14 : tbm_add_tuples(tbm, &(so->hashso_heappos), 1, true);
426 14 : ntids++;
427 : }
428 :
429 14 : res = _hash_next(scan, ForwardScanDirection);
430 : }
431 :
432 1 : return ntids;
433 : }
434 :
435 :
436 : /*
437 : * hashbeginscan() -- start a scan on a hash index
438 : */
439 : IndexScanDesc
440 24 : hashbeginscan(Relation rel, int nkeys, int norderbys)
441 : {
442 : IndexScanDesc scan;
443 : HashScanOpaque so;
444 :
445 : /* no order by operators allowed */
446 24 : Assert(norderbys == 0);
447 :
448 24 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
449 :
450 24 : so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
451 24 : so->hashso_curbuf = InvalidBuffer;
452 24 : so->hashso_bucket_buf = InvalidBuffer;
453 24 : so->hashso_split_bucket_buf = InvalidBuffer;
454 : /* set position invalid (this will cause _hash_first call) */
455 24 : ItemPointerSetInvalid(&(so->hashso_curpos));
456 24 : ItemPointerSetInvalid(&(so->hashso_heappos));
457 :
458 24 : so->hashso_buc_populated = false;
459 24 : so->hashso_buc_split = false;
460 :
461 24 : so->killedItems = NULL;
462 24 : so->numKilled = 0;
463 :
464 24 : scan->opaque = so;
465 :
466 24 : return scan;
467 : }
468 :
469 : /*
470 : * hashrescan() -- rescan an index relation
471 : */
472 : void
473 25 : hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
474 : ScanKey orderbys, int norderbys)
475 : {
476 25 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
477 25 : Relation rel = scan->indexRelation;
478 :
479 : /*
480 : * Before leaving current page, deal with any killed items. Also, ensure
481 : * that we acquire lock on current page before calling _hash_kill_items.
482 : */
483 25 : if (so->numKilled > 0)
484 : {
485 0 : LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
486 0 : _hash_kill_items(scan);
487 0 : LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
488 : }
489 :
490 25 : _hash_dropscanbuf(rel, so);
491 :
492 : /* set position invalid (this will cause _hash_first call) */
493 25 : ItemPointerSetInvalid(&(so->hashso_curpos));
494 25 : ItemPointerSetInvalid(&(so->hashso_heappos));
495 :
496 : /* Update scan key, if a new one is given */
497 25 : if (scankey && scan->numberOfKeys > 0)
498 : {
499 25 : memmove(scan->keyData,
500 : scankey,
501 25 : scan->numberOfKeys * sizeof(ScanKeyData));
502 : }
503 :
504 25 : so->hashso_buc_populated = false;
505 25 : so->hashso_buc_split = false;
506 25 : }
507 :
508 : /*
509 : * hashendscan() -- close down a scan
510 : */
511 : void
512 24 : hashendscan(IndexScanDesc scan)
513 : {
514 24 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
515 24 : Relation rel = scan->indexRelation;
516 :
517 : /*
518 : * Before leaving current page, deal with any killed items. Also, ensure
519 : * that we acquire lock on current page before calling _hash_kill_items.
520 : */
521 24 : if (so->numKilled > 0)
522 : {
523 0 : LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
524 0 : _hash_kill_items(scan);
525 0 : LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
526 : }
527 :
528 24 : _hash_dropscanbuf(rel, so);
529 :
530 24 : if (so->killedItems != NULL)
531 0 : pfree(so->killedItems);
532 24 : pfree(so);
533 24 : scan->opaque = NULL;
534 24 : }
535 :
536 : /*
537 : * Bulk deletion of all index entries pointing to a set of heap tuples.
538 : * The set of target tuples is specified via a callback routine that tells
539 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
540 : *
541 : * This function also deletes the tuples that are moved by split to other
542 : * bucket.
543 : *
544 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
545 : */
546 : IndexBulkDeleteResult *
547 1 : hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
548 : IndexBulkDeleteCallback callback, void *callback_state)
549 : {
550 1 : Relation rel = info->index;
551 : double tuples_removed;
552 : double num_index_tuples;
553 : double orig_ntuples;
554 : Bucket orig_maxbucket;
555 : Bucket cur_maxbucket;
556 : Bucket cur_bucket;
557 1 : Buffer metabuf = InvalidBuffer;
558 : HashMetaPage metap;
559 : HashMetaPage cachedmetap;
560 :
561 1 : tuples_removed = 0;
562 1 : num_index_tuples = 0;
563 :
564 : /*
565 : * We need a copy of the metapage so that we can use its hashm_spares[]
566 : * values to compute bucket page addresses, but a cached copy should be
567 : * good enough. (If not, we'll detect that further down and refresh the
568 : * cache as necessary.)
569 : */
570 1 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
571 1 : Assert(cachedmetap != NULL);
572 :
573 1 : orig_maxbucket = cachedmetap->hashm_maxbucket;
574 1 : orig_ntuples = cachedmetap->hashm_ntuples;
575 :
576 : /* Scan the buckets that we know exist */
577 1 : cur_bucket = 0;
578 1 : cur_maxbucket = orig_maxbucket;
579 :
580 : loop_top:
581 82 : while (cur_bucket <= cur_maxbucket)
582 : {
583 : BlockNumber bucket_blkno;
584 : BlockNumber blkno;
585 : Buffer bucket_buf;
586 : Buffer buf;
587 : HashPageOpaque bucket_opaque;
588 : Page page;
589 80 : bool split_cleanup = false;
590 :
591 : /* Get address of bucket's start page */
592 80 : bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
593 :
594 80 : blkno = bucket_blkno;
595 :
596 : /*
597 : * We need to acquire a cleanup lock on the primary bucket page to out
598 : * wait concurrent scans before deleting the dead tuples.
599 : */
600 80 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
601 80 : LockBufferForCleanup(buf);
602 80 : _hash_checkpage(rel, buf, LH_BUCKET_PAGE);
603 :
604 80 : page = BufferGetPage(buf);
605 80 : bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
606 :
607 : /*
608 : * If the bucket contains tuples that are moved by split, then we need
609 : * to delete such tuples. We can't delete such tuples if the split
610 : * operation on bucket is not finished as those are needed by scans.
611 : */
612 160 : if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
613 80 : H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
614 : {
615 0 : split_cleanup = true;
616 :
617 : /*
618 : * This bucket might have been split since we last held a lock on
619 : * the metapage. If so, hashm_maxbucket, hashm_highmask and
620 : * hashm_lowmask might be old enough to cause us to fail to remove
621 : * tuples left behind by the most recent split. To prevent that,
622 : * now that the primary page of the target bucket has been locked
623 : * (and thus can't be further split), check whether we need to
624 : * update our cached metapage data.
625 : */
626 0 : Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
627 0 : if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
628 : {
629 0 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
630 0 : Assert(cachedmetap != NULL);
631 : }
632 : }
633 :
634 80 : bucket_buf = buf;
635 :
636 80 : hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
637 : cachedmetap->hashm_maxbucket,
638 : cachedmetap->hashm_highmask,
639 : cachedmetap->hashm_lowmask, &tuples_removed,
640 : &num_index_tuples, split_cleanup,
641 : callback, callback_state);
642 :
643 80 : _hash_dropbuf(rel, bucket_buf);
644 :
645 : /* Advance to next bucket */
646 80 : cur_bucket++;
647 : }
648 :
649 1 : if (BufferIsInvalid(metabuf))
650 0 : metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
651 :
652 : /* Write-lock metapage and check for split since we started */
653 1 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
654 1 : metap = HashPageGetMeta(BufferGetPage(metabuf));
655 :
656 1 : if (cur_maxbucket != metap->hashm_maxbucket)
657 : {
658 : /* There's been a split, so process the additional bucket(s) */
659 0 : LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
660 0 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
661 0 : Assert(cachedmetap != NULL);
662 0 : cur_maxbucket = cachedmetap->hashm_maxbucket;
663 0 : goto loop_top;
664 : }
665 :
666 : /* Okay, we're really done. Update tuple count in metapage. */
667 1 : START_CRIT_SECTION();
668 :
669 2 : if (orig_maxbucket == metap->hashm_maxbucket &&
670 1 : orig_ntuples == metap->hashm_ntuples)
671 : {
672 : /*
673 : * No one has split or inserted anything since start of scan, so
674 : * believe our count as gospel.
675 : */
676 1 : metap->hashm_ntuples = num_index_tuples;
677 : }
678 : else
679 : {
680 : /*
681 : * Otherwise, our count is untrustworthy since we may have
682 : * double-scanned tuples in split buckets. Proceed by dead-reckoning.
683 : * (Note: we still return estimated_count = false, because using this
684 : * count is better than not updating reltuples at all.)
685 : */
686 0 : if (metap->hashm_ntuples > tuples_removed)
687 0 : metap->hashm_ntuples -= tuples_removed;
688 : else
689 0 : metap->hashm_ntuples = 0;
690 0 : num_index_tuples = metap->hashm_ntuples;
691 : }
692 :
693 1 : MarkBufferDirty(metabuf);
694 :
695 : /* XLOG stuff */
696 1 : if (RelationNeedsWAL(rel))
697 : {
698 : xl_hash_update_meta_page xlrec;
699 : XLogRecPtr recptr;
700 :
701 1 : xlrec.ntuples = metap->hashm_ntuples;
702 :
703 1 : XLogBeginInsert();
704 1 : XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
705 :
706 1 : XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
707 :
708 1 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
709 1 : PageSetLSN(BufferGetPage(metabuf), recptr);
710 : }
711 :
712 1 : END_CRIT_SECTION();
713 :
714 1 : _hash_relbuf(rel, metabuf);
715 :
716 : /* return statistics */
717 1 : if (stats == NULL)
718 1 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
719 1 : stats->estimated_count = false;
720 1 : stats->num_index_tuples = num_index_tuples;
721 1 : stats->tuples_removed += tuples_removed;
722 : /* hashvacuumcleanup will fill in num_pages */
723 :
724 1 : return stats;
725 : }
726 :
727 : /*
728 : * Post-VACUUM cleanup.
729 : *
730 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
731 : */
732 : IndexBulkDeleteResult *
733 5 : hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
734 : {
735 5 : Relation rel = info->index;
736 : BlockNumber num_pages;
737 :
738 : /* If hashbulkdelete wasn't called, return NULL signifying no change */
739 : /* Note: this covers the analyze_only case too */
740 5 : if (stats == NULL)
741 4 : return NULL;
742 :
743 : /* update statistics */
744 1 : num_pages = RelationGetNumberOfBlocks(rel);
745 1 : stats->num_pages = num_pages;
746 :
747 1 : return stats;
748 : }
749 :
750 : /*
751 : * Helper function to perform deletion of index entries from a bucket.
752 : *
753 : * This function expects that the caller has acquired a cleanup lock on the
754 : * primary bucket page, and will return with a write lock again held on the
755 : * primary bucket page. The lock won't necessarily be held continuously,
756 : * though, because we'll release it when visiting overflow pages.
757 : *
758 : * It would be very bad if this function cleaned a page while some other
759 : * backend was in the midst of scanning it, because hashgettuple assumes
760 : * that the next valid TID will be greater than or equal to the current
761 : * valid TID. There can't be any concurrent scans in progress when we first
762 : * enter this function because of the cleanup lock we hold on the primary
763 : * bucket page, but as soon as we release that lock, there might be. We
764 : * handle that by conspiring to prevent those scans from passing our cleanup
765 : * scan. To do that, we lock the next page in the bucket chain before
766 : * releasing the lock on the previous page. (This type of lock chaining is
767 : * not ideal, so we might want to look for a better solution at some point.)
768 : *
769 : * We need to retain a pin on the primary bucket to ensure that no concurrent
770 : * split can start.
771 : */
772 : void
773 153 : hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
774 : BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
775 : uint32 maxbucket, uint32 highmask, uint32 lowmask,
776 : double *tuples_removed, double *num_index_tuples,
777 : bool split_cleanup,
778 : IndexBulkDeleteCallback callback, void *callback_state)
779 : {
780 : BlockNumber blkno;
781 : Buffer buf;
782 153 : Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
783 153 : bool bucket_dirty = false;
784 :
785 153 : blkno = bucket_blkno;
786 153 : buf = bucket_buf;
787 :
788 153 : if (split_cleanup)
789 73 : new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
790 : lowmask, maxbucket);
791 :
792 : /* Scan each page in bucket */
793 : for (;;)
794 : {
795 : HashPageOpaque opaque;
796 : OffsetNumber offno;
797 : OffsetNumber maxoffno;
798 : Buffer next_buf;
799 : Page page;
800 : OffsetNumber deletable[MaxOffsetNumber];
801 215 : int ndeletable = 0;
802 215 : bool retain_pin = false;
803 215 : bool clear_dead_marking = false;
804 :
805 215 : vacuum_delay_point();
806 :
807 215 : page = BufferGetPage(buf);
808 215 : opaque = (HashPageOpaque) PageGetSpecialPointer(page);
809 :
810 : /* Scan each tuple in page */
811 215 : maxoffno = PageGetMaxOffsetNumber(page);
812 75123 : for (offno = FirstOffsetNumber;
813 : offno <= maxoffno;
814 74693 : offno = OffsetNumberNext(offno))
815 : {
816 : ItemPointer htup;
817 : IndexTuple itup;
818 : Bucket bucket;
819 74693 : bool kill_tuple = false;
820 :
821 74693 : itup = (IndexTuple) PageGetItem(page,
822 : PageGetItemId(page, offno));
823 74693 : htup = &(itup->t_tid);
824 :
825 : /*
826 : * To remove the dead tuples, we strictly want to rely on results
827 : * of callback function. refer btvacuumpage for detailed reason.
828 : */
829 74693 : if (callback && callback(htup, callback_state))
830 : {
831 5500 : kill_tuple = true;
832 11000 : if (tuples_removed)
833 5500 : *tuples_removed += 1;
834 : }
835 69193 : else if (split_cleanup)
836 : {
837 : /* delete the tuples that are moved by split. */
838 44193 : bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
839 : maxbucket,
840 : highmask,
841 : lowmask);
842 : /* mark the item for deletion */
843 44193 : if (bucket != cur_bucket)
844 : {
845 : /*
846 : * We expect tuples to either belong to current bucket or
847 : * new_bucket. This is ensured because we don't allow
848 : * further splits from bucket that contains garbage. See
849 : * comments in _hash_expandtable.
850 : */
851 16684 : Assert(bucket == new_bucket);
852 16684 : kill_tuple = true;
853 : }
854 : }
855 :
856 74693 : if (kill_tuple)
857 : {
858 : /* mark the item for deletion */
859 22184 : deletable[ndeletable++] = offno;
860 : }
861 : else
862 : {
863 : /* we're keeping it, so count it */
864 52509 : if (num_index_tuples)
865 25000 : *num_index_tuples += 1;
866 : }
867 : }
868 :
869 : /* retain the pin on primary bucket page till end of bucket scan */
870 215 : if (blkno == bucket_blkno)
871 153 : retain_pin = true;
872 : else
873 62 : retain_pin = false;
874 :
875 215 : blkno = opaque->hasho_nextblkno;
876 :
877 : /*
878 : * Apply deletions, advance to next page and write page if needed.
879 : */
880 215 : if (ndeletable > 0)
881 : {
882 : /* No ereport(ERROR) until changes are logged */
883 98 : START_CRIT_SECTION();
884 :
885 98 : PageIndexMultiDelete(page, deletable, ndeletable);
886 98 : bucket_dirty = true;
887 :
888 : /*
889 : * Let us mark the page as clean if vacuum removes the DEAD tuples
890 : * from an index page. We do this by clearing
891 : * LH_PAGE_HAS_DEAD_TUPLES flag.
892 : */
893 109 : if (tuples_removed && *tuples_removed > 0 &&
894 11 : H_HAS_DEAD_TUPLES(opaque))
895 : {
896 0 : opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
897 0 : clear_dead_marking = true;
898 : }
899 :
900 98 : MarkBufferDirty(buf);
901 :
902 : /* XLOG stuff */
903 98 : if (RelationNeedsWAL(rel))
904 : {
905 : xl_hash_delete xlrec;
906 : XLogRecPtr recptr;
907 :
908 98 : xlrec.clear_dead_marking = clear_dead_marking;
909 98 : xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false;
910 :
911 98 : XLogBeginInsert();
912 98 : XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
913 :
914 : /*
915 : * bucket buffer needs to be registered to ensure that we can
916 : * acquire a cleanup lock on it during replay.
917 : */
918 98 : if (!xlrec.is_primary_bucket_page)
919 33 : XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
920 :
921 98 : XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
922 98 : XLogRegisterBufData(1, (char *) deletable,
923 98 : ndeletable * sizeof(OffsetNumber));
924 :
925 98 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
926 98 : PageSetLSN(BufferGetPage(buf), recptr);
927 : }
928 :
929 98 : END_CRIT_SECTION();
930 : }
931 :
932 : /* bail out if there are no more pages to scan. */
933 215 : if (!BlockNumberIsValid(blkno))
934 153 : break;
935 :
936 62 : next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
937 : LH_OVERFLOW_PAGE,
938 : bstrategy);
939 :
940 : /*
941 : * release the lock on previous page after acquiring the lock on next
942 : * page
943 : */
944 62 : if (retain_pin)
945 13 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
946 : else
947 49 : _hash_relbuf(rel, buf);
948 :
949 62 : buf = next_buf;
950 62 : }
951 :
952 : /*
953 : * lock the bucket page to clear the garbage flag and squeeze the bucket.
954 : * if the current buffer is same as bucket buffer, then we already have
955 : * lock on bucket page.
956 : */
957 153 : if (buf != bucket_buf)
958 : {
959 13 : _hash_relbuf(rel, buf);
960 13 : LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
961 : }
962 :
963 : /*
964 : * Clear the garbage flag from bucket after deleting the tuples that are
965 : * moved by split. We purposefully clear the flag before squeeze bucket,
966 : * so that after restart, vacuum shouldn't again try to delete the moved
967 : * by split tuples.
968 : */
969 153 : if (split_cleanup)
970 : {
971 : HashPageOpaque bucket_opaque;
972 : Page page;
973 :
974 73 : page = BufferGetPage(bucket_buf);
975 73 : bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
976 :
977 : /* No ereport(ERROR) until changes are logged */
978 73 : START_CRIT_SECTION();
979 :
980 73 : bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
981 73 : MarkBufferDirty(bucket_buf);
982 :
983 : /* XLOG stuff */
984 73 : if (RelationNeedsWAL(rel))
985 : {
986 : XLogRecPtr recptr;
987 :
988 73 : XLogBeginInsert();
989 73 : XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
990 :
991 73 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
992 73 : PageSetLSN(page, recptr);
993 : }
994 :
995 73 : END_CRIT_SECTION();
996 : }
997 :
998 : /*
999 : * If we have deleted anything, try to compact free space. For squeezing
1000 : * the bucket, we must have a cleanup lock, else it can impact the
1001 : * ordering of tuples for a scan that has started before it.
1002 : */
1003 153 : if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
1004 67 : _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
1005 : bstrategy);
1006 : else
1007 86 : LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
1008 153 : }
|