Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * hashovfl.c
4 : * Overflow page management code for the Postgres hash access method
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/hash/hashovfl.c
12 : *
13 : * NOTES
14 : * Overflow pages look like ordinary relation pages.
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 : #include "postgres.h"
19 :
20 : #include "access/hash.h"
21 : #include "access/hash_xlog.h"
22 : #include "miscadmin.h"
23 : #include "utils/rel.h"
24 :
25 :
26 : static uint32 _hash_firstfreebit(uint32 map);
27 :
28 :
29 : /*
30 : * Convert overflow page bit number (its index in the free-page bitmaps)
31 : * to block number within the index.
32 : */
33 : static BlockNumber
34 30 : bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum)
35 : {
36 30 : uint32 splitnum = metap->hashm_ovflpoint;
37 : uint32 i;
38 :
39 : /* Convert zero-based bitnumber to 1-based page number */
40 30 : ovflbitnum += 1;
41 :
42 : /* Determine the split number for this page (must be >= 1) */
43 190 : for (i = 1;
44 134 : i < splitnum && ovflbitnum > metap->hashm_spares[i];
45 130 : i++)
46 : /* loop */ ;
47 :
48 : /*
49 : * Convert to absolute page number by adding the number of bucket pages
50 : * that exist before this split point.
51 : */
52 30 : return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum);
53 : }
54 :
55 : /*
56 : * _hash_ovflblkno_to_bitno
57 : *
58 : * Convert overflow page block number to bit number for free-page bitmap.
59 : */
60 : uint32
61 30 : _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
62 : {
63 30 : uint32 splitnum = metap->hashm_ovflpoint;
64 : uint32 i;
65 : uint32 bitnum;
66 :
67 : /* Determine the split number containing this page */
68 160 : for (i = 1; i <= splitnum; i++)
69 : {
70 160 : if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i))
71 0 : break; /* oops */
72 160 : bitnum = ovflblkno - _hash_get_totalbuckets(i);
73 :
74 : /*
75 : * bitnum has to be greater than number of overflow page added in
76 : * previous split point. The overflow page at this splitnum (i) if any
77 : * should start from (_hash_get_totalbuckets(i) +
78 : * metap->hashm_spares[i - 1] + 1).
79 : */
80 320 : if (bitnum > metap->hashm_spares[i - 1] &&
81 160 : bitnum <= metap->hashm_spares[i])
82 30 : return bitnum - 1; /* -1 to convert 1-based to 0-based */
83 : }
84 :
85 0 : ereport(ERROR,
86 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
87 : errmsg("invalid overflow block number %u", ovflblkno)));
88 : return 0; /* keep compiler quiet */
89 : }
90 :
91 : /*
92 : * _hash_addovflpage
93 : *
94 : * Add an overflow page to the bucket whose last page is pointed to by 'buf'.
95 : *
96 : * On entry, the caller must hold a pin but no lock on 'buf'. The pin is
97 : * dropped before exiting (we assume the caller is not interested in 'buf'
98 : * anymore) if not asked to retain. The pin will be retained only for the
99 : * primary bucket. The returned overflow page will be pinned and
100 : * write-locked; it is guaranteed to be empty.
101 : *
102 : * The caller must hold a pin, but no lock, on the metapage buffer.
103 : * That buffer is returned in the same state.
104 : *
105 : * NB: since this could be executed concurrently by multiple processes,
106 : * one should not assume that the returned overflow page will be the
107 : * immediate successor of the originally passed 'buf'. Additional overflow
108 : * pages might have been added to the bucket chain in between.
109 : */
110 : Buffer
111 30 : _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
112 : {
113 : Buffer ovflbuf;
114 : Page page;
115 : Page ovflpage;
116 : HashPageOpaque pageopaque;
117 : HashPageOpaque ovflopaque;
118 : HashMetaPage metap;
119 30 : Buffer mapbuf = InvalidBuffer;
120 30 : Buffer newmapbuf = InvalidBuffer;
121 : BlockNumber blkno;
122 : uint32 orig_firstfree;
123 : uint32 splitnum;
124 30 : uint32 *freep = NULL;
125 : uint32 max_ovflpg;
126 : uint32 bit;
127 : uint32 bitmap_page_bit;
128 : uint32 first_page;
129 : uint32 last_bit;
130 : uint32 last_page;
131 : uint32 i,
132 : j;
133 30 : bool page_found = false;
134 :
135 : /*
136 : * Write-lock the tail page. Here, we need to maintain locking order such
137 : * that, first acquire the lock on tail page of bucket, then on meta page
138 : * to find and lock the bitmap page and if it is found, then lock on meta
139 : * page is released, then finally acquire the lock on new overflow buffer.
140 : * We need this locking order to avoid deadlock with backends that are
141 : * doing inserts.
142 : *
143 : * Note: We could have avoided locking many buffers here if we made two
144 : * WAL records for acquiring an overflow page (one to allocate an overflow
145 : * page and another to add it to overflow bucket chain). However, doing
146 : * so can leak an overflow page, if the system crashes after allocation.
147 : * Needless to say, it is better to have a single record from a
148 : * performance point of view as well.
149 : */
150 30 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
151 :
152 : /* probably redundant... */
153 30 : _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
154 :
155 : /* loop to find current tail page, in case someone else inserted too */
156 : for (;;)
157 : {
158 : BlockNumber nextblkno;
159 :
160 30 : page = BufferGetPage(buf);
161 30 : pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
162 30 : nextblkno = pageopaque->hasho_nextblkno;
163 :
164 30 : if (!BlockNumberIsValid(nextblkno))
165 30 : break;
166 :
167 : /* we assume we do not need to write the unmodified page */
168 0 : if (retain_pin)
169 : {
170 : /* pin will be retained only for the primary bucket page */
171 0 : Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_BUCKET_PAGE);
172 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
173 : }
174 : else
175 0 : _hash_relbuf(rel, buf);
176 :
177 0 : retain_pin = false;
178 :
179 0 : buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
180 0 : }
181 :
182 : /* Get exclusive lock on the meta page */
183 30 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
184 :
185 30 : _hash_checkpage(rel, metabuf, LH_META_PAGE);
186 30 : metap = HashPageGetMeta(BufferGetPage(metabuf));
187 :
188 : /* start search at hashm_firstfree */
189 30 : orig_firstfree = metap->hashm_firstfree;
190 30 : first_page = orig_firstfree >> BMPG_SHIFT(metap);
191 30 : bit = orig_firstfree & BMPG_MASK(metap);
192 30 : i = first_page;
193 30 : j = bit / BITS_PER_MAP;
194 30 : bit &= ~(BITS_PER_MAP - 1);
195 :
196 : /* outer loop iterates once per bitmap page */
197 : for (;;)
198 : {
199 : BlockNumber mapblkno;
200 : Page mappage;
201 : uint32 last_inpage;
202 :
203 : /* want to end search with the last existing overflow page */
204 52 : splitnum = metap->hashm_ovflpoint;
205 52 : max_ovflpg = metap->hashm_spares[splitnum] - 1;
206 52 : last_page = max_ovflpg >> BMPG_SHIFT(metap);
207 52 : last_bit = max_ovflpg & BMPG_MASK(metap);
208 :
209 52 : if (i > last_page)
210 22 : break;
211 :
212 30 : Assert(i < metap->hashm_nmaps);
213 30 : mapblkno = metap->hashm_mapp[i];
214 :
215 30 : if (i == last_page)
216 30 : last_inpage = last_bit;
217 : else
218 0 : last_inpage = BMPGSZ_BIT(metap) - 1;
219 :
220 : /* Release exclusive lock on metapage while reading bitmap page */
221 30 : LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
222 :
223 30 : mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE);
224 30 : mappage = BufferGetPage(mapbuf);
225 30 : freep = HashPageGetBitmap(mappage);
226 :
227 52 : for (; bit <= last_inpage; j++, bit += BITS_PER_MAP)
228 : {
229 30 : if (freep[j] != ALL_SET)
230 : {
231 8 : page_found = true;
232 :
233 : /* Reacquire exclusive lock on the meta page */
234 8 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
235 :
236 : /* convert bit to bit number within page */
237 8 : bit += _hash_firstfreebit(freep[j]);
238 8 : bitmap_page_bit = bit;
239 :
240 : /* convert bit to absolute bit number */
241 8 : bit += (i << BMPG_SHIFT(metap));
242 : /* Calculate address of the recycled overflow page */
243 8 : blkno = bitno_to_blkno(metap, bit);
244 :
245 : /* Fetch and init the recycled page */
246 8 : ovflbuf = _hash_getinitbuf(rel, blkno);
247 :
248 8 : goto found;
249 : }
250 : }
251 :
252 : /* No free space here, try to advance to next map page */
253 22 : _hash_relbuf(rel, mapbuf);
254 22 : mapbuf = InvalidBuffer;
255 22 : i++;
256 22 : j = 0; /* scan from start of next map page */
257 22 : bit = 0;
258 :
259 : /* Reacquire exclusive lock on the meta page */
260 22 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
261 22 : }
262 :
263 : /*
264 : * No free pages --- have to extend the relation to add an overflow page.
265 : * First, check to see if we have to add a new bitmap page too.
266 : */
267 22 : if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1))
268 : {
269 : /*
270 : * We create the new bitmap page with all pages marked "in use".
271 : * Actually two pages in the new bitmap's range will exist
272 : * immediately: the bitmap page itself, and the following page which
273 : * is the one we return to the caller. Both of these are correctly
274 : * marked "in use". Subsequent pages do not exist yet, but it is
275 : * convenient to pre-mark them as "in use" too.
276 : */
277 0 : bit = metap->hashm_spares[splitnum];
278 :
279 : /* metapage already has a write lock */
280 0 : if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
281 0 : ereport(ERROR,
282 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
283 : errmsg("out of overflow pages in hash index \"%s\"",
284 : RelationGetRelationName(rel))));
285 :
286 0 : newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM);
287 : }
288 : else
289 : {
290 : /*
291 : * Nothing to do here; since the page will be past the last used page,
292 : * we know its bitmap bit was preinitialized to "in use".
293 : */
294 : }
295 :
296 : /* Calculate address of the new overflow page */
297 44 : bit = BufferIsValid(newmapbuf) ?
298 22 : metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum];
299 22 : blkno = bitno_to_blkno(metap, bit);
300 :
301 : /*
302 : * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the
303 : * relation length stays in sync with ours. XXX It's annoying to do this
304 : * with metapage write lock held; would be better to use a lock that
305 : * doesn't block incoming searches.
306 : *
307 : * It is okay to hold two buffer locks here (one on tail page of bucket
308 : * and other on new overflow page) since there cannot be anyone else
309 : * contending for access to ovflbuf.
310 : */
311 22 : ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM);
312 :
313 : found:
314 :
315 : /*
316 : * Do the update. No ereport(ERROR) until changes are logged. We want to
317 : * log the changes for bitmap page and overflow page together to avoid
318 : * loss of pages in case the new page is added.
319 : */
320 30 : START_CRIT_SECTION();
321 :
322 30 : if (page_found)
323 : {
324 8 : Assert(BufferIsValid(mapbuf));
325 :
326 : /* mark page "in use" in the bitmap */
327 8 : SETBIT(freep, bitmap_page_bit);
328 8 : MarkBufferDirty(mapbuf);
329 : }
330 : else
331 : {
332 : /* update the count to indicate new overflow page is added */
333 22 : metap->hashm_spares[splitnum]++;
334 :
335 22 : if (BufferIsValid(newmapbuf))
336 : {
337 0 : _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false);
338 0 : MarkBufferDirty(newmapbuf);
339 :
340 : /* add the new bitmap page to the metapage's list of bitmaps */
341 0 : metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf);
342 0 : metap->hashm_nmaps++;
343 0 : metap->hashm_spares[splitnum]++;
344 0 : MarkBufferDirty(metabuf);
345 : }
346 :
347 : /*
348 : * for new overflow page, we don't need to explicitly set the bit in
349 : * bitmap page, as by default that will be set to "in use".
350 : */
351 : }
352 :
353 : /*
354 : * Adjust hashm_firstfree to avoid redundant searches. But don't risk
355 : * changing it if someone moved it while we were searching bitmap pages.
356 : */
357 30 : if (metap->hashm_firstfree == orig_firstfree)
358 : {
359 30 : metap->hashm_firstfree = bit + 1;
360 30 : MarkBufferDirty(metabuf);
361 : }
362 :
363 : /* initialize new overflow page */
364 30 : ovflpage = BufferGetPage(ovflbuf);
365 30 : ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
366 30 : ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
367 30 : ovflopaque->hasho_nextblkno = InvalidBlockNumber;
368 30 : ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
369 30 : ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
370 30 : ovflopaque->hasho_page_id = HASHO_PAGE_ID;
371 :
372 30 : MarkBufferDirty(ovflbuf);
373 :
374 : /* logically chain overflow page to previous page */
375 30 : pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
376 :
377 30 : MarkBufferDirty(buf);
378 :
379 : /* XLOG stuff */
380 30 : if (RelationNeedsWAL(rel))
381 : {
382 : XLogRecPtr recptr;
383 : xl_hash_add_ovfl_page xlrec;
384 :
385 30 : xlrec.bmpage_found = page_found;
386 30 : xlrec.bmsize = metap->hashm_bmsize;
387 :
388 30 : XLogBeginInsert();
389 30 : XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage);
390 :
391 30 : XLogRegisterBuffer(0, ovflbuf, REGBUF_WILL_INIT);
392 30 : XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket));
393 :
394 30 : XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
395 :
396 30 : if (BufferIsValid(mapbuf))
397 : {
398 8 : XLogRegisterBuffer(2, mapbuf, REGBUF_STANDARD);
399 8 : XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32));
400 : }
401 :
402 30 : if (BufferIsValid(newmapbuf))
403 0 : XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT);
404 :
405 30 : XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD);
406 30 : XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32));
407 :
408 30 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE);
409 :
410 30 : PageSetLSN(BufferGetPage(ovflbuf), recptr);
411 30 : PageSetLSN(BufferGetPage(buf), recptr);
412 :
413 30 : if (BufferIsValid(mapbuf))
414 8 : PageSetLSN(BufferGetPage(mapbuf), recptr);
415 :
416 30 : if (BufferIsValid(newmapbuf))
417 0 : PageSetLSN(BufferGetPage(newmapbuf), recptr);
418 :
419 30 : PageSetLSN(BufferGetPage(metabuf), recptr);
420 : }
421 :
422 30 : END_CRIT_SECTION();
423 :
424 30 : if (retain_pin)
425 10 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
426 : else
427 20 : _hash_relbuf(rel, buf);
428 :
429 30 : if (BufferIsValid(mapbuf))
430 8 : _hash_relbuf(rel, mapbuf);
431 :
432 30 : LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
433 :
434 30 : if (BufferIsValid(newmapbuf))
435 0 : _hash_relbuf(rel, newmapbuf);
436 :
437 30 : return ovflbuf;
438 : }
439 :
440 : /*
441 : * _hash_firstfreebit()
442 : *
443 : * Return the number of the first bit that is not set in the word 'map'.
444 : */
445 : static uint32
446 8 : _hash_firstfreebit(uint32 map)
447 : {
448 : uint32 i,
449 : mask;
450 :
451 8 : mask = 0x1;
452 117 : for (i = 0; i < BITS_PER_MAP; i++)
453 : {
454 117 : if (!(mask & map))
455 8 : return i;
456 109 : mask <<= 1;
457 : }
458 :
459 0 : elog(ERROR, "firstfreebit found no free bit");
460 :
461 : return 0; /* keep compiler quiet */
462 : }
463 :
464 : /*
465 : * _hash_freeovflpage() -
466 : *
467 : * Remove this overflow page from its bucket's chain, and mark the page as
468 : * free. On entry, ovflbuf is write-locked; it is released before exiting.
469 : *
470 : * Add the tuples (itups) to wbuf in this function. We could do that in the
471 : * caller as well, but the advantage of doing it here is we can easily write
472 : * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and
473 : * removal of overflow page has to done as an atomic operation, otherwise
474 : * during replay on standby users might find duplicate records.
475 : *
476 : * Since this function is invoked in VACUUM, we provide an access strategy
477 : * parameter that controls fetches of the bucket pages.
478 : *
479 : * Returns the block number of the page that followed the given page
480 : * in the bucket, or InvalidBlockNumber if no following page.
481 : *
482 : * NB: caller must not hold lock on metapage, nor on page, that's next to
483 : * ovflbuf in the bucket chain. We don't acquire the lock on page that's
484 : * prior to ovflbuf in chain if it is same as wbuf because the caller already
485 : * has a lock on same.
486 : */
487 : BlockNumber
488 30 : _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
489 : Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets,
490 : Size *tups_size, uint16 nitups,
491 : BufferAccessStrategy bstrategy)
492 : {
493 : HashMetaPage metap;
494 : Buffer metabuf;
495 : Buffer mapbuf;
496 : BlockNumber ovflblkno;
497 : BlockNumber prevblkno;
498 : BlockNumber blkno;
499 : BlockNumber nextblkno;
500 : BlockNumber writeblkno;
501 : HashPageOpaque ovflopaque;
502 : Page ovflpage;
503 : Page mappage;
504 : uint32 *freep;
505 : uint32 ovflbitno;
506 : int32 bitmappage,
507 : bitmapbit;
508 : Bucket bucket PG_USED_FOR_ASSERTS_ONLY;
509 30 : Buffer prevbuf = InvalidBuffer;
510 30 : Buffer nextbuf = InvalidBuffer;
511 30 : bool update_metap = false;
512 :
513 : /* Get information from the doomed page */
514 30 : _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE);
515 30 : ovflblkno = BufferGetBlockNumber(ovflbuf);
516 30 : ovflpage = BufferGetPage(ovflbuf);
517 30 : ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
518 30 : nextblkno = ovflopaque->hasho_nextblkno;
519 30 : prevblkno = ovflopaque->hasho_prevblkno;
520 30 : writeblkno = BufferGetBlockNumber(wbuf);
521 30 : bucket = ovflopaque->hasho_bucket;
522 :
523 : /*
524 : * Fix up the bucket chain. this is a doubly-linked list, so we must fix
525 : * up the bucket chain members behind and ahead of the overflow page being
526 : * deleted. Concurrency issues are avoided by using lock chaining as
527 : * described atop hashbucketcleanup.
528 : */
529 30 : if (BlockNumberIsValid(prevblkno))
530 : {
531 30 : if (prevblkno == writeblkno)
532 10 : prevbuf = wbuf;
533 : else
534 20 : prevbuf = _hash_getbuf_with_strategy(rel,
535 : prevblkno,
536 : HASH_WRITE,
537 : LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
538 : bstrategy);
539 : }
540 30 : if (BlockNumberIsValid(nextblkno))
541 0 : nextbuf = _hash_getbuf_with_strategy(rel,
542 : nextblkno,
543 : HASH_WRITE,
544 : LH_OVERFLOW_PAGE,
545 : bstrategy);
546 :
547 : /* Note: bstrategy is intentionally not used for metapage and bitmap */
548 :
549 : /* Read the metapage so we can determine which bitmap page to use */
550 30 : metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
551 30 : metap = HashPageGetMeta(BufferGetPage(metabuf));
552 :
553 : /* Identify which bit to set */
554 30 : ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno);
555 :
556 30 : bitmappage = ovflbitno >> BMPG_SHIFT(metap);
557 30 : bitmapbit = ovflbitno & BMPG_MASK(metap);
558 :
559 30 : if (bitmappage >= metap->hashm_nmaps)
560 0 : elog(ERROR, "invalid overflow bit number %u", ovflbitno);
561 30 : blkno = metap->hashm_mapp[bitmappage];
562 :
563 : /* Release metapage lock while we access the bitmap page */
564 30 : LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
565 :
566 : /* read the bitmap page to clear the bitmap bit */
567 30 : mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE);
568 30 : mappage = BufferGetPage(mapbuf);
569 30 : freep = HashPageGetBitmap(mappage);
570 30 : Assert(ISSET(freep, bitmapbit));
571 :
572 : /* Get write-lock on metapage to update firstfree */
573 30 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
574 :
575 : /* This operation needs to log multiple tuples, prepare WAL for that */
576 30 : if (RelationNeedsWAL(rel))
577 30 : XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, 4 + nitups);
578 :
579 30 : START_CRIT_SECTION();
580 :
581 : /*
582 : * we have to insert tuples on the "write" page, being careful to preserve
583 : * hashkey ordering. (If we insert many tuples into the same "write" page
584 : * it would be worth qsort'ing them).
585 : */
586 30 : if (nitups > 0)
587 : {
588 11 : _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups);
589 11 : MarkBufferDirty(wbuf);
590 : }
591 :
592 : /*
593 : * Reinitialize the freed overflow page. Just zeroing the page won't
594 : * work, because WAL replay routines expect pages to be initialized. See
595 : * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. We are
596 : * careful to make the special space valid here so that tools like
597 : * pageinspect won't get confused.
598 : */
599 30 : _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf));
600 :
601 30 : ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
602 :
603 30 : ovflopaque->hasho_prevblkno = InvalidBlockNumber;
604 30 : ovflopaque->hasho_nextblkno = InvalidBlockNumber;
605 30 : ovflopaque->hasho_bucket = -1;
606 30 : ovflopaque->hasho_flag = LH_UNUSED_PAGE;
607 30 : ovflopaque->hasho_page_id = HASHO_PAGE_ID;
608 :
609 30 : MarkBufferDirty(ovflbuf);
610 :
611 30 : if (BufferIsValid(prevbuf))
612 : {
613 30 : Page prevpage = BufferGetPage(prevbuf);
614 30 : HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage);
615 :
616 30 : Assert(prevopaque->hasho_bucket == bucket);
617 30 : prevopaque->hasho_nextblkno = nextblkno;
618 30 : MarkBufferDirty(prevbuf);
619 : }
620 30 : if (BufferIsValid(nextbuf))
621 : {
622 0 : Page nextpage = BufferGetPage(nextbuf);
623 0 : HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage);
624 :
625 0 : Assert(nextopaque->hasho_bucket == bucket);
626 0 : nextopaque->hasho_prevblkno = prevblkno;
627 0 : MarkBufferDirty(nextbuf);
628 : }
629 :
630 : /* Clear the bitmap bit to indicate that this overflow page is free */
631 30 : CLRBIT(freep, bitmapbit);
632 30 : MarkBufferDirty(mapbuf);
633 :
634 : /* if this is now the first free page, update hashm_firstfree */
635 30 : if (ovflbitno < metap->hashm_firstfree)
636 : {
637 15 : metap->hashm_firstfree = ovflbitno;
638 15 : update_metap = true;
639 15 : MarkBufferDirty(metabuf);
640 : }
641 :
642 : /* XLOG stuff */
643 30 : if (RelationNeedsWAL(rel))
644 : {
645 : xl_hash_squeeze_page xlrec;
646 : XLogRecPtr recptr;
647 : int i;
648 :
649 30 : xlrec.prevblkno = prevblkno;
650 30 : xlrec.nextblkno = nextblkno;
651 30 : xlrec.ntups = nitups;
652 30 : xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf);
653 30 : xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf);
654 :
655 30 : XLogBeginInsert();
656 30 : XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage);
657 :
658 : /*
659 : * bucket buffer needs to be registered to ensure that we can acquire
660 : * a cleanup lock on it during replay.
661 : */
662 30 : if (!xlrec.is_prim_bucket_same_wrt)
663 0 : XLogRegisterBuffer(0, bucketbuf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
664 :
665 30 : XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD);
666 30 : if (xlrec.ntups > 0)
667 : {
668 11 : XLogRegisterBufData(1, (char *) itup_offsets,
669 11 : nitups * sizeof(OffsetNumber));
670 575 : for (i = 0; i < nitups; i++)
671 564 : XLogRegisterBufData(1, (char *) itups[i], tups_size[i]);
672 : }
673 :
674 30 : XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD);
675 :
676 : /*
677 : * If prevpage and the writepage (block in which we are moving tuples
678 : * from overflow) are same, then no need to separately register
679 : * prevpage. During replay, we can directly update the nextblock in
680 : * writepage.
681 : */
682 30 : if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt)
683 20 : XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD);
684 :
685 30 : if (BufferIsValid(nextbuf))
686 0 : XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD);
687 :
688 30 : XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD);
689 30 : XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32));
690 :
691 30 : if (update_metap)
692 : {
693 15 : XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD);
694 15 : XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32));
695 : }
696 :
697 30 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE);
698 :
699 30 : PageSetLSN(BufferGetPage(wbuf), recptr);
700 30 : PageSetLSN(BufferGetPage(ovflbuf), recptr);
701 :
702 30 : if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt)
703 20 : PageSetLSN(BufferGetPage(prevbuf), recptr);
704 30 : if (BufferIsValid(nextbuf))
705 0 : PageSetLSN(BufferGetPage(nextbuf), recptr);
706 :
707 30 : PageSetLSN(BufferGetPage(mapbuf), recptr);
708 :
709 30 : if (update_metap)
710 15 : PageSetLSN(BufferGetPage(metabuf), recptr);
711 : }
712 :
713 30 : END_CRIT_SECTION();
714 :
715 : /* release previous bucket if it is not same as write bucket */
716 30 : if (BufferIsValid(prevbuf) && prevblkno != writeblkno)
717 20 : _hash_relbuf(rel, prevbuf);
718 :
719 30 : if (BufferIsValid(ovflbuf))
720 30 : _hash_relbuf(rel, ovflbuf);
721 :
722 30 : if (BufferIsValid(nextbuf))
723 0 : _hash_relbuf(rel, nextbuf);
724 :
725 30 : _hash_relbuf(rel, mapbuf);
726 30 : _hash_relbuf(rel, metabuf);
727 :
728 30 : return nextblkno;
729 : }
730 :
731 :
732 : /*
733 : * _hash_initbitmapbuffer()
734 : *
735 : * Initialize a new bitmap page. All bits in the new bitmap page are set to
736 : * "1", indicating "in use".
737 : */
738 : void
739 15 : _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage)
740 : {
741 : Page pg;
742 : HashPageOpaque op;
743 : uint32 *freep;
744 :
745 15 : pg = BufferGetPage(buf);
746 :
747 : /* initialize the page */
748 15 : if (initpage)
749 0 : _hash_pageinit(pg, BufferGetPageSize(buf));
750 :
751 : /* initialize the page's special space */
752 15 : op = (HashPageOpaque) PageGetSpecialPointer(pg);
753 15 : op->hasho_prevblkno = InvalidBlockNumber;
754 15 : op->hasho_nextblkno = InvalidBlockNumber;
755 15 : op->hasho_bucket = -1;
756 15 : op->hasho_flag = LH_BITMAP_PAGE;
757 15 : op->hasho_page_id = HASHO_PAGE_ID;
758 :
759 : /* set all of the bits to 1 */
760 15 : freep = HashPageGetBitmap(pg);
761 15 : MemSet(freep, 0xFF, bmsize);
762 :
763 : /*
764 : * Set pd_lower just past the end of the bitmap page data. We could even
765 : * set pd_lower equal to pd_upper, but this is more precise and makes the
766 : * page look compressible to xlog.c.
767 : */
768 15 : ((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg;
769 15 : }
770 :
771 :
772 : /*
773 : * _hash_squeezebucket(rel, bucket)
774 : *
775 : * Try to squeeze the tuples onto pages occurring earlier in the
776 : * bucket chain in an attempt to free overflow pages. When we start
777 : * the "squeezing", the page from which we start taking tuples (the
778 : * "read" page) is the last bucket in the bucket chain and the page
779 : * onto which we start squeezing tuples (the "write" page) is the
780 : * first page in the bucket chain. The read page works backward and
781 : * the write page works forward; the procedure terminates when the
782 : * read page and write page are the same page.
783 : *
784 : * At completion of this procedure, it is guaranteed that all pages in
785 : * the bucket are nonempty, unless the bucket is totally empty (in
786 : * which case all overflow pages will be freed). The original implementation
787 : * required that to be true on entry as well, but it's a lot easier for
788 : * callers to leave empty overflow pages and let this guy clean it up.
789 : *
790 : * Caller must acquire cleanup lock on the primary page of the target
791 : * bucket to exclude any scans that are in progress, which could easily
792 : * be confused into returning the same tuple more than once or some tuples
793 : * not at all by the rearrangement we are performing here. To prevent
794 : * any concurrent scan to cross the squeeze scan we use lock chaining
795 : * similar to hasbucketcleanup. Refer comments atop hashbucketcleanup.
796 : *
797 : * We need to retain a pin on the primary bucket to ensure that no concurrent
798 : * split can start.
799 : *
800 : * Since this function is invoked in VACUUM, we provide an access strategy
801 : * parameter that controls fetches of the bucket pages.
802 : */
803 : void
804 67 : _hash_squeezebucket(Relation rel,
805 : Bucket bucket,
806 : BlockNumber bucket_blkno,
807 : Buffer bucket_buf,
808 : BufferAccessStrategy bstrategy)
809 : {
810 : BlockNumber wblkno;
811 : BlockNumber rblkno;
812 : Buffer wbuf;
813 : Buffer rbuf;
814 : Page wpage;
815 : Page rpage;
816 : HashPageOpaque wopaque;
817 : HashPageOpaque ropaque;
818 :
819 : /*
820 : * start squeezing into the primary bucket page.
821 : */
822 67 : wblkno = bucket_blkno;
823 67 : wbuf = bucket_buf;
824 67 : wpage = BufferGetPage(wbuf);
825 67 : wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
826 :
827 : /*
828 : * if there aren't any overflow pages, there's nothing to squeeze. caller
829 : * is responsible for releasing the pin on primary bucket page.
830 : */
831 67 : if (!BlockNumberIsValid(wopaque->hasho_nextblkno))
832 : {
833 55 : LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
834 55 : return;
835 : }
836 :
837 : /*
838 : * Find the last page in the bucket chain by starting at the base bucket
839 : * page and working forward. Note: we assume that a hash bucket chain is
840 : * usually smaller than the buffer ring being used by VACUUM, else using
841 : * the access strategy here would be counterproductive.
842 : */
843 12 : rbuf = InvalidBuffer;
844 12 : ropaque = wopaque;
845 : do
846 : {
847 52 : rblkno = ropaque->hasho_nextblkno;
848 52 : if (rbuf != InvalidBuffer)
849 40 : _hash_relbuf(rel, rbuf);
850 52 : rbuf = _hash_getbuf_with_strategy(rel,
851 : rblkno,
852 : HASH_WRITE,
853 : LH_OVERFLOW_PAGE,
854 : bstrategy);
855 52 : rpage = BufferGetPage(rbuf);
856 52 : ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
857 52 : Assert(ropaque->hasho_bucket == bucket);
858 52 : } while (BlockNumberIsValid(ropaque->hasho_nextblkno));
859 :
860 : /*
861 : * squeeze the tuples.
862 : */
863 : for (;;)
864 : {
865 : OffsetNumber roffnum;
866 : OffsetNumber maxroffnum;
867 : OffsetNumber deletable[MaxOffsetNumber];
868 : IndexTuple itups[MaxIndexTuplesPerPage];
869 : Size tups_size[MaxIndexTuplesPerPage];
870 : OffsetNumber itup_offsets[MaxIndexTuplesPerPage];
871 32 : uint16 ndeletable = 0;
872 32 : uint16 nitups = 0;
873 32 : Size all_tups_size = 0;
874 : int i;
875 32 : bool retain_pin = false;
876 :
877 : readpage:
878 : /* Scan each tuple in "read" page */
879 32 : maxroffnum = PageGetMaxOffsetNumber(rpage);
880 727 : for (roffnum = FirstOffsetNumber;
881 : roffnum <= maxroffnum;
882 663 : roffnum = OffsetNumberNext(roffnum))
883 : {
884 : IndexTuple itup;
885 : Size itemsz;
886 :
887 : /* skip dead tuples */
888 665 : if (ItemIdIsDead(PageGetItemId(rpage, roffnum)))
889 0 : continue;
890 :
891 665 : itup = (IndexTuple) PageGetItem(rpage,
892 : PageGetItemId(rpage, roffnum));
893 665 : itemsz = IndexTupleDSize(*itup);
894 665 : itemsz = MAXALIGN(itemsz);
895 :
896 : /*
897 : * Walk up the bucket chain, looking for a page big enough for
898 : * this item and all other accumulated items. Exit if we reach
899 : * the read page.
900 : */
901 1350 : while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz))
902 : {
903 22 : Buffer next_wbuf = InvalidBuffer;
904 22 : bool tups_moved = false;
905 :
906 22 : Assert(!PageIsEmpty(wpage));
907 :
908 22 : if (wblkno == bucket_blkno)
909 2 : retain_pin = true;
910 :
911 22 : wblkno = wopaque->hasho_nextblkno;
912 22 : Assert(BlockNumberIsValid(wblkno));
913 :
914 : /* don't need to move to next page if we reached the read page */
915 22 : if (wblkno != rblkno)
916 20 : next_wbuf = _hash_getbuf_with_strategy(rel,
917 : wblkno,
918 : HASH_WRITE,
919 : LH_OVERFLOW_PAGE,
920 : bstrategy);
921 :
922 22 : if (nitups > 0)
923 : {
924 2 : Assert(nitups == ndeletable);
925 :
926 : /*
927 : * This operation needs to log multiple tuples, prepare
928 : * WAL for that.
929 : */
930 2 : if (RelationNeedsWAL(rel))
931 2 : XLogEnsureRecordSpace(0, 3 + nitups);
932 :
933 2 : START_CRIT_SECTION();
934 :
935 : /*
936 : * we have to insert tuples on the "write" page, being
937 : * careful to preserve hashkey ordering. (If we insert
938 : * many tuples into the same "write" page it would be
939 : * worth qsort'ing them).
940 : */
941 2 : _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups);
942 2 : MarkBufferDirty(wbuf);
943 :
944 : /* Delete tuples we already moved off read page */
945 2 : PageIndexMultiDelete(rpage, deletable, ndeletable);
946 2 : MarkBufferDirty(rbuf);
947 :
948 : /* XLOG stuff */
949 2 : if (RelationNeedsWAL(rel))
950 : {
951 : XLogRecPtr recptr;
952 : xl_hash_move_page_contents xlrec;
953 :
954 2 : xlrec.ntups = nitups;
955 2 : xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf) ? true : false;
956 :
957 2 : XLogBeginInsert();
958 2 : XLogRegisterData((char *) &xlrec, SizeOfHashMovePageContents);
959 :
960 : /*
961 : * bucket buffer needs to be registered to ensure that
962 : * we can acquire a cleanup lock on it during replay.
963 : */
964 2 : if (!xlrec.is_prim_bucket_same_wrt)
965 2 : XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
966 :
967 2 : XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD);
968 2 : XLogRegisterBufData(1, (char *) itup_offsets,
969 2 : nitups * sizeof(OffsetNumber));
970 101 : for (i = 0; i < nitups; i++)
971 99 : XLogRegisterBufData(1, (char *) itups[i], tups_size[i]);
972 :
973 2 : XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
974 2 : XLogRegisterBufData(2, (char *) deletable,
975 2 : ndeletable * sizeof(OffsetNumber));
976 :
977 2 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS);
978 :
979 2 : PageSetLSN(BufferGetPage(wbuf), recptr);
980 2 : PageSetLSN(BufferGetPage(rbuf), recptr);
981 : }
982 :
983 2 : END_CRIT_SECTION();
984 :
985 2 : tups_moved = true;
986 : }
987 :
988 : /*
989 : * release the lock on previous page after acquiring the lock
990 : * on next page
991 : */
992 22 : if (retain_pin)
993 2 : LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
994 : else
995 20 : _hash_relbuf(rel, wbuf);
996 :
997 : /* nothing more to do if we reached the read page */
998 22 : if (rblkno == wblkno)
999 : {
1000 2 : _hash_relbuf(rel, rbuf);
1001 2 : return;
1002 : }
1003 :
1004 20 : wbuf = next_wbuf;
1005 20 : wpage = BufferGetPage(wbuf);
1006 20 : wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
1007 20 : Assert(wopaque->hasho_bucket == bucket);
1008 20 : retain_pin = false;
1009 :
1010 : /* be tidy */
1011 20 : for (i = 0; i < nitups; i++)
1012 0 : pfree(itups[i]);
1013 20 : nitups = 0;
1014 20 : all_tups_size = 0;
1015 20 : ndeletable = 0;
1016 :
1017 : /*
1018 : * after moving the tuples, rpage would have been compacted,
1019 : * so we need to rescan it.
1020 : */
1021 20 : if (tups_moved)
1022 0 : goto readpage;
1023 : }
1024 :
1025 : /* remember tuple for deletion from "read" page */
1026 663 : deletable[ndeletable++] = roffnum;
1027 :
1028 : /*
1029 : * we need a copy of index tuples as they can be freed as part of
1030 : * overflow page, however we need them to write a WAL record in
1031 : * _hash_freeovflpage.
1032 : */
1033 663 : itups[nitups] = CopyIndexTuple(itup);
1034 663 : tups_size[nitups++] = itemsz;
1035 663 : all_tups_size += itemsz;
1036 : }
1037 :
1038 : /*
1039 : * If we reach here, there are no live tuples on the "read" page ---
1040 : * it was empty when we got to it, or we moved them all. So we can
1041 : * just free the page without bothering with deleting tuples
1042 : * individually. Then advance to the previous "read" page.
1043 : *
1044 : * Tricky point here: if our read and write pages are adjacent in the
1045 : * bucket chain, our write lock on wbuf will conflict with
1046 : * _hash_freeovflpage's attempt to update the sibling links of the
1047 : * removed page. In that case, we don't need to lock it again.
1048 : */
1049 30 : rblkno = ropaque->hasho_prevblkno;
1050 30 : Assert(BlockNumberIsValid(rblkno));
1051 :
1052 : /* free this overflow page (releases rbuf) */
1053 30 : _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets,
1054 : tups_size, nitups, bstrategy);
1055 :
1056 : /* be tidy */
1057 594 : for (i = 0; i < nitups; i++)
1058 564 : pfree(itups[i]);
1059 :
1060 : /* are we freeing the page adjacent to wbuf? */
1061 30 : if (rblkno == wblkno)
1062 : {
1063 : /* retain the pin on primary bucket page till end of bucket scan */
1064 10 : if (wblkno == bucket_blkno)
1065 10 : LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
1066 : else
1067 0 : _hash_relbuf(rel, wbuf);
1068 10 : return;
1069 : }
1070 :
1071 20 : rbuf = _hash_getbuf_with_strategy(rel,
1072 : rblkno,
1073 : HASH_WRITE,
1074 : LH_OVERFLOW_PAGE,
1075 : bstrategy);
1076 20 : rpage = BufferGetPage(rbuf);
1077 20 : ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
1078 20 : Assert(ropaque->hasho_bucket == bucket);
1079 20 : }
1080 :
1081 : /* NOTREACHED */
1082 : }
|