Line data Source code
1 : /*
2 : * brin_pageops.c
3 : * Page-handling routines for BRIN indexes
4 : *
5 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
6 : * Portions Copyright (c) 1994, Regents of the University of California
7 : *
8 : * IDENTIFICATION
9 : * src/backend/access/brin/brin_pageops.c
10 : */
11 : #include "postgres.h"
12 :
13 : #include "access/brin_pageops.h"
14 : #include "access/brin_page.h"
15 : #include "access/brin_revmap.h"
16 : #include "access/brin_xlog.h"
17 : #include "access/xloginsert.h"
18 : #include "miscadmin.h"
19 : #include "storage/bufmgr.h"
20 : #include "storage/freespace.h"
21 : #include "storage/lmgr.h"
22 : #include "storage/smgr.h"
23 : #include "utils/rel.h"
24 :
25 :
26 : /*
27 : * Maximum size of an entry in a BRIN_PAGETYPE_REGULAR page. We can tolerate
28 : * a single item per page, unlike other index AMs.
29 : */
30 : #define BrinMaxItemSize \
31 : MAXALIGN_DOWN(BLCKSZ - \
32 : (MAXALIGN(SizeOfPageHeaderData + \
33 : sizeof(ItemIdData)) + \
34 : MAXALIGN(sizeof(BrinSpecialSpace))))
35 :
36 : static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
37 : bool *extended);
38 : static Size br_page_get_freespace(Page page);
39 : static void brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer);
40 :
41 :
42 : /*
43 : * Update tuple origtup (size origsz), located in offset oldoff of buffer
44 : * oldbuf, to newtup (size newsz) as summary tuple for the page range starting
45 : * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit.
46 : *
47 : * If samepage is true, attempt to put the new tuple in the same page, but if
48 : * there's no room, use some other one.
49 : *
50 : * If the update is successful, return true; the revmap is updated to point to
51 : * the new tuple. If the update is not done for whatever reason, return false.
52 : * Caller may retry the update if this happens.
53 : */
54 : bool
55 218 : brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
56 : BrinRevmap *revmap, BlockNumber heapBlk,
57 : Buffer oldbuf, OffsetNumber oldoff,
58 : const BrinTuple *origtup, Size origsz,
59 : const BrinTuple *newtup, Size newsz,
60 : bool samepage)
61 : {
62 : Page oldpage;
63 : ItemId oldlp;
64 : BrinTuple *oldtup;
65 : Size oldsz;
66 : Buffer newbuf;
67 : bool extended;
68 :
69 218 : Assert(newsz == MAXALIGN(newsz));
70 :
71 : /* If the item is oversized, don't bother. */
72 218 : if (newsz > BrinMaxItemSize)
73 : {
74 0 : ereport(ERROR,
75 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
76 : errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
77 : newsz, BrinMaxItemSize, RelationGetRelationName(idxrel))));
78 : return false; /* keep compiler quiet */
79 : }
80 :
81 : /* make sure the revmap is long enough to contain the entry we need */
82 218 : brinRevmapExtend(revmap, heapBlk);
83 :
84 218 : if (!samepage)
85 : {
86 : /* need a page on which to put the item */
87 1 : newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended);
88 1 : if (!BufferIsValid(newbuf))
89 : {
90 0 : Assert(!extended);
91 0 : return false;
92 : }
93 :
94 : /*
95 : * Note: it's possible (though unlikely) that the returned newbuf is
96 : * the same as oldbuf, if brin_getinsertbuffer determined that the old
97 : * buffer does in fact have enough space.
98 : */
99 1 : if (newbuf == oldbuf)
100 : {
101 0 : Assert(!extended);
102 0 : newbuf = InvalidBuffer;
103 : }
104 : }
105 : else
106 : {
107 217 : LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
108 217 : newbuf = InvalidBuffer;
109 217 : extended = false;
110 : }
111 218 : oldpage = BufferGetPage(oldbuf);
112 218 : oldlp = PageGetItemId(oldpage, oldoff);
113 :
114 : /*
115 : * Check that the old tuple wasn't updated concurrently: it might have
116 : * moved someplace else entirely ...
117 : */
118 218 : if (!ItemIdIsNormal(oldlp))
119 : {
120 0 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
121 :
122 : /*
123 : * If this happens, and the new buffer was obtained by extending the
124 : * relation, then we need to ensure we don't leave it uninitialized or
125 : * forget about it.
126 : */
127 0 : if (BufferIsValid(newbuf))
128 : {
129 0 : if (extended)
130 0 : brin_initialize_empty_new_buffer(idxrel, newbuf);
131 0 : UnlockReleaseBuffer(newbuf);
132 0 : if (extended)
133 0 : FreeSpaceMapVacuum(idxrel);
134 : }
135 0 : return false;
136 : }
137 :
138 218 : oldsz = ItemIdGetLength(oldlp);
139 218 : oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp);
140 :
141 : /*
142 : * ... or it might have been updated in place to different contents.
143 : */
144 218 : if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz))
145 : {
146 0 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
147 0 : if (BufferIsValid(newbuf))
148 : {
149 0 : if (extended)
150 0 : brin_initialize_empty_new_buffer(idxrel, newbuf);
151 0 : UnlockReleaseBuffer(newbuf);
152 0 : if (extended)
153 0 : FreeSpaceMapVacuum(idxrel);
154 : }
155 0 : return false;
156 : }
157 :
158 : /*
159 : * Great, the old tuple is intact. We can proceed with the update.
160 : *
161 : * If there's enough room in the old page for the new tuple, replace it.
162 : *
163 : * Note that there might now be enough space on the page even though the
164 : * caller told us there isn't, if a concurrent update moved another tuple
165 : * elsewhere or replaced a tuple with a smaller one.
166 : */
167 436 : if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) &&
168 218 : brin_can_do_samepage_update(oldbuf, origsz, newsz))
169 : {
170 217 : if (BufferIsValid(newbuf))
171 : {
172 : /* as above */
173 0 : if (extended)
174 0 : brin_initialize_empty_new_buffer(idxrel, newbuf);
175 0 : UnlockReleaseBuffer(newbuf);
176 : }
177 :
178 217 : START_CRIT_SECTION();
179 217 : if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) newtup, newsz))
180 0 : elog(ERROR, "failed to replace BRIN tuple");
181 217 : MarkBufferDirty(oldbuf);
182 :
183 : /* XLOG stuff */
184 217 : if (RelationNeedsWAL(idxrel))
185 : {
186 : xl_brin_samepage_update xlrec;
187 : XLogRecPtr recptr;
188 217 : uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE;
189 :
190 217 : xlrec.offnum = oldoff;
191 :
192 217 : XLogBeginInsert();
193 217 : XLogRegisterData((char *) &xlrec, SizeOfBrinSamepageUpdate);
194 :
195 217 : XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD);
196 217 : XLogRegisterBufData(0, (char *) newtup, newsz);
197 :
198 217 : recptr = XLogInsert(RM_BRIN_ID, info);
199 :
200 217 : PageSetLSN(oldpage, recptr);
201 : }
202 :
203 217 : END_CRIT_SECTION();
204 :
205 217 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
206 :
207 217 : if (extended)
208 0 : FreeSpaceMapVacuum(idxrel);
209 :
210 217 : return true;
211 : }
212 1 : else if (newbuf == InvalidBuffer)
213 : {
214 : /*
215 : * Not enough space, but caller said that there was. Tell them to
216 : * start over.
217 : */
218 0 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
219 0 : return false;
220 : }
221 : else
222 : {
223 : /*
224 : * Not enough free space on the oldpage. Put the new tuple on the new
225 : * page, and update the revmap.
226 : */
227 1 : Page newpage = BufferGetPage(newbuf);
228 : Buffer revmapbuf;
229 : ItemPointerData newtid;
230 : OffsetNumber newoff;
231 1 : BlockNumber newblk = InvalidBlockNumber;
232 1 : Size freespace = 0;
233 :
234 1 : revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
235 :
236 1 : START_CRIT_SECTION();
237 :
238 : /*
239 : * We need to initialize the page if it's newly obtained. Note we
240 : * will WAL-log the initialization as part of the update, so we don't
241 : * need to do that here.
242 : */
243 1 : if (extended)
244 1 : brin_page_init(BufferGetPage(newbuf), BRIN_PAGETYPE_REGULAR);
245 :
246 1 : PageIndexTupleDeleteNoCompact(oldpage, oldoff);
247 1 : newoff = PageAddItem(newpage, (Item) newtup, newsz,
248 : InvalidOffsetNumber, false, false);
249 1 : if (newoff == InvalidOffsetNumber)
250 0 : elog(ERROR, "failed to add BRIN tuple to new page");
251 1 : MarkBufferDirty(oldbuf);
252 1 : MarkBufferDirty(newbuf);
253 :
254 : /* needed to update FSM below */
255 1 : if (extended)
256 : {
257 1 : newblk = BufferGetBlockNumber(newbuf);
258 1 : freespace = br_page_get_freespace(newpage);
259 : }
260 :
261 1 : ItemPointerSet(&newtid, BufferGetBlockNumber(newbuf), newoff);
262 1 : brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid);
263 1 : MarkBufferDirty(revmapbuf);
264 :
265 : /* XLOG stuff */
266 1 : if (RelationNeedsWAL(idxrel))
267 : {
268 : xl_brin_update xlrec;
269 : XLogRecPtr recptr;
270 : uint8 info;
271 :
272 1 : info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0);
273 :
274 1 : xlrec.insert.offnum = newoff;
275 1 : xlrec.insert.heapBlk = heapBlk;
276 1 : xlrec.insert.pagesPerRange = pagesPerRange;
277 1 : xlrec.oldOffnum = oldoff;
278 :
279 1 : XLogBeginInsert();
280 :
281 : /* new page */
282 1 : XLogRegisterData((char *) &xlrec, SizeOfBrinUpdate);
283 :
284 1 : XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0));
285 1 : XLogRegisterBufData(0, (char *) newtup, newsz);
286 :
287 : /* revmap page */
288 1 : XLogRegisterBuffer(1, revmapbuf, 0);
289 :
290 : /* old page */
291 1 : XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD);
292 :
293 1 : recptr = XLogInsert(RM_BRIN_ID, info);
294 :
295 1 : PageSetLSN(oldpage, recptr);
296 1 : PageSetLSN(newpage, recptr);
297 1 : PageSetLSN(BufferGetPage(revmapbuf), recptr);
298 : }
299 :
300 1 : END_CRIT_SECTION();
301 :
302 1 : LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
303 1 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
304 1 : UnlockReleaseBuffer(newbuf);
305 :
306 1 : if (extended)
307 : {
308 1 : Assert(BlockNumberIsValid(newblk));
309 1 : RecordPageWithFreeSpace(idxrel, newblk, freespace);
310 1 : FreeSpaceMapVacuum(idxrel);
311 : }
312 :
313 1 : return true;
314 : }
315 : }
316 :
317 : /*
318 : * Return whether brin_doupdate can do a samepage update.
319 : */
320 : bool
321 436 : brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz)
322 : {
323 436 : return
324 452 : ((newsz <= origsz) ||
325 16 : PageGetExactFreeSpace(BufferGetPage(buffer)) >= (newsz - origsz));
326 : }
327 :
328 : /*
329 : * Insert an index tuple into the index relation. The revmap is updated to
330 : * mark the range containing the given page as pointing to the inserted entry.
331 : * A WAL record is written.
332 : *
333 : * The buffer, if valid, is first checked for free space to insert the new
334 : * entry; if there isn't enough, a new buffer is obtained and pinned. No
335 : * buffer lock must be held on entry, no buffer lock is held on exit.
336 : *
337 : * Return value is the offset number where the tuple was inserted.
338 : */
339 : OffsetNumber
340 154 : brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
341 : BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk,
342 : BrinTuple *tup, Size itemsz)
343 : {
344 : Page page;
345 : BlockNumber blk;
346 : OffsetNumber off;
347 : Buffer revmapbuf;
348 : ItemPointerData tid;
349 : bool extended;
350 :
351 154 : Assert(itemsz == MAXALIGN(itemsz));
352 :
353 : /* If the item is oversized, don't even bother. */
354 154 : if (itemsz > BrinMaxItemSize)
355 : {
356 0 : ereport(ERROR,
357 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
358 : errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
359 : itemsz, BrinMaxItemSize, RelationGetRelationName(idxrel))));
360 : return InvalidOffsetNumber; /* keep compiler quiet */
361 : }
362 :
363 : /* Make sure the revmap is long enough to contain the entry we need */
364 154 : brinRevmapExtend(revmap, heapBlk);
365 :
366 : /*
367 : * Acquire lock on buffer supplied by caller, if any. If it doesn't have
368 : * enough space, unpin it to obtain a new one below.
369 : */
370 154 : if (BufferIsValid(*buffer))
371 : {
372 : /*
373 : * It's possible that another backend (or ourselves!) extended the
374 : * revmap over the page we held a pin on, so we cannot assume that
375 : * it's still a regular page.
376 : */
377 143 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
378 143 : if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz)
379 : {
380 9 : UnlockReleaseBuffer(*buffer);
381 9 : *buffer = InvalidBuffer;
382 : }
383 : }
384 :
385 : /*
386 : * If we still don't have a usable buffer, have brin_getinsertbuffer
387 : * obtain one for us.
388 : */
389 154 : if (!BufferIsValid(*buffer))
390 : {
391 : do
392 20 : *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended);
393 20 : while (!BufferIsValid(*buffer));
394 : }
395 : else
396 134 : extended = false;
397 :
398 : /* Now obtain lock on revmap buffer */
399 154 : revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
400 :
401 154 : page = BufferGetPage(*buffer);
402 154 : blk = BufferGetBlockNumber(*buffer);
403 :
404 : /* Execute the actual insertion */
405 154 : START_CRIT_SECTION();
406 154 : if (extended)
407 13 : brin_page_init(BufferGetPage(*buffer), BRIN_PAGETYPE_REGULAR);
408 154 : off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber,
409 : false, false);
410 154 : if (off == InvalidOffsetNumber)
411 0 : elog(ERROR, "could not insert new index tuple to page");
412 154 : MarkBufferDirty(*buffer);
413 :
414 : BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u",
415 : blk, off, heapBlk));
416 :
417 154 : ItemPointerSet(&tid, blk, off);
418 154 : brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid);
419 154 : MarkBufferDirty(revmapbuf);
420 :
421 : /* XLOG stuff */
422 154 : if (RelationNeedsWAL(idxrel))
423 : {
424 : xl_brin_insert xlrec;
425 : XLogRecPtr recptr;
426 : uint8 info;
427 :
428 154 : info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0);
429 154 : xlrec.heapBlk = heapBlk;
430 154 : xlrec.pagesPerRange = pagesPerRange;
431 154 : xlrec.offnum = off;
432 :
433 154 : XLogBeginInsert();
434 154 : XLogRegisterData((char *) &xlrec, SizeOfBrinInsert);
435 :
436 154 : XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0));
437 154 : XLogRegisterBufData(0, (char *) tup, itemsz);
438 :
439 154 : XLogRegisterBuffer(1, revmapbuf, 0);
440 :
441 154 : recptr = XLogInsert(RM_BRIN_ID, info);
442 :
443 154 : PageSetLSN(page, recptr);
444 154 : PageSetLSN(BufferGetPage(revmapbuf), recptr);
445 : }
446 :
447 154 : END_CRIT_SECTION();
448 :
449 : /* Tuple is firmly on buffer; we can release our locks */
450 154 : LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
451 154 : LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
452 :
453 154 : if (extended)
454 13 : FreeSpaceMapVacuum(idxrel);
455 :
456 154 : return off;
457 : }
458 :
459 : /*
460 : * Initialize a page with the given type.
461 : *
462 : * Caller is responsible for marking it dirty, as appropriate.
463 : */
464 : void
465 22 : brin_page_init(Page page, uint16 type)
466 : {
467 22 : PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace));
468 :
469 22 : BrinPageType(page) = type;
470 22 : }
471 :
472 : /*
473 : * Initialize a new BRIN index' metapage.
474 : */
475 : void
476 4 : brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)
477 : {
478 : BrinMetaPageData *metadata;
479 :
480 4 : brin_page_init(page, BRIN_PAGETYPE_META);
481 :
482 4 : metadata = (BrinMetaPageData *) PageGetContents(page);
483 :
484 4 : metadata->brinMagic = BRIN_META_MAGIC;
485 4 : metadata->brinVersion = version;
486 4 : metadata->pagesPerRange = pagesPerRange;
487 :
488 : /*
489 : * Note we cheat here a little. 0 is not a valid revmap block number
490 : * (because it's the metapage buffer), but doing this enables the first
491 : * revmap page to be created when the index is.
492 : */
493 4 : metadata->lastRevmapPage = 0;
494 4 : }
495 :
496 : /*
497 : * Initiate page evacuation protocol.
498 : *
499 : * The page must be locked in exclusive mode by the caller.
500 : *
501 : * If the page is not yet initialized or empty, return false without doing
502 : * anything; it can be used for revmap without any further changes. If it
503 : * contains tuples, mark it for evacuation and return true.
504 : */
505 : bool
506 4 : brin_start_evacuating_page(Relation idxRel, Buffer buf)
507 : {
508 : OffsetNumber off;
509 : OffsetNumber maxoff;
510 : Page page;
511 :
512 4 : page = BufferGetPage(buf);
513 :
514 4 : if (PageIsNew(page))
515 4 : return false;
516 :
517 0 : maxoff = PageGetMaxOffsetNumber(page);
518 0 : for (off = FirstOffsetNumber; off <= maxoff; off++)
519 : {
520 : ItemId lp;
521 :
522 0 : lp = PageGetItemId(page, off);
523 0 : if (ItemIdIsUsed(lp))
524 : {
525 : /* prevent other backends from adding more stuff to this page */
526 0 : BrinPageFlags(page) |= BRIN_EVACUATE_PAGE;
527 0 : MarkBufferDirtyHint(buf, true);
528 :
529 0 : return true;
530 : }
531 : }
532 0 : return false;
533 : }
534 :
535 : /*
536 : * Move all tuples out of a page.
537 : *
538 : * The caller must hold lock on the page. The lock and pin are released.
539 : */
540 : void
541 0 : brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
542 : BrinRevmap *revmap, Buffer buf)
543 : {
544 : OffsetNumber off;
545 : OffsetNumber maxoff;
546 : Page page;
547 0 : BrinTuple *btup = NULL;
548 0 : Size btupsz = 0;
549 :
550 0 : page = BufferGetPage(buf);
551 :
552 0 : Assert(BrinPageFlags(page) & BRIN_EVACUATE_PAGE);
553 :
554 0 : maxoff = PageGetMaxOffsetNumber(page);
555 0 : for (off = FirstOffsetNumber; off <= maxoff; off++)
556 : {
557 : BrinTuple *tup;
558 : Size sz;
559 : ItemId lp;
560 :
561 0 : CHECK_FOR_INTERRUPTS();
562 :
563 0 : lp = PageGetItemId(page, off);
564 0 : if (ItemIdIsUsed(lp))
565 : {
566 0 : sz = ItemIdGetLength(lp);
567 0 : tup = (BrinTuple *) PageGetItem(page, lp);
568 0 : tup = brin_copy_tuple(tup, sz, btup, &btupsz);
569 :
570 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
571 :
572 0 : if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno,
573 : buf, off, tup, sz, tup, sz, false))
574 0 : off--; /* retry */
575 :
576 0 : LockBuffer(buf, BUFFER_LOCK_SHARE);
577 :
578 : /* It's possible that someone extended the revmap over this page */
579 0 : if (!BRIN_IS_REGULAR_PAGE(page))
580 0 : break;
581 : }
582 : }
583 :
584 0 : UnlockReleaseBuffer(buf);
585 0 : }
586 :
587 : /*
588 : * Given a BRIN index page, initialize it if necessary, and record it into the
589 : * FSM if necessary. Return value is true if the FSM itself needs "vacuuming".
590 : * The main use for this is when, during vacuuming, an uninitialized page is
591 : * found, which could be the result of relation extension followed by a crash
592 : * before the page can be used.
593 : */
594 : bool
595 18 : brin_page_cleanup(Relation idxrel, Buffer buf)
596 : {
597 18 : Page page = BufferGetPage(buf);
598 : Size freespace;
599 :
600 : /*
601 : * If a page was left uninitialized, initialize it now; also record it in
602 : * FSM.
603 : *
604 : * Somebody else might be extending the relation concurrently. To avoid
605 : * re-initializing the page before they can grab the buffer lock, we
606 : * acquire the extension lock momentarily. Since they hold the extension
607 : * lock from before getting the page and after its been initialized, we're
608 : * sure to see their initialization.
609 : */
610 18 : if (PageIsNew(page))
611 : {
612 0 : LockRelationForExtension(idxrel, ShareLock);
613 0 : UnlockRelationForExtension(idxrel, ShareLock);
614 :
615 0 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
616 0 : if (PageIsNew(page))
617 : {
618 0 : brin_initialize_empty_new_buffer(idxrel, buf);
619 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
620 0 : return true;
621 : }
622 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
623 : }
624 :
625 : /* Nothing to be done for non-regular index pages */
626 33 : if (BRIN_IS_META_PAGE(BufferGetPage(buf)) ||
627 15 : BRIN_IS_REVMAP_PAGE(BufferGetPage(buf)))
628 6 : return false;
629 :
630 : /* Measure free space and record it */
631 12 : freespace = br_page_get_freespace(page);
632 12 : if (freespace > GetRecordedFreeSpace(idxrel, BufferGetBlockNumber(buf)))
633 : {
634 9 : RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf), freespace);
635 9 : return true;
636 : }
637 :
638 3 : return false;
639 : }
640 :
641 : /*
642 : * Return a pinned and exclusively locked buffer which can be used to insert an
643 : * index item of size itemsz (caller must ensure not to request sizes
644 : * impossible to fulfill). If oldbuf is a valid buffer, it is also locked (in
645 : * an order determined to avoid deadlocks.)
646 : *
647 : * If we find that the old page is no longer a regular index page (because
648 : * of a revmap extension), the old buffer is unlocked and we return
649 : * InvalidBuffer.
650 : *
651 : * If there's no existing page with enough free space to accommodate the new
652 : * item, the relation is extended. If this happens, *extended is set to true,
653 : * and it is the caller's responsibility to initialize the page (and WAL-log
654 : * that fact) prior to use.
655 : *
656 : * Note that in some corner cases it is possible for this routine to extend the
657 : * relation and then not return the buffer. It is this routine's
658 : * responsibility to WAL-log the page initialization and to record the page in
659 : * FSM if that happens. Such a buffer may later be reused by this routine.
660 : */
661 : static Buffer
662 21 : brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
663 : bool *extended)
664 : {
665 : BlockNumber oldblk;
666 : BlockNumber newblk;
667 : Page page;
668 : Size freespace;
669 :
670 : /* callers must have checked */
671 21 : Assert(itemsz <= BrinMaxItemSize);
672 :
673 21 : *extended = false;
674 :
675 21 : if (BufferIsValid(oldbuf))
676 1 : oldblk = BufferGetBlockNumber(oldbuf);
677 : else
678 20 : oldblk = InvalidBlockNumber;
679 :
680 : /*
681 : * Loop until we find a page with sufficient free space. By the time we
682 : * return to caller out of this loop, both buffers are valid and locked;
683 : * if we have to restart here, neither buffer is locked and buf is not a
684 : * pinned buffer.
685 : */
686 21 : newblk = RelationGetTargetBlock(irel);
687 21 : if (newblk == InvalidBlockNumber)
688 6 : newblk = GetPageWithFreeSpace(irel, itemsz);
689 : for (;;)
690 : {
691 : Buffer buf;
692 31 : bool extensionLockHeld = false;
693 :
694 31 : CHECK_FOR_INTERRUPTS();
695 :
696 31 : if (newblk == InvalidBlockNumber)
697 : {
698 : /*
699 : * There's not enough free space in any existing index page,
700 : * according to the FSM: extend the relation to obtain a shiny new
701 : * page.
702 : */
703 14 : if (!RELATION_IS_LOCAL(irel))
704 : {
705 1 : LockRelationForExtension(irel, ExclusiveLock);
706 1 : extensionLockHeld = true;
707 : }
708 14 : buf = ReadBuffer(irel, P_NEW);
709 14 : newblk = BufferGetBlockNumber(buf);
710 14 : *extended = true;
711 :
712 : BRIN_elog((DEBUG2, "brin_getinsertbuffer: extending to page %u",
713 : BufferGetBlockNumber(buf)));
714 : }
715 17 : else if (newblk == oldblk)
716 : {
717 : /*
718 : * There's an odd corner-case here where the FSM is out-of-date,
719 : * and gave us the old page.
720 : */
721 1 : buf = oldbuf;
722 : }
723 : else
724 : {
725 16 : buf = ReadBuffer(irel, newblk);
726 : }
727 :
728 : /*
729 : * We lock the old buffer first, if it's earlier than the new one; but
730 : * before we do, we need to check that it hasn't been turned into a
731 : * revmap page concurrently; if we detect that it happened, give up
732 : * and tell caller to start over.
733 : */
734 31 : if (BufferIsValid(oldbuf) && oldblk < newblk)
735 : {
736 1 : LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
737 1 : if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)))
738 : {
739 0 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
740 :
741 : /*
742 : * It is possible that the new page was obtained from
743 : * extending the relation. In that case, we must be sure to
744 : * record it in the FSM before leaving, because otherwise the
745 : * space would be lost forever. However, we cannot let an
746 : * uninitialized page get in the FSM, so we need to initialize
747 : * it first.
748 : */
749 0 : if (*extended)
750 : {
751 0 : brin_initialize_empty_new_buffer(irel, buf);
752 : /* shouldn't matter, but don't confuse caller */
753 0 : *extended = false;
754 : }
755 :
756 0 : if (extensionLockHeld)
757 0 : UnlockRelationForExtension(irel, ExclusiveLock);
758 :
759 0 : ReleaseBuffer(buf);
760 0 : return InvalidBuffer;
761 : }
762 : }
763 :
764 31 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
765 :
766 31 : if (extensionLockHeld)
767 1 : UnlockRelationForExtension(irel, ExclusiveLock);
768 :
769 31 : page = BufferGetPage(buf);
770 :
771 : /*
772 : * We have a new buffer to insert into. Check that the new page has
773 : * enough free space, and return it if it does; otherwise start over.
774 : * Note that we allow for the FSM to be out of date here, and in that
775 : * case we update it and move on.
776 : *
777 : * (br_page_get_freespace also checks that the FSM didn't hand us a
778 : * page that has since been repurposed for the revmap.)
779 : */
780 62 : freespace = *extended ?
781 31 : BrinMaxItemSize : br_page_get_freespace(page);
782 31 : if (freespace >= itemsz)
783 : {
784 21 : RelationSetTargetBlock(irel, BufferGetBlockNumber(buf));
785 :
786 : /*
787 : * Since the target block specification can get lost on cache
788 : * invalidations, make sure we update the more permanent FSM with
789 : * data about it before going away.
790 : */
791 21 : if (*extended)
792 14 : RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf),
793 : freespace);
794 :
795 : /*
796 : * Lock the old buffer if not locked already. Note that in this
797 : * case we know for sure it's a regular page: it's later than the
798 : * new page we just got, which is not a revmap page, and revmap
799 : * pages are always consecutive.
800 : */
801 21 : if (BufferIsValid(oldbuf) && oldblk > newblk)
802 : {
803 0 : LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
804 0 : Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)));
805 : }
806 :
807 21 : return buf;
808 : }
809 :
810 : /* This page is no good. */
811 :
812 : /*
813 : * If an entirely new page does not contain enough free space for the
814 : * new item, then surely that item is oversized. Complain loudly; but
815 : * first make sure we initialize the page and record it as free, for
816 : * next time.
817 : */
818 10 : if (*extended)
819 : {
820 0 : brin_initialize_empty_new_buffer(irel, buf);
821 :
822 0 : ereport(ERROR,
823 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
824 : errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
825 : itemsz, freespace, RelationGetRelationName(irel))));
826 : return InvalidBuffer; /* keep compiler quiet */
827 : }
828 :
829 10 : if (newblk != oldblk)
830 9 : UnlockReleaseBuffer(buf);
831 10 : if (BufferIsValid(oldbuf) && oldblk <= newblk)
832 1 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
833 :
834 10 : newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz);
835 10 : }
836 : }
837 :
838 : /*
839 : * Initialize a page as an empty regular BRIN page, WAL-log this, and record
840 : * the page in FSM.
841 : *
842 : * There are several corner situations in which we extend the relation to
843 : * obtain a new page and later find that we cannot use it immediately. When
844 : * that happens, we don't want to leave the page go unrecorded in FSM, because
845 : * there is no mechanism to get the space back and the index would bloat.
846 : * Also, because we would not WAL-log the action that would initialize the
847 : * page, the page would go uninitialized in a standby (or after recovery).
848 : */
849 : static void
850 0 : brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer)
851 : {
852 : Page page;
853 :
854 : BRIN_elog((DEBUG2,
855 : "brin_initialize_empty_new_buffer: initializing blank page %u",
856 : BufferGetBlockNumber(buffer)));
857 :
858 0 : START_CRIT_SECTION();
859 0 : page = BufferGetPage(buffer);
860 0 : brin_page_init(page, BRIN_PAGETYPE_REGULAR);
861 0 : MarkBufferDirty(buffer);
862 0 : log_newpage_buffer(buffer, true);
863 0 : END_CRIT_SECTION();
864 :
865 : /*
866 : * We update the FSM for this page, but this is not WAL-logged. This is
867 : * acceptable because VACUUM will scan the index and update the FSM with
868 : * pages whose FSM records were forgotten in a crash.
869 : */
870 0 : RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer),
871 : br_page_get_freespace(page));
872 0 : }
873 :
874 :
875 : /*
876 : * Return the amount of free space on a regular BRIN index page.
877 : *
878 : * If the page is not a regular page, or has been marked with the
879 : * BRIN_EVACUATE_PAGE flag, returns 0.
880 : */
881 : static Size
882 173 : br_page_get_freespace(Page page)
883 : {
884 346 : if (!BRIN_IS_REGULAR_PAGE(page) ||
885 173 : (BrinPageFlags(page) & BRIN_EVACUATE_PAGE) != 0)
886 0 : return 0;
887 : else
888 173 : return PageGetFreeSpace(page);
889 : }
|