Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtxlog.c
4 : * WAL replay logic for btrees.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/nbtree/nbtxlog.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include "access/bufmask.h"
18 : #include "access/heapam_xlog.h"
19 : #include "access/nbtree.h"
20 : #include "access/nbtxlog.h"
21 : #include "access/transam.h"
22 : #include "access/xlog.h"
23 : #include "access/xlogutils.h"
24 : #include "storage/procarray.h"
25 : #include "miscadmin.h"
26 :
27 : /*
28 : * _bt_restore_page -- re-enter all the index tuples on a page
29 : *
30 : * The page is freshly init'd, and *from (length len) is a copy of what
31 : * had been its upper part (pd_upper to pd_special). We assume that the
32 : * tuples had been added to the page in item-number order, and therefore
33 : * the one with highest item number appears first (lowest on the page).
34 : */
35 : static void
36 0 : _bt_restore_page(Page page, char *from, int len)
37 : {
38 : IndexTupleData itupdata;
39 : Size itemsz;
40 0 : char *end = from + len;
41 : Item items[MaxIndexTuplesPerPage];
42 : uint16 itemsizes[MaxIndexTuplesPerPage];
43 : int i;
44 : int nitems;
45 :
46 : /*
47 : * To get the items back in the original order, we add them to the page in
48 : * reverse. To figure out where one tuple ends and another begins, we
49 : * have to scan them in forward order first.
50 : */
51 0 : i = 0;
52 0 : while (from < end)
53 : {
54 : /* Need to copy tuple header due to alignment considerations */
55 0 : memcpy(&itupdata, from, sizeof(IndexTupleData));
56 0 : itemsz = IndexTupleDSize(itupdata);
57 0 : itemsz = MAXALIGN(itemsz);
58 :
59 0 : items[i] = (Item) from;
60 0 : itemsizes[i] = itemsz;
61 0 : i++;
62 :
63 0 : from += itemsz;
64 : }
65 0 : nitems = i;
66 :
67 0 : for (i = nitems - 1; i >= 0; i--)
68 : {
69 0 : if (PageAddItem(page, items[i], itemsizes[i], nitems - i,
70 : false, false) == InvalidOffsetNumber)
71 0 : elog(PANIC, "_bt_restore_page: cannot add item to page");
72 0 : from += itemsz;
73 : }
74 0 : }
75 :
76 : static void
77 0 : _bt_restore_meta(XLogReaderState *record, uint8 block_id)
78 : {
79 0 : XLogRecPtr lsn = record->EndRecPtr;
80 : Buffer metabuf;
81 : Page metapg;
82 : BTMetaPageData *md;
83 : BTPageOpaque pageop;
84 : xl_btree_metadata *xlrec;
85 : char *ptr;
86 : Size len;
87 :
88 0 : metabuf = XLogInitBufferForRedo(record, block_id);
89 0 : ptr = XLogRecGetBlockData(record, block_id, &len);
90 :
91 0 : Assert(len == sizeof(xl_btree_metadata));
92 0 : Assert(BufferGetBlockNumber(metabuf) == BTREE_METAPAGE);
93 0 : xlrec = (xl_btree_metadata *) ptr;
94 0 : metapg = BufferGetPage(metabuf);
95 :
96 0 : _bt_pageinit(metapg, BufferGetPageSize(metabuf));
97 :
98 0 : md = BTPageGetMeta(metapg);
99 0 : md->btm_magic = BTREE_MAGIC;
100 0 : md->btm_version = BTREE_VERSION;
101 0 : md->btm_root = xlrec->root;
102 0 : md->btm_level = xlrec->level;
103 0 : md->btm_fastroot = xlrec->fastroot;
104 0 : md->btm_fastlevel = xlrec->fastlevel;
105 :
106 0 : pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
107 0 : pageop->btpo_flags = BTP_META;
108 :
109 : /*
110 : * Set pd_lower just past the end of the metadata. This is not essential
111 : * but it makes the page look compressible to xlog.c.
112 : */
113 0 : ((PageHeader) metapg)->pd_lower =
114 0 : ((char *) md + sizeof(BTMetaPageData)) - (char *) metapg;
115 :
116 0 : PageSetLSN(metapg, lsn);
117 0 : MarkBufferDirty(metabuf);
118 0 : UnlockReleaseBuffer(metabuf);
119 0 : }
120 :
121 : /*
122 : * _bt_clear_incomplete_split -- clear INCOMPLETE_SPLIT flag on a page
123 : *
124 : * This is a common subroutine of the redo functions of all the WAL record
125 : * types that can insert a downlink: insert, split, and newroot.
126 : */
127 : static void
128 0 : _bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id)
129 : {
130 0 : XLogRecPtr lsn = record->EndRecPtr;
131 : Buffer buf;
132 :
133 0 : if (XLogReadBufferForRedo(record, block_id, &buf) == BLK_NEEDS_REDO)
134 : {
135 0 : Page page = (Page) BufferGetPage(buf);
136 0 : BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
137 :
138 0 : Assert((pageop->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0);
139 0 : pageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
140 :
141 0 : PageSetLSN(page, lsn);
142 0 : MarkBufferDirty(buf);
143 : }
144 0 : if (BufferIsValid(buf))
145 0 : UnlockReleaseBuffer(buf);
146 0 : }
147 :
148 : static void
149 0 : btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record)
150 : {
151 0 : XLogRecPtr lsn = record->EndRecPtr;
152 0 : xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record);
153 : Buffer buffer;
154 : Page page;
155 :
156 : /*
157 : * Insertion to an internal page finishes an incomplete split at the child
158 : * level. Clear the incomplete-split flag in the child. Note: during
159 : * normal operation, the child and parent pages are locked at the same
160 : * time, so that clearing the flag and inserting the downlink appear
161 : * atomic to other backends. We don't bother with that during replay,
162 : * because readers don't care about the incomplete-split flag and there
163 : * cannot be updates happening.
164 : */
165 0 : if (!isleaf)
166 0 : _bt_clear_incomplete_split(record, 1);
167 0 : if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
168 : {
169 : Size datalen;
170 0 : char *datapos = XLogRecGetBlockData(record, 0, &datalen);
171 :
172 0 : page = BufferGetPage(buffer);
173 :
174 0 : if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum,
175 : false, false) == InvalidOffsetNumber)
176 0 : elog(PANIC, "btree_insert_redo: failed to add item");
177 :
178 0 : PageSetLSN(page, lsn);
179 0 : MarkBufferDirty(buffer);
180 : }
181 0 : if (BufferIsValid(buffer))
182 0 : UnlockReleaseBuffer(buffer);
183 :
184 : /*
185 : * Note: in normal operation, we'd update the metapage while still holding
186 : * lock on the page we inserted into. But during replay it's not
187 : * necessary to hold that lock, since no other index updates can be
188 : * happening concurrently, and readers will cope fine with following an
189 : * obsolete link from the metapage.
190 : */
191 0 : if (ismeta)
192 0 : _bt_restore_meta(record, 2);
193 0 : }
194 :
195 : static void
196 0 : btree_xlog_split(bool onleft, XLogReaderState *record)
197 : {
198 0 : XLogRecPtr lsn = record->EndRecPtr;
199 0 : xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
200 0 : bool isleaf = (xlrec->level == 0);
201 : Buffer lbuf;
202 : Buffer rbuf;
203 : Page rpage;
204 : BTPageOpaque ropaque;
205 : char *datapos;
206 : Size datalen;
207 0 : Item left_hikey = NULL;
208 0 : Size left_hikeysz = 0;
209 : BlockNumber leftsib;
210 : BlockNumber rightsib;
211 : BlockNumber rnext;
212 :
213 0 : XLogRecGetBlockTag(record, 0, NULL, NULL, &leftsib);
214 0 : XLogRecGetBlockTag(record, 1, NULL, NULL, &rightsib);
215 0 : if (!XLogRecGetBlockTag(record, 2, NULL, NULL, &rnext))
216 0 : rnext = P_NONE;
217 :
218 : /*
219 : * Clear the incomplete split flag on the left sibling of the child page
220 : * this is a downlink for. (Like in btree_xlog_insert, this can be done
221 : * before locking the other pages)
222 : */
223 0 : if (!isleaf)
224 0 : _bt_clear_incomplete_split(record, 3);
225 :
226 : /* Reconstruct right (new) sibling page from scratch */
227 0 : rbuf = XLogInitBufferForRedo(record, 1);
228 0 : datapos = XLogRecGetBlockData(record, 1, &datalen);
229 0 : rpage = (Page) BufferGetPage(rbuf);
230 :
231 0 : _bt_pageinit(rpage, BufferGetPageSize(rbuf));
232 0 : ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
233 :
234 0 : ropaque->btpo_prev = leftsib;
235 0 : ropaque->btpo_next = rnext;
236 0 : ropaque->btpo.level = xlrec->level;
237 0 : ropaque->btpo_flags = isleaf ? BTP_LEAF : 0;
238 0 : ropaque->btpo_cycleid = 0;
239 :
240 0 : _bt_restore_page(rpage, datapos, datalen);
241 :
242 : /*
243 : * On leaf level, the high key of the left page is equal to the first key
244 : * on the right page.
245 : */
246 0 : if (isleaf)
247 : {
248 0 : ItemId hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque));
249 :
250 0 : left_hikey = PageGetItem(rpage, hiItemId);
251 0 : left_hikeysz = ItemIdGetLength(hiItemId);
252 : }
253 :
254 0 : PageSetLSN(rpage, lsn);
255 0 : MarkBufferDirty(rbuf);
256 :
257 : /* don't release the buffer yet; we touch right page's first item below */
258 :
259 : /* Now reconstruct left (original) sibling page */
260 0 : if (XLogReadBufferForRedo(record, 0, &lbuf) == BLK_NEEDS_REDO)
261 : {
262 : /*
263 : * To retain the same physical order of the tuples that they had, we
264 : * initialize a temporary empty page for the left page and add all the
265 : * items to that in item number order. This mirrors how _bt_split()
266 : * works. It's not strictly required to retain the same physical
267 : * order, as long as the items are in the correct item number order,
268 : * but it helps debugging. See also _bt_restore_page(), which does
269 : * the same for the right page.
270 : */
271 0 : Page lpage = (Page) BufferGetPage(lbuf);
272 0 : BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
273 : OffsetNumber off;
274 0 : Item newitem = NULL;
275 0 : Size newitemsz = 0;
276 : Page newlpage;
277 : OffsetNumber leftoff;
278 :
279 0 : datapos = XLogRecGetBlockData(record, 0, &datalen);
280 :
281 0 : if (onleft)
282 : {
283 0 : newitem = (Item) datapos;
284 0 : newitemsz = MAXALIGN(IndexTupleSize(newitem));
285 0 : datapos += newitemsz;
286 0 : datalen -= newitemsz;
287 : }
288 :
289 : /* Extract left hikey and its size (assuming 16-bit alignment) */
290 0 : if (!isleaf)
291 : {
292 0 : left_hikey = (Item) datapos;
293 0 : left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey));
294 0 : datapos += left_hikeysz;
295 0 : datalen -= left_hikeysz;
296 : }
297 0 : Assert(datalen == 0);
298 :
299 0 : newlpage = PageGetTempPageCopySpecial(lpage);
300 :
301 : /* Set high key */
302 0 : leftoff = P_HIKEY;
303 0 : if (PageAddItem(newlpage, left_hikey, left_hikeysz,
304 : P_HIKEY, false, false) == InvalidOffsetNumber)
305 0 : elog(PANIC, "failed to add high key to left page after split");
306 0 : leftoff = OffsetNumberNext(leftoff);
307 :
308 0 : for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstright; off++)
309 : {
310 : ItemId itemid;
311 : Size itemsz;
312 : Item item;
313 :
314 : /* add the new item if it was inserted on left page */
315 0 : if (onleft && off == xlrec->newitemoff)
316 : {
317 0 : if (PageAddItem(newlpage, newitem, newitemsz, leftoff,
318 : false, false) == InvalidOffsetNumber)
319 0 : elog(ERROR, "failed to add new item to left page after split");
320 0 : leftoff = OffsetNumberNext(leftoff);
321 : }
322 :
323 0 : itemid = PageGetItemId(lpage, off);
324 0 : itemsz = ItemIdGetLength(itemid);
325 0 : item = PageGetItem(lpage, itemid);
326 0 : if (PageAddItem(newlpage, item, itemsz, leftoff,
327 : false, false) == InvalidOffsetNumber)
328 0 : elog(ERROR, "failed to add old item to left page after split");
329 0 : leftoff = OffsetNumberNext(leftoff);
330 : }
331 :
332 : /* cope with possibility that newitem goes at the end */
333 0 : if (onleft && off == xlrec->newitemoff)
334 : {
335 0 : if (PageAddItem(newlpage, newitem, newitemsz, leftoff,
336 : false, false) == InvalidOffsetNumber)
337 0 : elog(ERROR, "failed to add new item to left page after split");
338 0 : leftoff = OffsetNumberNext(leftoff);
339 : }
340 :
341 0 : PageRestoreTempPage(newlpage, lpage);
342 :
343 : /* Fix opaque fields */
344 0 : lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT;
345 0 : if (isleaf)
346 0 : lopaque->btpo_flags |= BTP_LEAF;
347 0 : lopaque->btpo_next = rightsib;
348 0 : lopaque->btpo_cycleid = 0;
349 :
350 0 : PageSetLSN(lpage, lsn);
351 0 : MarkBufferDirty(lbuf);
352 : }
353 :
354 : /* We no longer need the buffers */
355 0 : if (BufferIsValid(lbuf))
356 0 : UnlockReleaseBuffer(lbuf);
357 0 : UnlockReleaseBuffer(rbuf);
358 :
359 : /*
360 : * Fix left-link of the page to the right of the new right sibling.
361 : *
362 : * Note: in normal operation, we do this while still holding lock on the
363 : * two split pages. However, that's not necessary for correctness in WAL
364 : * replay, because no other index update can be in progress, and readers
365 : * will cope properly when following an obsolete left-link.
366 : */
367 0 : if (rnext != P_NONE)
368 : {
369 : Buffer buffer;
370 :
371 0 : if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO)
372 : {
373 0 : Page page = (Page) BufferGetPage(buffer);
374 0 : BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
375 :
376 0 : pageop->btpo_prev = rightsib;
377 :
378 0 : PageSetLSN(page, lsn);
379 0 : MarkBufferDirty(buffer);
380 : }
381 0 : if (BufferIsValid(buffer))
382 0 : UnlockReleaseBuffer(buffer);
383 : }
384 0 : }
385 :
386 : static void
387 0 : btree_xlog_vacuum(XLogReaderState *record)
388 : {
389 0 : XLogRecPtr lsn = record->EndRecPtr;
390 : Buffer buffer;
391 : Page page;
392 : BTPageOpaque opaque;
393 : #ifdef UNUSED
394 : xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
395 :
396 : /*
397 : * This section of code is thought to be no longer needed, after analysis
398 : * of the calling paths. It is retained to allow the code to be reinstated
399 : * if a flaw is revealed in that thinking.
400 : *
401 : * If we are running non-MVCC scans using this index we need to do some
402 : * additional work to ensure correctness, which is known as a "pin scan"
403 : * described in more detail in next paragraphs. We used to do the extra
404 : * work in all cases, whereas we now avoid that work in most cases. If
405 : * lastBlockVacuumed is set to InvalidBlockNumber then we skip the
406 : * additional work required for the pin scan.
407 : *
408 : * Avoiding this extra work is important since it requires us to touch
409 : * every page in the index, so is an O(N) operation. Worse, it is an
410 : * operation performed in the foreground during redo, so it delays
411 : * replication directly.
412 : *
413 : * If queries might be active then we need to ensure every leaf page is
414 : * unpinned between the lastBlockVacuumed and the current block, if there
415 : * are any. This prevents replay of the VACUUM from reaching the stage of
416 : * removing heap tuples while there could still be indexscans "in flight"
417 : * to those particular tuples for those scans which could be confused by
418 : * finding new tuples at the old TID locations (see nbtree/README).
419 : *
420 : * It might be worth checking if there are actually any backends running;
421 : * if not, we could just skip this.
422 : *
423 : * Since VACUUM can visit leaf pages out-of-order, it might issue records
424 : * with lastBlockVacuumed >= block; that's not an error, it just means
425 : * nothing to do now.
426 : *
427 : * Note: since we touch all pages in the range, we will lock non-leaf
428 : * pages, and also any empty (all-zero) pages that may be in the index. It
429 : * doesn't seem worth the complexity to avoid that. But it's important
430 : * that HotStandbyActiveInReplay() will not return true if the database
431 : * isn't yet consistent; so we need not fear reading still-corrupt blocks
432 : * here during crash recovery.
433 : */
434 : if (HotStandbyActiveInReplay() && BlockNumberIsValid(xlrec->lastBlockVacuumed))
435 : {
436 : RelFileNode thisrnode;
437 : BlockNumber thisblkno;
438 : BlockNumber blkno;
439 :
440 : XLogRecGetBlockTag(record, 0, &thisrnode, NULL, &thisblkno);
441 :
442 : for (blkno = xlrec->lastBlockVacuumed + 1; blkno < thisblkno; blkno++)
443 : {
444 : /*
445 : * We use RBM_NORMAL_NO_LOG mode because it's not an error
446 : * condition to see all-zero pages. The original btvacuumpage
447 : * scan would have skipped over all-zero pages, noting them in FSM
448 : * but not bothering to initialize them just yet; so we mustn't
449 : * throw an error here. (We could skip acquiring the cleanup lock
450 : * if PageIsNew, but it's probably not worth the cycles to test.)
451 : *
452 : * XXX we don't actually need to read the block, we just need to
453 : * confirm it is unpinned. If we had a special call into the
454 : * buffer manager we could optimise this so that if the block is
455 : * not in shared_buffers we confirm it as unpinned. Optimizing
456 : * this is now moot, since in most cases we avoid the scan.
457 : */
458 : buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno,
459 : RBM_NORMAL_NO_LOG);
460 : if (BufferIsValid(buffer))
461 : {
462 : LockBufferForCleanup(buffer);
463 : UnlockReleaseBuffer(buffer);
464 : }
465 : }
466 : }
467 : #endif
468 :
469 : /*
470 : * Like in btvacuumpage(), we need to take a cleanup lock on every leaf
471 : * page. See nbtree/README for details.
472 : */
473 0 : if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer)
474 : == BLK_NEEDS_REDO)
475 : {
476 : char *ptr;
477 : Size len;
478 :
479 0 : ptr = XLogRecGetBlockData(record, 0, &len);
480 :
481 0 : page = (Page) BufferGetPage(buffer);
482 :
483 0 : if (len > 0)
484 : {
485 : OffsetNumber *unused;
486 : OffsetNumber *unend;
487 :
488 0 : unused = (OffsetNumber *) ptr;
489 0 : unend = (OffsetNumber *) ((char *) ptr + len);
490 :
491 0 : if ((unend - unused) > 0)
492 0 : PageIndexMultiDelete(page, unused, unend - unused);
493 : }
494 :
495 : /*
496 : * Mark the page as not containing any LP_DEAD items --- see comments
497 : * in _bt_delitems_vacuum().
498 : */
499 0 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
500 0 : opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
501 :
502 0 : PageSetLSN(page, lsn);
503 0 : MarkBufferDirty(buffer);
504 : }
505 0 : if (BufferIsValid(buffer))
506 0 : UnlockReleaseBuffer(buffer);
507 0 : }
508 :
509 : /*
510 : * Get the latestRemovedXid from the heap pages pointed at by the index
511 : * tuples being deleted. This puts the work for calculating latestRemovedXid
512 : * into the recovery path rather than the primary path.
513 : *
514 : * It's possible that this generates a fair amount of I/O, since an index
515 : * block may have hundreds of tuples being deleted. Repeat accesses to the
516 : * same heap blocks are common, though are not yet optimised.
517 : *
518 : * XXX optimise later with something like XLogPrefetchBuffer()
519 : */
520 : static TransactionId
521 0 : btree_xlog_delete_get_latestRemovedXid(XLogReaderState *record)
522 : {
523 0 : xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
524 : OffsetNumber *unused;
525 : Buffer ibuffer,
526 : hbuffer;
527 : Page ipage,
528 : hpage;
529 : RelFileNode rnode;
530 : BlockNumber blkno;
531 : ItemId iitemid,
532 : hitemid;
533 : IndexTuple itup;
534 : HeapTupleHeader htuphdr;
535 : BlockNumber hblkno;
536 : OffsetNumber hoffnum;
537 0 : TransactionId latestRemovedXid = InvalidTransactionId;
538 : int i;
539 :
540 : /*
541 : * If there's nothing running on the standby we don't need to derive a
542 : * full latestRemovedXid value, so use a fast path out of here. This
543 : * returns InvalidTransactionId, and so will conflict with all HS
544 : * transactions; but since we just worked out that that's zero people,
545 : * it's OK.
546 : *
547 : * XXX There is a race condition here, which is that a new backend might
548 : * start just after we look. If so, it cannot need to conflict, but this
549 : * coding will result in throwing a conflict anyway.
550 : */
551 0 : if (CountDBBackends(InvalidOid) == 0)
552 0 : return latestRemovedXid;
553 :
554 : /*
555 : * In what follows, we have to examine the previous state of the index
556 : * page, as well as the heap page(s) it points to. This is only valid if
557 : * WAL replay has reached a consistent database state; which means that
558 : * the preceding check is not just an optimization, but is *necessary*. We
559 : * won't have let in any user sessions before we reach consistency.
560 : */
561 0 : if (!reachedConsistency)
562 0 : elog(PANIC, "btree_xlog_delete_get_latestRemovedXid: cannot operate with inconsistent data");
563 :
564 : /*
565 : * Get index page. If the DB is consistent, this should not fail, nor
566 : * should any of the heap page fetches below. If one does, we return
567 : * InvalidTransactionId to cancel all HS transactions. That's probably
568 : * overkill, but it's safe, and certainly better than panicking here.
569 : */
570 0 : XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
571 0 : ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL);
572 0 : if (!BufferIsValid(ibuffer))
573 0 : return InvalidTransactionId;
574 0 : LockBuffer(ibuffer, BT_READ);
575 0 : ipage = (Page) BufferGetPage(ibuffer);
576 :
577 : /*
578 : * Loop through the deleted index items to obtain the TransactionId from
579 : * the heap items they point to.
580 : */
581 0 : unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete);
582 :
583 0 : for (i = 0; i < xlrec->nitems; i++)
584 : {
585 : /*
586 : * Identify the index tuple about to be deleted
587 : */
588 0 : iitemid = PageGetItemId(ipage, unused[i]);
589 0 : itup = (IndexTuple) PageGetItem(ipage, iitemid);
590 :
591 : /*
592 : * Locate the heap page that the index tuple points at
593 : */
594 0 : hblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
595 0 : hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL);
596 0 : if (!BufferIsValid(hbuffer))
597 : {
598 0 : UnlockReleaseBuffer(ibuffer);
599 0 : return InvalidTransactionId;
600 : }
601 0 : LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
602 0 : hpage = (Page) BufferGetPage(hbuffer);
603 :
604 : /*
605 : * Look up the heap tuple header that the index tuple points at by
606 : * using the heap node supplied with the xlrec. We can't use
607 : * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
608 : * Note that we are not looking at tuple data here, just headers.
609 : */
610 0 : hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
611 0 : hitemid = PageGetItemId(hpage, hoffnum);
612 :
613 : /*
614 : * Follow any redirections until we find something useful.
615 : */
616 0 : while (ItemIdIsRedirected(hitemid))
617 : {
618 0 : hoffnum = ItemIdGetRedirect(hitemid);
619 0 : hitemid = PageGetItemId(hpage, hoffnum);
620 0 : CHECK_FOR_INTERRUPTS();
621 : }
622 :
623 : /*
624 : * If the heap item has storage, then read the header and use that to
625 : * set latestRemovedXid.
626 : *
627 : * Some LP_DEAD items may not be accessible, so we ignore them.
628 : */
629 0 : if (ItemIdHasStorage(hitemid))
630 : {
631 0 : htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
632 :
633 0 : HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
634 : }
635 0 : else if (ItemIdIsDead(hitemid))
636 : {
637 : /*
638 : * Conjecture: if hitemid is dead then it had xids before the xids
639 : * marked on LP_NORMAL items. So we just ignore this item and move
640 : * onto the next, for the purposes of calculating
641 : * latestRemovedxids.
642 : */
643 : }
644 : else
645 0 : Assert(!ItemIdIsUsed(hitemid));
646 :
647 0 : UnlockReleaseBuffer(hbuffer);
648 : }
649 :
650 0 : UnlockReleaseBuffer(ibuffer);
651 :
652 : /*
653 : * If all heap tuples were LP_DEAD then we will be returning
654 : * InvalidTransactionId here, which avoids conflicts. This matches
655 : * existing logic which assumes that LP_DEAD tuples must already be older
656 : * than the latestRemovedXid on the cleanup record that set them as
657 : * LP_DEAD, hence must already have generated a conflict.
658 : */
659 0 : return latestRemovedXid;
660 : }
661 :
662 : static void
663 0 : btree_xlog_delete(XLogReaderState *record)
664 : {
665 0 : XLogRecPtr lsn = record->EndRecPtr;
666 0 : xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
667 : Buffer buffer;
668 : Page page;
669 : BTPageOpaque opaque;
670 :
671 : /*
672 : * If we have any conflict processing to do, it must happen before we
673 : * update the page.
674 : *
675 : * Btree delete records can conflict with standby queries. You might
676 : * think that vacuum records would conflict as well, but we've handled
677 : * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
678 : * cleaned by the vacuum of the heap and so we can resolve any conflicts
679 : * just once when that arrives. After that we know that no conflicts
680 : * exist from individual btree vacuum records on that index.
681 : */
682 0 : if (InHotStandby)
683 : {
684 0 : TransactionId latestRemovedXid = btree_xlog_delete_get_latestRemovedXid(record);
685 : RelFileNode rnode;
686 :
687 0 : XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
688 :
689 0 : ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
690 : }
691 :
692 : /*
693 : * We don't need to take a cleanup lock to apply these changes. See
694 : * nbtree/README for details.
695 : */
696 0 : if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
697 : {
698 0 : page = (Page) BufferGetPage(buffer);
699 :
700 0 : if (XLogRecGetDataLen(record) > SizeOfBtreeDelete)
701 : {
702 : OffsetNumber *unused;
703 :
704 0 : unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete);
705 :
706 0 : PageIndexMultiDelete(page, unused, xlrec->nitems);
707 : }
708 :
709 : /*
710 : * Mark the page as not containing any LP_DEAD items --- see comments
711 : * in _bt_delitems_delete().
712 : */
713 0 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
714 0 : opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
715 :
716 0 : PageSetLSN(page, lsn);
717 0 : MarkBufferDirty(buffer);
718 : }
719 0 : if (BufferIsValid(buffer))
720 0 : UnlockReleaseBuffer(buffer);
721 0 : }
722 :
723 : static void
724 0 : btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)
725 : {
726 0 : XLogRecPtr lsn = record->EndRecPtr;
727 0 : xl_btree_mark_page_halfdead *xlrec = (xl_btree_mark_page_halfdead *) XLogRecGetData(record);
728 : Buffer buffer;
729 : Page page;
730 : BTPageOpaque pageop;
731 : IndexTupleData trunctuple;
732 :
733 : /*
734 : * In normal operation, we would lock all the pages this WAL record
735 : * touches before changing any of them. In WAL replay, it should be okay
736 : * to lock just one page at a time, since no concurrent index updates can
737 : * be happening, and readers should not care whether they arrive at the
738 : * target page or not (since it's surely empty).
739 : */
740 :
741 : /* parent page */
742 0 : if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
743 : {
744 : OffsetNumber poffset;
745 : ItemId itemid;
746 : IndexTuple itup;
747 : OffsetNumber nextoffset;
748 : BlockNumber rightsib;
749 :
750 0 : page = (Page) BufferGetPage(buffer);
751 0 : pageop = (BTPageOpaque) PageGetSpecialPointer(page);
752 :
753 0 : poffset = xlrec->poffset;
754 :
755 0 : nextoffset = OffsetNumberNext(poffset);
756 0 : itemid = PageGetItemId(page, nextoffset);
757 0 : itup = (IndexTuple) PageGetItem(page, itemid);
758 0 : rightsib = ItemPointerGetBlockNumber(&itup->t_tid);
759 :
760 0 : itemid = PageGetItemId(page, poffset);
761 0 : itup = (IndexTuple) PageGetItem(page, itemid);
762 0 : ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);
763 0 : nextoffset = OffsetNumberNext(poffset);
764 0 : PageIndexTupleDelete(page, nextoffset);
765 :
766 0 : PageSetLSN(page, lsn);
767 0 : MarkBufferDirty(buffer);
768 : }
769 0 : if (BufferIsValid(buffer))
770 0 : UnlockReleaseBuffer(buffer);
771 :
772 : /* Rewrite the leaf page as a halfdead page */
773 0 : buffer = XLogInitBufferForRedo(record, 0);
774 0 : page = (Page) BufferGetPage(buffer);
775 :
776 0 : _bt_pageinit(page, BufferGetPageSize(buffer));
777 0 : pageop = (BTPageOpaque) PageGetSpecialPointer(page);
778 :
779 0 : pageop->btpo_prev = xlrec->leftblk;
780 0 : pageop->btpo_next = xlrec->rightblk;
781 0 : pageop->btpo.level = 0;
782 0 : pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
783 0 : pageop->btpo_cycleid = 0;
784 :
785 : /*
786 : * Construct a dummy hikey item that points to the next parent to be
787 : * deleted (if any).
788 : */
789 0 : MemSet(&trunctuple, 0, sizeof(IndexTupleData));
790 0 : trunctuple.t_info = sizeof(IndexTupleData);
791 0 : if (xlrec->topparent != InvalidBlockNumber)
792 0 : ItemPointerSet(&trunctuple.t_tid, xlrec->topparent, P_HIKEY);
793 : else
794 0 : ItemPointerSetInvalid(&trunctuple.t_tid);
795 0 : if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
796 : false, false) == InvalidOffsetNumber)
797 0 : elog(ERROR, "could not add dummy high key to half-dead page");
798 :
799 0 : PageSetLSN(page, lsn);
800 0 : MarkBufferDirty(buffer);
801 0 : UnlockReleaseBuffer(buffer);
802 0 : }
803 :
804 :
805 : static void
806 0 : btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
807 : {
808 0 : XLogRecPtr lsn = record->EndRecPtr;
809 0 : xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record);
810 : BlockNumber leftsib;
811 : BlockNumber rightsib;
812 : Buffer buffer;
813 : Page page;
814 : BTPageOpaque pageop;
815 :
816 0 : leftsib = xlrec->leftsib;
817 0 : rightsib = xlrec->rightsib;
818 :
819 : /*
820 : * In normal operation, we would lock all the pages this WAL record
821 : * touches before changing any of them. In WAL replay, it should be okay
822 : * to lock just one page at a time, since no concurrent index updates can
823 : * be happening, and readers should not care whether they arrive at the
824 : * target page or not (since it's surely empty).
825 : */
826 :
827 : /* Fix left-link of right sibling */
828 0 : if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO)
829 : {
830 0 : page = (Page) BufferGetPage(buffer);
831 0 : pageop = (BTPageOpaque) PageGetSpecialPointer(page);
832 0 : pageop->btpo_prev = leftsib;
833 :
834 0 : PageSetLSN(page, lsn);
835 0 : MarkBufferDirty(buffer);
836 : }
837 0 : if (BufferIsValid(buffer))
838 0 : UnlockReleaseBuffer(buffer);
839 :
840 : /* Fix right-link of left sibling, if any */
841 0 : if (leftsib != P_NONE)
842 : {
843 0 : if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
844 : {
845 0 : page = (Page) BufferGetPage(buffer);
846 0 : pageop = (BTPageOpaque) PageGetSpecialPointer(page);
847 0 : pageop->btpo_next = rightsib;
848 :
849 0 : PageSetLSN(page, lsn);
850 0 : MarkBufferDirty(buffer);
851 : }
852 0 : if (BufferIsValid(buffer))
853 0 : UnlockReleaseBuffer(buffer);
854 : }
855 :
856 : /* Rewrite target page as empty deleted page */
857 0 : buffer = XLogInitBufferForRedo(record, 0);
858 0 : page = (Page) BufferGetPage(buffer);
859 :
860 0 : _bt_pageinit(page, BufferGetPageSize(buffer));
861 0 : pageop = (BTPageOpaque) PageGetSpecialPointer(page);
862 :
863 0 : pageop->btpo_prev = leftsib;
864 0 : pageop->btpo_next = rightsib;
865 0 : pageop->btpo.xact = xlrec->btpo_xact;
866 0 : pageop->btpo_flags = BTP_DELETED;
867 0 : pageop->btpo_cycleid = 0;
868 :
869 0 : PageSetLSN(page, lsn);
870 0 : MarkBufferDirty(buffer);
871 0 : UnlockReleaseBuffer(buffer);
872 :
873 : /*
874 : * If we deleted a parent of the targeted leaf page, instead of the leaf
875 : * itself, update the leaf to point to the next remaining child in the
876 : * branch.
877 : */
878 0 : if (XLogRecHasBlockRef(record, 3))
879 : {
880 : /*
881 : * There is no real data on the page, so we just re-create it from
882 : * scratch using the information from the WAL record.
883 : */
884 : IndexTupleData trunctuple;
885 :
886 0 : buffer = XLogInitBufferForRedo(record, 3);
887 0 : page = (Page) BufferGetPage(buffer);
888 :
889 0 : _bt_pageinit(page, BufferGetPageSize(buffer));
890 0 : pageop = (BTPageOpaque) PageGetSpecialPointer(page);
891 :
892 0 : pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
893 0 : pageop->btpo_prev = xlrec->leafleftsib;
894 0 : pageop->btpo_next = xlrec->leafrightsib;
895 0 : pageop->btpo.level = 0;
896 0 : pageop->btpo_cycleid = 0;
897 :
898 : /* Add a dummy hikey item */
899 0 : MemSet(&trunctuple, 0, sizeof(IndexTupleData));
900 0 : trunctuple.t_info = sizeof(IndexTupleData);
901 0 : if (xlrec->topparent != InvalidBlockNumber)
902 0 : ItemPointerSet(&trunctuple.t_tid, xlrec->topparent, P_HIKEY);
903 : else
904 0 : ItemPointerSetInvalid(&trunctuple.t_tid);
905 0 : if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
906 : false, false) == InvalidOffsetNumber)
907 0 : elog(ERROR, "could not add dummy high key to half-dead page");
908 :
909 0 : PageSetLSN(page, lsn);
910 0 : MarkBufferDirty(buffer);
911 0 : UnlockReleaseBuffer(buffer);
912 : }
913 :
914 : /* Update metapage if needed */
915 0 : if (info == XLOG_BTREE_UNLINK_PAGE_META)
916 0 : _bt_restore_meta(record, 4);
917 0 : }
918 :
919 : static void
920 0 : btree_xlog_newroot(XLogReaderState *record)
921 : {
922 0 : XLogRecPtr lsn = record->EndRecPtr;
923 0 : xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record);
924 : Buffer buffer;
925 : Page page;
926 : BTPageOpaque pageop;
927 : char *ptr;
928 : Size len;
929 :
930 0 : buffer = XLogInitBufferForRedo(record, 0);
931 0 : page = (Page) BufferGetPage(buffer);
932 :
933 0 : _bt_pageinit(page, BufferGetPageSize(buffer));
934 0 : pageop = (BTPageOpaque) PageGetSpecialPointer(page);
935 :
936 0 : pageop->btpo_flags = BTP_ROOT;
937 0 : pageop->btpo_prev = pageop->btpo_next = P_NONE;
938 0 : pageop->btpo.level = xlrec->level;
939 0 : if (xlrec->level == 0)
940 0 : pageop->btpo_flags |= BTP_LEAF;
941 0 : pageop->btpo_cycleid = 0;
942 :
943 0 : if (xlrec->level > 0)
944 : {
945 0 : ptr = XLogRecGetBlockData(record, 0, &len);
946 0 : _bt_restore_page(page, ptr, len);
947 :
948 : /* Clear the incomplete-split flag in left child */
949 0 : _bt_clear_incomplete_split(record, 1);
950 : }
951 :
952 0 : PageSetLSN(page, lsn);
953 0 : MarkBufferDirty(buffer);
954 0 : UnlockReleaseBuffer(buffer);
955 :
956 0 : _bt_restore_meta(record, 2);
957 0 : }
958 :
959 : static void
960 0 : btree_xlog_reuse_page(XLogReaderState *record)
961 : {
962 0 : xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record);
963 :
964 : /*
965 : * Btree reuse_page records exist to provide a conflict point when we
966 : * reuse pages in the index via the FSM. That's all they do though.
967 : *
968 : * latestRemovedXid was the page's btpo.xact. The btpo.xact <
969 : * RecentGlobalXmin test in _bt_page_recyclable() conceptually mirrors the
970 : * pgxact->xmin > limitXmin test in GetConflictingVirtualXIDs().
971 : * Consequently, one XID value achieves the same exclusion effect on
972 : * master and standby.
973 : */
974 0 : if (InHotStandby)
975 : {
976 0 : ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid,
977 : xlrec->node);
978 : }
979 0 : }
980 :
981 :
982 : void
983 0 : btree_redo(XLogReaderState *record)
984 : {
985 0 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
986 :
987 0 : switch (info)
988 : {
989 : case XLOG_BTREE_INSERT_LEAF:
990 0 : btree_xlog_insert(true, false, record);
991 0 : break;
992 : case XLOG_BTREE_INSERT_UPPER:
993 0 : btree_xlog_insert(false, false, record);
994 0 : break;
995 : case XLOG_BTREE_INSERT_META:
996 0 : btree_xlog_insert(false, true, record);
997 0 : break;
998 : case XLOG_BTREE_SPLIT_L:
999 0 : btree_xlog_split(true, record);
1000 0 : break;
1001 : case XLOG_BTREE_SPLIT_R:
1002 0 : btree_xlog_split(false, record);
1003 0 : break;
1004 : case XLOG_BTREE_VACUUM:
1005 0 : btree_xlog_vacuum(record);
1006 0 : break;
1007 : case XLOG_BTREE_DELETE:
1008 0 : btree_xlog_delete(record);
1009 0 : break;
1010 : case XLOG_BTREE_MARK_PAGE_HALFDEAD:
1011 0 : btree_xlog_mark_page_halfdead(info, record);
1012 0 : break;
1013 : case XLOG_BTREE_UNLINK_PAGE:
1014 : case XLOG_BTREE_UNLINK_PAGE_META:
1015 0 : btree_xlog_unlink_page(info, record);
1016 0 : break;
1017 : case XLOG_BTREE_NEWROOT:
1018 0 : btree_xlog_newroot(record);
1019 0 : break;
1020 : case XLOG_BTREE_REUSE_PAGE:
1021 0 : btree_xlog_reuse_page(record);
1022 0 : break;
1023 : default:
1024 0 : elog(PANIC, "btree_redo: unknown op code %u", info);
1025 : }
1026 0 : }
1027 :
1028 : /*
1029 : * Mask a btree page before performing consistency checks on it.
1030 : */
1031 : void
1032 0 : btree_mask(char *pagedata, BlockNumber blkno)
1033 : {
1034 0 : Page page = (Page) pagedata;
1035 : BTPageOpaque maskopaq;
1036 :
1037 0 : mask_page_lsn(page);
1038 :
1039 0 : mask_page_hint_bits(page);
1040 0 : mask_unused_space(page);
1041 :
1042 0 : maskopaq = (BTPageOpaque) PageGetSpecialPointer(page);
1043 :
1044 0 : if (P_ISDELETED(maskopaq))
1045 : {
1046 : /*
1047 : * Mask page content on a DELETED page since it will be re-initialized
1048 : * during replay. See btree_xlog_unlink_page() for details.
1049 : */
1050 0 : mask_page_content(page);
1051 : }
1052 0 : else if (P_ISLEAF(maskopaq))
1053 : {
1054 : /*
1055 : * In btree leaf pages, it is possible to modify the LP_FLAGS without
1056 : * emitting any WAL record. Hence, mask the line pointer flags. See
1057 : * _bt_killitems(), _bt_check_unique() for details.
1058 : */
1059 0 : mask_lp_flags(page);
1060 : }
1061 :
1062 : /*
1063 : * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See
1064 : * _bt_killitems(), _bt_check_unique() for details.
1065 : */
1066 0 : maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE;
1067 :
1068 : /*
1069 : * During replay of a btree page split, we don't set the BTP_SPLIT_END
1070 : * flag of the right sibling and initialize the cycle_id to 0 for the same
1071 : * page. See btree_xlog_split() for details.
1072 : */
1073 0 : maskopaq->btpo_flags &= ~BTP_SPLIT_END;
1074 0 : maskopaq->btpo_cycleid = 0;
1075 0 : }
|