Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * generic_xlog.c
4 : * Implementation of generic xlog records.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * src/backend/access/transam/generic_xlog.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 : #include "postgres.h"
15 :
16 : #include "access/bufmask.h"
17 : #include "access/generic_xlog.h"
18 : #include "access/xlogutils.h"
19 : #include "miscadmin.h"
20 : #include "utils/memutils.h"
21 :
22 : /*-------------------------------------------------------------------------
23 : * Internally, a delta between pages consists of a set of fragments. Each
24 : * fragment represents changes made in a given region of a page. A fragment
25 : * is made up as follows:
26 : *
27 : * - offset of page region (OffsetNumber)
28 : * - length of page region (OffsetNumber)
29 : * - data - the data to place into the region ('length' number of bytes)
30 : *
31 : * Unchanged regions of a page are not represented in its delta. As a result,
32 : * a delta can be more compact than the full page image. But having an
33 : * unchanged region between two fragments that is smaller than the fragment
34 : * header (offset+length) does not pay off in terms of the overall size of
35 : * the delta. For this reason, we merge adjacent fragments if the unchanged
36 : * region between them is <= MATCH_THRESHOLD bytes.
37 : *
38 : * We do not bother to merge fragments across the "lower" and "upper" parts
39 : * of a page; it's very seldom the case that pd_lower and pd_upper are within
40 : * MATCH_THRESHOLD bytes of each other, and handling that infrequent case
41 : * would complicate and slow down the delta-computation code unduly.
42 : * Therefore, the worst-case delta size includes two fragment headers plus
43 : * a full page's worth of data.
44 : *-------------------------------------------------------------------------
45 : */
46 : #define FRAGMENT_HEADER_SIZE (2 * sizeof(OffsetNumber))
47 : #define MATCH_THRESHOLD FRAGMENT_HEADER_SIZE
48 : #define MAX_DELTA_SIZE (BLCKSZ + 2 * FRAGMENT_HEADER_SIZE)
49 :
50 : /* Struct of generic xlog data for single page */
51 : typedef struct
52 : {
53 : Buffer buffer; /* registered buffer */
54 : int flags; /* flags for this buffer */
55 : int deltaLen; /* space consumed in delta field */
56 : char *image; /* copy of page image for modification, do not
57 : * do it in-place to have aligned memory chunk */
58 : char delta[MAX_DELTA_SIZE]; /* delta between page images */
59 : } PageData;
60 :
61 : /* State of generic xlog record construction */
62 : struct GenericXLogState
63 : {
64 : /*
65 : * page's images. Should be first in this struct to have MAXALIGN'ed
66 : * images addresses, because some code working with pages directly aligns
67 : * addresses, not offsets from beginning of page
68 : */
69 : char images[MAX_GENERIC_XLOG_PAGES * BLCKSZ];
70 : PageData pages[MAX_GENERIC_XLOG_PAGES];
71 : bool isLogged;
72 : };
73 :
74 : static void writeFragment(PageData *pageData, OffsetNumber offset,
75 : OffsetNumber len, const char *data);
76 : static void computeRegionDelta(PageData *pageData,
77 : const char *curpage, const char *targetpage,
78 : int targetStart, int targetEnd,
79 : int validStart, int validEnd);
80 : static void computeDelta(PageData *pageData, Page curpage, Page targetpage);
81 : static void applyPageRedo(Page page, const char *delta, Size deltaSize);
82 :
83 :
84 : /*
85 : * Write next fragment into pageData's delta.
86 : *
87 : * The fragment has the given offset and length, and data points to the
88 : * actual data (of length length).
89 : */
90 : static void
91 0 : writeFragment(PageData *pageData, OffsetNumber offset, OffsetNumber length,
92 : const char *data)
93 : {
94 0 : char *ptr = pageData->delta + pageData->deltaLen;
95 :
96 : /* Verify we have enough space */
97 0 : Assert(pageData->deltaLen + sizeof(offset) +
98 : sizeof(length) + length <= sizeof(pageData->delta));
99 :
100 : /* Write fragment data */
101 0 : memcpy(ptr, &offset, sizeof(offset));
102 0 : ptr += sizeof(offset);
103 0 : memcpy(ptr, &length, sizeof(length));
104 0 : ptr += sizeof(length);
105 0 : memcpy(ptr, data, length);
106 0 : ptr += length;
107 :
108 0 : pageData->deltaLen = ptr - pageData->delta;
109 0 : }
110 :
111 : /*
112 : * Compute the XLOG fragments needed to transform a region of curpage into the
113 : * corresponding region of targetpage, and append them to pageData's delta
114 : * field. The region to transform runs from targetStart to targetEnd-1.
115 : * Bytes in curpage outside the range validStart to validEnd-1 should be
116 : * considered invalid, and always overwritten with target data.
117 : *
118 : * This function is a hot spot, so it's worth being as tense as possible
119 : * about the data-matching loops.
120 : */
121 : static void
122 0 : computeRegionDelta(PageData *pageData,
123 : const char *curpage, const char *targetpage,
124 : int targetStart, int targetEnd,
125 : int validStart, int validEnd)
126 : {
127 : int i,
128 : loopEnd,
129 0 : fragmentBegin = -1,
130 0 : fragmentEnd = -1;
131 :
132 : /* Deal with any invalid start region by including it in first fragment */
133 0 : if (validStart > targetStart)
134 : {
135 0 : fragmentBegin = targetStart;
136 0 : targetStart = validStart;
137 : }
138 :
139 : /* We'll deal with any invalid end region after the main loop */
140 0 : loopEnd = Min(targetEnd, validEnd);
141 :
142 : /* Examine all the potentially matchable bytes */
143 0 : i = targetStart;
144 0 : while (i < loopEnd)
145 : {
146 0 : if (curpage[i] != targetpage[i])
147 : {
148 : /* On unmatched byte, start new fragment if not already in one */
149 0 : if (fragmentBegin < 0)
150 0 : fragmentBegin = i;
151 : /* Mark unmatched-data endpoint as uncertain */
152 0 : fragmentEnd = -1;
153 : /* Extend the fragment as far as possible in a tight loop */
154 0 : i++;
155 0 : while (i < loopEnd && curpage[i] != targetpage[i])
156 0 : i++;
157 0 : if (i >= loopEnd)
158 0 : break;
159 : }
160 :
161 : /* Found a matched byte, so remember end of unmatched fragment */
162 0 : fragmentEnd = i;
163 :
164 : /*
165 : * Extend the match as far as possible in a tight loop. (On typical
166 : * workloads, this inner loop is the bulk of this function's runtime.)
167 : */
168 0 : i++;
169 0 : while (i < loopEnd && curpage[i] == targetpage[i])
170 0 : i++;
171 :
172 : /*
173 : * There are several possible cases at this point:
174 : *
175 : * 1. We have no unwritten fragment (fragmentBegin < 0). There's
176 : * nothing to write; and it doesn't matter what fragmentEnd is.
177 : *
178 : * 2. We found more than MATCH_THRESHOLD consecutive matching bytes.
179 : * Dump out the unwritten fragment, stopping at fragmentEnd.
180 : *
181 : * 3. The match extends to loopEnd. We'll do nothing here, exit the
182 : * loop, and then dump the unwritten fragment, after merging it with
183 : * the invalid end region if any. If we don't so merge, fragmentEnd
184 : * establishes how much the final writeFragment call needs to write.
185 : *
186 : * 4. We found an unmatched byte before loopEnd. The loop will repeat
187 : * and will enter the unmatched-byte stanza above. So in this case
188 : * also, it doesn't matter what fragmentEnd is. The matched bytes
189 : * will get merged into the continuing unmatched fragment.
190 : *
191 : * Only in case 3 do we reach the bottom of the loop with a meaningful
192 : * fragmentEnd value, which is why it's OK that we unconditionally
193 : * assign "fragmentEnd = i" above.
194 : */
195 0 : if (fragmentBegin >= 0 && i - fragmentEnd > MATCH_THRESHOLD)
196 : {
197 0 : writeFragment(pageData, fragmentBegin,
198 : fragmentEnd - fragmentBegin,
199 : targetpage + fragmentBegin);
200 0 : fragmentBegin = -1;
201 0 : fragmentEnd = -1; /* not really necessary */
202 : }
203 : }
204 :
205 : /* Deal with any invalid end region by including it in final fragment */
206 0 : if (loopEnd < targetEnd)
207 : {
208 0 : if (fragmentBegin < 0)
209 0 : fragmentBegin = loopEnd;
210 0 : fragmentEnd = targetEnd;
211 : }
212 :
213 : /* Write final fragment if any */
214 0 : if (fragmentBegin >= 0)
215 : {
216 0 : if (fragmentEnd < 0)
217 0 : fragmentEnd = targetEnd;
218 0 : writeFragment(pageData, fragmentBegin,
219 : fragmentEnd - fragmentBegin,
220 : targetpage + fragmentBegin);
221 : }
222 0 : }
223 :
224 : /*
225 : * Compute the XLOG delta record needed to transform curpage into targetpage,
226 : * and store it in pageData's delta field.
227 : */
228 : static void
229 0 : computeDelta(PageData *pageData, Page curpage, Page targetpage)
230 : {
231 0 : int targetLower = ((PageHeader) targetpage)->pd_lower,
232 0 : targetUpper = ((PageHeader) targetpage)->pd_upper,
233 0 : curLower = ((PageHeader) curpage)->pd_lower,
234 0 : curUpper = ((PageHeader) curpage)->pd_upper;
235 :
236 0 : pageData->deltaLen = 0;
237 :
238 : /* Compute delta records for lower part of page ... */
239 0 : computeRegionDelta(pageData, curpage, targetpage,
240 : 0, targetLower,
241 : 0, curLower);
242 : /* ... and for upper part, ignoring what's between */
243 0 : computeRegionDelta(pageData, curpage, targetpage,
244 : targetUpper, BLCKSZ,
245 : curUpper, BLCKSZ);
246 :
247 : /*
248 : * If xlog debug is enabled, then check produced delta. Result of delta
249 : * application to curpage should be equivalent to targetpage.
250 : */
251 : #ifdef WAL_DEBUG
252 : if (XLOG_DEBUG)
253 : {
254 : char tmp[BLCKSZ];
255 :
256 : memcpy(tmp, curpage, BLCKSZ);
257 : applyPageRedo(tmp, pageData->delta, pageData->deltaLen);
258 : if (memcmp(tmp, targetpage, targetLower) != 0 ||
259 : memcmp(tmp + targetUpper, targetpage + targetUpper,
260 : BLCKSZ - targetUpper) != 0)
261 : elog(ERROR, "result of generic xlog apply does not match");
262 : }
263 : #endif
264 0 : }
265 :
266 : /*
267 : * Start new generic xlog record for modifications to specified relation.
268 : */
269 : GenericXLogState *
270 0 : GenericXLogStart(Relation relation)
271 : {
272 : GenericXLogState *state;
273 : int i;
274 :
275 0 : state = (GenericXLogState *) palloc(sizeof(GenericXLogState));
276 0 : state->isLogged = RelationNeedsWAL(relation);
277 :
278 0 : for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
279 : {
280 0 : state->pages[i].image = state->images + BLCKSZ * i;
281 0 : state->pages[i].buffer = InvalidBuffer;
282 : }
283 :
284 0 : return state;
285 : }
286 :
287 : /*
288 : * Register new buffer for generic xlog record.
289 : *
290 : * Returns pointer to the page's image in the GenericXLogState, which
291 : * is what the caller should modify.
292 : *
293 : * If the buffer is already registered, just return its existing entry.
294 : * (It's not very clear what to do with the flags in such a case, but
295 : * for now we stay with the original flags.)
296 : */
297 : Page
298 0 : GenericXLogRegisterBuffer(GenericXLogState *state, Buffer buffer, int flags)
299 : {
300 : int block_id;
301 :
302 : /* Search array for existing entry or first unused slot */
303 0 : for (block_id = 0; block_id < MAX_GENERIC_XLOG_PAGES; block_id++)
304 : {
305 0 : PageData *page = &state->pages[block_id];
306 :
307 0 : if (BufferIsInvalid(page->buffer))
308 : {
309 : /* Empty slot, so use it (there cannot be a match later) */
310 0 : page->buffer = buffer;
311 0 : page->flags = flags;
312 0 : memcpy(page->image, BufferGetPage(buffer), BLCKSZ);
313 0 : return (Page) page->image;
314 : }
315 0 : else if (page->buffer == buffer)
316 : {
317 : /*
318 : * Buffer is already registered. Just return the image, which is
319 : * already prepared.
320 : */
321 0 : return (Page) page->image;
322 : }
323 : }
324 :
325 0 : elog(ERROR, "maximum number %d of generic xlog buffers is exceeded",
326 : MAX_GENERIC_XLOG_PAGES);
327 : /* keep compiler quiet */
328 : return NULL;
329 : }
330 :
331 : /*
332 : * Apply changes represented by GenericXLogState to the actual buffers,
333 : * and emit a generic xlog record.
334 : */
335 : XLogRecPtr
336 0 : GenericXLogFinish(GenericXLogState *state)
337 : {
338 : XLogRecPtr lsn;
339 : int i;
340 :
341 0 : if (state->isLogged)
342 : {
343 : /* Logged relation: make xlog record in critical section. */
344 0 : XLogBeginInsert();
345 :
346 0 : START_CRIT_SECTION();
347 :
348 0 : for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
349 : {
350 0 : PageData *pageData = &state->pages[i];
351 : Page page;
352 : PageHeader pageHeader;
353 :
354 0 : if (BufferIsInvalid(pageData->buffer))
355 0 : continue;
356 :
357 0 : page = BufferGetPage(pageData->buffer);
358 0 : pageHeader = (PageHeader) pageData->image;
359 :
360 0 : if (pageData->flags & GENERIC_XLOG_FULL_IMAGE)
361 : {
362 : /*
363 : * A full-page image does not require us to supply any xlog
364 : * data. Just apply the image, being careful to zero the
365 : * "hole" between pd_lower and pd_upper in order to avoid
366 : * divergence between actual page state and what replay would
367 : * produce.
368 : */
369 0 : memcpy(page, pageData->image, pageHeader->pd_lower);
370 0 : memset(page + pageHeader->pd_lower, 0,
371 0 : pageHeader->pd_upper - pageHeader->pd_lower);
372 0 : memcpy(page + pageHeader->pd_upper,
373 0 : pageData->image + pageHeader->pd_upper,
374 0 : BLCKSZ - pageHeader->pd_upper);
375 :
376 0 : XLogRegisterBuffer(i, pageData->buffer,
377 : REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
378 : }
379 : else
380 : {
381 : /*
382 : * In normal mode, calculate delta and write it as xlog data
383 : * associated with this page.
384 : */
385 0 : computeDelta(pageData, page, (Page) pageData->image);
386 :
387 : /* Apply the image, with zeroed "hole" as above */
388 0 : memcpy(page, pageData->image, pageHeader->pd_lower);
389 0 : memset(page + pageHeader->pd_lower, 0,
390 0 : pageHeader->pd_upper - pageHeader->pd_lower);
391 0 : memcpy(page + pageHeader->pd_upper,
392 0 : pageData->image + pageHeader->pd_upper,
393 0 : BLCKSZ - pageHeader->pd_upper);
394 :
395 0 : XLogRegisterBuffer(i, pageData->buffer, REGBUF_STANDARD);
396 0 : XLogRegisterBufData(i, pageData->delta, pageData->deltaLen);
397 : }
398 : }
399 :
400 : /* Insert xlog record */
401 0 : lsn = XLogInsert(RM_GENERIC_ID, 0);
402 :
403 : /* Set LSN and mark buffers dirty */
404 0 : for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
405 : {
406 0 : PageData *pageData = &state->pages[i];
407 :
408 0 : if (BufferIsInvalid(pageData->buffer))
409 0 : continue;
410 0 : PageSetLSN(BufferGetPage(pageData->buffer), lsn);
411 0 : MarkBufferDirty(pageData->buffer);
412 : }
413 0 : END_CRIT_SECTION();
414 : }
415 : else
416 : {
417 : /* Unlogged relation: skip xlog-related stuff */
418 0 : START_CRIT_SECTION();
419 0 : for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
420 : {
421 0 : PageData *pageData = &state->pages[i];
422 :
423 0 : if (BufferIsInvalid(pageData->buffer))
424 0 : continue;
425 0 : memcpy(BufferGetPage(pageData->buffer),
426 0 : pageData->image,
427 : BLCKSZ);
428 : /* We don't worry about zeroing the "hole" in this case */
429 0 : MarkBufferDirty(pageData->buffer);
430 : }
431 0 : END_CRIT_SECTION();
432 : /* We don't have a LSN to return, in this case */
433 0 : lsn = InvalidXLogRecPtr;
434 : }
435 :
436 0 : pfree(state);
437 :
438 0 : return lsn;
439 : }
440 :
441 : /*
442 : * Abort generic xlog record construction. No changes are applied to buffers.
443 : *
444 : * Note: caller is responsible for releasing locks/pins on buffers, if needed.
445 : */
446 : void
447 0 : GenericXLogAbort(GenericXLogState *state)
448 : {
449 0 : pfree(state);
450 0 : }
451 :
452 : /*
453 : * Apply delta to given page image.
454 : */
455 : static void
456 0 : applyPageRedo(Page page, const char *delta, Size deltaSize)
457 : {
458 0 : const char *ptr = delta;
459 0 : const char *end = delta + deltaSize;
460 :
461 0 : while (ptr < end)
462 : {
463 : OffsetNumber offset,
464 : length;
465 :
466 0 : memcpy(&offset, ptr, sizeof(offset));
467 0 : ptr += sizeof(offset);
468 0 : memcpy(&length, ptr, sizeof(length));
469 0 : ptr += sizeof(length);
470 :
471 0 : memcpy(page + offset, ptr, length);
472 :
473 0 : ptr += length;
474 : }
475 0 : }
476 :
477 : /*
478 : * Redo function for generic xlog record.
479 : */
480 : void
481 0 : generic_redo(XLogReaderState *record)
482 : {
483 0 : XLogRecPtr lsn = record->EndRecPtr;
484 : Buffer buffers[MAX_GENERIC_XLOG_PAGES];
485 : uint8 block_id;
486 :
487 : /* Protect limited size of buffers[] array */
488 0 : Assert(record->max_block_id < MAX_GENERIC_XLOG_PAGES);
489 :
490 : /* Iterate over blocks */
491 0 : for (block_id = 0; block_id <= record->max_block_id; block_id++)
492 : {
493 : XLogRedoAction action;
494 :
495 0 : if (!XLogRecHasBlockRef(record, block_id))
496 : {
497 0 : buffers[block_id] = InvalidBuffer;
498 0 : continue;
499 : }
500 :
501 0 : action = XLogReadBufferForRedo(record, block_id, &buffers[block_id]);
502 :
503 : /* Apply redo to given block if needed */
504 0 : if (action == BLK_NEEDS_REDO)
505 : {
506 : Page page;
507 : PageHeader pageHeader;
508 : char *blockDelta;
509 : Size blockDeltaSize;
510 :
511 0 : page = BufferGetPage(buffers[block_id]);
512 0 : blockDelta = XLogRecGetBlockData(record, block_id, &blockDeltaSize);
513 0 : applyPageRedo(page, blockDelta, blockDeltaSize);
514 :
515 : /*
516 : * Since the delta contains no information about what's in the
517 : * "hole" between pd_lower and pd_upper, set that to zero to
518 : * ensure we produce the same page state that application of the
519 : * logged action by GenericXLogFinish did.
520 : */
521 0 : pageHeader = (PageHeader) page;
522 0 : memset(page + pageHeader->pd_lower, 0,
523 0 : pageHeader->pd_upper - pageHeader->pd_lower);
524 :
525 0 : PageSetLSN(page, lsn);
526 0 : MarkBufferDirty(buffers[block_id]);
527 : }
528 : }
529 :
530 : /* Changes are done: unlock and release all buffers */
531 0 : for (block_id = 0; block_id <= record->max_block_id; block_id++)
532 : {
533 0 : if (BufferIsValid(buffers[block_id]))
534 0 : UnlockReleaseBuffer(buffers[block_id]);
535 : }
536 0 : }
537 :
538 : /*
539 : * Mask a generic page before performing consistency checks on it.
540 : */
541 : void
542 0 : generic_mask(char *page, BlockNumber blkno)
543 : {
544 0 : mask_page_lsn(page);
545 :
546 0 : mask_unused_space(page);
547 0 : }
|