Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * bufpage.c
4 : * POSTGRES standard buffer page code.
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/storage/page/bufpage.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include "access/htup_details.h"
18 : #include "access/itup.h"
19 : #include "access/xlog.h"
20 : #include "storage/checksum.h"
21 : #include "utils/memdebug.h"
22 : #include "utils/memutils.h"
23 :
24 :
25 : /* GUC variable */
26 : bool ignore_checksum_failure = false;
27 :
28 :
29 : /* ----------------------------------------------------------------
30 : * Page support functions
31 : * ----------------------------------------------------------------
32 : */
33 :
34 : /*
35 : * PageInit
36 : * Initializes the contents of a page.
37 : * Note that we don't calculate an initial checksum here; that's not done
38 : * until it's time to write.
39 : */
40 : void
41 19889 : PageInit(Page page, Size pageSize, Size specialSize)
42 : {
43 19889 : PageHeader p = (PageHeader) page;
44 :
45 19889 : specialSize = MAXALIGN(specialSize);
46 :
47 19889 : Assert(pageSize == BLCKSZ);
48 19889 : Assert(pageSize > specialSize + SizeOfPageHeaderData);
49 :
50 : /* Make sure all fields of page are zero, as well as unused space */
51 19889 : MemSet(p, 0, pageSize);
52 :
53 19889 : p->pd_flags = 0;
54 19889 : p->pd_lower = SizeOfPageHeaderData;
55 19889 : p->pd_upper = pageSize - specialSize;
56 19889 : p->pd_special = pageSize - specialSize;
57 19889 : PageSetPageSizeAndVersion(page, pageSize, PG_PAGE_LAYOUT_VERSION);
58 : /* p->pd_prune_xid = InvalidTransactionId; done by above MemSet */
59 19889 : }
60 :
61 :
62 : /*
63 : * PageIsVerified
64 : * Check that the page header and checksum (if any) appear valid.
65 : *
66 : * This is called when a page has just been read in from disk. The idea is
67 : * to cheaply detect trashed pages before we go nuts following bogus item
68 : * pointers, testing invalid transaction identifiers, etc.
69 : *
70 : * It turns out to be necessary to allow zeroed pages here too. Even though
71 : * this routine is *not* called when deliberately adding a page to a relation,
72 : * there are scenarios in which a zeroed page might be found in a table.
73 : * (Example: a backend extends a relation, then crashes before it can write
74 : * any WAL entry about the new page. The kernel will already have the
75 : * zeroed page in the file, and it will stay that way after restart.) So we
76 : * allow zeroed pages here, and are careful that the page access macros
77 : * treat such a page as empty and without free space. Eventually, VACUUM
78 : * will clean up such a page and make it usable.
79 : */
80 : bool
81 4392 : PageIsVerified(Page page, BlockNumber blkno)
82 : {
83 4392 : PageHeader p = (PageHeader) page;
84 : size_t *pagebytes;
85 : int i;
86 4392 : bool checksum_failure = false;
87 4392 : bool header_sane = false;
88 4392 : bool all_zeroes = false;
89 4392 : uint16 checksum = 0;
90 :
91 : /*
92 : * Don't verify page data unless the page passes basic non-zero test
93 : */
94 4392 : if (!PageIsNew(page))
95 : {
96 4392 : if (DataChecksumsEnabled())
97 : {
98 0 : checksum = pg_checksum_page((char *) page, blkno);
99 :
100 0 : if (checksum != p->pd_checksum)
101 0 : checksum_failure = true;
102 : }
103 :
104 : /*
105 : * The following checks don't prove the header is correct, only that
106 : * it looks sane enough to allow into the buffer pool. Later usage of
107 : * the block can still reveal problems, which is why we offer the
108 : * checksum option.
109 : */
110 8784 : if ((p->pd_flags & ~PD_VALID_FLAG_BITS) == 0 &&
111 8784 : p->pd_lower <= p->pd_upper &&
112 8784 : p->pd_upper <= p->pd_special &&
113 8784 : p->pd_special <= BLCKSZ &&
114 4392 : p->pd_special == MAXALIGN(p->pd_special))
115 4392 : header_sane = true;
116 :
117 4392 : if (header_sane && !checksum_failure)
118 4392 : return true;
119 : }
120 :
121 : /*
122 : * Check all-zeroes case. Luckily BLCKSZ is guaranteed to always be a
123 : * multiple of size_t - and it's much faster to compare memory using the
124 : * native word size.
125 : */
126 : StaticAssertStmt(BLCKSZ == (BLCKSZ / sizeof(size_t)) * sizeof(size_t),
127 : "BLCKSZ has to be a multiple of sizeof(size_t)");
128 :
129 0 : all_zeroes = true;
130 0 : pagebytes = (size_t *) page;
131 0 : for (i = 0; i < (BLCKSZ / sizeof(size_t)); i++)
132 : {
133 0 : if (pagebytes[i] != 0)
134 : {
135 0 : all_zeroes = false;
136 0 : break;
137 : }
138 : }
139 :
140 0 : if (all_zeroes)
141 0 : return true;
142 :
143 : /*
144 : * Throw a WARNING if the checksum fails, but only after we've checked for
145 : * the all-zeroes case.
146 : */
147 0 : if (checksum_failure)
148 : {
149 0 : ereport(WARNING,
150 : (ERRCODE_DATA_CORRUPTED,
151 : errmsg("page verification failed, calculated checksum %u but expected %u",
152 : checksum, p->pd_checksum)));
153 :
154 0 : if (header_sane && ignore_checksum_failure)
155 0 : return true;
156 : }
157 :
158 0 : return false;
159 : }
160 :
161 :
162 : /*
163 : * PageAddItemExtended
164 : *
165 : * Add an item to a page. Return value is the offset at which it was
166 : * inserted, or InvalidOffsetNumber if the item is not inserted for any
167 : * reason. A WARNING is issued indicating the reason for the refusal.
168 : *
169 : * offsetNumber must be either InvalidOffsetNumber to specify finding a
170 : * free item pointer, or a value between FirstOffsetNumber and one past
171 : * the last existing item, to specify using that particular item pointer.
172 : *
173 : * If offsetNumber is valid and flag PAI_OVERWRITE is set, we just store
174 : * the item at the specified offsetNumber, which must be either a
175 : * currently-unused item pointer, or one past the last existing item.
176 : *
177 : * If offsetNumber is valid and flag PAI_OVERWRITE is not set, insert
178 : * the item at the specified offsetNumber, moving existing items later
179 : * in the array to make room.
180 : *
181 : * If offsetNumber is not valid, then assign a slot by finding the first
182 : * one that is both unused and deallocated.
183 : *
184 : * If flag PAI_IS_HEAP is set, we enforce that there can't be more than
185 : * MaxHeapTuplesPerPage line pointers on the page.
186 : *
187 : * !!! EREPORT(ERROR) IS DISALLOWED HERE !!!
188 : */
189 : OffsetNumber
190 2488846 : PageAddItemExtended(Page page,
191 : Item item,
192 : Size size,
193 : OffsetNumber offsetNumber,
194 : int flags)
195 : {
196 2488846 : PageHeader phdr = (PageHeader) page;
197 : Size alignedSize;
198 : int lower;
199 : int upper;
200 : ItemId itemId;
201 : OffsetNumber limit;
202 2488846 : bool needshuffle = false;
203 :
204 : /*
205 : * Be wary about corrupted page pointers
206 : */
207 4977692 : if (phdr->pd_lower < SizeOfPageHeaderData ||
208 4977692 : phdr->pd_lower > phdr->pd_upper ||
209 4977692 : phdr->pd_upper > phdr->pd_special ||
210 2488846 : phdr->pd_special > BLCKSZ)
211 0 : ereport(PANIC,
212 : (errcode(ERRCODE_DATA_CORRUPTED),
213 : errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
214 : phdr->pd_lower, phdr->pd_upper, phdr->pd_special)));
215 :
216 : /*
217 : * Select offsetNumber to place the new item at
218 : */
219 2488846 : limit = OffsetNumberNext(PageGetMaxOffsetNumber(page));
220 :
221 : /* was offsetNumber passed in? */
222 2488846 : if (OffsetNumberIsValid(offsetNumber))
223 : {
224 : /* yes, check it */
225 1538710 : if ((flags & PAI_OVERWRITE) != 0)
226 : {
227 0 : if (offsetNumber < limit)
228 : {
229 0 : itemId = PageGetItemId(phdr, offsetNumber);
230 0 : if (ItemIdIsUsed(itemId) || ItemIdHasStorage(itemId))
231 : {
232 0 : elog(WARNING, "will not overwrite a used ItemId");
233 0 : return InvalidOffsetNumber;
234 : }
235 : }
236 : }
237 : else
238 : {
239 1538710 : if (offsetNumber < limit)
240 346379 : needshuffle = true; /* need to move existing linp's */
241 : }
242 : }
243 : else
244 : {
245 : /* offsetNumber was not passed in, so find a free slot */
246 : /* if no free slot, we'll put it at limit (1st open slot) */
247 950136 : if (PageHasFreeLinePointers(phdr))
248 : {
249 : /*
250 : * Look for "recyclable" (unused) ItemId. We check for no storage
251 : * as well, just to be paranoid --- unused items should never have
252 : * storage.
253 : */
254 750214 : for (offsetNumber = 1; offsetNumber < limit; offsetNumber++)
255 : {
256 749774 : itemId = PageGetItemId(phdr, offsetNumber);
257 749774 : if (!ItemIdIsUsed(itemId) && !ItemIdHasStorage(itemId))
258 14256 : break;
259 : }
260 14696 : if (offsetNumber >= limit)
261 : {
262 : /* the hint is wrong, so reset it */
263 440 : PageClearHasFreeLinePointers(phdr);
264 : }
265 : }
266 : else
267 : {
268 : /* don't bother searching if hint says there's no free slot */
269 935440 : offsetNumber = limit;
270 : }
271 : }
272 :
273 : /* Reject placing items beyond the first unused line pointer */
274 2488846 : if (offsetNumber > limit)
275 : {
276 0 : elog(WARNING, "specified item offset is too large");
277 0 : return InvalidOffsetNumber;
278 : }
279 :
280 : /* Reject placing items beyond heap boundary, if heap */
281 2488846 : if ((flags & PAI_IS_HEAP) != 0 && offsetNumber > MaxHeapTuplesPerPage)
282 : {
283 0 : elog(WARNING, "can't put more than MaxHeapTuplesPerPage items in a heap page");
284 0 : return InvalidOffsetNumber;
285 : }
286 :
287 : /*
288 : * Compute new lower and upper pointers for page, see if it'll fit.
289 : *
290 : * Note: do arithmetic as signed ints, to avoid mistakes if, say,
291 : * alignedSize > pd_upper.
292 : */
293 2488846 : if (offsetNumber == limit || needshuffle)
294 2474590 : lower = phdr->pd_lower + sizeof(ItemIdData);
295 : else
296 14256 : lower = phdr->pd_lower;
297 :
298 2488846 : alignedSize = MAXALIGN(size);
299 :
300 2488846 : upper = (int) phdr->pd_upper - (int) alignedSize;
301 :
302 2488846 : if (lower > upper)
303 0 : return InvalidOffsetNumber;
304 :
305 : /*
306 : * OK to insert the item. First, shuffle the existing pointers if needed.
307 : */
308 2488846 : itemId = PageGetItemId(phdr, offsetNumber);
309 :
310 2488846 : if (needshuffle)
311 346379 : memmove(itemId + 1, itemId,
312 346379 : (limit - offsetNumber) * sizeof(ItemIdData));
313 :
314 : /* set the item pointer */
315 2488846 : ItemIdSetNormal(itemId, upper, size);
316 :
317 : /*
318 : * Items normally contain no uninitialized bytes. Core bufpage consumers
319 : * conform, but this is not a necessary coding rule; a new index AM could
320 : * opt to depart from it. However, data type input functions and other
321 : * C-language functions that synthesize datums should initialize all
322 : * bytes; datumIsEqual() relies on this. Testing here, along with the
323 : * similar check in printtup(), helps to catch such mistakes.
324 : *
325 : * Values of the "name" type retrieved via index-only scans may contain
326 : * uninitialized bytes; see comment in btrescan(). Valgrind will report
327 : * this as an error, but it is safe to ignore.
328 : */
329 : VALGRIND_CHECK_MEM_IS_DEFINED(item, size);
330 :
331 : /* copy the item's data onto the page */
332 2488846 : memcpy((char *) page + upper, item, size);
333 :
334 : /* adjust page header */
335 2488846 : phdr->pd_lower = (LocationIndex) lower;
336 2488846 : phdr->pd_upper = (LocationIndex) upper;
337 :
338 2488846 : return offsetNumber;
339 : }
340 :
341 :
342 : /*
343 : * PageGetTempPage
344 : * Get a temporary page in local memory for special processing.
345 : * The returned page is not initialized at all; caller must do that.
346 : */
347 : Page
348 897 : PageGetTempPage(Page page)
349 : {
350 : Size pageSize;
351 : Page temp;
352 :
353 897 : pageSize = PageGetPageSize(page);
354 897 : temp = (Page) palloc(pageSize);
355 :
356 897 : return temp;
357 : }
358 :
359 : /*
360 : * PageGetTempPageCopy
361 : * Get a temporary page in local memory for special processing.
362 : * The page is initialized by copying the contents of the given page.
363 : */
364 : Page
365 538 : PageGetTempPageCopy(Page page)
366 : {
367 : Size pageSize;
368 : Page temp;
369 :
370 538 : pageSize = PageGetPageSize(page);
371 538 : temp = (Page) palloc(pageSize);
372 :
373 538 : memcpy(temp, page, pageSize);
374 :
375 538 : return temp;
376 : }
377 :
378 : /*
379 : * PageGetTempPageCopySpecial
380 : * Get a temporary page in local memory for special processing.
381 : * The page is PageInit'd with the same special-space size as the
382 : * given page, and the special space is copied from the given page.
383 : */
384 : Page
385 863 : PageGetTempPageCopySpecial(Page page)
386 : {
387 : Size pageSize;
388 : Page temp;
389 :
390 863 : pageSize = PageGetPageSize(page);
391 863 : temp = (Page) palloc(pageSize);
392 :
393 863 : PageInit(temp, pageSize, PageGetSpecialSize(page));
394 2589 : memcpy(PageGetSpecialPointer(temp),
395 1726 : PageGetSpecialPointer(page),
396 863 : PageGetSpecialSize(page));
397 :
398 863 : return temp;
399 : }
400 :
401 : /*
402 : * PageRestoreTempPage
403 : * Copy temporary page back to permanent page after special processing
404 : * and release the temporary page.
405 : */
406 : void
407 1873 : PageRestoreTempPage(Page tempPage, Page oldPage)
408 : {
409 : Size pageSize;
410 :
411 1873 : pageSize = PageGetPageSize(tempPage);
412 1873 : memcpy((char *) oldPage, (char *) tempPage, pageSize);
413 :
414 1873 : pfree(tempPage);
415 1873 : }
416 :
417 : /*
418 : * sorting support for PageRepairFragmentation and PageIndexMultiDelete
419 : */
420 : typedef struct itemIdSortData
421 : {
422 : uint16 offsetindex; /* linp array index */
423 : int16 itemoff; /* page offset of item data */
424 : uint16 alignedlen; /* MAXALIGN(item data len) */
425 : } itemIdSortData;
426 : typedef itemIdSortData *itemIdSort;
427 :
428 : static int
429 1496131 : itemoffcompare(const void *itemidp1, const void *itemidp2)
430 : {
431 : /* Sort in decreasing itemoff order */
432 2992262 : return ((itemIdSort) itemidp2)->itemoff -
433 1496131 : ((itemIdSort) itemidp1)->itemoff;
434 : }
435 :
436 : /*
437 : * After removing or marking some line pointers unused, move the tuples to
438 : * remove the gaps caused by the removed items.
439 : */
440 : static void
441 4434 : compactify_tuples(itemIdSort itemidbase, int nitems, Page page)
442 : {
443 4434 : PageHeader phdr = (PageHeader) page;
444 : Offset upper;
445 : int i;
446 :
447 : /* sort itemIdSortData array into decreasing itemoff order */
448 4434 : qsort((char *) itemidbase, nitems, sizeof(itemIdSortData),
449 : itemoffcompare);
450 :
451 4434 : upper = phdr->pd_special;
452 343190 : for (i = 0; i < nitems; i++)
453 : {
454 338756 : itemIdSort itemidptr = &itemidbase[i];
455 : ItemId lp;
456 :
457 338756 : lp = PageGetItemId(page, itemidptr->offsetindex + 1);
458 338756 : upper -= itemidptr->alignedlen;
459 677512 : memmove((char *) page + upper,
460 338756 : (char *) page + itemidptr->itemoff,
461 338756 : itemidptr->alignedlen);
462 338756 : lp->lp_off = upper;
463 : }
464 :
465 4434 : phdr->pd_upper = upper;
466 4434 : }
467 :
468 : /*
469 : * PageRepairFragmentation
470 : *
471 : * Frees fragmented space on a page.
472 : * It doesn't remove unused line pointers! Please don't change this.
473 : *
474 : * This routine is usable for heap pages only, but see PageIndexMultiDelete.
475 : *
476 : * As a side effect, the page's PD_HAS_FREE_LINES hint bit is updated.
477 : */
478 : void
479 3805 : PageRepairFragmentation(Page page)
480 : {
481 3805 : Offset pd_lower = ((PageHeader) page)->pd_lower;
482 3805 : Offset pd_upper = ((PageHeader) page)->pd_upper;
483 3805 : Offset pd_special = ((PageHeader) page)->pd_special;
484 : ItemId lp;
485 : int nline,
486 : nstorage,
487 : nunused;
488 : int i;
489 : Size totallen;
490 :
491 : /*
492 : * It's worth the trouble to be more paranoid here than in most places,
493 : * because we are about to reshuffle data in (what is usually) a shared
494 : * disk buffer. If we aren't careful then corrupted pointers, lengths,
495 : * etc could cause us to clobber adjacent disk buffers, spreading the data
496 : * loss further. So, check everything.
497 : */
498 3805 : if (pd_lower < SizeOfPageHeaderData ||
499 3805 : pd_lower > pd_upper ||
500 3805 : pd_upper > pd_special ||
501 3805 : pd_special > BLCKSZ ||
502 3805 : pd_special != MAXALIGN(pd_special))
503 0 : ereport(ERROR,
504 : (errcode(ERRCODE_DATA_CORRUPTED),
505 : errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
506 : pd_lower, pd_upper, pd_special)));
507 :
508 3805 : nline = PageGetMaxOffsetNumber(page);
509 3805 : nunused = nstorage = 0;
510 349577 : for (i = FirstOffsetNumber; i <= nline; i++)
511 : {
512 345772 : lp = PageGetItemId(page, i);
513 345772 : if (ItemIdIsUsed(lp))
514 : {
515 263200 : if (ItemIdHasStorage(lp))
516 104860 : nstorage++;
517 : }
518 : else
519 : {
520 : /* Unused entries should have lp_len = 0, but make sure */
521 82572 : ItemIdSetUnused(lp);
522 82572 : nunused++;
523 : }
524 : }
525 :
526 3805 : if (nstorage == 0)
527 : {
528 : /* Page is completely empty, so just reset it quickly */
529 1110 : ((PageHeader) page)->pd_upper = pd_special;
530 : }
531 : else
532 : {
533 : /* Need to compact the page the hard way */
534 : itemIdSortData itemidbase[MaxHeapTuplesPerPage];
535 2695 : itemIdSort itemidptr = itemidbase;
536 :
537 2695 : totallen = 0;
538 253429 : for (i = 0; i < nline; i++)
539 : {
540 250734 : lp = PageGetItemId(page, i + 1);
541 250734 : if (ItemIdHasStorage(lp))
542 : {
543 104860 : itemidptr->offsetindex = i;
544 104860 : itemidptr->itemoff = ItemIdGetOffset(lp);
545 209720 : if (itemidptr->itemoff < (int) pd_upper ||
546 104860 : itemidptr->itemoff >= (int) pd_special)
547 0 : ereport(ERROR,
548 : (errcode(ERRCODE_DATA_CORRUPTED),
549 : errmsg("corrupted item pointer: %u",
550 : itemidptr->itemoff)));
551 104860 : itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp));
552 104860 : totallen += itemidptr->alignedlen;
553 104860 : itemidptr++;
554 : }
555 : }
556 :
557 2695 : if (totallen > (Size) (pd_special - pd_lower))
558 0 : ereport(ERROR,
559 : (errcode(ERRCODE_DATA_CORRUPTED),
560 : errmsg("corrupted item lengths: total %u, available space %u",
561 : (unsigned int) totallen, pd_special - pd_lower)));
562 :
563 2695 : compactify_tuples(itemidbase, nstorage, page);
564 : }
565 :
566 : /* Set hint bit for PageAddItem */
567 3805 : if (nunused > 0)
568 1780 : PageSetHasFreeLinePointers(page);
569 : else
570 2025 : PageClearHasFreeLinePointers(page);
571 3805 : }
572 :
573 : /*
574 : * PageGetFreeSpace
575 : * Returns the size of the free (allocatable) space on a page,
576 : * reduced by the space needed for a new line pointer.
577 : *
578 : * Note: this should usually only be used on index pages. Use
579 : * PageGetHeapFreeSpace on heap pages.
580 : */
581 : Size
582 2060668 : PageGetFreeSpace(Page page)
583 : {
584 : int space;
585 :
586 : /*
587 : * Use signed arithmetic here so that we behave sensibly if pd_lower >
588 : * pd_upper.
589 : */
590 4121336 : space = (int) ((PageHeader) page)->pd_upper -
591 2060668 : (int) ((PageHeader) page)->pd_lower;
592 :
593 2060668 : if (space < (int) sizeof(ItemIdData))
594 165 : return 0;
595 2060503 : space -= sizeof(ItemIdData);
596 :
597 2060503 : return (Size) space;
598 : }
599 :
600 : /*
601 : * PageGetFreeSpaceForMultipleTuples
602 : * Returns the size of the free (allocatable) space on a page,
603 : * reduced by the space needed for multiple new line pointers.
604 : *
605 : * Note: this should usually only be used on index pages. Use
606 : * PageGetHeapFreeSpace on heap pages.
607 : */
608 : Size
609 17025 : PageGetFreeSpaceForMultipleTuples(Page page, int ntups)
610 : {
611 : int space;
612 :
613 : /*
614 : * Use signed arithmetic here so that we behave sensibly if pd_lower >
615 : * pd_upper.
616 : */
617 34050 : space = (int) ((PageHeader) page)->pd_upper -
618 17025 : (int) ((PageHeader) page)->pd_lower;
619 :
620 17025 : if (space < (int) (ntups * sizeof(ItemIdData)))
621 0 : return 0;
622 17025 : space -= ntups * sizeof(ItemIdData);
623 :
624 17025 : return (Size) space;
625 : }
626 :
627 : /*
628 : * PageGetExactFreeSpace
629 : * Returns the size of the free (allocatable) space on a page,
630 : * without any consideration for adding/removing line pointers.
631 : */
632 : Size
633 313136 : PageGetExactFreeSpace(Page page)
634 : {
635 : int space;
636 :
637 : /*
638 : * Use signed arithmetic here so that we behave sensibly if pd_lower >
639 : * pd_upper.
640 : */
641 626272 : space = (int) ((PageHeader) page)->pd_upper -
642 313136 : (int) ((PageHeader) page)->pd_lower;
643 :
644 313136 : if (space < 0)
645 0 : return 0;
646 :
647 313136 : return (Size) space;
648 : }
649 :
650 :
651 : /*
652 : * PageGetHeapFreeSpace
653 : * Returns the size of the free (allocatable) space on a page,
654 : * reduced by the space needed for a new line pointer.
655 : *
656 : * The difference between this and PageGetFreeSpace is that this will return
657 : * zero if there are already MaxHeapTuplesPerPage line pointers in the page
658 : * and none are free. We use this to enforce that no more than
659 : * MaxHeapTuplesPerPage line pointers are created on a heap page. (Although
660 : * no more tuples than that could fit anyway, in the presence of redirected
661 : * or dead line pointers it'd be possible to have too many line pointers.
662 : * To avoid breaking code that assumes MaxHeapTuplesPerPage is a hard limit
663 : * on the number of line pointers, we make this extra check.)
664 : */
665 : Size
666 856049 : PageGetHeapFreeSpace(Page page)
667 : {
668 : Size space;
669 :
670 856049 : space = PageGetFreeSpace(page);
671 856049 : if (space > 0)
672 : {
673 : OffsetNumber offnum,
674 : nline;
675 :
676 : /*
677 : * Are there already MaxHeapTuplesPerPage line pointers in the page?
678 : */
679 855813 : nline = PageGetMaxOffsetNumber(page);
680 855813 : if (nline >= MaxHeapTuplesPerPage)
681 : {
682 3028 : if (PageHasFreeLinePointers((PageHeader) page))
683 : {
684 : /*
685 : * Since this is just a hint, we must confirm that there is
686 : * indeed a free line pointer
687 : */
688 310307 : for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum))
689 : {
690 310303 : ItemId lp = PageGetItemId(page, offnum);
691 :
692 310303 : if (!ItemIdIsUsed(lp))
693 2905 : break;
694 : }
695 :
696 2909 : if (offnum > nline)
697 : {
698 : /*
699 : * The hint is wrong, but we can't clear it here since we
700 : * don't have the ability to mark the page dirty.
701 : */
702 4 : space = 0;
703 : }
704 : }
705 : else
706 : {
707 : /*
708 : * Although the hint might be wrong, PageAddItem will believe
709 : * it anyway, so we must believe it too.
710 : */
711 119 : space = 0;
712 : }
713 : }
714 : }
715 856049 : return space;
716 : }
717 :
718 :
719 : /*
720 : * PageIndexTupleDelete
721 : *
722 : * This routine does the work of removing a tuple from an index page.
723 : *
724 : * Unlike heap pages, we compact out the line pointer for the removed tuple.
725 : */
726 : void
727 66225 : PageIndexTupleDelete(Page page, OffsetNumber offnum)
728 : {
729 66225 : PageHeader phdr = (PageHeader) page;
730 : char *addr;
731 : ItemId tup;
732 : Size size;
733 : unsigned offset;
734 : int nbytes;
735 : int offidx;
736 : int nline;
737 :
738 : /*
739 : * As with PageRepairFragmentation, paranoia seems justified.
740 : */
741 132450 : if (phdr->pd_lower < SizeOfPageHeaderData ||
742 132450 : phdr->pd_lower > phdr->pd_upper ||
743 132450 : phdr->pd_upper > phdr->pd_special ||
744 132450 : phdr->pd_special > BLCKSZ ||
745 66225 : phdr->pd_special != MAXALIGN(phdr->pd_special))
746 0 : ereport(ERROR,
747 : (errcode(ERRCODE_DATA_CORRUPTED),
748 : errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
749 : phdr->pd_lower, phdr->pd_upper, phdr->pd_special)));
750 :
751 66225 : nline = PageGetMaxOffsetNumber(page);
752 66225 : if ((int) offnum <= 0 || (int) offnum > nline)
753 0 : elog(ERROR, "invalid index offnum: %u", offnum);
754 :
755 : /* change offset number to offset index */
756 66225 : offidx = offnum - 1;
757 :
758 66225 : tup = PageGetItemId(page, offnum);
759 66225 : Assert(ItemIdHasStorage(tup));
760 66225 : size = ItemIdGetLength(tup);
761 66225 : offset = ItemIdGetOffset(tup);
762 :
763 132450 : if (offset < phdr->pd_upper || (offset + size) > phdr->pd_special ||
764 66225 : offset != MAXALIGN(offset))
765 0 : ereport(ERROR,
766 : (errcode(ERRCODE_DATA_CORRUPTED),
767 : errmsg("corrupted item pointer: offset = %u, size = %u",
768 : offset, (unsigned int) size)));
769 :
770 : /* Amount of space to actually be deleted */
771 66225 : size = MAXALIGN(size);
772 :
773 : /*
774 : * First, we want to get rid of the pd_linp entry for the index tuple. We
775 : * copy all subsequent linp's back one slot in the array. We don't use
776 : * PageGetItemId, because we are manipulating the _array_, not individual
777 : * linp's.
778 : */
779 132450 : nbytes = phdr->pd_lower -
780 66225 : ((char *) &phdr->pd_linp[offidx + 1] - (char *) phdr);
781 :
782 66225 : if (nbytes > 0)
783 130288 : memmove((char *) &(phdr->pd_linp[offidx]),
784 65144 : (char *) &(phdr->pd_linp[offidx + 1]),
785 : nbytes);
786 :
787 : /*
788 : * Now move everything between the old upper bound (beginning of tuple
789 : * space) and the beginning of the deleted tuple forward, so that space in
790 : * the middle of the page is left free. If we've just deleted the tuple
791 : * at the beginning of tuple space, then there's no need to do the copy.
792 : */
793 :
794 : /* beginning of tuple space */
795 66225 : addr = (char *) page + phdr->pd_upper;
796 :
797 66225 : if (offset > phdr->pd_upper)
798 64770 : memmove(addr + size, addr, offset - phdr->pd_upper);
799 :
800 : /* adjust free space boundary pointers */
801 66225 : phdr->pd_upper += size;
802 66225 : phdr->pd_lower -= sizeof(ItemIdData);
803 :
804 : /*
805 : * Finally, we need to adjust the linp entries that remain.
806 : *
807 : * Anything that used to be before the deleted tuple's data was moved
808 : * forward by the size of the deleted tuple.
809 : */
810 66225 : if (!PageIsEmpty(page))
811 : {
812 : int i;
813 :
814 66143 : nline--; /* there's one less than when we started */
815 14291949 : for (i = 1; i <= nline; i++)
816 : {
817 14225806 : ItemId ii = PageGetItemId(phdr, i);
818 :
819 14225806 : Assert(ItemIdHasStorage(ii));
820 14225806 : if (ItemIdGetOffset(ii) <= offset)
821 9305330 : ii->lp_off += size;
822 : }
823 : }
824 66225 : }
825 :
826 :
827 : /*
828 : * PageIndexMultiDelete
829 : *
830 : * This routine handles the case of deleting multiple tuples from an
831 : * index page at once. It is considerably faster than a loop around
832 : * PageIndexTupleDelete ... however, the caller *must* supply the array
833 : * of item numbers to be deleted in item number order!
834 : */
835 : void
836 2099 : PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
837 : {
838 2099 : PageHeader phdr = (PageHeader) page;
839 2099 : Offset pd_lower = phdr->pd_lower;
840 2099 : Offset pd_upper = phdr->pd_upper;
841 2099 : Offset pd_special = phdr->pd_special;
842 : itemIdSortData itemidbase[MaxIndexTuplesPerPage];
843 : ItemIdData newitemids[MaxIndexTuplesPerPage];
844 : itemIdSort itemidptr;
845 : ItemId lp;
846 : int nline,
847 : nused;
848 : Size totallen;
849 : Size size;
850 : unsigned offset;
851 : int nextitm;
852 : OffsetNumber offnum;
853 :
854 2099 : Assert(nitems <= MaxIndexTuplesPerPage);
855 :
856 : /*
857 : * If there aren't very many items to delete, then retail
858 : * PageIndexTupleDelete is the best way. Delete the items in reverse
859 : * order so we don't have to think about adjusting item numbers for
860 : * previous deletions.
861 : *
862 : * TODO: tune the magic number here
863 : */
864 2099 : if (nitems <= 2)
865 : {
866 1134 : while (--nitems >= 0)
867 414 : PageIndexTupleDelete(page, itemnos[nitems]);
868 2459 : return;
869 : }
870 :
871 : /*
872 : * As with PageRepairFragmentation, paranoia seems justified.
873 : */
874 1739 : if (pd_lower < SizeOfPageHeaderData ||
875 1739 : pd_lower > pd_upper ||
876 1739 : pd_upper > pd_special ||
877 1739 : pd_special > BLCKSZ ||
878 1739 : pd_special != MAXALIGN(pd_special))
879 0 : ereport(ERROR,
880 : (errcode(ERRCODE_DATA_CORRUPTED),
881 : errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
882 : pd_lower, pd_upper, pd_special)));
883 :
884 : /*
885 : * Scan the item pointer array and build a list of just the ones we are
886 : * going to keep. Notice we do not modify the page yet, since we are
887 : * still validity-checking.
888 : */
889 1739 : nline = PageGetMaxOffsetNumber(page);
890 1739 : itemidptr = itemidbase;
891 1739 : totallen = 0;
892 1739 : nused = 0;
893 1739 : nextitm = 0;
894 412986 : for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum))
895 : {
896 411247 : lp = PageGetItemId(page, offnum);
897 411247 : Assert(ItemIdHasStorage(lp));
898 411247 : size = ItemIdGetLength(lp);
899 411247 : offset = ItemIdGetOffset(lp);
900 822494 : if (offset < pd_upper ||
901 822494 : (offset + size) > pd_special ||
902 411247 : offset != MAXALIGN(offset))
903 0 : ereport(ERROR,
904 : (errcode(ERRCODE_DATA_CORRUPTED),
905 : errmsg("corrupted item pointer: offset = %u, length = %u",
906 : offset, (unsigned int) size)));
907 :
908 411247 : if (nextitm < nitems && offnum == itemnos[nextitm])
909 : {
910 : /* skip item to be deleted */
911 177351 : nextitm++;
912 : }
913 : else
914 : {
915 233896 : itemidptr->offsetindex = nused; /* where it will go */
916 233896 : itemidptr->itemoff = offset;
917 233896 : itemidptr->alignedlen = MAXALIGN(size);
918 233896 : totallen += itemidptr->alignedlen;
919 233896 : newitemids[nused] = *lp;
920 233896 : itemidptr++;
921 233896 : nused++;
922 : }
923 : }
924 :
925 : /* this will catch invalid or out-of-order itemnos[] */
926 1739 : if (nextitm != nitems)
927 0 : elog(ERROR, "incorrect index offsets supplied");
928 :
929 1739 : if (totallen > (Size) (pd_special - pd_lower))
930 0 : ereport(ERROR,
931 : (errcode(ERRCODE_DATA_CORRUPTED),
932 : errmsg("corrupted item lengths: total %u, available space %u",
933 : (unsigned int) totallen, pd_special - pd_lower)));
934 :
935 : /*
936 : * Looks good. Overwrite the line pointers with the copy, from which we've
937 : * removed all the unused items.
938 : */
939 1739 : memcpy(phdr->pd_linp, newitemids, nused * sizeof(ItemIdData));
940 1739 : phdr->pd_lower = SizeOfPageHeaderData + nused * sizeof(ItemIdData);
941 :
942 : /* and compactify the tuple data */
943 1739 : compactify_tuples(itemidbase, nused, page);
944 : }
945 :
946 :
947 : /*
948 : * PageIndexTupleDeleteNoCompact
949 : *
950 : * Remove the specified tuple from an index page, but set its line pointer
951 : * to "unused" instead of compacting it out, except that it can be removed
952 : * if it's the last line pointer on the page.
953 : *
954 : * This is used for index AMs that require that existing TIDs of live tuples
955 : * remain unchanged, and are willing to allow unused line pointers instead.
956 : */
957 : void
958 3 : PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum)
959 : {
960 3 : PageHeader phdr = (PageHeader) page;
961 : char *addr;
962 : ItemId tup;
963 : Size size;
964 : unsigned offset;
965 : int nline;
966 :
967 : /*
968 : * As with PageRepairFragmentation, paranoia seems justified.
969 : */
970 6 : if (phdr->pd_lower < SizeOfPageHeaderData ||
971 6 : phdr->pd_lower > phdr->pd_upper ||
972 6 : phdr->pd_upper > phdr->pd_special ||
973 6 : phdr->pd_special > BLCKSZ ||
974 3 : phdr->pd_special != MAXALIGN(phdr->pd_special))
975 0 : ereport(ERROR,
976 : (errcode(ERRCODE_DATA_CORRUPTED),
977 : errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
978 : phdr->pd_lower, phdr->pd_upper, phdr->pd_special)));
979 :
980 3 : nline = PageGetMaxOffsetNumber(page);
981 3 : if ((int) offnum <= 0 || (int) offnum > nline)
982 0 : elog(ERROR, "invalid index offnum: %u", offnum);
983 :
984 3 : tup = PageGetItemId(page, offnum);
985 3 : Assert(ItemIdHasStorage(tup));
986 3 : size = ItemIdGetLength(tup);
987 3 : offset = ItemIdGetOffset(tup);
988 :
989 6 : if (offset < phdr->pd_upper || (offset + size) > phdr->pd_special ||
990 3 : offset != MAXALIGN(offset))
991 0 : ereport(ERROR,
992 : (errcode(ERRCODE_DATA_CORRUPTED),
993 : errmsg("corrupted item pointer: offset = %u, size = %u",
994 : offset, (unsigned int) size)));
995 :
996 : /* Amount of space to actually be deleted */
997 3 : size = MAXALIGN(size);
998 :
999 : /*
1000 : * Either set the item pointer to "unused", or zap it if it's the last
1001 : * one. (Note: it's possible that the next-to-last one(s) are already
1002 : * unused, but we do not trouble to try to compact them out if so.)
1003 : */
1004 3 : if ((int) offnum < nline)
1005 1 : ItemIdSetUnused(tup);
1006 : else
1007 : {
1008 2 : phdr->pd_lower -= sizeof(ItemIdData);
1009 2 : nline--; /* there's one less than when we started */
1010 : }
1011 :
1012 : /*
1013 : * Now move everything between the old upper bound (beginning of tuple
1014 : * space) and the beginning of the deleted tuple forward, so that space in
1015 : * the middle of the page is left free. If we've just deleted the tuple
1016 : * at the beginning of tuple space, then there's no need to do the copy.
1017 : */
1018 :
1019 : /* beginning of tuple space */
1020 3 : addr = (char *) page + phdr->pd_upper;
1021 :
1022 3 : if (offset > phdr->pd_upper)
1023 1 : memmove(addr + size, addr, offset - phdr->pd_upper);
1024 :
1025 : /* adjust free space boundary pointer */
1026 3 : phdr->pd_upper += size;
1027 :
1028 : /*
1029 : * Finally, we need to adjust the linp entries that remain.
1030 : *
1031 : * Anything that used to be before the deleted tuple's data was moved
1032 : * forward by the size of the deleted tuple.
1033 : */
1034 3 : if (!PageIsEmpty(page))
1035 : {
1036 : int i;
1037 :
1038 34 : for (i = 1; i <= nline; i++)
1039 : {
1040 31 : ItemId ii = PageGetItemId(phdr, i);
1041 :
1042 31 : if (ItemIdHasStorage(ii) && ItemIdGetOffset(ii) <= offset)
1043 9 : ii->lp_off += size;
1044 : }
1045 : }
1046 3 : }
1047 :
1048 :
1049 : /*
1050 : * PageIndexTupleOverwrite
1051 : *
1052 : * Replace a specified tuple on an index page.
1053 : *
1054 : * The new tuple is placed exactly where the old one had been, shifting
1055 : * other tuples' data up or down as needed to keep the page compacted.
1056 : * This is better than deleting and reinserting the tuple, because it
1057 : * avoids any data shifting when the tuple size doesn't change; and
1058 : * even when it does, we avoid moving the item pointers around.
1059 : * Conceivably this could also be of use to an index AM that cares about
1060 : * the physical order of tuples as well as their ItemId order.
1061 : *
1062 : * If there's insufficient space for the new tuple, return false. Other
1063 : * errors represent data-corruption problems, so we just elog.
1064 : */
1065 : bool
1066 51651 : PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
1067 : Item newtup, Size newsize)
1068 : {
1069 51651 : PageHeader phdr = (PageHeader) page;
1070 : ItemId tupid;
1071 : int oldsize;
1072 : unsigned offset;
1073 : Size alignednewsize;
1074 : int size_diff;
1075 : int itemcount;
1076 :
1077 : /*
1078 : * As with PageRepairFragmentation, paranoia seems justified.
1079 : */
1080 103302 : if (phdr->pd_lower < SizeOfPageHeaderData ||
1081 103302 : phdr->pd_lower > phdr->pd_upper ||
1082 103302 : phdr->pd_upper > phdr->pd_special ||
1083 103302 : phdr->pd_special > BLCKSZ ||
1084 51651 : phdr->pd_special != MAXALIGN(phdr->pd_special))
1085 0 : ereport(ERROR,
1086 : (errcode(ERRCODE_DATA_CORRUPTED),
1087 : errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
1088 : phdr->pd_lower, phdr->pd_upper, phdr->pd_special)));
1089 :
1090 51651 : itemcount = PageGetMaxOffsetNumber(page);
1091 51651 : if ((int) offnum <= 0 || (int) offnum > itemcount)
1092 0 : elog(ERROR, "invalid index offnum: %u", offnum);
1093 :
1094 51651 : tupid = PageGetItemId(page, offnum);
1095 51651 : Assert(ItemIdHasStorage(tupid));
1096 51651 : oldsize = ItemIdGetLength(tupid);
1097 51651 : offset = ItemIdGetOffset(tupid);
1098 :
1099 103302 : if (offset < phdr->pd_upper || (offset + oldsize) > phdr->pd_special ||
1100 51651 : offset != MAXALIGN(offset))
1101 0 : ereport(ERROR,
1102 : (errcode(ERRCODE_DATA_CORRUPTED),
1103 : errmsg("corrupted item pointer: offset = %u, size = %u",
1104 : offset, (unsigned int) oldsize)));
1105 :
1106 : /*
1107 : * Determine actual change in space requirement, check for page overflow.
1108 : */
1109 51651 : oldsize = MAXALIGN(oldsize);
1110 51651 : alignednewsize = MAXALIGN(newsize);
1111 51651 : if (alignednewsize > oldsize + (phdr->pd_upper - phdr->pd_lower))
1112 0 : return false;
1113 :
1114 : /*
1115 : * Relocate existing data and update line pointers, unless the new tuple
1116 : * is the same size as the old (after alignment), in which case there's
1117 : * nothing to do. Notice that what we have to relocate is data before the
1118 : * target tuple, not data after, so it's convenient to express size_diff
1119 : * as the amount by which the tuple's size is decreasing, making it the
1120 : * delta to add to pd_upper and affected line pointers.
1121 : */
1122 51651 : size_diff = oldsize - (int) alignednewsize;
1123 51651 : if (size_diff != 0)
1124 : {
1125 116 : char *addr = (char *) page + phdr->pd_upper;
1126 : int i;
1127 :
1128 : /* relocate all tuple data before the target tuple */
1129 116 : memmove(addr + size_diff, addr, offset - phdr->pd_upper);
1130 :
1131 : /* adjust free space boundary pointer */
1132 116 : phdr->pd_upper += size_diff;
1133 :
1134 : /* adjust affected line pointers too */
1135 1285 : for (i = FirstOffsetNumber; i <= itemcount; i++)
1136 : {
1137 1169 : ItemId ii = PageGetItemId(phdr, i);
1138 :
1139 : /* Allow items without storage; currently only BRIN needs that */
1140 1169 : if (ItemIdHasStorage(ii) && ItemIdGetOffset(ii) <= offset)
1141 675 : ii->lp_off += size_diff;
1142 : }
1143 : }
1144 :
1145 : /* Update the item's tuple length (other fields shouldn't change) */
1146 51651 : ItemIdSetNormal(tupid, offset + size_diff, newsize);
1147 :
1148 : /* Copy new tuple data onto page */
1149 51651 : memcpy(PageGetItem(page, tupid), newtup, newsize);
1150 :
1151 51651 : return true;
1152 : }
1153 :
1154 :
1155 : /*
1156 : * Set checksum for a page in shared buffers.
1157 : *
1158 : * If checksums are disabled, or if the page is not initialized, just return
1159 : * the input. Otherwise, we must make a copy of the page before calculating
1160 : * the checksum, to prevent concurrent modifications (e.g. setting hint bits)
1161 : * from making the final checksum invalid. It doesn't matter if we include or
1162 : * exclude hints during the copy, as long as we write a valid page and
1163 : * associated checksum.
1164 : *
1165 : * Returns a pointer to the block-sized data that needs to be written. Uses
1166 : * statically-allocated memory, so the caller must immediately write the
1167 : * returned page and not refer to it again.
1168 : */
1169 : char *
1170 8177 : PageSetChecksumCopy(Page page, BlockNumber blkno)
1171 : {
1172 : static char *pageCopy = NULL;
1173 :
1174 : /* If we don't need a checksum, just return the passed-in data */
1175 8177 : if (PageIsNew(page) || !DataChecksumsEnabled())
1176 8177 : return (char *) page;
1177 :
1178 : /*
1179 : * We allocate the copy space once and use it over on each subsequent
1180 : * call. The point of palloc'ing here, rather than having a static char
1181 : * array, is first to ensure adequate alignment for the checksumming code
1182 : * and second to avoid wasting space in processes that never call this.
1183 : */
1184 0 : if (pageCopy == NULL)
1185 0 : pageCopy = MemoryContextAlloc(TopMemoryContext, BLCKSZ);
1186 :
1187 0 : memcpy(pageCopy, (char *) page, BLCKSZ);
1188 0 : ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno);
1189 0 : return pageCopy;
1190 : }
1191 :
1192 : /*
1193 : * Set checksum for a page in private memory.
1194 : *
1195 : * This must only be used when we know that no other process can be modifying
1196 : * the page buffer.
1197 : */
1198 : void
1199 4522 : PageSetChecksumInplace(Page page, BlockNumber blkno)
1200 : {
1201 : /* If we don't need a checksum, just return */
1202 4522 : if (PageIsNew(page) || !DataChecksumsEnabled())
1203 9044 : return;
1204 :
1205 0 : ((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, blkno);
1206 : }
|