Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * relmapper.c
4 : * Catalog-to-filenode mapping
5 : *
6 : * For most tables, the physical file underlying the table is specified by
7 : * pg_class.relfilenode. However, that obviously won't work for pg_class
8 : * itself, nor for the other "nailed" catalogs for which we have to be able
9 : * to set up working Relation entries without access to pg_class. It also
10 : * does not work for shared catalogs, since there is no practical way to
11 : * update other databases' pg_class entries when relocating a shared catalog.
12 : * Therefore, for these special catalogs (henceforth referred to as "mapped
13 : * catalogs") we rely on a separately maintained file that shows the mapping
14 : * from catalog OIDs to filenode numbers. Each database has a map file for
15 : * its local mapped catalogs, and there is a separate map file for shared
16 : * catalogs. Mapped catalogs have zero in their pg_class.relfilenode entries.
17 : *
18 : * Relocation of a normal table is committed (ie, the new physical file becomes
19 : * authoritative) when the pg_class row update commits. For mapped catalogs,
20 : * the act of updating the map file is effectively commit of the relocation.
21 : * We postpone the file update till just before commit of the transaction
22 : * doing the rewrite, but there is necessarily a window between. Therefore
23 : * mapped catalogs can only be relocated by operations such as VACUUM FULL
24 : * and CLUSTER, which make no transactionally-significant changes: it must be
25 : * safe for the new file to replace the old, even if the transaction itself
26 : * aborts. An important factor here is that the indexes and toast table of
27 : * a mapped catalog must also be mapped, so that the rewrites/relocations of
28 : * all these files commit in a single map file update rather than being tied
29 : * to transaction commit.
30 : *
31 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
32 : * Portions Copyright (c) 1994, Regents of the University of California
33 : *
34 : *
35 : * IDENTIFICATION
36 : * src/backend/utils/cache/relmapper.c
37 : *
38 : *-------------------------------------------------------------------------
39 : */
40 : #include "postgres.h"
41 :
42 : #include <fcntl.h>
43 : #include <sys/stat.h>
44 : #include <unistd.h>
45 :
46 : #include "access/xact.h"
47 : #include "access/xlog.h"
48 : #include "access/xloginsert.h"
49 : #include "catalog/catalog.h"
50 : #include "catalog/pg_tablespace.h"
51 : #include "catalog/storage.h"
52 : #include "miscadmin.h"
53 : #include "pgstat.h"
54 : #include "storage/fd.h"
55 : #include "storage/lwlock.h"
56 : #include "utils/inval.h"
57 : #include "utils/relmapper.h"
58 :
59 :
60 : /*
61 : * The map file is critical data: we have no automatic method for recovering
62 : * from loss or corruption of it. We use a CRC so that we can detect
63 : * corruption. To minimize the risk of failed updates, the map file should
64 : * be kept to no more than one standard-size disk sector (ie 512 bytes),
65 : * and we use overwrite-in-place rather than playing renaming games.
66 : * The struct layout below is designed to occupy exactly 512 bytes, which
67 : * might make filesystem updates a bit more efficient.
68 : *
69 : * Entries in the mappings[] array are in no particular order. We could
70 : * speed searching by insisting on OID order, but it really shouldn't be
71 : * worth the trouble given the intended size of the mapping sets.
72 : */
73 : #define RELMAPPER_FILENAME "pg_filenode.map"
74 :
75 : #define RELMAPPER_FILEMAGIC 0x592717 /* version ID value */
76 :
77 : #define MAX_MAPPINGS 62 /* 62 * 8 + 16 = 512 */
78 :
79 : typedef struct RelMapping
80 : {
81 : Oid mapoid; /* OID of a catalog */
82 : Oid mapfilenode; /* its filenode number */
83 : } RelMapping;
84 :
85 : typedef struct RelMapFile
86 : {
87 : int32 magic; /* always RELMAPPER_FILEMAGIC */
88 : int32 num_mappings; /* number of valid RelMapping entries */
89 : RelMapping mappings[MAX_MAPPINGS];
90 : pg_crc32c crc; /* CRC of all above */
91 : int32 pad; /* to make the struct size be 512 exactly */
92 : } RelMapFile;
93 :
94 : /*
95 : * The currently known contents of the shared map file and our database's
96 : * local map file are stored here. These can be reloaded from disk
97 : * immediately whenever we receive an update sinval message.
98 : */
99 : static RelMapFile shared_map;
100 : static RelMapFile local_map;
101 :
102 : /*
103 : * We use the same RelMapFile data structure to track uncommitted local
104 : * changes in the mappings (but note the magic and crc fields are not made
105 : * valid in these variables). Currently, map updates are not allowed within
106 : * subtransactions, so one set of transaction-level changes is sufficient.
107 : *
108 : * The active_xxx variables contain updates that are valid in our transaction
109 : * and should be honored by RelationMapOidToFilenode. The pending_xxx
110 : * variables contain updates we have been told about that aren't active yet;
111 : * they will become active at the next CommandCounterIncrement. This setup
112 : * lets map updates act similarly to updates of pg_class rows, ie, they
113 : * become visible only at the next CommandCounterIncrement boundary.
114 : */
115 : static RelMapFile active_shared_updates;
116 : static RelMapFile active_local_updates;
117 : static RelMapFile pending_shared_updates;
118 : static RelMapFile pending_local_updates;
119 :
120 :
121 : /* non-export function prototypes */
122 : static void apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode,
123 : bool add_okay);
124 : static void merge_map_updates(RelMapFile *map, const RelMapFile *updates,
125 : bool add_okay);
126 : static void load_relmap_file(bool shared);
127 : static void write_relmap_file(bool shared, RelMapFile *newmap,
128 : bool write_wal, bool send_sinval, bool preserve_files,
129 : Oid dbid, Oid tsid, const char *dbpath);
130 : static void perform_relmap_update(bool shared, const RelMapFile *updates);
131 :
132 :
133 : /*
134 : * RelationMapOidToFilenode
135 : *
136 : * The raison d' etre ... given a relation OID, look up its filenode.
137 : *
138 : * Although shared and local relation OIDs should never overlap, the caller
139 : * always knows which we need --- so pass that information to avoid useless
140 : * searching.
141 : *
142 : * Returns InvalidOid if the OID is not known (which should never happen,
143 : * but the caller is in a better position to report a meaningful error).
144 : */
145 : Oid
146 18120 : RelationMapOidToFilenode(Oid relationId, bool shared)
147 : {
148 : const RelMapFile *map;
149 : int32 i;
150 :
151 : /* If there are active updates, believe those over the main maps */
152 18120 : if (shared)
153 : {
154 11423 : map = &active_shared_updates;
155 11435 : for (i = 0; i < map->num_mappings; i++)
156 : {
157 23 : if (relationId == map->mappings[i].mapoid)
158 11 : return map->mappings[i].mapfilenode;
159 : }
160 11412 : map = &shared_map;
161 200930 : for (i = 0; i < map->num_mappings; i++)
162 : {
163 200930 : if (relationId == map->mappings[i].mapoid)
164 11412 : return map->mappings[i].mapfilenode;
165 : }
166 : }
167 : else
168 : {
169 6697 : map = &active_local_updates;
170 6726 : for (i = 0; i < map->num_mappings; i++)
171 : {
172 50 : if (relationId == map->mappings[i].mapoid)
173 21 : return map->mappings[i].mapfilenode;
174 : }
175 6676 : map = &local_map;
176 49599 : for (i = 0; i < map->num_mappings; i++)
177 : {
178 49599 : if (relationId == map->mappings[i].mapoid)
179 6676 : return map->mappings[i].mapfilenode;
180 : }
181 : }
182 :
183 0 : return InvalidOid;
184 : }
185 :
186 : /*
187 : * RelationMapFilenodeToOid
188 : *
189 : * Do the reverse of the normal direction of mapping done in
190 : * RelationMapOidToFilenode.
191 : *
192 : * This is not supposed to be used during normal running but rather for
193 : * information purposes when looking at the filesystem or xlog.
194 : *
195 : * Returns InvalidOid if the OID is not known; this can easily happen if the
196 : * relfilenode doesn't pertain to a mapped relation.
197 : */
198 : Oid
199 50 : RelationMapFilenodeToOid(Oid filenode, bool shared)
200 : {
201 : const RelMapFile *map;
202 : int32 i;
203 :
204 : /* If there are active updates, believe those over the main maps */
205 50 : if (shared)
206 : {
207 35 : map = &active_shared_updates;
208 35 : for (i = 0; i < map->num_mappings; i++)
209 : {
210 0 : if (filenode == map->mappings[i].mapfilenode)
211 0 : return map->mappings[i].mapoid;
212 : }
213 35 : map = &shared_map;
214 630 : for (i = 0; i < map->num_mappings; i++)
215 : {
216 630 : if (filenode == map->mappings[i].mapfilenode)
217 35 : return map->mappings[i].mapoid;
218 : }
219 : }
220 : else
221 : {
222 15 : map = &active_local_updates;
223 15 : for (i = 0; i < map->num_mappings; i++)
224 : {
225 0 : if (filenode == map->mappings[i].mapfilenode)
226 0 : return map->mappings[i].mapoid;
227 : }
228 15 : map = &local_map;
229 120 : for (i = 0; i < map->num_mappings; i++)
230 : {
231 120 : if (filenode == map->mappings[i].mapfilenode)
232 15 : return map->mappings[i].mapoid;
233 : }
234 : }
235 :
236 0 : return InvalidOid;
237 : }
238 :
239 : /*
240 : * RelationMapUpdateMap
241 : *
242 : * Install a new relfilenode mapping for the specified relation.
243 : *
244 : * If immediate is true (or we're bootstrapping), the mapping is activated
245 : * immediately. Otherwise it is made pending until CommandCounterIncrement.
246 : */
247 : void
248 65 : RelationMapUpdateMap(Oid relationId, Oid fileNode, bool shared,
249 : bool immediate)
250 : {
251 : RelMapFile *map;
252 :
253 65 : if (IsBootstrapProcessingMode())
254 : {
255 : /*
256 : * In bootstrap mode, the mapping gets installed in permanent map.
257 : */
258 54 : if (shared)
259 35 : map = &shared_map;
260 : else
261 19 : map = &local_map;
262 : }
263 : else
264 : {
265 : /*
266 : * We don't currently support map changes within subtransactions. This
267 : * could be done with more bookkeeping infrastructure, but it doesn't
268 : * presently seem worth it.
269 : */
270 11 : if (GetCurrentTransactionNestLevel() > 1)
271 0 : elog(ERROR, "cannot change relation mapping within subtransaction");
272 :
273 11 : if (immediate)
274 : {
275 : /* Make it active, but only locally */
276 2 : if (shared)
277 0 : map = &active_shared_updates;
278 : else
279 2 : map = &active_local_updates;
280 : }
281 : else
282 : {
283 : /* Make it pending */
284 9 : if (shared)
285 3 : map = &pending_shared_updates;
286 : else
287 6 : map = &pending_local_updates;
288 : }
289 : }
290 65 : apply_map_update(map, relationId, fileNode, true);
291 65 : }
292 :
293 : /*
294 : * apply_map_update
295 : *
296 : * Insert a new mapping into the given map variable, replacing any existing
297 : * mapping for the same relation.
298 : *
299 : * In some cases the caller knows there must be an existing mapping; pass
300 : * add_okay = false to draw an error if not.
301 : */
302 : static void
303 81 : apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode, bool add_okay)
304 : {
305 : int32 i;
306 :
307 : /* Replace any existing mapping */
308 871 : for (i = 0; i < map->num_mappings; i++)
309 : {
310 803 : if (relationId == map->mappings[i].mapoid)
311 : {
312 13 : map->mappings[i].mapfilenode = fileNode;
313 94 : return;
314 : }
315 : }
316 :
317 : /* Nope, need to add a new mapping */
318 68 : if (!add_okay)
319 0 : elog(ERROR, "attempt to apply a mapping to unmapped relation %u",
320 : relationId);
321 68 : if (map->num_mappings >= MAX_MAPPINGS)
322 0 : elog(ERROR, "ran out of space in relation map");
323 68 : map->mappings[map->num_mappings].mapoid = relationId;
324 68 : map->mappings[map->num_mappings].mapfilenode = fileNode;
325 68 : map->num_mappings++;
326 : }
327 :
328 : /*
329 : * merge_map_updates
330 : *
331 : * Merge all the updates in the given pending-update map into the target map.
332 : * This is just a bulk form of apply_map_update.
333 : */
334 : static void
335 10 : merge_map_updates(RelMapFile *map, const RelMapFile *updates, bool add_okay)
336 : {
337 : int32 i;
338 :
339 26 : for (i = 0; i < updates->num_mappings; i++)
340 : {
341 16 : apply_map_update(map,
342 : updates->mappings[i].mapoid,
343 : updates->mappings[i].mapfilenode,
344 : add_okay);
345 : }
346 10 : }
347 :
348 : /*
349 : * RelationMapRemoveMapping
350 : *
351 : * Remove a relation's entry in the map. This is only allowed for "active"
352 : * (but not committed) local mappings. We need it so we can back out the
353 : * entry for the transient target file when doing VACUUM FULL/CLUSTER on
354 : * a mapped relation.
355 : */
356 : void
357 2 : RelationMapRemoveMapping(Oid relationId)
358 : {
359 2 : RelMapFile *map = &active_local_updates;
360 : int32 i;
361 :
362 2 : for (i = 0; i < map->num_mappings; i++)
363 : {
364 2 : if (relationId == map->mappings[i].mapoid)
365 : {
366 : /* Found it, collapse it out */
367 2 : map->mappings[i] = map->mappings[map->num_mappings - 1];
368 2 : map->num_mappings--;
369 4 : return;
370 : }
371 : }
372 0 : elog(ERROR, "could not find temporary mapping for relation %u",
373 : relationId);
374 : }
375 :
376 : /*
377 : * RelationMapInvalidate
378 : *
379 : * This routine is invoked for SI cache flush messages. We must re-read
380 : * the indicated map file. However, we might receive a SI message in a
381 : * process that hasn't yet, and might never, load the mapping files;
382 : * for example the autovacuum launcher, which *must not* try to read
383 : * a local map since it is attached to no particular database.
384 : * So, re-read only if the map is valid now.
385 : */
386 : void
387 16 : RelationMapInvalidate(bool shared)
388 : {
389 16 : if (shared)
390 : {
391 9 : if (shared_map.magic == RELMAPPER_FILEMAGIC)
392 9 : load_relmap_file(true);
393 : }
394 : else
395 : {
396 7 : if (local_map.magic == RELMAPPER_FILEMAGIC)
397 7 : load_relmap_file(false);
398 : }
399 16 : }
400 :
401 : /*
402 : * RelationMapInvalidateAll
403 : *
404 : * Reload all map files. This is used to recover from SI message buffer
405 : * overflow: we can't be sure if we missed an inval message.
406 : * Again, reload only currently-valid maps.
407 : */
408 : void
409 145 : RelationMapInvalidateAll(void)
410 : {
411 145 : if (shared_map.magic == RELMAPPER_FILEMAGIC)
412 145 : load_relmap_file(true);
413 145 : if (local_map.magic == RELMAPPER_FILEMAGIC)
414 143 : load_relmap_file(false);
415 145 : }
416 :
417 : /*
418 : * AtCCI_RelationMap
419 : *
420 : * Activate any "pending" relation map updates at CommandCounterIncrement time.
421 : */
422 : void
423 22045 : AtCCI_RelationMap(void)
424 : {
425 22045 : if (pending_shared_updates.num_mappings != 0)
426 : {
427 3 : merge_map_updates(&active_shared_updates,
428 : &pending_shared_updates,
429 : true);
430 3 : pending_shared_updates.num_mappings = 0;
431 : }
432 22045 : if (pending_local_updates.num_mappings != 0)
433 : {
434 5 : merge_map_updates(&active_local_updates,
435 : &pending_local_updates,
436 : true);
437 5 : pending_local_updates.num_mappings = 0;
438 : }
439 22045 : }
440 :
441 : /*
442 : * AtEOXact_RelationMap
443 : *
444 : * Handle relation mapping at main-transaction commit or abort.
445 : *
446 : * During commit, this must be called as late as possible before the actual
447 : * transaction commit, so as to minimize the window where the transaction
448 : * could still roll back after committing map changes. Although nothing
449 : * critically bad happens in such a case, we still would prefer that it
450 : * not happen, since we'd possibly be losing useful updates to the relations'
451 : * pg_class row(s).
452 : *
453 : * During abort, we just have to throw away any pending map changes.
454 : * Normal post-abort cleanup will take care of fixing relcache entries.
455 : */
456 : void
457 26161 : AtEOXact_RelationMap(bool isCommit)
458 : {
459 26161 : if (isCommit)
460 : {
461 : /*
462 : * We should not get here with any "pending" updates. (We could
463 : * logically choose to treat such as committed, but in the current
464 : * code this should never happen.)
465 : */
466 22883 : Assert(pending_shared_updates.num_mappings == 0);
467 22883 : Assert(pending_local_updates.num_mappings == 0);
468 :
469 : /*
470 : * Write any active updates to the actual map files, then reset them.
471 : */
472 22883 : if (active_shared_updates.num_mappings != 0)
473 : {
474 1 : perform_relmap_update(true, &active_shared_updates);
475 1 : active_shared_updates.num_mappings = 0;
476 : }
477 22883 : if (active_local_updates.num_mappings != 0)
478 : {
479 1 : perform_relmap_update(false, &active_local_updates);
480 1 : active_local_updates.num_mappings = 0;
481 : }
482 : }
483 : else
484 : {
485 : /* Abort --- drop all local and pending updates */
486 3278 : active_shared_updates.num_mappings = 0;
487 3278 : active_local_updates.num_mappings = 0;
488 3278 : pending_shared_updates.num_mappings = 0;
489 3278 : pending_local_updates.num_mappings = 0;
490 : }
491 26161 : }
492 :
493 : /*
494 : * AtPrepare_RelationMap
495 : *
496 : * Handle relation mapping at PREPARE.
497 : *
498 : * Currently, we don't support preparing any transaction that changes the map.
499 : */
500 : void
501 6 : AtPrepare_RelationMap(void)
502 : {
503 12 : if (active_shared_updates.num_mappings != 0 ||
504 12 : active_local_updates.num_mappings != 0 ||
505 12 : pending_shared_updates.num_mappings != 0 ||
506 6 : pending_local_updates.num_mappings != 0)
507 0 : ereport(ERROR,
508 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
509 : errmsg("cannot PREPARE a transaction that modified relation mapping")));
510 6 : }
511 :
512 : /*
513 : * CheckPointRelationMap
514 : *
515 : * This is called during a checkpoint. It must ensure that any relation map
516 : * updates that were WAL-logged before the start of the checkpoint are
517 : * securely flushed to disk and will not need to be replayed later. This
518 : * seems unlikely to be a performance-critical issue, so we use a simple
519 : * method: we just take and release the RelationMappingLock. This ensures
520 : * that any already-logged map update is complete, because write_relmap_file
521 : * will fsync the map file before the lock is released.
522 : */
523 : void
524 11 : CheckPointRelationMap(void)
525 : {
526 11 : LWLockAcquire(RelationMappingLock, LW_SHARED);
527 11 : LWLockRelease(RelationMappingLock);
528 11 : }
529 :
530 : /*
531 : * RelationMapFinishBootstrap
532 : *
533 : * Write out the initial relation mapping files at the completion of
534 : * bootstrap. All the mapped files should have been made known to us
535 : * via RelationMapUpdateMap calls.
536 : */
537 : void
538 1 : RelationMapFinishBootstrap(void)
539 : {
540 1 : Assert(IsBootstrapProcessingMode());
541 :
542 : /* Shouldn't be anything "pending" ... */
543 1 : Assert(active_shared_updates.num_mappings == 0);
544 1 : Assert(active_local_updates.num_mappings == 0);
545 1 : Assert(pending_shared_updates.num_mappings == 0);
546 1 : Assert(pending_local_updates.num_mappings == 0);
547 :
548 : /* Write the files; no WAL or sinval needed */
549 1 : write_relmap_file(true, &shared_map, false, false, false,
550 : InvalidOid, GLOBALTABLESPACE_OID, NULL);
551 1 : write_relmap_file(false, &local_map, false, false, false,
552 : MyDatabaseId, MyDatabaseTableSpace, DatabasePath);
553 1 : }
554 :
555 : /*
556 : * RelationMapInitialize
557 : *
558 : * This initializes the mapper module at process startup. We can't access the
559 : * database yet, so just make sure the maps are empty.
560 : */
561 : void
562 338 : RelationMapInitialize(void)
563 : {
564 : /* The static variables should initialize to zeroes, but let's be sure */
565 338 : shared_map.magic = 0; /* mark it not loaded */
566 338 : local_map.magic = 0;
567 338 : shared_map.num_mappings = 0;
568 338 : local_map.num_mappings = 0;
569 338 : active_shared_updates.num_mappings = 0;
570 338 : active_local_updates.num_mappings = 0;
571 338 : pending_shared_updates.num_mappings = 0;
572 338 : pending_local_updates.num_mappings = 0;
573 338 : }
574 :
575 : /*
576 : * RelationMapInitializePhase2
577 : *
578 : * This is called to prepare for access to pg_database during startup.
579 : * We should be able to read the shared map file now.
580 : */
581 : void
582 338 : RelationMapInitializePhase2(void)
583 : {
584 : /*
585 : * In bootstrap mode, the map file isn't there yet, so do nothing.
586 : */
587 338 : if (IsBootstrapProcessingMode())
588 339 : return;
589 :
590 : /*
591 : * Load the shared map file, die on error.
592 : */
593 337 : load_relmap_file(true);
594 : }
595 :
596 : /*
597 : * RelationMapInitializePhase3
598 : *
599 : * This is called as soon as we have determined MyDatabaseId and set up
600 : * DatabasePath. At this point we should be able to read the local map file.
601 : */
602 : void
603 336 : RelationMapInitializePhase3(void)
604 : {
605 : /*
606 : * In bootstrap mode, the map file isn't there yet, so do nothing.
607 : */
608 336 : if (IsBootstrapProcessingMode())
609 337 : return;
610 :
611 : /*
612 : * Load the local map file, die on error.
613 : */
614 335 : load_relmap_file(false);
615 : }
616 :
617 : /*
618 : * load_relmap_file -- load data from the shared or local map file
619 : *
620 : * Because the map file is essential for access to core system catalogs,
621 : * failure to read it is a fatal error.
622 : *
623 : * Note that the local case requires DatabasePath to be set up.
624 : */
625 : static void
626 978 : load_relmap_file(bool shared)
627 : {
628 : RelMapFile *map;
629 : char mapfilename[MAXPGPATH];
630 : pg_crc32c crc;
631 : int fd;
632 :
633 978 : if (shared)
634 : {
635 492 : snprintf(mapfilename, sizeof(mapfilename), "global/%s",
636 : RELMAPPER_FILENAME);
637 492 : map = &shared_map;
638 : }
639 : else
640 : {
641 486 : snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
642 : DatabasePath, RELMAPPER_FILENAME);
643 486 : map = &local_map;
644 : }
645 :
646 : /* Read data ... */
647 978 : fd = OpenTransientFile(mapfilename,
648 : O_RDONLY | PG_BINARY, S_IRUSR | S_IWUSR);
649 978 : if (fd < 0)
650 0 : ereport(FATAL,
651 : (errcode_for_file_access(),
652 : errmsg("could not open relation mapping file \"%s\": %m",
653 : mapfilename)));
654 :
655 : /*
656 : * Note: we could take RelationMappingLock in shared mode here, but it
657 : * seems unnecessary since our read() should be atomic against any
658 : * concurrent updater's write(). If the file is updated shortly after we
659 : * look, the sinval signaling mechanism will make us re-read it before we
660 : * are able to access any relation that's affected by the change.
661 : */
662 978 : pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_READ);
663 978 : if (read(fd, map, sizeof(RelMapFile)) != sizeof(RelMapFile))
664 0 : ereport(FATAL,
665 : (errcode_for_file_access(),
666 : errmsg("could not read relation mapping file \"%s\": %m",
667 : mapfilename)));
668 978 : pgstat_report_wait_end();
669 :
670 978 : CloseTransientFile(fd);
671 :
672 : /* check for correct magic number, etc */
673 1956 : if (map->magic != RELMAPPER_FILEMAGIC ||
674 1956 : map->num_mappings < 0 ||
675 978 : map->num_mappings > MAX_MAPPINGS)
676 0 : ereport(FATAL,
677 : (errmsg("relation mapping file \"%s\" contains invalid data",
678 : mapfilename)));
679 :
680 : /* verify the CRC */
681 978 : INIT_CRC32C(crc);
682 978 : COMP_CRC32C(crc, (char *) map, offsetof(RelMapFile, crc));
683 978 : FIN_CRC32C(crc);
684 :
685 978 : if (!EQ_CRC32C(crc, map->crc))
686 0 : ereport(FATAL,
687 : (errmsg("relation mapping file \"%s\" contains incorrect checksum",
688 : mapfilename)));
689 978 : }
690 :
691 : /*
692 : * Write out a new shared or local map file with the given contents.
693 : *
694 : * The magic number and CRC are automatically updated in *newmap. On
695 : * success, we copy the data to the appropriate permanent static variable.
696 : *
697 : * If write_wal is TRUE then an appropriate WAL message is emitted.
698 : * (It will be false for bootstrap and WAL replay cases.)
699 : *
700 : * If send_sinval is TRUE then a SI invalidation message is sent.
701 : * (This should be true except in bootstrap case.)
702 : *
703 : * If preserve_files is TRUE then the storage manager is warned not to
704 : * delete the files listed in the map.
705 : *
706 : * Because this may be called during WAL replay when MyDatabaseId,
707 : * DatabasePath, etc aren't valid, we require the caller to pass in suitable
708 : * values. The caller is also responsible for being sure no concurrent
709 : * map update could be happening.
710 : */
711 : static void
712 4 : write_relmap_file(bool shared, RelMapFile *newmap,
713 : bool write_wal, bool send_sinval, bool preserve_files,
714 : Oid dbid, Oid tsid, const char *dbpath)
715 : {
716 : int fd;
717 : RelMapFile *realmap;
718 : char mapfilename[MAXPGPATH];
719 :
720 : /*
721 : * Fill in the overhead fields and update CRC.
722 : */
723 4 : newmap->magic = RELMAPPER_FILEMAGIC;
724 4 : if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS)
725 0 : elog(ERROR, "attempt to write bogus relation mapping");
726 :
727 4 : INIT_CRC32C(newmap->crc);
728 4 : COMP_CRC32C(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc));
729 4 : FIN_CRC32C(newmap->crc);
730 :
731 : /*
732 : * Open the target file. We prefer to do this before entering the
733 : * critical section, so that an open() failure need not force PANIC.
734 : */
735 4 : if (shared)
736 : {
737 2 : snprintf(mapfilename, sizeof(mapfilename), "global/%s",
738 : RELMAPPER_FILENAME);
739 2 : realmap = &shared_map;
740 : }
741 : else
742 : {
743 2 : snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
744 : dbpath, RELMAPPER_FILENAME);
745 2 : realmap = &local_map;
746 : }
747 :
748 4 : fd = OpenTransientFile(mapfilename,
749 : O_WRONLY | O_CREAT | PG_BINARY,
750 : S_IRUSR | S_IWUSR);
751 4 : if (fd < 0)
752 0 : ereport(ERROR,
753 : (errcode_for_file_access(),
754 : errmsg("could not open relation mapping file \"%s\": %m",
755 : mapfilename)));
756 :
757 4 : if (write_wal)
758 : {
759 : xl_relmap_update xlrec;
760 : XLogRecPtr lsn;
761 :
762 : /* now errors are fatal ... */
763 2 : START_CRIT_SECTION();
764 :
765 2 : xlrec.dbid = dbid;
766 2 : xlrec.tsid = tsid;
767 2 : xlrec.nbytes = sizeof(RelMapFile);
768 :
769 2 : XLogBeginInsert();
770 2 : XLogRegisterData((char *) (&xlrec), MinSizeOfRelmapUpdate);
771 2 : XLogRegisterData((char *) newmap, sizeof(RelMapFile));
772 :
773 2 : lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE);
774 :
775 : /* As always, WAL must hit the disk before the data update does */
776 2 : XLogFlush(lsn);
777 : }
778 :
779 4 : errno = 0;
780 4 : pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_WRITE);
781 4 : if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
782 : {
783 : /* if write didn't set errno, assume problem is no disk space */
784 0 : if (errno == 0)
785 0 : errno = ENOSPC;
786 0 : ereport(ERROR,
787 : (errcode_for_file_access(),
788 : errmsg("could not write to relation mapping file \"%s\": %m",
789 : mapfilename)));
790 : }
791 4 : pgstat_report_wait_end();
792 :
793 : /*
794 : * We choose to fsync the data to disk before considering the task done.
795 : * It would be possible to relax this if it turns out to be a performance
796 : * issue, but it would complicate checkpointing --- see notes for
797 : * CheckPointRelationMap.
798 : */
799 4 : pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_SYNC);
800 4 : if (pg_fsync(fd) != 0)
801 0 : ereport(ERROR,
802 : (errcode_for_file_access(),
803 : errmsg("could not fsync relation mapping file \"%s\": %m",
804 : mapfilename)));
805 4 : pgstat_report_wait_end();
806 :
807 4 : if (CloseTransientFile(fd))
808 0 : ereport(ERROR,
809 : (errcode_for_file_access(),
810 : errmsg("could not close relation mapping file \"%s\": %m",
811 : mapfilename)));
812 :
813 : /*
814 : * Now that the file is safely on disk, send sinval message to let other
815 : * backends know to re-read it. We must do this inside the critical
816 : * section: if for some reason we fail to send the message, we have to
817 : * force a database-wide PANIC. Otherwise other backends might continue
818 : * execution with stale mapping information, which would be catastrophic
819 : * as soon as others began to use the now-committed data.
820 : */
821 4 : if (send_sinval)
822 2 : CacheInvalidateRelmap(dbid);
823 :
824 : /*
825 : * Make sure that the files listed in the map are not deleted if the outer
826 : * transaction aborts. This had better be within the critical section
827 : * too: it's not likely to fail, but if it did, we'd arrive at transaction
828 : * abort with the files still vulnerable. PANICing will leave things in a
829 : * good state on-disk.
830 : *
831 : * Note: we're cheating a little bit here by assuming that mapped files
832 : * are either in pg_global or the database's default tablespace.
833 : */
834 4 : if (preserve_files)
835 : {
836 : int32 i;
837 :
838 52 : for (i = 0; i < newmap->num_mappings; i++)
839 : {
840 : RelFileNode rnode;
841 :
842 50 : rnode.spcNode = tsid;
843 50 : rnode.dbNode = dbid;
844 50 : rnode.relNode = newmap->mappings[i].mapfilenode;
845 50 : RelationPreserveStorage(rnode, false);
846 : }
847 : }
848 :
849 : /* Success, update permanent copy */
850 4 : memcpy(realmap, newmap, sizeof(RelMapFile));
851 :
852 : /* Critical section done */
853 4 : if (write_wal)
854 2 : END_CRIT_SECTION();
855 4 : }
856 :
857 : /*
858 : * Merge the specified updates into the appropriate "real" map,
859 : * and write out the changes. This function must be used for committing
860 : * updates during normal multiuser operation.
861 : */
862 : static void
863 2 : perform_relmap_update(bool shared, const RelMapFile *updates)
864 : {
865 : RelMapFile newmap;
866 :
867 : /*
868 : * Anyone updating a relation's mapping info should take exclusive lock on
869 : * that rel and hold it until commit. This ensures that there will not be
870 : * concurrent updates on the same mapping value; but there could easily be
871 : * concurrent updates on different values in the same file. We cover that
872 : * by acquiring the RelationMappingLock, re-reading the target file to
873 : * ensure it's up to date, applying the updates, and writing the data
874 : * before releasing RelationMappingLock.
875 : *
876 : * There is only one RelationMappingLock. In principle we could try to
877 : * have one per mapping file, but it seems unlikely to be worth the
878 : * trouble.
879 : */
880 2 : LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
881 :
882 : /* Be certain we see any other updates just made */
883 2 : load_relmap_file(shared);
884 :
885 : /* Prepare updated data in a local variable */
886 2 : if (shared)
887 1 : memcpy(&newmap, &shared_map, sizeof(RelMapFile));
888 : else
889 1 : memcpy(&newmap, &local_map, sizeof(RelMapFile));
890 :
891 : /*
892 : * Apply the updates to newmap. No new mappings should appear, unless
893 : * somebody is adding indexes to system catalogs.
894 : */
895 2 : merge_map_updates(&newmap, updates, allowSystemTableMods);
896 :
897 : /* Write out the updated map and do other necessary tasks */
898 2 : write_relmap_file(shared, &newmap, true, true, true,
899 : (shared ? InvalidOid : MyDatabaseId),
900 : (shared ? GLOBALTABLESPACE_OID : MyDatabaseTableSpace),
901 : DatabasePath);
902 :
903 : /* Now we can release the lock */
904 2 : LWLockRelease(RelationMappingLock);
905 2 : }
906 :
907 : /*
908 : * RELMAP resource manager's routines
909 : */
910 : void
911 0 : relmap_redo(XLogReaderState *record)
912 : {
913 0 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
914 :
915 : /* Backup blocks are not used in relmap records */
916 0 : Assert(!XLogRecHasAnyBlockRefs(record));
917 :
918 0 : if (info == XLOG_RELMAP_UPDATE)
919 : {
920 0 : xl_relmap_update *xlrec = (xl_relmap_update *) XLogRecGetData(record);
921 : RelMapFile newmap;
922 : char *dbpath;
923 :
924 0 : if (xlrec->nbytes != sizeof(RelMapFile))
925 0 : elog(PANIC, "relmap_redo: wrong size %u in relmap update record",
926 : xlrec->nbytes);
927 0 : memcpy(&newmap, xlrec->data, sizeof(newmap));
928 :
929 : /* We need to construct the pathname for this database */
930 0 : dbpath = GetDatabasePath(xlrec->dbid, xlrec->tsid);
931 :
932 : /*
933 : * Write out the new map and send sinval, but of course don't write a
934 : * new WAL entry. There's no surrounding transaction to tell to
935 : * preserve files, either.
936 : *
937 : * There shouldn't be anyone else updating relmaps during WAL replay,
938 : * so we don't bother to take the RelationMappingLock. We would need
939 : * to do so if load_relmap_file needed to interlock against writers.
940 : */
941 0 : write_relmap_file((xlrec->dbid == InvalidOid), &newmap,
942 : false, true, false,
943 : xlrec->dbid, xlrec->tsid, dbpath);
944 :
945 0 : pfree(dbpath);
946 : }
947 : else
948 0 : elog(PANIC, "relmap_redo: unknown op code %u", info);
949 0 : }
|