Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * cluster.c
4 : * CLUSTER a table on an index. This is now also used for VACUUM FULL.
5 : *
6 : * There is hardly anything left of Paul Brown's original implementation...
7 : *
8 : *
9 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
10 : * Portions Copyright (c) 1994-5, Regents of the University of California
11 : *
12 : *
13 : * IDENTIFICATION
14 : * src/backend/commands/cluster.c
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 : #include "postgres.h"
19 :
20 : #include "access/amapi.h"
21 : #include "access/multixact.h"
22 : #include "access/relscan.h"
23 : #include "access/rewriteheap.h"
24 : #include "access/transam.h"
25 : #include "access/tuptoaster.h"
26 : #include "access/xact.h"
27 : #include "access/xlog.h"
28 : #include "catalog/pg_am.h"
29 : #include "catalog/catalog.h"
30 : #include "catalog/dependency.h"
31 : #include "catalog/heap.h"
32 : #include "catalog/index.h"
33 : #include "catalog/namespace.h"
34 : #include "catalog/objectaccess.h"
35 : #include "catalog/toasting.h"
36 : #include "commands/cluster.h"
37 : #include "commands/tablecmds.h"
38 : #include "commands/vacuum.h"
39 : #include "miscadmin.h"
40 : #include "optimizer/planner.h"
41 : #include "storage/bufmgr.h"
42 : #include "storage/lmgr.h"
43 : #include "storage/predicate.h"
44 : #include "storage/smgr.h"
45 : #include "utils/acl.h"
46 : #include "utils/fmgroids.h"
47 : #include "utils/inval.h"
48 : #include "utils/lsyscache.h"
49 : #include "utils/memutils.h"
50 : #include "utils/pg_rusage.h"
51 : #include "utils/relmapper.h"
52 : #include "utils/snapmgr.h"
53 : #include "utils/syscache.h"
54 : #include "utils/tqual.h"
55 : #include "utils/tuplesort.h"
56 :
57 :
58 : /*
59 : * This struct is used to pass around the information on tables to be
60 : * clustered. We need this so we can make a list of them when invoked without
61 : * a specific table/index pair.
62 : */
63 : typedef struct
64 : {
65 : Oid tableOid;
66 : Oid indexOid;
67 : } RelToCluster;
68 :
69 :
70 : static void rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose);
71 : static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
72 : bool verbose, bool *pSwapToastByContent,
73 : TransactionId *pFreezeXid, MultiXactId *pCutoffMulti);
74 : static List *get_tables_to_cluster(MemoryContext cluster_context);
75 : static void reform_and_rewrite_tuple(HeapTuple tuple,
76 : TupleDesc oldTupDesc, TupleDesc newTupDesc,
77 : Datum *values, bool *isnull,
78 : bool newRelHasOids, RewriteState rwstate);
79 :
80 :
81 : /*---------------------------------------------------------------------------
82 : * This cluster code allows for clustering multiple tables at once. Because
83 : * of this, we cannot just run everything on a single transaction, or we
84 : * would be forced to acquire exclusive locks on all the tables being
85 : * clustered, simultaneously --- very likely leading to deadlock.
86 : *
87 : * To solve this we follow a similar strategy to VACUUM code,
88 : * clustering each relation in a separate transaction. For this to work,
89 : * we need to:
90 : * - provide a separate memory context so that we can pass information in
91 : * a way that survives across transactions
92 : * - start a new transaction every time a new relation is clustered
93 : * - check for validity of the information on to-be-clustered relations,
94 : * as someone might have deleted a relation behind our back, or
95 : * clustered one on a different index
96 : * - end the transaction
97 : *
98 : * The single-relation case does not have any such overhead.
99 : *
100 : * We also allow a relation to be specified without index. In that case,
101 : * the indisclustered bit will be looked up, and an ERROR will be thrown
102 : * if there is no index with the bit set.
103 : *---------------------------------------------------------------------------
104 : */
105 : void
106 11 : cluster(ClusterStmt *stmt, bool isTopLevel)
107 : {
108 11 : if (stmt->relation != NULL)
109 : {
110 : /* This is the single-relation case. */
111 : Oid tableOid,
112 10 : indexOid = InvalidOid;
113 : Relation rel;
114 :
115 : /* Find, lock, and check permissions on the table */
116 10 : tableOid = RangeVarGetRelidExtended(stmt->relation,
117 : AccessExclusiveLock,
118 : false, false,
119 : RangeVarCallbackOwnsTable, NULL);
120 10 : rel = heap_open(tableOid, NoLock);
121 :
122 : /*
123 : * Reject clustering a remote temp table ... their local buffer
124 : * manager is not going to cope.
125 : */
126 10 : if (RELATION_IS_OTHER_TEMP(rel))
127 0 : ereport(ERROR,
128 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
129 : errmsg("cannot cluster temporary tables of other sessions")));
130 :
131 10 : if (stmt->indexname == NULL)
132 : {
133 : ListCell *index;
134 :
135 : /* We need to find the index that has indisclustered set. */
136 4 : foreach(index, RelationGetIndexList(rel))
137 : {
138 : HeapTuple idxtuple;
139 : Form_pg_index indexForm;
140 :
141 3 : indexOid = lfirst_oid(index);
142 3 : idxtuple = SearchSysCache1(INDEXRELID,
143 : ObjectIdGetDatum(indexOid));
144 3 : if (!HeapTupleIsValid(idxtuple))
145 0 : elog(ERROR, "cache lookup failed for index %u", indexOid);
146 3 : indexForm = (Form_pg_index) GETSTRUCT(idxtuple);
147 3 : if (indexForm->indisclustered)
148 : {
149 2 : ReleaseSysCache(idxtuple);
150 2 : break;
151 : }
152 1 : ReleaseSysCache(idxtuple);
153 1 : indexOid = InvalidOid;
154 : }
155 :
156 3 : if (!OidIsValid(indexOid))
157 1 : ereport(ERROR,
158 : (errcode(ERRCODE_UNDEFINED_OBJECT),
159 : errmsg("there is no previously clustered index for table \"%s\"",
160 : stmt->relation->relname)));
161 : }
162 : else
163 : {
164 : /*
165 : * The index is expected to be in the same namespace as the
166 : * relation.
167 : */
168 7 : indexOid = get_relname_relid(stmt->indexname,
169 7 : rel->rd_rel->relnamespace);
170 7 : if (!OidIsValid(indexOid))
171 0 : ereport(ERROR,
172 : (errcode(ERRCODE_UNDEFINED_OBJECT),
173 : errmsg("index \"%s\" for table \"%s\" does not exist",
174 : stmt->indexname, stmt->relation->relname)));
175 : }
176 :
177 : /* close relation, keep lock till commit */
178 9 : heap_close(rel, NoLock);
179 :
180 : /* Do the job. */
181 9 : cluster_rel(tableOid, indexOid, false, stmt->verbose);
182 : }
183 : else
184 : {
185 : /*
186 : * This is the "multi relation" case. We need to cluster all tables
187 : * that have some index with indisclustered set.
188 : */
189 : MemoryContext cluster_context;
190 : List *rvs;
191 : ListCell *rv;
192 :
193 : /*
194 : * We cannot run this form of CLUSTER inside a user transaction block;
195 : * we'd be holding locks way too long.
196 : */
197 1 : PreventTransactionChain(isTopLevel, "CLUSTER");
198 :
199 : /*
200 : * Create special memory context for cross-transaction storage.
201 : *
202 : * Since it is a child of PortalContext, it will go away even in case
203 : * of error.
204 : */
205 1 : cluster_context = AllocSetContextCreate(PortalContext,
206 : "Cluster",
207 : ALLOCSET_DEFAULT_SIZES);
208 :
209 : /*
210 : * Build the list of relations to cluster. Note that this lives in
211 : * cluster_context.
212 : */
213 1 : rvs = get_tables_to_cluster(cluster_context);
214 :
215 : /* Commit to get out of starting transaction */
216 1 : PopActiveSnapshot();
217 1 : CommitTransactionCommand();
218 :
219 : /* Ok, now that we've got them all, cluster them one by one */
220 2 : foreach(rv, rvs)
221 : {
222 1 : RelToCluster *rvtc = (RelToCluster *) lfirst(rv);
223 :
224 : /* Start a new transaction for each relation. */
225 1 : StartTransactionCommand();
226 : /* functions in indexes may want a snapshot set */
227 1 : PushActiveSnapshot(GetTransactionSnapshot());
228 : /* Do the job. */
229 1 : cluster_rel(rvtc->tableOid, rvtc->indexOid, true, stmt->verbose);
230 1 : PopActiveSnapshot();
231 1 : CommitTransactionCommand();
232 : }
233 :
234 : /* Start a new transaction for the cleanup work. */
235 1 : StartTransactionCommand();
236 :
237 : /* Clean up working storage */
238 1 : MemoryContextDelete(cluster_context);
239 : }
240 10 : }
241 :
242 : /*
243 : * cluster_rel
244 : *
245 : * This clusters the table by creating a new, clustered table and
246 : * swapping the relfilenodes of the new table and the old table, so
247 : * the OID of the original table is preserved. Thus we do not lose
248 : * GRANT, inheritance nor references to this table (this was a bug
249 : * in releases through 7.3).
250 : *
251 : * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading
252 : * the new table, it's better to create the indexes afterwards than to fill
253 : * them incrementally while we load the table.
254 : *
255 : * If indexOid is InvalidOid, the table will be rewritten in physical order
256 : * instead of index order. This is the new implementation of VACUUM FULL,
257 : * and error messages should refer to the operation as VACUUM not CLUSTER.
258 : */
259 : void
260 22 : cluster_rel(Oid tableOid, Oid indexOid, bool recheck, bool verbose)
261 : {
262 : Relation OldHeap;
263 :
264 : /* Check for user-requested abort. */
265 22 : CHECK_FOR_INTERRUPTS();
266 :
267 : /*
268 : * We grab exclusive access to the target rel and index for the duration
269 : * of the transaction. (This is redundant for the single-transaction
270 : * case, since cluster() already did it.) The index lock is taken inside
271 : * check_index_is_clusterable.
272 : */
273 22 : OldHeap = try_relation_open(tableOid, AccessExclusiveLock);
274 :
275 : /* If the table has gone away, we can skip processing it */
276 22 : if (!OldHeap)
277 0 : return;
278 :
279 : /*
280 : * Since we may open a new transaction for each relation, we have to check
281 : * that the relation still is what we think it is.
282 : *
283 : * If this is a single-transaction CLUSTER, we can skip these tests. We
284 : * *must* skip the one on indisclustered since it would reject an attempt
285 : * to cluster a not-previously-clustered index.
286 : */
287 22 : if (recheck)
288 : {
289 : HeapTuple tuple;
290 : Form_pg_index indexForm;
291 :
292 : /* Check that the user still owns the relation */
293 1 : if (!pg_class_ownercheck(tableOid, GetUserId()))
294 : {
295 0 : relation_close(OldHeap, AccessExclusiveLock);
296 0 : return;
297 : }
298 :
299 : /*
300 : * Silently skip a temp table for a remote session. Only doing this
301 : * check in the "recheck" case is appropriate (which currently means
302 : * somebody is executing a database-wide CLUSTER), because there is
303 : * another check in cluster() which will stop any attempt to cluster
304 : * remote temp tables by name. There is another check in cluster_rel
305 : * which is redundant, but we leave it for extra safety.
306 : */
307 1 : if (RELATION_IS_OTHER_TEMP(OldHeap))
308 : {
309 0 : relation_close(OldHeap, AccessExclusiveLock);
310 0 : return;
311 : }
312 :
313 1 : if (OidIsValid(indexOid))
314 : {
315 : /*
316 : * Check that the index still exists
317 : */
318 1 : if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid)))
319 : {
320 0 : relation_close(OldHeap, AccessExclusiveLock);
321 0 : return;
322 : }
323 :
324 : /*
325 : * Check that the index is still the one with indisclustered set.
326 : */
327 1 : tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
328 1 : if (!HeapTupleIsValid(tuple)) /* probably can't happen */
329 : {
330 0 : relation_close(OldHeap, AccessExclusiveLock);
331 0 : return;
332 : }
333 1 : indexForm = (Form_pg_index) GETSTRUCT(tuple);
334 1 : if (!indexForm->indisclustered)
335 : {
336 0 : ReleaseSysCache(tuple);
337 0 : relation_close(OldHeap, AccessExclusiveLock);
338 0 : return;
339 : }
340 1 : ReleaseSysCache(tuple);
341 : }
342 : }
343 :
344 : /*
345 : * We allow VACUUM FULL, but not CLUSTER, on shared catalogs. CLUSTER
346 : * would work in most respects, but the index would only get marked as
347 : * indisclustered in the current database, leading to unexpected behavior
348 : * if CLUSTER were later invoked in another database.
349 : */
350 22 : if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared)
351 0 : ereport(ERROR,
352 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
353 : errmsg("cannot cluster a shared catalog")));
354 :
355 : /*
356 : * Don't process temp tables of other backends ... their local buffer
357 : * manager is not going to cope.
358 : */
359 22 : if (RELATION_IS_OTHER_TEMP(OldHeap))
360 : {
361 0 : if (OidIsValid(indexOid))
362 0 : ereport(ERROR,
363 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
364 : errmsg("cannot cluster temporary tables of other sessions")));
365 : else
366 0 : ereport(ERROR,
367 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
368 : errmsg("cannot vacuum temporary tables of other sessions")));
369 : }
370 :
371 : /*
372 : * Also check for active uses of the relation in the current transaction,
373 : * including open scans and pending AFTER trigger events.
374 : */
375 22 : CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM");
376 :
377 : /* Check heap and index are valid to cluster on */
378 22 : if (OidIsValid(indexOid))
379 10 : check_index_is_clusterable(OldHeap, indexOid, recheck, AccessExclusiveLock);
380 :
381 : /*
382 : * Quietly ignore the request if this is a materialized view which has not
383 : * been populated from its query. No harm is done because there is no data
384 : * to deal with, and we don't want to throw an error if this is part of a
385 : * multi-relation request -- for example, CLUSTER was run on the entire
386 : * database.
387 : */
388 22 : if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW &&
389 0 : !RelationIsPopulated(OldHeap))
390 : {
391 0 : relation_close(OldHeap, AccessExclusiveLock);
392 0 : return;
393 : }
394 :
395 : /*
396 : * All predicate locks on the tuples or pages are about to be made
397 : * invalid, because we move tuples around. Promote them to relation
398 : * locks. Predicate locks on indexes will be promoted when they are
399 : * reindexed.
400 : */
401 22 : TransferPredicateLocksToHeapRelation(OldHeap);
402 :
403 : /* rebuild_relation does all the dirty work */
404 22 : rebuild_relation(OldHeap, indexOid, verbose);
405 :
406 : /* NB: rebuild_relation does heap_close() on OldHeap */
407 : }
408 :
409 : /*
410 : * Verify that the specified heap and index are valid to cluster on
411 : *
412 : * Side effect: obtains lock on the index. The caller may
413 : * in some cases already have AccessExclusiveLock on the table, but
414 : * not in all cases so we can't rely on the table-level lock for
415 : * protection here.
416 : */
417 : void
418 13 : check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck, LOCKMODE lockmode)
419 : {
420 : Relation OldIndex;
421 :
422 13 : OldIndex = index_open(indexOid, lockmode);
423 :
424 : /*
425 : * Check that index is in fact an index on the given relation
426 : */
427 26 : if (OldIndex->rd_index == NULL ||
428 13 : OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
429 0 : ereport(ERROR,
430 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
431 : errmsg("\"%s\" is not an index for table \"%s\"",
432 : RelationGetRelationName(OldIndex),
433 : RelationGetRelationName(OldHeap))));
434 :
435 : /* Index AM must allow clustering */
436 13 : if (!OldIndex->rd_amroutine->amclusterable)
437 0 : ereport(ERROR,
438 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
439 : errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
440 : RelationGetRelationName(OldIndex))));
441 :
442 : /*
443 : * Disallow clustering on incomplete indexes (those that might not index
444 : * every row of the relation). We could relax this by making a separate
445 : * seqscan pass over the table to copy the missing rows, but that seems
446 : * expensive and tedious.
447 : */
448 13 : if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred))
449 0 : ereport(ERROR,
450 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
451 : errmsg("cannot cluster on partial index \"%s\"",
452 : RelationGetRelationName(OldIndex))));
453 :
454 : /*
455 : * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
456 : * it might well not contain entries for every heap row, or might not even
457 : * be internally consistent. (But note that we don't check indcheckxmin;
458 : * the worst consequence of following broken HOT chains would be that we
459 : * might put recently-dead tuples out-of-order in the new table, and there
460 : * is little harm in that.)
461 : */
462 13 : if (!IndexIsValid(OldIndex->rd_index))
463 0 : ereport(ERROR,
464 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
465 : errmsg("cannot cluster on invalid index \"%s\"",
466 : RelationGetRelationName(OldIndex))));
467 :
468 : /* Drop relcache refcnt on OldIndex, but keep lock */
469 13 : index_close(OldIndex, NoLock);
470 13 : }
471 :
472 : /*
473 : * mark_index_clustered: mark the specified index as the one clustered on
474 : *
475 : * With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
476 : */
477 : void
478 15 : mark_index_clustered(Relation rel, Oid indexOid, bool is_internal)
479 : {
480 : HeapTuple indexTuple;
481 : Form_pg_index indexForm;
482 : Relation pg_index;
483 : ListCell *index;
484 :
485 : /*
486 : * If the index is already marked clustered, no need to do anything.
487 : */
488 15 : if (OidIsValid(indexOid))
489 : {
490 13 : indexTuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
491 13 : if (!HeapTupleIsValid(indexTuple))
492 0 : elog(ERROR, "cache lookup failed for index %u", indexOid);
493 13 : indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
494 :
495 13 : if (indexForm->indisclustered)
496 : {
497 4 : ReleaseSysCache(indexTuple);
498 19 : return;
499 : }
500 :
501 9 : ReleaseSysCache(indexTuple);
502 : }
503 :
504 : /*
505 : * Check each index of the relation and set/clear the bit as needed.
506 : */
507 11 : pg_index = heap_open(IndexRelationId, RowExclusiveLock);
508 :
509 34 : foreach(index, RelationGetIndexList(rel))
510 : {
511 23 : Oid thisIndexOid = lfirst_oid(index);
512 :
513 23 : indexTuple = SearchSysCacheCopy1(INDEXRELID,
514 : ObjectIdGetDatum(thisIndexOid));
515 23 : if (!HeapTupleIsValid(indexTuple))
516 0 : elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
517 23 : indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
518 :
519 : /*
520 : * Unset the bit if set. We know it's wrong because we checked this
521 : * earlier.
522 : */
523 23 : if (indexForm->indisclustered)
524 : {
525 3 : indexForm->indisclustered = false;
526 3 : CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
527 : }
528 20 : else if (thisIndexOid == indexOid)
529 : {
530 : /* this was checked earlier, but let's be real sure */
531 9 : if (!IndexIsValid(indexForm))
532 0 : elog(ERROR, "cannot cluster on invalid index %u", indexOid);
533 9 : indexForm->indisclustered = true;
534 9 : CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
535 : }
536 :
537 23 : InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, 0,
538 : InvalidOid, is_internal);
539 :
540 23 : heap_freetuple(indexTuple);
541 : }
542 :
543 11 : heap_close(pg_index, RowExclusiveLock);
544 : }
545 :
546 : /*
547 : * rebuild_relation: rebuild an existing relation in index or physical order
548 : *
549 : * OldHeap: table to rebuild --- must be opened and exclusive-locked!
550 : * indexOid: index to cluster by, or InvalidOid to rewrite in physical order.
551 : *
552 : * NB: this routine closes OldHeap at the right time; caller should not.
553 : */
554 : static void
555 22 : rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose)
556 : {
557 22 : Oid tableOid = RelationGetRelid(OldHeap);
558 22 : Oid tableSpace = OldHeap->rd_rel->reltablespace;
559 : Oid OIDNewHeap;
560 : char relpersistence;
561 : bool is_system_catalog;
562 : bool swap_toast_by_content;
563 : TransactionId frozenXid;
564 : MultiXactId cutoffMulti;
565 :
566 : /* Mark the correct index as clustered */
567 22 : if (OidIsValid(indexOid))
568 10 : mark_index_clustered(OldHeap, indexOid, true);
569 :
570 : /* Remember info about rel before closing OldHeap */
571 22 : relpersistence = OldHeap->rd_rel->relpersistence;
572 22 : is_system_catalog = IsSystemRelation(OldHeap);
573 :
574 : /* Close relcache entry, but keep lock until transaction commit */
575 22 : heap_close(OldHeap, NoLock);
576 :
577 : /* Create the transient table that will receive the re-ordered data */
578 22 : OIDNewHeap = make_new_heap(tableOid, tableSpace,
579 : relpersistence,
580 : AccessExclusiveLock);
581 :
582 : /* Copy the heap data into the new table in the desired order */
583 22 : copy_heap_data(OIDNewHeap, tableOid, indexOid, verbose,
584 : &swap_toast_by_content, &frozenXid, &cutoffMulti);
585 :
586 : /*
587 : * Swap the physical files of the target and transient tables, then
588 : * rebuild the target's indexes and throw away the transient table.
589 : */
590 22 : finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
591 : swap_toast_by_content, false, true,
592 : frozenXid, cutoffMulti,
593 : relpersistence);
594 21 : }
595 :
596 :
597 : /*
598 : * Create the transient table that will be filled with new data during
599 : * CLUSTER, ALTER TABLE, and similar operations. The transient table
600 : * duplicates the logical structure of the OldHeap, but is placed in
601 : * NewTableSpace which might be different from OldHeap's. Also, it's built
602 : * with the specified persistence, which might differ from the original's.
603 : *
604 : * After this, the caller should load the new heap with transferred/modified
605 : * data, then call finish_heap_swap to complete the operation.
606 : */
607 : Oid
608 114 : make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, char relpersistence,
609 : LOCKMODE lockmode)
610 : {
611 : TupleDesc OldHeapDesc;
612 : char NewHeapName[NAMEDATALEN];
613 : Oid OIDNewHeap;
614 : Oid toastid;
615 : Relation OldHeap;
616 : HeapTuple tuple;
617 : Datum reloptions;
618 : bool isNull;
619 : Oid namespaceid;
620 :
621 114 : OldHeap = heap_open(OIDOldHeap, lockmode);
622 114 : OldHeapDesc = RelationGetDescr(OldHeap);
623 :
624 : /*
625 : * Note that the NewHeap will not receive any of the defaults or
626 : * constraints associated with the OldHeap; we don't need 'em, and there's
627 : * no reason to spend cycles inserting them into the catalogs only to
628 : * delete them.
629 : */
630 :
631 : /*
632 : * But we do want to use reloptions of the old heap for new heap.
633 : */
634 114 : tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(OIDOldHeap));
635 114 : if (!HeapTupleIsValid(tuple))
636 0 : elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
637 114 : reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
638 : &isNull);
639 114 : if (isNull)
640 114 : reloptions = (Datum) 0;
641 :
642 114 : if (relpersistence == RELPERSISTENCE_TEMP)
643 12 : namespaceid = LookupCreationNamespace("pg_temp");
644 : else
645 102 : namespaceid = RelationGetNamespace(OldHeap);
646 :
647 : /*
648 : * Create the new heap, using a temporary name in the same namespace as
649 : * the existing table. NOTE: there is some risk of collision with user
650 : * relnames. Working around this seems more trouble than it's worth; in
651 : * particular, we can't create the new heap in a different namespace from
652 : * the old, or we will have problems with the TEMP status of temp tables.
653 : *
654 : * Note: the new heap is not a shared relation, even if we are rebuilding
655 : * a shared rel. However, we do make the new heap mapped if the source is
656 : * mapped. This simplifies swap_relation_files, and is absolutely
657 : * necessary for rebuilding pg_class, for reasons explained there.
658 : */
659 114 : snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap);
660 :
661 228 : OIDNewHeap = heap_create_with_catalog(NewHeapName,
662 : namespaceid,
663 : NewTableSpace,
664 : InvalidOid,
665 : InvalidOid,
666 : InvalidOid,
667 114 : OldHeap->rd_rel->relowner,
668 : OldHeapDesc,
669 : NIL,
670 : RELKIND_RELATION,
671 : relpersistence,
672 : false,
673 114 : RelationIsMapped(OldHeap),
674 : true,
675 : 0,
676 : ONCOMMIT_NOOP,
677 : reloptions,
678 : false,
679 : true,
680 : true,
681 : NULL);
682 114 : Assert(OIDNewHeap != InvalidOid);
683 :
684 114 : ReleaseSysCache(tuple);
685 :
686 : /*
687 : * Advance command counter so that the newly-created relation's catalog
688 : * tuples will be visible to heap_open.
689 : */
690 114 : CommandCounterIncrement();
691 :
692 : /*
693 : * If necessary, create a TOAST table for the new relation.
694 : *
695 : * If the relation doesn't have a TOAST table already, we can't need one
696 : * for the new relation. The other way around is possible though: if some
697 : * wide columns have been dropped, NewHeapCreateToastTable can decide that
698 : * no TOAST table is needed for the new table.
699 : *
700 : * Note that NewHeapCreateToastTable ends with CommandCounterIncrement, so
701 : * that the TOAST table will be visible for insertion.
702 : */
703 114 : toastid = OldHeap->rd_rel->reltoastrelid;
704 114 : if (OidIsValid(toastid))
705 : {
706 : /* keep the existing toast table's reloptions, if any */
707 53 : tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastid));
708 53 : if (!HeapTupleIsValid(tuple))
709 0 : elog(ERROR, "cache lookup failed for relation %u", toastid);
710 53 : reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
711 : &isNull);
712 53 : if (isNull)
713 53 : reloptions = (Datum) 0;
714 :
715 53 : NewHeapCreateToastTable(OIDNewHeap, reloptions, lockmode);
716 :
717 53 : ReleaseSysCache(tuple);
718 : }
719 :
720 114 : heap_close(OldHeap, NoLock);
721 :
722 114 : return OIDNewHeap;
723 : }
724 :
725 : /*
726 : * Do the physical copying of heap data.
727 : *
728 : * There are three output parameters:
729 : * *pSwapToastByContent is set true if toast tables must be swapped by content.
730 : * *pFreezeXid receives the TransactionId used as freeze cutoff point.
731 : * *pCutoffMulti receives the MultiXactId used as a cutoff point.
732 : */
733 : static void
734 22 : copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
735 : bool *pSwapToastByContent, TransactionId *pFreezeXid,
736 : MultiXactId *pCutoffMulti)
737 : {
738 : Relation NewHeap,
739 : OldHeap,
740 : OldIndex;
741 : TupleDesc oldTupDesc;
742 : TupleDesc newTupDesc;
743 : int natts;
744 : Datum *values;
745 : bool *isnull;
746 : IndexScanDesc indexScan;
747 : HeapScanDesc heapScan;
748 : bool use_wal;
749 : bool is_system_catalog;
750 : TransactionId OldestXmin;
751 : TransactionId FreezeXid;
752 : MultiXactId MultiXactCutoff;
753 : RewriteState rwstate;
754 : bool use_sort;
755 : Tuplesortstate *tuplesort;
756 22 : double num_tuples = 0,
757 22 : tups_vacuumed = 0,
758 22 : tups_recently_dead = 0;
759 22 : int elevel = verbose ? INFO : DEBUG2;
760 : PGRUsage ru0;
761 :
762 22 : pg_rusage_init(&ru0);
763 :
764 : /*
765 : * Open the relations we need.
766 : */
767 22 : NewHeap = heap_open(OIDNewHeap, AccessExclusiveLock);
768 22 : OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
769 22 : if (OidIsValid(OIDOldIndex))
770 10 : OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
771 : else
772 12 : OldIndex = NULL;
773 :
774 : /*
775 : * Their tuple descriptors should be exactly alike, but here we only need
776 : * assume that they have the same number of columns.
777 : */
778 22 : oldTupDesc = RelationGetDescr(OldHeap);
779 22 : newTupDesc = RelationGetDescr(NewHeap);
780 22 : Assert(newTupDesc->natts == oldTupDesc->natts);
781 :
782 : /* Preallocate values/isnull arrays */
783 22 : natts = newTupDesc->natts;
784 22 : values = (Datum *) palloc(natts * sizeof(Datum));
785 22 : isnull = (bool *) palloc(natts * sizeof(bool));
786 :
787 : /*
788 : * If the OldHeap has a toast table, get lock on the toast table to keep
789 : * it from being vacuumed. This is needed because autovacuum processes
790 : * toast tables independently of their main tables, with no lock on the
791 : * latter. If an autovacuum were to start on the toast table after we
792 : * compute our OldestXmin below, it would use a later OldestXmin, and then
793 : * possibly remove as DEAD toast tuples belonging to main tuples we think
794 : * are only RECENTLY_DEAD. Then we'd fail while trying to copy those
795 : * tuples.
796 : *
797 : * We don't need to open the toast relation here, just lock it. The lock
798 : * will be held till end of transaction.
799 : */
800 22 : if (OldHeap->rd_rel->reltoastrelid)
801 4 : LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
802 :
803 : /*
804 : * We need to log the copied data in WAL iff WAL archiving/streaming is
805 : * enabled AND it's a WAL-logged rel.
806 : */
807 22 : use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap);
808 :
809 : /* use_wal off requires smgr_targblock be initially invalid */
810 22 : Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
811 :
812 : /*
813 : * If both tables have TOAST tables, perform toast swap by content. It is
814 : * possible that the old table has a toast table but the new one doesn't,
815 : * if toastable columns have been dropped. In that case we have to do
816 : * swap by links. This is okay because swap by content is only essential
817 : * for system catalogs, and we don't support schema changes for them.
818 : */
819 22 : if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid)
820 : {
821 4 : *pSwapToastByContent = true;
822 :
823 : /*
824 : * When doing swap by content, any toast pointers written into NewHeap
825 : * must use the old toast table's OID, because that's where the toast
826 : * data will eventually be found. Set this up by setting rd_toastoid.
827 : * This also tells toast_save_datum() to preserve the toast value
828 : * OIDs, which we want so as not to invalidate toast pointers in
829 : * system catalog caches, and to avoid making multiple copies of a
830 : * single toast value.
831 : *
832 : * Note that we must hold NewHeap open until we are done writing data,
833 : * since the relcache will not guarantee to remember this setting once
834 : * the relation is closed. Also, this technique depends on the fact
835 : * that no one will try to read from the NewHeap until after we've
836 : * finished writing it and swapping the rels --- otherwise they could
837 : * follow the toast pointers to the wrong place. (It would actually
838 : * work for values copied over from the old toast table, but not for
839 : * any values that we toast which were previously not toasted.)
840 : */
841 4 : NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid;
842 : }
843 : else
844 18 : *pSwapToastByContent = false;
845 :
846 : /*
847 : * Compute xids used to freeze and weed out dead tuples and multixacts.
848 : * Since we're going to rewrite the whole table anyway, there's no reason
849 : * not to be aggressive about this.
850 : */
851 22 : vacuum_set_xid_limits(OldHeap, 0, 0, 0, 0,
852 : &OldestXmin, &FreezeXid, NULL, &MultiXactCutoff,
853 : NULL);
854 :
855 : /*
856 : * FreezeXid will become the table's new relfrozenxid, and that mustn't go
857 : * backwards, so take the max.
858 : */
859 22 : if (TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
860 8 : FreezeXid = OldHeap->rd_rel->relfrozenxid;
861 :
862 : /*
863 : * MultiXactCutoff, similarly, shouldn't go backwards either.
864 : */
865 22 : if (MultiXactIdPrecedes(MultiXactCutoff, OldHeap->rd_rel->relminmxid))
866 0 : MultiXactCutoff = OldHeap->rd_rel->relminmxid;
867 :
868 : /* return selected values to caller */
869 22 : *pFreezeXid = FreezeXid;
870 22 : *pCutoffMulti = MultiXactCutoff;
871 :
872 : /* Remember if it's a system catalog */
873 22 : is_system_catalog = IsSystemRelation(OldHeap);
874 :
875 : /* Initialize the rewrite operation */
876 22 : rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, FreezeXid,
877 : MultiXactCutoff, use_wal);
878 :
879 : /*
880 : * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
881 : * the OldHeap. We know how to use a sort to duplicate the ordering of a
882 : * btree index, and will use seqscan-and-sort for that case if the planner
883 : * tells us it's cheaper. Otherwise, always indexscan if an index is
884 : * provided, else plain seqscan.
885 : */
886 22 : if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
887 10 : use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex);
888 : else
889 12 : use_sort = false;
890 :
891 : /* Set up sorting if wanted */
892 22 : if (use_sort)
893 5 : tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
894 : maintenance_work_mem, false);
895 : else
896 17 : tuplesort = NULL;
897 :
898 : /*
899 : * Prepare to scan the OldHeap. To ensure we see recently-dead tuples
900 : * that still need to be copied, we scan with SnapshotAny and use
901 : * HeapTupleSatisfiesVacuum for the visibility test.
902 : */
903 22 : if (OldIndex != NULL && !use_sort)
904 : {
905 5 : heapScan = NULL;
906 5 : indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
907 5 : index_rescan(indexScan, NULL, 0, NULL, 0);
908 : }
909 : else
910 : {
911 17 : heapScan = heap_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL);
912 17 : indexScan = NULL;
913 : }
914 :
915 : /* Log what we're doing */
916 22 : if (indexScan != NULL)
917 5 : ereport(elevel,
918 : (errmsg("clustering \"%s.%s\" using index scan on \"%s\"",
919 : get_namespace_name(RelationGetNamespace(OldHeap)),
920 : RelationGetRelationName(OldHeap),
921 : RelationGetRelationName(OldIndex))));
922 17 : else if (tuplesort != NULL)
923 5 : ereport(elevel,
924 : (errmsg("clustering \"%s.%s\" using sequential scan and sort",
925 : get_namespace_name(RelationGetNamespace(OldHeap)),
926 : RelationGetRelationName(OldHeap))));
927 : else
928 12 : ereport(elevel,
929 : (errmsg("vacuuming \"%s.%s\"",
930 : get_namespace_name(RelationGetNamespace(OldHeap)),
931 : RelationGetRelationName(OldHeap))));
932 :
933 : /*
934 : * Scan through the OldHeap, either in OldIndex order or sequentially;
935 : * copy each tuple into the NewHeap, or transiently to the tuplesort
936 : * module. Note that we don't bother sorting dead tuples (they won't get
937 : * to the new table anyway).
938 : */
939 : for (;;)
940 : {
941 : HeapTuple tuple;
942 : Buffer buf;
943 : bool isdead;
944 :
945 27156 : CHECK_FOR_INTERRUPTS();
946 :
947 27156 : if (indexScan != NULL)
948 : {
949 21 : tuple = index_getnext(indexScan, ForwardScanDirection);
950 21 : if (tuple == NULL)
951 5 : break;
952 :
953 : /* Since we used no scan keys, should never need to recheck */
954 16 : if (indexScan->xs_recheck)
955 0 : elog(ERROR, "CLUSTER does not support lossy index conditions");
956 :
957 16 : buf = indexScan->xs_cbuf;
958 : }
959 : else
960 : {
961 27135 : tuple = heap_getnext(heapScan, ForwardScanDirection);
962 27135 : if (tuple == NULL)
963 17 : break;
964 :
965 27118 : buf = heapScan->rs_cbuf;
966 : }
967 :
968 27134 : LockBuffer(buf, BUFFER_LOCK_SHARE);
969 :
970 27134 : switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf))
971 : {
972 : case HEAPTUPLE_DEAD:
973 : /* Definitely dead */
974 4373 : isdead = true;
975 4373 : break;
976 : case HEAPTUPLE_RECENTLY_DEAD:
977 2056 : tups_recently_dead += 1;
978 : /* fall through */
979 : case HEAPTUPLE_LIVE:
980 : /* Live or recently dead, must copy it */
981 22752 : isdead = false;
982 22752 : break;
983 : case HEAPTUPLE_INSERT_IN_PROGRESS:
984 :
985 : /*
986 : * Since we hold exclusive lock on the relation, normally the
987 : * only way to see this is if it was inserted earlier in our
988 : * own transaction. However, it can happen in system
989 : * catalogs, since we tend to release write lock before commit
990 : * there. Give a warning if neither case applies; but in any
991 : * case we had better copy it.
992 : */
993 7 : if (!is_system_catalog &&
994 3 : !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
995 0 : elog(WARNING, "concurrent insert in progress within table \"%s\"",
996 : RelationGetRelationName(OldHeap));
997 : /* treat as live */
998 4 : isdead = false;
999 4 : break;
1000 : case HEAPTUPLE_DELETE_IN_PROGRESS:
1001 :
1002 : /*
1003 : * Similar situation to INSERT_IN_PROGRESS case.
1004 : */
1005 10 : if (!is_system_catalog &&
1006 5 : !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
1007 0 : elog(WARNING, "concurrent delete in progress within table \"%s\"",
1008 : RelationGetRelationName(OldHeap));
1009 : /* treat as recently dead */
1010 5 : tups_recently_dead += 1;
1011 5 : isdead = false;
1012 5 : break;
1013 : default:
1014 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1015 : isdead = false; /* keep compiler quiet */
1016 : break;
1017 : }
1018 :
1019 27134 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1020 :
1021 27134 : if (isdead)
1022 : {
1023 4373 : tups_vacuumed += 1;
1024 : /* heap rewrite module still needs to see it... */
1025 4373 : if (rewrite_heap_dead_tuple(rwstate, tuple))
1026 : {
1027 : /* A previous recently-dead tuple is now known dead */
1028 0 : tups_vacuumed += 1;
1029 0 : tups_recently_dead -= 1;
1030 : }
1031 4373 : continue;
1032 : }
1033 :
1034 22761 : num_tuples += 1;
1035 22761 : if (tuplesort != NULL)
1036 20042 : tuplesort_putheaptuple(tuplesort, tuple);
1037 : else
1038 2719 : reform_and_rewrite_tuple(tuple,
1039 : oldTupDesc, newTupDesc,
1040 : values, isnull,
1041 2719 : NewHeap->rd_rel->relhasoids, rwstate);
1042 27134 : }
1043 :
1044 22 : if (indexScan != NULL)
1045 5 : index_endscan(indexScan);
1046 22 : if (heapScan != NULL)
1047 17 : heap_endscan(heapScan);
1048 :
1049 : /*
1050 : * In scan-and-sort mode, complete the sort, then read out all live tuples
1051 : * from the tuplestore and write them to the new relation.
1052 : */
1053 22 : if (tuplesort != NULL)
1054 : {
1055 5 : tuplesort_performsort(tuplesort);
1056 :
1057 : for (;;)
1058 : {
1059 : HeapTuple tuple;
1060 :
1061 20047 : CHECK_FOR_INTERRUPTS();
1062 :
1063 20047 : tuple = tuplesort_getheaptuple(tuplesort, true);
1064 20047 : if (tuple == NULL)
1065 5 : break;
1066 :
1067 20042 : reform_and_rewrite_tuple(tuple,
1068 : oldTupDesc, newTupDesc,
1069 : values, isnull,
1070 20042 : NewHeap->rd_rel->relhasoids, rwstate);
1071 20042 : }
1072 :
1073 5 : tuplesort_end(tuplesort);
1074 : }
1075 :
1076 : /* Write out any remaining tuples, and fsync if needed */
1077 22 : end_heap_rewrite(rwstate);
1078 :
1079 : /* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */
1080 22 : NewHeap->rd_toastoid = InvalidOid;
1081 :
1082 : /* Log what we did */
1083 22 : ereport(elevel,
1084 : (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
1085 : RelationGetRelationName(OldHeap),
1086 : tups_vacuumed, num_tuples,
1087 : RelationGetNumberOfBlocks(OldHeap)),
1088 : errdetail("%.0f dead row versions cannot be removed yet.\n"
1089 : "%s.",
1090 : tups_recently_dead,
1091 : pg_rusage_show(&ru0))));
1092 :
1093 : /* Clean up */
1094 22 : pfree(values);
1095 22 : pfree(isnull);
1096 :
1097 22 : if (OldIndex != NULL)
1098 10 : index_close(OldIndex, NoLock);
1099 22 : heap_close(OldHeap, NoLock);
1100 22 : heap_close(NewHeap, NoLock);
1101 22 : }
1102 :
1103 : /*
1104 : * Swap the physical files of two given relations.
1105 : *
1106 : * We swap the physical identity (reltablespace, relfilenode) while keeping the
1107 : * same logical identities of the two relations. relpersistence is also
1108 : * swapped, which is critical since it determines where buffers live for each
1109 : * relation.
1110 : *
1111 : * We can swap associated TOAST data in either of two ways: recursively swap
1112 : * the physical content of the toast tables (and their indexes), or swap the
1113 : * TOAST links in the given relations' pg_class entries. The former is needed
1114 : * to manage rewrites of shared catalogs (where we cannot change the pg_class
1115 : * links) while the latter is the only way to handle cases in which a toast
1116 : * table is added or removed altogether.
1117 : *
1118 : * Additionally, the first relation is marked with relfrozenxid set to
1119 : * frozenXid. It seems a bit ugly to have this here, but the caller would
1120 : * have to do it anyway, so having it here saves a heap_update. Note: in
1121 : * the swap-toast-links case, we assume we don't need to change the toast
1122 : * table's relfrozenxid: the new version of the toast table should already
1123 : * have relfrozenxid set to RecentXmin, which is good enough.
1124 : *
1125 : * Lastly, if r2 and its toast table and toast index (if any) are mapped,
1126 : * their OIDs are emitted into mapped_tables[]. This is hacky but beats
1127 : * having to look the information up again later in finish_heap_swap.
1128 : */
1129 : static void
1130 113 : swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
1131 : bool swap_toast_by_content,
1132 : bool is_internal,
1133 : TransactionId frozenXid,
1134 : MultiXactId cutoffMulti,
1135 : Oid *mapped_tables)
1136 : {
1137 : Relation relRelation;
1138 : HeapTuple reltup1,
1139 : reltup2;
1140 : Form_pg_class relform1,
1141 : relform2;
1142 : Oid relfilenode1,
1143 : relfilenode2;
1144 : Oid swaptemp;
1145 : char swptmpchr;
1146 :
1147 : /* We need writable copies of both pg_class tuples. */
1148 113 : relRelation = heap_open(RelationRelationId, RowExclusiveLock);
1149 :
1150 113 : reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1));
1151 113 : if (!HeapTupleIsValid(reltup1))
1152 0 : elog(ERROR, "cache lookup failed for relation %u", r1);
1153 113 : relform1 = (Form_pg_class) GETSTRUCT(reltup1);
1154 :
1155 113 : reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2));
1156 113 : if (!HeapTupleIsValid(reltup2))
1157 0 : elog(ERROR, "cache lookup failed for relation %u", r2);
1158 113 : relform2 = (Form_pg_class) GETSTRUCT(reltup2);
1159 :
1160 113 : relfilenode1 = relform1->relfilenode;
1161 113 : relfilenode2 = relform2->relfilenode;
1162 :
1163 113 : if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2))
1164 : {
1165 : /*
1166 : * Normal non-mapped relations: swap relfilenodes, reltablespaces,
1167 : * relpersistence
1168 : */
1169 111 : Assert(!target_is_pg_class);
1170 :
1171 111 : swaptemp = relform1->relfilenode;
1172 111 : relform1->relfilenode = relform2->relfilenode;
1173 111 : relform2->relfilenode = swaptemp;
1174 :
1175 111 : swaptemp = relform1->reltablespace;
1176 111 : relform1->reltablespace = relform2->reltablespace;
1177 111 : relform2->reltablespace = swaptemp;
1178 :
1179 111 : swptmpchr = relform1->relpersistence;
1180 111 : relform1->relpersistence = relform2->relpersistence;
1181 111 : relform2->relpersistence = swptmpchr;
1182 :
1183 : /* Also swap toast links, if we're swapping by links */
1184 222 : if (!swap_toast_by_content)
1185 : {
1186 99 : swaptemp = relform1->reltoastrelid;
1187 99 : relform1->reltoastrelid = relform2->reltoastrelid;
1188 99 : relform2->reltoastrelid = swaptemp;
1189 : }
1190 : }
1191 : else
1192 : {
1193 : /*
1194 : * Mapped-relation case. Here we have to swap the relation mappings
1195 : * instead of modifying the pg_class columns. Both must be mapped.
1196 : */
1197 2 : if (OidIsValid(relfilenode1) || OidIsValid(relfilenode2))
1198 0 : elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation",
1199 : NameStr(relform1->relname));
1200 :
1201 : /*
1202 : * We can't change the tablespace nor persistence of a mapped rel, and
1203 : * we can't handle toast link swapping for one either, because we must
1204 : * not apply any critical changes to its pg_class row. These cases
1205 : * should be prevented by upstream permissions tests, so these checks
1206 : * are non-user-facing emergency backstop.
1207 : */
1208 2 : if (relform1->reltablespace != relform2->reltablespace)
1209 0 : elog(ERROR, "cannot change tablespace of mapped relation \"%s\"",
1210 : NameStr(relform1->relname));
1211 2 : if (relform1->relpersistence != relform2->relpersistence)
1212 0 : elog(ERROR, "cannot change persistence of mapped relation \"%s\"",
1213 : NameStr(relform1->relname));
1214 4 : if (!swap_toast_by_content &&
1215 4 : (relform1->reltoastrelid || relform2->reltoastrelid))
1216 0 : elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"",
1217 : NameStr(relform1->relname));
1218 :
1219 : /*
1220 : * Fetch the mappings --- shouldn't fail, but be paranoid
1221 : */
1222 2 : relfilenode1 = RelationMapOidToFilenode(r1, relform1->relisshared);
1223 2 : if (!OidIsValid(relfilenode1))
1224 0 : elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1225 : NameStr(relform1->relname), r1);
1226 2 : relfilenode2 = RelationMapOidToFilenode(r2, relform2->relisshared);
1227 2 : if (!OidIsValid(relfilenode2))
1228 0 : elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1229 : NameStr(relform2->relname), r2);
1230 :
1231 : /*
1232 : * Send replacement mappings to relmapper. Note these won't actually
1233 : * take effect until CommandCounterIncrement.
1234 : */
1235 2 : RelationMapUpdateMap(r1, relfilenode2, relform1->relisshared, false);
1236 2 : RelationMapUpdateMap(r2, relfilenode1, relform2->relisshared, false);
1237 :
1238 : /* Pass OIDs of mapped r2 tables back to caller */
1239 2 : *mapped_tables++ = r2;
1240 : }
1241 :
1242 : /*
1243 : * In the case of a shared catalog, these next few steps will only affect
1244 : * our own database's pg_class row; but that's okay, because they are all
1245 : * noncritical updates. That's also an important fact for the case of a
1246 : * mapped catalog, because it's possible that we'll commit the map change
1247 : * and then fail to commit the pg_class update.
1248 : */
1249 :
1250 : /* set rel1's frozen Xid and minimum MultiXid */
1251 113 : if (relform1->relkind != RELKIND_INDEX)
1252 : {
1253 109 : Assert(TransactionIdIsNormal(frozenXid));
1254 109 : relform1->relfrozenxid = frozenXid;
1255 109 : Assert(MultiXactIdIsValid(cutoffMulti));
1256 109 : relform1->relminmxid = cutoffMulti;
1257 : }
1258 :
1259 : /* swap size statistics too, since new rel has freshly-updated stats */
1260 : {
1261 : int32 swap_pages;
1262 : float4 swap_tuples;
1263 : int32 swap_allvisible;
1264 :
1265 113 : swap_pages = relform1->relpages;
1266 113 : relform1->relpages = relform2->relpages;
1267 113 : relform2->relpages = swap_pages;
1268 :
1269 113 : swap_tuples = relform1->reltuples;
1270 113 : relform1->reltuples = relform2->reltuples;
1271 113 : relform2->reltuples = swap_tuples;
1272 :
1273 113 : swap_allvisible = relform1->relallvisible;
1274 113 : relform1->relallvisible = relform2->relallvisible;
1275 113 : relform2->relallvisible = swap_allvisible;
1276 : }
1277 :
1278 : /*
1279 : * Update the tuples in pg_class --- unless the target relation of the
1280 : * swap is pg_class itself. In that case, there is zero point in making
1281 : * changes because we'd be updating the old data that we're about to throw
1282 : * away. Because the real work being done here for a mapped relation is
1283 : * just to change the relation map settings, it's all right to not update
1284 : * the pg_class rows in this case. The most important changes will instead
1285 : * performed later, in finish_heap_swap() itself.
1286 : */
1287 113 : if (!target_is_pg_class)
1288 : {
1289 : CatalogIndexState indstate;
1290 :
1291 112 : indstate = CatalogOpenIndexes(relRelation);
1292 112 : CatalogTupleUpdateWithInfo(relRelation, &reltup1->t_self, reltup1,
1293 : indstate);
1294 112 : CatalogTupleUpdateWithInfo(relRelation, &reltup2->t_self, reltup2,
1295 : indstate);
1296 112 : CatalogCloseIndexes(indstate);
1297 : }
1298 : else
1299 : {
1300 : /* no update ... but we do still need relcache inval */
1301 1 : CacheInvalidateRelcacheByTuple(reltup1);
1302 1 : CacheInvalidateRelcacheByTuple(reltup2);
1303 : }
1304 :
1305 : /*
1306 : * Post alter hook for modified relations. The change to r2 is always
1307 : * internal, but r1 depends on the invocation context.
1308 : */
1309 113 : InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0,
1310 : InvalidOid, is_internal);
1311 113 : InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0,
1312 : InvalidOid, true);
1313 :
1314 : /*
1315 : * If we have toast tables associated with the relations being swapped,
1316 : * deal with them too.
1317 : */
1318 113 : if (relform1->reltoastrelid || relform2->reltoastrelid)
1319 : {
1320 49 : if (swap_toast_by_content)
1321 : {
1322 4 : if (relform1->reltoastrelid && relform2->reltoastrelid)
1323 : {
1324 : /* Recursively swap the contents of the toast tables */
1325 4 : swap_relation_files(relform1->reltoastrelid,
1326 : relform2->reltoastrelid,
1327 : target_is_pg_class,
1328 : swap_toast_by_content,
1329 : is_internal,
1330 : frozenXid,
1331 : cutoffMulti,
1332 : mapped_tables);
1333 : }
1334 : else
1335 : {
1336 : /* caller messed up */
1337 0 : elog(ERROR, "cannot swap toast files by content when there's only one");
1338 : }
1339 : }
1340 : else
1341 : {
1342 : /*
1343 : * We swapped the ownership links, so we need to change dependency
1344 : * data to match.
1345 : *
1346 : * NOTE: it is possible that only one table has a toast table.
1347 : *
1348 : * NOTE: at present, a TOAST table's only dependency is the one on
1349 : * its owning table. If more are ever created, we'd need to use
1350 : * something more selective than deleteDependencyRecordsFor() to
1351 : * get rid of just the link we want.
1352 : */
1353 : ObjectAddress baseobject,
1354 : toastobject;
1355 : long count;
1356 :
1357 : /*
1358 : * We disallow this case for system catalogs, to avoid the
1359 : * possibility that the catalog we're rebuilding is one of the
1360 : * ones the dependency changes would change. It's too late to be
1361 : * making any data changes to the target catalog.
1362 : */
1363 45 : if (IsSystemClass(r1, relform1))
1364 0 : elog(ERROR, "cannot swap toast files by links for system catalogs");
1365 :
1366 : /* Delete old dependencies */
1367 45 : if (relform1->reltoastrelid)
1368 : {
1369 42 : count = deleteDependencyRecordsFor(RelationRelationId,
1370 : relform1->reltoastrelid,
1371 : false);
1372 42 : if (count != 1)
1373 0 : elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1374 : count);
1375 : }
1376 45 : if (relform2->reltoastrelid)
1377 : {
1378 45 : count = deleteDependencyRecordsFor(RelationRelationId,
1379 : relform2->reltoastrelid,
1380 : false);
1381 45 : if (count != 1)
1382 0 : elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1383 : count);
1384 : }
1385 :
1386 : /* Register new dependencies */
1387 45 : baseobject.classId = RelationRelationId;
1388 45 : baseobject.objectSubId = 0;
1389 45 : toastobject.classId = RelationRelationId;
1390 45 : toastobject.objectSubId = 0;
1391 :
1392 45 : if (relform1->reltoastrelid)
1393 : {
1394 42 : baseobject.objectId = r1;
1395 42 : toastobject.objectId = relform1->reltoastrelid;
1396 42 : recordDependencyOn(&toastobject, &baseobject,
1397 : DEPENDENCY_INTERNAL);
1398 : }
1399 :
1400 45 : if (relform2->reltoastrelid)
1401 : {
1402 45 : baseobject.objectId = r2;
1403 45 : toastobject.objectId = relform2->reltoastrelid;
1404 45 : recordDependencyOn(&toastobject, &baseobject,
1405 : DEPENDENCY_INTERNAL);
1406 : }
1407 : }
1408 : }
1409 :
1410 : /*
1411 : * If we're swapping two toast tables by content, do the same for their
1412 : * valid index. The swap can actually be safely done only if the relations
1413 : * have indexes.
1414 : */
1415 125 : if (swap_toast_by_content &&
1416 16 : relform1->relkind == RELKIND_TOASTVALUE &&
1417 4 : relform2->relkind == RELKIND_TOASTVALUE)
1418 : {
1419 : Oid toastIndex1,
1420 : toastIndex2;
1421 :
1422 : /* Get valid index for each relation */
1423 4 : toastIndex1 = toast_get_valid_index(r1,
1424 : AccessExclusiveLock);
1425 4 : toastIndex2 = toast_get_valid_index(r2,
1426 : AccessExclusiveLock);
1427 :
1428 4 : swap_relation_files(toastIndex1,
1429 : toastIndex2,
1430 : target_is_pg_class,
1431 : swap_toast_by_content,
1432 : is_internal,
1433 : InvalidTransactionId,
1434 : InvalidMultiXactId,
1435 : mapped_tables);
1436 : }
1437 :
1438 : /* Clean up. */
1439 113 : heap_freetuple(reltup1);
1440 113 : heap_freetuple(reltup2);
1441 :
1442 113 : heap_close(relRelation, RowExclusiveLock);
1443 :
1444 : /*
1445 : * Close both relcache entries' smgr links. We need this kluge because
1446 : * both links will be invalidated during upcoming CommandCounterIncrement.
1447 : * Whichever of the rels is the second to be cleared will have a dangling
1448 : * reference to the other's smgr entry. Rather than trying to avoid this
1449 : * by ordering operations just so, it's easiest to close the links first.
1450 : * (Fortunately, since one of the entries is local in our transaction,
1451 : * it's sufficient to clear out our own relcache this way; the problem
1452 : * cannot arise for other backends when they see our update on the
1453 : * non-transient relation.)
1454 : *
1455 : * Caution: the placement of this step interacts with the decision to
1456 : * handle toast rels by recursion. When we are trying to rebuild pg_class
1457 : * itself, the smgr close on pg_class must happen after all accesses in
1458 : * this function.
1459 : */
1460 113 : RelationCloseSmgrByOid(r1);
1461 113 : RelationCloseSmgrByOid(r2);
1462 113 : }
1463 :
1464 : /*
1465 : * Remove the transient table that was built by make_new_heap, and finish
1466 : * cleaning up (including rebuilding all indexes on the old heap).
1467 : */
1468 : void
1469 105 : finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
1470 : bool is_system_catalog,
1471 : bool swap_toast_by_content,
1472 : bool check_constraints,
1473 : bool is_internal,
1474 : TransactionId frozenXid,
1475 : MultiXactId cutoffMulti,
1476 : char newrelpersistence)
1477 : {
1478 : ObjectAddress object;
1479 : Oid mapped_tables[4];
1480 : int reindex_flags;
1481 : int i;
1482 :
1483 : /* Zero out possible results from swapped_relation_files */
1484 105 : memset(mapped_tables, 0, sizeof(mapped_tables));
1485 :
1486 : /*
1487 : * Swap the contents of the heap relations (including any toast tables).
1488 : * Also set old heap's relfrozenxid to frozenXid.
1489 : */
1490 105 : swap_relation_files(OIDOldHeap, OIDNewHeap,
1491 : (OIDOldHeap == RelationRelationId),
1492 : swap_toast_by_content, is_internal,
1493 : frozenXid, cutoffMulti, mapped_tables);
1494 :
1495 : /*
1496 : * If it's a system catalog, queue an sinval message to flush all
1497 : * catcaches on the catalog when we reach CommandCounterIncrement.
1498 : */
1499 105 : if (is_system_catalog)
1500 3 : CacheInvalidateCatalog(OIDOldHeap);
1501 :
1502 : /*
1503 : * Rebuild each index on the relation (but not the toast table, which is
1504 : * all-new at this point). It is important to do this before the DROP
1505 : * step because if we are processing a system catalog that will be used
1506 : * during DROP, we want to have its indexes available. There is no
1507 : * advantage to the other order anyway because this is all transactional,
1508 : * so no chance to reclaim disk space before commit. We do not need a
1509 : * final CommandCounterIncrement() because reindex_relation does it.
1510 : *
1511 : * Note: because index_build is called via reindex_relation, it will never
1512 : * set indcheckxmin true for the indexes. This is OK even though in some
1513 : * sense we are building new indexes rather than rebuilding existing ones,
1514 : * because the new heap won't contain any HOT chains at all, let alone
1515 : * broken ones, so it can't be necessary to set indcheckxmin.
1516 : */
1517 105 : reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE;
1518 105 : if (check_constraints)
1519 83 : reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS;
1520 :
1521 : /*
1522 : * Ensure that the indexes have the same persistence as the parent
1523 : * relation.
1524 : */
1525 105 : if (newrelpersistence == RELPERSISTENCE_UNLOGGED)
1526 3 : reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED;
1527 102 : else if (newrelpersistence == RELPERSISTENCE_PERMANENT)
1528 96 : reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT;
1529 :
1530 105 : reindex_relation(OIDOldHeap, reindex_flags, 0);
1531 :
1532 : /*
1533 : * If the relation being rebuild is pg_class, swap_relation_files()
1534 : * couldn't update pg_class's own pg_class entry (check comments in
1535 : * swap_relation_files()), thus relfrozenxid was not updated. That's
1536 : * annoying because a potential reason for doing a VACUUM FULL is a
1537 : * imminent or actual anti-wraparound shutdown. So, now that we can
1538 : * access the new relation using it's indices, update relfrozenxid.
1539 : * pg_class doesn't have a toast relation, so we don't need to update the
1540 : * corresponding toast relation. Not that there's little point moving all
1541 : * relfrozenxid updates here since swap_relation_files() needs to write to
1542 : * pg_class for non-mapped relations anyway.
1543 : */
1544 102 : if (OIDOldHeap == RelationRelationId)
1545 : {
1546 : Relation relRelation;
1547 : HeapTuple reltup;
1548 : Form_pg_class relform;
1549 :
1550 1 : relRelation = heap_open(RelationRelationId, RowExclusiveLock);
1551 :
1552 1 : reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDOldHeap));
1553 1 : if (!HeapTupleIsValid(reltup))
1554 0 : elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
1555 1 : relform = (Form_pg_class) GETSTRUCT(reltup);
1556 :
1557 1 : relform->relfrozenxid = frozenXid;
1558 1 : relform->relminmxid = cutoffMulti;
1559 :
1560 1 : CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
1561 :
1562 1 : heap_close(relRelation, RowExclusiveLock);
1563 : }
1564 :
1565 : /* Destroy new heap with old filenode */
1566 102 : object.classId = RelationRelationId;
1567 102 : object.objectId = OIDNewHeap;
1568 102 : object.objectSubId = 0;
1569 :
1570 : /*
1571 : * The new relation is local to our transaction and we know nothing
1572 : * depends on it, so DROP_RESTRICT should be OK.
1573 : */
1574 102 : performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
1575 :
1576 : /* performDeletion does CommandCounterIncrement at end */
1577 :
1578 : /*
1579 : * Now we must remove any relation mapping entries that we set up for the
1580 : * transient table, as well as its toast table and toast index if any. If
1581 : * we fail to do this before commit, the relmapper will complain about new
1582 : * permanent map entries being added post-bootstrap.
1583 : */
1584 104 : for (i = 0; OidIsValid(mapped_tables[i]); i++)
1585 2 : RelationMapRemoveMapping(mapped_tables[i]);
1586 :
1587 : /*
1588 : * At this point, everything is kosher except that, if we did toast swap
1589 : * by links, the toast table's name corresponds to the transient table.
1590 : * The name is irrelevant to the backend because it's referenced by OID,
1591 : * but users looking at the catalogs could be confused. Rename it to
1592 : * prevent this problem.
1593 : *
1594 : * Note no lock required on the relation, because we already hold an
1595 : * exclusive lock on it.
1596 : */
1597 102 : if (!swap_toast_by_content)
1598 : {
1599 : Relation newrel;
1600 :
1601 98 : newrel = heap_open(OIDOldHeap, NoLock);
1602 98 : if (OidIsValid(newrel->rd_rel->reltoastrelid))
1603 : {
1604 : Oid toastidx;
1605 : char NewToastName[NAMEDATALEN];
1606 :
1607 : /* Get the associated valid index to be renamed */
1608 42 : toastidx = toast_get_valid_index(newrel->rd_rel->reltoastrelid,
1609 : AccessShareLock);
1610 :
1611 : /* rename the toast table ... */
1612 42 : snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u",
1613 : OIDOldHeap);
1614 42 : RenameRelationInternal(newrel->rd_rel->reltoastrelid,
1615 : NewToastName, true);
1616 :
1617 : /* ... and its valid index too. */
1618 42 : snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index",
1619 : OIDOldHeap);
1620 :
1621 42 : RenameRelationInternal(toastidx,
1622 : NewToastName, true);
1623 : }
1624 98 : relation_close(newrel, NoLock);
1625 : }
1626 102 : }
1627 :
1628 :
1629 : /*
1630 : * Get a list of tables that the current user owns and
1631 : * have indisclustered set. Return the list in a List * of rvsToCluster
1632 : * with the tableOid and the indexOid on which the table is already
1633 : * clustered.
1634 : */
1635 : static List *
1636 1 : get_tables_to_cluster(MemoryContext cluster_context)
1637 : {
1638 : Relation indRelation;
1639 : HeapScanDesc scan;
1640 : ScanKeyData entry;
1641 : HeapTuple indexTuple;
1642 : Form_pg_index index;
1643 : MemoryContext old_context;
1644 : RelToCluster *rvtc;
1645 1 : List *rvs = NIL;
1646 :
1647 : /*
1648 : * Get all indexes that have indisclustered set and are owned by
1649 : * appropriate user. System relations or nailed-in relations cannot ever
1650 : * have indisclustered set, because CLUSTER will refuse to set it when
1651 : * called with one of them as argument.
1652 : */
1653 1 : indRelation = heap_open(IndexRelationId, AccessShareLock);
1654 1 : ScanKeyInit(&entry,
1655 : Anum_pg_index_indisclustered,
1656 : BTEqualStrategyNumber, F_BOOLEQ,
1657 : BoolGetDatum(true));
1658 1 : scan = heap_beginscan_catalog(indRelation, 1, &entry);
1659 4 : while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1660 : {
1661 2 : index = (Form_pg_index) GETSTRUCT(indexTuple);
1662 :
1663 2 : if (!pg_class_ownercheck(index->indrelid, GetUserId()))
1664 1 : continue;
1665 :
1666 : /*
1667 : * We have to build the list in a different memory context so it will
1668 : * survive the cross-transaction processing
1669 : */
1670 1 : old_context = MemoryContextSwitchTo(cluster_context);
1671 :
1672 1 : rvtc = (RelToCluster *) palloc(sizeof(RelToCluster));
1673 1 : rvtc->tableOid = index->indrelid;
1674 1 : rvtc->indexOid = index->indexrelid;
1675 1 : rvs = lcons(rvtc, rvs);
1676 :
1677 1 : MemoryContextSwitchTo(old_context);
1678 : }
1679 1 : heap_endscan(scan);
1680 :
1681 1 : relation_close(indRelation, AccessShareLock);
1682 :
1683 1 : return rvs;
1684 : }
1685 :
1686 :
1687 : /*
1688 : * Reconstruct and rewrite the given tuple
1689 : *
1690 : * We cannot simply copy the tuple as-is, for several reasons:
1691 : *
1692 : * 1. We'd like to squeeze out the values of any dropped columns, both
1693 : * to save space and to ensure we have no corner-case failures. (It's
1694 : * possible for example that the new table hasn't got a TOAST table
1695 : * and so is unable to store any large values of dropped cols.)
1696 : *
1697 : * 2. The tuple might not even be legal for the new table; this is
1698 : * currently only known to happen as an after-effect of ALTER TABLE
1699 : * SET WITHOUT OIDS.
1700 : *
1701 : * So, we must reconstruct the tuple from component Datums.
1702 : */
1703 : static void
1704 22761 : reform_and_rewrite_tuple(HeapTuple tuple,
1705 : TupleDesc oldTupDesc, TupleDesc newTupDesc,
1706 : Datum *values, bool *isnull,
1707 : bool newRelHasOids, RewriteState rwstate)
1708 : {
1709 : HeapTuple copiedTuple;
1710 : int i;
1711 :
1712 22761 : heap_deform_tuple(tuple, oldTupDesc, values, isnull);
1713 :
1714 : /* Be sure to null out any dropped columns */
1715 366004 : for (i = 0; i < newTupDesc->natts; i++)
1716 : {
1717 343243 : if (TupleDescAttr(newTupDesc, i)->attisdropped)
1718 0 : isnull[i] = true;
1719 : }
1720 :
1721 22761 : copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
1722 :
1723 : /* Preserve OID, if any */
1724 22761 : if (newRelHasOids)
1725 645 : HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple));
1726 :
1727 : /* The heap rewrite module does the rest */
1728 22761 : rewrite_heap_tuple(rwstate, tuple, copiedTuple);
1729 :
1730 22761 : heap_freetuple(copiedTuple);
1731 22761 : }
|