Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * dsa.c
4 : * Dynamic shared memory areas.
5 : *
6 : * This module provides dynamic shared memory areas which are built on top of
7 : * DSM segments. While dsm.c allows segments of memory of shared memory to be
8 : * created and shared between backends, it isn't designed to deal with small
9 : * objects. A DSA area is a shared memory heap usually backed by one or more
10 : * DSM segments which can allocate memory using dsa_allocate() and dsa_free().
11 : * Alternatively, it can be created in pre-existing shared memory, including a
12 : * DSM segment, and then create extra DSM segments as required. Unlike the
13 : * regular system heap, it deals in pseudo-pointers which must be converted to
14 : * backend-local pointers before they are dereferenced. These pseudo-pointers
15 : * can however be shared with other backends, and can be used to construct
16 : * shared data structures.
17 : *
18 : * Each DSA area manages a set of DSM segments, adding new segments as
19 : * required and detaching them when they are no longer needed. Each segment
20 : * contains a number of 4KB pages, a free page manager for tracking
21 : * consecutive runs of free pages, and a page map for tracking the source of
22 : * objects allocated on each page. Allocation requests above 8KB are handled
23 : * by choosing a segment and finding consecutive free pages in its free page
24 : * manager. Allocation requests for smaller sizes are handled using pools of
25 : * objects of a selection of sizes. Each pool consists of a number of 16 page
26 : * (64KB) superblocks allocated in the same way as large objects. Allocation
27 : * of large objects and new superblocks is serialized by a single LWLock, but
28 : * allocation of small objects from pre-existing superblocks uses one LWLock
29 : * per pool. Currently there is one pool, and therefore one lock, per size
30 : * class. Per-core pools to increase concurrency and strategies for reducing
31 : * the resulting fragmentation are areas for future research. Each superblock
32 : * is managed with a 'span', which tracks the superblock's freelist. Free
33 : * requests are handled by looking in the page map to find which span an
34 : * address was allocated from, so that small objects can be returned to the
35 : * appropriate free list, and large object pages can be returned directly to
36 : * the free page map. When allocating, simple heuristics for selecting
37 : * segments and superblocks try to encourage occupied memory to be
38 : * concentrated, increasing the likelihood that whole superblocks can become
39 : * empty and be returned to the free page manager, and whole segments can
40 : * become empty and be returned to the operating system.
41 : *
42 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
43 : * Portions Copyright (c) 1994, Regents of the University of California
44 : *
45 : * IDENTIFICATION
46 : * src/backend/utils/mmgr/dsa.c
47 : *
48 : *-------------------------------------------------------------------------
49 : */
50 :
51 : #include "postgres.h"
52 :
53 : #include "port/atomics.h"
54 : #include "storage/dsm.h"
55 : #include "storage/ipc.h"
56 : #include "storage/lwlock.h"
57 : #include "storage/shmem.h"
58 : #include "utils/dsa.h"
59 : #include "utils/freepage.h"
60 : #include "utils/memutils.h"
61 :
62 : /*
63 : * The size of the initial DSM segment that backs a dsa_area created by
64 : * dsa_create. After creating some number of segments of this size we'll
65 : * double this size, and so on. Larger segments may be created if necessary
66 : * to satisfy large requests.
67 : */
68 : #define DSA_INITIAL_SEGMENT_SIZE ((Size) (1 * 1024 * 1024))
69 :
70 : /*
71 : * How many segments to create before we double the segment size. If this is
72 : * low, then there is likely to be a lot of wasted space in the largest
73 : * segment. If it is high, then we risk running out of segment slots (see
74 : * dsm.c's limits on total number of segments), or limiting the total size
75 : * an area can manage when using small pointers.
76 : */
77 : #define DSA_NUM_SEGMENTS_AT_EACH_SIZE 4
78 :
79 : /*
80 : * The number of bits used to represent the offset part of a dsa_pointer.
81 : * This controls the maximum size of a segment, the maximum possible
82 : * allocation size and also the maximum number of segments per area.
83 : */
84 : #if SIZEOF_DSA_POINTER == 4
85 : #define DSA_OFFSET_WIDTH 27 /* 32 segments of size up to 128MB */
86 : #else
87 : #define DSA_OFFSET_WIDTH 40 /* 1024 segments of size up to 1TB */
88 : #endif
89 :
90 : /*
91 : * The maximum number of DSM segments that an area can own, determined by
92 : * the number of bits remaining (but capped at 1024).
93 : */
94 : #define DSA_MAX_SEGMENTS \
95 : Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH)))
96 :
97 : /* The bitmask for extracting the offset from a dsa_pointer. */
98 : #define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1)
99 :
100 : /* The maximum size of a DSM segment. */
101 : #define DSA_MAX_SEGMENT_SIZE ((Size) 1 << DSA_OFFSET_WIDTH)
102 :
103 : /* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */
104 : #define DSA_PAGES_PER_SUPERBLOCK 16
105 :
106 : /*
107 : * A magic number used as a sanity check for following DSM segments belonging
108 : * to a DSA area (this number will be XORed with the area handle and
109 : * the segment index).
110 : */
111 : #define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608
112 :
113 : /* Build a dsa_pointer given a segment number and offset. */
114 : #define DSA_MAKE_POINTER(segment_number, offset) \
115 : (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset))
116 :
117 : /* Extract the segment number from a dsa_pointer. */
118 : #define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH)
119 :
120 : /* Extract the offset from a dsa_pointer. */
121 : #define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK)
122 :
123 : /* The type used for index segment indexes (zero based). */
124 : typedef Size dsa_segment_index;
125 :
126 : /* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */
127 : #define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0)
128 :
129 : /*
130 : * How many bins of segments do we have? The bins are used to categorize
131 : * segments by their largest contiguous run of free pages.
132 : */
133 : #define DSA_NUM_SEGMENT_BINS 16
134 :
135 : /*
136 : * What is the lowest bin that holds segments that *might* have n contiguous
137 : * free pages? There is no point in looking in segments in lower bins; they
138 : * definitely can't service a request for n free pages.
139 : */
140 : #define contiguous_pages_to_segment_bin(n) Min(fls(n), DSA_NUM_SEGMENT_BINS - 1)
141 :
142 : /* Macros for access to locks. */
143 : #define DSA_AREA_LOCK(area) (&area->control->lock)
144 : #define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock)
145 :
146 : /*
147 : * The header for an individual segment. This lives at the start of each DSM
148 : * segment owned by a DSA area including the first segment (where it appears
149 : * as part of the dsa_area_control struct).
150 : */
151 : typedef struct
152 : {
153 : /* Sanity check magic value. */
154 : uint32 magic;
155 : /* Total number of pages in this segment (excluding metadata area). */
156 : Size usable_pages;
157 : /* Total size of this segment in bytes. */
158 : Size size;
159 :
160 : /*
161 : * Index of the segment that precedes this one in the same segment bin, or
162 : * DSA_SEGMENT_INDEX_NONE if this is the first one.
163 : */
164 : dsa_segment_index prev;
165 :
166 : /*
167 : * Index of the segment that follows this one in the same segment bin, or
168 : * DSA_SEGMENT_INDEX_NONE if this is the last one.
169 : */
170 : dsa_segment_index next;
171 : /* The index of the bin that contains this segment. */
172 : Size bin;
173 :
174 : /*
175 : * A flag raised to indicate that this segment is being returned to the
176 : * operating system and has been unpinned.
177 : */
178 : bool freed;
179 : } dsa_segment_header;
180 :
181 : /*
182 : * Metadata for one superblock.
183 : *
184 : * For most blocks, span objects are stored out-of-line; that is, the span
185 : * object is not stored within the block itself. But, as an exception, for a
186 : * "span of spans", the span object is stored "inline". The allocation is
187 : * always exactly one page, and the dsa_area_span object is located at
188 : * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS,
189 : * and the remaining fields are used just as they would be in an ordinary
190 : * block. We can't allocate spans out of ordinary superblocks because
191 : * creating an ordinary superblock requires us to be able to allocate a span
192 : * *first*. Doing it this way avoids that circularity.
193 : */
194 : typedef struct
195 : {
196 : dsa_pointer pool; /* Containing pool. */
197 : dsa_pointer prevspan; /* Previous span. */
198 : dsa_pointer nextspan; /* Next span. */
199 : dsa_pointer start; /* Starting address. */
200 : Size npages; /* Length of span in pages. */
201 : uint16 size_class; /* Size class. */
202 : uint16 ninitialized; /* Maximum number of objects ever allocated. */
203 : uint16 nallocatable; /* Number of objects currently allocatable. */
204 : uint16 firstfree; /* First object on free list. */
205 : uint16 nmax; /* Maximum number of objects ever possible. */
206 : uint16 fclass; /* Current fullness class. */
207 : } dsa_area_span;
208 :
209 : /*
210 : * Given a pointer to an object in a span, access the index of the next free
211 : * object in the same span (ie in the span's freelist) as an L-value.
212 : */
213 : #define NextFreeObjectIndex(object) (* (uint16 *) (object))
214 :
215 : /*
216 : * Small allocations are handled by dividing a single block of memory into
217 : * many small objects of equal size. The possible allocation sizes are
218 : * defined by the following array. Larger size classes are spaced more widely
219 : * than smaller size classes. We fudge the spacing for size classes >1kB to
220 : * avoid space wastage: based on the knowledge that we plan to allocate 64kB
221 : * blocks, we bump the maximum object size up to the largest multiple of
222 : * 8 bytes that still lets us fit the same number of objects into one block.
223 : *
224 : * NB: Because of this fudging, if we were ever to use differently-sized blocks
225 : * for small allocations, these size classes would need to be reworked to be
226 : * optimal for the new size.
227 : *
228 : * NB: The optimal spacing for size classes, as well as the size of the blocks
229 : * out of which small objects are allocated, is not a question that has one
230 : * right answer. Some allocators (such as tcmalloc) use more closely-spaced
231 : * size classes than we do here, while others (like aset.c) use more
232 : * widely-spaced classes. Spacing the classes more closely avoids wasting
233 : * memory within individual chunks, but also means a larger number of
234 : * potentially-unfilled blocks.
235 : */
236 : static const uint16 dsa_size_classes[] = {
237 : sizeof(dsa_area_span), 0, /* special size classes */
238 : 8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */
239 : 80, 96, 112, 128, /* 4 classes separated by 16 bytes */
240 : 160, 192, 224, 256, /* 4 classes separated by 32 bytes */
241 : 320, 384, 448, 512, /* 4 classes separated by 64 bytes */
242 : 640, 768, 896, 1024, /* 4 classes separated by 128 bytes */
243 : 1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */
244 : 2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */
245 : 5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */
246 : };
247 : #define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes)
248 :
249 : /* Special size classes. */
250 : #define DSA_SCLASS_BLOCK_OF_SPANS 0
251 : #define DSA_SCLASS_SPAN_LARGE 1
252 :
253 : /*
254 : * The following lookup table is used to map the size of small objects
255 : * (less than 1kB) onto the corresponding size class. To use this table,
256 : * round the size of the object up to the next multiple of 8 bytes, and then
257 : * index into this array.
258 : */
259 : static char dsa_size_class_map[] = {
260 : 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
261 : 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
262 : 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
263 : 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
264 : 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
265 : 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
266 : 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
267 : 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
268 : };
269 : #define DSA_SIZE_CLASS_MAP_QUANTUM 8
270 :
271 : /*
272 : * Superblocks are binned by how full they are. Generally, each fullness
273 : * class corresponds to one quartile, but the block being used for
274 : * allocations is always at the head of the list for fullness class 1,
275 : * regardless of how full it really is.
276 : */
277 : #define DSA_FULLNESS_CLASSES 4
278 :
279 : /*
280 : * A dsa_area_pool represents a set of objects of a given size class.
281 : *
282 : * Perhaps there should be multiple pools for the same size class for
283 : * contention avoidance, but for now there is just one!
284 : */
285 : typedef struct
286 : {
287 : /* A lock protecting access to this pool. */
288 : LWLock lock;
289 : /* A set of linked lists of spans, arranged by fullness. */
290 : dsa_pointer spans[DSA_FULLNESS_CLASSES];
291 : /* Should we pad this out to a cacheline boundary? */
292 : } dsa_area_pool;
293 :
294 : /*
295 : * The control block for an area. This lives in shared memory, at the start of
296 : * the first DSM segment controlled by this area.
297 : */
298 : typedef struct
299 : {
300 : /* The segment header for the first segment. */
301 : dsa_segment_header segment_header;
302 : /* The handle for this area. */
303 : dsa_handle handle;
304 : /* The handles of the segments owned by this area. */
305 : dsm_handle segment_handles[DSA_MAX_SEGMENTS];
306 : /* Lists of segments, binned by maximum contiguous run of free pages. */
307 : dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS];
308 : /* The object pools for each size class. */
309 : dsa_area_pool pools[DSA_NUM_SIZE_CLASSES];
310 : /* The total size of all active segments. */
311 : Size total_segment_size;
312 : /* The maximum total size of backing storage we are allowed. */
313 : Size max_total_segment_size;
314 : /* Highest used segment index in the history of this area. */
315 : dsa_segment_index high_segment_index;
316 : /* The reference count for this area. */
317 : int refcnt;
318 : /* A flag indicating that this area has been pinned. */
319 : bool pinned;
320 : /* The number of times that segments have been freed. */
321 : Size freed_segment_counter;
322 : /* The LWLock tranche ID. */
323 : int lwlock_tranche_id;
324 : /* The general lock (protects everything except object pools). */
325 : LWLock lock;
326 : } dsa_area_control;
327 :
328 : /* Given a pointer to a pool, find a dsa_pointer. */
329 : #define DsaAreaPoolToDsaPointer(area, p) \
330 : DSA_MAKE_POINTER(0, (char *) p - (char *) area->control)
331 :
332 : /*
333 : * A dsa_segment_map is stored within the backend-private memory of each
334 : * individual backend. It holds the base address of the segment within that
335 : * backend, plus the addresses of key objects within the segment. Those
336 : * could instead be derived from the base address but it's handy to have them
337 : * around.
338 : */
339 : typedef struct
340 : {
341 : dsm_segment *segment; /* DSM segment */
342 : char *mapped_address; /* Address at which segment is mapped */
343 : dsa_segment_header *header; /* Header (same as mapped_address) */
344 : FreePageManager *fpm; /* Free page manager within segment. */
345 : dsa_pointer *pagemap; /* Page map within segment. */
346 : } dsa_segment_map;
347 :
348 : /*
349 : * Per-backend state for a storage area. Backends obtain one of these by
350 : * creating an area or attaching to an existing one using a handle. Each
351 : * process that needs to use an area uses its own object to track where the
352 : * segments are mapped.
353 : */
354 : struct dsa_area
355 : {
356 : /* Pointer to the control object in shared memory. */
357 : dsa_area_control *control;
358 :
359 : /* Has the mapping been pinned? */
360 : bool mapping_pinned;
361 :
362 : /*
363 : * This backend's array of segment maps, ordered by segment index
364 : * corresponding to control->segment_handles. Some of the area's segments
365 : * may not be mapped in in this backend yet, and some slots may have been
366 : * freed and need to be detached; these operations happen on demand.
367 : */
368 : dsa_segment_map segment_maps[DSA_MAX_SEGMENTS];
369 :
370 : /* The highest segment index this backend has ever mapped. */
371 : dsa_segment_index high_segment_index;
372 :
373 : /* The last observed freed_segment_counter. */
374 : Size freed_segment_counter;
375 : };
376 :
377 : #define DSA_SPAN_NOTHING_FREE ((uint16) -1)
378 : #define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
379 :
380 : /* Given a pointer to a segment_map, obtain a segment index number. */
381 : #define get_segment_index(area, segment_map_ptr) \
382 : (segment_map_ptr - &area->segment_maps[0])
383 :
384 : static void init_span(dsa_area *area, dsa_pointer span_pointer,
385 : dsa_area_pool *pool, dsa_pointer start, Size npages,
386 : uint16 size_class);
387 : static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool,
388 : int fromclass, int toclass);
389 : static inline dsa_pointer alloc_object(dsa_area *area, int size_class);
390 : static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
391 : int size_class);
392 : static dsa_segment_map *get_segment_by_index(dsa_area *area,
393 : dsa_segment_index index);
394 : static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer);
395 : static void unlink_span(dsa_area *area, dsa_area_span *span);
396 : static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
397 : dsa_pointer span_pointer, int fclass);
398 : static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map);
399 : static dsa_segment_map *get_best_segment(dsa_area *area, Size npages);
400 : static dsa_segment_map *make_new_segment(dsa_area *area, Size requested_pages);
401 : static dsa_area *create_internal(void *place, size_t size,
402 : int tranche_id,
403 : dsm_handle control_handle,
404 : dsm_segment *control_segment);
405 : static dsa_area *attach_internal(void *place, dsm_segment *segment,
406 : dsa_handle handle);
407 : static void check_for_freed_segments(dsa_area *area);
408 :
409 : /*
410 : * Create a new shared area in a new DSM segment. Further DSM segments will
411 : * be allocated as required to extend the available space.
412 : *
413 : * We can't allocate a LWLock tranche_id within this function, because tranche
414 : * IDs are a scarce resource; there are only 64k available, using low numbers
415 : * when possible matters, and we have no provision for recycling them. So,
416 : * we require the caller to provide one.
417 : */
418 : dsa_area *
419 0 : dsa_create(int tranche_id)
420 : {
421 : dsm_segment *segment;
422 : dsa_area *area;
423 :
424 : /*
425 : * Create the DSM segment that will hold the shared control object and the
426 : * first segment of usable space.
427 : */
428 0 : segment = dsm_create(DSA_INITIAL_SEGMENT_SIZE, 0);
429 :
430 : /*
431 : * All segments backing this area are pinned, so that DSA can explicitly
432 : * control their lifetime (otherwise a newly created segment belonging to
433 : * this area might be freed when the only backend that happens to have it
434 : * mapped in ends, corrupting the area).
435 : */
436 0 : dsm_pin_segment(segment);
437 :
438 : /* Create a new DSA area with the control object in this segment. */
439 0 : area = create_internal(dsm_segment_address(segment),
440 : DSA_INITIAL_SEGMENT_SIZE,
441 : tranche_id,
442 : dsm_segment_handle(segment), segment);
443 :
444 : /* Clean up when the control segment detaches. */
445 0 : on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
446 0 : PointerGetDatum(dsm_segment_address(segment)));
447 :
448 0 : return area;
449 : }
450 :
451 : /*
452 : * Create a new shared area in an existing shared memory space, which may be
453 : * either DSM or Postmaster-initialized memory. DSM segments will be
454 : * allocated as required to extend the available space, though that can be
455 : * prevented with dsa_set_size_limit(area, size) using the same size provided
456 : * to dsa_create_in_place.
457 : *
458 : * Areas created in-place must eventually be released by the backend that
459 : * created them and all backends that attach to them. This can be done
460 : * explicitly with dsa_release_in_place, or, in the special case that 'place'
461 : * happens to be in a pre-existing DSM segment, by passing in a pointer to the
462 : * segment so that a detach hook can be registered with the containing DSM
463 : * segment.
464 : *
465 : * See dsa_create() for a note about the tranche arguments.
466 : */
467 : dsa_area *
468 17 : dsa_create_in_place(void *place, size_t size,
469 : int tranche_id, dsm_segment *segment)
470 : {
471 : dsa_area *area;
472 :
473 17 : area = create_internal(place, size, tranche_id,
474 : DSM_HANDLE_INVALID, NULL);
475 :
476 : /*
477 : * Clean up when the control segment detaches, if a containing DSM segment
478 : * was provided.
479 : */
480 17 : if (segment != NULL)
481 17 : on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
482 : PointerGetDatum(place));
483 :
484 17 : return area;
485 : }
486 :
487 : /*
488 : * Obtain a handle that can be passed to other processes so that they can
489 : * attach to the given area. Cannot be called for areas created with
490 : * dsa_create_in_place.
491 : */
492 : dsa_handle
493 0 : dsa_get_handle(dsa_area *area)
494 : {
495 0 : Assert(area->control->handle != DSM_HANDLE_INVALID);
496 0 : return area->control->handle;
497 : }
498 :
499 : /*
500 : * Attach to an area given a handle generated (possibly in another process) by
501 : * dsa_get_handle. The area must have been created with dsa_create (not
502 : * dsa_create_in_place).
503 : */
504 : dsa_area *
505 0 : dsa_attach(dsa_handle handle)
506 : {
507 : dsm_segment *segment;
508 : dsa_area *area;
509 :
510 : /*
511 : * An area handle is really a DSM segment handle for the first segment, so
512 : * we go ahead and attach to that.
513 : */
514 0 : segment = dsm_attach(handle);
515 0 : if (segment == NULL)
516 0 : ereport(ERROR,
517 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
518 : errmsg("could not attach to dynamic shared area")));
519 :
520 0 : area = attach_internal(dsm_segment_address(segment), segment, handle);
521 :
522 : /* Clean up when the control segment detaches. */
523 0 : on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
524 0 : PointerGetDatum(dsm_segment_address(segment)));
525 :
526 0 : return area;
527 : }
528 :
529 : /*
530 : * Attach to an area that was created with dsa_create_in_place. The caller
531 : * must somehow know the location in memory that was used when the area was
532 : * created, though it may be mapped at a different virtual address in this
533 : * process.
534 : *
535 : * See dsa_create_in_place for note about releasing in-place areas, and the
536 : * optional 'segment' argument which can be provided to allow automatic
537 : * release if the containing memory happens to be a DSM segment.
538 : */
539 : dsa_area *
540 115 : dsa_attach_in_place(void *place, dsm_segment *segment)
541 : {
542 : dsa_area *area;
543 :
544 115 : area = attach_internal(place, NULL, DSM_HANDLE_INVALID);
545 :
546 : /*
547 : * Clean up when the control segment detaches, if a containing DSM segment
548 : * was provided.
549 : */
550 115 : if (segment != NULL)
551 115 : on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
552 : PointerGetDatum(place));
553 :
554 115 : return area;
555 : }
556 :
557 : /*
558 : * Release a DSA area that was produced by dsa_create_in_place or
559 : * dsa_attach_in_place. The 'segment' argument is ignored but provides an
560 : * interface suitable for on_dsm_detach, for the convenience of users who want
561 : * to create a DSA segment inside an existing DSM segment and have it
562 : * automatically released when the containing DSM segment is detached.
563 : * 'place' should be the address of the place where the area was created.
564 : *
565 : * This callback is automatically registered for the DSM segment containing
566 : * the control object of in-place areas when a segment is provided to
567 : * dsa_create_in_place or dsa_attach_in_place, and also for all areas created
568 : * with dsa_create.
569 : */
570 : void
571 132 : dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place)
572 : {
573 132 : dsa_release_in_place(DatumGetPointer(place));
574 132 : }
575 :
576 : /*
577 : * Release a DSA area that was produced by dsa_create_in_place or
578 : * dsa_attach_in_place. The 'code' argument is ignored but provides an
579 : * interface suitable for on_shmem_exit or before_shmem_exit, for the
580 : * convenience of users who want to create a DSA segment inside shared memory
581 : * other than a DSM segment and have it automatically release at backend exit.
582 : * 'place' should be the address of the place where the area was created.
583 : */
584 : void
585 0 : dsa_on_shmem_exit_release_in_place(int code, Datum place)
586 : {
587 0 : dsa_release_in_place(DatumGetPointer(place));
588 0 : }
589 :
590 : /*
591 : * Release a DSA area that was produced by dsa_create_in_place or
592 : * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX'
593 : * callbacks so that this is managed automatically, because failure to release
594 : * an area created in-place leaks its segments permanently.
595 : *
596 : * This is also called automatically for areas produced by dsa_create or
597 : * dsa_attach as an implementation detail.
598 : */
599 : void
600 132 : dsa_release_in_place(void *place)
601 : {
602 132 : dsa_area_control *control = (dsa_area_control *) place;
603 : int i;
604 :
605 132 : LWLockAcquire(&control->lock, LW_EXCLUSIVE);
606 132 : Assert(control->segment_header.magic ==
607 : (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0));
608 132 : Assert(control->refcnt > 0);
609 132 : if (--control->refcnt == 0)
610 : {
611 36 : for (i = 0; i <= control->high_segment_index; ++i)
612 : {
613 : dsm_handle handle;
614 :
615 19 : handle = control->segment_handles[i];
616 19 : if (handle != DSM_HANDLE_INVALID)
617 2 : dsm_unpin_segment(handle);
618 : }
619 : }
620 132 : LWLockRelease(&control->lock);
621 132 : }
622 :
623 : /*
624 : * Keep a DSA area attached until end of session or explicit detach.
625 : *
626 : * By default, areas are owned by the current resource owner, which means they
627 : * are detached automatically when that scope ends.
628 : */
629 : void
630 0 : dsa_pin_mapping(dsa_area *area)
631 : {
632 : int i;
633 :
634 0 : Assert(!area->mapping_pinned);
635 0 : area->mapping_pinned = true;
636 :
637 0 : for (i = 0; i <= area->high_segment_index; ++i)
638 0 : if (area->segment_maps[i].segment != NULL)
639 0 : dsm_pin_mapping(area->segment_maps[i].segment);
640 0 : }
641 :
642 : /*
643 : * Allocate memory in this storage area. The return value is a dsa_pointer
644 : * that can be passed to other processes, and converted to a local pointer
645 : * with dsa_get_address. 'flags' is a bitmap which should be constructed
646 : * from the following values:
647 : *
648 : * DSA_ALLOC_HUGE allows allocations >= 1GB. Otherwise, such allocations
649 : * will result in an ERROR.
650 : *
651 : * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when
652 : * no memory is available or a size limit establed by set_dsa_size_limit
653 : * would be exceeded. Otherwise, such allocations will result in an ERROR.
654 : *
655 : * DSA_ALLOC_ZERO causes the allocated memory to be zeroed. Otherwise, the
656 : * contents of newly-allocated memory are indeterminate.
657 : *
658 : * These flags correspond to similarly named flags used by
659 : * MemoryContextAllocExtended(). See also the macros dsa_allocate and
660 : * dsa_allocate0 which expand to a call to this function with commonly used
661 : * flags.
662 : */
663 : dsa_pointer
664 58 : dsa_allocate_extended(dsa_area *area, Size size, int flags)
665 : {
666 : uint16 size_class;
667 : dsa_pointer start_pointer;
668 : dsa_segment_map *segment_map;
669 : dsa_pointer result;
670 :
671 58 : Assert(size > 0);
672 :
673 : /* Sanity check on huge individual allocation size. */
674 116 : if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
675 92 : ((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
676 0 : elog(ERROR, "invalid DSA memory alloc request size %zu", size);
677 :
678 : /*
679 : * If bigger than the largest size class, just grab a run of pages from
680 : * the free page manager, instead of allocating an object from a pool.
681 : * There will still be a span, but it's a special class of span that
682 : * manages this whole allocation and simply gives all pages back to the
683 : * free page manager when dsa_free is called.
684 : */
685 58 : if (size > dsa_size_classes[lengthof(dsa_size_classes) - 1])
686 : {
687 24 : Size npages = fpm_size_to_pages(size);
688 : Size first_page;
689 : dsa_pointer span_pointer;
690 24 : dsa_area_pool *pool = &area->control->pools[DSA_SCLASS_SPAN_LARGE];
691 :
692 : /* Obtain a span object. */
693 24 : span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
694 24 : if (!DsaPointerIsValid(span_pointer))
695 0 : return InvalidDsaPointer;
696 :
697 24 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
698 :
699 : /* Find a segment from which to allocate. */
700 24 : segment_map = get_best_segment(area, npages);
701 24 : if (segment_map == NULL)
702 0 : segment_map = make_new_segment(area, npages);
703 24 : if (segment_map == NULL)
704 : {
705 : /* Can't make any more segments: game over. */
706 0 : LWLockRelease(DSA_AREA_LOCK(area));
707 0 : dsa_free(area, span_pointer);
708 :
709 : /* Raise error unless asked not to. */
710 0 : if ((flags & DSA_ALLOC_NO_OOM) == 0)
711 0 : ereport(ERROR,
712 : (errcode(ERRCODE_OUT_OF_MEMORY),
713 : errmsg("out of memory"),
714 : errdetail("Failed on DSA request of size %zu.",
715 : size)));
716 0 : return InvalidDsaPointer;
717 : }
718 :
719 : /*
720 : * Ask the free page manager for a run of pages. This should always
721 : * succeed, since both get_best_segment and make_new_segment should
722 : * only return a non-NULL pointer if it actually contains enough
723 : * contiguous freespace. If it does fail, something in our backend
724 : * private state is out of whack, so use FATAL to kill the process.
725 : */
726 24 : if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
727 0 : elog(FATAL,
728 : "dsa_allocate could not find %zu free pages", npages);
729 24 : LWLockRelease(DSA_AREA_LOCK(area));
730 :
731 24 : start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map),
732 : first_page * FPM_PAGE_SIZE);
733 :
734 : /* Initialize span and pagemap. */
735 24 : LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE),
736 : LW_EXCLUSIVE);
737 24 : init_span(area, span_pointer, pool, start_pointer, npages,
738 : DSA_SCLASS_SPAN_LARGE);
739 24 : segment_map->pagemap[first_page] = span_pointer;
740 24 : LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE));
741 :
742 : /* Zero-initialize the memory if requested. */
743 24 : if ((flags & DSA_ALLOC_ZERO) != 0)
744 24 : memset(dsa_get_address(area, start_pointer), 0, size);
745 :
746 24 : return start_pointer;
747 : }
748 :
749 : /* Map allocation to a size class. */
750 34 : if (size < lengthof(dsa_size_class_map) * DSA_SIZE_CLASS_MAP_QUANTUM)
751 : {
752 : int mapidx;
753 :
754 : /* For smaller sizes we have a lookup table... */
755 46 : mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) /
756 23 : DSA_SIZE_CLASS_MAP_QUANTUM) - 1;
757 23 : size_class = dsa_size_class_map[mapidx];
758 : }
759 : else
760 : {
761 : uint16 min;
762 : uint16 max;
763 :
764 : /* ... and for the rest we search by binary chop. */
765 11 : min = dsa_size_class_map[lengthof(dsa_size_class_map) - 1];
766 11 : max = lengthof(dsa_size_classes) - 1;
767 :
768 66 : while (min < max)
769 : {
770 44 : uint16 mid = (min + max) / 2;
771 44 : uint16 class_size = dsa_size_classes[mid];
772 :
773 44 : if (class_size < size)
774 12 : min = mid + 1;
775 : else
776 32 : max = mid;
777 : }
778 :
779 11 : size_class = min;
780 : }
781 34 : Assert(size <= dsa_size_classes[size_class]);
782 34 : Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]);
783 :
784 : /* Attempt to allocate an object from the appropriate pool. */
785 34 : result = alloc_object(area, size_class);
786 :
787 : /* Check for failure to allocate. */
788 34 : if (!DsaPointerIsValid(result))
789 : {
790 : /* Raise error unless asked not to. */
791 0 : if ((flags & DSA_ALLOC_NO_OOM) == 0)
792 : {
793 0 : ereport(ERROR,
794 : (errcode(ERRCODE_OUT_OF_MEMORY),
795 : errmsg("out of memory"),
796 : errdetail("Failed on DSA request of size %zu.", size)));
797 : }
798 0 : return InvalidDsaPointer;
799 : }
800 :
801 : /* Zero-initialize the memory if requested. */
802 34 : if ((flags & DSA_ALLOC_ZERO) != 0)
803 22 : memset(dsa_get_address(area, result), 0, size);
804 :
805 34 : return result;
806 : }
807 :
808 : /*
809 : * Free memory obtained with dsa_allocate.
810 : */
811 : void
812 71 : dsa_free(dsa_area *area, dsa_pointer dp)
813 : {
814 : dsa_segment_map *segment_map;
815 : int pageno;
816 : dsa_pointer span_pointer;
817 : dsa_area_span *span;
818 : char *superblock;
819 : char *object;
820 : Size size;
821 : int size_class;
822 :
823 : /* Make sure we don't have a stale segment in the slot 'dp' refers to. */
824 71 : check_for_freed_segments(area);
825 :
826 : /* Locate the object, span and pool. */
827 71 : segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp));
828 71 : pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE;
829 71 : span_pointer = segment_map->pagemap[pageno];
830 71 : span = dsa_get_address(area, span_pointer);
831 71 : superblock = dsa_get_address(area, span->start);
832 71 : object = dsa_get_address(area, dp);
833 71 : size_class = span->size_class;
834 71 : size = dsa_size_classes[size_class];
835 :
836 : /*
837 : * Special case for large objects that live in a special span: we return
838 : * those pages directly to the free page manager and free the span.
839 : */
840 71 : if (span->size_class == DSA_SCLASS_SPAN_LARGE)
841 : {
842 :
843 : #ifdef CLOBBER_FREED_MEMORY
844 22 : memset(object, 0x7f, span->npages * FPM_PAGE_SIZE);
845 : #endif
846 :
847 : /* Give pages back to free page manager. */
848 22 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
849 44 : FreePageManagerPut(segment_map->fpm,
850 22 : DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE,
851 : span->npages);
852 22 : LWLockRelease(DSA_AREA_LOCK(area));
853 : /* Unlink span. */
854 22 : LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE),
855 : LW_EXCLUSIVE);
856 22 : unlink_span(area, span);
857 22 : LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE));
858 : /* Free the span object so it can be reused. */
859 22 : dsa_free(area, span_pointer);
860 93 : return;
861 : }
862 :
863 : #ifdef CLOBBER_FREED_MEMORY
864 49 : memset(object, 0x7f, size);
865 : #endif
866 :
867 49 : LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
868 :
869 : /* Put the object on the span's freelist. */
870 49 : Assert(object >= superblock);
871 49 : Assert(object < superblock + DSA_SUPERBLOCK_SIZE);
872 49 : Assert((object - superblock) % size == 0);
873 49 : NextFreeObjectIndex(object) = span->firstfree;
874 49 : span->firstfree = (object - superblock) / size;
875 49 : ++span->nallocatable;
876 :
877 : /*
878 : * See if the span needs to moved to a different fullness class, or be
879 : * freed so its pages can be given back to the segment.
880 : */
881 49 : if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1)
882 : {
883 : /*
884 : * The block was completely full and is located in the
885 : * highest-numbered fullness class, which is never scanned for free
886 : * chunks. We must move it to the next-lower fullness class.
887 : */
888 0 : unlink_span(area, span);
889 0 : add_span_to_fullness_class(area, span, span_pointer,
890 : DSA_FULLNESS_CLASSES - 2);
891 :
892 : /*
893 : * If this is the only span, and there is no active span, then we
894 : * should probably move this span to fullness class 1. (Otherwise if
895 : * you allocate exactly all the objects in the only span, it moves to
896 : * class 3, then you free them all, it moves to 2, and then is given
897 : * back, leaving no active span).
898 : */
899 : }
900 67 : else if (span->nallocatable == span->nmax &&
901 36 : (span->fclass != 1 || span->prevspan != InvalidDsaPointer))
902 : {
903 : /*
904 : * This entire block is free, and it's not the active block for this
905 : * size class. Return the memory to the free page manager. We don't
906 : * do this for the active block to prevent hysteresis: if we
907 : * repeatedly allocate and free the only chunk in the active block, it
908 : * will be very inefficient if we deallocate and reallocate the block
909 : * every time.
910 : */
911 0 : destroy_superblock(area, span_pointer);
912 : }
913 :
914 49 : LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
915 : }
916 :
917 : /*
918 : * Obtain a backend-local address for a dsa_pointer. 'dp' must point to
919 : * memory allocated by the given area (possibly in another process) that
920 : * hasn't yet been freed. This may cause a segment to be mapped into the
921 : * current process if required, and may cause freed segments to be unmapped.
922 : */
923 : void *
924 965 : dsa_get_address(dsa_area *area, dsa_pointer dp)
925 : {
926 : dsa_segment_index index;
927 : Size offset;
928 :
929 : /* Convert InvalidDsaPointer to NULL. */
930 965 : if (!DsaPointerIsValid(dp))
931 20 : return NULL;
932 :
933 : /* Process any requests to detach from freed segments. */
934 945 : check_for_freed_segments(area);
935 :
936 : /* Break the dsa_pointer into its components. */
937 945 : index = DSA_EXTRACT_SEGMENT_NUMBER(dp);
938 945 : offset = DSA_EXTRACT_OFFSET(dp);
939 945 : Assert(index < DSA_MAX_SEGMENTS);
940 :
941 : /* Check if we need to cause this segment to be mapped in. */
942 945 : if (unlikely(area->segment_maps[index].mapped_address == NULL))
943 : {
944 : /* Call for effect (we don't need the result). */
945 44 : get_segment_by_index(area, index);
946 : }
947 :
948 945 : return area->segment_maps[index].mapped_address + offset;
949 : }
950 :
951 : /*
952 : * Pin this area, so that it will continue to exist even if all backends
953 : * detach from it. In that case, the area can still be reattached to if a
954 : * handle has been recorded somewhere.
955 : */
956 : void
957 0 : dsa_pin(dsa_area *area)
958 : {
959 0 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
960 0 : if (area->control->pinned)
961 : {
962 0 : LWLockRelease(DSA_AREA_LOCK(area));
963 0 : elog(ERROR, "dsa_area already pinned");
964 : }
965 0 : area->control->pinned = true;
966 0 : ++area->control->refcnt;
967 0 : LWLockRelease(DSA_AREA_LOCK(area));
968 0 : }
969 :
970 : /*
971 : * Undo the effects of dsa_pin, so that the given area can be freed when no
972 : * backends are attached to it. May be called only if dsa_pin has been
973 : * called.
974 : */
975 : void
976 0 : dsa_unpin(dsa_area *area)
977 : {
978 0 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
979 0 : Assert(area->control->refcnt > 1);
980 0 : if (!area->control->pinned)
981 : {
982 0 : LWLockRelease(DSA_AREA_LOCK(area));
983 0 : elog(ERROR, "dsa_area not pinned");
984 : }
985 0 : area->control->pinned = false;
986 0 : --area->control->refcnt;
987 0 : LWLockRelease(DSA_AREA_LOCK(area));
988 0 : }
989 :
990 : /*
991 : * Set the total size limit for this area. This limit is checked whenever new
992 : * segments need to be allocated from the operating system. If the new size
993 : * limit is already exceeded, this has no immediate effect.
994 : *
995 : * Note that the total virtual memory usage may be temporarily larger than
996 : * this limit when segments have been freed, but not yet detached by all
997 : * backends that have attached to them.
998 : */
999 : void
1000 0 : dsa_set_size_limit(dsa_area *area, Size limit)
1001 : {
1002 0 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1003 0 : area->control->max_total_segment_size = limit;
1004 0 : LWLockRelease(DSA_AREA_LOCK(area));
1005 0 : }
1006 :
1007 : /*
1008 : * Aggressively free all spare memory in the hope of returning DSM segments to
1009 : * the operating system.
1010 : */
1011 : void
1012 0 : dsa_trim(dsa_area *area)
1013 : {
1014 : int size_class;
1015 :
1016 : /*
1017 : * Trim in reverse pool order so we get to the spans-of-spans last, just
1018 : * in case any become entirely free while processing all the other pools.
1019 : */
1020 0 : for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class)
1021 : {
1022 0 : dsa_area_pool *pool = &area->control->pools[size_class];
1023 : dsa_pointer span_pointer;
1024 :
1025 0 : if (size_class == DSA_SCLASS_SPAN_LARGE)
1026 : {
1027 : /* Large object frees give back segments aggressively already. */
1028 0 : continue;
1029 : }
1030 :
1031 : /*
1032 : * Search fullness class 1 only. That is where we expect to find an
1033 : * entirely empty superblock (entirely empty superblocks in other
1034 : * fullness classes are returned to the free page map by dsa_free).
1035 : */
1036 0 : LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1037 0 : span_pointer = pool->spans[1];
1038 0 : while (DsaPointerIsValid(span_pointer))
1039 : {
1040 0 : dsa_area_span *span = dsa_get_address(area, span_pointer);
1041 0 : dsa_pointer next = span->nextspan;
1042 :
1043 0 : if (span->nallocatable == span->nmax)
1044 0 : destroy_superblock(area, span_pointer);
1045 :
1046 0 : span_pointer = next;
1047 : }
1048 0 : LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1049 : }
1050 0 : }
1051 :
1052 : /*
1053 : * Print out debugging information about the internal state of the shared
1054 : * memory area.
1055 : */
1056 : void
1057 0 : dsa_dump(dsa_area *area)
1058 : {
1059 : Size i,
1060 : j;
1061 :
1062 : /*
1063 : * Note: This gives an inconsistent snapshot as it acquires and releases
1064 : * individual locks as it goes...
1065 : */
1066 :
1067 0 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1068 0 : fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
1069 0 : fprintf(stderr, " max_total_segment_size: %zu\n",
1070 0 : area->control->max_total_segment_size);
1071 0 : fprintf(stderr, " total_segment_size: %zu\n",
1072 0 : area->control->total_segment_size);
1073 0 : fprintf(stderr, " refcnt: %d\n", area->control->refcnt);
1074 0 : fprintf(stderr, " pinned: %c\n", area->control->pinned ? 't' : 'f');
1075 0 : fprintf(stderr, " segment bins:\n");
1076 0 : for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1077 : {
1078 0 : if (area->control->segment_bins[i] != DSA_SEGMENT_INDEX_NONE)
1079 : {
1080 : dsa_segment_index segment_index;
1081 :
1082 0 : fprintf(stderr,
1083 : " segment bin %zu (at least %d contiguous pages free):\n",
1084 0 : i, 1 << (i - 1));
1085 0 : segment_index = area->control->segment_bins[i];
1086 0 : while (segment_index != DSA_SEGMENT_INDEX_NONE)
1087 : {
1088 : dsa_segment_map *segment_map;
1089 :
1090 0 : segment_map =
1091 : get_segment_by_index(area, segment_index);
1092 :
1093 0 : fprintf(stderr,
1094 : " segment index %zu, usable_pages = %zu, "
1095 : "contiguous_pages = %zu, mapped at %p\n",
1096 : segment_index,
1097 0 : segment_map->header->usable_pages,
1098 0 : fpm_largest(segment_map->fpm),
1099 : segment_map->mapped_address);
1100 0 : segment_index = segment_map->header->next;
1101 : }
1102 : }
1103 : }
1104 0 : LWLockRelease(DSA_AREA_LOCK(area));
1105 :
1106 0 : fprintf(stderr, " pools:\n");
1107 0 : for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1108 : {
1109 0 : bool found = false;
1110 :
1111 0 : LWLockAcquire(DSA_SCLASS_LOCK(area, i), LW_EXCLUSIVE);
1112 0 : for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1113 0 : if (DsaPointerIsValid(area->control->pools[i].spans[j]))
1114 0 : found = true;
1115 0 : if (found)
1116 : {
1117 0 : if (i == DSA_SCLASS_BLOCK_OF_SPANS)
1118 0 : fprintf(stderr, " pool for blocks of span objects:\n");
1119 0 : else if (i == DSA_SCLASS_SPAN_LARGE)
1120 0 : fprintf(stderr, " pool for large object spans:\n");
1121 : else
1122 0 : fprintf(stderr,
1123 : " pool for size class %zu (object size %hu bytes):\n",
1124 0 : i, dsa_size_classes[i]);
1125 0 : for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1126 : {
1127 0 : if (!DsaPointerIsValid(area->control->pools[i].spans[j]))
1128 0 : fprintf(stderr, " fullness class %zu is empty\n", j);
1129 : else
1130 : {
1131 0 : dsa_pointer span_pointer = area->control->pools[i].spans[j];
1132 :
1133 0 : fprintf(stderr, " fullness class %zu:\n", j);
1134 0 : while (DsaPointerIsValid(span_pointer))
1135 : {
1136 : dsa_area_span *span;
1137 :
1138 0 : span = dsa_get_address(area, span_pointer);
1139 0 : fprintf(stderr,
1140 : " span descriptor at "
1141 : DSA_POINTER_FORMAT ", superblock at "
1142 : DSA_POINTER_FORMAT
1143 : ", pages = %zu, objects free = %hu/%hu\n",
1144 : span_pointer, span->start, span->npages,
1145 0 : span->nallocatable, span->nmax);
1146 0 : span_pointer = span->nextspan;
1147 : }
1148 : }
1149 : }
1150 : }
1151 0 : LWLockRelease(DSA_SCLASS_LOCK(area, i));
1152 : }
1153 0 : }
1154 :
1155 : /*
1156 : * Return the smallest size that you can successfully provide to
1157 : * dsa_create_in_place.
1158 : */
1159 : Size
1160 34 : dsa_minimum_size(void)
1161 : {
1162 : Size size;
1163 34 : int pages = 0;
1164 :
1165 34 : size = MAXALIGN(sizeof(dsa_area_control)) +
1166 : MAXALIGN(sizeof(FreePageManager));
1167 :
1168 : /* Figure out how many pages we need, including the page map... */
1169 102 : while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages)
1170 : {
1171 34 : ++pages;
1172 34 : size += sizeof(dsa_pointer);
1173 : }
1174 :
1175 34 : return pages * FPM_PAGE_SIZE;
1176 : }
1177 :
1178 : /*
1179 : * Workhorse function for dsa_create and dsa_create_in_place.
1180 : */
1181 : static dsa_area *
1182 17 : create_internal(void *place, size_t size,
1183 : int tranche_id,
1184 : dsm_handle control_handle,
1185 : dsm_segment *control_segment)
1186 : {
1187 : dsa_area_control *control;
1188 : dsa_area *area;
1189 : dsa_segment_map *segment_map;
1190 : Size usable_pages;
1191 : Size total_pages;
1192 : Size metadata_bytes;
1193 : int i;
1194 :
1195 : /* Sanity check on the space we have to work in. */
1196 17 : if (size < dsa_minimum_size())
1197 0 : elog(ERROR, "dsa_area space must be at least %zu, but %zu provided",
1198 : dsa_minimum_size(), size);
1199 :
1200 : /* Now figure out how much space is usable */
1201 17 : total_pages = size / FPM_PAGE_SIZE;
1202 17 : metadata_bytes =
1203 : MAXALIGN(sizeof(dsa_area_control)) +
1204 17 : MAXALIGN(sizeof(FreePageManager)) +
1205 : total_pages * sizeof(dsa_pointer);
1206 : /* Add padding up to next page boundary. */
1207 17 : if (metadata_bytes % FPM_PAGE_SIZE != 0)
1208 17 : metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
1209 17 : Assert(metadata_bytes <= size);
1210 17 : usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE;
1211 :
1212 : /*
1213 : * Initialize the dsa_area_control object located at the start of the
1214 : * space.
1215 : */
1216 17 : control = (dsa_area_control *) place;
1217 17 : control->segment_header.magic =
1218 17 : DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
1219 17 : control->segment_header.next = DSA_SEGMENT_INDEX_NONE;
1220 17 : control->segment_header.prev = DSA_SEGMENT_INDEX_NONE;
1221 17 : control->segment_header.usable_pages = usable_pages;
1222 17 : control->segment_header.freed = false;
1223 17 : control->segment_header.size = DSA_INITIAL_SEGMENT_SIZE;
1224 17 : control->handle = control_handle;
1225 17 : control->max_total_segment_size = (Size) -1;
1226 17 : control->total_segment_size = size;
1227 17 : memset(&control->segment_handles[0], 0,
1228 : sizeof(dsm_handle) * DSA_MAX_SEGMENTS);
1229 17 : control->segment_handles[0] = control_handle;
1230 289 : for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1231 272 : control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE;
1232 17 : control->high_segment_index = 0;
1233 17 : control->refcnt = 1;
1234 17 : control->freed_segment_counter = 0;
1235 17 : control->lwlock_tranche_id = tranche_id;
1236 :
1237 : /*
1238 : * Create the dsa_area object that this backend will use to access the
1239 : * area. Other backends will need to obtain their own dsa_area object by
1240 : * attaching.
1241 : */
1242 17 : area = palloc(sizeof(dsa_area));
1243 17 : area->control = control;
1244 17 : area->mapping_pinned = false;
1245 17 : memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1246 17 : area->high_segment_index = 0;
1247 17 : area->freed_segment_counter = 0;
1248 17 : LWLockInitialize(&control->lock, control->lwlock_tranche_id);
1249 663 : for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1250 646 : LWLockInitialize(DSA_SCLASS_LOCK(area, i),
1251 : control->lwlock_tranche_id);
1252 :
1253 : /* Set up the segment map for this process's mapping. */
1254 17 : segment_map = &area->segment_maps[0];
1255 17 : segment_map->segment = control_segment;
1256 17 : segment_map->mapped_address = place;
1257 17 : segment_map->header = (dsa_segment_header *) place;
1258 17 : segment_map->fpm = (FreePageManager *)
1259 17 : (segment_map->mapped_address +
1260 : MAXALIGN(sizeof(dsa_area_control)));
1261 17 : segment_map->pagemap = (dsa_pointer *)
1262 17 : (segment_map->mapped_address +
1263 : MAXALIGN(sizeof(dsa_area_control)) +
1264 : MAXALIGN(sizeof(FreePageManager)));
1265 :
1266 : /* Set up the free page map. */
1267 17 : FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
1268 : /* There can be 0 usable pages if size is dsa_minimum_size(). */
1269 :
1270 17 : if (usable_pages > 0)
1271 0 : FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
1272 : usable_pages);
1273 :
1274 : /* Put this segment into the appropriate bin. */
1275 17 : control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0;
1276 17 : segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
1277 :
1278 17 : return area;
1279 : }
1280 :
1281 : /*
1282 : * Workhorse function for dsa_attach and dsa_attach_in_place.
1283 : */
1284 : static dsa_area *
1285 115 : attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
1286 : {
1287 : dsa_area_control *control;
1288 : dsa_area *area;
1289 : dsa_segment_map *segment_map;
1290 :
1291 115 : control = (dsa_area_control *) place;
1292 115 : Assert(control->handle == handle);
1293 115 : Assert(control->segment_handles[0] == handle);
1294 115 : Assert(control->segment_header.magic ==
1295 : (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0));
1296 :
1297 : /* Build the backend-local area object. */
1298 115 : area = palloc(sizeof(dsa_area));
1299 115 : area->control = control;
1300 115 : area->mapping_pinned = false;
1301 115 : memset(&area->segment_maps[0], 0,
1302 : sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1303 115 : area->high_segment_index = 0;
1304 :
1305 : /* Set up the segment map for this process's mapping. */
1306 115 : segment_map = &area->segment_maps[0];
1307 115 : segment_map->segment = segment; /* NULL for in-place */
1308 115 : segment_map->mapped_address = place;
1309 115 : segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
1310 115 : segment_map->fpm = (FreePageManager *)
1311 115 : (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)));
1312 115 : segment_map->pagemap = (dsa_pointer *)
1313 115 : (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
1314 : MAXALIGN(sizeof(FreePageManager)));
1315 :
1316 : /* Bump the reference count. */
1317 115 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1318 115 : if (control->refcnt == 0)
1319 : {
1320 : /* We can't attach to a DSA area that has already been destroyed. */
1321 0 : ereport(ERROR,
1322 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1323 : errmsg("could not attach to dynamic shared area")));
1324 : }
1325 115 : ++control->refcnt;
1326 115 : area->freed_segment_counter = area->control->freed_segment_counter;
1327 115 : LWLockRelease(DSA_AREA_LOCK(area));
1328 :
1329 115 : return area;
1330 : }
1331 :
1332 : /*
1333 : * Add a new span to fullness class 1 of the indicated pool.
1334 : */
1335 : static void
1336 31 : init_span(dsa_area *area,
1337 : dsa_pointer span_pointer,
1338 : dsa_area_pool *pool, dsa_pointer start, Size npages,
1339 : uint16 size_class)
1340 : {
1341 31 : dsa_area_span *span = dsa_get_address(area, span_pointer);
1342 31 : Size obsize = dsa_size_classes[size_class];
1343 :
1344 : /*
1345 : * The per-pool lock must be held because we manipulate the span list for
1346 : * this pool.
1347 : */
1348 31 : Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1349 :
1350 : /* Push this span onto the front of the span list for fullness class 1. */
1351 31 : if (DsaPointerIsValid(pool->spans[1]))
1352 : {
1353 13 : dsa_area_span *head = (dsa_area_span *)
1354 13 : dsa_get_address(area, pool->spans[1]);
1355 :
1356 13 : head->prevspan = span_pointer;
1357 : }
1358 31 : span->pool = DsaAreaPoolToDsaPointer(area, pool);
1359 31 : span->nextspan = pool->spans[1];
1360 31 : span->prevspan = InvalidDsaPointer;
1361 31 : pool->spans[1] = span_pointer;
1362 :
1363 31 : span->start = start;
1364 31 : span->npages = npages;
1365 31 : span->size_class = size_class;
1366 31 : span->ninitialized = 0;
1367 31 : if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1368 : {
1369 : /*
1370 : * A block-of-spans contains its own descriptor, so mark one object as
1371 : * initialized and reduce the count of allocatable objects by one.
1372 : * Doing this here has the side effect of also reducing nmax by one,
1373 : * which is important to make sure we free this object at the correct
1374 : * time.
1375 : */
1376 2 : span->ninitialized = 1;
1377 2 : span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
1378 : }
1379 29 : else if (size_class != DSA_SCLASS_SPAN_LARGE)
1380 5 : span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize;
1381 31 : span->firstfree = DSA_SPAN_NOTHING_FREE;
1382 31 : span->nmax = span->nallocatable;
1383 31 : span->fclass = 1;
1384 31 : }
1385 :
1386 : /*
1387 : * Transfer the first span in one fullness class to the head of another
1388 : * fullness class.
1389 : */
1390 : static bool
1391 14 : transfer_first_span(dsa_area *area,
1392 : dsa_area_pool *pool, int fromclass, int toclass)
1393 : {
1394 : dsa_pointer span_pointer;
1395 : dsa_area_span *span;
1396 : dsa_area_span *nextspan;
1397 :
1398 : /* Can't do it if source list is empty. */
1399 14 : span_pointer = pool->spans[fromclass];
1400 14 : if (!DsaPointerIsValid(span_pointer))
1401 14 : return false;
1402 :
1403 : /* Remove span from head of source list. */
1404 0 : span = dsa_get_address(area, span_pointer);
1405 0 : pool->spans[fromclass] = span->nextspan;
1406 0 : if (DsaPointerIsValid(span->nextspan))
1407 : {
1408 0 : nextspan = (dsa_area_span *)
1409 0 : dsa_get_address(area, span->nextspan);
1410 0 : nextspan->prevspan = InvalidDsaPointer;
1411 : }
1412 :
1413 : /* Add span to head of target list. */
1414 0 : span->nextspan = pool->spans[toclass];
1415 0 : pool->spans[toclass] = span_pointer;
1416 0 : if (DsaPointerIsValid(span->nextspan))
1417 : {
1418 0 : nextspan = (dsa_area_span *)
1419 0 : dsa_get_address(area, span->nextspan);
1420 0 : nextspan->prevspan = span_pointer;
1421 : }
1422 0 : span->fclass = toclass;
1423 :
1424 0 : return true;
1425 : }
1426 :
1427 : /*
1428 : * Allocate one object of the requested size class from the given area.
1429 : */
1430 : static inline dsa_pointer
1431 63 : alloc_object(dsa_area *area, int size_class)
1432 : {
1433 63 : dsa_area_pool *pool = &area->control->pools[size_class];
1434 : dsa_area_span *span;
1435 : dsa_pointer block;
1436 : dsa_pointer result;
1437 : char *object;
1438 : Size size;
1439 :
1440 : /*
1441 : * Even though ensure_active_superblock can in turn call alloc_object if
1442 : * it needs to allocate a new span, that's always from a different pool,
1443 : * and the order of lock acquisition is always the same, so it's OK that
1444 : * we hold this lock for the duration of this function.
1445 : */
1446 63 : Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1447 63 : LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1448 :
1449 : /*
1450 : * If there's no active superblock, we must successfully obtain one or
1451 : * fail the request.
1452 : */
1453 70 : if (!DsaPointerIsValid(pool->spans[1]) &&
1454 7 : !ensure_active_superblock(area, pool, size_class))
1455 : {
1456 0 : result = InvalidDsaPointer;
1457 : }
1458 : else
1459 : {
1460 : /*
1461 : * There should be a block in fullness class 1 at this point, and it
1462 : * should never be completely full. Thus we can either pop an object
1463 : * from the free list or, failing that, initialize a new object.
1464 : */
1465 63 : Assert(DsaPointerIsValid(pool->spans[1]));
1466 63 : span = (dsa_area_span *)
1467 63 : dsa_get_address(area, pool->spans[1]);
1468 63 : Assert(span->nallocatable > 0);
1469 63 : block = span->start;
1470 63 : Assert(size_class < DSA_NUM_SIZE_CLASSES);
1471 63 : size = dsa_size_classes[size_class];
1472 63 : if (span->firstfree != DSA_SPAN_NOTHING_FREE)
1473 : {
1474 48 : result = block + span->firstfree * size;
1475 48 : object = dsa_get_address(area, result);
1476 48 : span->firstfree = NextFreeObjectIndex(object);
1477 : }
1478 : else
1479 : {
1480 15 : result = block + span->ninitialized * size;
1481 15 : ++span->ninitialized;
1482 : }
1483 63 : --span->nallocatable;
1484 :
1485 : /* If it's now full, move it to the highest-numbered fullness class. */
1486 63 : if (span->nallocatable == 0)
1487 0 : transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1);
1488 : }
1489 :
1490 63 : Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1491 63 : LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1492 :
1493 63 : return result;
1494 : }
1495 :
1496 : /*
1497 : * Ensure an active (i.e. fullness class 1) superblock, unless all existing
1498 : * superblocks are completely full and no more can be allocated.
1499 : *
1500 : * Fullness classes K of 0..N are loosely intended to represent blocks whose
1501 : * utilization percentage is at least K/N, but we only enforce this rigorously
1502 : * for the highest-numbered fullness class, which always contains exactly
1503 : * those blocks that are completely full. It's otherwise acceptable for a
1504 : * block to be in a higher-numbered fullness class than the one to which it
1505 : * logically belongs. In addition, the active block, which is always the
1506 : * first block in fullness class 1, is permitted to have a higher allocation
1507 : * percentage than would normally be allowable for that fullness class; we
1508 : * don't move it until it's completely full, and then it goes to the
1509 : * highest-numbered fullness class.
1510 : *
1511 : * It might seem odd that the active block is the head of fullness class 1
1512 : * rather than fullness class 0, but experience with other allocators has
1513 : * shown that it's usually better to allocate from a block that's moderately
1514 : * full rather than one that's nearly empty. Insofar as is reasonably
1515 : * possible, we want to avoid performing new allocations in a block that would
1516 : * otherwise become empty soon.
1517 : */
1518 : static bool
1519 7 : ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
1520 : int size_class)
1521 : {
1522 : dsa_pointer span_pointer;
1523 : dsa_pointer start_pointer;
1524 7 : Size obsize = dsa_size_classes[size_class];
1525 : Size nmax;
1526 : int fclass;
1527 7 : Size npages = 1;
1528 : Size first_page;
1529 : Size i;
1530 : dsa_segment_map *segment_map;
1531 :
1532 7 : Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1533 :
1534 : /*
1535 : * Compute the number of objects that will fit in a block of this size
1536 : * class. Span-of-spans blocks are just a single page, and the first
1537 : * object isn't available for use because it describes the block-of-spans
1538 : * itself.
1539 : */
1540 7 : if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1541 2 : nmax = FPM_PAGE_SIZE / obsize - 1;
1542 : else
1543 5 : nmax = DSA_SUPERBLOCK_SIZE / obsize;
1544 :
1545 : /*
1546 : * If fullness class 1 is empty, try to find a span to put in it by
1547 : * scanning higher-numbered fullness classes (excluding the last one,
1548 : * whose blocks are certain to all be completely full).
1549 : */
1550 14 : for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1551 : {
1552 7 : span_pointer = pool->spans[fclass];
1553 :
1554 14 : while (DsaPointerIsValid(span_pointer))
1555 : {
1556 : int tfclass;
1557 : dsa_area_span *span;
1558 : dsa_area_span *nextspan;
1559 : dsa_area_span *prevspan;
1560 : dsa_pointer next_span_pointer;
1561 :
1562 0 : span = (dsa_area_span *)
1563 : dsa_get_address(area, span_pointer);
1564 0 : next_span_pointer = span->nextspan;
1565 :
1566 : /* Figure out what fullness class should contain this span. */
1567 0 : tfclass = (nmax - span->nallocatable)
1568 0 : * (DSA_FULLNESS_CLASSES - 1) / nmax;
1569 :
1570 : /* Look up next span. */
1571 0 : if (DsaPointerIsValid(span->nextspan))
1572 0 : nextspan = (dsa_area_span *)
1573 0 : dsa_get_address(area, span->nextspan);
1574 : else
1575 0 : nextspan = NULL;
1576 :
1577 : /*
1578 : * If utilization has dropped enough that this now belongs in some
1579 : * other fullness class, move it there.
1580 : */
1581 0 : if (tfclass < fclass)
1582 : {
1583 : /* Remove from the current fullness class list. */
1584 0 : if (pool->spans[fclass] == span_pointer)
1585 : {
1586 : /* It was the head; remove it. */
1587 0 : Assert(!DsaPointerIsValid(span->prevspan));
1588 0 : pool->spans[fclass] = span->nextspan;
1589 0 : if (nextspan != NULL)
1590 0 : nextspan->prevspan = InvalidDsaPointer;
1591 : }
1592 : else
1593 : {
1594 : /* It was not the head. */
1595 0 : Assert(DsaPointerIsValid(span->prevspan));
1596 0 : prevspan = (dsa_area_span *)
1597 0 : dsa_get_address(area, span->prevspan);
1598 0 : prevspan->nextspan = span->nextspan;
1599 : }
1600 0 : if (nextspan != NULL)
1601 0 : nextspan->prevspan = span->prevspan;
1602 :
1603 : /* Push onto the head of the new fullness class list. */
1604 0 : span->nextspan = pool->spans[tfclass];
1605 0 : pool->spans[tfclass] = span_pointer;
1606 0 : span->prevspan = InvalidDsaPointer;
1607 0 : if (DsaPointerIsValid(span->nextspan))
1608 : {
1609 0 : nextspan = (dsa_area_span *)
1610 0 : dsa_get_address(area, span->nextspan);
1611 0 : nextspan->prevspan = span_pointer;
1612 : }
1613 0 : span->fclass = tfclass;
1614 : }
1615 :
1616 : /* Advance to next span on list. */
1617 0 : span_pointer = next_span_pointer;
1618 : }
1619 :
1620 : /* Stop now if we found a suitable block. */
1621 7 : if (DsaPointerIsValid(pool->spans[1]))
1622 0 : return true;
1623 : }
1624 :
1625 : /*
1626 : * If there are no blocks that properly belong in fullness class 1, pick
1627 : * one from some other fullness class and move it there anyway, so that we
1628 : * have an allocation target. Our last choice is to transfer a block
1629 : * that's almost empty (and might become completely empty soon if left
1630 : * alone), but even that is better than failing, which is what we must do
1631 : * if there are no blocks at all with freespace.
1632 : */
1633 7 : Assert(!DsaPointerIsValid(pool->spans[1]));
1634 14 : for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1635 7 : if (transfer_first_span(area, pool, fclass, 1))
1636 0 : return true;
1637 14 : if (!DsaPointerIsValid(pool->spans[1]) &&
1638 7 : transfer_first_span(area, pool, 0, 1))
1639 0 : return true;
1640 :
1641 : /*
1642 : * We failed to find an existing span with free objects, so we need to
1643 : * allocate a new superblock and construct a new span to manage it.
1644 : *
1645 : * First, get a dsa_area_span object to describe the new superblock block
1646 : * ... unless this allocation is for a dsa_area_span object, in which case
1647 : * that's surely not going to work. We handle that case by storing the
1648 : * span describing a block-of-spans inline.
1649 : */
1650 7 : if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1651 : {
1652 5 : span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
1653 5 : if (!DsaPointerIsValid(span_pointer))
1654 0 : return false;
1655 5 : npages = DSA_PAGES_PER_SUPERBLOCK;
1656 : }
1657 :
1658 : /* Find or create a segment and allocate the superblock. */
1659 7 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1660 7 : segment_map = get_best_segment(area, npages);
1661 7 : if (segment_map == NULL)
1662 : {
1663 2 : segment_map = make_new_segment(area, npages);
1664 2 : if (segment_map == NULL)
1665 : {
1666 0 : LWLockRelease(DSA_AREA_LOCK(area));
1667 0 : return false;
1668 : }
1669 : }
1670 7 : if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
1671 : {
1672 0 : LWLockRelease(DSA_AREA_LOCK(area));
1673 0 : if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1674 0 : dsa_free(area, span_pointer);
1675 0 : return false;
1676 : }
1677 7 : LWLockRelease(DSA_AREA_LOCK(area));
1678 :
1679 : /* Compute the start of the superblock. */
1680 7 : start_pointer =
1681 7 : DSA_MAKE_POINTER(get_segment_index(area, segment_map),
1682 : first_page * FPM_PAGE_SIZE);
1683 :
1684 : /*
1685 : * If this is a block-of-spans, carve the descriptor right out of the
1686 : * allocated space.
1687 : */
1688 7 : if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1689 : {
1690 : /*
1691 : * We have a pointer into the segment. We need to build a dsa_pointer
1692 : * from the segment index and offset into the segment.
1693 : */
1694 2 : span_pointer = start_pointer;
1695 : }
1696 :
1697 : /* Initialize span and pagemap. */
1698 7 : init_span(area, span_pointer, pool, start_pointer, npages, size_class);
1699 89 : for (i = 0; i < npages; ++i)
1700 82 : segment_map->pagemap[first_page + i] = span_pointer;
1701 :
1702 7 : return true;
1703 : }
1704 :
1705 : /*
1706 : * Return the segment map corresponding to a given segment index, mapping the
1707 : * segment in if necessary. For internal segment book-keeping, this is called
1708 : * with the area lock held. It is also called by dsa_free and dsa_get_address
1709 : * without any locking, relying on the fact they have a known live segment
1710 : * index and they always call check_for_freed_segments to ensures that any
1711 : * freed segment occupying the same slot is detached first.
1712 : */
1713 : static dsa_segment_map *
1714 144 : get_segment_by_index(dsa_area *area, dsa_segment_index index)
1715 : {
1716 144 : if (unlikely(area->segment_maps[index].mapped_address == NULL))
1717 : {
1718 : dsm_handle handle;
1719 : dsm_segment *segment;
1720 : dsa_segment_map *segment_map;
1721 :
1722 : /*
1723 : * If we are reached by dsa_free or dsa_get_address, there must be at
1724 : * least one object allocated in the referenced segment. Otherwise,
1725 : * their caller has a double-free or access-after-free bug, which we
1726 : * have no hope of detecting. So we know it's safe to access this
1727 : * array slot without holding a lock; it won't change underneath us.
1728 : * Furthermore, we know that we can see the latest contents of the
1729 : * slot, as explained in check_for_freed_segments, which those
1730 : * functions call before arriving here.
1731 : */
1732 44 : handle = area->control->segment_handles[index];
1733 :
1734 : /* It's an error to try to access an unused slot. */
1735 44 : if (handle == DSM_HANDLE_INVALID)
1736 0 : elog(ERROR,
1737 : "dsa_area could not attach to a segment that has been freed");
1738 :
1739 44 : segment = dsm_attach(handle);
1740 44 : if (segment == NULL)
1741 0 : elog(ERROR, "dsa_area could not attach to segment");
1742 44 : if (area->mapping_pinned)
1743 0 : dsm_pin_mapping(segment);
1744 44 : segment_map = &area->segment_maps[index];
1745 44 : segment_map->segment = segment;
1746 44 : segment_map->mapped_address = dsm_segment_address(segment);
1747 44 : segment_map->header =
1748 44 : (dsa_segment_header *) segment_map->mapped_address;
1749 44 : segment_map->fpm = (FreePageManager *)
1750 44 : (segment_map->mapped_address +
1751 : MAXALIGN(sizeof(dsa_segment_header)));
1752 44 : segment_map->pagemap = (dsa_pointer *)
1753 44 : (segment_map->mapped_address +
1754 : MAXALIGN(sizeof(dsa_segment_header)) +
1755 : MAXALIGN(sizeof(FreePageManager)));
1756 :
1757 : /* Remember the highest index this backend has ever mapped. */
1758 44 : if (area->high_segment_index < index)
1759 44 : area->high_segment_index = index;
1760 :
1761 44 : Assert(segment_map->header->magic ==
1762 : (DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ index));
1763 : }
1764 :
1765 144 : return &area->segment_maps[index];
1766 : }
1767 :
1768 : /*
1769 : * Return a superblock to the free page manager. If the underlying segment
1770 : * has become entirely free, then return it to the operating system.
1771 : *
1772 : * The appropriate pool lock must be held.
1773 : */
1774 : static void
1775 0 : destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
1776 : {
1777 0 : dsa_area_span *span = dsa_get_address(area, span_pointer);
1778 0 : int size_class = span->size_class;
1779 : dsa_segment_map *segment_map;
1780 :
1781 0 : segment_map =
1782 0 : get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(span->start));
1783 :
1784 : /* Remove it from its fullness class list. */
1785 0 : unlink_span(area, span);
1786 :
1787 : /*
1788 : * Note: Here we acquire the area lock while we already hold a per-pool
1789 : * lock. We never hold the area lock and then take a pool lock, or we
1790 : * could deadlock.
1791 : */
1792 0 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1793 0 : FreePageManagerPut(segment_map->fpm,
1794 0 : DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE,
1795 : span->npages);
1796 : /* Check if the segment is now entirely free. */
1797 0 : if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages)
1798 : {
1799 0 : dsa_segment_index index = get_segment_index(area, segment_map);
1800 :
1801 : /* If it's not the segment with extra control data, free it. */
1802 0 : if (index != 0)
1803 : {
1804 : /*
1805 : * Give it back to the OS, and allow other backends to detect that
1806 : * they need to detach.
1807 : */
1808 0 : unlink_segment(area, segment_map);
1809 0 : segment_map->header->freed = true;
1810 0 : Assert(area->control->total_segment_size >=
1811 : segment_map->header->size);
1812 0 : area->control->total_segment_size -=
1813 0 : segment_map->header->size;
1814 0 : dsm_unpin_segment(dsm_segment_handle(segment_map->segment));
1815 0 : dsm_detach(segment_map->segment);
1816 0 : area->control->segment_handles[index] = DSM_HANDLE_INVALID;
1817 0 : ++area->control->freed_segment_counter;
1818 0 : segment_map->segment = NULL;
1819 0 : segment_map->header = NULL;
1820 0 : segment_map->mapped_address = NULL;
1821 : }
1822 : }
1823 0 : LWLockRelease(DSA_AREA_LOCK(area));
1824 :
1825 : /*
1826 : * Span-of-spans blocks store the span which describes them within the
1827 : * block itself, so freeing the storage implicitly frees the descriptor
1828 : * also. If this is a block of any other type, we need to separately free
1829 : * the span object also. This recursive call to dsa_free will acquire the
1830 : * span pool's lock. We can't deadlock because the acquisition order is
1831 : * always some other pool and then the span pool.
1832 : */
1833 0 : if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1834 0 : dsa_free(area, span_pointer);
1835 0 : }
1836 :
1837 : static void
1838 22 : unlink_span(dsa_area *area, dsa_area_span *span)
1839 : {
1840 22 : if (DsaPointerIsValid(span->nextspan))
1841 : {
1842 0 : dsa_area_span *next = dsa_get_address(area, span->nextspan);
1843 :
1844 0 : next->prevspan = span->prevspan;
1845 : }
1846 22 : if (DsaPointerIsValid(span->prevspan))
1847 : {
1848 13 : dsa_area_span *prev = dsa_get_address(area, span->prevspan);
1849 :
1850 13 : prev->nextspan = span->nextspan;
1851 : }
1852 : else
1853 : {
1854 9 : dsa_area_pool *pool = dsa_get_address(area, span->pool);
1855 :
1856 9 : pool->spans[span->fclass] = span->nextspan;
1857 : }
1858 22 : }
1859 :
1860 : static void
1861 0 : add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
1862 : dsa_pointer span_pointer,
1863 : int fclass)
1864 : {
1865 0 : dsa_area_pool *pool = dsa_get_address(area, span->pool);
1866 :
1867 0 : if (DsaPointerIsValid(pool->spans[fclass]))
1868 : {
1869 0 : dsa_area_span *head = dsa_get_address(area,
1870 : pool->spans[fclass]);
1871 :
1872 0 : head->prevspan = span_pointer;
1873 : }
1874 0 : span->prevspan = InvalidDsaPointer;
1875 0 : span->nextspan = pool->spans[fclass];
1876 0 : pool->spans[fclass] = span_pointer;
1877 0 : span->fclass = fclass;
1878 0 : }
1879 :
1880 : /*
1881 : * Detach from an area that was either created or attached to by this process.
1882 : */
1883 : void
1884 130 : dsa_detach(dsa_area *area)
1885 : {
1886 : int i;
1887 :
1888 : /* Detach from all segments. */
1889 306 : for (i = 0; i <= area->high_segment_index; ++i)
1890 176 : if (area->segment_maps[i].segment != NULL)
1891 46 : dsm_detach(area->segment_maps[i].segment);
1892 :
1893 : /*
1894 : * Note that 'detaching' (= detaching from DSM segments) doesn't include
1895 : * 'releasing' (= adjusting the reference count). It would be nice to
1896 : * combine these operations, but client code might never get around to
1897 : * calling dsa_detach because of an error path, and a detach hook on any
1898 : * particular segment is too late to detach other segments in the area
1899 : * without risking a 'leak' warning in the non-error path.
1900 : */
1901 :
1902 : /* Free the backend-local area object. */
1903 130 : pfree(area);
1904 130 : }
1905 :
1906 : /*
1907 : * Unlink a segment from the bin that contains it.
1908 : */
1909 : static void
1910 0 : unlink_segment(dsa_area *area, dsa_segment_map *segment_map)
1911 : {
1912 0 : if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE)
1913 : {
1914 : dsa_segment_map *prev;
1915 :
1916 0 : prev = get_segment_by_index(area, segment_map->header->prev);
1917 0 : prev->header->next = segment_map->header->next;
1918 : }
1919 : else
1920 : {
1921 0 : Assert(area->control->segment_bins[segment_map->header->bin] ==
1922 : get_segment_index(area, segment_map));
1923 0 : area->control->segment_bins[segment_map->header->bin] =
1924 0 : segment_map->header->next;
1925 : }
1926 0 : if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
1927 : {
1928 : dsa_segment_map *next;
1929 :
1930 0 : next = get_segment_by_index(area, segment_map->header->next);
1931 0 : next->header->prev = segment_map->header->prev;
1932 : }
1933 0 : }
1934 :
1935 : /*
1936 : * Find a segment that could satisfy a request for 'npages' of contiguous
1937 : * memory, or return NULL if none can be found. This may involve attaching to
1938 : * segments that weren't previously attached so that we can query their free
1939 : * pages map.
1940 : */
1941 : static dsa_segment_map *
1942 31 : get_best_segment(dsa_area *area, Size npages)
1943 : {
1944 : Size bin;
1945 :
1946 31 : Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
1947 :
1948 : /*
1949 : * Start searching from the first bin that *might* have enough contiguous
1950 : * pages.
1951 : */
1952 224 : for (bin = contiguous_pages_to_segment_bin(npages);
1953 : bin < DSA_NUM_SEGMENT_BINS;
1954 162 : ++bin)
1955 : {
1956 : /*
1957 : * The minimum contiguous size that any segment in this bin should
1958 : * have. We'll re-bin if we see segments with fewer.
1959 : */
1960 191 : Size threshold = (Size) 1 << (bin - 1);
1961 : dsa_segment_index segment_index;
1962 :
1963 : /* Search this bin for a segment with enough contiguous space. */
1964 191 : segment_index = area->control->segment_bins[bin];
1965 382 : while (segment_index != DSA_SEGMENT_INDEX_NONE)
1966 : {
1967 : dsa_segment_map *segment_map;
1968 : dsa_segment_index next_segment_index;
1969 : Size contiguous_pages;
1970 :
1971 29 : segment_map = get_segment_by_index(area, segment_index);
1972 29 : next_segment_index = segment_map->header->next;
1973 29 : contiguous_pages = fpm_largest(segment_map->fpm);
1974 :
1975 : /* Not enough for the request, still enough for this bin. */
1976 29 : if (contiguous_pages >= threshold && contiguous_pages < npages)
1977 : {
1978 0 : segment_index = next_segment_index;
1979 0 : continue;
1980 : }
1981 :
1982 : /* Re-bin it if it's no longer in the appropriate bin. */
1983 29 : if (contiguous_pages < threshold)
1984 : {
1985 : Size new_bin;
1986 :
1987 0 : new_bin = contiguous_pages_to_segment_bin(contiguous_pages);
1988 :
1989 : /* Remove it from its current bin. */
1990 0 : unlink_segment(area, segment_map);
1991 :
1992 : /* Push it onto the front of its new bin. */
1993 0 : segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
1994 0 : segment_map->header->next =
1995 0 : area->control->segment_bins[new_bin];
1996 0 : segment_map->header->bin = new_bin;
1997 0 : area->control->segment_bins[new_bin] = segment_index;
1998 0 : if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
1999 : {
2000 : dsa_segment_map *next;
2001 :
2002 0 : next = get_segment_by_index(area,
2003 0 : segment_map->header->next);
2004 0 : Assert(next->header->bin == new_bin);
2005 0 : next->header->prev = segment_index;
2006 : }
2007 :
2008 : /*
2009 : * But fall through to see if it's enough to satisfy this
2010 : * request anyway....
2011 : */
2012 : }
2013 :
2014 : /* Check if we are done. */
2015 29 : if (contiguous_pages >= npages)
2016 29 : return segment_map;
2017 :
2018 : /* Continue searching the same bin. */
2019 0 : segment_index = next_segment_index;
2020 : }
2021 : }
2022 :
2023 : /* Not found. */
2024 2 : return NULL;
2025 : }
2026 :
2027 : /*
2028 : * Create a new segment that can handle at least requested_pages. Returns
2029 : * NULL if the requested total size limit or maximum allowed number of
2030 : * segments would be exceeded.
2031 : */
2032 : static dsa_segment_map *
2033 2 : make_new_segment(dsa_area *area, Size requested_pages)
2034 : {
2035 : dsa_segment_index new_index;
2036 : Size metadata_bytes;
2037 : Size total_size;
2038 : Size total_pages;
2039 : Size usable_pages;
2040 : dsa_segment_map *segment_map;
2041 : dsm_segment *segment;
2042 :
2043 2 : Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
2044 :
2045 : /* Find a segment slot that is not in use (linearly for now). */
2046 2 : for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index)
2047 : {
2048 2 : if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID)
2049 2 : break;
2050 : }
2051 2 : if (new_index == DSA_MAX_SEGMENTS)
2052 0 : return NULL;
2053 :
2054 : /*
2055 : * If the total size limit is already exceeded, then we exit early and
2056 : * avoid arithmetic wraparound in the unsigned expressions below.
2057 : */
2058 4 : if (area->control->total_segment_size >=
2059 2 : area->control->max_total_segment_size)
2060 0 : return NULL;
2061 :
2062 : /*
2063 : * The size should be at least as big as requested, and at least big
2064 : * enough to follow a geometric series that approximately doubles the
2065 : * total storage each time we create a new segment. We use geometric
2066 : * growth because the underlying DSM system isn't designed for large
2067 : * numbers of segments (otherwise we might even consider just using one
2068 : * DSM segment for each large allocation and for each superblock, and then
2069 : * we wouldn't need to use FreePageManager).
2070 : *
2071 : * We decide on a total segment size first, so that we produce tidy
2072 : * power-of-two sized segments. This is a good property to have if we
2073 : * move to huge pages in the future. Then we work back to the number of
2074 : * pages we can fit.
2075 : */
2076 2 : total_size = DSA_INITIAL_SEGMENT_SIZE *
2077 2 : ((Size) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE));
2078 2 : total_size = Min(total_size, DSA_MAX_SEGMENT_SIZE);
2079 2 : total_size = Min(total_size,
2080 : area->control->max_total_segment_size -
2081 : area->control->total_segment_size);
2082 :
2083 2 : total_pages = total_size / FPM_PAGE_SIZE;
2084 2 : metadata_bytes =
2085 : MAXALIGN(sizeof(dsa_segment_header)) +
2086 2 : MAXALIGN(sizeof(FreePageManager)) +
2087 : sizeof(dsa_pointer) * total_pages;
2088 :
2089 : /* Add padding up to next page boundary. */
2090 2 : if (metadata_bytes % FPM_PAGE_SIZE != 0)
2091 2 : metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2092 2 : if (total_size <= metadata_bytes)
2093 0 : return NULL;
2094 2 : usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE;
2095 2 : Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size);
2096 :
2097 : /* See if that is enough... */
2098 2 : if (requested_pages > usable_pages)
2099 : {
2100 : /*
2101 : * We'll make an odd-sized segment, working forward from the requested
2102 : * number of pages.
2103 : */
2104 0 : usable_pages = requested_pages;
2105 0 : metadata_bytes =
2106 : MAXALIGN(sizeof(dsa_segment_header)) +
2107 0 : MAXALIGN(sizeof(FreePageManager)) +
2108 : usable_pages * sizeof(dsa_pointer);
2109 :
2110 : /* Add padding up to next page boundary. */
2111 0 : if (metadata_bytes % FPM_PAGE_SIZE != 0)
2112 0 : metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2113 0 : total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE;
2114 :
2115 : /* Is that too large for dsa_pointer's addressing scheme? */
2116 0 : if (total_size > DSA_MAX_SEGMENT_SIZE)
2117 0 : return NULL;
2118 :
2119 : /* Would that exceed the limit? */
2120 0 : if (total_size > area->control->max_total_segment_size -
2121 0 : area->control->total_segment_size)
2122 0 : return NULL;
2123 : }
2124 :
2125 : /* Create the segment. */
2126 2 : segment = dsm_create(total_size, 0);
2127 2 : if (segment == NULL)
2128 0 : return NULL;
2129 2 : dsm_pin_segment(segment);
2130 2 : if (area->mapping_pinned)
2131 0 : dsm_pin_mapping(segment);
2132 :
2133 : /* Store the handle in shared memory to be found by index. */
2134 4 : area->control->segment_handles[new_index] =
2135 2 : dsm_segment_handle(segment);
2136 : /* Track the highest segment index in the history of the area. */
2137 2 : if (area->control->high_segment_index < new_index)
2138 2 : area->control->high_segment_index = new_index;
2139 : /* Track the highest segment index this backend has ever mapped. */
2140 2 : if (area->high_segment_index < new_index)
2141 2 : area->high_segment_index = new_index;
2142 : /* Track total size of all segments. */
2143 2 : area->control->total_segment_size += total_size;
2144 2 : Assert(area->control->total_segment_size <=
2145 : area->control->max_total_segment_size);
2146 :
2147 : /* Build a segment map for this segment in this backend. */
2148 2 : segment_map = &area->segment_maps[new_index];
2149 2 : segment_map->segment = segment;
2150 2 : segment_map->mapped_address = dsm_segment_address(segment);
2151 2 : segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
2152 2 : segment_map->fpm = (FreePageManager *)
2153 2 : (segment_map->mapped_address +
2154 : MAXALIGN(sizeof(dsa_segment_header)));
2155 2 : segment_map->pagemap = (dsa_pointer *)
2156 2 : (segment_map->mapped_address +
2157 : MAXALIGN(sizeof(dsa_segment_header)) +
2158 : MAXALIGN(sizeof(FreePageManager)));
2159 :
2160 : /* Set up the free page map. */
2161 2 : FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
2162 2 : FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
2163 : usable_pages);
2164 :
2165 : /* Set up the segment header and put it in the appropriate bin. */
2166 4 : segment_map->header->magic =
2167 2 : DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index;
2168 2 : segment_map->header->usable_pages = usable_pages;
2169 2 : segment_map->header->size = total_size;
2170 2 : segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
2171 2 : segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2172 4 : segment_map->header->next =
2173 2 : area->control->segment_bins[segment_map->header->bin];
2174 2 : segment_map->header->freed = false;
2175 2 : area->control->segment_bins[segment_map->header->bin] = new_index;
2176 2 : if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2177 : {
2178 0 : dsa_segment_map *next =
2179 0 : get_segment_by_index(area, segment_map->header->next);
2180 :
2181 0 : Assert(next->header->bin == segment_map->header->bin);
2182 0 : next->header->prev = new_index;
2183 : }
2184 :
2185 2 : return segment_map;
2186 : }
2187 :
2188 : /*
2189 : * Check if any segments have been freed by destroy_superblock, so we can
2190 : * detach from them in this backend. This function is called by
2191 : * dsa_get_address and dsa_free to make sure that a dsa_pointer they have
2192 : * received can be resolved to the correct segment.
2193 : *
2194 : * The danger we want to defend against is that there could be an old segment
2195 : * mapped into a given slot in this backend, and the dsa_pointer they have
2196 : * might refer to some new segment in the same slot. So those functions must
2197 : * be sure to process all instructions to detach from a freed segment that had
2198 : * been generated by the time this process received the dsa_pointer, before
2199 : * they call get_segment_by_index.
2200 : */
2201 : static void
2202 1016 : check_for_freed_segments(dsa_area *area)
2203 : {
2204 : Size freed_segment_counter;
2205 :
2206 : /*
2207 : * Any other process that has freed a segment has incremented
2208 : * free_segment_counter while holding an LWLock, and that must precede any
2209 : * backend creating a new segment in the same slot while holding an
2210 : * LWLock, and that must precede the creation of any dsa_pointer pointing
2211 : * into the new segment which might reach us here, and the caller must
2212 : * have sent the dsa_pointer to this process using appropriate memory
2213 : * synchronization (some kind of locking or atomic primitive or system
2214 : * call). So all we need to do on the reading side is ask for the load of
2215 : * freed_segment_counter to follow the caller's load of the dsa_pointer it
2216 : * has, and we can be sure to detect any segments that had been freed as
2217 : * of the time that the dsa_pointer reached this process.
2218 : */
2219 1016 : pg_read_barrier();
2220 1016 : freed_segment_counter = area->control->freed_segment_counter;
2221 1016 : if (unlikely(area->freed_segment_counter != freed_segment_counter))
2222 : {
2223 : int i;
2224 :
2225 : /* Check all currently mapped segments to find what's been freed. */
2226 0 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
2227 0 : for (i = 0; i <= area->high_segment_index; ++i)
2228 : {
2229 0 : if (area->segment_maps[i].header != NULL &&
2230 0 : area->segment_maps[i].header->freed)
2231 : {
2232 0 : dsm_detach(area->segment_maps[i].segment);
2233 0 : area->segment_maps[i].segment = NULL;
2234 0 : area->segment_maps[i].header = NULL;
2235 0 : area->segment_maps[i].mapped_address = NULL;
2236 : }
2237 : }
2238 0 : LWLockRelease(DSA_AREA_LOCK(area));
2239 0 : area->freed_segment_counter = freed_segment_counter;
2240 : }
2241 1016 : }
|