Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * gistbuild.c
4 : * build algorithm for GiST indexes implementation.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/gist/gistbuild.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include <math.h>
18 :
19 : #include "access/genam.h"
20 : #include "access/gist_private.h"
21 : #include "access/gistxlog.h"
22 : #include "access/xloginsert.h"
23 : #include "catalog/index.h"
24 : #include "miscadmin.h"
25 : #include "optimizer/cost.h"
26 : #include "storage/bufmgr.h"
27 : #include "storage/smgr.h"
28 : #include "utils/memutils.h"
29 : #include "utils/rel.h"
30 :
31 : /* Step of index tuples for check whether to switch to buffering build mode */
32 : #define BUFFERING_MODE_SWITCH_CHECK_STEP 256
33 :
34 : /*
35 : * Number of tuples to process in the slow way before switching to buffering
36 : * mode, when buffering is explicitly turned on. Also, the number of tuples
37 : * to process between readjusting the buffer size parameter, while in
38 : * buffering mode.
39 : */
40 : #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
41 :
42 : typedef enum
43 : {
44 : GIST_BUFFERING_DISABLED, /* in regular build mode and aren't going to
45 : * switch */
46 : GIST_BUFFERING_AUTO, /* in regular build mode, but will switch to
47 : * buffering build mode if the index grows too
48 : * big */
49 : GIST_BUFFERING_STATS, /* gathering statistics of index tuple size
50 : * before switching to the buffering build
51 : * mode */
52 : GIST_BUFFERING_ACTIVE /* in buffering build mode */
53 : } GistBufferingMode;
54 :
55 : /* Working state for gistbuild and its callback */
56 : typedef struct
57 : {
58 : Relation indexrel;
59 : GISTSTATE *giststate;
60 :
61 : int64 indtuples; /* number of tuples indexed */
62 : int64 indtuplesSize; /* total size of all indexed tuples */
63 :
64 : Size freespace; /* amount of free space to leave on pages */
65 :
66 : /*
67 : * Extra data structures used during a buffering build. 'gfbb' contains
68 : * information related to managing the build buffers. 'parentMap' is a
69 : * lookup table of the parent of each internal page.
70 : */
71 : GISTBuildBuffers *gfbb;
72 : HTAB *parentMap;
73 :
74 : GistBufferingMode bufferingMode;
75 : } GISTBuildState;
76 :
77 : /* prototypes for private functions */
78 : static void gistInitBuffering(GISTBuildState *buildstate);
79 : static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
80 : static void gistBuildCallback(Relation index,
81 : HeapTuple htup,
82 : Datum *values,
83 : bool *isnull,
84 : bool tupleIsAlive,
85 : void *state);
86 : static void gistBufferingBuildInsert(GISTBuildState *buildstate,
87 : IndexTuple itup);
88 : static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
89 : BlockNumber startblkno, int startlevel);
90 : static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate,
91 : Buffer buffer, int level,
92 : IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
93 : BlockNumber parentblk, OffsetNumber downlinkoffnum);
94 : static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
95 : BlockNumber childblkno, int level,
96 : BlockNumber *parentblk,
97 : OffsetNumber *downlinkoffnum);
98 : static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
99 : static void gistEmptyAllBuffers(GISTBuildState *buildstate);
100 : static int gistGetMaxLevel(Relation index);
101 :
102 : static void gistInitParentMap(GISTBuildState *buildstate);
103 : static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
104 : BlockNumber parent);
105 : static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent);
106 : static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
107 :
108 : /*
109 : * Main entry point to GiST index build. Initially calls insert over and over,
110 : * but switches to more efficient buffering build algorithm after a certain
111 : * number of tuples (unless buffering mode is disabled).
112 : */
113 : IndexBuildResult *
114 24 : gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
115 : {
116 : IndexBuildResult *result;
117 : double reltuples;
118 : GISTBuildState buildstate;
119 : Buffer buffer;
120 : Page page;
121 24 : MemoryContext oldcxt = CurrentMemoryContext;
122 : int fillfactor;
123 :
124 24 : buildstate.indexrel = index;
125 24 : if (index->rd_options)
126 : {
127 : /* Get buffering mode from the options string */
128 0 : GiSTOptions *options = (GiSTOptions *) index->rd_options;
129 0 : char *bufferingMode = (char *) options + options->bufferingModeOffset;
130 :
131 0 : if (strcmp(bufferingMode, "on") == 0)
132 0 : buildstate.bufferingMode = GIST_BUFFERING_STATS;
133 0 : else if (strcmp(bufferingMode, "off") == 0)
134 0 : buildstate.bufferingMode = GIST_BUFFERING_DISABLED;
135 : else
136 0 : buildstate.bufferingMode = GIST_BUFFERING_AUTO;
137 :
138 0 : fillfactor = options->fillfactor;
139 : }
140 : else
141 : {
142 : /*
143 : * By default, switch to buffering mode when the index grows too large
144 : * to fit in cache.
145 : */
146 24 : buildstate.bufferingMode = GIST_BUFFERING_AUTO;
147 24 : fillfactor = GIST_DEFAULT_FILLFACTOR;
148 : }
149 : /* Calculate target amount of free space to leave on pages */
150 24 : buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
151 :
152 : /*
153 : * We expect to be called exactly once for any index relation. If that's
154 : * not the case, big trouble's what we have.
155 : */
156 24 : if (RelationGetNumberOfBlocks(index) != 0)
157 0 : elog(ERROR, "index \"%s\" already contains data",
158 : RelationGetRelationName(index));
159 :
160 : /* no locking is needed */
161 24 : buildstate.giststate = initGISTstate(index);
162 :
163 : /*
164 : * Create a temporary memory context that is reset once for each tuple
165 : * processed. (Note: we don't bother to make this a child of the
166 : * giststate's scanCxt, so we have to delete it separately at the end.)
167 : */
168 24 : buildstate.giststate->tempCxt = createTempGistContext();
169 :
170 : /* initialize the root page */
171 24 : buffer = gistNewBuffer(index);
172 24 : Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
173 24 : page = BufferGetPage(buffer);
174 :
175 24 : START_CRIT_SECTION();
176 :
177 24 : GISTInitBuffer(buffer, F_LEAF);
178 :
179 24 : MarkBufferDirty(buffer);
180 :
181 24 : if (RelationNeedsWAL(index))
182 : {
183 : XLogRecPtr recptr;
184 :
185 21 : XLogBeginInsert();
186 21 : XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT);
187 :
188 21 : recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX);
189 21 : PageSetLSN(page, recptr);
190 : }
191 : else
192 3 : PageSetLSN(page, gistGetFakeLSN(heap));
193 :
194 24 : UnlockReleaseBuffer(buffer);
195 :
196 24 : END_CRIT_SECTION();
197 :
198 : /* build the index */
199 24 : buildstate.indtuples = 0;
200 24 : buildstate.indtuplesSize = 0;
201 :
202 : /*
203 : * Do the heap scan.
204 : */
205 24 : reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
206 : gistBuildCallback, (void *) &buildstate);
207 :
208 : /*
209 : * If buffering was used, flush out all the tuples that are still in the
210 : * buffers.
211 : */
212 24 : if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE)
213 : {
214 0 : elog(DEBUG1, "all tuples processed, emptying buffers");
215 0 : gistEmptyAllBuffers(&buildstate);
216 0 : gistFreeBuildBuffers(buildstate.gfbb);
217 : }
218 :
219 : /* okay, all heap tuples are indexed */
220 24 : MemoryContextSwitchTo(oldcxt);
221 24 : MemoryContextDelete(buildstate.giststate->tempCxt);
222 :
223 24 : freeGISTstate(buildstate.giststate);
224 :
225 : /*
226 : * Return statistics
227 : */
228 24 : result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
229 :
230 24 : result->heap_tuples = reltuples;
231 24 : result->index_tuples = (double) buildstate.indtuples;
232 :
233 24 : return result;
234 : }
235 :
236 : /*
237 : * Validator for "buffering" reloption on GiST indexes. Allows "on", "off"
238 : * and "auto" values.
239 : */
240 : void
241 0 : gistValidateBufferingOption(char *value)
242 : {
243 0 : if (value == NULL ||
244 0 : (strcmp(value, "on") != 0 &&
245 0 : strcmp(value, "off") != 0 &&
246 0 : strcmp(value, "auto") != 0))
247 : {
248 0 : ereport(ERROR,
249 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
250 : errmsg("invalid value for \"buffering\" option"),
251 : errdetail("Valid values are \"on\", \"off\", and \"auto\".")));
252 : }
253 0 : }
254 :
255 : /*
256 : * Attempt to switch to buffering mode.
257 : *
258 : * If there is not enough memory for buffering build, sets bufferingMode
259 : * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
260 : * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
261 : * GIST_BUFFERING_ACTIVE.
262 : */
263 : static void
264 0 : gistInitBuffering(GISTBuildState *buildstate)
265 : {
266 0 : Relation index = buildstate->indexrel;
267 : int pagesPerBuffer;
268 : Size pageFreeSpace;
269 : Size itupAvgSize,
270 : itupMinSize;
271 : double avgIndexTuplesPerPage,
272 : maxIndexTuplesPerPage;
273 : int i;
274 : int levelStep;
275 :
276 : /* Calc space of index page which is available for index tuples */
277 0 : pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
278 : - sizeof(ItemIdData)
279 0 : - buildstate->freespace;
280 :
281 : /*
282 : * Calculate average size of already inserted index tuples using gathered
283 : * statistics.
284 : */
285 0 : itupAvgSize = (double) buildstate->indtuplesSize /
286 0 : (double) buildstate->indtuples;
287 :
288 : /*
289 : * Calculate minimal possible size of index tuple by index metadata.
290 : * Minimal possible size of varlena is VARHDRSZ.
291 : *
292 : * XXX: that's not actually true, as a short varlen can be just 2 bytes.
293 : * And we should take padding into account here.
294 : */
295 0 : itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
296 0 : for (i = 0; i < index->rd_att->natts; i++)
297 : {
298 0 : if (TupleDescAttr(index->rd_att, i)->attlen < 0)
299 0 : itupMinSize += VARHDRSZ;
300 : else
301 0 : itupMinSize += TupleDescAttr(index->rd_att, i)->attlen;
302 : }
303 :
304 : /* Calculate average and maximal number of index tuples which fit to page */
305 0 : avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
306 0 : maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
307 :
308 : /*
309 : * We need to calculate two parameters for the buffering algorithm:
310 : * levelStep and pagesPerBuffer.
311 : *
312 : * levelStep determines the size of subtree that we operate on, while
313 : * emptying a buffer. A higher value is better, as you need fewer buffer
314 : * emptying steps to build the index. However, if you set it too high, the
315 : * subtree doesn't fit in cache anymore, and you quickly lose the benefit
316 : * of the buffers.
317 : *
318 : * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
319 : * the number of tuples on page (ie. fanout), and M is the amount of
320 : * internal memory available. Curiously, they doesn't explain *why* that
321 : * setting is optimal. We calculate it by taking the highest levelStep so
322 : * that a subtree still fits in cache. For a small B, our way of
323 : * calculating levelStep is very close to Arge et al's formula. For a
324 : * large B, our formula gives a value that is 2x higher.
325 : *
326 : * The average size (in pages) of a subtree of depth n can be calculated
327 : * as a geometric series:
328 : *
329 : * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
330 : *
331 : * where B is the average number of index tuples on page. The subtree is
332 : * cached in the shared buffer cache and the OS cache, so we choose
333 : * levelStep so that the subtree size is comfortably smaller than
334 : * effective_cache_size, with a safety factor of 4.
335 : *
336 : * The estimate on the average number of index tuples on page is based on
337 : * average tuple sizes observed before switching to buffered build, so the
338 : * real subtree size can be somewhat larger. Also, it would selfish to
339 : * gobble the whole cache for our index build. The safety factor of 4
340 : * should account for those effects.
341 : *
342 : * The other limiting factor for setting levelStep is that while
343 : * processing a subtree, we need to hold one page for each buffer at the
344 : * next lower buffered level. The max. number of buffers needed for that
345 : * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
346 : * hopefully maintenance_work_mem is set high enough that you're
347 : * constrained by effective_cache_size rather than maintenance_work_mem.
348 : *
349 : * XXX: the buffer hash table consumes a fair amount of memory too per
350 : * buffer, but that is not currently taken into account. That scales on
351 : * the total number of buffers used, ie. the index size and on levelStep.
352 : * Note that a higher levelStep *reduces* the amount of memory needed for
353 : * the hash table.
354 : */
355 0 : levelStep = 1;
356 : for (;;)
357 : {
358 : double subtreesize;
359 : double maxlowestlevelpages;
360 :
361 : /* size of an average subtree at this levelStep (in pages). */
362 0 : subtreesize =
363 0 : (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
364 : (1 - avgIndexTuplesPerPage);
365 :
366 : /* max number of pages at the lowest level of a subtree */
367 0 : maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
368 :
369 : /* subtree must fit in cache (with safety factor of 4) */
370 0 : if (subtreesize > effective_cache_size / 4)
371 0 : break;
372 :
373 : /* each node in the lowest level of a subtree has one page in memory */
374 0 : if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
375 0 : break;
376 :
377 : /* Good, we can handle this levelStep. See if we can go one higher. */
378 0 : levelStep++;
379 0 : }
380 :
381 : /*
382 : * We just reached an unacceptable value of levelStep in previous loop.
383 : * So, decrease levelStep to get last acceptable value.
384 : */
385 0 : levelStep--;
386 :
387 : /*
388 : * If there's not enough cache or maintenance_work_mem, fall back to plain
389 : * inserts.
390 : */
391 0 : if (levelStep <= 0)
392 : {
393 0 : elog(DEBUG1, "failed to switch to buffered GiST build");
394 0 : buildstate->bufferingMode = GIST_BUFFERING_DISABLED;
395 0 : return;
396 : }
397 :
398 : /*
399 : * The second parameter to set is pagesPerBuffer, which determines the
400 : * size of each buffer. We adjust pagesPerBuffer also during the build,
401 : * which is why this calculation is in a separate function.
402 : */
403 0 : pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
404 :
405 : /* Initialize GISTBuildBuffers with these parameters */
406 0 : buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
407 : gistGetMaxLevel(index));
408 :
409 0 : gistInitParentMap(buildstate);
410 :
411 0 : buildstate->bufferingMode = GIST_BUFFERING_ACTIVE;
412 :
413 0 : elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
414 : levelStep, pagesPerBuffer);
415 : }
416 :
417 : /*
418 : * Calculate pagesPerBuffer parameter for the buffering algorithm.
419 : *
420 : * Buffer size is chosen so that assuming that tuples are distributed
421 : * randomly, emptying half a buffer fills on average one page in every buffer
422 : * at the next lower level.
423 : */
424 : static int
425 0 : calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
426 : {
427 : double pagesPerBuffer;
428 : double avgIndexTuplesPerPage;
429 : double itupAvgSize;
430 : Size pageFreeSpace;
431 :
432 : /* Calc space of index page which is available for index tuples */
433 0 : pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
434 : - sizeof(ItemIdData)
435 0 : - buildstate->freespace;
436 :
437 : /*
438 : * Calculate average size of already inserted index tuples using gathered
439 : * statistics.
440 : */
441 0 : itupAvgSize = (double) buildstate->indtuplesSize /
442 0 : (double) buildstate->indtuples;
443 :
444 0 : avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
445 :
446 : /*
447 : * Recalculate required size of buffers.
448 : */
449 0 : pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
450 :
451 0 : return (int) rint(pagesPerBuffer);
452 : }
453 :
454 : /*
455 : * Per-tuple callback from IndexBuildHeapScan.
456 : */
457 : static void
458 51274 : gistBuildCallback(Relation index,
459 : HeapTuple htup,
460 : Datum *values,
461 : bool *isnull,
462 : bool tupleIsAlive,
463 : void *state)
464 : {
465 51274 : GISTBuildState *buildstate = (GISTBuildState *) state;
466 : IndexTuple itup;
467 : MemoryContext oldCtx;
468 :
469 51274 : oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
470 :
471 : /* form an index tuple and point it at the heap tuple */
472 51274 : itup = gistFormTuple(buildstate->giststate, index, values, isnull, true);
473 51274 : itup->t_tid = htup->t_self;
474 :
475 51274 : if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE)
476 : {
477 : /* We have buffers, so use them. */
478 0 : gistBufferingBuildInsert(buildstate, itup);
479 : }
480 : else
481 : {
482 : /*
483 : * There's no buffers (yet). Since we already have the index relation
484 : * locked, we call gistdoinsert directly.
485 : */
486 51274 : gistdoinsert(index, itup, buildstate->freespace,
487 : buildstate->giststate);
488 : }
489 :
490 : /* Update tuple count and total size. */
491 51274 : buildstate->indtuples += 1;
492 51274 : buildstate->indtuplesSize += IndexTupleSize(itup);
493 :
494 51274 : MemoryContextSwitchTo(oldCtx);
495 51274 : MemoryContextReset(buildstate->giststate->tempCxt);
496 :
497 51274 : if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE &&
498 0 : buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
499 : {
500 : /* Adjust the target buffer size now */
501 0 : buildstate->gfbb->pagesPerBuffer =
502 0 : calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
503 : }
504 :
505 : /*
506 : * In 'auto' mode, check if the index has grown too large to fit in cache,
507 : * and switch to buffering mode if it has.
508 : *
509 : * To avoid excessive calls to smgrnblocks(), only check this every
510 : * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples
511 : */
512 102548 : if ((buildstate->bufferingMode == GIST_BUFFERING_AUTO &&
513 51471 : buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
514 51471 : effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) ||
515 51274 : (buildstate->bufferingMode == GIST_BUFFERING_STATS &&
516 0 : buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
517 : {
518 : /*
519 : * Index doesn't fit in effective cache anymore. Try to switch to
520 : * buffering build mode.
521 : */
522 0 : gistInitBuffering(buildstate);
523 : }
524 51274 : }
525 :
526 : /*
527 : * Insert function for buffering index build.
528 : */
529 : static void
530 0 : gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
531 : {
532 : /* Insert the tuple to buffers. */
533 0 : gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
534 :
535 : /* If we filled up (half of a) buffer, process buffer emptying. */
536 0 : gistProcessEmptyingQueue(buildstate);
537 0 : }
538 :
539 : /*
540 : * Process an index tuple. Runs the tuple down the tree until we reach a leaf
541 : * page or node buffer, and inserts the tuple there. Returns true if we have
542 : * to stop buffer emptying process (because one of child buffers can't take
543 : * index tuples anymore).
544 : */
545 : static bool
546 0 : gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
547 : BlockNumber startblkno, int startlevel)
548 : {
549 0 : GISTSTATE *giststate = buildstate->giststate;
550 0 : GISTBuildBuffers *gfbb = buildstate->gfbb;
551 0 : Relation indexrel = buildstate->indexrel;
552 : BlockNumber childblkno;
553 : Buffer buffer;
554 0 : bool result = false;
555 : BlockNumber blkno;
556 : int level;
557 0 : OffsetNumber downlinkoffnum = InvalidOffsetNumber;
558 0 : BlockNumber parentblkno = InvalidBlockNumber;
559 :
560 0 : CHECK_FOR_INTERRUPTS();
561 :
562 : /*
563 : * Loop until we reach a leaf page (level == 0) or a level with buffers
564 : * (not including the level we start at, because we would otherwise make
565 : * no progress).
566 : */
567 0 : blkno = startblkno;
568 0 : level = startlevel;
569 : for (;;)
570 : {
571 : ItemId iid;
572 : IndexTuple idxtuple,
573 : newtup;
574 : Page page;
575 : OffsetNumber childoffnum;
576 :
577 : /* Have we reached a level with buffers? */
578 0 : if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
579 0 : break;
580 :
581 : /* Have we reached a leaf page? */
582 0 : if (level == 0)
583 0 : break;
584 :
585 : /*
586 : * Nope. Descend down to the next level then. Choose a child to
587 : * descend down to.
588 : */
589 :
590 0 : buffer = ReadBuffer(indexrel, blkno);
591 0 : LockBuffer(buffer, GIST_EXCLUSIVE);
592 :
593 0 : page = (Page) BufferGetPage(buffer);
594 0 : childoffnum = gistchoose(indexrel, page, itup, giststate);
595 0 : iid = PageGetItemId(page, childoffnum);
596 0 : idxtuple = (IndexTuple) PageGetItem(page, iid);
597 0 : childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
598 :
599 0 : if (level > 1)
600 0 : gistMemorizeParent(buildstate, childblkno, blkno);
601 :
602 : /*
603 : * Check that the key representing the target child node is consistent
604 : * with the key we're inserting. Update it if it's not.
605 : */
606 0 : newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
607 0 : if (newtup)
608 : {
609 0 : blkno = gistbufferinginserttuples(buildstate,
610 : buffer,
611 : level,
612 : &newtup,
613 : 1,
614 : childoffnum,
615 : InvalidBlockNumber,
616 : InvalidOffsetNumber);
617 : /* gistbufferinginserttuples() released the buffer */
618 : }
619 : else
620 0 : UnlockReleaseBuffer(buffer);
621 :
622 : /* Descend to the child */
623 0 : parentblkno = blkno;
624 0 : blkno = childblkno;
625 0 : downlinkoffnum = childoffnum;
626 0 : Assert(level > 0);
627 0 : level--;
628 0 : }
629 :
630 0 : if (LEVEL_HAS_BUFFERS(level, gfbb))
631 0 : {
632 : /*
633 : * We've reached level with buffers. Place the index tuple to the
634 : * buffer, and add the buffer to the emptying queue if it overflows.
635 : */
636 : GISTNodeBuffer *childNodeBuffer;
637 :
638 : /* Find the buffer or create a new one */
639 0 : childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
640 :
641 : /* Add index tuple to it */
642 0 : gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
643 :
644 0 : if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
645 0 : result = true;
646 : }
647 : else
648 : {
649 : /*
650 : * We've reached a leaf page. Place the tuple here.
651 : */
652 0 : Assert(level == 0);
653 0 : buffer = ReadBuffer(indexrel, blkno);
654 0 : LockBuffer(buffer, GIST_EXCLUSIVE);
655 0 : gistbufferinginserttuples(buildstate, buffer, level,
656 : &itup, 1, InvalidOffsetNumber,
657 : parentblkno, downlinkoffnum);
658 : /* gistbufferinginserttuples() released the buffer */
659 : }
660 :
661 0 : return result;
662 : }
663 :
664 : /*
665 : * Insert tuples to a given page.
666 : *
667 : * This is analogous with gistinserttuples() in the regular insertion code.
668 : *
669 : * Returns the block number of the page where the (first) new or updated tuple
670 : * was inserted. Usually that's the original page, but might be a sibling page
671 : * if the original page was split.
672 : *
673 : * Caller should hold a lock on 'buffer' on entry. This function will unlock
674 : * and unpin it.
675 : */
676 : static BlockNumber
677 0 : gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
678 : IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
679 : BlockNumber parentblk, OffsetNumber downlinkoffnum)
680 : {
681 0 : GISTBuildBuffers *gfbb = buildstate->gfbb;
682 : List *splitinfo;
683 : bool is_split;
684 0 : BlockNumber placed_to_blk = InvalidBlockNumber;
685 :
686 0 : is_split = gistplacetopage(buildstate->indexrel,
687 : buildstate->freespace,
688 : buildstate->giststate,
689 : buffer,
690 : itup, ntup, oldoffnum, &placed_to_blk,
691 : InvalidBuffer,
692 : &splitinfo,
693 : false);
694 :
695 : /*
696 : * If this is a root split, update the root path item kept in memory. This
697 : * ensures that all path stacks are always complete, including all parent
698 : * nodes up to the root. That simplifies the algorithm to re-find correct
699 : * parent.
700 : */
701 0 : if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
702 : {
703 0 : Page page = BufferGetPage(buffer);
704 : OffsetNumber off;
705 : OffsetNumber maxoff;
706 :
707 0 : Assert(level == gfbb->rootlevel);
708 0 : gfbb->rootlevel++;
709 :
710 0 : elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
711 :
712 : /*
713 : * All the downlinks on the old root page are now on one of the child
714 : * pages. Visit all the new child pages to memorize the parents of the
715 : * grandchildren.
716 : */
717 0 : if (gfbb->rootlevel > 1)
718 : {
719 0 : maxoff = PageGetMaxOffsetNumber(page);
720 0 : for (off = FirstOffsetNumber; off <= maxoff; off++)
721 : {
722 0 : ItemId iid = PageGetItemId(page, off);
723 0 : IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
724 0 : BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
725 0 : Buffer childbuf = ReadBuffer(buildstate->indexrel, childblkno);
726 :
727 0 : LockBuffer(childbuf, GIST_SHARE);
728 0 : gistMemorizeAllDownlinks(buildstate, childbuf);
729 0 : UnlockReleaseBuffer(childbuf);
730 :
731 : /*
732 : * Also remember that the parent of the new child page is the
733 : * root block.
734 : */
735 0 : gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
736 : }
737 : }
738 : }
739 :
740 0 : if (splitinfo)
741 : {
742 : /*
743 : * Insert the downlinks to the parent. This is analogous with
744 : * gistfinishsplit() in the regular insertion code, but the locking is
745 : * simpler, and we have to maintain the buffers on internal nodes and
746 : * the parent map.
747 : */
748 : IndexTuple *downlinks;
749 : int ndownlinks,
750 : i;
751 : Buffer parentBuffer;
752 : ListCell *lc;
753 :
754 : /* Parent may have changed since we memorized this path. */
755 0 : parentBuffer =
756 0 : gistBufferingFindCorrectParent(buildstate,
757 : BufferGetBlockNumber(buffer),
758 : level,
759 : &parentblk,
760 : &downlinkoffnum);
761 :
762 : /*
763 : * If there's a buffer associated with this page, that needs to be
764 : * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
765 : * downlinks in 'splitinfo', to make sure they're consistent not only
766 : * with the tuples already on the pages, but also the tuples in the
767 : * buffers that will eventually be inserted to them.
768 : */
769 0 : gistRelocateBuildBuffersOnSplit(gfbb,
770 : buildstate->giststate,
771 : buildstate->indexrel,
772 : level,
773 : buffer, splitinfo);
774 :
775 : /* Create an array of all the downlink tuples */
776 0 : ndownlinks = list_length(splitinfo);
777 0 : downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
778 0 : i = 0;
779 0 : foreach(lc, splitinfo)
780 : {
781 0 : GISTPageSplitInfo *splitinfo = lfirst(lc);
782 :
783 : /*
784 : * Remember the parent of each new child page in our parent map.
785 : * This assumes that the downlinks fit on the parent page. If the
786 : * parent page is split, too, when we recurse up to insert the
787 : * downlinks, the recursive gistbufferinginserttuples() call will
788 : * update the map again.
789 : */
790 0 : if (level > 0)
791 0 : gistMemorizeParent(buildstate,
792 : BufferGetBlockNumber(splitinfo->buf),
793 : BufferGetBlockNumber(parentBuffer));
794 :
795 : /*
796 : * Also update the parent map for all the downlinks that got moved
797 : * to a different page. (actually this also loops through the
798 : * downlinks that stayed on the original page, but it does no
799 : * harm).
800 : */
801 0 : if (level > 1)
802 0 : gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
803 :
804 : /*
805 : * Since there's no concurrent access, we can release the lower
806 : * level buffers immediately. This includes the original page.
807 : */
808 0 : UnlockReleaseBuffer(splitinfo->buf);
809 0 : downlinks[i++] = splitinfo->downlink;
810 : }
811 :
812 : /* Insert them into parent. */
813 0 : gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
814 : downlinks, ndownlinks, downlinkoffnum,
815 : InvalidBlockNumber, InvalidOffsetNumber);
816 :
817 0 : list_free_deep(splitinfo); /* we don't need this anymore */
818 : }
819 : else
820 0 : UnlockReleaseBuffer(buffer);
821 :
822 0 : return placed_to_blk;
823 : }
824 :
825 : /*
826 : * Find the downlink pointing to a child page.
827 : *
828 : * 'childblkno' indicates the child page to find the parent for. 'level' is
829 : * the level of the child. On entry, *parentblkno and *downlinkoffnum can
830 : * point to a location where the downlink used to be - we will check that
831 : * location first, and save some cycles if it hasn't moved. The function
832 : * returns a buffer containing the downlink, exclusively-locked, and
833 : * *parentblkno and *downlinkoffnum are set to the real location of the
834 : * downlink.
835 : *
836 : * If the child page is a leaf (level == 0), the caller must supply a correct
837 : * parentblkno. Otherwise we use the parent map hash table to find the parent
838 : * block.
839 : *
840 : * This function serves the same purpose as gistFindCorrectParent() during
841 : * normal index inserts, but this is simpler because we don't need to deal
842 : * with concurrent inserts.
843 : */
844 : static Buffer
845 0 : gistBufferingFindCorrectParent(GISTBuildState *buildstate,
846 : BlockNumber childblkno, int level,
847 : BlockNumber *parentblkno,
848 : OffsetNumber *downlinkoffnum)
849 : {
850 : BlockNumber parent;
851 : Buffer buffer;
852 : Page page;
853 : OffsetNumber maxoff;
854 : OffsetNumber off;
855 :
856 0 : if (level > 0)
857 0 : parent = gistGetParent(buildstate, childblkno);
858 : else
859 : {
860 : /*
861 : * For a leaf page, the caller must supply a correct parent block
862 : * number.
863 : */
864 0 : if (*parentblkno == InvalidBlockNumber)
865 0 : elog(ERROR, "no parent buffer provided of child %d", childblkno);
866 0 : parent = *parentblkno;
867 : }
868 :
869 0 : buffer = ReadBuffer(buildstate->indexrel, parent);
870 0 : page = BufferGetPage(buffer);
871 0 : LockBuffer(buffer, GIST_EXCLUSIVE);
872 0 : gistcheckpage(buildstate->indexrel, buffer);
873 0 : maxoff = PageGetMaxOffsetNumber(page);
874 :
875 : /* Check if it was not moved */
876 0 : if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
877 0 : *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
878 : {
879 0 : ItemId iid = PageGetItemId(page, *downlinkoffnum);
880 0 : IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
881 :
882 0 : if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
883 : {
884 : /* Still there */
885 0 : return buffer;
886 : }
887 : }
888 :
889 : /*
890 : * Downlink was not at the offset where it used to be. Scan the page to
891 : * find it. During normal gist insertions, it might've moved to another
892 : * page, to the right, but during a buffering build, we keep track of the
893 : * parent of each page in the lookup table so we should always know what
894 : * page it's on.
895 : */
896 0 : for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
897 : {
898 0 : ItemId iid = PageGetItemId(page, off);
899 0 : IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
900 :
901 0 : if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
902 : {
903 : /* yes!!, found it */
904 0 : *downlinkoffnum = off;
905 0 : return buffer;
906 : }
907 : }
908 :
909 0 : elog(ERROR, "failed to re-find parent for block %u", childblkno);
910 : return InvalidBuffer; /* keep compiler quiet */
911 : }
912 :
913 : /*
914 : * Process buffers emptying stack. Emptying of one buffer can cause emptying
915 : * of other buffers. This function iterates until this cascading emptying
916 : * process finished, e.g. until buffers emptying stack is empty.
917 : */
918 : static void
919 0 : gistProcessEmptyingQueue(GISTBuildState *buildstate)
920 : {
921 0 : GISTBuildBuffers *gfbb = buildstate->gfbb;
922 :
923 : /* Iterate while we have elements in buffers emptying stack. */
924 0 : while (gfbb->bufferEmptyingQueue != NIL)
925 : {
926 : GISTNodeBuffer *emptyingNodeBuffer;
927 :
928 : /* Get node buffer from emptying stack. */
929 0 : emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
930 0 : gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
931 0 : emptyingNodeBuffer->queuedForEmptying = false;
932 :
933 : /*
934 : * We are going to load last pages of buffers where emptying will be
935 : * to. So let's unload any previously loaded buffers.
936 : */
937 0 : gistUnloadNodeBuffers(gfbb);
938 :
939 : /*
940 : * Pop tuples from the buffer and run them down to the buffers at
941 : * lower level, or leaf pages. We continue until one of the lower
942 : * level buffers fills up, or this buffer runs empty.
943 : *
944 : * In Arge et al's paper, the buffer emptying is stopped after
945 : * processing 1/2 node buffer worth of tuples, to avoid overfilling
946 : * any of the lower level buffers. However, it's more efficient to
947 : * keep going until one of the lower level buffers actually fills up,
948 : * so that's what we do. This doesn't need to be exact, if a buffer
949 : * overfills by a few tuples, there's no harm done.
950 : */
951 : while (true)
952 : {
953 : IndexTuple itup;
954 :
955 : /* Get next index tuple from the buffer */
956 0 : if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
957 0 : break;
958 :
959 : /*
960 : * Run it down to the underlying node buffer or leaf page.
961 : *
962 : * Note: it's possible that the buffer we're emptying splits as a
963 : * result of this call. If that happens, our emptyingNodeBuffer
964 : * points to the left half of the split. After split, it's very
965 : * likely that the new left buffer is no longer over the half-full
966 : * threshold, but we might as well keep flushing tuples from it
967 : * until we fill a lower-level buffer.
968 : */
969 0 : if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
970 : {
971 : /*
972 : * A lower level buffer filled up. Stop emptying this buffer,
973 : * to avoid overflowing the lower level buffer.
974 : */
975 0 : break;
976 : }
977 :
978 : /* Free all the memory allocated during index tuple processing */
979 0 : MemoryContextReset(buildstate->giststate->tempCxt);
980 0 : }
981 : }
982 0 : }
983 :
984 : /*
985 : * Empty all node buffers, from top to bottom. This is done at the end of
986 : * index build to flush all remaining tuples to the index.
987 : *
988 : * Note: This destroys the buffersOnLevels lists, so the buffers should not
989 : * be inserted to after this call.
990 : */
991 : static void
992 0 : gistEmptyAllBuffers(GISTBuildState *buildstate)
993 : {
994 0 : GISTBuildBuffers *gfbb = buildstate->gfbb;
995 : MemoryContext oldCtx;
996 : int i;
997 :
998 0 : oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
999 :
1000 : /*
1001 : * Iterate through the levels from top to bottom.
1002 : */
1003 0 : for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
1004 : {
1005 : /*
1006 : * Empty all buffers on this level. Note that new buffers can pop up
1007 : * in the list during the processing, as a result of page splits, so a
1008 : * simple walk through the list won't work. We remove buffers from the
1009 : * list when we see them empty; a buffer can't become non-empty once
1010 : * it's been fully emptied.
1011 : */
1012 0 : while (gfbb->buffersOnLevels[i] != NIL)
1013 : {
1014 : GISTNodeBuffer *nodeBuffer;
1015 :
1016 0 : nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
1017 :
1018 0 : if (nodeBuffer->blocksCount != 0)
1019 : {
1020 : /*
1021 : * Add this buffer to the emptying queue, and proceed to empty
1022 : * the queue.
1023 : */
1024 0 : if (!nodeBuffer->queuedForEmptying)
1025 : {
1026 0 : MemoryContextSwitchTo(gfbb->context);
1027 0 : nodeBuffer->queuedForEmptying = true;
1028 0 : gfbb->bufferEmptyingQueue =
1029 0 : lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
1030 0 : MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1031 : }
1032 0 : gistProcessEmptyingQueue(buildstate);
1033 : }
1034 : else
1035 0 : gfbb->buffersOnLevels[i] =
1036 0 : list_delete_first(gfbb->buffersOnLevels[i]);
1037 : }
1038 0 : elog(DEBUG2, "emptied all buffers at level %d", i);
1039 : }
1040 0 : MemoryContextSwitchTo(oldCtx);
1041 0 : }
1042 :
1043 : /*
1044 : * Get the depth of the GiST index.
1045 : */
1046 : static int
1047 0 : gistGetMaxLevel(Relation index)
1048 : {
1049 : int maxLevel;
1050 : BlockNumber blkno;
1051 :
1052 : /*
1053 : * Traverse down the tree, starting from the root, until we hit the leaf
1054 : * level.
1055 : */
1056 0 : maxLevel = 0;
1057 0 : blkno = GIST_ROOT_BLKNO;
1058 : while (true)
1059 : {
1060 : Buffer buffer;
1061 : Page page;
1062 : IndexTuple itup;
1063 :
1064 0 : buffer = ReadBuffer(index, blkno);
1065 :
1066 : /*
1067 : * There's no concurrent access during index build, so locking is just
1068 : * pro forma.
1069 : */
1070 0 : LockBuffer(buffer, GIST_SHARE);
1071 0 : page = (Page) BufferGetPage(buffer);
1072 :
1073 0 : if (GistPageIsLeaf(page))
1074 : {
1075 : /* We hit the bottom, so we're done. */
1076 0 : UnlockReleaseBuffer(buffer);
1077 0 : break;
1078 : }
1079 :
1080 : /*
1081 : * Pick the first downlink on the page, and follow it. It doesn't
1082 : * matter which downlink we choose, the tree has the same depth
1083 : * everywhere, so we just pick the first one.
1084 : */
1085 0 : itup = (IndexTuple) PageGetItem(page,
1086 : PageGetItemId(page, FirstOffsetNumber));
1087 0 : blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1088 0 : UnlockReleaseBuffer(buffer);
1089 :
1090 : /*
1091 : * We're going down on the tree. It means that there is yet one more
1092 : * level in the tree.
1093 : */
1094 0 : maxLevel++;
1095 0 : }
1096 0 : return maxLevel;
1097 : }
1098 :
1099 :
1100 : /*
1101 : * Routines for managing the parent map.
1102 : *
1103 : * Whenever a page is split, we need to insert the downlinks into the parent.
1104 : * We need to somehow find the parent page to do that. In normal insertions,
1105 : * we keep a stack of nodes visited when we descend the tree. However, in
1106 : * buffering build, we can start descending the tree from any internal node,
1107 : * when we empty a buffer by cascading tuples to its children. So we don't
1108 : * have a full stack up to the root available at that time.
1109 : *
1110 : * So instead, we maintain a hash table to track the parent of every internal
1111 : * page. We don't need to track the parents of leaf nodes, however. Whenever
1112 : * we insert to a leaf, we've just descended down from its parent, so we know
1113 : * its immediate parent already. This helps a lot to limit the memory used
1114 : * by this hash table.
1115 : *
1116 : * Whenever an internal node is split, the parent map needs to be updated.
1117 : * the parent of the new child page needs to be recorded, and also the
1118 : * entries for all page whose downlinks are moved to a new page at the split
1119 : * needs to be updated.
1120 : *
1121 : * We also update the parent map whenever we descend the tree. That might seem
1122 : * unnecessary, because we maintain the map whenever a downlink is moved or
1123 : * created, but it is needed because we switch to buffering mode after
1124 : * creating a tree with regular index inserts. Any pages created before
1125 : * switching to buffering mode will not be present in the parent map initially,
1126 : * but will be added there the first time we visit them.
1127 : */
1128 :
1129 : typedef struct
1130 : {
1131 : BlockNumber childblkno; /* hash key */
1132 : BlockNumber parentblkno;
1133 : } ParentMapEntry;
1134 :
1135 : static void
1136 0 : gistInitParentMap(GISTBuildState *buildstate)
1137 : {
1138 : HASHCTL hashCtl;
1139 :
1140 0 : hashCtl.keysize = sizeof(BlockNumber);
1141 0 : hashCtl.entrysize = sizeof(ParentMapEntry);
1142 0 : hashCtl.hcxt = CurrentMemoryContext;
1143 0 : buildstate->parentMap = hash_create("gistbuild parent map",
1144 : 1024,
1145 : &hashCtl,
1146 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1147 0 : }
1148 :
1149 : static void
1150 0 : gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
1151 : {
1152 : ParentMapEntry *entry;
1153 : bool found;
1154 :
1155 0 : entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1156 : (const void *) &child,
1157 : HASH_ENTER,
1158 : &found);
1159 0 : entry->parentblkno = parent;
1160 0 : }
1161 :
1162 : /*
1163 : * Scan all downlinks on a page, and memorize their parent.
1164 : */
1165 : static void
1166 0 : gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
1167 : {
1168 : OffsetNumber maxoff;
1169 : OffsetNumber off;
1170 0 : BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
1171 0 : Page page = BufferGetPage(parentbuf);
1172 :
1173 0 : Assert(!GistPageIsLeaf(page));
1174 :
1175 0 : maxoff = PageGetMaxOffsetNumber(page);
1176 0 : for (off = FirstOffsetNumber; off <= maxoff; off++)
1177 : {
1178 0 : ItemId iid = PageGetItemId(page, off);
1179 0 : IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1180 0 : BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1181 :
1182 0 : gistMemorizeParent(buildstate, childblkno, parentblkno);
1183 : }
1184 0 : }
1185 :
1186 : static BlockNumber
1187 0 : gistGetParent(GISTBuildState *buildstate, BlockNumber child)
1188 : {
1189 : ParentMapEntry *entry;
1190 : bool found;
1191 :
1192 : /* Find node buffer in hash table */
1193 0 : entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1194 : (const void *) &child,
1195 : HASH_FIND,
1196 : &found);
1197 0 : if (!found)
1198 0 : elog(ERROR, "could not find parent of block %d in lookup table", child);
1199 :
1200 0 : return entry->parentblkno;
1201 : }
|