Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * hashinsert.c
4 : * Item insertion in hash tables for Postgres.
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/hash/hashinsert.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include "access/hash.h"
19 : #include "access/hash_xlog.h"
20 : #include "access/heapam.h"
21 : #include "miscadmin.h"
22 : #include "utils/rel.h"
23 : #include "storage/lwlock.h"
24 : #include "storage/buf_internals.h"
25 :
26 : static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf,
27 : RelFileNode hnode);
28 :
29 : /*
30 : * _hash_doinsert() -- Handle insertion of a single index tuple.
31 : *
32 : * This routine is called by the public interface routines, hashbuild
33 : * and hashinsert. By here, itup is completely filled in.
34 : */
35 : void
36 80552 : _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel)
37 : {
38 80552 : Buffer buf = InvalidBuffer;
39 : Buffer bucket_buf;
40 : Buffer metabuf;
41 : HashMetaPage metap;
42 80552 : HashMetaPage usedmetap = NULL;
43 : Page metapage;
44 : Page page;
45 : HashPageOpaque pageopaque;
46 : Size itemsz;
47 : bool do_expand;
48 : uint32 hashkey;
49 : Bucket bucket;
50 : OffsetNumber itup_off;
51 :
52 : /*
53 : * Get the hash key for the item (it's stored in the index tuple itself).
54 : */
55 80552 : hashkey = _hash_get_indextuple_hashkey(itup);
56 :
57 : /* compute item size too */
58 80552 : itemsz = IndexTupleDSize(*itup);
59 80552 : itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
60 : * need to be consistent */
61 :
62 : restart_insert:
63 :
64 : /*
65 : * Read the metapage. We don't lock it yet; HashMaxItemSize() will
66 : * examine pd_pagesize_version, but that can't change so we can examine it
67 : * without a lock.
68 : */
69 80552 : metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
70 80552 : metapage = BufferGetPage(metabuf);
71 :
72 : /*
73 : * Check whether the item can fit on a hash page at all. (Eventually, we
74 : * ought to try to apply TOAST methods if not.) Note that at this point,
75 : * itemsz doesn't include the ItemId.
76 : *
77 : * XXX this is useless code if we are only storing hash keys.
78 : */
79 80552 : if (itemsz > HashMaxItemSize(metapage))
80 0 : ereport(ERROR,
81 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
82 : errmsg("index row size %zu exceeds hash maximum %zu",
83 : itemsz, HashMaxItemSize(metapage)),
84 : errhint("Values larger than a buffer page cannot be indexed.")));
85 :
86 : /* Lock the primary bucket page for the target bucket. */
87 80552 : buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE,
88 : &usedmetap);
89 80552 : Assert(usedmetap != NULL);
90 :
91 : /* remember the primary bucket buffer to release the pin on it at end. */
92 80552 : bucket_buf = buf;
93 :
94 80552 : page = BufferGetPage(buf);
95 80552 : pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
96 80552 : bucket = pageopaque->hasho_bucket;
97 :
98 : /*
99 : * If this bucket is in the process of being split, try to finish the
100 : * split before inserting, because that might create room for the
101 : * insertion to proceed without allocating an additional overflow page.
102 : * It's only interesting to finish the split if we're trying to insert
103 : * into the bucket from which we're removing tuples (the "old" bucket),
104 : * not if we're trying to insert into the bucket into which tuples are
105 : * being moved (the "new" bucket).
106 : */
107 80552 : if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf))
108 : {
109 : /* release the lock on bucket buffer, before completing the split. */
110 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
111 :
112 0 : _hash_finish_split(rel, metabuf, buf, bucket,
113 0 : usedmetap->hashm_maxbucket,
114 0 : usedmetap->hashm_highmask,
115 0 : usedmetap->hashm_lowmask);
116 :
117 : /* release the pin on old and meta buffer. retry for insert. */
118 0 : _hash_dropbuf(rel, buf);
119 0 : _hash_dropbuf(rel, metabuf);
120 0 : goto restart_insert;
121 : }
122 :
123 : /* Do the insertion */
124 195120 : while (PageGetFreeSpace(page) < itemsz)
125 : {
126 : BlockNumber nextblkno;
127 :
128 : /*
129 : * Check if current page has any DEAD tuples. If yes, delete these
130 : * tuples and see if we can get a space for the new item to be
131 : * inserted before moving to the next page in the bucket chain.
132 : */
133 34016 : if (H_HAS_DEAD_TUPLES(pageopaque))
134 : {
135 :
136 0 : if (IsBufferCleanupOK(buf))
137 : {
138 0 : _hash_vacuum_one_page(rel, metabuf, buf, heapRel->rd_node);
139 :
140 0 : if (PageGetFreeSpace(page) >= itemsz)
141 0 : break; /* OK, now we have enough space */
142 : }
143 : }
144 :
145 : /*
146 : * no space on this page; check for an overflow page
147 : */
148 34016 : nextblkno = pageopaque->hasho_nextblkno;
149 :
150 34016 : if (BlockNumberIsValid(nextblkno))
151 : {
152 : /*
153 : * ovfl page exists; go get it. if it doesn't have room, we'll
154 : * find out next pass through the loop test above. we always
155 : * release both the lock and pin if this is an overflow page, but
156 : * only the lock if this is the primary bucket page, since the pin
157 : * on the primary bucket must be retained throughout the scan.
158 : */
159 33997 : if (buf != bucket_buf)
160 28045 : _hash_relbuf(rel, buf);
161 : else
162 5952 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
163 33997 : buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
164 33997 : page = BufferGetPage(buf);
165 : }
166 : else
167 : {
168 : /*
169 : * we're at the end of the bucket chain and we haven't found a
170 : * page with enough room. allocate a new overflow page.
171 : */
172 :
173 : /* release our write lock without modifying buffer */
174 19 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
175 :
176 : /* chain to a new overflow page */
177 19 : buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false);
178 19 : page = BufferGetPage(buf);
179 :
180 : /* should fit now, given test above */
181 19 : Assert(PageGetFreeSpace(page) >= itemsz);
182 : }
183 34016 : pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
184 34016 : Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE);
185 34016 : Assert(pageopaque->hasho_bucket == bucket);
186 : }
187 :
188 : /*
189 : * Write-lock the metapage so we can increment the tuple count. After
190 : * incrementing it, check to see if it's time for a split.
191 : */
192 80552 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
193 :
194 : /* Do the update. No ereport(ERROR) until changes are logged */
195 80552 : START_CRIT_SECTION();
196 :
197 : /* found page with enough space, so add the item here */
198 80552 : itup_off = _hash_pgaddtup(rel, buf, itemsz, itup);
199 80552 : MarkBufferDirty(buf);
200 :
201 : /* metapage operations */
202 80552 : metap = HashPageGetMeta(metapage);
203 80552 : metap->hashm_ntuples += 1;
204 :
205 : /* Make sure this stays in sync with _hash_expandtable() */
206 241656 : do_expand = metap->hashm_ntuples >
207 161104 : (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);
208 :
209 80552 : MarkBufferDirty(metabuf);
210 :
211 : /* XLOG stuff */
212 80552 : if (RelationNeedsWAL(rel))
213 : {
214 : xl_hash_insert xlrec;
215 : XLogRecPtr recptr;
216 :
217 80551 : xlrec.offnum = itup_off;
218 :
219 80551 : XLogBeginInsert();
220 80551 : XLogRegisterData((char *) &xlrec, SizeOfHashInsert);
221 :
222 80551 : XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
223 :
224 80551 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
225 80551 : XLogRegisterBufData(0, (char *) itup, IndexTupleDSize(*itup));
226 :
227 80551 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT);
228 :
229 80551 : PageSetLSN(BufferGetPage(buf), recptr);
230 80551 : PageSetLSN(BufferGetPage(metabuf), recptr);
231 : }
232 :
233 80552 : END_CRIT_SECTION();
234 :
235 : /* drop lock on metapage, but keep pin */
236 80552 : LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
237 :
238 : /*
239 : * Release the modified page and ensure to release the pin on primary
240 : * page.
241 : */
242 80552 : _hash_relbuf(rel, buf);
243 80552 : if (buf != bucket_buf)
244 5961 : _hash_dropbuf(rel, bucket_buf);
245 :
246 : /* Attempt to split if a split is needed */
247 80552 : if (do_expand)
248 73 : _hash_expandtable(rel, metabuf);
249 :
250 : /* Finally drop our pin on the metapage */
251 80552 : _hash_dropbuf(rel, metabuf);
252 80552 : }
253 :
254 : /*
255 : * _hash_pgaddtup() -- add a tuple to a particular page in the index.
256 : *
257 : * This routine adds the tuple to the page as requested; it does not write out
258 : * the page. It is an error to call pgaddtup() without pin and write lock on
259 : * the target buffer.
260 : *
261 : * Returns the offset number at which the tuple was inserted. This function
262 : * is responsible for preserving the condition that tuples in a hash index
263 : * page are sorted by hashkey value.
264 : */
265 : OffsetNumber
266 80552 : _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup)
267 : {
268 : OffsetNumber itup_off;
269 : Page page;
270 : uint32 hashkey;
271 :
272 80552 : _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
273 80552 : page = BufferGetPage(buf);
274 :
275 : /* Find where to insert the tuple (preserving page's hashkey ordering) */
276 80552 : hashkey = _hash_get_indextuple_hashkey(itup);
277 80552 : itup_off = _hash_binsearch(page, hashkey);
278 :
279 80552 : if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
280 : == InvalidOffsetNumber)
281 0 : elog(ERROR, "failed to add index item to \"%s\"",
282 : RelationGetRelationName(rel));
283 :
284 80552 : return itup_off;
285 : }
286 :
287 : /*
288 : * _hash_pgaddmultitup() -- add a tuple vector to a particular page in the
289 : * index.
290 : *
291 : * This routine has same requirements for locking and tuple ordering as
292 : * _hash_pgaddtup().
293 : *
294 : * Returns the offset number array at which the tuples were inserted.
295 : */
296 : void
297 97 : _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups,
298 : OffsetNumber *itup_offsets, uint16 nitups)
299 : {
300 : OffsetNumber itup_off;
301 : Page page;
302 : uint32 hashkey;
303 : int i;
304 :
305 97 : _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
306 97 : page = BufferGetPage(buf);
307 :
308 17444 : for (i = 0; i < nitups; i++)
309 : {
310 : Size itemsize;
311 :
312 17347 : itemsize = IndexTupleDSize(*itups[i]);
313 17347 : itemsize = MAXALIGN(itemsize);
314 :
315 : /* Find where to insert the tuple (preserving page's hashkey ordering) */
316 17347 : hashkey = _hash_get_indextuple_hashkey(itups[i]);
317 17347 : itup_off = _hash_binsearch(page, hashkey);
318 :
319 17347 : itup_offsets[i] = itup_off;
320 :
321 17347 : if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false)
322 : == InvalidOffsetNumber)
323 0 : elog(ERROR, "failed to add index item to \"%s\"",
324 : RelationGetRelationName(rel));
325 : }
326 97 : }
327 :
328 : /*
329 : * _hash_vacuum_one_page - vacuum just one index page.
330 : *
331 : * Try to remove LP_DEAD items from the given page. We must acquire cleanup
332 : * lock on the page being modified before calling this function.
333 : */
334 :
335 : static void
336 0 : _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf,
337 : RelFileNode hnode)
338 : {
339 : OffsetNumber deletable[MaxOffsetNumber];
340 0 : int ndeletable = 0;
341 : OffsetNumber offnum,
342 : maxoff;
343 0 : Page page = BufferGetPage(buf);
344 : HashPageOpaque pageopaque;
345 : HashMetaPage metap;
346 :
347 : /* Scan each tuple in page to see if it is marked as LP_DEAD */
348 0 : maxoff = PageGetMaxOffsetNumber(page);
349 0 : for (offnum = FirstOffsetNumber;
350 : offnum <= maxoff;
351 0 : offnum = OffsetNumberNext(offnum))
352 : {
353 0 : ItemId itemId = PageGetItemId(page, offnum);
354 :
355 0 : if (ItemIdIsDead(itemId))
356 0 : deletable[ndeletable++] = offnum;
357 : }
358 :
359 0 : if (ndeletable > 0)
360 : {
361 : /*
362 : * Write-lock the meta page so that we can decrement tuple count.
363 : */
364 0 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
365 :
366 : /* No ereport(ERROR) until changes are logged */
367 0 : START_CRIT_SECTION();
368 :
369 0 : PageIndexMultiDelete(page, deletable, ndeletable);
370 :
371 : /*
372 : * Mark the page as not containing any LP_DEAD items. This is not
373 : * certainly true (there might be some that have recently been marked,
374 : * but weren't included in our target-item list), but it will almost
375 : * always be true and it doesn't seem worth an additional page scan to
376 : * check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint
377 : * anyway.
378 : */
379 0 : pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
380 0 : pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
381 :
382 0 : metap = HashPageGetMeta(BufferGetPage(metabuf));
383 0 : metap->hashm_ntuples -= ndeletable;
384 :
385 0 : MarkBufferDirty(buf);
386 0 : MarkBufferDirty(metabuf);
387 :
388 : /* XLOG stuff */
389 0 : if (RelationNeedsWAL(rel))
390 : {
391 : xl_hash_vacuum_one_page xlrec;
392 : XLogRecPtr recptr;
393 :
394 0 : xlrec.hnode = hnode;
395 0 : xlrec.ntuples = ndeletable;
396 :
397 0 : XLogBeginInsert();
398 0 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
399 0 : XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage);
400 :
401 : /*
402 : * We need the target-offsets array whether or not we store the
403 : * whole buffer, to allow us to find the latestRemovedXid on a
404 : * standby server.
405 : */
406 0 : XLogRegisterData((char *) deletable,
407 0 : ndeletable * sizeof(OffsetNumber));
408 :
409 0 : XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
410 :
411 0 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE);
412 :
413 0 : PageSetLSN(BufferGetPage(buf), recptr);
414 0 : PageSetLSN(BufferGetPage(metabuf), recptr);
415 : }
416 :
417 0 : END_CRIT_SECTION();
418 :
419 : /*
420 : * Releasing write lock on meta page as we have updated the tuple
421 : * count.
422 : */
423 0 : LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
424 : }
425 0 : }
|