Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * hashutil.c
4 : * Utility code for Postgres hash implementation.
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/hash/hashutil.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include "access/hash.h"
18 : #include "access/reloptions.h"
19 : #include "access/relscan.h"
20 : #include "utils/lsyscache.h"
21 : #include "utils/rel.h"
22 : #include "storage/buf_internals.h"
23 :
24 : #define CALC_NEW_BUCKET(old_bucket, lowmask) \
25 : old_bucket | (lowmask + 1)
26 :
27 : /*
28 : * _hash_checkqual -- does the index tuple satisfy the scan conditions?
29 : */
30 : bool
31 16534 : _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
32 : {
33 : /*
34 : * Currently, we can't check any of the scan conditions since we do not
35 : * have the original index entry value to supply to the sk_func. Always
36 : * return true; we expect that hashgettuple already set the recheck flag
37 : * to make the main indexscan code do it.
38 : */
39 : #ifdef NOT_USED
40 : TupleDesc tupdesc = RelationGetDescr(scan->indexRelation);
41 : ScanKey key = scan->keyData;
42 : int scanKeySize = scan->numberOfKeys;
43 :
44 : while (scanKeySize > 0)
45 : {
46 : Datum datum;
47 : bool isNull;
48 : Datum test;
49 :
50 : datum = index_getattr(itup,
51 : key->sk_attno,
52 : tupdesc,
53 : &isNull);
54 :
55 : /* assume sk_func is strict */
56 : if (isNull)
57 : return false;
58 : if (key->sk_flags & SK_ISNULL)
59 : return false;
60 :
61 : test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
62 : datum, key->sk_argument);
63 :
64 : if (!DatumGetBool(test))
65 : return false;
66 :
67 : key++;
68 : scanKeySize--;
69 : }
70 : #endif
71 :
72 16534 : return true;
73 : }
74 :
75 : /*
76 : * _hash_datum2hashkey -- given a Datum, call the index's hash procedure
77 : *
78 : * The Datum is assumed to be of the index's column type, so we can use the
79 : * "primary" hash procedure that's tracked for us by the generic index code.
80 : */
81 : uint32
82 80577 : _hash_datum2hashkey(Relation rel, Datum key)
83 : {
84 : FmgrInfo *procinfo;
85 : Oid collation;
86 :
87 : /* XXX assumes index has only one attribute */
88 80577 : procinfo = index_getprocinfo(rel, 1, HASHSTANDARD_PROC);
89 80577 : collation = rel->rd_indcollation[0];
90 :
91 80577 : return DatumGetUInt32(FunctionCall1Coll(procinfo, collation, key));
92 : }
93 :
94 : /*
95 : * _hash_datum2hashkey_type -- given a Datum of a specified type,
96 : * hash it in a fashion compatible with this index
97 : *
98 : * This is much more expensive than _hash_datum2hashkey, so use it only in
99 : * cross-type situations.
100 : */
101 : uint32
102 0 : _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype)
103 : {
104 : RegProcedure hash_proc;
105 : Oid collation;
106 :
107 : /* XXX assumes index has only one attribute */
108 0 : hash_proc = get_opfamily_proc(rel->rd_opfamily[0],
109 : keytype,
110 : keytype,
111 : HASHSTANDARD_PROC);
112 0 : if (!RegProcedureIsValid(hash_proc))
113 0 : elog(ERROR, "missing support function %d(%u,%u) for index \"%s\"",
114 : HASHSTANDARD_PROC, keytype, keytype,
115 : RelationGetRelationName(rel));
116 0 : collation = rel->rd_indcollation[0];
117 :
118 0 : return DatumGetUInt32(OidFunctionCall1Coll(hash_proc, collation, key));
119 : }
120 :
121 : /*
122 : * _hash_hashkey2bucket -- determine which bucket the hashkey maps to.
123 : */
124 : Bucket
125 458518 : _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
126 : uint32 highmask, uint32 lowmask)
127 : {
128 : Bucket bucket;
129 :
130 458518 : bucket = hashkey & highmask;
131 458518 : if (bucket > maxbucket)
132 175145 : bucket = bucket & lowmask;
133 :
134 458518 : return bucket;
135 : }
136 :
137 : /*
138 : * _hash_log2 -- returns ceil(lg2(num))
139 : */
140 : uint32
141 79552 : _hash_log2(uint32 num)
142 : {
143 : uint32 i,
144 : limit;
145 :
146 79552 : limit = 1;
147 79552 : for (i = 0; limit < num; limit <<= 1, i++)
148 : ;
149 79552 : return i;
150 : }
151 :
152 : /*
153 : * _hash_spareindex -- returns spare index / global splitpoint phase of the
154 : * bucket
155 : */
156 : uint32
157 79521 : _hash_spareindex(uint32 num_bucket)
158 : {
159 : uint32 splitpoint_group;
160 : uint32 splitpoint_phases;
161 :
162 79521 : splitpoint_group = _hash_log2(num_bucket);
163 :
164 79521 : if (splitpoint_group < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
165 79521 : return splitpoint_group;
166 :
167 : /* account for single-phase groups */
168 0 : splitpoint_phases = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
169 :
170 : /* account for multi-phase groups before splitpoint_group */
171 0 : splitpoint_phases +=
172 0 : ((splitpoint_group - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) <<
173 : HASH_SPLITPOINT_PHASE_BITS);
174 :
175 : /* account for phases within current group */
176 0 : splitpoint_phases +=
177 0 : (((num_bucket - 1) >>
178 0 : (splitpoint_group - (HASH_SPLITPOINT_PHASE_BITS + 1))) &
179 : HASH_SPLITPOINT_PHASE_MASK); /* to 0-based value. */
180 :
181 0 : return splitpoint_phases;
182 : }
183 :
184 : /*
185 : * _hash_get_totalbuckets -- returns total number of buckets allocated till
186 : * the given splitpoint phase.
187 : */
188 : uint32
189 217 : _hash_get_totalbuckets(uint32 splitpoint_phase)
190 : {
191 : uint32 splitpoint_group;
192 : uint32 total_buckets;
193 : uint32 phases_within_splitpoint_group;
194 :
195 217 : if (splitpoint_phase < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
196 217 : return (1 << splitpoint_phase);
197 :
198 : /* get splitpoint's group */
199 0 : splitpoint_group = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
200 0 : splitpoint_group +=
201 0 : ((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) >>
202 : HASH_SPLITPOINT_PHASE_BITS);
203 :
204 : /* account for buckets before splitpoint_group */
205 0 : total_buckets = (1 << (splitpoint_group - 1));
206 :
207 : /* account for buckets within splitpoint_group */
208 0 : phases_within_splitpoint_group =
209 0 : (((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) &
210 : HASH_SPLITPOINT_PHASE_MASK) + 1); /* from 0-based to 1-based */
211 0 : total_buckets +=
212 0 : (((1 << (splitpoint_group - 1)) >> HASH_SPLITPOINT_PHASE_BITS) *
213 : phases_within_splitpoint_group);
214 :
215 0 : return total_buckets;
216 : }
217 :
218 : /*
219 : * _hash_checkpage -- sanity checks on the format of all hash pages
220 : *
221 : * If flags is not zero, it is a bitwise OR of the acceptable page types
222 : * (values of hasho_flag & LH_PAGE_TYPE).
223 : */
224 : void
225 309529 : _hash_checkpage(Relation rel, Buffer buf, int flags)
226 : {
227 309529 : Page page = BufferGetPage(buf);
228 :
229 : /*
230 : * ReadBuffer verifies that every newly-read page passes
231 : * PageHeaderIsValid, which means it either contains a reasonably sane
232 : * page header or is all-zero. We have to defend against the all-zero
233 : * case, however.
234 : */
235 309529 : if (PageIsNew(page))
236 0 : ereport(ERROR,
237 : (errcode(ERRCODE_INDEX_CORRUPTED),
238 : errmsg("index \"%s\" contains unexpected zero page at block %u",
239 : RelationGetRelationName(rel),
240 : BufferGetBlockNumber(buf)),
241 : errhint("Please REINDEX it.")));
242 :
243 : /*
244 : * Additionally check that the special area looks sane.
245 : */
246 309529 : if (PageGetSpecialSize(page) != MAXALIGN(sizeof(HashPageOpaqueData)))
247 0 : ereport(ERROR,
248 : (errcode(ERRCODE_INDEX_CORRUPTED),
249 : errmsg("index \"%s\" contains corrupted page at block %u",
250 : RelationGetRelationName(rel),
251 : BufferGetBlockNumber(buf)),
252 : errhint("Please REINDEX it.")));
253 :
254 309529 : if (flags)
255 : {
256 309529 : HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page);
257 :
258 309529 : if ((opaque->hasho_flag & flags) == 0)
259 0 : ereport(ERROR,
260 : (errcode(ERRCODE_INDEX_CORRUPTED),
261 : errmsg("index \"%s\" contains corrupted page at block %u",
262 : RelationGetRelationName(rel),
263 : BufferGetBlockNumber(buf)),
264 : errhint("Please REINDEX it.")));
265 : }
266 :
267 : /*
268 : * When checking the metapage, also verify magic number and version.
269 : */
270 309529 : if (flags == LH_META_PAGE)
271 : {
272 80758 : HashMetaPage metap = HashPageGetMeta(page);
273 :
274 80758 : if (metap->hashm_magic != HASH_MAGIC)
275 0 : ereport(ERROR,
276 : (errcode(ERRCODE_INDEX_CORRUPTED),
277 : errmsg("index \"%s\" is not a hash index",
278 : RelationGetRelationName(rel))));
279 :
280 80758 : if (metap->hashm_version != HASH_VERSION)
281 0 : ereport(ERROR,
282 : (errcode(ERRCODE_INDEX_CORRUPTED),
283 : errmsg("index \"%s\" has wrong hash version",
284 : RelationGetRelationName(rel)),
285 : errhint("Please REINDEX it.")));
286 : }
287 309529 : }
288 :
289 : bytea *
290 4 : hashoptions(Datum reloptions, bool validate)
291 : {
292 4 : return default_reloptions(reloptions, validate, RELOPT_KIND_HASH);
293 : }
294 :
295 : /*
296 : * _hash_get_indextuple_hashkey - get the hash index tuple's hash key value
297 : */
298 : uint32
299 964491 : _hash_get_indextuple_hashkey(IndexTuple itup)
300 : {
301 : char *attp;
302 :
303 : /*
304 : * We assume the hash key is the first attribute and can't be null, so
305 : * this can be done crudely but very very cheaply ...
306 : */
307 964491 : attp = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
308 964491 : return *((uint32 *) attp);
309 : }
310 :
311 : /*
312 : * _hash_convert_tuple - convert raw index data to hash key
313 : *
314 : * Inputs: values and isnull arrays for the user data column(s)
315 : * Outputs: values and isnull arrays for the index tuple, suitable for
316 : * passing to index_form_tuple().
317 : *
318 : * Returns true if successful, false if not (because there are null values).
319 : * On a false result, the given data need not be indexed.
320 : *
321 : * Note: callers know that the index-column arrays are always of length 1.
322 : * In principle, there could be more than one input column, though we do not
323 : * currently support that.
324 : */
325 : bool
326 80552 : _hash_convert_tuple(Relation index,
327 : Datum *user_values, bool *user_isnull,
328 : Datum *index_values, bool *index_isnull)
329 : {
330 : uint32 hashkey;
331 :
332 : /*
333 : * We do not insert null values into hash indexes. This is okay because
334 : * the only supported search operator is '=', and we assume it is strict.
335 : */
336 80552 : if (user_isnull[0])
337 0 : return false;
338 :
339 80552 : hashkey = _hash_datum2hashkey(index, user_values[0]);
340 80552 : index_values[0] = UInt32GetDatum(hashkey);
341 80552 : index_isnull[0] = false;
342 80552 : return true;
343 : }
344 :
345 : /*
346 : * _hash_binsearch - Return the offset number in the page where the
347 : * specified hash value should be sought or inserted.
348 : *
349 : * We use binary search, relying on the assumption that the existing entries
350 : * are ordered by hash key.
351 : *
352 : * Returns the offset of the first index entry having hashkey >= hash_value,
353 : * or the page's max offset plus one if hash_value is greater than all
354 : * existing hash keys in the page. This is the appropriate place to start
355 : * a search, or to insert a new item.
356 : */
357 : OffsetNumber
358 97599 : _hash_binsearch(Page page, uint32 hash_value)
359 : {
360 : OffsetNumber upper;
361 : OffsetNumber lower;
362 :
363 : /* Loop invariant: lower <= desired place <= upper */
364 97599 : upper = PageGetMaxOffsetNumber(page) + 1;
365 97599 : lower = FirstOffsetNumber;
366 :
367 867191 : while (upper > lower)
368 : {
369 : OffsetNumber off;
370 : IndexTuple itup;
371 : uint32 hashkey;
372 :
373 671993 : off = (upper + lower) / 2;
374 671993 : Assert(OffsetNumberIsValid(off));
375 :
376 671993 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
377 671993 : hashkey = _hash_get_indextuple_hashkey(itup);
378 671993 : if (hashkey < hash_value)
379 298954 : lower = off + 1;
380 : else
381 373039 : upper = off;
382 : }
383 :
384 97599 : return lower;
385 : }
386 :
387 : /*
388 : * _hash_binsearch_last
389 : *
390 : * Same as above, except that if there are multiple matching items in the
391 : * page, we return the offset of the last one instead of the first one,
392 : * and the possible range of outputs is 0..maxoffset not 1..maxoffset+1.
393 : * This is handy for starting a new page in a backwards scan.
394 : */
395 : OffsetNumber
396 11 : _hash_binsearch_last(Page page, uint32 hash_value)
397 : {
398 : OffsetNumber upper;
399 : OffsetNumber lower;
400 :
401 : /* Loop invariant: lower <= desired place <= upper */
402 11 : upper = PageGetMaxOffsetNumber(page);
403 11 : lower = FirstOffsetNumber - 1;
404 :
405 121 : while (upper > lower)
406 : {
407 : IndexTuple itup;
408 : OffsetNumber off;
409 : uint32 hashkey;
410 :
411 99 : off = (upper + lower + 1) / 2;
412 99 : Assert(OffsetNumberIsValid(off));
413 :
414 99 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
415 99 : hashkey = _hash_get_indextuple_hashkey(itup);
416 99 : if (hashkey > hash_value)
417 0 : upper = off - 1;
418 : else
419 99 : lower = off;
420 : }
421 :
422 11 : return lower;
423 : }
424 :
425 : /*
426 : * _hash_get_oldblock_from_newbucket() -- get the block number of a bucket
427 : * from which current (new) bucket is being split.
428 : */
429 : BlockNumber
430 0 : _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket)
431 : {
432 : Bucket old_bucket;
433 : uint32 mask;
434 : Buffer metabuf;
435 : HashMetaPage metap;
436 : BlockNumber blkno;
437 :
438 : /*
439 : * To get the old bucket from the current bucket, we need a mask to modulo
440 : * into lower half of table. This mask is stored in meta page as
441 : * hashm_lowmask, but here we can't rely on the same, because we need a
442 : * value of lowmask that was prevalent at the time when bucket split was
443 : * started. Masking the most significant bit of new bucket would give us
444 : * old bucket.
445 : */
446 0 : mask = (((uint32) 1) << (fls(new_bucket) - 1)) - 1;
447 0 : old_bucket = new_bucket & mask;
448 :
449 0 : metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
450 0 : metap = HashPageGetMeta(BufferGetPage(metabuf));
451 :
452 0 : blkno = BUCKET_TO_BLKNO(metap, old_bucket);
453 :
454 0 : _hash_relbuf(rel, metabuf);
455 :
456 0 : return blkno;
457 : }
458 :
459 : /*
460 : * _hash_get_newblock_from_oldbucket() -- get the block number of a bucket
461 : * that will be generated after split from old bucket.
462 : *
463 : * This is used to find the new bucket from old bucket based on current table
464 : * half. It is mainly required to finish the incomplete splits where we are
465 : * sure that not more than one bucket could have split in progress from old
466 : * bucket.
467 : */
468 : BlockNumber
469 0 : _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)
470 : {
471 : Bucket new_bucket;
472 : Buffer metabuf;
473 : HashMetaPage metap;
474 : BlockNumber blkno;
475 :
476 0 : metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
477 0 : metap = HashPageGetMeta(BufferGetPage(metabuf));
478 :
479 0 : new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket,
480 : metap->hashm_lowmask,
481 : metap->hashm_maxbucket);
482 0 : blkno = BUCKET_TO_BLKNO(metap, new_bucket);
483 :
484 0 : _hash_relbuf(rel, metabuf);
485 :
486 0 : return blkno;
487 : }
488 :
489 : /*
490 : * _hash_get_newbucket_from_oldbucket() -- get the new bucket that will be
491 : * generated after split from current (old) bucket.
492 : *
493 : * This is used to find the new bucket from old bucket. New bucket can be
494 : * obtained by OR'ing old bucket with most significant bit of current table
495 : * half (lowmask passed in this function can be used to identify msb of
496 : * current table half). There could be multiple buckets that could have
497 : * been split from current bucket. We need the first such bucket that exists.
498 : * Caller must ensure that no more than one split has happened from old
499 : * bucket.
500 : */
501 : Bucket
502 72 : _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
503 : uint32 lowmask, uint32 maxbucket)
504 : {
505 : Bucket new_bucket;
506 :
507 72 : new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
508 72 : if (new_bucket > maxbucket)
509 : {
510 0 : lowmask = lowmask >> 1;
511 0 : new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
512 : }
513 :
514 72 : return new_bucket;
515 : }
516 :
517 : /*
518 : * _hash_kill_items - set LP_DEAD state for items an indexscan caller has
519 : * told us were killed.
520 : *
521 : * scan->opaque, referenced locally through so, contains information about the
522 : * current page and killed tuples thereon (generally, this should only be
523 : * called if so->numKilled > 0).
524 : *
525 : * We match items by heap TID before assuming they are the right ones to
526 : * delete.
527 : */
528 : void
529 0 : _hash_kill_items(IndexScanDesc scan)
530 : {
531 0 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
532 : Page page;
533 : HashPageOpaque opaque;
534 : OffsetNumber offnum,
535 : maxoff;
536 0 : int numKilled = so->numKilled;
537 : int i;
538 0 : bool killedsomething = false;
539 :
540 0 : Assert(so->numKilled > 0);
541 0 : Assert(so->killedItems != NULL);
542 :
543 : /*
544 : * Always reset the scan state, so we don't look for same items on other
545 : * pages.
546 : */
547 0 : so->numKilled = 0;
548 :
549 0 : page = BufferGetPage(so->hashso_curbuf);
550 0 : opaque = (HashPageOpaque) PageGetSpecialPointer(page);
551 0 : maxoff = PageGetMaxOffsetNumber(page);
552 :
553 0 : for (i = 0; i < numKilled; i++)
554 : {
555 0 : offnum = so->killedItems[i].indexOffset;
556 :
557 0 : while (offnum <= maxoff)
558 : {
559 0 : ItemId iid = PageGetItemId(page, offnum);
560 0 : IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
561 :
562 0 : if (ItemPointerEquals(&ituple->t_tid, &so->killedItems[i].heapTid))
563 : {
564 : /* found the item */
565 0 : ItemIdMarkDead(iid);
566 0 : killedsomething = true;
567 0 : break; /* out of inner search loop */
568 : }
569 0 : offnum = OffsetNumberNext(offnum);
570 : }
571 : }
572 :
573 : /*
574 : * Since this can be redone later if needed, mark as dirty hint. Whenever
575 : * we mark anything LP_DEAD, we also set the page's
576 : * LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint.
577 : */
578 0 : if (killedsomething)
579 : {
580 0 : opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES;
581 0 : MarkBufferDirtyHint(so->hashso_curbuf, true);
582 : }
583 0 : }
|