LCOV - code coverage report
Current view: top level - src/backend/access/hash - hash.c (source / functions) Hit Total Coverage
Test: PostgreSQL Lines: 303 339 89.4 %
Date: 2017-09-29 15:12:54 Functions: 13 13 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * hash.c
       4             :  *    Implementation of Margo Seltzer's Hashing package for postgres.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/access/hash/hash.c
      12             :  *
      13             :  * NOTES
      14             :  *    This file contains only the public interface routines.
      15             :  *
      16             :  *-------------------------------------------------------------------------
      17             :  */
      18             : 
      19             : #include "postgres.h"
      20             : 
      21             : #include "access/hash.h"
      22             : #include "access/hash_xlog.h"
      23             : #include "access/relscan.h"
      24             : #include "catalog/index.h"
      25             : #include "commands/vacuum.h"
      26             : #include "miscadmin.h"
      27             : #include "optimizer/plancat.h"
      28             : #include "utils/builtins.h"
      29             : #include "utils/index_selfuncs.h"
      30             : #include "utils/rel.h"
      31             : #include "miscadmin.h"
      32             : 
      33             : 
      34             : /* Working state for hashbuild and its callback */
      35             : typedef struct
      36             : {
      37             :     HSpool     *spool;          /* NULL if not using spooling */
      38             :     double      indtuples;      /* # tuples accepted into index */
      39             :     Relation    heapRel;        /* heap relation descriptor */
      40             : } HashBuildState;
      41             : 
      42             : static void hashbuildCallback(Relation index,
      43             :                   HeapTuple htup,
      44             :                   Datum *values,
      45             :                   bool *isnull,
      46             :                   bool tupleIsAlive,
      47             :                   void *state);
      48             : 
      49             : 
      50             : /*
      51             :  * Hash handler function: return IndexAmRoutine with access method parameters
      52             :  * and callbacks.
      53             :  */
      54             : Datum
      55         134 : hashhandler(PG_FUNCTION_ARGS)
      56             : {
      57         134 :     IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
      58             : 
      59         134 :     amroutine->amstrategies = HTMaxStrategyNumber;
      60         134 :     amroutine->amsupport = HASHNProcs;
      61         134 :     amroutine->amcanorder = false;
      62         134 :     amroutine->amcanorderbyop = false;
      63         134 :     amroutine->amcanbackward = true;
      64         134 :     amroutine->amcanunique = false;
      65         134 :     amroutine->amcanmulticol = false;
      66         134 :     amroutine->amoptionalkey = false;
      67         134 :     amroutine->amsearcharray = false;
      68         134 :     amroutine->amsearchnulls = false;
      69         134 :     amroutine->amstorage = false;
      70         134 :     amroutine->amclusterable = false;
      71         134 :     amroutine->ampredlocks = false;
      72         134 :     amroutine->amcanparallel = false;
      73         134 :     amroutine->amkeytype = INT4OID;
      74             : 
      75         134 :     amroutine->ambuild = hashbuild;
      76         134 :     amroutine->ambuildempty = hashbuildempty;
      77         134 :     amroutine->aminsert = hashinsert;
      78         134 :     amroutine->ambulkdelete = hashbulkdelete;
      79         134 :     amroutine->amvacuumcleanup = hashvacuumcleanup;
      80         134 :     amroutine->amcanreturn = NULL;
      81         134 :     amroutine->amcostestimate = hashcostestimate;
      82         134 :     amroutine->amoptions = hashoptions;
      83         134 :     amroutine->amproperty = NULL;
      84         134 :     amroutine->amvalidate = hashvalidate;
      85         134 :     amroutine->ambeginscan = hashbeginscan;
      86         134 :     amroutine->amrescan = hashrescan;
      87         134 :     amroutine->amgettuple = hashgettuple;
      88         134 :     amroutine->amgetbitmap = hashgetbitmap;
      89         134 :     amroutine->amendscan = hashendscan;
      90         134 :     amroutine->ammarkpos = NULL;
      91         134 :     amroutine->amrestrpos = NULL;
      92         134 :     amroutine->amestimateparallelscan = NULL;
      93         134 :     amroutine->aminitparallelscan = NULL;
      94         134 :     amroutine->amparallelrescan = NULL;
      95             : 
      96         134 :     PG_RETURN_POINTER(amroutine);
      97             : }
      98             : 
      99             : /*
     100             :  *  hashbuild() -- build a new hash index.
     101             :  */
     102             : IndexBuildResult *
     103          14 : hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
     104             : {
     105             :     IndexBuildResult *result;
     106             :     BlockNumber relpages;
     107             :     double      reltuples;
     108             :     double      allvisfrac;
     109             :     uint32      num_buckets;
     110             :     long        sort_threshold;
     111             :     HashBuildState buildstate;
     112             : 
     113             :     /*
     114             :      * We expect to be called exactly once for any index relation. If that's
     115             :      * not the case, big trouble's what we have.
     116             :      */
     117          14 :     if (RelationGetNumberOfBlocks(index) != 0)
     118           0 :         elog(ERROR, "index \"%s\" already contains data",
     119             :              RelationGetRelationName(index));
     120             : 
     121             :     /* Estimate the number of rows currently present in the table */
     122          14 :     estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
     123             : 
     124             :     /* Initialize the hash index metadata page and initial buckets */
     125          14 :     num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
     126             : 
     127             :     /*
     128             :      * If we just insert the tuples into the index in scan order, then
     129             :      * (assuming their hash codes are pretty random) there will be no locality
     130             :      * of access to the index, and if the index is bigger than available RAM
     131             :      * then we'll thrash horribly.  To prevent that scenario, we can sort the
     132             :      * tuples by (expected) bucket number.  However, such a sort is useless
     133             :      * overhead when the index does fit in RAM.  We choose to sort if the
     134             :      * initial index size exceeds maintenance_work_mem, or the number of
     135             :      * buffers usable for the index, whichever is less.  (Limiting by the
     136             :      * number of buffers should reduce thrashing between PG buffers and kernel
     137             :      * buffers, which seems useful even if no physical I/O results.  Limiting
     138             :      * by maintenance_work_mem is useful to allow easy testing of the sort
     139             :      * code path, and may be useful to DBAs as an additional control knob.)
     140             :      *
     141             :      * NOTE: this test will need adjustment if a bucket is ever different from
     142             :      * one page.  Also, "initial index size" accounting does not include the
     143             :      * metapage, nor the first bitmap page.
     144             :      */
     145          14 :     sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ;
     146          14 :     if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
     147          13 :         sort_threshold = Min(sort_threshold, NBuffers);
     148             :     else
     149           1 :         sort_threshold = Min(sort_threshold, NLocBuffer);
     150             : 
     151          14 :     if (num_buckets >= (uint32) sort_threshold)
     152           1 :         buildstate.spool = _h_spoolinit(heap, index, num_buckets);
     153             :     else
     154          13 :         buildstate.spool = NULL;
     155             : 
     156             :     /* prepare to build the index */
     157          14 :     buildstate.indtuples = 0;
     158          14 :     buildstate.heapRel = heap;
     159             : 
     160             :     /* do the heap scan */
     161          14 :     reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
     162             :                                    hashbuildCallback, (void *) &buildstate);
     163             : 
     164          14 :     if (buildstate.spool)
     165             :     {
     166             :         /* sort the tuples and insert them into the index */
     167           1 :         _h_indexbuild(buildstate.spool, buildstate.heapRel);
     168           1 :         _h_spooldestroy(buildstate.spool);
     169             :     }
     170             : 
     171             :     /*
     172             :      * Return statistics
     173             :      */
     174          14 :     result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
     175             : 
     176          14 :     result->heap_tuples = reltuples;
     177          14 :     result->index_tuples = buildstate.indtuples;
     178             : 
     179          14 :     return result;
     180             : }
     181             : 
     182             : /*
     183             :  *  hashbuildempty() -- build an empty hash index in the initialization fork
     184             :  */
     185             : void
     186           1 : hashbuildempty(Relation index)
     187             : {
     188           1 :     _hash_init(index, 0, INIT_FORKNUM);
     189           1 : }
     190             : 
     191             : /*
     192             :  * Per-tuple callback from IndexBuildHeapScan
     193             :  */
     194             : static void
     195       50543 : hashbuildCallback(Relation index,
     196             :                   HeapTuple htup,
     197             :                   Datum *values,
     198             :                   bool *isnull,
     199             :                   bool tupleIsAlive,
     200             :                   void *state)
     201             : {
     202       50543 :     HashBuildState *buildstate = (HashBuildState *) state;
     203             :     Datum       index_values[1];
     204             :     bool        index_isnull[1];
     205             :     IndexTuple  itup;
     206             : 
     207             :     /* convert data to a hash key; on failure, do not insert anything */
     208       50543 :     if (!_hash_convert_tuple(index,
     209             :                              values, isnull,
     210             :                              index_values, index_isnull))
     211       50543 :         return;
     212             : 
     213             :     /* Either spool the tuple for sorting, or just put it into the index */
     214       50543 :     if (buildstate->spool)
     215       10000 :         _h_spool(buildstate->spool, &htup->t_self,
     216             :                  index_values, index_isnull);
     217             :     else
     218             :     {
     219             :         /* form an index tuple and point it at the heap tuple */
     220       40543 :         itup = index_form_tuple(RelationGetDescr(index),
     221             :                                 index_values, index_isnull);
     222       40543 :         itup->t_tid = htup->t_self;
     223       40543 :         _hash_doinsert(index, itup, buildstate->heapRel);
     224       40543 :         pfree(itup);
     225             :     }
     226             : 
     227       50543 :     buildstate->indtuples += 1;
     228             : }
     229             : 
     230             : /*
     231             :  *  hashinsert() -- insert an index tuple into a hash table.
     232             :  *
     233             :  *  Hash on the heap tuple's key, form an index tuple with hash code.
     234             :  *  Find the appropriate location for the new tuple, and put it there.
     235             :  */
     236             : bool
     237       30009 : hashinsert(Relation rel, Datum *values, bool *isnull,
     238             :            ItemPointer ht_ctid, Relation heapRel,
     239             :            IndexUniqueCheck checkUnique,
     240             :            IndexInfo *indexInfo)
     241             : {
     242             :     Datum       index_values[1];
     243             :     bool        index_isnull[1];
     244             :     IndexTuple  itup;
     245             : 
     246             :     /* convert data to a hash key; on failure, do not insert anything */
     247       30009 :     if (!_hash_convert_tuple(rel,
     248             :                              values, isnull,
     249             :                              index_values, index_isnull))
     250           0 :         return false;
     251             : 
     252             :     /* form an index tuple and point it at the heap tuple */
     253       30009 :     itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
     254       30009 :     itup->t_tid = *ht_ctid;
     255             : 
     256       30009 :     _hash_doinsert(rel, itup, heapRel);
     257             : 
     258       30009 :     pfree(itup);
     259             : 
     260       30009 :     return false;
     261             : }
     262             : 
     263             : 
     264             : /*
     265             :  *  hashgettuple() -- Get the next tuple in the scan.
     266             :  */
     267             : bool
     268       16544 : hashgettuple(IndexScanDesc scan, ScanDirection dir)
     269             : {
     270       16544 :     HashScanOpaque so = (HashScanOpaque) scan->opaque;
     271       16544 :     Relation    rel = scan->indexRelation;
     272             :     Buffer      buf;
     273             :     Page        page;
     274             :     OffsetNumber offnum;
     275             :     ItemPointer current;
     276             :     bool        res;
     277             : 
     278             :     /* Hash indexes are always lossy since we store only the hash code */
     279       16544 :     scan->xs_recheck = true;
     280             : 
     281             :     /*
     282             :      * We hold pin but not lock on current buffer while outside the hash AM.
     283             :      * Reacquire the read lock here.
     284             :      */
     285       16544 :     if (BufferIsValid(so->hashso_curbuf))
     286       16520 :         LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
     287             : 
     288             :     /*
     289             :      * If we've already initialized this scan, we can just advance it in the
     290             :      * appropriate direction.  If we haven't done so yet, we call a routine to
     291             :      * get the first item in the scan.
     292             :      */
     293       16544 :     current = &(so->hashso_curpos);
     294       16544 :     if (ItemPointerIsValid(current))
     295             :     {
     296             :         /*
     297             :          * An insertion into the current index page could have happened while
     298             :          * we didn't have read lock on it.  Re-find our position by looking
     299             :          * for the TID we previously returned.  (Because we hold a pin on the
     300             :          * primary bucket page, no deletions or splits could have occurred;
     301             :          * therefore we can expect that the TID still exists in the current
     302             :          * index page, at an offset >= where we were.)
     303             :          */
     304             :         OffsetNumber maxoffnum;
     305             : 
     306       16520 :         buf = so->hashso_curbuf;
     307       16520 :         Assert(BufferIsValid(buf));
     308       16520 :         page = BufferGetPage(buf);
     309             : 
     310             :         /*
     311             :          * We don't need test for old snapshot here as the current buffer is
     312             :          * pinned, so vacuum can't clean the page.
     313             :          */
     314       16520 :         maxoffnum = PageGetMaxOffsetNumber(page);
     315       33043 :         for (offnum = ItemPointerGetOffsetNumber(current);
     316             :              offnum <= maxoffnum;
     317           3 :              offnum = OffsetNumberNext(offnum))
     318             :         {
     319             :             IndexTuple  itup;
     320             : 
     321       16523 :             itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
     322       16523 :             if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid)))
     323       16520 :                 break;
     324             :         }
     325       16520 :         if (offnum > maxoffnum)
     326           0 :             elog(ERROR, "failed to re-find scan position within index \"%s\"",
     327             :                  RelationGetRelationName(rel));
     328       16520 :         ItemPointerSetOffsetNumber(current, offnum);
     329             : 
     330             :         /*
     331             :          * Check to see if we should kill the previously-fetched tuple.
     332             :          */
     333       16520 :         if (scan->kill_prior_tuple)
     334             :         {
     335             :             /*
     336             :              * Yes, so remember it for later. (We'll deal with all such tuples
     337             :              * at once right after leaving the index page or at end of scan.)
     338             :              * In case if caller reverses the indexscan direction it is quite
     339             :              * possible that the same item might get entered multiple times.
     340             :              * But, we don't detect that; instead, we just forget any excess
     341             :              * entries.
     342             :              */
     343           0 :             if (so->killedItems == NULL)
     344           0 :                 so->killedItems = palloc(MaxIndexTuplesPerPage *
     345             :                                          sizeof(HashScanPosItem));
     346             : 
     347           0 :             if (so->numKilled < MaxIndexTuplesPerPage)
     348             :             {
     349           0 :                 so->killedItems[so->numKilled].heapTid = so->hashso_heappos;
     350           0 :                 so->killedItems[so->numKilled].indexOffset =
     351           0 :                     ItemPointerGetOffsetNumber(&(so->hashso_curpos));
     352           0 :                 so->numKilled++;
     353             :             }
     354             :         }
     355             : 
     356             :         /*
     357             :          * Now continue the scan.
     358             :          */
     359       16520 :         res = _hash_next(scan, dir);
     360             :     }
     361             :     else
     362          24 :         res = _hash_first(scan, dir);
     363             : 
     364             :     /*
     365             :      * Skip killed tuples if asked to.
     366             :      */
     367       16544 :     if (scan->ignore_killed_tuples)
     368             :     {
     369       33088 :         while (res)
     370             :         {
     371       16520 :             offnum = ItemPointerGetOffsetNumber(current);
     372       16520 :             page = BufferGetPage(so->hashso_curbuf);
     373       16520 :             if (!ItemIdIsDead(PageGetItemId(page, offnum)))
     374       16520 :                 break;
     375           0 :             res = _hash_next(scan, dir);
     376             :         }
     377             :     }
     378             : 
     379             :     /* Release read lock on current buffer, but keep it pinned */
     380       16544 :     if (BufferIsValid(so->hashso_curbuf))
     381       16520 :         LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
     382             : 
     383             :     /* Return current heap TID on success */
     384       16544 :     scan->xs_ctup.t_self = so->hashso_heappos;
     385             : 
     386       16544 :     return res;
     387             : }
     388             : 
     389             : 
     390             : /*
     391             :  *  hashgetbitmap() -- get all tuples at once
     392             :  */
     393             : int64
     394           1 : hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
     395             : {
     396           1 :     HashScanOpaque so = (HashScanOpaque) scan->opaque;
     397             :     bool        res;
     398           1 :     int64       ntids = 0;
     399             : 
     400           1 :     res = _hash_first(scan, ForwardScanDirection);
     401             : 
     402          16 :     while (res)
     403             :     {
     404             :         bool        add_tuple;
     405             : 
     406             :         /*
     407             :          * Skip killed tuples if asked to.
     408             :          */
     409          14 :         if (scan->ignore_killed_tuples)
     410             :         {
     411             :             Page        page;
     412             :             OffsetNumber offnum;
     413             : 
     414          14 :             offnum = ItemPointerGetOffsetNumber(&(so->hashso_curpos));
     415          14 :             page = BufferGetPage(so->hashso_curbuf);
     416          14 :             add_tuple = !ItemIdIsDead(PageGetItemId(page, offnum));
     417             :         }
     418             :         else
     419           0 :             add_tuple = true;
     420             : 
     421             :         /* Save tuple ID, and continue scanning */
     422          14 :         if (add_tuple)
     423             :         {
     424             :             /* Note we mark the tuple ID as requiring recheck */
     425          14 :             tbm_add_tuples(tbm, &(so->hashso_heappos), 1, true);
     426          14 :             ntids++;
     427             :         }
     428             : 
     429          14 :         res = _hash_next(scan, ForwardScanDirection);
     430             :     }
     431             : 
     432           1 :     return ntids;
     433             : }
     434             : 
     435             : 
     436             : /*
     437             :  *  hashbeginscan() -- start a scan on a hash index
     438             :  */
     439             : IndexScanDesc
     440          24 : hashbeginscan(Relation rel, int nkeys, int norderbys)
     441             : {
     442             :     IndexScanDesc scan;
     443             :     HashScanOpaque so;
     444             : 
     445             :     /* no order by operators allowed */
     446          24 :     Assert(norderbys == 0);
     447             : 
     448          24 :     scan = RelationGetIndexScan(rel, nkeys, norderbys);
     449             : 
     450          24 :     so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
     451          24 :     so->hashso_curbuf = InvalidBuffer;
     452          24 :     so->hashso_bucket_buf = InvalidBuffer;
     453          24 :     so->hashso_split_bucket_buf = InvalidBuffer;
     454             :     /* set position invalid (this will cause _hash_first call) */
     455          24 :     ItemPointerSetInvalid(&(so->hashso_curpos));
     456          24 :     ItemPointerSetInvalid(&(so->hashso_heappos));
     457             : 
     458          24 :     so->hashso_buc_populated = false;
     459          24 :     so->hashso_buc_split = false;
     460             : 
     461          24 :     so->killedItems = NULL;
     462          24 :     so->numKilled = 0;
     463             : 
     464          24 :     scan->opaque = so;
     465             : 
     466          24 :     return scan;
     467             : }
     468             : 
     469             : /*
     470             :  *  hashrescan() -- rescan an index relation
     471             :  */
     472             : void
     473          25 : hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
     474             :            ScanKey orderbys, int norderbys)
     475             : {
     476          25 :     HashScanOpaque so = (HashScanOpaque) scan->opaque;
     477          25 :     Relation    rel = scan->indexRelation;
     478             : 
     479             :     /*
     480             :      * Before leaving current page, deal with any killed items. Also, ensure
     481             :      * that we acquire lock on current page before calling _hash_kill_items.
     482             :      */
     483          25 :     if (so->numKilled > 0)
     484             :     {
     485           0 :         LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
     486           0 :         _hash_kill_items(scan);
     487           0 :         LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
     488             :     }
     489             : 
     490          25 :     _hash_dropscanbuf(rel, so);
     491             : 
     492             :     /* set position invalid (this will cause _hash_first call) */
     493          25 :     ItemPointerSetInvalid(&(so->hashso_curpos));
     494          25 :     ItemPointerSetInvalid(&(so->hashso_heappos));
     495             : 
     496             :     /* Update scan key, if a new one is given */
     497          25 :     if (scankey && scan->numberOfKeys > 0)
     498             :     {
     499          25 :         memmove(scan->keyData,
     500             :                 scankey,
     501          25 :                 scan->numberOfKeys * sizeof(ScanKeyData));
     502             :     }
     503             : 
     504          25 :     so->hashso_buc_populated = false;
     505          25 :     so->hashso_buc_split = false;
     506          25 : }
     507             : 
     508             : /*
     509             :  *  hashendscan() -- close down a scan
     510             :  */
     511             : void
     512          24 : hashendscan(IndexScanDesc scan)
     513             : {
     514          24 :     HashScanOpaque so = (HashScanOpaque) scan->opaque;
     515          24 :     Relation    rel = scan->indexRelation;
     516             : 
     517             :     /*
     518             :      * Before leaving current page, deal with any killed items. Also, ensure
     519             :      * that we acquire lock on current page before calling _hash_kill_items.
     520             :      */
     521          24 :     if (so->numKilled > 0)
     522             :     {
     523           0 :         LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
     524           0 :         _hash_kill_items(scan);
     525           0 :         LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
     526             :     }
     527             : 
     528          24 :     _hash_dropscanbuf(rel, so);
     529             : 
     530          24 :     if (so->killedItems != NULL)
     531           0 :         pfree(so->killedItems);
     532          24 :     pfree(so);
     533          24 :     scan->opaque = NULL;
     534          24 : }
     535             : 
     536             : /*
     537             :  * Bulk deletion of all index entries pointing to a set of heap tuples.
     538             :  * The set of target tuples is specified via a callback routine that tells
     539             :  * whether any given heap tuple (identified by ItemPointer) is being deleted.
     540             :  *
     541             :  * This function also deletes the tuples that are moved by split to other
     542             :  * bucket.
     543             :  *
     544             :  * Result: a palloc'd struct containing statistical info for VACUUM displays.
     545             :  */
     546             : IndexBulkDeleteResult *
     547           1 : hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
     548             :                IndexBulkDeleteCallback callback, void *callback_state)
     549             : {
     550           1 :     Relation    rel = info->index;
     551             :     double      tuples_removed;
     552             :     double      num_index_tuples;
     553             :     double      orig_ntuples;
     554             :     Bucket      orig_maxbucket;
     555             :     Bucket      cur_maxbucket;
     556             :     Bucket      cur_bucket;
     557           1 :     Buffer      metabuf = InvalidBuffer;
     558             :     HashMetaPage metap;
     559             :     HashMetaPage cachedmetap;
     560             : 
     561           1 :     tuples_removed = 0;
     562           1 :     num_index_tuples = 0;
     563             : 
     564             :     /*
     565             :      * We need a copy of the metapage so that we can use its hashm_spares[]
     566             :      * values to compute bucket page addresses, but a cached copy should be
     567             :      * good enough.  (If not, we'll detect that further down and refresh the
     568             :      * cache as necessary.)
     569             :      */
     570           1 :     cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
     571           1 :     Assert(cachedmetap != NULL);
     572             : 
     573           1 :     orig_maxbucket = cachedmetap->hashm_maxbucket;
     574           1 :     orig_ntuples = cachedmetap->hashm_ntuples;
     575             : 
     576             :     /* Scan the buckets that we know exist */
     577           1 :     cur_bucket = 0;
     578           1 :     cur_maxbucket = orig_maxbucket;
     579             : 
     580             : loop_top:
     581          82 :     while (cur_bucket <= cur_maxbucket)
     582             :     {
     583             :         BlockNumber bucket_blkno;
     584             :         BlockNumber blkno;
     585             :         Buffer      bucket_buf;
     586             :         Buffer      buf;
     587             :         HashPageOpaque bucket_opaque;
     588             :         Page        page;
     589          80 :         bool        split_cleanup = false;
     590             : 
     591             :         /* Get address of bucket's start page */
     592          80 :         bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
     593             : 
     594          80 :         blkno = bucket_blkno;
     595             : 
     596             :         /*
     597             :          * We need to acquire a cleanup lock on the primary bucket page to out
     598             :          * wait concurrent scans before deleting the dead tuples.
     599             :          */
     600          80 :         buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
     601          80 :         LockBufferForCleanup(buf);
     602          80 :         _hash_checkpage(rel, buf, LH_BUCKET_PAGE);
     603             : 
     604          80 :         page = BufferGetPage(buf);
     605          80 :         bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
     606             : 
     607             :         /*
     608             :          * If the bucket contains tuples that are moved by split, then we need
     609             :          * to delete such tuples.  We can't delete such tuples if the split
     610             :          * operation on bucket is not finished as those are needed by scans.
     611             :          */
     612         160 :         if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
     613          80 :             H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
     614             :         {
     615           0 :             split_cleanup = true;
     616             : 
     617             :             /*
     618             :              * This bucket might have been split since we last held a lock on
     619             :              * the metapage.  If so, hashm_maxbucket, hashm_highmask and
     620             :              * hashm_lowmask might be old enough to cause us to fail to remove
     621             :              * tuples left behind by the most recent split.  To prevent that,
     622             :              * now that the primary page of the target bucket has been locked
     623             :              * (and thus can't be further split), check whether we need to
     624             :              * update our cached metapage data.
     625             :              */
     626           0 :             Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
     627           0 :             if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
     628             :             {
     629           0 :                 cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
     630           0 :                 Assert(cachedmetap != NULL);
     631             :             }
     632             :         }
     633             : 
     634          80 :         bucket_buf = buf;
     635             : 
     636          80 :         hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
     637             :                           cachedmetap->hashm_maxbucket,
     638             :                           cachedmetap->hashm_highmask,
     639             :                           cachedmetap->hashm_lowmask, &tuples_removed,
     640             :                           &num_index_tuples, split_cleanup,
     641             :                           callback, callback_state);
     642             : 
     643          80 :         _hash_dropbuf(rel, bucket_buf);
     644             : 
     645             :         /* Advance to next bucket */
     646          80 :         cur_bucket++;
     647             :     }
     648             : 
     649           1 :     if (BufferIsInvalid(metabuf))
     650           0 :         metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
     651             : 
     652             :     /* Write-lock metapage and check for split since we started */
     653           1 :     LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
     654           1 :     metap = HashPageGetMeta(BufferGetPage(metabuf));
     655             : 
     656           1 :     if (cur_maxbucket != metap->hashm_maxbucket)
     657             :     {
     658             :         /* There's been a split, so process the additional bucket(s) */
     659           0 :         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     660           0 :         cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
     661           0 :         Assert(cachedmetap != NULL);
     662           0 :         cur_maxbucket = cachedmetap->hashm_maxbucket;
     663           0 :         goto loop_top;
     664             :     }
     665             : 
     666             :     /* Okay, we're really done.  Update tuple count in metapage. */
     667           1 :     START_CRIT_SECTION();
     668             : 
     669           2 :     if (orig_maxbucket == metap->hashm_maxbucket &&
     670           1 :         orig_ntuples == metap->hashm_ntuples)
     671             :     {
     672             :         /*
     673             :          * No one has split or inserted anything since start of scan, so
     674             :          * believe our count as gospel.
     675             :          */
     676           1 :         metap->hashm_ntuples = num_index_tuples;
     677             :     }
     678             :     else
     679             :     {
     680             :         /*
     681             :          * Otherwise, our count is untrustworthy since we may have
     682             :          * double-scanned tuples in split buckets.  Proceed by dead-reckoning.
     683             :          * (Note: we still return estimated_count = false, because using this
     684             :          * count is better than not updating reltuples at all.)
     685             :          */
     686           0 :         if (metap->hashm_ntuples > tuples_removed)
     687           0 :             metap->hashm_ntuples -= tuples_removed;
     688             :         else
     689           0 :             metap->hashm_ntuples = 0;
     690           0 :         num_index_tuples = metap->hashm_ntuples;
     691             :     }
     692             : 
     693           1 :     MarkBufferDirty(metabuf);
     694             : 
     695             :     /* XLOG stuff */
     696           1 :     if (RelationNeedsWAL(rel))
     697             :     {
     698             :         xl_hash_update_meta_page xlrec;
     699             :         XLogRecPtr  recptr;
     700             : 
     701           1 :         xlrec.ntuples = metap->hashm_ntuples;
     702             : 
     703           1 :         XLogBeginInsert();
     704           1 :         XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
     705             : 
     706           1 :         XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
     707             : 
     708           1 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
     709           1 :         PageSetLSN(BufferGetPage(metabuf), recptr);
     710             :     }
     711             : 
     712           1 :     END_CRIT_SECTION();
     713             : 
     714           1 :     _hash_relbuf(rel, metabuf);
     715             : 
     716             :     /* return statistics */
     717           1 :     if (stats == NULL)
     718           1 :         stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
     719           1 :     stats->estimated_count = false;
     720           1 :     stats->num_index_tuples = num_index_tuples;
     721           1 :     stats->tuples_removed += tuples_removed;
     722             :     /* hashvacuumcleanup will fill in num_pages */
     723             : 
     724           1 :     return stats;
     725             : }
     726             : 
     727             : /*
     728             :  * Post-VACUUM cleanup.
     729             :  *
     730             :  * Result: a palloc'd struct containing statistical info for VACUUM displays.
     731             :  */
     732             : IndexBulkDeleteResult *
     733           5 : hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
     734             : {
     735           5 :     Relation    rel = info->index;
     736             :     BlockNumber num_pages;
     737             : 
     738             :     /* If hashbulkdelete wasn't called, return NULL signifying no change */
     739             :     /* Note: this covers the analyze_only case too */
     740           5 :     if (stats == NULL)
     741           4 :         return NULL;
     742             : 
     743             :     /* update statistics */
     744           1 :     num_pages = RelationGetNumberOfBlocks(rel);
     745           1 :     stats->num_pages = num_pages;
     746             : 
     747           1 :     return stats;
     748             : }
     749             : 
     750             : /*
     751             :  * Helper function to perform deletion of index entries from a bucket.
     752             :  *
     753             :  * This function expects that the caller has acquired a cleanup lock on the
     754             :  * primary bucket page, and will return with a write lock again held on the
     755             :  * primary bucket page.  The lock won't necessarily be held continuously,
     756             :  * though, because we'll release it when visiting overflow pages.
     757             :  *
     758             :  * It would be very bad if this function cleaned a page while some other
     759             :  * backend was in the midst of scanning it, because hashgettuple assumes
     760             :  * that the next valid TID will be greater than or equal to the current
     761             :  * valid TID.  There can't be any concurrent scans in progress when we first
     762             :  * enter this function because of the cleanup lock we hold on the primary
     763             :  * bucket page, but as soon as we release that lock, there might be.  We
     764             :  * handle that by conspiring to prevent those scans from passing our cleanup
     765             :  * scan.  To do that, we lock the next page in the bucket chain before
     766             :  * releasing the lock on the previous page.  (This type of lock chaining is
     767             :  * not ideal, so we might want to look for a better solution at some point.)
     768             :  *
     769             :  * We need to retain a pin on the primary bucket to ensure that no concurrent
     770             :  * split can start.
     771             :  */
     772             : void
     773         153 : hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
     774             :                   BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
     775             :                   uint32 maxbucket, uint32 highmask, uint32 lowmask,
     776             :                   double *tuples_removed, double *num_index_tuples,
     777             :                   bool split_cleanup,
     778             :                   IndexBulkDeleteCallback callback, void *callback_state)
     779             : {
     780             :     BlockNumber blkno;
     781             :     Buffer      buf;
     782         153 :     Bucket      new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
     783         153 :     bool        bucket_dirty = false;
     784             : 
     785         153 :     blkno = bucket_blkno;
     786         153 :     buf = bucket_buf;
     787             : 
     788         153 :     if (split_cleanup)
     789          73 :         new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
     790             :                                                         lowmask, maxbucket);
     791             : 
     792             :     /* Scan each page in bucket */
     793             :     for (;;)
     794             :     {
     795             :         HashPageOpaque opaque;
     796             :         OffsetNumber offno;
     797             :         OffsetNumber maxoffno;
     798             :         Buffer      next_buf;
     799             :         Page        page;
     800             :         OffsetNumber deletable[MaxOffsetNumber];
     801         215 :         int         ndeletable = 0;
     802         215 :         bool        retain_pin = false;
     803         215 :         bool        clear_dead_marking = false;
     804             : 
     805         215 :         vacuum_delay_point();
     806             : 
     807         215 :         page = BufferGetPage(buf);
     808         215 :         opaque = (HashPageOpaque) PageGetSpecialPointer(page);
     809             : 
     810             :         /* Scan each tuple in page */
     811         215 :         maxoffno = PageGetMaxOffsetNumber(page);
     812       75123 :         for (offno = FirstOffsetNumber;
     813             :              offno <= maxoffno;
     814       74693 :              offno = OffsetNumberNext(offno))
     815             :         {
     816             :             ItemPointer htup;
     817             :             IndexTuple  itup;
     818             :             Bucket      bucket;
     819       74693 :             bool        kill_tuple = false;
     820             : 
     821       74693 :             itup = (IndexTuple) PageGetItem(page,
     822             :                                             PageGetItemId(page, offno));
     823       74693 :             htup = &(itup->t_tid);
     824             : 
     825             :             /*
     826             :              * To remove the dead tuples, we strictly want to rely on results
     827             :              * of callback function.  refer btvacuumpage for detailed reason.
     828             :              */
     829       74693 :             if (callback && callback(htup, callback_state))
     830             :             {
     831        5500 :                 kill_tuple = true;
     832       11000 :                 if (tuples_removed)
     833        5500 :                     *tuples_removed += 1;
     834             :             }
     835       69193 :             else if (split_cleanup)
     836             :             {
     837             :                 /* delete the tuples that are moved by split. */
     838       44193 :                 bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
     839             :                                               maxbucket,
     840             :                                               highmask,
     841             :                                               lowmask);
     842             :                 /* mark the item for deletion */
     843       44193 :                 if (bucket != cur_bucket)
     844             :                 {
     845             :                     /*
     846             :                      * We expect tuples to either belong to current bucket or
     847             :                      * new_bucket.  This is ensured because we don't allow
     848             :                      * further splits from bucket that contains garbage. See
     849             :                      * comments in _hash_expandtable.
     850             :                      */
     851       16684 :                     Assert(bucket == new_bucket);
     852       16684 :                     kill_tuple = true;
     853             :                 }
     854             :             }
     855             : 
     856       74693 :             if (kill_tuple)
     857             :             {
     858             :                 /* mark the item for deletion */
     859       22184 :                 deletable[ndeletable++] = offno;
     860             :             }
     861             :             else
     862             :             {
     863             :                 /* we're keeping it, so count it */
     864       52509 :                 if (num_index_tuples)
     865       25000 :                     *num_index_tuples += 1;
     866             :             }
     867             :         }
     868             : 
     869             :         /* retain the pin on primary bucket page till end of bucket scan */
     870         215 :         if (blkno == bucket_blkno)
     871         153 :             retain_pin = true;
     872             :         else
     873          62 :             retain_pin = false;
     874             : 
     875         215 :         blkno = opaque->hasho_nextblkno;
     876             : 
     877             :         /*
     878             :          * Apply deletions, advance to next page and write page if needed.
     879             :          */
     880         215 :         if (ndeletable > 0)
     881             :         {
     882             :             /* No ereport(ERROR) until changes are logged */
     883          98 :             START_CRIT_SECTION();
     884             : 
     885          98 :             PageIndexMultiDelete(page, deletable, ndeletable);
     886          98 :             bucket_dirty = true;
     887             : 
     888             :             /*
     889             :              * Let us mark the page as clean if vacuum removes the DEAD tuples
     890             :              * from an index page. We do this by clearing
     891             :              * LH_PAGE_HAS_DEAD_TUPLES flag.
     892             :              */
     893         109 :             if (tuples_removed && *tuples_removed > 0 &&
     894          11 :                 H_HAS_DEAD_TUPLES(opaque))
     895             :             {
     896           0 :                 opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
     897           0 :                 clear_dead_marking = true;
     898             :             }
     899             : 
     900          98 :             MarkBufferDirty(buf);
     901             : 
     902             :             /* XLOG stuff */
     903          98 :             if (RelationNeedsWAL(rel))
     904             :             {
     905             :                 xl_hash_delete xlrec;
     906             :                 XLogRecPtr  recptr;
     907             : 
     908          98 :                 xlrec.clear_dead_marking = clear_dead_marking;
     909          98 :                 xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false;
     910             : 
     911          98 :                 XLogBeginInsert();
     912          98 :                 XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
     913             : 
     914             :                 /*
     915             :                  * bucket buffer needs to be registered to ensure that we can
     916             :                  * acquire a cleanup lock on it during replay.
     917             :                  */
     918          98 :                 if (!xlrec.is_primary_bucket_page)
     919          33 :                     XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
     920             : 
     921          98 :                 XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
     922          98 :                 XLogRegisterBufData(1, (char *) deletable,
     923          98 :                                     ndeletable * sizeof(OffsetNumber));
     924             : 
     925          98 :                 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
     926          98 :                 PageSetLSN(BufferGetPage(buf), recptr);
     927             :             }
     928             : 
     929          98 :             END_CRIT_SECTION();
     930             :         }
     931             : 
     932             :         /* bail out if there are no more pages to scan. */
     933         215 :         if (!BlockNumberIsValid(blkno))
     934         153 :             break;
     935             : 
     936          62 :         next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
     937             :                                               LH_OVERFLOW_PAGE,
     938             :                                               bstrategy);
     939             : 
     940             :         /*
     941             :          * release the lock on previous page after acquiring the lock on next
     942             :          * page
     943             :          */
     944          62 :         if (retain_pin)
     945          13 :             LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     946             :         else
     947          49 :             _hash_relbuf(rel, buf);
     948             : 
     949          62 :         buf = next_buf;
     950          62 :     }
     951             : 
     952             :     /*
     953             :      * lock the bucket page to clear the garbage flag and squeeze the bucket.
     954             :      * if the current buffer is same as bucket buffer, then we already have
     955             :      * lock on bucket page.
     956             :      */
     957         153 :     if (buf != bucket_buf)
     958             :     {
     959          13 :         _hash_relbuf(rel, buf);
     960          13 :         LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
     961             :     }
     962             : 
     963             :     /*
     964             :      * Clear the garbage flag from bucket after deleting the tuples that are
     965             :      * moved by split.  We purposefully clear the flag before squeeze bucket,
     966             :      * so that after restart, vacuum shouldn't again try to delete the moved
     967             :      * by split tuples.
     968             :      */
     969         153 :     if (split_cleanup)
     970             :     {
     971             :         HashPageOpaque bucket_opaque;
     972             :         Page        page;
     973             : 
     974          73 :         page = BufferGetPage(bucket_buf);
     975          73 :         bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
     976             : 
     977             :         /* No ereport(ERROR) until changes are logged */
     978          73 :         START_CRIT_SECTION();
     979             : 
     980          73 :         bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
     981          73 :         MarkBufferDirty(bucket_buf);
     982             : 
     983             :         /* XLOG stuff */
     984          73 :         if (RelationNeedsWAL(rel))
     985             :         {
     986             :             XLogRecPtr  recptr;
     987             : 
     988          73 :             XLogBeginInsert();
     989          73 :             XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
     990             : 
     991          73 :             recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
     992          73 :             PageSetLSN(page, recptr);
     993             :         }
     994             : 
     995          73 :         END_CRIT_SECTION();
     996             :     }
     997             : 
     998             :     /*
     999             :      * If we have deleted anything, try to compact free space.  For squeezing
    1000             :      * the bucket, we must have a cleanup lock, else it can impact the
    1001             :      * ordering of tuples for a scan that has started before it.
    1002             :      */
    1003         153 :     if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
    1004          67 :         _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
    1005             :                             bstrategy);
    1006             :     else
    1007          86 :         LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
    1008         153 : }

Generated by: LCOV version 1.11