LCOV - code coverage report
Current view: top level - src/backend/access/tablesample - bernoulli.c (source / functions) Hit Total Coverage
Test: PostgreSQL Lines: 57 57 100.0 %
Date: 2017-09-29 13:40:31 Functions: 5 5 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * bernoulli.c
       4             :  *    support routines for BERNOULLI tablesample method
       5             :  *
       6             :  * To ensure repeatability of samples, it is necessary that selection of a
       7             :  * given tuple be history-independent; otherwise syncscanning would break
       8             :  * repeatability, to say nothing of logically-irrelevant maintenance such
       9             :  * as physical extension or shortening of the relation.
      10             :  *
      11             :  * To achieve that, we proceed by hashing each candidate TID together with
      12             :  * the active seed, and then selecting it if the hash is less than the
      13             :  * cutoff value computed from the selection probability by BeginSampleScan.
      14             :  *
      15             :  *
      16             :  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
      17             :  * Portions Copyright (c) 1994, Regents of the University of California
      18             :  *
      19             :  * IDENTIFICATION
      20             :  *    src/backend/access/tablesample/bernoulli.c
      21             :  *
      22             :  *-------------------------------------------------------------------------
      23             :  */
      24             : 
      25             : #include "postgres.h"
      26             : 
      27             : #ifdef _MSC_VER
      28             : #include <float.h>                /* for _isnan */
      29             : #endif
      30             : #include <math.h>
      31             : 
      32             : #include "access/hash.h"
      33             : #include "access/tsmapi.h"
      34             : #include "catalog/pg_type.h"
      35             : #include "optimizer/clauses.h"
      36             : #include "optimizer/cost.h"
      37             : #include "utils/builtins.h"
      38             : 
      39             : 
      40             : /* Private state */
      41             : typedef struct
      42             : {
      43             :     uint64      cutoff;         /* select tuples with hash less than this */
      44             :     uint32      seed;           /* random seed */
      45             :     OffsetNumber lt;            /* last tuple returned from current block */
      46             : } BernoulliSamplerData;
      47             : 
      48             : 
      49             : static void bernoulli_samplescangetsamplesize(PlannerInfo *root,
      50             :                                   RelOptInfo *baserel,
      51             :                                   List *paramexprs,
      52             :                                   BlockNumber *pages,
      53             :                                   double *tuples);
      54             : static void bernoulli_initsamplescan(SampleScanState *node,
      55             :                          int eflags);
      56             : static void bernoulli_beginsamplescan(SampleScanState *node,
      57             :                           Datum *params,
      58             :                           int nparams,
      59             :                           uint32 seed);
      60             : static OffsetNumber bernoulli_nextsampletuple(SampleScanState *node,
      61             :                           BlockNumber blockno,
      62             :                           OffsetNumber maxoffset);
      63             : 
      64             : 
      65             : /*
      66             :  * Create a TsmRoutine descriptor for the BERNOULLI method.
      67             :  */
      68             : Datum
      69          76 : tsm_bernoulli_handler(PG_FUNCTION_ARGS)
      70             : {
      71          76 :     TsmRoutine *tsm = makeNode(TsmRoutine);
      72             : 
      73          76 :     tsm->parameterTypes = list_make1_oid(FLOAT4OID);
      74          76 :     tsm->repeatable_across_queries = true;
      75          76 :     tsm->repeatable_across_scans = true;
      76          76 :     tsm->SampleScanGetSampleSize = bernoulli_samplescangetsamplesize;
      77          76 :     tsm->InitSampleScan = bernoulli_initsamplescan;
      78          76 :     tsm->BeginSampleScan = bernoulli_beginsamplescan;
      79          76 :     tsm->NextSampleBlock = NULL;
      80          76 :     tsm->NextSampleTuple = bernoulli_nextsampletuple;
      81          76 :     tsm->EndSampleScan = NULL;
      82             : 
      83          76 :     PG_RETURN_POINTER(tsm);
      84             : }
      85             : 
      86             : /*
      87             :  * Sample size estimation.
      88             :  */
      89             : static void
      90          20 : bernoulli_samplescangetsamplesize(PlannerInfo *root,
      91             :                                   RelOptInfo *baserel,
      92             :                                   List *paramexprs,
      93             :                                   BlockNumber *pages,
      94             :                                   double *tuples)
      95             : {
      96             :     Node       *pctnode;
      97             :     float4      samplefract;
      98             : 
      99             :     /* Try to extract an estimate for the sample percentage */
     100          20 :     pctnode = (Node *) linitial(paramexprs);
     101          20 :     pctnode = estimate_expression_value(root, pctnode);
     102             : 
     103          37 :     if (IsA(pctnode, Const) &&
     104          17 :         !((Const *) pctnode)->constisnull)
     105             :     {
     106          17 :         samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue);
     107          34 :         if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract))
     108          15 :             samplefract /= 100.0f;
     109             :         else
     110             :         {
     111             :             /* Default samplefract if the value is bogus */
     112           2 :             samplefract = 0.1f;
     113             :         }
     114             :     }
     115             :     else
     116             :     {
     117             :         /* Default samplefract if we didn't obtain a non-null Const */
     118           3 :         samplefract = 0.1f;
     119             :     }
     120             : 
     121             :     /* We'll visit all pages of the baserel */
     122          20 :     *pages = baserel->pages;
     123             : 
     124          20 :     *tuples = clamp_row_est(baserel->tuples * samplefract);
     125          20 : }
     126             : 
     127             : /*
     128             :  * Initialize during executor setup.
     129             :  */
     130             : static void
     131          20 : bernoulli_initsamplescan(SampleScanState *node, int eflags)
     132             : {
     133          20 :     node->tsm_state = palloc0(sizeof(BernoulliSamplerData));
     134          20 : }
     135             : 
     136             : /*
     137             :  * Examine parameters and prepare for a sample scan.
     138             :  */
     139             : static void
     140          15 : bernoulli_beginsamplescan(SampleScanState *node,
     141             :                           Datum *params,
     142             :                           int nparams,
     143             :                           uint32 seed)
     144             : {
     145          15 :     BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state;
     146          15 :     double      percent = DatumGetFloat4(params[0]);
     147             :     double      dcutoff;
     148             : 
     149          15 :     if (percent < 0 || percent > 100 || isnan(percent))
     150           2 :         ereport(ERROR,
     151             :                 (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
     152             :                  errmsg("sample percentage must be between 0 and 100")));
     153             : 
     154             :     /*
     155             :      * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to
     156             :      * store that as a uint64, of course.  Note that this gives strictly
     157             :      * correct behavior at the limits of zero or one probability.
     158             :      */
     159          13 :     dcutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100);
     160          13 :     sampler->cutoff = (uint64) dcutoff;
     161          13 :     sampler->seed = seed;
     162          13 :     sampler->lt = InvalidOffsetNumber;
     163             : 
     164             :     /*
     165             :      * Use bulkread, since we're scanning all pages.  But pagemode visibility
     166             :      * checking is a win only at larger sampling fractions.  The 25% cutoff
     167             :      * here is based on very limited experimentation.
     168             :      */
     169          13 :     node->use_bulkread = true;
     170          13 :     node->use_pagemode = (percent >= 25);
     171          13 : }
     172             : 
     173             : /*
     174             :  * Select next sampled tuple in current block.
     175             :  *
     176             :  * It is OK here to return an offset without knowing if the tuple is visible
     177             :  * (or even exists).  The reason is that we do the coinflip for every tuple
     178             :  * offset in the table.  Since all tuples have the same probability of being
     179             :  * returned, it doesn't matter if we do extra coinflips for invisible tuples.
     180             :  *
     181             :  * When we reach end of the block, return InvalidOffsetNumber which tells
     182             :  * SampleScan to go to next block.
     183             :  */
     184             : static OffsetNumber
     185       21472 : bernoulli_nextsampletuple(SampleScanState *node,
     186             :                           BlockNumber blockno,
     187             :                           OffsetNumber maxoffset)
     188             : {
     189       21472 :     BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state;
     190       21472 :     OffsetNumber tupoffset = sampler->lt;
     191             :     uint32      hashinput[3];
     192             : 
     193             :     /* Advance to first/next tuple in block */
     194       21472 :     if (tupoffset == InvalidOffsetNumber)
     195        1398 :         tupoffset = FirstOffsetNumber;
     196             :     else
     197       20074 :         tupoffset++;
     198             : 
     199             :     /*
     200             :      * We compute the hash by applying hash_any to an array of 3 uint32's
     201             :      * containing the block, offset, and seed.  This is efficient to set up,
     202             :      * and with the current implementation of hash_any, it gives
     203             :      * machine-independent results, which is a nice property for regression
     204             :      * testing.
     205             :      *
     206             :      * These words in the hash input are the same throughout the block:
     207             :      */
     208       21472 :     hashinput[0] = blockno;
     209       21472 :     hashinput[2] = sampler->seed;
     210             : 
     211             :     /*
     212             :      * Loop over tuple offsets until finding suitable TID or reaching end of
     213             :      * block.
     214             :      */
     215       41506 :     for (; tupoffset <= maxoffset; tupoffset++)
     216             :     {
     217             :         uint32      hash;
     218             : 
     219       40108 :         hashinput[1] = tupoffset;
     220             : 
     221       40108 :         hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput,
     222             :                                        (int) sizeof(hashinput)));
     223       40108 :         if (hash < sampler->cutoff)
     224       20074 :             break;
     225             :     }
     226             : 
     227       21472 :     if (tupoffset > maxoffset)
     228        1398 :         tupoffset = InvalidOffsetNumber;
     229             : 
     230       21472 :     sampler->lt = tupoffset;
     231             : 
     232       21472 :     return tupoffset;
     233             : }

Generated by: LCOV version 1.11