LCOV - code coverage report
Current view: top level - src/backend/tsearch - ts_selfuncs.c (source / functions) Hit Total Coverage
Test: PostgreSQL Lines: 87 100 87.0 %
Date: 2017-09-29 15:12:54 Functions: 5 6 83.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * ts_selfuncs.c
       4             :  *    Selectivity estimation functions for text search operators.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
       7             :  *
       8             :  *
       9             :  * IDENTIFICATION
      10             :  *    src/backend/tsearch/ts_selfuncs.c
      11             :  *
      12             :  *-------------------------------------------------------------------------
      13             :  */
      14             : #include "postgres.h"
      15             : 
      16             : #include "access/htup_details.h"
      17             : #include "catalog/pg_statistic.h"
      18             : #include "catalog/pg_type.h"
      19             : #include "miscadmin.h"
      20             : #include "nodes/nodes.h"
      21             : #include "tsearch/ts_type.h"
      22             : #include "utils/builtins.h"
      23             : #include "utils/lsyscache.h"
      24             : #include "utils/selfuncs.h"
      25             : #include "utils/syscache.h"
      26             : 
      27             : 
      28             : /*
      29             :  * The default text search selectivity is chosen to be small enough to
      30             :  * encourage indexscans for typical table densities.  See selfuncs.h and
      31             :  * DEFAULT_EQ_SEL for details.
      32             :  */
      33             : #define DEFAULT_TS_MATCH_SEL 0.005
      34             : 
      35             : /* lookup table type for binary searching through MCELEMs */
      36             : typedef struct
      37             : {
      38             :     text       *element;
      39             :     float4      frequency;
      40             : } TextFreq;
      41             : 
      42             : /* type of keys for bsearch'ing through an array of TextFreqs */
      43             : typedef struct
      44             : {
      45             :     char       *lexeme;
      46             :     int         length;
      47             : } LexemeKey;
      48             : 
      49             : static Selectivity tsquerysel(VariableStatData *vardata, Datum constval);
      50             : static Selectivity mcelem_tsquery_selec(TSQuery query,
      51             :                      Datum *mcelem, int nmcelem,
      52             :                      float4 *numbers, int nnumbers);
      53             : static Selectivity tsquery_opr_selec(QueryItem *item, char *operand,
      54             :                   TextFreq *lookup, int length, float4 minfreq);
      55             : static int  compare_lexeme_textfreq(const void *e1, const void *e2);
      56             : 
      57             : #define tsquery_opr_selec_no_stats(query) \
      58             :     tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), NULL, 0, 0)
      59             : 
      60             : 
      61             : /*
      62             :  *  tsmatchsel -- Selectivity of "@@"
      63             :  *
      64             :  * restriction selectivity function for tsvector @@ tsquery and
      65             :  * tsquery @@ tsvector
      66             :  */
      67             : Datum
      68          57 : tsmatchsel(PG_FUNCTION_ARGS)
      69             : {
      70          57 :     PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
      71             : 
      72             : #ifdef NOT_USED
      73             :     Oid         operator = PG_GETARG_OID(1);
      74             : #endif
      75          57 :     List       *args = (List *) PG_GETARG_POINTER(2);
      76          57 :     int         varRelid = PG_GETARG_INT32(3);
      77             :     VariableStatData vardata;
      78             :     Node       *other;
      79             :     bool        varonleft;
      80             :     Selectivity selec;
      81             : 
      82             :     /*
      83             :      * If expression is not variable = something or something = variable, then
      84             :      * punt and return a default estimate.
      85             :      */
      86          57 :     if (!get_restriction_variable(root, args, varRelid,
      87             :                                   &vardata, &other, &varonleft))
      88           0 :         PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
      89             : 
      90             :     /*
      91             :      * Can't do anything useful if the something is not a constant, either.
      92             :      */
      93          57 :     if (!IsA(other, Const))
      94             :     {
      95           0 :         ReleaseVariableStats(vardata);
      96           0 :         PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
      97             :     }
      98             : 
      99             :     /*
     100             :      * The "@@" operator is strict, so we can cope with NULL right away
     101             :      */
     102          57 :     if (((Const *) other)->constisnull)
     103             :     {
     104           0 :         ReleaseVariableStats(vardata);
     105           0 :         PG_RETURN_FLOAT8(0.0);
     106             :     }
     107             : 
     108             :     /*
     109             :      * OK, there's a Var and a Const we're dealing with here.  We need the
     110             :      * Const to be a TSQuery, else we can't do anything useful.  We have to
     111             :      * check this because the Var might be the TSQuery not the TSVector.
     112             :      */
     113          57 :     if (((Const *) other)->consttype == TSQUERYOID)
     114             :     {
     115             :         /* tsvector @@ tsquery or the other way around */
     116          57 :         Assert(vardata.vartype == TSVECTOROID);
     117             : 
     118          57 :         selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
     119             :     }
     120             :     else
     121             :     {
     122             :         /* If we can't see the query structure, must punt */
     123           0 :         selec = DEFAULT_TS_MATCH_SEL;
     124             :     }
     125             : 
     126          57 :     ReleaseVariableStats(vardata);
     127             : 
     128          57 :     CLAMP_PROBABILITY(selec);
     129             : 
     130          57 :     PG_RETURN_FLOAT8((float8) selec);
     131             : }
     132             : 
     133             : 
     134             : /*
     135             :  *  tsmatchjoinsel -- join selectivity of "@@"
     136             :  *
     137             :  * join selectivity function for tsvector @@ tsquery and tsquery @@ tsvector
     138             :  */
     139             : Datum
     140           0 : tsmatchjoinsel(PG_FUNCTION_ARGS)
     141             : {
     142             :     /* for the moment we just punt */
     143           0 :     PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
     144             : }
     145             : 
     146             : 
     147             : /*
     148             :  * @@ selectivity for tsvector var vs tsquery constant
     149             :  */
     150             : static Selectivity
     151          57 : tsquerysel(VariableStatData *vardata, Datum constval)
     152             : {
     153             :     Selectivity selec;
     154             :     TSQuery     query;
     155             : 
     156             :     /* The caller made sure the const is a TSQuery, so get it now */
     157          57 :     query = DatumGetTSQuery(constval);
     158             : 
     159             :     /* Empty query matches nothing */
     160          57 :     if (query->size == 0)
     161           0 :         return (Selectivity) 0.0;
     162             : 
     163          57 :     if (HeapTupleIsValid(vardata->statsTuple))
     164             :     {
     165             :         Form_pg_statistic stats;
     166             :         AttStatsSlot sslot;
     167             : 
     168          51 :         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
     169             : 
     170             :         /* MCELEM will be an array of TEXT elements for a tsvector column */
     171          51 :         if (get_attstatsslot(&sslot, vardata->statsTuple,
     172             :                              STATISTIC_KIND_MCELEM, InvalidOid,
     173             :                              ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
     174             :         {
     175             :             /*
     176             :              * There is a most-common-elements slot for the tsvector Var, so
     177             :              * use that.
     178             :              */
     179          51 :             selec = mcelem_tsquery_selec(query, sslot.values, sslot.nvalues,
     180             :                                          sslot.numbers, sslot.nnumbers);
     181          51 :             free_attstatsslot(&sslot);
     182             :         }
     183             :         else
     184             :         {
     185             :             /* No most-common-elements info, so do without */
     186           0 :             selec = tsquery_opr_selec_no_stats(query);
     187             :         }
     188             : 
     189             :         /*
     190             :          * MCE stats count only non-null rows, so adjust for null rows.
     191             :          */
     192          51 :         selec *= (1.0 - stats->stanullfrac);
     193             :     }
     194             :     else
     195             :     {
     196             :         /* No stats at all, so do without */
     197           6 :         selec = tsquery_opr_selec_no_stats(query);
     198             :         /* we assume no nulls here, so no stanullfrac correction */
     199             :     }
     200             : 
     201          57 :     return selec;
     202             : }
     203             : 
     204             : /*
     205             :  * Extract data from the pg_statistic arrays into useful format.
     206             :  */
     207             : static Selectivity
     208          51 : mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
     209             :                      float4 *numbers, int nnumbers)
     210             : {
     211             :     float4      minfreq;
     212             :     TextFreq   *lookup;
     213             :     Selectivity selec;
     214             :     int         i;
     215             : 
     216             :     /*
     217             :      * There should be two more Numbers than Values, because the last two
     218             :      * cells are taken for minimal and maximal frequency.  Punt if not.
     219             :      *
     220             :      * (Note: the MCELEM statistics slot definition allows for a third extra
     221             :      * number containing the frequency of nulls, but we're not expecting that
     222             :      * to appear for a tsvector column.)
     223             :      */
     224          51 :     if (nnumbers != nmcelem + 2)
     225           0 :         return tsquery_opr_selec_no_stats(query);
     226             : 
     227             :     /*
     228             :      * Transpose the data into a single array so we can use bsearch().
     229             :      */
     230          51 :     lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
     231       51051 :     for (i = 0; i < nmcelem; i++)
     232             :     {
     233             :         /*
     234             :          * The text Datums came from an array, so it cannot be compressed or
     235             :          * stored out-of-line -- it's safe to use VARSIZE_ANY*.
     236             :          */
     237       51000 :         Assert(!VARATT_IS_COMPRESSED(mcelem[i]) && !VARATT_IS_EXTERNAL(mcelem[i]));
     238       51000 :         lookup[i].element = (text *) DatumGetPointer(mcelem[i]);
     239       51000 :         lookup[i].frequency = numbers[i];
     240             :     }
     241             : 
     242             :     /*
     243             :      * Grab the lowest frequency. compute_tsvector_stats() stored it for us in
     244             :      * the one before the last cell of the Numbers array. See ts_typanalyze.c
     245             :      */
     246          51 :     minfreq = numbers[nnumbers - 2];
     247             : 
     248          51 :     selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
     249             :                               nmcelem, minfreq);
     250             : 
     251          51 :     pfree(lookup);
     252             : 
     253          51 :     return selec;
     254             : }
     255             : 
     256             : /*
     257             :  * Traverse the tsquery in preorder, calculating selectivity as:
     258             :  *
     259             :  *   selec(left_oper) * selec(right_oper) in AND & PHRASE nodes,
     260             :  *
     261             :  *   selec(left_oper) + selec(right_oper) -
     262             :  *      selec(left_oper) * selec(right_oper) in OR nodes,
     263             :  *
     264             :  *   1 - select(oper) in NOT nodes
     265             :  *
     266             :  *   histogram-based estimation in prefix VAL nodes
     267             :  *
     268             :  *   freq[val] in exact VAL nodes, if the value is in MCELEM
     269             :  *   min(freq[MCELEM]) / 2 in VAL nodes, if it is not
     270             :  *
     271             :  * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
     272             :  * binary search for determining freq[MCELEM].
     273             :  *
     274             :  * If we don't have stats for the tsvector, we still use this logic,
     275             :  * except we use default estimates for VAL nodes.  This case is signaled
     276             :  * by lookup == NULL.
     277             :  */
     278             : static Selectivity
     279         165 : tsquery_opr_selec(QueryItem *item, char *operand,
     280             :                   TextFreq *lookup, int length, float4 minfreq)
     281             : {
     282             :     Selectivity selec;
     283             : 
     284             :     /* since this function recurses, it could be driven to stack overflow */
     285         165 :     check_stack_depth();
     286             : 
     287         165 :     if (item->type == QI_VAL)
     288             :     {
     289         109 :         QueryOperand *oper = (QueryOperand *) item;
     290             :         LexemeKey   key;
     291             : 
     292             :         /*
     293             :          * Prepare the key for bsearch().
     294             :          */
     295         109 :         key.lexeme = operand + oper->distance;
     296         109 :         key.length = oper->length;
     297             : 
     298         109 :         if (oper->prefix)
     299             :         {
     300             :             /* Prefix match, ie the query item is lexeme:* */
     301             :             Selectivity matched,
     302             :                         allmces;
     303             :             int         i,
     304             :                         n_matched;
     305             : 
     306             :             /*
     307             :              * Our strategy is to scan through the MCELEM list and combine the
     308             :              * frequencies of the ones that match the prefix.  We then
     309             :              * extrapolate the fraction of matching MCELEMs to the remaining
     310             :              * rows, assuming that the MCELEMs are representative of the whole
     311             :              * lexeme population in this respect.  (Compare
     312             :              * histogram_selectivity().)  Note that these are most common
     313             :              * elements not most common values, so they're not mutually
     314             :              * exclusive.  We treat occurrences as independent events.
     315             :              *
     316             :              * This is only a good plan if we have a pretty fair number of
     317             :              * MCELEMs available; we set the threshold at 100.  If no stats or
     318             :              * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
     319             :              */
     320          13 :             if (lookup == NULL || length < 100)
     321          12 :                 return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
     322             : 
     323           8 :             matched = allmces = 0;
     324           8 :             n_matched = 0;
     325        8008 :             for (i = 0; i < length; i++)
     326             :             {
     327        8000 :                 TextFreq   *t = lookup + i;
     328        8000 :                 int         tlen = VARSIZE_ANY_EXHDR(t->element);
     329             : 
     330       16000 :                 if (tlen >= key.length &&
     331        8000 :                     strncmp(key.lexeme, VARDATA_ANY(t->element),
     332        8000 :                             key.length) == 0)
     333             :                 {
     334         288 :                     matched += t->frequency - matched * t->frequency;
     335         288 :                     n_matched++;
     336             :                 }
     337        8000 :                 allmces += t->frequency - allmces * t->frequency;
     338             :             }
     339             : 
     340             :             /* Clamp to ensure sanity in the face of roundoff error */
     341           8 :             CLAMP_PROBABILITY(matched);
     342           8 :             CLAMP_PROBABILITY(allmces);
     343             : 
     344           8 :             selec = matched + (1.0 - allmces) * ((double) n_matched / length);
     345             : 
     346             :             /*
     347             :              * In any case, never believe that a prefix match has selectivity
     348             :              * less than we would assign for a non-MCELEM lexeme.  This
     349             :              * preserves the property that "word:*" should be estimated to
     350             :              * match at least as many rows as "word" would be.
     351             :              */
     352           8 :             selec = Max(Min(DEFAULT_TS_MATCH_SEL, minfreq / 2), selec);
     353             :         }
     354             :         else
     355             :         {
     356             :             /* Regular exact lexeme match */
     357             :             TextFreq   *searchres;
     358             : 
     359             :             /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
     360          96 :             if (lookup == NULL)
     361           2 :                 return (Selectivity) DEFAULT_TS_MATCH_SEL;
     362             : 
     363          94 :             searchres = (TextFreq *) bsearch(&key, lookup, length,
     364             :                                              sizeof(TextFreq),
     365             :                                              compare_lexeme_textfreq);
     366             : 
     367          94 :             if (searchres)
     368             :             {
     369             :                 /*
     370             :                  * The element is in MCELEM.  Return precise selectivity (or
     371             :                  * at least as precise as ANALYZE could find out).
     372             :                  */
     373          78 :                 selec = searchres->frequency;
     374             :             }
     375             :             else
     376             :             {
     377             :                 /*
     378             :                  * The element is not in MCELEM.  Punt, but assume that the
     379             :                  * selectivity cannot be more than minfreq / 2.
     380             :                  */
     381          16 :                 selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
     382             :             }
     383             :         }
     384             :     }
     385             :     else
     386             :     {
     387             :         /* Current TSQuery node is an operator */
     388             :         Selectivity s1,
     389             :                     s2;
     390             : 
     391          56 :         switch (item->qoperator.oper)
     392             :         {
     393             :             case OP_NOT:
     394           4 :                 selec = 1.0 - tsquery_opr_selec(item + 1, operand,
     395             :                                                 lookup, length, minfreq);
     396           4 :                 break;
     397             : 
     398             :             case OP_PHRASE:
     399             :             case OP_AND:
     400          25 :                 s1 = tsquery_opr_selec(item + 1, operand,
     401             :                                        lookup, length, minfreq);
     402          25 :                 s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
     403             :                                        lookup, length, minfreq);
     404          25 :                 selec = s1 * s2;
     405          25 :                 break;
     406             : 
     407             :             case OP_OR:
     408          27 :                 s1 = tsquery_opr_selec(item + 1, operand,
     409             :                                        lookup, length, minfreq);
     410          27 :                 s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
     411             :                                        lookup, length, minfreq);
     412          27 :                 selec = s1 + s2 - s1 * s2;
     413          27 :                 break;
     414             : 
     415             :             default:
     416           0 :                 elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
     417             :                 selec = 0;      /* keep compiler quiet */
     418             :                 break;
     419             :         }
     420             :     }
     421             : 
     422             :     /* Clamp intermediate results to stay sane despite roundoff error */
     423         158 :     CLAMP_PROBABILITY(selec);
     424             : 
     425         158 :     return selec;
     426             : }
     427             : 
     428             : /*
     429             :  * bsearch() comparator for a lexeme (non-NULL terminated string with length)
     430             :  * and a TextFreq. Use length, then byte-for-byte comparison, because that's
     431             :  * how ANALYZE code sorted data before storing it in a statistic tuple.
     432             :  * See ts_typanalyze.c for details.
     433             :  */
     434             : static int
     435         740 : compare_lexeme_textfreq(const void *e1, const void *e2)
     436             : {
     437         740 :     const LexemeKey *key = (const LexemeKey *) e1;
     438         740 :     const TextFreq *t = (const TextFreq *) e2;
     439             :     int         len1,
     440             :                 len2;
     441             : 
     442         740 :     len1 = key->length;
     443         740 :     len2 = VARSIZE_ANY_EXHDR(t->element);
     444             : 
     445             :     /* Compare lengths first, possibly avoiding a strncmp call */
     446         740 :     if (len1 > len2)
     447         144 :         return 1;
     448         596 :     else if (len1 < len2)
     449           0 :         return -1;
     450             : 
     451             :     /* Fall back on byte-for-byte comparison */
     452         596 :     return strncmp(key->lexeme, VARDATA_ANY(t->element), len1);
     453             : }

Generated by: LCOV version 1.11