LCOV - PostgreSQL - src/backend/utils/adt/tsvector.c

LCOV - code coverage report

Current view:	top level - src/backend/utils/adt - tsvector.c (source / functions)		Hit	Total	Coverage
Test:	PostgreSQL	Lines:	167	243	68.7 %
Date:	2017-09-29 15:12:54	Functions:	6	9	66.7 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * tsvector.c
       4             :  *    I/O functions for tsvector
       5             :  *
       6             :  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
       7             :  *
       8             :  *
       9             :  * IDENTIFICATION
      10             :  *    src/backend/utils/adt/tsvector.c
      11             :  *
      12             :  *-------------------------------------------------------------------------
      13             :  */
      14             : 
      15             : #include "postgres.h"
      16             : 
      17             : #include "libpq/pqformat.h"
      18             : #include "tsearch/ts_locale.h"
      19             : #include "tsearch/ts_utils.h"
      20             : #include "utils/builtins.h"
      21             : #include "utils/memutils.h"
      22             : 
      23             : typedef struct
      24             : {
      25             :     WordEntry   entry;          /* must be first! */
      26             :     WordEntryPos *pos;
      27             :     int         poslen;         /* number of elements in pos */
      28             : } WordEntryIN;
      29             : 
      30             : 
      31             : /* Compare two WordEntryPos values for qsort */
      32             : int
      33         124 : compareWordEntryPos(const void *a, const void *b)
      34             : {
      35         124 :     int         apos = WEP_GETPOS(*(const WordEntryPos *) a);
      36         124 :     int         bpos = WEP_GETPOS(*(const WordEntryPos *) b);
      37             : 
      38         124 :     if (apos == bpos)
      39           4 :         return 0;
      40         120 :     return (apos > bpos) ? 1 : -1;
      41             : }
      42             : 
      43             : /*
      44             :  * Removes duplicate pos entries. If there's two entries with same pos
      45             :  * but different weight, the higher weight is retained.
      46             :  *
      47             :  * Returns new length.
      48             :  */
      49             : static int
      50         226 : uniquePos(WordEntryPos *a, int l)
      51             : {
      52             :     WordEntryPos *ptr,
      53             :                *res;
      54             : 
      55         226 :     if (l <= 1)
      56         182 :         return l;
      57             : 
      58          44 :     qsort((void *) a, l, sizeof(WordEntryPos), compareWordEntryPos);
      59             : 
      60          44 :     res = a;
      61          44 :     ptr = a + 1;
      62         198 :     while (ptr - a < l)
      63             :     {
      64         110 :         if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
      65             :         {
      66         106 :             res++;
      67         106 :             *res = *ptr;
      68         212 :             if (res - a >= MAXNUMPOS - 1 ||
      69         106 :                 WEP_GETPOS(*res) == MAXENTRYPOS - 1)
      70             :                 break;
      71             :         }
      72           4 :         else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
      73           1 :             WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
      74         110 :         ptr++;
      75             :     }
      76             : 
      77          44 :     return res + 1 - a;
      78             : }
      79             : 
      80             : /* Compare two WordEntryIN values for qsort */
      81             : static int
      82      178842 : compareentry(const void *va, const void *vb, void *arg)
      83             : {
      84      178842 :     const WordEntryIN *a = (const WordEntryIN *) va;
      85      178842 :     const WordEntryIN *b = (const WordEntryIN *) vb;
      86      178842 :     char       *BufferStr = (char *) arg;
      87             : 
      88      357684 :     return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
      89      357684 :                            &BufferStr[b->entry.pos], b->entry.len,
      90             :                            false);
      91             : }
      92             : 
      93             : /*
      94             :  * Sort an array of WordEntryIN, remove duplicates.
      95             :  * *outbuflen receives the amount of space needed for strings and positions.
      96             :  */
      97             : static int
      98         604 : uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
      99             : {
     100             :     int         buflen;
     101             :     WordEntryIN *ptr,
     102             :                *res;
     103             : 
     104         604 :     Assert(l >= 1);
     105             : 
     106         604 :     if (l > 1)
     107         596 :         qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
     108             :                   (void *) buf);
     109             : 
     110         604 :     buflen = 0;
     111         604 :     res = a;
     112         604 :     ptr = a + 1;
     113       30734 :     while (ptr - a < l)
     114             :     {
     115       58875 :         if (!(ptr->entry.len == res->entry.len &&
     116       29349 :               strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
     117       29349 :                       res->entry.len) == 0))
     118             :         {
     119             :             /* done accumulating data into *res, count space needed */
     120       28605 :             buflen += res->entry.len;
     121       28605 :             if (res->entry.haspos)
     122             :             {
     123         155 :                 res->poslen = uniquePos(res->pos, res->poslen);
     124         155 :                 buflen = SHORTALIGN(buflen);
     125         155 :                 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
     126             :             }
     127       28605 :             res++;
     128       57210 :             if (res != ptr)
     129       14823 :                 memcpy(res, ptr, sizeof(WordEntryIN));
     130             :         }
     131         921 :         else if (ptr->entry.haspos)
     132             :         {
     133           7 :             if (res->entry.haspos)
     134             :             {
     135             :                 /* append ptr's positions to res's positions */
     136           6 :                 int         newlen = ptr->poslen + res->poslen;
     137             : 
     138           6 :                 res->pos = (WordEntryPos *)
     139           6 :                     repalloc(res->pos, newlen * sizeof(WordEntryPos));
     140           6 :                 memcpy(&res->pos[res->poslen], ptr->pos,
     141           6 :                        ptr->poslen * sizeof(WordEntryPos));
     142           6 :                 res->poslen = newlen;
     143           6 :                 pfree(ptr->pos);
     144             :             }
     145             :             else
     146             :             {
     147             :                 /* just give ptr's positions to pos */
     148           1 :                 res->entry.haspos = 1;
     149           1 :                 res->pos = ptr->pos;
     150           1 :                 res->poslen = ptr->poslen;
     151             :             }
     152             :         }
     153       29526 :         ptr++;
     154             :     }
     155             : 
     156             :     /* count space needed for last item */
     157         604 :     buflen += res->entry.len;
     158         604 :     if (res->entry.haspos)
     159             :     {
     160          71 :         res->poslen = uniquePos(res->pos, res->poslen);
     161          71 :         buflen = SHORTALIGN(buflen);
     162          71 :         buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
     163             :     }
     164             : 
     165         604 :     *outbuflen = buflen;
     166         604 :     return res + 1 - a;
     167             : }
     168             : 
     169             : static int
     170           0 : WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
     171             : {
     172           0 :     return compareentry(a, b, buf);
     173             : }
     174             : 
     175             : 
     176             : Datum
     177         612 : tsvectorin(PG_FUNCTION_ARGS)
     178             : {
     179         612 :     char       *buf = PG_GETARG_CSTRING(0);
     180             :     TSVectorParseState state;
     181             :     WordEntryIN *arr;
     182             :     int         totallen;
     183             :     int         arrlen;         /* allocated size of arr */
     184             :     WordEntry  *inarr;
     185         612 :     int         len = 0;
     186             :     TSVector    in;
     187             :     int         i;
     188             :     char       *token;
     189             :     int         toklen;
     190             :     WordEntryPos *pos;
     191             :     int         poslen;
     192             :     char       *strbuf;
     193             :     int         stroff;
     194             : 
     195             :     /*
     196             :      * Tokens are appended to tmpbuf, cur is a pointer to the end of used
     197             :      * space in tmpbuf.
     198             :      */
     199             :     char       *tmpbuf;
     200             :     char       *cur;
     201         612 :     int         buflen = 256;   /* allocated size of tmpbuf */
     202             : 
     203         612 :     state = init_tsvector_parser(buf, false, false);
     204             : 
     205         612 :     arrlen = 64;
     206         612 :     arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
     207         612 :     cur = tmpbuf = (char *) palloc(buflen);
     208             : 
     209       31354 :     while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
     210             :     {
     211       30130 :         if (toklen >= MAXSTRLEN)
     212           0 :             ereport(ERROR,
     213             :                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     214             :                      errmsg("word is too long (%ld bytes, max %ld bytes)",
     215             :                             (long) toklen,
     216             :                             (long) (MAXSTRLEN - 1))));
     217             : 
     218       30130 :         if (cur - tmpbuf > MAXSTRPOS)
     219           0 :             ereport(ERROR,
     220             :                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     221             :                      errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
     222             :                             (long) (cur - tmpbuf), (long) MAXSTRPOS)));
     223             : 
     224             :         /*
     225             :          * Enlarge buffers if needed
     226             :          */
     227       30130 :         if (len >= arrlen)
     228             :         {
     229         219 :             arrlen *= 2;
     230         219 :             arr = (WordEntryIN *)
     231         219 :                 repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
     232             :         }
     233       60260 :         while ((cur - tmpbuf) + toklen >= buflen)
     234             :         {
     235           0 :             int         dist = cur - tmpbuf;
     236             : 
     237           0 :             buflen *= 2;
     238           0 :             tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
     239           0 :             cur = tmpbuf + dist;
     240             :         }
     241       30130 :         arr[len].entry.len = toklen;
     242       30130 :         arr[len].entry.pos = cur - tmpbuf;
     243       30130 :         memcpy((void *) cur, (void *) token, toklen);
     244       30130 :         cur += toklen;
     245             : 
     246       30130 :         if (poslen != 0)
     247             :         {
     248         232 :             arr[len].entry.haspos = 1;
     249         232 :             arr[len].pos = pos;
     250         232 :             arr[len].poslen = poslen;
     251             :         }
     252             :         else
     253             :         {
     254       29898 :             arr[len].entry.haspos = 0;
     255       29898 :             arr[len].pos = NULL;
     256       29898 :             arr[len].poslen = 0;
     257             :         }
     258       30130 :         len++;
     259             :     }
     260             : 
     261         612 :     close_tsvector_parser(state);
     262             : 
     263         612 :     if (len > 0)
     264         604 :         len = uniqueentry(arr, len, tmpbuf, &buflen);
     265             :     else
     266           8 :         buflen = 0;
     267             : 
     268         612 :     if (buflen > MAXSTRPOS)
     269           0 :         ereport(ERROR,
     270             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     271             :                  errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
     272             : 
     273         612 :     totallen = CALCDATASIZE(len, buflen);
     274         612 :     in = (TSVector) palloc0(totallen);
     275         612 :     SET_VARSIZE(in, totallen);
     276         612 :     in->size = len;
     277         612 :     inarr = ARRPTR(in);
     278         612 :     strbuf = STRPTR(in);
     279         612 :     stroff = 0;
     280       29821 :     for (i = 0; i < len; i++)
     281             :     {
     282       29209 :         memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
     283       29209 :         arr[i].entry.pos = stroff;
     284       29209 :         stroff += arr[i].entry.len;
     285       29209 :         if (arr[i].entry.haspos)
     286             :         {
     287         226 :             if (arr[i].poslen > 0xFFFF)
     288           0 :                 elog(ERROR, "positions array too long");
     289             : 
     290             :             /* Copy number of positions */
     291         226 :             stroff = SHORTALIGN(stroff);
     292         226 :             *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
     293         226 :             stroff += sizeof(uint16);
     294             : 
     295             :             /* Copy positions */
     296         226 :             memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
     297         226 :             stroff += arr[i].poslen * sizeof(WordEntryPos);
     298             : 
     299         226 :             pfree(arr[i].pos);
     300             :         }
     301       29209 :         inarr[i] = arr[i].entry;
     302             :     }
     303             : 
     304         612 :     Assert((strbuf + stroff - (char *) in) == totallen);
     305             : 
     306         612 :     PG_RETURN_TSVECTOR(in);
     307             : }
     308             : 
     309             : Datum
     310          71 : tsvectorout(PG_FUNCTION_ARGS)
     311             : {
     312          71 :     TSVector    out = PG_GETARG_TSVECTOR(0);
     313             :     char       *outbuf;
     314             :     int32       i,
     315          71 :                 lenbuf = 0,
     316             :                 pp;
     317          71 :     WordEntry  *ptr = ARRPTR(out);
     318             :     char       *curbegin,
     319             :                *curin,
     320             :                *curout;
     321             : 
     322          71 :     lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
     323         387 :     for (i = 0; i < out->size; i++)
     324             :     {
     325         316 :         lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
     326         316 :         if (ptr[i].haspos)
     327         252 :             lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
     328             :     }
     329             : 
     330          71 :     curout = outbuf = (char *) palloc(lenbuf);
     331         387 :     for (i = 0; i < out->size; i++)
     332             :     {
     333         316 :         curbegin = curin = STRPTR(out) + ptr->pos;
     334         316 :         if (i != 0)
     335         254 :             *curout++ = ' ';
     336         316 :         *curout++ = '\'';
     337        2308 :         while (curin - curbegin < ptr->len)
     338             :         {
     339        1676 :             int         len = pg_mblen(curin);
     340             : 
     341        1676 :             if (t_iseq(curin, '\''))
     342           4 :                 *curout++ = '\'';
     343        1672 :             else if (t_iseq(curin, '\\'))
     344          15 :                 *curout++ = '\\';
     345             : 
     346        5028 :             while (len--)
     347        1676 :                 *curout++ = *curin++;
     348             :         }
     349             : 
     350         316 :         *curout++ = '\'';
     351         316 :         if ((pp = POSDATALEN(out, ptr)) != 0)
     352             :         {
     353             :             WordEntryPos *wptr;
     354             : 
     355         252 :             *curout++ = ':';
     356         252 :             wptr = POSDATAPTR(out, ptr);
     357         846 :             while (pp)
     358             :             {
     359         342 :                 curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
     360         342 :                 switch (WEP_GETWEIGHT(*wptr))
     361             :                 {
     362             :                     case 3:
     363          18 :                         *curout++ = 'A';
     364          18 :                         break;
     365             :                     case 2:
     366          10 :                         *curout++ = 'B';
     367          10 :                         break;
     368             :                     case 1:
     369          37 :                         *curout++ = 'C';
     370          37 :                         break;
     371             :                     case 0:
     372             :                     default:
     373         277 :                         break;
     374             :                 }
     375             : 
     376         342 :                 if (pp > 1)
     377          90 :                     *curout++ = ',';
     378         342 :                 pp--;
     379         342 :                 wptr++;
     380             :             }
     381             :         }
     382         316 :         ptr++;
     383             :     }
     384             : 
     385          71 :     *curout = '\0';
     386          71 :     PG_FREE_IF_COPY(out, 0);
     387          71 :     PG_RETURN_CSTRING(outbuf);
     388             : }
     389             : 
     390             : /*
     391             :  * Binary Input / Output functions. The binary format is as follows:
     392             :  *
     393             :  * uint32   number of lexemes
     394             :  *
     395             :  * for each lexeme:
     396             :  *      lexeme text in client encoding, null-terminated
     397             :  *      uint16  number of positions
     398             :  *      for each position:
     399             :  *          uint16 WordEntryPos
     400             :  */
     401             : 
     402             : Datum
     403           0 : tsvectorsend(PG_FUNCTION_ARGS)
     404             : {
     405           0 :     TSVector    vec = PG_GETARG_TSVECTOR(0);
     406             :     StringInfoData buf;
     407             :     int         i,
     408             :                 j;
     409           0 :     WordEntry  *weptr = ARRPTR(vec);
     410             : 
     411           0 :     pq_begintypsend(&buf);
     412             : 
     413           0 :     pq_sendint(&buf, vec->size, sizeof(int32));
     414           0 :     for (i = 0; i < vec->size; i++)
     415             :     {
     416             :         uint16      npos;
     417             : 
     418             :         /*
     419             :          * the strings in the TSVector array are not null-terminated, so we
     420             :          * have to send the null-terminator separately
     421             :          */
     422           0 :         pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
     423           0 :         pq_sendbyte(&buf, '\0');
     424             : 
     425           0 :         npos = POSDATALEN(vec, weptr);
     426           0 :         pq_sendint(&buf, npos, sizeof(uint16));
     427             : 
     428           0 :         if (npos > 0)
     429             :         {
     430           0 :             WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
     431             : 
     432           0 :             for (j = 0; j < npos; j++)
     433           0 :                 pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
     434             :         }
     435           0 :         weptr++;
     436             :     }
     437             : 
     438           0 :     PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
     439             : }
     440             : 
     441             : Datum
     442           0 : tsvectorrecv(PG_FUNCTION_ARGS)
     443             : {
     444           0 :     StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
     445             :     TSVector    vec;
     446             :     int         i;
     447             :     int32       nentries;
     448             :     int         datalen;        /* number of bytes used in the variable size
     449             :                                  * area after fixed size TSVector header and
     450             :                                  * WordEntries */
     451             :     Size        hdrlen;
     452             :     Size        len;            /* allocated size of vec */
     453           0 :     bool        needSort = false;
     454             : 
     455           0 :     nentries = pq_getmsgint(buf, sizeof(int32));
     456           0 :     if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
     457           0 :         elog(ERROR, "invalid size of tsvector");
     458             : 
     459           0 :     hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
     460             : 
     461           0 :     len = hdrlen * 2;           /* times two to make room for lexemes */
     462           0 :     vec = (TSVector) palloc0(len);
     463           0 :     vec->size = nentries;
     464             : 
     465           0 :     datalen = 0;
     466           0 :     for (i = 0; i < nentries; i++)
     467             :     {
     468             :         const char *lexeme;
     469             :         uint16      npos;
     470             :         size_t      lex_len;
     471             : 
     472           0 :         lexeme = pq_getmsgstring(buf);
     473           0 :         npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
     474             : 
     475             :         /* sanity checks */
     476             : 
     477           0 :         lex_len = strlen(lexeme);
     478           0 :         if (lex_len > MAXSTRLEN)
     479           0 :             elog(ERROR, "invalid tsvector: lexeme too long");
     480             : 
     481           0 :         if (datalen > MAXSTRPOS)
     482           0 :             elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
     483             : 
     484           0 :         if (npos > MAXNUMPOS)
     485           0 :             elog(ERROR, "unexpected number of tsvector positions");
     486             : 
     487             :         /*
     488             :          * Looks valid. Fill the WordEntry struct, and copy lexeme.
     489             :          *
     490             :          * But make sure the buffer is large enough first.
     491             :          */
     492           0 :         while (hdrlen + SHORTALIGN(datalen + lex_len) +
     493           0 :                (npos + 1) * sizeof(WordEntryPos) >= len)
     494             :         {
     495           0 :             len *= 2;
     496           0 :             vec = (TSVector) repalloc(vec, len);
     497             :         }
     498             : 
     499           0 :         vec->entries[i].haspos = (npos > 0) ? 1 : 0;
     500           0 :         vec->entries[i].len = lex_len;
     501           0 :         vec->entries[i].pos = datalen;
     502             : 
     503           0 :         memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
     504             : 
     505           0 :         datalen += lex_len;
     506             : 
     507           0 :         if (i > 0 && WordEntryCMP(&vec->entries[i],
     508           0 :                                   &vec->entries[i - 1],
     509           0 :                                   STRPTR(vec)) <= 0)
     510           0 :             needSort = true;
     511             : 
     512             :         /* Receive positions */
     513           0 :         if (npos > 0)
     514             :         {
     515             :             uint16      j;
     516             :             WordEntryPos *wepptr;
     517             : 
     518             :             /*
     519             :              * Pad to 2-byte alignment if necessary. Though we used palloc0
     520             :              * for the initial allocation, subsequent repalloc'd memory areas
     521             :              * are not initialized to zero.
     522             :              */
     523           0 :             if (datalen != SHORTALIGN(datalen))
     524             :             {
     525           0 :                 *(STRPTR(vec) + datalen) = '\0';
     526           0 :                 datalen = SHORTALIGN(datalen);
     527             :             }
     528             : 
     529           0 :             memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
     530             : 
     531           0 :             wepptr = POSDATAPTR(vec, &vec->entries[i]);
     532           0 :             for (j = 0; j < npos; j++)
     533             :             {
     534           0 :                 wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
     535           0 :                 if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
     536           0 :                     elog(ERROR, "position information is misordered");
     537             :             }
     538             : 
     539           0 :             datalen += (npos + 1) * sizeof(WordEntry);
     540             :         }
     541             :     }
     542             : 
     543           0 :     SET_VARSIZE(vec, hdrlen + datalen);
     544             : 
     545           0 :     if (needSort)
     546           0 :         qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry),
     547           0 :                   compareentry, (void *) STRPTR(vec));
     548             : 
     549           0 :     PG_RETURN_TSVECTOR(vec);
     550             : }

Generated by: LCOV version 1.11