Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * ginpostinglist.c
4 : * routines for dealing with posting lists.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/gin/ginpostinglist.c
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include "access/gin_private.h"
18 :
19 : #ifdef USE_ASSERT_CHECKING
20 : #define CHECK_ENCODING_ROUNDTRIP
21 : #endif
22 :
23 : /*
24 : * For encoding purposes, item pointers are represented as 64-bit unsigned
25 : * integers. The lowest 11 bits represent the offset number, and the next
26 : * lowest 32 bits are the block number. That leaves 17 bits unused, i.e.
27 : * only 43 low bits are used.
28 : *
29 : * These 43-bit integers are encoded using varbyte encoding. In each byte,
30 : * the 7 low bits contain data, while the highest bit is a continuation bit.
31 : * When the continuation bit is set, the next byte is part of the same
32 : * integer, otherwise this is the last byte of this integer. 43 bits fit
33 : * conveniently in at most 6 bytes when varbyte encoded (the 6th byte does
34 : * not need a continuation bit, because we know the max size to be 43 bits):
35 : *
36 : * 0XXXXXXX
37 : * 1XXXXXXX 0XXXXYYY
38 : * 1XXXXXXX 1XXXXYYY 0YYYYYYY
39 : * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY
40 : * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
41 : * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY YYYYYYYY
42 : *
43 : * X = bits used for offset number
44 : * Y = bits used for block number
45 : *
46 : * The bytes are in stored in little-endian order.
47 : *
48 : * An important property of this encoding is that removing an item from list
49 : * never increases the size of the resulting compressed posting list. Proof:
50 : *
51 : * Removing number is actually replacement of two numbers with their sum. We
52 : * have to prove that varbyte encoding of a sum can't be longer than varbyte
53 : * encoding of its summands. Sum of two numbers is at most one bit wider than
54 : * the larger of the summands. Widening a number by one bit enlarges its length
55 : * in varbyte encoding by at most one byte. Therefore, varbyte encoding of sum
56 : * is at most one byte longer than varbyte encoding of larger summand. Lesser
57 : * summand is at least one byte, so the sum cannot take more space than the
58 : * summands, Q.E.D.
59 : *
60 : * This property greatly simplifies VACUUM, which can assume that posting
61 : * lists always fit on the same page after vacuuming. Note that even though
62 : * that holds for removing items from a posting list, you must also be
63 : * careful to not cause expansion e.g. when merging uncompressed items on the
64 : * page into the compressed lists, when vacuuming.
65 : */
66 :
67 : /*
68 : * How many bits do you need to encode offset number? OffsetNumber is a 16-bit
69 : * integer, but you can't fit that many items on a page. 11 ought to be more
70 : * than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and
71 : * use the minimum number of bits, but that would require changing the on-disk
72 : * format if MaxHeapTuplesPerPage changes. Better to leave some slack.
73 : */
74 : #define MaxHeapTuplesPerPageBits 11
75 :
76 : static inline uint64
77 2221966 : itemptr_to_uint64(const ItemPointer iptr)
78 : {
79 : uint64 val;
80 :
81 2221966 : Assert(ItemPointerIsValid(iptr));
82 2221966 : Assert(GinItemPointerGetOffsetNumber(iptr) < (1 << MaxHeapTuplesPerPageBits));
83 :
84 2221966 : val = GinItemPointerGetBlockNumber(iptr);
85 2221966 : val <<= MaxHeapTuplesPerPageBits;
86 2221966 : val |= GinItemPointerGetOffsetNumber(iptr);
87 :
88 2221966 : return val;
89 : }
90 :
91 : static inline void
92 4232353 : uint64_to_itemptr(uint64 val, ItemPointer iptr)
93 : {
94 4232353 : GinItemPointerSetOffsetNumber(iptr, val & ((1 << MaxHeapTuplesPerPageBits) - 1));
95 4232353 : val = val >> MaxHeapTuplesPerPageBits;
96 4232353 : GinItemPointerSetBlockNumber(iptr, val);
97 :
98 4232353 : Assert(ItemPointerIsValid(iptr));
99 4232353 : }
100 :
101 : /*
102 : * Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer.
103 : */
104 : static void
105 2105060 : encode_varbyte(uint64 val, unsigned char **ptr)
106 : {
107 2105060 : unsigned char *p = *ptr;
108 :
109 4264057 : while (val > 0x7F)
110 : {
111 53937 : *(p++) = 0x80 | (val & 0x7F);
112 53937 : val >>= 7;
113 : }
114 2105060 : *(p++) = (unsigned char) val;
115 :
116 2105060 : *ptr = p;
117 2105060 : }
118 :
119 : /*
120 : * Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer.
121 : */
122 : static uint64
123 4232353 : decode_varbyte(unsigned char **ptr)
124 : {
125 : uint64 val;
126 4232353 : unsigned char *p = *ptr;
127 : uint64 c;
128 :
129 4232353 : c = *(p++);
130 4232353 : val = c & 0x7F;
131 4232353 : if (c & 0x80)
132 : {
133 95836 : c = *(p++);
134 95836 : val |= (c & 0x7F) << 7;
135 95836 : if (c & 0x80)
136 : {
137 10660 : c = *(p++);
138 10660 : val |= (c & 0x7F) << 14;
139 10660 : if (c & 0x80)
140 : {
141 0 : c = *(p++);
142 0 : val |= (c & 0x7F) << 21;
143 0 : if (c & 0x80)
144 : {
145 0 : c = *(p++);
146 0 : val |= (c & 0x7F) << 28;
147 0 : if (c & 0x80)
148 : {
149 0 : c = *(p++);
150 0 : val |= (c & 0x7F) << 35;
151 0 : if (c & 0x80)
152 : {
153 : /* last byte, no continuation bit */
154 0 : c = *(p++);
155 0 : val |= c << 42;
156 : }
157 : }
158 : }
159 : }
160 : }
161 : }
162 :
163 4232353 : *ptr = p;
164 :
165 4232353 : return val;
166 : }
167 :
168 : /*
169 : * Encode a posting list.
170 : *
171 : * The encoded list is returned in a palloc'd struct, which will be at most
172 : * 'maxsize' bytes in size. The number items in the returned segment is
173 : * returned in *nwritten. If it's not equal to nipd, not all the items fit
174 : * in 'maxsize', and only the first *nwritten were encoded.
175 : *
176 : * The allocated size of the returned struct is short-aligned, and the padding
177 : * byte at the end, if any, is zero.
178 : */
179 : GinPostingList *
180 42675 : ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize,
181 : int *nwritten)
182 : {
183 : uint64 prev;
184 42675 : int totalpacked = 0;
185 : int maxbytes;
186 : GinPostingList *result;
187 : unsigned char *ptr;
188 : unsigned char *endptr;
189 :
190 42675 : maxsize = SHORTALIGN_DOWN(maxsize);
191 :
192 42675 : result = palloc(maxsize);
193 :
194 42675 : maxbytes = maxsize - offsetof(GinPostingList, bytes);
195 42675 : Assert(maxbytes > 0);
196 :
197 : /* Store the first special item */
198 42675 : result->first = ipd[0];
199 :
200 42675 : prev = itemptr_to_uint64(&result->first);
201 :
202 42675 : ptr = result->bytes;
203 42675 : endptr = result->bytes + maxbytes;
204 2147546 : for (totalpacked = 1; totalpacked < nipd; totalpacked++)
205 : {
206 2105060 : uint64 val = itemptr_to_uint64(&ipd[totalpacked]);
207 2105060 : uint64 delta = val - prev;
208 :
209 2105060 : Assert(val > prev);
210 :
211 2105060 : if (endptr - ptr >= 6)
212 2103920 : encode_varbyte(delta, &ptr);
213 : else
214 : {
215 : /*
216 : * There are less than 6 bytes left. Have to check if the next
217 : * item fits in that space before writing it out.
218 : */
219 : unsigned char buf[6];
220 1140 : unsigned char *p = buf;
221 :
222 1140 : encode_varbyte(delta, &p);
223 1140 : if (p - buf > (endptr - ptr))
224 189 : break; /* output is full */
225 :
226 951 : memcpy(ptr, buf, p - buf);
227 951 : ptr += (p - buf);
228 : }
229 2104871 : prev = val;
230 : }
231 42675 : result->nbytes = ptr - result->bytes;
232 :
233 : /*
234 : * If we wrote an odd number of bytes, zero out the padding byte at the
235 : * end.
236 : */
237 42675 : if (result->nbytes != SHORTALIGN(result->nbytes))
238 8214 : result->bytes[result->nbytes] = 0;
239 :
240 42675 : if (nwritten)
241 3528 : *nwritten = totalpacked;
242 :
243 42675 : Assert(SizeOfGinPostingList(result) <= maxsize);
244 :
245 : /*
246 : * Check that the encoded segment decodes back to the original items.
247 : */
248 : #if defined (CHECK_ENCODING_ROUNDTRIP)
249 : {
250 : int ndecoded;
251 42675 : ItemPointer tmp = ginPostingListDecode(result, &ndecoded);
252 : int i;
253 :
254 42675 : Assert(ndecoded == totalpacked);
255 2190221 : for (i = 0; i < ndecoded; i++)
256 2147546 : Assert(memcmp(&tmp[i], &ipd[i], sizeof(ItemPointerData)) == 0);
257 42675 : pfree(tmp);
258 : }
259 : #endif
260 :
261 42675 : return result;
262 : }
263 :
264 : /*
265 : * Decode a compressed posting list into an array of item pointers.
266 : * The number of items is returned in *ndecoded.
267 : */
268 : ItemPointer
269 74085 : ginPostingListDecode(GinPostingList *plist, int *ndecoded)
270 : {
271 74085 : return ginPostingListDecodeAllSegments(plist,
272 74085 : SizeOfGinPostingList(plist),
273 : ndecoded);
274 : }
275 :
276 : /*
277 : * Decode multiple posting list segments into an array of item pointers.
278 : * The number of items is returned in *ndecoded_out. The segments are stored
279 : * one after each other, with total size 'len' bytes.
280 : */
281 : ItemPointer
282 74091 : ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)
283 : {
284 : ItemPointer result;
285 : int nallocated;
286 : uint64 val;
287 74091 : char *endseg = ((char *) segment) + len;
288 : int ndecoded;
289 : unsigned char *ptr;
290 : unsigned char *endptr;
291 :
292 : /*
293 : * Guess an initial size of the array.
294 : */
295 74091 : nallocated = segment->nbytes * 2 + 1;
296 74091 : result = palloc(nallocated * sizeof(ItemPointerData));
297 :
298 74091 : ndecoded = 0;
299 222413 : while ((char *) segment < endseg)
300 : {
301 : /* enlarge output array if needed */
302 74231 : if (ndecoded >= nallocated)
303 : {
304 0 : nallocated *= 2;
305 0 : result = repalloc(result, nallocated * sizeof(ItemPointerData));
306 : }
307 :
308 : /* copy the first item */
309 74231 : Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first)));
310 74231 : Assert(ndecoded == 0 || ginCompareItemPointers(&segment->first, &result[ndecoded - 1]) > 0);
311 74231 : result[ndecoded] = segment->first;
312 74231 : ndecoded++;
313 :
314 74231 : val = itemptr_to_uint64(&segment->first);
315 74231 : ptr = segment->bytes;
316 74231 : endptr = segment->bytes + segment->nbytes;
317 4380815 : while (ptr < endptr)
318 : {
319 : /* enlarge output array if needed */
320 4232353 : if (ndecoded >= nallocated)
321 : {
322 24 : nallocated *= 2;
323 24 : result = repalloc(result, nallocated * sizeof(ItemPointerData));
324 : }
325 :
326 4232353 : val += decode_varbyte(&ptr);
327 :
328 4232353 : uint64_to_itemptr(val, &result[ndecoded]);
329 4232353 : ndecoded++;
330 : }
331 74231 : segment = GinNextPostingListSegment(segment);
332 : }
333 :
334 74091 : if (ndecoded_out)
335 74091 : *ndecoded_out = ndecoded;
336 74091 : return result;
337 : }
338 :
339 : /*
340 : * Add all item pointers from a bunch of posting lists to a TIDBitmap.
341 : */
342 : int
343 0 : ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len,
344 : TIDBitmap *tbm)
345 : {
346 : int ndecoded;
347 : ItemPointer items;
348 :
349 0 : items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded);
350 0 : tbm_add_tuples(tbm, items, ndecoded, false);
351 0 : pfree(items);
352 :
353 0 : return ndecoded;
354 : }
355 :
356 : /*
357 : * Merge two ordered arrays of itempointers, eliminating any duplicates.
358 : *
359 : * Returns a palloc'd array, and *nmerged is set to the number of items in
360 : * the result, after eliminating duplicates.
361 : */
362 : ItemPointer
363 6981 : ginMergeItemPointers(ItemPointerData *a, uint32 na,
364 : ItemPointerData *b, uint32 nb,
365 : int *nmerged)
366 : {
367 : ItemPointerData *dst;
368 :
369 6981 : dst = (ItemPointer) palloc((na + nb) * sizeof(ItemPointerData));
370 :
371 : /*
372 : * If the argument arrays don't overlap, we can just append them to each
373 : * other.
374 : */
375 6981 : if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0)
376 : {
377 3316 : memcpy(dst, a, na * sizeof(ItemPointerData));
378 3316 : memcpy(&dst[na], b, nb * sizeof(ItemPointerData));
379 3316 : *nmerged = na + nb;
380 : }
381 3665 : else if (ginCompareItemPointers(&b[nb - 1], &a[0]) < 0)
382 : {
383 3665 : memcpy(dst, b, nb * sizeof(ItemPointerData));
384 3665 : memcpy(&dst[nb], a, na * sizeof(ItemPointerData));
385 3665 : *nmerged = na + nb;
386 : }
387 : else
388 : {
389 0 : ItemPointerData *dptr = dst;
390 0 : ItemPointerData *aptr = a;
391 0 : ItemPointerData *bptr = b;
392 :
393 0 : while (aptr - a < na && bptr - b < nb)
394 : {
395 0 : int cmp = ginCompareItemPointers(aptr, bptr);
396 :
397 0 : if (cmp > 0)
398 0 : *dptr++ = *bptr++;
399 0 : else if (cmp == 0)
400 : {
401 : /* only keep one copy of the identical items */
402 0 : *dptr++ = *bptr++;
403 0 : aptr++;
404 : }
405 : else
406 0 : *dptr++ = *aptr++;
407 : }
408 :
409 0 : while (aptr - a < na)
410 0 : *dptr++ = *aptr++;
411 :
412 0 : while (bptr - b < nb)
413 0 : *dptr++ = *bptr++;
414 :
415 0 : *nmerged = dptr - dst;
416 : }
417 :
418 6981 : return dst;
419 : }
|