Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * array_typanalyze.c
4 : * Functions for gathering statistics from array columns
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/utils/adt/array_typanalyze.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include "access/tuptoaster.h"
18 : #include "catalog/pg_collation.h"
19 : #include "commands/vacuum.h"
20 : #include "utils/array.h"
21 : #include "utils/builtins.h"
22 : #include "utils/datum.h"
23 : #include "utils/lsyscache.h"
24 : #include "utils/typcache.h"
25 :
26 :
27 : /*
28 : * To avoid consuming too much memory, IO and CPU load during analysis, and/or
29 : * too much space in the resulting pg_statistic rows, we ignore arrays that
30 : * are wider than ARRAY_WIDTH_THRESHOLD (after detoasting!). Note that this
31 : * number is considerably more than the similar WIDTH_THRESHOLD limit used
32 : * in analyze.c's standard typanalyze code.
33 : */
34 : #define ARRAY_WIDTH_THRESHOLD 0x10000
35 :
36 : /* Extra data for compute_array_stats function */
37 : typedef struct
38 : {
39 : /* Information about array element type */
40 : Oid type_id; /* element type's OID */
41 : Oid eq_opr; /* default equality operator's OID */
42 : bool typbyval; /* physical properties of element type */
43 : int16 typlen;
44 : char typalign;
45 :
46 : /*
47 : * Lookup data for element type's comparison and hash functions (these are
48 : * in the type's typcache entry, which we expect to remain valid over the
49 : * lifespan of the ANALYZE run)
50 : */
51 : FmgrInfo *cmp;
52 : FmgrInfo *hash;
53 :
54 : /* Saved state from std_typanalyze() */
55 : AnalyzeAttrComputeStatsFunc std_compute_stats;
56 : void *std_extra_data;
57 : } ArrayAnalyzeExtraData;
58 :
59 : /*
60 : * While compute_array_stats is running, we keep a pointer to the extra data
61 : * here for use by assorted subroutines. compute_array_stats doesn't
62 : * currently need to be re-entrant, so avoiding this is not worth the extra
63 : * notational cruft that would be needed.
64 : */
65 : static ArrayAnalyzeExtraData *array_extra_data;
66 :
67 : /* A hash table entry for the Lossy Counting algorithm */
68 : typedef struct
69 : {
70 : Datum key; /* This is 'e' from the LC algorithm. */
71 : int frequency; /* This is 'f'. */
72 : int delta; /* And this is 'delta'. */
73 : int last_container; /* For de-duplication of array elements. */
74 : } TrackItem;
75 :
76 : /* A hash table entry for distinct-elements counts */
77 : typedef struct
78 : {
79 : int count; /* Count of distinct elements in an array */
80 : int frequency; /* Number of arrays seen with this count */
81 : } DECountItem;
82 :
83 : static void compute_array_stats(VacAttrStats *stats,
84 : AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
85 : static void prune_element_hashtable(HTAB *elements_tab, int b_current);
86 : static uint32 element_hash(const void *key, Size keysize);
87 : static int element_match(const void *key1, const void *key2, Size keysize);
88 : static int element_compare(const void *key1, const void *key2);
89 : static int trackitem_compare_frequencies_desc(const void *e1, const void *e2);
90 : static int trackitem_compare_element(const void *e1, const void *e2);
91 : static int countitem_compare_count(const void *e1, const void *e2);
92 :
93 :
94 : /*
95 : * array_typanalyze -- typanalyze function for array columns
96 : */
97 : Datum
98 61 : array_typanalyze(PG_FUNCTION_ARGS)
99 : {
100 61 : VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
101 : Oid element_typeid;
102 : TypeCacheEntry *typentry;
103 : ArrayAnalyzeExtraData *extra_data;
104 :
105 : /*
106 : * Call the standard typanalyze function. It may fail to find needed
107 : * operators, in which case we also can't do anything, so just fail.
108 : */
109 61 : if (!std_typanalyze(stats))
110 0 : PG_RETURN_BOOL(false);
111 :
112 : /*
113 : * Check attribute data type is a varlena array (or a domain over one).
114 : */
115 61 : element_typeid = get_base_element_type(stats->attrtypid);
116 61 : if (!OidIsValid(element_typeid))
117 0 : elog(ERROR, "array_typanalyze was invoked for non-array type %u",
118 : stats->attrtypid);
119 :
120 : /*
121 : * Gather information about the element type. If we fail to find
122 : * something, return leaving the state from std_typanalyze() in place.
123 : */
124 61 : typentry = lookup_type_cache(element_typeid,
125 : TYPECACHE_EQ_OPR |
126 : TYPECACHE_CMP_PROC_FINFO |
127 : TYPECACHE_HASH_PROC_FINFO);
128 :
129 122 : if (!OidIsValid(typentry->eq_opr) ||
130 104 : !OidIsValid(typentry->cmp_proc_finfo.fn_oid) ||
131 43 : !OidIsValid(typentry->hash_proc_finfo.fn_oid))
132 18 : PG_RETURN_BOOL(true);
133 :
134 : /* Store our findings for use by compute_array_stats() */
135 43 : extra_data = (ArrayAnalyzeExtraData *) palloc(sizeof(ArrayAnalyzeExtraData));
136 43 : extra_data->type_id = typentry->type_id;
137 43 : extra_data->eq_opr = typentry->eq_opr;
138 43 : extra_data->typbyval = typentry->typbyval;
139 43 : extra_data->typlen = typentry->typlen;
140 43 : extra_data->typalign = typentry->typalign;
141 43 : extra_data->cmp = &typentry->cmp_proc_finfo;
142 43 : extra_data->hash = &typentry->hash_proc_finfo;
143 :
144 : /* Save old compute_stats and extra_data for scalar statistics ... */
145 43 : extra_data->std_compute_stats = stats->compute_stats;
146 43 : extra_data->std_extra_data = stats->extra_data;
147 :
148 : /* ... and replace with our info */
149 43 : stats->compute_stats = compute_array_stats;
150 43 : stats->extra_data = extra_data;
151 :
152 : /*
153 : * Note we leave stats->minrows set as std_typanalyze set it. Should it
154 : * be increased for array analysis purposes?
155 : */
156 :
157 43 : PG_RETURN_BOOL(true);
158 : }
159 :
160 : /*
161 : * compute_array_stats() -- compute statistics for an array column
162 : *
163 : * This function computes statistics useful for determining selectivity of
164 : * the array operators <@, &&, and @>. It is invoked by ANALYZE via the
165 : * compute_stats hook after sample rows have been collected.
166 : *
167 : * We also invoke the standard compute_stats function, which will compute
168 : * "scalar" statistics relevant to the btree-style array comparison operators.
169 : * However, exact duplicates of an entire array may be rare despite many
170 : * arrays sharing individual elements. This especially afflicts long arrays,
171 : * which are also liable to lack all scalar statistics due to the low
172 : * WIDTH_THRESHOLD used in analyze.c. So, in addition to the standard stats,
173 : * we find the most common array elements and compute a histogram of distinct
174 : * element counts.
175 : *
176 : * The algorithm used is Lossy Counting, as proposed in the paper "Approximate
177 : * frequency counts over data streams" by G. S. Manku and R. Motwani, in
178 : * Proceedings of the 28th International Conference on Very Large Data Bases,
179 : * Hong Kong, China, August 2002, section 4.2. The paper is available at
180 : * http://www.vldb.org/conf/2002/S10P03.pdf
181 : *
182 : * The Lossy Counting (aka LC) algorithm goes like this:
183 : * Let s be the threshold frequency for an item (the minimum frequency we
184 : * are interested in) and epsilon the error margin for the frequency. Let D
185 : * be a set of triples (e, f, delta), where e is an element value, f is that
186 : * element's frequency (actually, its current occurrence count) and delta is
187 : * the maximum error in f. We start with D empty and process the elements in
188 : * batches of size w. (The batch size is also known as "bucket size" and is
189 : * equal to 1/epsilon.) Let the current batch number be b_current, starting
190 : * with 1. For each element e we either increment its f count, if it's
191 : * already in D, or insert a new triple into D with values (e, 1, b_current
192 : * - 1). After processing each batch we prune D, by removing from it all
193 : * elements with f + delta <= b_current. After the algorithm finishes we
194 : * suppress all elements from D that do not satisfy f >= (s - epsilon) * N,
195 : * where N is the total number of elements in the input. We emit the
196 : * remaining elements with estimated frequency f/N. The LC paper proves
197 : * that this algorithm finds all elements with true frequency at least s,
198 : * and that no frequency is overestimated or is underestimated by more than
199 : * epsilon. Furthermore, given reasonable assumptions about the input
200 : * distribution, the required table size is no more than about 7 times w.
201 : *
202 : * In the absence of a principled basis for other particular values, we
203 : * follow ts_typanalyze() and use parameters s = 0.07/K, epsilon = s/10.
204 : * But we leave out the correction for stopwords, which do not apply to
205 : * arrays. These parameters give bucket width w = K/0.007 and maximum
206 : * expected hashtable size of about 1000 * K.
207 : *
208 : * Elements may repeat within an array. Since duplicates do not change the
209 : * behavior of <@, && or @>, we want to count each element only once per
210 : * array. Therefore, we store in the finished pg_statistic entry each
211 : * element's frequency as the fraction of all non-null rows that contain it.
212 : * We divide the raw counts by nonnull_cnt to get those figures.
213 : */
214 : static void
215 33 : compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
216 : int samplerows, double totalrows)
217 : {
218 : ArrayAnalyzeExtraData *extra_data;
219 : int num_mcelem;
220 33 : int null_cnt = 0;
221 33 : int null_elem_cnt = 0;
222 33 : int analyzed_rows = 0;
223 :
224 : /* This is D from the LC algorithm. */
225 : HTAB *elements_tab;
226 : HASHCTL elem_hash_ctl;
227 : HASH_SEQ_STATUS scan_status;
228 :
229 : /* This is the current bucket number from the LC algorithm */
230 : int b_current;
231 :
232 : /* This is 'w' from the LC algorithm */
233 : int bucket_width;
234 : int array_no;
235 : int64 element_no;
236 : TrackItem *item;
237 : int slot_idx;
238 : HTAB *count_tab;
239 : HASHCTL count_hash_ctl;
240 : DECountItem *count_item;
241 :
242 33 : extra_data = (ArrayAnalyzeExtraData *) stats->extra_data;
243 :
244 : /*
245 : * Invoke analyze.c's standard analysis function to create scalar-style
246 : * stats for the column. It will expect its own extra_data pointer, so
247 : * temporarily install that.
248 : */
249 33 : stats->extra_data = extra_data->std_extra_data;
250 33 : (*extra_data->std_compute_stats) (stats, fetchfunc, samplerows, totalrows);
251 33 : stats->extra_data = extra_data;
252 :
253 : /*
254 : * Set up static pointer for use by subroutines. We wait till here in
255 : * case std_compute_stats somehow recursively invokes us (probably not
256 : * possible, but ...)
257 : */
258 33 : array_extra_data = extra_data;
259 :
260 : /*
261 : * We want statistics_target * 10 elements in the MCELEM array. This
262 : * multiplier is pretty arbitrary, but is meant to reflect the fact that
263 : * the number of individual elements tracked in pg_statistic ought to be
264 : * more than the number of values for a simple scalar column.
265 : */
266 33 : num_mcelem = stats->attr->attstattarget * 10;
267 :
268 : /*
269 : * We set bucket width equal to num_mcelem / 0.007 as per the comment
270 : * above.
271 : */
272 33 : bucket_width = num_mcelem * 1000 / 7;
273 :
274 : /*
275 : * Create the hashtable. It will be in local memory, so we don't need to
276 : * worry about overflowing the initial size. Also we don't need to pay any
277 : * attention to locking and memory management.
278 : */
279 33 : MemSet(&elem_hash_ctl, 0, sizeof(elem_hash_ctl));
280 33 : elem_hash_ctl.keysize = sizeof(Datum);
281 33 : elem_hash_ctl.entrysize = sizeof(TrackItem);
282 33 : elem_hash_ctl.hash = element_hash;
283 33 : elem_hash_ctl.match = element_match;
284 33 : elem_hash_ctl.hcxt = CurrentMemoryContext;
285 33 : elements_tab = hash_create("Analyzed elements table",
286 : num_mcelem,
287 : &elem_hash_ctl,
288 : HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT);
289 :
290 : /* hashtable for array distinct elements counts */
291 33 : MemSet(&count_hash_ctl, 0, sizeof(count_hash_ctl));
292 33 : count_hash_ctl.keysize = sizeof(int);
293 33 : count_hash_ctl.entrysize = sizeof(DECountItem);
294 33 : count_hash_ctl.hcxt = CurrentMemoryContext;
295 33 : count_tab = hash_create("Array distinct element count table",
296 : 64,
297 : &count_hash_ctl,
298 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
299 :
300 : /* Initialize counters. */
301 33 : b_current = 1;
302 33 : element_no = 0;
303 :
304 : /* Loop over the arrays. */
305 29827 : for (array_no = 0; array_no < samplerows; array_no++)
306 : {
307 : Datum value;
308 : bool isnull;
309 : ArrayType *array;
310 : int num_elems;
311 : Datum *elem_values;
312 : bool *elem_nulls;
313 : bool null_present;
314 : int j;
315 29794 : int64 prev_element_no = element_no;
316 : int distinct_count;
317 : bool count_item_found;
318 :
319 29794 : vacuum_delay_point();
320 :
321 29794 : value = fetchfunc(stats, array_no, &isnull);
322 29794 : if (isnull)
323 : {
324 : /* array is null, just count that */
325 29035 : null_cnt++;
326 58070 : continue;
327 : }
328 :
329 : /* Skip too-large values. */
330 759 : if (toast_raw_datum_size(value) > ARRAY_WIDTH_THRESHOLD)
331 0 : continue;
332 : else
333 759 : analyzed_rows++;
334 :
335 : /*
336 : * Now detoast the array if needed, and deconstruct into datums.
337 : */
338 759 : array = DatumGetArrayTypeP(value);
339 :
340 759 : Assert(ARR_ELEMTYPE(array) == extra_data->type_id);
341 2277 : deconstruct_array(array,
342 : extra_data->type_id,
343 759 : extra_data->typlen,
344 759 : extra_data->typbyval,
345 759 : extra_data->typalign,
346 : &elem_values, &elem_nulls, &num_elems);
347 :
348 : /*
349 : * We loop through the elements in the array and add them to our
350 : * tracking hashtable.
351 : */
352 759 : null_present = false;
353 4543 : for (j = 0; j < num_elems; j++)
354 : {
355 : Datum elem_value;
356 : bool found;
357 :
358 : /* No null element processing other than flag setting here */
359 3784 : if (elem_nulls[j])
360 : {
361 4 : null_present = true;
362 573 : continue;
363 : }
364 :
365 : /* Lookup current element in hashtable, adding it if new */
366 3780 : elem_value = elem_values[j];
367 3780 : item = (TrackItem *) hash_search(elements_tab,
368 : (const void *) &elem_value,
369 : HASH_ENTER, &found);
370 :
371 3780 : if (found)
372 : {
373 : /* The element value is already on the tracking list */
374 :
375 : /*
376 : * The operators we assist ignore duplicate array elements, so
377 : * count a given distinct element only once per array.
378 : */
379 2851 : if (item->last_container == array_no)
380 565 : continue;
381 :
382 2286 : item->frequency++;
383 2286 : item->last_container = array_no;
384 : }
385 : else
386 : {
387 : /* Initialize new tracking list element */
388 :
389 : /*
390 : * If element type is pass-by-reference, we must copy it into
391 : * palloc'd space, so that we can release the array below. (We
392 : * do this so that the space needed for element values is
393 : * limited by the size of the hashtable; if we kept all the
394 : * array values around, it could be much more.)
395 : */
396 1858 : item->key = datumCopy(elem_value,
397 929 : extra_data->typbyval,
398 929 : extra_data->typlen);
399 :
400 929 : item->frequency = 1;
401 929 : item->delta = b_current - 1;
402 929 : item->last_container = array_no;
403 : }
404 :
405 : /* element_no is the number of elements processed (ie N) */
406 3215 : element_no++;
407 :
408 : /* We prune the D structure after processing each bucket */
409 3215 : if (element_no % bucket_width == 0)
410 : {
411 0 : prune_element_hashtable(elements_tab, b_current);
412 0 : b_current++;
413 : }
414 : }
415 :
416 : /* Count null element presence once per array. */
417 759 : if (null_present)
418 4 : null_elem_cnt++;
419 :
420 : /* Update frequency of the particular array distinct element count. */
421 759 : distinct_count = (int) (element_no - prev_element_no);
422 759 : count_item = (DECountItem *) hash_search(count_tab, &distinct_count,
423 : HASH_ENTER,
424 : &count_item_found);
425 :
426 759 : if (count_item_found)
427 679 : count_item->frequency++;
428 : else
429 80 : count_item->frequency = 1;
430 :
431 : /* Free memory allocated while detoasting. */
432 759 : if (PointerGetDatum(array) != value)
433 612 : pfree(array);
434 759 : pfree(elem_values);
435 759 : pfree(elem_nulls);
436 : }
437 :
438 : /* Skip pg_statistic slots occupied by standard statistics */
439 33 : slot_idx = 0;
440 96 : while (slot_idx < STATISTIC_NUM_SLOTS && stats->stakind[slot_idx] != 0)
441 30 : slot_idx++;
442 33 : if (slot_idx > STATISTIC_NUM_SLOTS - 2)
443 0 : elog(ERROR, "insufficient pg_statistic slots for array stats");
444 :
445 : /* We can only compute real stats if we found some non-null values. */
446 33 : if (analyzed_rows > 0)
447 : {
448 14 : int nonnull_cnt = analyzed_rows;
449 : int count_items_count;
450 : int i;
451 : TrackItem **sort_table;
452 : int track_len;
453 : int64 cutoff_freq;
454 : int64 minfreq,
455 : maxfreq;
456 :
457 : /*
458 : * We assume the standard stats code already took care of setting
459 : * stats_valid, stanullfrac, stawidth, stadistinct. We'd have to
460 : * re-compute those values if we wanted to not store the standard
461 : * stats.
462 : */
463 :
464 : /*
465 : * Construct an array of the interesting hashtable items, that is,
466 : * those meeting the cutoff frequency (s - epsilon)*N. Also identify
467 : * the minimum and maximum frequencies among these items.
468 : *
469 : * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff
470 : * frequency is 9*N / bucket_width.
471 : */
472 14 : cutoff_freq = 9 * element_no / bucket_width;
473 :
474 14 : i = hash_get_num_entries(elements_tab); /* surely enough space */
475 14 : sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i);
476 :
477 14 : hash_seq_init(&scan_status, elements_tab);
478 14 : track_len = 0;
479 14 : minfreq = element_no;
480 14 : maxfreq = 0;
481 957 : while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
482 : {
483 929 : if (item->frequency > cutoff_freq)
484 : {
485 929 : sort_table[track_len++] = item;
486 929 : minfreq = Min(minfreq, item->frequency);
487 929 : maxfreq = Max(maxfreq, item->frequency);
488 : }
489 : }
490 14 : Assert(track_len <= i);
491 :
492 : /* emit some statistics for debug purposes */
493 14 : elog(DEBUG3, "compute_array_stats: target # mces = %d, "
494 : "bucket width = %d, "
495 : "# elements = " INT64_FORMAT ", hashtable size = %d, "
496 : "usable entries = %d",
497 : num_mcelem, bucket_width, element_no, i, track_len);
498 :
499 : /*
500 : * If we obtained more elements than we really want, get rid of those
501 : * with least frequencies. The easiest way is to qsort the array into
502 : * descending frequency order and truncate the array.
503 : */
504 14 : if (num_mcelem < track_len)
505 : {
506 0 : qsort(sort_table, track_len, sizeof(TrackItem *),
507 : trackitem_compare_frequencies_desc);
508 : /* reset minfreq to the smallest frequency we're keeping */
509 0 : minfreq = sort_table[num_mcelem - 1]->frequency;
510 : }
511 : else
512 14 : num_mcelem = track_len;
513 :
514 : /* Generate MCELEM slot entry */
515 14 : if (num_mcelem > 0)
516 : {
517 : MemoryContext old_context;
518 : Datum *mcelem_values;
519 : float4 *mcelem_freqs;
520 :
521 : /*
522 : * We want to store statistics sorted on the element value using
523 : * the element type's default comparison function. This permits
524 : * fast binary searches in selectivity estimation functions.
525 : */
526 14 : qsort(sort_table, num_mcelem, sizeof(TrackItem *),
527 : trackitem_compare_element);
528 :
529 : /* Must copy the target values into anl_context */
530 14 : old_context = MemoryContextSwitchTo(stats->anl_context);
531 :
532 : /*
533 : * We sorted statistics on the element value, but we want to be
534 : * able to find the minimal and maximal frequencies without going
535 : * through all the values. We also want the frequency of null
536 : * elements. Store these three values at the end of mcelem_freqs.
537 : */
538 14 : mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum));
539 14 : mcelem_freqs = (float4 *) palloc((num_mcelem + 3) * sizeof(float4));
540 :
541 : /*
542 : * See comments above about use of nonnull_cnt as the divisor for
543 : * the final frequency estimates.
544 : */
545 943 : for (i = 0; i < num_mcelem; i++)
546 : {
547 929 : TrackItem *item = sort_table[i];
548 :
549 2787 : mcelem_values[i] = datumCopy(item->key,
550 929 : extra_data->typbyval,
551 929 : extra_data->typlen);
552 1858 : mcelem_freqs[i] = (double) item->frequency /
553 929 : (double) nonnull_cnt;
554 : }
555 14 : mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt;
556 14 : mcelem_freqs[i++] = (double) maxfreq / (double) nonnull_cnt;
557 14 : mcelem_freqs[i++] = (double) null_elem_cnt / (double) nonnull_cnt;
558 :
559 14 : MemoryContextSwitchTo(old_context);
560 :
561 14 : stats->stakind[slot_idx] = STATISTIC_KIND_MCELEM;
562 14 : stats->staop[slot_idx] = extra_data->eq_opr;
563 14 : stats->stanumbers[slot_idx] = mcelem_freqs;
564 : /* See above comment about extra stanumber entries */
565 14 : stats->numnumbers[slot_idx] = num_mcelem + 3;
566 14 : stats->stavalues[slot_idx] = mcelem_values;
567 14 : stats->numvalues[slot_idx] = num_mcelem;
568 : /* We are storing values of element type */
569 14 : stats->statypid[slot_idx] = extra_data->type_id;
570 14 : stats->statyplen[slot_idx] = extra_data->typlen;
571 14 : stats->statypbyval[slot_idx] = extra_data->typbyval;
572 14 : stats->statypalign[slot_idx] = extra_data->typalign;
573 14 : slot_idx++;
574 : }
575 :
576 : /* Generate DECHIST slot entry */
577 14 : count_items_count = hash_get_num_entries(count_tab);
578 14 : if (count_items_count > 0)
579 : {
580 14 : int num_hist = stats->attr->attstattarget;
581 : DECountItem **sorted_count_items;
582 : int j;
583 : int delta;
584 : int64 frac;
585 : float4 *hist;
586 :
587 : /* num_hist must be at least 2 for the loop below to work */
588 14 : num_hist = Max(num_hist, 2);
589 :
590 : /*
591 : * Create an array of DECountItem pointers, and sort them into
592 : * increasing count order.
593 : */
594 14 : sorted_count_items = (DECountItem **)
595 14 : palloc(sizeof(DECountItem *) * count_items_count);
596 14 : hash_seq_init(&scan_status, count_tab);
597 14 : j = 0;
598 108 : while ((count_item = (DECountItem *) hash_seq_search(&scan_status)) != NULL)
599 : {
600 80 : sorted_count_items[j++] = count_item;
601 : }
602 14 : qsort(sorted_count_items, count_items_count,
603 : sizeof(DECountItem *), countitem_compare_count);
604 :
605 : /*
606 : * Prepare to fill stanumbers with the histogram, followed by the
607 : * average count. This array must be stored in anl_context.
608 : */
609 14 : hist = (float4 *)
610 14 : MemoryContextAlloc(stats->anl_context,
611 : sizeof(float4) * (num_hist + 1));
612 14 : hist[num_hist] = (double) element_no / (double) nonnull_cnt;
613 :
614 : /*----------
615 : * Construct the histogram of distinct-element counts (DECs).
616 : *
617 : * The object of this loop is to copy the min and max DECs to
618 : * hist[0] and hist[num_hist - 1], along with evenly-spaced DECs
619 : * in between (where "evenly-spaced" is with reference to the
620 : * whole input population of arrays). If we had a complete sorted
621 : * array of DECs, one per analyzed row, the i'th hist value would
622 : * come from DECs[i * (analyzed_rows - 1) / (num_hist - 1)]
623 : * (compare the histogram-making loop in compute_scalar_stats()).
624 : * But instead of that we have the sorted_count_items[] array,
625 : * which holds unique DEC values with their frequencies (that is,
626 : * a run-length-compressed version of the full array). So we
627 : * control advancing through sorted_count_items[] with the
628 : * variable "frac", which is defined as (x - y) * (num_hist - 1),
629 : * where x is the index in the notional DECs array corresponding
630 : * to the start of the next sorted_count_items[] element's run,
631 : * and y is the index in DECs from which we should take the next
632 : * histogram value. We have to advance whenever x <= y, that is
633 : * frac <= 0. The x component is the sum of the frequencies seen
634 : * so far (up through the current sorted_count_items[] element),
635 : * and of course y * (num_hist - 1) = i * (analyzed_rows - 1),
636 : * per the subscript calculation above. (The subscript calculation
637 : * implies dropping any fractional part of y; in this formulation
638 : * that's handled by not advancing until frac reaches 1.)
639 : *
640 : * Even though frac has a bounded range, it could overflow int32
641 : * when working with very large statistics targets, so we do that
642 : * math in int64.
643 : *----------
644 : */
645 14 : delta = analyzed_rows - 1;
646 14 : j = 0; /* current index in sorted_count_items */
647 : /* Initialize frac for sorted_count_items[0]; y is initially 0 */
648 14 : frac = (int64) sorted_count_items[0]->frequency * (num_hist - 1);
649 1414 : for (i = 0; i < num_hist; i++)
650 : {
651 2866 : while (frac <= 0)
652 : {
653 : /* Advance, and update x component of frac */
654 66 : j++;
655 66 : frac += (int64) sorted_count_items[j]->frequency * (num_hist - 1);
656 : }
657 1400 : hist[i] = sorted_count_items[j]->count;
658 1400 : frac -= delta; /* update y for upcoming i increment */
659 : }
660 14 : Assert(j == count_items_count - 1);
661 :
662 14 : stats->stakind[slot_idx] = STATISTIC_KIND_DECHIST;
663 14 : stats->staop[slot_idx] = extra_data->eq_opr;
664 14 : stats->stanumbers[slot_idx] = hist;
665 14 : stats->numnumbers[slot_idx] = num_hist + 1;
666 14 : slot_idx++;
667 : }
668 : }
669 :
670 : /*
671 : * We don't need to bother cleaning up any of our temporary palloc's. The
672 : * hashtable should also go away, as it used a child memory context.
673 : */
674 33 : }
675 :
676 : /*
677 : * A function to prune the D structure from the Lossy Counting algorithm.
678 : * Consult compute_tsvector_stats() for wider explanation.
679 : */
680 : static void
681 0 : prune_element_hashtable(HTAB *elements_tab, int b_current)
682 : {
683 : HASH_SEQ_STATUS scan_status;
684 : TrackItem *item;
685 :
686 0 : hash_seq_init(&scan_status, elements_tab);
687 0 : while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
688 : {
689 0 : if (item->frequency + item->delta <= b_current)
690 : {
691 0 : Datum value = item->key;
692 :
693 0 : if (hash_search(elements_tab, (const void *) &item->key,
694 : HASH_REMOVE, NULL) == NULL)
695 0 : elog(ERROR, "hash table corrupted");
696 : /* We should free memory if element is not passed by value */
697 0 : if (!array_extra_data->typbyval)
698 0 : pfree(DatumGetPointer(value));
699 : }
700 : }
701 0 : }
702 :
703 : /*
704 : * Hash function for elements.
705 : *
706 : * We use the element type's default hash opclass, and the default collation
707 : * if the type is collation-sensitive.
708 : */
709 : static uint32
710 3780 : element_hash(const void *key, Size keysize)
711 : {
712 3780 : Datum d = *((const Datum *) key);
713 : Datum h;
714 :
715 3780 : h = FunctionCall1Coll(array_extra_data->hash, DEFAULT_COLLATION_OID, d);
716 3780 : return DatumGetUInt32(h);
717 : }
718 :
719 : /*
720 : * Matching function for elements, to be used in hashtable lookups.
721 : */
722 : static int
723 2851 : element_match(const void *key1, const void *key2, Size keysize)
724 : {
725 : /* The keysize parameter is superfluous here */
726 2851 : return element_compare(key1, key2);
727 : }
728 :
729 : /*
730 : * Comparison function for elements.
731 : *
732 : * We use the element type's default btree opclass, and the default collation
733 : * if the type is collation-sensitive.
734 : *
735 : * XXX consider using SortSupport infrastructure
736 : */
737 : static int
738 9943 : element_compare(const void *key1, const void *key2)
739 : {
740 9943 : Datum d1 = *((const Datum *) key1);
741 9943 : Datum d2 = *((const Datum *) key2);
742 : Datum c;
743 :
744 9943 : c = FunctionCall2Coll(array_extra_data->cmp, DEFAULT_COLLATION_OID, d1, d2);
745 9943 : return DatumGetInt32(c);
746 : }
747 :
748 : /*
749 : * qsort() comparator for sorting TrackItems by frequencies (descending sort)
750 : */
751 : static int
752 0 : trackitem_compare_frequencies_desc(const void *e1, const void *e2)
753 : {
754 0 : const TrackItem *const *t1 = (const TrackItem *const *) e1;
755 0 : const TrackItem *const *t2 = (const TrackItem *const *) e2;
756 :
757 0 : return (*t2)->frequency - (*t1)->frequency;
758 : }
759 :
760 : /*
761 : * qsort() comparator for sorting TrackItems by element values
762 : */
763 : static int
764 7092 : trackitem_compare_element(const void *e1, const void *e2)
765 : {
766 7092 : const TrackItem *const *t1 = (const TrackItem *const *) e1;
767 7092 : const TrackItem *const *t2 = (const TrackItem *const *) e2;
768 :
769 7092 : return element_compare(&(*t1)->key, &(*t2)->key);
770 : }
771 :
772 : /*
773 : * qsort() comparator for sorting DECountItems by count
774 : */
775 : static int
776 204 : countitem_compare_count(const void *e1, const void *e2)
777 : {
778 204 : const DECountItem *const *t1 = (const DECountItem *const *) e1;
779 204 : const DECountItem *const *t2 = (const DECountItem *const *) e2;
780 :
781 204 : if ((*t1)->count < (*t2)->count)
782 93 : return -1;
783 111 : else if ((*t1)->count == (*t2)->count)
784 0 : return 0;
785 : else
786 111 : return 1;
787 : }
|