@@ -579,9 +579,9 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
579579{
580580int num_hist = stats -> attr -> attstattarget ;
581581DECountItem * * sorted_count_items ;
582- int count_item_index ;
582+ int j ;
583583int delta ;
584- int frac ;
584+ int64 frac ;
585585float4 * hist ;
586586
587587/* num_hist must be at least 2 for the loop below to work */
@@ -594,45 +594,70 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
594594sorted_count_items = (DECountItem * * )
595595palloc (sizeof (DECountItem * )* count_items_count );
596596hash_seq_init (& scan_status ,count_tab );
597- count_item_index = 0 ;
597+ j = 0 ;
598598while ((count_item = (DECountItem * )hash_seq_search (& scan_status ))!= NULL )
599599{
600- sorted_count_items [count_item_index ++ ]= count_item ;
600+ sorted_count_items [j ++ ]= count_item ;
601601}
602602qsort (sorted_count_items ,count_items_count ,
603603sizeof (DECountItem * ),countitem_compare_count );
604604
605605/*
606- *Fill stanumbers with the histogram, followed by the average
607- * count. This array must be stored in anl_context.
606+ *Prepare to fill stanumbers with the histogram, followed by the
607+ *average count. This array must be stored in anl_context.
608608 */
609609hist = (float4 * )
610610MemoryContextAlloc (stats -> anl_context ,
611611sizeof (float4 )* (num_hist + 1 ));
612612hist [num_hist ]= (double )element_no / (double )nonnull_cnt ;
613613
614- /*
615- * Construct the histogram.
614+ /*----------
615+ * Construct the histogram of distinct-element counts (DECs).
616+ *
617+ * The object of this loop is to copy the min and max DECs to
618+ * hist[0] and hist[num_hist - 1], along with evenly-spaced DECs
619+ * in between (where "evenly-spaced" is with reference to the
620+ * whole input population of arrays). If we had a complete sorted
621+ * array of DECs, one per analyzed row, the i'th hist value would
622+ * come from DECs[i * (analyzed_rows - 1) / (num_hist - 1)]
623+ * (compare the histogram-making loop in compute_scalar_stats()).
624+ * But instead of that we have the sorted_count_items[] array,
625+ * which holds unique DEC values with their frequencies (that is,
626+ * a run-length-compressed version of the full array). So we
627+ * control advancing through sorted_count_items[] with the
628+ * variable "frac", which is defined as (x - y) * (num_hist - 1),
629+ * where x is the index in the notional DECs array corresponding
630+ * to the start of the next sorted_count_items[] element's run,
631+ * and y is the index in DECs from which we should take the next
632+ * histogram value. We have to advance whenever x <= y, that is
633+ * frac <= 0. The x component is the sum of the frequencies seen
634+ * so far (up through the current sorted_count_items[] element),
635+ * and of course y * (num_hist - 1) = i * (analyzed_rows - 1),
636+ * per the subscript calculation above. (The subscript calculation
637+ * implies dropping any fractional part of y; in this formulation
638+ * that's handled by not advancing until frac reaches 1.)
616639 *
617- * XXX this needs work: frac could overflow, and it's not clear
618- * how or why the code works. Even if it does work, it needs
619- * documented.
640+ * Even though frac has a bounded range, it could overflow int32
641+ * when working with very large statistics targets, so we do that
642+ * math in int64.
643+ *----------
620644 */
621645delta = analyzed_rows - 1 ;
622- count_item_index = 0 ;
623- frac = sorted_count_items [0 ]-> frequency * (num_hist - 1 );
646+ j = 0 ;/* current index in sorted_count_items */
647+ /* Initialize frac for sorted_count_items[0]; y is initially 0 */
648+ frac = (int64 )sorted_count_items [0 ]-> frequency * (num_hist - 1 );
624649for (i = 0 ;i < num_hist ;i ++ )
625650{
626651while (frac <=0 )
627652{
628- count_item_index ++ ;
629- Assert ( count_item_index < count_items_count ) ;
630- frac += sorted_count_items [count_item_index ]-> frequency * (num_hist - 1 );
653+ /* Advance, and update x component of frac */
654+ j ++ ;
655+ frac += ( int64 ) sorted_count_items [j ]-> frequency * (num_hist - 1 );
631656}
632- hist [i ]= sorted_count_items [count_item_index ]-> count ;
633- frac -= delta ;
657+ hist [i ]= sorted_count_items [j ]-> count ;
658+ frac -= delta ;/* update y for upcoming i increment */
634659}
635- Assert (count_item_index == count_items_count - 1 );
660+ Assert (j == count_items_count - 1 );
636661
637662stats -> stakind [slot_idx ]= STATISTIC_KIND_DECHIST ;
638663stats -> staop [slot_idx ]= extra_data -> eq_opr ;