From b6502396fff51f422ddae66c17e89b7fddeed957 Mon Sep 17 00:00:00 2001 From: Yoga Ramalingam Date: Fri, 14 Nov 2014 09:23:13 -0500 Subject: [PATCH] statsd histogram to support more than 1 second Summary: Problem: Collectd/Statsd supports configurable percentiles for timers but it limits the value to be 1 to 1000ms. If the timer value is more than 1000ms, it uses for min, max, average,... and drops it for percentile computation. Solution: Added support for increasing bin width when the value is above histogram's range. Test Plan: Tested by sending metrics within range and out-of-range (ie above 1000 ms) Reviewers: skhajamo, shalstea Reviewed By: skhajamo CC: arcyd Differential Revision: https://all.phab.dev.bloomberg.com/D156454 --- src/utils_latency.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 90 insertions(+), 9 deletions(-) diff --git a/src/utils_latency.c b/src/utils_latency.c index 94da2112..cfe90e27 100644 --- a/src/utils_latency.c +++ b/src/utils_latency.c @@ -25,13 +25,18 @@ **/ #include "collectd.h" +#include "plugin.h" #include "utils_latency.h" #include "common.h" -#ifndef LATENCY_HISTOGRAM_SIZE -# define LATENCY_HISTOGRAM_SIZE 1000 +#include + +#ifndef HISTOGRAM_NUM_BINS +# define HISTOGRAM_NUM_BINS 1000 #endif +static const int HISTOGRAM_DEFAULT_BIN_WIDTH = 1; + struct latency_counter_s { cdtime_t start_time; @@ -42,9 +47,69 @@ struct latency_counter_s cdtime_t min; cdtime_t max; - int histogram[LATENCY_HISTOGRAM_SIZE]; + int bin_width; + int histogram[HISTOGRAM_NUM_BINS]; }; +/* +* Histogram represents the distribution of data, it has a list of "bins". +* Each bin represents an interval and has a count (frequency) of +* number of values fall within its interval. +* +* Histogram's range is determined by the number of bins and the bin width, +* There are 1000 bins and all bins have the same width of default 1 millisecond. +* When a value above this range is added, Histogram's range is increased by +* increasing the bin width (note that number of bins remains always at 1000). +* This operation of increasing bin width is little expensive as each bin need +* to be visited to update it's count. To reduce frequent change of bin width, +* new bin width will be the next nearest power of 2. Example: 2, 4, 8, 16, 32, +* 64, 128, 256, 512, 1024, 2048, 5086, ... +* +* So, if the required bin width is 300, then new bin width will be 512 as it is +* the next nearest power of 2. +* +*/ +void change_bin_width (latency_counter_t *lc, size_t val) /* {{{ */ +{ + int i=0; + /* This function is called because the new value is above histogram's range. + * First find the required bin width: + * requiredBinWidth = (value + 1) / numBins + * then get the next nearest power of 2 + * newBinWidth = 2^(ceil(log2(requiredBinWidth))) + */ + double required_bin_width = (double)(val + 1) / HISTOGRAM_NUM_BINS; + double required_bin_width_logbase2 = log(required_bin_width) / log(2.0); + int new_bin_width = (int)(pow(2.0, ceil( required_bin_width_logbase2))); + int old_bin_width = lc->bin_width; + lc->bin_width = new_bin_width; + + /* + * bin width has been increased, now iterate through all bins and move the + * old bin's count to new bin. + */ + if (lc->num > 0) // if the histogram has data then iterate else skip + { + double width_change_ratio = old_bin_width / new_bin_width; + for (i=0; ihistogram[new_bin] += lc->histogram[i]; + lc->histogram[i] = 0; + } + DEBUG("utils_latency: change_bin_width: fixed all bins"); + } + + DEBUG("utils_latency: change_bin_width: val-[%ld], oldBinWidth-[%d], " + "newBinWidth-[%d], required_bin_width-[%f], " + "required_bin_width_logbase2-[%f]", + val, old_bin_width, new_bin_width, required_bin_width, + required_bin_width_logbase2); + +} /* }}} void change_bin_width */ + latency_counter_t *latency_counter_create () /* {{{ */ { latency_counter_t *lc; @@ -54,6 +119,7 @@ latency_counter_t *latency_counter_create () /* {{{ */ return (NULL); latency_counter_reset (lc); + lc->bin_width = HISTOGRAM_DEFAULT_BIN_WIDTH; return (lc); } /* }}} latency_counter_t *latency_counter_create */ @@ -83,8 +149,19 @@ void latency_counter_add (latency_counter_t *lc, cdtime_t latency) /* {{{ */ * subtract one from the cdtime_t value so that exactly 1.0 ms get sorted * accordingly. */ latency_ms = (size_t) CDTIME_T_TO_MS (latency - 1); - if (latency_ms < STATIC_ARRAY_SIZE (lc->histogram)) - lc->histogram[latency_ms]++; + + int bin = (int)(latency_ms / lc->bin_width); + if (bin >= HISTOGRAM_NUM_BINS) + { + change_bin_width(lc, latency_ms); + bin = (int)(latency_ms / lc->bin_width); + if (bin >= HISTOGRAM_NUM_BINS) + { + ERROR("utils_latency: latency_counter_add: Invalid bin %d", bin); + return; + } + } + lc->histogram[bin]++; } /* }}} void latency_counter_add */ void latency_counter_reset (latency_counter_t *lc) /* {{{ */ @@ -92,7 +169,11 @@ void latency_counter_reset (latency_counter_t *lc) /* {{{ */ if (lc == NULL) return; + int bin_width = lc->bin_width; memset (lc, 0, sizeof (*lc)); + + /* preserve bin width */ + lc->bin_width = bin_width; lc->start_time = cdtime (); } /* }}} void latency_counter_reset */ @@ -153,7 +234,7 @@ cdtime_t latency_counter_get_percentile (latency_counter_t *lc, percent_upper = 0.0; percent_lower = 0.0; sum = 0; - for (i = 0; i < LATENCY_HISTOGRAM_SIZE; i++) + for (i = 0; i < HISTOGRAM_NUM_BINS; i++) { percent_lower = percent_upper; sum += lc->histogram[i]; @@ -166,14 +247,14 @@ cdtime_t latency_counter_get_percentile (latency_counter_t *lc, break; } - if (i >= LATENCY_HISTOGRAM_SIZE) + if (i >= HISTOGRAM_NUM_BINS) return (0); assert (percent_upper >= percent); assert (percent_lower < percent); - ms_upper = (double) (i + 1); - ms_lower = (double) i; + ms_upper = (double) ( (i + 1) * lc->bin_width ); + ms_lower = (double) ( i * lc->bin_width ); if (i == 0) return (MS_TO_CDTIME_T (ms_upper)); -- 2.11.0