From: Aurélien Reynaud Date: Wed, 1 Sep 2010 20:28:38 +0000 (+0200) Subject: lpar plugin: new attempt X-Git-Tag: collectd-5.0.0-beta0~27^2~15 X-Git-Url: https://git.verplant.org/?a=commitdiff_plain;h=3e375cba6873e0e5d47de4b16058cbd5ceb96184;p=collectd.git lpar plugin: new attempt Hello Florian, here is a new version of my lpar plugin. I tried to address the shortcomings of the previous attempt: - Minimum and maximum proc capacity are gone, being static values - The plugin now uses the cpu type for every value, so there is no need anymore for the lpar_cpu type - This also means there is no need anymore to compute rates in the plugin, so the code is IMHO much more elegant - There is a config option "ReportBySerial", as described in my previous email - We now use pool_busy_time directly instead of computing it from total and idle The patch is against the current 4.10 branch, rather than against ar/lpar, because it is more of a complete rewrite than just fixes. I could provide a patch against ar/lpar however if you prefer so. Regards, Aurélien Reynaud Signed-off-by: Florian Forster --- diff --git a/src/lpar.c b/src/lpar.c index 7998d796..2267e03c 100644 --- a/src/lpar.c +++ b/src/lpar.c @@ -22,72 +22,45 @@ #include "collectd.h" #include "common.h" #include "plugin.h" - #include #include -#include #include #ifndef XINTFRAC +# include # define XINTFRAC ((double)(_system_configuration.Xint) / \ (double)(_system_configuration.Xfrac)) #endif +#define HTIC2SEC(x) ((double)x * XINTFRAC / 1000000000.0) + +/* Max length of the type instance string */ +#define TYPE_INST_LEN (sizeof("pool--total") + 2*sizeof(int) + 1) static const char *config_keys[] = { - "CpuPoolStats" + "CpuPoolStats", + "ReportBySerial" }; static int config_keys_num = STATIC_ARRAY_SIZE (config_keys); -static int pool_stats = 0; +static int pool_stats = 0, + report_by_serial = 0; -/* As an LPAR can be moved transparently across physical systems - * through Live Partition Mobility (LPM), and the resources we are - * monitoring are tied to the underlying hardware, we need to keep - * track on which physical server we are currently on. This is done - * through the plugin instance which holds the chassis' serial. - */ static u_longlong_t last_time_base; -static u_longlong_t last_pcpu_user, - last_pcpu_sys, - last_pcpu_idle, - last_pcpu_wait; -static u_longlong_t last_pool_idle_time = 0; -static u_longlong_t last_idle_donated_purr = 0, - last_busy_donated_purr = 0, - last_busy_stolen_purr = 0, - last_idle_stolen_purr = 0; +static u_longlong_t ent_counter; static int donate_flag = 0; -/* Save the current values for the next iteration */ -static void save_last_values (perfstat_partition_total_t *lparstats) -{ - last_time_base = lparstats->timebase_last; - - last_pcpu_user = lparstats->puser; - last_pcpu_sys = lparstats->psys; - last_pcpu_idle = lparstats->pidle; - last_pcpu_wait = lparstats->pwait; - - if (donate_flag) - { - last_idle_donated_purr = lparstats->idle_donated_purr; - last_busy_donated_purr = lparstats->busy_donated_purr; - last_busy_stolen_purr = lparstats->busy_stolen_purr; - last_idle_stolen_purr = lparstats->idle_stolen_purr; - } - - last_pool_idle_time = lparstats->pool_idle_time; -} - static int lpar_config (const char *key, const char *value) { if (strcasecmp ("CpuPoolStats", key) == 0) { if (IS_TRUE (value)) pool_stats = 1; - else - pool_stats = 0; + } + else if (strcasecmp ("ReportBySerial", key) == 0) + { + if (IS_TRUE (value)) + report_by_serial = 1; } else { @@ -101,7 +74,7 @@ static int lpar_init (void) { perfstat_partition_total_t lparstats; - /* retrieve the initial metrics */ + /* Retrieve the initial metrics */ if (!perfstat_partition_total (NULL, &lparstats, sizeof (perfstat_partition_total_t), 1)) { @@ -114,133 +87,105 @@ static int lpar_init (void) donate_flag = 1; } - /* save the initial data */ - save_last_values (&lparstats); + if (pool_stats && !lparstats.type.b.pool_util_authority) + { + WARNING ("lpar plugin: this system does not have pool authority. " + "Disabling CPU pool statistics collection."); + pool_stats = 0; + } + + /* Initialize the fake counter for entitled capacity */ + last_time_base = lparstats.timebase_last; + ent_counter = 0; return (0); } /* int lpar_init */ -static void lpar_submit (const char *plugin_inst, const char *type_instance, double value) +static void lpar_submit (const char *type_instance, double value) { value_t values[1]; value_list_t vl = VALUE_LIST_INIT; - values[0].gauge = (gauge_t)value; + /* Although it appears as a double, value is really a (scaled) counter, + expressed in CPU x seconds. At high collection rates (< 1 min), its + integer part is very small and the resulting graphs get blocky. We regain + some precision by applying a x100 factor before casting it to a counter, + turning the final value into CPU units instead of CPUs. */ + values[0].counter = (counter_t)(value * 100.0 + 0.5); vl.values = values; vl.values_len = 1; - sstrncpy (vl.host, hostname_g, sizeof (vl.host)); + + /* An LPAR has the same serial number as the physical system it is currently + running on. It is a convenient way of tracking LPARs as they are moved + from chassis to chassis through Live Partition Mobility (LPM). */ + if (report_by_serial) + { + struct utsname name; + if (uname (&name) != 0) + { + ERROR ("lpar plugin: uname failed."); + return; + } + sstrncpy (vl.host, name.machine, sizeof (vl.host)); + sstrncpy (vl.plugin_instance, hostname_g, sizeof (vl.plugin)); + } + else + { + sstrncpy (vl.host, hostname_g, sizeof (vl.host)); + } sstrncpy (vl.plugin, "lpar", sizeof (vl.plugin)); - sstrncpy (vl.plugin_instance, plugin_inst, sizeof (vl.plugin)); - sstrncpy (vl.type, "lpar_pcpu", sizeof (vl.type)); + sstrncpy (vl.type, "cpu", sizeof (vl.type)); sstrncpy (vl.type_instance, type_instance, sizeof (vl.type_instance)); plugin_dispatch_values (&vl); } -static int submit_counter (const char *plugin_instance, /* {{{ */ - const char *type, const char *type_instance, counter_t value) -{ - value_t values[1]; - value_list_t vl = VALUE_LIST_INIT; - - values[0].counter = value; - - vl.values = values; - vl.values_len = 1; - sstrncpy (vl.host, hostname_g, sizeof (vl.host)); - sstrncpy (vl.plugin, "lpar", sizeof (vl.plugin)); - sstrncpy (vl.plugin_instance, plugin_inst, sizeof (vl.plugin)); - sstrncpy (vl.type, type, sizeof (vl.type)); - sstrncpy (vl.type_instance, type_instance, sizeof (vl.type_instance)); - - return (plugin_dispatch_values (&vl)); -} /* }}} int submit_counter */ - static int lpar_read (void) { u_longlong_t delta_time_base; perfstat_partition_total_t lparstats; - struct utsname name; - char plugin_inst[DATA_MAX_NAME_LEN]; - _Bool have_donate = 0; - /* retrieve the current physical server's id and build the plugin - instance's name */ - if (uname (&name) != 0) - { - ERROR ("lpar plugin: uname failed."); - return (-1); - } - sstrncpy (plugin_inst, name.machine, sizeof (plugin_inst)); - - /* retrieve the current metrics */ - if (!perfstat_partition_total (/* name = */ NULL, /* "must be set to NULL" */ - &lparstats, sizeof (lparstats), - /* desired_number = */ 1 /* "must be set to 1" */)) + /* Retrieve the current metrics */ + if (!perfstat_partition_total (NULL, &lparstats, + sizeof (perfstat_partition_total_t), 1)) { ERROR ("lpar plugin: perfstat_partition_total failed."); return (-1); } - if (!lparstats.type.b.shared_enabled - && lparstats.type.b.donate_enabled) - have_donate = 1; - delta_time_base = lparstats.timebase_last - last_time_base; - if (delta_time_base == 0) - { - /* The system stats have not been updated since last time */ - return (0); - } - - submit_counter (plugin_inst, "cpu", "user", (counter_t) lparstats.puser); - submit_counter (plugin_inst, "cpu", "system", (counter_t) lparstats.psys); - submit_counter (plugin_inst, "cpu", "idle", (counter_t) lparstats.pidle); - submit_counter (plugin_inst, "cpu", "wait", (counter_t) lparstats.pwait); + last_time_base = lparstats.timebase_last; + + lpar_submit ("user", HTIC2SEC(lparstats.puser)); + lpar_submit ("sys", HTIC2SEC(lparstats.psys)); + lpar_submit ("wait", HTIC2SEC(lparstats.pwait)); + lpar_submit ("idle", HTIC2SEC(lparstats.pidle)); + /* Entitled capacity is reported as an absolute value instead of a counter, + so we fake one. It's also in CPU units, hence the division by 100 before + submission. */ + ent_counter += lparstats.entitled_proc_capacity * delta_time_base; + lpar_submit ("ent", HTIC2SEC(ent_counter) / 100.0); - /* FIXME: Use an appropriate GAUGE type here. */ - lpar_submit (plugin_inst, "ent", (double)lparstats.entitled_proc_capacity / 100.0); - lpar_submit (plugin_inst, "max", (double)lparstats.max_proc_capacity / 100.0); - lpar_submit (plugin_inst, "min", (double)lparstats.min_proc_capacity / 100.0); - - if (have_donate) + if (donate_flag) { - dlt_idle_donated = lparstats.idle_donated_purr - last_idle_donated_purr; - dlt_busy_donated = lparstats.busy_donated_purr - last_busy_donated_purr; - dlt_idle_stolen = lparstats.idle_stolen_purr - last_idle_stolen_purr; - dlt_busy_stolen = lparstats.busy_stolen_purr - last_busy_stolen_purr; - - submit_counter (plugin_inst, "cpu", "donated-idle", (counter_t) lparstats.idle_donated_purr); - submit_counter (plugin_inst, "cpu", "donated-busy", (counter_t) lparstats.busy_donated_purr); - submit_counter (plugin_inst, "cpu", "stolen-idle", (counter_t) lparstats.idle_stolen_purr); - submit_counter (plugin_inst, "cpu", "stolen-busy", (counter_t) lparstats.busy_stolen_purr); + lpar_submit ("idle_donated", HTIC2SEC(lparstats.idle_donated_purr)); + lpar_submit ("busy_donated", HTIC2SEC(lparstats.busy_donated_purr)); + lpar_submit ("idle_stolen", HTIC2SEC(lparstats.idle_stolen_purr)); + lpar_submit ("busy_stolen", HTIC2SEC(lparstats.busy_stolen_purr)); } if (pool_stats) { - if (!lparstats.type.b.pool_util_authority) - { - WARNING ("lpar plugin: Pool utilization data is not available."); - } - else - { - u_longlong_t dlt_pit; - double total, idle; - char type[DATA_MAX_NAME_LEN]; + char typinst[TYPE_INST_LEN]; - /* FIXME: The pool id should probably be used as plugin instance. */ - dlt_pit = lparstats.pool_idle_time - last_pool_idle_time; - total = (double)lparstats.phys_cpus_pool; - idle = (double)dlt_pit / XINTFRAC / (double)delta_time_base; - ssnprintf (type, sizeof(type), "pool-%X-total", lparstats.pool_id); - lpar_submit (plugin_inst, type, total); - ssnprintf (type, sizeof(type), "pool-%X-used", lparstats.pool_id); - lpar_submit (plugin_inst, type, total - idle); - } - } + /* Pool stats are in CPU x ns */ + ssnprintf (typinst, sizeof(typinst), "pool-%X-busy", lparstats.pool_id); + lpar_submit (typinst, (double)lparstats.pool_busy_time / 1000000000.0); - save_last_values (&lparstats); + ssnprintf (typinst, sizeof(typinst), "pool-%X-total", lparstats.pool_id); + lpar_submit (typinst, (double)lparstats.pool_max_time / 1000000000.0); + } return (0); } /* int lpar_read */ @@ -253,5 +198,5 @@ void module_register (void) plugin_register_read ("lpar", lpar_read); } /* void module_register */ -/* vim: set sw=8 sts=8 ts=8 noet : */ +/* vim: set sw=2 sts=2 ts=8 : */ diff --git a/src/types.db b/src/types.db index 962109f4..1b0020f6 100644 --- a/src/types.db +++ b/src/types.db @@ -88,7 +88,6 @@ irq value:COUNTER:U:65535 latency value:GAUGE:0:65535 links value:GAUGE:0:U load shortterm:GAUGE:0:100, midterm:GAUGE:0:100, longterm:GAUGE:0:100 -lpar_pcpu value:GAUGE:0:U memcached_command value:COUNTER:0:U memcached_connections value:GAUGE:0:U memcached_items value:GAUGE:0:U