2 * collectd - src/threshold.c
3 * Copyright (C) 2007-2010 Florian Forster
4 * Copyright (C) 2008-2009 Sebastian Harl
5 * Copyright (C) 2009 Andrés J. Díaz
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License as published by the
9 * Free Software Foundation; only version 2 of the License is applicable.
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 * Florian octo Forster <octo at collectd.org>
22 * Sebastian Harl <sh at tokkee.org>
23 * Andrés J. Díaz <ajdiaz at connectical.com>
30 #include "utils_avltree.h"
31 #include "utils_cache.h"
32 #include "utils_threshold.h"
35 * Threshold management
36 * ====================
37 * The following functions add, delete, search, etc. configured thresholds to
38 * the underlying AVL trees.
42 * int ut_threshold_add
44 * Adds a threshold configuration to the list of thresholds. The threshold_t
45 * structure is copied and may be destroyed after this call. Returns zero on
46 * success, non-zero otherwise.
48 static int ut_threshold_add (const threshold_t *th)
50 char name[6 * DATA_MAX_NAME_LEN];
56 if (format_name (name, sizeof (name), th->host,
57 th->plugin, th->plugin_instance,
58 th->type, th->type_instance) != 0)
60 ERROR ("ut_threshold_add: format_name failed.");
64 name_copy = strdup (name);
65 if (name_copy == NULL)
67 ERROR ("ut_threshold_add: strdup failed.");
71 th_copy = malloc (sizeof (*th_copy));
75 ERROR ("ut_threshold_add: malloc failed.");
78 memcpy (th_copy, th, sizeof (threshold_t));
80 DEBUG ("ut_threshold_add: Adding entry `%s'", name);
82 pthread_mutex_lock (&threshold_lock);
84 th_ptr = threshold_get (th->host, th->plugin, th->plugin_instance,
85 th->type, th->type_instance);
87 while ((th_ptr != NULL) && (th_ptr->next != NULL))
88 th_ptr = th_ptr->next;
90 if (th_ptr == NULL) /* no such threshold yet */
92 status = c_avl_insert (threshold_tree, name_copy, th_copy);
94 else /* th_ptr points to the last threshold in the list */
96 th_ptr->next = th_copy;
97 /* name_copy isn't needed */
101 pthread_mutex_unlock (&threshold_lock);
105 ERROR ("ut_threshold_add: c_avl_insert (%s) failed.", name);
111 } /* }}} int ut_threshold_add */
116 * The following approximately two hundred functions are used to handle the
117 * configuration and fill the threshold list.
119 static int ut_config_type_datasource (threshold_t *th, oconfig_item_t *ci)
121 if ((ci->values_num != 1)
122 || (ci->values[0].type != OCONFIG_TYPE_STRING))
124 WARNING ("threshold values: The `DataSource' option needs exactly one "
129 sstrncpy (th->data_source, ci->values[0].value.string,
130 sizeof (th->data_source));
133 } /* int ut_config_type_datasource */
135 static int ut_config_type_instance (threshold_t *th, oconfig_item_t *ci)
137 if ((ci->values_num != 1)
138 || (ci->values[0].type != OCONFIG_TYPE_STRING))
140 WARNING ("threshold values: The `Instance' option needs exactly one "
145 sstrncpy (th->type_instance, ci->values[0].value.string,
146 sizeof (th->type_instance));
149 } /* int ut_config_type_instance */
151 static int ut_config_type_max (threshold_t *th, oconfig_item_t *ci)
153 if ((ci->values_num != 1)
154 || (ci->values[0].type != OCONFIG_TYPE_NUMBER))
156 WARNING ("threshold values: The `%s' option needs exactly one "
157 "number argument.", ci->key);
161 if (strcasecmp (ci->key, "WarningMax") == 0)
162 th->warning_max = ci->values[0].value.number;
164 th->failure_max = ci->values[0].value.number;
167 } /* int ut_config_type_max */
169 static int ut_config_type_min (threshold_t *th, oconfig_item_t *ci)
171 if ((ci->values_num != 1)
172 || (ci->values[0].type != OCONFIG_TYPE_NUMBER))
174 WARNING ("threshold values: The `%s' option needs exactly one "
175 "number argument.", ci->key);
179 if (strcasecmp (ci->key, "WarningMin") == 0)
180 th->warning_min = ci->values[0].value.number;
182 th->failure_min = ci->values[0].value.number;
185 } /* int ut_config_type_min */
187 static int ut_config_type_hits (threshold_t *th, oconfig_item_t *ci)
189 if ((ci->values_num != 1)
190 || (ci->values[0].type != OCONFIG_TYPE_NUMBER))
192 WARNING ("threshold values: The `%s' option needs exactly one "
193 "number argument.", ci->key);
197 th->hits = ci->values[0].value.number;
200 } /* int ut_config_type_hits */
202 static int ut_config_type_hysteresis (threshold_t *th, oconfig_item_t *ci)
204 if ((ci->values_num != 1)
205 || (ci->values[0].type != OCONFIG_TYPE_NUMBER))
207 WARNING ("threshold values: The `%s' option needs exactly one "
208 "number argument.", ci->key);
212 th->hysteresis = ci->values[0].value.number;
215 } /* int ut_config_type_hysteresis */
217 static int ut_config_type (const threshold_t *th_orig, oconfig_item_t *ci)
223 if ((ci->values_num != 1)
224 || (ci->values[0].type != OCONFIG_TYPE_STRING))
226 WARNING ("threshold values: The `Type' block needs exactly one string "
231 if (ci->children_num < 1)
233 WARNING ("threshold values: The `Type' block needs at least one option.");
237 memcpy (&th, th_orig, sizeof (th));
238 sstrncpy (th.type, ci->values[0].value.string, sizeof (th.type));
240 th.warning_min = NAN;
241 th.warning_max = NAN;
242 th.failure_min = NAN;
243 th.failure_max = NAN;
246 th.flags = UT_FLAG_INTERESTING; /* interesting by default */
248 for (i = 0; i < ci->children_num; i++)
250 oconfig_item_t *option = ci->children + i;
252 if (strcasecmp ("Instance", option->key) == 0)
253 status = ut_config_type_instance (&th, option);
254 else if (strcasecmp ("DataSource", option->key) == 0)
255 status = ut_config_type_datasource (&th, option);
256 else if ((strcasecmp ("WarningMax", option->key) == 0)
257 || (strcasecmp ("FailureMax", option->key) == 0))
258 status = ut_config_type_max (&th, option);
259 else if ((strcasecmp ("WarningMin", option->key) == 0)
260 || (strcasecmp ("FailureMin", option->key) == 0))
261 status = ut_config_type_min (&th, option);
262 else if (strcasecmp ("Interesting", option->key) == 0)
263 status = cf_util_get_flag (option, &th.flags, UT_FLAG_INTERESTING);
264 else if (strcasecmp ("Invert", option->key) == 0)
265 status = cf_util_get_flag (option, &th.flags, UT_FLAG_INVERT);
266 else if (strcasecmp ("Persist", option->key) == 0)
267 status = cf_util_get_flag (option, &th.flags, UT_FLAG_PERSIST);
268 else if (strcasecmp ("PersistOK", option->key) == 0)
269 status = cf_util_get_flag (option, &th.flags, UT_FLAG_PERSIST_OK);
270 else if (strcasecmp ("Percentage", option->key) == 0)
271 status = cf_util_get_flag (option, &th.flags, UT_FLAG_PERCENTAGE);
272 else if (strcasecmp ("Hits", option->key) == 0)
273 status = ut_config_type_hits (&th, option);
274 else if (strcasecmp ("Hysteresis", option->key) == 0)
275 status = ut_config_type_hysteresis (&th, option);
278 WARNING ("threshold values: Option `%s' not allowed inside a `Type' "
279 "block.", option->key);
289 status = ut_threshold_add (&th);
293 } /* int ut_config_type */
295 static int ut_config_plugin_instance (threshold_t *th, oconfig_item_t *ci)
297 if ((ci->values_num != 1)
298 || (ci->values[0].type != OCONFIG_TYPE_STRING))
300 WARNING ("threshold values: The `Instance' option needs exactly one "
305 sstrncpy (th->plugin_instance, ci->values[0].value.string,
306 sizeof (th->plugin_instance));
309 } /* int ut_config_plugin_instance */
311 static int ut_config_plugin (const threshold_t *th_orig, oconfig_item_t *ci)
317 if ((ci->values_num != 1)
318 || (ci->values[0].type != OCONFIG_TYPE_STRING))
320 WARNING ("threshold values: The `Plugin' block needs exactly one string "
325 if (ci->children_num < 1)
327 WARNING ("threshold values: The `Plugin' block needs at least one nested "
332 memcpy (&th, th_orig, sizeof (th));
333 sstrncpy (th.plugin, ci->values[0].value.string, sizeof (th.plugin));
335 for (i = 0; i < ci->children_num; i++)
337 oconfig_item_t *option = ci->children + i;
339 if (strcasecmp ("Type", option->key) == 0)
340 status = ut_config_type (&th, option);
341 else if (strcasecmp ("Instance", option->key) == 0)
342 status = ut_config_plugin_instance (&th, option);
345 WARNING ("threshold values: Option `%s' not allowed inside a `Plugin' "
346 "block.", option->key);
355 } /* int ut_config_plugin */
357 static int ut_config_host (const threshold_t *th_orig, oconfig_item_t *ci)
363 if ((ci->values_num != 1)
364 || (ci->values[0].type != OCONFIG_TYPE_STRING))
366 WARNING ("threshold values: The `Host' block needs exactly one string "
371 if (ci->children_num < 1)
373 WARNING ("threshold values: The `Host' block needs at least one nested "
378 memcpy (&th, th_orig, sizeof (th));
379 sstrncpy (th.host, ci->values[0].value.string, sizeof (th.host));
381 for (i = 0; i < ci->children_num; i++)
383 oconfig_item_t *option = ci->children + i;
385 if (strcasecmp ("Type", option->key) == 0)
386 status = ut_config_type (&th, option);
387 else if (strcasecmp ("Plugin", option->key) == 0)
388 status = ut_config_plugin (&th, option);
391 WARNING ("threshold values: Option `%s' not allowed inside a `Host' "
392 "block.", option->key);
401 } /* int ut_config_host */
403 * End of the functions used to configure threshold values.
408 * int ut_report_state
410 * Checks if the `state' differs from the old state and creates a notification
414 static int ut_report_state (const data_set_t *ds,
415 const value_list_t *vl,
416 const threshold_t *th,
417 const gauge_t *values,
429 /* Check if hits matched */
430 if ( (th->hits != 0) )
432 int hits = uc_get_hits(ds,vl);
433 /* STATE_OKAY resets hits unless PERSIST_OK flag is set. Hits resets if
434 * threshold is hit. */
435 if ( ( (state == STATE_OKAY) && ((th->flags & UT_FLAG_PERSIST_OK) == 0) ) || (hits > th->hits) )
437 DEBUG("ut_report_state: reset uc_get_hits = 0");
438 uc_set_hits(ds,vl,0); /* reset hit counter and notify */
440 DEBUG("ut_report_state: th->hits = %d, uc_get_hits = %d",th->hits,uc_get_hits(ds,vl));
441 (void) uc_inc_hits(ds,vl,1); /* increase hit counter */
444 } /* end check hits */
446 state_old = uc_get_state (ds, vl);
448 /* If the state didn't change, report if `persistent' is specified. If the
449 * state is `okay', then only report if `persist_ok` flag is set. */
450 if (state == state_old)
452 if ((th->flags & UT_FLAG_PERSIST) == 0)
454 else if ( (state == STATE_OKAY) && ((th->flags & UT_FLAG_PERSIST_OK) == 0) )
458 if (state != state_old)
459 uc_set_state (ds, vl, state);
461 NOTIFICATION_INIT_VL (&n, vl);
464 bufsize = sizeof (n.message);
466 if (state == STATE_OKAY)
467 n.severity = NOTIF_OKAY;
468 else if (state == STATE_WARNING)
469 n.severity = NOTIF_WARNING;
471 n.severity = NOTIF_FAILURE;
475 status = ssnprintf (buf, bufsize, "Host %s, plugin %s",
476 vl->host, vl->plugin);
480 if (vl->plugin_instance[0] != '\0')
482 status = ssnprintf (buf, bufsize, " (instance %s)",
483 vl->plugin_instance);
488 status = ssnprintf (buf, bufsize, " type %s", vl->type);
492 if (vl->type_instance[0] != '\0')
494 status = ssnprintf (buf, bufsize, " (instance %s)",
500 plugin_notification_meta_add_string (&n, "DataSource",
501 ds->ds[ds_index].name);
502 plugin_notification_meta_add_double (&n, "CurrentValue", values[ds_index]);
503 plugin_notification_meta_add_double (&n, "WarningMin", th->warning_min);
504 plugin_notification_meta_add_double (&n, "WarningMax", th->warning_max);
505 plugin_notification_meta_add_double (&n, "FailureMin", th->failure_min);
506 plugin_notification_meta_add_double (&n, "FailureMax", th->failure_max);
508 /* Send an okay notification */
509 if (state == STATE_OKAY)
511 if (state_old == STATE_MISSING)
512 ssnprintf (buf, bufsize, ": Value is no longer missing.");
514 ssnprintf (buf, bufsize,
515 ": All data sources are within range again. "
516 "Current value of \"%s\" is %f.",
517 ds->ds[ds_index].name, values[ds_index]);
524 min = (state == STATE_ERROR) ? th->failure_min : th->warning_min;
525 max = (state == STATE_ERROR) ? th->failure_max : th->warning_max;
527 if (th->flags & UT_FLAG_INVERT)
529 if (!isnan (min) && !isnan (max))
531 ssnprintf (buf, bufsize, ": Data source \"%s\" is currently "
532 "%f. That is within the %s region of %f%s and %f%s.",
533 ds->ds[ds_index].name, values[ds_index],
534 (state == STATE_ERROR) ? "failure" : "warning",
535 min, ((th->flags & UT_FLAG_PERCENTAGE) != 0) ? "%" : "",
536 max, ((th->flags & UT_FLAG_PERCENTAGE) != 0) ? "%" : "");
540 ssnprintf (buf, bufsize, ": Data source \"%s\" is currently "
541 "%f. That is %s the %s threshold of %f%s.",
542 ds->ds[ds_index].name, values[ds_index],
543 isnan (min) ? "below" : "above",
544 (state == STATE_ERROR) ? "failure" : "warning",
545 isnan (min) ? max : min,
546 ((th->flags & UT_FLAG_PERCENTAGE) != 0) ? "%" : "");
549 else if (th->flags & UT_FLAG_PERCENTAGE)
556 for (i = 0; i < vl->values_len; i++)
558 if (isnan (values[i]))
567 value = 100.0 * values[ds_index] / sum;
569 ssnprintf (buf, bufsize, ": Data source \"%s\" is currently "
570 "%g (%.2f%%). That is %s the %s threshold of %.2f%%.",
571 ds->ds[ds_index].name, values[ds_index], value,
572 (value < min) ? "below" : "above",
573 (state == STATE_ERROR) ? "failure" : "warning",
574 (value < min) ? min : max);
576 else /* is not inverted */
578 ssnprintf (buf, bufsize, ": Data source \"%s\" is currently "
579 "%f. That is %s the %s threshold of %f.",
580 ds->ds[ds_index].name, values[ds_index],
581 (values[ds_index] < min) ? "below" : "above",
582 (state == STATE_ERROR) ? "failure" : "warning",
583 (values[ds_index] < min) ? min : max);
587 plugin_dispatch_notification (&n);
589 plugin_notification_meta_free (n.meta);
591 } /* }}} int ut_report_state */
594 * int ut_check_one_data_source
596 * Checks one data source against the given threshold configuration. If the
597 * `DataSource' option is set in the threshold, and the name does NOT match,
598 * `okay' is returned. If the threshold does match, its failure and warning
599 * min and max values are checked and `failure' or `warning' is returned if
603 static int ut_check_one_data_source (const data_set_t *ds,
604 const value_list_t __attribute__((unused)) *vl,
605 const threshold_t *th,
606 const gauge_t *values,
612 int prev_state = STATE_OKAY;
614 /* check if this threshold applies to this data source */
617 ds_name = ds->ds[ds_index].name;
618 if ((th->data_source[0] != 0)
619 && (strcmp (ds_name, th->data_source) != 0))
623 if ((th->flags & UT_FLAG_INVERT) != 0)
629 /* XXX: This is an experimental code, not optimized, not fast, not reliable,
630 * and probably, do not work as you expect. Enjoy! :D */
631 if (th->hysteresis > 0)
633 prev_state = uc_get_state(ds,vl);
634 /* The purpose of hysteresis is elliminating flapping state when the value
635 * oscilates around the thresholds. In other words, what is important is
636 * the previous state; if the new value would trigger a transition, make
637 * sure that we artificially widen the range which is considered to apply
638 * for the previous state, and only trigger the notification if the value
639 * is outside of this expanded range.
641 * There is no hysteresis for the OKAY state.
643 gauge_t hysteresis_for_warning = 0, hysteresis_for_failure = 0;
647 hysteresis_for_failure = th->hysteresis;
650 hysteresis_for_warning = th->hysteresis;
653 /* do nothing -- the hysteresis only applies to the non-normal states */
657 if ((!isnan (th->failure_min) && (th->failure_min + hysteresis_for_failure > values[ds_index]))
658 || (!isnan (th->failure_max) && (th->failure_max - hysteresis_for_failure < values[ds_index])))
661 if ((!isnan (th->warning_min) && (th->warning_min + hysteresis_for_warning > values[ds_index]))
662 || (!isnan (th->warning_max) && (th->warning_max - hysteresis_for_warning < values[ds_index])))
666 else { /* no hysteresis */
667 if ((!isnan (th->failure_min) && (th->failure_min > values[ds_index]))
668 || (!isnan (th->failure_max) && (th->failure_max < values[ds_index])))
671 if ((!isnan (th->warning_min) && (th->warning_min > values[ds_index]))
672 || (!isnan (th->warning_max) && (th->warning_max < values[ds_index])))
677 return (STATE_ERROR);
680 return (STATE_WARNING);
683 } /* }}} int ut_check_one_data_source */
686 * int ut_check_one_threshold
688 * Checks all data sources of a value list against the given threshold, using
689 * the ut_check_one_data_source function above. Returns the worst status,
690 * which is `okay' if nothing has failed.
691 * Returns less than zero if the data set doesn't have any data sources.
693 static int ut_check_one_threshold (const data_set_t *ds,
694 const value_list_t *vl,
695 const threshold_t *th,
696 const gauge_t *values,
702 gauge_t values_copy[ds->ds_num];
704 memcpy (values_copy, values, sizeof (values_copy));
706 if ((th->flags & UT_FLAG_PERCENTAGE) != 0)
713 WARNING ("ut_check_one_threshold: The %s type has only one data "
714 "source, but you have configured to check this as a percentage. "
715 "That doesn't make much sense, because the percentage will always "
716 "be 100%%!", ds->type);
719 /* Prepare `sum' and `num'. */
720 for (i = 0; i < ds->ds_num; i++)
721 if (!isnan (values[i]))
727 if ((num == 0) /* All data sources are undefined. */
728 || (sum == 0.0)) /* Sum is zero, cannot calculate percentage. */
730 for (i = 0; i < ds->ds_num; i++)
731 values_copy[i] = NAN;
733 else /* We can actually calculate the percentage. */
735 for (i = 0; i < ds->ds_num; i++)
736 values_copy[i] = 100.0 * values[i] / sum;
738 } /* if (UT_FLAG_PERCENTAGE) */
740 for (i = 0; i < ds->ds_num; i++)
744 status = ut_check_one_data_source (ds, vl, th, values_copy, i);
750 } /* for (ds->ds_num) */
752 if (ret_ds_index != NULL)
753 *ret_ds_index = ds_index;
756 } /* }}} int ut_check_one_threshold */
759 * int ut_check_threshold
761 * Gets a list of matching thresholds and searches for the worst status by one
762 * of the thresholds. Then reports that status using the ut_report_state
764 * Returns zero on success and if no threshold has been configured. Returns
765 * less than zero on failure.
767 static int ut_check_threshold (const data_set_t *ds, const value_list_t *vl,
768 __attribute__((unused)) user_data_t *ud)
774 int worst_state = -1;
775 threshold_t *worst_th = NULL;
776 int worst_ds_index = -1;
778 if (threshold_tree == NULL)
781 /* Is this lock really necessary? So far, thresholds are only inserted at
783 pthread_mutex_lock (&threshold_lock);
784 th = threshold_search (vl);
785 pthread_mutex_unlock (&threshold_lock);
789 DEBUG ("ut_check_threshold: Found matching threshold(s)");
791 values = uc_get_rate (ds, vl);
799 status = ut_check_one_threshold (ds, vl, th, values, &ds_index);
802 ERROR ("ut_check_threshold: ut_check_one_threshold failed.");
807 if (worst_state < status)
809 worst_state = status;
811 worst_ds_index = ds_index;
817 status = ut_report_state (ds, vl, worst_th, values,
818 worst_ds_index, worst_state);
821 ERROR ("ut_check_threshold: ut_report_state failed.");
829 } /* }}} int ut_check_threshold */
834 * This function is called whenever a value goes "missing".
836 static int ut_missing (const value_list_t *vl,
837 __attribute__((unused)) user_data_t *ud)
840 cdtime_t missing_time;
841 char identifier[6 * DATA_MAX_NAME_LEN];
845 if (threshold_tree == NULL)
848 th = threshold_search (vl);
849 /* dispatch notifications for "interesting" values only */
850 if ((th == NULL) || ((th->flags & UT_FLAG_INTERESTING) == 0))
854 missing_time = now - vl->time;
855 FORMAT_VL (identifier, sizeof (identifier), vl);
857 NOTIFICATION_INIT_VL (&n, vl);
858 ssnprintf (n.message, sizeof (n.message),
859 "%s has not been updated for %.3f seconds.",
860 identifier, CDTIME_T_TO_DOUBLE (missing_time));
863 plugin_dispatch_notification (&n);
866 } /* }}} int ut_missing */
868 static int ut_config (oconfig_item_t *ci)
872 int old_size = c_avl_size (threshold_tree);
874 if (threshold_tree == NULL)
876 threshold_tree = c_avl_create ((int (*) (const void *, const void *)) strcmp);
877 if (threshold_tree == NULL)
879 ERROR ("ut_config: c_avl_create failed.");
889 .flags = UT_FLAG_INTERESTING /* interesting by default */
892 for (i = 0; i < ci->children_num; i++)
894 oconfig_item_t *option = ci->children + i;
896 if (strcasecmp ("Type", option->key) == 0)
897 status = ut_config_type (&th, option);
898 else if (strcasecmp ("Plugin", option->key) == 0)
899 status = ut_config_plugin (&th, option);
900 else if (strcasecmp ("Host", option->key) == 0)
901 status = ut_config_host (&th, option);
904 WARNING ("threshold values: Option `%s' not allowed here.", option->key);
912 /* register callbacks if this is the first time we see a valid config */
913 if ((old_size == 0) && (c_avl_size (threshold_tree) > 0))
915 plugin_register_missing ("threshold", ut_missing,
916 /* user data = */ NULL);
917 plugin_register_write ("threshold", ut_check_threshold,
918 /* user data = */ NULL);
922 } /* }}} int um_config */
924 void module_register (void)
926 plugin_register_complex_config ("threshold", ut_config);
929 /* vim: set sw=2 ts=8 sts=2 tw=78 et fdm=marker : */