=back
+=head2 Plugin C<intel_pmu>
+
+The I<intel_pmu> plugin collects performance counters data on Intel CPUs using
+Linux perf interface. All events are reported on a per core basis.
+
+B<Synopsis:>
+
+ <Plugin intel_pmu>
+ ReportHardwareCacheEvents true
+ ReportKernelPMUEvents true
+ ReportSoftwareEvents true
+ EventList "/var/cache/pmu/GenuineIntel-6-2D-core.json"
+ HardwareEvents "L2_RQSTS.CODE_RD_HIT,L2_RQSTS.CODE_RD_MISS" "L2_RQSTS.ALL_CODE_RD"
+ </Plugin>
+
+B<Options:>
+
+=over 4
+
+=item B<ReportHardwareCacheEvents> B<false>|B<true>
+
+Enable or disable measuring of hardware CPU cache events:
+ - L1-dcache-loads
+ - L1-dcache-load-misses
+ - L1-dcache-stores
+ - L1-dcache-store-misses
+ - L1-dcache-prefetches
+ - L1-dcache-prefetch-misses
+ - L1-icache-loads
+ - L1-icache-load-misses
+ - L1-icache-prefetches
+ - L1-icache-prefetch-misses
+ - LLC-loads
+ - LLC-load-misses
+ - LLC-stores
+ - LLC-store-misses
+ - LLC-prefetches
+ - LLC-prefetch-misses
+ - dTLB-loads
+ - dTLB-load-misses
+ - dTLB-stores
+ - dTLB-store-misses
+ - dTLB-prefetches
+ - dTLB-prefetch-misses
+ - iTLB-loads
+ - iTLB-load-misses
+ - branch-loads
+ - branch-load-misses
+
+=item B<ReportKernelPMUEvents> B<false>|B<true>
+
+Enable or disable measuring of the following events:
+ - cpu-cycles
+ - instructions
+ - cache-references
+ - cache-misses
+ - branches
+ - branch-misses
+ - bus-cycles
+
+=item B<ReportSoftwareEvents> B<false>|B<true>
+
+Enable or disable measuring of software events provided by kernel:
+ - cpu-clock
+ - task-clock
+ - context-switches
+ - cpu-migrations
+ - page-faults
+ - minor-faults
+ - major-faults
+ - alignment-faults
+ - emulation-faults
+
+=item B<EventList> I<filename>
+
+JSON performance counter event list file name. To be able to monitor all Intel
+CPU specific events JSON event list file should be downloaded. Use the pmu-tools
+event_download.py script to download event list for current CPU.
+
+=item B<HardwareEvents> I<events>
+
+This field is a list of event names or groups of comma separated event names.
+This option requires B<EventList> option to be configured.
+
+=back
+
=head2 Plugin C<intel_rdt>
The I<intel_rdt> plugin collects information provided by monitoring features of
mcelog server is running. When the server is running, the plugin will tail the
specified logfile to retrieve machine check exception information and send a
notification with the details from the logfile. The plugin will use the mcelog
- client protocol to retrieve memory related machine check exceptions.
+ client protocol to retrieve memory related machine check exceptions. Note that
+ for memory exceptions, notifications are only sent when there is a change in
+ the number of corrected/uncorrected memory errors.
- =over 4
+ =head3 The Memory block
+
+ Note: these options cannot be used in conjunction with the logfile options, they are mutually
+ exclusive.
+
+ =over 3
=item B<McelogClientSocket> I<Path>
Connect to the mcelog client socket using the UNIX domain socket at I<Path>.
Defaults to B<"/var/run/mcelog-client">.
+ =item B<PersistentNotification> B<true>|B<false>
+ Override default configuration to only send notifications when sent when there
+ is a change in the number of corrected/uncorrected memory errors. When set to
+ true notifications will be sent for every read cycle. Default is false. Does
+ not affect the stats being dispatched.
+
+ =back
+
+ =over 4
+
=item B<McelogLogfile> I<Path>
- The mcelog file to parse. Defaults to B<"/var/log/mcelog">.
+ The mcelog file to parse. Defaults to B<"/var/log/mcelog">. Note: this option
+ cannot be used in conjunction with the memory block options, they are mutually
+ exclusive.
=back
Example usage:
C<c_rehash /path/to/certs/folder>
+=item B<ConnectTimeout> I<Milliseconds>
+
+The B<ConnectTimeout> option sets the connect timeout, in milliseconds.
+By default, the configured B<Interval> is used to set the timeout.
+
=back
=head2 Plugin C<olsrd>
When the C<rrdtool> plugin uses a cache (by setting B<CacheTimeout>, see below)
it writes all values for a certain RRD-file if the oldest value is older than
-(or equal to) the number of seconds specified. If some RRD-file is not updated
+(or equal to) the number of seconds specified by B<CacheTimeout>.
+That check happens on new values arriwal. If some RRD-file is not updated
anymore for some reason (the computer was shut down, the network is broken,
-etc.) some values may still be in the cache. If B<CacheFlush> is set, then the
-entire cache is searched for entries older than B<CacheTimeout> seconds and
-written to disk every I<Seconds> seconds. Since this is kind of expensive and
-does nothing under normal circumstances, this value should not be too small.
-900 seconds might be a good value, though setting this to 7200 seconds doesn't
-normally do much harm either.
+etc.) some values may still be in the cache. If B<CacheFlush> is set, then
+every I<Seconds> seconds the entire cache is searched for entries older than
+B<CacheTimeout> + B<RandomTimeout> seconds. The entries found are written to
+disk. Since scanning the entire cache is kind of expensive and does nothing
+under normal circumstances, this value should not be too small. 900 seconds
+might be a good value, though setting this to 7200 seconds doesn't normally
+do much harm either.
+
+Defaults to 10x B<CacheTimeout>.
+B<CacheFlush> must be larger than or equal to B<CacheTimeout>, otherwise the
+above default is used.
=item B<CacheTimeout> I<Seconds>
* collectd - src/mcelog.c
* MIT License
*
- * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ * Copyright(c) 2016-2017 Intel Corporation. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* Krzysztof Matczak <krzysztofx.matczak@intel.com>
*/
- #include "common.h"
#include "collectd.h"
+ #include "common.h"
+ #include "utils_llist.h"
+
#include <poll.h>
#include <sys/socket.h>
#include <sys/un.h>
#define MCELOG_DIMM_NAME "DMI_NAME"
#define MCELOG_CORRECTED_ERR "corrected memory errors"
#define MCELOG_UNCORRECTED_ERR "uncorrected memory errors"
+ #define MCELOG_CORRECTED_ERR_TIMED "corrected memory timed errors"
+ #define MCELOG_UNCORRECTED_ERR_TIMED "uncorrected memory timed errors"
+ #define MCELOG_CORRECTED_ERR_TYPE_INS "corrected_memory_errors"
+ #define MCELOG_UNCORRECTED_ERR_TYPE_INS "uncorrected_memory_errors"
typedef struct mcelog_config_s {
char logfile[PATH_MAX]; /* mcelog logfile */
pthread_t tid; /* poll thread id */
+ llist_t *dimms_list; /* DIMMs list */
+ pthread_mutex_t dimms_lock; /* lock for dimms cache */
+ _Bool persist;
} mcelog_config_t;
typedef struct socket_adapter_s socket_adapter_t;
static int socket_reinit(socket_adapter_t *self);
static int socket_receive(socket_adapter_t *self, FILE **p_file);
- static mcelog_config_t g_mcelog_config = {.logfile = "/var/log/mcelog"};
+ static mcelog_config_t g_mcelog_config = {
+ .logfile = "/var/log/mcelog", .persist = 0,
+ };
static socket_adapter_t socket_adapter = {
.sock_fd = -1,
};
static _Bool mcelog_thread_running;
+ static _Bool mcelog_apply_defaults;
+
+ static void mcelog_free_dimms_list_records(llist_t *dimms_list) {
+
+ for (llentry_t *e = llist_head(dimms_list); e != NULL; e = e->next) {
+ sfree(e->key);
+ sfree(e->value);
+ }
+ }
+
+ /* Create or get dimm by dimm name/location */
+ static llentry_t *mcelog_dimm(const mcelog_memory_rec_t *rec,
+ llist_t *dimms_list) {
+
+ char dimm_name[DATA_MAX_NAME_LEN];
+
+ if (strlen(rec->dimm_name) > 0) {
+ ssnprintf(dimm_name, sizeof(dimm_name), "%s_%s", rec->location,
+ rec->dimm_name);
+ } else
+ sstrncpy(dimm_name, rec->location, sizeof(dimm_name));
+
+ llentry_t *dimm_le = llist_search(g_mcelog_config.dimms_list, dimm_name);
+
+ if (dimm_le != NULL)
+ return dimm_le;
+
+ /* allocate new linked list entry */
+ mcelog_memory_rec_t *dimm_mr = calloc(1, sizeof(*dimm_mr));
+ if (dimm_mr == NULL) {
+ ERROR(MCELOG_PLUGIN ": Error allocating dimm memory item");
+ return NULL;
+ }
+ char *p_name = strdup(dimm_name);
+ if (p_name == NULL) {
+ ERROR(MCELOG_PLUGIN ": strdup: error");
+ free(dimm_mr);
+ return NULL;
+ }
+
+ /* add new dimm */
+ dimm_le = llentry_create(p_name, dimm_mr);
+ if (dimm_le == NULL) {
+ ERROR(MCELOG_PLUGIN ": llentry_create(): error");
+ free(dimm_mr);
+ free(p_name);
+ return NULL;
+ }
+ pthread_mutex_lock(&g_mcelog_config.dimms_lock);
+ llist_append(g_mcelog_config.dimms_list, dimm_le);
+ pthread_mutex_unlock(&g_mcelog_config.dimms_lock);
+
+ return dimm_le;
+ }
+
+ static void mcelog_update_dimm_stats(llentry_t *dimm,
+ const mcelog_memory_rec_t *rec) {
+ pthread_mutex_lock(&g_mcelog_config.dimms_lock);
+ memcpy(dimm->value, rec, sizeof(mcelog_memory_rec_t));
+ pthread_mutex_unlock(&g_mcelog_config.dimms_lock);
+ }
static int mcelog_config(oconfig_item_t *ci) {
+ int use_logfile = 0, use_memory = 0;
for (int i = 0; i < ci->children_num; i++) {
oconfig_item_t *child = ci->children + i;
- if (strcasecmp("McelogClientSocket", child->key) == 0) {
- if (cf_util_get_string_buffer(child, socket_adapter.unix_sock.sun_path,
- sizeof(socket_adapter.unix_sock.sun_path)) <
- 0) {
- ERROR(MCELOG_PLUGIN ": Invalid configuration option: \"%s\".",
+ if (strcasecmp("McelogLogfile", child->key) == 0) {
+ use_logfile = 1;
+ if (use_memory) {
+ ERROR(MCELOG_PLUGIN ": Invalid configuration option: \"%s\", Memory "
+ "option is already configured.",
child->key);
return -1;
}
- } else if (strcasecmp("McelogLogfile", child->key) == 0) {
if (cf_util_get_string_buffer(child, g_mcelog_config.logfile,
sizeof(g_mcelog_config.logfile)) < 0) {
ERROR(MCELOG_PLUGIN ": Invalid configuration option: \"%s\".",
child->key);
return -1;
}
+ memset(socket_adapter.unix_sock.sun_path, 0,
+ sizeof(socket_adapter.unix_sock.sun_path));
+ } else if (strcasecmp("Memory", child->key) == 0) {
+ if (use_logfile) {
+ ERROR(MCELOG_PLUGIN ": Invalid configuration option: \"%s\", Logfile "
+ "option is already configured.",
+ child->key);
+ return -1;
+ }
+ use_memory = 1;
+ for (int j = 0; j < child->children_num; j++) {
+ oconfig_item_t *mem_child = child->children + j;
+ if (strcasecmp("McelogClientSocket", mem_child->key) == 0) {
+ if (cf_util_get_string_buffer(
+ mem_child, socket_adapter.unix_sock.sun_path,
+ sizeof(socket_adapter.unix_sock.sun_path)) < 0) {
+ ERROR(MCELOG_PLUGIN ": Invalid configuration option: \"%s\".",
+ mem_child->key);
+ return -1;
+ }
+ } else if (strcasecmp("PersistentNotification", mem_child->key) == 0) {
+ if (cf_util_get_boolean(mem_child, &g_mcelog_config.persist) < 0) {
+ ERROR(MCELOG_PLUGIN ": Invalid configuration option: \"%s\".",
+ mem_child->key);
+ return -1;
+ }
+ } else {
+ ERROR(MCELOG_PLUGIN ": Invalid Memory configuration option: \"%s\".",
+ mem_child->key);
+ return -1;
+ }
+ }
+ memset(g_mcelog_config.logfile, 0, sizeof(g_mcelog_config.logfile));
} else {
ERROR(MCELOG_PLUGIN ": Invalid configuration option: \"%s\".",
child->key);
return -1;
}
}
+
+ if (!use_logfile && !use_memory)
+ mcelog_apply_defaults = 1;
+
return 0;
}
return ret;
}
- static int mcelog_prepare_notification(notification_t *n,
- const mcelog_memory_rec_t *mr) {
- if (n == NULL || mr == NULL)
- return -1;
+ static int mcelog_dispatch_mem_notifications(const mcelog_memory_rec_t *mr) {
+ notification_t n = {.severity = NOTIF_WARNING,
+ .time = cdtime(),
+ .plugin = MCELOG_PLUGIN,
+ .type = "errors"};
- if ((mr->location[0] != '\0') &&
- (plugin_notification_meta_add_string(n, MCELOG_SOCKET_STR, mr->location) <
- 0)) {
- ERROR(MCELOG_PLUGIN ": add memory location meta data failed");
- return -1;
- }
- if ((mr->dimm_name[0] != '\0') &&
- (plugin_notification_meta_add_string(n, MCELOG_DIMM_NAME, mr->dimm_name) <
- 0)) {
- ERROR(MCELOG_PLUGIN ": add DIMM name meta data failed");
- plugin_notification_meta_free(n->meta);
- return -1;
- }
- if (plugin_notification_meta_add_signed_int(n, MCELOG_CORRECTED_ERR,
- mr->corrected_err_total) < 0) {
- ERROR(MCELOG_PLUGIN ": add corrected errors meta data failed");
- plugin_notification_meta_free(n->meta);
- return -1;
- }
- if (plugin_notification_meta_add_signed_int(
- n, "corrected memory timed errors", mr->corrected_err_timed) < 0) {
- ERROR(MCELOG_PLUGIN ": add corrected timed errors meta data failed");
- plugin_notification_meta_free(n->meta);
+ int dispatch_corrected_notifs = 0, dispatch_uncorrected_notifs = 0;
+
+ if (mr == NULL)
return -1;
- }
- if ((mr->corrected_err_timed_period[0] != '\0') &&
- (plugin_notification_meta_add_string(n, "corrected errors time period",
- mr->corrected_err_timed_period) <
- 0)) {
- ERROR(MCELOG_PLUGIN ": add corrected errors period meta data failed");
- plugin_notification_meta_free(n->meta);
+
+ llentry_t *dimm = mcelog_dimm(mr, g_mcelog_config.dimms_list);
+ if (dimm == NULL) {
+ ERROR(MCELOG_PLUGIN
+ ": Error adding/getting dimm memory item to/from cache");
return -1;
}
- if (plugin_notification_meta_add_signed_int(n, MCELOG_UNCORRECTED_ERR,
- mr->uncorrected_err_total) < 0) {
- ERROR(MCELOG_PLUGIN ": add corrected errors meta data failed");
- plugin_notification_meta_free(n->meta);
- return -1;
+ mcelog_memory_rec_t *mr_old = dimm->value;
+ if (!g_mcelog_config.persist) {
+
+ if (mr_old->corrected_err_total != mr->corrected_err_total ||
+ mr_old->corrected_err_timed != mr->corrected_err_timed)
+ dispatch_corrected_notifs = 1;
+
+ if (mr_old->uncorrected_err_total != mr->uncorrected_err_total ||
+ mr_old->uncorrected_err_timed != mr->uncorrected_err_timed)
+ dispatch_uncorrected_notifs = 1;
+
+ if (!dispatch_corrected_notifs && !dispatch_uncorrected_notifs) {
+ DEBUG("%s: No new notifications to dispatch", MCELOG_PLUGIN);
+ return 0;
+ }
+ } else {
+ dispatch_corrected_notifs = 1;
+ dispatch_uncorrected_notifs = 1;
}
- if (plugin_notification_meta_add_signed_int(n,
- "uncorrected memory timed errors",
- mr->uncorrected_err_timed) < 0) {
- ERROR(MCELOG_PLUGIN ": add corrected timed errors meta data failed");
- plugin_notification_meta_free(n->meta);
- return -1;
+
+ sstrncpy(n.host, hostname_g, sizeof(n.host));
+
+ if (mr->dimm_name[0] != '\0')
+ ssnprintf(n.plugin_instance, sizeof(n.plugin_instance), "%s_%s",
+ mr->location, mr->dimm_name);
+ else
+ sstrncpy(n.plugin_instance, mr->location, sizeof(n.plugin_instance));
+
+ if (dispatch_corrected_notifs &&
+ (mr->corrected_err_total > 0 || mr->corrected_err_timed > 0)) {
+ /* Corrected Error Notifications */
+ plugin_notification_meta_add_signed_int(&n, MCELOG_CORRECTED_ERR,
+ mr->corrected_err_total);
+ plugin_notification_meta_add_signed_int(&n, MCELOG_CORRECTED_ERR_TIMED,
+ mr->corrected_err_timed);
+ ssnprintf(n.message, sizeof(n.message), MCELOG_CORRECTED_ERR);
+ sstrncpy(n.type_instance, MCELOG_CORRECTED_ERR_TYPE_INS,
+ sizeof(n.type_instance));
+ plugin_dispatch_notification(&n);
+ if (n.meta)
+ plugin_notification_meta_free(n.meta);
+ n.meta = NULL;
}
- if ((mr->uncorrected_err_timed_period[0] != '\0') &&
- (plugin_notification_meta_add_string(n, "uncorrected errors time period",
- mr->uncorrected_err_timed_period) <
- 0)) {
- ERROR(MCELOG_PLUGIN ": add corrected errors period meta data failed");
- plugin_notification_meta_free(n->meta);
- return -1;
+
+ if (dispatch_uncorrected_notifs &&
+ (mr->uncorrected_err_total > 0 || mr->uncorrected_err_timed > 0)) {
+ /* Uncorrected Error Notifications */
+ plugin_notification_meta_add_signed_int(&n, MCELOG_UNCORRECTED_ERR,
+ mr->uncorrected_err_total);
+ plugin_notification_meta_add_signed_int(&n, MCELOG_UNCORRECTED_ERR_TIMED,
+ mr->uncorrected_err_timed);
+ ssnprintf(n.message, sizeof(n.message), MCELOG_UNCORRECTED_ERR);
+ sstrncpy(n.type_instance, MCELOG_UNCORRECTED_ERR_TYPE_INS,
+ sizeof(n.type_instance));
+ n.severity = NOTIF_FAILURE;
+ plugin_dispatch_notification(&n);
+ if (n.meta)
+ plugin_notification_meta_free(n.meta);
+ n.meta = NULL;
}
return 0;
return -1;
}
+ llentry_t *dimm = mcelog_dimm(mr, g_mcelog_config.dimms_list);
+ if (dimm == NULL) {
+ ERROR(MCELOG_PLUGIN
+ ": Error adding/getting dimm memory item to/from cache");
+ return -1;
+ }
+
value_list_t vl = {
.values_len = 1,
.values = &(value_t){.derive = (derive_t)mr->corrected_err_total},
.time = cdtime(),
.plugin = MCELOG_PLUGIN,
.type = "errors",
- .type_instance = "corrected_memory_errors"};
+ .type_instance = MCELOG_CORRECTED_ERR_TYPE_INS};
+
+ mcelog_update_dimm_stats(dimm, mr);
if (mr->dimm_name[0] != '\0')
- ssnprintf(vl.plugin_instance, sizeof(vl.plugin_instance), "%s_%s",
- mr->location, mr->dimm_name);
+ snprintf(vl.plugin_instance, sizeof(vl.plugin_instance), "%s_%s",
+ mr->location, mr->dimm_name);
else
sstrncpy(vl.plugin_instance, mr->location, sizeof(vl.plugin_instance));
plugin_dispatch_values(&vl);
- ssnprintf(vl.type_instance, sizeof(vl.type_instance),
- "corrected_memory_errors_in_%s", mr->corrected_err_timed_period);
+ snprintf(vl.type_instance, sizeof(vl.type_instance),
+ "corrected_memory_errors_in_%s", mr->corrected_err_timed_period);
vl.values = &(value_t){.derive = (derive_t)mr->corrected_err_timed};
plugin_dispatch_values(&vl);
- sstrncpy(vl.type_instance, "uncorrected_memory_errors",
+ sstrncpy(vl.type_instance, MCELOG_UNCORRECTED_ERR_TYPE_INS,
sizeof(vl.type_instance));
vl.values = &(value_t){.derive = (derive_t)mr->uncorrected_err_total};
plugin_dispatch_values(&vl);
- ssnprintf(vl.type_instance, sizeof(vl.type_instance),
- "uncorrected_memory_errors_in_%s",
- mr->uncorrected_err_timed_period);
+ snprintf(vl.type_instance, sizeof(vl.type_instance),
+ "uncorrected_memory_errors_in_%s", mr->uncorrected_err_timed_period);
vl.values = &(value_t){.derive = (derive_t)mr->uncorrected_err_timed};
plugin_dispatch_values(&vl);
continue;
}
- notification_t n = {.severity = NOTIF_OKAY,
- .time = cdtime(),
- .message = "Got memory errors info.",
- .plugin = MCELOG_PLUGIN,
- .type_instance = "memory_erros"};
-
- if (mcelog_prepare_notification(&n, &memory_record) == 0)
- mcelog_dispatch_notification(&n);
+ if (mcelog_dispatch_mem_notifications(&memory_record) != 0)
+ ERROR(MCELOG_PLUGIN ": Failed to submit memory errors notification");
if (mcelog_submit(&memory_record) != 0)
ERROR(MCELOG_PLUGIN ": Failed to submit memory errors");
memset(&memory_record, 0, sizeof(memory_record));
}
static int mcelog_init(void) {
+ if (mcelog_apply_defaults) {
+ INFO(MCELOG_PLUGIN
+ ": No configuration selected defaulting to memory errors.");
+ memset(g_mcelog_config.logfile, 0, sizeof(g_mcelog_config.logfile));
+ }
+ g_mcelog_config.dimms_list = llist_create();
+ int err = pthread_mutex_init(&g_mcelog_config.dimms_lock, NULL);
+ if (err < 0) {
+ ERROR(MCELOG_PLUGIN ": plugin: failed to initialize cache lock");
+ return -1;
+ }
+
if (socket_adapter.reinit(&socket_adapter) != 0) {
ERROR(MCELOG_PLUGIN ": Cannot connect to client socket");
return -1;
}
- if (plugin_thread_create(&g_mcelog_config.tid, NULL, poll_worker, NULL,
- NULL) != 0) {
- ERROR(MCELOG_PLUGIN ": Error creating poll thread.");
- return -1;
+ if (strlen(socket_adapter.unix_sock.sun_path)) {
+ if (plugin_thread_create(&g_mcelog_config.tid, NULL, poll_worker, NULL,
+ NULL) != 0) {
+ ERROR(MCELOG_PLUGIN ": Error creating poll thread.");
+ return -1;
+ }
}
return 0;
}
ret = -1;
}
}
-
+ pthread_mutex_lock(&g_mcelog_config.dimms_lock);
+ mcelog_free_dimms_list_records(g_mcelog_config.dimms_list);
+ llist_destroy(g_mcelog_config.dimms_list);
+ g_mcelog_config.dimms_list = NULL;
+ pthread_mutex_unlock(&g_mcelog_config.dimms_lock);
+ pthread_mutex_destroy(&g_mcelog_config.dimms_lock);
ret = socket_adapter.close(&socket_adapter) || ret;
pthread_rwlock_destroy(&(socket_adapter.lock));
return -ret;