From 814678593c7f581a1428cf566333609384848827 Mon Sep 17 00:00:00 2001 From: Kamil Wiatrowski Date: Tue, 20 Feb 2018 12:28:28 +0000 Subject: [PATCH] pcie_errors: plugin to read PCIe errors The pcie plugin collects PCI Express errors from Device Status in Capability structure and from Advanced Error Reporting Extended Capability where available. At every read it polls config space of PCI Express devices and dispatches notification for every error that is found. Notif OK is sent after error is cleared. Change-Id: I559f4035df76ab2934969a3c46cd4e98b93aba9a Signed-off-by: Kamil Wiatrowski --- Makefile.am | 18 ++ README | 4 + configure.ac | 13 + src/collectd.conf.in | 7 + src/collectd.conf.pod | 47 +++ src/pcie_errors.c | 779 +++++++++++++++++++++++++++++++++++++++++++++++++ src/pcie_errors_test.c | 581 ++++++++++++++++++++++++++++++++++++ src/types.db | 1 + 8 files changed, 1450 insertions(+) create mode 100644 src/pcie_errors.c create mode 100644 src/pcie_errors_test.c diff --git a/Makefile.am b/Makefile.am index 64414fbd..effbdd43 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1381,6 +1381,24 @@ ovs_stats_la_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_LIBYAJL_LDFLAGS) ovs_stats_la_LIBADD = $(BUILD_WITH_LIBYAJL_LIBS) endif +if BUILD_PLUGIN_PCIE_ERRORS +pkglib_LTLIBRARIES += pcie_errors.la +pcie_errors_la_SOURCES = src/pcie_errors.c +pcie_errors_la_CPPFLAGS = $(AM_CPPFLAGS) +pcie_errors_la_LDFLAGS = $(PLUGIN_LDFLAGS) + +test_plugin_pcie_errors_SOURCES = \ + src/pcie_errors_test.c \ + src/daemon/utils_llist.c \ + src/daemon/configfile.c \ + src/daemon/types_list.c +test_plugin_pcie_errors_CPPFLAGS = $(AM_CPPFLAGS) +test_plugin_pcie_errors_LDFLAGS = $(PLUGIN_LDFLAGS) +test_plugin_pcie_errors_LDADD = liboconfig.la libplugin_mock.la +check_PROGRAMS += test_plugin_pcie_errors +TESTS += test_plugin_pcie_errors +endif + if BUILD_PLUGIN_PERL pkglib_LTLIBRARIES += perl.la perl_la_SOURCES = src/perl.c diff --git a/README b/README index 2210b2b9..a111e84a 100644 --- a/README +++ b/README @@ -314,6 +314,10 @@ Features OVS documentation. + - pcie_errors + Read errors from PCI Express Device Status and AER extended capabilities. + + - perl The perl plugin implements a Perl-interpreter into collectd. You can write your own plugins in Perl and return arbitrary values using this diff --git a/configure.ac b/configure.ac index 1e31e218..7049975c 100644 --- a/configure.ac +++ b/configure.ac @@ -550,6 +550,12 @@ if test "x$ac_system" = "xLinux"; then AC_DEFINE([HAVE_CAPABILITY], [1], [Define to 1 if you have cap_get_proc() (-lcap).]) fi + # For pcie_errors plugin + AC_CHECK_HEADERS([linux/pci_regs.h], + [have_pci_regs_h="yes"], + [have_pci_regs_h="no (linux/pci_regs.h not found)"] + ) + else have_linux_raid_md_u_h="no" have_linux_wireless_h="no" @@ -6229,6 +6235,7 @@ plugin_nfs="no" plugin_numa="no" plugin_ovs_events="no" plugin_ovs_stats="no" +plugin_pcie_errors="no" plugin_perl="no" plugin_pinba="no" plugin_processes="no" @@ -6307,6 +6314,10 @@ if test "x$ac_system" = "xLinux"; then plugin_ovs_events="yes" plugin_ovs_stats="yes" fi + + if test "x$have_pci_regs_h" = "xyes"; then + plugin_pcie_errors="yes" + fi fi if test "x$ac_system" = "xOpenBSD"; then @@ -6684,6 +6695,7 @@ AC_PLUGIN([openvpn], [yes], [OpenVPN client stat AC_PLUGIN([oracle], [$with_oracle], [Oracle plugin]) AC_PLUGIN([ovs_events], [$plugin_ovs_events], [OVS events plugin]) AC_PLUGIN([ovs_stats], [$plugin_ovs_stats], [OVS statistics plugin]) +AC_PLUGIN([pcie_errors], [$plugin_pcie_errors], [PCIe errors plugin]) AC_PLUGIN([perl], [$plugin_perl], [Embed a Perl interpreter]) AC_PLUGIN([pf], [$have_net_pfvar_h], [BSD packet filter (PF) statistics]) # FIXME: Check for libevent, too. @@ -7105,6 +7117,7 @@ AC_MSG_RESULT([ openvpn . . . . . . . $enable_openvpn]) AC_MSG_RESULT([ oracle . . . . . . . $enable_oracle]) AC_MSG_RESULT([ ovs_events . . . . . $enable_ovs_events]) AC_MSG_RESULT([ ovs_stats . . . . . . $enable_ovs_stats]) +AC_MSG_RESULT([ pcie_errors . . . . . $enable_pcie_errors]) AC_MSG_RESULT([ perl . . . . . . . . $enable_perl]) AC_MSG_RESULT([ pf . . . . . . . . . $enable_pf]) AC_MSG_RESULT([ pinba . . . . . . . . $enable_pinba]) diff --git a/src/collectd.conf.in b/src/collectd.conf.in index c2aa9152..6a5dfbdf 100644 --- a/src/collectd.conf.in +++ b/src/collectd.conf.in @@ -173,6 +173,7 @@ #@BUILD_PLUGIN_ORACLE_TRUE@LoadPlugin oracle #@BUILD_PLUGIN_OVS_EVENTS_TRUE@LoadPlugin ovs_events #@BUILD_PLUGIN_OVS_STATS_TRUE@LoadPlugin ovs_stats +#@BUILD_PLUGIN_PCIE_ERRORS_TRUE@LoadPlugin pcie_errors #@BUILD_PLUGIN_PERL_TRUE@LoadPlugin perl #@BUILD_PLUGIN_PINBA_TRUE@LoadPlugin pinba #@BUILD_PLUGIN_PING_TRUE@LoadPlugin ping @@ -1130,6 +1131,12 @@ # Bridges "br0" "br_ext" # +# +# Source "sysfs" +# ReportMasked false +# PersistentNotifications false +# + # # IncludeDir "/my/include/path" # BaseName "Collectd::Plugins" diff --git a/src/collectd.conf.pod b/src/collectd.conf.pod index 69a1b1e0..20605f56 100644 --- a/src/collectd.conf.pod +++ b/src/collectd.conf.pod @@ -6265,6 +6265,53 @@ Default: empty (monitor all bridges) =back +=head2 Plugin C + +The I plugin collects PCI Express errors from Device Status in Capability +structure and from Advanced Error Reporting Extended Capability where available. +At every read it polls config space of PCI Express devices and dispatches +notification for every error that is set. It checks for new errors at every read. +The device is indicated in plugin_instance according to format "domain:bus:dev.fn". +Errors are divided into categories indicated by type_instance: "correctable", and +for uncorrectable errors "non_fatal" or "fatal". +Fatal errros are reported as I and all others as I. + +B + + + Source "sysfs" + AccessDir "/sys/bus/pci" + ReportMasked false + PersistentNotifications false + + +B + +=over 4 + +=item B B|B + +Use B or B to read data from /sysfs or /proc. +The default value is B. + +=item B I + +Directory used to access device config space. It is optional and defaults to +/sys/bus/pci for B and to /proc/bus/pci for B. + +=item B B|B + +If true plugin will notify errors that are set to masked in Error Mask register. +Such errors are not reported to the PCI Express Root Complex. Defaults to +B. + +=item B B|B + +If false plugin will dispatch notfication only on set/clear of error. +The ones already reported will be ignored. Defaults to B. + +=back + =head2 Plugin C This plugin embeds a Perl-interpreter into collectd and provides an interface diff --git a/src/pcie_errors.c b/src/pcie_errors.c new file mode 100644 index 00000000..1f7837d2 --- /dev/null +++ b/src/pcie_errors.c @@ -0,0 +1,779 @@ +/** + * collectd - src/pcie_errors.c + * + * Copyright(c) 2018 Intel Corporation. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Kamil Wiatrowski + **/ + +#include "collectd.h" + +#include "common.h" +#include "utils_llist.h" + +#include + +#define PCIE_ERRORS_PLUGIN "pcie_errors" +#define PCIE_DEFAULT_PROCDIR "/proc/bus/pci" +#define PCIE_DEFAULT_SYSFSDIR "/sys/bus/pci" +#define PCIE_NAME_LEN 512 +#define PCIE_BUFF_SIZE 1024 + +#define PCIE_ERROR "pcie_error" +#define PCIE_SEV_CE "correctable" +#define PCIE_SEV_FATAL "fatal" +#define PCIE_SEV_NOFATAL "non_fatal" + +#define PCIE_DEV(x) (((x) >> 3) & 0x1f) +#define PCIE_FN(x) ((x)&0x07) + +#define PCIE_ECAP_OFFSET 0x100 /* ECAP always begin at offset 0x100 */ + +typedef struct pcie_config_s { + _Bool use_sysfs; + _Bool notif_masked; + _Bool persistent; + char access_dir[PATH_MAX]; + _Bool config_error; +} pcie_config_t; + +typedef struct pcie_device_s { + int fd; + int domain; + uint8_t bus; + uint8_t device; + uint8_t function; + int cap_exp; + int ecap_aer; + uint16_t device_status; + uint32_t correctable_errors; + uint32_t uncorrectable_errors; +} pcie_device_t; + +typedef struct pcie_fops_s { + int (*list_devices)(llist_t *dev_list); + int (*open)(pcie_device_t *dev); + void (*close)(pcie_device_t *dev); + int (*read)(pcie_device_t *dev, void *buff, int size, int pos); +} pcie_fops_t; + +typedef struct pcie_error_s { + int mask; + const char *desc; +} pcie_error_t; + +static llist_t *pcie_dev_list; +static pcie_config_t pcie_config = {.access_dir = "", .use_sysfs = 1}; +static pcie_fops_t pcie_fops; + +/* Device Error Status */ +static pcie_error_t pcie_base_errors[] = { + {PCI_EXP_DEVSTA_CED, "Correctable Error"}, + {PCI_EXP_DEVSTA_NFED, "Non-Fatal Error"}, + {PCI_EXP_DEVSTA_FED, "Fatal Error"}, + {PCI_EXP_DEVSTA_URD, "Unsupported Request"}}; +static const int pcie_base_errors_num = STATIC_ARRAY_SIZE(pcie_base_errors); + +/* Uncorrectable Error Status */ +static pcie_error_t pcie_aer_ues[] = { +#ifdef PCI_ERR_UNC_DLP + {PCI_ERR_UNC_DLP, "Data Link Protocol"}, +#endif +#ifdef PCI_ERR_UNC_SURPDN + {PCI_ERR_UNC_SURPDN, "Surprise Down"}, +#endif +#ifdef PCI_ERR_UNC_POISON_TLP + {PCI_ERR_UNC_POISON_TLP, "Poisoned TLP"}, +#endif +#ifdef PCI_ERR_UNC_FCP + {PCI_ERR_UNC_FCP, "Flow Control Protocol"}, +#endif +#ifdef PCI_ERR_UNC_COMP_TIME + {PCI_ERR_UNC_COMP_TIME, "Completion Timeout"}, +#endif +#ifdef PCI_ERR_UNC_COMP_ABORT + {PCI_ERR_UNC_COMP_ABORT, "Completer Abort"}, +#endif +#ifdef PCI_ERR_UNC_UNX_COMP + {PCI_ERR_UNC_UNX_COMP, "Unexpected Completion"}, +#endif +#ifdef PCI_ERR_UNC_RX_OVER + {PCI_ERR_UNC_RX_OVER, "Receiver Overflow"}, +#endif +#ifdef PCI_ERR_UNC_MALF_TLP + {PCI_ERR_UNC_MALF_TLP, "Malformed TLP"}, +#endif +#ifdef PCI_ERR_UNC_ECRC + {PCI_ERR_UNC_ECRC, "ECRC Error Status"}, +#endif +#ifdef PCI_ERR_UNC_UNSUP + {PCI_ERR_UNC_UNSUP, "Unsupported Request"}, +#endif +#ifdef PCI_ERR_UNC_ACSV + {PCI_ERR_UNC_ACSV, "ACS Violation"}, +#endif +#ifdef PCI_ERR_UNC_INTN + {PCI_ERR_UNC_INTN, "Internal"}, +#endif +#ifdef PCI_ERR_UNC_MCBTLP + {PCI_ERR_UNC_MCBTLP, "MC blocked TLP"}, +#endif +#ifdef PCI_ERR_UNC_ATOMEG + {PCI_ERR_UNC_ATOMEG, "Atomic egress blocked"}, +#endif +#ifdef PCI_ERR_UNC_TLPPRE + {PCI_ERR_UNC_TLPPRE, "TLP prefix blocked"}, +#endif +}; +static const int pcie_aer_ues_num = STATIC_ARRAY_SIZE(pcie_aer_ues); + +/* Correctable Error Status */ +static pcie_error_t pcie_aer_ces[] = { +#ifdef PCI_ERR_COR_RCVR + {PCI_ERR_COR_RCVR, "Receiver Error Status"}, +#endif +#ifdef PCI_ERR_COR_BAD_TLP + {PCI_ERR_COR_BAD_TLP, "Bad TLP Status"}, +#endif +#ifdef PCI_ERR_COR_BAD_DLLP + {PCI_ERR_COR_BAD_DLLP, "Bad DLLP Status"}, +#endif +#ifdef PCI_ERR_COR_REP_ROLL + {PCI_ERR_COR_REP_ROLL, "REPLAY_NUM Rollover"}, +#endif +#ifdef PCI_ERR_COR_REP_TIMER + {PCI_ERR_COR_REP_TIMER, "Replay Timer Timeout"}, +#endif +#ifdef PCI_ERR_COR_ADV_NFAT + {PCI_ERR_COR_ADV_NFAT, "Advisory Non-Fatal"}, +#endif +#ifdef PCI_ERR_COR_INTERNAL + {PCI_ERR_COR_INTERNAL, "Corrected Internal"}, +#endif +#ifdef PCI_ERR_COR_LOG_OVER + {PCI_ERR_COR_LOG_OVER, "Header Log Overflow"}, +#endif +}; +static const int pcie_aer_ces_num = STATIC_ARRAY_SIZE(pcie_aer_ces); + +static int pcie_add_device(llist_t *list, int domain, uint8_t bus, + uint8_t device, uint8_t fn) { + llentry_t *entry; + pcie_device_t *dev = calloc(1, sizeof(*dev)); + if (dev == NULL) { + ERROR(PCIE_ERRORS_PLUGIN ": Failed to allocate device"); + return -ENOMEM; + } + + dev->domain = domain; + dev->bus = bus; + dev->device = device; + dev->function = fn; + dev->cap_exp = -1; + dev->ecap_aer = -1; + entry = llentry_create(NULL, dev); + if (entry == NULL) { + ERROR(PCIE_ERRORS_PLUGIN ": Failed to create llentry"); + sfree(dev); + return -ENOMEM; + } + llist_append(list, entry); + + DEBUG(PCIE_ERRORS_PLUGIN ": pci device added to list: %04x:%02x:%02x.%d", + domain, bus, device, fn); + return 0; +} + +static void pcie_clear_list(llist_t *list) { + if (list == NULL) + return; + + for (llentry_t *e = llist_head(list); e != NULL; e = e->next) + sfree(e->value); + + llist_destroy(list); +} + +static int pcie_list_devices_proc(llist_t *dev_list) { + FILE *fd; + char file_name[PCIE_NAME_LEN]; + char buf[PCIE_BUFF_SIZE]; + unsigned int i = 0; + int ret = 0; + + if (dev_list == NULL) + return -EINVAL; + + snprintf(file_name, sizeof(file_name), "%s/devices", pcie_config.access_dir); + fd = fopen(file_name, "r"); + if (!fd) { + char errbuf[PCIE_BUFF_SIZE]; + ERROR(PCIE_ERRORS_PLUGIN ": Cannot open file %s to get devices list: %s", + file_name, sstrerror(errno, errbuf, sizeof(errbuf))); + return -ENOENT; + } + + while (fgets(buf, sizeof(buf), fd)) { + unsigned int slot; + uint8_t bus, dev, fn; + + if (sscanf(buf, "%x", &slot) != 1) { + ERROR(PCIE_ERRORS_PLUGIN ": Failed to read line %u from %s", i + 1, + file_name); + continue; + } + + bus = slot >> 8U; + dev = PCIE_DEV(slot); + fn = PCIE_FN(slot); + ret = pcie_add_device(dev_list, 0, bus, dev, fn); + if (ret) + break; + + ++i; + } + + fclose(fd); + return ret; +} + +static int pcie_list_devices_sysfs(llist_t *dev_list) { + DIR *dir; + struct dirent *item; + char dir_name[PCIE_NAME_LEN]; + int ret = 0; + + if (dev_list == NULL) + return -EINVAL; + + snprintf(dir_name, sizeof(dir_name), "%s/devices", pcie_config.access_dir); + dir = opendir(dir_name); + if (!dir) { + char errbuf[PCIE_BUFF_SIZE]; + ERROR(PCIE_ERRORS_PLUGIN ": Cannot open dir %s to get devices list: %s", + dir_name, sstrerror(errno, errbuf, sizeof(errbuf))); + return -ENOENT; + } + + while ((item = readdir(dir))) { + unsigned int dom, bus, dev; + int fn; + + /* Omit special non-device entries */ + if (item->d_name[0] == '.') + continue; + + if (sscanf(item->d_name, "%x:%x:%x.%d", &dom, &bus, &dev, &fn) != 4) { + ERROR(PCIE_ERRORS_PLUGIN ": Failed to parse entry %s", item->d_name); + continue; + } + + ret = pcie_add_device(dev_list, dom, bus, dev, fn); + if (ret) + break; + } + + closedir(dir); + return ret; +} + +static void pcie_close(pcie_device_t *dev) { + if (close(dev->fd) == -1) { + char errbuf[PCIE_BUFF_SIZE]; + ERROR(PCIE_ERRORS_PLUGIN ": Failed to close %04x:%02x:%02x.%d, fd=%d: %s", + dev->domain, dev->bus, dev->device, dev->function, dev->fd, + sstrerror(errno, errbuf, sizeof(errbuf))); + } + + dev->fd = -1; +} + +static int pcie_open(pcie_device_t *dev, const char *name) { + dev->fd = open(name, O_RDONLY); + if (dev->fd == -1) { + char errbuf[PCIE_BUFF_SIZE]; + ERROR(PCIE_ERRORS_PLUGIN ": Failed to open file %s: %s", name, + sstrerror(errno, errbuf, sizeof(errbuf))); + return -ENOENT; + } + + return 0; +} + +static int pcie_open_proc(pcie_device_t *dev) { + char file_name[PCIE_NAME_LEN]; + + snprintf(file_name, sizeof(file_name), "%s/%02x/%02x.%d", + pcie_config.access_dir, dev->bus, dev->device, dev->function); + + return pcie_open(dev, file_name); +} + +static int pcie_open_sysfs(pcie_device_t *dev) { + char file_name[PCIE_NAME_LEN]; + + snprintf(file_name, sizeof(file_name), "%s/devices/%04x:%02x:%02x.%d/config", + pcie_config.access_dir, dev->domain, dev->bus, dev->device, + dev->function); + + return pcie_open(dev, file_name); +} + +static int pcie_read(pcie_device_t *dev, void *buff, int size, int pos) { + int len = pread(dev->fd, buff, size, pos); + if (len == size) + return 0; + + if (len == -1) { + char errbuf[PCIE_BUFF_SIZE]; + ERROR(PCIE_ERRORS_PLUGIN ": Failed to read %04x:%02x:%02x.%d at pos %d: %s", + dev->domain, dev->bus, dev->device, dev->function, pos, + sstrerror(errno, errbuf, sizeof(errbuf))); + } else { + ERROR(PCIE_ERRORS_PLUGIN + ": %04x:%02x:%02x.%d Read only %d bytes, should be %d", + dev->domain, dev->bus, dev->device, dev->function, len, size); + } + return -1; +} + +static uint8_t pcie_read8(pcie_device_t *dev, int pos) { + uint8_t value; + if (pcie_fops.read(dev, &value, 1, pos)) + return 0; + return value; +} + +static uint16_t pcie_read16(pcie_device_t *dev, int pos) { + uint16_t value; + if (pcie_fops.read(dev, &value, 2, pos)) + return 0; + return value; +} + +static uint32_t pcie_read32(pcie_device_t *dev, int pos) { + uint32_t value; + if (pcie_fops.read(dev, &value, 4, pos)) + return 0; + return value; +} + +static void pcie_dispatch_notification(pcie_device_t *dev, notification_t *n, + const char *type, + const char *type_instance) { + sstrncpy(n->host, hostname_g, sizeof(n->host)); + snprintf(n->plugin_instance, sizeof(n->plugin_instance), "%04x:%02x:%02x.%d", + dev->domain, dev->bus, dev->device, dev->function); + sstrncpy(n->type, type, sizeof(n->type)); + sstrncpy(n->type_instance, type_instance, sizeof(n->type_instance)); + + plugin_dispatch_notification(n); +} + +/* Report errors found in AER Correctable Error Status register */ +static void pcie_dispatch_correctable_errors(pcie_device_t *dev, + uint32_t errors, uint32_t masked) { + for (int i = 0; i < pcie_aer_ces_num; i++) { + pcie_error_t *err = pcie_aer_ces + i; + notification_t n = {.severity = NOTIF_WARNING, + .time = cdtime(), + .plugin = PCIE_ERRORS_PLUGIN, + .meta = NULL}; + + /* If not specifically set by config option omit masked errors */ + if (!pcie_config.notif_masked && (err->mask & masked)) + continue; + + if (err->mask & errors) { + /* Error already reported, notify only if persistent is set */ + if (!pcie_config.persistent && (err->mask & dev->correctable_errors)) + continue; + + DEBUG(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: %s set", dev->domain, + dev->bus, dev->device, dev->function, err->desc); + snprintf(n.message, sizeof(n.message), "Correctable Error set: %s", + err->desc); + pcie_dispatch_notification(dev, &n, PCIE_ERROR, PCIE_SEV_CE); + + } else if (err->mask & dev->correctable_errors) { + DEBUG(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: %s cleared", dev->domain, + dev->bus, dev->device, dev->function, err->desc); + + n.severity = NOTIF_OKAY; + snprintf(n.message, sizeof(n.message), "Correctable Error cleared: %s", + err->desc); + pcie_dispatch_notification(dev, &n, PCIE_ERROR, PCIE_SEV_CE); + } + } +} + +/* Report errors found in AER Uncorrectable Error Status register */ +static void pcie_dispatch_uncorrectable_errors(pcie_device_t *dev, + uint32_t errors, uint32_t masked, + uint32_t severity) { + for (int i = 0; i < pcie_aer_ues_num; i++) { + pcie_error_t *err = pcie_aer_ues + i; + const char *type_instance = + (severity & err->mask) ? PCIE_SEV_FATAL : PCIE_SEV_NOFATAL; + notification_t n = { + .time = cdtime(), .plugin = PCIE_ERRORS_PLUGIN, .meta = NULL}; + + /* If not specifically set by config option omit masked errors */ + if (!pcie_config.notif_masked && (err->mask & masked)) + continue; + + if (err->mask & errors) { + /* Error already reported, notify only if persistent is set */ + if (!pcie_config.persistent && (err->mask & dev->uncorrectable_errors)) + continue; + + DEBUG(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: %s(%s) set", dev->domain, + dev->bus, dev->device, dev->function, err->desc, type_instance); + + n.severity = (severity & err->mask) ? NOTIF_FAILURE : NOTIF_WARNING; + snprintf(n.message, sizeof(n.message), "Uncorrectable(%s) Error set: %s", + type_instance, err->desc); + pcie_dispatch_notification(dev, &n, PCIE_ERROR, type_instance); + + } else if (err->mask & dev->uncorrectable_errors) { + DEBUG(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: %s(%s) cleared", + dev->domain, dev->bus, dev->device, dev->function, err->desc, + type_instance); + + n.severity = NOTIF_OKAY; + snprintf(n.message, sizeof(n.message), + "Uncorrectable(%s) Error cleared: %s", type_instance, err->desc); + pcie_dispatch_notification(dev, &n, PCIE_ERROR, type_instance); + } + } +} + +/* Find offset of PCI Express Capability Structure + * in PCI configuration space. + * Returns offset, -1 if not found. +**/ +static int pcie_find_cap_exp(pcie_device_t *dev) { + int pos = pcie_read8(dev, PCI_CAPABILITY_LIST) & ~3; + + while (pos) { + uint8_t id = pcie_read8(dev, pos + PCI_CAP_LIST_ID); + + if (id == 0xff) + break; + if (id == PCI_CAP_ID_EXP) + return pos; + + pos = pcie_read8(dev, pos + PCI_CAP_LIST_NEXT) & ~3; + } + + DEBUG(PCIE_ERRORS_PLUGIN ": Cannot find CAP EXP for %04x:%02x:%02x.%d", + dev->domain, dev->bus, dev->device, dev->function); + + return -1; +} + +/* Find offset of Advanced Error Reporting Capability. + * Returns AER offset, -1 if not found. +**/ +static int pcie_find_ecap_aer(pcie_device_t *dev) { + int pos = PCIE_ECAP_OFFSET; + uint32_t header = pcie_read32(dev, pos); + int id = PCI_EXT_CAP_ID(header); + int next = PCI_EXT_CAP_NEXT(header); + + if (!id && !next) + return -1; + + if (id == PCI_EXT_CAP_ID_ERR) + return pos; + + while (next) { + if (next <= PCIE_ECAP_OFFSET) + break; + + header = pcie_read32(dev, next); + id = PCI_EXT_CAP_ID(header); + + if (id == PCI_EXT_CAP_ID_ERR) + return next; + + next = PCI_EXT_CAP_NEXT(header); + } + + return -1; +} + +static void pcie_check_dev_status(pcie_device_t *dev, int pos) { + /* Read Device Status register with mask for errors only */ + uint16_t new_status = pcie_read16(dev, pos + PCI_EXP_DEVSTA) & 0xf; + + /* Check if anything new should be reported */ + if (!(pcie_config.persistent && new_status) && + (new_status == dev->device_status)) + return; + + /* Report errors found in Device Status register */ + for (int i = 0; i < pcie_base_errors_num; i++) { + pcie_error_t *err = pcie_base_errors + i; + const char *type_instance = (err->mask == PCI_EXP_DEVSTA_FED) + ? PCIE_SEV_FATAL + : (err->mask == PCI_EXP_DEVSTA_CED) + ? PCIE_SEV_CE + : PCIE_SEV_NOFATAL; + const int severity = + (err->mask == PCI_EXP_DEVSTA_FED) ? NOTIF_FAILURE : NOTIF_WARNING; + notification_t n = {.severity = severity, + .time = cdtime(), + .plugin = PCIE_ERRORS_PLUGIN, + .meta = NULL}; + + if (err->mask & new_status) { + /* Error already reported, notify only if persistent is set */ + if (!pcie_config.persistent && (err->mask & dev->device_status)) + continue; + + DEBUG(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: %s set", dev->domain, + dev->bus, dev->device, dev->function, err->desc); + snprintf(n.message, sizeof(n.message), "Device Status Error set: %s", + err->desc); + pcie_dispatch_notification(dev, &n, PCIE_ERROR, type_instance); + + } else if (err->mask & dev->device_status) { + DEBUG(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: %s cleared", dev->domain, + dev->bus, dev->device, dev->function, err->desc); + n.severity = NOTIF_OKAY; + snprintf(n.message, sizeof(n.message), "Device Status Error cleared: %s", + err->desc); + pcie_dispatch_notification(dev, &n, PCIE_ERROR, type_instance); + } + } + + dev->device_status = new_status; +} + +static void pcie_check_aer(pcie_device_t *dev, int pos) { + /* Check for AER uncorrectable errors */ + uint32_t errors = pcie_read32(dev, pos + PCI_ERR_UNCOR_STATUS); + + if ((pcie_config.persistent && errors) || + (errors != dev->uncorrectable_errors)) { + uint32_t masked = pcie_read32(dev, pos + PCI_ERR_UNCOR_MASK); + uint32_t severity = pcie_read32(dev, pos + PCI_ERR_UNCOR_SEVER); + pcie_dispatch_uncorrectable_errors(dev, errors, masked, severity); + } + dev->uncorrectable_errors = errors; + + /* Check for AER correctable errors */ + errors = pcie_read32(dev, pos + PCI_ERR_COR_STATUS); + if ((pcie_config.persistent && errors) || + (errors != dev->correctable_errors)) { + uint32_t masked = pcie_read32(dev, pos + PCI_ERR_COR_MASK); + pcie_dispatch_correctable_errors(dev, errors, masked); + } + dev->correctable_errors = errors; +} + +static int pcie_process_devices(llist_t *devs) { + int ret = 0; + if (devs == NULL) + return -1; + + for (llentry_t *e = llist_head(devs); e != NULL; e = e->next) { + pcie_device_t *dev = e->value; + + if (pcie_fops.open(dev) == 0) { + pcie_check_dev_status(dev, dev->cap_exp); + if (dev->ecap_aer != -1) + pcie_check_aer(dev, dev->ecap_aer); + + pcie_fops.close(dev); + } else { + notification_t n = {.severity = NOTIF_FAILURE, + .time = cdtime(), + .message = "Failed to read device status", + .plugin = PCIE_ERRORS_PLUGIN, + .meta = NULL}; + pcie_dispatch_notification(dev, &n, "", ""); + ret = -1; + } + } + + return ret; +} + +/* This function is to be called during init to filter out no pcie devices */ +static void pcie_preprocess_devices(llist_t *devs) { + llentry_t *e_next; + + if (devs == NULL) + return; + + for (llentry_t *e = llist_head(devs); e != NULL; e = e_next) { + pcie_device_t *dev = e->value; + _Bool del = 0; + + if (pcie_fops.open(dev) == 0) { + uint16_t status = pcie_read16(dev, PCI_STATUS); + if (status & PCI_STATUS_CAP_LIST) + dev->cap_exp = pcie_find_cap_exp(dev); + + /* Every PCIe device must have Capability Structure */ + if (dev->cap_exp == -1) { + DEBUG(PCIE_ERRORS_PLUGIN ": Not PCI Express device: %04x:%02x:%02x.%d", + dev->domain, dev->bus, dev->device, dev->function); + del = 1; + } else { + dev->ecap_aer = pcie_find_ecap_aer(dev); + if (dev->ecap_aer == -1) + INFO(PCIE_ERRORS_PLUGIN + ": Device is not AER capable: %04x:%02x:%02x.%d", + dev->domain, dev->bus, dev->device, dev->function); + } + + pcie_fops.close(dev); + } else { + ERROR(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: failed to open", + dev->domain, dev->bus, dev->device, dev->function); + del = 1; + } + + e_next = e->next; + if (del) { + sfree(dev); + llist_remove(devs, e); + llentry_destroy(e); + } + } +} + +static int pcie_plugin_read(__attribute__((unused)) user_data_t *ud) { + + if (pcie_process_devices(pcie_dev_list) < 0) { + ERROR(PCIE_ERRORS_PLUGIN ": Failed to read devices state"); + return -1; + } + return 0; +} + +static void pcie_access_config(void) { + /* Set functions for register access to + * use proc or sysfs depending on config. */ + if (pcie_config.use_sysfs) { + pcie_fops.list_devices = pcie_list_devices_sysfs; + pcie_fops.open = pcie_open_sysfs; + if (pcie_config.access_dir[0] == '\0') + sstrncpy(pcie_config.access_dir, PCIE_DEFAULT_SYSFSDIR, + sizeof(pcie_config.access_dir)); + } else { + /* use proc */ + pcie_fops.list_devices = pcie_list_devices_proc; + pcie_fops.open = pcie_open_proc; + if (pcie_config.access_dir[0] == '\0') + sstrncpy(pcie_config.access_dir, PCIE_DEFAULT_PROCDIR, + sizeof(pcie_config.access_dir)); + } + /* Common functions */ + pcie_fops.close = pcie_close; + pcie_fops.read = pcie_read; +} + +static int pcie_plugin_config(oconfig_item_t *ci) { + + for (int i = 0; i < ci->children_num; i++) { + oconfig_item_t *child = ci->children + i; + int status = 0; + + if (strcasecmp("Source", child->key) == 0) { + if ((child->values_num != 1) || + (child->values[0].type != OCONFIG_TYPE_STRING)) { + status = -1; + } else if (strcasecmp("proc", child->values[0].value.string) == 0) { + pcie_config.use_sysfs = 0; + } else if (strcasecmp("sysfs", child->values[0].value.string) != 0) { + ERROR(PCIE_ERRORS_PLUGIN ": Allowed sources are 'proc' or 'sysfs'."); + status = -1; + } + } else if (strcasecmp("AccessDir", child->key) == 0) { + status = cf_util_get_string_buffer(child, pcie_config.access_dir, + sizeof(pcie_config.access_dir)); + } else if (strcasecmp("ReportMasked", child->key) == 0) { + status = cf_util_get_boolean(child, &pcie_config.notif_masked); + } else if (strcasecmp("PersistentNotifications", child->key) == 0) { + status = cf_util_get_boolean(child, &pcie_config.persistent); + } else { + ERROR(PCIE_ERRORS_PLUGIN ": Invalid configuration option \"%s\".", + child->key); + pcie_config.config_error = 1; + break; + } + + if (status) { + ERROR(PCIE_ERRORS_PLUGIN ": Invalid configuration parameter \"%s\".", + child->key); + pcie_config.config_error = 1; + break; + } + } + + return 0; +} + +static int pcie_shutdown(void) { + pcie_clear_list(pcie_dev_list); + pcie_dev_list = NULL; + + return 0; +} + +static int pcie_init(void) { + if (pcie_config.config_error) { + ERROR(PCIE_ERRORS_PLUGIN + ": Error in configuration, failed to init plugin."); + return -1; + } + + pcie_access_config(); + pcie_dev_list = llist_create(); + if (pcie_fops.list_devices(pcie_dev_list) != 0) { + ERROR(PCIE_ERRORS_PLUGIN ": Failed to find devices."); + pcie_shutdown(); + return -1; + } + pcie_preprocess_devices(pcie_dev_list); + if (llist_size(pcie_dev_list) == 0) { + /* No any PCI Express devices were found on the system */ + ERROR(PCIE_ERRORS_PLUGIN ": No PCIe devices found in %s", + pcie_config.access_dir); + pcie_shutdown(); + return -1; + } + + return 0; +} + +void module_register(void) { + plugin_register_init(PCIE_ERRORS_PLUGIN, pcie_init); + plugin_register_complex_config(PCIE_ERRORS_PLUGIN, pcie_plugin_config); + plugin_register_complex_read(NULL, PCIE_ERRORS_PLUGIN, pcie_plugin_read, 0, + NULL); + plugin_register_shutdown(PCIE_ERRORS_PLUGIN, pcie_shutdown); +} diff --git a/src/pcie_errors_test.c b/src/pcie_errors_test.c new file mode 100644 index 00000000..48b01b14 --- /dev/null +++ b/src/pcie_errors_test.c @@ -0,0 +1,581 @@ +/** + * collectd - src/pcie_errors.c + * + * Copyright(c) 2018 Intel Corporation. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Kamil Wiatrowski + **/ + +#include "pcie_errors.c" /* sic */ +#include "testing.h" + +#define TEST_DOMAIN 1 +#define TEST_BUS 5 +#define TEST_DEVICE 0xc +#define TEST_FUNCTION 2 +#define TEST_DEVICE_STR "0001:05:0c.2" + +#define G_BUFF_LEN 4 + +static notification_t last_notif; +static char g_buff[G_BUFF_LEN]; + +/* mock functions */ +int plugin_dispatch_notification(const notification_t *notif) { + last_notif = *notif; + return ENOTSUP; +} + +ssize_t pread(__attribute__((unused)) int fd, void *buf, size_t count, + __attribute__((unused)) off_t offset) { + if (count == 0 || count > G_BUFF_LEN) + return -1; + + memcpy(buf, g_buff, count); + return count; +} +/* end mock functions */ + +DEF_TEST(clear_dev_list) { + pcie_clear_list(NULL); + + llist_t *test_list = llist_create(); + CHECK_NOT_NULL(test_list); + + pcie_device_t *dev = calloc(1, sizeof(*dev)); + CHECK_NOT_NULL(dev); + + llentry_t *entry = llentry_create(NULL, dev); + CHECK_NOT_NULL(entry); + + llist_append(test_list, entry); + + for (llentry_t *e = llist_head(test_list); e != NULL; e = e->next) { + EXPECT_EQ_UINT64(dev, e->value); + } + + pcie_clear_list(test_list); + + return 0; +} + +DEF_TEST(add_to_list) { + llist_t *test_list = llist_create(); + CHECK_NOT_NULL(test_list); + + int ret = pcie_add_device(test_list, TEST_DOMAIN, TEST_BUS, TEST_DEVICE, + TEST_FUNCTION); + EXPECT_EQ_INT(0, ret); + + llentry_t *e = llist_head(test_list); + CHECK_NOT_NULL(e); + OK(NULL == e->next); + + pcie_device_t *dev = e->value; + CHECK_NOT_NULL(dev); + EXPECT_EQ_INT(TEST_DOMAIN, dev->domain); + EXPECT_EQ_INT(TEST_BUS, dev->bus); + EXPECT_EQ_INT(TEST_DEVICE, dev->device); + EXPECT_EQ_INT(TEST_FUNCTION, dev->function); + EXPECT_EQ_INT(-1, dev->cap_exp); + EXPECT_EQ_INT(-1, dev->ecap_aer); + + pcie_clear_list(test_list); + + return 0; +} + +DEF_TEST(pcie_read) { + int ret; + pcie_device_t dev = {0}; + uint32_t val = 0; + g_buff[0] = 4; + g_buff[1] = 3; + g_buff[2] = 2; + g_buff[3] = 1; + + ret = pcie_read(&dev, &val, 1, 0); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(4, val); + + ret = pcie_read(&dev, &val, 2, 0); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(0x304, val); + + ret = pcie_read(&dev, &val, 3, 0); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(0x20304, val); + + ret = pcie_read(&dev, &val, 4, 0); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(0x1020304, val); + + ret = pcie_read(&dev, &val, G_BUFF_LEN + 1, 0); + EXPECT_EQ_INT(-1, ret); + + pcie_fops.read = pcie_read; + + uint8_t val8 = pcie_read8(&dev, 0); + EXPECT_EQ_INT(4, val8); + + uint16_t val16 = pcie_read16(&dev, 0); + EXPECT_EQ_INT(0x304, val16); + + uint32_t val32 = pcie_read32(&dev, 0); + EXPECT_EQ_INT(0x1020304, val32); + + return 0; +} + +DEF_TEST(dispatch_notification) { + pcie_device_t dev = {0, TEST_DOMAIN, TEST_BUS, TEST_DEVICE, TEST_FUNCTION, + 0, 0, 0, 0, 0}; + cdtime_t t = cdtime(); + notification_t n = { + .severity = 1, .time = t, .plugin = "pcie_errors_test", .meta = NULL}; + + pcie_dispatch_notification(&dev, &n, "test_type", "test_type_instance"); + EXPECT_EQ_INT(1, last_notif.severity); + EXPECT_EQ_UINT64(t, last_notif.time); + EXPECT_EQ_STR("pcie_errors_test", last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(hostname_g, last_notif.host); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR("test_type", last_notif.type); + EXPECT_EQ_STR("test_type_instance", last_notif.type_instance); + + return 0; +} + +DEF_TEST(access_config) { + pcie_config.use_sysfs = 0; + pcie_access_config(); + EXPECT_EQ_UINT64(pcie_list_devices_proc, pcie_fops.list_devices); + EXPECT_EQ_UINT64(pcie_open_proc, pcie_fops.open); + EXPECT_EQ_UINT64(pcie_close, pcie_fops.close); + EXPECT_EQ_UINT64(pcie_read, pcie_fops.read); + EXPECT_EQ_STR(PCIE_DEFAULT_PROCDIR, pcie_config.access_dir); + + sstrncpy(pcie_config.access_dir, "Test", sizeof(pcie_config.access_dir)); + pcie_access_config(); + EXPECT_EQ_STR("Test", pcie_config.access_dir); + + pcie_config.use_sysfs = 1; + pcie_access_config(); + EXPECT_EQ_UINT64(pcie_list_devices_sysfs, pcie_fops.list_devices); + EXPECT_EQ_UINT64(pcie_open_sysfs, pcie_fops.open); + EXPECT_EQ_UINT64(pcie_close, pcie_fops.close); + EXPECT_EQ_UINT64(pcie_read, pcie_fops.read); + EXPECT_EQ_STR("Test", pcie_config.access_dir); + + pcie_config.access_dir[0] = '\0'; + pcie_access_config(); + EXPECT_EQ_STR(PCIE_DEFAULT_SYSFSDIR, pcie_config.access_dir); + + return 0; +} + +DEF_TEST(plugin_config_fail) { + oconfig_item_t test_cfg_parent = {"pcie_errors", NULL, 0, NULL, NULL, 0}; + char value_buff[256] = "procs"; + char key_buff[256] = "Sources"; + oconfig_value_t test_cfg_value = {{value_buff}, OCONFIG_TYPE_STRING}; + oconfig_item_t test_cfg = { + key_buff, &test_cfg_value, 1, &test_cfg_parent, NULL, 0}; + + test_cfg_parent.children = &test_cfg; + test_cfg_parent.children_num = 1; + + int ret = pcie_plugin_config(&test_cfg_parent); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(1, pcie_config.config_error); + pcie_config.config_error = 0; + + sstrncpy(key_buff, "Source", sizeof(key_buff)); + ret = pcie_plugin_config(&test_cfg_parent); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(1, pcie_config.config_error); + pcie_config.config_error = 0; + + sstrncpy(value_buff, "proc", sizeof(value_buff)); + test_cfg_value.type = OCONFIG_TYPE_NUMBER; + ret = pcie_plugin_config(&test_cfg_parent); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(1, pcie_config.config_error); + pcie_config.config_error = 0; + + sstrncpy(key_buff, "AccessDir", sizeof(key_buff)); + ret = pcie_plugin_config(&test_cfg_parent); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(1, pcie_config.config_error); + pcie_config.config_error = 0; + + return 0; +} + +DEF_TEST(plugin_config) { + oconfig_item_t test_cfg_parent = {"pcie_errors", NULL, 0, NULL, NULL, 0}; + char value_buff[256] = "proc"; + char key_buff[256] = "source"; + oconfig_value_t test_cfg_value = {{value_buff}, OCONFIG_TYPE_STRING}; + oconfig_item_t test_cfg = { + key_buff, &test_cfg_value, 1, &test_cfg_parent, NULL, 0}; + + test_cfg_parent.children = &test_cfg; + test_cfg_parent.children_num = 1; + + pcie_config.use_sysfs = 1; + int ret = pcie_plugin_config(&test_cfg_parent); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(0, pcie_config.config_error); + EXPECT_EQ_INT(0, pcie_config.use_sysfs); + + pcie_config.use_sysfs = 1; + sstrncpy(value_buff, "sysfs", sizeof(value_buff)); + ret = pcie_plugin_config(&test_cfg_parent); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(0, pcie_config.config_error); + EXPECT_EQ_INT(1, pcie_config.use_sysfs); + + sstrncpy(key_buff, "AccessDir", sizeof(key_buff)); + sstrncpy(value_buff, "some/test/value", sizeof(value_buff)); + ret = pcie_plugin_config(&test_cfg_parent); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(0, pcie_config.config_error); + EXPECT_EQ_STR("some/test/value", pcie_config.access_dir); + + memset(&test_cfg_value.value, 0, sizeof(test_cfg_value.value)); + test_cfg_value.value.boolean = 1; + test_cfg_value.type = OCONFIG_TYPE_BOOLEAN; + sstrncpy(key_buff, "ReportMasked", sizeof(key_buff)); + ret = pcie_plugin_config(&test_cfg_parent); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(0, pcie_config.config_error); + EXPECT_EQ_INT(1, pcie_config.notif_masked); + + sstrncpy(key_buff, "PersistentNotifications", sizeof(key_buff)); + ret = pcie_plugin_config(&test_cfg_parent); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(0, pcie_config.config_error); + EXPECT_EQ_INT(1, pcie_config.persistent); + + return 0; +} + +#define BAD_TLP_SET_MSG "Correctable Error set: Bad TLP Status" +#define BAD_TLP_CLEAR_MSG "Correctable Error cleared: Bad TLP Status" + +DEF_TEST(dispatch_correctable_errors) { + pcie_device_t dev = {0, TEST_DOMAIN, TEST_BUS, TEST_DEVICE, TEST_FUNCTION, + 0, 0, 0, 0, 0}; + pcie_config.notif_masked = 0; + pcie_config.persistent = 0; + + pcie_dispatch_correctable_errors(&dev, PCI_ERR_COR_BAD_TLP, + ~(PCI_ERR_COR_BAD_TLP)); + EXPECT_EQ_INT(NOTIF_WARNING, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_CE, last_notif.type_instance); + EXPECT_EQ_STR(BAD_TLP_SET_MSG, last_notif.message); + + memset(&last_notif, 0, sizeof(last_notif)); + dev.correctable_errors = PCI_ERR_COR_BAD_TLP; + pcie_dispatch_correctable_errors(&dev, PCI_ERR_COR_BAD_TLP, + ~(PCI_ERR_COR_BAD_TLP)); + EXPECT_EQ_STR("", last_notif.plugin_instance); + + pcie_config.persistent = 1; + pcie_dispatch_correctable_errors(&dev, PCI_ERR_COR_BAD_TLP, + ~(PCI_ERR_COR_BAD_TLP)); + EXPECT_EQ_INT(NOTIF_WARNING, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_CE, last_notif.type_instance); + EXPECT_EQ_STR(BAD_TLP_SET_MSG, last_notif.message); + + memset(&last_notif, 0, sizeof(last_notif)); + pcie_dispatch_correctable_errors(&dev, PCI_ERR_COR_BAD_TLP, + PCI_ERR_COR_BAD_TLP); + EXPECT_EQ_STR("", last_notif.plugin_instance); + + pcie_config.notif_masked = 1; + pcie_dispatch_correctable_errors(&dev, PCI_ERR_COR_BAD_TLP, + PCI_ERR_COR_BAD_TLP); + EXPECT_EQ_INT(NOTIF_WARNING, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_CE, last_notif.type_instance); + EXPECT_EQ_STR(BAD_TLP_SET_MSG, last_notif.message); + + pcie_config.persistent = 0; + memset(&last_notif, 0, sizeof(last_notif)); + pcie_dispatch_correctable_errors(&dev, PCI_ERR_COR_BAD_TLP, + PCI_ERR_COR_BAD_TLP); + EXPECT_EQ_STR("", last_notif.plugin_instance); + + dev.correctable_errors = 0; + pcie_dispatch_correctable_errors(&dev, PCI_ERR_COR_BAD_TLP, + PCI_ERR_COR_BAD_TLP); + EXPECT_EQ_INT(NOTIF_WARNING, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_CE, last_notif.type_instance); + EXPECT_EQ_STR(BAD_TLP_SET_MSG, last_notif.message); + + pcie_dispatch_correctable_errors(&dev, PCI_ERR_COR_BAD_TLP, + ~(PCI_ERR_COR_BAD_TLP)); + EXPECT_EQ_INT(NOTIF_WARNING, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_CE, last_notif.type_instance); + EXPECT_EQ_STR(BAD_TLP_SET_MSG, last_notif.message); + + pcie_config.notif_masked = 0; + dev.correctable_errors = PCI_ERR_COR_BAD_TLP; + pcie_dispatch_correctable_errors(&dev, 0, ~(PCI_ERR_COR_BAD_TLP)); + EXPECT_EQ_INT(NOTIF_OKAY, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_CE, last_notif.type_instance); + EXPECT_EQ_STR(BAD_TLP_CLEAR_MSG, last_notif.message); + + return 0; +} + +#define FCP_NF_SET_MSG \ + "Uncorrectable(non_fatal) Error set: Flow Control Protocol" +#define FCP_F_SET_MSG "Uncorrectable(fatal) Error set: Flow Control Protocol" +#define FCP_NF_CLEAR_MSG \ + "Uncorrectable(non_fatal) Error cleared: Flow Control Protocol" +#define FCP_F_CLEAR_MSG \ + "Uncorrectable(fatal) Error cleared: Flow Control Protocol" + +DEF_TEST(dispatch_uncorrectable_errors) { + pcie_device_t dev = {0, TEST_DOMAIN, TEST_BUS, TEST_DEVICE, TEST_FUNCTION, + 0, 0, 0, 0, 0}; + pcie_config.notif_masked = 0; + pcie_config.persistent = 0; + + pcie_dispatch_uncorrectable_errors(&dev, PCI_ERR_UNC_FCP, ~(PCI_ERR_UNC_FCP), + ~(PCI_ERR_UNC_FCP)); + EXPECT_EQ_INT(NOTIF_WARNING, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_NOFATAL, last_notif.type_instance); + EXPECT_EQ_STR(FCP_NF_SET_MSG, last_notif.message); + + pcie_dispatch_uncorrectable_errors(&dev, PCI_ERR_UNC_FCP, ~(PCI_ERR_UNC_FCP), + PCI_ERR_UNC_FCP); + EXPECT_EQ_INT(NOTIF_FAILURE, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_FATAL, last_notif.type_instance); + EXPECT_EQ_STR(FCP_F_SET_MSG, last_notif.message); + + memset(&last_notif, 0, sizeof(last_notif)); + dev.uncorrectable_errors = PCI_ERR_UNC_FCP; + pcie_dispatch_uncorrectable_errors(&dev, PCI_ERR_UNC_FCP, ~(PCI_ERR_UNC_FCP), + PCI_ERR_UNC_FCP); + EXPECT_EQ_STR("", last_notif.plugin_instance); + + pcie_config.persistent = 1; + pcie_dispatch_uncorrectable_errors(&dev, PCI_ERR_UNC_FCP, ~(PCI_ERR_UNC_FCP), + PCI_ERR_UNC_FCP); + EXPECT_EQ_INT(NOTIF_FAILURE, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_FATAL, last_notif.type_instance); + EXPECT_EQ_STR(FCP_F_SET_MSG, last_notif.message); + + memset(&last_notif, 0, sizeof(last_notif)); + pcie_dispatch_uncorrectable_errors(&dev, PCI_ERR_UNC_FCP, PCI_ERR_UNC_FCP, + PCI_ERR_UNC_FCP); + EXPECT_EQ_STR("", last_notif.plugin_instance); + + pcie_config.notif_masked = 1; + pcie_dispatch_uncorrectable_errors(&dev, PCI_ERR_UNC_FCP, PCI_ERR_UNC_FCP, + PCI_ERR_UNC_FCP); + EXPECT_EQ_INT(NOTIF_FAILURE, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_FATAL, last_notif.type_instance); + EXPECT_EQ_STR(FCP_F_SET_MSG, last_notif.message); + + pcie_config.persistent = 0; + dev.uncorrectable_errors = 0; + memset(&last_notif, 0, sizeof(last_notif)); + pcie_dispatch_uncorrectable_errors(&dev, PCI_ERR_UNC_FCP, ~(PCI_ERR_UNC_FCP), + PCI_ERR_UNC_FCP); + EXPECT_EQ_INT(NOTIF_FAILURE, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_FATAL, last_notif.type_instance); + EXPECT_EQ_STR(FCP_F_SET_MSG, last_notif.message); + + pcie_config.notif_masked = 0; + dev.uncorrectable_errors = PCI_ERR_UNC_FCP; + pcie_dispatch_uncorrectable_errors(&dev, 0, ~(PCI_ERR_UNC_FCP), + ~(PCI_ERR_UNC_FCP)); + EXPECT_EQ_INT(NOTIF_OKAY, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_NOFATAL, last_notif.type_instance); + EXPECT_EQ_STR(FCP_NF_CLEAR_MSG, last_notif.message); + + memset(&last_notif, 0, sizeof(last_notif)); + pcie_dispatch_uncorrectable_errors(&dev, 0, ~(PCI_ERR_UNC_FCP), + PCI_ERR_UNC_FCP); + EXPECT_EQ_INT(NOTIF_OKAY, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_FATAL, last_notif.type_instance); + EXPECT_EQ_STR(FCP_F_CLEAR_MSG, last_notif.message); + + return 0; +} + +#define UR_SET_MSG "Device Status Error set: Unsupported Request" +#define UR_CLEAR_MSG "Device Status Error cleared: Unsupported Request" +#define FE_SET_MSG "Device Status Error set: Fatal Error" +#define FE_CLEAR_MSG "Device Status Error cleared: Fatal Error" + +DEF_TEST(device_status_errors) { + pcie_device_t dev = {0, TEST_DOMAIN, TEST_BUS, TEST_DEVICE, TEST_FUNCTION, + 0, 0, 0, 0, 0}; + pcie_config.persistent = 0; + g_buff[0] = (PCI_EXP_DEVSTA_URD & 0xff); + + memset(&last_notif, 0, sizeof(last_notif)); + pcie_check_dev_status(&dev, 0); + EXPECT_EQ_INT(NOTIF_WARNING, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_NOFATAL, last_notif.type_instance); + EXPECT_EQ_STR(UR_SET_MSG, last_notif.message); + + memset(&last_notif, 0, sizeof(last_notif)); + pcie_check_dev_status(&dev, 0); + EXPECT_EQ_STR("", last_notif.plugin_instance); + + pcie_config.persistent = 1; + pcie_check_dev_status(&dev, 0); + EXPECT_EQ_INT(NOTIF_WARNING, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_NOFATAL, last_notif.type_instance); + EXPECT_EQ_STR(UR_SET_MSG, last_notif.message); + + g_buff[0] = 0; + pcie_check_dev_status(&dev, 0); + EXPECT_EQ_INT(NOTIF_OKAY, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_NOFATAL, last_notif.type_instance); + EXPECT_EQ_STR(UR_CLEAR_MSG, last_notif.message); + + pcie_config.persistent = 0; + dev.device_status = PCI_EXP_DEVSTA_URD; + pcie_check_dev_status(&dev, 0); + EXPECT_EQ_INT(NOTIF_OKAY, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_NOFATAL, last_notif.type_instance); + EXPECT_EQ_STR(UR_CLEAR_MSG, last_notif.message); + + memset(&last_notif, 0, sizeof(last_notif)); + pcie_check_dev_status(&dev, 0); + EXPECT_EQ_STR("", last_notif.plugin_instance); + + g_buff[0] = (PCI_EXP_DEVSTA_FED & 0xff); + pcie_check_dev_status(&dev, 0); + EXPECT_EQ_INT(NOTIF_FAILURE, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_FATAL, last_notif.type_instance); + EXPECT_EQ_STR(FE_SET_MSG, last_notif.message); + + g_buff[0] = 0; + pcie_check_dev_status(&dev, 0); + EXPECT_EQ_INT(NOTIF_OKAY, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_FATAL, last_notif.type_instance); + EXPECT_EQ_STR(FE_CLEAR_MSG, last_notif.message); + + return 0; +} + +int main(void) { + RUN_TEST(clear_dev_list); + RUN_TEST(add_to_list); + RUN_TEST(pcie_read); + RUN_TEST(dispatch_notification); + + RUN_TEST(access_config); + RUN_TEST(plugin_config_fail); + RUN_TEST(plugin_config); + + RUN_TEST(dispatch_correctable_errors); + RUN_TEST(dispatch_uncorrectable_errors); + RUN_TEST(device_status_errors); + + END_TEST; +} diff --git a/src/types.db b/src/types.db index 1b1e6f0c..15a401cf 100644 --- a/src/types.db +++ b/src/types.db @@ -172,6 +172,7 @@ objects value:GAUGE:0:U operations value:DERIVE:0:U operations_per_second value:GAUGE:0:U packets value:DERIVE:0:U +pcie_error value:GAUGE:U:U pending_operations value:GAUGE:0:U percent value:GAUGE:0:100.1 percent_bytes value:GAUGE:0:100.1 -- 2.11.0