From: Florian Forster Date: Sat, 27 Oct 2018 08:53:12 +0000 (+0200) Subject: Merge pull request #2979 from rpv-tomsk/collectd-master X-Git-Url: https://git.octo.it/?a=commitdiff_plain;h=935aa663ec5a952c51c3b2cd36d6ca70377e3c61;hp=c6b9bda2432a1f46712b8b64aa2b5e45a4d9f8b5;p=collectd.git Merge pull request #2979 from rpv-tomsk/collectd-master swap plugin: Drop support for Linux 2.4. --- diff --git a/Makefile.am b/Makefile.am index f929ffc2..eaff0dd1 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1014,6 +1014,13 @@ gps_la_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_LIBGPS_LDFLAGS) gps_la_LIBADD = -lpthread $(BUILD_WITH_LIBGPS_LIBS) endif +if BUILD_PLUGIN_GPU_NVIDIA +pkglib_LTLIBRARIES += gpu_nvidia.la +gpu_nvidia_la_SOURCES = src/gpu_nvidia.c +gpu_nvidia_la_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_GPU_CUDA_LDFLAGS) +gpu_nvidia_la_LIBADD = $(BUILD_WITH_CUDA_LIBS) +endif + if BUILD_PLUGIN_GRPC pkglib_LTLIBRARIES += grpc.la grpc_la_SOURCES = src/grpc.cc diff --git a/README b/README index a5947038..87760548 100644 --- a/README +++ b/README @@ -135,6 +135,9 @@ Features - gps Monitor gps related data through gpsd. + - gpu_nvidia + Monitor NVIDIA GPU statistics available through NVML. + - hddtemp Hard disk temperatures using hddtempd. @@ -749,6 +752,10 @@ Prerequisites particular. + * CUDA (optional) + Used by the `gpu_nvidia' plugin + + * libatasmart (optional) Used by the `smart' plugin. diff --git a/configure.ac b/configure.ac index 86296d9f..3b6d10e8 100644 --- a/configure.ac +++ b/configure.ac @@ -2074,6 +2074,58 @@ if test "x$with_kvm_openfiles" = "xyes"; then with_libkvm="yes" fi +# --with-cuda {{{ +# only CUDA provides the nvml.h header +AC_ARG_WITH([cuda], + [AS_HELP_STRING([--with-cuda@<:@=PREFIX@:>@], [Path to cuda.])], + [ + if test "x$withval" = "xyes"; then + with_cuda="yes" + else if test "x$withval" = "xno"; then + with_cuda="no" + else + with_cuda="yes" + CUDA_CFLAGS="$CUDA_CFLAGS -I$withval/include" + CUDA_LDFLAGS="$CUDA_LDFLAGS -L$withval/lib" + fi; fi + ], + [ with_cuda="yes" + CUDA_CFLAGS="$CUDA_CFLAGS -I/opt/cuda/include" + CUDA_LDFLAGS="$CUDA_LDFLAGS -L/opt/cuda/lib64" + ] +) + +SAVE_CFLAGS="$CFLAGS" +SAVE_LDFLAGS="$LDFLAGS" +CFLAGS="$CFLAGS $CUDA_CFLAGS" +LDFLAGS="$LDFLAGS $CUDA_LDFLAGS" + +if test "x$with_cuda" = "xyes"; then + AC_CHECK_HEADERS([nvml.h], + [with_cuda="yes"], + [with_cuda="no (header file missing)"] + ) +fi + +if test "x$with_cuda" = "xpkgconfig"; then + AC_CHECK_HEADERS([nvml.h], + [], + [with_cuda="no (header file missing)"] + ) +fi + +if test "x$with_cuda" = "xyes"; then + BUILD_WITH_CUDA_CFLAGS="$CUDA_CFLAGS" + BUILD_WITH_CUDA_LDFLAGS="$CUDA_LDFLAGS" + BUILD_WITH_CUDA_LIBS="-lnvidia-ml" +fi + +AC_SUBST([BUILD_WITH_CUDA_CFLAGS]) +AC_SUBST([BUILD_WITH_CUDA_LDFLAGS]) +AC_SUBST([BUILD_WITH_CUDA_LIBS]) + +# }}} + # --with-libaquaero5 {{{ AC_ARG_WITH([libaquaero5], [AS_HELP_STRING([--with-libaquaero5@<:@=PREFIX@:>@], [Path to aquatools-ng source code.])], @@ -6362,6 +6414,7 @@ plugin_ethstat="no" plugin_fhcount="no" plugin_fscache="no" plugin_gps="no" +plugin_gpu_nvidia="no" plugin_grpc="no" plugin_hugepages="no" plugin_intel_pmu="no" @@ -6792,6 +6845,7 @@ AC_PLUGIN([filecount], [yes], [Count files in di AC_PLUGIN([fscache], [$plugin_fscache], [fscache statistics]) AC_PLUGIN([gmond], [$with_libganglia], [Ganglia plugin]) AC_PLUGIN([gps], [$plugin_gps], [GPS plugin]) +AC_PLUGIN([gpu_nvidia], [$with_cuda], [NVIDIA GPU plugin]) AC_PLUGIN([grpc], [$plugin_grpc], [gRPC plugin]) AC_PLUGIN([hddtemp], [yes], [Query hddtempd]) AC_PLUGIN([hugepages], [$plugin_hugepages], [Hugepages statistics]) @@ -7102,6 +7156,7 @@ AC_MSG_RESULT([ YACC . . . . . . . . $YACC]) AC_MSG_RESULT([ YFLAGS . . . . . . . $YFLAGS]) AC_MSG_RESULT() AC_MSG_RESULT([ Libraries:]) +AC_MSG_RESULT([ cuda . . . . . . . . $with_cuda]) AC_MSG_RESULT([ intel mic . . . . . . $with_mic]) AC_MSG_RESULT([ libaquaero5 . . . . . $with_libaquaero5]) AC_MSG_RESULT([ libatasmart . . . . . $with_libatasmart]) @@ -7216,6 +7271,7 @@ AC_MSG_RESULT([ filecount . . . . . . $enable_filecount]) AC_MSG_RESULT([ fscache . . . . . . . $enable_fscache]) AC_MSG_RESULT([ gmond . . . . . . . . $enable_gmond]) AC_MSG_RESULT([ gps . . . . . . . . . $enable_gps]) +AC_MSG_RESULT([ gpu_nvidia . . . . . $enable_gpu_nvidia]) AC_MSG_RESULT([ grpc . . . . . . . . $enable_grpc]) AC_MSG_RESULT([ hddtemp . . . . . . . $enable_hddtemp]) AC_MSG_RESULT([ hugepages . . . . . . $enable_hugepages]) diff --git a/src/collectd.conf.in b/src/collectd.conf.in index 94214759..b7c1b278 100644 --- a/src/collectd.conf.in +++ b/src/collectd.conf.in @@ -657,6 +657,12 @@ # PauseConnect 5 # +# +# GPUIndex 0 +# GPUIndex 2 +# IgnoreSelected false +# + # # # EnableSSL true diff --git a/src/collectd.conf.pod b/src/collectd.conf.pod index bb2aace5..d830573b 100644 --- a/src/collectd.conf.pod +++ b/src/collectd.conf.pod @@ -3210,6 +3210,30 @@ Pause to apply between attempts of connection to gpsd in seconds (default 5 sec) =back +=head2 Plugin C + +Efficiently collects various statistics from the system's NVIDIA GPUs using the +NVML library. Currently collected are fan speed, core temperature, percent +load, percent memory used, compute and memory frequencies, and power +consumption. + +=over 4 + +=item B + +If one or more of these options is specified, only GPUs at that index (as +determined by nvidia-utils through I) have statistics collected. +If no instance of this option is specified, all GPUs are monitored. + +=item B + +If set to true, all detected GPUs B the ones at indices specified by +B entries are collected. For greater clarity, setting IgnoreSelected +without any GPUIndex directives will result in B statistics being +collected. + +=back + =head2 Plugin C The I plugin provides an RPC interface to submit values to or query diff --git a/src/gpu_nvidia.c b/src/gpu_nvidia.c new file mode 100644 index 00000000..812cfeb0 --- /dev/null +++ b/src/gpu_nvidia.c @@ -0,0 +1,215 @@ +/* +Copyright 2018 Evgeny Naumov + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "daemon/collectd.h" +#include "daemon/common.h" +#include "daemon/plugin.h" + +#include +#include +#include + +#define MAX_DEVNAME_LEN 256 +#define PLUGIN_NAME "gpu_nvidia" + +static nvmlReturn_t nv_status = NVML_SUCCESS; +static char *nv_errline = ""; + +#define TRY_CATCH(f, catch) \ + if ((nv_status = f) != NVML_SUCCESS) { \ + nv_errline = #f; \ + goto catch; \ + } + +#define TRY_CATCH_OPTIONAL(f, catch) \ + if ((nv_status = f) != NVML_SUCCESS && \ + nv_status != NVML_ERROR_NOT_SUPPORTED) { \ + nv_errline = #f; \ + goto catch; \ + } + +#define TRY(f) TRY_CATCH(f, catch) +#define TRYOPT(f) TRY_CATCH_OPTIONAL(f, catch) + +#define KEY_GPUINDEX "GPUIndex" +#define KEY_IGNORESELECTED "IgnoreSelected" + +static const char *config_keys[] = { + KEY_GPUINDEX, KEY_IGNORESELECTED, +}; +static const unsigned int n_config_keys = STATIC_ARRAY_SIZE(config_keys); + +// This is a bitflag, necessitating the (extremely conservative) assumption +// that there are no more than 64 GPUs on this system. +static uint64_t conf_match_mask = 0; +static bool conf_mask_is_exclude = 0; + +static int nvml_config(const char *key, const char *value) { + + if (strcasecmp(key, KEY_GPUINDEX) == 0) { + char *eptr; + unsigned long device_ix = strtoul(value, &eptr, 10); + if (eptr == value) { + ERROR(PLUGIN_NAME ": Failed to parse GPUIndex value \"%s\"", value); + return -1; + } + if (device_ix >= 64) { + ERROR(PLUGIN_NAME + ": At most 64 GPUs (0 <= GPUIndex < 64) are supported!"); + return -2; + } + conf_match_mask |= (1 << device_ix); + } else if (strcasecmp(key, KEY_IGNORESELECTED)) { + conf_mask_is_exclude = IS_TRUE(value); + } else { + ERROR(PLUGIN_NAME ": Unrecognized config option %s", key); + return -10; + } + + return 0; +} + +static int nvml_init(void) { + TRY(nvmlInit()); + return 0; + + catch : ERROR(PLUGIN_NAME ": NVML init failed with %d", nv_status); + return -1; +} + +static int nvml_shutdown(void) { + TRY(nvmlShutdown()) + return 0; + + catch : ERROR(PLUGIN_NAME ": NVML shutdown failed with %d", nv_status); + return -1; +} + +static void nvml_submit_gauge(const char *plugin_instance, const char *type, + const char *type_instance, gauge_t nvml) { + + value_list_t vl = VALUE_LIST_INIT; + + vl.values = &(value_t){.gauge = nvml}; + vl.values_len = 1; + + sstrncpy(vl.plugin, PLUGIN_NAME, sizeof(vl.plugin)); + sstrncpy(vl.plugin_instance, plugin_instance, sizeof(vl.plugin_instance)); + + sstrncpy(vl.type, type, sizeof(vl.type)); + + if (type_instance != NULL) { + sstrncpy(vl.type_instance, type_instance, sizeof(vl.type_instance)); + } + + plugin_dispatch_values(&vl); +} + +static int nvml_read(void) { + + unsigned int device_count; + TRY_CATCH(nvmlDeviceGetCount(&device_count), catch_nocount); + + if (device_count > 64) { + device_count = 64; + } + + for (unsigned int ix = 0; ix < device_count; ix++) { + + unsigned int is_match = + ((1 << ix) & conf_match_mask) || (conf_match_mask == 0); + if (conf_mask_is_exclude == !!is_match) { + continue; + } + + nvmlDevice_t dev; + TRY(nvmlDeviceGetHandleByIndex(ix, &dev)); + + char dev_name[MAX_DEVNAME_LEN + 1] = {0}; + TRY(nvmlDeviceGetName(dev, dev_name, sizeof(dev_name) - 1)); + + // Try to be as lenient as possible with the variety of devices that are + // out there, ignoring any NOT_SUPPORTED errors gently. + nvmlMemory_t meminfo; + TRYOPT(nvmlDeviceGetMemoryInfo(dev, &meminfo)) + if (nv_status == NVML_SUCCESS) { + nvml_submit_gauge(dev_name, "memory", "used", meminfo.used); + nvml_submit_gauge(dev_name, "memory", "free", meminfo.free); + } + + nvmlUtilization_t utilization; + TRYOPT(nvmlDeviceGetUtilizationRates(dev, &utilization)) + if (nv_status == NVML_SUCCESS) + nvml_submit_gauge(dev_name, "percent", "gpu_used", utilization.gpu); + + unsigned int fan_speed; + TRYOPT(nvmlDeviceGetFanSpeed(dev, &fan_speed)) + if (nv_status == NVML_SUCCESS) + nvml_submit_gauge(dev_name, "fanspeed", NULL, fan_speed); + + unsigned int core_temp; + TRYOPT(nvmlDeviceGetTemperature(dev, NVML_TEMPERATURE_GPU, &core_temp)) + if (nv_status == NVML_SUCCESS) + nvml_submit_gauge(dev_name, "temperature", "core", core_temp); + + unsigned int sm_clk_mhz; + TRYOPT(nvmlDeviceGetClockInfo(dev, NVML_CLOCK_SM, &sm_clk_mhz)) + if (nv_status == NVML_SUCCESS) + nvml_submit_gauge(dev_name, "frequency", "multiprocessor", + 1e6 * sm_clk_mhz); + + unsigned int mem_clk_mhz; + TRYOPT(nvmlDeviceGetClockInfo(dev, NVML_CLOCK_MEM, &mem_clk_mhz)) + if (nv_status == NVML_SUCCESS) + nvml_submit_gauge(dev_name, "frequency", "memory", 1e6 * mem_clk_mhz); + + unsigned int power_mW; + TRYOPT(nvmlDeviceGetPowerUsage(dev, &power_mW)) + if (nv_status == NVML_SUCCESS) + nvml_submit_gauge(dev_name, "power", NULL, 1e-3 * power_mW); + + continue; + + // Failures here indicate transient errors or removal of GPU. In either + // case it will either be resolved or the GPU will no longer be enumerated + // the next time round. + catch : WARNING(PLUGIN_NAME + ": NVML call \"%s\" failed (%d) on dev at index %d!", + nv_errline, nv_status, ix); + continue; + } + + return 0; + +// Failures here indicate serious misconfiguration; we bail out totally. +catch_nocount: + ERROR(PLUGIN_NAME ": Failed to enumerate NVIDIA GPUs (\"%s\" returned %d)", + nv_errline, nv_status); + return -1; +} + +void module_register(void) { + plugin_register_init(PLUGIN_NAME, nvml_init); + plugin_register_config(PLUGIN_NAME, nvml_config, config_keys, n_config_keys); + plugin_register_read(PLUGIN_NAME, nvml_read); + plugin_register_shutdown(PLUGIN_NAME, nvml_shutdown); +}