From: Evgeny Naumov Date: Fri, 26 Oct 2018 22:13:27 +0000 (-0400) Subject: rename plugin to "gpu_nvidia" X-Git-Url: https://git.octo.it/?p=collectd.git;a=commitdiff_plain;h=504ebaee2c6ad13d63588dd04ec6bb8e07f6db85 rename plugin to "gpu_nvidia" --- diff --git a/Makefile.am b/Makefile.am index ebf59358..eaff0dd1 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1014,11 +1014,11 @@ gps_la_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_LIBGPS_LDFLAGS) gps_la_LIBADD = -lpthread $(BUILD_WITH_LIBGPS_LIBS) endif -if BUILD_PLUGIN_GPU_NVML -pkglib_LTLIBRARIES += gpu_nvml.la -gpu_nvml_la_SOURCES = src/gpu_nvml.c -gpu_nvml_la_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_GPU_CUDA_LDFLAGS) -gpu_nvml_la_LIBADD = $(BUILD_WITH_CUDA_LIBS) +if BUILD_PLUGIN_GPU_NVIDIA +pkglib_LTLIBRARIES += gpu_nvidia.la +gpu_nvidia_la_SOURCES = src/gpu_nvidia.c +gpu_nvidia_la_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_GPU_CUDA_LDFLAGS) +gpu_nvidia_la_LIBADD = $(BUILD_WITH_CUDA_LIBS) endif if BUILD_PLUGIN_GRPC diff --git a/README b/README index fb10a641..87760548 100644 --- a/README +++ b/README @@ -135,7 +135,7 @@ Features - gps Monitor gps related data through gpsd. - - gpu_nvml + - gpu_nvidia Monitor NVIDIA GPU statistics available through NVML. - hddtemp @@ -753,7 +753,7 @@ Prerequisites * CUDA (optional) - Used by the `gpu_nvml' plugin + Used by the `gpu_nvidia' plugin * libatasmart (optional) diff --git a/configure.ac b/configure.ac index ca869cde..2f73cbd6 100644 --- a/configure.ac +++ b/configure.ac @@ -6414,7 +6414,7 @@ plugin_ethstat="no" plugin_fhcount="no" plugin_fscache="no" plugin_gps="no" -plugin_gpu_nvml="no" +plugin_gpu_nvidia="no" plugin_grpc="no" plugin_hugepages="no" plugin_intel_pmu="no" @@ -6845,7 +6845,7 @@ AC_PLUGIN([filecount], [yes], [Count files in di AC_PLUGIN([fscache], [$plugin_fscache], [fscache statistics]) AC_PLUGIN([gmond], [$with_libganglia], [Ganglia plugin]) AC_PLUGIN([gps], [$plugin_gps], [GPS plugin]) -AC_PLUGIN([gpu_nvml], [$with_cuda], [NVIDIA GPU plugin]) +AC_PLUGIN([gpu_nvidia], [$with_cuda], [NVIDIA GPU plugin]) AC_PLUGIN([grpc], [$plugin_grpc], [gRPC plugin]) AC_PLUGIN([hddtemp], [yes], [Query hddtempd]) AC_PLUGIN([hugepages], [$plugin_hugepages], [Hugepages statistics]) @@ -7271,7 +7271,7 @@ AC_MSG_RESULT([ filecount . . . . . . $enable_filecount]) AC_MSG_RESULT([ fscache . . . . . . . $enable_fscache]) AC_MSG_RESULT([ gmond . . . . . . . . $enable_gmond]) AC_MSG_RESULT([ gps . . . . . . . . . $enable_gps]) -AC_MSG_RESULT([ gpu_nvml . . . . . . $enable_gpu_nvml]) +AC_MSG_RESULT([ gpu_nvidia . . . . . $enable_gpu_nvidia]) AC_MSG_RESULT([ grpc . . . . . . . . $enable_grpc]) AC_MSG_RESULT([ hddtemp . . . . . . . $enable_hddtemp]) AC_MSG_RESULT([ hugepages . . . . . . $enable_hugepages]) diff --git a/src/collectd.conf.in b/src/collectd.conf.in index 14afd119..b7c1b278 100644 --- a/src/collectd.conf.in +++ b/src/collectd.conf.in @@ -657,10 +657,10 @@ # PauseConnect 5 # -# +# # GPUIndex 0 # GPUIndex 2 -# IgnoreSelected 0 +# IgnoreSelected false # # diff --git a/src/collectd.conf.pod b/src/collectd.conf.pod index d2409206..cc288a61 100644 --- a/src/collectd.conf.pod +++ b/src/collectd.conf.pod @@ -3210,7 +3210,7 @@ Pause to apply between attempts of connection to gpsd in seconds (default 5 sec) =back -=head2 Plugin C +=head2 Plugin C Efficiently collects various statistics from the system's NVIDIA GPUs using the NVML library. Currently collected are fan speed, core temperature, percent diff --git a/src/gpu_nvidia.c b/src/gpu_nvidia.c new file mode 100644 index 00000000..812cfeb0 --- /dev/null +++ b/src/gpu_nvidia.c @@ -0,0 +1,215 @@ +/* +Copyright 2018 Evgeny Naumov + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "daemon/collectd.h" +#include "daemon/common.h" +#include "daemon/plugin.h" + +#include +#include +#include + +#define MAX_DEVNAME_LEN 256 +#define PLUGIN_NAME "gpu_nvidia" + +static nvmlReturn_t nv_status = NVML_SUCCESS; +static char *nv_errline = ""; + +#define TRY_CATCH(f, catch) \ + if ((nv_status = f) != NVML_SUCCESS) { \ + nv_errline = #f; \ + goto catch; \ + } + +#define TRY_CATCH_OPTIONAL(f, catch) \ + if ((nv_status = f) != NVML_SUCCESS && \ + nv_status != NVML_ERROR_NOT_SUPPORTED) { \ + nv_errline = #f; \ + goto catch; \ + } + +#define TRY(f) TRY_CATCH(f, catch) +#define TRYOPT(f) TRY_CATCH_OPTIONAL(f, catch) + +#define KEY_GPUINDEX "GPUIndex" +#define KEY_IGNORESELECTED "IgnoreSelected" + +static const char *config_keys[] = { + KEY_GPUINDEX, KEY_IGNORESELECTED, +}; +static const unsigned int n_config_keys = STATIC_ARRAY_SIZE(config_keys); + +// This is a bitflag, necessitating the (extremely conservative) assumption +// that there are no more than 64 GPUs on this system. +static uint64_t conf_match_mask = 0; +static bool conf_mask_is_exclude = 0; + +static int nvml_config(const char *key, const char *value) { + + if (strcasecmp(key, KEY_GPUINDEX) == 0) { + char *eptr; + unsigned long device_ix = strtoul(value, &eptr, 10); + if (eptr == value) { + ERROR(PLUGIN_NAME ": Failed to parse GPUIndex value \"%s\"", value); + return -1; + } + if (device_ix >= 64) { + ERROR(PLUGIN_NAME + ": At most 64 GPUs (0 <= GPUIndex < 64) are supported!"); + return -2; + } + conf_match_mask |= (1 << device_ix); + } else if (strcasecmp(key, KEY_IGNORESELECTED)) { + conf_mask_is_exclude = IS_TRUE(value); + } else { + ERROR(PLUGIN_NAME ": Unrecognized config option %s", key); + return -10; + } + + return 0; +} + +static int nvml_init(void) { + TRY(nvmlInit()); + return 0; + + catch : ERROR(PLUGIN_NAME ": NVML init failed with %d", nv_status); + return -1; +} + +static int nvml_shutdown(void) { + TRY(nvmlShutdown()) + return 0; + + catch : ERROR(PLUGIN_NAME ": NVML shutdown failed with %d", nv_status); + return -1; +} + +static void nvml_submit_gauge(const char *plugin_instance, const char *type, + const char *type_instance, gauge_t nvml) { + + value_list_t vl = VALUE_LIST_INIT; + + vl.values = &(value_t){.gauge = nvml}; + vl.values_len = 1; + + sstrncpy(vl.plugin, PLUGIN_NAME, sizeof(vl.plugin)); + sstrncpy(vl.plugin_instance, plugin_instance, sizeof(vl.plugin_instance)); + + sstrncpy(vl.type, type, sizeof(vl.type)); + + if (type_instance != NULL) { + sstrncpy(vl.type_instance, type_instance, sizeof(vl.type_instance)); + } + + plugin_dispatch_values(&vl); +} + +static int nvml_read(void) { + + unsigned int device_count; + TRY_CATCH(nvmlDeviceGetCount(&device_count), catch_nocount); + + if (device_count > 64) { + device_count = 64; + } + + for (unsigned int ix = 0; ix < device_count; ix++) { + + unsigned int is_match = + ((1 << ix) & conf_match_mask) || (conf_match_mask == 0); + if (conf_mask_is_exclude == !!is_match) { + continue; + } + + nvmlDevice_t dev; + TRY(nvmlDeviceGetHandleByIndex(ix, &dev)); + + char dev_name[MAX_DEVNAME_LEN + 1] = {0}; + TRY(nvmlDeviceGetName(dev, dev_name, sizeof(dev_name) - 1)); + + // Try to be as lenient as possible with the variety of devices that are + // out there, ignoring any NOT_SUPPORTED errors gently. + nvmlMemory_t meminfo; + TRYOPT(nvmlDeviceGetMemoryInfo(dev, &meminfo)) + if (nv_status == NVML_SUCCESS) { + nvml_submit_gauge(dev_name, "memory", "used", meminfo.used); + nvml_submit_gauge(dev_name, "memory", "free", meminfo.free); + } + + nvmlUtilization_t utilization; + TRYOPT(nvmlDeviceGetUtilizationRates(dev, &utilization)) + if (nv_status == NVML_SUCCESS) + nvml_submit_gauge(dev_name, "percent", "gpu_used", utilization.gpu); + + unsigned int fan_speed; + TRYOPT(nvmlDeviceGetFanSpeed(dev, &fan_speed)) + if (nv_status == NVML_SUCCESS) + nvml_submit_gauge(dev_name, "fanspeed", NULL, fan_speed); + + unsigned int core_temp; + TRYOPT(nvmlDeviceGetTemperature(dev, NVML_TEMPERATURE_GPU, &core_temp)) + if (nv_status == NVML_SUCCESS) + nvml_submit_gauge(dev_name, "temperature", "core", core_temp); + + unsigned int sm_clk_mhz; + TRYOPT(nvmlDeviceGetClockInfo(dev, NVML_CLOCK_SM, &sm_clk_mhz)) + if (nv_status == NVML_SUCCESS) + nvml_submit_gauge(dev_name, "frequency", "multiprocessor", + 1e6 * sm_clk_mhz); + + unsigned int mem_clk_mhz; + TRYOPT(nvmlDeviceGetClockInfo(dev, NVML_CLOCK_MEM, &mem_clk_mhz)) + if (nv_status == NVML_SUCCESS) + nvml_submit_gauge(dev_name, "frequency", "memory", 1e6 * mem_clk_mhz); + + unsigned int power_mW; + TRYOPT(nvmlDeviceGetPowerUsage(dev, &power_mW)) + if (nv_status == NVML_SUCCESS) + nvml_submit_gauge(dev_name, "power", NULL, 1e-3 * power_mW); + + continue; + + // Failures here indicate transient errors or removal of GPU. In either + // case it will either be resolved or the GPU will no longer be enumerated + // the next time round. + catch : WARNING(PLUGIN_NAME + ": NVML call \"%s\" failed (%d) on dev at index %d!", + nv_errline, nv_status, ix); + continue; + } + + return 0; + +// Failures here indicate serious misconfiguration; we bail out totally. +catch_nocount: + ERROR(PLUGIN_NAME ": Failed to enumerate NVIDIA GPUs (\"%s\" returned %d)", + nv_errline, nv_status); + return -1; +} + +void module_register(void) { + plugin_register_init(PLUGIN_NAME, nvml_init); + plugin_register_config(PLUGIN_NAME, nvml_config, config_keys, n_config_keys); + plugin_register_read(PLUGIN_NAME, nvml_read); + plugin_register_shutdown(PLUGIN_NAME, nvml_shutdown); +} diff --git a/src/gpu_nvml.c b/src/gpu_nvml.c deleted file mode 100644 index be188fea..00000000 --- a/src/gpu_nvml.c +++ /dev/null @@ -1,214 +0,0 @@ -/* -Copyright 2018 Evgeny Naumov - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#include "daemon/collectd.h" -#include "daemon/common.h" -#include "daemon/plugin.h" - -#include -#include -#include - -#define MAX_DEVNAME_LEN 256 -#define PLUGIN_NAME "gpu_nvml" - -static nvmlReturn_t nv_status = NVML_SUCCESS; -static char *nv_errline = ""; - -#define TRY_CATCH(f, catch) \ - if ((nv_status = f) != NVML_SUCCESS) { \ - nv_errline = #f; \ - goto catch; \ - } - -#define TRY_CATCH_OPTIONAL(f, catch) \ - if ((nv_status = f) != NVML_SUCCESS && \ - nv_status != NVML_ERROR_NOT_SUPPORTED) { \ - nv_errline = #f; \ - goto catch; \ - } - -#define TRY(f) TRY_CATCH(f, catch) -#define TRYOPT(f) TRY_CATCH_OPTIONAL(f, catch) - -#define KEY_GPUINDEX "GPUIndex" -#define KEY_IGNORESELECTED "IgnoreSelected" - -static const char *config_keys[] = { - KEY_GPUINDEX, KEY_IGNORESELECTED, -}; -static const unsigned int n_config_keys = STATIC_ARRAY_SIZE(config_keys); - -// This is a bitflag, necessitating the (extremely conservative) assumption -// that there are no more than 64 GPUs on this system. -static uint64_t conf_match_mask = 0; -static bool conf_mask_is_exclude = 0; - -static int nvml_config(const char *key, const char *value) { - - if (strcasecmp(key, KEY_GPUINDEX) == 0) { - char *eptr; - unsigned long device_ix = strtoul(value, &eptr, 10); - if (eptr == value) { - ERROR(PLUGIN_NAME ": Failed to parse GPUIndex value \"%s\"", value); - return -1; - } - if (device_ix >= 64) { - ERROR(PLUGIN_NAME - ": At most 64 GPUs (0 <= GPUIndex < 64) are supported!"); - return -2; - } - conf_match_mask |= (1 << device_ix); - } else if (strcasecmp(key, KEY_IGNORESELECTED)) { - conf_mask_is_exclude = IS_TRUE(value); - } else { - ERROR(PLUGIN_NAME ": Unrecognized config option %s", key); - return -10; - } - - return 0; -} - -static int nvml_init(void) { - TRY(nvmlInit()); - return 0; - - catch : ERROR(PLUGIN_NAME ": NVML init failed with %d", nv_status); - return -1; -} - -static int nvml_shutdown(void) { - TRY(nvmlShutdown()) - return 0; - - catch : ERROR(PLUGIN_NAME ": NVML shutdown failed with %d", nv_status); - return -1; -} - -static void nvml_submit_gauge(const char *plugin_instance, const char *type, - const char *type_instance, gauge_t nvml) { - - value_list_t vl = VALUE_LIST_INIT; - - vl.values = &(value_t){.gauge = nvml}; - vl.values_len = 1; - - sstrncpy(vl.plugin, PLUGIN_NAME, sizeof(vl.plugin)); - sstrncpy(vl.plugin_instance, plugin_instance, sizeof(vl.plugin_instance)); - - sstrncpy(vl.type, type, sizeof(vl.type)); - - if (type_instance != NULL) { - sstrncpy(vl.type_instance, type_instance, sizeof(vl.type_instance)); - } - - plugin_dispatch_values(&vl); -} - -static int nvml_read(void) { - - unsigned int device_count; - TRY_CATCH(nvmlDeviceGetCount(&device_count), catch_nocount); - - if (device_count > 64) { - device_count = 64; - } - - for (unsigned int ix = 0; ix < device_count; ix++) { - - unsigned int is_match = - ((1 << ix) & conf_match_mask) || (conf_match_mask == 0); - if (conf_mask_is_exclude == !!is_match) { - continue; - } - - nvmlDevice_t dev; - TRY(nvmlDeviceGetHandleByIndex(ix, &dev)); - - char dev_name[MAX_DEVNAME_LEN + 1] = {0}; - TRY(nvmlDeviceGetName(dev, dev_name, sizeof(dev_name) - 1)); - - // Try to be as lenient as possible with the variety of devices that are - // out there, ignoring any NOT_SUPPORTED errors gently. - nvmlMemory_t meminfo; - TRYOPT(nvmlDeviceGetMemoryInfo(dev, &meminfo)) - if (nv_status == NVML_SUCCESS) { - nvml_submit_gauge(dev_name, "memory", "used", meminfo.used); - nvml_submit_gauge(dev_name, "memory", "free", meminfo.free); - } - - nvmlUtilization_t utilization; - TRYOPT(nvmlDeviceGetUtilizationRates(dev, &utilization)) - if (nv_status == NVML_SUCCESS) - nvml_submit_gauge(dev_name, "percent", "gpu_used", utilization.gpu); - - unsigned int fan_speed; - TRYOPT(nvmlDeviceGetFanSpeed(dev, &fan_speed)) - if (nv_status == NVML_SUCCESS) - nvml_submit_gauge(dev_name, "fanspeed", NULL, fan_speed); - - unsigned int core_temp; - TRYOPT(nvmlDeviceGetTemperature(dev, NVML_TEMPERATURE_GPU, &core_temp)) - if (nv_status == NVML_SUCCESS) - nvml_submit_gauge(dev_name, "temperature", "core", core_temp); - - unsigned int sm_clk_mhz; - TRYOPT(nvmlDeviceGetClockInfo(dev, NVML_CLOCK_SM, &sm_clk_mhz)) - if (nv_status == NVML_SUCCESS) - nvml_submit_gauge(dev_name, "frequency", "sm", 1e6 * sm_clk_mhz); - - unsigned int mem_clk_mhz; - TRYOPT(nvmlDeviceGetClockInfo(dev, NVML_CLOCK_MEM, &mem_clk_mhz)) - if (nv_status == NVML_SUCCESS) - nvml_submit_gauge(dev_name, "frequency", "mem", 1e6 * mem_clk_mhz); - - unsigned int power_mW; - TRYOPT(nvmlDeviceGetPowerUsage(dev, &power_mW)) - if (nv_status == NVML_SUCCESS) - nvml_submit_gauge(dev_name, "power", NULL, 1e-3 * power_mW); - - continue; - - // Failures here indicate transient errors or removal of GPU. In either - // case it will either be resolved or the GPU will no longer be enumerated - // the next time round. - catch : WARNING(PLUGIN_NAME - ": NVML call \"%s\" failed (%d) on dev at index %d!", - nv_errline, nv_status, ix); - continue; - } - - return 0; - -// Failures here indicate serious misconfiguration; we bail out totally. -catch_nocount: - ERROR(PLUGIN_NAME ": Failed to enumerate NVIDIA GPUs (\"%s\" returned %d)", - nv_errline, nv_status); - return -1; -} - -void module_register(void) { - plugin_register_init(PLUGIN_NAME, nvml_init); - plugin_register_config(PLUGIN_NAME, nvml_config, config_keys, n_config_keys); - plugin_register_read(PLUGIN_NAME, nvml_read); - plugin_register_shutdown(PLUGIN_NAME, nvml_shutdown); -}