1 #include "daemon/collectd.h"
2 #include "daemon/common.h"
3 #include "daemon/plugin.h"
9 #define MAX_DEVNAME_LEN 256
10 #define PLUGIN_NAME "gpu_nvml"
12 static nvmlReturn_t nv_status = NVML_SUCCESS;
13 static char *nv_errline = "";
15 #define TRY_CATCH(f, catch) \
16 if ((nv_status = f) != NVML_SUCCESS) { \
21 #define TRY_CATCH_OPTIONAL(f, catch) \
22 if ((nv_status = f) != NVML_SUCCESS && \
23 nv_status != NVML_ERROR_NOT_SUPPORTED) { \
28 #define TRY(f) TRY_CATCH(f, catch)
29 #define TRYOPT(f) TRY_CATCH_OPTIONAL(f, catch)
31 #define WRAPGAUGE(x) ((value_t){.gauge = (gauge_t)(x)})
33 static const char *config_keys[] = {
34 "GPUIndex", "IgnoreSelected",
36 static const unsigned int n_config_keys = STATIC_ARRAY_SIZE(config_keys);
38 // This is a bitflag, necessitating the (extremely conservative) assumption
39 // that there are no more than 64 GPUs on this system.
40 static uint64_t conf_match_mask = 0;
41 static bool conf_mask_is_exclude = 0;
43 static int nvml_config(const char *key, const char *value) {
45 unsigned long device_ix;
48 if (strcasecmp(key, config_keys[0]) == 0) {
49 device_ix = strtoul(value, &eptr, 10);
56 conf_match_mask |= (1 << device_ix);
57 } else if (strcasecmp(key, config_keys[1])) {
59 IS_TRUE(value) { conf_mask_is_exclude = 1; }
67 static int nvml_init(void) {
71 catch : ERROR("NVML init failed with %d", nv_status);
75 static int nvml_shutdown(void) {
79 catch : ERROR("NVML shutdown failed with %d", nv_status);
83 static void nvml_submit(const char *plugin_instance, const char *type,
84 const char *type_instance, value_t nvml) {
86 value_list_t vl = VALUE_LIST_INIT;
91 sstrncpy(vl.plugin, PLUGIN_NAME, sizeof(vl.plugin));
92 sstrncpy(vl.plugin_instance, plugin_instance, sizeof(vl.plugin_instance));
94 sstrncpy(vl.type, type, sizeof(vl.type));
96 if (type_instance != NULL) {
97 sstrncpy(vl.type_instance, type_instance, sizeof(vl.type_instance));
100 plugin_dispatch_values(&vl);
103 static int nvml_read(void) {
105 unsigned int device_count;
106 TRY_CATCH(nvmlDeviceGetCount(&device_count), catch_nocount);
108 if (device_count > 64) {
112 for (int ix = 0; ix < device_count; ix++) {
114 int is_match = ((1 << ix) & conf_match_mask) || (conf_match_mask == 0);
115 if (conf_mask_is_exclude == !!is_match) {
120 TRY(nvmlDeviceGetHandleByIndex(ix, &dev));
122 char dev_name[MAX_DEVNAME_LEN + 1];
124 TRY(nvmlDeviceGetName(dev, dev_name, MAX_DEVNAME_LEN));
126 // Try to be as lenient as possible with the variety of devices that are
127 // out there, ignoring any NOT_SUPPORTED errors gently.
128 nvmlMemory_t meminfo;
129 TRYOPT(nvmlDeviceGetMemoryInfo(dev, &meminfo))
130 if (nv_status == NVML_SUCCESS) {
131 double pct_mem_used = 100. * (double)meminfo.used / meminfo.total;
132 nvml_submit(dev_name, "percent", "mem_used", WRAPGAUGE(pct_mem_used));
135 nvmlUtilization_t utilization;
136 TRYOPT(nvmlDeviceGetUtilizationRates(dev, &utilization))
137 if (nv_status == NVML_SUCCESS)
138 nvml_submit(dev_name, "percent", "gpu_used", WRAPGAUGE(utilization.gpu));
140 unsigned int fan_speed;
141 TRYOPT(nvmlDeviceGetFanSpeed(dev, &fan_speed))
142 if (nv_status == NVML_SUCCESS)
143 nvml_submit(dev_name, "fanspeed", NULL, WRAPGAUGE(fan_speed));
145 unsigned int core_temp;
146 TRYOPT(nvmlDeviceGetTemperature(dev, NVML_TEMPERATURE_GPU, &core_temp))
147 if (nv_status == NVML_SUCCESS)
148 nvml_submit(dev_name, "temperature", "core", WRAPGAUGE(core_temp));
150 unsigned int sm_clk_mhz;
151 TRYOPT(nvmlDeviceGetClockInfo(dev, NVML_CLOCK_SM, &sm_clk_mhz))
152 if (nv_status == NVML_SUCCESS)
153 nvml_submit(dev_name, "frequency", "sm", WRAPGAUGE(1e6 * sm_clk_mhz));
155 unsigned int mem_clk_mhz;
156 TRYOPT(nvmlDeviceGetClockInfo(dev, NVML_CLOCK_MEM, &mem_clk_mhz))
157 if (nv_status == NVML_SUCCESS)
158 nvml_submit(dev_name, "frequency", "mem", WRAPGAUGE(1e6 * mem_clk_mhz));
160 unsigned int power_mW;
161 TRYOPT(nvmlDeviceGetPowerUsage(dev, &power_mW))
162 if (nv_status == NVML_SUCCESS)
163 nvml_submit(dev_name, "power", NULL, WRAPGAUGE(1e-3 * power_mW));
167 // Failures here indicate transient errors or removal of GPU. In either
168 // case it will either be resolved or the GPU will no longer be enumerated
169 // the next time round.
170 catch : WARNING("NVML call \"%s\" failed with code %d on dev at index %d!",
171 nv_errline, nv_status, ix);
177 // Failures here indicate serious misconfiguration; we bail out totally.
179 ERROR("Failed to enumerate NVIDIA GPUs (\"%s\" returned %d)", nv_errline,
184 void module_register(void) {
185 plugin_register_init(PLUGIN_NAME, nvml_init);
186 plugin_register_config(PLUGIN_NAME, nvml_config, config_keys, n_config_keys);
187 plugin_register_read(PLUGIN_NAME, nvml_read);
188 plugin_register_shutdown(PLUGIN_NAME, nvml_shutdown);