- TRY(nvmlDeviceGetMemoryInfo(dev, &meminfo))
- TRY(nvmlDeviceGetUtilizationRates(dev, &utilization))
- TRY(nvmlDeviceGetFanSpeed(dev, &fan_speed))
- TRY(nvmlDeviceGetTemperature(dev, NVML_TEMPERATURE_GPU, &core_temp))
+ // Try to be as lenient as possible with the variety of devices that are
+ // out there, ignoring any NOT_SUPPORTED errors gently.
+ nvmlMemory_t meminfo;
+ TRYOPT(nvmlDeviceGetMemoryInfo(dev, &meminfo))
+ if (nv_status == NVML_SUCCESS) {
+ double pct_mem_used = 100. * (double)meminfo.used / meminfo.total;
+ nvml_submit(dev_name, "percent", "mem_used", WRAPGAUGE(pct_mem_used));
+ }
+
+ nvmlUtilization_t utilization;
+ TRYOPT(nvmlDeviceGetUtilizationRates(dev, &utilization))
+ if (nv_status == NVML_SUCCESS)
+ nvml_submit(dev_name, "percent", "gpu_used", WRAPGAUGE(utilization.gpu));
+
+ unsigned int fan_speed;
+ TRYOPT(nvmlDeviceGetFanSpeed(dev, &fan_speed))
+ if (nv_status == NVML_SUCCESS)
+ nvml_submit(dev_name, "fanspeed", NULL, WRAPGAUGE(fan_speed));
+
+ unsigned int core_temp;
+ TRYOPT(nvmlDeviceGetTemperature(dev, NVML_TEMPERATURE_GPU, &core_temp))
+ if (nv_status == NVML_SUCCESS)
+ nvml_submit(dev_name, "temperature", "core", WRAPGAUGE(core_temp));
+
+ unsigned int sm_clk_mhz;
+ TRYOPT(nvmlDeviceGetClockInfo(dev, NVML_CLOCK_SM, &sm_clk_mhz))
+ if (nv_status == NVML_SUCCESS)
+ nvml_submit(dev_name, "frequency", "sm", WRAPGAUGE(1e6 * sm_clk_mhz));
+
+ unsigned int mem_clk_mhz;
+ TRYOPT(nvmlDeviceGetClockInfo(dev, NVML_CLOCK_MEM, &mem_clk_mhz))
+ if (nv_status == NVML_SUCCESS)
+ nvml_submit(dev_name, "frequency", "mem", WRAPGAUGE(1e6 * mem_clk_mhz));