diff --git a/flake.nix b/flake.nix index 0759feb4..013aaa62 100644 --- a/flake.nix +++ b/flake.nix @@ -317,6 +317,10 @@ if isDarwin then "/Users/${device.user}" else "/home/${device.user}"; + # output = + # if isDarwin + # then self.darwinConfigurations."${device.name}" + # else self.nixosConfigurations."${device.name}"; }; nixos_devices = nixpkgs.lib.attrsets.filterAttrs (n: x: x.isNix) devices; diff --git a/nixos/mirai/services/dashboards/enhanced-overview.json b/nixos/mirai/services/dashboards/enhanced-overview.json new file mode 100644 index 00000000..61834d50 --- /dev/null +++ b/nixos/mirai/services/dashboards/enhanced-overview.json @@ -0,0 +1,912 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "100 - (avg by (instance) (irate(node_cpu_seconds_total{job=\"tsuba-node\",mode=\"idle\"}[5m])) * 100)", + "legendFormat": "Tsuba CPU Usage", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "100 - (avg by (instance) (irate(node_cpu_seconds_total{job=\"ryu-node\",mode=\"idle\"}[5m])) * 100)", + "legendFormat": "Ryu CPU Usage", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "100 - (avg by (instance) (irate(node_cpu_seconds_total{job=\"mirai-node\",mode=\"idle\"}[5m])) * 100)", + "legendFormat": "Mirai CPU Usage", + "range": true, + "refId": "C" + } + ], + "title": "CPU Usage - All Devices", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "(node_memory_MemTotal_bytes{job=\"tsuba-node\"} - node_memory_MemAvailable_bytes{job=\"tsuba-node\"}) / node_memory_MemTotal_bytes{job=\"tsuba-node\"} * 100", + "legendFormat": "Tsuba Memory Usage", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "(node_memory_MemTotal_bytes{job=\"ryu-node\"} - node_memory_MemAvailable_bytes{job=\"ryu-node\"}) / node_memory_MemTotal_bytes{job=\"ryu-node\"} * 100", + "legendFormat": "Ryu Memory Usage", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "(node_memory_MemTotal_bytes{job=\"mirai-node\"} - node_memory_MemAvailable_bytes{job=\"mirai-node\"}) / node_memory_MemTotal_bytes{job=\"mirai-node\"} * 100", + "legendFormat": "Mirai Memory Usage", + "range": true, + "refId": "C" + } + ], + "title": "Memory Usage - All Devices", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Failed Services" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-background" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Active Services" + }, + "properties": [ + { + "id": "unit", + "value": "short" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 3, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "systemd_units{job=~\"tsuba-systemd|ryu-systemd|mirai-systemd\",state=\"active\"}", + "format": "table", + "legendFormat": "__auto", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "systemd_units{job=~\"tsuba-systemd|ryu-systemd|mirai-systemd\",state=\"failed\"}", + "format": "table", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + } + ], + "title": "SystemD Service Status", + "transformations": [ + { + "id": "merge", + "options": {} + }, + { + "id": "groupBy", + "options": { + "fields": { + "Value #A": { + "aggregations": ["sum"], + "operation": "aggregate" + }, + "Value #B": { + "aggregations": ["sum"], + "operation": "aggregate" + }, + "job": { + "aggregations": [], + "operation": "groupby" + } + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "indexByName": {}, + "renameByName": { + "Value #A (sum)": "Active Services", + "Value #B (sum)": "Failed Services", + "job": "Device" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "zfs_dataset_used_bytes{job=~\"tsuba-zfs|ryu-zfs|mirai-zfs\"} / zfs_dataset_available_bytes{job=~\"tsuba-zfs|ryu-zfs|mirai-zfs\"} * 100", + "legendFormat": "{{job}} - {{dataset}}", + "range": true, + "refId": "A" + } + ], + "title": "ZFS Pool Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 40 + }, + { + "color": "red", + "value": 60 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Temperature" + }, + "properties": [ + { + "id": "unit", + "value": "celsius" + }, + { + "id": "custom.cellOptions", + "value": { + "type": "gauge" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Health" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { + "color": "red", + "index": 1, + "text": "FAIL" + }, + "1": { + "color": "green", + "index": 0, + "text": "OK" + } + }, + "type": "value" + } + ] + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-background" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 5, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "smartctl_device_smart_healthy{job=~\"tsuba-smartctl|ryu-smartctl|mirai-smartctl\"}", + "format": "table", + "legendFormat": "__auto", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "smartctl_device_temperature{job=~\"tsuba-smartctl|ryu-smartctl|mirai-smartctl\"}", + "format": "table", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + } + ], + "title": "Disk Health & Temperature", + "transformations": [ + { + "id": "merge", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "__name__": true, + "instance": true, + "Time": true + }, + "indexByName": {}, + "renameByName": { + "Value #A": "Health", + "Value #B": "Temperature", + "device": "Device", + "job": "System" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(nginx_http_requests_total{job=\"mirai-nginx\"}[5m])", + "legendFormat": "{{server}} - {{status}}", + "range": true, + "refId": "A" + } + ], + "title": "Nginx Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "nginx_connections_active{job=\"mirai-nginx\"}", + "legendFormat": "Active Connections", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "nginx_connections_reading{job=\"mirai-nginx\"}", + "legendFormat": "Reading", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "nginx_connections_writing{job=\"mirai-nginx\"}", + "legendFormat": "Writing", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "nginx_connections_waiting{job=\"mirai-nginx\"}", + "legendFormat": "Waiting", + "range": true, + "refId": "D" + } + ], + "title": "Nginx Connections", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "style": "dark", + "tags": ["monitoring", "overview", "enhanced"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Enhanced Infrastructure Overview", + "uid": "enhanced-overview", + "version": 0, + "weekStart": "" +} diff --git a/nixos/mirai/services/dashboards/mirai-monitoring.json b/nixos/mirai/services/dashboards/mirai-monitoring.json index 07f1faa4..cc5948cc 100644 --- a/nixos/mirai/services/dashboards/mirai-monitoring.json +++ b/nixos/mirai/services/dashboards/mirai-monitoring.json @@ -256,28 +256,6 @@ } }, "overrides": [ - { - "matcher": { - "id": "byName", - "options": "CPU %" - }, - "properties": [ - { - "id": "unit", - "value": "percent" - }, - { - "id": "custom.cellOptions", - "value": { - "type": "gauge" - } - }, - { - "id": "max", - "value": 100 - } - ] - }, { "matcher": { "id": "byName", @@ -289,6 +267,18 @@ "value": "bytes" } ] + }, + { + "matcher": { + "id": "byName", + "options": "Count" + }, + "properties": [ + { + "id": "unit", + "value": "short" + } + ] } ] }, @@ -304,16 +294,14 @@ "footer": { "countRows": false, "fields": "", - "reducer": [ - "sum" - ], + "reducer": ["sum"], "show": false }, "showHeader": true, "sortBy": [ { "desc": true, - "displayName": "CPU %" + "displayName": "Memory" } ] }, @@ -325,7 +313,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "namedprocess_namegroup_cpu_seconds_total{job=\"mirai-process\"}", + "expr": "namedprocess_namegroup_memory_bytes{job=\"mirai-process\"} > 0", "format": "table", "legendFormat": "__auto", "range": true, @@ -337,7 +325,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "namedprocess_namegroup_memory_bytes{job=\"mirai-process\"}", + "expr": "namedprocess_namegroup_num_procs{job=\"mirai-process\"} > 0", "format": "table", "hide": false, "legendFormat": "__auto", @@ -357,12 +345,13 @@ "excludeByName": { "__name__": true, "instance": true, - "job": true + "job": true, + "Time": true }, "indexByName": {}, "renameByName": { - "Value #A": "CPU %", - "Value #B": "Memory", + "Value #A": "Memory", + "Value #B": "Count", "groupname": "Process Name" } } @@ -595,10 +584,7 @@ "refresh": "30s", "schemaVersion": 38, "style": "dark", - "tags": [ - "monitoring", - "mirai" - ], + "tags": ["monitoring", "mirai"], "templating": { "list": [] }, diff --git a/nixos/mirai/services/dashboards/ryu-monitoring.json b/nixos/mirai/services/dashboards/ryu-monitoring.json index fb04d967..5dc502a7 100644 --- a/nixos/mirai/services/dashboards/ryu-monitoring.json +++ b/nixos/mirai/services/dashboards/ryu-monitoring.json @@ -256,28 +256,6 @@ } }, "overrides": [ - { - "matcher": { - "id": "byName", - "options": "CPU %" - }, - "properties": [ - { - "id": "unit", - "value": "percent" - }, - { - "id": "custom.cellOptions", - "value": { - "type": "gauge" - } - }, - { - "id": "max", - "value": 100 - } - ] - }, { "matcher": { "id": "byName", @@ -289,6 +267,18 @@ "value": "bytes" } ] + }, + { + "matcher": { + "id": "byName", + "options": "Count" + }, + "properties": [ + { + "id": "unit", + "value": "short" + } + ] } ] }, @@ -304,16 +294,14 @@ "footer": { "countRows": false, "fields": "", - "reducer": [ - "sum" - ], + "reducer": ["sum"], "show": false }, "showHeader": true, "sortBy": [ { "desc": true, - "displayName": "CPU %" + "displayName": "Memory" } ] }, @@ -325,7 +313,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "namedprocess_namegroup_cpu_seconds_total{job=\"ryu-process\"}", + "expr": "namedprocess_namegroup_memory_bytes{job=\"ryu-process\"} > 0", "format": "table", "legendFormat": "__auto", "range": true, @@ -337,7 +325,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "namedprocess_namegroup_memory_bytes{job=\"ryu-process\"}", + "expr": "namedprocess_namegroup_num_procs{job=\"ryu-process\"} > 0", "format": "table", "hide": false, "legendFormat": "__auto", @@ -357,12 +345,13 @@ "excludeByName": { "__name__": true, "instance": true, - "job": true + "job": true, + "Time": true }, "indexByName": {}, "renameByName": { - "Value #A": "CPU %", - "Value #B": "Memory", + "Value #A": "Memory", + "Value #B": "Count", "groupname": "Process Name" } } @@ -541,7 +530,7 @@ } ] }, - "unit": "bytes" + "unit": "binBps" }, "overrides": [] }, @@ -595,10 +584,7 @@ "refresh": "30s", "schemaVersion": 38, "style": "dark", - "tags": [ - "monitoring", - "ryu" - ], + "tags": ["monitoring", "ryu"], "templating": { "list": [] }, diff --git a/nixos/mirai/services/dashboards/systemd-monitoring.json b/nixos/mirai/services/dashboards/systemd-monitoring.json new file mode 100644 index 00000000..2fd30d2e --- /dev/null +++ b/nixos/mirai/services/dashboards/systemd-monitoring.json @@ -0,0 +1,616 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": 1, + "text": "FAILED" + }, + "1": { + "color": "green", + "index": 0, + "text": "ACTIVE" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.5 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Status" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "type": "color-background" + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "Device" + } + ] + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "systemd_unit_state{job=~\"tsuba-systemd|ryu-systemd|mirai-systemd\",state=~\"active|failed\",name=~\".+\\.(service|timer)$\"}", + "format": "table", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "SystemD Services Status", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "__name__": true, + "instance": true, + "Time": true + }, + "indexByName": {}, + "renameByName": { + "Value": "Status", + "job": "Device", + "name": "Service", + "state": "State" + } + } + }, + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": [ + "Device", + "Service", + "Status" + ] + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "count by (job) (systemd_unit_state{job=~\"tsuba-systemd|ryu-systemd|mirai-systemd\",state=\"active\",name=~\".+\\.service$\"})", + "legendFormat": "{{job}} - Active Services", + "range": true, + "refId": "A" + } + ], + "title": "Active Services Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*Failed.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "count by (job) (systemd_unit_state{job=~\"tsuba-systemd|ryu-systemd|mirai-systemd\",state=\"failed\",name=~\".+\\.service$\"})", + "legendFormat": "{{job}} - Failed Services", + "range": true, + "refId": "A" + } + ], + "title": "Failed Services Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Restart Count" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-background" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 4, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Restart Count" + } + ] + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "systemd_unit_restart_total{job=~\"tsuba-systemd|ryu-systemd|mirai-systemd\",name=~\".+\\.service$\"} > 0", + "format": "table", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Service Restart Count", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "__name__": true, + "instance": true, + "Time": true + }, + "indexByName": {}, + "renameByName": { + "Value": "Restart Count", + "job": "Device", + "name": "Service" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Next Run" + }, + "properties": [ + { + "id": "unit", + "value": "dateTimeAsIso" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 5, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "systemd_timer_last_trigger_seconds{job=~\"tsuba-systemd|ryu-systemd|mirai-systemd\",name=~\".+\\.timer$\"}", + "format": "table", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "SystemD Timers", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "__name__": true, + "instance": true, + "Time": true + }, + "indexByName": {}, + "renameByName": { + "Value": "Last Trigger", + "job": "Device", + "name": "Timer" + } + } + } + ], + "type": "table" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "style": "dark", + "tags": ["monitoring", "systemd", "services"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "SystemD Services Monitoring", + "uid": "systemd-monitoring", + "version": 0, + "weekStart": "" +} diff --git a/nixos/mirai/services/dashboards/tsuba-monitoring.json b/nixos/mirai/services/dashboards/tsuba-monitoring.json index 446d63e3..93293173 100644 --- a/nixos/mirai/services/dashboards/tsuba-monitoring.json +++ b/nixos/mirai/services/dashboards/tsuba-monitoring.json @@ -304,9 +304,7 @@ "footer": { "countRows": false, "fields": "", - "reducer": [ - "sum" - ], + "reducer": ["sum"], "show": false }, "showHeader": true, @@ -325,7 +323,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "namedprocess_namegroup_cpu_seconds_total{job=\"tsuba-process\"}", + "expr": "rate(namedprocess_namegroup_cpu_seconds_total{job=\"tsuba-process\"}[5m]) * 100", "format": "table", "legendFormat": "__auto", "range": true, @@ -337,12 +335,25 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "namedprocess_namegroup_memory_bytes{job=\"tsuba-process\"}", + "expr": "namedprocess_namegroup_num_procs{job=\"tsuba-process\"} > 0", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "namedprocess_namegroup_num_procs{job=\"tsuba-process\"}", + "format": "table", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "C" } ], "title": "Tsuba Running Processes", @@ -363,6 +374,7 @@ "renameByName": { "Value #A": "CPU %", "Value #B": "Memory", + "Value #C": "Count", "groupname": "Process Name" } } @@ -595,10 +607,7 @@ "refresh": "30s", "schemaVersion": 38, "style": "dark", - "tags": [ - "monitoring", - "tsuba" - ], + "tags": ["monitoring", "tsuba"], "templating": { "list": [] }, diff --git a/nixos/mirai/services/default.nix b/nixos/mirai/services/default.nix index e9628cee..d14ebea2 100644 --- a/nixos/mirai/services/default.nix +++ b/nixos/mirai/services/default.nix @@ -8,7 +8,7 @@ ./flaresolverr.nix ./gitea.nix ./homepage.nix - # ./immich.nix + ./immich.nix ./llama.nix ./lldap.nix ./monitoring.nix diff --git a/nixos/mirai/services/monitoring.nix b/nixos/mirai/services/monitoring.nix index 83709f9f..2620ad7e 100644 --- a/nixos/mirai/services/monitoring.nix +++ b/nixos/mirai/services/monitoring.nix @@ -1,6 +1,8 @@ { config, pkgs, + lib, + # devices, ... }: { sops.secrets = { @@ -29,68 +31,16 @@ port = 9090; listenAddress = "0.0.0.0"; - scrapeConfigs = [ - { - job_name = "tsuba-node"; - static_configs = [ - { - targets = ["tsuba:9100"]; - } - ]; - scrape_interval = "15s"; - scrape_timeout = "10s"; - } - { - job_name = "tsuba-process"; - static_configs = [ - { - targets = ["tsuba:9256"]; - } - ]; - scrape_interval = "15s"; - scrape_timeout = "10s"; - } - { - job_name = "ryu-node"; - static_configs = [ - { - targets = ["ryu:9100"]; - } - ]; - scrape_interval = "15s"; - scrape_timeout = "10s"; - } - { - job_name = "ryu-process"; - static_configs = [ - { - targets = ["ryu:9256"]; - } - ]; - scrape_interval = "15s"; - scrape_timeout = "10s"; - } - { - job_name = "mirai-node"; - static_configs = [ - { - targets = ["localhost:9100"]; - } - ]; - scrape_interval = "15s"; - scrape_timeout = "10s"; - } - { - job_name = "mirai-process"; - static_configs = [ - { - targets = ["localhost:9256"]; - } - ]; - scrape_interval = "15s"; - scrape_timeout = "10s"; - } - ]; + scrapeConfigs = []; + # ++ (lib.mapAttrsToList (name: cfg: { + # job_name = "mirai-" + name; + # static_configs = [ + # { + # targets = [("localhost:" + (builtins.toString cfg.port))]; + # } + # ]; + # }) + # (config.services.prometheus.exporters)); retentionTime = "30d"; @@ -101,6 +51,22 @@ }; prometheus.exporters = { + ping = { + enable = true; + settings = { + targets = [ + "1.1.1.1" + "ryu" + "tsuba" + "shiro" + ]; + ping = { + interval = "5s"; + timeout = "5s"; + }; + }; + openFirewall = true; + }; node = { enable = true; enabledCollectors = [ @@ -115,7 +81,7 @@ "uname" "vmstat" ]; - port = 9100; + openFirewall = true; }; process = { enable = true; @@ -125,6 +91,11 @@ cmdline = [".*"]; } ]; + openFirewall = true; + }; + systemd = { + enable = true; + openFirewall = true; }; }; @@ -262,16 +233,12 @@ "C /var/lib/grafana/dashboards/ryu-monitoring.json 0644 grafana grafana - ${./dashboards/ryu-monitoring.json}" "C /var/lib/grafana/dashboards/mirai-monitoring.json 0644 grafana grafana - ${./dashboards/mirai-monitoring.json}" "C /var/lib/grafana/dashboards/overview-monitoring.json 0644 grafana grafana - ${./dashboards/overview-monitoring.json}" + "C /var/lib/grafana/dashboards/enhanced-overview.json 0644 grafana grafana - ${./dashboards/enhanced-overview.json}" + "C /var/lib/grafana/dashboards/systemd-monitoring.json 0644 grafana grafana - ${./dashboards/systemd-monitoring.json}" ]; # Open firewall ports networking.firewall = { - allowedTCPPorts = [ - 3000 # Grafana - 9090 # Prometheus - 9100 # Node exporter - 9256 # Process exporter - ]; # Allow Tailscale traffic for metrics scraping trustedInterfaces = ["tailscale0"]; }; diff --git a/nixos/ryu/services/monitoring.nix b/nixos/ryu/services/monitoring.nix index 9d8e1863..4fe6fcca 100644 --- a/nixos/ryu/services/monitoring.nix +++ b/nixos/ryu/services/monitoring.nix @@ -2,6 +2,10 @@ services = { prometheus = { exporters = { + systemd = { + enable = true; + }; + nvidia-gpu.enable = true; node = { enable = true; enabledCollectors = [ @@ -16,7 +20,6 @@ "uname" "vmstat" ]; - port = 9100; }; process = { enable = true; @@ -30,12 +33,4 @@ }; }; }; - - # Open firewall ports for Prometheus exporters - networking.firewall = { - allowedTCPPorts = [ - 9100 # node exporter - 9256 # process exporter - ]; - }; } diff --git a/nixos/tsuba/services/monitoring.nix b/nixos/tsuba/services/monitoring.nix index 9d8e1863..7e765c6f 100644 --- a/nixos/tsuba/services/monitoring.nix +++ b/nixos/tsuba/services/monitoring.nix @@ -2,6 +2,10 @@ services = { prometheus = { exporters = { + systemd = { + enable = true; + port = 9558; + }; node = { enable = true; enabledCollectors = [ @@ -36,6 +40,9 @@ allowedTCPPorts = [ 9100 # node exporter 9256 # process exporter + 9558 # systemd exporter + 9134 # zfs exporter + 9633 # smartctl exporter ]; }; }