diff --git a/nixos/ryu/services/caddy.nix b/nixos/ryu/services/caddy.nix index be7ced3a..3ded0398 100644 --- a/nixos/ryu/services/caddy.nix +++ b/nixos/ryu/services/caddy.nix @@ -14,6 +14,11 @@ services = { caddy = { enable = true; + globalConfig = '' + servers { + metrics + } + ''; extraConfig = '' (cloudflare) { tls { diff --git a/nixos/ryu/services/monitoring.nix b/nixos/ryu/services/monitoring.nix index 4fe6fcca..6c90459b 100644 --- a/nixos/ryu/services/monitoring.nix +++ b/nixos/ryu/services/monitoring.nix @@ -1,11 +1,28 @@ -{...}: { +{pkgs, ...}: let + # Port configurations + ports = { + # System exporters + node = 9100; + systemd = 9558; + process = 9256; + nvidiagpu = 9835; + + # Infrastructure exporters + cadvisor = 8080; + caddy = 2019; + }; +in { services = { prometheus = { exporters = { systemd = { enable = true; + port = ports.systemd; + }; + nvidia-gpu = { + enable = true; + port = ports.nvidiagpu; }; - nvidia-gpu.enable = true; node = { enable = true; enabledCollectors = [ @@ -19,7 +36,10 @@ "time" "uname" "vmstat" + "diskstats" + "cpu" ]; + port = ports.node; }; process = { enable = true; @@ -33,4 +53,34 @@ }; }; }; + + # Docker cAdvisor for container metrics + virtualisation.oci-containers.containers.cadvisor = { + image = "gcr.io/cadvisor/cadvisor:v0.49.1"; + ports = ["${toString ports.cadvisor}:8080"]; + volumes = [ + "/:/rootfs:ro" + "/var/run:/var/run:ro" + "/sys:/sys:ro" + "/var/lib/docker/:/var/lib/docker:ro" + "/dev/disk/:/dev/disk:ro" + ]; + extraOptions = [ + "--privileged" + "--device=/dev/kmsg" + ]; + }; + + # Open firewall ports for Prometheus exporters + networking.firewall = { + # Allow from Tailscale network + interfaces."tailscale0".allowedTCPPorts = [ + ports.node + ports.systemd + ports.process + ports.nvidiagpu + ports.cadvisor + ports.caddy + ]; + }; } diff --git a/nixos/tako/services/caddy.nix b/nixos/tako/services/caddy.nix index ec9add9e..3c01c063 100644 --- a/nixos/tako/services/caddy.nix +++ b/nixos/tako/services/caddy.nix @@ -2,6 +2,11 @@ services = { caddy = { enable = true; + globalConfig = '' + servers { + metrics + } + ''; extraConfig = '' (auth) { forward_auth localhost:5555 { diff --git a/nixos/tako/services/default.nix b/nixos/tako/services/default.nix index 171c9501..192a9631 100644 --- a/nixos/tako/services/default.nix +++ b/nixos/tako/services/default.nix @@ -3,7 +3,7 @@ ./games # ./headscale.nix ./llms.nix - # ./monitoring.nix + ./monitoring.nix # ./paperless.nix ./navidrome.nix ./shitpost.nix diff --git a/nixos/tako/services/gitea.nix b/nixos/tako/services/gitea.nix index 7161ff11..e7310ac6 100644 --- a/nixos/tako/services/gitea.nix +++ b/nixos/tako/services/gitea.nix @@ -63,6 +63,10 @@ # LFS_START_SERVER = true; LFS_ALLOW_PURE_SSH = true; }; + metrics = { + ENABLED = true; + TOKEN = ""; + }; oauth2_client = { ENABLE_AUTO_REGISTRATION = true; ACCOUNT_LINKING = "auto"; diff --git a/nixos/tako/services/homepage.nix b/nixos/tako/services/homepage.nix index 6ce78c21..9b9af59a 100644 --- a/nixos/tako/services/homepage.nix +++ b/nixos/tako/services/homepage.nix @@ -87,6 +87,14 @@ siteMonitor = "https://git.darksailor.dev"; }; } + { + "Grafana" = { + icon = "grafana.png"; + description = "Grafana Monitoring & Metrics"; + href = "https://grafana.darksailor.dev"; + siteMonitor = "https://grafana.darksailor.dev"; + }; + } { "Nextcloud" = { icon = "nextcloud.png"; diff --git a/nixos/tako/services/monitoring.nix b/nixos/tako/services/monitoring.nix new file mode 100644 index 00000000..54f98b81 --- /dev/null +++ b/nixos/tako/services/monitoring.nix @@ -0,0 +1,452 @@ +{ + config, + pkgs, + lib, + ... +}: let + # Port configurations + ports = { + grafana = 3001; # Changed from 3000 to avoid clash with Gitea + prometheus = 9090; + + # System exporters + node = 9100; + systemd = 9558; + process = 9256; + + # Infrastructure exporters + postgres = 9187; + redis = 9121; + cadvisor = 8080; + + # Application exporters + caddy = 2019; + }; +in { + # Grafana configuration with Authelia integration + services.grafana = { + enable = true; + settings = { + server = { + http_addr = "127.0.0.1"; + http_port = ports.grafana; + domain = "grafana.darksailor.dev"; + root_url = "https://grafana.darksailor.dev"; + }; + + # Disable Grafana's own auth since we use Authelia + auth.disable_login_form = true; + "auth.basic".enabled = false; + "auth.anonymous".enabled = false; + "auth.proxy" = { + enabled = true; + header_name = "REMOTE-USER"; + header_property = "username"; + auto_sign_up = true; + }; + + users = { + allow_sign_up = false; + auto_assign_org = true; + auto_assign_org_role = "Admin"; + }; + + security = { + disable_gravatar = true; + cookie_secure = true; + }; + + analytics = { + reporting_enabled = false; + check_for_updates = false; + }; + }; + + provision = { + enable = true; + datasources.settings.datasources = [ + { + name = "Prometheus"; + type = "prometheus"; + access = "proxy"; + url = "http://localhost:${toString ports.prometheus}"; + isDefault = true; + jsonData = { + timeInterval = "30s"; + }; + } + ]; + + # Provision popular community dashboards + dashboards.path = let + # Define dashboard files with proper hashes + nodeExporterFull = pkgs.fetchurl { + url = "https://grafana.com/api/dashboards/1860/revisions/37/download"; + sha256 = "0qza4j8lywrj08bqbww52dgh2p2b9rkhq5p313g72i57lrlkacfl"; + }; + nvidiaDashboard = pkgs.fetchurl { + url = "https://grafana.com/api/dashboards/14574/revisions/9/download"; + sha256 = "170ijap5i99sapkxlf3k0lnvwmb6g9jkk7q66nwjwswkj2a7rqbr"; + }; + postgresqlDashboard = pkgs.fetchurl { + url = "https://grafana.com/api/dashboards/9628/revisions/7/download"; + sha256 = "0xmk68kqb9b8aspjj2f8wxv2mxiqk9k3xs0yal4szmzbv65c6k66"; + }; + redisDashboard = pkgs.fetchurl { + url = "https://grafana.com/api/dashboards/11835/revisions/1/download"; + sha256 = "15lbn4i8j5hiypl4dsg0d72jgrgjwpagkf5kcwx66gyps17jcrxx"; + }; + dockerDashboard = pkgs.fetchurl { + url = "https://grafana.com/api/dashboards/193/revisions/1/download"; + sha256 = "1lxbbl91fh0yfh8x53205b7nw5ivghlpfb0m308z2p6fzvz2iq2m"; + }; + caddyDashboard = pkgs.fetchurl { + url = "https://grafana.com/api/dashboards/14280/revisions/1/download"; + sha256 = "0j3q68cq1nj8gcxkqz5h1kn1ds5kgq4jlkw73xp6yc88mbm5nyh4"; + }; + in + pkgs.runCommand "grafana-dashboards" {} '' + mkdir -p $out + cp ${nodeExporterFull} $out/node-exporter-full.json + cp ${nvidiaDashboard} $out/nvidia-gpu.json + cp ${postgresqlDashboard} $out/postgresql.json + cp ${redisDashboard} $out/redis.json + cp ${dockerDashboard} $out/docker-cadvisor.json + cp ${caddyDashboard} $out/caddy.json + ''; + }; + }; + + # Caddy virtual host for Grafana with Authelia + services.caddy.virtualHosts."grafana.darksailor.dev".extraConfig = '' + import auth + reverse_proxy localhost:${toString ports.grafana} + ''; + + # Central Prometheus server + services.prometheus = { + enable = true; + port = ports.prometheus; + + # Retention settings (90 days) + retentionTime = "90d"; + + # Global scrape config + globalConfig = { + scrape_interval = "30s"; + evaluation_interval = "30s"; + }; + + # System exporters for tako + exporters = { + node = { + enable = true; + port = ports.node; + enabledCollectors = [ + "systemd" + "textfile" + "filesystem" + "loadavg" + "meminfo" + "netdev" + "netstat" + "stat" + "time" + "uname" + "vmstat" + "diskstats" + "cpu" + ]; + }; + + systemd = { + enable = true; + port = ports.systemd; + }; + + process = { + enable = true; + settings.process_names = [ + { + name = "{{.Comm}}"; + cmdline = [".*"]; + } + ]; + }; + + postgres = { + enable = true; + port = ports.postgres; + runAsLocalSuperUser = true; + }; + + redis = { + enable = true; + port = ports.redis; + }; + }; + + # Scrape configurations for all targets + scrapeConfigs = [ + # System metrics - tako (local) + { + job_name = "tako-system"; + static_configs = [ + { + targets = [ + "localhost:${toString ports.node}" + "localhost:${toString ports.systemd}" + "localhost:${toString ports.process}" + ]; + labels = { + instance = "tako"; + machine = "tako"; + role = "server"; + }; + } + ]; + } + + # Infrastructure - tako + { + job_name = "tako-infrastructure"; + static_configs = [ + { + targets = [ + "localhost:${toString ports.postgres}" + "localhost:${toString ports.redis}" + "localhost:${toString ports.cadvisor}" + ]; + labels = { + instance = "tako"; + machine = "tako"; + }; + } + ]; + } + + # Caddy metrics - tako + { + job_name = "tako-caddy"; + static_configs = [ + { + targets = ["localhost:${toString ports.caddy}"]; + labels = { + instance = "tako"; + machine = "tako"; + service = "caddy"; + }; + } + ]; + } + + # Application metrics - tako + { + job_name = "tako-applications"; + static_configs = [ + { + targets = [ + "localhost:3000" # gitea + "localhost:5555" # authelia (if metrics enabled) + ]; + labels = { + instance = "tako"; + machine = "tako"; + }; + } + ]; + } + + # System metrics - tsuba (remote via Tailscale) + { + job_name = "tsuba-system"; + static_configs = [ + { + targets = [ + "tsuba:9100" + "tsuba:9558" + "tsuba:9256" + ]; + labels = { + instance = "tsuba"; + machine = "tsuba"; + role = "server"; + }; + } + ]; + } + + # Infrastructure - tsuba + { + job_name = "tsuba-infrastructure"; + static_configs = [ + { + targets = [ + "tsuba:8080" # cadvisor + "tsuba:2019" # caddy + ]; + labels = { + instance = "tsuba"; + machine = "tsuba"; + }; + } + ]; + } + + # Media services - tsuba + { + job_name = "tsuba-media"; + static_configs = [ + { + targets = [ + "tsuba:8096" # jellyfin (built-in /metrics endpoint) + "tsuba:8123" # homeassistant (configure prometheus integration) + "tsuba:9617" # pihole-exporter + ]; + labels = { + instance = "tsuba"; + machine = "tsuba"; + }; + } + ]; + metrics_path = "/metrics"; + relabel_configs = [ + { + source_labels = ["__address__"]; + regex = "tsuba:8096"; + target_label = "__metrics_path__"; + replacement = "/metrics"; + } + { + source_labels = ["__address__"]; + regex = "tsuba:8123"; + target_label = "__metrics_path__"; + replacement = "/api/prometheus"; + } + ]; + } + + # Servarr stack - tsuba (exportarr) + { + job_name = "tsuba-servarr"; + static_configs = [ + { + targets = [ + "tsuba:9707" # sonarr + "tsuba:9708" # radarr + "tsuba:9709" # lidarr + "tsuba:9710" # bazarr + ]; + labels = { + instance = "tsuba"; + machine = "tsuba"; + stack = "servarr"; + }; + } + ]; + } + + # Deluge - tsuba + { + job_name = "tsuba-deluge"; + static_configs = [ + { + targets = ["tsuba:9354"]; + labels = { + instance = "tsuba"; + machine = "tsuba"; + service = "deluge"; + }; + } + ]; + } + + # System metrics - ryu (remote via Tailscale) + { + job_name = "ryu-system"; + static_configs = [ + { + targets = [ + "ryu:9100" + "ryu:9558" + "ryu:9256" + "ryu:9835" # nvidia-gpu + ]; + labels = { + instance = "ryu"; + machine = "ryu"; + role = "desktop"; + }; + } + ]; + } + + # Infrastructure - ryu + { + job_name = "ryu-infrastructure"; + static_configs = [ + { + targets = [ + "ryu:8080" # cadvisor + "ryu:2019" # caddy + ]; + labels = { + instance = "ryu"; + machine = "ryu"; + }; + } + ]; + } + ]; + }; + + # Docker cAdvisor for container metrics + virtualisation.oci-containers.containers.cadvisor = { + image = "gcr.io/cadvisor/cadvisor:v0.49.1"; + ports = ["127.0.0.1:${toString ports.cadvisor}:8080"]; + volumes = [ + "/:/rootfs:ro" + "/var/run:/var/run:ro" + "/sys:/sys:ro" + "/var/lib/docker/:/var/lib/docker:ro" + "/dev/disk/:/dev/disk:ro" + ]; + extraOptions = [ + "--privileged" + "--device=/dev/kmsg" + ]; + }; + + # Link dashboard files from Nix store to Grafana's expected location + # systemd.tmpfiles.rules = let + # dashboardPath = config.services.grafana.provision.dashboards.path; + # in [ + # "L+ /var/lib/grafana/dashboards/node-exporter-full.json - - - - ${dashboardPath}/node-exporter-full.json" + # "L+ /var/lib/grafana/dashboards/nvidia-gpu.json - - - - ${dashboardPath}/nvidia-gpu.json" + # "L+ /var/lib/grafana/dashboards/postgresql.json - - - - ${dashboardPath}/postgresql.json" + # "L+ /var/lib/grafana/dashboards/redis.json - - - - ${dashboardPath}/redis.json" + # "L+ /var/lib/grafana/dashboards/docker-cadvisor.json - - - - ${dashboardPath}/docker-cadvisor.json" + # "L+ /var/lib/grafana/dashboards/caddy.json - - - - ${dashboardPath}/caddy.json" + # ]; + + # Open firewall ports for Prometheus to scrape exporters + networking.firewall = { + allowedTCPPorts = [ + ports.node + ports.systemd + ports.process + ]; + + # Allow Prometheus and Grafana access from Tailscale network + interfaces."tailscale0".allowedTCPPorts = [ + ports.prometheus + ports.grafana + ports.node + ports.systemd + ports.process + ports.postgres + ports.redis + ports.cadvisor + ]; + }; +} diff --git a/nixos/tsuba/services/caddy.nix b/nixos/tsuba/services/caddy.nix index 8efdb4f6..2cf5cc40 100644 --- a/nixos/tsuba/services/caddy.nix +++ b/nixos/tsuba/services/caddy.nix @@ -14,6 +14,11 @@ services = { caddy = { enable = true; + globalConfig = '' + servers { + metrics + } + ''; extraConfig = '' (cloudflare) { tls { diff --git a/nixos/tsuba/services/monitoring.nix b/nixos/tsuba/services/monitoring.nix index 7e765c6f..92b78133 100644 --- a/nixos/tsuba/services/monitoring.nix +++ b/nixos/tsuba/services/monitoring.nix @@ -1,10 +1,35 @@ -{...}: { +{pkgs, ...}: let + # Port configurations + ports = { + # System exporters + node = 9100; + systemd = 9558; + process = 9256; + + # Infrastructure exporters + cadvisor = 8080; + caddy = 2019; + + # Media exporters + jellyfin = 9220; + pihole = 9617; + + # Servarr exporters (via exportarr) + sonarr = 9707; + radarr = 9708; + lidarr = 9709; + bazarr = 9710; + + # Torrent + deluge = 9354; + }; +in { services = { prometheus = { exporters = { systemd = { enable = true; - port = 9558; + port = ports.systemd; }; node = { enable = true; @@ -19,8 +44,10 @@ "time" "uname" "vmstat" + "diskstats" + "cpu" ]; - port = 9100; + port = ports.node; }; process = { enable = true; @@ -35,14 +62,128 @@ }; }; + # Docker cAdvisor for container metrics + virtualisation.oci-containers.containers.cadvisor = { + image = "gcr.io/cadvisor/cadvisor:v0.49.1"; + ports = ["${toString ports.cadvisor}:8080"]; + volumes = [ + "/:/rootfs:ro" + "/var/run:/var/run:ro" + "/sys:/sys:ro" + "/var/lib/docker/:/var/lib/docker:ro" + "/dev/disk/:/dev/disk:ro" + ]; + extraOptions = [ + "--privileged" + "--device=/dev/kmsg" + ]; + }; + + # Jellyfin - use built-in metrics endpoint at http://localhost:8096/metrics + # No separate exporter needed - Prometheus will scrape directly + + # Home Assistant - has built-in Prometheus integration + # Configure in Home Assistant configuration.yaml: + # prometheus: + # namespace: homeassistant + + # Pi-hole exporter + systemd.services.pihole-exporter = { + description = "Pi-hole Prometheus Exporter"; + wantedBy = ["multi-user.target"]; + after = ["network.target"]; + serviceConfig = { + Type = "simple"; + DynamicUser = true; + ExecStart = "${pkgs.prometheus-pihole-exporter}/bin/pihole_exporter -pihole_hostname localhost -pihole_port 8053 -port ${toString ports.pihole}"; + Restart = "on-failure"; + }; + }; + + # Exportarr for Sonarr + systemd.services.exportarr-sonarr = { + description = "Exportarr Prometheus Exporter for Sonarr"; + wantedBy = ["multi-user.target"]; + after = ["network.target"]; + serviceConfig = { + Type = "simple"; + DynamicUser = true; + ExecStart = "${pkgs.exportarr}/bin/exportarr sonarr --port ${toString ports.sonarr} --url http://localhost:8989"; + Restart = "on-failure"; + }; + }; + + # Exportarr for Radarr + systemd.services.exportarr-radarr = { + description = "Exportarr Prometheus Exporter for Radarr"; + wantedBy = ["multi-user.target"]; + after = ["network.target"]; + serviceConfig = { + Type = "simple"; + DynamicUser = true; + ExecStart = "${pkgs.exportarr}/bin/exportarr radarr --port ${toString ports.radarr} --url http://localhost:7878"; + Restart = "on-failure"; + }; + }; + + # Exportarr for Lidarr + systemd.services.exportarr-lidarr = { + description = "Exportarr Prometheus Exporter for Lidarr"; + wantedBy = ["multi-user.target"]; + after = ["network.target"]; + serviceConfig = { + Type = "simple"; + DynamicUser = true; + ExecStart = "${pkgs.exportarr}/bin/exportarr lidarr --port ${toString ports.lidarr} --url http://localhost:8686"; + Restart = "on-failure"; + }; + }; + + # Exportarr for Bazarr + systemd.services.exportarr-bazarr = { + description = "Exportarr Prometheus Exporter for Bazarr"; + wantedBy = ["multi-user.target"]; + after = ["network.target"]; + serviceConfig = { + Type = "simple"; + DynamicUser = true; + ExecStart = "${pkgs.exportarr}/bin/exportarr bazarr --port ${toString ports.bazarr} --url http://localhost:6767"; + Restart = "on-failure"; + }; + }; + + # Deluge exporter + systemd.services.deluge-exporter = { + description = "Deluge Prometheus Exporter"; + wantedBy = ["multi-user.target"]; + after = ["network.target"]; + serviceConfig = { + Type = "simple"; + DynamicUser = true; + ExecStart = "${pkgs.prometheus-deluge-exporter}/bin/deluge-exporter localhost:58846 --addr :${toString ports.deluge}"; + Restart = "on-failure"; + }; + }; + + # Samba exporter - using a simple script to expose smbstatus metrics + # For now, we'll skip this and can add later if needed + # Open firewall ports for Prometheus exporters networking.firewall = { - allowedTCPPorts = [ - 9100 # node exporter - 9256 # process exporter - 9558 # systemd exporter - 9134 # zfs exporter - 9633 # smartctl exporter + # Allow from Tailscale network + interfaces."tailscale0".allowedTCPPorts = [ + ports.node + ports.systemd + ports.process + ports.cadvisor + ports.caddy + ports.jellyfin + ports.pihole + ports.sonarr + ports.radarr + ports.lidarr + ports.bazarr + ports.deluge ]; }; }