From d45bbef50951097124cd1bf547578443d5bf9049 Mon Sep 17 00:00:00 2001 From: tsvetkov Date: Fri, 27 Feb 2026 01:28:08 +0000 Subject: [PATCH] add the ansible playbooks for the monitoring stack --- README.md | 13 ++ ansible/inventory.yml | 44 +++++ ansible/kubernetes-playbook.yml | 117 +++++++++++ ansible/playbook.yml | 37 ++++ ansible/roles/common/tasks/main.yml | 29 +++ .../files/dashboards/home-infra-overview.json | 187 ++++++++++++++++++ ansible/roles/grafana/handlers/main.yml | 5 + ansible/roles/grafana/tasks/main.yml | 77 ++++++++ .../roles/grafana/templates/dashboards.yml.j2 | 13 ++ .../grafana/templates/datasources.yml.j2 | 34 ++++ ansible/roles/prometheus/handlers/main.yml | 9 + ansible/roles/prometheus/tasks/main.yml | 82 ++++++++ .../templates/prometheus.service.j2 | 30 +++ .../prometheus/templates/prometheus.yml.j2 | 68 +++++++ ansible/roles/promtail/handlers/main.yml | 9 + ansible/roles/promtail/tasks/main.yml | 78 ++++++++ .../promtail/templates/promtail.service.j2 | 24 +++ .../roles/promtail/templates/promtail.yml.j2 | 56 ++++++ ansible/roles/snmp_exporter/handlers/main.yml | 9 + ansible/roles/snmp_exporter/tasks/main.yml | 71 +++++++ .../templates/snmp_exporter.service.j2 | 25 +++ 21 files changed, 1017 insertions(+) create mode 100644 README.md create mode 100644 ansible/inventory.yml create mode 100644 ansible/kubernetes-playbook.yml create mode 100644 ansible/playbook.yml create mode 100644 ansible/roles/common/tasks/main.yml create mode 100644 ansible/roles/grafana/files/dashboards/home-infra-overview.json create mode 100644 ansible/roles/grafana/handlers/main.yml create mode 100644 ansible/roles/grafana/tasks/main.yml create mode 100644 ansible/roles/grafana/templates/dashboards.yml.j2 create mode 100644 ansible/roles/grafana/templates/datasources.yml.j2 create mode 100644 ansible/roles/prometheus/handlers/main.yml create mode 100644 ansible/roles/prometheus/tasks/main.yml create mode 100644 ansible/roles/prometheus/templates/prometheus.service.j2 create mode 100644 ansible/roles/prometheus/templates/prometheus.yml.j2 create mode 100644 ansible/roles/promtail/handlers/main.yml create mode 100644 ansible/roles/promtail/tasks/main.yml create mode 100644 ansible/roles/promtail/templates/promtail.service.j2 create mode 100644 ansible/roles/promtail/templates/promtail.yml.j2 create mode 100644 ansible/roles/snmp_exporter/handlers/main.yml create mode 100644 ansible/roles/snmp_exporter/tasks/main.yml create mode 100644 ansible/roles/snmp_exporter/templates/snmp_exporter.service.j2 diff --git a/README.md b/README.md new file mode 100644 index 0000000..e368083 --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +The ansible playbook found here is used to deploy a monitoring stack for a homelab. + +Detailed instructions about the project can be found in it's [own repository](https://git.96-fromsofia.net/k8s/monitoring-stack) + +- playbook.yml is deploying the monitoring stack outside of the talos cluster +- kubernetes-playbook.yml is deploying the monitoring stack inside of the talos cluster + +To run a playbook: +``` +ansible-playbook -i inventory.yml playbook.yml +``` + +Both playbooks can be ran and both stacks are designed to coexist. This is so the monitoring of the network layer and physical hosts is still active and visible even when the talos cluster itself is not running. diff --git a/ansible/inventory.yml b/ansible/inventory.yml new file mode 100644 index 0000000..248f57f --- /dev/null +++ b/ansible/inventory.yml @@ -0,0 +1,44 @@ +all: + children: + monitoring: + hosts: + rpi: + ansible_host: 192.168.1.100 # Change to your RPi IP + ansible_user: pi # Change if different + ansible_become: true + + # Configuration variables + prometheus_version: "2.48.0" + promtail_version: "2.9.2" + grafana_version: "10.2.2" + snmp_exporter_version: "0.24.1" + + # Loki endpoint (in Talos cluster) + loki_url: "http://192.168.1.200:30100" # Change to your Talos node IP + + # Prometheus cluster endpoint (for Grafana datasource) + prometheus_cluster_url: "http://192.168.1.200:30090" # Change to your Talos node IP + + # Network device IPs for SNMP + snmp_targets: + - name: "router" + ip: "192.168.1.1" + module: "if_mib" + - name: "modem" + ip: "192.168.1.2" + module: "if_mib" + + # Targets with node_exporter + node_exporter_targets: + - name: "proxmox" + ip: "192.168.1.10" + port: 9100 + - name: "nfs" + ip: "192.168.1.11" + port: 9100 + + # Proxmox PVE exporter target + proxmox_targets: + - name: "proxmox" + ip: "192.168.1.10" + port: 9221 diff --git a/ansible/kubernetes-playbook.yml b/ansible/kubernetes-playbook.yml new file mode 100644 index 0000000..fec3de2 --- /dev/null +++ b/ansible/kubernetes-playbook.yml @@ -0,0 +1,117 @@ +--- +# Deploy monitoring stack to Talos cluster via Ansible +# +# Prerequisites: +# - kubectl configured with access to your Talos cluster +# - kubernetes.core collection installed: ansible-galaxy collection install kubernetes.core +# +# Usage: +# ansible-playbook -i inventory.yml kubernetes-playbook.yml +# +# Or with a specific kubeconfig: +# ansible-playbook -i inventory.yml kubernetes-playbook.yml -e kubeconfig_path=~/.kube/talos-config + +- name: Deploy monitoring stack to Kubernetes + hosts: localhost + connection: local + gather_facts: false + + vars: + kubeconfig_path: "{{ lookup('env', 'KUBECONFIG') | default('~/.kube/config', true) }}" + manifests_dir: "{{ playbook_dir }}/kubernetes" + + tasks: + - name: Create monitoring namespace + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + state: present + src: "{{ manifests_dir }}/namespace.yaml" + + - name: Deploy Prometheus + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + state: present + src: "{{ item }}" + loop: + - "{{ manifests_dir }}/prometheus/rbac.yaml" + - "{{ manifests_dir }}/prometheus/configmap.yaml" + - "{{ manifests_dir }}/prometheus/deployment.yaml" + - "{{ manifests_dir }}/prometheus/service.yaml" + + - name: Wait for Prometheus to be ready + kubernetes.core.k8s_info: + kubeconfig: "{{ kubeconfig_path }}" + kind: Deployment + name: prometheus + namespace: monitoring + register: prometheus_deployment + until: prometheus_deployment.resources[0].status.readyReplicas | default(0) >= 1 + retries: 30 + delay: 10 + + - name: Deploy Loki + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + state: present + src: "{{ item }}" + loop: + - "{{ manifests_dir }}/loki/configmap.yaml" + - "{{ manifests_dir }}/loki/deployment.yaml" + - "{{ manifests_dir }}/loki/service.yaml" + + - name: Wait for Loki to be ready + kubernetes.core.k8s_info: + kubeconfig: "{{ kubeconfig_path }}" + kind: Deployment + name: loki + namespace: monitoring + register: loki_deployment + until: loki_deployment.resources[0].status.readyReplicas | default(0) >= 1 + retries: 30 + delay: 10 + + - name: Deploy Promtail + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + state: present + src: "{{ item }}" + loop: + - "{{ manifests_dir }}/promtail/rbac.yaml" + - "{{ manifests_dir }}/promtail/configmap.yaml" + - "{{ manifests_dir }}/promtail/daemonset.yaml" + + - name: Deploy Node Exporter + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + state: present + src: "{{ manifests_dir }}/node-exporter/daemonset.yaml" + + - name: Deploy Kube State Metrics + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + state: present + src: "{{ item }}" + loop: + - "{{ manifests_dir }}/kube-state-metrics/rbac.yaml" + - "{{ manifests_dir }}/kube-state-metrics/deployment.yaml" + + - name: Get cluster node IPs + kubernetes.core.k8s_info: + kubeconfig: "{{ kubeconfig_path }}" + kind: Node + register: cluster_nodes + + - name: Display access information + ansible.builtin.debug: + msg: + - "Monitoring stack deployed successfully!" + - "" + - "Prometheus: http://:30090" + - "Loki: http://:30100" + - "" + - "Node IPs:" + - "{{ cluster_nodes.resources | map(attribute='status.addresses') | flatten | selectattr('type', 'equalto', 'InternalIP') | map(attribute='address') | list }}" + - "" + - "Update your RPi inventory.yml with one of these IPs for:" + - " loki_url: http://:30100" + - " prometheus_cluster_url: http://:30090" diff --git a/ansible/playbook.yml b/ansible/playbook.yml new file mode 100644 index 0000000..b6c5d68 --- /dev/null +++ b/ansible/playbook.yml @@ -0,0 +1,37 @@ +--- +- name: Deploy monitoring stack on RPi + hosts: monitoring + become: true + + vars: + # Architecture detection for ARM + arch_map: + armv7l: "armv7" + aarch64: "arm64" + x86_64: "amd64" + + pre_tasks: + - name: Gather architecture + ansible.builtin.set_fact: + go_arch: "{{ arch_map[ansible_architecture] | default('arm64') }}" + + - name: Update apt cache + ansible.builtin.apt: + update_cache: true + cache_valid_time: 3600 + when: ansible_os_family == "Debian" + + roles: + - common + - prometheus + - snmp_exporter + - promtail + - grafana + + post_tasks: + - name: Display access information + ansible.builtin.debug: + msg: + - "Grafana: http://{{ ansible_host }}:3000 (admin/admin)" + - "Prometheus: http://{{ ansible_host }}:9090" + - "Syslog listener: {{ ansible_host }}:514 (UDP)" diff --git a/ansible/roles/common/tasks/main.yml b/ansible/roles/common/tasks/main.yml new file mode 100644 index 0000000..5591e01 --- /dev/null +++ b/ansible/roles/common/tasks/main.yml @@ -0,0 +1,29 @@ +--- +- name: Install common dependencies + ansible.builtin.apt: + name: + - curl + - tar + - gzip + - ca-certificates + state: present + +- name: Create monitoring user + ansible.builtin.user: + name: monitoring + system: true + shell: /usr/sbin/nologin + home: /var/lib/monitoring + create_home: false + +- name: Create common directories + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: monitoring + group: monitoring + mode: "0755" + loop: + - /etc/monitoring + - /var/lib/monitoring + - /var/log/monitoring diff --git a/ansible/roles/grafana/files/dashboards/home-infra-overview.json b/ansible/roles/grafana/files/dashboards/home-infra-overview.json new file mode 100644 index 0000000..ccf73b5 --- /dev/null +++ b/ansible/roles/grafana/files/dashboards/home-infra-overview.json @@ -0,0 +1,187 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "title": "Infrastructure Overview", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus-infra" }, + "fieldConfig": { + "defaults": { + "mappings": [ + { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" } + ], + "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } + } + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 }, + "id": 2, + "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "title": "Targets Up", + "type": "stat", + "targets": [{ "expr": "count(up == 1)", "refId": "A" }] + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus-infra" }, + "fieldConfig": { + "defaults": { + "mappings": [ + { "options": { "0": { "color": "green", "text": "0" } }, "type": "value" } + ], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } + } + }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 }, + "id": 3, + "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "title": "Targets Down", + "type": "stat", + "targets": [{ "expr": "count(up == 0) or vector(0)", "refId": "A" }] + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 }, + "id": 4, + "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }, "textMode": "auto" }, + "title": "Log Lines (1h)", + "type": "stat", + "targets": [{ "expr": "sum(count_over_time({job=~\".+\"}[1h]))", "refId": "A" }] + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, + "id": 5, + "fieldConfig": { + "defaults": { + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 10 }, { "color": "red", "value": 50 }] } + } + }, + "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }, "textMode": "auto" }, + "title": "Error Logs (1h)", + "type": "stat", + "targets": [{ "expr": "sum(count_over_time({severity=~\"err|error|crit|alert|emerg\"}[1h])) or vector(0)", "refId": "A" }] + }, + { + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 10, + "title": "Network Devices", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus-infra" }, + "fieldConfig": { + "defaults": { + "custom": { "lineWidth": 1, "fillOpacity": 10 }, + "unit": "bps" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "id": 11, + "options": { "legend": { "displayMode": "list", "placement": "bottom" } }, + "title": "Network Interface Traffic", + "type": "timeseries", + "targets": [ + { "expr": "rate(ifHCInOctets{job=\"snmp\"}[5m]) * 8", "legendFormat": "{{device}} - {{ifDescr}} In", "refId": "A" }, + { "expr": "rate(ifHCOutOctets{job=\"snmp\"}[5m]) * 8", "legendFormat": "{{device}} - {{ifDescr}} Out", "refId": "B" } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus-infra" }, + "fieldConfig": { + "defaults": { + "mappings": [ + { "options": { "1": { "color": "green", "text": "Up" }, "2": { "color": "red", "text": "Down" } }, "type": "value" } + ] + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "id": 12, + "options": { "showHeader": true }, + "title": "Interface Status", + "type": "table", + "targets": [{ "expr": "ifOperStatus{job=\"snmp\"}", "format": "table", "instant": true, "refId": "A" }], + "transformations": [ + { "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true, "job": true }, "renameByName": { "device": "Device", "ifDescr": "Interface", "Value": "Status" } } } + ] + }, + { + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, + "id": 20, + "title": "Proxmox / VMs", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus-infra" }, + "fieldConfig": { + "defaults": { "unit": "percentunit", "max": 1, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.7 }, { "color": "red", "value": 0.9 }] } } + }, + "gridPos": { "h": 6, "w": 8, "x": 0, "y": 15 }, + "id": 21, + "options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, + "title": "CPU Usage", + "type": "gauge", + "targets": [{ "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\",instance=~\"proxmox.*|nfs.*\"}[5m])) by (instance)", "legendFormat": "{{instance}}", "refId": "A" }] + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus-infra" }, + "fieldConfig": { + "defaults": { "unit": "percentunit", "max": 1, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.7 }, { "color": "red", "value": 0.9 }] } } + }, + "gridPos": { "h": 6, "w": 8, "x": 8, "y": 15 }, + "id": 22, + "options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, + "title": "Memory Usage", + "type": "gauge", + "targets": [{ "expr": "1 - (node_memory_MemAvailable_bytes{instance=~\"proxmox.*|nfs.*\"} / node_memory_MemTotal_bytes{instance=~\"proxmox.*|nfs.*\"})", "legendFormat": "{{instance}}", "refId": "A" }] + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus-infra" }, + "fieldConfig": { + "defaults": { "unit": "percentunit", "max": 1, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.7 }, { "color": "red", "value": 0.9 }] } } + }, + "gridPos": { "h": 6, "w": 8, "x": 16, "y": 15 }, + "id": 23, + "options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, + "title": "Disk Usage", + "type": "gauge", + "targets": [{ "expr": "1 - (node_filesystem_avail_bytes{instance=~\"proxmox.*|nfs.*\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=~\"proxmox.*|nfs.*\",mountpoint=\"/\"})", "legendFormat": "{{instance}}", "refId": "A" }] + }, + { + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 }, + "id": 30, + "title": "Recent Logs", + "type": "row" + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 22 }, + "id": 31, + "options": { "showTime": true, "showLabels": true, "showCommonLabels": false, "wrapLogMessage": true, "prettifyLogMessage": false, "enableLogDetails": true, "sortOrder": "Descending" }, + "title": "All Logs", + "type": "logs", + "targets": [{ "expr": "{job=~\".+\"}", "refId": "A" }] + } + ], + "refresh": "30s", + "schemaVersion": 38, + "style": "dark", + "tags": ["home-infra"], + "templating": { "list": [] }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "Home Infrastructure Overview", + "uid": "home-infra-overview", + "version": 1 +} diff --git a/ansible/roles/grafana/handlers/main.yml b/ansible/roles/grafana/handlers/main.yml new file mode 100644 index 0000000..c7d3ee4 --- /dev/null +++ b/ansible/roles/grafana/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Restart grafana + ansible.builtin.systemd: + name: grafana-server + state: restarted diff --git a/ansible/roles/grafana/tasks/main.yml b/ansible/roles/grafana/tasks/main.yml new file mode 100644 index 0000000..424eaca --- /dev/null +++ b/ansible/roles/grafana/tasks/main.yml @@ -0,0 +1,77 @@ +--- +- name: Add Grafana APT key + ansible.builtin.get_url: + url: https://apt.grafana.com/gpg.key + dest: /etc/apt/keyrings/grafana.asc + mode: "0644" + +- name: Add Grafana APT repository + ansible.builtin.apt_repository: + repo: "deb [signed-by=/etc/apt/keyrings/grafana.asc] https://apt.grafana.com stable main" + state: present + filename: grafana + +- name: Install Grafana + ansible.builtin.apt: + name: grafana + state: present + update_cache: true + notify: Restart grafana + +- name: Create Grafana provisioning directories + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: grafana + group: grafana + mode: "0755" + loop: + - /etc/grafana/provisioning/datasources + - /etc/grafana/provisioning/dashboards + - /var/lib/grafana/dashboards + +- name: Deploy Grafana datasources + ansible.builtin.template: + src: datasources.yml.j2 + dest: /etc/grafana/provisioning/datasources/datasources.yml + owner: grafana + group: grafana + mode: "0640" + notify: Restart grafana + +- name: Deploy Grafana dashboard provisioning + ansible.builtin.template: + src: dashboards.yml.j2 + dest: /etc/grafana/provisioning/dashboards/dashboards.yml + owner: grafana + group: grafana + mode: "0640" + notify: Restart grafana + +- name: Deploy default dashboards + ansible.builtin.copy: + src: "{{ item }}" + dest: /var/lib/grafana/dashboards/ + owner: grafana + group: grafana + mode: "0644" + loop: "{{ lookup('fileglob', 'files/dashboards/*.json', wantlist=True) }}" + notify: Restart grafana + ignore_errors: true # OK if no dashboards yet + +- name: Configure Grafana + ansible.builtin.lineinfile: + path: /etc/grafana/grafana.ini + regexp: "{{ item.regexp }}" + line: "{{ item.line }}" + state: present + loop: + - { regexp: '^;?http_port', line: 'http_port = 3000' } + - { regexp: '^;?http_addr', line: 'http_addr = 0.0.0.0' } + notify: Restart grafana + +- name: Enable and start Grafana + ansible.builtin.systemd: + name: grafana-server + enabled: true + state: started diff --git a/ansible/roles/grafana/templates/dashboards.yml.j2 b/ansible/roles/grafana/templates/dashboards.yml.j2 new file mode 100644 index 0000000..534818e --- /dev/null +++ b/ansible/roles/grafana/templates/dashboards.yml.j2 @@ -0,0 +1,13 @@ +apiVersion: 1 + +providers: + - name: 'Home Infrastructure' + orgId: 1 + folder: 'Home Infra' + folderUid: 'home-infra' + type: file + disableDeletion: false + updateIntervalSeconds: 30 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards diff --git a/ansible/roles/grafana/templates/datasources.yml.j2 b/ansible/roles/grafana/templates/datasources.yml.j2 new file mode 100644 index 0000000..6bae865 --- /dev/null +++ b/ansible/roles/grafana/templates/datasources.yml.j2 @@ -0,0 +1,34 @@ +apiVersion: 1 + +datasources: + # Local Prometheus (RPi - infrastructure metrics) + - name: Prometheus-Infra + type: prometheus + access: proxy + url: http://localhost:9090 + isDefault: true + editable: false + jsonData: + timeInterval: "30s" + httpMethod: POST + + # Cluster Prometheus (Talos - Kubernetes metrics) + - name: Prometheus-Cluster + type: prometheus + access: proxy + url: {{ prometheus_cluster_url }} + isDefault: false + editable: false + jsonData: + timeInterval: "30s" + httpMethod: POST + + # Loki (Talos cluster - centralized logs) + - name: Loki + type: loki + access: proxy + url: {{ loki_url }} + isDefault: false + editable: false + jsonData: + maxLines: 1000 diff --git a/ansible/roles/prometheus/handlers/main.yml b/ansible/roles/prometheus/handlers/main.yml new file mode 100644 index 0000000..2d9626a --- /dev/null +++ b/ansible/roles/prometheus/handlers/main.yml @@ -0,0 +1,9 @@ +--- +- name: Reload systemd + ansible.builtin.systemd: + daemon_reload: true + +- name: Restart prometheus + ansible.builtin.systemd: + name: prometheus + state: restarted diff --git a/ansible/roles/prometheus/tasks/main.yml b/ansible/roles/prometheus/tasks/main.yml new file mode 100644 index 0000000..3764061 --- /dev/null +++ b/ansible/roles/prometheus/tasks/main.yml @@ -0,0 +1,82 @@ +--- +- name: Create Prometheus directories + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: monitoring + group: monitoring + mode: "0755" + loop: + - /etc/prometheus + - /var/lib/prometheus + +- name: Check if Prometheus is installed + ansible.builtin.stat: + path: /usr/local/bin/prometheus + register: prometheus_binary + +- name: Get installed Prometheus version + ansible.builtin.command: /usr/local/bin/prometheus --version + register: prometheus_installed_version + changed_when: false + failed_when: false + when: prometheus_binary.stat.exists + +- name: Download Prometheus + ansible.builtin.get_url: + url: "https://github.com/prometheus/prometheus/releases/download/v{{ prometheus_version }}/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}.tar.gz" + dest: "/tmp/prometheus-{{ prometheus_version }}.tar.gz" + mode: "0644" + when: not prometheus_binary.stat.exists or prometheus_version not in (prometheus_installed_version.stdout | default('')) + +- name: Extract Prometheus + ansible.builtin.unarchive: + src: "/tmp/prometheus-{{ prometheus_version }}.tar.gz" + dest: /tmp + remote_src: true + when: not prometheus_binary.stat.exists or prometheus_version not in (prometheus_installed_version.stdout | default('')) + +- name: Install Prometheus binaries + ansible.builtin.copy: + src: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}/{{ item }}" + dest: "/usr/local/bin/{{ item }}" + mode: "0755" + remote_src: true + loop: + - prometheus + - promtool + notify: Restart prometheus + when: not prometheus_binary.stat.exists or prometheus_version not in (prometheus_installed_version.stdout | default('')) + +- name: Deploy Prometheus configuration + ansible.builtin.template: + src: prometheus.yml.j2 + dest: /etc/prometheus/prometheus.yml + owner: monitoring + group: monitoring + mode: "0644" + notify: Restart prometheus + +- name: Deploy Prometheus systemd service + ansible.builtin.template: + src: prometheus.service.j2 + dest: /etc/systemd/system/prometheus.service + mode: "0644" + notify: + - Reload systemd + - Restart prometheus + +- name: Enable and start Prometheus + ansible.builtin.systemd: + name: prometheus + enabled: true + state: started + daemon_reload: true + +- name: Clean up downloaded files + ansible.builtin.file: + path: "{{ item }}" + state: absent + loop: + - "/tmp/prometheus-{{ prometheus_version }}.tar.gz" + - "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}" diff --git a/ansible/roles/prometheus/templates/prometheus.service.j2 b/ansible/roles/prometheus/templates/prometheus.service.j2 new file mode 100644 index 0000000..bef070e --- /dev/null +++ b/ansible/roles/prometheus/templates/prometheus.service.j2 @@ -0,0 +1,30 @@ +[Unit] +Description=Prometheus Monitoring System +Documentation=https://prometheus.io/docs/ +Wants=network-online.target +After=network-online.target + +[Service] +Type=simple +User=monitoring +Group=monitoring +ExecReload=/bin/kill -HUP $MAINPID +ExecStart=/usr/local/bin/prometheus \ + --config.file=/etc/prometheus/prometheus.yml \ + --storage.tsdb.path=/var/lib/prometheus \ + --storage.tsdb.retention.time=15d \ + --web.listen-address=0.0.0.0:9090 \ + --web.enable-lifecycle \ + --log.level=info + +SyslogIdentifier=prometheus +Restart=always +RestartSec=5 + +# Hardening +NoNewPrivileges=true +ProtectSystem=full +ProtectHome=true + +[Install] +WantedBy=multi-user.target diff --git a/ansible/roles/prometheus/templates/prometheus.yml.j2 b/ansible/roles/prometheus/templates/prometheus.yml.j2 new file mode 100644 index 0000000..58abd95 --- /dev/null +++ b/ansible/roles/prometheus/templates/prometheus.yml.j2 @@ -0,0 +1,68 @@ +global: + scrape_interval: 30s + evaluation_interval: 30s + external_labels: + monitor: 'home-infra' + source: 'rpi' + +scrape_configs: + # Self-monitoring + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + labels: + instance: 'rpi-prometheus' + + # SNMP targets (network devices) +{% if snmp_targets is defined and snmp_targets | length > 0 %} + - job_name: 'snmp' + scrape_interval: 60s + scrape_timeout: 30s + static_configs: +{% for target in snmp_targets %} + - targets: ['{{ target.ip }}'] + labels: + device: '{{ target.name }}' +{% endfor %} + metrics_path: /snmp + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: localhost:9116 + params: + module: [if_mib] # Default module, can be overridden per-target +{% endif %} + + # Node exporter targets (VMs with node_exporter) +{% if node_exporter_targets is defined and node_exporter_targets | length > 0 %} + - job_name: 'node' + static_configs: +{% for target in node_exporter_targets %} + - targets: ['{{ target.ip }}:{{ target.port | default(9100) }}'] + labels: + instance: '{{ target.name }}' +{% endfor %} +{% endif %} + + # Proxmox PVE exporter +{% if proxmox_targets is defined and proxmox_targets | length > 0 %} + - job_name: 'proxmox' + scrape_interval: 60s + static_configs: +{% for target in proxmox_targets %} + - targets: ['{{ target.ip }}:{{ target.port | default(9221) }}'] + labels: + instance: '{{ target.name }}' +{% endfor %} + metrics_path: /pve + params: + module: [default] +{% endif %} + + # SNMP exporter self-metrics + - job_name: 'snmp-exporter' + static_configs: + - targets: ['localhost:9116'] diff --git a/ansible/roles/promtail/handlers/main.yml b/ansible/roles/promtail/handlers/main.yml new file mode 100644 index 0000000..a1a4c8c --- /dev/null +++ b/ansible/roles/promtail/handlers/main.yml @@ -0,0 +1,9 @@ +--- +- name: Reload systemd + ansible.builtin.systemd: + daemon_reload: true + +- name: Restart promtail + ansible.builtin.systemd: + name: promtail + state: restarted diff --git a/ansible/roles/promtail/tasks/main.yml b/ansible/roles/promtail/tasks/main.yml new file mode 100644 index 0000000..41ef567 --- /dev/null +++ b/ansible/roles/promtail/tasks/main.yml @@ -0,0 +1,78 @@ +--- +- name: Create Promtail directories + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: monitoring + group: monitoring + mode: "0755" + loop: + - /etc/promtail + - /var/lib/promtail + +- name: Check if Promtail is installed + ansible.builtin.stat: + path: /usr/local/bin/promtail + register: promtail_binary + +- name: Download Promtail + ansible.builtin.get_url: + url: "https://github.com/grafana/loki/releases/download/v{{ promtail_version }}/promtail-linux-{{ go_arch }}.zip" + dest: "/tmp/promtail-{{ promtail_version }}.zip" + mode: "0644" + when: not promtail_binary.stat.exists + +- name: Install unzip + ansible.builtin.apt: + name: unzip + state: present + when: ansible_os_family == "Debian" + +- name: Extract Promtail + ansible.builtin.unarchive: + src: "/tmp/promtail-{{ promtail_version }}.zip" + dest: /tmp + remote_src: true + when: not promtail_binary.stat.exists + +- name: Install Promtail binary + ansible.builtin.copy: + src: /tmp/promtail-linux-{{ go_arch }} + dest: /usr/local/bin/promtail + mode: "0755" + remote_src: true + notify: Restart promtail + when: not promtail_binary.stat.exists + +- name: Deploy Promtail configuration + ansible.builtin.template: + src: promtail.yml.j2 + dest: /etc/promtail/promtail.yml + owner: monitoring + group: monitoring + mode: "0644" + notify: Restart promtail + +- name: Deploy Promtail systemd service + ansible.builtin.template: + src: promtail.service.j2 + dest: /etc/systemd/system/promtail.service + mode: "0644" + notify: + - Reload systemd + - Restart promtail + +- name: Enable and start Promtail + ansible.builtin.systemd: + name: promtail + enabled: true + state: started + daemon_reload: true + +- name: Clean up downloaded files + ansible.builtin.file: + path: "{{ item }}" + state: absent + loop: + - "/tmp/promtail-{{ promtail_version }}.zip" + - "/tmp/promtail-linux-{{ go_arch }}" diff --git a/ansible/roles/promtail/templates/promtail.service.j2 b/ansible/roles/promtail/templates/promtail.service.j2 new file mode 100644 index 0000000..9b63b8a --- /dev/null +++ b/ansible/roles/promtail/templates/promtail.service.j2 @@ -0,0 +1,24 @@ +[Unit] +Description=Promtail Log Collector +Documentation=https://grafana.com/docs/loki/latest/clients/promtail/ +Wants=network-online.target +After=network-online.target + +[Service] +Type=simple +User=root +Group=root +ExecStart=/usr/local/bin/promtail \ + -config.file=/etc/promtail/promtail.yml \ + -config.expand-env=true + +SyslogIdentifier=promtail +Restart=always +RestartSec=5 + +# Need root for syslog port 514 and journal access +# Can use CAP_NET_BIND_SERVICE instead if preferred +AmbientCapabilities=CAP_NET_BIND_SERVICE + +[Install] +WantedBy=multi-user.target diff --git a/ansible/roles/promtail/templates/promtail.yml.j2 b/ansible/roles/promtail/templates/promtail.yml.j2 new file mode 100644 index 0000000..1dff82c --- /dev/null +++ b/ansible/roles/promtail/templates/promtail.yml.j2 @@ -0,0 +1,56 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /var/lib/promtail/positions.yaml + +clients: + - url: {{ loki_url }}/loki/api/v1/push + tenant_id: home-infra + batchwait: 1s + batchsize: 1048576 + timeout: 10s + +scrape_configs: + # Syslog listener for network devices + - job_name: syslog + syslog: + listen_address: 0.0.0.0:514 + listen_protocol: udp + idle_timeout: 60s + label_structured_data: true + labels: + job: syslog + source: network-devices + relabel_configs: + - source_labels: ['__syslog_message_hostname'] + target_label: 'host' + - source_labels: ['__syslog_message_severity'] + target_label: 'severity' + - source_labels: ['__syslog_message_facility'] + target_label: 'facility' + - source_labels: ['__syslog_message_app_name'] + target_label: 'app' + pipeline_stages: + - match: + selector: '{job="syslog"}' + stages: + # Extract common patterns from network device logs + - regex: + expression: '(?P\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' + - labels: + src_ip: + + # Local system journal (RPi logs) + - job_name: journal + journal: + max_age: 12h + labels: + job: systemd-journal + host: rpi + relabel_configs: + - source_labels: ['__journal__systemd_unit'] + target_label: 'unit' + - source_labels: ['__journal_priority_keyword'] + target_label: 'severity' diff --git a/ansible/roles/snmp_exporter/handlers/main.yml b/ansible/roles/snmp_exporter/handlers/main.yml new file mode 100644 index 0000000..1fb3b76 --- /dev/null +++ b/ansible/roles/snmp_exporter/handlers/main.yml @@ -0,0 +1,9 @@ +--- +- name: Reload systemd + ansible.builtin.systemd: + daemon_reload: true + +- name: Restart snmp_exporter + ansible.builtin.systemd: + name: snmp_exporter + state: restarted diff --git a/ansible/roles/snmp_exporter/tasks/main.yml b/ansible/roles/snmp_exporter/tasks/main.yml new file mode 100644 index 0000000..7524b63 --- /dev/null +++ b/ansible/roles/snmp_exporter/tasks/main.yml @@ -0,0 +1,71 @@ +--- +- name: Create SNMP exporter directory + ansible.builtin.file: + path: /etc/snmp_exporter + state: directory + owner: monitoring + group: monitoring + mode: "0755" + +- name: Check if SNMP exporter is installed + ansible.builtin.stat: + path: /usr/local/bin/snmp_exporter + register: snmp_exporter_binary + +- name: Download SNMP exporter + ansible.builtin.get_url: + url: "https://github.com/prometheus/snmp_exporter/releases/download/v{{ snmp_exporter_version }}/snmp_exporter-{{ snmp_exporter_version }}.linux-{{ go_arch }}.tar.gz" + dest: "/tmp/snmp_exporter-{{ snmp_exporter_version }}.tar.gz" + mode: "0644" + when: not snmp_exporter_binary.stat.exists + +- name: Extract SNMP exporter + ansible.builtin.unarchive: + src: "/tmp/snmp_exporter-{{ snmp_exporter_version }}.tar.gz" + dest: /tmp + remote_src: true + when: not snmp_exporter_binary.stat.exists + +- name: Install SNMP exporter binary + ansible.builtin.copy: + src: "/tmp/snmp_exporter-{{ snmp_exporter_version }}.linux-{{ go_arch }}/snmp_exporter" + dest: /usr/local/bin/snmp_exporter + mode: "0755" + remote_src: true + notify: Restart snmp_exporter + when: not snmp_exporter_binary.stat.exists + +- name: Install default SNMP exporter config + ansible.builtin.copy: + src: "/tmp/snmp_exporter-{{ snmp_exporter_version }}.linux-{{ go_arch }}/snmp.yml" + dest: /etc/snmp_exporter/snmp.yml + owner: monitoring + group: monitoring + mode: "0644" + remote_src: true + notify: Restart snmp_exporter + when: not snmp_exporter_binary.stat.exists + +- name: Deploy SNMP exporter systemd service + ansible.builtin.template: + src: snmp_exporter.service.j2 + dest: /etc/systemd/system/snmp_exporter.service + mode: "0644" + notify: + - Reload systemd + - Restart snmp_exporter + +- name: Enable and start SNMP exporter + ansible.builtin.systemd: + name: snmp_exporter + enabled: true + state: started + daemon_reload: true + +- name: Clean up downloaded files + ansible.builtin.file: + path: "{{ item }}" + state: absent + loop: + - "/tmp/snmp_exporter-{{ snmp_exporter_version }}.tar.gz" + - "/tmp/snmp_exporter-{{ snmp_exporter_version }}.linux-{{ go_arch }}" diff --git a/ansible/roles/snmp_exporter/templates/snmp_exporter.service.j2 b/ansible/roles/snmp_exporter/templates/snmp_exporter.service.j2 new file mode 100644 index 0000000..722f7c8 --- /dev/null +++ b/ansible/roles/snmp_exporter/templates/snmp_exporter.service.j2 @@ -0,0 +1,25 @@ +[Unit] +Description=SNMP Exporter for Prometheus +Documentation=https://github.com/prometheus/snmp_exporter +Wants=network-online.target +After=network-online.target + +[Service] +Type=simple +User=monitoring +Group=monitoring +ExecStart=/usr/local/bin/snmp_exporter \ + --config.file=/etc/snmp_exporter/snmp.yml \ + --web.listen-address=0.0.0.0:9116 \ + --log.level=info + +SyslogIdentifier=snmp_exporter +Restart=always +RestartSec=5 + +NoNewPrivileges=true +ProtectSystem=full +ProtectHome=true + +[Install] +WantedBy=multi-user.target