add the ansible playbooks for the monitoring stack

This commit is contained in:
tsvetkov
2026-02-27 01:28:08 +00:00
commit d45bbef509
21 changed files with 1017 additions and 0 deletions

View File

@@ -0,0 +1,187 @@
{
"annotations": {
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 1,
"title": "Infrastructure Overview",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
"fieldConfig": {
"defaults": {
"mappings": [
{ "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
}
},
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
"id": 2,
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"title": "Targets Up",
"type": "stat",
"targets": [{ "expr": "count(up == 1)", "refId": "A" }]
},
{
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
"fieldConfig": {
"defaults": {
"mappings": [
{ "options": { "0": { "color": "green", "text": "0" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
}
},
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
"id": 3,
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"title": "Targets Down",
"type": "stat",
"targets": [{ "expr": "count(up == 0) or vector(0)", "refId": "A" }]
},
{
"datasource": { "type": "loki", "uid": "loki" },
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
"id": 4,
"options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }, "textMode": "auto" },
"title": "Log Lines (1h)",
"type": "stat",
"targets": [{ "expr": "sum(count_over_time({job=~\".+\"}[1h]))", "refId": "A" }]
},
{
"datasource": { "type": "loki", "uid": "loki" },
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
"id": 5,
"fieldConfig": {
"defaults": {
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 10 }, { "color": "red", "value": 50 }] }
}
},
"options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }, "textMode": "auto" },
"title": "Error Logs (1h)",
"type": "stat",
"targets": [{ "expr": "sum(count_over_time({severity=~\"err|error|crit|alert|emerg\"}[1h])) or vector(0)", "refId": "A" }]
},
{
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
"id": 10,
"title": "Network Devices",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
"fieldConfig": {
"defaults": {
"custom": { "lineWidth": 1, "fillOpacity": 10 },
"unit": "bps"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
"id": 11,
"options": { "legend": { "displayMode": "list", "placement": "bottom" } },
"title": "Network Interface Traffic",
"type": "timeseries",
"targets": [
{ "expr": "rate(ifHCInOctets{job=\"snmp\"}[5m]) * 8", "legendFormat": "{{device}} - {{ifDescr}} In", "refId": "A" },
{ "expr": "rate(ifHCOutOctets{job=\"snmp\"}[5m]) * 8", "legendFormat": "{{device}} - {{ifDescr}} Out", "refId": "B" }
]
},
{
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
"fieldConfig": {
"defaults": {
"mappings": [
{ "options": { "1": { "color": "green", "text": "Up" }, "2": { "color": "red", "text": "Down" } }, "type": "value" }
]
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
"id": 12,
"options": { "showHeader": true },
"title": "Interface Status",
"type": "table",
"targets": [{ "expr": "ifOperStatus{job=\"snmp\"}", "format": "table", "instant": true, "refId": "A" }],
"transformations": [
{ "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true, "job": true }, "renameByName": { "device": "Device", "ifDescr": "Interface", "Value": "Status" } } }
]
},
{
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
"id": 20,
"title": "Proxmox / VMs",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
"fieldConfig": {
"defaults": { "unit": "percentunit", "max": 1, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.7 }, { "color": "red", "value": 0.9 }] } }
},
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 15 },
"id": 21,
"options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true },
"title": "CPU Usage",
"type": "gauge",
"targets": [{ "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\",instance=~\"proxmox.*|nfs.*\"}[5m])) by (instance)", "legendFormat": "{{instance}}", "refId": "A" }]
},
{
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
"fieldConfig": {
"defaults": { "unit": "percentunit", "max": 1, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.7 }, { "color": "red", "value": 0.9 }] } }
},
"gridPos": { "h": 6, "w": 8, "x": 8, "y": 15 },
"id": 22,
"options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true },
"title": "Memory Usage",
"type": "gauge",
"targets": [{ "expr": "1 - (node_memory_MemAvailable_bytes{instance=~\"proxmox.*|nfs.*\"} / node_memory_MemTotal_bytes{instance=~\"proxmox.*|nfs.*\"})", "legendFormat": "{{instance}}", "refId": "A" }]
},
{
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
"fieldConfig": {
"defaults": { "unit": "percentunit", "max": 1, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.7 }, { "color": "red", "value": 0.9 }] } }
},
"gridPos": { "h": 6, "w": 8, "x": 16, "y": 15 },
"id": 23,
"options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true },
"title": "Disk Usage",
"type": "gauge",
"targets": [{ "expr": "1 - (node_filesystem_avail_bytes{instance=~\"proxmox.*|nfs.*\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=~\"proxmox.*|nfs.*\",mountpoint=\"/\"})", "legendFormat": "{{instance}}", "refId": "A" }]
},
{
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 },
"id": 30,
"title": "Recent Logs",
"type": "row"
},
{
"datasource": { "type": "loki", "uid": "loki" },
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 22 },
"id": 31,
"options": { "showTime": true, "showLabels": true, "showCommonLabels": false, "wrapLogMessage": true, "prettifyLogMessage": false, "enableLogDetails": true, "sortOrder": "Descending" },
"title": "All Logs",
"type": "logs",
"targets": [{ "expr": "{job=~\".+\"}", "refId": "A" }]
}
],
"refresh": "30s",
"schemaVersion": 38,
"style": "dark",
"tags": ["home-infra"],
"templating": { "list": [] },
"time": { "from": "now-1h", "to": "now" },
"timepicker": {},
"timezone": "",
"title": "Home Infrastructure Overview",
"uid": "home-infra-overview",
"version": 1
}

View File

@@ -0,0 +1,5 @@
---
- name: Restart grafana
ansible.builtin.systemd:
name: grafana-server
state: restarted

View File

@@ -0,0 +1,77 @@
---
- name: Add Grafana APT key
ansible.builtin.get_url:
url: https://apt.grafana.com/gpg.key
dest: /etc/apt/keyrings/grafana.asc
mode: "0644"
- name: Add Grafana APT repository
ansible.builtin.apt_repository:
repo: "deb [signed-by=/etc/apt/keyrings/grafana.asc] https://apt.grafana.com stable main"
state: present
filename: grafana
- name: Install Grafana
ansible.builtin.apt:
name: grafana
state: present
update_cache: true
notify: Restart grafana
- name: Create Grafana provisioning directories
ansible.builtin.file:
path: "{{ item }}"
state: directory
owner: grafana
group: grafana
mode: "0755"
loop:
- /etc/grafana/provisioning/datasources
- /etc/grafana/provisioning/dashboards
- /var/lib/grafana/dashboards
- name: Deploy Grafana datasources
ansible.builtin.template:
src: datasources.yml.j2
dest: /etc/grafana/provisioning/datasources/datasources.yml
owner: grafana
group: grafana
mode: "0640"
notify: Restart grafana
- name: Deploy Grafana dashboard provisioning
ansible.builtin.template:
src: dashboards.yml.j2
dest: /etc/grafana/provisioning/dashboards/dashboards.yml
owner: grafana
group: grafana
mode: "0640"
notify: Restart grafana
- name: Deploy default dashboards
ansible.builtin.copy:
src: "{{ item }}"
dest: /var/lib/grafana/dashboards/
owner: grafana
group: grafana
mode: "0644"
loop: "{{ lookup('fileglob', 'files/dashboards/*.json', wantlist=True) }}"
notify: Restart grafana
ignore_errors: true # OK if no dashboards yet
- name: Configure Grafana
ansible.builtin.lineinfile:
path: /etc/grafana/grafana.ini
regexp: "{{ item.regexp }}"
line: "{{ item.line }}"
state: present
loop:
- { regexp: '^;?http_port', line: 'http_port = 3000' }
- { regexp: '^;?http_addr', line: 'http_addr = 0.0.0.0' }
notify: Restart grafana
- name: Enable and start Grafana
ansible.builtin.systemd:
name: grafana-server
enabled: true
state: started

View File

@@ -0,0 +1,13 @@
apiVersion: 1
providers:
- name: 'Home Infrastructure'
orgId: 1
folder: 'Home Infra'
folderUid: 'home-infra'
type: file
disableDeletion: false
updateIntervalSeconds: 30
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards

View File

@@ -0,0 +1,34 @@
apiVersion: 1
datasources:
# Local Prometheus (RPi - infrastructure metrics)
- name: Prometheus-Infra
type: prometheus
access: proxy
url: http://localhost:9090
isDefault: true
editable: false
jsonData:
timeInterval: "30s"
httpMethod: POST
# Cluster Prometheus (Talos - Kubernetes metrics)
- name: Prometheus-Cluster
type: prometheus
access: proxy
url: {{ prometheus_cluster_url }}
isDefault: false
editable: false
jsonData:
timeInterval: "30s"
httpMethod: POST
# Loki (Talos cluster - centralized logs)
- name: Loki
type: loki
access: proxy
url: {{ loki_url }}
isDefault: false
editable: false
jsonData:
maxLines: 1000