add the ansible playbooks for the monitoring stack
This commit is contained in:
187
ansible/roles/grafana/files/dashboards/home-infra-overview.json
Normal file
187
ansible/roles/grafana/files/dashboards/home-infra-overview.json
Normal file
@@ -0,0 +1,187 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"title": "Infrastructure Overview",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{ "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
|
||||
"id": 2,
|
||||
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
|
||||
"title": "Targets Up",
|
||||
"type": "stat",
|
||||
"targets": [{ "expr": "count(up == 1)", "refId": "A" }]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{ "options": { "0": { "color": "green", "text": "0" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
|
||||
"id": 3,
|
||||
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
|
||||
"title": "Targets Down",
|
||||
"type": "stat",
|
||||
"targets": [{ "expr": "count(up == 0) or vector(0)", "refId": "A" }]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
|
||||
"id": 4,
|
||||
"options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }, "textMode": "auto" },
|
||||
"title": "Log Lines (1h)",
|
||||
"type": "stat",
|
||||
"targets": [{ "expr": "sum(count_over_time({job=~\".+\"}[1h]))", "refId": "A" }]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
|
||||
"id": 5,
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 10 }, { "color": "red", "value": 50 }] }
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }, "textMode": "auto" },
|
||||
"title": "Error Logs (1h)",
|
||||
"type": "stat",
|
||||
"targets": [{ "expr": "sum(count_over_time({severity=~\"err|error|crit|alert|emerg\"}[1h])) or vector(0)", "refId": "A" }]
|
||||
},
|
||||
{
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
|
||||
"id": 10,
|
||||
"title": "Network Devices",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "lineWidth": 1, "fillOpacity": 10 },
|
||||
"unit": "bps"
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
|
||||
"id": 11,
|
||||
"options": { "legend": { "displayMode": "list", "placement": "bottom" } },
|
||||
"title": "Network Interface Traffic",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{ "expr": "rate(ifHCInOctets{job=\"snmp\"}[5m]) * 8", "legendFormat": "{{device}} - {{ifDescr}} In", "refId": "A" },
|
||||
{ "expr": "rate(ifHCOutOctets{job=\"snmp\"}[5m]) * 8", "legendFormat": "{{device}} - {{ifDescr}} Out", "refId": "B" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{ "options": { "1": { "color": "green", "text": "Up" }, "2": { "color": "red", "text": "Down" } }, "type": "value" }
|
||||
]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
|
||||
"id": 12,
|
||||
"options": { "showHeader": true },
|
||||
"title": "Interface Status",
|
||||
"type": "table",
|
||||
"targets": [{ "expr": "ifOperStatus{job=\"snmp\"}", "format": "table", "instant": true, "refId": "A" }],
|
||||
"transformations": [
|
||||
{ "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true, "job": true }, "renameByName": { "device": "Device", "ifDescr": "Interface", "Value": "Status" } } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
|
||||
"id": 20,
|
||||
"title": "Proxmox / VMs",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "percentunit", "max": 1, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.7 }, { "color": "red", "value": 0.9 }] } }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 15 },
|
||||
"id": 21,
|
||||
"options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true },
|
||||
"title": "CPU Usage",
|
||||
"type": "gauge",
|
||||
"targets": [{ "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\",instance=~\"proxmox.*|nfs.*\"}[5m])) by (instance)", "legendFormat": "{{instance}}", "refId": "A" }]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "percentunit", "max": 1, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.7 }, { "color": "red", "value": 0.9 }] } }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 8, "y": 15 },
|
||||
"id": 22,
|
||||
"options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true },
|
||||
"title": "Memory Usage",
|
||||
"type": "gauge",
|
||||
"targets": [{ "expr": "1 - (node_memory_MemAvailable_bytes{instance=~\"proxmox.*|nfs.*\"} / node_memory_MemTotal_bytes{instance=~\"proxmox.*|nfs.*\"})", "legendFormat": "{{instance}}", "refId": "A" }]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "percentunit", "max": 1, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.7 }, { "color": "red", "value": 0.9 }] } }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 16, "y": 15 },
|
||||
"id": 23,
|
||||
"options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true },
|
||||
"title": "Disk Usage",
|
||||
"type": "gauge",
|
||||
"targets": [{ "expr": "1 - (node_filesystem_avail_bytes{instance=~\"proxmox.*|nfs.*\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=~\"proxmox.*|nfs.*\",mountpoint=\"/\"})", "legendFormat": "{{instance}}", "refId": "A" }]
|
||||
},
|
||||
{
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 },
|
||||
"id": 30,
|
||||
"title": "Recent Logs",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 22 },
|
||||
"id": 31,
|
||||
"options": { "showTime": true, "showLabels": true, "showCommonLabels": false, "wrapLogMessage": true, "prettifyLogMessage": false, "enableLogDetails": true, "sortOrder": "Descending" },
|
||||
"title": "All Logs",
|
||||
"type": "logs",
|
||||
"targets": [{ "expr": "{job=~\".+\"}", "refId": "A" }]
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 38,
|
||||
"style": "dark",
|
||||
"tags": ["home-infra"],
|
||||
"templating": { "list": [] },
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Home Infrastructure Overview",
|
||||
"uid": "home-infra-overview",
|
||||
"version": 1
|
||||
}
|
||||
5
ansible/roles/grafana/handlers/main.yml
Normal file
5
ansible/roles/grafana/handlers/main.yml
Normal file
@@ -0,0 +1,5 @@
|
||||
---
|
||||
- name: Restart grafana
|
||||
ansible.builtin.systemd:
|
||||
name: grafana-server
|
||||
state: restarted
|
||||
77
ansible/roles/grafana/tasks/main.yml
Normal file
77
ansible/roles/grafana/tasks/main.yml
Normal file
@@ -0,0 +1,77 @@
|
||||
---
|
||||
- name: Add Grafana APT key
|
||||
ansible.builtin.get_url:
|
||||
url: https://apt.grafana.com/gpg.key
|
||||
dest: /etc/apt/keyrings/grafana.asc
|
||||
mode: "0644"
|
||||
|
||||
- name: Add Grafana APT repository
|
||||
ansible.builtin.apt_repository:
|
||||
repo: "deb [signed-by=/etc/apt/keyrings/grafana.asc] https://apt.grafana.com stable main"
|
||||
state: present
|
||||
filename: grafana
|
||||
|
||||
- name: Install Grafana
|
||||
ansible.builtin.apt:
|
||||
name: grafana
|
||||
state: present
|
||||
update_cache: true
|
||||
notify: Restart grafana
|
||||
|
||||
- name: Create Grafana provisioning directories
|
||||
ansible.builtin.file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
owner: grafana
|
||||
group: grafana
|
||||
mode: "0755"
|
||||
loop:
|
||||
- /etc/grafana/provisioning/datasources
|
||||
- /etc/grafana/provisioning/dashboards
|
||||
- /var/lib/grafana/dashboards
|
||||
|
||||
- name: Deploy Grafana datasources
|
||||
ansible.builtin.template:
|
||||
src: datasources.yml.j2
|
||||
dest: /etc/grafana/provisioning/datasources/datasources.yml
|
||||
owner: grafana
|
||||
group: grafana
|
||||
mode: "0640"
|
||||
notify: Restart grafana
|
||||
|
||||
- name: Deploy Grafana dashboard provisioning
|
||||
ansible.builtin.template:
|
||||
src: dashboards.yml.j2
|
||||
dest: /etc/grafana/provisioning/dashboards/dashboards.yml
|
||||
owner: grafana
|
||||
group: grafana
|
||||
mode: "0640"
|
||||
notify: Restart grafana
|
||||
|
||||
- name: Deploy default dashboards
|
||||
ansible.builtin.copy:
|
||||
src: "{{ item }}"
|
||||
dest: /var/lib/grafana/dashboards/
|
||||
owner: grafana
|
||||
group: grafana
|
||||
mode: "0644"
|
||||
loop: "{{ lookup('fileglob', 'files/dashboards/*.json', wantlist=True) }}"
|
||||
notify: Restart grafana
|
||||
ignore_errors: true # OK if no dashboards yet
|
||||
|
||||
- name: Configure Grafana
|
||||
ansible.builtin.lineinfile:
|
||||
path: /etc/grafana/grafana.ini
|
||||
regexp: "{{ item.regexp }}"
|
||||
line: "{{ item.line }}"
|
||||
state: present
|
||||
loop:
|
||||
- { regexp: '^;?http_port', line: 'http_port = 3000' }
|
||||
- { regexp: '^;?http_addr', line: 'http_addr = 0.0.0.0' }
|
||||
notify: Restart grafana
|
||||
|
||||
- name: Enable and start Grafana
|
||||
ansible.builtin.systemd:
|
||||
name: grafana-server
|
||||
enabled: true
|
||||
state: started
|
||||
13
ansible/roles/grafana/templates/dashboards.yml.j2
Normal file
13
ansible/roles/grafana/templates/dashboards.yml.j2
Normal file
@@ -0,0 +1,13 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'Home Infrastructure'
|
||||
orgId: 1
|
||||
folder: 'Home Infra'
|
||||
folderUid: 'home-infra'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 30
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
34
ansible/roles/grafana/templates/datasources.yml.j2
Normal file
34
ansible/roles/grafana/templates/datasources.yml.j2
Normal file
@@ -0,0 +1,34 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
# Local Prometheus (RPi - infrastructure metrics)
|
||||
- name: Prometheus-Infra
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://localhost:9090
|
||||
isDefault: true
|
||||
editable: false
|
||||
jsonData:
|
||||
timeInterval: "30s"
|
||||
httpMethod: POST
|
||||
|
||||
# Cluster Prometheus (Talos - Kubernetes metrics)
|
||||
- name: Prometheus-Cluster
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: {{ prometheus_cluster_url }}
|
||||
isDefault: false
|
||||
editable: false
|
||||
jsonData:
|
||||
timeInterval: "30s"
|
||||
httpMethod: POST
|
||||
|
||||
# Loki (Talos cluster - centralized logs)
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: {{ loki_url }}
|
||||
isDefault: false
|
||||
editable: false
|
||||
jsonData:
|
||||
maxLines: 1000
|
||||
Reference in New Issue
Block a user