add the ansible playbooks for the monitoring stack
This commit is contained in:
13
README.md
Normal file
13
README.md
Normal file
@@ -0,0 +1,13 @@
|
||||
The ansible playbook found here is used to deploy a monitoring stack for a homelab.
|
||||
|
||||
Detailed instructions about the project can be found in it's [own repository](https://git.96-fromsofia.net/k8s/monitoring-stack)
|
||||
|
||||
- playbook.yml is deploying the monitoring stack outside of the talos cluster
|
||||
- kubernetes-playbook.yml is deploying the monitoring stack inside of the talos cluster
|
||||
|
||||
To run a playbook:
|
||||
```
|
||||
ansible-playbook -i inventory.yml playbook.yml
|
||||
```
|
||||
|
||||
Both playbooks can be ran and both stacks are designed to coexist. This is so the monitoring of the network layer and physical hosts is still active and visible even when the talos cluster itself is not running.
|
||||
44
ansible/inventory.yml
Normal file
44
ansible/inventory.yml
Normal file
@@ -0,0 +1,44 @@
|
||||
all:
|
||||
children:
|
||||
monitoring:
|
||||
hosts:
|
||||
rpi:
|
||||
ansible_host: 192.168.1.100 # Change to your RPi IP
|
||||
ansible_user: pi # Change if different
|
||||
ansible_become: true
|
||||
|
||||
# Configuration variables
|
||||
prometheus_version: "2.48.0"
|
||||
promtail_version: "2.9.2"
|
||||
grafana_version: "10.2.2"
|
||||
snmp_exporter_version: "0.24.1"
|
||||
|
||||
# Loki endpoint (in Talos cluster)
|
||||
loki_url: "http://192.168.1.200:30100" # Change to your Talos node IP
|
||||
|
||||
# Prometheus cluster endpoint (for Grafana datasource)
|
||||
prometheus_cluster_url: "http://192.168.1.200:30090" # Change to your Talos node IP
|
||||
|
||||
# Network device IPs for SNMP
|
||||
snmp_targets:
|
||||
- name: "router"
|
||||
ip: "192.168.1.1"
|
||||
module: "if_mib"
|
||||
- name: "modem"
|
||||
ip: "192.168.1.2"
|
||||
module: "if_mib"
|
||||
|
||||
# Targets with node_exporter
|
||||
node_exporter_targets:
|
||||
- name: "proxmox"
|
||||
ip: "192.168.1.10"
|
||||
port: 9100
|
||||
- name: "nfs"
|
||||
ip: "192.168.1.11"
|
||||
port: 9100
|
||||
|
||||
# Proxmox PVE exporter target
|
||||
proxmox_targets:
|
||||
- name: "proxmox"
|
||||
ip: "192.168.1.10"
|
||||
port: 9221
|
||||
117
ansible/kubernetes-playbook.yml
Normal file
117
ansible/kubernetes-playbook.yml
Normal file
@@ -0,0 +1,117 @@
|
||||
---
|
||||
# Deploy monitoring stack to Talos cluster via Ansible
|
||||
#
|
||||
# Prerequisites:
|
||||
# - kubectl configured with access to your Talos cluster
|
||||
# - kubernetes.core collection installed: ansible-galaxy collection install kubernetes.core
|
||||
#
|
||||
# Usage:
|
||||
# ansible-playbook -i inventory.yml kubernetes-playbook.yml
|
||||
#
|
||||
# Or with a specific kubeconfig:
|
||||
# ansible-playbook -i inventory.yml kubernetes-playbook.yml -e kubeconfig_path=~/.kube/talos-config
|
||||
|
||||
- name: Deploy monitoring stack to Kubernetes
|
||||
hosts: localhost
|
||||
connection: local
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
kubeconfig_path: "{{ lookup('env', 'KUBECONFIG') | default('~/.kube/config', true) }}"
|
||||
manifests_dir: "{{ playbook_dir }}/kubernetes"
|
||||
|
||||
tasks:
|
||||
- name: Create monitoring namespace
|
||||
kubernetes.core.k8s:
|
||||
kubeconfig: "{{ kubeconfig_path }}"
|
||||
state: present
|
||||
src: "{{ manifests_dir }}/namespace.yaml"
|
||||
|
||||
- name: Deploy Prometheus
|
||||
kubernetes.core.k8s:
|
||||
kubeconfig: "{{ kubeconfig_path }}"
|
||||
state: present
|
||||
src: "{{ item }}"
|
||||
loop:
|
||||
- "{{ manifests_dir }}/prometheus/rbac.yaml"
|
||||
- "{{ manifests_dir }}/prometheus/configmap.yaml"
|
||||
- "{{ manifests_dir }}/prometheus/deployment.yaml"
|
||||
- "{{ manifests_dir }}/prometheus/service.yaml"
|
||||
|
||||
- name: Wait for Prometheus to be ready
|
||||
kubernetes.core.k8s_info:
|
||||
kubeconfig: "{{ kubeconfig_path }}"
|
||||
kind: Deployment
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
register: prometheus_deployment
|
||||
until: prometheus_deployment.resources[0].status.readyReplicas | default(0) >= 1
|
||||
retries: 30
|
||||
delay: 10
|
||||
|
||||
- name: Deploy Loki
|
||||
kubernetes.core.k8s:
|
||||
kubeconfig: "{{ kubeconfig_path }}"
|
||||
state: present
|
||||
src: "{{ item }}"
|
||||
loop:
|
||||
- "{{ manifests_dir }}/loki/configmap.yaml"
|
||||
- "{{ manifests_dir }}/loki/deployment.yaml"
|
||||
- "{{ manifests_dir }}/loki/service.yaml"
|
||||
|
||||
- name: Wait for Loki to be ready
|
||||
kubernetes.core.k8s_info:
|
||||
kubeconfig: "{{ kubeconfig_path }}"
|
||||
kind: Deployment
|
||||
name: loki
|
||||
namespace: monitoring
|
||||
register: loki_deployment
|
||||
until: loki_deployment.resources[0].status.readyReplicas | default(0) >= 1
|
||||
retries: 30
|
||||
delay: 10
|
||||
|
||||
- name: Deploy Promtail
|
||||
kubernetes.core.k8s:
|
||||
kubeconfig: "{{ kubeconfig_path }}"
|
||||
state: present
|
||||
src: "{{ item }}"
|
||||
loop:
|
||||
- "{{ manifests_dir }}/promtail/rbac.yaml"
|
||||
- "{{ manifests_dir }}/promtail/configmap.yaml"
|
||||
- "{{ manifests_dir }}/promtail/daemonset.yaml"
|
||||
|
||||
- name: Deploy Node Exporter
|
||||
kubernetes.core.k8s:
|
||||
kubeconfig: "{{ kubeconfig_path }}"
|
||||
state: present
|
||||
src: "{{ manifests_dir }}/node-exporter/daemonset.yaml"
|
||||
|
||||
- name: Deploy Kube State Metrics
|
||||
kubernetes.core.k8s:
|
||||
kubeconfig: "{{ kubeconfig_path }}"
|
||||
state: present
|
||||
src: "{{ item }}"
|
||||
loop:
|
||||
- "{{ manifests_dir }}/kube-state-metrics/rbac.yaml"
|
||||
- "{{ manifests_dir }}/kube-state-metrics/deployment.yaml"
|
||||
|
||||
- name: Get cluster node IPs
|
||||
kubernetes.core.k8s_info:
|
||||
kubeconfig: "{{ kubeconfig_path }}"
|
||||
kind: Node
|
||||
register: cluster_nodes
|
||||
|
||||
- name: Display access information
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "Monitoring stack deployed successfully!"
|
||||
- ""
|
||||
- "Prometheus: http://<node-ip>:30090"
|
||||
- "Loki: http://<node-ip>:30100"
|
||||
- ""
|
||||
- "Node IPs:"
|
||||
- "{{ cluster_nodes.resources | map(attribute='status.addresses') | flatten | selectattr('type', 'equalto', 'InternalIP') | map(attribute='address') | list }}"
|
||||
- ""
|
||||
- "Update your RPi inventory.yml with one of these IPs for:"
|
||||
- " loki_url: http://<node-ip>:30100"
|
||||
- " prometheus_cluster_url: http://<node-ip>:30090"
|
||||
37
ansible/playbook.yml
Normal file
37
ansible/playbook.yml
Normal file
@@ -0,0 +1,37 @@
|
||||
---
|
||||
- name: Deploy monitoring stack on RPi
|
||||
hosts: monitoring
|
||||
become: true
|
||||
|
||||
vars:
|
||||
# Architecture detection for ARM
|
||||
arch_map:
|
||||
armv7l: "armv7"
|
||||
aarch64: "arm64"
|
||||
x86_64: "amd64"
|
||||
|
||||
pre_tasks:
|
||||
- name: Gather architecture
|
||||
ansible.builtin.set_fact:
|
||||
go_arch: "{{ arch_map[ansible_architecture] | default('arm64') }}"
|
||||
|
||||
- name: Update apt cache
|
||||
ansible.builtin.apt:
|
||||
update_cache: true
|
||||
cache_valid_time: 3600
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
roles:
|
||||
- common
|
||||
- prometheus
|
||||
- snmp_exporter
|
||||
- promtail
|
||||
- grafana
|
||||
|
||||
post_tasks:
|
||||
- name: Display access information
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "Grafana: http://{{ ansible_host }}:3000 (admin/admin)"
|
||||
- "Prometheus: http://{{ ansible_host }}:9090"
|
||||
- "Syslog listener: {{ ansible_host }}:514 (UDP)"
|
||||
29
ansible/roles/common/tasks/main.yml
Normal file
29
ansible/roles/common/tasks/main.yml
Normal file
@@ -0,0 +1,29 @@
|
||||
---
|
||||
- name: Install common dependencies
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- curl
|
||||
- tar
|
||||
- gzip
|
||||
- ca-certificates
|
||||
state: present
|
||||
|
||||
- name: Create monitoring user
|
||||
ansible.builtin.user:
|
||||
name: monitoring
|
||||
system: true
|
||||
shell: /usr/sbin/nologin
|
||||
home: /var/lib/monitoring
|
||||
create_home: false
|
||||
|
||||
- name: Create common directories
|
||||
ansible.builtin.file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
owner: monitoring
|
||||
group: monitoring
|
||||
mode: "0755"
|
||||
loop:
|
||||
- /etc/monitoring
|
||||
- /var/lib/monitoring
|
||||
- /var/log/monitoring
|
||||
187
ansible/roles/grafana/files/dashboards/home-infra-overview.json
Normal file
187
ansible/roles/grafana/files/dashboards/home-infra-overview.json
Normal file
@@ -0,0 +1,187 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"title": "Infrastructure Overview",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{ "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
|
||||
"id": 2,
|
||||
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
|
||||
"title": "Targets Up",
|
||||
"type": "stat",
|
||||
"targets": [{ "expr": "count(up == 1)", "refId": "A" }]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{ "options": { "0": { "color": "green", "text": "0" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
|
||||
"id": 3,
|
||||
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
|
||||
"title": "Targets Down",
|
||||
"type": "stat",
|
||||
"targets": [{ "expr": "count(up == 0) or vector(0)", "refId": "A" }]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
|
||||
"id": 4,
|
||||
"options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }, "textMode": "auto" },
|
||||
"title": "Log Lines (1h)",
|
||||
"type": "stat",
|
||||
"targets": [{ "expr": "sum(count_over_time({job=~\".+\"}[1h]))", "refId": "A" }]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
|
||||
"id": 5,
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 10 }, { "color": "red", "value": 50 }] }
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }, "textMode": "auto" },
|
||||
"title": "Error Logs (1h)",
|
||||
"type": "stat",
|
||||
"targets": [{ "expr": "sum(count_over_time({severity=~\"err|error|crit|alert|emerg\"}[1h])) or vector(0)", "refId": "A" }]
|
||||
},
|
||||
{
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
|
||||
"id": 10,
|
||||
"title": "Network Devices",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "lineWidth": 1, "fillOpacity": 10 },
|
||||
"unit": "bps"
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
|
||||
"id": 11,
|
||||
"options": { "legend": { "displayMode": "list", "placement": "bottom" } },
|
||||
"title": "Network Interface Traffic",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{ "expr": "rate(ifHCInOctets{job=\"snmp\"}[5m]) * 8", "legendFormat": "{{device}} - {{ifDescr}} In", "refId": "A" },
|
||||
{ "expr": "rate(ifHCOutOctets{job=\"snmp\"}[5m]) * 8", "legendFormat": "{{device}} - {{ifDescr}} Out", "refId": "B" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{ "options": { "1": { "color": "green", "text": "Up" }, "2": { "color": "red", "text": "Down" } }, "type": "value" }
|
||||
]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
|
||||
"id": 12,
|
||||
"options": { "showHeader": true },
|
||||
"title": "Interface Status",
|
||||
"type": "table",
|
||||
"targets": [{ "expr": "ifOperStatus{job=\"snmp\"}", "format": "table", "instant": true, "refId": "A" }],
|
||||
"transformations": [
|
||||
{ "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true, "job": true }, "renameByName": { "device": "Device", "ifDescr": "Interface", "Value": "Status" } } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
|
||||
"id": 20,
|
||||
"title": "Proxmox / VMs",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "percentunit", "max": 1, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.7 }, { "color": "red", "value": 0.9 }] } }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 15 },
|
||||
"id": 21,
|
||||
"options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true },
|
||||
"title": "CPU Usage",
|
||||
"type": "gauge",
|
||||
"targets": [{ "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\",instance=~\"proxmox.*|nfs.*\"}[5m])) by (instance)", "legendFormat": "{{instance}}", "refId": "A" }]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "percentunit", "max": 1, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.7 }, { "color": "red", "value": 0.9 }] } }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 8, "y": 15 },
|
||||
"id": 22,
|
||||
"options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true },
|
||||
"title": "Memory Usage",
|
||||
"type": "gauge",
|
||||
"targets": [{ "expr": "1 - (node_memory_MemAvailable_bytes{instance=~\"proxmox.*|nfs.*\"} / node_memory_MemTotal_bytes{instance=~\"proxmox.*|nfs.*\"})", "legendFormat": "{{instance}}", "refId": "A" }]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus-infra" },
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "percentunit", "max": 1, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.7 }, { "color": "red", "value": 0.9 }] } }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 16, "y": 15 },
|
||||
"id": 23,
|
||||
"options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true },
|
||||
"title": "Disk Usage",
|
||||
"type": "gauge",
|
||||
"targets": [{ "expr": "1 - (node_filesystem_avail_bytes{instance=~\"proxmox.*|nfs.*\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=~\"proxmox.*|nfs.*\",mountpoint=\"/\"})", "legendFormat": "{{instance}}", "refId": "A" }]
|
||||
},
|
||||
{
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 },
|
||||
"id": 30,
|
||||
"title": "Recent Logs",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 22 },
|
||||
"id": 31,
|
||||
"options": { "showTime": true, "showLabels": true, "showCommonLabels": false, "wrapLogMessage": true, "prettifyLogMessage": false, "enableLogDetails": true, "sortOrder": "Descending" },
|
||||
"title": "All Logs",
|
||||
"type": "logs",
|
||||
"targets": [{ "expr": "{job=~\".+\"}", "refId": "A" }]
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 38,
|
||||
"style": "dark",
|
||||
"tags": ["home-infra"],
|
||||
"templating": { "list": [] },
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Home Infrastructure Overview",
|
||||
"uid": "home-infra-overview",
|
||||
"version": 1
|
||||
}
|
||||
5
ansible/roles/grafana/handlers/main.yml
Normal file
5
ansible/roles/grafana/handlers/main.yml
Normal file
@@ -0,0 +1,5 @@
|
||||
---
|
||||
- name: Restart grafana
|
||||
ansible.builtin.systemd:
|
||||
name: grafana-server
|
||||
state: restarted
|
||||
77
ansible/roles/grafana/tasks/main.yml
Normal file
77
ansible/roles/grafana/tasks/main.yml
Normal file
@@ -0,0 +1,77 @@
|
||||
---
|
||||
- name: Add Grafana APT key
|
||||
ansible.builtin.get_url:
|
||||
url: https://apt.grafana.com/gpg.key
|
||||
dest: /etc/apt/keyrings/grafana.asc
|
||||
mode: "0644"
|
||||
|
||||
- name: Add Grafana APT repository
|
||||
ansible.builtin.apt_repository:
|
||||
repo: "deb [signed-by=/etc/apt/keyrings/grafana.asc] https://apt.grafana.com stable main"
|
||||
state: present
|
||||
filename: grafana
|
||||
|
||||
- name: Install Grafana
|
||||
ansible.builtin.apt:
|
||||
name: grafana
|
||||
state: present
|
||||
update_cache: true
|
||||
notify: Restart grafana
|
||||
|
||||
- name: Create Grafana provisioning directories
|
||||
ansible.builtin.file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
owner: grafana
|
||||
group: grafana
|
||||
mode: "0755"
|
||||
loop:
|
||||
- /etc/grafana/provisioning/datasources
|
||||
- /etc/grafana/provisioning/dashboards
|
||||
- /var/lib/grafana/dashboards
|
||||
|
||||
- name: Deploy Grafana datasources
|
||||
ansible.builtin.template:
|
||||
src: datasources.yml.j2
|
||||
dest: /etc/grafana/provisioning/datasources/datasources.yml
|
||||
owner: grafana
|
||||
group: grafana
|
||||
mode: "0640"
|
||||
notify: Restart grafana
|
||||
|
||||
- name: Deploy Grafana dashboard provisioning
|
||||
ansible.builtin.template:
|
||||
src: dashboards.yml.j2
|
||||
dest: /etc/grafana/provisioning/dashboards/dashboards.yml
|
||||
owner: grafana
|
||||
group: grafana
|
||||
mode: "0640"
|
||||
notify: Restart grafana
|
||||
|
||||
- name: Deploy default dashboards
|
||||
ansible.builtin.copy:
|
||||
src: "{{ item }}"
|
||||
dest: /var/lib/grafana/dashboards/
|
||||
owner: grafana
|
||||
group: grafana
|
||||
mode: "0644"
|
||||
loop: "{{ lookup('fileglob', 'files/dashboards/*.json', wantlist=True) }}"
|
||||
notify: Restart grafana
|
||||
ignore_errors: true # OK if no dashboards yet
|
||||
|
||||
- name: Configure Grafana
|
||||
ansible.builtin.lineinfile:
|
||||
path: /etc/grafana/grafana.ini
|
||||
regexp: "{{ item.regexp }}"
|
||||
line: "{{ item.line }}"
|
||||
state: present
|
||||
loop:
|
||||
- { regexp: '^;?http_port', line: 'http_port = 3000' }
|
||||
- { regexp: '^;?http_addr', line: 'http_addr = 0.0.0.0' }
|
||||
notify: Restart grafana
|
||||
|
||||
- name: Enable and start Grafana
|
||||
ansible.builtin.systemd:
|
||||
name: grafana-server
|
||||
enabled: true
|
||||
state: started
|
||||
13
ansible/roles/grafana/templates/dashboards.yml.j2
Normal file
13
ansible/roles/grafana/templates/dashboards.yml.j2
Normal file
@@ -0,0 +1,13 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'Home Infrastructure'
|
||||
orgId: 1
|
||||
folder: 'Home Infra'
|
||||
folderUid: 'home-infra'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 30
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
34
ansible/roles/grafana/templates/datasources.yml.j2
Normal file
34
ansible/roles/grafana/templates/datasources.yml.j2
Normal file
@@ -0,0 +1,34 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
# Local Prometheus (RPi - infrastructure metrics)
|
||||
- name: Prometheus-Infra
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://localhost:9090
|
||||
isDefault: true
|
||||
editable: false
|
||||
jsonData:
|
||||
timeInterval: "30s"
|
||||
httpMethod: POST
|
||||
|
||||
# Cluster Prometheus (Talos - Kubernetes metrics)
|
||||
- name: Prometheus-Cluster
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: {{ prometheus_cluster_url }}
|
||||
isDefault: false
|
||||
editable: false
|
||||
jsonData:
|
||||
timeInterval: "30s"
|
||||
httpMethod: POST
|
||||
|
||||
# Loki (Talos cluster - centralized logs)
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: {{ loki_url }}
|
||||
isDefault: false
|
||||
editable: false
|
||||
jsonData:
|
||||
maxLines: 1000
|
||||
9
ansible/roles/prometheus/handlers/main.yml
Normal file
9
ansible/roles/prometheus/handlers/main.yml
Normal file
@@ -0,0 +1,9 @@
|
||||
---
|
||||
- name: Reload systemd
|
||||
ansible.builtin.systemd:
|
||||
daemon_reload: true
|
||||
|
||||
- name: Restart prometheus
|
||||
ansible.builtin.systemd:
|
||||
name: prometheus
|
||||
state: restarted
|
||||
82
ansible/roles/prometheus/tasks/main.yml
Normal file
82
ansible/roles/prometheus/tasks/main.yml
Normal file
@@ -0,0 +1,82 @@
|
||||
---
|
||||
- name: Create Prometheus directories
|
||||
ansible.builtin.file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
owner: monitoring
|
||||
group: monitoring
|
||||
mode: "0755"
|
||||
loop:
|
||||
- /etc/prometheus
|
||||
- /var/lib/prometheus
|
||||
|
||||
- name: Check if Prometheus is installed
|
||||
ansible.builtin.stat:
|
||||
path: /usr/local/bin/prometheus
|
||||
register: prometheus_binary
|
||||
|
||||
- name: Get installed Prometheus version
|
||||
ansible.builtin.command: /usr/local/bin/prometheus --version
|
||||
register: prometheus_installed_version
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: prometheus_binary.stat.exists
|
||||
|
||||
- name: Download Prometheus
|
||||
ansible.builtin.get_url:
|
||||
url: "https://github.com/prometheus/prometheus/releases/download/v{{ prometheus_version }}/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}.tar.gz"
|
||||
dest: "/tmp/prometheus-{{ prometheus_version }}.tar.gz"
|
||||
mode: "0644"
|
||||
when: not prometheus_binary.stat.exists or prometheus_version not in (prometheus_installed_version.stdout | default(''))
|
||||
|
||||
- name: Extract Prometheus
|
||||
ansible.builtin.unarchive:
|
||||
src: "/tmp/prometheus-{{ prometheus_version }}.tar.gz"
|
||||
dest: /tmp
|
||||
remote_src: true
|
||||
when: not prometheus_binary.stat.exists or prometheus_version not in (prometheus_installed_version.stdout | default(''))
|
||||
|
||||
- name: Install Prometheus binaries
|
||||
ansible.builtin.copy:
|
||||
src: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}/{{ item }}"
|
||||
dest: "/usr/local/bin/{{ item }}"
|
||||
mode: "0755"
|
||||
remote_src: true
|
||||
loop:
|
||||
- prometheus
|
||||
- promtool
|
||||
notify: Restart prometheus
|
||||
when: not prometheus_binary.stat.exists or prometheus_version not in (prometheus_installed_version.stdout | default(''))
|
||||
|
||||
- name: Deploy Prometheus configuration
|
||||
ansible.builtin.template:
|
||||
src: prometheus.yml.j2
|
||||
dest: /etc/prometheus/prometheus.yml
|
||||
owner: monitoring
|
||||
group: monitoring
|
||||
mode: "0644"
|
||||
notify: Restart prometheus
|
||||
|
||||
- name: Deploy Prometheus systemd service
|
||||
ansible.builtin.template:
|
||||
src: prometheus.service.j2
|
||||
dest: /etc/systemd/system/prometheus.service
|
||||
mode: "0644"
|
||||
notify:
|
||||
- Reload systemd
|
||||
- Restart prometheus
|
||||
|
||||
- name: Enable and start Prometheus
|
||||
ansible.builtin.systemd:
|
||||
name: prometheus
|
||||
enabled: true
|
||||
state: started
|
||||
daemon_reload: true
|
||||
|
||||
- name: Clean up downloaded files
|
||||
ansible.builtin.file:
|
||||
path: "{{ item }}"
|
||||
state: absent
|
||||
loop:
|
||||
- "/tmp/prometheus-{{ prometheus_version }}.tar.gz"
|
||||
- "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}"
|
||||
30
ansible/roles/prometheus/templates/prometheus.service.j2
Normal file
30
ansible/roles/prometheus/templates/prometheus.service.j2
Normal file
@@ -0,0 +1,30 @@
|
||||
[Unit]
|
||||
Description=Prometheus Monitoring System
|
||||
Documentation=https://prometheus.io/docs/
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=monitoring
|
||||
Group=monitoring
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
ExecStart=/usr/local/bin/prometheus \
|
||||
--config.file=/etc/prometheus/prometheus.yml \
|
||||
--storage.tsdb.path=/var/lib/prometheus \
|
||||
--storage.tsdb.retention.time=15d \
|
||||
--web.listen-address=0.0.0.0:9090 \
|
||||
--web.enable-lifecycle \
|
||||
--log.level=info
|
||||
|
||||
SyslogIdentifier=prometheus
|
||||
Restart=always
|
||||
RestartSec=5
|
||||
|
||||
# Hardening
|
||||
NoNewPrivileges=true
|
||||
ProtectSystem=full
|
||||
ProtectHome=true
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
68
ansible/roles/prometheus/templates/prometheus.yml.j2
Normal file
68
ansible/roles/prometheus/templates/prometheus.yml.j2
Normal file
@@ -0,0 +1,68 @@
|
||||
global:
|
||||
scrape_interval: 30s
|
||||
evaluation_interval: 30s
|
||||
external_labels:
|
||||
monitor: 'home-infra'
|
||||
source: 'rpi'
|
||||
|
||||
scrape_configs:
|
||||
# Self-monitoring
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
labels:
|
||||
instance: 'rpi-prometheus'
|
||||
|
||||
# SNMP targets (network devices)
|
||||
{% if snmp_targets is defined and snmp_targets | length > 0 %}
|
||||
- job_name: 'snmp'
|
||||
scrape_interval: 60s
|
||||
scrape_timeout: 30s
|
||||
static_configs:
|
||||
{% for target in snmp_targets %}
|
||||
- targets: ['{{ target.ip }}']
|
||||
labels:
|
||||
device: '{{ target.name }}'
|
||||
{% endfor %}
|
||||
metrics_path: /snmp
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: localhost:9116
|
||||
params:
|
||||
module: [if_mib] # Default module, can be overridden per-target
|
||||
{% endif %}
|
||||
|
||||
# Node exporter targets (VMs with node_exporter)
|
||||
{% if node_exporter_targets is defined and node_exporter_targets | length > 0 %}
|
||||
- job_name: 'node'
|
||||
static_configs:
|
||||
{% for target in node_exporter_targets %}
|
||||
- targets: ['{{ target.ip }}:{{ target.port | default(9100) }}']
|
||||
labels:
|
||||
instance: '{{ target.name }}'
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
|
||||
# Proxmox PVE exporter
|
||||
{% if proxmox_targets is defined and proxmox_targets | length > 0 %}
|
||||
- job_name: 'proxmox'
|
||||
scrape_interval: 60s
|
||||
static_configs:
|
||||
{% for target in proxmox_targets %}
|
||||
- targets: ['{{ target.ip }}:{{ target.port | default(9221) }}']
|
||||
labels:
|
||||
instance: '{{ target.name }}'
|
||||
{% endfor %}
|
||||
metrics_path: /pve
|
||||
params:
|
||||
module: [default]
|
||||
{% endif %}
|
||||
|
||||
# SNMP exporter self-metrics
|
||||
- job_name: 'snmp-exporter'
|
||||
static_configs:
|
||||
- targets: ['localhost:9116']
|
||||
9
ansible/roles/promtail/handlers/main.yml
Normal file
9
ansible/roles/promtail/handlers/main.yml
Normal file
@@ -0,0 +1,9 @@
|
||||
---
|
||||
- name: Reload systemd
|
||||
ansible.builtin.systemd:
|
||||
daemon_reload: true
|
||||
|
||||
- name: Restart promtail
|
||||
ansible.builtin.systemd:
|
||||
name: promtail
|
||||
state: restarted
|
||||
78
ansible/roles/promtail/tasks/main.yml
Normal file
78
ansible/roles/promtail/tasks/main.yml
Normal file
@@ -0,0 +1,78 @@
|
||||
---
|
||||
- name: Create Promtail directories
|
||||
ansible.builtin.file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
owner: monitoring
|
||||
group: monitoring
|
||||
mode: "0755"
|
||||
loop:
|
||||
- /etc/promtail
|
||||
- /var/lib/promtail
|
||||
|
||||
- name: Check if Promtail is installed
|
||||
ansible.builtin.stat:
|
||||
path: /usr/local/bin/promtail
|
||||
register: promtail_binary
|
||||
|
||||
- name: Download Promtail
|
||||
ansible.builtin.get_url:
|
||||
url: "https://github.com/grafana/loki/releases/download/v{{ promtail_version }}/promtail-linux-{{ go_arch }}.zip"
|
||||
dest: "/tmp/promtail-{{ promtail_version }}.zip"
|
||||
mode: "0644"
|
||||
when: not promtail_binary.stat.exists
|
||||
|
||||
- name: Install unzip
|
||||
ansible.builtin.apt:
|
||||
name: unzip
|
||||
state: present
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Extract Promtail
|
||||
ansible.builtin.unarchive:
|
||||
src: "/tmp/promtail-{{ promtail_version }}.zip"
|
||||
dest: /tmp
|
||||
remote_src: true
|
||||
when: not promtail_binary.stat.exists
|
||||
|
||||
- name: Install Promtail binary
|
||||
ansible.builtin.copy:
|
||||
src: /tmp/promtail-linux-{{ go_arch }}
|
||||
dest: /usr/local/bin/promtail
|
||||
mode: "0755"
|
||||
remote_src: true
|
||||
notify: Restart promtail
|
||||
when: not promtail_binary.stat.exists
|
||||
|
||||
- name: Deploy Promtail configuration
|
||||
ansible.builtin.template:
|
||||
src: promtail.yml.j2
|
||||
dest: /etc/promtail/promtail.yml
|
||||
owner: monitoring
|
||||
group: monitoring
|
||||
mode: "0644"
|
||||
notify: Restart promtail
|
||||
|
||||
- name: Deploy Promtail systemd service
|
||||
ansible.builtin.template:
|
||||
src: promtail.service.j2
|
||||
dest: /etc/systemd/system/promtail.service
|
||||
mode: "0644"
|
||||
notify:
|
||||
- Reload systemd
|
||||
- Restart promtail
|
||||
|
||||
- name: Enable and start Promtail
|
||||
ansible.builtin.systemd:
|
||||
name: promtail
|
||||
enabled: true
|
||||
state: started
|
||||
daemon_reload: true
|
||||
|
||||
- name: Clean up downloaded files
|
||||
ansible.builtin.file:
|
||||
path: "{{ item }}"
|
||||
state: absent
|
||||
loop:
|
||||
- "/tmp/promtail-{{ promtail_version }}.zip"
|
||||
- "/tmp/promtail-linux-{{ go_arch }}"
|
||||
24
ansible/roles/promtail/templates/promtail.service.j2
Normal file
24
ansible/roles/promtail/templates/promtail.service.j2
Normal file
@@ -0,0 +1,24 @@
|
||||
[Unit]
|
||||
Description=Promtail Log Collector
|
||||
Documentation=https://grafana.com/docs/loki/latest/clients/promtail/
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=root
|
||||
Group=root
|
||||
ExecStart=/usr/local/bin/promtail \
|
||||
-config.file=/etc/promtail/promtail.yml \
|
||||
-config.expand-env=true
|
||||
|
||||
SyslogIdentifier=promtail
|
||||
Restart=always
|
||||
RestartSec=5
|
||||
|
||||
# Need root for syslog port 514 and journal access
|
||||
# Can use CAP_NET_BIND_SERVICE instead if preferred
|
||||
AmbientCapabilities=CAP_NET_BIND_SERVICE
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
56
ansible/roles/promtail/templates/promtail.yml.j2
Normal file
56
ansible/roles/promtail/templates/promtail.yml.j2
Normal file
@@ -0,0 +1,56 @@
|
||||
server:
|
||||
http_listen_port: 9080
|
||||
grpc_listen_port: 0
|
||||
|
||||
positions:
|
||||
filename: /var/lib/promtail/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: {{ loki_url }}/loki/api/v1/push
|
||||
tenant_id: home-infra
|
||||
batchwait: 1s
|
||||
batchsize: 1048576
|
||||
timeout: 10s
|
||||
|
||||
scrape_configs:
|
||||
# Syslog listener for network devices
|
||||
- job_name: syslog
|
||||
syslog:
|
||||
listen_address: 0.0.0.0:514
|
||||
listen_protocol: udp
|
||||
idle_timeout: 60s
|
||||
label_structured_data: true
|
||||
labels:
|
||||
job: syslog
|
||||
source: network-devices
|
||||
relabel_configs:
|
||||
- source_labels: ['__syslog_message_hostname']
|
||||
target_label: 'host'
|
||||
- source_labels: ['__syslog_message_severity']
|
||||
target_label: 'severity'
|
||||
- source_labels: ['__syslog_message_facility']
|
||||
target_label: 'facility'
|
||||
- source_labels: ['__syslog_message_app_name']
|
||||
target_label: 'app'
|
||||
pipeline_stages:
|
||||
- match:
|
||||
selector: '{job="syslog"}'
|
||||
stages:
|
||||
# Extract common patterns from network device logs
|
||||
- regex:
|
||||
expression: '(?P<src_ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
|
||||
- labels:
|
||||
src_ip:
|
||||
|
||||
# Local system journal (RPi logs)
|
||||
- job_name: journal
|
||||
journal:
|
||||
max_age: 12h
|
||||
labels:
|
||||
job: systemd-journal
|
||||
host: rpi
|
||||
relabel_configs:
|
||||
- source_labels: ['__journal__systemd_unit']
|
||||
target_label: 'unit'
|
||||
- source_labels: ['__journal_priority_keyword']
|
||||
target_label: 'severity'
|
||||
9
ansible/roles/snmp_exporter/handlers/main.yml
Normal file
9
ansible/roles/snmp_exporter/handlers/main.yml
Normal file
@@ -0,0 +1,9 @@
|
||||
---
|
||||
- name: Reload systemd
|
||||
ansible.builtin.systemd:
|
||||
daemon_reload: true
|
||||
|
||||
- name: Restart snmp_exporter
|
||||
ansible.builtin.systemd:
|
||||
name: snmp_exporter
|
||||
state: restarted
|
||||
71
ansible/roles/snmp_exporter/tasks/main.yml
Normal file
71
ansible/roles/snmp_exporter/tasks/main.yml
Normal file
@@ -0,0 +1,71 @@
|
||||
---
|
||||
- name: Create SNMP exporter directory
|
||||
ansible.builtin.file:
|
||||
path: /etc/snmp_exporter
|
||||
state: directory
|
||||
owner: monitoring
|
||||
group: monitoring
|
||||
mode: "0755"
|
||||
|
||||
- name: Check if SNMP exporter is installed
|
||||
ansible.builtin.stat:
|
||||
path: /usr/local/bin/snmp_exporter
|
||||
register: snmp_exporter_binary
|
||||
|
||||
- name: Download SNMP exporter
|
||||
ansible.builtin.get_url:
|
||||
url: "https://github.com/prometheus/snmp_exporter/releases/download/v{{ snmp_exporter_version }}/snmp_exporter-{{ snmp_exporter_version }}.linux-{{ go_arch }}.tar.gz"
|
||||
dest: "/tmp/snmp_exporter-{{ snmp_exporter_version }}.tar.gz"
|
||||
mode: "0644"
|
||||
when: not snmp_exporter_binary.stat.exists
|
||||
|
||||
- name: Extract SNMP exporter
|
||||
ansible.builtin.unarchive:
|
||||
src: "/tmp/snmp_exporter-{{ snmp_exporter_version }}.tar.gz"
|
||||
dest: /tmp
|
||||
remote_src: true
|
||||
when: not snmp_exporter_binary.stat.exists
|
||||
|
||||
- name: Install SNMP exporter binary
|
||||
ansible.builtin.copy:
|
||||
src: "/tmp/snmp_exporter-{{ snmp_exporter_version }}.linux-{{ go_arch }}/snmp_exporter"
|
||||
dest: /usr/local/bin/snmp_exporter
|
||||
mode: "0755"
|
||||
remote_src: true
|
||||
notify: Restart snmp_exporter
|
||||
when: not snmp_exporter_binary.stat.exists
|
||||
|
||||
- name: Install default SNMP exporter config
|
||||
ansible.builtin.copy:
|
||||
src: "/tmp/snmp_exporter-{{ snmp_exporter_version }}.linux-{{ go_arch }}/snmp.yml"
|
||||
dest: /etc/snmp_exporter/snmp.yml
|
||||
owner: monitoring
|
||||
group: monitoring
|
||||
mode: "0644"
|
||||
remote_src: true
|
||||
notify: Restart snmp_exporter
|
||||
when: not snmp_exporter_binary.stat.exists
|
||||
|
||||
- name: Deploy SNMP exporter systemd service
|
||||
ansible.builtin.template:
|
||||
src: snmp_exporter.service.j2
|
||||
dest: /etc/systemd/system/snmp_exporter.service
|
||||
mode: "0644"
|
||||
notify:
|
||||
- Reload systemd
|
||||
- Restart snmp_exporter
|
||||
|
||||
- name: Enable and start SNMP exporter
|
||||
ansible.builtin.systemd:
|
||||
name: snmp_exporter
|
||||
enabled: true
|
||||
state: started
|
||||
daemon_reload: true
|
||||
|
||||
- name: Clean up downloaded files
|
||||
ansible.builtin.file:
|
||||
path: "{{ item }}"
|
||||
state: absent
|
||||
loop:
|
||||
- "/tmp/snmp_exporter-{{ snmp_exporter_version }}.tar.gz"
|
||||
- "/tmp/snmp_exporter-{{ snmp_exporter_version }}.linux-{{ go_arch }}"
|
||||
@@ -0,0 +1,25 @@
|
||||
[Unit]
|
||||
Description=SNMP Exporter for Prometheus
|
||||
Documentation=https://github.com/prometheus/snmp_exporter
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=monitoring
|
||||
Group=monitoring
|
||||
ExecStart=/usr/local/bin/snmp_exporter \
|
||||
--config.file=/etc/snmp_exporter/snmp.yml \
|
||||
--web.listen-address=0.0.0.0:9116 \
|
||||
--log.level=info
|
||||
|
||||
SyslogIdentifier=snmp_exporter
|
||||
Restart=always
|
||||
RestartSec=5
|
||||
|
||||
NoNewPrivileges=true
|
||||
ProtectSystem=full
|
||||
ProtectHome=true
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
Reference in New Issue
Block a user