commit 5a06798d5c57220188ff0ae1361ac91f69deb7ae Author: tsvetkov Date: Fri Feb 27 01:18:34 2026 +0000 Add the monitoring stack diff --git a/README.md b/README.md new file mode 100644 index 0000000..047c2a3 --- /dev/null +++ b/README.md @@ -0,0 +1,80 @@ +# Home Infrastructure Monitoring Stack + +## Overview + +This is intended to monitor a homelab environment consisting of: +- Proxmox +- Kubernetes running on proxmox +- Home router / Firewall via SNMP + +As this is using prometheus, it means if your router, IOT device or any other data source you want to monitor has a probe - it is absolutely achievable. + +## Hardware + +Utilising what is lying around, in my case this a an old model 3 RPi and 4 VMs running a talos cluster. +This setup is heavily tied to my own infra but a majority of what you will find here is easily adaptable. + +## Component Summary + +| Component | Location | Purpose | +|-----------|----------|---------| +| Grafana | RPi | Single UI for all metrics and logs | +| Prometheus (infra) | RPi | Scrapes network devices, Proxmox, NFS VM | +| Prometheus (cluster) | Talos | Scrapes Kubernetes workloads and nodes | +| Loki | Talos | Centralized log storage | +| Promtail (syslog) | RPi | Receives syslog from network devices, forwards to Loki | +| Promtail (k8s) | Talos | Collects container and Talos logs | +| SNMP Exporter | RPi | Translates SNMP to Prometheus metrics | +| Node Exporter | Talos (DaemonSet) | Host-level metrics for Talos nodes | +| Kube State Metrics | Talos | Kubernetes object metrics | + +## Directory Structure + +**NOTE:** The ansible directory can be downloaded from it's own [repository](). + +``` +monitoring-stack/ +├── README.md # This file +├── ansible/ # RPi setup +│ ├── inventory.yml +│ ├── playbook.yml +│ └── roles/ +│ ├── common/ +│ ├── prometheus/ +│ ├── promtail/ +│ └── grafana/ +└── kubernetes/ # Talos cluster manifests + ├── namespace.yaml + ├── prometheus/ + ├── loki/ + ├── promtail/ + ├── node-exporter/ + └── kube-state-metrics/ +``` + +## Deployment Order + +1. **RPi Setup** (Ansible) + ```bash + cd ansible + ansible-playbook -i inventory.yml playbook.yml + ``` + +2. **Talos Cluster** (kubectl/Ansible) + ```bash + kubectl apply -f kubernetes/namespace.yaml + kubectl apply -f kubernetes/prometheus/ + kubectl apply -f kubernetes/loki/ + kubectl apply -f kubernetes/promtail/ + kubectl apply -f kubernetes/node-exporter/ + kubectl apply -f kubernetes/kube-state-metrics/ + ``` + +3. **Configure Network Devices** + - Point syslog to RPi IP:514 (UDP) + - Enable SNMP on devices + +4. **Add Data Sources in Grafana** + - Prometheus (local): `http://localhost:9090` + - Prometheus (cluster): `http://:30090` + - Loki (cluster): `http://:30100` diff --git a/kubernetes/kube-state-metrics/deployment.yaml b/kubernetes/kube-state-metrics/deployment.yaml new file mode 100644 index 0000000..4b05b35 --- /dev/null +++ b/kubernetes/kube-state-metrics/deployment.yaml @@ -0,0 +1,81 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: monitoring + labels: + app: kube-state-metrics +spec: + replicas: 1 + selector: + matchLabels: + app: kube-state-metrics + template: + metadata: + labels: + app: kube-state-metrics + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + spec: + serviceAccountName: kube-state-metrics + securityContext: + fsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + containers: + - name: kube-state-metrics + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.10.1 + args: + - "--port=8080" + - "--telemetry-port=8081" + ports: + - name: http-metrics + containerPort: 8080 + protocol: TCP + - name: telemetry + containerPort: 8081 + protocol: TCP + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "128Mi" + cpu: "100m" + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + httpGet: + path: / + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 10 +--- +apiVersion: v1 +kind: Service +metadata: + name: kube-state-metrics + namespace: monitoring + labels: + app: kube-state-metrics + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" +spec: + type: ClusterIP + ports: + - name: http-metrics + port: 8080 + targetPort: http-metrics + protocol: TCP + - name: telemetry + port: 8081 + targetPort: telemetry + protocol: TCP + selector: + app: kube-state-metrics diff --git a/kubernetes/kube-state-metrics/rbac.yaml b/kubernetes/kube-state-metrics/rbac.yaml new file mode 100644 index 0000000..5c37b8d --- /dev/null +++ b/kubernetes/kube-state-metrics/rbac.yaml @@ -0,0 +1,92 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-state-metrics + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kube-state-metrics +rules: + - apiGroups: [""] + resources: + - configmaps + - secrets + - nodes + - pods + - services + - serviceaccounts + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: + - statefulsets + - daemonsets + - deployments + - replicasets + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: + - cronjobs + - jobs + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["list", "watch"] + - apiGroups: ["authentication.k8s.io"] + resources: + - tokenreviews + verbs: ["create"] + - apiGroups: ["authorization.k8s.io"] + resources: + - subjectaccessreviews + verbs: ["create"] + - apiGroups: ["policy"] + resources: + - poddisruptionbudgets + verbs: ["list", "watch"] + - apiGroups: ["certificates.k8s.io"] + resources: + - certificatesigningrequests + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: + - storageclasses + - volumeattachments + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: + - mutatingwebhookconfigurations + - validatingwebhookconfigurations + verbs: ["list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: + - networkpolicies + - ingresses + verbs: ["list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: + - leases + verbs: ["list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: + - kind: ServiceAccount + name: kube-state-metrics + namespace: monitoring diff --git a/kubernetes/loki/configmap.yaml b/kubernetes/loki/configmap.yaml new file mode 100644 index 0000000..d05a34d --- /dev/null +++ b/kubernetes/loki/configmap.yaml @@ -0,0 +1,67 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: loki-config + namespace: monitoring +data: + loki.yaml: | + auth_enabled: false + + server: + http_listen_port: 3100 + grpc_listen_port: 9096 + log_level: info + + common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + + query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 + + schema_config: + configs: + - from: 2020-10-24 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + + ruler: + alertmanager_url: http://localhost:9093 + + limits_config: + reject_old_samples: true + reject_old_samples_max_age: 168h + ingestion_rate_mb: 4 + ingestion_burst_size_mb: 6 + max_streams_per_user: 10000 + max_line_size: 256kb + + # Compactor for retention + compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + retention_delete_worker_count: 150 + delete_request_store: filesystem + + # 7 day retention + table_manager: + retention_deletes_enabled: true + retention_period: 168h diff --git a/kubernetes/loki/deployment.yaml b/kubernetes/loki/deployment.yaml new file mode 100644 index 0000000..7610f42 --- /dev/null +++ b/kubernetes/loki/deployment.yaml @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: loki + namespace: monitoring + labels: + app: loki +spec: + replicas: 1 + selector: + matchLabels: + app: loki + template: + metadata: + labels: + app: loki + spec: + securityContext: + fsGroup: 10001 + runAsGroup: 10001 + runAsNonRoot: true + runAsUser: 10001 + containers: + - name: loki + image: grafana/loki:2.9.2 + args: + - "-config.file=/etc/loki/loki.yaml" + ports: + - name: http + containerPort: 3100 + protocol: TCP + - name: grpc + containerPort: 9096 + protocol: TCP + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "512Mi" + cpu: "300m" + volumeMounts: + - name: config + mountPath: /etc/loki + - name: storage + mountPath: /loki + livenessProbe: + httpGet: + path: /ready + port: http + initialDelaySeconds: 45 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /ready + port: http + initialDelaySeconds: 45 + periodSeconds: 10 + volumes: + - name: config + configMap: + name: loki-config + - name: storage + emptyDir: {} + # For production, replace emptyDir with PVC: + # - name: storage + # persistentVolumeClaim: + # claimName: loki-storage diff --git a/kubernetes/loki/service.yaml b/kubernetes/loki/service.yaml new file mode 100644 index 0000000..8f09f15 --- /dev/null +++ b/kubernetes/loki/service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: loki + namespace: monitoring + labels: + app: loki +spec: + type: NodePort + ports: + - name: http + port: 3100 + targetPort: 3100 + nodePort: 30100 + protocol: TCP + selector: + app: loki diff --git a/kubernetes/namespace.yaml b/kubernetes/namespace.yaml new file mode 100644 index 0000000..90d12ef --- /dev/null +++ b/kubernetes/namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring + labels: + name: monitoring diff --git a/kubernetes/node-exporter/daemonset.yaml b/kubernetes/node-exporter/daemonset.yaml new file mode 100644 index 0000000..e7eeb7e --- /dev/null +++ b/kubernetes/node-exporter/daemonset.yaml @@ -0,0 +1,115 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-exporter + namespace: monitoring +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-exporter + namespace: monitoring + labels: + app: node-exporter +spec: + selector: + matchLabels: + app: node-exporter + template: + metadata: + labels: + app: node-exporter + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9100" + spec: + serviceAccountName: node-exporter + hostNetwork: true + hostPID: true + tolerations: + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + containers: + - name: node-exporter + image: prom/node-exporter:v1.7.0 + args: + - "--path.procfs=/host/proc" + - "--path.sysfs=/host/sys" + - "--path.rootfs=/host/root" + - "--web.listen-address=:9100" + - "--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/)" + - "--collector.netclass.ignored-devices=^(veth.*|cali.*|docker.*|flannel.*)$" + ports: + - name: metrics + containerPort: 9100 + hostPort: 9100 + protocol: TCP + resources: + requests: + memory: "20Mi" + cpu: "10m" + limits: + memory: "50Mi" + cpu: "100m" + securityContext: + runAsNonRoot: true + runAsUser: 65534 + volumeMounts: + - name: proc + mountPath: /host/proc + readOnly: true + - name: sys + mountPath: /host/sys + readOnly: true + - name: root + mountPath: /host/root + readOnly: true + mountPropagation: HostToContainer + livenessProbe: + httpGet: + path: / + port: metrics + initialDelaySeconds: 5 + periodSeconds: 15 + readinessProbe: + httpGet: + path: / + port: metrics + initialDelaySeconds: 5 + periodSeconds: 15 + volumes: + - name: proc + hostPath: + path: /proc + - name: sys + hostPath: + path: /sys + - name: root + hostPath: + path: / +--- +apiVersion: v1 +kind: Service +metadata: + name: node-exporter + namespace: monitoring + labels: + app: node-exporter + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9100" +spec: + type: ClusterIP + clusterIP: None + ports: + - name: metrics + port: 9100 + targetPort: 9100 + protocol: TCP + selector: + app: node-exporter diff --git a/kubernetes/prometheus/configmap.yaml b/kubernetes/prometheus/configmap.yaml new file mode 100644 index 0000000..ce9db23 --- /dev/null +++ b/kubernetes/prometheus/configmap.yaml @@ -0,0 +1,143 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: monitoring +data: + prometheus.yml: | + global: + scrape_interval: 30s + evaluation_interval: 30s + external_labels: + monitor: 'talos-cluster' + cluster: 'home' + + scrape_configs: + # Prometheus self-monitoring + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + # Kubernetes API server + - job_name: 'kubernetes-apiservers' + kubernetes_sd_configs: + - role: endpoints + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + + # Kubernetes nodes (kubelet) + - job_name: 'kubernetes-nodes' + kubernetes_sd_configs: + - role: node + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics + + # Kubernetes nodes (cAdvisor) + - job_name: 'kubernetes-cadvisor' + kubernetes_sd_configs: + - role: node + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + + # Kubernetes service endpoints + - job_name: 'kubernetes-service-endpoints' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: kubernetes_name + + # Kubernetes pods + - job_name: 'kubernetes-pods' + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + + # Node exporter (DaemonSet) + - job_name: 'node-exporter' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_endpoints_name] + action: keep + regex: node-exporter + - source_labels: [__meta_kubernetes_endpoint_node_name] + target_label: node + + # Kube State Metrics + - job_name: 'kube-state-metrics' + static_configs: + - targets: ['kube-state-metrics.monitoring.svc.cluster.local:8080'] diff --git a/kubernetes/prometheus/deployment.yaml b/kubernetes/prometheus/deployment.yaml new file mode 100644 index 0000000..f05814c --- /dev/null +++ b/kubernetes/prometheus/deployment.yaml @@ -0,0 +1,73 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus + namespace: monitoring + labels: + app: prometheus +spec: + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9090" + spec: + serviceAccountName: prometheus + securityContext: + fsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + containers: + - name: prometheus + image: prom/prometheus:v2.48.0 + args: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + - "--storage.tsdb.retention.time=15d" + - "--web.listen-address=0.0.0.0:9090" + - "--web.enable-lifecycle" + - "--web.enable-admin-api" + ports: + - name: http + containerPort: 9090 + protocol: TCP + resources: + requests: + memory: "512Mi" + cpu: "200m" + limits: + memory: "1Gi" + cpu: "500m" + volumeMounts: + - name: config + mountPath: /etc/prometheus + - name: storage + mountPath: /prometheus + livenessProbe: + httpGet: + path: /-/healthy + port: http + initialDelaySeconds: 30 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /-/ready + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + volumes: + - name: config + configMap: + name: prometheus-config + - name: storage + emptyDir: {} + # For production, replace emptyDir with PVC: + # - name: storage + # persistentVolumeClaim: + # claimName: prometheus-storage diff --git a/kubernetes/prometheus/rbac.yaml b/kubernetes/prometheus/rbac.yaml new file mode 100644 index 0000000..f211576 --- /dev/null +++ b/kubernetes/prometheus/rbac.yaml @@ -0,0 +1,40 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - nodes/metrics + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] + - apiGroups: ["extensions", "networking.k8s.io"] + resources: + - ingresses + verbs: ["get", "list", "watch"] + - nonResourceURLs: ["/metrics", "/metrics/cadvisor"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: + - kind: ServiceAccount + name: prometheus + namespace: monitoring diff --git a/kubernetes/prometheus/service.yaml b/kubernetes/prometheus/service.yaml new file mode 100644 index 0000000..dc5aa78 --- /dev/null +++ b/kubernetes/prometheus/service.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: monitoring + labels: + app: prometheus + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9090" +spec: + type: NodePort + ports: + - name: http + port: 9090 + targetPort: 9090 + nodePort: 30090 + protocol: TCP + selector: + app: prometheus diff --git a/kubernetes/promtail/configmap.yaml b/kubernetes/promtail/configmap.yaml new file mode 100644 index 0000000..60911a4 --- /dev/null +++ b/kubernetes/promtail/configmap.yaml @@ -0,0 +1,85 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: promtail-config + namespace: monitoring +data: + promtail.yaml: | + server: + http_listen_port: 3101 + grpc_listen_port: 0 + + positions: + filename: /run/promtail/positions.yaml + + clients: + - url: http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push + tenant_id: talos-cluster + + scrape_configs: + # Container logs from /var/log/pods + - job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + pipeline_stages: + - cri: {} + relabel_configs: + # Only scrape pods with promtail.io/scrape annotation (or all by default) + - source_labels: + - __meta_kubernetes_pod_annotation_promtail_io_scrape + action: drop + regex: false + + # Use pod name as instance + - source_labels: + - __meta_kubernetes_pod_name + target_label: instance + + # Namespace label + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + + # Pod name label + - source_labels: + - __meta_kubernetes_pod_name + target_label: pod + + # Container name label + - source_labels: + - __meta_kubernetes_pod_container_name + target_label: container + + # Node name label + - source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + + # App label (from pod labels) + - source_labels: + - __meta_kubernetes_pod_label_app + target_label: app + + # App.kubernetes.io/name label + - source_labels: + - __meta_kubernetes_pod_label_app_kubernetes_io_name + target_label: app + regex: (.+) + action: replace + + # Set path to container log file + - source_labels: + - __meta_kubernetes_pod_uid + - __meta_kubernetes_pod_container_name + target_label: __path__ + separator: / + replacement: /var/log/pods/*$1/*.log + + # Talos system logs (if mounted) + - job_name: talos-system + static_configs: + - targets: + - localhost + labels: + job: talos-system + __path__: /var/log/containers/*.log diff --git a/kubernetes/promtail/daemonset.yaml b/kubernetes/promtail/daemonset.yaml new file mode 100644 index 0000000..3d21759 --- /dev/null +++ b/kubernetes/promtail/daemonset.yaml @@ -0,0 +1,93 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: promtail + namespace: monitoring + labels: + app: promtail +spec: + selector: + matchLabels: + app: promtail + template: + metadata: + labels: + app: promtail + spec: + serviceAccountName: promtail + tolerations: + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + containers: + - name: promtail + image: grafana/promtail:2.9.2 + args: + - "-config.file=/etc/promtail/promtail.yaml" + env: + - name: HOSTNAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + ports: + - name: http + containerPort: 3101 + protocol: TCP + resources: + requests: + memory: "50Mi" + cpu: "50m" + limits: + memory: "128Mi" + cpu: "100m" + securityContext: + readOnlyRootFilesystem: true + runAsUser: 0 + runAsGroup: 0 + volumeMounts: + - name: config + mountPath: /etc/promtail + - name: run + mountPath: /run/promtail + # Mount pod logs + - name: pods + mountPath: /var/log/pods + readOnly: true + # Mount container logs (for CRI-O / containerd) + - name: containers + mountPath: /var/log/containers + readOnly: true + # Machine-id for consistent instance identification + - name: machine-id + mountPath: /etc/machine-id + readOnly: true + readinessProbe: + httpGet: + path: /ready + port: http + initialDelaySeconds: 10 + periodSeconds: 10 + livenessProbe: + httpGet: + path: /ready + port: http + initialDelaySeconds: 10 + periodSeconds: 10 + volumes: + - name: config + configMap: + name: promtail-config + - name: run + emptyDir: {} + - name: pods + hostPath: + path: /var/log/pods + - name: containers + hostPath: + path: /var/log/containers + - name: machine-id + hostPath: + path: /etc/machine-id diff --git a/kubernetes/promtail/rbac.yaml b/kubernetes/promtail/rbac.yaml new file mode 100644 index 0000000..05334d4 --- /dev/null +++ b/kubernetes/promtail/rbac.yaml @@ -0,0 +1,33 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: promtail + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: promtail +rules: + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: promtail +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: promtail +subjects: + - kind: ServiceAccount + name: promtail + namespace: monitoring