Add observability stack: ServiceMonitors, Tempo, OTel API instrumentation, dashboards
- Add ServiceMonitors for Traefik, ArgoCD, and Longhorn - Enable cert-manager ServiceMonitor via helm values - Deploy Grafana Tempo for distributed tracing (single-binary, Longhorn PVC) - Add Tempo datasource with trace-to-logs and trace-to-metrics correlation - Instrument API with OpenTelemetry SDK (Prometheus metrics + OTLP traces) - Replace console.log with pino structured logging + pino-http middleware - Add Grafana dashboards for Traefik, API overview, and PostgreSQL (CNPG)
This commit is contained in:
@@ -0,0 +1,88 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: api-overview-dashboard
|
||||
namespace: observability
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
api-overview.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"title": "HTTP Request Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_server_request_duration_seconds_count{service_name=\"api\"}[5m])) by (http_route)",
|
||||
"legendFormat": "{{ http_route }}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "reqps" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "HTTP Error Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_server_request_duration_seconds_count{service_name=\"api\", http_status_code=~\"5..\"}[5m])) by (http_route)",
|
||||
"legendFormat": "{{ http_route }}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "reqps" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "HTTP Request Duration (p50 / p95 / p99)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(http_server_request_duration_seconds_bucket{service_name=\"api\"}[5m])) by (le))",
|
||||
"legendFormat": "p50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket{service_name=\"api\"}[5m])) by (le))",
|
||||
"legendFormat": "p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{service_name=\"api\"}[5m])) by (le))",
|
||||
"legendFormat": "p99"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "s" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "DB Query Duration",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(db_client_operation_duration_bucket{service_name=\"api\"}[5m])) by (le, db_operation_name))",
|
||||
"legendFormat": "p95 {{ db_operation_name }}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "s" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["homelab", "api", "otel"],
|
||||
"templating": { "list": [] },
|
||||
"time": { "from": "now-6h", "to": "now" },
|
||||
"title": "API Overview",
|
||||
"uid": "api-overview"
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: postgres-dashboard
|
||||
namespace: observability
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
postgres.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"title": "Active Connections",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cnpg_backends_total",
|
||||
"legendFormat": "{{ pod }}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Transactions per Second",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(cnpg_pg_stat_database_xact_commit{datname=\"homelab\"}[5m])",
|
||||
"legendFormat": "commits {{ pod }}"
|
||||
},
|
||||
{
|
||||
"expr": "rate(cnpg_pg_stat_database_xact_rollback{datname=\"homelab\"}[5m])",
|
||||
"legendFormat": "rollbacks {{ pod }}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "ops" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Replication Lag (bytes)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cnpg_pg_replication_lag",
|
||||
"legendFormat": "{{ pod }}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "bytes" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Database Size",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cnpg_pg_database_size_bytes{datname=\"homelab\"}",
|
||||
"legendFormat": "{{ pod }}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "bytes" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["homelab", "postgres", "cnpg"],
|
||||
"templating": { "list": [] },
|
||||
"time": { "from": "now-6h", "to": "now" },
|
||||
"title": "PostgreSQL (CloudNativePG)",
|
||||
"uid": "postgres-cnpg"
|
||||
}
|
||||
@@ -0,0 +1,89 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: traefik-dashboard
|
||||
namespace: observability
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
traefik.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"title": "Request Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(traefik_entrypoint_requests_total[5m])) by (entrypoint)",
|
||||
"legendFormat": "{{ entrypoint }}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "reqps" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Error Rate (4xx/5xx)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(traefik_entrypoint_requests_total{code=~\"4..\"}[5m]))",
|
||||
"legendFormat": "4xx"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(traefik_entrypoint_requests_total{code=~\"5..\"}[5m]))",
|
||||
"legendFormat": "5xx"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "reqps" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Request Duration (p50 / p95 / p99)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(traefik_entrypoint_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(traefik_entrypoint_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(traefik_entrypoint_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p99"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "s" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Open Connections",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(traefik_entrypoint_open_connections) by (entrypoint)",
|
||||
"legendFormat": "{{ entrypoint }}"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["homelab", "traefik", "ingress"],
|
||||
"templating": { "list": [] },
|
||||
"time": { "from": "now-6h", "to": "now" },
|
||||
"title": "Traefik",
|
||||
"uid": "traefik"
|
||||
}
|
||||
@@ -15,3 +15,22 @@ data:
|
||||
url: http://loki.observability.svc:3100
|
||||
jsonData:
|
||||
maxLines: 1000
|
||||
derivedFields:
|
||||
- datasourceUid: tempo
|
||||
matcherRegex: '"traceID":"(\w+)"'
|
||||
name: TraceID
|
||||
url: "$${__value.raw}"
|
||||
- name: Tempo
|
||||
type: tempo
|
||||
uid: tempo
|
||||
access: proxy
|
||||
url: http://tempo.observability.svc:3100
|
||||
jsonData:
|
||||
tracesToLogs:
|
||||
datasourceUid: loki
|
||||
filterByTraceID: true
|
||||
filterBySpanID: false
|
||||
tracesToMetrics:
|
||||
datasourceUid: prometheus
|
||||
serviceMap:
|
||||
datasourceUid: prometheus
|
||||
|
||||
@@ -2,5 +2,9 @@ apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- application.yaml
|
||||
- grafana-admin-sealed.yaml
|
||||
- grafana-datasources.yaml
|
||||
- dashboards/cluster-overview.yaml
|
||||
- dashboards/traefik.yaml
|
||||
- dashboards/api-overview.yaml
|
||||
- dashboards/postgres.yaml
|
||||
|
||||
@@ -4,3 +4,4 @@ resources:
|
||||
- kube-prometheus-stack/
|
||||
- loki/
|
||||
- promtail/
|
||||
- tempo/
|
||||
|
||||
41
infra/kubernetes/observability/tempo/application.yaml
Normal file
41
infra/kubernetes/observability/tempo/application.yaml
Normal file
@@ -0,0 +1,41 @@
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: tempo
|
||||
namespace: argocd
|
||||
spec:
|
||||
project: default
|
||||
source:
|
||||
repoURL: https://grafana.github.io/helm-charts
|
||||
chart: tempo
|
||||
targetRevision: 1.12.0
|
||||
helm:
|
||||
valuesObject:
|
||||
tempo:
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: "0.0.0.0:4317"
|
||||
http:
|
||||
endpoint: "0.0.0.0:4318"
|
||||
retention: 168h
|
||||
resources:
|
||||
requests:
|
||||
memory: 256Mi
|
||||
cpu: 100m
|
||||
limits:
|
||||
memory: 1Gi
|
||||
persistence:
|
||||
enabled: true
|
||||
storageClassName: longhorn
|
||||
size: 10Gi
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: observability
|
||||
syncPolicy:
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
4
infra/kubernetes/observability/tempo/kustomization.yaml
Normal file
4
infra/kubernetes/observability/tempo/kustomization.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- application.yaml
|
||||
Reference in New Issue
Block a user