Add observability stack: ServiceMonitors, Tempo, OTel API instrumentation, dashboards

- Add ServiceMonitors for Traefik, ArgoCD, and Longhorn
- Enable cert-manager ServiceMonitor via helm values
- Deploy Grafana Tempo for distributed tracing (single-binary, Longhorn PVC)
- Add Tempo datasource with trace-to-logs and trace-to-metrics correlation
- Instrument API with OpenTelemetry SDK (Prometheus metrics + OTLP traces)
- Replace console.log with pino structured logging + pino-http middleware
- Add Grafana dashboards for Traefik, API overview, and PostgreSQL (CNPG)
This commit is contained in:
Julia McGhee
2026-03-20 21:00:48 +00:00
parent 8a23d5d5f6
commit 051c957347
23 changed files with 2259 additions and 11 deletions

View File

@@ -21,7 +21,12 @@ spec:
image: ghcr.io/lazorgurl/homelab-api:latest image: ghcr.io/lazorgurl/homelab-api:latest
ports: ports:
- containerPort: 4000 - containerPort: 4000
name: http
- containerPort: 9464
name: metrics
env: env:
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: http://tempo.observability.svc:4318
- name: DATABASE_URL - name: DATABASE_URL
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:

View File

@@ -6,3 +6,4 @@ resources:
- ingress.yaml - ingress.yaml
- ghcr-pull-secret-sealed.yaml - ghcr-pull-secret-sealed.yaml
- api-secrets-sealed.yaml - api-secrets-sealed.yaml
- servicemonitor.yaml

View File

@@ -8,5 +8,10 @@ spec:
- port: 80 - port: 80
targetPort: 4000 targetPort: 4000
protocol: TCP protocol: TCP
name: http
- port: 9464
targetPort: 9464
protocol: TCP
name: metrics
selector: selector:
app: api app: api

View File

@@ -0,0 +1,14 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: api
labels:
app: api
spec:
selector:
matchLabels:
app: api
endpoints:
- port: metrics
interval: 30s
path: /metrics

View File

@@ -10,13 +10,22 @@
"test": "echo \"no tests yet\"" "test": "echo \"no tests yet\""
}, },
"dependencies": { "dependencies": {
"@opentelemetry/auto-instrumentations-node": "^0.71.0",
"@opentelemetry/exporter-prometheus": "^0.213.0",
"@opentelemetry/exporter-trace-otlp-http": "^0.213.0",
"@opentelemetry/resources": "^2.6.0",
"@opentelemetry/sdk-node": "^0.213.0",
"@opentelemetry/semantic-conventions": "^1.40.0",
"cors": "^2.8.5",
"express": "^4.21.0", "express": "^4.21.0",
"cors": "^2.8.5" "pino": "^10.3.1",
"pino-http": "^11.0.0"
}, },
"devDependencies": { "devDependencies": {
"@types/express": "^5.0.0",
"@types/cors": "^2.8.17", "@types/cors": "^2.8.17",
"@types/express": "^5.0.0",
"@types/node": "^22.10.0", "@types/node": "^22.10.0",
"@types/pino-http": "^6.1.0",
"tsup": "^8.3.0", "tsup": "^8.3.0",
"tsx": "^4.19.0", "tsx": "^4.19.0",
"typescript": "^5.7.0" "typescript": "^5.7.0"

View File

@@ -1,11 +1,17 @@
import "./instrumentation";
import express from "express"; import express from "express";
import cors from "cors"; import cors from "cors";
import pino from "pino";
import pinoHttp from "pino-http";
const logger = pino({ name: "api" });
const app = express(); const app = express();
const port = process.env.PORT || 4000; const port = process.env.PORT || 4000;
app.use(cors()); app.use(cors());
app.use(express.json()); app.use(express.json());
app.use(pinoHttp({ logger }));
app.get("/health", (_req, res) => { app.get("/health", (_req, res) => {
res.json({ status: "ok", timestamp: new Date().toISOString() }); res.json({ status: "ok", timestamp: new Date().toISOString() });
@@ -16,5 +22,5 @@ app.get("/api", (_req, res) => {
}); });
app.listen(port, () => { app.listen(port, () => {
console.log(`API server running on port ${port}`); logger.info(`API server running on port ${port}`);
}); });

View File

@@ -0,0 +1,36 @@
import { NodeSDK } from "@opentelemetry/sdk-node";
import { getNodeAutoInstrumentations } from "@opentelemetry/auto-instrumentations-node";
import { PrometheusExporter } from "@opentelemetry/exporter-prometheus";
import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
import { resourceFromAttributes } from "@opentelemetry/resources";
import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions";
const prometheusExporter = new PrometheusExporter({ port: 9464 });
const traceExporter = new OTLPTraceExporter({
url:
process.env.OTEL_EXPORTER_OTLP_ENDPOINT ??
"http://tempo.observability.svc:4318/v1/traces",
});
const sdk = new NodeSDK({
resource: resourceFromAttributes({
[ATTR_SERVICE_NAME]: "api",
}),
metricReader: prometheusExporter,
traceExporter,
instrumentations: [
getNodeAutoInstrumentations({
"@opentelemetry/instrumentation-fs": { enabled: false },
}),
],
});
sdk.start();
process.on("SIGTERM", () => {
sdk.shutdown().then(
() => process.exit(0),
() => process.exit(1),
);
});

View File

@@ -8,6 +8,7 @@ resources:
- appsets/platform.yaml - appsets/platform.yaml
- appsets/apps.yaml - appsets/apps.yaml
- appsets/previews.yaml - appsets/previews.yaml
- servicemonitor.yaml
patches: patches:
- target: - target:
kind: ConfigMap kind: ConfigMap

View File

@@ -0,0 +1,44 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: argocd-server
namespace: argocd
labels:
app.kubernetes.io/part-of: argocd
spec:
selector:
matchLabels:
app.kubernetes.io/name: argocd-server
endpoints:
- port: metrics
interval: 30s
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: argocd-repo-server
namespace: argocd
labels:
app.kubernetes.io/part-of: argocd
spec:
selector:
matchLabels:
app.kubernetes.io/name: argocd-repo-server
endpoints:
- port: metrics
interval: 30s
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: argocd-application-controller
namespace: argocd
labels:
app.kubernetes.io/part-of: argocd
spec:
selector:
matchLabels:
app.kubernetes.io/name: argocd-application-controller
endpoints:
- port: metrics
interval: 30s

View File

@@ -0,0 +1,88 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: api-overview-dashboard
namespace: observability
labels:
grafana_dashboard: "1"
data:
api-overview.json: |
{
"annotations": { "list": [] },
"editable": true,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"title": "HTTP Request Rate",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
"targets": [
{
"expr": "sum(rate(http_server_request_duration_seconds_count{service_name=\"api\"}[5m])) by (http_route)",
"legendFormat": "{{ http_route }}"
}
],
"fieldConfig": {
"defaults": { "unit": "reqps" }
}
},
{
"title": "HTTP Error Rate",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
"targets": [
{
"expr": "sum(rate(http_server_request_duration_seconds_count{service_name=\"api\", http_status_code=~\"5..\"}[5m])) by (http_route)",
"legendFormat": "{{ http_route }}"
}
],
"fieldConfig": {
"defaults": { "unit": "reqps" }
}
},
{
"title": "HTTP Request Duration (p50 / p95 / p99)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(http_server_request_duration_seconds_bucket{service_name=\"api\"}[5m])) by (le))",
"legendFormat": "p50"
},
{
"expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket{service_name=\"api\"}[5m])) by (le))",
"legendFormat": "p95"
},
{
"expr": "histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{service_name=\"api\"}[5m])) by (le))",
"legendFormat": "p99"
}
],
"fieldConfig": {
"defaults": { "unit": "s" }
}
},
{
"title": "DB Query Duration",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(db_client_operation_duration_bucket{service_name=\"api\"}[5m])) by (le, db_operation_name))",
"legendFormat": "p95 {{ db_operation_name }}"
}
],
"fieldConfig": {
"defaults": { "unit": "s" }
}
}
],
"schemaVersion": 39,
"tags": ["homelab", "api", "otel"],
"templating": { "list": [] },
"time": { "from": "now-6h", "to": "now" },
"title": "API Overview",
"uid": "api-overview"
}

View File

@@ -0,0 +1,81 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: postgres-dashboard
namespace: observability
labels:
grafana_dashboard: "1"
data:
postgres.json: |
{
"annotations": { "list": [] },
"editable": true,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"title": "Active Connections",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
"targets": [
{
"expr": "cnpg_backends_total",
"legendFormat": "{{ pod }}"
}
]
},
{
"title": "Transactions per Second",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
"targets": [
{
"expr": "rate(cnpg_pg_stat_database_xact_commit{datname=\"homelab\"}[5m])",
"legendFormat": "commits {{ pod }}"
},
{
"expr": "rate(cnpg_pg_stat_database_xact_rollback{datname=\"homelab\"}[5m])",
"legendFormat": "rollbacks {{ pod }}"
}
],
"fieldConfig": {
"defaults": { "unit": "ops" }
}
},
{
"title": "Replication Lag (bytes)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
"targets": [
{
"expr": "cnpg_pg_replication_lag",
"legendFormat": "{{ pod }}"
}
],
"fieldConfig": {
"defaults": { "unit": "bytes" }
}
},
{
"title": "Database Size",
"type": "stat",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
"targets": [
{
"expr": "cnpg_pg_database_size_bytes{datname=\"homelab\"}",
"legendFormat": "{{ pod }}"
}
],
"fieldConfig": {
"defaults": { "unit": "bytes" }
}
}
],
"schemaVersion": 39,
"tags": ["homelab", "postgres", "cnpg"],
"templating": { "list": [] },
"time": { "from": "now-6h", "to": "now" },
"title": "PostgreSQL (CloudNativePG)",
"uid": "postgres-cnpg"
}

View File

@@ -0,0 +1,89 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: traefik-dashboard
namespace: observability
labels:
grafana_dashboard: "1"
data:
traefik.json: |
{
"annotations": { "list": [] },
"editable": true,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"title": "Request Rate",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
"targets": [
{
"expr": "sum(rate(traefik_entrypoint_requests_total[5m])) by (entrypoint)",
"legendFormat": "{{ entrypoint }}"
}
],
"fieldConfig": {
"defaults": { "unit": "reqps" }
}
},
{
"title": "Error Rate (4xx/5xx)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
"targets": [
{
"expr": "sum(rate(traefik_entrypoint_requests_total{code=~\"4..\"}[5m]))",
"legendFormat": "4xx"
},
{
"expr": "sum(rate(traefik_entrypoint_requests_total{code=~\"5..\"}[5m]))",
"legendFormat": "5xx"
}
],
"fieldConfig": {
"defaults": { "unit": "reqps" }
}
},
{
"title": "Request Duration (p50 / p95 / p99)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(traefik_entrypoint_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "p50"
},
{
"expr": "histogram_quantile(0.95, sum(rate(traefik_entrypoint_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "p95"
},
{
"expr": "histogram_quantile(0.99, sum(rate(traefik_entrypoint_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "p99"
}
],
"fieldConfig": {
"defaults": { "unit": "s" }
}
},
{
"title": "Open Connections",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
"targets": [
{
"expr": "sum(traefik_entrypoint_open_connections) by (entrypoint)",
"legendFormat": "{{ entrypoint }}"
}
]
}
],
"schemaVersion": 39,
"tags": ["homelab", "traefik", "ingress"],
"templating": { "list": [] },
"time": { "from": "now-6h", "to": "now" },
"title": "Traefik",
"uid": "traefik"
}

View File

@@ -15,3 +15,22 @@ data:
url: http://loki.observability.svc:3100 url: http://loki.observability.svc:3100
jsonData: jsonData:
maxLines: 1000 maxLines: 1000
derivedFields:
- datasourceUid: tempo
matcherRegex: '"traceID":"(\w+)"'
name: TraceID
url: "$${__value.raw}"
- name: Tempo
type: tempo
uid: tempo
access: proxy
url: http://tempo.observability.svc:3100
jsonData:
tracesToLogs:
datasourceUid: loki
filterByTraceID: true
filterBySpanID: false
tracesToMetrics:
datasourceUid: prometheus
serviceMap:
datasourceUid: prometheus

View File

@@ -2,5 +2,9 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization kind: Kustomization
resources: resources:
- application.yaml - application.yaml
- grafana-admin-sealed.yaml
- grafana-datasources.yaml - grafana-datasources.yaml
- dashboards/cluster-overview.yaml - dashboards/cluster-overview.yaml
- dashboards/traefik.yaml
- dashboards/api-overview.yaml
- dashboards/postgres.yaml

View File

@@ -4,3 +4,4 @@ resources:
- kube-prometheus-stack/ - kube-prometheus-stack/
- loki/ - loki/
- promtail/ - promtail/
- tempo/

View File

@@ -0,0 +1,41 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: tempo
namespace: argocd
spec:
project: default
source:
repoURL: https://grafana.github.io/helm-charts
chart: tempo
targetRevision: 1.12.0
helm:
valuesObject:
tempo:
receivers:
otlp:
protocols:
grpc:
endpoint: "0.0.0.0:4317"
http:
endpoint: "0.0.0.0:4318"
retention: 168h
resources:
requests:
memory: 256Mi
cpu: 100m
limits:
memory: 1Gi
persistence:
enabled: true
storageClassName: longhorn
size: 10Gi
destination:
server: https://kubernetes.default.svc
namespace: observability
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true

View File

@@ -0,0 +1,4 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- application.yaml

View File

@@ -15,6 +15,9 @@ spec:
valuesObject: valuesObject:
crds: crds:
enabled: true enabled: true
prometheus:
servicemonitor:
enabled: true
destination: destination:
server: https://kubernetes.default.svc server: https://kubernetes.default.svc
namespace: cert-manager namespace: cert-manager

View File

@@ -3,3 +3,4 @@ kind: Kustomization
resources: resources:
- application.yaml - application.yaml
- namespace.yaml - namespace.yaml
- servicemonitor.yaml

View File

@@ -0,0 +1,15 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: longhorn
namespace: longhorn-system
labels:
app: longhorn
spec:
selector:
matchLabels:
app: longhorn-manager
endpoints:
- port: manager
interval: 30s
path: /metrics

View File

@@ -8,3 +8,4 @@ resources:
- ingressroute-grafana.yaml - ingressroute-grafana.yaml
- ingressroute-longhorn.yaml - ingressroute-longhorn.yaml
- certificate-internal.yaml - certificate-internal.yaml
- servicemonitor.yaml

View File

@@ -0,0 +1,15 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: traefik
namespace: kube-system
labels:
app: traefik
spec:
selector:
matchLabels:
app.kubernetes.io/name: traefik
endpoints:
- port: metrics
interval: 30s
path: /metrics

1781
pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff