- Add ServiceMonitors for Traefik, ArgoCD, and Longhorn - Enable cert-manager ServiceMonitor via helm values - Deploy Grafana Tempo for distributed tracing (single-binary, Longhorn PVC) - Add Tempo datasource with trace-to-logs and trace-to-metrics correlation - Instrument API with OpenTelemetry SDK (Prometheus metrics + OTLP traces) - Replace console.log with pino structured logging + pino-http middleware - Add Grafana dashboards for Traefik, API overview, and PostgreSQL (CNPG)
89 lines
2.8 KiB
YAML
89 lines
2.8 KiB
YAML
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: api-overview-dashboard
|
|
namespace: observability
|
|
labels:
|
|
grafana_dashboard: "1"
|
|
data:
|
|
api-overview.json: |
|
|
{
|
|
"annotations": { "list": [] },
|
|
"editable": true,
|
|
"graphTooltip": 1,
|
|
"id": null,
|
|
"links": [],
|
|
"panels": [
|
|
{
|
|
"title": "HTTP Request Rate",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(http_server_request_duration_seconds_count{service_name=\"api\"}[5m])) by (http_route)",
|
|
"legendFormat": "{{ http_route }}"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "reqps" }
|
|
}
|
|
},
|
|
{
|
|
"title": "HTTP Error Rate",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(http_server_request_duration_seconds_count{service_name=\"api\", http_status_code=~\"5..\"}[5m])) by (http_route)",
|
|
"legendFormat": "{{ http_route }}"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "reqps" }
|
|
}
|
|
},
|
|
{
|
|
"title": "HTTP Request Duration (p50 / p95 / p99)",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.50, sum(rate(http_server_request_duration_seconds_bucket{service_name=\"api\"}[5m])) by (le))",
|
|
"legendFormat": "p50"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket{service_name=\"api\"}[5m])) by (le))",
|
|
"legendFormat": "p95"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{service_name=\"api\"}[5m])) by (le))",
|
|
"legendFormat": "p99"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "s" }
|
|
}
|
|
},
|
|
{
|
|
"title": "DB Query Duration",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.95, sum(rate(db_client_operation_duration_bucket{service_name=\"api\"}[5m])) by (le, db_operation_name))",
|
|
"legendFormat": "p95 {{ db_operation_name }}"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "s" }
|
|
}
|
|
}
|
|
],
|
|
"schemaVersion": 39,
|
|
"tags": ["homelab", "api", "otel"],
|
|
"templating": { "list": [] },
|
|
"time": { "from": "now-6h", "to": "now" },
|
|
"title": "API Overview",
|
|
"uid": "api-overview"
|
|
}
|