Shardlyn Metrics Reference
Overview
Shardlyn exposes metrics in Prometheus format for monitoring and alerting. Both the Control Plane and Agent expose separate metrics endpoints.
Related
- Observability Guide — Setting up monitoring and alerts
- Architecture — Metrics collection flow
- Connecting Nodes — Setting up your infrastructure
Endpoints
| Component | Endpoint | Default Port |
|---|---|---|
| Control Plane | /metrics | 9100 |
| Agent | /metrics | 9101 |
Control Plane Metrics
HTTP Metrics
# HELP shardlyn_http_requests_total Total number of HTTP requests
# TYPE shardlyn_http_requests_total counter
shardlyn_http_requests_total{method="GET",path="/v1/nodes",status="200"} 150
shardlyn_http_requests_total{method="POST",path="/v1/instances",status="201"} 25
shardlyn_http_requests_total{method="POST",path="/v1/auth/login",status="401"} 3
# HELP shardlyn_http_request_duration_seconds HTTP request duration in seconds
# TYPE shardlyn_http_request_duration_seconds histogram
shardlyn_http_request_duration_seconds_bucket{method="GET",path="/v1/nodes",le="0.01"} 140
shardlyn_http_request_duration_seconds_bucket{method="GET",path="/v1/nodes",le="0.05"} 148
shardlyn_http_request_duration_seconds_bucket{method="GET",path="/v1/nodes",le="0.1"} 150
shardlyn_http_request_duration_seconds_sum{method="GET",path="/v1/nodes"} 1.25
shardlyn_http_request_duration_seconds_count{method="GET",path="/v1/nodes"} 150
# HELP shardlyn_http_requests_in_flight Current number of HTTP requests being processed
# TYPE shardlyn_http_requests_in_flight gauge
shardlyn_http_requests_in_flight 5Database Metrics
# HELP shardlyn_db_connections_open Number of open database connections
# TYPE shardlyn_db_connections_open gauge
shardlyn_db_connections_open 5
# HELP shardlyn_db_connections_in_use Number of database connections in use
# TYPE shardlyn_db_connections_in_use gauge
shardlyn_db_connections_in_use 2
# HELP shardlyn_db_query_duration_seconds Database query duration
# TYPE shardlyn_db_query_duration_seconds histogram
shardlyn_db_query_duration_seconds_bucket{query="get_node",le="0.001"} 100
shardlyn_db_query_duration_seconds_bucket{query="get_node",le="0.01"} 145
shardlyn_db_query_duration_seconds_bucket{query="get_node",le="0.1"} 150Node Metrics
# HELP shardlyn_nodes_total Total number of registered nodes
# TYPE shardlyn_nodes_total gauge
shardlyn_nodes_total 5
# HELP shardlyn_nodes_by_status Number of nodes by status
# TYPE shardlyn_nodes_by_status gauge
shardlyn_nodes_by_status{status="healthy"} 4
shardlyn_nodes_by_status{status="unhealthy"} 1
shardlyn_nodes_by_status{status="offline"} 0
# HELP shardlyn_node_last_heartbeat_seconds Time since last heartbeat
# TYPE shardlyn_node_last_heartbeat_seconds gauge
shardlyn_node_last_heartbeat_seconds{node_id="abc123",node_name="node-1"} 5
shardlyn_node_last_heartbeat_seconds{node_id="def456",node_name="node-2"} 8Instance Metrics
# HELP shardlyn_instances_total Total number of instances
# TYPE shardlyn_instances_total gauge
shardlyn_instances_total 25
# HELP shardlyn_instances_by_state Number of instances by state
# TYPE shardlyn_instances_by_state gauge
shardlyn_instances_by_state{state="running"} 20
shardlyn_instances_by_state{state="stopped"} 3
shardlyn_instances_by_state{state="creating"} 1
shardlyn_instances_by_state{state="error"} 1
# HELP shardlyn_instances_by_workload Number of instances per workload
# TYPE shardlyn_instances_by_workload gauge
shardlyn_instances_by_workload{workload_id="w1",workload_name="minecraft-vanilla"} 10
shardlyn_instances_by_workload{workload_id="w2",workload_name="cs2-server"} 8
shardlyn_instances_by_workload{workload_id="w3",workload_name="valheim-server"} 7Reconciliation Metrics
# HELP shardlyn_reconcile_operations_total Total reconciliation operations
# TYPE shardlyn_reconcile_operations_total counter
shardlyn_reconcile_operations_total{action="create"} 50
shardlyn_reconcile_operations_total{action="start"} 30
shardlyn_reconcile_operations_total{action="stop"} 20
shardlyn_reconcile_operations_total{action="remove"} 10
# HELP shardlyn_reconcile_errors_total Reconciliation errors
# TYPE shardlyn_reconcile_errors_total counter
shardlyn_reconcile_errors_total{action="create",reason="image_pull_failed"} 2
shardlyn_reconcile_errors_total{action="start",reason="container_not_found"} 1
# HELP shardlyn_reconcile_duration_seconds Time to process reconciliation
# TYPE shardlyn_reconcile_duration_seconds histogram
shardlyn_reconcile_duration_seconds_bucket{le="0.01"} 90
shardlyn_reconcile_duration_seconds_bucket{le="0.1"} 98
shardlyn_reconcile_duration_seconds_bucket{le="1"} 100Provisioning Metrics
# HELP shardlyn_provision_requests_total Total provision requests
# TYPE shardlyn_provision_requests_total counter
shardlyn_provision_requests_total{provider="hetzner",status="applied"} 10
shardlyn_provision_requests_total{provider="hetzner",status="failed"} 1
# HELP shardlyn_provision_duration_seconds Time to complete provisioning
# TYPE shardlyn_provision_duration_seconds histogram
shardlyn_provision_duration_seconds_bucket{provider="hetzner",le="60"} 5
shardlyn_provision_duration_seconds_bucket{provider="hetzner",le="120"} 9
shardlyn_provision_duration_seconds_bucket{provider="hetzner",le="300"} 10WebSocket Metrics
# HELP shardlyn_websocket_connections_total Total WebSocket connections
# TYPE shardlyn_websocket_connections_total counter
shardlyn_websocket_connections_total{type="logs"} 100
shardlyn_websocket_connections_total{type="console"} 50
# HELP shardlyn_websocket_connections_active Active WebSocket connections
# TYPE shardlyn_websocket_connections_active gauge
shardlyn_websocket_connections_active{type="logs"} 5
shardlyn_websocket_connections_active{type="console"} 2
# HELP shardlyn_websocket_messages_total Total WebSocket messages
# TYPE shardlyn_websocket_messages_total counter
shardlyn_websocket_messages_total{type="logs",direction="out"} 50000
shardlyn_websocket_messages_total{type="console",direction="in"} 1000
shardlyn_websocket_messages_total{type="console",direction="out"} 5000Agent Metrics
System Metrics
# HELP shardlyn_agent_cpu_percent CPU usage percentage
# TYPE shardlyn_agent_cpu_percent gauge
shardlyn_agent_cpu_percent 25.5
# HELP shardlyn_agent_memory_bytes Memory usage in bytes
# TYPE shardlyn_agent_memory_bytes gauge
shardlyn_agent_memory_bytes{type="total"} 8589934592
shardlyn_agent_memory_bytes{type="used"} 4294967296
shardlyn_agent_memory_bytes{type="available"} 4294967296
# HELP shardlyn_agent_disk_bytes Disk usage in bytes
# TYPE shardlyn_agent_disk_bytes gauge
shardlyn_agent_disk_bytes{path="/var/lib/shardlyn",type="total"} 107374182400
shardlyn_agent_disk_bytes{path="/var/lib/shardlyn",type="used"} 32212254720
shardlyn_agent_disk_bytes{path="/var/lib/shardlyn",type="free"} 75161927680Heartbeat Metrics
# HELP shardlyn_agent_heartbeat_total Total heartbeats sent
# TYPE shardlyn_agent_heartbeat_total counter
shardlyn_agent_heartbeat_total{status="success"} 1000
shardlyn_agent_heartbeat_total{status="failed"} 5
# HELP shardlyn_agent_heartbeat_duration_seconds Heartbeat request duration
# TYPE shardlyn_agent_heartbeat_duration_seconds histogram
shardlyn_agent_heartbeat_duration_seconds_bucket{le="0.1"} 950
shardlyn_agent_heartbeat_duration_seconds_bucket{le="0.5"} 995
shardlyn_agent_heartbeat_duration_seconds_bucket{le="1"} 1000
# HELP shardlyn_agent_last_heartbeat_timestamp Unix timestamp of last heartbeat
# TYPE shardlyn_agent_last_heartbeat_timestamp gauge
shardlyn_agent_last_heartbeat_timestamp 1705312800Container Metrics
# HELP shardlyn_agent_containers_total Number of managed containers
# TYPE shardlyn_agent_containers_total gauge
shardlyn_agent_containers_total 10
# HELP shardlyn_agent_containers_by_state Containers by state
# TYPE shardlyn_agent_containers_by_state gauge
shardlyn_agent_containers_by_state{state="running"} 8
shardlyn_agent_containers_by_state{state="stopped"} 2
# HELP shardlyn_agent_container_operations_total Container operations
# TYPE shardlyn_agent_container_operations_total counter
shardlyn_agent_container_operations_total{operation="create",status="success"} 15
shardlyn_agent_container_operations_total{operation="create",status="failed"} 1
shardlyn_agent_container_operations_total{operation="start",status="success"} 20
shardlyn_agent_container_operations_total{operation="stop",status="success"} 10
shardlyn_agent_container_operations_total{operation="remove",status="success"} 5
# HELP shardlyn_agent_container_operation_duration_seconds Container operation duration
# TYPE shardlyn_agent_container_operation_duration_seconds histogram
shardlyn_agent_container_operation_duration_seconds_bucket{operation="create",le="10"} 10
shardlyn_agent_container_operation_duration_seconds_bucket{operation="create",le="60"} 14
shardlyn_agent_container_operation_duration_seconds_bucket{operation="create",le="300"} 15Docker Metrics
# HELP shardlyn_agent_docker_api_calls_total Docker API calls
# TYPE shardlyn_agent_docker_api_calls_total counter
shardlyn_agent_docker_api_calls_total{method="ContainerList"} 1000
shardlyn_agent_docker_api_calls_total{method="ContainerCreate"} 15
shardlyn_agent_docker_api_calls_total{method="ContainerStart"} 20
shardlyn_agent_docker_api_calls_total{method="ImagePull"} 10
# HELP shardlyn_agent_docker_api_duration_seconds Docker API call duration
# TYPE shardlyn_agent_docker_api_duration_seconds histogram
shardlyn_agent_docker_api_duration_seconds_bucket{method="ContainerList",le="0.1"} 950
shardlyn_agent_docker_api_duration_seconds_bucket{method="ImagePull",le="60"} 8Prometheus Configuration
scrape_configs
yaml
scrape_configs:
- job_name: 'shardlyn-controlplane'
static_configs:
- targets: ['controlplane:9100']
scrape_interval: 15s
- job_name: 'shardlyn-agent'
static_configs:
- targets: ['agent:9101']
scrape_interval: 15s
# For multiple agents, use service discovery or file_sd
- job_name: 'shardlyn-agents'
file_sd_configs:
- files:
- '/etc/prometheus/agents/*.json'Recording Rules
yaml
groups:
- name: shardlyn_recording
rules:
# Request rate per endpoint
- record: shardlyn:http_requests:rate5m
expr: rate(shardlyn_http_requests_total[5m])
# Request latency p99
- record: shardlyn:http_request_duration:p99
expr: histogram_quantile(0.99, rate(shardlyn_http_request_duration_seconds_bucket[5m]))
# Instance availability
- record: shardlyn:instances:availability
expr: shardlyn_instances_by_state{state="running"} / shardlyn_instances_total
# Node health percentage
- record: shardlyn:nodes:healthy_percent
expr: shardlyn_nodes_by_status{status="healthy"} / shardlyn_nodes_total * 100Alerting Rules
yaml
groups:
- name: shardlyn_alerts
rules:
# High error rate
- alert: ShardlynHighErrorRate
expr: rate(shardlyn_http_requests_total{status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "High HTTP error rate"
description: "Error rate is {{ $value | printf \"%.2f\" }} req/s"
# Node unhealthy
- alert: ShardlynNodeUnhealthy
expr: shardlyn_nodes_by_status{status="unhealthy"} > 0
for: 2m
labels:
severity: warning
annotations:
summary: "Node is unhealthy"
description: "{{ $value }} nodes are unhealthy"
# Node offline
- alert: ShardlynNodeOffline
expr: shardlyn_node_last_heartbeat_seconds > 300
for: 5m
labels:
severity: critical
annotations:
summary: "Node is offline"
description: "Node {{ $labels.node_name }} hasn't sent heartbeat for {{ $value }}s"
# Instance errors
- alert: ShardlynInstanceErrors
expr: shardlyn_instances_by_state{state="error"} > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Instances in error state"
description: "{{ $value }} instances are in error state"
# Database connection pool exhausted
- alert: ShardlynDatabasePoolExhausted
expr: shardlyn_db_connections_in_use / shardlyn_db_connections_open > 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "Database connection pool nearly exhausted"
description: "{{ $value | printf \"%.0f\" }}% of connections in use"
# Agent heartbeat failures
- alert: ShardlynAgentHeartbeatFailures
expr: rate(shardlyn_agent_heartbeat_total{status="failed"}[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "Agent experiencing heartbeat failures"
description: "Agent heartbeat failure rate: {{ $value | printf \"%.2f\" }}/s"
# High disk usage
- alert: ShardlynAgentDiskHigh
expr: shardlyn_agent_disk_bytes{type="used"} / shardlyn_agent_disk_bytes{type="total"} > 0.85
for: 10m
labels:
severity: warning
annotations:
summary: "Agent disk usage high"
description: "Disk usage at {{ $value | printf \"%.0f\" }}%"
# High memory usage
- alert: ShardlynAgentMemoryHigh
expr: shardlyn_agent_memory_bytes{type="used"} / shardlyn_agent_memory_bytes{type="total"} > 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "Agent memory usage high"
description: "Memory usage at {{ $value | printf \"%.0f\" }}%"Grafana Dashboards
Control Plane Dashboard
Key panels:
- Request rate (by endpoint)
- Request latency (p50, p95, p99)
- Error rate
- Active WebSocket connections
- Database connection pool
- Nodes by status
- Instances by state
Agent Dashboard
Key panels:
- CPU usage
- Memory usage
- Disk usage
- Heartbeat latency
- Container operations
- Managed containers by state
System Overview Dashboard
Key panels:
- Total nodes (healthy/unhealthy)
- Total instances (running/stopped/error)
- Provisioning status
- Recent errors
- Resource utilization heatmap