Skip to content

Shardlyn Metrics Reference

Overview

Shardlyn exposes metrics in Prometheus format for monitoring and alerting. Both the Control Plane and Agent expose separate metrics endpoints.

Related

Endpoints

ComponentEndpointDefault Port
Control Plane/metrics9100
Agent/metrics9101

Control Plane Metrics

HTTP Metrics

# HELP shardlyn_http_requests_total Total number of HTTP requests
# TYPE shardlyn_http_requests_total counter
shardlyn_http_requests_total{method="GET",path="/v1/nodes",status="200"} 150
shardlyn_http_requests_total{method="POST",path="/v1/instances",status="201"} 25
shardlyn_http_requests_total{method="POST",path="/v1/auth/login",status="401"} 3

# HELP shardlyn_http_request_duration_seconds HTTP request duration in seconds
# TYPE shardlyn_http_request_duration_seconds histogram
shardlyn_http_request_duration_seconds_bucket{method="GET",path="/v1/nodes",le="0.01"} 140
shardlyn_http_request_duration_seconds_bucket{method="GET",path="/v1/nodes",le="0.05"} 148
shardlyn_http_request_duration_seconds_bucket{method="GET",path="/v1/nodes",le="0.1"} 150
shardlyn_http_request_duration_seconds_sum{method="GET",path="/v1/nodes"} 1.25
shardlyn_http_request_duration_seconds_count{method="GET",path="/v1/nodes"} 150

# HELP shardlyn_http_requests_in_flight Current number of HTTP requests being processed
# TYPE shardlyn_http_requests_in_flight gauge
shardlyn_http_requests_in_flight 5

Database Metrics

# HELP shardlyn_db_connections_open Number of open database connections
# TYPE shardlyn_db_connections_open gauge
shardlyn_db_connections_open 5

# HELP shardlyn_db_connections_in_use Number of database connections in use
# TYPE shardlyn_db_connections_in_use gauge
shardlyn_db_connections_in_use 2

# HELP shardlyn_db_query_duration_seconds Database query duration
# TYPE shardlyn_db_query_duration_seconds histogram
shardlyn_db_query_duration_seconds_bucket{query="get_node",le="0.001"} 100
shardlyn_db_query_duration_seconds_bucket{query="get_node",le="0.01"} 145
shardlyn_db_query_duration_seconds_bucket{query="get_node",le="0.1"} 150

Node Metrics

# HELP shardlyn_nodes_total Total number of registered nodes
# TYPE shardlyn_nodes_total gauge
shardlyn_nodes_total 5

# HELP shardlyn_nodes_by_status Number of nodes by status
# TYPE shardlyn_nodes_by_status gauge
shardlyn_nodes_by_status{status="healthy"} 4
shardlyn_nodes_by_status{status="unhealthy"} 1
shardlyn_nodes_by_status{status="offline"} 0

# HELP shardlyn_node_last_heartbeat_seconds Time since last heartbeat
# TYPE shardlyn_node_last_heartbeat_seconds gauge
shardlyn_node_last_heartbeat_seconds{node_id="abc123",node_name="node-1"} 5
shardlyn_node_last_heartbeat_seconds{node_id="def456",node_name="node-2"} 8

Instance Metrics

# HELP shardlyn_instances_total Total number of instances
# TYPE shardlyn_instances_total gauge
shardlyn_instances_total 25

# HELP shardlyn_instances_by_state Number of instances by state
# TYPE shardlyn_instances_by_state gauge
shardlyn_instances_by_state{state="running"} 20
shardlyn_instances_by_state{state="stopped"} 3
shardlyn_instances_by_state{state="creating"} 1
shardlyn_instances_by_state{state="error"} 1

# HELP shardlyn_instances_by_workload Number of instances per workload
# TYPE shardlyn_instances_by_workload gauge
shardlyn_instances_by_workload{workload_id="w1",workload_name="minecraft-vanilla"} 10
shardlyn_instances_by_workload{workload_id="w2",workload_name="cs2-server"} 8
shardlyn_instances_by_workload{workload_id="w3",workload_name="valheim-server"} 7

Reconciliation Metrics

# HELP shardlyn_reconcile_operations_total Total reconciliation operations
# TYPE shardlyn_reconcile_operations_total counter
shardlyn_reconcile_operations_total{action="create"} 50
shardlyn_reconcile_operations_total{action="start"} 30
shardlyn_reconcile_operations_total{action="stop"} 20
shardlyn_reconcile_operations_total{action="remove"} 10

# HELP shardlyn_reconcile_errors_total Reconciliation errors
# TYPE shardlyn_reconcile_errors_total counter
shardlyn_reconcile_errors_total{action="create",reason="image_pull_failed"} 2
shardlyn_reconcile_errors_total{action="start",reason="container_not_found"} 1

# HELP shardlyn_reconcile_duration_seconds Time to process reconciliation
# TYPE shardlyn_reconcile_duration_seconds histogram
shardlyn_reconcile_duration_seconds_bucket{le="0.01"} 90
shardlyn_reconcile_duration_seconds_bucket{le="0.1"} 98
shardlyn_reconcile_duration_seconds_bucket{le="1"} 100

Provisioning Metrics

# HELP shardlyn_provision_requests_total Total provision requests
# TYPE shardlyn_provision_requests_total counter
shardlyn_provision_requests_total{provider="hetzner",status="applied"} 10
shardlyn_provision_requests_total{provider="hetzner",status="failed"} 1

# HELP shardlyn_provision_duration_seconds Time to complete provisioning
# TYPE shardlyn_provision_duration_seconds histogram
shardlyn_provision_duration_seconds_bucket{provider="hetzner",le="60"} 5
shardlyn_provision_duration_seconds_bucket{provider="hetzner",le="120"} 9
shardlyn_provision_duration_seconds_bucket{provider="hetzner",le="300"} 10

WebSocket Metrics

# HELP shardlyn_websocket_connections_total Total WebSocket connections
# TYPE shardlyn_websocket_connections_total counter
shardlyn_websocket_connections_total{type="logs"} 100
shardlyn_websocket_connections_total{type="console"} 50

# HELP shardlyn_websocket_connections_active Active WebSocket connections
# TYPE shardlyn_websocket_connections_active gauge
shardlyn_websocket_connections_active{type="logs"} 5
shardlyn_websocket_connections_active{type="console"} 2

# HELP shardlyn_websocket_messages_total Total WebSocket messages
# TYPE shardlyn_websocket_messages_total counter
shardlyn_websocket_messages_total{type="logs",direction="out"} 50000
shardlyn_websocket_messages_total{type="console",direction="in"} 1000
shardlyn_websocket_messages_total{type="console",direction="out"} 5000

Agent Metrics

System Metrics

# HELP shardlyn_agent_cpu_percent CPU usage percentage
# TYPE shardlyn_agent_cpu_percent gauge
shardlyn_agent_cpu_percent 25.5

# HELP shardlyn_agent_memory_bytes Memory usage in bytes
# TYPE shardlyn_agent_memory_bytes gauge
shardlyn_agent_memory_bytes{type="total"} 8589934592
shardlyn_agent_memory_bytes{type="used"} 4294967296
shardlyn_agent_memory_bytes{type="available"} 4294967296

# HELP shardlyn_agent_disk_bytes Disk usage in bytes
# TYPE shardlyn_agent_disk_bytes gauge
shardlyn_agent_disk_bytes{path="/var/lib/shardlyn",type="total"} 107374182400
shardlyn_agent_disk_bytes{path="/var/lib/shardlyn",type="used"} 32212254720
shardlyn_agent_disk_bytes{path="/var/lib/shardlyn",type="free"} 75161927680

Heartbeat Metrics

# HELP shardlyn_agent_heartbeat_total Total heartbeats sent
# TYPE shardlyn_agent_heartbeat_total counter
shardlyn_agent_heartbeat_total{status="success"} 1000
shardlyn_agent_heartbeat_total{status="failed"} 5

# HELP shardlyn_agent_heartbeat_duration_seconds Heartbeat request duration
# TYPE shardlyn_agent_heartbeat_duration_seconds histogram
shardlyn_agent_heartbeat_duration_seconds_bucket{le="0.1"} 950
shardlyn_agent_heartbeat_duration_seconds_bucket{le="0.5"} 995
shardlyn_agent_heartbeat_duration_seconds_bucket{le="1"} 1000

# HELP shardlyn_agent_last_heartbeat_timestamp Unix timestamp of last heartbeat
# TYPE shardlyn_agent_last_heartbeat_timestamp gauge
shardlyn_agent_last_heartbeat_timestamp 1705312800

Container Metrics

# HELP shardlyn_agent_containers_total Number of managed containers
# TYPE shardlyn_agent_containers_total gauge
shardlyn_agent_containers_total 10

# HELP shardlyn_agent_containers_by_state Containers by state
# TYPE shardlyn_agent_containers_by_state gauge
shardlyn_agent_containers_by_state{state="running"} 8
shardlyn_agent_containers_by_state{state="stopped"} 2

# HELP shardlyn_agent_container_operations_total Container operations
# TYPE shardlyn_agent_container_operations_total counter
shardlyn_agent_container_operations_total{operation="create",status="success"} 15
shardlyn_agent_container_operations_total{operation="create",status="failed"} 1
shardlyn_agent_container_operations_total{operation="start",status="success"} 20
shardlyn_agent_container_operations_total{operation="stop",status="success"} 10
shardlyn_agent_container_operations_total{operation="remove",status="success"} 5

# HELP shardlyn_agent_container_operation_duration_seconds Container operation duration
# TYPE shardlyn_agent_container_operation_duration_seconds histogram
shardlyn_agent_container_operation_duration_seconds_bucket{operation="create",le="10"} 10
shardlyn_agent_container_operation_duration_seconds_bucket{operation="create",le="60"} 14
shardlyn_agent_container_operation_duration_seconds_bucket{operation="create",le="300"} 15

Docker Metrics

# HELP shardlyn_agent_docker_api_calls_total Docker API calls
# TYPE shardlyn_agent_docker_api_calls_total counter
shardlyn_agent_docker_api_calls_total{method="ContainerList"} 1000
shardlyn_agent_docker_api_calls_total{method="ContainerCreate"} 15
shardlyn_agent_docker_api_calls_total{method="ContainerStart"} 20
shardlyn_agent_docker_api_calls_total{method="ImagePull"} 10

# HELP shardlyn_agent_docker_api_duration_seconds Docker API call duration
# TYPE shardlyn_agent_docker_api_duration_seconds histogram
shardlyn_agent_docker_api_duration_seconds_bucket{method="ContainerList",le="0.1"} 950
shardlyn_agent_docker_api_duration_seconds_bucket{method="ImagePull",le="60"} 8

Prometheus Configuration

scrape_configs

yaml
scrape_configs:
  - job_name: 'shardlyn-controlplane'
    static_configs:
      - targets: ['controlplane:9100']
    scrape_interval: 15s

  - job_name: 'shardlyn-agent'
    static_configs:
      - targets: ['agent:9101']
    scrape_interval: 15s

  # For multiple agents, use service discovery or file_sd
  - job_name: 'shardlyn-agents'
    file_sd_configs:
      - files:
          - '/etc/prometheus/agents/*.json'

Recording Rules

yaml
groups:
  - name: shardlyn_recording
    rules:
      # Request rate per endpoint
      - record: shardlyn:http_requests:rate5m
        expr: rate(shardlyn_http_requests_total[5m])

      # Request latency p99
      - record: shardlyn:http_request_duration:p99
        expr: histogram_quantile(0.99, rate(shardlyn_http_request_duration_seconds_bucket[5m]))

      # Instance availability
      - record: shardlyn:instances:availability
        expr: shardlyn_instances_by_state{state="running"} / shardlyn_instances_total

      # Node health percentage
      - record: shardlyn:nodes:healthy_percent
        expr: shardlyn_nodes_by_status{status="healthy"} / shardlyn_nodes_total * 100

Alerting Rules

yaml
groups:
  - name: shardlyn_alerts
    rules:
      # High error rate
      - alert: ShardlynHighErrorRate
        expr: rate(shardlyn_http_requests_total{status=~"5.."}[5m]) > 0.1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High HTTP error rate"
          description: "Error rate is {{ $value | printf \"%.2f\" }} req/s"

      # Node unhealthy
      - alert: ShardlynNodeUnhealthy
        expr: shardlyn_nodes_by_status{status="unhealthy"} > 0
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "Node is unhealthy"
          description: "{{ $value }} nodes are unhealthy"

      # Node offline
      - alert: ShardlynNodeOffline
        expr: shardlyn_node_last_heartbeat_seconds > 300
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Node is offline"
          description: "Node {{ $labels.node_name }} hasn't sent heartbeat for {{ $value }}s"

      # Instance errors
      - alert: ShardlynInstanceErrors
        expr: shardlyn_instances_by_state{state="error"} > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Instances in error state"
          description: "{{ $value }} instances are in error state"

      # Database connection pool exhausted
      - alert: ShardlynDatabasePoolExhausted
        expr: shardlyn_db_connections_in_use / shardlyn_db_connections_open > 0.9
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Database connection pool nearly exhausted"
          description: "{{ $value | printf \"%.0f\" }}% of connections in use"

      # Agent heartbeat failures
      - alert: ShardlynAgentHeartbeatFailures
        expr: rate(shardlyn_agent_heartbeat_total{status="failed"}[5m]) > 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Agent experiencing heartbeat failures"
          description: "Agent heartbeat failure rate: {{ $value | printf \"%.2f\" }}/s"

      # High disk usage
      - alert: ShardlynAgentDiskHigh
        expr: shardlyn_agent_disk_bytes{type="used"} / shardlyn_agent_disk_bytes{type="total"} > 0.85
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Agent disk usage high"
          description: "Disk usage at {{ $value | printf \"%.0f\" }}%"

      # High memory usage
      - alert: ShardlynAgentMemoryHigh
        expr: shardlyn_agent_memory_bytes{type="used"} / shardlyn_agent_memory_bytes{type="total"} > 0.9
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Agent memory usage high"
          description: "Memory usage at {{ $value | printf \"%.0f\" }}%"

Grafana Dashboards

Control Plane Dashboard

Key panels:

  1. Request rate (by endpoint)
  2. Request latency (p50, p95, p99)
  3. Error rate
  4. Active WebSocket connections
  5. Database connection pool
  6. Nodes by status
  7. Instances by state

Agent Dashboard

Key panels:

  1. CPU usage
  2. Memory usage
  3. Disk usage
  4. Heartbeat latency
  5. Container operations
  6. Managed containers by state

System Overview Dashboard

Key panels:

  1. Total nodes (healthy/unhealthy)
  2. Total instances (running/stopped/error)
  3. Provisioning status
  4. Recent errors
  5. Resource utilization heatmap

Built for teams that want control of their own infrastructure.