Zum Inhalt

📊 Monitoring (Prometheus + Grafana)

Typ: Prometheus Metrics + Grafana Dashboards
Auth: API Key / Basic Auth
Status: ✅ Open Source Stack


Übersicht

Monitoring Stack für: - API Health & Performance - Server-Metriken (CPU, RAM, Disk) - Uptime-Überwachung - Alerting (E-Mail, Slack)


Stack-Komponenten

Tool Funktion Port
Prometheus Metrics Collection 9090
Grafana Dashboards 3000
Alertmanager Alerting 9093
Node Exporter Host Metrics 9100
UptimeRobot External Uptime Cloud

API Endpoints

Methode Endpunkt Beschreibung Cache TTL
GET /api/monitoring/health Health Check -
GET /api/monitoring/metrics Prometheus Metrics -
GET /api/monitoring/status System Status 30s
GET /api/monitoring/uptime Uptime History 5min
GET /api/monitoring/alerts Active Alerts 1min

Prometheus Metrics

import { Registry, Counter, Histogram, Gauge } from 'prom-client';

const register = new Registry();

// API Request Counter
export const httpRequestsTotal = new Counter({
  name: 'http_requests_total',
  help: 'Total HTTP requests',
  labelNames: ['method', 'path', 'status'],
  registers: [register]
});

// Response Time Histogram
export const httpRequestDuration = new Histogram({
  name: 'http_request_duration_seconds',
  help: 'HTTP request duration in seconds',
  labelNames: ['method', 'path'],
  buckets: [0.01, 0.05, 0.1, 0.5, 1, 5],
  registers: [register]
});

// Cache Hit Rate
export const cacheHitRate = new Gauge({
  name: 'cache_hit_rate',
  help: 'Cache hit rate percentage',
  registers: [register]
});

// Active Connections
export const activeConnections = new Gauge({
  name: 'active_connections',
  help: 'Number of active connections',
  registers: [register]
});

// Middleware
app.use((req, res, next) => {
  const start = Date.now();

  res.on('finish', () => {
    const duration = (Date.now() - start) / 1000;

    httpRequestsTotal.inc({
      method: req.method,
      path: req.route?.path || req.path,
      status: res.statusCode
    });

    httpRequestDuration.observe(
      { method: req.method, path: req.route?.path || req.path },
      duration
    );
  });

  next();
});

// Metrics Endpoint
app.get('/metrics', async (req, res) => {
  res.set('Content-Type', register.contentType);
  res.end(await register.metrics());
});

Prometheus Config

# prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

alerting:
  alertmanagers:
    - static_configs:
        - targets: ['alertmanager:9093']

rule_files:
  - 'alerts.yml'

scrape_configs:
  - job_name: 'contract-api'
    static_configs:
      - targets: ['api:3000']
    metrics_path: '/metrics'

  - job_name: 'node-exporter'
    static_configs:
      - targets: ['node-exporter:9100']

  - job_name: 'sonicjs'
    static_configs:
      - targets: ['sonicjs:8787']

Alert Rules

# alerts.yml
groups:
  - name: api-alerts
    rules:
      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High error rate detected"

      - alert: SlowResponses
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "95th percentile response time > 2s"

      - alert: ServiceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.job }} is down"

Grafana Dashboard

{
  "title": "Contract API Dashboard",
  "panels": [
    {
      "title": "Requests/sec",
      "type": "graph",
      "targets": [{
        "expr": "rate(http_requests_total[1m])"
      }]
    },
    {
      "title": "Response Time (p95)",
      "type": "graph",
      "targets": [{
        "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))"
      }]
    },
    {
      "title": "Error Rate",
      "type": "stat",
      "targets": [{
        "expr": "rate(http_requests_total{status=~\"5..\"}[5m]) / rate(http_requests_total[5m]) * 100"
      }]
    },
    {
      "title": "Cache Hit Rate",
      "type": "gauge",
      "targets": [{
        "expr": "cache_hit_rate"
      }]
    }
  ]
}

UptimeRobot Integration

const UPTIMEROBOT_API = 'https://api.uptimerobot.com/v2';

async function getUptimeStatus() {
  const response = await fetch(`${UPTIMEROBOT_API}/getMonitors`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({
      api_key: process.env.UPTIMEROBOT_API_KEY,
      format: 'json',
      logs: 1
    })
  });

  return response.json();
}

async function createMonitor(url: string, name: string) {
  const response = await fetch(`${UPTIMEROBOT_API}/newMonitor`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({
      api_key: process.env.UPTIMEROBOT_API_KEY,
      friendly_name: name,
      url,
      type: 1,  // HTTP(s)
      interval: 300  // 5 minutes
    })
  });

  return response.json();
}

Docker Compose

# docker-compose.monitoring.yml
services:
  prometheus:
    image: prom/prometheus:latest
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - ./alerts.yml:/etc/prometheus/alerts.yml
      - prometheus_data:/prometheus

  grafana:
    image: grafana/grafana:latest
    ports:
      - "3001:3000"
    volumes:
      - grafana_data:/var/lib/grafana
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}

  alertmanager:
    image: prom/alertmanager:latest
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml

  node-exporter:
    image: prom/node-exporter:latest
    ports:
      - "9100:9100"

volumes:
  prometheus_data:
  grafana_data:

Umgebungsvariablen

# Prometheus
PROMETHEUS_URL="http://prometheus:9090"

# Grafana
GRAFANA_URL="http://grafana:3000"
GRAFANA_PASSWORD="secret"

# UptimeRobot
UPTIMEROBOT_API_KEY=""

# Alerting
ALERTMANAGER_SLACK_WEBHOOK=""
ALERTMANAGER_EMAIL="alerts@example.com"

Prometheus + Grafana • UptimeRobot • Alerting