Skip to content

📊 Observability

"You can't fix what you can't see."
Observability là khả năng hiểu internal state của system từ external outputs.

🔭 Three Pillars of Observability

┌─────────────────────────────────────────────────────────────┐
│                    OBSERVABILITY                            │
├─────────────────┬──────────────────┬───────────────────────┤
│     LOGS        │     METRICS      │       TRACES          │
│                 │                  │                       │
│  What happened  │  How is system   │  How do requests      │
│  at a point     │  performing?     │  flow through         │
│  in time?       │                  │  services?            │
│                 │                  │                       │
│  "User X got    │  "95th latency   │  "Request took 50ms   │
│   error Y"      │   is 200ms"      │   across 5 services"  │
└─────────────────┴──────────────────┴───────────────────────┘

📝 Structured Logging

⚔️ Tradeoff: Logging Libraries

LibraryPerformanceFeaturesWhen to Use
slog (stdlib)★★★★☆Standard, flexibleGo 1.21+, new projects
zerolog★★★★★Zero alloc, fastHigh-performance
zap★★★★★Rich featuresUber ecosystem
logrus★★★☆☆Hooks, formattersLegacy (not recommended)

slog (Go 1.21+ Standard Library)

go
import "log/slog"

func main() {
    // JSON handler for production
    logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
        Level: slog.LevelInfo,
    }))
    slog.SetDefault(logger)
    
    // Structured logging
    slog.Info("server started",
        slog.String("addr", ":8080"),
        slog.Int("port", 8080),
    )
    
    // With context
    slog.InfoContext(ctx, "request processed",
        slog.String("method", "GET"),
        slog.String("path", "/api/users"),
        slog.Duration("latency", 42*time.Millisecond),
    )
    
    // Error with details
    slog.Error("database query failed",
        slog.String("query", "SELECT * FROM users"),
        slog.Any("error", err),
    )
}

// Output (JSON):
// {"time":"2024-01-15T10:30:00Z","level":"INFO","msg":"server started","addr":":8080","port":8080}

zerolog (Zero Allocation)

go
import "github.com/rs/zerolog"

func main() {
    // Configure
    zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
    logger := zerolog.New(os.Stdout).With().Timestamp().Logger()
    
    // Structured logging
    logger.Info().
        Str("service", "user-api").
        Int("port", 8080).
        Msg("server started")
    
    // With error
    logger.Error().
        Err(err).
        Str("user_id", "123").
        Msg("failed to create user")
    
    // Sub-logger with context
    requestLogger := logger.With().
        Str("request_id", requestID).
        Str("user_agent", userAgent).
        Logger()
    
    requestLogger.Info().Msg("request received")
}

zap (Uber's High-Performance Logger)

go
import "go.uber.org/zap"

func main() {
    // Production config
    logger, _ := zap.NewProduction()
    defer logger.Sync()
    
    // Structured logging
    logger.Info("server started",
        zap.String("addr", ":8080"),
        zap.Int("port", 8080),
    )
    
    // Sugared logger (more convenient, slightly slower)
    sugar := logger.Sugar()
    sugar.Infow("request processed",
        "method", "GET",
        "path", "/api/users",
        "latency_ms", 42,
    )
    
    // With fields
    userLogger := logger.With(
        zap.String("user_id", "123"),
        zap.String("session_id", "abc"),
    )
    userLogger.Info("user action", zap.String("action", "login"))
}

📈 Metrics with Prometheus

Basic Counter & Histogram

go
import (
    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promauto"
    "github.com/prometheus/client_golang/prometheus/promhttp"
)

var (
    // Counter: monotonically increasing
    requestsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "http_requests_total",
            Help: "Total number of HTTP requests",
        },
        []string{"method", "path", "status"},
    )
    
    // Histogram: distribution of values
    requestDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "http_request_duration_seconds",
            Help:    "HTTP request duration in seconds",
            Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10},
        },
        []string{"method", "path"},
    )
    
    // Gauge: can go up or down
    activeConnections = promauto.NewGauge(
        prometheus.GaugeOpts{
            Name: "active_connections",
            Help: "Number of active connections",
        },
    )
)

// Middleware to record metrics
func MetricsMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        start := time.Now()
        
        // Wrap response writer to capture status
        ww := &responseWriter{ResponseWriter: w, status: 200}
        
        next.ServeHTTP(ww, r)
        
        // Record metrics
        duration := time.Since(start).Seconds()
        requestsTotal.WithLabelValues(r.Method, r.URL.Path, fmt.Sprintf("%d", ww.status)).Inc()
        requestDuration.WithLabelValues(r.Method, r.URL.Path).Observe(duration)
    })
}

func main() {
    // Expose metrics endpoint
    http.Handle("/metrics", promhttp.Handler())
}

Business Metrics

go
var (
    usersCreated = promauto.NewCounter(prometheus.CounterOpts{
        Name: "users_created_total",
        Help: "Total number of users created",
    })
    
    ordersProcessed = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "orders_processed_total",
            Help: "Total orders processed",
        },
        []string{"status", "payment_method"},
    )
    
    cacheHitRatio = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "cache_hit_ratio",
            Help: "Cache hit ratio",
        },
        []string{"cache_name"},
    )
)

func (s *UserService) Create(ctx context.Context, user *User) error {
    // ... business logic ...
    
    usersCreated.Inc()
    return nil
}

🔗 Distributed Tracing with OpenTelemetry

Setup

go
import (
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
    "go.opentelemetry.io/otel/sdk/trace"
)

func initTracer() (*trace.TracerProvider, error) {
    exporter, err := otlptracegrpc.New(context.Background(),
        otlptracegrpc.WithEndpoint("localhost:4317"),
        otlptracegrpc.WithInsecure(),
    )
    if err != nil {
        return nil, err
    }
    
    tp := trace.NewTracerProvider(
        trace.WithBatcher(exporter),
        trace.WithResource(resource.NewWithAttributes(
            semconv.SchemaURL,
            semconv.ServiceName("user-service"),
            semconv.ServiceVersion("1.0.0"),
        )),
    )
    
    otel.SetTracerProvider(tp)
    return tp, nil
}

Creating Spans

go
import (
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/codes"
)

var tracer = otel.Tracer("user-service")

func (s *UserService) GetUser(ctx context.Context, id int64) (*User, error) {
    ctx, span := tracer.Start(ctx, "UserService.GetUser")
    defer span.End()
    
    // Add attributes
    span.SetAttributes(
        attribute.Int64("user.id", id),
    )
    
    // Call repository (creates child span)
    user, err := s.repo.GetByID(ctx, id)
    if err != nil {
        span.RecordError(err)
        span.SetStatus(codes.Error, err.Error())
        return nil, err
    }
    
    span.SetAttributes(
        attribute.String("user.email", user.Email),
    )
    
    return user, nil
}

func (r *UserRepository) GetByID(ctx context.Context, id int64) (*User, error) {
    ctx, span := tracer.Start(ctx, "UserRepository.GetByID")
    defer span.End()
    
    span.SetAttributes(
        attribute.String("db.system", "postgresql"),
        attribute.String("db.statement", "SELECT * FROM users WHERE id = $1"),
    )
    
    // ... database query ...
    
    return user, nil
}

HTTP Middleware for Tracing

go
import (
    "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
)

func main() {
    handler := otelhttp.NewHandler(router, "server",
        otelhttp.WithMessageEvents(otelhttp.ReadEvents, otelhttp.WriteEvents),
    )
    
    http.ListenAndServe(":8080", handler)
}

🏥 Health Checks

Liveness vs Readiness

go
type HealthChecker struct {
    db    *sql.DB
    redis *redis.Client
}

// Liveness: Is the process running?
func (h *HealthChecker) Liveness(w http.ResponseWriter, r *http.Request) {
    w.WriteHeader(http.StatusOK)
    w.Write([]byte("OK"))
}

// Readiness: Can the service handle traffic?
func (h *HealthChecker) Readiness(w http.ResponseWriter, r *http.Request) {
    ctx, cancel := context.WithTimeout(r.Context(), 5*time.Second)
    defer cancel()
    
    checks := make(map[string]string)
    healthy := true
    
    // Check database
    if err := h.db.PingContext(ctx); err != nil {
        checks["database"] = fmt.Sprintf("unhealthy: %v", err)
        healthy = false
    } else {
        checks["database"] = "healthy"
    }
    
    // Check Redis
    if err := h.redis.Ping(ctx).Err(); err != nil {
        checks["redis"] = fmt.Sprintf("unhealthy: %v", err)
        healthy = false
    } else {
        checks["redis"] = "healthy"
    }
    
    status := http.StatusOK
    if !healthy {
        status = http.StatusServiceUnavailable
    }
    
    w.Header().Set("Content-Type", "application/json")
    w.WriteHeader(status)
    json.NewEncoder(w).Encode(map[string]interface{}{
        "status": map[bool]string{true: "healthy", false: "unhealthy"}[healthy],
        "checks": checks,
    })
}

💻 Engineering Example: Observable Microservice

go
package main

import (
    "context"
    "log/slog"
    "net/http"
    "os"
    "os/signal"
    "syscall"
    "time"

    "github.com/go-chi/chi/v5"
    "github.com/prometheus/client_golang/prometheus/promhttp"
    "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
    "go.opentelemetry.io/otel"
)

func main() {
    // Initialize logging
    logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
        Level: slog.LevelInfo,
    }))
    slog.SetDefault(logger)
    
    // Initialize tracing
    tp, err := initTracer()
    if err != nil {
        slog.Error("failed to init tracer", slog.Any("error", err))
        os.Exit(1)
    }
    defer tp.Shutdown(context.Background())
    
    // Router with observability
    r := chi.NewRouter()
    
    // Observability middleware
    r.Use(func(next http.Handler) http.Handler {
        return otelhttp.NewHandler(next, "server")
    })
    r.Use(loggingMiddleware)
    r.Use(metricsMiddleware)
    
    // Health endpoints (no auth required)
    r.Get("/health", healthHandler)
    r.Get("/ready", readyHandler)
    r.Handle("/metrics", promhttp.Handler())
    
    // API routes
    r.Route("/api/v1", func(r chi.Router) {
        r.Get("/users/{id}", getUserHandler)
    })
    
    // Start server
    srv := &http.Server{
        Addr:    ":8080",
        Handler: r,
    }
    
    go func() {
        slog.Info("server starting", slog.String("addr", ":8080"))
        if err := srv.ListenAndServe(); err != http.ErrServerClosed {
            slog.Error("server error", slog.Any("error", err))
        }
    }()
    
    // Graceful shutdown
    quit := make(chan os.Signal, 1)
    signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
    <-quit
    
    slog.Info("shutting down...")
    ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
    defer cancel()
    srv.Shutdown(ctx)
}

func loggingMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        start := time.Now()
        ww := &responseWriter{ResponseWriter: w, status: 200}
        
        next.ServeHTTP(ww, r)
        
        slog.Info("request",
            slog.String("method", r.Method),
            slog.String("path", r.URL.Path),
            slog.Int("status", ww.status),
            slog.Duration("latency", time.Since(start)),
            slog.String("remote_addr", r.RemoteAddr),
        )
    })
}

Ship-to-Prod Checklist

Logging

  • [ ] Structured logging (JSON in production)
  • [ ] Log levels configured appropriately
  • [ ] Correlation IDs in all log entries
  • [ ] No sensitive data in logs
  • [ ] Log rotation configured

Metrics

  • [ ] RED metrics: Rate, Errors, Duration
  • [ ] /metrics endpoint exposed
  • [ ] Business metrics alongside technical
  • [ ] Grafana dashboards created
  • [ ] Alerts configured for critical metrics

Tracing

  • [ ] OpenTelemetry initialized
  • [ ] Spans for all significant operations
  • [ ] Context propagation across services
  • [ ] Sampling configured for high traffic

Health Checks

  • [ ] /health (liveness) endpoint
  • [ ] /ready (readiness) endpoint
  • [ ] Dependency checks in readiness
  • [ ] Kubernetes probes configured

📊 Summary

PillarToolPurpose
Logsslog/zap/zerologEvent recording
MetricsPrometheusSystem state
TracesOpenTelemetryRequest flow
HealthCustom endpointsLoad balancer

➡️ Tiếp theo

Observability nắm vững rồi! Tiếp theo: Deployment - Docker, Kubernetes, và CI/CD.