Giao diện
📊 Observability
"You can't fix what you can't see."
Observability là khả năng hiểu internal state của system từ external outputs.
Observability là khả năng hiểu internal state của system từ external outputs.
🔭 Three Pillars of Observability
┌─────────────────────────────────────────────────────────────┐
│ OBSERVABILITY │
├─────────────────┬──────────────────┬───────────────────────┤
│ LOGS │ METRICS │ TRACES │
│ │ │ │
│ What happened │ How is system │ How do requests │
│ at a point │ performing? │ flow through │
│ in time? │ │ services? │
│ │ │ │
│ "User X got │ "95th latency │ "Request took 50ms │
│ error Y" │ is 200ms" │ across 5 services" │
└─────────────────┴──────────────────┴───────────────────────┘📝 Structured Logging
⚔️ Tradeoff: Logging Libraries
| Library | Performance | Features | When to Use |
|---|---|---|---|
| slog (stdlib) | ★★★★☆ | Standard, flexible | Go 1.21+, new projects |
| zerolog | ★★★★★ | Zero alloc, fast | High-performance |
| zap | ★★★★★ | Rich features | Uber ecosystem |
| logrus | ★★★☆☆ | Hooks, formatters | Legacy (not recommended) |
slog (Go 1.21+ Standard Library)
go
import "log/slog"
func main() {
// JSON handler for production
logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
Level: slog.LevelInfo,
}))
slog.SetDefault(logger)
// Structured logging
slog.Info("server started",
slog.String("addr", ":8080"),
slog.Int("port", 8080),
)
// With context
slog.InfoContext(ctx, "request processed",
slog.String("method", "GET"),
slog.String("path", "/api/users"),
slog.Duration("latency", 42*time.Millisecond),
)
// Error with details
slog.Error("database query failed",
slog.String("query", "SELECT * FROM users"),
slog.Any("error", err),
)
}
// Output (JSON):
// {"time":"2024-01-15T10:30:00Z","level":"INFO","msg":"server started","addr":":8080","port":8080}zerolog (Zero Allocation)
go
import "github.com/rs/zerolog"
func main() {
// Configure
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
logger := zerolog.New(os.Stdout).With().Timestamp().Logger()
// Structured logging
logger.Info().
Str("service", "user-api").
Int("port", 8080).
Msg("server started")
// With error
logger.Error().
Err(err).
Str("user_id", "123").
Msg("failed to create user")
// Sub-logger with context
requestLogger := logger.With().
Str("request_id", requestID).
Str("user_agent", userAgent).
Logger()
requestLogger.Info().Msg("request received")
}zap (Uber's High-Performance Logger)
go
import "go.uber.org/zap"
func main() {
// Production config
logger, _ := zap.NewProduction()
defer logger.Sync()
// Structured logging
logger.Info("server started",
zap.String("addr", ":8080"),
zap.Int("port", 8080),
)
// Sugared logger (more convenient, slightly slower)
sugar := logger.Sugar()
sugar.Infow("request processed",
"method", "GET",
"path", "/api/users",
"latency_ms", 42,
)
// With fields
userLogger := logger.With(
zap.String("user_id", "123"),
zap.String("session_id", "abc"),
)
userLogger.Info("user action", zap.String("action", "login"))
}📈 Metrics with Prometheus
Basic Counter & Histogram
go
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
var (
// Counter: monotonically increasing
requestsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "http_requests_total",
Help: "Total number of HTTP requests",
},
[]string{"method", "path", "status"},
)
// Histogram: distribution of values
requestDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "HTTP request duration in seconds",
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10},
},
[]string{"method", "path"},
)
// Gauge: can go up or down
activeConnections = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "active_connections",
Help: "Number of active connections",
},
)
)
// Middleware to record metrics
func MetricsMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
// Wrap response writer to capture status
ww := &responseWriter{ResponseWriter: w, status: 200}
next.ServeHTTP(ww, r)
// Record metrics
duration := time.Since(start).Seconds()
requestsTotal.WithLabelValues(r.Method, r.URL.Path, fmt.Sprintf("%d", ww.status)).Inc()
requestDuration.WithLabelValues(r.Method, r.URL.Path).Observe(duration)
})
}
func main() {
// Expose metrics endpoint
http.Handle("/metrics", promhttp.Handler())
}Business Metrics
go
var (
usersCreated = promauto.NewCounter(prometheus.CounterOpts{
Name: "users_created_total",
Help: "Total number of users created",
})
ordersProcessed = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "orders_processed_total",
Help: "Total orders processed",
},
[]string{"status", "payment_method"},
)
cacheHitRatio = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cache_hit_ratio",
Help: "Cache hit ratio",
},
[]string{"cache_name"},
)
)
func (s *UserService) Create(ctx context.Context, user *User) error {
// ... business logic ...
usersCreated.Inc()
return nil
}🔗 Distributed Tracing with OpenTelemetry
Setup
go
import (
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/sdk/trace"
)
func initTracer() (*trace.TracerProvider, error) {
exporter, err := otlptracegrpc.New(context.Background(),
otlptracegrpc.WithEndpoint("localhost:4317"),
otlptracegrpc.WithInsecure(),
)
if err != nil {
return nil, err
}
tp := trace.NewTracerProvider(
trace.WithBatcher(exporter),
trace.WithResource(resource.NewWithAttributes(
semconv.SchemaURL,
semconv.ServiceName("user-service"),
semconv.ServiceVersion("1.0.0"),
)),
)
otel.SetTracerProvider(tp)
return tp, nil
}Creating Spans
go
import (
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
)
var tracer = otel.Tracer("user-service")
func (s *UserService) GetUser(ctx context.Context, id int64) (*User, error) {
ctx, span := tracer.Start(ctx, "UserService.GetUser")
defer span.End()
// Add attributes
span.SetAttributes(
attribute.Int64("user.id", id),
)
// Call repository (creates child span)
user, err := s.repo.GetByID(ctx, id)
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return nil, err
}
span.SetAttributes(
attribute.String("user.email", user.Email),
)
return user, nil
}
func (r *UserRepository) GetByID(ctx context.Context, id int64) (*User, error) {
ctx, span := tracer.Start(ctx, "UserRepository.GetByID")
defer span.End()
span.SetAttributes(
attribute.String("db.system", "postgresql"),
attribute.String("db.statement", "SELECT * FROM users WHERE id = $1"),
)
// ... database query ...
return user, nil
}HTTP Middleware for Tracing
go
import (
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
)
func main() {
handler := otelhttp.NewHandler(router, "server",
otelhttp.WithMessageEvents(otelhttp.ReadEvents, otelhttp.WriteEvents),
)
http.ListenAndServe(":8080", handler)
}🏥 Health Checks
Liveness vs Readiness
go
type HealthChecker struct {
db *sql.DB
redis *redis.Client
}
// Liveness: Is the process running?
func (h *HealthChecker) Liveness(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write([]byte("OK"))
}
// Readiness: Can the service handle traffic?
func (h *HealthChecker) Readiness(w http.ResponseWriter, r *http.Request) {
ctx, cancel := context.WithTimeout(r.Context(), 5*time.Second)
defer cancel()
checks := make(map[string]string)
healthy := true
// Check database
if err := h.db.PingContext(ctx); err != nil {
checks["database"] = fmt.Sprintf("unhealthy: %v", err)
healthy = false
} else {
checks["database"] = "healthy"
}
// Check Redis
if err := h.redis.Ping(ctx).Err(); err != nil {
checks["redis"] = fmt.Sprintf("unhealthy: %v", err)
healthy = false
} else {
checks["redis"] = "healthy"
}
status := http.StatusOK
if !healthy {
status = http.StatusServiceUnavailable
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
json.NewEncoder(w).Encode(map[string]interface{}{
"status": map[bool]string{true: "healthy", false: "unhealthy"}[healthy],
"checks": checks,
})
}💻 Engineering Example: Observable Microservice
go
package main
import (
"context"
"log/slog"
"net/http"
"os"
"os/signal"
"syscall"
"time"
"github.com/go-chi/chi/v5"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
"go.opentelemetry.io/otel"
)
func main() {
// Initialize logging
logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
Level: slog.LevelInfo,
}))
slog.SetDefault(logger)
// Initialize tracing
tp, err := initTracer()
if err != nil {
slog.Error("failed to init tracer", slog.Any("error", err))
os.Exit(1)
}
defer tp.Shutdown(context.Background())
// Router with observability
r := chi.NewRouter()
// Observability middleware
r.Use(func(next http.Handler) http.Handler {
return otelhttp.NewHandler(next, "server")
})
r.Use(loggingMiddleware)
r.Use(metricsMiddleware)
// Health endpoints (no auth required)
r.Get("/health", healthHandler)
r.Get("/ready", readyHandler)
r.Handle("/metrics", promhttp.Handler())
// API routes
r.Route("/api/v1", func(r chi.Router) {
r.Get("/users/{id}", getUserHandler)
})
// Start server
srv := &http.Server{
Addr: ":8080",
Handler: r,
}
go func() {
slog.Info("server starting", slog.String("addr", ":8080"))
if err := srv.ListenAndServe(); err != http.ErrServerClosed {
slog.Error("server error", slog.Any("error", err))
}
}()
// Graceful shutdown
quit := make(chan os.Signal, 1)
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
<-quit
slog.Info("shutting down...")
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
srv.Shutdown(ctx)
}
func loggingMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
ww := &responseWriter{ResponseWriter: w, status: 200}
next.ServeHTTP(ww, r)
slog.Info("request",
slog.String("method", r.Method),
slog.String("path", r.URL.Path),
slog.Int("status", ww.status),
slog.Duration("latency", time.Since(start)),
slog.String("remote_addr", r.RemoteAddr),
)
})
}✅ Ship-to-Prod Checklist
Logging
- [ ] Structured logging (JSON in production)
- [ ] Log levels configured appropriately
- [ ] Correlation IDs in all log entries
- [ ] No sensitive data in logs
- [ ] Log rotation configured
Metrics
- [ ] RED metrics: Rate, Errors, Duration
- [ ] /metrics endpoint exposed
- [ ] Business metrics alongside technical
- [ ] Grafana dashboards created
- [ ] Alerts configured for critical metrics
Tracing
- [ ] OpenTelemetry initialized
- [ ] Spans for all significant operations
- [ ] Context propagation across services
- [ ] Sampling configured for high traffic
Health Checks
- [ ] /health (liveness) endpoint
- [ ] /ready (readiness) endpoint
- [ ] Dependency checks in readiness
- [ ] Kubernetes probes configured
📊 Summary
| Pillar | Tool | Purpose |
|---|---|---|
| Logs | slog/zap/zerolog | Event recording |
| Metrics | Prometheus | System state |
| Traces | OpenTelemetry | Request flow |
| Health | Custom endpoints | Load balancer |
➡️ Tiếp theo
Observability nắm vững rồi! Tiếp theo: Deployment - Docker, Kubernetes, và CI/CD.