Skip to content

Performance Tuning

"Don't guess, measure."
Performance optimization phải dựa trên data, không phải intuition.

🔬 Profiling Workflow

The Golden Rule

1. Write correct code first
2. Measure with benchmarks
3. Profile to find bottlenecks
4. Optimize the bottleneck
5. Measure again
6. Repeat

CPU Profiling

go
import (
    "os"
    "runtime/pprof"
)

func main() {
    // Create CPU profile
    f, _ := os.Create("cpu.prof")
    defer f.Close()
    
    pprof.StartCPUProfile(f)
    defer pprof.StopCPUProfile()
    
    // Your code here...
    doWork()
}

// Alternative: via test
// $ go test -cpuprofile=cpu.prof -bench=.
// $ go tool pprof cpu.prof

Memory Profiling

go
func main() {
    doWork()
    
    // Snapshot heap profile
    f, _ := os.Create("mem.prof")
    defer f.Close()
    
    runtime.GC()  // Get up-to-date statistics
    pprof.WriteHeapProfile(f)
}

// Analysis
// $ go tool pprof mem.prof
// (pprof) top10
// (pprof) list funcName

HTTP pprof Server

go
import (
    "net/http"
    _ "net/http/pprof"  // Register /debug/pprof/ handlers
)

func main() {
    go func() {
        log.Println(http.ListenAndServe("localhost:6060", nil))
    }()
    
    // Main app...
}

// Access profiles:
// http://localhost:6060/debug/pprof/
// http://localhost:6060/debug/pprof/heap
// http://localhost:6060/debug/pprof/goroutine

// Live analysis
// $ go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30

📊 Understanding pprof Output

Common Commands

bash
$ go tool pprof cpu.prof

(pprof) top10         # Top 10 by self time
(pprof) top10 -cum    # Top 10 by cumulative time
(pprof) list funcName # Source view with time annotation
(pprof) web           # Visual graph (requires graphviz)
(pprof) svg           # Save SVG file

Reading the Output

      flat  flat%   sum%        cum   cum%
     3.00s 30.00% 30.00%      7.50s 75.00%  main.processItem
     2.00s 20.00% 50.00%      2.00s 20.00%  runtime.memmove
     1.50s 15.00% 65.00%      5.00s 50.00%  encoding/json.Marshal
ColumnMeaning
flatTime in this function only
flat%Percentage of total
sum%Cumulative percentage
cumTime in function + callees
cum%Cumulative percentage

💾 Memory Optimization

Allocation Reduction Strategies

go
// ❌ BAD: Allocations in loop
func processBad(items []string) []byte {
    var result []byte
    for _, item := range items {
        result = append(result, []byte(item)...)  // Realloc each time
        result = append(result, '\n')
    }
    return result
}

// ✅ GOOD: Pre-allocate
func processGood(items []string) []byte {
    // Calculate total size
    size := 0
    for _, item := range items {
        size += len(item) + 1  // +1 for newline
    }
    
    result := make([]byte, 0, size)  // Single allocation
    for _, item := range items {
        result = append(result, item...)
        result = append(result, '\n')
    }
    return result
}

// Benchmark: processGood is 5-10x faster for large inputs

sync.Pool for Reusable Objects

go
var bufferPool = sync.Pool{
    New: func() interface{} {
        return new(bytes.Buffer)
    },
}

func processRequest(data []byte) []byte {
    buf := bufferPool.Get().(*bytes.Buffer)
    defer func() {
        buf.Reset()
        bufferPool.Put(buf)
    }()
    
    // Use buffer...
    buf.Write(data)
    buf.WriteString("-processed")
    
    // Copy result before returning buffer to pool
    result := make([]byte, buf.Len())
    copy(result, buf.Bytes())
    return result
}

// Pool is especially effective for:
// - Temporary buffers
// - JSON encoders/decoders
// - Regular expression objects
// - Database connection wrappers

Avoiding String []byte Conversions

go
// ❌ BAD: Multiple allocations
func processBad(s string) string {
    b := []byte(s)          // Allocation 1
    b = bytes.ToUpper(b)
    return string(b)        // Allocation 2
}

// ✅ GOOD: Use strings package
func processGood(s string) string {
    return strings.ToUpper(s)  // Optimized internally
}

// For JSON with io.Reader
// ❌ ioutil.ReadAll + json.Unmarshal
// ✅ json.NewDecoder(reader).Decode()

🔧 Compiler Optimizations

Inlining

go
// Small functions are inlined automatically
// Check with: go build -gcflags="-m"

//go:noinline  // Prevent inlining (for benchmarking)
func smallFunc(x int) int {
    return x * 2
}

// Functions that are too complex won't inline
// - More than ~80 nodes in AST
// - Contains loops
// - Contains defer
// - Contains go

Bounds Check Elimination

go
// ❌ Bounds checked on each access
func sumBad(data []int) int {
    sum := 0
    for i := 0; i < len(data); i++ {
        sum += data[i]  // Bounds check here
    }
    return sum
}

// ✅ Compiler can eliminate bounds checks
func sumGood(data []int) int {
    sum := 0
    for _, v := range data {  // No bounds check needed
        sum += v
    }
    return sum
}

// ✅ Manual BCE hint
func sumWithBCE(data []int) int {
    sum := 0
    _ = data[len(data)-1]  // Hint: bounds are ok
    for i := 0; i < len(data); i++ {
        sum += data[i]  // BCE applies
    }
    return sum
}

// Check BCE: go build -gcflags="-d=ssa/check_bce/debug=1"

Escape Analysis

go
// Check what escapes to heap:
// $ go build -gcflags="-m" ./...

// ❌ Escapes to heap
func createUser() *User {
    u := User{Name: "Alice"}
    return &u  // Escapes!
}

// ✅ Stack allocation (caller provides memory)
func initUser(u *User) {
    u.Name = "Alice"
}

func main() {
    var u User  // Stack allocated
    initUser(&u)
}

Concurrency Optimization

Reducing Lock Contention

go
// ❌ Single lock = contention
type CacheBad struct {
    mu    sync.RWMutex
    items map[string]string
}

// ✅ Sharded cache = reduced contention
type CacheGood struct {
    shards [256]cacheShard
}

type cacheShard struct {
    mu    sync.RWMutex
    items map[string]string
}

func (c *CacheGood) getShard(key string) *cacheShard {
    h := fnv.New32a()
    h.Write([]byte(key))
    return &c.shards[h.Sum32()%256]
}

func (c *CacheGood) Get(key string) (string, bool) {
    shard := c.getShard(key)
    shard.mu.RLock()
    defer shard.mu.RUnlock()
    val, ok := shard.items[key]
    return val, ok
}

sync.Map for Specific Cases

go
// sync.Map is optimized for:
// 1. Key is written once, read many times
// 2. Multiple goroutines read/write disjoint sets of keys

var cache sync.Map

func get(key string) (string, bool) {
    val, ok := cache.Load(key)
    if !ok {
        return "", false
    }
    return val.(string), true
}

func set(key, value string) {
    cache.Store(key, value)
}

// For other patterns, sharded map with RWMutex is often faster

💻 Engineering Example: High-Throughput Pipeline

go
package pipeline

import (
    "bytes"
    "sync"
)

// Optimized pipeline with pooling and batching

var (
    bufferPool = sync.Pool{
        New: func() interface{} {
            return bytes.NewBuffer(make([]byte, 0, 4096))
        },
    }
    
    recordPool = sync.Pool{
        New: func() interface{} {
            return &Record{}
        },
    }
)

type Record struct {
    ID   int64
    Data []byte
}

func (r *Record) Reset() {
    r.ID = 0
    r.Data = r.Data[:0]
}

type Pipeline struct {
    workers   int
    batchSize int
    input     chan []byte
    output    chan []byte
}

func NewPipeline(workers, batchSize int) *Pipeline {
    return &Pipeline{
        workers:   workers,
        batchSize: batchSize,
        input:     make(chan []byte, workers*2),
        output:    make(chan []byte, workers*2),
    }
}

func (p *Pipeline) Start() {
    for i := 0; i < p.workers; i++ {
        go p.worker()
    }
}

func (p *Pipeline) worker() {
    // Batch buffer to reduce channel operations
    batch := make([][]byte, 0, p.batchSize)
    
    for data := range p.input {
        batch = append(batch, data)
        
        if len(batch) >= p.batchSize {
            p.processBatch(batch)
            batch = batch[:0]  // Reset without allocation
        }
    }
    
    // Process remaining
    if len(batch) > 0 {
        p.processBatch(batch)
    }
}

func (p *Pipeline) processBatch(batch [][]byte) {
    for _, data := range batch {
        // Get pooled objects
        buf := bufferPool.Get().(*bytes.Buffer)
        record := recordPool.Get().(*Record)
        
        // Process
        buf.Reset()
        record.Reset()
        
        // ... processing logic ...
        buf.Write(data)
        buf.WriteString("-processed")
        
        result := make([]byte, buf.Len())
        copy(result, buf.Bytes())
        
        // Return to pools
        bufferPool.Put(buf)
        recordPool.Put(record)
        
        p.output <- result
    }
}

// Benchmark comparison:
// Without pooling: 50,000 req/s, 500MB heap
// With pooling:    200,000 req/s, 50MB heap (4x throughput, 10x less memory)

Ship-to-Prod Checklist

Profiling

  • [ ] Baseline benchmarks established
  • [ ] CPU profile analyzed for hot paths
  • [ ] Memory profile checked for leaks
  • [ ] Goroutine profile for leak detection

Memory

  • [ ] Pre-allocate slices with known capacity
  • [ ] sync.Pool for frequently allocated objects
  • [ ] Avoid string ↔ []byte in hot paths
  • [ ] Escape analysis checked for critical paths

Concurrency

  • [ ] Lock contention measured and reduced
  • [ ] Batching for high-throughput pipelines
  • [ ] Channel buffer sizes tuned
  • [ ] GOMAXPROCS appropriate for workload

Compiler

  • [ ] Inlining checked for critical functions
  • [ ] Bounds checks eliminated where safe
  • [ ] Build tags used for optimized builds

📊 Summary

OptimizationWhen to Apply
Pre-allocationKnown slice sizes
sync.PoolFrequent temp objects
Sharded locksHigh read/write concurrency
InliningSmall, hot functions
BCE hintsCritical loops
BatchingHigh-throughput streams

➡️ Tiếp theo

Performance nắm vững rồi! Tiếp theo: Advanced Concurrency - Complex concurrency patterns.