Giao diện
⚡ Performance Tuning
"Don't guess, measure."
Performance optimization phải dựa trên data, không phải intuition.
Performance optimization phải dựa trên data, không phải intuition.
🔬 Profiling Workflow
The Golden Rule
1. Write correct code first
2. Measure with benchmarks
3. Profile to find bottlenecks
4. Optimize the bottleneck
5. Measure again
6. RepeatCPU Profiling
go
import (
"os"
"runtime/pprof"
)
func main() {
// Create CPU profile
f, _ := os.Create("cpu.prof")
defer f.Close()
pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile()
// Your code here...
doWork()
}
// Alternative: via test
// $ go test -cpuprofile=cpu.prof -bench=.
// $ go tool pprof cpu.profMemory Profiling
go
func main() {
doWork()
// Snapshot heap profile
f, _ := os.Create("mem.prof")
defer f.Close()
runtime.GC() // Get up-to-date statistics
pprof.WriteHeapProfile(f)
}
// Analysis
// $ go tool pprof mem.prof
// (pprof) top10
// (pprof) list funcNameHTTP pprof Server
go
import (
"net/http"
_ "net/http/pprof" // Register /debug/pprof/ handlers
)
func main() {
go func() {
log.Println(http.ListenAndServe("localhost:6060", nil))
}()
// Main app...
}
// Access profiles:
// http://localhost:6060/debug/pprof/
// http://localhost:6060/debug/pprof/heap
// http://localhost:6060/debug/pprof/goroutine
// Live analysis
// $ go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30📊 Understanding pprof Output
Common Commands
bash
$ go tool pprof cpu.prof
(pprof) top10 # Top 10 by self time
(pprof) top10 -cum # Top 10 by cumulative time
(pprof) list funcName # Source view with time annotation
(pprof) web # Visual graph (requires graphviz)
(pprof) svg # Save SVG fileReading the Output
flat flat% sum% cum cum%
3.00s 30.00% 30.00% 7.50s 75.00% main.processItem
2.00s 20.00% 50.00% 2.00s 20.00% runtime.memmove
1.50s 15.00% 65.00% 5.00s 50.00% encoding/json.Marshal| Column | Meaning |
|---|---|
| flat | Time in this function only |
| flat% | Percentage of total |
| sum% | Cumulative percentage |
| cum | Time in function + callees |
| cum% | Cumulative percentage |
💾 Memory Optimization
Allocation Reduction Strategies
go
// ❌ BAD: Allocations in loop
func processBad(items []string) []byte {
var result []byte
for _, item := range items {
result = append(result, []byte(item)...) // Realloc each time
result = append(result, '\n')
}
return result
}
// ✅ GOOD: Pre-allocate
func processGood(items []string) []byte {
// Calculate total size
size := 0
for _, item := range items {
size += len(item) + 1 // +1 for newline
}
result := make([]byte, 0, size) // Single allocation
for _, item := range items {
result = append(result, item...)
result = append(result, '\n')
}
return result
}
// Benchmark: processGood is 5-10x faster for large inputssync.Pool for Reusable Objects
go
var bufferPool = sync.Pool{
New: func() interface{} {
return new(bytes.Buffer)
},
}
func processRequest(data []byte) []byte {
buf := bufferPool.Get().(*bytes.Buffer)
defer func() {
buf.Reset()
bufferPool.Put(buf)
}()
// Use buffer...
buf.Write(data)
buf.WriteString("-processed")
// Copy result before returning buffer to pool
result := make([]byte, buf.Len())
copy(result, buf.Bytes())
return result
}
// Pool is especially effective for:
// - Temporary buffers
// - JSON encoders/decoders
// - Regular expression objects
// - Database connection wrappersAvoiding String ↔ []byte Conversions
go
// ❌ BAD: Multiple allocations
func processBad(s string) string {
b := []byte(s) // Allocation 1
b = bytes.ToUpper(b)
return string(b) // Allocation 2
}
// ✅ GOOD: Use strings package
func processGood(s string) string {
return strings.ToUpper(s) // Optimized internally
}
// For JSON with io.Reader
// ❌ ioutil.ReadAll + json.Unmarshal
// ✅ json.NewDecoder(reader).Decode()🔧 Compiler Optimizations
Inlining
go
// Small functions are inlined automatically
// Check with: go build -gcflags="-m"
//go:noinline // Prevent inlining (for benchmarking)
func smallFunc(x int) int {
return x * 2
}
// Functions that are too complex won't inline
// - More than ~80 nodes in AST
// - Contains loops
// - Contains defer
// - Contains goBounds Check Elimination
go
// ❌ Bounds checked on each access
func sumBad(data []int) int {
sum := 0
for i := 0; i < len(data); i++ {
sum += data[i] // Bounds check here
}
return sum
}
// ✅ Compiler can eliminate bounds checks
func sumGood(data []int) int {
sum := 0
for _, v := range data { // No bounds check needed
sum += v
}
return sum
}
// ✅ Manual BCE hint
func sumWithBCE(data []int) int {
sum := 0
_ = data[len(data)-1] // Hint: bounds are ok
for i := 0; i < len(data); i++ {
sum += data[i] // BCE applies
}
return sum
}
// Check BCE: go build -gcflags="-d=ssa/check_bce/debug=1"Escape Analysis
go
// Check what escapes to heap:
// $ go build -gcflags="-m" ./...
// ❌ Escapes to heap
func createUser() *User {
u := User{Name: "Alice"}
return &u // Escapes!
}
// ✅ Stack allocation (caller provides memory)
func initUser(u *User) {
u.Name = "Alice"
}
func main() {
var u User // Stack allocated
initUser(&u)
}⚡ Concurrency Optimization
Reducing Lock Contention
go
// ❌ Single lock = contention
type CacheBad struct {
mu sync.RWMutex
items map[string]string
}
// ✅ Sharded cache = reduced contention
type CacheGood struct {
shards [256]cacheShard
}
type cacheShard struct {
mu sync.RWMutex
items map[string]string
}
func (c *CacheGood) getShard(key string) *cacheShard {
h := fnv.New32a()
h.Write([]byte(key))
return &c.shards[h.Sum32()%256]
}
func (c *CacheGood) Get(key string) (string, bool) {
shard := c.getShard(key)
shard.mu.RLock()
defer shard.mu.RUnlock()
val, ok := shard.items[key]
return val, ok
}sync.Map for Specific Cases
go
// sync.Map is optimized for:
// 1. Key is written once, read many times
// 2. Multiple goroutines read/write disjoint sets of keys
var cache sync.Map
func get(key string) (string, bool) {
val, ok := cache.Load(key)
if !ok {
return "", false
}
return val.(string), true
}
func set(key, value string) {
cache.Store(key, value)
}
// For other patterns, sharded map with RWMutex is often faster💻 Engineering Example: High-Throughput Pipeline
go
package pipeline
import (
"bytes"
"sync"
)
// Optimized pipeline with pooling and batching
var (
bufferPool = sync.Pool{
New: func() interface{} {
return bytes.NewBuffer(make([]byte, 0, 4096))
},
}
recordPool = sync.Pool{
New: func() interface{} {
return &Record{}
},
}
)
type Record struct {
ID int64
Data []byte
}
func (r *Record) Reset() {
r.ID = 0
r.Data = r.Data[:0]
}
type Pipeline struct {
workers int
batchSize int
input chan []byte
output chan []byte
}
func NewPipeline(workers, batchSize int) *Pipeline {
return &Pipeline{
workers: workers,
batchSize: batchSize,
input: make(chan []byte, workers*2),
output: make(chan []byte, workers*2),
}
}
func (p *Pipeline) Start() {
for i := 0; i < p.workers; i++ {
go p.worker()
}
}
func (p *Pipeline) worker() {
// Batch buffer to reduce channel operations
batch := make([][]byte, 0, p.batchSize)
for data := range p.input {
batch = append(batch, data)
if len(batch) >= p.batchSize {
p.processBatch(batch)
batch = batch[:0] // Reset without allocation
}
}
// Process remaining
if len(batch) > 0 {
p.processBatch(batch)
}
}
func (p *Pipeline) processBatch(batch [][]byte) {
for _, data := range batch {
// Get pooled objects
buf := bufferPool.Get().(*bytes.Buffer)
record := recordPool.Get().(*Record)
// Process
buf.Reset()
record.Reset()
// ... processing logic ...
buf.Write(data)
buf.WriteString("-processed")
result := make([]byte, buf.Len())
copy(result, buf.Bytes())
// Return to pools
bufferPool.Put(buf)
recordPool.Put(record)
p.output <- result
}
}
// Benchmark comparison:
// Without pooling: 50,000 req/s, 500MB heap
// With pooling: 200,000 req/s, 50MB heap (4x throughput, 10x less memory)✅ Ship-to-Prod Checklist
Profiling
- [ ] Baseline benchmarks established
- [ ] CPU profile analyzed for hot paths
- [ ] Memory profile checked for leaks
- [ ] Goroutine profile for leak detection
Memory
- [ ] Pre-allocate slices with known capacity
- [ ] sync.Pool for frequently allocated objects
- [ ] Avoid string ↔ []byte in hot paths
- [ ] Escape analysis checked for critical paths
Concurrency
- [ ] Lock contention measured and reduced
- [ ] Batching for high-throughput pipelines
- [ ] Channel buffer sizes tuned
- [ ] GOMAXPROCS appropriate for workload
Compiler
- [ ] Inlining checked for critical functions
- [ ] Bounds checks eliminated where safe
- [ ] Build tags used for optimized builds
📊 Summary
| Optimization | When to Apply |
|---|---|
| Pre-allocation | Known slice sizes |
| sync.Pool | Frequent temp objects |
| Sharded locks | High read/write concurrency |
| Inlining | Small, hot functions |
| BCE hints | Critical loops |
| Batching | High-throughput streams |
➡️ Tiếp theo
Performance nắm vững rồi! Tiếp theo: Advanced Concurrency - Complex concurrency patterns.