Giao diện
Python Memory Model Intermediate
Hiểu cách Python quản lý bộ nhớ = Debug memory issues như pro
Learning Outcomes
Sau khi hoàn thành trang này, bạn sẽ:
- 🎯 Hiểu reference counting và cách Python track objects
- 🎯 Nắm vững garbage collection (generational GC)
- 🎯 Phân biệt identity vs equality (
isvs==) - 🎯 Phát hiện và fix memory leaks trong production
Reference Counting
Cách Python Track Objects
Mỗi object trong Python có một reference count - số lượng references trỏ đến nó.
python
import sys
# Tạo object
a = [1, 2, 3]
print(sys.getrefcount(a)) # 2 (a + tham số của getrefcount)
# Thêm reference
b = a
print(sys.getrefcount(a)) # 3
# Xóa reference
del b
print(sys.getrefcount(a)) # 2
# Khi refcount = 0, object bị deallocate
del a # [1, 2, 3] được giải phóngReference Count Tăng Khi
python
import sys
obj = {"key": "value"}
initial = sys.getrefcount(obj)
# 1. Gán cho biến mới
ref1 = obj
print(sys.getrefcount(obj) - initial) # +1
# 2. Thêm vào container
my_list = [obj]
print(sys.getrefcount(obj) - initial) # +2
# 3. Truyền vào function
def func(x):
print(sys.getrefcount(obj) - initial) # +3 (trong function)
func(obj)
# 4. Tạo iterator
for item in [obj]:
print(sys.getrefcount(obj) - initial) # +3 (iterator holds ref)Reference Count Giảm Khi
python
import sys
obj = [1, 2, 3]
# 1. del statement
ref = obj
del ref # refcount -= 1
# 2. Reassignment
ref = obj
ref = None # refcount -= 1
# 3. Object ra khỏi scope
def func():
local_ref = obj
# Khi func() return, local_ref bị xóa
# 4. Container bị xóa
my_list = [obj]
del my_list # refcount -= 1Circular References Problem
python
import gc
# ❌ PROBLEM: Circular reference
class Node:
def __init__(self, value):
self.value = value
self.next = None
# Tạo circular reference
a = Node(1)
b = Node(2)
a.next = b
b.next = a # Circular!
# Xóa references
del a
del b
# Objects vẫn tồn tại vì refcount > 0 (trỏ lẫn nhau)
# Garbage collector sẽ dọn dẹp
gc.collect() # Force garbage collectionGarbage Collection (Generational GC)
Tại Sao Cần GC?
Reference counting không xử lý được circular references. Python dùng generational garbage collector để giải quyết.
3 Generations
┌─────────────────────────────────────────────────────────┐
│ Generation 2 │
│ (Long-lived objects) │
│ Collected least frequently │
├─────────────────────────────────────────────────────────┤
│ Generation 1 │
│ (Medium-lived objects) │
│ Collected occasionally │
├─────────────────────────────────────────────────────────┤
│ Generation 0 │
│ (New objects) │
│ Collected most frequently │
└─────────────────────────────────────────────────────────┘Hypothesis: Hầu hết objects chết trẻ (die young). Objects sống sót qua nhiều collections có xu hướng sống lâu.
GC Thresholds
python
import gc
# Xem thresholds hiện tại
print(gc.get_threshold()) # (700, 10, 10)
# Gen 0: collect sau 700 allocations
# Gen 1: collect sau 10 Gen 0 collections
# Gen 2: collect sau 10 Gen 1 collections
# Tùy chỉnh thresholds
gc.set_threshold(1000, 15, 15)
# Xem statistics
print(gc.get_stats())
# [{'collections': 85, 'collected': 1234, 'uncollectable': 0}, ...]Manual GC Control
python
import gc
# Disable GC (cẩn thận!)
gc.disable()
# Enable GC
gc.enable()
# Force collection
collected = gc.collect() # Returns number of unreachable objects
print(f"Collected {collected} objects")
# Collect specific generation
gc.collect(0) # Only generation 0
gc.collect(1) # Generation 0 and 1
gc.collect(2) # All generations (default)Debug GC
python
import gc
# Enable debug flags
gc.set_debug(gc.DEBUG_LEAK) # Print info about leaks
gc.set_debug(gc.DEBUG_STATS) # Print collection statistics
# Find objects
gc.get_objects() # All tracked objects
gc.get_referrers(obj) # Objects that reference obj
gc.get_referents(obj) # Objects that obj references
# Check if object is tracked
gc.is_tracked(obj) # True if GC tracks this objectObject Identity vs Equality
is vs ==
python
# == checks VALUE equality (calls __eq__)
# is checks IDENTITY (same object in memory)
a = [1, 2, 3]
b = [1, 2, 3]
c = a
print(a == b) # True - same values
print(a is b) # False - different objects
print(a == c) # True - same values
print(a is c) # True - same object
# Check with id()
print(id(a)) # 140234567890
print(id(b)) # 140234567891 (different)
print(id(c)) # 140234567890 (same as a)CPython Integer Caching
python
# CPython caches small integers (-5 to 256)
a = 256
b = 256
print(a is b) # True - cached!
a = 257
b = 257
print(a is b) # False - not cached
# ⚠️ NEVER rely on this behavior!
# It's implementation-specific and can changeString Interning
python
# Python interns some strings
a = "hello"
b = "hello"
print(a is b) # True - interned
a = "hello world"
b = "hello world"
print(a is b) # May be True or False!
# Force interning
import sys
a = sys.intern("hello world")
b = sys.intern("hello world")
print(a is b) # True - explicitly internedWhen to Use is
python
# ✅ CORRECT: Use `is` for singletons
if value is None:
pass
if value is True:
pass
if value is False:
pass
# ✅ CORRECT: Use `is` for sentinel values
MISSING = object()
def get_value(key, default=MISSING):
value = cache.get(key, MISSING)
if value is MISSING:
raise KeyError(key)
return value
# ❌ WRONG: Use `is` for value comparison
if x is 0: # Don't do this!
pass
if x is "hello": # Don't do this!
passMemory Leak Patterns
Pattern 1: Circular References với del
python
import gc
# ❌ LEAK: __del__ với circular reference
class Node:
def __init__(self, value):
self.value = value
self.parent = None
self.children = []
def add_child(self, child):
self.children.append(child)
child.parent = self # Circular!
def __del__(self):
print(f"Deleting {self.value}")
# Tạo circular reference
root = Node("root")
child = Node("child")
root.add_child(child)
del root
del child
# __del__ có thể không được gọi do circular reference!
# ✅ FIX: Dùng weakref
import weakref
class Node:
def __init__(self, value):
self.value = value
self._parent = None
self.children = []
@property
def parent(self):
return self._parent() if self._parent else None
@parent.setter
def parent(self, node):
self._parent = weakref.ref(node) if node else None
def add_child(self, child):
self.children.append(child)
child.parent = selfPattern 2: Caching Không Giới Hạn
python
# ❌ LEAK: Cache grows forever
cache = {}
def get_user(user_id):
if user_id not in cache:
cache[user_id] = fetch_from_db(user_id)
return cache[user_id]
# ✅ FIX 1: LRU Cache với size limit
from functools import lru_cache
@lru_cache(maxsize=1000)
def get_user(user_id):
return fetch_from_db(user_id)
# ✅ FIX 2: WeakValueDictionary
from weakref import WeakValueDictionary
cache = WeakValueDictionary()
def get_user(user_id):
user = cache.get(user_id)
if user is None:
user = fetch_from_db(user_id)
cache[user_id] = user
return user
# Objects tự động bị xóa khi không còn strong referencePattern 3: Event Handlers Không Cleanup
python
# ❌ LEAK: Handlers giữ reference đến objects
class Button:
def __init__(self):
self.handlers = []
def on_click(self, handler):
self.handlers.append(handler)
class Window:
def __init__(self, button):
self.button = button
button.on_click(self.handle_click) # Window giữ ref đến button
# Button giữ ref đến Window
def handle_click(self):
print("Clicked!")
# ✅ FIX: Weak references cho handlers
import weakref
class Button:
def __init__(self):
self.handlers = []
def on_click(self, handler):
# Store weak reference to bound method's object
if hasattr(handler, '__self__'):
ref = weakref.WeakMethod(handler)
else:
ref = weakref.ref(handler)
self.handlers.append(ref)
def click(self):
# Clean up dead references
self.handlers = [h for h in self.handlers if h() is not None]
for handler_ref in self.handlers:
handler = handler_ref()
if handler:
handler()Pattern 4: Global State
python
# ❌ LEAK: Global list grows forever
processed_items = []
def process(item):
result = do_something(item)
processed_items.append(result) # Never cleaned!
return result
# ✅ FIX: Bounded collection hoặc cleanup
from collections import deque
processed_items = deque(maxlen=1000) # Auto-removes old items
def process(item):
result = do_something(item)
processed_items.append(result)
return resultMemory Profiling
sys.getsizeof
python
import sys
# Basic size
print(sys.getsizeof([])) # 56 bytes (empty list)
print(sys.getsizeof([1, 2, 3])) # 88 bytes
print(sys.getsizeof({})) # 64 bytes (empty dict)
print(sys.getsizeof("hello")) # 54 bytes
# ⚠️ getsizeof không tính nested objects!
nested = [[1, 2], [3, 4]]
print(sys.getsizeof(nested)) # Chỉ tính outer list, không tính inner listsDeep Size Calculation
python
import sys
from collections.abc import Mapping, Iterable
def deep_getsizeof(obj, seen=None):
"""Tính tổng size của object và tất cả nested objects."""
if seen is None:
seen = set()
obj_id = id(obj)
if obj_id in seen:
return 0
seen.add(obj_id)
size = sys.getsizeof(obj)
if isinstance(obj, dict):
size += sum(deep_getsizeof(k, seen) + deep_getsizeof(v, seen)
for k, v in obj.items())
elif isinstance(obj, (list, tuple, set, frozenset)):
size += sum(deep_getsizeof(i, seen) for i in obj)
elif hasattr(obj, '__dict__'):
size += deep_getsizeof(obj.__dict__, seen)
return size
# Usage
data = {"users": [{"name": "Alice"}, {"name": "Bob"}]}
print(f"Deep size: {deep_getsizeof(data)} bytes")tracemalloc
python
import tracemalloc
# Start tracing
tracemalloc.start()
# Your code here
data = [i ** 2 for i in range(100000)]
# Get current memory usage
current, peak = tracemalloc.get_traced_memory()
print(f"Current: {current / 1024:.2f} KB")
print(f"Peak: {peak / 1024:.2f} KB")
# Get top memory consumers
snapshot = tracemalloc.take_snapshot()
top_stats = snapshot.statistics('lineno')
print("\nTop 10 memory consumers:")
for stat in top_stats[:10]:
print(stat)
# Stop tracing
tracemalloc.stop()memory_profiler (Third-party)
python
# pip install memory_profiler
from memory_profiler import profile
@profile
def memory_hungry_function():
a = [i for i in range(1000000)]
b = [i ** 2 for i in range(1000000)]
del a
return b
# Run: python -m memory_profiler script.pyProduction Pitfalls
Pitfall 1: del Không Được Gọi
python
# ❌ PROBLEM: __del__ không đảm bảo được gọi
class Resource:
def __init__(self):
self.handle = open_resource()
def __del__(self):
self.handle.close() # Có thể không được gọi!
# ✅ FIX: Context manager
class Resource:
def __init__(self):
self.handle = open_resource()
def __enter__(self):
return self
def __exit__(self, *args):
self.handle.close() # Guaranteed!
with Resource() as r:
use(r)
# Cleanup guaranteedPitfall 2: Large Object Không Được Release
python
# ❌ PROBLEM: Large object giữ trong memory
def process_file(path):
data = path.read_bytes() # Load entire file
result = analyze(data)
return result # data vẫn trong memory cho đến khi function return
# ✅ FIX: Explicit cleanup
def process_file(path):
data = path.read_bytes()
result = analyze(data)
del data # Explicit release
return result
# ✅ BETTER: Streaming
def process_file(path):
with open(path, 'rb') as f:
for chunk in iter(lambda: f.read(8192), b''):
process_chunk(chunk)Pitfall 3: Closure Giữ Reference
python
# ❌ PROBLEM: Closure giữ reference đến large object
def create_processor(large_data):
def process(x):
# large_data được capture trong closure
return x in large_data
return process
data = load_huge_dataset() # 1GB
processor = create_processor(data)
del data # Không giải phóng! processor vẫn giữ reference
# ✅ FIX: Copy chỉ những gì cần
def create_processor(large_data):
lookup_set = set(large_data) # Chỉ giữ set
def process(x):
return x in lookup_set
return processCross-links
- Data Structures - Memory layout của list, dict, set
- Performance - Memory (Phase 3) - Memory optimization techniques
- GIL & Threading - Memory trong multi-threaded context
Bảng Tóm tắt
python
# === REFERENCE COUNTING ===
import sys
sys.getrefcount(obj) # Số references đến obj
# === GARBAGE COLLECTION ===
import gc
gc.collect() # Force collection
gc.get_threshold() # (700, 10, 10)
gc.set_debug(gc.DEBUG_LEAK)
# === IDENTITY VS EQUALITY ===
a == b # Value equality (__eq__)
a is b # Identity (same object)
id(a) # Memory address
# === WEAK REFERENCES ===
import weakref
ref = weakref.ref(obj)
obj = ref() # Dereference
# === MEMORY PROFILING ===
import tracemalloc
tracemalloc.start()
current, peak = tracemalloc.get_traced_memory()
# === BEST PRACTICES ===
# 1. Dùng context managers cho resources
# 2. Tránh circular references
# 3. Dùng weakref cho caches
# 4. Explicit del cho large objects
# 5. Streaming thay vì load all