Giao diện
collections & itertools Standard Library
Đừng reinvent the wheel - stdlib đã có sẵn data structures tối ưu
Learning Outcomes
Sau khi hoàn thành trang này, bạn sẽ:
- 🎯 Sử dụng defaultdict, Counter, deque đúng use case
- 🎯 Hiểu khi nào dùng namedtuple vs dataclass
- 🎯 Master itertools recipes cho data processing
- 🎯 Viết code memory-efficient với generators
- 🎯 Tránh các Production Pitfalls phổ biến
collections Module
defaultdict - Auto-Initialize Keys
python
from collections import defaultdict
# ❌ Cách cũ: Check key existence
word_count = {}
for word in words:
if word not in word_count:
word_count[word] = 0
word_count[word] += 1
# ✅ defaultdict: Auto-initialize
word_count = defaultdict(int) # Default value: 0
for word in words:
word_count[word] += 1
# Các factory functions phổ biến
defaultdict(int) # Default: 0
defaultdict(list) # Default: []
defaultdict(set) # Default: set()
defaultdict(str) # Default: ""
defaultdict(lambda: "N/A") # Custom defaultdefaultdict Patterns
python
from collections import defaultdict
# Pattern 1: Grouping
users_by_role = defaultdict(list)
for user in users:
users_by_role[user.role].append(user)
# Pattern 2: Counting with conditions
error_counts = defaultdict(int)
for log in logs:
if log.level == "ERROR":
error_counts[log.service] += 1
# Pattern 3: Nested defaultdict
# Tạo nested dict tự động
def nested_dict():
return defaultdict(nested_dict)
data = nested_dict()
data["users"]["alice"]["settings"]["theme"] = "dark"
# Không cần tạo intermediate dicts!
# Pattern 4: Set operations
tags_by_post = defaultdict(set)
for post, tag in post_tags:
tags_by_post[post].add(tag)⚠️ DEFAULTDICT GOTCHA
defaultdict tạo key mới khi access key không tồn tại. Điều này có thể gây side effects không mong muốn.
python
d = defaultdict(list)
if d["missing"]: # Tạo key "missing" với value []!
pass
# ✅ Dùng .get() để tránh side effect
if d.get("missing"):
passCounter - Đếm Elements
python
from collections import Counter
# Đếm từ string/list
words = ["apple", "banana", "apple", "cherry", "banana", "apple"]
counter = Counter(words)
# Counter({'apple': 3, 'banana': 2, 'cherry': 1})
# Đếm characters
char_count = Counter("mississippi")
# Counter({'i': 4, 's': 4, 'p': 2, 'm': 1})
# Từ dict
counter = Counter({"a": 4, "b": 2})Counter Methods
python
from collections import Counter
counter = Counter(["a", "b", "a", "c", "a", "b"])
# Most common elements
counter.most_common(2) # [('a', 3), ('b', 2)]
# Total count
counter.total() # 6 (Python 3.10+)
sum(counter.values()) # 6 (older Python)
# Elements iterator (repeats each element)
list(counter.elements()) # ['a', 'a', 'a', 'b', 'b', 'c']
# Update counts
counter.update(["a", "d"]) # Add counts
counter.subtract(["a"]) # Subtract counts
# Arithmetic operations
c1 = Counter(a=3, b=1)
c2 = Counter(a=1, b=2)
c1 + c2 # Counter({'a': 4, 'b': 3})
c1 - c2 # Counter({'a': 2}) - chỉ giữ positive
c1 & c2 # Counter({'a': 1, 'b': 1}) - min
c1 | c2 # Counter({'a': 3, 'b': 2}) - maxCounter Use Cases
python
from collections import Counter
# Use case 1: Find duplicates
def find_duplicates(items):
return [item for item, count in Counter(items).items() if count > 1]
# Use case 2: Anagram check
def is_anagram(s1: str, s2: str) -> bool:
return Counter(s1.lower()) == Counter(s2.lower())
# Use case 3: Top N items
def top_n_words(text: str, n: int = 10) -> list[tuple[str, int]]:
words = text.lower().split()
return Counter(words).most_common(n)
# Use case 4: Histogram
def print_histogram(data: list[str]) -> None:
counter = Counter(data)
for item, count in counter.most_common():
print(f"{item}: {'█' * count}")deque - Double-Ended Queue
python
from collections import deque
# Tạo deque
d = deque([1, 2, 3])
d = deque(maxlen=3) # Fixed size (auto-discard oldest)
# Operations ở cả hai đầu - O(1)
d.append(4) # Add right
d.appendleft(0) # Add left
d.pop() # Remove right
d.popleft() # Remove left
# Extend
d.extend([4, 5]) # Add multiple right
d.extendleft([0, -1]) # Add multiple left (reversed order!)
# Rotate
d.rotate(1) # Rotate right
d.rotate(-1) # Rotate leftdeque vs list
| Operation | list | deque |
|---|---|---|
append(x) | O(1) | O(1) |
pop() | O(1) | O(1) |
insert(0, x) | O(n) | O(1) |
pop(0) | O(n) | O(1) |
[i] access | O(1) | O(n) |
💡 KHI NÀO DÙNG DEQUE?
- Queue/Stack operations: append/pop ở cả hai đầu
- Sliding window: maxlen tự động discard
- BFS algorithms: popleft() cho FIFO
KHÔNG dùng deque khi: Cần random access d[i] thường xuyên
deque Patterns
python
from collections import deque
# Pattern 1: Sliding window (last N items)
recent_logs = deque(maxlen=100)
for log in log_stream:
recent_logs.append(log)
# Tự động giữ 100 logs gần nhất
# Pattern 2: BFS traversal
def bfs(graph, start):
visited = {start}
queue = deque([start])
while queue:
node = queue.popleft() # O(1) - FIFO
for neighbor in graph[node]:
if neighbor not in visited:
visited.add(neighbor)
queue.append(neighbor)
return visited
# Pattern 3: Moving average
class MovingAverage:
def __init__(self, window_size: int):
self.window = deque(maxlen=window_size)
def add(self, value: float) -> float:
self.window.append(value)
return sum(self.window) / len(self.window)
# Pattern 4: Undo/Redo stack
class UndoManager:
def __init__(self, max_history: int = 50):
self.undo_stack = deque(maxlen=max_history)
self.redo_stack = deque(maxlen=max_history)
def do(self, action):
self.undo_stack.append(action)
self.redo_stack.clear()
def undo(self):
if self.undo_stack:
action = self.undo_stack.pop()
self.redo_stack.append(action)
return action.reverse()namedtuple vs dataclass
python
from collections import namedtuple
from dataclasses import dataclass
from typing import NamedTuple
# === NAMEDTUPLE ===
# Cách 1: Function call
Point = namedtuple("Point", ["x", "y"])
p = Point(1, 2)
# Cách 2: typing.NamedTuple (recommended)
class Point(NamedTuple):
x: float
y: float
def distance(self) -> float:
return (self.x**2 + self.y**2) ** 0.5
# === DATACLASS ===
@dataclass
class Point:
x: float
y: float
def distance(self) -> float:
return (self.x**2 + self.y**2) ** 0.5So Sánh Chi Tiết
| Feature | namedtuple | dataclass |
|---|---|---|
| Immutable | ✅ Yes | ❌ No (default) |
| Hashable | ✅ Yes | ❌ No (default) |
| Memory | Nhỏ hơn | Lớn hơn |
| Default values | ✅ Yes | ✅ Yes |
| Methods | ✅ Yes | ✅ Yes |
| Inheritance | ⚠️ Limited | ✅ Full |
__slots__ | ✅ Built-in | ❌ Manual |
| Mutable fields | ❌ No | ✅ Yes |
| Post-init | ❌ No | ✅ __post_init__ |
Khi Nào Dùng Gì?
python
# ✅ NAMEDTUPLE: Immutable data, dict keys, lightweight
class Coordinate(NamedTuple):
lat: float
lon: float
locations = {Coordinate(10.0, 106.0): "HCMC"} # Hashable!
# ✅ DATACLASS: Mutable data, complex logic, OOP
@dataclass
class User:
name: str
email: str
age: int = 0
def __post_init__(self):
self.email = self.email.lower()
# ✅ FROZEN DATACLASS: Best of both worlds
@dataclass(frozen=True)
class Config:
host: str
port: int = 8080
config = Config("localhost")
# config.port = 9000 # FrozenInstanceError!itertools Module
Infinite Iterators
python
from itertools import count, cycle, repeat
# count: Đếm vô hạn
for i in count(10, 2): # 10, 12, 14, 16, ...
if i > 20:
break
print(i)
# cycle: Lặp vô hạn
colors = cycle(["red", "green", "blue"])
for _ in range(5):
print(next(colors)) # red, green, blue, red, green
# repeat: Lặp lại element
list(repeat("x", 3)) # ['x', 'x', 'x']Combinatoric Iterators
python
from itertools import product, permutations, combinations, combinations_with_replacement
# product: Cartesian product
list(product("AB", "12"))
# [('A', '1'), ('A', '2'), ('B', '1'), ('B', '2')]
list(product(range(2), repeat=3))
# [(0,0,0), (0,0,1), (0,1,0), (0,1,1), (1,0,0), (1,0,1), (1,1,0), (1,1,1)]
# permutations: Hoán vị
list(permutations("ABC", 2))
# [('A','B'), ('A','C'), ('B','A'), ('B','C'), ('C','A'), ('C','B')]
# combinations: Tổ hợp (không lặp)
list(combinations("ABC", 2))
# [('A', 'B'), ('A', 'C'), ('B', 'C')]
# combinations_with_replacement: Tổ hợp (có lặp)
list(combinations_with_replacement("AB", 2))
# [('A', 'A'), ('A', 'B'), ('B', 'B')]Terminating Iterators
python
from itertools import (
chain, compress, dropwhile, takewhile,
filterfalse, groupby, islice, starmap, zip_longest
)
# chain: Nối iterables
list(chain([1, 2], [3, 4], [5])) # [1, 2, 3, 4, 5]
list(chain.from_iterable([[1, 2], [3, 4]])) # [1, 2, 3, 4]
# compress: Filter by selector
list(compress("ABCDE", [1, 0, 1, 0, 1])) # ['A', 'C', 'E']
# dropwhile/takewhile: Conditional slicing
list(dropwhile(lambda x: x < 5, [1, 4, 6, 4, 1])) # [6, 4, 1]
list(takewhile(lambda x: x < 5, [1, 4, 6, 4, 1])) # [1, 4]
# filterfalse: Inverse of filter
list(filterfalse(lambda x: x % 2, range(10))) # [0, 2, 4, 6, 8]
# islice: Slice iterator
list(islice(range(100), 5, 10)) # [5, 6, 7, 8, 9]
# starmap: map với unpacking
list(starmap(pow, [(2, 3), (3, 2)])) # [8, 9]
# zip_longest: zip với fill value
list(zip_longest("AB", "xyz", fillvalue="-"))
# [('A', 'x'), ('B', 'y'), ('-', 'z')]groupby - Nhóm Consecutive Elements
python
from itertools import groupby
from operator import itemgetter
# ⚠️ QUAN TRỌNG: Data phải được SORT trước!
data = [
{"name": "Alice", "dept": "Engineering"},
{"name": "Bob", "dept": "Engineering"},
{"name": "Charlie", "dept": "Sales"},
{"name": "David", "dept": "Sales"},
]
# Sort by grouping key first!
data.sort(key=itemgetter("dept"))
# Group
for dept, members in groupby(data, key=itemgetter("dept")):
print(f"{dept}: {[m['name'] for m in members]}")
# Engineering: ['Alice', 'Bob']
# Sales: ['Charlie', 'David']🚨 GROUPBY TRAP
groupby chỉ nhóm consecutive elements. Nếu data chưa sort, kết quả sẽ sai!
python
data = [1, 1, 2, 1, 1] # Không sorted
list((k, list(g)) for k, g in groupby(data))
# [(1, [1, 1]), (2, [2]), (1, [1, 1])] # 3 groups, không phải 2!itertools Recipes
Chunking
python
from itertools import islice
def chunked(iterable, n):
"""Chia iterable thành chunks của n elements."""
it = iter(iterable)
while chunk := list(islice(it, n)):
yield chunk
list(chunked(range(10), 3))
# [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
# Python 3.12+: itertools.batched
from itertools import batched
list(batched(range(10), 3))
# [(0, 1, 2), (3, 4, 5), (6, 7, 8), (9,)]Sliding Window
python
from collections import deque
from itertools import islice
def sliding_window(iterable, n):
"""Sliding window của size n."""
it = iter(iterable)
window = deque(islice(it, n), maxlen=n)
if len(window) == n:
yield tuple(window)
for x in it:
window.append(x)
yield tuple(window)
list(sliding_window(range(5), 3))
# [(0, 1, 2), (1, 2, 3), (2, 3, 4)]
# Python 3.10+: itertools.pairwise (window size 2)
from itertools import pairwise
list(pairwise(range(5)))
# [(0, 1), (1, 2), (2, 3), (3, 4)]Flatten Nested Iterables
python
from itertools import chain
def flatten(nested):
"""Flatten một level."""
return chain.from_iterable(nested)
list(flatten([[1, 2], [3, 4], [5]]))
# [1, 2, 3, 4, 5]
# Deep flatten (recursive)
def deep_flatten(nested):
for item in nested:
if isinstance(item, (list, tuple)):
yield from deep_flatten(item)
else:
yield item
list(deep_flatten([[1, [2, 3]], [4, [5, [6]]]]))
# [1, 2, 3, 4, 5, 6]First/Last N Elements
python
from itertools import islice
from collections import deque
def first_n(iterable, n):
"""Lấy n elements đầu tiên."""
return list(islice(iterable, n))
def last_n(iterable, n):
"""Lấy n elements cuối cùng."""
return list(deque(iterable, maxlen=n))
# Usage
first_n(range(100), 5) # [0, 1, 2, 3, 4]
last_n(range(100), 5) # [95, 96, 97, 98, 99]Unique Elements (Preserve Order)
python
def unique_everseen(iterable, key=None):
"""Unique elements, giữ thứ tự xuất hiện đầu tiên."""
seen = set()
seen_add = seen.add
if key is None:
for element in iterable:
if element not in seen:
seen_add(element)
yield element
else:
for element in iterable:
k = key(element)
if k not in seen:
seen_add(k)
yield element
list(unique_everseen([1, 2, 1, 3, 2, 4]))
# [1, 2, 3, 4]
# Case-insensitive unique
list(unique_everseen(["A", "b", "a", "B"], key=str.lower))
# ['A', 'b']Data Processing Patterns
Pattern 1: Pipeline Processing
python
from itertools import filterfalse, islice
from typing import Iterator
def pipeline(data: Iterator[dict]) -> Iterator[dict]:
"""
Data processing pipeline:
1. Filter invalid records
2. Transform fields
3. Take first N
"""
# Step 1: Filter
valid = filter(lambda x: x.get("status") == "active", data)
# Step 2: Transform
transformed = (
{**record, "name": record["name"].upper()}
for record in valid
)
# Step 3: Limit
return islice(transformed, 100)
# Usage - lazy evaluation!
results = pipeline(huge_data_stream)
for record in results:
process(record)Pattern 2: Parallel Iteration
python
from itertools import zip_longest
def merge_sorted(*iterables, key=None):
"""Merge multiple sorted iterables."""
import heapq
return heapq.merge(*iterables, key=key)
# Merge sorted files
file1 = (int(line) for line in open("sorted1.txt"))
file2 = (int(line) for line in open("sorted2.txt"))
for num in merge_sorted(file1, file2):
print(num)Pattern 3: Accumulate with Custom Function
python
from itertools import accumulate
from operator import mul
# Running sum
list(accumulate([1, 2, 3, 4])) # [1, 3, 6, 10]
# Running product
list(accumulate([1, 2, 3, 4], mul)) # [1, 2, 6, 24]
# Running max
list(accumulate([3, 1, 4, 1, 5], max)) # [3, 3, 4, 4, 5]
# Custom: Running balance
transactions = [100, -20, 50, -30]
list(accumulate(transactions, initial=1000))
# [1000, 1100, 1080, 1130, 1100]Production Pitfalls
Pitfall 1: Exhausted Iterators
python
from itertools import islice
# ❌ BUG: Iterator exhausted sau lần dùng đầu
data = (x for x in range(10))
first_5 = list(islice(data, 5)) # [0, 1, 2, 3, 4]
next_5 = list(islice(data, 5)) # [5, 6, 7, 8, 9] - OK
remaining = list(data) # [] - Empty!
# ✅ SỬA: Dùng itertools.tee hoặc list
from itertools import tee
data = (x for x in range(10))
data1, data2 = tee(data, 2) # Tạo 2 independent iterators
# ⚠️ Cẩn thận: tee lưu data trong memory!Pitfall 2: groupby Without Sorting
python
from itertools import groupby
# ❌ BUG: Data chưa sort
data = [("a", 1), ("b", 2), ("a", 3)]
result = {k: list(g) for k, g in groupby(data, key=lambda x: x[0])}
# {'a': [('a', 1)], 'b': [('b', 2)], 'a': [('a', 3)]}
# Key 'a' xuất hiện 2 lần!
# ✅ SỬA: Sort trước
data.sort(key=lambda x: x[0])
result = {k: list(g) for k, g in groupby(data, key=lambda x: x[0])}
# {'a': [('a', 1), ('a', 3)], 'b': [('b', 2)]}Pitfall 3: defaultdict Side Effects
python
from collections import defaultdict
# ❌ BUG: Checking key creates it
d = defaultdict(list)
if d["missing"]: # Creates empty list!
pass
print(dict(d)) # {'missing': []}
# ✅ SỬA: Use .get() or 'in' check
if d.get("missing"):
pass
# Or
if "missing" in d and d["missing"]:
passPitfall 4: Counter Negative Counts
python
from collections import Counter
# ❌ SURPRISE: Counter allows negative counts
c = Counter(a=3, b=1)
c.subtract({"a": 5})
print(c) # Counter({'b': 1, 'a': -2})
# ✅ Để loại bỏ zero/negative
c = +c # Unary plus removes non-positive
print(c) # Counter({'b': 1})Pitfall 5: deque Thread Safety
python
from collections import deque
import threading
# ❌ BUG: deque operations không atomic
d = deque()
def producer():
for i in range(1000):
d.append(i)
def consumer():
while True:
if d: # Check và pop không atomic!
item = d.popleft() # Có thể raise IndexError
# ✅ SỬA: Dùng queue.Queue cho thread-safe
from queue import Queue
q = Queue()
q.put(item)
item = q.get() # Thread-safe, blockingBest Practices Summary
python
from collections import defaultdict, Counter, deque
from itertools import chain, groupby, islice
from typing import NamedTuple
# === COLLECTIONS ===
# ✅ defaultdict cho grouping/counting
groups = defaultdict(list)
for item in items:
groups[item.category].append(item)
# ✅ Counter cho frequency analysis
word_freq = Counter(words)
top_10 = word_freq.most_common(10)
# ✅ deque cho queue operations
queue = deque(maxlen=100) # Auto-discard oldest
queue.append(item)
item = queue.popleft()
# ✅ NamedTuple cho immutable records
class Point(NamedTuple):
x: float
y: float
# === ITERTOOLS ===
# ✅ chain để nối iterables
all_items = chain(list1, list2, list3)
# ✅ islice để limit
first_100 = islice(huge_iterator, 100)
# ✅ groupby với sorted data
data.sort(key=key_func)
for key, group in groupby(data, key=key_func):
process(key, list(group))
# === PATTERNS ===
# ✅ Generator pipelines cho memory efficiency
def process_pipeline(data):
filtered = (x for x in data if x.valid)
transformed = (transform(x) for x in filtered)
return islice(transformed, 1000)Cross-links
- Cấu Trúc Dữ Liệu - List, dict, set fundamentals
- Generators - Iterator protocol deep dive
- Memory Model - Memory-efficient patterns
- functools - Functional programming tools