Skip to content

collections & itertools Standard Library

Đừng reinvent the wheel - stdlib đã có sẵn data structures tối ưu

Learning Outcomes

Sau khi hoàn thành trang này, bạn sẽ:

  • 🎯 Sử dụng defaultdict, Counter, deque đúng use case
  • 🎯 Hiểu khi nào dùng namedtuple vs dataclass
  • 🎯 Master itertools recipes cho data processing
  • 🎯 Viết code memory-efficient với generators
  • 🎯 Tránh các Production Pitfalls phổ biến

collections Module

defaultdict - Auto-Initialize Keys

python
from collections import defaultdict

# ❌ Cách cũ: Check key existence
word_count = {}
for word in words:
    if word not in word_count:
        word_count[word] = 0
    word_count[word] += 1

# ✅ defaultdict: Auto-initialize
word_count = defaultdict(int)  # Default value: 0
for word in words:
    word_count[word] += 1

# Các factory functions phổ biến
defaultdict(int)      # Default: 0
defaultdict(list)     # Default: []
defaultdict(set)      # Default: set()
defaultdict(str)      # Default: ""
defaultdict(lambda: "N/A")  # Custom default

defaultdict Patterns

python
from collections import defaultdict

# Pattern 1: Grouping
users_by_role = defaultdict(list)
for user in users:
    users_by_role[user.role].append(user)

# Pattern 2: Counting with conditions
error_counts = defaultdict(int)
for log in logs:
    if log.level == "ERROR":
        error_counts[log.service] += 1

# Pattern 3: Nested defaultdict
# Tạo nested dict tự động
def nested_dict():
    return defaultdict(nested_dict)

data = nested_dict()
data["users"]["alice"]["settings"]["theme"] = "dark"
# Không cần tạo intermediate dicts!

# Pattern 4: Set operations
tags_by_post = defaultdict(set)
for post, tag in post_tags:
    tags_by_post[post].add(tag)

⚠️ DEFAULTDICT GOTCHA

defaultdict tạo key mới khi access key không tồn tại. Điều này có thể gây side effects không mong muốn.

python
d = defaultdict(list)
if d["missing"]:  # Tạo key "missing" với value []!
    pass

# ✅ Dùng .get() để tránh side effect
if d.get("missing"):
    pass

Counter - Đếm Elements

python
from collections import Counter

# Đếm từ string/list
words = ["apple", "banana", "apple", "cherry", "banana", "apple"]
counter = Counter(words)
# Counter({'apple': 3, 'banana': 2, 'cherry': 1})

# Đếm characters
char_count = Counter("mississippi")
# Counter({'i': 4, 's': 4, 'p': 2, 'm': 1})

# Từ dict
counter = Counter({"a": 4, "b": 2})

Counter Methods

python
from collections import Counter

counter = Counter(["a", "b", "a", "c", "a", "b"])

# Most common elements
counter.most_common(2)  # [('a', 3), ('b', 2)]

# Total count
counter.total()  # 6 (Python 3.10+)
sum(counter.values())  # 6 (older Python)

# Elements iterator (repeats each element)
list(counter.elements())  # ['a', 'a', 'a', 'b', 'b', 'c']

# Update counts
counter.update(["a", "d"])  # Add counts
counter.subtract(["a"])     # Subtract counts

# Arithmetic operations
c1 = Counter(a=3, b=1)
c2 = Counter(a=1, b=2)

c1 + c2  # Counter({'a': 4, 'b': 3})
c1 - c2  # Counter({'a': 2}) - chỉ giữ positive
c1 & c2  # Counter({'a': 1, 'b': 1}) - min
c1 | c2  # Counter({'a': 3, 'b': 2}) - max

Counter Use Cases

python
from collections import Counter

# Use case 1: Find duplicates
def find_duplicates(items):
    return [item for item, count in Counter(items).items() if count > 1]

# Use case 2: Anagram check
def is_anagram(s1: str, s2: str) -> bool:
    return Counter(s1.lower()) == Counter(s2.lower())

# Use case 3: Top N items
def top_n_words(text: str, n: int = 10) -> list[tuple[str, int]]:
    words = text.lower().split()
    return Counter(words).most_common(n)

# Use case 4: Histogram
def print_histogram(data: list[str]) -> None:
    counter = Counter(data)
    for item, count in counter.most_common():
        print(f"{item}: {'' * count}")

deque - Double-Ended Queue

python
from collections import deque

# Tạo deque
d = deque([1, 2, 3])
d = deque(maxlen=3)  # Fixed size (auto-discard oldest)

# Operations ở cả hai đầu - O(1)
d.append(4)       # Add right
d.appendleft(0)   # Add left
d.pop()           # Remove right
d.popleft()       # Remove left

# Extend
d.extend([4, 5])       # Add multiple right
d.extendleft([0, -1])  # Add multiple left (reversed order!)

# Rotate
d.rotate(1)   # Rotate right
d.rotate(-1)  # Rotate left

deque vs list

Operationlistdeque
append(x)O(1)O(1)
pop()O(1)O(1)
insert(0, x)O(n)O(1)
pop(0)O(n)O(1)
[i] accessO(1)O(n)

💡 KHI NÀO DÙNG DEQUE?

  • Queue/Stack operations: append/pop ở cả hai đầu
  • Sliding window: maxlen tự động discard
  • BFS algorithms: popleft() cho FIFO

KHÔNG dùng deque khi: Cần random access d[i] thường xuyên

deque Patterns

python
from collections import deque

# Pattern 1: Sliding window (last N items)
recent_logs = deque(maxlen=100)
for log in log_stream:
    recent_logs.append(log)
    # Tự động giữ 100 logs gần nhất

# Pattern 2: BFS traversal
def bfs(graph, start):
    visited = {start}
    queue = deque([start])
    
    while queue:
        node = queue.popleft()  # O(1) - FIFO
        for neighbor in graph[node]:
            if neighbor not in visited:
                visited.add(neighbor)
                queue.append(neighbor)
    
    return visited

# Pattern 3: Moving average
class MovingAverage:
    def __init__(self, window_size: int):
        self.window = deque(maxlen=window_size)
    
    def add(self, value: float) -> float:
        self.window.append(value)
        return sum(self.window) / len(self.window)

# Pattern 4: Undo/Redo stack
class UndoManager:
    def __init__(self, max_history: int = 50):
        self.undo_stack = deque(maxlen=max_history)
        self.redo_stack = deque(maxlen=max_history)
    
    def do(self, action):
        self.undo_stack.append(action)
        self.redo_stack.clear()
    
    def undo(self):
        if self.undo_stack:
            action = self.undo_stack.pop()
            self.redo_stack.append(action)
            return action.reverse()

namedtuple vs dataclass

python
from collections import namedtuple
from dataclasses import dataclass
from typing import NamedTuple

# === NAMEDTUPLE ===
# Cách 1: Function call
Point = namedtuple("Point", ["x", "y"])
p = Point(1, 2)

# Cách 2: typing.NamedTuple (recommended)
class Point(NamedTuple):
    x: float
    y: float
    
    def distance(self) -> float:
        return (self.x**2 + self.y**2) ** 0.5

# === DATACLASS ===
@dataclass
class Point:
    x: float
    y: float
    
    def distance(self) -> float:
        return (self.x**2 + self.y**2) ** 0.5

So Sánh Chi Tiết

Featurenamedtupledataclass
Immutable✅ Yes❌ No (default)
Hashable✅ Yes❌ No (default)
MemoryNhỏ hơnLớn hơn
Default values✅ Yes✅ Yes
Methods✅ Yes✅ Yes
Inheritance⚠️ Limited✅ Full
__slots__✅ Built-in❌ Manual
Mutable fields❌ No✅ Yes
Post-init❌ No__post_init__

Khi Nào Dùng Gì?

python
# ✅ NAMEDTUPLE: Immutable data, dict keys, lightweight
class Coordinate(NamedTuple):
    lat: float
    lon: float

locations = {Coordinate(10.0, 106.0): "HCMC"}  # Hashable!

# ✅ DATACLASS: Mutable data, complex logic, OOP
@dataclass
class User:
    name: str
    email: str
    age: int = 0
    
    def __post_init__(self):
        self.email = self.email.lower()

# ✅ FROZEN DATACLASS: Best of both worlds
@dataclass(frozen=True)
class Config:
    host: str
    port: int = 8080

config = Config("localhost")
# config.port = 9000  # FrozenInstanceError!

itertools Module

Infinite Iterators

python
from itertools import count, cycle, repeat

# count: Đếm vô hạn
for i in count(10, 2):  # 10, 12, 14, 16, ...
    if i > 20:
        break
    print(i)

# cycle: Lặp vô hạn
colors = cycle(["red", "green", "blue"])
for _ in range(5):
    print(next(colors))  # red, green, blue, red, green

# repeat: Lặp lại element
list(repeat("x", 3))  # ['x', 'x', 'x']

Combinatoric Iterators

python
from itertools import product, permutations, combinations, combinations_with_replacement

# product: Cartesian product
list(product("AB", "12"))
# [('A', '1'), ('A', '2'), ('B', '1'), ('B', '2')]

list(product(range(2), repeat=3))
# [(0,0,0), (0,0,1), (0,1,0), (0,1,1), (1,0,0), (1,0,1), (1,1,0), (1,1,1)]

# permutations: Hoán vị
list(permutations("ABC", 2))
# [('A','B'), ('A','C'), ('B','A'), ('B','C'), ('C','A'), ('C','B')]

# combinations: Tổ hợp (không lặp)
list(combinations("ABC", 2))
# [('A', 'B'), ('A', 'C'), ('B', 'C')]

# combinations_with_replacement: Tổ hợp (có lặp)
list(combinations_with_replacement("AB", 2))
# [('A', 'A'), ('A', 'B'), ('B', 'B')]

Terminating Iterators

python
from itertools import (
    chain, compress, dropwhile, takewhile,
    filterfalse, groupby, islice, starmap, zip_longest
)

# chain: Nối iterables
list(chain([1, 2], [3, 4], [5]))  # [1, 2, 3, 4, 5]
list(chain.from_iterable([[1, 2], [3, 4]]))  # [1, 2, 3, 4]

# compress: Filter by selector
list(compress("ABCDE", [1, 0, 1, 0, 1]))  # ['A', 'C', 'E']

# dropwhile/takewhile: Conditional slicing
list(dropwhile(lambda x: x < 5, [1, 4, 6, 4, 1]))  # [6, 4, 1]
list(takewhile(lambda x: x < 5, [1, 4, 6, 4, 1]))  # [1, 4]

# filterfalse: Inverse of filter
list(filterfalse(lambda x: x % 2, range(10)))  # [0, 2, 4, 6, 8]

# islice: Slice iterator
list(islice(range(100), 5, 10))  # [5, 6, 7, 8, 9]

# starmap: map với unpacking
list(starmap(pow, [(2, 3), (3, 2)]))  # [8, 9]

# zip_longest: zip với fill value
list(zip_longest("AB", "xyz", fillvalue="-"))
# [('A', 'x'), ('B', 'y'), ('-', 'z')]

groupby - Nhóm Consecutive Elements

python
from itertools import groupby
from operator import itemgetter

# ⚠️ QUAN TRỌNG: Data phải được SORT trước!
data = [
    {"name": "Alice", "dept": "Engineering"},
    {"name": "Bob", "dept": "Engineering"},
    {"name": "Charlie", "dept": "Sales"},
    {"name": "David", "dept": "Sales"},
]

# Sort by grouping key first!
data.sort(key=itemgetter("dept"))

# Group
for dept, members in groupby(data, key=itemgetter("dept")):
    print(f"{dept}: {[m['name'] for m in members]}")
# Engineering: ['Alice', 'Bob']
# Sales: ['Charlie', 'David']

🚨 GROUPBY TRAP

groupby chỉ nhóm consecutive elements. Nếu data chưa sort, kết quả sẽ sai!

python
data = [1, 1, 2, 1, 1]  # Không sorted
list((k, list(g)) for k, g in groupby(data))
# [(1, [1, 1]), (2, [2]), (1, [1, 1])]  # 3 groups, không phải 2!

itertools Recipes

Chunking

python
from itertools import islice

def chunked(iterable, n):
    """Chia iterable thành chunks của n elements."""
    it = iter(iterable)
    while chunk := list(islice(it, n)):
        yield chunk

list(chunked(range(10), 3))
# [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]

# Python 3.12+: itertools.batched
from itertools import batched
list(batched(range(10), 3))
# [(0, 1, 2), (3, 4, 5), (6, 7, 8), (9,)]

Sliding Window

python
from collections import deque
from itertools import islice

def sliding_window(iterable, n):
    """Sliding window của size n."""
    it = iter(iterable)
    window = deque(islice(it, n), maxlen=n)
    if len(window) == n:
        yield tuple(window)
    for x in it:
        window.append(x)
        yield tuple(window)

list(sliding_window(range(5), 3))
# [(0, 1, 2), (1, 2, 3), (2, 3, 4)]

# Python 3.10+: itertools.pairwise (window size 2)
from itertools import pairwise
list(pairwise(range(5)))
# [(0, 1), (1, 2), (2, 3), (3, 4)]

Flatten Nested Iterables

python
from itertools import chain

def flatten(nested):
    """Flatten một level."""
    return chain.from_iterable(nested)

list(flatten([[1, 2], [3, 4], [5]]))
# [1, 2, 3, 4, 5]

# Deep flatten (recursive)
def deep_flatten(nested):
    for item in nested:
        if isinstance(item, (list, tuple)):
            yield from deep_flatten(item)
        else:
            yield item

list(deep_flatten([[1, [2, 3]], [4, [5, [6]]]]))
# [1, 2, 3, 4, 5, 6]

First/Last N Elements

python
from itertools import islice
from collections import deque

def first_n(iterable, n):
    """Lấy n elements đầu tiên."""
    return list(islice(iterable, n))

def last_n(iterable, n):
    """Lấy n elements cuối cùng."""
    return list(deque(iterable, maxlen=n))

# Usage
first_n(range(100), 5)  # [0, 1, 2, 3, 4]
last_n(range(100), 5)   # [95, 96, 97, 98, 99]

Unique Elements (Preserve Order)

python
def unique_everseen(iterable, key=None):
    """Unique elements, giữ thứ tự xuất hiện đầu tiên."""
    seen = set()
    seen_add = seen.add
    if key is None:
        for element in iterable:
            if element not in seen:
                seen_add(element)
                yield element
    else:
        for element in iterable:
            k = key(element)
            if k not in seen:
                seen_add(k)
                yield element

list(unique_everseen([1, 2, 1, 3, 2, 4]))
# [1, 2, 3, 4]

# Case-insensitive unique
list(unique_everseen(["A", "b", "a", "B"], key=str.lower))
# ['A', 'b']

Data Processing Patterns

Pattern 1: Pipeline Processing

python
from itertools import filterfalse, islice
from typing import Iterator

def pipeline(data: Iterator[dict]) -> Iterator[dict]:
    """
    Data processing pipeline:
    1. Filter invalid records
    2. Transform fields
    3. Take first N
    """
    # Step 1: Filter
    valid = filter(lambda x: x.get("status") == "active", data)
    
    # Step 2: Transform
    transformed = (
        {**record, "name": record["name"].upper()}
        for record in valid
    )
    
    # Step 3: Limit
    return islice(transformed, 100)

# Usage - lazy evaluation!
results = pipeline(huge_data_stream)
for record in results:
    process(record)

Pattern 2: Parallel Iteration

python
from itertools import zip_longest

def merge_sorted(*iterables, key=None):
    """Merge multiple sorted iterables."""
    import heapq
    return heapq.merge(*iterables, key=key)

# Merge sorted files
file1 = (int(line) for line in open("sorted1.txt"))
file2 = (int(line) for line in open("sorted2.txt"))
for num in merge_sorted(file1, file2):
    print(num)

Pattern 3: Accumulate with Custom Function

python
from itertools import accumulate
from operator import mul

# Running sum
list(accumulate([1, 2, 3, 4]))  # [1, 3, 6, 10]

# Running product
list(accumulate([1, 2, 3, 4], mul))  # [1, 2, 6, 24]

# Running max
list(accumulate([3, 1, 4, 1, 5], max))  # [3, 3, 4, 4, 5]

# Custom: Running balance
transactions = [100, -20, 50, -30]
list(accumulate(transactions, initial=1000))
# [1000, 1100, 1080, 1130, 1100]

Production Pitfalls

Pitfall 1: Exhausted Iterators

python
from itertools import islice

# ❌ BUG: Iterator exhausted sau lần dùng đầu
data = (x for x in range(10))
first_5 = list(islice(data, 5))   # [0, 1, 2, 3, 4]
next_5 = list(islice(data, 5))    # [5, 6, 7, 8, 9] - OK
remaining = list(data)             # [] - Empty!

# ✅ SỬA: Dùng itertools.tee hoặc list
from itertools import tee

data = (x for x in range(10))
data1, data2 = tee(data, 2)  # Tạo 2 independent iterators
# ⚠️ Cẩn thận: tee lưu data trong memory!

Pitfall 2: groupby Without Sorting

python
from itertools import groupby

# ❌ BUG: Data chưa sort
data = [("a", 1), ("b", 2), ("a", 3)]
result = {k: list(g) for k, g in groupby(data, key=lambda x: x[0])}
# {'a': [('a', 1)], 'b': [('b', 2)], 'a': [('a', 3)]}
# Key 'a' xuất hiện 2 lần!

# ✅ SỬA: Sort trước
data.sort(key=lambda x: x[0])
result = {k: list(g) for k, g in groupby(data, key=lambda x: x[0])}
# {'a': [('a', 1), ('a', 3)], 'b': [('b', 2)]}

Pitfall 3: defaultdict Side Effects

python
from collections import defaultdict

# ❌ BUG: Checking key creates it
d = defaultdict(list)
if d["missing"]:  # Creates empty list!
    pass
print(dict(d))  # {'missing': []}

# ✅ SỬA: Use .get() or 'in' check
if d.get("missing"):
    pass
# Or
if "missing" in d and d["missing"]:
    pass

Pitfall 4: Counter Negative Counts

python
from collections import Counter

# ❌ SURPRISE: Counter allows negative counts
c = Counter(a=3, b=1)
c.subtract({"a": 5})
print(c)  # Counter({'b': 1, 'a': -2})

# ✅ Để loại bỏ zero/negative
c = +c  # Unary plus removes non-positive
print(c)  # Counter({'b': 1})

Pitfall 5: deque Thread Safety

python
from collections import deque
import threading

# ❌ BUG: deque operations không atomic
d = deque()

def producer():
    for i in range(1000):
        d.append(i)

def consumer():
    while True:
        if d:  # Check và pop không atomic!
            item = d.popleft()  # Có thể raise IndexError

# ✅ SỬA: Dùng queue.Queue cho thread-safe
from queue import Queue

q = Queue()
q.put(item)
item = q.get()  # Thread-safe, blocking

Best Practices Summary

python
from collections import defaultdict, Counter, deque
from itertools import chain, groupby, islice
from typing import NamedTuple

# === COLLECTIONS ===

# ✅ defaultdict cho grouping/counting
groups = defaultdict(list)
for item in items:
    groups[item.category].append(item)

# ✅ Counter cho frequency analysis
word_freq = Counter(words)
top_10 = word_freq.most_common(10)

# ✅ deque cho queue operations
queue = deque(maxlen=100)  # Auto-discard oldest
queue.append(item)
item = queue.popleft()

# ✅ NamedTuple cho immutable records
class Point(NamedTuple):
    x: float
    y: float


# === ITERTOOLS ===

# ✅ chain để nối iterables
all_items = chain(list1, list2, list3)

# ✅ islice để limit
first_100 = islice(huge_iterator, 100)

# ✅ groupby với sorted data
data.sort(key=key_func)
for key, group in groupby(data, key=key_func):
    process(key, list(group))


# === PATTERNS ===

# ✅ Generator pipelines cho memory efficiency
def process_pipeline(data):
    filtered = (x for x in data if x.valid)
    transformed = (transform(x) for x in filtered)
    return islice(transformed, 1000)