Skip to content

dataclasses & attrs Standard Library

Viết data classes ngắn gọn, type-safe - không cần boilerplate

Learning Outcomes

Sau khi hoàn thành trang này, bạn sẽ:

  • 🎯 Sử dụng @dataclass decorator để tạo data classes nhanh chóng
  • 🎯 Hiểu field() và các options như default_factory, compare, hash
  • 🎯 Implement post_init cho validation và computed fields
  • 🎯 So sánh dataclass vs attrs vs Pydantic để chọn đúng tool
  • 🎯 Tránh các Production Pitfalls phổ biến

@dataclass Cơ bản

Vấn đề: Boilerplate Code

python
# ❌ Cách cũ: Quá nhiều boilerplate
class User:
    def __init__(self, name: str, email: str, age: int = 0):
        self.name = name
        self.email = email
        self.age = age
    
    def __repr__(self):
        return f"User(name={self.name!r}, email={self.email!r}, age={self.age!r})"
    
    def __eq__(self, other):
        if not isinstance(other, User):
            return NotImplemented
        return (self.name, self.email, self.age) == (other.name, other.email, other.age)
    
    def __hash__(self):
        return hash((self.name, self.email, self.age))

Giải pháp: @dataclass

python
from dataclasses import dataclass

# ✅ Với dataclass: Gọn gàng, tự động generate methods
@dataclass
class User:
    name: str
    email: str
    age: int = 0

# Tự động có:
# - __init__(self, name, email, age=0)
# - __repr__(self) → "User(name='...', email='...', age=0)"
# - __eq__(self, other) → So sánh theo giá trị

user = User("HPN", "hpn@example.com", 30)
print(user)  # User(name='HPN', email='hpn@example.com', age=30)

user1 = User("HPN", "hpn@example.com")
user2 = User("HPN", "hpn@example.com")
user1 == user2  # True (value equality)

@dataclass Parameters

python
from dataclasses import dataclass

@dataclass(
    init=True,           # Generate __init__
    repr=True,           # Generate __repr__
    eq=True,             # Generate __eq__
    order=False,         # Generate __lt__, __le__, __gt__, __ge__
    unsafe_hash=False,   # Generate __hash__ (cẩn thận với mutable!)
    frozen=False,        # Immutable (như tuple)
    match_args=True,     # Pattern matching support (3.10+)
    kw_only=False,       # All fields keyword-only (3.10+)
    slots=False,         # Use __slots__ for memory (3.10+)
)
class Config:
    host: str
    port: int = 8080

frozen=True - Immutable Dataclass

python
from dataclasses import dataclass

@dataclass(frozen=True)
class Point:
    x: float
    y: float

p = Point(1.0, 2.0)
# p.x = 3.0  # FrozenInstanceError!

# ✅ Có thể dùng làm dict key (hashable)
points = {Point(0, 0): "origin", Point(1, 1): "diagonal"}

order=True - Comparison Methods

python
from dataclasses import dataclass

@dataclass(order=True)
class Version:
    major: int
    minor: int
    patch: int = 0

v1 = Version(1, 0, 0)
v2 = Version(2, 0, 0)
v3 = Version(1, 5, 0)

v1 < v2   # True
v1 < v3   # True (so sánh tuple: (1,0,0) < (1,5,0))
sorted([v2, v1, v3])  # [Version(1,0,0), Version(1,5,0), Version(2,0,0)]

slots=True - Memory Optimization (Python 3.10+)

python
from dataclasses import dataclass
import sys

@dataclass
class UserNoSlots:
    name: str
    age: int

@dataclass(slots=True)
class UserWithSlots:
    name: str
    age: int

# Memory comparison
no_slots = UserNoSlots("HPN", 30)
with_slots = UserWithSlots("HPN", 30)

sys.getsizeof(no_slots)        # ~48 bytes
sys.getsizeof(no_slots.__dict__)  # ~104 bytes
# Total: ~152 bytes

sys.getsizeof(with_slots)      # ~56 bytes
# Tiết kiệm ~63% memory!

kw_only=True - Keyword-Only Arguments (Python 3.10+)

python
from dataclasses import dataclass

@dataclass(kw_only=True)
class Config:
    host: str
    port: int
    debug: bool = False

# Phải dùng keyword arguments
config = Config(host="localhost", port=8080)
# config = Config("localhost", 8080)  # TypeError!

field() - Fine-grained Control

python
from dataclasses import dataclass, field
from typing import List

@dataclass
class User:
    name: str
    email: str
    
    # ❌ WRONG: Mutable default value
    # tags: List[str] = []  # Shared across all instances!
    
    # ✅ CORRECT: Use default_factory
    tags: List[str] = field(default_factory=list)
    
    # Field không xuất hiện trong __init__
    id: int = field(init=False, default=0)
    
    # Field không xuất hiện trong __repr__
    _cache: dict = field(default_factory=dict, repr=False)
    
    # Field không dùng trong comparison
    created_at: str = field(default="", compare=False)

field() Parameters

python
from dataclasses import dataclass, field

@dataclass
class Example:
    # default: Giá trị mặc định (immutable only!)
    name: str = field(default="Unknown")
    
    # default_factory: Factory function cho mutable defaults
    items: list = field(default_factory=list)
    
    # init: Có xuất hiện trong __init__ không
    computed: int = field(init=False, default=0)
    
    # repr: Có xuất hiện trong __repr__ không
    internal: str = field(default="", repr=False)
    
    # compare: Có dùng trong __eq__ và ordering không
    metadata: dict = field(default_factory=dict, compare=False)
    
    # hash: Có dùng trong __hash__ không (None = theo compare)
    id: int = field(default=0, hash=True)
    
    # kw_only: Field này phải là keyword argument (3.10+)
    debug: bool = field(default=False, kw_only=True)

Pattern: Computed Fields

python
from dataclasses import dataclass, field

@dataclass
class Rectangle:
    width: float
    height: float
    
    # Computed field - không trong __init__
    area: float = field(init=False)
    perimeter: float = field(init=False)
    
    def __post_init__(self):
        self.area = self.width * self.height
        self.perimeter = 2 * (self.width + self.height)

rect = Rectangle(10, 5)
print(rect.area)       # 50.0
print(rect.perimeter)  # 30.0

post_init - Post-Initialization

__post_init__ được gọi sau __init__, dùng cho validation và computed fields.

Basic Usage

python
from dataclasses import dataclass

@dataclass
class User:
    name: str
    email: str
    age: int = 0
    
    def __post_init__(self):
        # Validation
        if self.age < 0:
            raise ValueError("Age cannot be negative")
        
        # Normalization
        self.email = self.email.lower().strip()
        self.name = self.name.strip()

user = User("  HPN  ", "  HPN@Example.COM  ", 30)
print(user.name)   # "HPN"
print(user.email)  # "hpn@example.com"

# User("Test", "test@test.com", -1)  # ValueError!

InitVar - Init-Only Variables

python
from dataclasses import dataclass, field, InitVar

@dataclass
class User:
    name: str
    email: str
    
    # InitVar: Chỉ dùng trong __init__, không lưu làm attribute
    password: InitVar[str]
    
    # Computed từ password
    password_hash: str = field(init=False)
    
    def __post_init__(self, password: str):
        # password chỉ available trong __post_init__
        import hashlib
        self.password_hash = hashlib.sha256(password.encode()).hexdigest()

user = User("HPN", "hpn@test.com", "secret123")
print(user.password_hash)  # "a665a45920422f9d..."
# user.password  # AttributeError! Không có attribute này

Pattern: Dependency Injection

python
from dataclasses import dataclass, field, InitVar
from typing import Optional

@dataclass
class Service:
    name: str
    
    # Optional dependency injection
    logger: InitVar[Optional["Logger"]] = None
    _logger: "Logger" = field(init=False, repr=False)
    
    def __post_init__(self, logger: Optional["Logger"]):
        # Use injected logger or create default
        self._logger = logger or DefaultLogger()

Inheritance với Dataclass

Basic Inheritance

python
from dataclasses import dataclass

@dataclass
class Person:
    name: str
    age: int

@dataclass
class Employee(Person):
    employee_id: str
    department: str = "Engineering"

emp = Employee("HPN", 30, "E001")
print(emp)  # Employee(name='HPN', age=30, employee_id='E001', department='Engineering')

Field Order với Defaults

python
from dataclasses import dataclass, field

# ❌ PROBLEM: Parent có default, child không có
@dataclass
class Base:
    name: str = "default"

# @dataclass
# class Child(Base):
#     id: int  # TypeError: non-default argument follows default argument

# ✅ SOLUTION 1: Child cũng có default
@dataclass
class Child(Base):
    id: int = 0

# ✅ SOLUTION 2: Dùng field(kw_only=True) (Python 3.10+)
@dataclass
class Base:
    name: str = "default"

@dataclass
class Child(Base):
    id: int = field(kw_only=True)

child = Child(id=1)  # name="default", id=1

Override post_init

python
from dataclasses import dataclass

@dataclass
class Base:
    name: str
    
    def __post_init__(self):
        self.name = self.name.upper()

@dataclass
class Child(Base):
    age: int
    
    def __post_init__(self):
        super().__post_init__()  # Gọi parent's __post_init__
        if self.age < 0:
            raise ValueError("Age must be positive")

child = Child("hpn", 30)
print(child.name)  # "HPN" (từ parent's __post_init__)

dataclass vs namedtuple

python
from dataclasses import dataclass
from typing import NamedTuple

# === NAMEDTUPLE ===
class PointNT(NamedTuple):
    x: float
    y: float

# === DATACLASS ===
@dataclass
class PointDC:
    x: float
    y: float

So sánh Chi tiết

Featurenamedtupledataclass
Immutable✅ Always❌ Default (frozen=True để immutable)
Hashable✅ Always❌ Default (frozen=True để hashable)
MemoryNhỏ hơnLớn hơn (slots=True để tối ưu)
Tuple unpackingx, y = point❌ Không
Index accesspoint[0]❌ Không
Default values✅ Yes✅ Yes
Mutable fields❌ No✅ Yes
post_init❌ No✅ Yes
Inheritance⚠️ Limited✅ Full

Khi nào dùng gì?

python
# ✅ NAMEDTUPLE: Immutable, lightweight, dict keys
class Coordinate(NamedTuple):
    lat: float
    lon: float

locations = {Coordinate(10.0, 106.0): "HCMC"}  # Hashable!
lat, lon = Coordinate(10.0, 106.0)  # Unpacking!

# ✅ DATACLASS: Mutable, complex logic, validation
@dataclass
class User:
    name: str
    email: str
    
    def __post_init__(self):
        self.email = self.email.lower()

# ✅ FROZEN DATACLASS: Best of both (nhưng không unpack được)
@dataclass(frozen=True)
class Config:
    host: str
    port: int = 8080

attrs Library

attrs là thư viện third-party mạnh hơn dataclass, có trước dataclass (Python 3.7).

Installation

bash
pip install attrs

Basic Usage

python
import attrs

@attrs.define
class User:
    name: str
    email: str
    age: int = 0

# Tương đương @dataclass(slots=True, eq=True, ...)
user = User("HPN", "hpn@test.com", 30)

attrs Validators

python
import attrs
from attrs import validators

@attrs.define
class User:
    name: str = attrs.field(validator=validators.instance_of(str))
    email: str = attrs.field(validator=[
        validators.instance_of(str),
        validators.matches_re(r'^[\w\.-]+@[\w\.-]+\.\w+$')
    ])
    age: int = attrs.field(validator=[
        validators.instance_of(int),
        validators.ge(0),  # >= 0
        validators.le(150)  # <= 150
    ])

# Validation tự động khi tạo instance
user = User("HPN", "hpn@test.com", 30)  # OK
# User("HPN", "invalid-email", 30)  # ValueError!
# User("HPN", "hpn@test.com", -1)   # ValueError!

attrs Converters

python
import attrs

@attrs.define
class Config:
    host: str = attrs.field(converter=str.lower)
    port: int = attrs.field(converter=int)
    debug: bool = attrs.field(converter=bool, default=False)

config = Config("LOCALHOST", "8080", 1)
print(config.host)   # "localhost"
print(config.port)   # 8080 (int)
print(config.debug)  # True

attrs vs dataclass

Featuredataclassattrs
Built-in✅ Python 3.7+❌ pip install
Validators❌ Manual✅ Built-in
Converters❌ Manual✅ Built-in
slots✅ 3.10+✅ Default
PerformanceGoodBetter
EcosystemStandardRich (cattrs, etc.)

Pydantic Comparison

Pydantic là thư viện validation mạnh nhất, đặc biệt cho API/JSON data.

Installation

bash
pip install pydantic

Basic Usage

python
from pydantic import BaseModel, EmailStr, Field
from typing import Optional

class User(BaseModel):
    name: str = Field(..., min_length=1, max_length=100)
    email: EmailStr
    age: int = Field(default=0, ge=0, le=150)
    bio: Optional[str] = None

# Validation tự động
user = User(name="HPN", email="hpn@test.com", age=30)

# Type coercion
user = User(name="HPN", email="hpn@test.com", age="30")  # age: int = 30

# Validation error
# User(name="", email="invalid", age=-1)  # ValidationError!

Pydantic JSON Serialization

python
from pydantic import BaseModel
from datetime import datetime

class Event(BaseModel):
    name: str
    timestamp: datetime
    
    class Config:
        json_encoders = {
            datetime: lambda v: v.isoformat()
        }

event = Event(name="Deploy", timestamp=datetime.now())

# Serialize to dict/JSON
event.model_dump()       # {'name': 'Deploy', 'timestamp': datetime(...)}
event.model_dump_json()  # '{"name": "Deploy", "timestamp": "2024-..."}'

# Parse from dict/JSON
Event.model_validate({"name": "Test", "timestamp": "2024-01-01T00:00:00"})
Event.model_validate_json('{"name": "Test", "timestamp": "2024-01-01T00:00:00"}')

So sánh Tổng hợp

FeaturedataclassattrsPydantic
Built-in
Validation❌ Manual✅ Basic✅ Advanced
Type coercion
JSON support⚠️ cattrs✅ Built-in
PerformanceFastFasterSlower*
Use caseSimple dataComplex dataAPI/Config

*Pydantic v2 đã cải thiện performance đáng kể

Khi nào dùng gì?

python
# ✅ DATACLASS: Internal data structures, simple DTOs
@dataclass
class Point:
    x: float
    y: float

# ✅ ATTRS: Complex validation, performance-critical
@attrs.define
class User:
    name: str = attrs.field(validator=validators.instance_of(str))
    age: int = attrs.field(validator=validators.ge(0))

# ✅ PYDANTIC: API models, config parsing, JSON handling
class APIResponse(BaseModel):
    status: str
    data: dict
    timestamp: datetime

Advanced Patterns

Pattern 1: Factory Methods

python
from dataclasses import dataclass
from typing import Self  # Python 3.11+

@dataclass
class User:
    name: str
    email: str
    role: str = "user"
    
    @classmethod
    def admin(cls, name: str, email: str) -> Self:
        """Factory method cho admin user."""
        return cls(name=name, email=email, role="admin")
    
    @classmethod
    def from_dict(cls, data: dict) -> Self:
        """Factory method từ dictionary."""
        return cls(**data)

admin = User.admin("HPN", "hpn@test.com")
user = User.from_dict({"name": "Test", "email": "test@test.com"})

Pattern 2: Immutable with Replace

python
from dataclasses import dataclass, replace

@dataclass(frozen=True)
class Config:
    host: str
    port: int
    debug: bool = False

config = Config("localhost", 8080)

# Tạo copy với một số fields thay đổi
dev_config = replace(config, debug=True)
prod_config = replace(config, host="prod.example.com", port=443)

print(dev_config)   # Config(host='localhost', port=8080, debug=True)
print(prod_config)  # Config(host='prod.example.com', port=443, debug=False)

Pattern 3: Serialization

python
from dataclasses import dataclass, asdict, astuple
import json

@dataclass
class User:
    name: str
    email: str
    age: int = 0

user = User("HPN", "hpn@test.com", 30)

# Convert to dict
user_dict = asdict(user)
# {'name': 'HPN', 'email': 'hpn@test.com', 'age': 30}

# Convert to tuple
user_tuple = astuple(user)
# ('HPN', 'hpn@test.com', 30)

# JSON serialization
json_str = json.dumps(asdict(user))
# '{"name": "HPN", "email": "hpn@test.com", "age": 30}'

# JSON deserialization
data = json.loads(json_str)
user_restored = User(**data)

Pattern 4: Nested Dataclasses

python
from dataclasses import dataclass, field, asdict
from typing import List

@dataclass
class Address:
    street: str
    city: str
    country: str = "Vietnam"

@dataclass
class User:
    name: str
    email: str
    addresses: List[Address] = field(default_factory=list)

user = User(
    name="HPN",
    email="hpn@test.com",
    addresses=[
        Address("123 Main St", "HCMC"),
        Address("456 Side St", "Hanoi")
    ]
)

# asdict handles nested dataclasses
user_dict = asdict(user)
# {
#     'name': 'HPN',
#     'email': 'hpn@test.com',
#     'addresses': [
#         {'street': '123 Main St', 'city': 'HCMC', 'country': 'Vietnam'},
#         {'street': '456 Side St', 'city': 'Hanoi', 'country': 'Vietnam'}
#     ]
# }

Production Pitfalls

Pitfall 1: Mutable Default Values

python
from dataclasses import dataclass, field

# ❌ BUG: Mutable default shared across instances
@dataclass
class User:
    name: str
    tags: list = []  # DANGER!

u1 = User("A")
u2 = User("B")
u1.tags.append("admin")
print(u2.tags)  # ['admin'] - BUG! Shared list

# ✅ FIX: Use default_factory
@dataclass
class User:
    name: str
    tags: list = field(default_factory=list)

Pitfall 2: Hashability với Mutable Fields

python
from dataclasses import dataclass

# ❌ BUG: Mutable dataclass không hashable
@dataclass
class User:
    name: str
    tags: list = None

user = User("HPN", [])
# {user}  # TypeError: unhashable type: 'User'

# ✅ FIX 1: frozen=True (nhưng không thể modify)
@dataclass(frozen=True)
class User:
    name: str

# ✅ FIX 2: unsafe_hash=True (cẩn thận!)
@dataclass(unsafe_hash=True)
class User:
    name: str
    tags: list = field(default_factory=list, hash=False)

Pitfall 3: post_init với Inheritance

python
from dataclasses import dataclass

@dataclass
class Base:
    name: str
    
    def __post_init__(self):
        print("Base __post_init__")

@dataclass
class Child(Base):
    age: int
    
    def __post_init__(self):
        # ❌ BUG: Quên gọi super()
        print("Child __post_init__")

child = Child("HPN", 30)
# Chỉ in "Child __post_init__"
# Base's __post_init__ không được gọi!

# ✅ FIX: Gọi super().__post_init__()
@dataclass
class Child(Base):
    age: int
    
    def __post_init__(self):
        super().__post_init__()
        print("Child __post_init__")

Pitfall 4: slots với Inheritance

python
from dataclasses import dataclass

# ❌ BUG: Parent không có slots, child có slots
@dataclass
class Base:
    name: str

@dataclass(slots=True)
class Child(Base):
    age: int

child = Child("HPN", 30)
child.dynamic = "value"  # Vẫn hoạt động! (từ Base's __dict__)

# ✅ FIX: Cả parent và child đều có slots
@dataclass(slots=True)
class Base:
    name: str

@dataclass(slots=True)
class Child(Base):
    age: int

Pitfall 5: asdict với Non-Serializable Fields

python
from dataclasses import dataclass, asdict, field
from datetime import datetime
import json

@dataclass
class Event:
    name: str
    timestamp: datetime

event = Event("Deploy", datetime.now())

# ❌ BUG: datetime không JSON serializable
# json.dumps(asdict(event))  # TypeError!

# ✅ FIX: Custom dict factory
def custom_asdict(obj):
    def convert(o):
        if isinstance(o, datetime):
            return o.isoformat()
        return o
    
    return {k: convert(v) for k, v in asdict(obj).items()}

json.dumps(custom_asdict(event))  # OK!

Quick Reference

python
from dataclasses import dataclass, field, asdict, astuple, replace

# === BASIC ===
@dataclass
class User:
    name: str
    age: int = 0

# === IMMUTABLE ===
@dataclass(frozen=True)
class Config:
    host: str
    port: int

# === MEMORY OPTIMIZED (3.10+) ===
@dataclass(slots=True)
class Point:
    x: float
    y: float

# === FIELD OPTIONS ===
@dataclass
class Example:
    items: list = field(default_factory=list)  # Mutable default
    computed: int = field(init=False)          # Not in __init__
    internal: str = field(repr=False)          # Not in __repr__
    metadata: dict = field(compare=False)      # Not in __eq__

# === POST INIT ===
@dataclass
class Validated:
    value: int
    
    def __post_init__(self):
        if self.value < 0:
            raise ValueError("Must be positive")

# === UTILITIES ===
asdict(obj)           # Convert to dict
astuple(obj)          # Convert to tuple
replace(obj, **kw)    # Copy with changes