Skip to content

pathlib & File I/O Standard Library

Quên os.path đi - pathlib là cách Pythonic để làm việc với filesystem

Learning Outcomes

Sau khi hoàn thành trang này, bạn sẽ:

  • 🎯 Sử dụng Path objects thay vì string manipulation
  • 🎯 Thực hiện file I/O an toàn với context managers
  • 🎯 Tìm files với glob patterns hiệu quả
  • 🎯 Migrate từ os.path → pathlib trong codebase cũ
  • 🎯 Tránh các Production Pitfalls khi làm việc với filesystem

Path Objects - Cơ Bản

Tạo Path Object

python
from pathlib import Path

# Cách 1: Từ string
path = Path("/home/user/documents")

# Cách 2: Current directory
cwd = Path.cwd()

# Cách 3: Home directory
home = Path.home()

# Cách 4: Từ __file__ (script location)
script_dir = Path(__file__).parent
config_path = script_dir / "config.yaml"

Path Concatenation với / Operator

python
from pathlib import Path

# ✅ PYTHONIC: Dùng / operator
base = Path("/var/log")
app_log = base / "myapp" / "app.log"
# PosixPath('/var/log/myapp/app.log')

# ❌ CŨ: String concatenation
import os
app_log = os.path.join("/var/log", "myapp", "app.log")

💡 TẠI SAO / OPERATOR?

Path overload / operator (__truediv__) để join paths. Đây là design pattern "operator overloading" - code đọc tự nhiên như đường dẫn thật.

Path Properties

python
from pathlib import Path

path = Path("/home/user/documents/report.pdf")

# Các thuộc tính hữu ích
path.name        # 'report.pdf' - filename
path.stem        # 'report' - filename without extension
path.suffix      # '.pdf' - extension
path.suffixes    # ['.pdf'] - all extensions (e.g., ['.tar', '.gz'])
path.parent      # PosixPath('/home/user/documents')
path.parents     # Sequence of parent directories
path.parts       # ('/', 'home', 'user', 'documents', 'report.pdf')
path.anchor      # '/' (root on Unix, 'C:\\' on Windows)

# Kiểm tra path
path.is_absolute()  # True
path.is_relative_to("/home")  # True (Python 3.9+)

Path Manipulation

python
from pathlib import Path

path = Path("/home/user/documents/report.pdf")

# Thay đổi extension
new_path = path.with_suffix(".docx")
# PosixPath('/home/user/documents/report.docx')

# Thay đổi filename
new_path = path.with_name("summary.pdf")
# PosixPath('/home/user/documents/summary.pdf')

# Thay đổi stem (giữ extension)
new_path = path.with_stem("final_report")  # Python 3.9+
# PosixPath('/home/user/documents/final_report.pdf')

# Resolve symlinks và relative paths
absolute = Path("./config").resolve()
# PosixPath('/current/working/dir/config')

File I/O với pathlib

Đọc File

python
from pathlib import Path

config_path = Path("config.yaml")

# ✅ Đọc toàn bộ text
content = config_path.read_text(encoding="utf-8")

# ✅ Đọc binary
data = config_path.read_bytes()

# ✅ Đọc từng dòng (memory efficient)
for line in config_path.open(encoding="utf-8"):
    process(line.strip())

# ✅ Với context manager (recommended cho files lớn)
with config_path.open("r", encoding="utf-8") as f:
    for line in f:
        process(line)

Ghi File

python
from pathlib import Path

output = Path("output.txt")

# ✅ Ghi text (overwrite)
output.write_text("Hello, World!", encoding="utf-8")

# ✅ Ghi binary
output.write_bytes(b"\x00\x01\x02")

# ✅ Append mode
with output.open("a", encoding="utf-8") as f:
    f.write("\nNew line")

# ✅ Tạo parent directories nếu chưa có
output_dir = Path("logs/2024/01")
output_dir.mkdir(parents=True, exist_ok=True)
(output_dir / "app.log").write_text("Log entry")

JSON/YAML I/O Pattern

python
from pathlib import Path
import json

config_path = Path("config.json")

# ✅ Đọc JSON
config = json.loads(config_path.read_text(encoding="utf-8"))

# ✅ Ghi JSON (pretty print)
config_path.write_text(
    json.dumps(config, indent=2, ensure_ascii=False),
    encoding="utf-8"
)

# ✅ YAML (với PyYAML)
import yaml

yaml_path = Path("config.yaml")
config = yaml.safe_load(yaml_path.read_text(encoding="utf-8"))
yaml_path.write_text(yaml.dump(config, allow_unicode=True))

Glob Patterns - Tìm Files

Basic Glob

python
from pathlib import Path

project = Path("./src")

# Tìm tất cả .py files trong directory
py_files = list(project.glob("*.py"))

# Tìm recursive (tất cả subdirectories)
all_py = list(project.glob("**/*.py"))

# Tìm với pattern phức tạp
tests = list(project.glob("**/test_*.py"))
configs = list(project.glob("**/*.{json,yaml,yml}"))  # Không hỗ trợ!

⚠️ GLOB LIMITATIONS

pathlib.glob() không hỗ trợ {a,b} brace expansion. Dùng multiple globs hoặc fnmatch cho patterns phức tạp.

Advanced Glob Patterns

python
from pathlib import Path

src = Path("./src")

# Pattern matching
src.glob("*.py")           # Chỉ trong src/
src.glob("**/*.py")        # Recursive
src.glob("**/[!_]*.py")    # Không bắt đầu bằng _
src.glob("module_?.py")    # Single character wildcard

# Kết hợp nhiều patterns
from itertools import chain

all_configs = chain(
    src.glob("**/*.json"),
    src.glob("**/*.yaml"),
    src.glob("**/*.toml"),
)

# Hoặc dùng rglob (recursive glob shortcut)
all_py = src.rglob("*.py")  # Tương đương glob("**/*.py")

Filtering Results

python
from pathlib import Path

project = Path(".")

# Lọc chỉ files (không directories)
files = [p for p in project.rglob("*") if p.is_file()]

# Lọc theo size
large_files = [
    p for p in project.rglob("*")
    if p.is_file() and p.stat().st_size > 1_000_000  # > 1MB
]

# Lọc theo modification time
from datetime import datetime, timedelta

cutoff = datetime.now() - timedelta(days=7)
recent = [
    p for p in project.rglob("*.log")
    if p.is_file() and datetime.fromtimestamp(p.stat().st_mtime) > cutoff
]

os.path → pathlib Migration

Bảng Chuyển Đổi

os.pathpathlibGhi chú
os.path.join(a, b)Path(a) / bDùng / operator
os.path.dirname(p)Path(p).parentTrả về Path object
os.path.basename(p)Path(p).nameFilename
os.path.splitext(p)Path(p).stem, .suffixTách riêng
os.path.exists(p)Path(p).exists()Method call
os.path.isfile(p)Path(p).is_file()Method call
os.path.isdir(p)Path(p).is_dir()Method call
os.path.abspath(p)Path(p).resolve()Resolve symlinks
os.path.expanduser(p)Path(p).expanduser()Expand ~
os.getcwd()Path.cwd()Class method
os.listdir(p)Path(p).iterdir()Returns iterator
os.walk(p)Path(p).rglob("*")Recursive
glob.glob(pattern)Path(".").glob(pattern)Built-in

Migration Example

python
# ❌ CŨ: os.path style
import os
import glob

def find_configs_old(base_dir: str) -> list[str]:
    configs = []
    for root, dirs, files in os.walk(base_dir):
        for f in files:
            if f.endswith(('.json', '.yaml')):
                full_path = os.path.join(root, f)
                if os.path.getsize(full_path) > 0:
                    configs.append(full_path)
    return configs

# ✅ MỚI: pathlib style
from pathlib import Path

def find_configs(base_dir: str | Path) -> list[Path]:
    base = Path(base_dir)
    return [
        p for p in base.rglob("*")
        if p.is_file()
        and p.suffix in {".json", ".yaml", ".yml"}
        and p.stat().st_size > 0
    ]

File Operations

Kiểm Tra và Tạo

python
from pathlib import Path

path = Path("data/output")

# Kiểm tra tồn tại
if not path.exists():
    path.mkdir(parents=True, exist_ok=True)

# Kiểm tra loại
path.is_file()      # True nếu là file
path.is_dir()       # True nếu là directory
path.is_symlink()   # True nếu là symbolic link
path.is_mount()     # True nếu là mount point

# Tạo file rỗng (touch)
Path("marker.txt").touch(exist_ok=True)

Copy, Move, Delete

python
from pathlib import Path
import shutil

src = Path("source.txt")
dst = Path("backup/source.txt")

# Copy file
dst.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(src, dst)  # Giữ metadata

# Copy directory
shutil.copytree(Path("src"), Path("src_backup"))

# Move/Rename
src.rename(dst)  # Move hoặc rename

# Delete file
Path("temp.txt").unlink(missing_ok=True)  # Python 3.8+

# Delete directory (empty)
Path("empty_dir").rmdir()

# Delete directory (recursive)
shutil.rmtree(Path("dir_with_contents"))

🚨 CẢNH BÁO

shutil.rmtree() xóa TOÀN BỘ directory tree không hỏi. Luôn double-check path trước khi gọi!

File Metadata

python
from pathlib import Path
from datetime import datetime

path = Path("document.pdf")
stat = path.stat()

# Size
size_bytes = stat.st_size
size_mb = stat.st_size / (1024 * 1024)

# Timestamps
created = datetime.fromtimestamp(stat.st_ctime)
modified = datetime.fromtimestamp(stat.st_mtime)
accessed = datetime.fromtimestamp(stat.st_atime)

# Permissions (Unix)
mode = stat.st_mode
is_readable = path.is_file() and os.access(path, os.R_OK)

# Owner (Unix)
import pwd
owner = pwd.getpwuid(stat.st_uid).pw_name

Temporary Files

tempfile Integration

python
from pathlib import Path
import tempfile

# Temporary file (auto-deleted)
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
    temp_path = Path(f.name)
    f.write('{"key": "value"}')

# Sử dụng temp_path
print(temp_path.read_text())

# Cleanup manual
temp_path.unlink()

# Temporary directory
with tempfile.TemporaryDirectory() as tmpdir:
    tmp = Path(tmpdir)
    (tmp / "data.txt").write_text("temporary data")
    # Directory tự động xóa khi exit context

Pattern: Safe File Write

python
from pathlib import Path
import tempfile
import shutil

def safe_write(path: Path, content: str) -> None:
    """
    Atomic write - tránh corrupt file nếu crash giữa chừng.
    
    1. Ghi vào temp file
    2. Rename temp → target (atomic trên cùng filesystem)
    """
    path = Path(path)
    
    # Tạo temp file cùng directory (để rename atomic)
    fd, tmp_path = tempfile.mkstemp(
        dir=path.parent,
        prefix=f".{path.name}.",
        suffix=".tmp"
    )
    tmp = Path(tmp_path)
    
    try:
        tmp.write_text(content, encoding="utf-8")
        tmp.replace(path)  # Atomic rename
    except Exception:
        tmp.unlink(missing_ok=True)
        raise

Cross-Platform Considerations

Windows vs Unix Paths

python
from pathlib import Path, PurePosixPath, PureWindowsPath

# Path() tự động chọn đúng class
path = Path("data/file.txt")
# Windows: WindowsPath('data\\file.txt')
# Unix: PosixPath('data/file.txt')

# Force specific path type (cho testing/parsing)
posix = PurePosixPath("/home/user/file.txt")
windows = PureWindowsPath("C:\\Users\\user\\file.txt")

# Convert Windows path trong config
config_path = "C:\\Users\\data\\config.yaml"
path = Path(config_path)  # Tự động handle

Path Normalization

python
from pathlib import Path

# Resolve relative paths và symlinks
path = Path("./data/../data/./file.txt")
normalized = path.resolve()
# PosixPath('/absolute/path/to/data/file.txt')

# Chỉ normalize (không resolve symlinks)
from pathlib import PurePath
normalized = PurePath("./data/../data/./file.txt")
# Vẫn giữ nguyên - dùng os.path.normpath nếu cần

Production Pitfalls

Pitfall 1: Encoding Issues

python
from pathlib import Path

# ❌ BUG: Không specify encoding
content = Path("data.txt").read_text()  # Dùng system default!

# ✅ SỬA: Luôn specify encoding
content = Path("data.txt").read_text(encoding="utf-8")

# ✅ Handle encoding errors
content = Path("data.txt").read_text(
    encoding="utf-8",
    errors="replace"  # Thay thế invalid chars
)

Pitfall 2: Race Conditions

python
from pathlib import Path

path = Path("data.txt")

# ❌ BUG: TOCTOU (Time-of-check to time-of-use)
if path.exists():
    content = path.read_text()  # File có thể bị xóa giữa check và read!

# ✅ SỬA: EAFP (Easier to Ask Forgiveness than Permission)
try:
    content = path.read_text(encoding="utf-8")
except FileNotFoundError:
    content = ""

Pitfall 3: Path Injection

python
from pathlib import Path

# ❌ BUG: User input không sanitize
def get_user_file(username: str) -> Path:
    return Path(f"data/{username}/profile.json")

# Attacker: username = "../../../etc/passwd"
# Result: data/../../../etc/passwd → /etc/passwd

# ✅ SỬA: Validate path stays within base
def get_user_file_safe(username: str, base: Path) -> Path:
    base = base.resolve()
    target = (base / username / "profile.json").resolve()
    
    # Kiểm tra target vẫn trong base
    if not target.is_relative_to(base):
        raise ValueError("Path traversal detected!")
    
    return target
python
from pathlib import Path
import os

# ❌ BUG: Follow symlinks blindly
def delete_user_data(user_dir: Path) -> None:
    for f in user_dir.rglob("*"):
        f.unlink()  # Có thể xóa files ngoài user_dir qua symlink!

# ✅ SỬA: Check symlinks
def delete_user_data_safe(user_dir: Path) -> None:
    user_dir = user_dir.resolve()
    
    for f in user_dir.rglob("*"):
        # Skip symlinks
        if f.is_symlink():
            f.unlink()  # Xóa symlink, không follow
            continue
            
        # Verify still within user_dir
        if f.resolve().is_relative_to(user_dir):
            if f.is_file():
                f.unlink()

Pitfall 5: Large File Memory

python
from pathlib import Path

# ❌ BUG: Load entire file vào memory
huge_file = Path("10gb_log.txt")
content = huge_file.read_text()  # 💥 MemoryError!

# ✅ SỬA: Stream processing
def process_large_file(path: Path) -> int:
    count = 0
    with path.open("r", encoding="utf-8") as f:
        for line in f:  # Iterator - không load toàn bộ
            if "ERROR" in line:
                count += 1
    return count

# ✅ SỬA: Chunked reading cho binary
def hash_large_file(path: Path) -> str:
    import hashlib
    
    hasher = hashlib.sha256()
    with path.open("rb") as f:
        while chunk := f.read(8192):  # 8KB chunks
            hasher.update(chunk)
    return hasher.hexdigest()

Pitfall 6: Glob Performance

python
from pathlib import Path

# ❌ CHẬM: Glob toàn bộ rồi filter
all_files = list(Path(".").rglob("*"))  # Load tất cả vào memory
py_files = [f for f in all_files if f.suffix == ".py"]

# ✅ NHANH: Glob specific pattern
py_files = list(Path(".").rglob("*.py"))

# ✅ NHANH HƠN: Generator (lazy evaluation)
def find_large_py_files(base: Path, min_size: int = 10000):
    for path in base.rglob("*.py"):
        if path.stat().st_size > min_size:
            yield path

Best Practices Summary

python
from pathlib import Path

# === PATH CREATION ===
# ✅ Dùng Path objects
config = Path("config") / "app.yaml"

# ✅ Resolve relative paths
absolute = Path("./data").resolve()

# ✅ Type hints
def load_config(path: Path | str) -> dict:
    return json.loads(Path(path).read_text(encoding="utf-8"))


# === FILE I/O ===
# ✅ Luôn specify encoding
content = path.read_text(encoding="utf-8")

# ✅ Context manager cho files lớn
with path.open("r", encoding="utf-8") as f:
    for line in f:
        process(line)

# ✅ Atomic writes cho data quan trọng
# (Xem safe_write pattern ở trên)


# === DIRECTORY OPERATIONS ===
# ✅ Tạo với parents và exist_ok
Path("logs/2024").mkdir(parents=True, exist_ok=True)

# ✅ Dùng rglob cho recursive search
for py_file in src.rglob("*.py"):
    lint(py_file)


# === SECURITY ===
# ✅ Validate user input paths
if not target.resolve().is_relative_to(base_dir):
    raise SecurityError("Path traversal!")

# ✅ EAFP over LBYL
try:
    content = path.read_text(encoding="utf-8")
except FileNotFoundError:
    content = default_content