Giao diện
pathlib & File I/O Standard Library
Quên os.path đi - pathlib là cách Pythonic để làm việc với filesystem
Learning Outcomes
Sau khi hoàn thành trang này, bạn sẽ:
- 🎯 Sử dụng Path objects thay vì string manipulation
- 🎯 Thực hiện file I/O an toàn với context managers
- 🎯 Tìm files với glob patterns hiệu quả
- 🎯 Migrate từ os.path → pathlib trong codebase cũ
- 🎯 Tránh các Production Pitfalls khi làm việc với filesystem
Path Objects - Cơ Bản
Tạo Path Object
python
from pathlib import Path
# Cách 1: Từ string
path = Path("/home/user/documents")
# Cách 2: Current directory
cwd = Path.cwd()
# Cách 3: Home directory
home = Path.home()
# Cách 4: Từ __file__ (script location)
script_dir = Path(__file__).parent
config_path = script_dir / "config.yaml"Path Concatenation với / Operator
python
from pathlib import Path
# ✅ PYTHONIC: Dùng / operator
base = Path("/var/log")
app_log = base / "myapp" / "app.log"
# PosixPath('/var/log/myapp/app.log')
# ❌ CŨ: String concatenation
import os
app_log = os.path.join("/var/log", "myapp", "app.log")💡 TẠI SAO / OPERATOR?
Path overload / operator (__truediv__) để join paths. Đây là design pattern "operator overloading" - code đọc tự nhiên như đường dẫn thật.
Path Properties
python
from pathlib import Path
path = Path("/home/user/documents/report.pdf")
# Các thuộc tính hữu ích
path.name # 'report.pdf' - filename
path.stem # 'report' - filename without extension
path.suffix # '.pdf' - extension
path.suffixes # ['.pdf'] - all extensions (e.g., ['.tar', '.gz'])
path.parent # PosixPath('/home/user/documents')
path.parents # Sequence of parent directories
path.parts # ('/', 'home', 'user', 'documents', 'report.pdf')
path.anchor # '/' (root on Unix, 'C:\\' on Windows)
# Kiểm tra path
path.is_absolute() # True
path.is_relative_to("/home") # True (Python 3.9+)Path Manipulation
python
from pathlib import Path
path = Path("/home/user/documents/report.pdf")
# Thay đổi extension
new_path = path.with_suffix(".docx")
# PosixPath('/home/user/documents/report.docx')
# Thay đổi filename
new_path = path.with_name("summary.pdf")
# PosixPath('/home/user/documents/summary.pdf')
# Thay đổi stem (giữ extension)
new_path = path.with_stem("final_report") # Python 3.9+
# PosixPath('/home/user/documents/final_report.pdf')
# Resolve symlinks và relative paths
absolute = Path("./config").resolve()
# PosixPath('/current/working/dir/config')File I/O với pathlib
Đọc File
python
from pathlib import Path
config_path = Path("config.yaml")
# ✅ Đọc toàn bộ text
content = config_path.read_text(encoding="utf-8")
# ✅ Đọc binary
data = config_path.read_bytes()
# ✅ Đọc từng dòng (memory efficient)
for line in config_path.open(encoding="utf-8"):
process(line.strip())
# ✅ Với context manager (recommended cho files lớn)
with config_path.open("r", encoding="utf-8") as f:
for line in f:
process(line)Ghi File
python
from pathlib import Path
output = Path("output.txt")
# ✅ Ghi text (overwrite)
output.write_text("Hello, World!", encoding="utf-8")
# ✅ Ghi binary
output.write_bytes(b"\x00\x01\x02")
# ✅ Append mode
with output.open("a", encoding="utf-8") as f:
f.write("\nNew line")
# ✅ Tạo parent directories nếu chưa có
output_dir = Path("logs/2024/01")
output_dir.mkdir(parents=True, exist_ok=True)
(output_dir / "app.log").write_text("Log entry")JSON/YAML I/O Pattern
python
from pathlib import Path
import json
config_path = Path("config.json")
# ✅ Đọc JSON
config = json.loads(config_path.read_text(encoding="utf-8"))
# ✅ Ghi JSON (pretty print)
config_path.write_text(
json.dumps(config, indent=2, ensure_ascii=False),
encoding="utf-8"
)
# ✅ YAML (với PyYAML)
import yaml
yaml_path = Path("config.yaml")
config = yaml.safe_load(yaml_path.read_text(encoding="utf-8"))
yaml_path.write_text(yaml.dump(config, allow_unicode=True))Glob Patterns - Tìm Files
Basic Glob
python
from pathlib import Path
project = Path("./src")
# Tìm tất cả .py files trong directory
py_files = list(project.glob("*.py"))
# Tìm recursive (tất cả subdirectories)
all_py = list(project.glob("**/*.py"))
# Tìm với pattern phức tạp
tests = list(project.glob("**/test_*.py"))
configs = list(project.glob("**/*.{json,yaml,yml}")) # Không hỗ trợ!⚠️ GLOB LIMITATIONS
pathlib.glob() không hỗ trợ {a,b} brace expansion. Dùng multiple globs hoặc fnmatch cho patterns phức tạp.
Advanced Glob Patterns
python
from pathlib import Path
src = Path("./src")
# Pattern matching
src.glob("*.py") # Chỉ trong src/
src.glob("**/*.py") # Recursive
src.glob("**/[!_]*.py") # Không bắt đầu bằng _
src.glob("module_?.py") # Single character wildcard
# Kết hợp nhiều patterns
from itertools import chain
all_configs = chain(
src.glob("**/*.json"),
src.glob("**/*.yaml"),
src.glob("**/*.toml"),
)
# Hoặc dùng rglob (recursive glob shortcut)
all_py = src.rglob("*.py") # Tương đương glob("**/*.py")Filtering Results
python
from pathlib import Path
project = Path(".")
# Lọc chỉ files (không directories)
files = [p for p in project.rglob("*") if p.is_file()]
# Lọc theo size
large_files = [
p for p in project.rglob("*")
if p.is_file() and p.stat().st_size > 1_000_000 # > 1MB
]
# Lọc theo modification time
from datetime import datetime, timedelta
cutoff = datetime.now() - timedelta(days=7)
recent = [
p for p in project.rglob("*.log")
if p.is_file() and datetime.fromtimestamp(p.stat().st_mtime) > cutoff
]os.path → pathlib Migration
Bảng Chuyển Đổi
| os.path | pathlib | Ghi chú |
|---|---|---|
os.path.join(a, b) | Path(a) / b | Dùng / operator |
os.path.dirname(p) | Path(p).parent | Trả về Path object |
os.path.basename(p) | Path(p).name | Filename |
os.path.splitext(p) | Path(p).stem, .suffix | Tách riêng |
os.path.exists(p) | Path(p).exists() | Method call |
os.path.isfile(p) | Path(p).is_file() | Method call |
os.path.isdir(p) | Path(p).is_dir() | Method call |
os.path.abspath(p) | Path(p).resolve() | Resolve symlinks |
os.path.expanduser(p) | Path(p).expanduser() | Expand ~ |
os.getcwd() | Path.cwd() | Class method |
os.listdir(p) | Path(p).iterdir() | Returns iterator |
os.walk(p) | Path(p).rglob("*") | Recursive |
glob.glob(pattern) | Path(".").glob(pattern) | Built-in |
Migration Example
python
# ❌ CŨ: os.path style
import os
import glob
def find_configs_old(base_dir: str) -> list[str]:
configs = []
for root, dirs, files in os.walk(base_dir):
for f in files:
if f.endswith(('.json', '.yaml')):
full_path = os.path.join(root, f)
if os.path.getsize(full_path) > 0:
configs.append(full_path)
return configs
# ✅ MỚI: pathlib style
from pathlib import Path
def find_configs(base_dir: str | Path) -> list[Path]:
base = Path(base_dir)
return [
p for p in base.rglob("*")
if p.is_file()
and p.suffix in {".json", ".yaml", ".yml"}
and p.stat().st_size > 0
]File Operations
Kiểm Tra và Tạo
python
from pathlib import Path
path = Path("data/output")
# Kiểm tra tồn tại
if not path.exists():
path.mkdir(parents=True, exist_ok=True)
# Kiểm tra loại
path.is_file() # True nếu là file
path.is_dir() # True nếu là directory
path.is_symlink() # True nếu là symbolic link
path.is_mount() # True nếu là mount point
# Tạo file rỗng (touch)
Path("marker.txt").touch(exist_ok=True)Copy, Move, Delete
python
from pathlib import Path
import shutil
src = Path("source.txt")
dst = Path("backup/source.txt")
# Copy file
dst.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(src, dst) # Giữ metadata
# Copy directory
shutil.copytree(Path("src"), Path("src_backup"))
# Move/Rename
src.rename(dst) # Move hoặc rename
# Delete file
Path("temp.txt").unlink(missing_ok=True) # Python 3.8+
# Delete directory (empty)
Path("empty_dir").rmdir()
# Delete directory (recursive)
shutil.rmtree(Path("dir_with_contents"))🚨 CẢNH BÁO
shutil.rmtree() xóa TOÀN BỘ directory tree không hỏi. Luôn double-check path trước khi gọi!
File Metadata
python
from pathlib import Path
from datetime import datetime
path = Path("document.pdf")
stat = path.stat()
# Size
size_bytes = stat.st_size
size_mb = stat.st_size / (1024 * 1024)
# Timestamps
created = datetime.fromtimestamp(stat.st_ctime)
modified = datetime.fromtimestamp(stat.st_mtime)
accessed = datetime.fromtimestamp(stat.st_atime)
# Permissions (Unix)
mode = stat.st_mode
is_readable = path.is_file() and os.access(path, os.R_OK)
# Owner (Unix)
import pwd
owner = pwd.getpwuid(stat.st_uid).pw_nameTemporary Files
tempfile Integration
python
from pathlib import Path
import tempfile
# Temporary file (auto-deleted)
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
temp_path = Path(f.name)
f.write('{"key": "value"}')
# Sử dụng temp_path
print(temp_path.read_text())
# Cleanup manual
temp_path.unlink()
# Temporary directory
with tempfile.TemporaryDirectory() as tmpdir:
tmp = Path(tmpdir)
(tmp / "data.txt").write_text("temporary data")
# Directory tự động xóa khi exit contextPattern: Safe File Write
python
from pathlib import Path
import tempfile
import shutil
def safe_write(path: Path, content: str) -> None:
"""
Atomic write - tránh corrupt file nếu crash giữa chừng.
1. Ghi vào temp file
2. Rename temp → target (atomic trên cùng filesystem)
"""
path = Path(path)
# Tạo temp file cùng directory (để rename atomic)
fd, tmp_path = tempfile.mkstemp(
dir=path.parent,
prefix=f".{path.name}.",
suffix=".tmp"
)
tmp = Path(tmp_path)
try:
tmp.write_text(content, encoding="utf-8")
tmp.replace(path) # Atomic rename
except Exception:
tmp.unlink(missing_ok=True)
raiseCross-Platform Considerations
Windows vs Unix Paths
python
from pathlib import Path, PurePosixPath, PureWindowsPath
# Path() tự động chọn đúng class
path = Path("data/file.txt")
# Windows: WindowsPath('data\\file.txt')
# Unix: PosixPath('data/file.txt')
# Force specific path type (cho testing/parsing)
posix = PurePosixPath("/home/user/file.txt")
windows = PureWindowsPath("C:\\Users\\user\\file.txt")
# Convert Windows path trong config
config_path = "C:\\Users\\data\\config.yaml"
path = Path(config_path) # Tự động handlePath Normalization
python
from pathlib import Path
# Resolve relative paths và symlinks
path = Path("./data/../data/./file.txt")
normalized = path.resolve()
# PosixPath('/absolute/path/to/data/file.txt')
# Chỉ normalize (không resolve symlinks)
from pathlib import PurePath
normalized = PurePath("./data/../data/./file.txt")
# Vẫn giữ nguyên - dùng os.path.normpath nếu cầnProduction Pitfalls
Pitfall 1: Encoding Issues
python
from pathlib import Path
# ❌ BUG: Không specify encoding
content = Path("data.txt").read_text() # Dùng system default!
# ✅ SỬA: Luôn specify encoding
content = Path("data.txt").read_text(encoding="utf-8")
# ✅ Handle encoding errors
content = Path("data.txt").read_text(
encoding="utf-8",
errors="replace" # Thay thế invalid chars
)Pitfall 2: Race Conditions
python
from pathlib import Path
path = Path("data.txt")
# ❌ BUG: TOCTOU (Time-of-check to time-of-use)
if path.exists():
content = path.read_text() # File có thể bị xóa giữa check và read!
# ✅ SỬA: EAFP (Easier to Ask Forgiveness than Permission)
try:
content = path.read_text(encoding="utf-8")
except FileNotFoundError:
content = ""Pitfall 3: Path Injection
python
from pathlib import Path
# ❌ BUG: User input không sanitize
def get_user_file(username: str) -> Path:
return Path(f"data/{username}/profile.json")
# Attacker: username = "../../../etc/passwd"
# Result: data/../../../etc/passwd → /etc/passwd
# ✅ SỬA: Validate path stays within base
def get_user_file_safe(username: str, base: Path) -> Path:
base = base.resolve()
target = (base / username / "profile.json").resolve()
# Kiểm tra target vẫn trong base
if not target.is_relative_to(base):
raise ValueError("Path traversal detected!")
return targetPitfall 4: Symlink Attacks
python
from pathlib import Path
import os
# ❌ BUG: Follow symlinks blindly
def delete_user_data(user_dir: Path) -> None:
for f in user_dir.rglob("*"):
f.unlink() # Có thể xóa files ngoài user_dir qua symlink!
# ✅ SỬA: Check symlinks
def delete_user_data_safe(user_dir: Path) -> None:
user_dir = user_dir.resolve()
for f in user_dir.rglob("*"):
# Skip symlinks
if f.is_symlink():
f.unlink() # Xóa symlink, không follow
continue
# Verify still within user_dir
if f.resolve().is_relative_to(user_dir):
if f.is_file():
f.unlink()Pitfall 5: Large File Memory
python
from pathlib import Path
# ❌ BUG: Load entire file vào memory
huge_file = Path("10gb_log.txt")
content = huge_file.read_text() # 💥 MemoryError!
# ✅ SỬA: Stream processing
def process_large_file(path: Path) -> int:
count = 0
with path.open("r", encoding="utf-8") as f:
for line in f: # Iterator - không load toàn bộ
if "ERROR" in line:
count += 1
return count
# ✅ SỬA: Chunked reading cho binary
def hash_large_file(path: Path) -> str:
import hashlib
hasher = hashlib.sha256()
with path.open("rb") as f:
while chunk := f.read(8192): # 8KB chunks
hasher.update(chunk)
return hasher.hexdigest()Pitfall 6: Glob Performance
python
from pathlib import Path
# ❌ CHẬM: Glob toàn bộ rồi filter
all_files = list(Path(".").rglob("*")) # Load tất cả vào memory
py_files = [f for f in all_files if f.suffix == ".py"]
# ✅ NHANH: Glob specific pattern
py_files = list(Path(".").rglob("*.py"))
# ✅ NHANH HƠN: Generator (lazy evaluation)
def find_large_py_files(base: Path, min_size: int = 10000):
for path in base.rglob("*.py"):
if path.stat().st_size > min_size:
yield pathBest Practices Summary
python
from pathlib import Path
# === PATH CREATION ===
# ✅ Dùng Path objects
config = Path("config") / "app.yaml"
# ✅ Resolve relative paths
absolute = Path("./data").resolve()
# ✅ Type hints
def load_config(path: Path | str) -> dict:
return json.loads(Path(path).read_text(encoding="utf-8"))
# === FILE I/O ===
# ✅ Luôn specify encoding
content = path.read_text(encoding="utf-8")
# ✅ Context manager cho files lớn
with path.open("r", encoding="utf-8") as f:
for line in f:
process(line)
# ✅ Atomic writes cho data quan trọng
# (Xem safe_write pattern ở trên)
# === DIRECTORY OPERATIONS ===
# ✅ Tạo với parents và exist_ok
Path("logs/2024").mkdir(parents=True, exist_ok=True)
# ✅ Dùng rglob cho recursive search
for py_file in src.rglob("*.py"):
lint(py_file)
# === SECURITY ===
# ✅ Validate user input paths
if not target.resolve().is_relative_to(base_dir):
raise SecurityError("Path traversal!")
# ✅ EAFP over LBYL
try:
content = path.read_text(encoding="utf-8")
except FileNotFoundError:
content = default_contentCross-links
- Context Managers -
withstatement cho file handling - Logging & Debugging - Log file management
- Testing Architecture - Test fixtures với temp files