Giao diện
C Extensions & Cython Advanced
Khi Python không đủ nhanh - Escape hatch sang native code
Learning Outcomes
Sau khi hoàn thành trang này, bạn sẽ:
- 🎯 Biết khi nào cần C extensions (và khi nào không)
- 🎯 Viết Cython code để tăng tốc Python
- 🎯 Sử dụng ctypes và cffi để gọi C libraries
- 🎯 Hiểu NumPy C API basics
- 🎯 Tránh các Production Pitfalls với native code
Khi Nào Cần C Extensions?
Decision Tree
┌─────────────────────────────────────────────────────────┐
│ Is Python fast enough? │
│ │ │
│ ┌─────────┴─────────┐ │
│ ▼ ▼ │
│ YES NO │
│ │ │ │
│ ▼ ▼ │
│ STOP HERE! Is it CPU-bound? │
│ │ │
│ ┌─────────────┴─────────────┐ │
│ ▼ ▼ │
│ YES NO │
│ │ │ │
│ ▼ ▼ │
│ Can NumPy help? Fix I/O first │
│ │ (async, batch) │
│ ┌─────────┴─────────┐ │
│ ▼ ▼ │
│ YES NO │
│ │ │ │
│ ▼ ▼ │
│ Use NumPy! Consider C extension │
└─────────────────────────────────────────────────────────┘✅ Good Use Cases for C Extensions
python
# 1. Tight numerical loops
def compute_slow(data: list[float]) -> float:
total = 0.0
for x in data:
total += x ** 2 + x ** 3 # Millions of iterations
return total
# 2. Image/video processing
def process_pixels(image: bytes) -> bytes:
# Per-pixel operations on millions of pixels
pass
# 3. Cryptography / compression
def encrypt(data: bytes, key: bytes) -> bytes:
# Bit manipulation on large data
pass
# 4. Calling existing C libraries
# - OpenSSL, SQLite, libpng, etc.❌ Bad Use Cases for C Extensions
python
# 1. I/O-bound code
def fetch_data():
response = requests.get(url) # Network is bottleneck
return response.json()
# 2. Already using NumPy efficiently
def matrix_ops(a: np.ndarray, b: np.ndarray):
return a @ b # Already calls optimized BLAS
# 3. Simple code that runs rarely
def parse_config():
# Runs once at startup - who cares if it's 10ms?
pass
# 4. Code that's hard to profile
# If you can't measure it, don't optimize it!Cython - Python with C Speed
What is Cython?
Cython = Python + C type declarations → Compiled to C → Native speed
┌─────────────────────────────────────────────────────────┐
│ Python Code (.py) │
│ │ │
│ ▼ │
│ Cython Code (.pyx) ← Add type declarations │
│ │ │
│ ▼ │
│ C Code (.c) ← Cython compiler │
│ │ │
│ ▼ │
│ Shared Library (.so/.pyd) ← C compiler │
│ │ │
│ ▼ │
│ Import in Python! │
└─────────────────────────────────────────────────────────┘Installation
bash
pip install cythonBasic Example
python
# compute.pyx
def compute_sum(data):
"""Pure Python - no speedup yet."""
total = 0.0
for x in data:
total += x ** 2
return totalpython
# compute_typed.pyx
def compute_sum_typed(list data):
"""With type declarations - 10-100x faster!"""
cdef double total = 0.0
cdef double x
cdef int i
cdef int n = len(data)
for i in range(n):
x = data[i]
total += x ** 2
return totalBuild Setup
python
# setup.py
from setuptools import setup
from Cython.Build import cythonize
setup(
ext_modules=cythonize("compute.pyx"),
)bash
# Build
python setup.py build_ext --inplace
# Or with pyproject.toml (modern)
pip install .Cython Type Declarations
python
# compute.pyx
# C types
cdef int i
cdef double x
cdef float y
cdef long n
cdef char* s
# Python types with C speed
cdef list my_list
cdef dict my_dict
cdef str my_str
# Typed memoryviews (for NumPy arrays)
cdef double[:] arr_1d
cdef double[:, :] arr_2d
cdef double[:, :, :] arr_3d
# Function declarations
cdef double square(double x):
"""C function - not callable from Python."""
return x * x
cpdef double square_public(double x):
"""Hybrid - callable from both C and Python."""
return x * x
def square_python(x):
"""Pure Python function."""
return x * xCython with NumPy
python
# fast_numpy.pyx
import numpy as np
cimport numpy as np
cimport cython
# Disable bounds checking for speed
@cython.boundscheck(False)
@cython.wraparound(False)
def fast_sum(np.ndarray[np.float64_t, ndim=1] arr):
"""Fast sum with typed memoryview."""
cdef int n = arr.shape[0]
cdef double total = 0.0
cdef int i
for i in range(n):
total += arr[i]
return total
# Even faster with memoryviews
@cython.boundscheck(False)
@cython.wraparound(False)
def faster_sum(double[:] arr):
"""Memoryview syntax - cleaner and fast."""
cdef int n = arr.shape[0]
cdef double total = 0.0
cdef int i
for i in range(n):
total += arr[i]
return totalCython Compiler Directives
python
# At file level
# cython: boundscheck=False
# cython: wraparound=False
# cython: cdivision=True
# Or as decorators
import cython
@cython.boundscheck(False) # No array bounds checking
@cython.wraparound(False) # No negative indexing
@cython.cdivision(True) # C-style division (no ZeroDivisionError)
@cython.nonecheck(False) # No None checking
def fast_function(double[:] arr):
passProfiling Cython Code
bash
# Generate annotated HTML
cython -a compute.pyx
# Yellow = Python interaction (slow)
# White = Pure C (fast)ctypes - Call C from Python
Basic Usage
python
import ctypes
# Load shared library
# Linux: .so, macOS: .dylib, Windows: .dll
lib = ctypes.CDLL('./mylib.so')
# Define function signature
lib.add.argtypes = [ctypes.c_int, ctypes.c_int]
lib.add.restype = ctypes.c_int
# Call function
result = lib.add(5, 3)
print(result) # 8C Types Mapping
python
import ctypes
# Basic types
ctypes.c_int # int
ctypes.c_long # long
ctypes.c_float # float
ctypes.c_double # double
ctypes.c_char # char
ctypes.c_char_p # char* (string)
ctypes.c_void_p # void*
ctypes.c_bool # bool
# Arrays
IntArray5 = ctypes.c_int * 5
arr = IntArray5(1, 2, 3, 4, 5)
# Pointers
ctypes.POINTER(ctypes.c_int) # int*
ctypes.byref(x) # &x (address of)Structures
python
import ctypes
# C struct
# struct Point {
# double x;
# double y;
# };
class Point(ctypes.Structure):
_fields_ = [
('x', ctypes.c_double),
('y', ctypes.c_double),
]
# Usage
p = Point(1.0, 2.0)
print(p.x, p.y)
# Pass to C function
lib.process_point.argtypes = [ctypes.POINTER(Point)]
lib.process_point(ctypes.byref(p))Callbacks
python
import ctypes
# C function type: int (*callback)(int, int)
CALLBACK = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_int)
def py_callback(a, b):
return a + b
# Convert Python function to C callback
c_callback = CALLBACK(py_callback)
# Pass to C function
lib.register_callback(c_callback)Working with NumPy
python
import ctypes
import numpy as np
# Get pointer to NumPy array data
arr = np.array([1.0, 2.0, 3.0], dtype=np.float64)
ptr = arr.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
# Pass to C function
lib.process_array.argtypes = [
ctypes.POINTER(ctypes.c_double),
ctypes.c_int
]
lib.process_array(ptr, len(arr))cffi - Modern C FFI
Why cffi over ctypes?
| Feature | ctypes | cffi |
|---|---|---|
| Syntax | Python-like | C-like |
| Performance | Good | Better |
| Complex types | Manual | Automatic |
| Error messages | Poor | Good |
| PyPy support | Limited | Excellent |
Installation
bash
pip install cffiABI Mode (Simple)
python
from cffi import FFI
ffi = FFI()
# Declare C functions
ffi.cdef("""
int add(int a, int b);
double sqrt(double x);
""")
# Load library
lib = ffi.dlopen('./mylib.so')
# Or standard library
lib = ffi.dlopen(None) # libc
# Call functions
result = lib.add(5, 3)
print(result) # 8API Mode (Compiled)
python
# build_mymodule.py
from cffi import FFI
ffi = FFI()
# C declarations
ffi.cdef("""
typedef struct {
double x;
double y;
} Point;
double distance(Point* p1, Point* p2);
""")
# C source code
ffi.set_source("_mymodule", """
#include <math.h>
typedef struct {
double x;
double y;
} Point;
double distance(Point* p1, Point* p2) {
double dx = p2->x - p1->x;
double dy = p2->y - p1->y;
return sqrt(dx*dx + dy*dy);
}
""")
if __name__ == "__main__":
ffi.compile(verbose=True)bash
python build_mymodule.pypython
# Usage
from _mymodule import ffi, lib
p1 = ffi.new("Point*", {'x': 0.0, 'y': 0.0})
p2 = ffi.new("Point*", {'x': 3.0, 'y': 4.0})
dist = lib.distance(p1, p2)
print(dist) # 5.0cffi with NumPy
python
from cffi import FFI
import numpy as np
ffi = FFI()
ffi.cdef("""
void process_array(double* data, int n);
""")
lib = ffi.dlopen('./mylib.so')
# NumPy array to cffi pointer
arr = np.array([1.0, 2.0, 3.0], dtype=np.float64)
ptr = ffi.cast("double*", arr.ctypes.data)
lib.process_array(ptr, len(arr))NumPy C API Basics
When to Use NumPy C API
- Writing NumPy ufuncs
- Integrating with existing C code
- Maximum performance for array operations
Simple Example with Cython
python
# numpy_ext.pyx
import numpy as np
cimport numpy as np
from libc.math cimport sqrt
np.import_array() # Required!
def euclidean_distance(
np.ndarray[np.float64_t, ndim=1] a,
np.ndarray[np.float64_t, ndim=1] b
):
"""Compute Euclidean distance between two vectors."""
cdef int n = a.shape[0]
cdef double total = 0.0
cdef int i
for i in range(n):
total += (a[i] - b[i]) ** 2
return sqrt(total)NumPy Array Flags
python
import numpy as np
arr = np.array([[1, 2], [3, 4]])
# Check memory layout
print(arr.flags)
# C_CONTIGUOUS : True (row-major)
# F_CONTIGUOUS : False (column-major)
# OWNDATA : True
# WRITEABLE : True
# ALIGNED : True
# Ensure contiguous for C code
arr_c = np.ascontiguousarray(arr)
arr_f = np.asfortranarray(arr)Performance Comparison
Benchmark: Sum of Squares
python
import numpy as np
import timeit
# Test data
data = np.random.random(1_000_000)
data_list = data.tolist()
# Pure Python
def python_sum(data):
return sum(x ** 2 for x in data)
# NumPy
def numpy_sum(data):
return np.sum(data ** 2)
# Cython (after compilation)
# from compute import cython_sum
# Results (typical):
# Python: ~500ms
# NumPy: ~2ms (250x faster)
# Cython: ~1ms (500x faster)When Each Approach Wins
┌─────────────────────────────────────────────────────────┐
│ Approach │ Best For │
├────────────────┼───────────────────────────────────────┤
│ Pure Python │ Prototyping, I/O-bound code │
│ NumPy │ Array operations, linear algebra │
│ Cython │ Custom loops, mixed Python/C │
│ ctypes/cffi │ Calling existing C libraries │
│ Pure C ext │ Maximum control, complex C code │
└─────────────────────────────────────────────────────────┘Production Pitfalls
Pitfall 1: GIL Not Released
python
# ❌ PROBLEM: Cython holds GIL by default
def slow_cython(data):
cdef int i
for i in range(len(data)):
# GIL held - blocks other threads!
process(data[i])
# ✅ FIX: Release GIL for pure C code
from cython.parallel import prange
def fast_cython(double[:] data):
cdef int i
cdef int n = data.shape[0]
with nogil: # Release GIL
for i in prange(n): # Parallel loop
# Only C operations here!
data[i] = data[i] ** 2Pitfall 2: Memory Management
python
# ❌ PROBLEM: Memory leak with ctypes
import ctypes
lib = ctypes.CDLL('./mylib.so')
lib.create_buffer.restype = ctypes.c_void_p
ptr = lib.create_buffer(1000) # Allocates memory
# ... use ptr ...
# Memory leaked! No automatic cleanup
# ✅ FIX: Always free allocated memory
lib.free_buffer.argtypes = [ctypes.c_void_p]
ptr = lib.create_buffer(1000)
try:
# ... use ptr ...
finally:
lib.free_buffer(ptr)Pitfall 3: Type Mismatch
python
# ❌ PROBLEM: Wrong type causes crash
import ctypes
lib = ctypes.CDLL('./mylib.so')
# C function expects int, but we pass float
result = lib.process(3.14) # Undefined behavior!
# ✅ FIX: Always declare argtypes
lib.process.argtypes = [ctypes.c_int]
lib.process.restype = ctypes.c_int
result = lib.process(3) # Correct
# lib.process(3.14) # TypeError - caught early!Pitfall 4: NumPy Array Lifetime
python
# ❌ PROBLEM: Array garbage collected while C uses it
import ctypes
import numpy as np
def get_pointer():
arr = np.array([1.0, 2.0, 3.0])
return arr.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
ptr = get_pointer() # arr is garbage collected!
# ptr now points to freed memory!
# ✅ FIX: Keep array alive
def process_array(arr):
ptr = arr.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
lib.process(ptr, len(arr))
# arr stays alive until function returnsPitfall 5: Platform Differences
python
# ❌ PROBLEM: Hardcoded library path
lib = ctypes.CDLL('./mylib.so') # Fails on Windows!
# ✅ FIX: Platform-aware loading
import ctypes
import sys
if sys.platform == 'win32':
lib = ctypes.CDLL('./mylib.dll')
elif sys.platform == 'darwin':
lib = ctypes.CDLL('./mylib.dylib')
else:
lib = ctypes.CDLL('./mylib.so')
# Or use ctypes.util.find_library
from ctypes.util import find_library
lib_path = find_library('mylib')Quick Reference
python
# === Cython ===
# compute.pyx
cdef double x # C variable
cpdef double func(x): # Callable from Python and C
cdef double cfunc(x): # C only
# Build
# python setup.py build_ext --inplace
# === ctypes ===
import ctypes
lib = ctypes.CDLL('./mylib.so')
lib.func.argtypes = [ctypes.c_int]
lib.func.restype = ctypes.c_int
result = lib.func(42)
# === cffi ===
from cffi import FFI
ffi = FFI()
ffi.cdef("int add(int a, int b);")
lib = ffi.dlopen('./mylib.so')
result = lib.add(5, 3)
# === NumPy + ctypes ===
arr = np.array([1.0, 2.0], dtype=np.float64)
ptr = arr.ctypes.data_as(ctypes.POINTER(ctypes.c_double))Cross-links
- Prerequisites: Profiling - Find bottlenecks first
- Previous: Memory Optimization - Memory efficiency
- Next: NumPy Internals - Vectorization patterns
- Related: GIL & Threading - GIL release in C extensions