Skip to content

C Extensions & Cython Advanced

Khi Python không đủ nhanh - Escape hatch sang native code

Learning Outcomes

Sau khi hoàn thành trang này, bạn sẽ:

  • 🎯 Biết khi nào cần C extensions (và khi nào không)
  • 🎯 Viết Cython code để tăng tốc Python
  • 🎯 Sử dụng ctypescffi để gọi C libraries
  • 🎯 Hiểu NumPy C API basics
  • 🎯 Tránh các Production Pitfalls với native code

Khi Nào Cần C Extensions?

Decision Tree

┌─────────────────────────────────────────────────────────┐
│              Is Python fast enough?                     │
│                        │                                │
│              ┌─────────┴─────────┐                      │
│              ▼                   ▼                      │
│            YES                  NO                      │
│              │                   │                      │
│              ▼                   ▼                      │
│         STOP HERE!        Is it CPU-bound?             │
│                                  │                      │
│                    ┌─────────────┴─────────────┐        │
│                    ▼                           ▼        │
│                   YES                         NO        │
│                    │                           │        │
│                    ▼                           ▼        │
│           Can NumPy help?              Fix I/O first   │
│                    │                   (async, batch)   │
│          ┌─────────┴─────────┐                         │
│          ▼                   ▼                         │
│         YES                 NO                         │
│          │                   │                         │
│          ▼                   ▼                         │
│     Use NumPy!        Consider C extension             │
└─────────────────────────────────────────────────────────┘

Good Use Cases for C Extensions

python
# 1. Tight numerical loops
def compute_slow(data: list[float]) -> float:
    total = 0.0
    for x in data:
        total += x ** 2 + x ** 3  # Millions of iterations
    return total

# 2. Image/video processing
def process_pixels(image: bytes) -> bytes:
    # Per-pixel operations on millions of pixels
    pass

# 3. Cryptography / compression
def encrypt(data: bytes, key: bytes) -> bytes:
    # Bit manipulation on large data
    pass

# 4. Calling existing C libraries
# - OpenSSL, SQLite, libpng, etc.

Bad Use Cases for C Extensions

python
# 1. I/O-bound code
def fetch_data():
    response = requests.get(url)  # Network is bottleneck
    return response.json()

# 2. Already using NumPy efficiently
def matrix_ops(a: np.ndarray, b: np.ndarray):
    return a @ b  # Already calls optimized BLAS

# 3. Simple code that runs rarely
def parse_config():
    # Runs once at startup - who cares if it's 10ms?
    pass

# 4. Code that's hard to profile
# If you can't measure it, don't optimize it!

Cython - Python with C Speed

What is Cython?

Cython = Python + C type declarations → Compiled to C → Native speed

┌─────────────────────────────────────────────────────────┐
│  Python Code (.py)                                      │
│       │                                                 │
│       ▼                                                 │
│  Cython Code (.pyx)  ←  Add type declarations          │
│       │                                                 │
│       ▼                                                 │
│  C Code (.c)         ←  Cython compiler                │
│       │                                                 │
│       ▼                                                 │
│  Shared Library (.so/.pyd)  ←  C compiler              │
│       │                                                 │
│       ▼                                                 │
│  Import in Python!                                      │
└─────────────────────────────────────────────────────────┘

Installation

bash
pip install cython

Basic Example

python
# compute.pyx
def compute_sum(data):
    """Pure Python - no speedup yet."""
    total = 0.0
    for x in data:
        total += x ** 2
    return total
python
# compute_typed.pyx
def compute_sum_typed(list data):
    """With type declarations - 10-100x faster!"""
    cdef double total = 0.0
    cdef double x
    cdef int i
    cdef int n = len(data)
    
    for i in range(n):
        x = data[i]
        total += x ** 2
    
    return total

Build Setup

python
# setup.py
from setuptools import setup
from Cython.Build import cythonize

setup(
    ext_modules=cythonize("compute.pyx"),
)
bash
# Build
python setup.py build_ext --inplace

# Or with pyproject.toml (modern)
pip install .

Cython Type Declarations

python
# compute.pyx

# C types
cdef int i
cdef double x
cdef float y
cdef long n
cdef char* s

# Python types with C speed
cdef list my_list
cdef dict my_dict
cdef str my_str

# Typed memoryviews (for NumPy arrays)
cdef double[:] arr_1d
cdef double[:, :] arr_2d
cdef double[:, :, :] arr_3d

# Function declarations
cdef double square(double x):
    """C function - not callable from Python."""
    return x * x

cpdef double square_public(double x):
    """Hybrid - callable from both C and Python."""
    return x * x

def square_python(x):
    """Pure Python function."""
    return x * x

Cython with NumPy

python
# fast_numpy.pyx
import numpy as np
cimport numpy as np
cimport cython

# Disable bounds checking for speed
@cython.boundscheck(False)
@cython.wraparound(False)
def fast_sum(np.ndarray[np.float64_t, ndim=1] arr):
    """Fast sum with typed memoryview."""
    cdef int n = arr.shape[0]
    cdef double total = 0.0
    cdef int i
    
    for i in range(n):
        total += arr[i]
    
    return total

# Even faster with memoryviews
@cython.boundscheck(False)
@cython.wraparound(False)
def faster_sum(double[:] arr):
    """Memoryview syntax - cleaner and fast."""
    cdef int n = arr.shape[0]
    cdef double total = 0.0
    cdef int i
    
    for i in range(n):
        total += arr[i]
    
    return total

Cython Compiler Directives

python
# At file level
# cython: boundscheck=False
# cython: wraparound=False
# cython: cdivision=True

# Or as decorators
import cython

@cython.boundscheck(False)  # No array bounds checking
@cython.wraparound(False)   # No negative indexing
@cython.cdivision(True)     # C-style division (no ZeroDivisionError)
@cython.nonecheck(False)    # No None checking
def fast_function(double[:] arr):
    pass

Profiling Cython Code

bash
# Generate annotated HTML
cython -a compute.pyx

# Yellow = Python interaction (slow)
# White = Pure C (fast)

ctypes - Call C from Python

Basic Usage

python
import ctypes

# Load shared library
# Linux: .so, macOS: .dylib, Windows: .dll
lib = ctypes.CDLL('./mylib.so')

# Define function signature
lib.add.argtypes = [ctypes.c_int, ctypes.c_int]
lib.add.restype = ctypes.c_int

# Call function
result = lib.add(5, 3)
print(result)  # 8

C Types Mapping

python
import ctypes

# Basic types
ctypes.c_int       # int
ctypes.c_long      # long
ctypes.c_float     # float
ctypes.c_double    # double
ctypes.c_char      # char
ctypes.c_char_p    # char* (string)
ctypes.c_void_p    # void*
ctypes.c_bool      # bool

# Arrays
IntArray5 = ctypes.c_int * 5
arr = IntArray5(1, 2, 3, 4, 5)

# Pointers
ctypes.POINTER(ctypes.c_int)  # int*
ctypes.byref(x)               # &x (address of)

Structures

python
import ctypes

# C struct
# struct Point {
#     double x;
#     double y;
# };

class Point(ctypes.Structure):
    _fields_ = [
        ('x', ctypes.c_double),
        ('y', ctypes.c_double),
    ]

# Usage
p = Point(1.0, 2.0)
print(p.x, p.y)

# Pass to C function
lib.process_point.argtypes = [ctypes.POINTER(Point)]
lib.process_point(ctypes.byref(p))

Callbacks

python
import ctypes

# C function type: int (*callback)(int, int)
CALLBACK = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_int)

def py_callback(a, b):
    return a + b

# Convert Python function to C callback
c_callback = CALLBACK(py_callback)

# Pass to C function
lib.register_callback(c_callback)

Working with NumPy

python
import ctypes
import numpy as np

# Get pointer to NumPy array data
arr = np.array([1.0, 2.0, 3.0], dtype=np.float64)
ptr = arr.ctypes.data_as(ctypes.POINTER(ctypes.c_double))

# Pass to C function
lib.process_array.argtypes = [
    ctypes.POINTER(ctypes.c_double),
    ctypes.c_int
]
lib.process_array(ptr, len(arr))

cffi - Modern C FFI

Why cffi over ctypes?

Featurectypescffi
SyntaxPython-likeC-like
PerformanceGoodBetter
Complex typesManualAutomatic
Error messagesPoorGood
PyPy supportLimitedExcellent

Installation

bash
pip install cffi

ABI Mode (Simple)

python
from cffi import FFI

ffi = FFI()

# Declare C functions
ffi.cdef("""
    int add(int a, int b);
    double sqrt(double x);
""")

# Load library
lib = ffi.dlopen('./mylib.so')
# Or standard library
lib = ffi.dlopen(None)  # libc

# Call functions
result = lib.add(5, 3)
print(result)  # 8

API Mode (Compiled)

python
# build_mymodule.py
from cffi import FFI

ffi = FFI()

# C declarations
ffi.cdef("""
    typedef struct {
        double x;
        double y;
    } Point;
    
    double distance(Point* p1, Point* p2);
""")

# C source code
ffi.set_source("_mymodule", """
    #include <math.h>
    
    typedef struct {
        double x;
        double y;
    } Point;
    
    double distance(Point* p1, Point* p2) {
        double dx = p2->x - p1->x;
        double dy = p2->y - p1->y;
        return sqrt(dx*dx + dy*dy);
    }
""")

if __name__ == "__main__":
    ffi.compile(verbose=True)
bash
python build_mymodule.py
python
# Usage
from _mymodule import ffi, lib

p1 = ffi.new("Point*", {'x': 0.0, 'y': 0.0})
p2 = ffi.new("Point*", {'x': 3.0, 'y': 4.0})

dist = lib.distance(p1, p2)
print(dist)  # 5.0

cffi with NumPy

python
from cffi import FFI
import numpy as np

ffi = FFI()
ffi.cdef("""
    void process_array(double* data, int n);
""")
lib = ffi.dlopen('./mylib.so')

# NumPy array to cffi pointer
arr = np.array([1.0, 2.0, 3.0], dtype=np.float64)
ptr = ffi.cast("double*", arr.ctypes.data)

lib.process_array(ptr, len(arr))

NumPy C API Basics

When to Use NumPy C API

  • Writing NumPy ufuncs
  • Integrating with existing C code
  • Maximum performance for array operations

Simple Example with Cython

python
# numpy_ext.pyx
import numpy as np
cimport numpy as np
from libc.math cimport sqrt

np.import_array()  # Required!

def euclidean_distance(
    np.ndarray[np.float64_t, ndim=1] a,
    np.ndarray[np.float64_t, ndim=1] b
):
    """Compute Euclidean distance between two vectors."""
    cdef int n = a.shape[0]
    cdef double total = 0.0
    cdef int i
    
    for i in range(n):
        total += (a[i] - b[i]) ** 2
    
    return sqrt(total)

NumPy Array Flags

python
import numpy as np

arr = np.array([[1, 2], [3, 4]])

# Check memory layout
print(arr.flags)
# C_CONTIGUOUS : True   (row-major)
# F_CONTIGUOUS : False  (column-major)
# OWNDATA : True
# WRITEABLE : True
# ALIGNED : True

# Ensure contiguous for C code
arr_c = np.ascontiguousarray(arr)
arr_f = np.asfortranarray(arr)

Performance Comparison

Benchmark: Sum of Squares

python
import numpy as np
import timeit

# Test data
data = np.random.random(1_000_000)
data_list = data.tolist()

# Pure Python
def python_sum(data):
    return sum(x ** 2 for x in data)

# NumPy
def numpy_sum(data):
    return np.sum(data ** 2)

# Cython (after compilation)
# from compute import cython_sum

# Results (typical):
# Python:  ~500ms
# NumPy:   ~2ms   (250x faster)
# Cython:  ~1ms   (500x faster)

When Each Approach Wins

┌─────────────────────────────────────────────────────────┐
│  Approach      │  Best For                             │
├────────────────┼───────────────────────────────────────┤
│  Pure Python   │  Prototyping, I/O-bound code         │
│  NumPy         │  Array operations, linear algebra    │
│  Cython        │  Custom loops, mixed Python/C        │
│  ctypes/cffi   │  Calling existing C libraries        │
│  Pure C ext    │  Maximum control, complex C code     │
└─────────────────────────────────────────────────────────┘

Production Pitfalls

Pitfall 1: GIL Not Released

python
# ❌ PROBLEM: Cython holds GIL by default
def slow_cython(data):
    cdef int i
    for i in range(len(data)):
        # GIL held - blocks other threads!
        process(data[i])

# ✅ FIX: Release GIL for pure C code
from cython.parallel import prange

def fast_cython(double[:] data):
    cdef int i
    cdef int n = data.shape[0]
    
    with nogil:  # Release GIL
        for i in prange(n):  # Parallel loop
            # Only C operations here!
            data[i] = data[i] ** 2

Pitfall 2: Memory Management

python
# ❌ PROBLEM: Memory leak with ctypes
import ctypes

lib = ctypes.CDLL('./mylib.so')
lib.create_buffer.restype = ctypes.c_void_p

ptr = lib.create_buffer(1000)  # Allocates memory
# ... use ptr ...
# Memory leaked! No automatic cleanup

# ✅ FIX: Always free allocated memory
lib.free_buffer.argtypes = [ctypes.c_void_p]

ptr = lib.create_buffer(1000)
try:
    # ... use ptr ...
finally:
    lib.free_buffer(ptr)

Pitfall 3: Type Mismatch

python
# ❌ PROBLEM: Wrong type causes crash
import ctypes

lib = ctypes.CDLL('./mylib.so')
# C function expects int, but we pass float
result = lib.process(3.14)  # Undefined behavior!

# ✅ FIX: Always declare argtypes
lib.process.argtypes = [ctypes.c_int]
lib.process.restype = ctypes.c_int

result = lib.process(3)  # Correct
# lib.process(3.14)  # TypeError - caught early!

Pitfall 4: NumPy Array Lifetime

python
# ❌ PROBLEM: Array garbage collected while C uses it
import ctypes
import numpy as np

def get_pointer():
    arr = np.array([1.0, 2.0, 3.0])
    return arr.ctypes.data_as(ctypes.POINTER(ctypes.c_double))

ptr = get_pointer()  # arr is garbage collected!
# ptr now points to freed memory!

# ✅ FIX: Keep array alive
def process_array(arr):
    ptr = arr.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
    lib.process(ptr, len(arr))
    # arr stays alive until function returns

Pitfall 5: Platform Differences

python
# ❌ PROBLEM: Hardcoded library path
lib = ctypes.CDLL('./mylib.so')  # Fails on Windows!

# ✅ FIX: Platform-aware loading
import ctypes
import sys

if sys.platform == 'win32':
    lib = ctypes.CDLL('./mylib.dll')
elif sys.platform == 'darwin':
    lib = ctypes.CDLL('./mylib.dylib')
else:
    lib = ctypes.CDLL('./mylib.so')

# Or use ctypes.util.find_library
from ctypes.util import find_library
lib_path = find_library('mylib')

Quick Reference

python
# === Cython ===
# compute.pyx
cdef double x           # C variable
cpdef double func(x):   # Callable from Python and C
cdef double cfunc(x):   # C only

# Build
# python setup.py build_ext --inplace

# === ctypes ===
import ctypes
lib = ctypes.CDLL('./mylib.so')
lib.func.argtypes = [ctypes.c_int]
lib.func.restype = ctypes.c_int
result = lib.func(42)

# === cffi ===
from cffi import FFI
ffi = FFI()
ffi.cdef("int add(int a, int b);")
lib = ffi.dlopen('./mylib.so')
result = lib.add(5, 3)

# === NumPy + ctypes ===
arr = np.array([1.0, 2.0], dtype=np.float64)
ptr = arr.ctypes.data_as(ctypes.POINTER(ctypes.c_double))