Giao diện
NumPy Internals Advanced
NumPy = Python's secret weapon cho numerical computing - Hiểu internals = Viết code nhanh hơn 100x
Learning Outcomes
Sau khi hoàn thành trang này, bạn sẽ:
- 🎯 Hiểu ndarray memory layout (contiguous, strides)
- 🎯 Áp dụng vectorization patterns để tránh Python loops
- 🎯 Master broadcasting rules cho array operations
- 🎯 Tránh các anti-patterns làm chậm NumPy code
- 🎯 Biết khi nào NumPy không phải là giải pháp tốt nhất
ndarray Memory Layout
Array Structure
┌─────────────────────────────────────────────────────────┐
│ ndarray Object │
├─────────────────────────────────────────────────────────┤
│ data │ Pointer to actual data buffer │
│ dtype │ Data type (float64, int32, etc.) │
│ shape │ Dimensions (3, 4) = 3 rows, 4 cols │
│ strides │ Bytes to jump for each dimension │
│ flags │ C_CONTIGUOUS, F_CONTIGUOUS, etc. │
└─────────────────────────────────────────────────────────┘python
import numpy as np
arr = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float64)
print(f"Shape: {arr.shape}") # (2, 3)
print(f"Dtype: {arr.dtype}") # float64
print(f"Strides: {arr.strides}") # (24, 8) - bytes per dimension
print(f"Size: {arr.size}") # 6 elements
print(f"Itemsize: {arr.itemsize}") # 8 bytes per element
print(f"Nbytes: {arr.nbytes}") # 48 bytes totalStrides Explained
python
import numpy as np
# 2D array (2 rows, 3 cols) of float64 (8 bytes each)
arr = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float64)
# Memory layout (C-order, row-major):
# [1.0][2.0][3.0][4.0][5.0][6.0]
# ^ ^
# | |
# row 0 row 1
print(arr.strides) # (24, 8)
# - To move to next row: jump 24 bytes (3 elements × 8 bytes)
# - To move to next column: jump 8 bytes (1 element × 8 bytes)
# Access arr[1, 2]:
# offset = 1 * 24 + 2 * 8 = 40 bytes from startC-order vs Fortran-order
python
import numpy as np
# C-order (row-major) - default
arr_c = np.array([[1, 2, 3], [4, 5, 6]], order='C')
print(f"C-order strides: {arr_c.strides}") # (24, 8)
# Memory: [1, 2, 3, 4, 5, 6]
# row 0 row 1
# Fortran-order (column-major)
arr_f = np.array([[1, 2, 3], [4, 5, 6]], order='F')
print(f"F-order strides: {arr_f.strides}") # (8, 16)
# Memory: [1, 4, 2, 5, 3, 6]
# col0 col1 col2
# Check contiguity
print(arr_c.flags['C_CONTIGUOUS']) # True
print(arr_f.flags['F_CONTIGUOUS']) # TrueViews vs Copies
python
import numpy as np
arr = np.array([1, 2, 3, 4, 5, 6])
# View - shares memory (fast, no copy)
view = arr[::2] # [1, 3, 5]
view[0] = 100
print(arr) # [100, 2, 3, 4, 5, 6] - original changed!
# Copy - new memory (slower, independent)
copy = arr[::2].copy()
copy[0] = 200
print(arr) # [100, 2, 3, 4, 5, 6] - original unchanged
# Check if view or copy
print(view.base is arr) # True - view
print(copy.base is None) # True - copy (owns data)Non-contiguous Arrays
python
import numpy as np
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
# Slicing can create non-contiguous views
col = arr[:, 1] # [2, 5, 8]
print(col.flags['C_CONTIGUOUS']) # False!
print(col.strides) # (24,) - jumping over elements
# Non-contiguous arrays are slower for some operations
# Fix: make contiguous copy
col_contig = np.ascontiguousarray(col)
print(col_contig.flags['C_CONTIGUOUS']) # TrueVectorization Patterns
Why Vectorization?
python
import numpy as np
import timeit
n = 1_000_000
data = np.random.random(n)
# ❌ Python loop - SLOW
def python_loop(data):
result = np.empty_like(data)
for i in range(len(data)):
result[i] = data[i] ** 2
return result
# ✅ Vectorized - FAST
def vectorized(data):
return data ** 2
# Benchmark
t_loop = timeit.timeit(lambda: python_loop(data), number=10)
t_vec = timeit.timeit(lambda: vectorized(data), number=10)
print(f"Python loop: {t_loop:.3f}s")
print(f"Vectorized: {t_vec:.3f}s")
print(f"Speedup: {t_loop/t_vec:.0f}x")
# Typical: 50-100x speedup!Pattern 1: Element-wise Operations
python
import numpy as np
a = np.array([1, 2, 3, 4, 5])
b = np.array([10, 20, 30, 40, 50])
# ❌ Python loop
result = []
for i in range(len(a)):
result.append(a[i] + b[i])
# ✅ Vectorized
result = a + b # [11, 22, 33, 44, 55]
# Works for all arithmetic operations
result = a * b # Element-wise multiply
result = a / b # Element-wise divide
result = a ** 2 # Element-wise power
result = np.sqrt(a) # Element-wise sqrt
result = np.sin(a) # Element-wise sinPattern 2: Conditional Operations
python
import numpy as np
data = np.array([1, -2, 3, -4, 5])
# ❌ Python loop
result = []
for x in data:
if x > 0:
result.append(x)
else:
result.append(0)
# ✅ Vectorized with np.where
result = np.where(data > 0, data, 0) # [1, 0, 3, 0, 5]
# ✅ Boolean indexing
positives = data[data > 0] # [1, 3, 5]
# ✅ np.clip for bounds
clipped = np.clip(data, 0, 10) # [1, 0, 3, 0, 5]Pattern 3: Aggregations
python
import numpy as np
data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
# ❌ Python loop
total = 0
for row in data:
for val in row:
total += val
# ✅ Vectorized
total = np.sum(data) # 45
row_sums = np.sum(data, axis=1) # [6, 15, 24]
col_sums = np.sum(data, axis=0) # [12, 15, 18]
# Other aggregations
np.mean(data) # Average
np.std(data) # Standard deviation
np.min(data) # Minimum
np.max(data) # Maximum
np.prod(data) # Product
np.argmax(data) # Index of max
np.cumsum(data) # Cumulative sumPattern 4: Matrix Operations
python
import numpy as np
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
# Matrix multiplication
C = A @ B # Preferred (Python 3.5+)
C = np.dot(A, B) # Equivalent
C = np.matmul(A, B) # Equivalent
# Element-wise multiply (NOT matrix multiply!)
C = A * B
# Transpose
A_T = A.T
# Inverse
A_inv = np.linalg.inv(A)
# Determinant
det = np.linalg.det(A)
# Eigenvalues
eigenvalues, eigenvectors = np.linalg.eig(A)Pattern 5: Avoid Intermediate Arrays
python
import numpy as np
a = np.random.random(1_000_000)
b = np.random.random(1_000_000)
c = np.random.random(1_000_000)
# ❌ Creates intermediate arrays
result = a + b + c # Creates temp for (a+b), then adds c
# ✅ Use out parameter
result = np.empty_like(a)
np.add(a, b, out=result)
np.add(result, c, out=result)
# ✅ Or use numexpr for complex expressions
import numexpr as ne
result = ne.evaluate('a + b + c') # No intermediates!Broadcasting Rules
What is Broadcasting?
Broadcasting allows NumPy to perform operations on arrays with different shapes.
python
import numpy as np
# Scalar + Array
a = np.array([1, 2, 3])
result = a + 10 # [11, 12, 13]
# 10 is "broadcast" to [10, 10, 10]
# 1D + 2D
a = np.array([[1, 2, 3], [4, 5, 6]]) # (2, 3)
b = np.array([10, 20, 30]) # (3,)
result = a + b
# [[11, 22, 33],
# [14, 25, 36]]
# b is broadcast to [[10, 20, 30], [10, 20, 30]]Broadcasting Rules
Rule 1: If arrays have different ndim, prepend 1s to smaller shape
Rule 2: Arrays with size 1 in a dimension act as if they had the
size of the largest array in that dimension
Rule 3: Arrays are compatible if dimensions are equal or one is 1python
import numpy as np
# Example 1: (3,) + (2, 3)
a = np.array([1, 2, 3]) # Shape: (3,)
b = np.array([[1, 2, 3], # Shape: (2, 3)
[4, 5, 6]])
# Step 1: Prepend 1 to a's shape → (1, 3)
# Step 2: Broadcast (1, 3) to (2, 3)
# Result shape: (2, 3)
result = a + b
# Example 2: (3, 1) + (1, 4)
a = np.array([[1], [2], [3]]) # Shape: (3, 1)
b = np.array([[10, 20, 30, 40]]) # Shape: (1, 4)
# Broadcast to (3, 4)
result = a + b
# [[11, 21, 31, 41],
# [12, 22, 32, 42],
# [13, 23, 33, 43]]Broadcasting Visualization
Shape (3, 1) + Shape (1, 4) = Shape (3, 4)
[[1], [[10, 20, 30, 40]] [[11, 21, 31, 41],
[2], + = [12, 22, 32, 42],
[3]] [13, 23, 33, 43]]
↓ broadcast → ↓ broadcast ↓
[[1, 1, 1, 1], [[10, 20, 30, 40],
[2, 2, 2, 2], + [10, 20, 30, 40],
[3, 3, 3, 3]] [10, 20, 30, 40]]Common Broadcasting Patterns
python
import numpy as np
# Pattern 1: Normalize columns
data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
col_means = data.mean(axis=0) # [4, 5, 6]
normalized = data - col_means # Subtract mean from each column
# Pattern 2: Outer product
a = np.array([1, 2, 3])
b = np.array([10, 20])
outer = a[:, np.newaxis] * b # (3, 1) * (2,) = (3, 2)
# [[10, 20],
# [20, 40],
# [30, 60]]
# Pattern 3: Distance matrix
points = np.array([[0, 0], [1, 1], [2, 2]]) # (3, 2)
# Compute pairwise distances
diff = points[:, np.newaxis, :] - points[np.newaxis, :, :] # (3, 3, 2)
distances = np.sqrt(np.sum(diff ** 2, axis=2)) # (3, 3)Broadcasting Errors
python
import numpy as np
a = np.array([[1, 2, 3], [4, 5, 6]]) # (2, 3)
b = np.array([1, 2]) # (2,)
# ❌ ERROR: Shapes not compatible
# (2, 3) and (2,) → trailing dimensions don't match (3 vs 2)
# result = a + b # ValueError!
# ✅ FIX: Reshape to make compatible
b_reshaped = b[:, np.newaxis] # (2, 1)
result = a + b_reshaped # (2, 3) + (2, 1) = (2, 3)Avoiding Python Loops
Anti-pattern 1: Growing Arrays
python
import numpy as np
# ❌ SLOW: Growing array in loop
result = np.array([])
for i in range(10000):
result = np.append(result, i ** 2) # O(n²)!
# ✅ FAST: Pre-allocate
result = np.empty(10000)
for i in range(10000):
result[i] = i ** 2
# ✅ FASTEST: Vectorize
result = np.arange(10000) ** 2Anti-pattern 2: Item-by-item Access
python
import numpy as np
arr = np.random.random((1000, 1000))
# ❌ SLOW: Python indexing
total = 0
for i in range(arr.shape[0]):
for j in range(arr.shape[1]):
total += arr[i, j]
# ✅ FAST: NumPy function
total = np.sum(arr)Anti-pattern 3: Unnecessary Copies
python
import numpy as np
arr = np.random.random(1_000_000)
# ❌ SLOW: Creates copy
filtered = arr[arr > 0.5].copy() # Unnecessary .copy()
# ✅ FAST: View is usually fine
filtered = arr[arr > 0.5] # Already a copy due to fancy indexing
# ❌ SLOW: Concatenate in loop
result = np.array([])
for chunk in chunks:
result = np.concatenate([result, chunk])
# ✅ FAST: Concatenate once
result = np.concatenate(chunks)Anti-pattern 4: Wrong dtype
python
import numpy as np
# ❌ SLOW: Object dtype (Python objects)
arr = np.array([1, 2, 3], dtype=object)
# ✅ FAST: Native dtype
arr = np.array([1, 2, 3], dtype=np.int64)
# ❌ SLOW: Unnecessary precision
arr = np.array([1.0, 2.0, 3.0], dtype=np.float64) # 8 bytes each
# ✅ FAST: Use float32 if precision allows
arr = np.array([1.0, 2.0, 3.0], dtype=np.float32) # 4 bytes eachWhen Loops Are OK
python
import numpy as np
# ✅ OK: Small number of iterations
for i in range(10): # Only 10 iterations
process_batch(data[i])
# ✅ OK: Complex logic that can't be vectorized
for i in range(n):
if complex_condition(data[i]):
result[i] = complex_computation(data[i])
else:
result[i] = other_computation(data[i])
# ✅ OK: Memory constraints (can't load all at once)
for chunk in np.array_split(huge_array, 100):
process(chunk)Advanced Techniques
np.einsum - Einstein Summation
python
import numpy as np
A = np.random.random((3, 4))
B = np.random.random((4, 5))
C = np.random.random((3, 4))
# Matrix multiplication
result = np.einsum('ij,jk->ik', A, B) # Same as A @ B
# Element-wise multiply then sum
result = np.einsum('ij,ij->', A, C) # Same as np.sum(A * C)
# Trace
result = np.einsum('ii->', A[:3, :3]) # Same as np.trace(A[:3, :3])
# Outer product
a = np.array([1, 2, 3])
b = np.array([4, 5])
result = np.einsum('i,j->ij', a, b) # Same as np.outer(a, b)
# Batch matrix multiply
batch_A = np.random.random((10, 3, 4))
batch_B = np.random.random((10, 4, 5))
result = np.einsum('bij,bjk->bik', batch_A, batch_B)Structured Arrays
python
import numpy as np
# Define structured dtype
dt = np.dtype([
('name', 'U10'), # Unicode string, 10 chars
('age', 'i4'), # 32-bit integer
('weight', 'f8'), # 64-bit float
])
# Create structured array
people = np.array([
('Alice', 30, 55.5),
('Bob', 25, 70.2),
('Charlie', 35, 65.0),
], dtype=dt)
# Access fields
print(people['name']) # ['Alice' 'Bob' 'Charlie']
print(people['age']) # [30 25 35]
# Filter
adults = people[people['age'] >= 30]Memory-mapped Arrays
python
import numpy as np
# Create memory-mapped file
shape = (10000, 10000)
dtype = np.float64
# Write mode
mmap = np.memmap('data.dat', dtype=dtype, mode='w+', shape=shape)
mmap[:] = np.random.random(shape)
mmap.flush() # Write to disk
del mmap
# Read mode (doesn't load into RAM)
mmap = np.memmap('data.dat', dtype=dtype, mode='r', shape=shape)
# Access like normal array, but data loaded on demand
print(mmap[0, 0])Production Pitfalls
Pitfall 1: Silent Type Conversion
python
import numpy as np
# ❌ PROBLEM: Integer overflow
arr = np.array([2**62, 2**62], dtype=np.int64)
result = arr * 4 # Overflow! No warning!
print(result) # Garbage values
# ✅ FIX: Check for overflow or use larger dtype
arr = np.array([2**62, 2**62], dtype=np.float64)
result = arr * 4 # Works correctlyPitfall 2: View Modification
python
import numpy as np
# ❌ PROBLEM: Modifying view affects original
original = np.array([1, 2, 3, 4, 5])
view = original[1:4]
view[0] = 100
print(original) # [1, 100, 3, 4, 5] - Modified!
# ✅ FIX: Use copy if you need independence
view = original[1:4].copy()
view[0] = 100
print(original) # [1, 2, 3, 4, 5] - UnchangedPitfall 3: Broadcasting Surprises
python
import numpy as np
# ❌ PROBLEM: Unexpected broadcasting
a = np.array([[1, 2, 3]]) # (1, 3)
b = np.array([[1], [2], [3]]) # (3, 1)
result = a + b # (3, 3) - Maybe not what you wanted!
# ✅ FIX: Be explicit about shapes
print(f"a shape: {a.shape}")
print(f"b shape: {b.shape}")
# Reshape if neededPitfall 4: NaN Propagation
python
import numpy as np
# ❌ PROBLEM: NaN propagates silently
data = np.array([1.0, 2.0, np.nan, 4.0])
print(np.mean(data)) # nan!
print(np.sum(data)) # nan!
# ✅ FIX: Use nan-aware functions
print(np.nanmean(data)) # 2.333...
print(np.nansum(data)) # 7.0
# Or filter NaN
clean_data = data[~np.isnan(data)]Pitfall 5: Floating Point Comparison
python
import numpy as np
# ❌ PROBLEM: Floating point precision
a = 0.1 + 0.2
b = 0.3
print(a == b) # False!
arr = np.array([0.1 + 0.2])
print(arr == 0.3) # [False]!
# ✅ FIX: Use np.isclose or np.allclose
print(np.isclose(a, b)) # True
print(np.allclose(arr, 0.3)) # TrueQuick Reference
python
import numpy as np
# === Memory Layout ===
arr.shape # Dimensions
arr.strides # Bytes per dimension
arr.flags # C_CONTIGUOUS, F_CONTIGUOUS
arr.base # None if owns data, else base array
# === Vectorization ===
arr + scalar # Broadcast scalar
arr1 + arr2 # Element-wise
arr @ matrix # Matrix multiply
np.where(cond, x, y) # Conditional
# === Broadcasting ===
# Shapes must match from right, or be 1
# (3, 4) + (4,) → OK
# (3, 4) + (3, 1) → OK
# (3, 4) + (3,) → ERROR
# === Aggregations ===
np.sum(arr, axis=0) # Sum along axis
np.mean(arr) # Average
np.std(arr) # Standard deviation
# === Avoid ===
# - Python loops over array elements
# - np.append in loop
# - Item-by-item indexing
# - Object dtype
# === Use ===
# - Vectorized operations
# - Pre-allocated arrays
# - Native dtypes
# - BroadcastingCross-links
- Prerequisites: Data Structures - Python collections
- Previous: C Extensions & Cython - Native code
- Related: Memory Optimization - Memory efficiency
- Related: Profiling - Find bottlenecks