Skip to content

NumPy Internals Advanced

NumPy = Python's secret weapon cho numerical computing - Hiểu internals = Viết code nhanh hơn 100x

Learning Outcomes

Sau khi hoàn thành trang này, bạn sẽ:

  • 🎯 Hiểu ndarray memory layout (contiguous, strides)
  • 🎯 Áp dụng vectorization patterns để tránh Python loops
  • 🎯 Master broadcasting rules cho array operations
  • 🎯 Tránh các anti-patterns làm chậm NumPy code
  • 🎯 Biết khi nào NumPy không phải là giải pháp tốt nhất

ndarray Memory Layout

Array Structure

┌─────────────────────────────────────────────────────────┐
│                    ndarray Object                       │
├─────────────────────────────────────────────────────────┤
│  data      │  Pointer to actual data buffer            │
│  dtype     │  Data type (float64, int32, etc.)         │
│  shape     │  Dimensions (3, 4) = 3 rows, 4 cols       │
│  strides   │  Bytes to jump for each dimension         │
│  flags     │  C_CONTIGUOUS, F_CONTIGUOUS, etc.         │
└─────────────────────────────────────────────────────────┘
python
import numpy as np

arr = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float64)

print(f"Shape: {arr.shape}")      # (2, 3)
print(f"Dtype: {arr.dtype}")      # float64
print(f"Strides: {arr.strides}")  # (24, 8) - bytes per dimension
print(f"Size: {arr.size}")        # 6 elements
print(f"Itemsize: {arr.itemsize}")  # 8 bytes per element
print(f"Nbytes: {arr.nbytes}")    # 48 bytes total

Strides Explained

python
import numpy as np

# 2D array (2 rows, 3 cols) of float64 (8 bytes each)
arr = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float64)

# Memory layout (C-order, row-major):
# [1.0][2.0][3.0][4.0][5.0][6.0]
#  ^         ^
#  |         |
#  row 0     row 1

print(arr.strides)  # (24, 8)
# - To move to next row: jump 24 bytes (3 elements × 8 bytes)
# - To move to next column: jump 8 bytes (1 element × 8 bytes)

# Access arr[1, 2]:
# offset = 1 * 24 + 2 * 8 = 40 bytes from start

C-order vs Fortran-order

python
import numpy as np

# C-order (row-major) - default
arr_c = np.array([[1, 2, 3], [4, 5, 6]], order='C')
print(f"C-order strides: {arr_c.strides}")  # (24, 8)

# Memory: [1, 2, 3, 4, 5, 6]
#         row 0    row 1

# Fortran-order (column-major)
arr_f = np.array([[1, 2, 3], [4, 5, 6]], order='F')
print(f"F-order strides: {arr_f.strides}")  # (8, 16)

# Memory: [1, 4, 2, 5, 3, 6]
#         col0 col1 col2

# Check contiguity
print(arr_c.flags['C_CONTIGUOUS'])  # True
print(arr_f.flags['F_CONTIGUOUS'])  # True

Views vs Copies

python
import numpy as np

arr = np.array([1, 2, 3, 4, 5, 6])

# View - shares memory (fast, no copy)
view = arr[::2]  # [1, 3, 5]
view[0] = 100
print(arr)  # [100, 2, 3, 4, 5, 6] - original changed!

# Copy - new memory (slower, independent)
copy = arr[::2].copy()
copy[0] = 200
print(arr)  # [100, 2, 3, 4, 5, 6] - original unchanged

# Check if view or copy
print(view.base is arr)  # True - view
print(copy.base is None)  # True - copy (owns data)

Non-contiguous Arrays

python
import numpy as np

arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

# Slicing can create non-contiguous views
col = arr[:, 1]  # [2, 5, 8]
print(col.flags['C_CONTIGUOUS'])  # False!
print(col.strides)  # (24,) - jumping over elements

# Non-contiguous arrays are slower for some operations
# Fix: make contiguous copy
col_contig = np.ascontiguousarray(col)
print(col_contig.flags['C_CONTIGUOUS'])  # True

Vectorization Patterns

Why Vectorization?

python
import numpy as np
import timeit

n = 1_000_000
data = np.random.random(n)

# ❌ Python loop - SLOW
def python_loop(data):
    result = np.empty_like(data)
    for i in range(len(data)):
        result[i] = data[i] ** 2
    return result

# ✅ Vectorized - FAST
def vectorized(data):
    return data ** 2

# Benchmark
t_loop = timeit.timeit(lambda: python_loop(data), number=10)
t_vec = timeit.timeit(lambda: vectorized(data), number=10)

print(f"Python loop: {t_loop:.3f}s")
print(f"Vectorized: {t_vec:.3f}s")
print(f"Speedup: {t_loop/t_vec:.0f}x")
# Typical: 50-100x speedup!

Pattern 1: Element-wise Operations

python
import numpy as np

a = np.array([1, 2, 3, 4, 5])
b = np.array([10, 20, 30, 40, 50])

# ❌ Python loop
result = []
for i in range(len(a)):
    result.append(a[i] + b[i])

# ✅ Vectorized
result = a + b  # [11, 22, 33, 44, 55]

# Works for all arithmetic operations
result = a * b      # Element-wise multiply
result = a / b      # Element-wise divide
result = a ** 2     # Element-wise power
result = np.sqrt(a) # Element-wise sqrt
result = np.sin(a)  # Element-wise sin

Pattern 2: Conditional Operations

python
import numpy as np

data = np.array([1, -2, 3, -4, 5])

# ❌ Python loop
result = []
for x in data:
    if x > 0:
        result.append(x)
    else:
        result.append(0)

# ✅ Vectorized with np.where
result = np.where(data > 0, data, 0)  # [1, 0, 3, 0, 5]

# ✅ Boolean indexing
positives = data[data > 0]  # [1, 3, 5]

# ✅ np.clip for bounds
clipped = np.clip(data, 0, 10)  # [1, 0, 3, 0, 5]

Pattern 3: Aggregations

python
import numpy as np

data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

# ❌ Python loop
total = 0
for row in data:
    for val in row:
        total += val

# ✅ Vectorized
total = np.sum(data)        # 45
row_sums = np.sum(data, axis=1)   # [6, 15, 24]
col_sums = np.sum(data, axis=0)   # [12, 15, 18]

# Other aggregations
np.mean(data)       # Average
np.std(data)        # Standard deviation
np.min(data)        # Minimum
np.max(data)        # Maximum
np.prod(data)       # Product
np.argmax(data)     # Index of max
np.cumsum(data)     # Cumulative sum

Pattern 4: Matrix Operations

python
import numpy as np

A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])

# Matrix multiplication
C = A @ B           # Preferred (Python 3.5+)
C = np.dot(A, B)    # Equivalent
C = np.matmul(A, B) # Equivalent

# Element-wise multiply (NOT matrix multiply!)
C = A * B

# Transpose
A_T = A.T

# Inverse
A_inv = np.linalg.inv(A)

# Determinant
det = np.linalg.det(A)

# Eigenvalues
eigenvalues, eigenvectors = np.linalg.eig(A)

Pattern 5: Avoid Intermediate Arrays

python
import numpy as np

a = np.random.random(1_000_000)
b = np.random.random(1_000_000)
c = np.random.random(1_000_000)

# ❌ Creates intermediate arrays
result = a + b + c  # Creates temp for (a+b), then adds c

# ✅ Use out parameter
result = np.empty_like(a)
np.add(a, b, out=result)
np.add(result, c, out=result)

# ✅ Or use numexpr for complex expressions
import numexpr as ne
result = ne.evaluate('a + b + c')  # No intermediates!

Broadcasting Rules

What is Broadcasting?

Broadcasting allows NumPy to perform operations on arrays with different shapes.

python
import numpy as np

# Scalar + Array
a = np.array([1, 2, 3])
result = a + 10  # [11, 12, 13]
# 10 is "broadcast" to [10, 10, 10]

# 1D + 2D
a = np.array([[1, 2, 3], [4, 5, 6]])  # (2, 3)
b = np.array([10, 20, 30])             # (3,)
result = a + b
# [[11, 22, 33],
#  [14, 25, 36]]
# b is broadcast to [[10, 20, 30], [10, 20, 30]]

Broadcasting Rules

Rule 1: If arrays have different ndim, prepend 1s to smaller shape
Rule 2: Arrays with size 1 in a dimension act as if they had the 
        size of the largest array in that dimension
Rule 3: Arrays are compatible if dimensions are equal or one is 1
python
import numpy as np

# Example 1: (3,) + (2, 3)
a = np.array([1, 2, 3])           # Shape: (3,)
b = np.array([[1, 2, 3],          # Shape: (2, 3)
              [4, 5, 6]])

# Step 1: Prepend 1 to a's shape → (1, 3)
# Step 2: Broadcast (1, 3) to (2, 3)
# Result shape: (2, 3)
result = a + b

# Example 2: (3, 1) + (1, 4)
a = np.array([[1], [2], [3]])     # Shape: (3, 1)
b = np.array([[10, 20, 30, 40]])  # Shape: (1, 4)

# Broadcast to (3, 4)
result = a + b
# [[11, 21, 31, 41],
#  [12, 22, 32, 42],
#  [13, 23, 33, 43]]

Broadcasting Visualization

Shape (3, 1) + Shape (1, 4) = Shape (3, 4)

    [[1],        [[10, 20, 30, 40]]      [[11, 21, 31, 41],
     [2],    +                       =    [12, 22, 32, 42],
     [3]]                                 [13, 23, 33, 43]]

    ↓ broadcast →    ↓ broadcast ↓

    [[1, 1, 1, 1],   [[10, 20, 30, 40],
     [2, 2, 2, 2], +  [10, 20, 30, 40],
     [3, 3, 3, 3]]    [10, 20, 30, 40]]

Common Broadcasting Patterns

python
import numpy as np

# Pattern 1: Normalize columns
data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
col_means = data.mean(axis=0)  # [4, 5, 6]
normalized = data - col_means  # Subtract mean from each column

# Pattern 2: Outer product
a = np.array([1, 2, 3])
b = np.array([10, 20])
outer = a[:, np.newaxis] * b  # (3, 1) * (2,) = (3, 2)
# [[10, 20],
#  [20, 40],
#  [30, 60]]

# Pattern 3: Distance matrix
points = np.array([[0, 0], [1, 1], [2, 2]])  # (3, 2)
# Compute pairwise distances
diff = points[:, np.newaxis, :] - points[np.newaxis, :, :]  # (3, 3, 2)
distances = np.sqrt(np.sum(diff ** 2, axis=2))  # (3, 3)

Broadcasting Errors

python
import numpy as np

a = np.array([[1, 2, 3], [4, 5, 6]])  # (2, 3)
b = np.array([1, 2])                   # (2,)

# ❌ ERROR: Shapes not compatible
# (2, 3) and (2,) → trailing dimensions don't match (3 vs 2)
# result = a + b  # ValueError!

# ✅ FIX: Reshape to make compatible
b_reshaped = b[:, np.newaxis]  # (2, 1)
result = a + b_reshaped  # (2, 3) + (2, 1) = (2, 3)

Avoiding Python Loops

Anti-pattern 1: Growing Arrays

python
import numpy as np

# ❌ SLOW: Growing array in loop
result = np.array([])
for i in range(10000):
    result = np.append(result, i ** 2)  # O(n²)!

# ✅ FAST: Pre-allocate
result = np.empty(10000)
for i in range(10000):
    result[i] = i ** 2

# ✅ FASTEST: Vectorize
result = np.arange(10000) ** 2

Anti-pattern 2: Item-by-item Access

python
import numpy as np

arr = np.random.random((1000, 1000))

# ❌ SLOW: Python indexing
total = 0
for i in range(arr.shape[0]):
    for j in range(arr.shape[1]):
        total += arr[i, j]

# ✅ FAST: NumPy function
total = np.sum(arr)

Anti-pattern 3: Unnecessary Copies

python
import numpy as np

arr = np.random.random(1_000_000)

# ❌ SLOW: Creates copy
filtered = arr[arr > 0.5].copy()  # Unnecessary .copy()

# ✅ FAST: View is usually fine
filtered = arr[arr > 0.5]  # Already a copy due to fancy indexing

# ❌ SLOW: Concatenate in loop
result = np.array([])
for chunk in chunks:
    result = np.concatenate([result, chunk])

# ✅ FAST: Concatenate once
result = np.concatenate(chunks)

Anti-pattern 4: Wrong dtype

python
import numpy as np

# ❌ SLOW: Object dtype (Python objects)
arr = np.array([1, 2, 3], dtype=object)

# ✅ FAST: Native dtype
arr = np.array([1, 2, 3], dtype=np.int64)

# ❌ SLOW: Unnecessary precision
arr = np.array([1.0, 2.0, 3.0], dtype=np.float64)  # 8 bytes each

# ✅ FAST: Use float32 if precision allows
arr = np.array([1.0, 2.0, 3.0], dtype=np.float32)  # 4 bytes each

When Loops Are OK

python
import numpy as np

# ✅ OK: Small number of iterations
for i in range(10):  # Only 10 iterations
    process_batch(data[i])

# ✅ OK: Complex logic that can't be vectorized
for i in range(n):
    if complex_condition(data[i]):
        result[i] = complex_computation(data[i])
    else:
        result[i] = other_computation(data[i])

# ✅ OK: Memory constraints (can't load all at once)
for chunk in np.array_split(huge_array, 100):
    process(chunk)

Advanced Techniques

np.einsum - Einstein Summation

python
import numpy as np

A = np.random.random((3, 4))
B = np.random.random((4, 5))
C = np.random.random((3, 4))

# Matrix multiplication
result = np.einsum('ij,jk->ik', A, B)  # Same as A @ B

# Element-wise multiply then sum
result = np.einsum('ij,ij->', A, C)  # Same as np.sum(A * C)

# Trace
result = np.einsum('ii->', A[:3, :3])  # Same as np.trace(A[:3, :3])

# Outer product
a = np.array([1, 2, 3])
b = np.array([4, 5])
result = np.einsum('i,j->ij', a, b)  # Same as np.outer(a, b)

# Batch matrix multiply
batch_A = np.random.random((10, 3, 4))
batch_B = np.random.random((10, 4, 5))
result = np.einsum('bij,bjk->bik', batch_A, batch_B)

Structured Arrays

python
import numpy as np

# Define structured dtype
dt = np.dtype([
    ('name', 'U10'),      # Unicode string, 10 chars
    ('age', 'i4'),        # 32-bit integer
    ('weight', 'f8'),     # 64-bit float
])

# Create structured array
people = np.array([
    ('Alice', 30, 55.5),
    ('Bob', 25, 70.2),
    ('Charlie', 35, 65.0),
], dtype=dt)

# Access fields
print(people['name'])    # ['Alice' 'Bob' 'Charlie']
print(people['age'])     # [30 25 35]

# Filter
adults = people[people['age'] >= 30]

Memory-mapped Arrays

python
import numpy as np

# Create memory-mapped file
shape = (10000, 10000)
dtype = np.float64

# Write mode
mmap = np.memmap('data.dat', dtype=dtype, mode='w+', shape=shape)
mmap[:] = np.random.random(shape)
mmap.flush()  # Write to disk
del mmap

# Read mode (doesn't load into RAM)
mmap = np.memmap('data.dat', dtype=dtype, mode='r', shape=shape)
# Access like normal array, but data loaded on demand
print(mmap[0, 0])

Production Pitfalls

Pitfall 1: Silent Type Conversion

python
import numpy as np

# ❌ PROBLEM: Integer overflow
arr = np.array([2**62, 2**62], dtype=np.int64)
result = arr * 4  # Overflow! No warning!
print(result)  # Garbage values

# ✅ FIX: Check for overflow or use larger dtype
arr = np.array([2**62, 2**62], dtype=np.float64)
result = arr * 4  # Works correctly

Pitfall 2: View Modification

python
import numpy as np

# ❌ PROBLEM: Modifying view affects original
original = np.array([1, 2, 3, 4, 5])
view = original[1:4]
view[0] = 100
print(original)  # [1, 100, 3, 4, 5] - Modified!

# ✅ FIX: Use copy if you need independence
view = original[1:4].copy()
view[0] = 100
print(original)  # [1, 2, 3, 4, 5] - Unchanged

Pitfall 3: Broadcasting Surprises

python
import numpy as np

# ❌ PROBLEM: Unexpected broadcasting
a = np.array([[1, 2, 3]])  # (1, 3)
b = np.array([[1], [2], [3]])  # (3, 1)
result = a + b  # (3, 3) - Maybe not what you wanted!

# ✅ FIX: Be explicit about shapes
print(f"a shape: {a.shape}")
print(f"b shape: {b.shape}")
# Reshape if needed

Pitfall 4: NaN Propagation

python
import numpy as np

# ❌ PROBLEM: NaN propagates silently
data = np.array([1.0, 2.0, np.nan, 4.0])
print(np.mean(data))  # nan!
print(np.sum(data))   # nan!

# ✅ FIX: Use nan-aware functions
print(np.nanmean(data))  # 2.333...
print(np.nansum(data))   # 7.0

# Or filter NaN
clean_data = data[~np.isnan(data)]

Pitfall 5: Floating Point Comparison

python
import numpy as np

# ❌ PROBLEM: Floating point precision
a = 0.1 + 0.2
b = 0.3
print(a == b)  # False!

arr = np.array([0.1 + 0.2])
print(arr == 0.3)  # [False]!

# ✅ FIX: Use np.isclose or np.allclose
print(np.isclose(a, b))  # True
print(np.allclose(arr, 0.3))  # True

Quick Reference

python
import numpy as np

# === Memory Layout ===
arr.shape       # Dimensions
arr.strides     # Bytes per dimension
arr.flags       # C_CONTIGUOUS, F_CONTIGUOUS
arr.base        # None if owns data, else base array

# === Vectorization ===
arr + scalar    # Broadcast scalar
arr1 + arr2     # Element-wise
arr @ matrix    # Matrix multiply
np.where(cond, x, y)  # Conditional

# === Broadcasting ===
# Shapes must match from right, or be 1
# (3, 4) + (4,) → OK
# (3, 4) + (3, 1) → OK
# (3, 4) + (3,) → ERROR

# === Aggregations ===
np.sum(arr, axis=0)   # Sum along axis
np.mean(arr)          # Average
np.std(arr)           # Standard deviation

# === Avoid ===
# - Python loops over array elements
# - np.append in loop
# - Item-by-item indexing
# - Object dtype

# === Use ===
# - Vectorized operations
# - Pre-allocated arrays
# - Native dtypes
# - Broadcasting