Skip to content

Assembly Analysis Advanced

Từ Rust code đến CPU instructions

1. Xem Assembly Output

Tools

bash
# Cách 1: rustc trực tiếp
rustc --emit=asm -O main.rs

# Cách 2: cargo-show-asm (recommended)
cargo install cargo-show-asm
cargo asm my_crate::my_function

# Cách 3: Compiler Explorer (godbolt.org)
# Paste code vào, chọn rustc

Example: Simple Function

rust
pub fn add(a: i32, b: i32) -> i32 {
    a + b
}
asm
; x86_64 Assembly (Intel syntax)
add:
    lea     eax, [rdi + rsi]  ; eax = rdi + rsi
    ret

Giải thích:

  • rdi = first argument (a)
  • rsi = second argument (b)
  • lea = Load Effective Address (fast addition)
  • eax = return value (lower 32 bits of rax)

2. Common Patterns

Bounds Check Elimination

rust
pub fn sum_array(arr: &[i32]) -> i32 {
    let mut sum = 0;
    for i in 0..arr.len() {
        sum += arr[i];  // Bounds check per access
    }
    sum
}

// Better: Iterator eliminates bounds checks
pub fn sum_array_fast(arr: &[i32]) -> i32 {
    arr.iter().sum()
}
asm
; sum_array_fast (no bounds checks)
sum_array_fast:
    xor     eax, eax          ; sum = 0
    test    rsi, rsi          ; if len == 0
    je      .done
.loop:
    add     eax, [rdi]        ; sum += *ptr
    add     rdi, 4            ; ptr++
    dec     rsi               ; len--
    jne     .loop
.done:
    ret

Inlining

rust
#[inline(always)]
fn square(x: i32) -> i32 {
    x * x
}

pub fn sum_of_squares(a: i32, b: i32) -> i32 {
    square(a) + square(b)
}
asm
; Inlined version
sum_of_squares:
    imul    edi, edi          ; a * a
    imul    esi, esi          ; b * b
    lea     eax, [rdi + rsi]  ; return a² + b²
    ret

3. SIMD Optimizations

Auto-vectorization

rust
pub fn sum_vec(data: &[f32]) -> f32 {
    data.iter().sum()
}
asm
; With -C target-feature=+avx2
sum_vec:
    ; Process 8 floats at once
    vxorps  ymm0, ymm0, ymm0      ; sum = 0 (8 floats)
.loop:
    vaddps  ymm0, ymm0, [rdi]     ; sum += data[0..8]
    add     rdi, 32                ; ptr += 8 floats
    sub     rsi, 8
    jg      .loop
    ; Horizontal sum
    vextractf128 xmm1, ymm0, 1
    vaddps  xmm0, xmm0, xmm1
    ; ... final reduction

Manual SIMD với std::arch

rust
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

/// Sum f32 array using AVX2
#[target_feature(enable = "avx2")]
pub unsafe fn sum_avx2(data: &[f32]) -> f32 {
    let mut sum = _mm256_setzero_ps();
    let chunks = data.chunks_exact(8);
    let remainder = chunks.remainder();
    
    for chunk in chunks {
        let v = _mm256_loadu_ps(chunk.as_ptr());
        sum = _mm256_add_ps(sum, v);
    }
    
    // Horizontal sum
    let low = _mm256_extractf128_ps(sum, 0);
    let high = _mm256_extractf128_ps(sum, 1);
    let sum128 = _mm_add_ps(low, high);
    
    let mut result = [0f32; 4];
    _mm_storeu_ps(result.as_mut_ptr(), sum128);
    
    let scalar_sum: f32 = result.iter().sum();
    scalar_sum + remainder.iter().sum::<f32>()
}

SIMD Types

TypeWidthElements
__m128128-bit4 × f32
__m128d128-bit2 × f64
__m128i128-bit4 × i32 / 8 × i16 / 16 × i8
__m256256-bit8 × f32
__m256d256-bit4 × f64
__m512512-bit16 × f32 (AVX-512)

4. Common x86_64 Instructions

InstructionMeaningExample
movMove datamov rax, rbx
leaLoad address (fast math)lea rax, [rbx + 4*rcx]
add/subArithmeticadd rax, 5
imulSigned multiplyimul rax, rbx
cmpComparecmp rax, 10
je/jneJump equal/not equalje .label
callFunction callcall my_func
retReturnret
push/popStack operationspush rbp

Calling Convention (System V AMD64)

RegisterPurpose
rdi1st argument
rsi2nd argument
rdx3rd argument
rcx4th argument
r85th argument
r96th argument
raxReturn value
rspStack pointer
rbpBase pointer

5. Optimization Tips

Đo lường trước khi Optimize

rust
#[bench]
fn bench_sum(b: &mut Bencher) {
    let data: Vec<f32> = (0..10000).map(|x| x as f32).collect();
    b.iter(|| sum_vec(&data));
}

Compiler Hints

rust
// Likely/unlikely branches
#[cold]
fn error_handler() { /* rarely called */ }

if likely(condition) {
    // Hot path
}

// Force inline
#[inline(always)]
fn hot_function() {}

// Prevent inline (for profiling)
#[inline(never)]
fn benchmark_this() {}

Target Features

bash
# Enable all CPU features
RUSTFLAGS="-C target-cpu=native" cargo build --release

# Specific features
RUSTFLAGS="-C target-feature=+avx2,+fma" cargo build --release

🎯 Reading Assembly Checklist

  1. Look for bounds checks: call panic_bounds_check
  2. Check inlining: No call to small functions
  3. SIMD: ymm/xmm registers = vectorized
  4. Memory access: [rdi] = dereferencing pointer
  5. Loop unrolling: Multiple operations per iteration

💡 GODBOLT TIP

Sử dụng godbolt.org với -C opt-level=3 để xem optimized assembly online.