Giao diện
Assembly Analysis Advanced
Từ Rust code đến CPU instructions
1. Xem Assembly Output
Tools
bash
# Cách 1: rustc trực tiếp
rustc --emit=asm -O main.rs
# Cách 2: cargo-show-asm (recommended)
cargo install cargo-show-asm
cargo asm my_crate::my_function
# Cách 3: Compiler Explorer (godbolt.org)
# Paste code vào, chọn rustcExample: Simple Function
rust
pub fn add(a: i32, b: i32) -> i32 {
a + b
}asm
; x86_64 Assembly (Intel syntax)
add:
lea eax, [rdi + rsi] ; eax = rdi + rsi
retGiải thích:
rdi= first argument (a)rsi= second argument (b)lea= Load Effective Address (fast addition)eax= return value (lower 32 bits of rax)
2. Common Patterns
Bounds Check Elimination
rust
pub fn sum_array(arr: &[i32]) -> i32 {
let mut sum = 0;
for i in 0..arr.len() {
sum += arr[i]; // Bounds check per access
}
sum
}
// Better: Iterator eliminates bounds checks
pub fn sum_array_fast(arr: &[i32]) -> i32 {
arr.iter().sum()
}asm
; sum_array_fast (no bounds checks)
sum_array_fast:
xor eax, eax ; sum = 0
test rsi, rsi ; if len == 0
je .done
.loop:
add eax, [rdi] ; sum += *ptr
add rdi, 4 ; ptr++
dec rsi ; len--
jne .loop
.done:
retInlining
rust
#[inline(always)]
fn square(x: i32) -> i32 {
x * x
}
pub fn sum_of_squares(a: i32, b: i32) -> i32 {
square(a) + square(b)
}asm
; Inlined version
sum_of_squares:
imul edi, edi ; a * a
imul esi, esi ; b * b
lea eax, [rdi + rsi] ; return a² + b²
ret3. SIMD Optimizations
Auto-vectorization
rust
pub fn sum_vec(data: &[f32]) -> f32 {
data.iter().sum()
}asm
; With -C target-feature=+avx2
sum_vec:
; Process 8 floats at once
vxorps ymm0, ymm0, ymm0 ; sum = 0 (8 floats)
.loop:
vaddps ymm0, ymm0, [rdi] ; sum += data[0..8]
add rdi, 32 ; ptr += 8 floats
sub rsi, 8
jg .loop
; Horizontal sum
vextractf128 xmm1, ymm0, 1
vaddps xmm0, xmm0, xmm1
; ... final reductionManual SIMD với std::arch
rust
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
/// Sum f32 array using AVX2
#[target_feature(enable = "avx2")]
pub unsafe fn sum_avx2(data: &[f32]) -> f32 {
let mut sum = _mm256_setzero_ps();
let chunks = data.chunks_exact(8);
let remainder = chunks.remainder();
for chunk in chunks {
let v = _mm256_loadu_ps(chunk.as_ptr());
sum = _mm256_add_ps(sum, v);
}
// Horizontal sum
let low = _mm256_extractf128_ps(sum, 0);
let high = _mm256_extractf128_ps(sum, 1);
let sum128 = _mm_add_ps(low, high);
let mut result = [0f32; 4];
_mm_storeu_ps(result.as_mut_ptr(), sum128);
let scalar_sum: f32 = result.iter().sum();
scalar_sum + remainder.iter().sum::<f32>()
}SIMD Types
| Type | Width | Elements |
|---|---|---|
__m128 | 128-bit | 4 × f32 |
__m128d | 128-bit | 2 × f64 |
__m128i | 128-bit | 4 × i32 / 8 × i16 / 16 × i8 |
__m256 | 256-bit | 8 × f32 |
__m256d | 256-bit | 4 × f64 |
__m512 | 512-bit | 16 × f32 (AVX-512) |
4. Common x86_64 Instructions
| Instruction | Meaning | Example |
|---|---|---|
mov | Move data | mov rax, rbx |
lea | Load address (fast math) | lea rax, [rbx + 4*rcx] |
add/sub | Arithmetic | add rax, 5 |
imul | Signed multiply | imul rax, rbx |
cmp | Compare | cmp rax, 10 |
je/jne | Jump equal/not equal | je .label |
call | Function call | call my_func |
ret | Return | ret |
push/pop | Stack operations | push rbp |
Calling Convention (System V AMD64)
| Register | Purpose |
|---|---|
rdi | 1st argument |
rsi | 2nd argument |
rdx | 3rd argument |
rcx | 4th argument |
r8 | 5th argument |
r9 | 6th argument |
rax | Return value |
rsp | Stack pointer |
rbp | Base pointer |
5. Optimization Tips
Đo lường trước khi Optimize
rust
#[bench]
fn bench_sum(b: &mut Bencher) {
let data: Vec<f32> = (0..10000).map(|x| x as f32).collect();
b.iter(|| sum_vec(&data));
}Compiler Hints
rust
// Likely/unlikely branches
#[cold]
fn error_handler() { /* rarely called */ }
if likely(condition) {
// Hot path
}
// Force inline
#[inline(always)]
fn hot_function() {}
// Prevent inline (for profiling)
#[inline(never)]
fn benchmark_this() {}Target Features
bash
# Enable all CPU features
RUSTFLAGS="-C target-cpu=native" cargo build --release
# Specific features
RUSTFLAGS="-C target-feature=+avx2,+fma" cargo build --release🎯 Reading Assembly Checklist
- Look for bounds checks:
call panic_bounds_check - Check inlining: No
callto small functions - SIMD:
ymm/xmmregisters = vectorized - Memory access:
[rdi]= dereferencing pointer - Loop unrolling: Multiple operations per iteration
💡 GODBOLT TIP
Sử dụng godbolt.org với -C opt-level=3 để xem optimized assembly online.