Giao diện
Storage Engines Database
LSM Trees, WAL, B-Tree Internals
1. LSM Trees (Log-Structured Merge-Trees)
Architecture
LSM TREE ARCHITECTURE
┌─────────────────────────────────────────────────────────────────┐
│ │
│ Write Path │
│ ────────── │
│ │
│ Write ─▶ ┌──────────┐ │
│ │ MemTable │ In-memory (Red-Black Tree/SkipList) │
│ │ (mutable)│ Fast writes: O(log n) │
│ └─────┬────┘ │
│ │ Flush when full (~64MB) │
│ ▼ │
│ ┌──────────┐ │
│ │ SSTable │ Immutable, sorted on disk │
│ │ Level 0 │ │
│ └─────┬────┘ │
│ │ Compaction │
│ ▼ │
│ ┌──────────┐ │
│ │ SSTable │ Larger, merged files │
│ │ Level 1 │ │
│ └─────┬────┘ │
│ │ │
│ ▼ │
│ ┌──────────┐ │
│ │ SSTable │ Even larger │
│ │ Level N │ │
│ └──────────┘ │
│ │
└─────────────────────────────────────────────────────────────────┘MemTable Implementation
rust
use std::collections::BTreeMap;
use std::sync::RwLock;
pub struct MemTable {
data: RwLock<BTreeMap<Vec<u8>, Vec<u8>>>,
size: std::sync::atomic::AtomicUsize,
max_size: usize,
}
impl MemTable {
pub fn new(max_size: usize) -> Self {
Self {
data: RwLock::new(BTreeMap::new()),
size: std::sync::atomic::AtomicUsize::new(0),
max_size,
}
}
pub fn put(&self, key: Vec<u8>, value: Vec<u8>) -> bool {
let entry_size = key.len() + value.len();
let mut data = self.data.write().unwrap();
data.insert(key, value);
let new_size = self.size.fetch_add(entry_size, std::sync::atomic::Ordering::SeqCst);
new_size + entry_size >= self.max_size
}
pub fn get(&self, key: &[u8]) -> Option<Vec<u8>> {
let data = self.data.read().unwrap();
data.get(key).cloned()
}
pub fn to_sorted_entries(&self) -> Vec<(Vec<u8>, Vec<u8>)> {
let data = self.data.read().unwrap();
data.iter()
.map(|(k, v)| (k.clone(), v.clone()))
.collect()
}
}SSTable Format
SSTABLE FILE FORMAT
┌─────────────────────────────────────────────────────────────────┐
│ │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ Data Blocks │ │
│ │ ┌─────────────────────────────────────────────────┐ │ │
│ │ │ key1 | value1 | key2 | value2 | ... │ │ │
│ │ └─────────────────────────────────────────────────┘ │ │
│ │ ┌─────────────────────────────────────────────────┐ │ │
│ │ │ key_n | value_n | ... │ │ │
│ │ └─────────────────────────────────────────────────┘ │ │
│ └─────────────────────────────────────────────────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ Index Block │ │
│ │ block_0: offset, first_key │ │
│ │ block_1: offset, first_key │ │
│ │ ... │ │
│ └─────────────────────────────────────────────────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ Bloom Filter │ │
│ │ (Fast negative lookups) │ │
│ └─────────────────────────────────────────────────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ Footer │ │
│ │ index_offset | bloom_offset | magic_number │ │
│ └─────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────┘2. Write-Ahead Log (WAL)
Concept
WAL FOR DURABILITY
┌─────────────────────────────────────────────────────────────────┐
│ │
│ Write Request │
│ │ │
│ ▼ │
│ ┌─────────────┐ │
│ │ 1. Append │ ──▶ WAL File (on disk) │
│ │ to WAL │ Sequential write = FAST │
│ └──────┬──────┘ │
│ │ │
│ ▼ │
│ ┌─────────────┐ │
│ │ 2. fsync() │ Ensure data is on disk │
│ └──────┬──────┘ │
│ │ │
│ ▼ │
│ ┌─────────────┐ │
│ │ 3. Update │ ──▶ MemTable (in memory) │
│ │ MemTable │ │
│ └──────┬──────┘ │
│ │ │
│ ▼ │
│ ┌─────────────┐ │
│ │ 4. ACK to │ │
│ │ Client │ │
│ └─────────────┘ │
│ │
│ On Crash Recovery: │
│ 1. Read WAL from disk │
│ 2. Replay all entries into MemTable │
│ 3. Resume normal operation │
│ │
└─────────────────────────────────────────────────────────────────┘WAL Implementation
rust
use std::fs::{File, OpenOptions};
use std::io::{Write, BufWriter, BufReader, Read};
pub struct WriteAheadLog {
file: BufWriter<File>,
path: String,
}
#[repr(C)]
struct WalEntry {
key_len: u32,
value_len: u32,
// followed by key bytes, then value bytes
}
impl WriteAheadLog {
pub fn new(path: &str) -> std::io::Result<Self> {
let file = OpenOptions::new()
.create(true)
.append(true)
.open(path)?;
Ok(Self {
file: BufWriter::new(file),
path: path.to_string(),
})
}
pub fn append(&mut self, key: &[u8], value: &[u8]) -> std::io::Result<()> {
// Write length-prefixed entry
let key_len = key.len() as u32;
let value_len = value.len() as u32;
self.file.write_all(&key_len.to_le_bytes())?;
self.file.write_all(&value_len.to_le_bytes())?;
self.file.write_all(key)?;
self.file.write_all(value)?;
// CRITICAL: Ensure durability
self.file.flush()?;
self.file.get_ref().sync_all()?;
Ok(())
}
pub fn recover<F>(&self, mut callback: F) -> std::io::Result<()>
where
F: FnMut(Vec<u8>, Vec<u8>),
{
let file = File::open(&self.path)?;
let mut reader = BufReader::new(file);
loop {
let mut len_buf = [0u8; 4];
// Read key length
if reader.read_exact(&mut len_buf).is_err() {
break; // EOF
}
let key_len = u32::from_le_bytes(len_buf) as usize;
// Read value length
reader.read_exact(&mut len_buf)?;
let value_len = u32::from_le_bytes(len_buf) as usize;
// Read key and value
let mut key = vec![0u8; key_len];
let mut value = vec![0u8; value_len];
reader.read_exact(&mut key)?;
reader.read_exact(&mut value)?;
callback(key, value);
}
Ok(())
}
}3. B-Tree Node Serialization
B-Tree Structure
B-TREE (ORDER 4)
┌─────────────────────────────────────────────────────────────────┐
│ │
│ ┌───────────────┐ │
│ │ [30, 60] │ Root │
│ └───┬───┬───┬───┘ │
│ ┌──────┘ │ └──────┐ │
│ ▼ ▼ ▼ │
│ ┌─────────┐ ┌─────────┐ ┌─────────┐ │
│ │[10, 20] │ │[40, 50] │ │[70, 80] │ Internal │
│ └┬──┬──┬──┘ └┬──┬──┬──┘ └┬──┬──┬──┘ │
│ │ │ │ │ │ │ │ │ │ │
│ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ │
│ ┌─┐┌─┐┌─┐ ┌─┐┌─┐┌─┐ ┌─┐┌─┐┌─┐ │
│ │5││15│25│ │35│45│55│ │65│75│85│ Leaves │
│ └─┘└─┘└─┘ └─┘└─┘└─┘ └─┘└─┘└─┘ │
│ │
│ Properties: │
│ • All leaves at same level │
│ • O(log n) search, insert, delete │
│ • High fanout = fewer disk reads │
│ │
└─────────────────────────────────────────────────────────────────┘Disk Page Format
rust
const PAGE_SIZE: usize = 4096;
const MAX_KEYS: usize = 255;
#[repr(C)]
pub struct BTreePage {
header: PageHeader,
keys: [KeyEntry; MAX_KEYS],
// Values or child pointers follow
}
#[repr(C)]
pub struct PageHeader {
page_type: u8, // 0 = internal, 1 = leaf
num_keys: u16,
parent_page: u32,
next_leaf: u32, // For leaf pages (range scans)
_padding: [u8; 7],
}
#[repr(C)]
pub struct KeyEntry {
key_offset: u16, // Offset within page
key_len: u16,
value_offset: u16, // Or child page number
value_len: u16,
}
impl BTreePage {
pub fn serialize(&self) -> [u8; PAGE_SIZE] {
let mut buffer = [0u8; PAGE_SIZE];
// Write header
buffer[0] = self.header.page_type;
buffer[1..3].copy_from_slice(&self.header.num_keys.to_le_bytes());
buffer[3..7].copy_from_slice(&self.header.parent_page.to_le_bytes());
buffer[7..11].copy_from_slice(&self.header.next_leaf.to_le_bytes());
// Write keys (offset 16)
let mut offset = 16;
for i in 0..self.header.num_keys as usize {
let entry = &self.keys[i];
buffer[offset..offset+2].copy_from_slice(&entry.key_offset.to_le_bytes());
buffer[offset+2..offset+4].copy_from_slice(&entry.key_len.to_le_bytes());
buffer[offset+4..offset+6].copy_from_slice(&entry.value_offset.to_le_bytes());
buffer[offset+6..offset+8].copy_from_slice(&entry.value_len.to_le_bytes());
offset += 8;
}
buffer
}
pub fn deserialize(buffer: &[u8; PAGE_SIZE]) -> Self {
let header = PageHeader {
page_type: buffer[0],
num_keys: u16::from_le_bytes([buffer[1], buffer[2]]),
parent_page: u32::from_le_bytes(buffer[3..7].try_into().unwrap()),
next_leaf: u32::from_le_bytes(buffer[7..11].try_into().unwrap()),
_padding: [0; 7],
};
let mut keys = [KeyEntry {
key_offset: 0,
key_len: 0,
value_offset: 0,
value_len: 0,
}; MAX_KEYS];
let mut offset = 16;
for i in 0..header.num_keys as usize {
keys[i] = KeyEntry {
key_offset: u16::from_le_bytes([buffer[offset], buffer[offset+1]]),
key_len: u16::from_le_bytes([buffer[offset+2], buffer[offset+3]]),
value_offset: u16::from_le_bytes([buffer[offset+4], buffer[offset+5]]),
value_len: u16::from_le_bytes([buffer[offset+6], buffer[offset+7]]),
};
offset += 8;
}
Self { header, keys }
}
}4. Comparison
| Feature | B-Tree | LSM Tree |
|---|---|---|
| Write | O(log n) | O(1) amortized |
| Read | O(log n) | O(log n) + merge |
| Space | In-place update | Append-only |
| Use case | OLTP, random reads | Write-heavy, time-series |
| Examples | PostgreSQL, MySQL | RocksDB, TiKV, Cassandra |
🎯 Key Takeaways
- LSM Trees tối ưu cho writes bằng cách convert random → sequential
- WAL đảm bảo durability với fsync
- B-Trees tối ưu cho reads với high fanout
- Page-based storage align với disk I/O
💡 LEARNING PATH
Học tiếp: RocksDB Wiki, Database Internals Book