Giao diện
OS Event Queues Expert
Below Tokio: Cách OS xử lý thousands of connections
1. C10K Problem
Mathematical Breakdown
Thread-per-connection model fails:
THREAD-PER-CONNECTION MODEL
┌─────────────────────────────────────────────────────────────────┐
│ │
│ 10,000 connections = 10,000 threads │
│ │
│ Memory per thread: 8 MB (stack) │
│ Total memory: 80 GB ❌ │
│ │
│ Context switch cost: ~10 μs │
│ Switches/second: 10,000 × 100 = 1,000,000 │
│ CPU overhead: 10 seconds/second ❌ │
│ │
│ Conclusion: Thread-per-connection KHÔNG SCALE │
│ │
└─────────────────────────────────────────────────────────────────┘Non-blocking I/O solution:
EVENT-DRIVEN MODEL
┌─────────────────────────────────────────────────────────────────┐
│ │
│ 10,000 connections, 1-N threads │
│ │
│ Memory: N threads × 8MB + 10,000 × ~1KB = ~100 MB ✅ │
│ │
│ Workflow: │
│ 1. Register all sockets với event queue │
│ 2. epoll_wait() - Block cho đến khi có event │
│ 3. Process ready sockets (O(ready), not O(total)) │
│ 4. Repeat │
│ │
└─────────────────────────────────────────────────────────────────┘2. epoll (Linux)
Architecture
EPOLL ARCHITECTURE
┌─────────────────────────────────────────────────────────────────┐
│ │
│ User Space Kernel Space │
│ ────────── ──────────── │
│ │
│ ┌──────────┐ ┌──────────────────────┐ │
│ │ App │ epoll_create() │ epoll instance │ │
│ │ │─────────────────▶│ ┌────────────────┐ │ │
│ │ │ │ │ Interest List │ │ │
│ │ │ epoll_ctl() │ │ ┌──┬──┬──┬──┐ │ │ │
│ │ │─────────────────▶│ │ │fd│fd│fd│fd│ │ │ │
│ │ │ ADD/MOD/DEL │ │ └──┴──┴──┴──┘ │ │ │
│ │ │ │ └────────────────┘ │ │
│ │ │ │ │ │
│ │ │ epoll_wait() │ ┌────────────────┐ │ │
│ │ │◀─────────────────│ │ Ready List │ │ │
│ │ │ returns ready │ │ (ready fds) │ │ │
│ └──────────┘ │ └────────────────┘ │ │
│ └──────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────┘Rust FFI với epoll
rust
use libc::{
epoll_create1, epoll_ctl, epoll_wait,
epoll_event, EPOLL_CTL_ADD, EPOLL_CTL_DEL, EPOLL_CTL_MOD,
EPOLLIN, EPOLLOUT, EPOLLET, EPOLLONESHOT,
c_int, close,
};
use std::io::{self, Error};
use std::os::unix::io::RawFd;
pub struct Epoll {
fd: RawFd,
}
impl Epoll {
/// Tạo epoll instance
pub fn new() -> io::Result<Self> {
let fd = unsafe { epoll_create1(0) };
if fd < 0 {
return Err(Error::last_os_error());
}
Ok(Self { fd })
}
/// Đăng ký file descriptor
pub fn add(&self, fd: RawFd, events: u32, data: u64) -> io::Result<()> {
let mut event = epoll_event {
events,
u64: data,
};
let result = unsafe {
epoll_ctl(self.fd, EPOLL_CTL_ADD, fd, &mut event)
};
if result < 0 {
return Err(Error::last_os_error());
}
Ok(())
}
/// Chờ events
pub fn wait(&self, events: &mut [epoll_event], timeout_ms: i32) -> io::Result<usize> {
let result = unsafe {
epoll_wait(
self.fd,
events.as_mut_ptr(),
events.len() as c_int,
timeout_ms
)
};
if result < 0 {
return Err(Error::last_os_error());
}
Ok(result as usize)
}
}
impl Drop for Epoll {
fn drop(&mut self) {
unsafe { close(self.fd); }
}
}Usage Example
rust
use std::net::TcpListener;
use std::os::unix::io::AsRawFd;
fn epoll_server() -> io::Result<()> {
let listener = TcpListener::bind("0.0.0.0:8080")?;
listener.set_nonblocking(true)?;
let epoll = Epoll::new()?;
// Register listener
epoll.add(
listener.as_raw_fd(),
(EPOLLIN | EPOLLET) as u32, // Edge-triggered
0 // data = 0 means listener
)?;
let mut events = vec![epoll_event { events: 0, u64: 0 }; 1024];
loop {
let n = epoll.wait(&mut events, -1)?;
for i in 0..n {
let event = &events[i];
if event.u64 == 0 {
// Listener ready - accept connections
while let Ok((stream, addr)) = listener.accept() {
stream.set_nonblocking(true)?;
let fd = stream.as_raw_fd();
epoll.add(fd, EPOLLIN as u32, fd as u64)?;
}
} else {
// Client ready - handle I/O
let fd = event.u64 as RawFd;
handle_client(fd);
}
}
}
}3. kqueue (BSD/macOS)
Differences from epoll
| Aspect | epoll | kqueue |
|---|---|---|
| Origin | Linux | BSD/macOS |
| Event types | Read/Write only | Any: file, signal, timer... |
| API | 3 syscalls | 2 syscalls |
| Batch changes | One at a time | Multiple in one call |
Rust FFI
rust
#[cfg(any(target_os = "macos", target_os = "freebsd"))]
mod kqueue_impl {
use libc::{kqueue, kevent, close, EV_ADD, EV_DELETE, EVFILT_READ};
use std::io::{self, Error};
use std::os::unix::io::RawFd;
pub struct Kqueue {
fd: RawFd,
}
impl Kqueue {
pub fn new() -> io::Result<Self> {
let fd = unsafe { kqueue() };
if fd < 0 {
return Err(Error::last_os_error());
}
Ok(Self { fd })
}
pub fn add_read(&self, fd: RawFd) -> io::Result<()> {
let changes = [kevent {
ident: fd as usize,
filter: EVFILT_READ,
flags: EV_ADD,
fflags: 0,
data: 0,
udata: std::ptr::null_mut(),
}];
let result = unsafe {
kevent(self.fd, changes.as_ptr(), 1, std::ptr::null_mut(), 0, std::ptr::null())
};
if result < 0 {
return Err(Error::last_os_error());
}
Ok(())
}
}
}4. IOCP (Windows)
Completion-based vs Readiness-based
READINESS vs COMPLETION
┌─────────────────────────────────────────────────────────────────┐
│ │
│ epoll/kqueue (Readiness-based): │
│ ───────────────────────────────── │
│ 1. Wait for "socket is READY to read" │
│ 2. You call read() yourself │
│ 3. Handle data │
│ │
│ IOCP (Completion-based): │
│ ─────────────────────────── │
│ 1. Submit "read into this buffer" │
│ 2. OS does the read │
│ 3. Wait for "read COMPLETED" │
│ 4. Handle data (already in buffer) │
│ │
└─────────────────────────────────────────────────────────────────┘Windows Rust Example
rust
#[cfg(windows)]
mod iocp_impl {
use windows_sys::Win32::System::IO::{
CreateIoCompletionPort, GetQueuedCompletionStatus,
OVERLAPPED,
};
use std::io::{self, Error};
pub struct Iocp {
handle: isize,
}
impl Iocp {
pub fn new() -> io::Result<Self> {
let handle = unsafe {
CreateIoCompletionPort(-1isize, 0, 0, 0)
};
if handle == 0 {
return Err(Error::last_os_error());
}
Ok(Self { handle })
}
pub fn wait(&self) -> io::Result<CompletionEvent> {
let mut bytes = 0u32;
let mut key = 0usize;
let mut overlapped: *mut OVERLAPPED = std::ptr::null_mut();
let result = unsafe {
GetQueuedCompletionStatus(
self.handle,
&mut bytes,
&mut key,
&mut overlapped,
u32::MAX // Infinite timeout
)
};
if result == 0 {
return Err(Error::last_os_error());
}
Ok(CompletionEvent { bytes, key })
}
}
pub struct CompletionEvent {
pub bytes: u32,
pub key: usize,
}
}5. Cross-Platform Abstraction (mio)
rust
use mio::{Events, Interest, Poll, Token};
use mio::net::TcpListener;
use std::io;
const SERVER: Token = Token(0);
fn mio_server() -> io::Result<()> {
let mut poll = Poll::new()?;
let mut events = Events::with_capacity(1024);
let addr = "0.0.0.0:8080".parse().unwrap();
let mut server = TcpListener::bind(addr)?;
// mio sử dụng epoll/kqueue/IOCP tùy platform
poll.registry().register(
&mut server,
SERVER,
Interest::READABLE
)?;
loop {
poll.poll(&mut events, None)?;
for event in events.iter() {
match event.token() {
SERVER => {
let (mut conn, addr) = server.accept()?;
println!("Connection from {}", addr);
}
_ => {}
}
}
}
}6. Performance Comparison
| Metric | epoll | kqueue | IOCP |
|---|---|---|---|
| Add connection | O(1) | O(1) | O(1) |
| Wait for events | O(ready) | O(ready) | O(ready) |
| Memory/connection | ~160 bytes | ~200 bytes | ~300 bytes |
| Max connections | ~1M+ | ~1M+ | ~1M+ |
🎯 Best Practices
- Use Edge-Triggered (ET) — Ít syscalls hơn Level-Triggered
- Non-blocking sockets — Bắt buộc với event loops
- Batch operations — kqueue cho phép batch changes
- Use mio for cross-platform — Abstracts OS differences
💡 TOKIO INSIGHT
Tokio's runtime wraps mio, which wraps epoll/kqueue/IOCP. Hiểu layer này giúp debug performance issues.