Skip to content

OS Event Queues Expert

Below Tokio: Cách OS xử lý thousands of connections

1. C10K Problem

Mathematical Breakdown

Thread-per-connection model fails:

                    THREAD-PER-CONNECTION MODEL
┌─────────────────────────────────────────────────────────────────┐
│                                                                 │
│   10,000 connections = 10,000 threads                           │
│                                                                 │
│   Memory per thread:     8 MB (stack)                           │
│   Total memory:          80 GB ❌                               │
│                                                                 │
│   Context switch cost:   ~10 μs                                 │
│   Switches/second:       10,000 × 100 = 1,000,000               │
│   CPU overhead:          10 seconds/second ❌                   │
│                                                                 │
│   Conclusion: Thread-per-connection KHÔNG SCALE                 │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

Non-blocking I/O solution:

                    EVENT-DRIVEN MODEL
┌─────────────────────────────────────────────────────────────────┐
│                                                                 │
│   10,000 connections, 1-N threads                               │
│                                                                 │
│   Memory:  N threads × 8MB + 10,000 × ~1KB = ~100 MB ✅         │
│                                                                 │
│   Workflow:                                                     │
│   1. Register all sockets với event queue                       │
│   2. epoll_wait() - Block cho đến khi có event                  │
│   3. Process ready sockets (O(ready), not O(total))             │
│   4. Repeat                                                     │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

2. epoll (Linux)

Architecture

                          EPOLL ARCHITECTURE
┌─────────────────────────────────────────────────────────────────┐
│                                                                 │
│   User Space                    Kernel Space                    │
│   ──────────                    ────────────                    │
│                                                                 │
│   ┌──────────┐                  ┌──────────────────────┐        │
│   │ App      │  epoll_create()  │    epoll instance    │        │
│   │          │─────────────────▶│  ┌────────────────┐  │        │
│   │          │                  │  │ Interest List  │  │        │
│   │          │  epoll_ctl()     │  │ ┌──┬──┬──┬──┐  │  │        │
│   │          │─────────────────▶│  │ │fd│fd│fd│fd│  │  │        │
│   │          │  ADD/MOD/DEL     │  │ └──┴──┴──┴──┘  │  │        │
│   │          │                  │  └────────────────┘  │        │
│   │          │                  │                      │        │
│   │          │  epoll_wait()    │  ┌────────────────┐  │        │
│   │          │◀─────────────────│  │  Ready List    │  │        │
│   │          │  returns ready   │  │  (ready fds)   │  │        │
│   └──────────┘                  │  └────────────────┘  │        │
│                                 └──────────────────────┘        │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

Rust FFI với epoll

rust
use libc::{
    epoll_create1, epoll_ctl, epoll_wait,
    epoll_event, EPOLL_CTL_ADD, EPOLL_CTL_DEL, EPOLL_CTL_MOD,
    EPOLLIN, EPOLLOUT, EPOLLET, EPOLLONESHOT,
    c_int, close,
};
use std::io::{self, Error};
use std::os::unix::io::RawFd;

pub struct Epoll {
    fd: RawFd,
}

impl Epoll {
    /// Tạo epoll instance
    pub fn new() -> io::Result<Self> {
        let fd = unsafe { epoll_create1(0) };
        if fd < 0 {
            return Err(Error::last_os_error());
        }
        Ok(Self { fd })
    }
    
    /// Đăng ký file descriptor
    pub fn add(&self, fd: RawFd, events: u32, data: u64) -> io::Result<()> {
        let mut event = epoll_event {
            events,
            u64: data,
        };
        
        let result = unsafe {
            epoll_ctl(self.fd, EPOLL_CTL_ADD, fd, &mut event)
        };
        
        if result < 0 {
            return Err(Error::last_os_error());
        }
        Ok(())
    }
    
    /// Chờ events
    pub fn wait(&self, events: &mut [epoll_event], timeout_ms: i32) -> io::Result<usize> {
        let result = unsafe {
            epoll_wait(
                self.fd,
                events.as_mut_ptr(),
                events.len() as c_int,
                timeout_ms
            )
        };
        
        if result < 0 {
            return Err(Error::last_os_error());
        }
        Ok(result as usize)
    }
}

impl Drop for Epoll {
    fn drop(&mut self) {
        unsafe { close(self.fd); }
    }
}

Usage Example

rust
use std::net::TcpListener;
use std::os::unix::io::AsRawFd;

fn epoll_server() -> io::Result<()> {
    let listener = TcpListener::bind("0.0.0.0:8080")?;
    listener.set_nonblocking(true)?;
    
    let epoll = Epoll::new()?;
    
    // Register listener
    epoll.add(
        listener.as_raw_fd(),
        (EPOLLIN | EPOLLET) as u32,  // Edge-triggered
        0  // data = 0 means listener
    )?;
    
    let mut events = vec![epoll_event { events: 0, u64: 0 }; 1024];
    
    loop {
        let n = epoll.wait(&mut events, -1)?;
        
        for i in 0..n {
            let event = &events[i];
            
            if event.u64 == 0 {
                // Listener ready - accept connections
                while let Ok((stream, addr)) = listener.accept() {
                    stream.set_nonblocking(true)?;
                    let fd = stream.as_raw_fd();
                    epoll.add(fd, EPOLLIN as u32, fd as u64)?;
                }
            } else {
                // Client ready - handle I/O
                let fd = event.u64 as RawFd;
                handle_client(fd);
            }
        }
    }
}

3. kqueue (BSD/macOS)

Differences from epoll

Aspectepollkqueue
OriginLinuxBSD/macOS
Event typesRead/Write onlyAny: file, signal, timer...
API3 syscalls2 syscalls
Batch changesOne at a timeMultiple in one call

Rust FFI

rust
#[cfg(any(target_os = "macos", target_os = "freebsd"))]
mod kqueue_impl {
    use libc::{kqueue, kevent, close, EV_ADD, EV_DELETE, EVFILT_READ};
    use std::io::{self, Error};
    use std::os::unix::io::RawFd;
    
    pub struct Kqueue {
        fd: RawFd,
    }
    
    impl Kqueue {
        pub fn new() -> io::Result<Self> {
            let fd = unsafe { kqueue() };
            if fd < 0 {
                return Err(Error::last_os_error());
            }
            Ok(Self { fd })
        }
        
        pub fn add_read(&self, fd: RawFd) -> io::Result<()> {
            let changes = [kevent {
                ident: fd as usize,
                filter: EVFILT_READ,
                flags: EV_ADD,
                fflags: 0,
                data: 0,
                udata: std::ptr::null_mut(),
            }];
            
            let result = unsafe {
                kevent(self.fd, changes.as_ptr(), 1, std::ptr::null_mut(), 0, std::ptr::null())
            };
            
            if result < 0 {
                return Err(Error::last_os_error());
            }
            Ok(())
        }
    }
}

4. IOCP (Windows)

Completion-based vs Readiness-based

                    READINESS vs COMPLETION
┌─────────────────────────────────────────────────────────────────┐
│                                                                 │
│   epoll/kqueue (Readiness-based):                               │
│   ─────────────────────────────────                             │
│   1. Wait for "socket is READY to read"                         │
│   2. You call read() yourself                                   │
│   3. Handle data                                                │
│                                                                 │
│   IOCP (Completion-based):                                      │
│   ───────────────────────────                                   │
│   1. Submit "read into this buffer"                             │
│   2. OS does the read                                           │
│   3. Wait for "read COMPLETED"                                  │
│   4. Handle data (already in buffer)                            │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

Windows Rust Example

rust
#[cfg(windows)]
mod iocp_impl {
    use windows_sys::Win32::System::IO::{
        CreateIoCompletionPort, GetQueuedCompletionStatus,
        OVERLAPPED,
    };
    use std::io::{self, Error};
    
    pub struct Iocp {
        handle: isize,
    }
    
    impl Iocp {
        pub fn new() -> io::Result<Self> {
            let handle = unsafe {
                CreateIoCompletionPort(-1isize, 0, 0, 0)
            };
            if handle == 0 {
                return Err(Error::last_os_error());
            }
            Ok(Self { handle })
        }
        
        pub fn wait(&self) -> io::Result<CompletionEvent> {
            let mut bytes = 0u32;
            let mut key = 0usize;
            let mut overlapped: *mut OVERLAPPED = std::ptr::null_mut();
            
            let result = unsafe {
                GetQueuedCompletionStatus(
                    self.handle,
                    &mut bytes,
                    &mut key,
                    &mut overlapped,
                    u32::MAX // Infinite timeout
                )
            };
            
            if result == 0 {
                return Err(Error::last_os_error());
            }
            
            Ok(CompletionEvent { bytes, key })
        }
    }
    
    pub struct CompletionEvent {
        pub bytes: u32,
        pub key: usize,
    }
}

5. Cross-Platform Abstraction (mio)

rust
use mio::{Events, Interest, Poll, Token};
use mio::net::TcpListener;
use std::io;

const SERVER: Token = Token(0);

fn mio_server() -> io::Result<()> {
    let mut poll = Poll::new()?;
    let mut events = Events::with_capacity(1024);
    
    let addr = "0.0.0.0:8080".parse().unwrap();
    let mut server = TcpListener::bind(addr)?;
    
    // mio sử dụng epoll/kqueue/IOCP tùy platform
    poll.registry().register(
        &mut server,
        SERVER,
        Interest::READABLE
    )?;
    
    loop {
        poll.poll(&mut events, None)?;
        
        for event in events.iter() {
            match event.token() {
                SERVER => {
                    let (mut conn, addr) = server.accept()?;
                    println!("Connection from {}", addr);
                }
                _ => {}
            }
        }
    }
}

6. Performance Comparison

MetricepollkqueueIOCP
Add connectionO(1)O(1)O(1)
Wait for eventsO(ready)O(ready)O(ready)
Memory/connection~160 bytes~200 bytes~300 bytes
Max connections~1M+~1M+~1M+

🎯 Best Practices

  1. Use Edge-Triggered (ET) — Ít syscalls hơn Level-Triggered
  2. Non-blocking sockets — Bắt buộc với event loops
  3. Batch operations — kqueue cho phép batch changes
  4. Use mio for cross-platform — Abstracts OS differences

💡 TOKIO INSIGHT

Tokio's runtime wraps mio, which wraps epoll/kqueue/IOCP. Hiểu layer này giúp debug performance issues.