dio provides production-grade, bare-metal I/O primitives for Go programs
that need to bypass the Linux page cache: page-aligned memory pooling,
io_uring/pread/pwrite scheduling, and OS-level file-descriptor wrappers
that correctly manage GC lifetime across raw syscalls.
Extracted from an internal storage engine and battle-tested on NVMe hardware.
go get github.com/miretskiy/dio
Requires Go 1.25+. Fully functional on Linux and Darwin; graceful no-ops on other POSIX platforms.
| Package | Purpose |
|---|---|
dio/align |
Page-aligned memory allocation and O_DIRECT arithmetic |
dio/mempool |
Fixed-capacity pool of pre-warmed mmap buffers |
dio/sys |
OS file-descriptor primitives: Fallocate, Fdatasync, PunchHole, … |
dio/iosched |
Pluggable I/O scheduler: synchronous pread/pwrite or async io_uring |
Alignment helpers for O_DIRECT I/O. All O_DIRECT operations on Linux require the buffer address, transfer size, and file offset to be multiples of the filesystem block size (4 KiB on most systems).
import "github.com/miretskiy/dio/align"
// Round a size up to the next 4 KiB boundary.
aligned := align.PageAlign(4097) // → 8192
// Compute aligned parameters for a byte range — useful before a pread.
off, length := align.AlignRange(offset, int(size))
buf := align.AllocAligned(int(length))
defer align.FreeAligned(buf)
n, err := f.ReadAt(buf, off)
data := buf[offset-off : offset-off+size] // trim lead padding
// Check whether a slice is page-aligned (required before O_DIRECT write).
if !align.IsAligned(buf) {
return errors.New("buffer not page-aligned")
}
// Grow a reusable aligned buffer without reallocating if already large enough.
buf = align.GrowAligned(buf, newSize) // old buf freed if too smallAllocAligned uses mmap(MAP_ANON) — guaranteed page-aligned, pre-warmed
to commit physical RAM. Not managed by the GC: you must call FreeAligned
when done.
buf := align.AllocAligned(1 << 20) // 1 MiB, page-aligned
defer align.FreeAligned(buf)
// buf is safe to pass to O_DIRECT read/writeA fixed-capacity pool of pre-allocated, page-aligned mmap buffers. All memory is committed at construction time — the hot path never touches the kernel allocator.
import "github.com/miretskiy/dio/mempool"
// 32 × 1 MiB slabs, pre-allocated and pre-warmed.
pool := mempool.NewMmapPool("writes", 1<<20, 32)
defer pool.Close()
// Acquire blocks until a buffer is available (token-bucket backpressure).
buf := pool.Acquire()
// Write data into it, then submit to the I/O path.
copy(buf.Bytes(), payload)
submitWrite(buf.AlignedBytes(int64(len(payload)))) // page-aligned prefix
// Release when done — returns the raw memory to the pool.
buf.Unpin()buf, ok := pool.TryAcquire()
if !ok {
// Pool is currently empty; handle backpressure explicitly.
metrics.PoolExhausted.Inc()
buf = pool.Acquire() // block
}
defer buf.Unpin()The pool uses reference counting so multiple goroutines can read from the same buffer simultaneously.
buf := pool.Acquire() // refCount = 1
for _, reader := range readers {
if !buf.TryInc() { // refCount++; returns false if already released
break
}
go func(b *mempool.MmapBuffer) {
defer b.Unpin()
process(b.Bytes())
}(buf)
}
buf.Unpin() // writer releases its pin; last reader will return to poolABA safety: each Acquire wraps the raw memory in a fresh *MmapBuffer
struct, so stale pointers held by racing readers remain distinct even when
the underlying memory is reused.
For allocations that exceed the pool's slab size:
buf := mempool.NewMmapBuffer(int64(largeSize))
defer buf.Unpin() // Munmaps directly — no pool involved// In init() or TestMain — panics on double-release or other misuse.
mempool.SetPanicOnMisuse(true)OS-level file-descriptor primitives. Every function that issues a raw
syscall accepts *os.File and calls runtime.KeepAlive after the syscall
to prevent the GC from finalizing the file while the kernel holds its fd.
import "github.com/miretskiy/dio/sys"
// Open with O_DIRECT | O_DSYNC — bypasses page cache, syncs data on write.
f, err := sys.CreateDirect(path, sys.FlDirectIO|sys.FlDSync)
if err != nil {
return err
}
defer f.Close()
// Pre-allocate contiguous disk space to reduce fragmentation.
if err := sys.Fallocate(f, 512<<20 /* 512 MiB */); err != nil {
return err
}buf := align.AllocAligned(blockSize)
defer align.FreeAligned(buf)
fillBuffer(buf)
if _, err := f.Write(buf); err != nil {
return err
}
// On Darwin, O_DSYNC is unavailable at open time — explicit sync required.
if err := sys.SyncFile(f, sys.FlDSync); err != nil {
return err
}// Reclaim disk space occupied by a deleted record.
// The range is aligned inward to block boundaries so adjacent data is safe.
reclaimed, err := sys.PunchHole(f, recordOffset, recordSize)
if err != nil {
return err
}
log.Printf("reclaimed %d bytes", reclaimed)// Tell the kernel we will read sequentially — enables aggressive read-ahead.
sys.Fadvise(f, 0, fileSize, sys.FadvSequential)
// After reading, release the pages from the page cache.
sys.Fadvise(f, 0, fileSize, sys.FadvDontNeed)if sys.RequiresAlignment {
// Linux O_DIRECT: buffer address, size, and offset must all be 4 KiB-aligned.
}
if sys.UseFadvise {
// Linux: posix_fadvise is effective.
// Darwin: F_RDAHEAD is coarser; this is false.
}Pluggable positioned I/O scheduler. The interface is identical on all platforms; the implementation is swapped at construction time.
import "github.com/miretskiy/dio/iosched"
// Pick the best available backend automatically:
// Linux + io_uring kernel support -> URingScheduler
// Otherwise -> POSIXScheduler
sched := iosched.NewDefaultScheduler()
defer sched.Close()
f, _ := os.Open(path)
defer f.Close()
buf := make([]byte, blockSize)
ticket, err := sched.Submit(iosched.ReadOp(f, buf, offset))
if err != nil {
return err
}
defer ticket.Release()
ticket.Wait()
if err := ticket.Error(); err != nil {
return err
}Synchronous Scheduler implementation for callers that want the same
Submit/Ticket API without io_uring. Each Submit performs the POSIX
operation before returning, so Ticket.Wait is immediate.
sched := iosched.NewPOSIXScheduler()
defer sched.Close()Asynchronous io_uring scheduler. A single coordinator goroutine owns the ring.
Submitters publish Op values through an intrusive lock-free MPSC stack and a
buffered doorbell; the coordinator fills free SQ slots and reaps CQEs.
if !iosched.IOUringAvailable {
log.Println("io_uring not available, falling back to pwrite")
}
sched, err := iosched.NewURingScheduler(iosched.URingConfig{
RingDepth: 256, // SQ/CQ entries, must be power of two
SQPOLL: false, // true burns one CPU core for kernel-side polling
})
if err != nil {
return err
}
defer sched.Close()
ticket, err := sched.Submit(iosched.ReadOp(f, buf, offset))
if err != nil {
return err
}
defer ticket.Release()
ticket.Wait()
if err := ticket.Error(); err != nil {
return err
}
n := ticket.Op.Result.NSubmit does not implement application-level backpressure. It is a
non-blocking handoff into the scheduler; callers that need an in-flight cap
should wrap the scheduler with their own semaphore, token bucket, queue, or
retry policy.
var wg sync.WaitGroup
for _, req := range requests {
wg.Add(1)
go func(r Request) {
defer wg.Done()
buf := make([]byte, r.Size)
n, err := bio.ReadAt(f, buf, r.Offset)
r.Done(buf[:n], err)
}(req)
}
wg.Wait()
// URingScheduler coalesces ready submissions into fewer io_uring_enter syscalls.stats := sched.Stats()
if stats.Syscalls != 0 {
fmt.Printf("ops/syscall=%.1f\n",
float64(stats.OpsPlaced)/float64(stats.Syscalls))
}Every sys function that extracts a raw fd from *os.File and passes it
to a kernel syscall follows this pattern:
fd := int(f.Fd()) // extract once before any loop
var err error
for {
err = unix.Fdatasync(fd)
if err != unix.EINTR { break }
}
runtime.KeepAlive(f) // single KeepAlive after the loop
return errf.Fd() returns a uintptr. Once extracted, the Go runtime no longer
considers the integer a reference to f, so the GC could finalize f
(closing the fd) while the kernel is blocked in the syscall. KeepAlive
extends f's lifetime to cover the entire loop.
# Unit tests (Darwin/Linux)
go test ./...
# Race detector
go test -race ./...
# Linux-only tests (hole punch, io_uring) — run on an NVMe volume
TMPDIR=/instance_storage go test ./... -vApache 2.0 — see LICENSE.