Good encrypted swap
I’m presently trying to build a Linux-based hypervisor that does some mild oversubscription of its RAM. I also want it to be unreasonably secure. In combination, this means that I want my swap partition to be both encrypted and authenticated: the typical aes-xts-plain algorithm used by LUKS is insufficient, because it straightforwardly allows an attacker (or just a buggy SSD) to corrupt pages at will. If this goes un-mitigated, this means that the system can enter any number of undefined states. I’d prefer the two binary states:
- the program continued doing what it was supposed to do
- the program crashed (SIGBUS) with a clear error message
This is a good use—one of the few good uses—for dm-integrity. But dm-integrity
strongly encourages you to do an initial format of the device by initializing
tags, which is an O(n) operation. And given that I want to 1) rotate keys on
every bootup and 2) boot quickly, I went looking for a way to avoid that setup
cost. It turns out the answer is straightforward: you can just pass the
--integrity-no-wipe mount option. We also pass --integrity-no-journal to
remove the performance-degrading journal: it’s only needed for crash recovery,
and we’d be wiping the device and starting over on crashes anyway.
The result works great, with the caveat that udev sees the new device and calls
blkid to probe it, which prints angry-looking messages to dmesg like
device-mapper: crypt: dm-0: INTEGRITY AEAD ERROR, sector 8.
#!/bin/bash
# Set up authenticated encrypted swap using LUKS2 with AEAD.
set -euo pipefail
if [ $# -ne 1 ]; then
echo "usage: ${0##*/} <block-device>" >&2
exit 1
fi
DEVICE=$1
DM_NAME=swap_encrypted
KEY_SIZE=16 # aegis128: 128-bit key
KEY_FILE=$(mktemp)
PASS_FILE=$(mktemp)
trap 'rm -f "$KEY_FILE" "$PASS_FILE"' EXIT
head -c "$KEY_SIZE" /dev/urandom > "$KEY_FILE"
head -c 32 /dev/urandom > "$PASS_FILE"
cryptsetup luksFormat \
--type luks2 \
--cipher aegis128-random \
--key-size $((KEY_SIZE * 8)) \
--integrity aead \
--sector-size 4096 \
--integrity-no-wipe \
--integrity-no-journal \
--volume-key-file "$KEY_FILE" \
--key-file "$PASS_FILE" \
--pbkdf pbkdf2 --pbkdf-force-iterations 1000 \
--batch-mode \
"$DEVICE"
cryptsetup open \
--type luks2 \
--integrity-no-journal \
--key-file "$PASS_FILE" \
"$DEVICE" "$DM_NAME"
# Write a minimal swap header directly. mkswap reads the device first to check
# for existing signatures, which fails because AEAD tags are uninitialized
# (--integrity-no-wipe). Writing page 0 with pwrite avoids any reads.
python3 -c "
import os, struct
path = '/dev/mapper/$DM_NAME'
with open(path, 'r+b') as dev:
size = dev.seek(0, 2)
last_page = size // 4096 - 1
hdr = bytearray(4096)
struct.pack_into('<III', hdr, 1024, 1, last_page, 0)
hdr[1036:1052] = os.urandom(16)
hdr[-10:] = b'SWAPSPACE2'
os.pwrite(dev.fileno(), hdr, 0)
"
Epilogue: why did I say that dm-integrity had few good uses?
I think it’s a perfectly nice piece of software, I’m glad the authors wrote it, and I really like that they built a journaling mode for doing AEAD on block devices without encountering a “write hole”. My two complaints are:
- it requires O(n) initialization of the device
- it doesn’t do key rotation well
I think that these complaints are not about dm-integrity the software, but rather about the entire solution-space it inhabits. I’m basically criticizing a banana for not being a mango.
My preferred approach would be to address integrity concerns, even cryptographic ones, at the filesystem level. The filesystem already knows which ranges of the disk are uninitialized, and can use that knowledge to obviate any O(n) setup steps. A filesystem can also straightforwardly rotate keys at frequent intervals (e.g. a new data key for each committed transaction), rather than the very static approach that LUKS takes. More on that in another post, hopefully.
Epilogue 2: but what if I really hate the errors from blkid scanning?
In this age of AI coding assistants, if you tell Claude “I really hate the blkid scanning”, then you’ll end up with a script that uses libdevicemapper to create the device, passing all of the flags that tell udev to ignore this device
The other option would be a name-based heuristic in a udev rule.
#!/usr/bin/python3
"""Set up authenticated encrypted swap using dm-integrity + dm-crypt.
The device stack is:
sys.argv[1]
-> dm-integrity (tag storage for AEAD, mode D / no journal / no wipe)
-> dm-crypt with AEGIS128-random (AEAD: combined encryption + authentication)
-> write swap header + swapon
Constructs the device-mapper stack directly (no LUKS) via libdevmapper, so we can set udev cookie
flags at device creation time. A fresh random key is generated every boot and exists only in
kernel memory after the dm-crypt device is created.
"""
import ctypes
import ctypes.util
import os
import struct
import sys
# -- Configuration --
DM_NAME = "swap_encrypted"
PAGE_SIZE = 4096
BLOCK_SIZE = 4096
KEY_SIZE = 16 # aegis128: 128-bit key
TAG_SIZE = 32 # aegis128: 16-byte nonce + 16-byte auth tag
# -- libdevmapper ctypes bindings --
_libdm_path = ctypes.util.find_library("devmapper") or "libdevmapper.so.1.02.1"
_libdm = ctypes.CDLL(_libdm_path)
DM_DEVICE_CREATE = 0
DM_DEVICE_REMOVE = 2
DM_UDEV_DISABLE_DM_RULES_FLAG = 0x0001
DM_UDEV_DISABLE_SUBSYSTEM_RULES_FLAG = 0x0002
DM_UDEV_DISABLE_DISK_RULES_FLAG = 0x0004
DM_UDEV_DISABLE_OTHER_RULES_FLAG = 0x0008
_UDEV_SUPPRESS_ALL = (
DM_UDEV_DISABLE_DM_RULES_FLAG
| DM_UDEV_DISABLE_SUBSYSTEM_RULES_FLAG
| DM_UDEV_DISABLE_DISK_RULES_FLAG
| DM_UDEV_DISABLE_OTHER_RULES_FLAG
)
_libdm.dm_task_create.argtypes = [ctypes.c_int]
_libdm.dm_task_create.restype = ctypes.c_void_p
_libdm.dm_task_destroy.argtypes = [ctypes.c_void_p]
_libdm.dm_task_destroy.restype = None
_libdm.dm_task_set_name.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
_libdm.dm_task_set_name.restype = ctypes.c_int
_libdm.dm_task_add_target.argtypes = [
ctypes.c_void_p,
ctypes.c_uint64,
ctypes.c_uint64,
ctypes.c_char_p,
ctypes.c_char_p,
]
_libdm.dm_task_add_target.restype = ctypes.c_int
_libdm.dm_task_set_cookie.argtypes = [
ctypes.c_void_p,
ctypes.POINTER(ctypes.c_uint32),
ctypes.c_uint16,
]
_libdm.dm_task_set_cookie.restype = ctypes.c_int
_libdm.dm_task_run.argtypes = [ctypes.c_void_p]
_libdm.dm_task_run.restype = ctypes.c_int
_libdm.dm_udev_wait.argtypes = [ctypes.c_uint32]
_libdm.dm_udev_wait.restype = ctypes.c_int
# -- libc swapon --
# Calling swapon(2) directly avoids the userspace swapon binary, which reads the device (triggering
# page cache readahead) before issuing the syscall. Those readahead pages hit sectors with
# uninitialized AEAD integrity tags, producing spurious INTEGRITY AEAD ERROR messages. The kernel's
# sys_swapon reads only page 0 via read_cache_folio (no readahead), so calling the syscall directly
# avoids the problem entirely.
_libc = ctypes.CDLL(ctypes.util.find_library("c"), use_errno=True)
libc_swapon = _libc.swapon
libc_swapon.argtypes = [ctypes.c_char_p, ctypes.c_int]
libc_swapon.restype = ctypes.c_int
def _dm_create(name: str, size: int, target: str, params: str, udev_flags: int = 0) -> None:
"""Create a device-mapper device with the given table and udev cookie flags."""
task = _libdm.dm_task_create(DM_DEVICE_CREATE)
if not task:
raise RuntimeError("dm_task_create failed")
cookie = ctypes.c_uint32(0)
try:
if not _libdm.dm_task_set_name(task, name.encode()):
raise RuntimeError(f"dm_task_set_name failed: {name}")
if not _libdm.dm_task_add_target(task, 0, size, target.encode(), params.encode()):
raise RuntimeError(f"dm_task_add_target failed: {target}")
if not _libdm.dm_task_set_cookie(task, ctypes.byref(cookie), udev_flags):
raise RuntimeError("dm_task_set_cookie failed")
if not _libdm.dm_task_run(task):
raise RuntimeError(f"dm_task_run failed for {name}")
finally:
_libdm.dm_task_destroy(task)
_libdm.dm_udev_wait(cookie.value)
def _dm_remove(name: str) -> None:
"""Remove a device-mapper device."""
task = _libdm.dm_task_create(DM_DEVICE_REMOVE)
if not task:
raise RuntimeError("dm_task_create(REMOVE) failed")
cookie = ctypes.c_uint32(0)
try:
if not _libdm.dm_task_set_name(task, name.encode()):
raise RuntimeError(f"dm_task_set_name failed: {name}")
if not _libdm.dm_task_set_cookie(task, ctypes.byref(cookie), 0):
raise RuntimeError("dm_task_set_cookie failed")
if not _libdm.dm_task_run(task):
raise RuntimeError(f"dm_task_run(REMOVE) failed for {name}")
finally:
_libdm.dm_task_destroy(task)
_libdm.dm_udev_wait(cookie.value)
# -- dm-integrity superblock --
# struct superblock { __u8 magic[8]; __u8 version; __u8 log2_interleave_sectors;
# __le16 integrity_tag_size; __le32 journal_sections;
# __le64 provided_data_sectors; ... };
_SB_MAGIC = b"integrt\x00"
_SB_SECTORS = 8 # superblock occupies 8 x 512-byte sectors = 4096 bytes
def _zero_superblock(device: str) -> None:
"""Zero the dm-integrity superblock so the kernel will format on next table load."""
with open(device, "r+b") as f:
os.pwrite(f.fileno(), b"\x00" * (_SB_SECTORS * 512), 0)
def _read_provided_data_sectors(device: str) -> int:
"""Read provided_data_sectors from the dm-integrity superblock on the raw device."""
with open(device, "rb") as f:
sb = f.read(24)
if sb[:8] != _SB_MAGIC:
raise RuntimeError(f"bad dm-integrity superblock magic: {sb[:8]!r}")
return struct.unpack_from("<Q", sb, 16)[0]
def _format_integrity(device: str) -> int:
"""Format dm-integrity on the raw device and return provided_data_sectors.
Creates a temporary 1-block dm-integrity device, which triggers the kernel to write a
fresh superblock. The device is immediately removed; the superblock persists on disk.
"""
_zero_superblock(device)
temp_name = f"_integrity_fmt_{os.getpid()}"
integrity_params = f"{device} 0 {TAG_SIZE} D 1 block_size:{BLOCK_SIZE}"
_dm_create(
temp_name,
BLOCK_SIZE // 512,
"integrity",
integrity_params,
udev_flags=_UDEV_SUPPRESS_ALL,
)
_dm_remove(temp_name)
return _read_provided_data_sectors(device)
# -- swap header --
def _write_swap_header(path: str) -> None:
"""Write a minimal swap header directly, avoiding reads against uninitialized AEAD tags.
mkswap reads the first page to check for existing signatures, which would fail because the
integrity tags are uninitialized. Writing the header ourselves avoids the read entirely.
"""
with open(path, "r+b") as dev:
device_size = dev.seek(0, 2)
last_page = device_size // PAGE_SIZE - 1
# Make a valid swap superblock
# If anything attempts to read past this superblock, it'll produce AEAD
# integrity errors. In the past this was caused by the (useless, it
# appears) blk-availability service.
header = bytearray(PAGE_SIZE)
struct.pack_into("<III", header, 1024, 1, last_page, 0)
header[1036:1052] = os.urandom(16) # sws_uuid
header[-10:] = b"SWAPSPACE2"
os.pwrite(dev.fileno(), header, 0)
# -- main --
def start(device: str) -> None:
integrity_name = f"{DM_NAME}_integrity"
provided_data_sectors = _format_integrity(device)
# DM_UDEV_DISABLE_DISK_RULES_FLAG prevents udev's 60-persistent-storage-dm.rules from
# running blkid on these devices. Without it, blkid reads sectors with uninitialized
# integrity tags (we skip the O(device-size) tag wipe), and dm-crypt returns AEAD errors
# for every probed sector. The flag is encoded in the DM cookie and decoded by
# 55-dm.rules, so it travels with the device creation event rather than requiring a
# separate udev rule matched on device name.
integrity_params = f"{device} 0 {TAG_SIZE} D 1 block_size:{BLOCK_SIZE}"
_dm_create(
integrity_name,
provided_data_sectors,
"integrity",
integrity_params,
udev_flags=DM_UDEV_DISABLE_DISK_RULES_FLAG,
)
key_hex = os.urandom(KEY_SIZE).hex()
crypt_params = (
f"capi:aegis128-random {key_hex} 0"
f" /dev/mapper/{integrity_name} 0"
f" 2 integrity:{TAG_SIZE}:aead sector_size:{BLOCK_SIZE}"
)
_dm_create(
DM_NAME,
provided_data_sectors,
"crypt",
crypt_params,
udev_flags=DM_UDEV_DISABLE_DISK_RULES_FLAG,
)
swap_path = f"/dev/mapper/{DM_NAME}"
_write_swap_header(swap_path)
if libc_swapon(swap_path.encode(), 0) != 0:
errno = ctypes.get_errno()
raise OSError(errno, os.strerror(errno), swap_path)
def main() -> None:
if len(sys.argv) != 2:
print(f"usage: {sys.argv[0]} <block-device>", file=sys.stderr)
sys.exit(1)
start(sys.argv[1])
if __name__ == "__main__":
main()