//
// Syd: rock-solid application kernel
// src/workers/ipc.rs: `syd_ipc' remote-configuration thread
//
// Copyright (c) 2025 Ali Polatel <alip@chesswob.org>
//
// SPDX-License-Identifier: GPL-3.0

use std::{
    borrow::Cow,
    hash::{Hash, Hasher},
    io::{IoSlice, IoSliceMut},
    os::{
        fd::{AsFd, AsRawFd, FromRawFd, OwnedFd, RawFd},
        unix::net::{UnixListener, UnixStream},
    },
    sync::{atomic::Ordering, Arc, RwLock},
    thread,
};

use libseccomp::{scmp_cmp, ScmpAction, ScmpFilterContext, ScmpSyscall};
use memchr::{arch::all::is_equal, memchr};
use nix::{
    cmsg_space,
    errno::Errno,
    fcntl::OFlag,
    poll::PollTimeout,
    sched::{unshare, CloneFlags},
    sys::{
        epoll::{Epoll, EpollCreateFlags, EpollEvent, EpollFlags},
        socket::{
            accept4, bind, listen, recvmsg, sendmsg, setsockopt, socket, sockopt::PassCred,
            AddressFamily, Backlog, ControlMessageOwned, SockFlag, SockType, UnixAddr,
            UnixCredentials,
        },
        stat::{umask, Mode},
    },
    unistd::{write, Gid, Uid},
};
use serde::{ser::SerializeMap, Serialize, Serializer};

use crate::{
    alert,
    compat::{epoll_ctl_mod_safe, epoll_ctl_safe, MsgFlags},
    config::*,
    confine::{confine_scmp_madvise, confine_scmp_write, scmp_add_setid_rules},
    err::{err2no, scmp2no, SydJoinHandle, SydResult},
    fd::closeexcept,
    fs::peer_cred,
    hash::SydHashMap,
    info,
    landlock::Errata,
    landlock_policy::LandlockPolicy,
    path::{XPath, XPathBuf},
    retry::retry_on_eintr,
    rng::duprand,
    sandbox::{Flags, Sandbox, LINE_MAX},
};

const IPC_ACK: &[u8] = b"{\"err\":0,\"msg\":\"ACK\"}\n";
const IPC_AUTH: &[u8] = b"{\"err\":13,\"msg\":\"AUTH\"}\n";
const IPC_RATE: &[u8] = b"{\"err\":7,\"msg\":\"RATE\"}\n";
const IPC_PINK: &[u8] = b"{\"err\":0,\
\"msg\":\"Change return success. Going and coming without error. Action brings good fortune.\"}\n";
const IPC_PONG: &[u8] = b"{\"err\":0,\"msg\":\"PONG\"}\n";

// Epoll-based, single-threaded IPC server over a UNIX socket.
pub(crate) struct IpcWorker {
    // Path to the UNIX socket (possibly abstract if it starts with '@')
    //
    // This is set to `None` once the bind is successful.
    addr: Option<XPathBuf>,
    // Epoll file descriptor
    pub(crate) epoll: Option<Epoll>,
    // Listener socket
    pub(crate) sock: Option<UnixListener>,
    // Reference to the Sandbox to be configured
    sandbox: Option<Arc<RwLock<Sandbox>>>,
    // Sandbox flags specified at startup.
    flags: Flags,
    // Credentials for authentication.
    creds: (Option<Uid>, Option<Gid>),

    // SafeSetId UID/GID transitions.
    transit_uids: Vec<(Uid, Uid)>,
    transit_gids: Vec<(Gid, Gid)>,
}

// Prompt modes supported (similar to HAProxy)
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum PromptMode {
    // Non-interactive: single command, then close.
    NonInteractive,
    // Interactive: accept multiple commands, no prompt.
    Interactive,
    // Prompt: accept multiple commands and send prompt "; " before each.
    Prompt,
}

impl Serialize for PromptMode {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: Serializer,
    {
        let s = match *self {
            Self::NonInteractive => "non-interactive",
            Self::Interactive => "interactive",
            Self::Prompt => "prompt",
        };
        serializer.serialize_str(s)
    }
}

/// Per-connection state: buffering incoming bytes,
/// pending outgoing bytes, and tracking mode.
struct Connection {
    /// Credentials of the other end.
    creds: UnixCredentials,
    /// Underlying UNIX-stream socket.
    stream: UnixStream,
    /// Buffered incoming bytes until a full line arrives.
    buf: Vec<u8>,
    /// Buffer of bytes to write (response + prompt).
    write_buf: Vec<u8>,
    /// Current position in `write_buf`.
    write_pos: usize,
    /// Prompt mode for this connection.
    mode: PromptMode,
    /// Whether to close after flushing the write buffer.
    should_close: bool,
}

// Type for Connection map.
// We use HashMap because HashSet does not have get_mut!
type ConnectionMap = SydHashMap<RawFd, Connection>;

impl Hash for Connection {
    fn hash<H: Hasher>(&self, state: &mut H) {
        self.stream.as_fd().as_raw_fd().hash(state);
    }
}

impl PartialEq for Connection {
    fn eq(&self, other: &Self) -> bool {
        self.stream.as_fd().as_raw_fd() == other.stream.as_fd().as_raw_fd()
    }
}

impl Eq for Connection {}

// Wrapper type over UnixCredentials that implements Serialize.
struct Creds(UnixCredentials);

impl Serialize for Creds {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: Serializer,
    {
        let mut m = serializer.serialize_map(Some(3))?;
        m.serialize_entry("uid", &self.0.uid())?;
        m.serialize_entry("gid", &self.0.gid())?;
        m.serialize_entry("pid", &self.0.pid())?;
        m.end()
    }
}

impl Serialize for Connection {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: Serializer,
    {
        let mut map = serializer.serialize_map(Some(8))?;

        map.serialize_entry("fd", &self.stream.as_raw_fd())?;
        map.serialize_entry("ilen", &self.buf.len())?;
        map.serialize_entry("icap", &self.buf.capacity())?;
        map.serialize_entry("olen", &self.write_buf.len())?;
        map.serialize_entry("ocap", &self.write_buf.capacity())?;
        map.serialize_entry("mode", &self.mode)?;
        map.serialize_entry("close", &self.should_close)?;
        map.serialize_entry("creds", &Creds(self.creds))?;

        map.end()
    }
}

impl Connection {
    fn new(creds: UnixCredentials, stream: UnixStream) -> Self {
        Self {
            creds,
            stream,
            buf: Vec::with_capacity(1024),
            write_buf: Vec::with_capacity(1024),
            write_pos: 0,
            mode: PromptMode::NonInteractive,
            should_close: false,
        }
    }

    // Change epoll(7) interests.
    fn ctl(&self, epoll: &Epoll, flags: EpollFlags) -> Result<(), Errno> {
        let fd = self.stream.as_raw_fd();

        #[expect(clippy::cast_sign_loss)]
        let event = libc::epoll_event {
            events: flags.bits() as u32,
            u64: fd as u64,
        };

        epoll_ctl_mod_safe(&epoll.0, fd, event)
    }

    // Append data to the write buffer.
    #[inline]
    fn enqueue_response(&mut self, data: &[u8]) {
        self.write_buf.extend_from_slice(data);
    }

    // Enqueue the prompt string ("; ") in `write_buf`.
    #[inline]
    fn enqueue_prompt(&mut self) {
        self.write_buf.extend_from_slice(b"; ");
    }
}

impl IpcWorker {
    // Create a new IpcWorker for the given path.
    //
    // If `addr` starts with '@', an abstract UNIX socket is used.
    //
    // This does not bind or listen yet; only stores the path.
    // Returns immediately without error.
    pub(crate) fn new(
        addr: &XPath,
        uid: Option<Uid>,
        gid: Option<Gid>,
        flags: Flags,
        transit_uids: &[(Uid, Uid)],
        transit_gids: &[(Gid, Gid)],
    ) -> Self {
        IpcWorker {
            flags,
            creds: (uid, gid),
            addr: Some(addr.to_owned()),
            epoll: None,
            sock: None,
            sandbox: None,
            transit_uids: transit_uids.to_vec(),
            transit_gids: transit_gids.to_vec(),
        }
    }

    // Create the listener socket, bind it, make it non-blocking, listen, and create epoll.
    //
    // This function sets umask(2) and therefore may not be thread-safe!
    pub(crate) fn setup(&mut self) -> Result<(), Errno> {
        // Prepare UNIX socket address, `@' prefix implies abstract socket.
        let addr = self.addr.as_ref().ok_or(Errno::EADDRNOTAVAIL)?;
        let addr = if matches!(addr.first(), Some(b'@')) {
            UnixAddr::new_abstract(&addr.as_bytes()[1..])?
        } else {
            UnixAddr::new(addr.as_bytes())?
        };
        self.addr = None; // No longer needed.

        // Create UNIX stream socket, set to non-blocking.
        let sock = socket(
            AddressFamily::Unix,
            SockType::Stream,
            SockFlag::SOCK_CLOEXEC | SockFlag::SOCK_NONBLOCK,
            None,
        )?;

        // SAFETY: Randomize to make fd reuse harder.
        let sock_fd = duprand(sock.as_raw_fd(), OFlag::O_CLOEXEC).inspect(|fd| {
            // SAFETY: Used to prevent leaks in sandbox process:
            let mut fd_str = itoa::Buffer::new();
            let fd_str = fd_str.format(fd.as_raw_fd());
            std::env::set_var(ENV_IPC_UNIX_FD, fd_str);
        })?;
        drop(sock);
        let sock = sock_fd;

        // Set SO_PASSCRED for authentication.
        setsockopt(&sock, PassCred, &true)?;

        // SAFETY: Ensure socket is created with sane permissions.
        let umask_orig = umask(Mode::from_bits_truncate(0o077));
        let result = bind(sock.as_raw_fd(), &addr);
        umask(umask_orig);
        result?;

        listen(&sock, Backlog::MAXCONN)?;

        // Create epoll instance.
        let epoll = Epoll::new(EpollCreateFlags::EPOLL_CLOEXEC)?;

        // SAFETY: Randomize the epoll fd to make fd reuse harder.
        let epoll_fd = duprand(epoll.0.as_raw_fd(), OFlag::O_CLOEXEC).inspect(|fd| {
            // SAFETY: Used to prevent leaks in sandbox process:
            let mut fd_str = itoa::Buffer::new();
            let fd_str = fd_str.format(fd.as_raw_fd());
            std::env::set_var(ENV_IPC_POLL_FD, fd_str);
        })?;
        drop(epoll);
        let epoll = Epoll(epoll_fd);

        // Register listener_fd for EPOLLIN.
        #[expect(clippy::cast_sign_loss)]
        let event = libc::epoll_event {
            events: EpollFlags::EPOLLIN.bits() as u32,
            u64: sock.as_fd().as_raw_fd() as u64,
        };
        epoll_ctl_safe(&epoll.0, sock.as_fd().as_raw_fd(), Some(event))?;

        self.epoll = Some(epoll);
        self.sock = Some(UnixListener::from(sock));

        Ok(())
    }

    // Set reference to the Sandbox to be configured.
    pub fn set_sandbox(&mut self, sandbox: Arc<RwLock<Sandbox>>) {
        self.sandbox = Some(sandbox);
    }

    // Confine IPC thread.
    #[expect(clippy::cognitive_complexity)]
    pub(crate) fn prepare_confine(
        epoll_fd: RawFd,
        flags: Flags,
        transit_uids: &[(Uid, Uid)],
        transit_gids: &[(Gid, Gid)],
        dry_run: bool,
    ) -> SydResult<ScmpFilterContext> {
        if !dry_run {
            // SAFETY: Set up a Landlock sandbox to disallow all access.
            let abi = crate::landlock::ABI::new_current();
            let errata = crate::landlock::Errata::query();
            let policy = LandlockPolicy {
                scoped_abs: true,
                scoped_sig: errata.contains(Errata::SCOPED_SIGNAL_SAME_TGID),
                ..Default::default()
            };
            let _ = policy.restrict_self(abi);
        }

        // Create seccomp filter with default action.
        let mut ctx = ScmpFilterContext::new(ScmpAction::KillProcess)?;

        // Enforce the NO_NEW_PRIVS functionality before
        // loading the seccomp filter into the kernel.
        ctx.set_ctl_nnp(true)?;

        // DO NOT synchronize filter to all threads.
        // Other threads will self-confine.
        ctx.set_ctl_tsync(false)?;

        // Disable Speculative Store Bypass mitigations
        // with trace/allow_unsafe_exec_speculative:1
        ctx.set_ctl_ssb(flags.allow_unsafe_exec_speculative())?;

        // We kill for bad system call and bad arch.
        ctx.set_act_badarch(ScmpAction::KillProcess)?;

        // Use a binary tree sorted by syscall number if possible.
        let _ = ctx.set_ctl_optimize(2);

        // SAFETY: Do NOT add supported architectures to the filter.
        // This ensures Syd can never run a non-native system call,
        // which we do not need at all.
        // seccomp_add_architectures(&mut ctx)?;

        // Allow epoll(7) API to our single epoll fd only.
        #[expect(clippy::cast_sign_loss)]
        for sysname in EPOLL_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[scmp_cmp!($arg0 == epoll_fd as u64)],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_ipc_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Deny open and stat family with ENOSYS rather than KillProcess.
        // We need this because Rust allocator has side effects such as
        // opening /proc/sys/vm/overcommit_memory on some architectures.
        //
        // Note, we avoid this when profiling is enabled,
        // as gperf requires it to write profiling data.
        for sysname in [
            "open",
            "openat",
            "openat2",
            "stat",
            "lstat",
            "statx",
            "newfstatat",
        ] {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    let action = if !cfg!(feature = "prof") {
                        ScmpAction::Errno(Errno::ENOSYS as i32)
                    } else {
                        ScmpAction::Allow
                    };
                    ctx.add_rule(action, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_ipc_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow safe fcntl(2) utility calls.
        for sysname in ["fcntl", "fcntl64"] {
            let syscall = match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => syscall,
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_ipc_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                    continue;
                }
            };

            for op in IPC_FCNTL_OPS {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &[scmp_cmp!($arg1 == *op)])?;
            }
        }

        // Allow safe prctl(2) operations.
        let sysname = "prctl";
        if let Ok(syscall) = ScmpSyscall::from_name(sysname) {
            for (_, op) in IPC_PRCTL_OPS {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &[scmp_cmp!($arg0 == *op)])?;
            }
        } else {
            info!("ctx": "confine", "op": "allow_ipc_syscall",
                "msg": format!("invalid or unsupported syscall {sysname}"));
        }

        // Prevent executable memory.
        const PROT_EXEC: u64 = libc::PROT_EXEC as u64;
        for sysname in ["mmap", "mmap2", "mprotect"] {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[scmp_cmp!($arg2 & PROT_EXEC == 0)],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_ipc_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow writes to the log-fd.
        // No proc_pid_mem(5) access required here.
        confine_scmp_write(&mut ctx, None, false)?;

        // Allow safe madvise(2) advice.
        confine_scmp_madvise(&mut ctx)?;

        // Allow safe system calls.
        for sysname in IPC_SYSCALLS.iter().chain(VDSO_SYSCALLS) {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_ipc_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow futex system calls.
        for sysname in FUTEX_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_ipc_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow getid system calls.
        for sysname in GET_ID_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_ipc_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow UID/GID changing system calls as necessary.
        let safe_setuid = flags.allow_safe_setuid();
        let safe_setgid = flags.allow_safe_setgid();
        if safe_setuid || safe_setgid {
            scmp_add_setid_rules(
                "ipc",
                &mut ctx,
                safe_setuid,
                safe_setgid,
                transit_uids,
                transit_gids,
            )?;
        }

        Ok(ctx)
    }

    // Spawn the IPC worker in a new thread named `syd_ipc'.
    pub fn try_spawn(mut self, notif_pipe: (RawFd, RawFd)) -> Result<SydJoinHandle<()>, Errno> {
        thread::Builder::new()
            .name("syd_ipc".into())
            .stack_size(IPC_STACK_SIZE)
            .spawn(move || {
                self.init(notif_pipe)?;
                self.main()
            })
            .map_err(|err| err2no(&err))
    }

    /// Initialize IPC worker.
    fn init(&mut self, notif_pipe: (RawFd, RawFd)) -> SydResult<()> {
        // SAFETY: We use exit_group(2) here to bail,
        // because this unsharing is a critical safety feature.
        if let Err(errno) = unshare(CloneFlags::CLONE_FS | CloneFlags::CLONE_FILES) {
            alert!("ctx": "boot", "op": "unshare_ipc_thread",
                "msg": format!("failed to unshare(CLONE_FS|CLONE_FILES): {errno}"),
                "err": errno as i32);
            std::process::exit(101);
        }

        // SAFETY: notif_pipe points to valid FDs.
        let (pipe_rd, pipe_wr) = unsafe {
            (
                OwnedFd::from_raw_fd(notif_pipe.0),
                OwnedFd::from_raw_fd(notif_pipe.1),
            )
        };
        drop(pipe_rd);
        let buf = [42u8; 1];
        match retry_on_eintr(|| write(&pipe_wr, &buf))? {
            0 => return Err(Errno::EIO.into()),
            1 => {}
            n => unreachable!("BUG: invalid pipe write of size {n}!"),
        }

        // Close the notification pipe.
        drop(pipe_wr);

        // SAFETY: The IPC worker needs to inherit only the following FDs:
        // 1. epoll(7) FD.
        // 2. IPC socket FD.
        // 3. Log FD.
        // We have to sort the set as the FDs are randomized.
        let poll = self.epoll.as_ref().ok_or(Errno::EADDRNOTAVAIL)?;
        let sock = self.sock.as_ref().ok_or(Errno::EADDRNOTAVAIL)?;
        #[expect(clippy::cast_sign_loss)]
        let mut set = vec![
            poll.0.as_raw_fd() as libc::c_uint,
            sock.as_raw_fd() as libc::c_uint,
            crate::log::LOG_FD.load(Ordering::Relaxed) as libc::c_uint,
        ];
        set.sort_unstable();
        closeexcept(&set)?;

        Ok(())
    }

    /// Main loop: waits on epoll, accepts new connections,
    /// reads commands, and writes responses.
    #[expect(clippy::cognitive_complexity)]
    fn main(&self) -> SydResult<()> {
        let epoll = self.epoll.as_ref().ok_or(Errno::EADDRNOTAVAIL)?;
        let sock = self.sock.as_ref().ok_or(Errno::EADDRNOTAVAIL)?;
        let sock_fd = sock.as_fd().as_raw_fd();

        // Confine `syd_ipc' thread.
        // SAFETY: We use exit_group(2) here to bail,
        // because this confinement is a critical safety feature.
        let ctx = match Self::prepare_confine(
            epoll.0.as_raw_fd(),
            self.flags,
            &self.transit_uids,
            &self.transit_gids,
            false,
        ) {
            Ok(ctx) => ctx,
            Err(error) => {
                let errno = error.errno().unwrap_or(Errno::ENOSYS);
                alert!("ctx": "boot", "op": "confine_ipc_thread",
                    "msg": format!("failed to confine: {error}"),
                    "err": errno as i32);
                std::process::exit(101);
            }
        };

        // Load seccomp(2) BPF into the kernel.
        // SAFETY: We use exit_group(2) here to bail,
        // because this confinement is a critical safety feature.
        if let Err(error) = ctx.load() {
            let errno = scmp2no(&error).unwrap_or(Errno::ENOSYS);
            alert!("ctx": "boot", "op": "confine_ipc_thread",
                "msg": format!("failed to confine: {error}"),
                "err": errno as i32);
            std::process::exit(101);
        }
        drop(ctx);

        info!("ctx": "confine", "op": "confine_ipc_thread",
            "msg": "IPC thread confined");

        // Set of active connections.
        let mut connections = ConnectionMap::default();

        // Buffer for epoll events.
        // TODO: IPC_EPOLL_MAX_EVENTS=1024 move to config.rs
        let mut events = [EpollEvent::empty(); 1024];

        // Wait for events and handle EINTR.
        loop {
            // Wait for events, block indefinitely.
            let nfds = match epoll.wait(&mut events, PollTimeout::NONE) {
                Ok(n) => n,
                Err(Errno::EINTR) => continue,
                Err(errno) => return Err(errno.into()),
            };

            for ev in events.iter().take(nfds) {
                #[expect(clippy::cast_possible_truncation)]
                let fd = ev.data() as RawFd;
                let flags = ev.events();

                if fd == sock_fd {
                    // New incoming connection(s).
                    self.accept_new_connections(&mut connections)?;
                } else {
                    // Existing connection.
                    if flags.contains(EpollFlags::EPOLLIN) {
                        self.handle_readable(&mut connections, fd)?;
                    }
                    if flags.contains(EpollFlags::EPOLLOUT) {
                        self.handle_writable(&mut connections, fd)?;
                    }
                    if flags.intersects(
                        EpollFlags::EPOLLERR | EpollFlags::EPOLLHUP | EpollFlags::EPOLLRDHUP,
                    ) {
                        // Error or hang-up: close connection.
                        self.close_connection(&mut connections, fd)?;
                    }
                }
            }
        }
    }

    /// Accept new connections, set them non-blocking, register with epoll.
    fn accept_new_connections(&self, connections: &mut ConnectionMap) -> Result<(), Errno> {
        let epoll = self.epoll.as_ref().ok_or(Errno::EADDRNOTAVAIL)?;
        let sock = self.sock.as_ref().ok_or(Errno::EADDRNOTAVAIL)?;

        loop {
            #[expect(unreachable_patterns)]
            match accept4(
                sock.as_raw_fd(),
                SockFlag::SOCK_CLOEXEC | SockFlag::SOCK_NONBLOCK,
            )
            .map(|fd| {
                // SAFETY: accept4 returns a valid FD on success.
                UnixStream::from(unsafe { OwnedFd::from_raw_fd(fd) })
            }) {
                Ok(stream) => {
                    // Register the new socket with epoll for reading.
                    let fd = stream.as_fd().as_raw_fd();
                    #[expect(clippy::cast_sign_loss)]
                    let event = libc::epoll_event {
                        events: EpollFlags::EPOLLIN.bits() as u32,
                        u64: fd as u64,
                    };
                    epoll_ctl_safe(&epoll.0, fd, Some(event))?;

                    // Create Connection state.
                    let creds = peer_cred(&stream)?;
                    let mut conn = Connection::new(creds, stream);

                    // In prompt mode, send initial prompt.
                    if conn.mode == PromptMode::Prompt {
                        conn.enqueue_prompt();
                    }
                    connections.insert(fd, conn);
                }
                Err(Errno::EAGAIN | Errno::EWOULDBLOCK) => {
                    // No more pending connections.
                    break;
                }
                Err(errno) => {
                    // Unexpected error.
                    return Err(errno);
                }
            }
        }

        Ok(())
    }

    /// Handle a readable event on connection `fd`.
    #[expect(clippy::cognitive_complexity)]
    fn handle_readable(&self, connections: &mut ConnectionMap, fd: RawFd) -> Result<(), Errno> {
        // Lookup connection by fd.
        // We need a mutable reference; use get_mut.
        if let Some(conn) = connections.get_mut(&fd) {
            loop {
                let mut buf = [0u8; LINE_MAX];
                let mut iov = [IoSliceMut::new(&mut buf)];
                let mut cmsg = cmsg_space!(UnixCredentials);

                #[expect(clippy::arithmetic_side_effects)]
                #[expect(unreachable_patterns)]
                match recvmsg::<()>(
                    conn.stream.as_raw_fd(),
                    &mut iov,
                    Some(&mut cmsg),
                    MsgFlags::MSG_DONTWAIT.into(),
                ) {
                    Ok(msg) if msg.bytes == 0 => {
                        // EOF on read side: if there's a pending write,
                        // defer closing until after flush.
                        if conn.write_buf.is_empty() {
                            // no response queued => close immediately.
                            self.close_connection(connections, fd)?;
                        } else {
                            // response pending => mark to close after writing.
                            conn.should_close = true;
                        }
                        return Ok(());
                    }
                    Ok(msg) if conn.buf.len() + msg.bytes >= LINE_MAX => {
                        // Input too large:
                        // 1. Add EPOLLOUT to interests.
                        // 2. Reject with error message.
                        // 3. Close connection after reply.
                        let epoll = self.epoll.as_ref().ok_or(Errno::EADDRNOTAVAIL)?;
                        conn.ctl(epoll, EpollFlags::EPOLLOUT)?;
                        conn.enqueue_response(IPC_RATE);
                        conn.should_close = true;

                        info!("ctx": "ipc", "op": "reject_ipc_command",
                            "msg": format!("rejected IPC command from pid:{} with uid:{}/gid:{}",
                                conn.creds.pid(),
                                conn.creds.uid(),
                                conn.creds.gid()),
                            "len": conn.buf.len() + msg.bytes,
                            "conn": &conn);

                        return Ok(());
                    }
                    Ok(msg) => {
                        // Authenticate each and every message.
                        let mut auth = false;
                        if let Ok(cmsgs) = msg.cmsgs() {
                            for cmsg in cmsgs {
                                let creds = if let ControlMessageOwned::ScmCredentials(creds) = cmsg
                                {
                                    creds
                                } else {
                                    continue;
                                };

                                if creds.pid() == 0 {
                                    // Invalid credentials, skip.
                                    continue;
                                }

                                if creds.uid() == 0 && creds.gid() == 0 {
                                    // Matched UID and GID for ROOT: Authenticated.
                                    auth = true;
                                    break;
                                }

                                if let Some(uid) = self.creds.0 {
                                    if creds.uid() != uid.as_raw() {
                                        // UID mismatch.
                                        continue;
                                    }
                                }

                                if let Some(gid) = self.creds.1 {
                                    if creds.gid() != gid.as_raw() {
                                        // GID mismatch.
                                        continue;
                                    }
                                }

                                // Matched UID and GID: Authenticated.
                                auth = true;
                                break;
                            }
                        }

                        if !auth {
                            // Authentication failed:
                            // 1. Add EPOLLOUT to interests.
                            // 2. Reject with error message.
                            // 3. Close connection after reply.
                            let epoll = self.epoll.as_ref().ok_or(Errno::EADDRNOTAVAIL)?;
                            conn.ctl(epoll, EpollFlags::EPOLLOUT)?;
                            conn.enqueue_response(IPC_AUTH);
                            conn.should_close = true;

                            return Ok(());
                        }

                        // Extend buffer with the message content.
                        for iov in msg.iovs() {
                            conn.buf.extend_from_slice(iov);
                        }

                        // Process any full lines in buffer.
                        while let Some(pos) = memchr(b'\n', &conn.buf) {
                            // Handle the line (no allocations needed here).
                            self.process_command(conn, pos)?;

                            // Remove the line *and* the '\n' from the buffer.
                            conn.buf.drain(..=pos);
                        }
                    }
                    Err(Errno::EINTR) => continue,
                    Err(Errno::EAGAIN | Errno::EWOULDBLOCK) => {
                        // No more data.
                        break;
                    }
                    Err(_) => {
                        // Other errors -> close connection.
                        self.close_connection(connections, fd)?;
                        return Ok(());
                    }
                }
            }
        }

        Ok(())
    }

    /// Handle a writable event on connection `fd` (flush pending writes).
    fn handle_writable(&self, connections: &mut ConnectionMap, fd: RawFd) -> Result<(), Errno> {
        let epoll = self.epoll.as_ref().ok_or(Errno::EADDRNOTAVAIL)?;

        if let Some(conn) = connections.get_mut(&fd) {
            while conn.write_pos < conn.write_buf.len() {
                let iov = [IoSlice::new(&conn.write_buf[conn.write_pos..])];

                #[expect(clippy::arithmetic_side_effects)]
                #[expect(unreachable_patterns)]
                match sendmsg::<()>(
                    conn.stream.as_raw_fd(),
                    &iov,
                    &[],
                    MsgFlags::MSG_DONTWAIT.into(),
                    None,
                ) {
                    Ok(0) => {
                        // Would block or closed; stop for now.
                        break;
                    }
                    Ok(n) => {
                        conn.write_pos += n;
                    }
                    Err(Errno::EINTR) => continue,
                    Err(Errno::EAGAIN | Errno::EWOULDBLOCK) => {
                        // Can't write more right now.
                        break;
                    }
                    Err(_) => {
                        // Fatal write error; close.
                        self.close_connection(connections, fd)?;
                        return Ok(());
                    }
                }
            }

            if conn.write_pos >= conn.write_buf.len() {
                // All data written: clear buffer and disable EPOLLOUT.
                conn.ctl(epoll, EpollFlags::EPOLLIN)?;
                conn.write_buf.clear();
                conn.write_pos = 0;

                // If mode is NonInteractive and the last command closed the connection,
                // we close the connection here.
                if conn.should_close {
                    self.close_connection(connections, fd)?;
                }
            }
        }

        Ok(())
    }

    // Process at `pos` boundary from connection `conn` on `fd`.
    #[expect(clippy::cognitive_complexity)]
    fn process_command(&self, conn: &mut Connection, pos: usize) -> Result<(), Errno> {
        let epoll = self.epoll.as_ref().ok_or(Errno::EADDRNOTAVAIL)?;

        // Convert to UTF-8, errors are fatal.
        let cmd = String::from_utf8(conn.buf[..pos].to_vec()).or(Err(Errno::EINVAL))?;

        // Split on ASCII whitespace.
        let mut parts = cmd.split_ascii_whitespace();

        match parts.next() {
            Some("prompt") => {
                // Change mode.
                match parts.next() {
                    Some("n") => {
                        conn.mode = PromptMode::NonInteractive;
                        conn.should_close = true;
                    }
                    Some("i") => {
                        conn.mode = PromptMode::Interactive;
                    }
                    Some("p") => {
                        conn.mode = PromptMode::Prompt;
                        conn.enqueue_prompt();
                    }
                    None => {
                        // Toggle:
                        // 1. From NonInteractive → Prompt;
                        // 2. From Interactive → Prompt;
                        // 3. From Prompt → Interactive.
                        match conn.mode {
                            PromptMode::NonInteractive => {
                                conn.mode = PromptMode::Prompt;
                                conn.enqueue_prompt();
                            }
                            PromptMode::Interactive => {
                                conn.mode = PromptMode::Prompt;
                                conn.enqueue_prompt();
                            }
                            PromptMode::Prompt => {
                                conn.mode = PromptMode::Interactive;
                            }
                        }
                    }
                    _ => {
                        conn.enqueue_response(b"Unknown prompt mode!\n");
                        if conn.mode == PromptMode::Prompt {
                            conn.enqueue_prompt();
                        }
                    }
                }
            }
            Some("quit") | Some("exit") => conn.should_close = true,
            Some(part) => {
                let response = if part == "ping" {
                    // Alive check, send pong to ping.
                    Cow::Borrowed(IPC_PONG)
                } else if part == "pink" {
                    // Pink Floyd check, reply quote.
                    Cow::Borrowed(IPC_PINK)
                } else if part == "version" {
                    Cow::Owned(format!(
            "{{\"major\":{API_MAJOR_VERSION},\"minor\":{IPC_MINOR_VERSION},\"version\":\"{API_MAJOR_VERSION}.{IPC_MINOR_VERSION}\"}}\n").into_bytes())
                } else {
                    // Regular command: pass to handle_cmd.
                    Cow::Owned(self.handle_cmd(&cmd)?)
                };
                conn.enqueue_response(&response);
                if conn.mode == PromptMode::Prompt {
                    conn.enqueue_prompt();
                } else if conn.mode == PromptMode::NonInteractive {
                    conn.should_close = true;
                }
                info!("ctx": "ipc", "op": "handle_ipc_command",
                    "msg": format!("handled IPC command from pid:{} with uid:{}/gid:{}",
                        conn.creds.pid(),
                        conn.creds.uid(),
                        conn.creds.gid()),
                    "cmd": &cmd,
                    "resp": XPath::from_bytes(&response),
                    "conn": &conn);
            }
            None => {
                // Empty line: just send prompt if in prompt mode.
                if conn.mode == PromptMode::Prompt {
                    conn.enqueue_prompt();
                }
            }
        }

        // After enqueueing data, ensure EPOLLOUT is enabled.
        if !conn.write_buf.is_empty() {
            conn.ctl(epoll, EpollFlags::EPOLLIN | EpollFlags::EPOLLOUT)?;
        }

        Ok(())
    }

    // Close and clean up connection `fd`.
    fn close_connection(&self, connections: &mut ConnectionMap, fd: RawFd) -> Result<(), Errno> {
        let epoll = self.epoll.as_ref().ok_or(Errno::EADDRNOTAVAIL)?;

        // Unregister from epoll.
        epoll_ctl_safe(&epoll.0, fd, None)?;

        // Remove from set (drops the stream).
        connections.remove(&fd);

        Ok(())
    }

    // handle_cmd: process a single command string,
    // and return a response buffer.
    fn handle_cmd(&self, cmd: &str) -> Result<Vec<u8>, Errno> {
        // Lock sandbox for write.
        let sandbox = self.sandbox.as_ref().ok_or(Errno::EOWNERDEAD)?;
        let mut sandbox = sandbox.write().unwrap_or_else(|err| err.into_inner());

        // Close connection immediately if sandbox is locked.
        // This also results in thread-exit.
        if sandbox.locked() {
            return Err(Errno::EBUSY);
        }

        // Intercept display commands and send to socket
        // instead of Syd's standard error:
        // stat: Prints Sandbox in compact JSON.
        // stats: Prints Sandbox in human-readable format.
        if is_equal(cmd.as_bytes(), b"stat") {
            return Ok(serde_json::to_string(&*sandbox)
                .or(Err(Errno::EINVAL))?
                .into_bytes());
        } else if is_equal(cmd.as_bytes(), b"stats") {
            return Ok(format!("{sandbox}").into_bytes());
        }

        // We use config_unchecked to skip logging,
        // and environment initialization.
        let result = sandbox.config_unchecked(cmd);

        // Close connection immediately if sandbox got locked.
        // This also results in thread-exit.
        if sandbox.locked() {
            return Err(Errno::EBUSY);
        }

        match result {
            Ok(()) => Ok(IPC_ACK.to_vec()),
            Err(errno) => {
                Ok(format!("{{\"err\":{},\"msg\":\"{errno}\"}}\n", errno as i32).into_bytes())
            }
        }
    }
}
