composefs/
fs.rs

1//! Reading and writing filesystem trees to/from disk.
2//!
3//! This module provides functionality to read filesystem structures from
4//! disk into composefs tree representations and write them back, including
5//! handling of hardlinks, extended attributes, and repository integration.
6
7use std::{
8    cell::RefCell,
9    collections::{BTreeMap, HashMap},
10    ffi::{CStr, OsStr},
11    fs::File,
12    io::{Read, Write},
13    mem::MaybeUninit,
14    os::unix::ffi::OsStrExt,
15    path::Path,
16    rc::Rc,
17};
18
19use anyhow::{ensure, Context as _, Result};
20use rustix::{
21    buffer::spare_capacity,
22    fd::{AsFd, OwnedFd},
23    fs::{
24        fstat, getxattr, linkat, listxattr, mkdirat, mknodat, openat, readlinkat, symlinkat,
25        AtFlags, Dir, FileType, Mode, OFlags, CWD,
26    },
27    io::{read, Errno},
28};
29use zerocopy::IntoBytes;
30
31use crate::{
32    fsverity::{compute_verity, FsVerityHashValue},
33    repository::Repository,
34    tree::{Directory, FileSystem, Inode, Leaf, LeafContent, RegularFile, Stat},
35    util::proc_self_fd,
36    INLINE_CONTENT_MAX,
37};
38
39/// Attempt to use O_TMPFILE + rename to atomically set file contents.
40/// Will fall back to a non-atomic write if the target doesn't support O_TMPFILE.
41fn set_file_contents(dirfd: &OwnedFd, name: &OsStr, stat: &Stat, data: &[u8]) -> Result<()> {
42    match openat(
43        dirfd,
44        ".",
45        OFlags::WRONLY | OFlags::TMPFILE | OFlags::CLOEXEC,
46        stat.st_mode.into(),
47    ) {
48        Ok(tmp) => {
49            let mut tmp = File::from(tmp);
50            tmp.write_all(data)?;
51            tmp.sync_data()?;
52            linkat(
53                CWD,
54                proc_self_fd(&tmp),
55                dirfd,
56                name,
57                AtFlags::SYMLINK_FOLLOW,
58            )?;
59        }
60        Err(Errno::OPNOTSUPP) => {
61            // vfat? yolo...
62            let fd = openat(
63                dirfd,
64                name,
65                OFlags::CREATE | OFlags::WRONLY | OFlags::CLOEXEC,
66                stat.st_mode.into(),
67            )?;
68            let mut f = File::from(fd);
69            f.write_all(data)?;
70            f.sync_data()?;
71        }
72        Err(e) => Err(e)?,
73    }
74    Ok(())
75}
76
77fn write_directory<ObjectID: FsVerityHashValue>(
78    dir: &Directory<ObjectID>,
79    dirfd: &OwnedFd,
80    name: &OsStr,
81    repo: &Repository<ObjectID>,
82) -> Result<()> {
83    match mkdirat(dirfd, name, dir.stat.st_mode.into()) {
84        Ok(()) | Err(Errno::EXIST) => {}
85        Err(e) => Err(e)?,
86    }
87
88    let fd = openat(dirfd, name, OFlags::PATH | OFlags::DIRECTORY, 0.into())?;
89    write_directory_contents(dir, &fd, repo)
90}
91
92fn write_leaf<ObjectID: FsVerityHashValue>(
93    leaf: &Leaf<ObjectID>,
94    dirfd: &OwnedFd,
95    name: &OsStr,
96    repo: &Repository<ObjectID>,
97) -> Result<()> {
98    let mode = leaf.stat.st_mode.into();
99
100    match &leaf.content {
101        LeafContent::Regular(RegularFile::Inline(ref data)) => {
102            set_file_contents(dirfd, name, &leaf.stat, data)?
103        }
104        LeafContent::Regular(RegularFile::External(ref id, size)) => {
105            let object = repo.open_object(id)?;
106            // TODO: make this better.  At least needs to be EINTR-safe.  Could even do reflink in some cases.
107            // Regardless we shouldn't read the whole file into memory.
108            let size = (*size).try_into().context("size overflow")?;
109            let mut buffer = vec![MaybeUninit::uninit(); size];
110            let (data, _) = read(object, &mut buffer)?;
111            set_file_contents(dirfd, name, &leaf.stat, data)?;
112        }
113        LeafContent::BlockDevice(rdev) => mknodat(dirfd, name, FileType::BlockDevice, mode, *rdev)?,
114        LeafContent::CharacterDevice(rdev) => {
115            mknodat(dirfd, name, FileType::CharacterDevice, mode, *rdev)?
116        }
117        LeafContent::Socket => mknodat(dirfd, name, FileType::Socket, mode, 0)?,
118        LeafContent::Fifo => mknodat(dirfd, name, FileType::Fifo, mode, 0)?,
119        LeafContent::Symlink(target) => symlinkat(target.as_ref(), dirfd, name)?,
120    }
121
122    Ok(())
123}
124
125fn write_directory_contents<ObjectID: FsVerityHashValue>(
126    dir: &Directory<ObjectID>,
127    fd: &OwnedFd,
128    repo: &Repository<ObjectID>,
129) -> Result<()> {
130    for (name, inode) in dir.entries() {
131        match inode {
132            Inode::Directory(ref dir) => write_directory(dir, fd, name, repo),
133            Inode::Leaf(ref leaf) => write_leaf(leaf, fd, name, repo),
134        }?;
135    }
136
137    Ok(())
138}
139
140/// Writes a directory tree from composefs representation to a filesystem path.
141///
142/// Reconstructs the filesystem structure at the specified output directory,
143/// creating directories, files, symlinks, and device nodes as needed. External
144/// file content is read from the repository. Note that hardlinks are not supported.
145pub fn write_to_path<ObjectID: FsVerityHashValue>(
146    repo: &Repository<ObjectID>,
147    dir: &Directory<ObjectID>,
148    output_dir: &Path,
149) -> Result<()> {
150    let fd = openat(CWD, output_dir, OFlags::PATH | OFlags::DIRECTORY, 0.into())?;
151    write_directory_contents(dir, &fd, repo)
152}
153
154/// Helper for reading filesystem trees from disk into composefs representation.
155///
156/// Tracks hardlinks via inode numbers and handles integration with repositories
157/// for storing large file content.
158#[derive(Debug)]
159pub struct FilesystemReader<'repo, ObjectID: FsVerityHashValue> {
160    repo: Option<&'repo Repository<ObjectID>>,
161    inodes: HashMap<(u64, u64), Rc<Leaf<ObjectID>>>,
162}
163
164impl<ObjectID: FsVerityHashValue> FilesystemReader<'_, ObjectID> {
165    fn read_xattrs(fd: &OwnedFd) -> Result<BTreeMap<Box<OsStr>, Box<[u8]>>> {
166        // flistxattr() and fgetxattr() don't work with with O_PATH fds, so go via /proc/self/fd.
167        // Note: we want the symlink-following version of this call, which produces the correct
168        // behaviour even when trying to read xattrs from symlinks themselves.  See
169        // https://gist.github.com/allisonkarlitskaya/7a80f2ebb3314d80f45c653a1ba0e398
170        let filename = proc_self_fd(fd);
171
172        let mut xattrs = BTreeMap::new();
173
174        let mut names = [MaybeUninit::new(0); 65536];
175        let (names, _) = listxattr(&filename, &mut names)?;
176
177        for name in names.split_inclusive(|c| *c == 0) {
178            let mut buffer = [MaybeUninit::new(0); 65536];
179            let name: &[u8] = name.as_bytes();
180            let name = CStr::from_bytes_with_nul(name)?;
181            let (value, _) = getxattr(&filename, name, &mut buffer)?;
182            let key = Box::from(OsStr::from_bytes(name.to_bytes()));
183            xattrs.insert(key, Box::from(value));
184        }
185
186        Ok(xattrs)
187    }
188
189    fn stat(fd: &OwnedFd, ifmt: FileType) -> Result<(rustix::fs::Stat, Stat)> {
190        let buf = fstat(fd)?;
191
192        ensure!(
193            FileType::from_raw_mode(buf.st_mode) == ifmt,
194            "File type changed
195            between readdir() and fstat()"
196        );
197
198        Ok((
199            buf,
200            Stat {
201                st_mode: buf.st_mode & 0o7777,
202                st_uid: buf.st_uid,
203                st_gid: buf.st_gid,
204                st_mtim_sec: buf.st_mtime as i64,
205                xattrs: RefCell::new(Self::read_xattrs(fd)?),
206            },
207        ))
208    }
209
210    fn read_leaf_content(
211        &mut self,
212        fd: OwnedFd,
213        buf: rustix::fs::Stat,
214    ) -> Result<LeafContent<ObjectID>> {
215        let content = match FileType::from_raw_mode(buf.st_mode) {
216            FileType::Directory | FileType::Unknown => unreachable!(),
217            FileType::RegularFile => {
218                let size = buf.st_size.try_into().context("size overflow")?;
219                let mut buffer = Vec::with_capacity(size);
220                if buf.st_size > 0 {
221                    read(fd, spare_capacity(&mut buffer))?;
222                }
223                let buffer = Box::from(buffer);
224
225                if buf.st_size > INLINE_CONTENT_MAX as i64 {
226                    let id = if let Some(repo) = self.repo {
227                        repo.ensure_object(&buffer)?
228                    } else {
229                        compute_verity(&buffer)
230                    };
231                    LeafContent::Regular(RegularFile::External(id, buf.st_size as u64))
232                } else {
233                    LeafContent::Regular(RegularFile::Inline(buffer))
234                }
235            }
236            FileType::Symlink => {
237                let target = readlinkat(fd, "", [])?;
238                LeafContent::Symlink(OsStr::from_bytes(target.as_bytes()).into())
239            }
240            FileType::CharacterDevice => LeafContent::CharacterDevice(buf.st_rdev),
241            FileType::BlockDevice => LeafContent::BlockDevice(buf.st_rdev),
242            FileType::Fifo => LeafContent::Fifo,
243            FileType::Socket => LeafContent::Socket,
244        };
245        Ok(content)
246    }
247
248    fn read_leaf(
249        &mut self,
250        dirfd: &OwnedFd,
251        name: &OsStr,
252        ifmt: FileType,
253    ) -> Result<Rc<Leaf<ObjectID>>> {
254        let oflags = match ifmt {
255            FileType::RegularFile => OFlags::RDONLY,
256            _ => OFlags::PATH,
257        };
258
259        let fd = openat(
260            dirfd,
261            name,
262            oflags | OFlags::NOFOLLOW | OFlags::CLOEXEC,
263            Mode::empty(),
264        )?;
265
266        let (buf, stat) = Self::stat(&fd, ifmt)?;
267
268        // NB: We could check `st_nlink > 1` to find out if we should track a file as a potential
269        // hardlink or not, but some filesystems (like fuse-overlayfs) can report this incorrectly.
270        // Track all files.  https://github.com/containers/fuse-overlayfs/issues/435
271        let key = (buf.st_dev, buf.st_ino);
272        if let Some(leafref) = self.inodes.get(&key) {
273            Ok(Rc::clone(leafref))
274        } else {
275            let content = self.read_leaf_content(fd, buf)?;
276            let leaf = Rc::new(Leaf { stat, content });
277            self.inodes.insert(key, Rc::clone(&leaf));
278            Ok(leaf)
279        }
280    }
281
282    /// Reads a directory from disk into composefs representation.
283    ///
284    /// Recursively reads directory contents, tracking hardlinks and optionally
285    /// reading the directory's own metadata. Large files are stored in the repository
286    /// if one was provided.
287    fn read_directory(&mut self, dirfd: impl AsFd, name: &OsStr) -> Result<Directory<ObjectID>> {
288        let fd = openat(
289            dirfd,
290            name,
291            OFlags::RDONLY | OFlags::DIRECTORY | OFlags::NOFOLLOW | OFlags::CLOEXEC,
292            Mode::empty(),
293        )?;
294
295        let (_, stat) = Self::stat(&fd, FileType::Directory)?;
296        let mut directory = Directory::new(stat);
297
298        for item in Dir::read_from(&fd)? {
299            let entry = item?;
300            let name = OsStr::from_bytes(entry.file_name().to_bytes());
301
302            if name == "." || name == ".." {
303                continue;
304            }
305
306            let inode = self.read_inode(&fd, name, entry.file_type())?;
307            directory.insert(name, inode);
308        }
309
310        Ok(directory)
311    }
312
313    fn read_inode(
314        &mut self,
315        dirfd: &OwnedFd,
316        name: &OsStr,
317        ifmt: FileType,
318    ) -> Result<Inode<ObjectID>> {
319        if ifmt == FileType::Directory {
320            let dir = self.read_directory(dirfd, name)?;
321            Ok(Inode::Directory(Box::new(dir)))
322        } else {
323            let leaf = self.read_leaf(dirfd, name, ifmt)?;
324            Ok(Inode::Leaf(leaf))
325        }
326    }
327}
328
329/// Load a filesystem tree from the given path. A repository may
330/// be provided; if it is, then all files found in the filesystem
331/// are copied in.
332pub fn read_filesystem<ObjectID: FsVerityHashValue>(
333    dirfd: impl AsFd,
334    path: &Path,
335    repo: Option<&Repository<ObjectID>>,
336) -> Result<FileSystem<ObjectID>> {
337    let mut reader = FilesystemReader {
338        repo,
339        inodes: HashMap::new(),
340    };
341
342    let root = reader.read_directory(dirfd, path.as_os_str())?;
343
344    Ok(FileSystem { root })
345}
346
347/// Load a filesystem tree from the given path, filtering xattrs with a predicate.
348///
349/// This is a wrapper around [`read_filesystem`] that filters extended attributes
350/// using the provided predicate. Only xattrs for which the predicate returns `true`
351/// are retained. This is useful when reading from a mounted filesystem where host
352/// xattrs may leak into the image.
353///
354/// # Example
355///
356/// ```ignore
357/// use composefs::fs::{read_filesystem_filtered, CONTAINER_XATTR_ALLOWLIST};
358///
359/// // Filter to only allow security.capability
360/// let fs = read_filesystem_filtered(dirfd, path, repo, |name| {
361///     name.as_encoded_bytes() == b"security.capability"
362/// })?;
363/// ```
364pub fn read_filesystem_filtered<ObjectID, F>(
365    dirfd: impl AsFd,
366    path: &Path,
367    repo: Option<&Repository<ObjectID>>,
368    xattr_filter: F,
369) -> Result<FileSystem<ObjectID>>
370where
371    ObjectID: FsVerityHashValue,
372    F: Fn(&OsStr) -> bool,
373{
374    let fs = read_filesystem(dirfd, path, repo)?;
375    fs.filter_xattrs(xattr_filter);
376    Ok(fs)
377}
378
379/// Default xattr allowlist for container filesystems.
380///
381/// When reading from a mounted container filesystem, host xattrs can leak into
382/// the image (e.g., SELinux labels like `container_t` from overlayfs). This
383/// allowlist specifies which xattrs are safe to preserve.
384///
385/// Currently only `security.capability` is allowed, as it represents actual
386/// file capabilities that should be preserved. SELinux labels (`security.selinux`)
387/// are excluded because they come from the build host and will be regenerated
388/// by `transform_for_boot()` based on the target system's policy.
389///
390/// See: <https://github.com/containers/storage/pull/1608#issuecomment-1600915185>
391pub const CONTAINER_XATTR_ALLOWLIST: &[&str] = &["security.capability"];
392
393/// Returns true if the given xattr name is in [`CONTAINER_XATTR_ALLOWLIST`].
394pub fn is_allowed_container_xattr(name: &OsStr) -> bool {
395    CONTAINER_XATTR_ALLOWLIST
396        .iter()
397        .any(|allowed| name.as_encoded_bytes() == allowed.as_bytes())
398}
399
400/// Load a container root filesystem from the given path.
401///
402/// This is a convenience wrapper around [`read_filesystem_filtered`] that also
403/// applies OCI container transformations via [`FileSystem::transform_for_oci`].
404///
405/// Equivalent to calling:
406/// ```ignore
407/// let mut fs = read_filesystem_filtered(dirfd, path, repo, is_allowed_container_xattr)?;
408/// fs.transform_for_oci()?;
409/// ```
410///
411/// This is the recommended way to read a container filesystem because:
412/// - OCI container runtimes don't preserve root directory metadata from layer tars
413/// - Host xattrs (especially `security.selinux`) can leak into mounted filesystems
414/// - `/run` should be empty (it's a tmpfs at runtime)
415/// - Podman/buildah's `RUN --mount` can leave directory stubs
416///
417/// By filtering xattrs and applying OCI transformations, we ensure consistent
418/// and reproducible composefs digests between build-time and install-time.
419pub fn read_container_root<ObjectID: FsVerityHashValue>(
420    dirfd: impl AsFd,
421    path: &Path,
422    repo: Option<&Repository<ObjectID>>,
423) -> Result<FileSystem<ObjectID>> {
424    let mut fs = read_filesystem_filtered(dirfd, path, repo, is_allowed_container_xattr)?;
425    fs.transform_for_oci()?;
426    Ok(fs)
427}
428
429/// Read the contents of a file.
430pub fn read_file<ObjectID: FsVerityHashValue>(
431    file: &RegularFile<ObjectID>,
432    repo: &Repository<ObjectID>,
433) -> Result<Box<[u8]>> {
434    match file {
435        RegularFile::Inline(data) => Ok(data.clone()),
436        RegularFile::External(id, size) => {
437            let capacity: usize = (*size).try_into().context("file too large for memory")?;
438            let mut data = Vec::with_capacity(capacity);
439            std::fs::File::from(repo.open_object(id)?).read_to_end(&mut data)?;
440            ensure!(
441                *size == data.len() as u64,
442                "File content doesn't have the expected length"
443            );
444            Ok(data.into_boxed_slice())
445        }
446    }
447}
448
449#[cfg(test)]
450mod tests {
451    use super::*;
452    use rustix::fs::{openat, CWD};
453
454    #[test]
455    fn test_write_contents() -> Result<()> {
456        let td = tempfile::tempdir()?;
457        let testpath = &td.path().join("testfile");
458        let td = openat(
459            CWD,
460            td.path(),
461            OFlags::RDONLY | OFlags::DIRECTORY | OFlags::CLOEXEC,
462            Mode::from_raw_mode(0),
463        )?;
464        let st = Stat {
465            st_mode: 0o755,
466            st_uid: 0,
467            st_gid: 0,
468            st_mtim_sec: Default::default(),
469            xattrs: Default::default(),
470        };
471        set_file_contents(&td, OsStr::new("testfile"), &st, b"new contents").unwrap();
472        drop(td);
473        assert_eq!(std::fs::read(testpath)?, b"new contents");
474        Ok(())
475    }
476}