bootc_lib/
podstorage.rs

1//! # bootc-managed instance of containers-storage:
2//!
3//! The backend for podman and other tools is known as `container-storage:`,
4//! with a canonical instance that lives in `/var/lib/containers`.
5//!
6//! This is a `containers-storage:` instance` which is owned by bootc and
7//! is stored at `/sysroot/ostree/bootc`.
8//!
9//! At the current time, this is only used for Logically Bound Images.
10
11use std::collections::HashSet;
12use std::io::{Seek, Write};
13use std::os::unix::process::CommandExt;
14use std::process::{Command, Stdio};
15use std::sync::Arc;
16
17use anyhow::{Context, Result};
18use bootc_utils::{AsyncCommandRunExt, CommandRunExt, ExitStatusExt};
19use camino::{Utf8Path, Utf8PathBuf};
20use cap_std_ext::cap_std::fs::Dir;
21use cap_std_ext::cap_tempfile::TempDir;
22use cap_std_ext::cmdext::CapStdExtCommandExt;
23use cap_std_ext::dirext::CapStdExtDirExt;
24use cap_std_ext::{cap_std, cap_tempfile};
25use fn_error_context::context;
26use ostree_ext::ostree::{self};
27use std::os::fd::{AsFd, AsRawFd, OwnedFd};
28use tokio::process::Command as AsyncCommand;
29
30// Pass only 100 args at a time just to avoid potentially overflowing argument
31// vectors; not that this should happen in reality, but just in case.
32const SUBCMD_ARGV_CHUNKING: usize = 100;
33
34/// Global directory path which we use for podman to point
35/// it at our storage. Unfortunately we can't yet use the
36/// /proc/self/fd/N trick because it currently breaks due
37/// to how the untar process is forked in the child.
38pub(crate) const STORAGE_ALIAS_DIR: &str = "/run/bootc/storage";
39/// We pass this via /proc/self/fd to the child process.
40const STORAGE_RUN_FD: i32 = 3;
41
42const LABELED: &str = ".bootc_labeled";
43
44/// The path to the image storage, relative to the bootc root directory.
45pub(crate) const SUBPATH: &str = "storage";
46/// The path to the "runroot" with transient runtime state; this is
47/// relative to the /run directory
48const RUNROOT: &str = "bootc/storage";
49
50/// A bootc-owned instance of `containers-storage:`.
51pub(crate) struct CStorage {
52    /// The root directory
53    sysroot: Dir,
54    /// The location of container storage
55    storage_root: Dir,
56    #[allow(dead_code)]
57    /// Our runtime state
58    run: Dir,
59    /// Disallow using this across multiple threads concurrently; while we
60    /// have internal locking in podman, in the future we may change how
61    /// things work here. And we don't have a use case right now for
62    /// concurrent operations.
63    _unsync: std::cell::Cell<()>,
64}
65
66#[derive(Debug, PartialEq, Eq)]
67pub(crate) enum PullMode {
68    /// Pull only if the image is not present
69    IfNotExists,
70    /// Always check for an update
71    #[allow(dead_code)]
72    Always,
73}
74
75#[allow(unsafe_code)]
76#[context("Binding storage roots")]
77fn bind_storage_roots(cmd: &mut Command, storage_root: &Dir, run_root: &Dir) -> Result<()> {
78    // podman requires an absolute path, for two reasons right now:
79    // - It writes the file paths into `db.sql`, a sqlite database for unknown reasons
80    // - It forks helper binaries, so just giving it /proc/self/fd won't work as
81    //   those helpers may not get the fd passed. (which is also true of skopeo)
82    // We create a new mount namespace, which also has the helpful side effect
83    // of automatically cleaning up the global bind mount that the storage stack
84    // creates.
85
86    let storage_root = Arc::new(storage_root.try_clone().context("Cloning storage root")?);
87    let run_root: Arc<OwnedFd> = Arc::new(run_root.try_clone().context("Cloning runroot")?.into());
88    // SAFETY: All the APIs we call here are safe to invoke between fork and exec.
89    unsafe {
90        cmd.pre_exec(move || {
91            use rustix::fs::{Mode, OFlags};
92            // For reasons I don't understand, we can't just `mount("/proc/self/fd/N", "/path/to/target")`
93            // but it *does* work to fchdir(fd) + mount(".", "/path/to/target").
94            // I think it may be that mount doesn't like operating on the magic links?
95            // This trick only works if we set our working directory to the target *before*
96            // creating the new namespace too.
97            //
98            // I think we may be hitting this:
99            //
100            // "       EINVAL A bind operation (MS_BIND) was requested where source referred a mount namespace magic link (i.e., a /proc/pid/ns/mnt magic link or a bind mount to such a link) and the propagation type of the parent mount of target was
101            // MS_SHARED, but propagation of the requested bind mount could lead to a circular dependency that might prevent the mount namespace from ever being freed."
102            //
103            // But...how did we avoid that circular dependency by using the process cwd?
104            //
105            // I tried making the mounts recursively private, but that didn't help.
106            let oldwd = rustix::fs::open(
107                ".",
108                OFlags::DIRECTORY | OFlags::CLOEXEC | OFlags::RDONLY,
109                Mode::empty(),
110            )?;
111            rustix::process::fchdir(&storage_root)?;
112            rustix::thread::unshare_unsafe(rustix::thread::UnshareFlags::NEWNS)?;
113            rustix::mount::mount_bind(".", STORAGE_ALIAS_DIR)?;
114            rustix::process::fchdir(&oldwd)?;
115            Ok(())
116        })
117    };
118    cmd.take_fd_n(run_root, STORAGE_RUN_FD);
119    Ok(())
120}
121
122// Initialize a `podman` subprocess with:
123// - storage overridden to point to to storage_root
124// - Authentication (auth.json) using the bootc/ostree owned auth
125fn new_podman_cmd_in(sysroot: &Dir, storage_root: &Dir, run_root: &Dir) -> Result<Command> {
126    let mut cmd = Command::new("podman");
127    bind_storage_roots(&mut cmd, storage_root, run_root)?;
128    let run_root = format!("/proc/self/fd/{STORAGE_RUN_FD}");
129    cmd.args(["--root", STORAGE_ALIAS_DIR, "--runroot", run_root.as_str()]);
130
131    let tmpd = &cap_std::fs::Dir::open_ambient_dir("/tmp", cap_std::ambient_authority())?;
132    let mut tempfile = cap_tempfile::TempFile::new_anonymous(tmpd).map(std::io::BufWriter::new)?;
133
134    // Keep this in sync with https://github.com/bootc-dev/containers-image-proxy-rs/blob/b5e0861ad5065f47eaf9cda0d48da3529cc1bc43/src/imageproxy.rs#L310
135    // We always override the auth to match the bootc setup.
136    let authfile_fd = ostree_ext::globals::get_global_authfile(sysroot)?.map(|v| v.1);
137    if let Some(mut fd) = authfile_fd {
138        std::io::copy(&mut fd, &mut tempfile)?;
139    } else {
140        // Note that if there's no bootc-owned auth, then we force an empty authfile to ensure
141        // that podman doesn't fall back to searching the user-owned paths.
142        tempfile.write_all(b"{}")?;
143    }
144
145    let tempfile = tempfile
146        .into_inner()
147        .map_err(|e| e.into_error())?
148        .into_std();
149    let fd: Arc<OwnedFd> = std::sync::Arc::new(tempfile.into());
150    let target_fd = fd.as_fd().as_raw_fd();
151    cmd.take_fd_n(fd, target_fd);
152    cmd.env("REGISTRY_AUTH_FILE", format!("/proc/self/fd/{target_fd}"));
153
154    Ok(cmd)
155}
156
157/// Adjust the provided command (skopeo or podman e.g.) to reference
158/// the provided path as an additional image store.
159pub fn set_additional_image_store<'c>(
160    cmd: &'c mut Command,
161    ais: impl AsRef<Utf8Path>,
162) -> &'c mut Command {
163    let ais = ais.as_ref();
164    let storage_opt = format!("additionalimagestore={ais}");
165    cmd.env("STORAGE_OPTS", storage_opt)
166}
167
168/// Ensure that "podman" is the first thing to touch the global storage
169/// instance. This is a workaround for https://github.com/bootc-dev/bootc/pull/1101#issuecomment-2653862974
170/// Basically podman has special upgrade logic for when it is the first thing
171/// to initialize the c/storage instance it sets the networking to netavark.
172/// If it's not the first thing, then it assumes an upgrade scenario and we
173/// may be using CNI.
174///
175/// But this legacy path is triggered through us using skopeo, turning off netavark
176/// by default. Work around this by ensuring that /usr/bin/podman is
177/// always the first thing to touch c/storage (at least, when invoked by us).
178///
179/// Call this function any time we're going to write to containers-storage.
180pub(crate) fn ensure_floating_c_storage_initialized() {
181    if let Err(e) = Command::new("podman")
182        .args(["system", "info"])
183        .stdout(Stdio::null())
184        .run_capture_stderr()
185    {
186        // Out of conservatism we don't make this operation fatal right now.
187        // If something went wrong, then we'll probably fail on a later operation
188        // anyways.
189        tracing::warn!("Failed to query podman system info: {e}");
190    }
191}
192
193impl CStorage {
194    /// Create a `podman image` Command instance prepared to operate on our alternative
195    /// root.
196    pub(crate) fn new_image_cmd(&self) -> Result<Command> {
197        let mut r = new_podman_cmd_in(&self.sysroot, &self.storage_root, &self.run)?;
198        // We want to limit things to only manipulating images by default.
199        r.arg("image");
200        Ok(r)
201    }
202
203    fn init_globals() -> Result<()> {
204        // Ensure our global storage alias dir exists
205        std::fs::create_dir_all(STORAGE_ALIAS_DIR)
206            .with_context(|| format!("Creating {STORAGE_ALIAS_DIR}"))?;
207        Ok(())
208    }
209
210    /// Ensure that the LSM (SELinux) labels are set on the bootc-owned
211    /// containers-storage: instance. We use a `LABELED` stamp file for
212    /// idempotence.
213    #[context("Labeling imgstorage dirs")]
214    fn ensure_labeled(root: &Dir, sepolicy: Option<&ostree::SePolicy>) -> Result<()> {
215        if root.try_exists(LABELED)? {
216            return Ok(());
217        }
218        let Some(sepolicy) = sepolicy else {
219            return Ok(());
220        };
221
222        // recursively set the labels because they were previously set to usr_t,
223        // and there is no policy defined to set them to the c/storage labels
224        crate::lsm::relabel_recurse(
225            &root,
226            ".",
227            Some(Utf8Path::new("/var/lib/containers/storage")),
228            sepolicy,
229        )
230        .context("labeling storage root")?;
231
232        root.create(LABELED)?;
233
234        Ok(())
235    }
236
237    #[context("Creating imgstorage")]
238    pub(crate) fn create(
239        sysroot: &Dir,
240        run: &Dir,
241        sepolicy: Option<&ostree::SePolicy>,
242    ) -> Result<Self> {
243        Self::init_globals()?;
244        let subpath = &Self::subpath();
245
246        // SAFETY: We know there's a parent
247        let parent = subpath.parent().unwrap();
248        let tmp = format!("{subpath}.tmp");
249        if !sysroot
250            .try_exists(subpath)
251            .with_context(|| format!("Querying {subpath}"))?
252        {
253            sysroot.remove_all_optional(&tmp).context("Removing tmp")?;
254            sysroot
255                .create_dir_all(parent)
256                .with_context(|| format!("Creating {parent}"))?;
257            sysroot.create_dir_all(&tmp).context("Creating tmpdir")?;
258            let storage_root = sysroot.open_dir(&tmp).context("Open tmp")?;
259
260            // There's no explicit API to initialize a containers-storage:
261            // root, simply passing a path will attempt to auto-create it.
262            // We run "podman images" in the new root.
263            new_podman_cmd_in(&sysroot, &storage_root, &run)?
264                .stdout(Stdio::null())
265                .arg("images")
266                .run_capture_stderr()
267                .context("Initializing images")?;
268            Self::ensure_labeled(&storage_root, sepolicy)?;
269            drop(storage_root);
270            sysroot
271                .rename(&tmp, sysroot, subpath)
272                .context("Renaming tmpdir")?;
273            tracing::debug!("Created image store");
274        } else {
275            // the storage already exists, make sure it has selinux labels
276            let storage_root = sysroot.open_dir(subpath).context("opening storage dir")?;
277            Self::ensure_labeled(&storage_root, sepolicy)?;
278        }
279
280        Self::open(sysroot, run)
281    }
282
283    #[context("Opening imgstorage")]
284    pub(crate) fn open(sysroot: &Dir, run: &Dir) -> Result<Self> {
285        tracing::trace!("Opening container image store");
286        Self::init_globals()?;
287        let subpath = &Self::subpath();
288        let storage_root = sysroot
289            .open_dir(subpath)
290            .with_context(|| format!("Opening {subpath}"))?;
291        // Always auto-create this if missing
292        run.create_dir_all(RUNROOT)
293            .with_context(|| format!("Creating {RUNROOT}"))?;
294        let run = run.open_dir(RUNROOT)?;
295        Ok(Self {
296            sysroot: sysroot.try_clone()?,
297            storage_root,
298            run,
299            _unsync: Default::default(),
300        })
301    }
302
303    #[context("Listing images")]
304    pub(crate) async fn list_images(&self) -> Result<Vec<crate::podman::ImageListEntry>> {
305        let mut cmd = self.new_image_cmd()?;
306        cmd.args(["list", "--format=json"]);
307        cmd.stdin(Stdio::null());
308        // It's maximally convenient for us to just pipe the whole output to a tempfile
309        let mut stdout = tempfile::tempfile()?;
310        cmd.stdout(stdout.try_clone()?);
311        // Allocate stderr, which is passed to the status checker
312        let stderr = tempfile::tempfile()?;
313        cmd.stderr(stderr.try_clone()?);
314
315        // Spawn the child and wait
316        AsyncCommand::from(cmd)
317            .status()
318            .await?
319            .check_status_with_stderr(stderr)?;
320        // Spawn a helper thread to avoid blocking the main thread
321        // parsing JSON.
322        tokio::task::spawn_blocking(move || -> Result<_> {
323            stdout.seek(std::io::SeekFrom::Start(0))?;
324            let stdout = std::io::BufReader::new(stdout);
325            let r = serde_json::from_reader(stdout)?;
326            Ok(r)
327        })
328        .await?
329    }
330
331    #[context("Pruning")]
332    pub(crate) async fn prune_except_roots(&self, roots: &HashSet<&str>) -> Result<Vec<String>> {
333        let all_images = self.list_images().await?;
334        tracing::debug!("Images total: {}", all_images.len(),);
335        let mut garbage = Vec::new();
336        for image in all_images {
337            if image
338                .names
339                .iter()
340                .flatten()
341                .all(|name| !roots.contains(name.as_str()))
342            {
343                garbage.push(image.id);
344            }
345        }
346        tracing::debug!("Images to prune: {}", garbage.len());
347        for garbage in garbage.chunks(SUBCMD_ARGV_CHUNKING) {
348            let mut cmd = self.new_image_cmd()?;
349            cmd.stdin(Stdio::null());
350            cmd.stdout(Stdio::null());
351            cmd.arg("rm");
352            cmd.args(garbage);
353            AsyncCommand::from(cmd).run().await?;
354        }
355        Ok(garbage)
356    }
357
358    /// Return true if the image exists in the storage.
359    pub(crate) async fn exists(&self, image: &str) -> Result<bool> {
360        // Sadly https://docs.rs/containers-image-proxy/latest/containers_image_proxy/struct.ImageProxy.html#method.open_image_optional
361        // doesn't work with containers-storage yet
362        let mut cmd = AsyncCommand::from(self.new_image_cmd()?);
363        cmd.args(["exists", image]);
364        Ok(cmd.status().await?.success())
365    }
366
367    /// Fetch the image if it is not already present; return whether
368    /// or not the image was fetched.
369    pub(crate) async fn pull(&self, image: &str, mode: PullMode) -> Result<bool> {
370        match mode {
371            PullMode::IfNotExists => {
372                if self.exists(image).await? {
373                    tracing::debug!("Image is already present: {image}");
374                    return Ok(false);
375                }
376            }
377            PullMode::Always => {}
378        };
379        let mut cmd = self.new_image_cmd()?;
380        cmd.stdin(Stdio::null());
381        cmd.stdout(Stdio::null());
382        cmd.args(["pull", image]);
383        tracing::debug!("Pulling image: {image}");
384        let mut cmd = AsyncCommand::from(cmd);
385        cmd.run().await.context("Failed to pull image")?;
386        Ok(true)
387    }
388
389    /// Copy an image from the default container storage (/var/lib/containers/)
390    /// to this storage.
391    #[context("Pulling from host storage: {image}")]
392    pub(crate) async fn pull_from_host_storage(&self, image: &str) -> Result<()> {
393        let mut cmd = Command::new("podman");
394        cmd.stdin(Stdio::null());
395        cmd.stdout(Stdio::null());
396        // An ephemeral place for the transient state;
397        let temp_runroot = TempDir::new(cap_std::ambient_authority())?;
398        bind_storage_roots(&mut cmd, &self.storage_root, &temp_runroot)?;
399
400        // The destination (target stateroot) + container storage dest
401        let storage_dest = &format!(
402            "containers-storage:[overlay@{STORAGE_ALIAS_DIR}+/proc/self/fd/{STORAGE_RUN_FD}]"
403        );
404        cmd.args(["image", "push", "--remove-signatures", image])
405            .arg(format!("{storage_dest}{image}"));
406        let mut cmd = AsyncCommand::from(cmd);
407        cmd.run().await?;
408        temp_runroot.close()?;
409        Ok(())
410    }
411
412    pub(crate) fn subpath() -> Utf8PathBuf {
413        Utf8Path::new(crate::store::BOOTC_ROOT).join(SUBPATH)
414    }
415}
416
417#[cfg(test)]
418mod tests {
419    use super::*;
420    static_assertions::assert_not_impl_any!(CStorage: Sync);
421}