ostree_ext/
chunking.rs

1//! Split an OSTree commit into separate chunks
2
3// SPDX-License-Identifier: Apache-2.0 OR MIT
4
5use std::borrow::{Borrow, Cow};
6use std::collections::{BTreeMap, BTreeSet};
7use std::fmt::Write;
8use std::hash::{Hash, Hasher};
9use std::num::NonZeroU32;
10use std::rc::Rc;
11use std::time::Instant;
12
13use crate::container::{COMPONENT_SEPARATOR, CONTENT_ANNOTATION};
14use crate::objectsource::{ContentID, ObjectMeta, ObjectMetaMap, ObjectSourceMeta};
15use crate::objgv::*;
16use crate::statistics;
17use anyhow::{Result, anyhow};
18use camino::{Utf8Path, Utf8PathBuf};
19use containers_image_proxy::oci_spec;
20use gvariant::aligned_bytes::TryAsAligned;
21use gvariant::{Marker, Structure};
22use indexmap::IndexMap;
23use ostree::{gio, glib};
24use serde::{Deserialize, Serialize};
25
26/// Maximum number of layers (chunks) we will use.
27// We take half the limit of 128.
28// https://github.com/ostreedev/ostree-rs-ext/issues/69
29pub(crate) const MAX_CHUNKS: u32 = 64;
30/// Minimum number of layers we can create in a "chunked" flow; otherwise
31/// we will just drop down to one.
32const MIN_CHUNKED_LAYERS: u32 = 4;
33
34/// A convenient alias for a reference-counted, immutable string.
35pub(crate) type RcStr = Rc<str>;
36/// Maps from a checksum to its size and file names (multiple in the case of
37/// hard links).
38pub(crate) type ChunkMapping = BTreeMap<RcStr, (u64, Vec<Utf8PathBuf>)>;
39// TODO type PackageSet = HashSet<RcStr>;
40
41const LOW_PARTITION: &str = "2ls";
42const HIGH_PARTITION: &str = "1hs";
43
44#[derive(Debug, Default)]
45pub(crate) struct Chunk {
46    pub(crate) name: String,
47    pub(crate) content: ChunkMapping,
48    pub(crate) size: u64,
49    pub(crate) packages: Vec<String>,
50}
51
52#[derive(Debug, Clone, Deserialize, Serialize)]
53/// Object metadata, but with additional size data
54pub struct ObjectSourceMetaSized {
55    /// The original metadata
56    #[serde(flatten)]
57    pub meta: ObjectSourceMeta,
58    /// Total size of associated objects
59    pub size: u64,
60}
61
62impl Hash for ObjectSourceMetaSized {
63    fn hash<H: Hasher>(&self, state: &mut H) {
64        self.meta.identifier.hash(state);
65    }
66}
67
68impl Eq for ObjectSourceMetaSized {}
69
70impl PartialEq for ObjectSourceMetaSized {
71    fn eq(&self, other: &Self) -> bool {
72        self.meta.identifier == other.meta.identifier
73    }
74}
75
76/// Extend content source metadata with sizes.
77#[derive(Debug)]
78pub struct ObjectMetaSized {
79    /// Mapping from content object to source.
80    pub map: ObjectMetaMap,
81    /// Computed sizes of each content source
82    pub sizes: Vec<ObjectSourceMetaSized>,
83}
84
85impl ObjectMetaSized {
86    /// Given object metadata and a repo, compute the size of each content source.
87    pub fn compute_sizes(repo: &ostree::Repo, meta: ObjectMeta) -> Result<ObjectMetaSized> {
88        let cancellable = gio::Cancellable::NONE;
89        // Destructure into component parts; we'll create the version with sizes
90        let map = meta.map;
91        let mut set = meta.set;
92        // Maps content id -> total size of associated objects
93        let mut sizes = BTreeMap::<&str, u64>::new();
94        // Populate two mappings above, iterating over the object -> contentid mapping
95        for (checksum, contentid) in map.iter() {
96            let finfo = repo.query_file(checksum, cancellable)?.0;
97            let sz = sizes.entry(contentid).or_default();
98            *sz += finfo.size() as u64;
99        }
100        // Combine data from sizes and the content mapping.
101        let sized: Result<Vec<_>> = sizes
102            .into_iter()
103            .map(|(id, size)| -> Result<ObjectSourceMetaSized> {
104                set.take(id)
105                    .ok_or_else(|| anyhow!("Failed to find {} in content set", id))
106                    .map(|meta| ObjectSourceMetaSized { meta, size })
107            })
108            .collect();
109        let mut sizes = sized?;
110        sizes.sort_by(|a, b| b.size.cmp(&a.size));
111        Ok(ObjectMetaSized { map, sizes })
112    }
113}
114
115/// How to split up an ostree commit into "chunks" - designed to map to container image layers.
116#[derive(Debug, Default)]
117pub struct Chunking {
118    pub(crate) metadata_size: u64,
119    pub(crate) remainder: Chunk,
120    pub(crate) chunks: Vec<Chunk>,
121
122    pub(crate) max: u32,
123
124    processed_mapping: bool,
125    /// Number of components (e.g. packages) provided originally
126    pub(crate) n_provided_components: u32,
127    /// The above, but only ones with non-zero size
128    pub(crate) n_sized_components: u32,
129}
130
131#[derive(Default)]
132struct Generation {
133    path: Utf8PathBuf,
134    metadata_size: u64,
135    dirtree_found: BTreeSet<RcStr>,
136    dirmeta_found: BTreeSet<RcStr>,
137}
138
139fn push_dirmeta(repo: &ostree::Repo, generation: &mut Generation, checksum: &str) -> Result<()> {
140    if generation.dirtree_found.contains(checksum) {
141        return Ok(());
142    }
143    let checksum = RcStr::from(checksum);
144    generation.dirmeta_found.insert(RcStr::clone(&checksum));
145    let child_v = repo.load_variant(ostree::ObjectType::DirMeta, checksum.borrow())?;
146    generation.metadata_size += child_v.data_as_bytes().as_ref().len() as u64;
147    Ok(())
148}
149
150fn push_dirtree(
151    repo: &ostree::Repo,
152    generation: &mut Generation,
153    checksum: &str,
154) -> Result<glib::Variant> {
155    let child_v = repo.load_variant(ostree::ObjectType::DirTree, checksum)?;
156    if !generation.dirtree_found.contains(checksum) {
157        generation.metadata_size += child_v.data_as_bytes().as_ref().len() as u64;
158    } else {
159        let checksum = RcStr::from(checksum);
160        generation.dirtree_found.insert(checksum);
161    }
162    Ok(child_v)
163}
164
165fn generate_chunking_recurse(
166    repo: &ostree::Repo,
167    generation: &mut Generation,
168    chunk: &mut Chunk,
169    dt: &glib::Variant,
170) -> Result<()> {
171    let dt = dt.data_as_bytes();
172    let dt = dt.try_as_aligned()?;
173    let dt = gv_dirtree!().cast(dt);
174    let (files, dirs) = dt.to_tuple();
175    // A reusable buffer to avoid heap allocating these
176    let mut hexbuf = [0u8; 64];
177    for file in files {
178        let (name, csum) = file.to_tuple();
179        let fpath = generation.path.join(name.to_str());
180        hex::encode_to_slice(csum, &mut hexbuf)?;
181        let checksum = std::str::from_utf8(&hexbuf)?;
182        let meta = repo.query_file(checksum, gio::Cancellable::NONE)?.0;
183        let size = meta.size() as u64;
184        let entry = chunk.content.entry(RcStr::from(checksum)).or_default();
185        entry.0 = size;
186        let first = entry.1.is_empty();
187        if first {
188            chunk.size += size;
189        }
190        entry.1.push(fpath);
191    }
192    for item in dirs {
193        let (name, contents_csum, meta_csum) = item.to_tuple();
194        let name = name.to_str();
195        // Extend our current path
196        generation.path.push(name);
197        hex::encode_to_slice(contents_csum, &mut hexbuf)?;
198        let checksum_s = std::str::from_utf8(&hexbuf)?;
199        let dirtree_v = push_dirtree(repo, generation, checksum_s)?;
200        generate_chunking_recurse(repo, generation, chunk, &dirtree_v)?;
201        drop(dirtree_v);
202        hex::encode_to_slice(meta_csum, &mut hexbuf)?;
203        let checksum_s = std::str::from_utf8(&hexbuf)?;
204        push_dirmeta(repo, generation, checksum_s)?;
205        // We did a push above, so pop must succeed.
206        assert!(generation.path.pop());
207    }
208    Ok(())
209}
210
211impl Chunk {
212    fn new(name: &str) -> Self {
213        Chunk {
214            name: name.to_string(),
215            ..Default::default()
216        }
217    }
218
219    pub(crate) fn move_obj(&mut self, dest: &mut Self, checksum: &str) -> bool {
220        // In most cases, we expect the object to exist in the source.  However, it's
221        // conveneient here to simply ignore objects which were already moved into
222        // a chunk.
223        if let Some((name, (size, paths))) = self.content.remove_entry(checksum) {
224            let v = dest.content.insert(name, (size, paths));
225            debug_assert!(v.is_none());
226            self.size -= size;
227            dest.size += size;
228            true
229        } else {
230            false
231        }
232    }
233
234    pub(crate) fn move_path(&mut self, dest: &mut Self, checksum: &str, path: &Utf8Path) {
235        if let Some((_size, paths)) = self.content.get_mut(checksum) {
236            let path_index = paths.iter().position(|p| *p == path);
237            if let Some(index) = path_index {
238                let removed_path = paths.remove(index);
239
240                let dest_entry = dest
241                    .content
242                    .entry(RcStr::from(checksum))
243                    .or_insert((0, Vec::new()));
244                dest_entry.1.push(removed_path);
245
246                if paths.is_empty() {
247                    self.content.remove(checksum);
248                }
249            }
250        }
251    }
252}
253
254impl Chunking {
255    /// Creates a reverse map from content IDs to checksums
256    fn create_content_id_map(
257        map: &IndexMap<String, ContentID>,
258    ) -> IndexMap<ContentID, Vec<&String>> {
259        let mut rmap = IndexMap::<ContentID, Vec<&String>>::new();
260        for (checksum, contentid) in map.iter() {
261            rmap.entry(Rc::clone(contentid)).or_default().push(checksum);
262        }
263        rmap
264    }
265
266    /// Generate an initial single chunk.
267    pub fn new(repo: &ostree::Repo, rev: &str) -> Result<Self> {
268        // Find the target commit
269        let rev = repo.require_rev(rev)?;
270
271        // Load and parse the commit object
272        let (commit_v, _) = repo.load_commit(&rev)?;
273        let commit_v = commit_v.data_as_bytes();
274        let commit_v = commit_v.try_as_aligned()?;
275        let commit = gv_commit!().cast(commit_v);
276        let commit = commit.to_tuple();
277
278        // Load it all into a single chunk
279        let mut generation = Generation {
280            path: Utf8PathBuf::from("/"),
281            ..Default::default()
282        };
283        let mut chunk: Chunk = Default::default();
284
285        // Find the root directory tree
286        let contents_checksum = &hex::encode(commit.6);
287        let contents_v = repo.load_variant(ostree::ObjectType::DirTree, contents_checksum)?;
288        push_dirtree(repo, &mut generation, contents_checksum)?;
289        let meta_checksum = &hex::encode(commit.7);
290        push_dirmeta(repo, &mut generation, meta_checksum.as_str())?;
291
292        generate_chunking_recurse(repo, &mut generation, &mut chunk, &contents_v)?;
293
294        let chunking = Chunking {
295            metadata_size: generation.metadata_size,
296            remainder: chunk,
297            ..Default::default()
298        };
299        Ok(chunking)
300    }
301
302    /// Generate a chunking from an object mapping.
303    pub fn from_mapping(
304        repo: &ostree::Repo,
305        rev: &str,
306        meta: &ObjectMetaSized,
307        max_layers: &Option<NonZeroU32>,
308        prior_build_metadata: Option<&oci_spec::image::ImageManifest>,
309        specific_contentmeta: Option<&BTreeMap<ContentID, Vec<(Utf8PathBuf, String)>>>,
310    ) -> Result<Self> {
311        let mut r = Self::new(repo, rev)?;
312        r.process_mapping(meta, max_layers, prior_build_metadata, specific_contentmeta)?;
313        Ok(r)
314    }
315
316    fn remaining(&self) -> u32 {
317        self.max.saturating_sub(self.chunks.len() as u32)
318    }
319
320    /// Given metadata about which objects are owned by a particular content source,
321    /// generate chunks that group together those objects.
322    #[allow(clippy::or_fun_call)]
323    pub fn process_mapping(
324        &mut self,
325        meta: &ObjectMetaSized,
326        max_layers: &Option<NonZeroU32>,
327        prior_build_metadata: Option<&oci_spec::image::ImageManifest>,
328        specific_contentmeta: Option<&BTreeMap<ContentID, Vec<(Utf8PathBuf, String)>>>,
329    ) -> Result<()> {
330        self.max = max_layers
331            .unwrap_or(NonZeroU32::new(MAX_CHUNKS).unwrap())
332            .get();
333
334        let sizes = &meta.sizes;
335        // It doesn't make sense to handle multiple mappings
336        assert!(!self.processed_mapping);
337        self.processed_mapping = true;
338        let remaining = self.remaining();
339        if remaining == 0 {
340            return Ok(());
341        }
342
343        // Create exclusive chunks first if specified
344        let mut processed_specific_components = BTreeSet::new();
345        if let Some(specific_meta) = specific_contentmeta {
346            for (component, files) in specific_meta {
347                let mut chunk = Chunk::new(&component);
348                chunk.packages = vec![component.to_string()];
349
350                // Move all objects belonging to this exclusive component
351                for (path, checksum) in files {
352                    self.remainder
353                        .move_path(&mut chunk, checksum.as_str(), path);
354                }
355
356                self.chunks.push(chunk);
357                processed_specific_components.insert(component.clone());
358            }
359        }
360
361        // Safety: Let's assume no one has over 4 billion components.
362        self.n_provided_components = meta.sizes.len().try_into().unwrap();
363        self.n_sized_components = sizes
364            .iter()
365            .filter(|v| v.size > 0)
366            .count()
367            .try_into()
368            .unwrap();
369
370        // Filter out exclusive components for regular packing
371        let regular_sizes: Vec<ObjectSourceMetaSized> = sizes
372            .iter()
373            .filter(|component| {
374                !processed_specific_components.contains(&*component.meta.identifier)
375            })
376            .cloned()
377            .collect();
378
379        let rmap = Self::create_content_id_map(&meta.map);
380
381        // Process regular components with bin packing if we have remaining layers
382        if let Some(remaining) = NonZeroU32::new(self.remaining()) {
383            let start = Instant::now();
384            let packing = basic_packing(&regular_sizes, remaining, prior_build_metadata)?;
385            let duration = start.elapsed();
386            tracing::debug!("Time elapsed in packing: {:#?}", duration);
387
388            for bin in packing.into_iter() {
389                let name = match bin.len() {
390                    0 => Cow::Borrowed("Reserved for new packages"),
391                    1 => {
392                        let first = bin[0];
393                        let first_name = &*first.meta.identifier;
394                        Cow::Borrowed(first_name)
395                    }
396                    2..=5 => {
397                        let first = bin[0];
398                        let first_name = &*first.meta.identifier;
399                        let r = bin.iter().map(|v| &*v.meta.identifier).skip(1).fold(
400                            String::from(first_name),
401                            |mut acc, v| {
402                                write!(acc, " and {v}").unwrap();
403                                acc
404                            },
405                        );
406                        Cow::Owned(r)
407                    }
408                    n => Cow::Owned(format!("{n} components")),
409                };
410                let mut chunk = Chunk::new(&name);
411                chunk.packages = bin.iter().map(|v| String::from(&*v.meta.name)).collect();
412                for szmeta in bin {
413                    for &obj in rmap.get(&szmeta.meta.identifier).unwrap() {
414                        self.remainder.move_obj(&mut chunk, obj.as_str());
415                    }
416                }
417                self.chunks.push(chunk);
418            }
419        }
420
421        // Check that all objects have been processed
422        if !processed_specific_components.is_empty() || !regular_sizes.is_empty() {
423            assert_eq!(self.remainder.content.len(), 0);
424        }
425
426        Ok(())
427    }
428
429    pub(crate) fn take_chunks(&mut self) -> Vec<Chunk> {
430        let mut r = Vec::new();
431        std::mem::swap(&mut self.chunks, &mut r);
432        r
433    }
434
435    /// Print information about chunking to standard output.
436    pub fn print(&self) {
437        println!("Metadata: {}", glib::format_size(self.metadata_size));
438        if self.n_provided_components > 0 {
439            println!(
440                "Components: provided={} sized={}",
441                self.n_provided_components, self.n_sized_components
442            );
443        }
444        for (n, chunk) in self.chunks.iter().enumerate() {
445            let sz = glib::format_size(chunk.size);
446            println!(
447                "Chunk {}: \"{}\": objects:{} size:{}",
448                n,
449                chunk.name,
450                chunk.content.len(),
451                sz
452            );
453        }
454        if !self.remainder.content.is_empty() {
455            let sz = glib::format_size(self.remainder.size);
456            println!(
457                "Remainder: \"{}\": objects:{} size:{}",
458                self.remainder.name,
459                self.remainder.content.len(),
460                sz
461            );
462        }
463    }
464}
465
466#[cfg(test)]
467fn components_size(components: &[&ObjectSourceMetaSized]) -> u64 {
468    components.iter().map(|k| k.size).sum()
469}
470
471/// Compute the total size of a packing
472#[cfg(test)]
473fn packing_size(packing: &[Vec<&ObjectSourceMetaSized>]) -> u64 {
474    packing.iter().map(|v| components_size(v)).sum()
475}
476
477/// Given a certain threshold, divide a list of packages into all combinations
478/// of (high, medium, low) size and (high,medium,low) using the following
479/// outlier detection methods:
480/// - Median and Median Absolute Deviation Method
481///   Aggressively detects outliers in size and classifies them by
482///   high, medium, low. The high size and low size are separate partitions
483///   and deserve bins of their own
484/// - Mean and Standard Deviation Method
485///   The medium partition from the previous step is less aggressively
486///   classified by using mean for both size and frequency
487///
488/// Note: Assumes components is sorted by descending size
489fn get_partitions_with_threshold<'a>(
490    components: &[&'a ObjectSourceMetaSized],
491    limit_hs_bins: usize,
492    threshold: f64,
493) -> Option<BTreeMap<String, Vec<&'a ObjectSourceMetaSized>>> {
494    let mut partitions: BTreeMap<String, Vec<&ObjectSourceMetaSized>> = BTreeMap::new();
495    let mut med_size: Vec<&ObjectSourceMetaSized> = Vec::new();
496    let mut high_size: Vec<&ObjectSourceMetaSized> = Vec::new();
497
498    let mut sizes: Vec<u64> = components.iter().map(|a| a.size).collect();
499    let (median_size, mad_size) = statistics::median_absolute_deviation(&mut sizes)?;
500
501    // We use abs here to ensure the lower limit stays positive
502    let size_low_limit = 0.5 * f64::abs(median_size - threshold * mad_size);
503    let size_high_limit = median_size + threshold * mad_size;
504
505    for pkg in components {
506        let size = pkg.size as f64;
507
508        // high size (hs)
509        if size >= size_high_limit {
510            high_size.push(pkg);
511        }
512        // low size (ls)
513        else if size <= size_low_limit {
514            partitions
515                .entry(LOW_PARTITION.to_string())
516                .and_modify(|bin| bin.push(pkg))
517                .or_insert_with(|| vec![pkg]);
518        }
519        // medium size (ms)
520        else {
521            med_size.push(pkg);
522        }
523    }
524
525    // Extra high-size packages
526    let mut remaining_pkgs: Vec<_> = if high_size.len() <= limit_hs_bins {
527        Vec::new()
528    } else {
529        high_size.drain(limit_hs_bins..).collect()
530    };
531    assert!(high_size.len() <= limit_hs_bins);
532
533    // Concatenate extra high-size packages + med_sizes to keep it descending sorted
534    remaining_pkgs.append(&mut med_size);
535    partitions.insert(HIGH_PARTITION.to_string(), high_size);
536
537    // Ascending sorted by frequency, so each partition within medium-size is freq sorted
538    remaining_pkgs.sort_by(|a, b| {
539        a.meta
540            .change_frequency
541            .partial_cmp(&b.meta.change_frequency)
542            .unwrap()
543    });
544    let med_sizes: Vec<u64> = remaining_pkgs.iter().map(|a| a.size).collect();
545    let med_frequencies: Vec<u64> = remaining_pkgs
546        .iter()
547        .map(|a| a.meta.change_frequency.into())
548        .collect();
549
550    let med_mean_freq = statistics::mean(&med_frequencies)?;
551    let med_stddev_freq = statistics::std_deviation(&med_frequencies)?;
552    let med_mean_size = statistics::mean(&med_sizes)?;
553    let med_stddev_size = statistics::std_deviation(&med_sizes)?;
554
555    // We use abs to avoid the lower limit being negative
556    let med_freq_low_limit = 0.5f64 * f64::abs(med_mean_freq - threshold * med_stddev_freq);
557    let med_freq_high_limit = med_mean_freq + threshold * med_stddev_freq;
558    let med_size_low_limit = 0.5f64 * f64::abs(med_mean_size - threshold * med_stddev_size);
559    let med_size_high_limit = med_mean_size + threshold * med_stddev_size;
560
561    for pkg in remaining_pkgs {
562        let size = pkg.size as f64;
563        let freq = pkg.meta.change_frequency as f64;
564
565        let size_name;
566        if size >= med_size_high_limit {
567            size_name = "hs";
568        } else if size <= med_size_low_limit {
569            size_name = "ls";
570        } else {
571            size_name = "ms";
572        }
573
574        // Numbered to maintain order of partitions in a BTreeMap of hf, mf, lf
575        let freq_name;
576        if freq >= med_freq_high_limit {
577            freq_name = "3hf";
578        } else if freq <= med_freq_low_limit {
579            freq_name = "5lf";
580        } else {
581            freq_name = "4mf";
582        }
583
584        let bucket = format!("{freq_name}_{size_name}");
585        partitions
586            .entry(bucket.to_string())
587            .and_modify(|bin| bin.push(pkg))
588            .or_insert_with(|| vec![pkg]);
589    }
590
591    for (name, pkgs) in &partitions {
592        tracing::debug!("{:#?}: {:#?}", name, pkgs.len());
593    }
594
595    Some(partitions)
596}
597
598/// If the current rpm-ostree commit to be encapsulated is not the one in which packing structure changes, then
599///  Flatten out prior_build_metadata to view all the packages in prior build as a single vec
600///  Compare the flattened vector to components to see if pkgs added, updated,
601///  removed or kept same
602///  if pkgs added, then add them to the last bin of prior
603///  if pkgs removed, then remove them from the prior[i]
604///  iterate through prior[i] and make bins according to the name in nevra of pkgs to update
605///  required packages
606/// else if pkg structure to be changed || prior build not specified
607///  Recompute optimal packaging structure (Compute partitions, place packages and optimize build)
608///
609/// A return value of `Ok(None)` indicates there was not an error, but the prior packing structure
610/// is incompatible in some way with the current build.
611fn basic_packing_with_prior_build<'a>(
612    components: &'a [ObjectSourceMetaSized],
613    bin_size: NonZeroU32,
614    prior_build: &oci_spec::image::ImageManifest,
615) -> Result<Option<Vec<Vec<&'a ObjectSourceMetaSized>>>> {
616    let before_processing_pkgs_len = components.len();
617
618    tracing::debug!("Attempting to use old package structure");
619
620    // The first layer is the ostree commit, which will always be different for different builds,
621    // so we ignore it.  For the remaining layers, extract the components/packages in each one.
622    let curr_build: Result<Vec<Vec<String>>> = prior_build
623        .layers()
624        .iter()
625        .skip(1)
626        .map(|layer| -> Result<_> {
627            let annotation_layer = layer
628                .annotations()
629                .as_ref()
630                .and_then(|annos| annos.get(CONTENT_ANNOTATION))
631                .ok_or_else(|| anyhow!("Missing {CONTENT_ANNOTATION} on prior build"))?;
632            Ok(annotation_layer
633                .split(COMPONENT_SEPARATOR)
634                .map(ToOwned::to_owned)
635                .collect())
636        })
637        .collect();
638    let mut curr_build = curr_build?;
639
640    if (bin_size.get() as usize) < curr_build.len() {
641        tracing::debug!("bin_size = {bin_size} is too small to be compatible with the prior build");
642        return Ok(None);
643    }
644
645    // View the packages as unordered sets for lookups and differencing
646    let prev_pkgs_set: BTreeSet<String> = curr_build
647        .iter()
648        .flat_map(|v| v.iter().cloned())
649        .filter(|name| !name.is_empty())
650        .collect();
651    let curr_pkgs_set: BTreeSet<String> = components
652        .iter()
653        .map(|pkg| pkg.meta.name.to_string())
654        .collect();
655
656    // Added packages are included in the last bin which was reserved space.
657    if let Some(last_bin) = curr_build.last_mut() {
658        let added = curr_pkgs_set.difference(&prev_pkgs_set);
659        last_bin.retain(|name| !name.is_empty());
660        last_bin.extend(added.into_iter().cloned());
661    } else {
662        tracing::debug!("No empty last bin for added packages.");
663        return Ok(None);
664    }
665
666    // Handle removed packages
667    let removed: BTreeSet<&String> = prev_pkgs_set.difference(&curr_pkgs_set).collect();
668    for bin in curr_build.iter_mut() {
669        bin.retain(|pkg| !removed.contains(pkg));
670    }
671
672    // Handle updated packages
673    let mut name_to_component: BTreeMap<String, &ObjectSourceMetaSized> = BTreeMap::new();
674    for component in components.iter() {
675        name_to_component
676            .entry(component.meta.name.to_string())
677            .or_insert(component);
678    }
679    let mut modified_build: Vec<Vec<&ObjectSourceMetaSized>> = Vec::new();
680    for bin in curr_build {
681        let mut mod_bin = Vec::new();
682        for pkg in bin {
683            // An empty component set can happen for the ostree commit layer; ignore that.
684            if pkg.is_empty() {
685                continue;
686            }
687            mod_bin.push(name_to_component[&pkg]);
688        }
689        modified_build.push(mod_bin);
690    }
691
692    // Verify all packages are included
693    let after_processing_pkgs_len: usize = modified_build.iter().map(|b| b.len()).sum();
694    assert_eq!(after_processing_pkgs_len, before_processing_pkgs_len);
695    assert!(modified_build.len() <= bin_size.get() as usize);
696    Ok(Some(modified_build))
697}
698
699/// Given a set of components with size metadata (e.g. boxes of a certain size)
700/// and a number of bins (possible container layers) to use, determine which components
701/// go in which bin.  This algorithm is pretty simple:
702/// Total available bins = n
703///
704/// 1 bin for all the u32_max frequency pkgs
705/// 1 bin for all newly added pkgs
706/// 1 bin for all low size pkgs
707///
708/// 60% of n-3 bins for high size pkgs
709/// 40% of n-3 bins for medium size pkgs
710///
711/// If HS bins > limit, spillover to MS to package
712/// If MS bins > limit, fold by merging 2 bins from the end
713///
714fn basic_packing<'a>(
715    components: &'a [ObjectSourceMetaSized],
716    bin_size: NonZeroU32,
717    prior_build_metadata: Option<&oci_spec::image::ImageManifest>,
718) -> Result<Vec<Vec<&'a ObjectSourceMetaSized>>> {
719    const HIGH_SIZE_CUTOFF: f32 = 0.6;
720    let before_processing_pkgs_len = components.len();
721
722    anyhow::ensure!(bin_size.get() >= MIN_CHUNKED_LAYERS);
723
724    // If we have a prior build, then try to use that.
725    if let Some(prior_build) = prior_build_metadata {
726        if let Some(packing) = basic_packing_with_prior_build(components, bin_size, prior_build)? {
727            tracing::debug!("Keeping old package structure");
728            return Ok(packing);
729        } else {
730            tracing::debug!(
731                "Failed to use old package structure; discarding info from prior build."
732            );
733        }
734    }
735
736    tracing::debug!("Creating new packing structure");
737
738    // If there are fewer packages/components than there are bins, then we don't need to do
739    // any "bin packing" at all; just assign a single component to each and we're done.
740    if before_processing_pkgs_len < bin_size.get() as usize {
741        let mut r = components.iter().map(|pkg| vec![pkg]).collect::<Vec<_>>();
742        if before_processing_pkgs_len > 0 {
743            let new_pkgs_bin: Vec<&ObjectSourceMetaSized> = Vec::new();
744            r.push(new_pkgs_bin);
745        }
746        return Ok(r);
747    }
748
749    let mut r = Vec::new();
750    // Split off the components which are "max frequency".
751    let (components, max_freq_components) = components
752        .iter()
753        .partition::<Vec<_>, _>(|pkg| pkg.meta.change_frequency != u32::MAX);
754    if !components.is_empty() {
755        // Given a total number of bins (layers), compute how many should be assigned to our
756        // partitioning based on size and frequency.
757        let limit_ls_bins = 1usize;
758        let limit_new_bins = 1usize;
759        let _limit_new_pkgs = 0usize;
760        let limit_max_frequency_pkgs = max_freq_components.len();
761        let limit_max_frequency_bins = limit_max_frequency_pkgs.min(1);
762        let low_and_other_bin_limit = limit_ls_bins + limit_new_bins + limit_max_frequency_bins;
763        let limit_hs_bins = (HIGH_SIZE_CUTOFF
764            * (bin_size.get() - low_and_other_bin_limit as u32) as f32)
765            .floor() as usize;
766        let limit_ms_bins =
767            (bin_size.get() - (limit_hs_bins + low_and_other_bin_limit) as u32) as usize;
768        let partitions = get_partitions_with_threshold(&components, limit_hs_bins, 2f64)
769            .expect("Partitioning components into sets");
770
771        // Compute how many low-sized package/components we have.
772        let low_sized_component_count = partitions
773            .get(LOW_PARTITION)
774            .map(|p| p.len())
775            .unwrap_or_default();
776
777        // Approximate number of components we should have per medium-size bin.
778        let pkg_per_bin_ms: usize = (components.len() - limit_hs_bins - low_sized_component_count)
779            .checked_div(limit_ms_bins)
780            .ok_or_else(|| anyhow::anyhow!("number of bins should be >= {}", MIN_CHUNKED_LAYERS))?;
781
782        // Bins assignment
783        for (partition, pkgs) in partitions.iter() {
784            if partition == HIGH_PARTITION {
785                for pkg in pkgs {
786                    r.push(vec![*pkg]);
787                }
788            } else if partition == LOW_PARTITION {
789                let mut bin: Vec<&ObjectSourceMetaSized> = Vec::new();
790                for pkg in pkgs {
791                    bin.push(*pkg);
792                }
793                r.push(bin);
794            } else {
795                let mut bin: Vec<&ObjectSourceMetaSized> = Vec::new();
796                for (i, pkg) in pkgs.iter().enumerate() {
797                    if bin.len() < pkg_per_bin_ms {
798                        bin.push(*pkg);
799                    } else {
800                        r.push(bin.clone());
801                        bin.clear();
802                        bin.push(*pkg);
803                    }
804                    if i == pkgs.len() - 1 && !bin.is_empty() {
805                        r.push(bin.clone());
806                        bin.clear();
807                    }
808                }
809            }
810        }
811        tracing::debug!("Bins before unoptimized build: {}", r.len());
812
813        // Despite allocation certain number of pkgs per bin in medium-size partitions, the
814        // hard limit of number of medium-size bins can be exceeded. This is because the pkg_per_bin_ms
815        // is only upper limit and there is no lower limit. Thus, if a partition in medium-size has only 1 pkg
816        // but pkg_per_bin_ms > 1, then the entire bin will have 1 pkg. This prevents partition
817        // mixing.
818        //
819        // Addressing medium-size bins limit breach by mergin internal MS partitions
820        // The partitions in medium-size are merged beginning from the end so to not mix high-frequency bins with low-frequency bins. The
821        // bins are kept in this order: high-frequency, medium-frequency, low-frequency.
822        while r.len() > (bin_size.get() as usize - limit_new_bins - limit_max_frequency_bins) {
823            for i in (limit_ls_bins + limit_hs_bins..r.len() - 1)
824                .step_by(2)
825                .rev()
826            {
827                if r.len() <= (bin_size.get() as usize - limit_new_bins - limit_max_frequency_bins)
828                {
829                    break;
830                }
831                let prev = &r[i - 1];
832                let curr = &r[i];
833                let mut merge: Vec<&ObjectSourceMetaSized> = Vec::new();
834                merge.extend(prev.iter());
835                merge.extend(curr.iter());
836                r.remove(i);
837                r.remove(i - 1);
838                r.insert(i, merge);
839            }
840        }
841        tracing::debug!("Bins after optimization: {}", r.len());
842    }
843
844    if !max_freq_components.is_empty() {
845        r.push(max_freq_components);
846    }
847
848    // Allocate an empty bin for new packages
849    r.push(Vec::new());
850    let after_processing_pkgs_len = r.iter().map(|b| b.len()).sum::<usize>();
851    assert_eq!(after_processing_pkgs_len, before_processing_pkgs_len);
852    assert!(r.len() <= bin_size.get() as usize);
853    Ok(r)
854}
855
856#[cfg(test)]
857mod test {
858    use super::*;
859
860    use oci_spec::image as oci_image;
861    use std::str::FromStr;
862
863    const FCOS_CONTENTMETA: &[u8] = include_bytes!("fixtures/fedora-coreos-contentmeta.json.gz");
864    const SHA256_EXAMPLE: &str =
865        "sha256:0000111122223333444455556666777788889999aaaabbbbccccddddeeeeffff";
866
867    #[test]
868    fn test_packing_basics() -> Result<()> {
869        // null cases
870        for v in [4, 7].map(|v| NonZeroU32::new(v).unwrap()) {
871            assert_eq!(basic_packing(&[], v, None).unwrap().len(), 0);
872        }
873        Ok(())
874    }
875
876    #[test]
877    fn test_packing_fcos() -> Result<()> {
878        let contentmeta: Vec<ObjectSourceMetaSized> =
879            serde_json::from_reader(flate2::read::GzDecoder::new(FCOS_CONTENTMETA))?;
880        let total_size = contentmeta.iter().map(|v| v.size).sum::<u64>();
881
882        let packing =
883            basic_packing(&contentmeta, NonZeroU32::new(MAX_CHUNKS).unwrap(), None).unwrap();
884        assert!(!contentmeta.is_empty());
885        // We should fit into the assigned chunk size
886        assert_eq!(packing.len() as u32, MAX_CHUNKS);
887        // And verify that the sizes match
888        let packed_total_size = packing_size(&packing);
889        assert_eq!(total_size, packed_total_size);
890        Ok(())
891    }
892
893    #[test]
894    fn test_packing_one_layer() -> Result<()> {
895        let contentmeta: Vec<ObjectSourceMetaSized> =
896            serde_json::from_reader(flate2::read::GzDecoder::new(FCOS_CONTENTMETA))?;
897        let r = basic_packing(&contentmeta, NonZeroU32::new(1).unwrap(), None);
898        assert!(r.is_err());
899        Ok(())
900    }
901
902    fn create_manifest(prev_expected_structure: Vec<Vec<&str>>) -> oci_spec::image::ImageManifest {
903        use std::collections::HashMap;
904
905        let mut p = prev_expected_structure
906            .iter()
907            .map(|b| {
908                b.iter()
909                    .map(|p| p.split('.').collect::<Vec<&str>>()[0].to_string())
910                    .collect()
911            })
912            .collect();
913        let mut metadata_with_ostree_commit = vec![vec![String::from("ostree_commit")]];
914        metadata_with_ostree_commit.append(&mut p);
915
916        let config = oci_spec::image::DescriptorBuilder::default()
917            .media_type(oci_spec::image::MediaType::ImageConfig)
918            .size(7023_u64)
919            .digest(oci_image::Digest::from_str(SHA256_EXAMPLE).unwrap())
920            .build()
921            .expect("build config descriptor");
922
923        let layers: Vec<oci_spec::image::Descriptor> = metadata_with_ostree_commit
924            .iter()
925            .map(|l| {
926                let mut buf = [0; 8];
927                let sep = COMPONENT_SEPARATOR.encode_utf8(&mut buf);
928                oci_spec::image::DescriptorBuilder::default()
929                    .media_type(oci_spec::image::MediaType::ImageLayerGzip)
930                    .size(100_u64)
931                    .digest(oci_image::Digest::from_str(SHA256_EXAMPLE).unwrap())
932                    .annotations(HashMap::from([(
933                        CONTENT_ANNOTATION.to_string(),
934                        l.join(sep),
935                    )]))
936                    .build()
937                    .expect("build layer")
938            })
939            .collect();
940
941        oci_spec::image::ImageManifestBuilder::default()
942            .schema_version(oci_spec::image::SCHEMA_VERSION)
943            .config(config)
944            .layers(layers)
945            .build()
946            .expect("build image manifest")
947    }
948
949    #[test]
950    fn test_advanced_packing() -> Result<()> {
951        // Step1 : Initial build (Packing structure computed)
952        let contentmeta_v0: Vec<ObjectSourceMetaSized> = vec![
953            vec![1, u32::MAX, 100000],
954            vec![2, u32::MAX, 99999],
955            vec![3, 30, 99998],
956            vec![4, 100, 99997],
957            vec![10, 51, 1000],
958            vec![8, 50, 500],
959            vec![9, 1, 200],
960            vec![11, 100000, 199],
961            vec![6, 30, 2],
962            vec![7, 30, 1],
963        ]
964        .iter()
965        .map(|data| ObjectSourceMetaSized {
966            meta: ObjectSourceMeta {
967                identifier: RcStr::from(format!("pkg{}.0", data[0])),
968                name: RcStr::from(format!("pkg{}", data[0])),
969                srcid: RcStr::from(format!("srcpkg{}", data[0])),
970                change_time_offset: 0,
971                change_frequency: data[1],
972            },
973            size: data[2] as u64,
974        })
975        .collect();
976
977        let packing = basic_packing(
978            &contentmeta_v0.as_slice(),
979            NonZeroU32::new(6).unwrap(),
980            None,
981        )
982        .unwrap();
983        let structure: Vec<Vec<&str>> = packing
984            .iter()
985            .map(|bin| bin.iter().map(|pkg| &*pkg.meta.identifier).collect())
986            .collect();
987        let v0_expected_structure = vec![
988            vec!["pkg3.0"],
989            vec!["pkg4.0"],
990            vec!["pkg6.0", "pkg7.0", "pkg11.0"],
991            vec!["pkg9.0", "pkg8.0", "pkg10.0"],
992            vec!["pkg1.0", "pkg2.0"],
993            vec![],
994        ];
995        assert_eq!(structure, v0_expected_structure);
996
997        // Step 2: Derive packing structure from last build
998
999        let mut contentmeta_v1: Vec<ObjectSourceMetaSized> = contentmeta_v0;
1000        // Upgrade pkg1.0 to 1.1
1001        contentmeta_v1[0].meta.identifier = RcStr::from("pkg1.1");
1002        // Remove pkg7
1003        contentmeta_v1.remove(contentmeta_v1.len() - 1);
1004        // Add pkg5
1005        contentmeta_v1.push(ObjectSourceMetaSized {
1006            meta: ObjectSourceMeta {
1007                identifier: RcStr::from("pkg5.0"),
1008                name: RcStr::from("pkg5"),
1009                srcid: RcStr::from("srcpkg5"),
1010                change_time_offset: 0,
1011                change_frequency: 42,
1012            },
1013            size: 100000,
1014        });
1015
1016        let image_manifest_v0 = create_manifest(v0_expected_structure);
1017        let packing_derived = basic_packing(
1018            &contentmeta_v1.as_slice(),
1019            NonZeroU32::new(6).unwrap(),
1020            Some(&image_manifest_v0),
1021        )
1022        .unwrap();
1023        let structure_derived: Vec<Vec<&str>> = packing_derived
1024            .iter()
1025            .map(|bin| bin.iter().map(|pkg| &*pkg.meta.identifier).collect())
1026            .collect();
1027        let v1_expected_structure = vec![
1028            vec!["pkg3.0"],
1029            vec!["pkg4.0"],
1030            vec!["pkg6.0", "pkg11.0"],
1031            vec!["pkg9.0", "pkg8.0", "pkg10.0"],
1032            vec!["pkg1.1", "pkg2.0"],
1033            vec!["pkg5.0"],
1034        ];
1035
1036        assert_eq!(structure_derived, v1_expected_structure);
1037
1038        // Step 3: Another update on derived where the pkg in the last bin updates
1039
1040        let mut contentmeta_v2: Vec<ObjectSourceMetaSized> = contentmeta_v1;
1041        // Upgrade pkg5.0 to 5.1
1042        contentmeta_v2[9].meta.identifier = RcStr::from("pkg5.1");
1043        // Add pkg12
1044        contentmeta_v2.push(ObjectSourceMetaSized {
1045            meta: ObjectSourceMeta {
1046                identifier: RcStr::from("pkg12.0"),
1047                name: RcStr::from("pkg12"),
1048                srcid: RcStr::from("srcpkg12"),
1049                change_time_offset: 0,
1050                change_frequency: 42,
1051            },
1052            size: 100000,
1053        });
1054
1055        let image_manifest_v1 = create_manifest(v1_expected_structure);
1056        let packing_derived = basic_packing(
1057            &contentmeta_v2.as_slice(),
1058            NonZeroU32::new(6).unwrap(),
1059            Some(&image_manifest_v1),
1060        )
1061        .unwrap();
1062        let structure_derived: Vec<Vec<&str>> = packing_derived
1063            .iter()
1064            .map(|bin| bin.iter().map(|pkg| &*pkg.meta.identifier).collect())
1065            .collect();
1066        let v2_expected_structure = vec![
1067            vec!["pkg3.0"],
1068            vec!["pkg4.0"],
1069            vec!["pkg6.0", "pkg11.0"],
1070            vec!["pkg9.0", "pkg8.0", "pkg10.0"],
1071            vec!["pkg1.1", "pkg2.0"],
1072            vec!["pkg5.1", "pkg12.0"],
1073        ];
1074
1075        assert_eq!(structure_derived, v2_expected_structure);
1076        Ok(())
1077    }
1078
1079    fn setup_exclusive_test(
1080        component_data: &[(u32, u32, u64)],
1081        max_layers: u32,
1082        num_fake_objects: Option<usize>,
1083    ) -> Result<(
1084        Vec<ObjectSourceMetaSized>,
1085        ObjectMetaSized,
1086        BTreeMap<ContentID, Vec<(Utf8PathBuf, String)>>,
1087        Chunking,
1088    )> {
1089        // Create content metadata from provided data
1090        let contentmeta: Vec<ObjectSourceMetaSized> = component_data
1091            .iter()
1092            .map(|&(id, freq, size)| ObjectSourceMetaSized {
1093                meta: ObjectSourceMeta {
1094                    identifier: RcStr::from(format!("pkg{id}.0")),
1095                    name: RcStr::from(format!("pkg{id}")),
1096                    srcid: RcStr::from(format!("srcpkg{id}")),
1097                    change_time_offset: 0,
1098                    change_frequency: freq,
1099                },
1100                size,
1101            })
1102            .collect();
1103
1104        // Create object maps with fake checksums
1105        let mut object_map = IndexMap::new();
1106
1107        for (i, component) in contentmeta.iter().enumerate() {
1108            let checksum = format!("checksum_{i}");
1109            object_map.insert(checksum, component.meta.identifier.clone());
1110        }
1111
1112        let regular_meta = ObjectMetaSized {
1113            map: object_map,
1114            sizes: contentmeta.clone(),
1115        };
1116
1117        // Create exclusive metadata (initially empty, to be populated by individual tests)
1118        let specific_contentmeta = BTreeMap::new();
1119
1120        // Set up chunking with remainder chunk
1121        let mut chunking = Chunking::default();
1122        chunking.max = max_layers;
1123        chunking.remainder = Chunk::new("remainder");
1124
1125        // Add fake objects to the remainder chunk if specified
1126        if let Some(num_objects) = num_fake_objects {
1127            for i in 0..num_objects {
1128                let checksum = format!("checksum_{i}");
1129                chunking.remainder.content.insert(
1130                    RcStr::from(checksum),
1131                    (
1132                        1000,
1133                        vec![Utf8PathBuf::from(format!("/path/to/checksum_{i}"))],
1134                    ),
1135                );
1136                chunking.remainder.size += 1000;
1137            }
1138        }
1139
1140        Ok((contentmeta, regular_meta, specific_contentmeta, chunking))
1141    }
1142
1143    #[test]
1144    fn test_exclusive_chunks() -> Result<()> {
1145        // Test that exclusive chunks are created first and get their own layers
1146        let component_data = [
1147            (1, 100, 50000),
1148            (2, 200, 40000),
1149            (3, 300, 30000),
1150            (4, 400, 20000),
1151            (5, 500, 10000),
1152        ];
1153
1154        let (contentmeta, regular_meta, mut specific_contentmeta, mut chunking) =
1155            setup_exclusive_test(&component_data, 8, Some(5))?;
1156
1157        // Create specific content metadata for pkg1 and pkg2
1158        specific_contentmeta.insert(
1159            contentmeta[0].meta.identifier.clone(),
1160            vec![(
1161                Utf8PathBuf::from("/path/to/checksum_0"),
1162                "checksum_0".to_string(),
1163            )],
1164        );
1165        specific_contentmeta.insert(
1166            contentmeta[1].meta.identifier.clone(),
1167            vec![(
1168                Utf8PathBuf::from("/path/to/checksum_1"),
1169                "checksum_1".to_string(),
1170            )],
1171        );
1172
1173        chunking.process_mapping(
1174            &regular_meta,
1175            &Some(NonZeroU32::new(8).unwrap()),
1176            None,
1177            Some(&specific_contentmeta),
1178        )?;
1179
1180        // Verify exclusive chunks are created first
1181        assert!(chunking.chunks.len() >= 2);
1182        assert_eq!(chunking.chunks[0].name, "pkg1.0");
1183        assert_eq!(chunking.chunks[1].name, "pkg2.0");
1184        assert_eq!(chunking.chunks[0].packages, vec!["pkg1.0".to_string()]);
1185        assert_eq!(chunking.chunks[1].packages, vec!["pkg2.0".to_string()]);
1186
1187        Ok(())
1188    }
1189
1190    #[test]
1191    fn test_exclusive_chunks_with_regular_packing() -> Result<()> {
1192        // Test that exclusive chunks are created first, then regular packing continues
1193        let component_data = [
1194            (1, 100, 50000), // exclusive
1195            (2, 200, 40000), // exclusive
1196            (3, 300, 30000), // regular
1197            (4, 400, 20000), // regular
1198            (5, 500, 10000), // regular
1199            (6, 600, 5000),  // regular
1200        ];
1201
1202        let (contentmeta, regular_meta, mut specific_contentmeta, mut chunking) =
1203            setup_exclusive_test(&component_data, 8, Some(6))?;
1204
1205        // Create specific content metadata for pkg1 and pkg2
1206        specific_contentmeta.insert(
1207            contentmeta[0].meta.identifier.clone(),
1208            vec![(
1209                Utf8PathBuf::from("/path/to/checksum_0"),
1210                "checksum_0".to_string(),
1211            )],
1212        );
1213        specific_contentmeta.insert(
1214            contentmeta[1].meta.identifier.clone(),
1215            vec![(
1216                Utf8PathBuf::from("/path/to/checksum_1"),
1217                "checksum_1".to_string(),
1218            )],
1219        );
1220
1221        chunking.process_mapping(
1222            &regular_meta,
1223            &Some(NonZeroU32::new(8).unwrap()),
1224            None,
1225            Some(&specific_contentmeta),
1226        )?;
1227
1228        // Verify exclusive chunks are created first
1229        assert!(chunking.chunks.len() >= 2);
1230        assert_eq!(chunking.chunks[0].name, "pkg1.0");
1231        assert_eq!(chunking.chunks[1].name, "pkg2.0");
1232        assert_eq!(chunking.chunks[0].packages, vec!["pkg1.0".to_string()]);
1233        assert_eq!(chunking.chunks[1].packages, vec!["pkg2.0".to_string()]);
1234
1235        // Verify regular components are not in exclusive chunks
1236        for chunk in &chunking.chunks[2..] {
1237            assert!(!chunk.packages.contains(&"pkg1.0".to_string()));
1238            assert!(!chunk.packages.contains(&"pkg2.0".to_string()));
1239        }
1240
1241        Ok(())
1242    }
1243
1244    #[test]
1245    fn test_exclusive_chunks_isolation() -> Result<()> {
1246        // Test that exclusive chunks properly isolate components
1247        let component_data = [(1, 100, 50000), (2, 200, 40000), (3, 300, 30000)];
1248
1249        let (contentmeta, regular_meta, mut specific_contentmeta, mut chunking) =
1250            setup_exclusive_test(&component_data, 8, Some(3))?;
1251
1252        // Create specific content metadata for pkg1 only
1253        specific_contentmeta.insert(
1254            contentmeta[0].meta.identifier.clone(),
1255            vec![(
1256                Utf8PathBuf::from("/path/to/checksum_0"),
1257                "checksum_0".to_string(),
1258            )],
1259        );
1260
1261        chunking.process_mapping(
1262            &regular_meta,
1263            &Some(NonZeroU32::new(8).unwrap()),
1264            None,
1265            Some(&specific_contentmeta),
1266        )?;
1267
1268        // Verify pkg1 is in its own exclusive chunk
1269        assert!(!chunking.chunks.is_empty());
1270        assert_eq!(chunking.chunks[0].name, "pkg1.0");
1271        assert_eq!(chunking.chunks[0].packages, vec!["pkg1.0".to_string()]);
1272
1273        // Verify pkg2 and pkg3 are in regular chunks, not mixed with pkg1
1274        let mut found_pkg2 = false;
1275        let mut found_pkg3 = false;
1276        for chunk in &chunking.chunks[1..] {
1277            if chunk.packages.contains(&"pkg2".to_string()) {
1278                found_pkg2 = true;
1279                assert!(!chunk.packages.contains(&"pkg1.0".to_string()));
1280            }
1281            if chunk.packages.contains(&"pkg3".to_string()) {
1282                found_pkg3 = true;
1283                assert!(!chunk.packages.contains(&"pkg1.0".to_string()));
1284            }
1285        }
1286        assert!(found_pkg2 && found_pkg3);
1287
1288        Ok(())
1289    }
1290
1291    #[test]
1292    fn test_process_mapping_specific_components_contain_correct_objects() -> Result<()> {
1293        // This test validates that specific components get their own dedicated layers
1294        // and that their objects are properly isolated from regular package layers
1295
1296        // Setup: Create 5 packages
1297        // - pkg1, pkg2: Will be marked as "specific components" (get their own layers)
1298        // - pkg3, pkg4, pkg5: Regular packages (will be bin-packed together)
1299        let packages = [
1300            (1, 100, 50000), // pkg1 - SPECIFIC COMPONENT
1301            (2, 200, 40000), // pkg2 - SPECIFIC COMPONENT
1302            (3, 300, 30000), // pkg3 - regular package
1303            (4, 400, 20000), // pkg4 - regular package
1304            (5, 500, 10000), // pkg5 - regular package
1305        ];
1306
1307        let (contentmeta, mut system_metadata, mut specific_contentmeta, mut chunking) =
1308            setup_exclusive_test(&packages, 8, None)?;
1309
1310        // Create object mappings
1311        // - system_objects_map: Contains ALL objects in the system (both specific and regular)
1312        let mut system_objects_map = IndexMap::new();
1313
1314        // SPECIFIC COMPONENT 1 (pkg1): owns 3 objects
1315        let pkg1_objects = ["checksum_1_a", "checksum_1_b", "checksum_1_c"];
1316
1317        // SPECIFIC COMPONENT 2 (pkg2): owns 2 objects
1318        let pkg2_objects = ["checksum_2_a", "checksum_2_b"];
1319
1320        // REGULAR PACKAGE 1 (pkg3): owns 2 objects
1321        let pkg3_objects = ["checksum_3_a", "checksum_3_b"];
1322        for obj in &pkg3_objects {
1323            system_objects_map.insert(obj.to_string(), contentmeta[2].meta.identifier.clone());
1324        }
1325
1326        // REGULAR PACKAGE 2 (pkg4): owns 1 object
1327        let pkg4_objects = ["checksum_4_a"];
1328        for obj in &pkg4_objects {
1329            system_objects_map.insert(obj.to_string(), contentmeta[3].meta.identifier.clone());
1330        }
1331
1332        // REGULAR PACKAGE 3 (pkg5): owns 3 objects
1333        let pkg5_objects = ["checksum_5_a", "checksum_5_b", "checksum_5_c"];
1334        for obj in &pkg5_objects {
1335            system_objects_map.insert(obj.to_string(), contentmeta[4].meta.identifier.clone());
1336        }
1337
1338        // Set up metadata
1339        system_metadata.map = system_objects_map;
1340
1341        // Create specific content metadata for pkg1 and pkg2
1342        specific_contentmeta.insert(
1343            contentmeta[0].meta.identifier.clone(),
1344            pkg1_objects
1345                .iter()
1346                .map(|obj| {
1347                    (
1348                        Utf8PathBuf::from(format!("/path/to/{obj}")),
1349                        obj.to_string(),
1350                    )
1351                })
1352                .collect(),
1353        );
1354        specific_contentmeta.insert(
1355            contentmeta[1].meta.identifier.clone(),
1356            pkg2_objects
1357                .iter()
1358                .map(|obj| {
1359                    (
1360                        Utf8PathBuf::from(format!("/path/to/{obj}")),
1361                        obj.to_string(),
1362                    )
1363                })
1364                .collect(),
1365        );
1366
1367        // Initialize: Add ALL objects to the remainder chunk before processing
1368        // This includes both specific component objects and regular package objects
1369        // because process_mapping needs to move them from remainder to their final layers
1370        for (checksum, _) in &system_metadata.map {
1371            chunking.remainder.content.insert(
1372                RcStr::from(checksum.as_str()),
1373                (
1374                    1000,
1375                    vec![Utf8PathBuf::from(format!("/path/to/{checksum}"))],
1376                ),
1377            );
1378            chunking.remainder.size += 1000;
1379        }
1380        for paths in specific_contentmeta.values() {
1381            for (p, checksum) in paths {
1382                chunking
1383                    .remainder
1384                    .content
1385                    .insert(RcStr::from(checksum.as_str()), (1000, vec![p.clone()]));
1386            }
1387        }
1388
1389        // Process the mapping
1390        // - system_metadata contains ALL objects in the system
1391        // - specific_contentmeta tells process_mapping which objects belong to specific components
1392        chunking.process_mapping(
1393            &system_metadata,
1394            &Some(NonZeroU32::new(8).unwrap()),
1395            None,
1396            Some(&specific_contentmeta),
1397        )?;
1398
1399        // VALIDATION PART 1: Specific components get their own dedicated chunks
1400        assert!(
1401            chunking.chunks.len() >= 2,
1402            "Should have at least 2 chunks for specific components"
1403        );
1404
1405        // Specific Component Layer 1: pkg1 only
1406        let specific_component_1_layer = &chunking.chunks[0];
1407        assert_eq!(specific_component_1_layer.name, "pkg1.0");
1408        assert_eq!(specific_component_1_layer.packages, vec!["pkg1.0"]);
1409        assert_eq!(specific_component_1_layer.content.len(), 3);
1410        for obj in &pkg1_objects {
1411            assert!(
1412                specific_component_1_layer.content.contains_key(*obj),
1413                "Specific component 1 layer should contain {obj}"
1414            );
1415        }
1416
1417        // Specific Component Layer 2: pkg2 only
1418        let specific_component_2_layer = &chunking.chunks[1];
1419        assert_eq!(specific_component_2_layer.name, "pkg2.0");
1420        assert_eq!(specific_component_2_layer.packages, vec!["pkg2.0"]);
1421        assert_eq!(specific_component_2_layer.content.len(), 2);
1422        for obj in &pkg2_objects {
1423            assert!(
1424                specific_component_2_layer.content.contains_key(*obj),
1425                "Specific component 2 layer should contain {obj}"
1426            );
1427        }
1428
1429        // VALIDATION PART 2: Specific component layers contain NO regular package objects
1430        for specific_layer in &chunking.chunks[0..2] {
1431            for obj in pkg3_objects
1432                .iter()
1433                .chain(&pkg4_objects)
1434                .chain(&pkg5_objects)
1435            {
1436                assert!(
1437                    !specific_layer.content.contains_key(*obj),
1438                    "Specific component layer '{}' should NOT contain regular package object {}",
1439                    specific_layer.name,
1440                    obj
1441                );
1442            }
1443        }
1444
1445        // VALIDATION PART 3: Regular package layers contain NO specific component objects
1446        let regular_package_layers = &chunking.chunks[2..];
1447        for regular_layer in regular_package_layers {
1448            for obj in pkg1_objects.iter().chain(&pkg2_objects) {
1449                assert!(
1450                    !regular_layer.content.contains_key(*obj),
1451                    "Regular package layer should NOT contain specific component object {obj}"
1452                );
1453            }
1454        }
1455
1456        // VALIDATION PART 4: All regular package objects are in some regular layer
1457        let mut found_regular_objects = BTreeSet::new();
1458        for regular_layer in regular_package_layers {
1459            for obj in regular_layer.content.keys() {
1460                found_regular_objects.insert(obj.as_ref());
1461            }
1462        }
1463
1464        for obj in pkg3_objects
1465            .iter()
1466            .chain(&pkg4_objects)
1467            .chain(&pkg5_objects)
1468        {
1469            assert!(
1470                found_regular_objects.contains(*obj),
1471                "Regular package object {obj} should be in some regular layer"
1472            );
1473        }
1474
1475        // VALIDATION PART 5: All objects moved from remainder
1476        assert_eq!(
1477            chunking.remainder.content.len(),
1478            0,
1479            "All objects should be moved from remainder"
1480        );
1481        assert_eq!(chunking.remainder.size, 0, "Remainder size should be 0");
1482
1483        Ok(())
1484    }
1485}