composefs/erofs/
writer.rs

1//! EROFS image generation and writing functionality.
2//!
3//! This module provides functionality to generate EROFS filesystem images
4//! from composefs tree structures, handling inode layout, directory blocks,
5//! and metadata serialization.
6
7use std::{
8    cell::RefCell,
9    collections::{BTreeMap, HashMap},
10    mem::size_of,
11    os::unix::ffi::OsStrExt,
12    rc::Rc,
13};
14
15use log::trace;
16use xxhash_rust::xxh32::xxh32;
17use zerocopy::{Immutable, IntoBytes};
18
19use crate::{
20    erofs::{composefs::OverlayMetacopy, format, reader::round_up},
21    fsverity::FsVerityHashValue,
22    tree,
23};
24
25#[derive(Clone, Copy, Debug)]
26enum Offset {
27    Header,
28    Superblock,
29    Inode,
30    XAttr,
31    Block,
32    End,
33}
34
35trait Output {
36    fn note_offset(&mut self, offset_type: Offset);
37    fn get(&self, offset_type: Offset, idx: usize) -> usize;
38    fn write(&mut self, data: &[u8]);
39    fn pad(&mut self, alignment: usize);
40    fn len(&self) -> usize;
41
42    fn get_div(&self, offset_type: Offset, idx: usize, div: usize) -> usize {
43        let offset = self.get(offset_type, idx);
44        assert_eq!(offset % div, 0);
45        offset / div
46    }
47
48    fn get_nid(&self, idx: usize) -> u64 {
49        self.get_div(Offset::Inode, idx, 32) as u64
50    }
51
52    fn get_xattr(&self, idx: usize) -> u32 {
53        self.get_div(Offset::XAttr, idx, 4).try_into().unwrap()
54    }
55
56    fn write_struct(&mut self, st: impl IntoBytes + Immutable) {
57        self.write(st.as_bytes());
58    }
59}
60
61#[derive(PartialOrd, PartialEq, Eq, Ord, Clone)]
62struct XAttr {
63    prefix: u8,
64    suffix: Box<[u8]>,
65    value: Box<[u8]>,
66}
67
68#[derive(Clone, Default)]
69struct InodeXAttrs {
70    shared: Vec<usize>,
71    local: Vec<XAttr>,
72    filter: u32,
73}
74
75#[derive(Debug)]
76struct DirEnt<'a> {
77    name: &'a [u8],
78    inode: usize,
79    file_type: format::FileType,
80}
81
82#[derive(Debug, Default)]
83struct Directory<'a> {
84    blocks: Box<[Box<[DirEnt<'a>]>]>,
85    inline: Box<[DirEnt<'a>]>,
86    size: u64,
87    nlink: usize,
88}
89
90#[derive(Debug)]
91struct Leaf<'a, ObjectID: FsVerityHashValue> {
92    content: &'a tree::LeafContent<ObjectID>,
93    nlink: usize,
94}
95
96#[derive(Debug)]
97enum InodeContent<'a, ObjectID: FsVerityHashValue> {
98    Directory(Directory<'a>),
99    Leaf(Leaf<'a, ObjectID>),
100}
101
102struct Inode<'a, ObjectID: FsVerityHashValue> {
103    stat: &'a tree::Stat,
104    xattrs: InodeXAttrs,
105    content: InodeContent<'a, ObjectID>,
106}
107
108impl XAttr {
109    pub fn write(&self, output: &mut impl Output) {
110        output.write_struct(format::XAttrHeader {
111            name_len: self.suffix.len() as u8,
112            name_index: self.prefix,
113            value_size: (self.value.len() as u16).into(),
114        });
115        output.write(&self.suffix);
116        output.write(&self.value);
117        output.pad(4);
118    }
119}
120
121impl InodeXAttrs {
122    fn add(&mut self, name: &[u8], value: &[u8]) {
123        for (idx, prefix) in format::XATTR_PREFIXES.iter().enumerate().rev() {
124            if let Some(suffix) = name.strip_prefix(*prefix) {
125                self.filter |= 1 << (xxh32(suffix, format::XATTR_FILTER_SEED + idx as u32) % 32);
126                self.local.push(XAttr {
127                    prefix: idx as u8,
128                    suffix: Box::from(suffix),
129                    value: Box::from(value),
130                });
131                return;
132            }
133        }
134        unreachable!("{:?}", std::str::from_utf8(name)); // worst case: we matched the empty prefix (0)
135    }
136
137    fn write(&self, output: &mut impl Output) {
138        if self.filter != 0 {
139            trace!("  write xattrs block");
140            output.write_struct(format::InodeXAttrHeader {
141                name_filter: (!self.filter).into(),
142                shared_count: self.shared.len() as u8,
143                ..Default::default()
144            });
145            for idx in &self.shared {
146                trace!("    shared {} @{}", idx, output.len());
147                output.write(&output.get_xattr(*idx).to_le_bytes());
148            }
149            for attr in &self.local {
150                trace!("    local @{}", output.len());
151                attr.write(output);
152            }
153        }
154        // our alignment is equal to xattr alignment: no need to pad
155    }
156}
157
158impl<'a> Directory<'a> {
159    pub fn from_entries(entries: Vec<DirEnt<'a>>) -> Self {
160        let mut blocks = vec![];
161        let mut rest = vec![];
162
163        let mut n_bytes = 0u64;
164        let mut nlink = 0;
165
166        trace!("Directory with {} items", entries.len());
167
168        // The content of the directory is fixed at this point so we may as well split it into
169        // blocks.  This lets us avoid measuring and re-measuring.
170        for entry in entries.into_iter() {
171            let entry_size: u64 = (size_of::<format::DirectoryEntryHeader>() + entry.name.len())
172                .try_into()
173                .unwrap();
174            assert!(entry_size <= 4096);
175
176            trace!("    {:?}", entry.file_type);
177
178            if matches!(entry.file_type, format::FileType::Directory) {
179                nlink += 1;
180            }
181
182            n_bytes += entry_size;
183            if n_bytes <= 4096 {
184                rest.push(entry);
185            } else {
186                // It won't fit, so we need to store the existing entries in a block.
187                trace!("    block {}", rest.len());
188                blocks.push(rest.into_boxed_slice());
189
190                // Start over
191                rest = vec![entry];
192                n_bytes = entry_size;
193            }
194        }
195
196        // Don't try to store more than 2048 bytes of tail data
197        if n_bytes > 2048 {
198            blocks.push(rest.into_boxed_slice());
199            rest = vec![];
200            n_bytes = 0;
201        }
202
203        trace!(
204            "  blocks {} inline {} inline_size {n_bytes}",
205            blocks.len(),
206            rest.len()
207        );
208
209        let block_size: u64 = format::BLOCK_SIZE.into();
210        let size = block_size * blocks.len() as u64 + n_bytes;
211        Self {
212            blocks: blocks.into_boxed_slice(),
213            inline: rest.into_boxed_slice(),
214            size,
215            nlink,
216        }
217    }
218
219    fn write_block(&self, output: &mut impl Output, block: &[DirEnt]) {
220        trace!("    write dir block {} @{}", block.len(), output.len());
221        let mut nameofs = size_of::<format::DirectoryEntryHeader>() * block.len();
222
223        for entry in block {
224            trace!(
225                "      entry {:?} name {} @{}",
226                entry.file_type,
227                nameofs,
228                output.len()
229            );
230            output.write_struct(format::DirectoryEntryHeader {
231                name_offset: (nameofs as u16).into(),
232                inode_offset: output.get_nid(entry.inode).into(),
233                file_type: entry.file_type.into(),
234                ..Default::default()
235            });
236            nameofs += entry.name.len();
237        }
238
239        for entry in block {
240            trace!("      name @{}", output.len());
241            output.write(entry.name.as_bytes());
242        }
243    }
244
245    fn write_inline(&self, output: &mut impl Output) {
246        trace!(
247            "  write inline len {} expected size {} of {}",
248            self.inline.len(),
249            self.size % 4096,
250            self.size
251        );
252        self.write_block(output, &self.inline);
253    }
254
255    fn write_blocks(&self, output: &mut impl Output) {
256        let block_size: usize = format::BLOCK_SIZE.into();
257        for block in &self.blocks {
258            assert_eq!(output.len() % block_size, 0);
259            self.write_block(output, block);
260            output.pad(block_size);
261        }
262    }
263
264    fn inode_meta(&self, block_offset: usize) -> (format::DataLayout, u32, u64, usize) {
265        let (layout, u) = if self.inline.is_empty() {
266            (format::DataLayout::FlatPlain, block_offset as u32 / 4096)
267        } else if !self.blocks.is_empty() {
268            (format::DataLayout::FlatInline, block_offset as u32 / 4096)
269        } else {
270            (format::DataLayout::FlatInline, 0)
271        };
272        (layout, u, self.size, self.nlink)
273    }
274}
275
276impl<ObjectID: FsVerityHashValue> Leaf<'_, ObjectID> {
277    fn inode_meta(&self) -> (format::DataLayout, u32, u64, usize) {
278        let (layout, u, size) = match &self.content {
279            tree::LeafContent::Regular(tree::RegularFile::Inline(data)) => {
280                if data.is_empty() {
281                    (format::DataLayout::FlatPlain, 0, data.len() as u64)
282                } else {
283                    (format::DataLayout::FlatInline, 0, data.len() as u64)
284                }
285            }
286            tree::LeafContent::Regular(tree::RegularFile::External(.., size)) => {
287                (format::DataLayout::ChunkBased, 31, *size)
288            }
289            tree::LeafContent::CharacterDevice(rdev) | tree::LeafContent::BlockDevice(rdev) => {
290                (format::DataLayout::FlatPlain, *rdev as u32, 0)
291            }
292            tree::LeafContent::Fifo | tree::LeafContent::Socket => {
293                (format::DataLayout::FlatPlain, 0, 0)
294            }
295            tree::LeafContent::Symlink(target) => {
296                (format::DataLayout::FlatInline, 0, target.len() as u64)
297            }
298        };
299        (layout, u, size, self.nlink)
300    }
301
302    fn write_inline(&self, output: &mut impl Output) {
303        output.write(match self.content {
304            tree::LeafContent::Regular(tree::RegularFile::Inline(data)) => data,
305            tree::LeafContent::Regular(tree::RegularFile::External(..)) => b"\xff\xff\xff\xff", // null chunk
306            tree::LeafContent::Symlink(target) => target.as_bytes(),
307            _ => &[],
308        });
309    }
310}
311
312impl<ObjectID: FsVerityHashValue> Inode<'_, ObjectID> {
313    fn file_type(&self) -> format::FileType {
314        match &self.content {
315            InodeContent::Directory(..) => format::FileType::Directory,
316            InodeContent::Leaf(leaf) => match &leaf.content {
317                tree::LeafContent::Regular(..) => format::FileType::RegularFile,
318                tree::LeafContent::CharacterDevice(..) => format::FileType::CharacterDevice,
319                tree::LeafContent::BlockDevice(..) => format::FileType::BlockDevice,
320                tree::LeafContent::Fifo => format::FileType::Fifo,
321                tree::LeafContent::Socket => format::FileType::Socket,
322                tree::LeafContent::Symlink(..) => format::FileType::Symlink,
323            },
324        }
325    }
326
327    fn write_inode(&self, output: &mut impl Output, idx: usize) {
328        let (layout, u, size, nlink) = match &self.content {
329            InodeContent::Directory(dir) => dir.inode_meta(output.get(Offset::Block, idx)),
330            InodeContent::Leaf(leaf) => leaf.inode_meta(),
331        };
332
333        let xattr_size = {
334            let mut xattr = FirstPass::default();
335            self.xattrs.write(&mut xattr);
336            xattr.offset
337        };
338
339        // We need to make sure the inline part doesn't overlap a block boundary
340        output.pad(32);
341        if matches!(layout, format::DataLayout::FlatInline) {
342            let block_size = u64::from(format::BLOCK_SIZE);
343            let inode_and_xattr_size: u64 = (size_of::<format::ExtendedInodeHeader>() + xattr_size)
344                .try_into()
345                .unwrap();
346            let inline_start: u64 = output.len().try_into().unwrap();
347            let inline_start = inline_start + inode_and_xattr_size;
348            let end_of_metadata = inline_start - 1;
349            let inline_end = inline_start + (size % block_size);
350            if end_of_metadata / block_size != inline_end / block_size {
351                // If we proceed, then we'll violate the rule about crossing block boundaries.
352                // The easiest thing to do is to add padding so that the inline data starts close
353                // to the start of a fresh block boundary, while ensuring inode alignment.
354                // pad_size is always < block_size (4096), so fits in usize
355                let pad_size = (block_size - end_of_metadata % block_size) as usize;
356                let pad = vec![0; pad_size];
357                trace!("added pad {}", pad.len());
358                output.write(&pad);
359                output.pad(32);
360            }
361        }
362
363        let format = format::InodeLayout::Extended | layout;
364
365        trace!(
366            "write inode {idx} nid {} {:?} {:?} xattrsize{xattr_size} icount{} inline{} @{}",
367            output.len() / 32,
368            format,
369            self.file_type(),
370            match xattr_size {
371                0 => 0,
372                n => (1 + (n - 12) / 4) as u16,
373            },
374            size % 4096,
375            output.len()
376        );
377
378        output.note_offset(Offset::Inode);
379        output.write_struct(format::ExtendedInodeHeader {
380            format,
381            xattr_icount: match xattr_size {
382                0 => 0,
383                n => (1 + (n - 12) / 4) as u16,
384            }
385            .into(),
386            mode: self.file_type() | self.stat.st_mode,
387            size: size.into(),
388            u: u.into(),
389            ino: ((output.len() / 32) as u32).into(),
390            uid: self.stat.st_uid.into(),
391            gid: self.stat.st_gid.into(),
392            mtime: (self.stat.st_mtim_sec as u64).into(),
393            nlink: (nlink as u32).into(),
394            ..Default::default()
395        });
396
397        self.xattrs.write(output);
398
399        match &self.content {
400            InodeContent::Directory(dir) => dir.write_inline(output),
401            InodeContent::Leaf(leaf) => leaf.write_inline(output),
402        };
403
404        output.pad(32);
405    }
406
407    fn write_blocks(&self, output: &mut impl Output) {
408        if let InodeContent::Directory(dir) = &self.content {
409            dir.write_blocks(output);
410        }
411    }
412}
413
414struct InodeCollector<'a, ObjectID: FsVerityHashValue> {
415    inodes: Vec<Inode<'a, ObjectID>>,
416    hardlinks: HashMap<*const tree::Leaf<ObjectID>, usize>,
417}
418
419impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> {
420    fn push_inode(&mut self, stat: &'a tree::Stat, content: InodeContent<'a, ObjectID>) -> usize {
421        let mut xattrs = InodeXAttrs::default();
422
423        // We need to record extra xattrs for some files.  These come first.
424        if let InodeContent::Leaf(Leaf {
425            content: tree::LeafContent::Regular(tree::RegularFile::External(id, ..)),
426            ..
427        }) = content
428        {
429            xattrs.add(
430                b"trusted.overlay.metacopy",
431                OverlayMetacopy::new(id).as_bytes(),
432            );
433
434            let redirect = format!("/{}", id.to_object_pathname());
435            xattrs.add(b"trusted.overlay.redirect", redirect.as_bytes());
436        }
437
438        // Add the normal xattrs.  They're already listed in sorted order.
439        for (name, value) in RefCell::borrow(&stat.xattrs).iter() {
440            let name = name.as_bytes();
441
442            if let Some(escapee) = name.strip_prefix(b"trusted.overlay.") {
443                let escaped = [b"trusted.overlay.overlay.", escapee].concat();
444                xattrs.add(&escaped, value);
445            } else {
446                xattrs.add(name, value);
447            }
448        }
449
450        // Allocate an inode for ourselves.  At first we write all xattrs as local.  Later (after
451        // we've determined which xattrs ought to be shared) we'll come and move some of them over.
452        let inode = self.inodes.len();
453        self.inodes.push(Inode {
454            stat,
455            xattrs,
456            content,
457        });
458        inode
459    }
460
461    fn collect_leaf(&mut self, leaf: &'a Rc<tree::Leaf<ObjectID>>) -> usize {
462        let nlink = Rc::strong_count(leaf);
463
464        if nlink > 1 {
465            if let Some(inode) = self.hardlinks.get(&Rc::as_ptr(leaf)) {
466                return *inode;
467            }
468        }
469
470        let inode = self.push_inode(
471            &leaf.stat,
472            InodeContent::Leaf(Leaf {
473                content: &leaf.content,
474                nlink,
475            }),
476        );
477
478        if nlink > 1 {
479            self.hardlinks.insert(Rc::as_ptr(leaf), inode);
480        }
481
482        inode
483    }
484
485    fn insert_sorted(
486        entries: &mut Vec<DirEnt<'a>>,
487        name: &'a [u8],
488        inode: usize,
489        file_type: format::FileType,
490    ) {
491        let entry = DirEnt {
492            name,
493            inode,
494            file_type,
495        };
496        let point = entries.partition_point(|e| e.name < entry.name);
497        entries.insert(point, entry);
498    }
499
500    fn collect_dir(&mut self, dir: &'a tree::Directory<ObjectID>, parent: usize) -> usize {
501        // The root inode number needs to fit in a u16.  That more or less compels us to write the
502        // directory inode before the inode of the children of the directory.  Reserve a slot.
503        let me = self.push_inode(&dir.stat, InodeContent::Directory(Directory::default()));
504
505        let mut entries = vec![];
506
507        for (name, inode) in dir.sorted_entries() {
508            let child = match inode {
509                tree::Inode::Directory(dir) => self.collect_dir(dir, me),
510                tree::Inode::Leaf(leaf) => self.collect_leaf(leaf),
511            };
512            entries.push(DirEnt {
513                name: name.as_bytes(),
514                inode: child,
515                file_type: self.inodes[child].file_type(),
516            });
517        }
518
519        // We're expected to add those, too
520        Self::insert_sorted(&mut entries, b".", me, format::FileType::Directory);
521        Self::insert_sorted(&mut entries, b"..", parent, format::FileType::Directory);
522
523        // Now that we know the actual content, we can write it to our reserved slot
524        self.inodes[me].content = InodeContent::Directory(Directory::from_entries(entries));
525        me
526    }
527
528    pub fn collect(fs: &'a tree::FileSystem<ObjectID>) -> Vec<Inode<'a, ObjectID>> {
529        let mut this = Self {
530            inodes: vec![],
531            hardlinks: HashMap::new(),
532        };
533
534        // '..' of the root directory is the root directory again
535        let root_inode = this.collect_dir(&fs.root, 0);
536        assert_eq!(root_inode, 0);
537
538        this.inodes
539    }
540}
541
542/// Takes a list of inodes where each inode contains only local xattr values, determines which
543/// xattrs (key, value) pairs appear more than once, and shares them.
544fn share_xattrs(inodes: &mut [Inode<impl FsVerityHashValue>]) -> Vec<XAttr> {
545    let mut xattrs: BTreeMap<XAttr, usize> = BTreeMap::new();
546
547    // Collect all xattrs from the inodes
548    for inode in inodes.iter() {
549        for attr in &inode.xattrs.local {
550            if let Some(count) = xattrs.get_mut(attr) {
551                *count += 1;
552            } else {
553                xattrs.insert(attr.clone(), 1);
554            }
555        }
556    }
557
558    // Share only xattrs with more than one user
559    xattrs.retain(|_k, v| *v > 1);
560
561    // Repurpose the refcount field as an index lookup
562    for (idx, value) in xattrs.values_mut().enumerate() {
563        *value = idx;
564    }
565
566    // Visit each inode and change local xattrs into shared xattrs
567    for inode in inodes.iter_mut() {
568        inode.xattrs.local.retain(|attr| {
569            if let Some(idx) = xattrs.get(attr) {
570                inode.xattrs.shared.push(*idx);
571                false // drop the local xattr: we converted it
572            } else {
573                true // retain the local xattr: we didn't convert it
574            }
575        });
576    }
577
578    // Return the shared xattrs as a vec
579    xattrs.into_keys().collect()
580}
581
582fn write_erofs(
583    output: &mut impl Output,
584    inodes: &[Inode<impl FsVerityHashValue>],
585    xattrs: &[XAttr],
586) {
587    // Write composefs header
588    output.note_offset(Offset::Header);
589    output.write_struct(format::ComposefsHeader {
590        magic: format::COMPOSEFS_MAGIC,
591        version: format::VERSION,
592        flags: 0.into(),
593        composefs_version: format::COMPOSEFS_VERSION,
594        ..Default::default()
595    });
596    output.pad(1024);
597
598    // Write superblock
599    output.note_offset(Offset::Superblock);
600    output.write_struct(format::Superblock {
601        magic: format::MAGIC_V1,
602        blkszbits: format::BLOCK_BITS,
603        feature_compat: format::FEATURE_COMPAT_MTIME | format::FEATURE_COMPAT_XATTR_FILTER,
604        root_nid: (output.get_nid(0) as u16).into(),
605        inos: (inodes.len() as u64).into(),
606        blocks: ((output.get(Offset::End, 0) / usize::from(format::BLOCK_SIZE)) as u32).into(),
607        ..Default::default()
608    });
609
610    // Write inode table
611    for (idx, inode) in inodes.iter().enumerate() {
612        // The inode may add padding to itself, so it notes its own offset
613        inode.write_inode(output, idx);
614    }
615
616    // Write shared xattr table
617    for xattr in xattrs {
618        output.note_offset(Offset::XAttr);
619        xattr.write(output);
620    }
621
622    // Write blocks from inodes that have them
623    output.pad(4096);
624    for inode in inodes.iter() {
625        output.note_offset(Offset::Block);
626        inode.write_blocks(output);
627    }
628
629    // That's it
630    output.note_offset(Offset::End);
631}
632
633#[derive(Default)]
634struct Layout {
635    offset_types: Vec<usize>,
636    offsets: Vec<usize>,
637}
638
639#[derive(Default)]
640struct FirstPass {
641    offset: usize,
642    layout: Layout,
643}
644
645struct SecondPass {
646    output: Vec<u8>,
647    layout: Layout,
648}
649
650impl Output for SecondPass {
651    fn note_offset(&mut self, _offset_type: Offset) {
652        /* no-op */
653    }
654
655    fn get(&self, offset_type: Offset, idx: usize) -> usize {
656        let start = self.layout.offset_types[offset_type as usize];
657        self.layout.offsets[start + idx]
658    }
659
660    fn write(&mut self, data: &[u8]) {
661        self.output.extend_from_slice(data);
662    }
663
664    fn pad(&mut self, alignment: usize) {
665        self.output
666            .resize(round_up(self.output.len(), alignment), 0);
667    }
668
669    fn len(&self) -> usize {
670        self.output.len()
671    }
672}
673
674impl Output for FirstPass {
675    fn note_offset(&mut self, offset_type: Offset) {
676        while self.layout.offset_types.len() <= offset_type as usize {
677            self.layout.offset_types.push(self.layout.offsets.len());
678        }
679        assert_eq!(self.layout.offset_types.len(), offset_type as usize + 1);
680
681        trace!(
682            "{:?} #{} @{}",
683            offset_type,
684            self.layout.offsets.len() - self.layout.offset_types[offset_type as usize],
685            self.offset
686        );
687        self.layout.offsets.push(self.offset);
688    }
689
690    fn get(&self, _: Offset, _: usize) -> usize {
691        0 // We don't know offsets in the first pass, so fake it
692    }
693
694    fn write(&mut self, data: &[u8]) {
695        self.offset += data.len();
696    }
697
698    fn pad(&mut self, alignment: usize) {
699        self.offset = round_up(self.offset, alignment);
700    }
701
702    fn len(&self) -> usize {
703        self.offset
704    }
705}
706
707/// Creates an EROFS filesystem image from a composefs tree
708///
709/// This function performs a two-pass generation:
710/// 1. First pass determines the layout and sizes of all structures
711/// 2. Second pass writes the actual image data
712///
713/// Returns the complete EROFS image as a byte array.
714pub fn mkfs_erofs<ObjectID: FsVerityHashValue>(fs: &tree::FileSystem<ObjectID>) -> Box<[u8]> {
715    // Create the intermediate representation: flattened inodes and shared xattrs
716    let mut inodes = InodeCollector::collect(fs);
717    let xattrs = share_xattrs(&mut inodes);
718
719    // Do a first pass with the writer to determine the layout
720    let mut first_pass = FirstPass::default();
721    write_erofs(&mut first_pass, &inodes, &xattrs);
722
723    // Do a second pass with the writer to get the actual bytes
724    let mut second_pass = SecondPass {
725        output: vec![],
726        layout: first_pass.layout,
727    };
728    write_erofs(&mut second_pass, &inodes, &xattrs);
729
730    // That's it
731    second_pass.output.into_boxed_slice()
732}