diff --git a/README.md b/README.md index 8d3a494..8648811 100644 --- a/README.md +++ b/README.md @@ -98,8 +98,11 @@ Recommended: Brotli/2-7 ## TODO ### Core functionality +- Fix vacuum inconsistencies (either index related, or bundle syncing related) +- Proper bundle usage analysis with compressed size estimation - Recompress & combine bundles - Allow to use tar files for backup and restore (--tar, http://alexcrichton.com/tar-rs/tar/index.html) +- Allow to mount backups (inode id == position in index, lru cache) - File attributes - xattrs https://crates.io/crates/xattr diff --git a/src/cli/args.rs b/src/cli/args.rs index 5f02a58..cde15df 100644 --- a/src/cli/args.rs +++ b/src/cli/args.rs @@ -65,6 +65,9 @@ pub enum Arguments { backup_name: Option, inode: Option }, + Analyze { + repo_path: String + }, BundleList { repo_path: String }, @@ -176,7 +179,7 @@ fn parse_bundle_id(val: &str) -> BundleId { } } - +#[allow(unknown_lints,cyclomatic_complexity)] pub fn parse() -> Arguments { let args = clap_app!(zvault => (version: crate_version!()) @@ -259,6 +262,10 @@ pub fn parse() -> Arguments { (about: "displays information on a repository, a backup or a path in a backup") (@arg PATH: +required "repository[::backup[::subpath]] path") ) + (@subcommand analyze => + (about: "analyze the used and reclaimable space of bundles") + (@arg REPO: +required "repository path") + ) (@subcommand configure => (about: "changes the configuration") (@arg REPO: +required "path of the repository") @@ -425,6 +432,16 @@ pub fn parse() -> Arguments { inode: inode.map(|v| v.to_string()) } } + if let Some(args) = args.subcommand_matches("analyze") { + let (repository, backup, inode) = split_repo_path(args.value_of("REPO").unwrap()); + if backup.is_some() || inode.is_some() { + println!("No backups or subpaths may be given here"); + exit(1); + } + return Arguments::Analyze { + repo_path: repository.to_string() + } + } if let Some(args) = args.subcommand_matches("import") { let (repository, backup, inode) = split_repo_path(args.value_of("REPO").unwrap()); if backup.is_some() || inode.is_some() { diff --git a/src/cli/mod.rs b/src/cli/mod.rs index 2a29ab0..13ef0bf 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -169,6 +169,28 @@ fn print_config(config: &Config) { println!("Hash method: {}", config.hash.name()); } +fn print_analysis(analysis: &HashMap) { + let mut reclaim_space = [0; 11]; + let mut data_total = 0; + for bundle in analysis.values() { + data_total += bundle.info.encoded_size; + #[allow(unknown_lints,needless_range_loop)] + for i in 0..11 { + if bundle.get_usage_ratio() <= i as f32 * 0.1 { + reclaim_space[i] += bundle.get_unused_size(); + } + } + } + println!("Total bundle size: {}", to_file_size(data_total as u64)); + let used = data_total - reclaim_space[10]; + println!("Space used: {}, {:.1} %", to_file_size(used as u64), used as f32 / data_total as f32 * 100.0); + println!("Reclaimable space (depending on vacuum ratio)"); + #[allow(unknown_lints,needless_range_loop)] + for i in 0..11 { + println!(" - ratio={:3}: {:6}, {:4.1} %", i*10, to_file_size(reclaim_space[i] as u64), reclaim_space[i] as f32 / data_total as f32 * 100.0); + } +} + #[allow(unknown_lints,cyclomatic_complexity)] pub fn run() { @@ -346,6 +368,10 @@ pub fn run() { print_repoinfo(&repo.info()); } }, + Arguments::Analyze{repo_path} => { + let mut repo = open_repository(&repo_path); + print_analysis(&checked(repo.analyze_usage(), "analyze repository")); + }, Arguments::BundleList{repo_path} => { let repo = open_repository(&repo_path); for bundle in repo.list_bundles() { diff --git a/src/prelude.rs b/src/prelude.rs index 64e599a..9978ded 100644 --- a/src/prelude.rs +++ b/src/prelude.rs @@ -1,7 +1,7 @@ pub use ::util::*; pub use ::bundledb::{BundleReader, BundleMode, BundleWriter, BundleInfo, BundleId, BundleDbError, BundleDb, BundleWriterError}; pub use ::chunker::{ChunkerType, Chunker, ChunkerStatus, IChunker, ChunkerError}; -pub use ::repository::{Repository, Backup, Config, RepositoryError, RepositoryInfo, Inode, FileType, RepositoryIntegrityError, BackupFileError, BackupError, BackupOptions}; +pub use ::repository::{Repository, Backup, Config, RepositoryError, RepositoryInfo, Inode, FileType, RepositoryIntegrityError, BackupFileError, BackupError, BackupOptions, BundleAnalysis}; pub use ::index::{Index, Location, IndexError}; pub use serde::{Serialize, Deserialize}; diff --git a/src/repository/info.rs b/src/repository/info.rs index bdd46bb..15507a0 100644 --- a/src/repository/info.rs +++ b/src/repository/info.rs @@ -1,5 +1,32 @@ use ::prelude::*; +use super::metadata::FileContents; + +use std::collections::{HashMap, VecDeque}; + + +pub struct BundleAnalysis { + pub info: BundleInfo, + pub chunk_usage: Bitmap, + pub used_raw_size: usize +} + +impl BundleAnalysis { + #[inline] + pub fn get_usage_ratio(&self) -> f32 { + self.used_raw_size as f32 / self.info.raw_size as f32 + } + + #[inline] + pub fn get_used_size(&self) -> usize { + (self.get_usage_ratio() * self.info.encoded_size as f32) as usize + } + + #[inline] + pub fn get_unused_size(&self) -> usize { + ((1.0 - self.get_usage_ratio()) * self.info.encoded_size as f32) as usize + } +} pub struct RepositoryInfo { pub bundle_count: usize, @@ -15,6 +42,70 @@ pub struct RepositoryInfo { impl Repository { + fn mark_used(&self, bundles: &mut HashMap, chunks: &[Chunk]) -> Result { + let mut new = false; + for &(hash, len) in chunks { + if let Some(pos) = self.index.get(&hash) { + if let Some(bundle) = bundles.get_mut(&pos.bundle) { + if !bundle.chunk_usage.get(pos.chunk as usize) { + new = true; + bundle.chunk_usage.set(pos.chunk as usize); + bundle.used_raw_size += len as usize; + } + } else { + return Err(RepositoryIntegrityError::MissingBundleId(pos.bundle).into()); + } + } else { + return Err(RepositoryIntegrityError::MissingChunk(hash).into()); + } + } + Ok(new) + } + + pub fn analyze_usage(&mut self) -> Result, RepositoryError> { + let mut usage = HashMap::new(); + for (id, bundle) in self.bundle_map.bundles() { + let bundle = try!(self.bundles.get_bundle_info(&bundle).ok_or_else(|| RepositoryIntegrityError::MissingBundle(bundle))); + usage.insert(id, BundleAnalysis { + chunk_usage: Bitmap::new(bundle.chunk_count), + info: bundle.clone(), + used_raw_size: 0 + }); + } + let backups = try!(self.get_backups()); + let mut todo = VecDeque::new(); + for (_name, backup) in backups { + todo.push_back(backup.root); + } + while let Some(chunks) = todo.pop_back() { + if !try!(self.mark_used(&mut usage, &chunks)) { + continue + } + let inode = try!(self.get_inode(&chunks)); + // Mark the content chunks as used + match inode.contents { + None | Some(FileContents::Inline(_)) => (), + Some(FileContents::ChunkedDirect(chunks)) => { + try!(self.mark_used(&mut usage, &chunks)); + }, + Some(FileContents::ChunkedIndirect(chunks)) => { + if try!(self.mark_used(&mut usage, &chunks)) { + let chunk_data = try!(self.get_data(&chunks)); + let chunks = ChunkList::read_from(&chunk_data); + try!(self.mark_used(&mut usage, &chunks)); + } + } + } + // Put children in todo + if let Some(children) = inode.children { + for (_name, chunks) in children { + todo.push_back(chunks); + } + } + } + Ok(usage) + } + #[inline] pub fn list_bundles(&self) -> Vec<&BundleInfo> { self.bundles.list_bundles() diff --git a/src/repository/mod.rs b/src/repository/mod.rs index 9e2c8e4..3c08e39 100644 --- a/src/repository/mod.rs +++ b/src/repository/mod.rs @@ -24,7 +24,7 @@ pub use self::metadata::{Inode, FileType}; pub use self::backup::{BackupError, BackupOptions}; pub use self::backup_file::{Backup, BackupFileError}; pub use self::integrity::RepositoryIntegrityError; -pub use self::info::RepositoryInfo; +pub use self::info::{RepositoryInfo, BundleAnalysis}; use self::bundle_map::BundleMap; diff --git a/src/repository/vacuum.rs b/src/repository/vacuum.rs index 811a3d1..8bc73d9 100644 --- a/src/repository/vacuum.rs +++ b/src/repository/vacuum.rs @@ -1,86 +1,9 @@ use ::prelude::*; -use super::metadata::FileContents; +use std::collections::HashSet; -use std::collections::{HashMap, HashSet, VecDeque}; - - -pub struct BundleUsage { - pub used: Bitmap, - pub mode: Bitmap, - pub chunk_count: usize, - pub total_size: usize, - pub used_size: usize -} impl Repository { - fn mark_used(&self, bundles: &mut HashMap, chunks: &[Chunk], mode: BundleMode) -> Result { - let mut new = false; - for chunk in chunks { - if let Some(pos) = self.index.get(&chunk.0) { - if let Some(bundle) = bundles.get_mut(&pos.bundle) { - if !bundle.used.get(pos.chunk as usize) { - new = true; - bundle.used.set(pos.chunk as usize); - bundle.used_size += chunk.1 as usize; - if mode == BundleMode::Meta { - bundle.mode.set(pos.chunk as usize); - } - } - } - } else { - return Err(RepositoryIntegrityError::MissingChunk(chunk.0).into()); - } - } - Ok(new) - } - - pub fn analyze_usage(&mut self) -> Result, RepositoryError> { - let mut usage = HashMap::new(); - for (id, bundle) in self.bundle_map.bundles() { - let bundle = try!(self.bundles.get_bundle_info(&bundle).ok_or_else(|| RepositoryIntegrityError::MissingBundle(bundle))); - usage.insert(id, BundleUsage { - used: Bitmap::new(bundle.chunk_count), - mode: Bitmap::new(bundle.chunk_count), - chunk_count: bundle.chunk_count, - total_size: bundle.raw_size, - used_size: 0 - }); - } - let backups = try!(self.get_backups()); - for (_name, backup) in backups { - let mut todo = VecDeque::new(); - todo.push_back(backup.root); - while let Some(chunks) = todo.pop_front() { - if !try!(self.mark_used(&mut usage, &chunks, BundleMode::Meta)) { - continue - } - let inode = try!(self.get_inode(&chunks)); - // Mark the content chunks as used - match inode.contents { - Some(FileContents::ChunkedDirect(chunks)) => { - try!(self.mark_used(&mut usage, &chunks, BundleMode::Content)); - }, - Some(FileContents::ChunkedIndirect(chunks)) => { - if try!(self.mark_used(&mut usage, &chunks, BundleMode::Meta)) { - let chunk_data = try!(self.get_data(&chunks)); - let chunks = ChunkList::read_from(&chunk_data); - try!(self.mark_used(&mut usage, &chunks, BundleMode::Content)); - } - } - _ => () - } - // Put children in todo - if let Some(children) = inode.children { - for (_name, chunks) in children { - todo.push_back(chunks); - } - } - } - } - Ok(usage) - } - fn delete_bundle(&mut self, id: u32) -> Result<(), RepositoryError> { if let Some(bundle) = self.bundle_map.remove(id) { try!(self.bundles.delete_bundle(&bundle)); @@ -96,15 +19,19 @@ impl Repository { let _lock = try!(self.lock(true)); info!("Analyzing chunk usage"); let usage = try!(self.analyze_usage()); - let total = usage.values().map(|b| b.total_size).sum::(); - let used = usage.values().map(|b| b.used_size).sum::(); - info!("Usage: {} of {}, {:.1}%", to_file_size(used as u64), to_file_size(total as u64), used as f32/total as f32*100.0); + let mut data_total = 0; + let mut data_used = 0; + for bundle in usage.values() { + data_total += bundle.info.encoded_size; + data_used += bundle.get_used_size(); + } + info!("Usage: {} of {}, {:.1}%", to_file_size(data_used as u64), to_file_size(data_total as u64), data_used as f32/data_total as f32*100.0); let mut rewrite_bundles = HashSet::new(); let mut reclaim_space = 0; for (id, bundle) in &usage { - if bundle.used_size as f32 / bundle.total_size as f32 <= ratio { + if bundle.get_usage_ratio() <= ratio { rewrite_bundles.insert(*id); - reclaim_space += bundle.total_size - bundle.used_size; + reclaim_space += bundle.get_unused_size(); } } info!("Reclaiming {} by rewriting {} bundles", to_file_size(reclaim_space as u64), rewrite_bundles.len()); @@ -115,35 +42,24 @@ impl Repository { let bundle = &usage[id]; let bundle_id = self.bundle_map.get(*id).unwrap(); let chunks = try!(self.bundles.get_chunk_list(&bundle_id)); + let mode = usage[id].info.mode; for (chunk, &(hash, _len)) in chunks.into_iter().enumerate() { - if !bundle.used.get(chunk) { + if !bundle.chunk_usage.get(chunk) { try!(self.index.delete(&hash)); continue } let data = try!(self.bundles.get_chunk(&bundle_id, chunk)); - let mode = if bundle.mode.get(chunk) { - BundleMode::Meta - } else { - BundleMode::Content - }; try!(self.put_chunk_override(mode, hash, &data)); } } try!(self.flush()); info!("Checking index"); - let mut pos = 0; - loop { - pos = if let Some(pos) = self.index.next_entry(pos) { - pos - } else { - break - }; - let entry = self.index.get_entry(pos).unwrap(); - if rewrite_bundles.contains(&entry.data.bundle) { + self.index.walk::<_, ()>(|_hash, location| { + if rewrite_bundles.contains(&location.bundle) { panic!("Removed bundle is still referenced in index"); } - pos += 1; - } + Ok(()) + }).ok(); info!("Deleting {} bundles", rewrite_bundles.len()); for id in rewrite_bundles { try!(self.delete_bundle(id));