diff --git a/.travis.yml b/.travis.yml index 0a381a7..05ed5a0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,7 +21,9 @@ rust: - nightly matrix: allow_failures: - - rust: nightly + - rust: + - beta + - stable script: - cargo clean - cargo build diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a40b5d..5d8886d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,9 +5,15 @@ This project follows [semantic versioning](http://semver.org). ### UNRELEASED * [added] Translation infrastructure (**requires nightly rust**) +* [added] Checking hashes of chunks in check --bundle-data +* [added] Debian packet for libsodium23 * [modified] Updated dependencies * [modified] Updated copyright date +* [modified] Moved all code into one crate for easier translation +* [modified] Compression ratio is now displayed in a clearer format * [fixed] Also including the first min_size bytes in hash +* [fixed] Fixed some texts in manpages +* [fixed] Calling strip on final binaries ### v0.4.0 (2017-07-21) diff --git a/Cargo.lock b/Cargo.lock index a296e6e..bcfc2bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -24,7 +24,7 @@ dependencies = [ [[package]] name = "atty" -version = "0.2.6" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "libc 0.2.39 (registry+https://github.com/rust-lang/crates.io-index)", @@ -76,7 +76,7 @@ version = "2.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", - "atty 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", + "atty 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", "bitflags 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", "strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", "textwrap 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -611,7 +611,7 @@ dependencies = [ "checksum aho-corasick 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "d6531d44de723825aa81398a6415283229725a00fa30713812ab9323faa82fc4" "checksum ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" "checksum arrayvec 0.4.7 (registry+https://github.com/rust-lang/crates.io-index)" = "a1e964f9e24d588183fcb43503abda40d288c8657dfc27311516ce2f05675aef" -"checksum atty 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "8352656fd42c30a0c3c89d26dea01e3b77c0ab2af18230835c15e2e13cd51859" +"checksum atty 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "af80143d6f7608d746df1520709e5d141c96f240b0e62b0aa41bdfb53374d9d4" "checksum bitflags 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "aad18937a628ec6abcd26d1489012cc0e18c21798210f491af69ded9b881106d" "checksum bitflags 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b3c30d3802dfb7281680d6285f2ccdaa8c2d8fee41f93805dba5c4cf50dc23cf" "checksum blake2-rfc 0.2.18 (registry+https://github.com/rust-lang/crates.io-index)" = "5d6d530bdd2d52966a6d03b7a964add7ae1a288d25214066fd4b600f0f796400" diff --git a/src/bundledb/db.rs b/src/bundledb/db.rs index 4d76399..09e26c3 100644 --- a/src/bundledb/db.rs +++ b/src/bundledb/db.rs @@ -529,12 +529,14 @@ impl BundleDb { let bundles_data: Vec<_> = bundles.iter().filter(|b| b.mode == BundleMode::Data).collect(); let mut hash_methods = HashMap::new(); let mut compressions = HashMap::new(); + let mut encryptions = HashMap::new(); for bundle in &bundles { *hash_methods.entry(bundle.hash_method).or_insert(0) += 1; *compressions.entry(bundle.compression.clone()).or_insert(0) += 1; + *encryptions.entry(bundle.encryption.clone()).or_insert(0) += 1; } BundleStatistics { - hash_methods, compressions, + hash_methods, compressions, encryptions, raw_size: ValueStats::from_iter(|| bundles.iter().map(|b| b.raw_size as f32)), encoded_size: ValueStats::from_iter(|| bundles.iter().map(|b| b.encoded_size as f32)), chunk_count: ValueStats::from_iter(|| bundles.iter().map(|b| b.chunk_count as f32)), diff --git a/src/bundledb/mod.rs b/src/bundledb/mod.rs index acb4aa0..d53ef02 100644 --- a/src/bundledb/mod.rs +++ b/src/bundledb/mod.rs @@ -148,5 +148,6 @@ pub struct BundleStatistics { pub encoded_size_data: ValueStats, pub chunk_count_data: ValueStats, pub hash_methods: HashMap, - pub compressions: HashMap, usize> + pub compressions: HashMap, usize>, + pub encryptions: HashMap, usize> } \ No newline at end of file diff --git a/src/cli/args.rs b/src/cli/args.rs index 93379b8..98567f1 100644 --- a/src/cli/args.rs +++ b/src/cli/args.rs @@ -41,6 +41,12 @@ pub enum Arguments { inode: Option, force: bool }, + Duplicates { + repo_path: PathBuf, + backup_name: String, + inode: Option, + min_size: u64 + }, Prune { repo_path: PathBuf, prefix: String, @@ -75,7 +81,7 @@ pub enum Arguments { backup_name: Option, inode: Option }, - Stats { + Statistics { repo_path: PathBuf }, Copy { @@ -206,6 +212,31 @@ fn validate_repo_path( parse_repo_path(&repo_path, existing, backup_restr, path_restr).map(|_| ()) } + +fn parse_filesize(num: &str) -> Result { + let (num, suffix) = if num.len() > 0 { + num.split_at(num.len() - 1) + } else { + (num, "b") + }; + let factor = match suffix { + "b" | "B" => 1, + "k" | "K" => 1024, + "m" | "M" => 1024*1024, + "g" | "G" => 1024*1024*1024, + "t" | "T" => 1024*1024*1024*1024, + _ => return Err(tr!("Unknown suffix").to_string()) + }; + let num = try!(parse_num(num)); + Ok(num * factor) +} + +#[allow(unknown_lints, needless_pass_by_value)] +fn validate_filesize(val: String) -> Result<(), String> { + parse_filesize(&val).map(|_| ()) +} + + fn parse_num(num: &str) -> Result { if let Ok(num) = num.parse::() { Ok(num) @@ -467,7 +498,8 @@ pub fn parse() -> Result<(log::Level, Arguments), ErrorCode> { .arg(Arg::from_usage("") .help(tr!("Path of the repository")) .validator(|val| validate_repo_path(val, true, Some(false), Some(false))))) - .subcommand(SubCommand::with_name("stats") + .subcommand(SubCommand::with_name("statistics") + .alias("stats") .about(tr!("Display statistics on a repository")) .arg(Arg::from_usage("") .help(tr!("Path of the repository")) @@ -514,6 +546,16 @@ pub fn parse() -> Result<(log::Level, Arguments), ErrorCode> { .arg(Arg::from_usage("") .help(tr!("New version, [repository]::backup[::subpath]")) .validator(|val| validate_repo_path(val, true, Some(true), None)))) + .subcommand(SubCommand::with_name("duplicates") + .aliases(&["dups"]) + .about(tr!("Find duplicate files in a backup")) + .arg(Arg::from_usage("[min_size] --min-size [SIZE]") + .help(tr!("Set the minimum file size")) + .default_value(DEFAULT_DUPLICATES_MIN_SIZE_STR) + .validator(validate_filesize)) + .arg(Arg::from_usage("") + .help(tr!("The backup/subtree path, [repository]::backup[::subtree]")) + .validator(|val| validate_repo_path(val, true, Some(true), None)))) .subcommand(SubCommand::with_name("copy") .alias("cp") .about(tr!("Create a copy of a backup")) @@ -747,14 +789,14 @@ pub fn parse() -> Result<(log::Level, Arguments), ErrorCode> { inode: inode.map(|v| v.to_string()) } } - ("stats", Some(args)) => { + ("statistics", Some(args)) => { let (repository, _backup, _inode) = parse_repo_path( args.value_of("REPO").unwrap(), true, Some(false), Some(false) ).unwrap(); - Arguments::Stats { repo_path: repository } + Arguments::Statistics { repo_path: repository } } ("copy", Some(args)) => { let (repository_src, backup_src, _inode) = @@ -830,6 +872,18 @@ pub fn parse() -> Result<(log::Level, Arguments), ErrorCode> { .unwrap_or_else(|| vec![]) } } + ("duplicates", Some(args)) => { + let (repository, backup, inode) = + parse_repo_path(args.value_of("BACKUP").unwrap(), true, Some(true), None).unwrap(); + Arguments::Duplicates { + repo_path: repository, + backup_name: backup.unwrap().to_string(), + inode: inode.map(|v| v.to_string()), + min_size: args.value_of("min_size").map(|v| { + parse_filesize(v).unwrap() + }).unwrap() + } + } ("config", Some(args)) => { let (repository, _backup, _inode) = parse_repo_path( args.value_of("REPO").unwrap(), diff --git a/src/cli/mod.rs b/src/cli/mod.rs index b2a2d9a..f3198fb 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -45,7 +45,8 @@ pub enum ErrorCode { DiffRun, VersionsRun, ImportRun, - FuseMount + FuseMount, + DuplicatesRun } impl ErrorCode { pub fn code(&self) -> i32 { @@ -81,6 +82,7 @@ impl ErrorCode { ErrorCode::VersionsRun => 22, ErrorCode::ImportRun => 23, ErrorCode::FuseMount => 24, + ErrorCode::DuplicatesRun => 27, // ErrorCode::NoSuchBackup => 25, ErrorCode::BackupAlreadyExists => 26, @@ -94,6 +96,7 @@ pub const DEFAULT_HASH: &str = "blake2"; pub const DEFAULT_COMPRESSION: &str = "brotli/3"; pub const DEFAULT_BUNDLE_SIZE_STR: &str = "25"; pub const DEFAULT_VACUUM_RATIO_STR: &str = "0"; +pub const DEFAULT_DUPLICATES_MIN_SIZE_STR: &str = "1b"; lazy_static! { pub static ref ZVAULT_FOLDER: PathBuf = { env::home_dir().unwrap().join(".zvault") @@ -132,6 +135,22 @@ fn get_backup(repo: &Repository, backup_name: &str) -> Result )) } +fn get_inode(repo: &mut Repository, backup: &Backup, inode: Option<&String>) -> Result { + Ok(if let Some(inode) = inode { + checked!( + repo.get_backup_inode(&backup, &inode), + "load subpath inode", + ErrorCode::LoadInode + ) + } else { + checked!( + repo.get_inode(&backup.root), + "load root inode", + ErrorCode::LoadInode + ) + }) +} + fn find_reference_backup( repo: &Repository, path: &str, @@ -322,37 +341,37 @@ fn print_repostats(stats: &RepositoryStatistics) { tr_println!("Displacement:\n - average: {:.1}\n - stddev: {:.1}\n - over {:.1}: {:.0}, {:.1}%\n - maximum: {:.0}", disp.avg, disp.stddev, disp.avg + 2.0 * disp.stddev, disp.count_xl, disp.count_xl as f32 / disp.count as f32 * 100.0, disp.max); println!(""); - tr_println!("Bundles (all)\n============="); + tr_println!("Bundles\n======="); + let tsize = (stats.bundles.raw_size.count as f32 * stats.bundles.encoded_size.avg) as u64; + tr_println!("All bundles: {} in {} bundles", to_file_size(tsize), stats.bundles.raw_size.count); let rsize = &stats.bundles.raw_size; - tr_println!("Raw size:\n - average: {}\n - stddev: {}\n - maximum: {}", - to_file_size(rsize.avg as u64), to_file_size(rsize.stddev as u64), to_file_size(rsize.max as u64)); + tr_println!(" - raw size: ø = {}, maximum: {}", to_file_size(rsize.avg as u64), to_file_size(rsize.max as u64)); let esize = &stats.bundles.encoded_size; - tr_println!("Encoded size:\n - average: {}\n - stddev: {}\n - maximum: {}", - to_file_size(esize.avg as u64), to_file_size(esize.stddev as u64), to_file_size(esize.max as u64)); + tr_println!(" - encoded size: ø = {}, maximum: {}", to_file_size(esize.avg as u64), to_file_size(esize.max as u64)); let ccount = &stats.bundles.chunk_count; - tr_println!("Chunk count:\n - average: {:.1}\n - stddev: {:.1}\n - minimum: {:.0}\n - maximum: {:.0}", ccount.avg, ccount.stddev, ccount.min, ccount.max); - println!(""); - tr_println!("Meta bundles\n============"); + tr_println!(" - chunk count: ø = {:.1}, maximum: {:.0}", ccount.avg, ccount.max); + let tsize = (stats.bundles.raw_size_meta.count as f32 * stats.bundles.encoded_size_meta.avg) as u64; + tr_println!("Meta bundles: {} in {} bundles", to_file_size(tsize), stats.bundles.raw_size_meta.count); let rsize = &stats.bundles.raw_size_meta; - tr_println!("Raw size:\n - average: {}\n - stddev: {}\n - maximum: {}", - to_file_size(rsize.avg as u64), to_file_size(rsize.stddev as u64), to_file_size(rsize.max as u64)); + tr_println!(" - raw size: ø = {}, maximum: {}", to_file_size(rsize.avg as u64), to_file_size(rsize.max as u64)); let esize = &stats.bundles.encoded_size_meta; - tr_println!("Encoded size:\n - average: {}\n - stddev: {}\n - maximum: {}", - to_file_size(esize.avg as u64), to_file_size(esize.stddev as u64), to_file_size(esize.max as u64)); + tr_println!(" - encoded size: ø = {}, maximum: {}", to_file_size(esize.avg as u64), to_file_size(esize.max as u64)); let ccount = &stats.bundles.chunk_count_meta; - tr_println!("Chunk count:\n - average: {:.1}\n - stddev: {:.1}\n - minimum: {:.0}\n - maximum: {:.0}", ccount.avg, ccount.stddev, ccount.min, ccount.max); - println!(""); - tr_println!("Data bundles\n============"); + tr_println!(" - chunk count: ø = {:.1}, maximum: {:.0}", ccount.avg, ccount.max); + let tsize = (stats.bundles.raw_size_data.count as f32 * stats.bundles.encoded_size_data.avg) as u64; + tr_println!("Data bundles: {} in {} bundles", to_file_size(tsize), stats.bundles.raw_size_data.count); let rsize = &stats.bundles.raw_size_data; - tr_println!("Raw size:\n - average: {}\n - stddev: {}\n - maximum: {}", - to_file_size(rsize.avg as u64), to_file_size(rsize.stddev as u64), to_file_size(rsize.max as u64)); + tr_println!(" - raw size: ø = {}, maximum: {}", to_file_size(rsize.avg as u64), to_file_size(rsize.max as u64)); let esize = &stats.bundles.encoded_size_data; - tr_println!("Encoded size:\n - average: {}\n - stddev: {}\n - maximum: {}", - to_file_size(esize.avg as u64), to_file_size(esize.stddev as u64), to_file_size(esize.max as u64)); + tr_println!(" - encoded size: ø = {}, maximum: {}", to_file_size(esize.avg as u64), to_file_size(esize.max as u64)); let ccount = &stats.bundles.chunk_count_data; - tr_println!("Chunk count:\n - average: {:.1}\n - stddev: {:.1}\n - minimum: {:.0}\n - maximum: {:.0}", ccount.avg, ccount.stddev, ccount.min, ccount.max); + tr_println!(" - chunk count: ø = {:.1}, maximum: {:.0}", ccount.avg, ccount.max); println!(""); tr_println!("Bundle methods\n=============="); + tr_println!("Hash:"); + for (hash, &count) in &stats.bundles.hash_methods { + tr_println!(" - {}: {}, {:.1}%", hash.name(), count, count as f32 / stats.bundles.raw_size.count as f32 * 100.0); + } tr_println!("Compression:"); for (compr, &count) in &stats.bundles.compressions { let compr_name = if let &Some(ref compr) = compr { @@ -362,9 +381,14 @@ fn print_repostats(stats: &RepositoryStatistics) { }; tr_println!(" - {}: {}, {:.1}%", compr_name, count, count as f32 / stats.bundles.raw_size.count as f32 * 100.0); } - tr_println!("Hash:"); - for (hash, &count) in &stats.bundles.hash_methods { - tr_println!(" - {}: {}, {:.1}%", hash.name(), count, count as f32 / stats.bundles.raw_size.count as f32 * 100.0); + tr_println!("Encryption:"); + for (encr, &count) in &stats.bundles.encryptions { + let encr_name = if let &Some(ref encr) = encr { + to_hex(&encr.1[..]) + } else { + tr!("none").to_string() + }; + tr_println!(" - {}: {}, {:.1}%", encr_name, count, count as f32 / stats.bundles.raw_size.count as f32 * 100.0); } } @@ -465,6 +489,17 @@ fn print_analysis(analysis: &HashMap) { } } +fn print_duplicates(dups: Vec<(Vec, u64)>) { + for (group, size) in dups { + tr_println!("{} duplicates found, size: {}", group.len(), to_file_size(size)); + for dup in group { + println!(" - {}", dup.to_string_lossy()); + } + println!(); + } +} + + #[allow(unknown_lints, cyclomatic_complexity)] pub fn run() -> Result<(), ErrorCode> { @@ -652,19 +687,7 @@ pub fn run() -> Result<(), ErrorCode> { } => { let mut repo = try!(open_repository(&repo_path, true)); let backup = try!(get_backup(&repo, &backup_name)); - let inode = if let Some(inode) = inode { - checked!( - repo.get_backup_inode(&backup, &inode), - "load subpath inode", - ErrorCode::LoadInode - ) - } else { - checked!( - repo.get_inode(&backup.root), - "load root inode", - ErrorCode::LoadInode - ) - }; + let inode = try!(get_inode(&mut repo, &backup, inode.as_ref())); if tar { checked!( repo.export_tarfile(&backup, inode, &dst_path), @@ -917,12 +940,28 @@ pub fn run() -> Result<(), ErrorCode> { print_repoinfo(&repo.info()); } } - Arguments::Stats { + Arguments::Statistics { repo_path } => { let mut repo = try!(open_repository(&repo_path, false)); print_repostats(&repo.statistics()); } + Arguments::Duplicates { + repo_path, + backup_name, + inode, + min_size + } => { + let mut repo = try!(open_repository(&repo_path, true)); + let backup = try!(get_backup(&repo, &backup_name)); + let inode = try!(get_inode(&mut repo, &backup, inode.as_ref())); + let dups = checked!( + repo.find_duplicates(&inode, min_size), + "find duplicates", + ErrorCode::DuplicatesRun + ); + print_duplicates(dups); + } Arguments::Mount { repo_path, backup_name, diff --git a/src/repository/backup.rs b/src/repository/backup.rs index 48ce955..7fefdde 100644 --- a/src/repository/backup.rs +++ b/src/repository/backup.rs @@ -542,4 +542,49 @@ impl Repository { )); Ok(diffs) } + + fn count_sizes_recursive(&mut self, inode: &Inode, sizes: &mut HashMap, min_size: u64) -> Result<(), RepositoryError> { + if inode.size >= min_size { + *sizes.entry(inode.size).or_insert(0) += 1; + } + if let Some(ref children) = inode.children { + for chunks in children.values() { + let ch = try!(self.get_inode(&chunks)); + try!(self.count_sizes_recursive(&ch, sizes, min_size)); + } + } + Ok(()) + } + + fn find_duplicates_recursive(&mut self, inode: &Inode, path: &Path, sizes: &HashMap, hashes: &mut HashMap, u64)>) -> Result<(), RepositoryError> { + let path = path.join(&inode.name); + if sizes.get(&inode.size).cloned().unwrap_or(0) > 1 { + if let Some(ref data) = inode.data { + let chunk_data = try!(msgpack::encode(data).map_err(InodeError::from)); + let hash = HashMethod::Blake2.hash(&chunk_data); + hashes.entry(hash).or_insert((Vec::new(), inode.size)).0.push(path.clone()); + } + } + if let Some(ref children) = inode.children { + for chunks in children.values() { + let ch = try!(self.get_inode(&chunks)); + try!(self.find_duplicates_recursive(&ch, &path, sizes, hashes)); + } + } + Ok(()) + } + + pub fn find_duplicates(&mut self, inode: &Inode, min_size: u64) -> Result, u64)>, RepositoryError> { + let mut sizes = HashMap::new(); + try!(self.count_sizes_recursive(inode, &mut sizes, min_size)); + let mut hashes = HashMap::new(); + if let Some(ref children) = inode.children { + for chunks in children.values() { + let ch = try!(self.get_inode(&chunks)); + try!(self.find_duplicates_recursive(&ch, Path::new(""), &sizes, &mut hashes)); + } + } + let dups = hashes.into_iter().map(|(_,v)| v).filter(|&(ref v, _)| v.len() > 1).collect(); + Ok(dups) + } }