stats & dups

This commit is contained in:
Dennis Schwerdel 2018-03-08 15:20:20 +01:00
parent 56c916f585
commit 2f3c97a043
8 changed files with 197 additions and 48 deletions

View File

@ -21,7 +21,9 @@ rust:
- nightly
matrix:
allow_failures:
- rust: nightly
- rust:
- beta
- stable
script:
- cargo clean
- cargo build

View File

@ -5,9 +5,15 @@ This project follows [semantic versioning](http://semver.org).
### UNRELEASED
* [added] Translation infrastructure (**requires nightly rust**)
* [added] Checking hashes of chunks in check --bundle-data
* [added] Debian packet for libsodium23
* [modified] Updated dependencies
* [modified] Updated copyright date
* [modified] Moved all code into one crate for easier translation
* [modified] Compression ratio is now displayed in a clearer format
* [fixed] Also including the first min_size bytes in hash
* [fixed] Fixed some texts in manpages
* [fixed] Calling strip on final binaries
### v0.4.0 (2017-07-21)

6
Cargo.lock generated
View File

@ -24,7 +24,7 @@ dependencies = [
[[package]]
name = "atty"
version = "0.2.6"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.39 (registry+https://github.com/rust-lang/crates.io-index)",
@ -76,7 +76,7 @@ version = "2.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)",
"atty 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)",
"atty 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
"bitflags 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
"textwrap 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
@ -611,7 +611,7 @@ dependencies = [
"checksum aho-corasick 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "d6531d44de723825aa81398a6415283229725a00fa30713812ab9323faa82fc4"
"checksum ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b"
"checksum arrayvec 0.4.7 (registry+https://github.com/rust-lang/crates.io-index)" = "a1e964f9e24d588183fcb43503abda40d288c8657dfc27311516ce2f05675aef"
"checksum atty 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "8352656fd42c30a0c3c89d26dea01e3b77c0ab2af18230835c15e2e13cd51859"
"checksum atty 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "af80143d6f7608d746df1520709e5d141c96f240b0e62b0aa41bdfb53374d9d4"
"checksum bitflags 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "aad18937a628ec6abcd26d1489012cc0e18c21798210f491af69ded9b881106d"
"checksum bitflags 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b3c30d3802dfb7281680d6285f2ccdaa8c2d8fee41f93805dba5c4cf50dc23cf"
"checksum blake2-rfc 0.2.18 (registry+https://github.com/rust-lang/crates.io-index)" = "5d6d530bdd2d52966a6d03b7a964add7ae1a288d25214066fd4b600f0f796400"

View File

@ -529,12 +529,14 @@ impl BundleDb {
let bundles_data: Vec<_> = bundles.iter().filter(|b| b.mode == BundleMode::Data).collect();
let mut hash_methods = HashMap::new();
let mut compressions = HashMap::new();
let mut encryptions = HashMap::new();
for bundle in &bundles {
*hash_methods.entry(bundle.hash_method).or_insert(0) += 1;
*compressions.entry(bundle.compression.clone()).or_insert(0) += 1;
*encryptions.entry(bundle.encryption.clone()).or_insert(0) += 1;
}
BundleStatistics {
hash_methods, compressions,
hash_methods, compressions, encryptions,
raw_size: ValueStats::from_iter(|| bundles.iter().map(|b| b.raw_size as f32)),
encoded_size: ValueStats::from_iter(|| bundles.iter().map(|b| b.encoded_size as f32)),
chunk_count: ValueStats::from_iter(|| bundles.iter().map(|b| b.chunk_count as f32)),

View File

@ -148,5 +148,6 @@ pub struct BundleStatistics {
pub encoded_size_data: ValueStats,
pub chunk_count_data: ValueStats,
pub hash_methods: HashMap<HashMethod, usize>,
pub compressions: HashMap<Option<Compression>, usize>
pub compressions: HashMap<Option<Compression>, usize>,
pub encryptions: HashMap<Option<Encryption>, usize>
}

View File

@ -41,6 +41,12 @@ pub enum Arguments {
inode: Option<String>,
force: bool
},
Duplicates {
repo_path: PathBuf,
backup_name: String,
inode: Option<String>,
min_size: u64
},
Prune {
repo_path: PathBuf,
prefix: String,
@ -75,7 +81,7 @@ pub enum Arguments {
backup_name: Option<String>,
inode: Option<String>
},
Stats {
Statistics {
repo_path: PathBuf
},
Copy {
@ -206,6 +212,31 @@ fn validate_repo_path(
parse_repo_path(&repo_path, existing, backup_restr, path_restr).map(|_| ())
}
fn parse_filesize(num: &str) -> Result<u64, String> {
let (num, suffix) = if num.len() > 0 {
num.split_at(num.len() - 1)
} else {
(num, "b")
};
let factor = match suffix {
"b" | "B" => 1,
"k" | "K" => 1024,
"m" | "M" => 1024*1024,
"g" | "G" => 1024*1024*1024,
"t" | "T" => 1024*1024*1024*1024,
_ => return Err(tr!("Unknown suffix").to_string())
};
let num = try!(parse_num(num));
Ok(num * factor)
}
#[allow(unknown_lints, needless_pass_by_value)]
fn validate_filesize(val: String) -> Result<(), String> {
parse_filesize(&val).map(|_| ())
}
fn parse_num(num: &str) -> Result<u64, String> {
if let Ok(num) = num.parse::<u64>() {
Ok(num)
@ -467,7 +498,8 @@ pub fn parse() -> Result<(log::Level, Arguments), ErrorCode> {
.arg(Arg::from_usage("<REPO>")
.help(tr!("Path of the repository"))
.validator(|val| validate_repo_path(val, true, Some(false), Some(false)))))
.subcommand(SubCommand::with_name("stats")
.subcommand(SubCommand::with_name("statistics")
.alias("stats")
.about(tr!("Display statistics on a repository"))
.arg(Arg::from_usage("<REPO>")
.help(tr!("Path of the repository"))
@ -514,6 +546,16 @@ pub fn parse() -> Result<(log::Level, Arguments), ErrorCode> {
.arg(Arg::from_usage("<NEW>")
.help(tr!("New version, [repository]::backup[::subpath]"))
.validator(|val| validate_repo_path(val, true, Some(true), None))))
.subcommand(SubCommand::with_name("duplicates")
.aliases(&["dups"])
.about(tr!("Find duplicate files in a backup"))
.arg(Arg::from_usage("[min_size] --min-size [SIZE]")
.help(tr!("Set the minimum file size"))
.default_value(DEFAULT_DUPLICATES_MIN_SIZE_STR)
.validator(validate_filesize))
.arg(Arg::from_usage("<BACKUP>")
.help(tr!("The backup/subtree path, [repository]::backup[::subtree]"))
.validator(|val| validate_repo_path(val, true, Some(true), None))))
.subcommand(SubCommand::with_name("copy")
.alias("cp")
.about(tr!("Create a copy of a backup"))
@ -747,14 +789,14 @@ pub fn parse() -> Result<(log::Level, Arguments), ErrorCode> {
inode: inode.map(|v| v.to_string())
}
}
("stats", Some(args)) => {
("statistics", Some(args)) => {
let (repository, _backup, _inode) = parse_repo_path(
args.value_of("REPO").unwrap(),
true,
Some(false),
Some(false)
).unwrap();
Arguments::Stats { repo_path: repository }
Arguments::Statistics { repo_path: repository }
}
("copy", Some(args)) => {
let (repository_src, backup_src, _inode) =
@ -830,6 +872,18 @@ pub fn parse() -> Result<(log::Level, Arguments), ErrorCode> {
.unwrap_or_else(|| vec![])
}
}
("duplicates", Some(args)) => {
let (repository, backup, inode) =
parse_repo_path(args.value_of("BACKUP").unwrap(), true, Some(true), None).unwrap();
Arguments::Duplicates {
repo_path: repository,
backup_name: backup.unwrap().to_string(),
inode: inode.map(|v| v.to_string()),
min_size: args.value_of("min_size").map(|v| {
parse_filesize(v).unwrap()
}).unwrap()
}
}
("config", Some(args)) => {
let (repository, _backup, _inode) = parse_repo_path(
args.value_of("REPO").unwrap(),

View File

@ -45,7 +45,8 @@ pub enum ErrorCode {
DiffRun,
VersionsRun,
ImportRun,
FuseMount
FuseMount,
DuplicatesRun
}
impl ErrorCode {
pub fn code(&self) -> i32 {
@ -81,6 +82,7 @@ impl ErrorCode {
ErrorCode::VersionsRun => 22,
ErrorCode::ImportRun => 23,
ErrorCode::FuseMount => 24,
ErrorCode::DuplicatesRun => 27,
//
ErrorCode::NoSuchBackup => 25,
ErrorCode::BackupAlreadyExists => 26,
@ -94,6 +96,7 @@ pub const DEFAULT_HASH: &str = "blake2";
pub const DEFAULT_COMPRESSION: &str = "brotli/3";
pub const DEFAULT_BUNDLE_SIZE_STR: &str = "25";
pub const DEFAULT_VACUUM_RATIO_STR: &str = "0";
pub const DEFAULT_DUPLICATES_MIN_SIZE_STR: &str = "1b";
lazy_static! {
pub static ref ZVAULT_FOLDER: PathBuf = {
env::home_dir().unwrap().join(".zvault")
@ -132,6 +135,22 @@ fn get_backup(repo: &Repository, backup_name: &str) -> Result<Backup, ErrorCode>
))
}
fn get_inode(repo: &mut Repository, backup: &Backup, inode: Option<&String>) -> Result<Inode, ErrorCode> {
Ok(if let Some(inode) = inode {
checked!(
repo.get_backup_inode(&backup, &inode),
"load subpath inode",
ErrorCode::LoadInode
)
} else {
checked!(
repo.get_inode(&backup.root),
"load root inode",
ErrorCode::LoadInode
)
})
}
fn find_reference_backup(
repo: &Repository,
path: &str,
@ -322,37 +341,37 @@ fn print_repostats(stats: &RepositoryStatistics) {
tr_println!("Displacement:\n - average: {:.1}\n - stddev: {:.1}\n - over {:.1}: {:.0}, {:.1}%\n - maximum: {:.0}",
disp.avg, disp.stddev, disp.avg + 2.0 * disp.stddev, disp.count_xl, disp.count_xl as f32 / disp.count as f32 * 100.0, disp.max);
println!("");
tr_println!("Bundles (all)\n=============");
tr_println!("Bundles\n=======");
let tsize = (stats.bundles.raw_size.count as f32 * stats.bundles.encoded_size.avg) as u64;
tr_println!("All bundles: {} in {} bundles", to_file_size(tsize), stats.bundles.raw_size.count);
let rsize = &stats.bundles.raw_size;
tr_println!("Raw size:\n - average: {}\n - stddev: {}\n - maximum: {}",
to_file_size(rsize.avg as u64), to_file_size(rsize.stddev as u64), to_file_size(rsize.max as u64));
tr_println!(" - raw size: ø = {}, maximum: {}", to_file_size(rsize.avg as u64), to_file_size(rsize.max as u64));
let esize = &stats.bundles.encoded_size;
tr_println!("Encoded size:\n - average: {}\n - stddev: {}\n - maximum: {}",
to_file_size(esize.avg as u64), to_file_size(esize.stddev as u64), to_file_size(esize.max as u64));
tr_println!(" - encoded size: ø = {}, maximum: {}", to_file_size(esize.avg as u64), to_file_size(esize.max as u64));
let ccount = &stats.bundles.chunk_count;
tr_println!("Chunk count:\n - average: {:.1}\n - stddev: {:.1}\n - minimum: {:.0}\n - maximum: {:.0}", ccount.avg, ccount.stddev, ccount.min, ccount.max);
println!("");
tr_println!("Meta bundles\n============");
tr_println!(" - chunk count: ø = {:.1}, maximum: {:.0}", ccount.avg, ccount.max);
let tsize = (stats.bundles.raw_size_meta.count as f32 * stats.bundles.encoded_size_meta.avg) as u64;
tr_println!("Meta bundles: {} in {} bundles", to_file_size(tsize), stats.bundles.raw_size_meta.count);
let rsize = &stats.bundles.raw_size_meta;
tr_println!("Raw size:\n - average: {}\n - stddev: {}\n - maximum: {}",
to_file_size(rsize.avg as u64), to_file_size(rsize.stddev as u64), to_file_size(rsize.max as u64));
tr_println!(" - raw size: ø = {}, maximum: {}", to_file_size(rsize.avg as u64), to_file_size(rsize.max as u64));
let esize = &stats.bundles.encoded_size_meta;
tr_println!("Encoded size:\n - average: {}\n - stddev: {}\n - maximum: {}",
to_file_size(esize.avg as u64), to_file_size(esize.stddev as u64), to_file_size(esize.max as u64));
tr_println!(" - encoded size: ø = {}, maximum: {}", to_file_size(esize.avg as u64), to_file_size(esize.max as u64));
let ccount = &stats.bundles.chunk_count_meta;
tr_println!("Chunk count:\n - average: {:.1}\n - stddev: {:.1}\n - minimum: {:.0}\n - maximum: {:.0}", ccount.avg, ccount.stddev, ccount.min, ccount.max);
println!("");
tr_println!("Data bundles\n============");
tr_println!(" - chunk count: ø = {:.1}, maximum: {:.0}", ccount.avg, ccount.max);
let tsize = (stats.bundles.raw_size_data.count as f32 * stats.bundles.encoded_size_data.avg) as u64;
tr_println!("Data bundles: {} in {} bundles", to_file_size(tsize), stats.bundles.raw_size_data.count);
let rsize = &stats.bundles.raw_size_data;
tr_println!("Raw size:\n - average: {}\n - stddev: {}\n - maximum: {}",
to_file_size(rsize.avg as u64), to_file_size(rsize.stddev as u64), to_file_size(rsize.max as u64));
tr_println!(" - raw size: ø = {}, maximum: {}", to_file_size(rsize.avg as u64), to_file_size(rsize.max as u64));
let esize = &stats.bundles.encoded_size_data;
tr_println!("Encoded size:\n - average: {}\n - stddev: {}\n - maximum: {}",
to_file_size(esize.avg as u64), to_file_size(esize.stddev as u64), to_file_size(esize.max as u64));
tr_println!(" - encoded size: ø = {}, maximum: {}", to_file_size(esize.avg as u64), to_file_size(esize.max as u64));
let ccount = &stats.bundles.chunk_count_data;
tr_println!("Chunk count:\n - average: {:.1}\n - stddev: {:.1}\n - minimum: {:.0}\n - maximum: {:.0}", ccount.avg, ccount.stddev, ccount.min, ccount.max);
tr_println!(" - chunk count: ø = {:.1}, maximum: {:.0}", ccount.avg, ccount.max);
println!("");
tr_println!("Bundle methods\n==============");
tr_println!("Hash:");
for (hash, &count) in &stats.bundles.hash_methods {
tr_println!(" - {}: {}, {:.1}%", hash.name(), count, count as f32 / stats.bundles.raw_size.count as f32 * 100.0);
}
tr_println!("Compression:");
for (compr, &count) in &stats.bundles.compressions {
let compr_name = if let &Some(ref compr) = compr {
@ -362,9 +381,14 @@ fn print_repostats(stats: &RepositoryStatistics) {
};
tr_println!(" - {}: {}, {:.1}%", compr_name, count, count as f32 / stats.bundles.raw_size.count as f32 * 100.0);
}
tr_println!("Hash:");
for (hash, &count) in &stats.bundles.hash_methods {
tr_println!(" - {}: {}, {:.1}%", hash.name(), count, count as f32 / stats.bundles.raw_size.count as f32 * 100.0);
tr_println!("Encryption:");
for (encr, &count) in &stats.bundles.encryptions {
let encr_name = if let &Some(ref encr) = encr {
to_hex(&encr.1[..])
} else {
tr!("none").to_string()
};
tr_println!(" - {}: {}, {:.1}%", encr_name, count, count as f32 / stats.bundles.raw_size.count as f32 * 100.0);
}
}
@ -465,6 +489,17 @@ fn print_analysis(analysis: &HashMap<u32, BundleAnalysis>) {
}
}
fn print_duplicates(dups: Vec<(Vec<PathBuf>, u64)>) {
for (group, size) in dups {
tr_println!("{} duplicates found, size: {}", group.len(), to_file_size(size));
for dup in group {
println!(" - {}", dup.to_string_lossy());
}
println!();
}
}
#[allow(unknown_lints, cyclomatic_complexity)]
pub fn run() -> Result<(), ErrorCode> {
@ -652,19 +687,7 @@ pub fn run() -> Result<(), ErrorCode> {
} => {
let mut repo = try!(open_repository(&repo_path, true));
let backup = try!(get_backup(&repo, &backup_name));
let inode = if let Some(inode) = inode {
checked!(
repo.get_backup_inode(&backup, &inode),
"load subpath inode",
ErrorCode::LoadInode
)
} else {
checked!(
repo.get_inode(&backup.root),
"load root inode",
ErrorCode::LoadInode
)
};
let inode = try!(get_inode(&mut repo, &backup, inode.as_ref()));
if tar {
checked!(
repo.export_tarfile(&backup, inode, &dst_path),
@ -917,12 +940,28 @@ pub fn run() -> Result<(), ErrorCode> {
print_repoinfo(&repo.info());
}
}
Arguments::Stats {
Arguments::Statistics {
repo_path
} => {
let mut repo = try!(open_repository(&repo_path, false));
print_repostats(&repo.statistics());
}
Arguments::Duplicates {
repo_path,
backup_name,
inode,
min_size
} => {
let mut repo = try!(open_repository(&repo_path, true));
let backup = try!(get_backup(&repo, &backup_name));
let inode = try!(get_inode(&mut repo, &backup, inode.as_ref()));
let dups = checked!(
repo.find_duplicates(&inode, min_size),
"find duplicates",
ErrorCode::DuplicatesRun
);
print_duplicates(dups);
}
Arguments::Mount {
repo_path,
backup_name,

View File

@ -542,4 +542,49 @@ impl Repository {
));
Ok(diffs)
}
fn count_sizes_recursive(&mut self, inode: &Inode, sizes: &mut HashMap<u64, usize>, min_size: u64) -> Result<(), RepositoryError> {
if inode.size >= min_size {
*sizes.entry(inode.size).or_insert(0) += 1;
}
if let Some(ref children) = inode.children {
for chunks in children.values() {
let ch = try!(self.get_inode(&chunks));
try!(self.count_sizes_recursive(&ch, sizes, min_size));
}
}
Ok(())
}
fn find_duplicates_recursive(&mut self, inode: &Inode, path: &Path, sizes: &HashMap<u64, usize>, hashes: &mut HashMap<Hash, (Vec<PathBuf>, u64)>) -> Result<(), RepositoryError> {
let path = path.join(&inode.name);
if sizes.get(&inode.size).cloned().unwrap_or(0) > 1 {
if let Some(ref data) = inode.data {
let chunk_data = try!(msgpack::encode(data).map_err(InodeError::from));
let hash = HashMethod::Blake2.hash(&chunk_data);
hashes.entry(hash).or_insert((Vec::new(), inode.size)).0.push(path.clone());
}
}
if let Some(ref children) = inode.children {
for chunks in children.values() {
let ch = try!(self.get_inode(&chunks));
try!(self.find_duplicates_recursive(&ch, &path, sizes, hashes));
}
}
Ok(())
}
pub fn find_duplicates(&mut self, inode: &Inode, min_size: u64) -> Result<Vec<(Vec<PathBuf>, u64)>, RepositoryError> {
let mut sizes = HashMap::new();
try!(self.count_sizes_recursive(inode, &mut sizes, min_size));
let mut hashes = HashMap::new();
if let Some(ref children) = inode.children {
for chunks in children.values() {
let ch = try!(self.get_inode(&chunks));
try!(self.find_duplicates_recursive(&ch, Path::new(""), &sizes, &mut hashes));
}
}
let dups = hashes.into_iter().map(|(_,v)| v).filter(|&(ref v, _)| v.len() > 1).collect();
Ok(dups)
}
}