diff --git a/docs/man/zvault.1.md b/docs/man/zvault.1.md index d2d7f57..e55fd1e 100644 --- a/docs/man/zvault.1.md +++ b/docs/man/zvault.1.md @@ -100,8 +100,8 @@ regarded as not set at all. Examples: -- `~/.zvault` references the repository in `~/.zvault` and is identical with - `::`. +- `~/.zvault/repos/default` references the repository in + `~/.zvault/repos/default` and is identical with `::`. - `::backup1` references the backup `backup1` in the default repository - `::backup1::/` references the root folder of the backup `backup1` in the default repository @@ -189,7 +189,7 @@ The chunker algortihm and chunk size are configured together in the format `algorithm/size` where algorithm is one of `rabin`, `ae` and `fastcdc` and size is the size in KiB e.g. `16`. So the recommended configuration is `fastcdc/16`. -Please not that since the chunker algorithm and chunk size affect the chunks +Please note that since the chunker algorithm and chunk size affect the chunks created from the input data, any change to those values will make existing chunks inaccessible for deduplication purposes. The old data is still readable but new backups will have to store all data again. @@ -198,7 +198,7 @@ but new backups will have to store all data again. ### Compression ZVault offers different compression algorithms that can be used to compress the stored data after deduplication. The compression ratio that can be achieved -mostly depends on the input data (test data can be compressed well and media +mostly depends on the input data (text data can be compressed well and media data like music and videos are already compressed and can not be compressed significantly). diff --git a/src/cli/mod.rs b/src/cli/mod.rs index 8072738..f1331b2 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -859,6 +859,7 @@ pub fn run() -> Result<(), ErrorCode> { print_backup(&backup); } } else { + println!("{:?}", repo.statistics()); print_repoinfo(&repo.info()); } } diff --git a/src/index.rs b/src/index.rs index 2031bc9..e9a492e 100644 --- a/src/index.rs +++ b/src/index.rs @@ -8,6 +8,7 @@ use std::os::unix::io::AsRawFd; use mmap::{MemoryMap, MapOption, MapError}; +use ::prelude::*; pub const MAX_USAGE: f64 = 0.9; pub const MIN_USAGE: f64 = 0.35; @@ -373,6 +374,11 @@ impl Index { self.header.capacity = self.capacity as u64; } + #[inline] + fn get_displacement(&self, entry: &Entry, pos: usize) -> usize { + (pos + self.capacity - (entry.get_key().hash() as usize & self.mask)) & self.mask + } + /// Finds the position for this key /// If the key is in the table, it will be the position of the key, /// otherwise it will be the position where this key should be inserted @@ -387,7 +393,7 @@ impl Index { if entry.get_key() == key { return LocateResult::Found(pos); } - let odist = (pos + self.capacity - (entry.get_key().hash() as usize & self.mask)) & self.mask; + let odist = self.get_displacement(entry, pos); if dist > odist { return LocateResult::Steal(pos); } @@ -579,4 +585,20 @@ impl Index { } self.entries = 0; } + + #[allow(dead_code)] + pub fn statistics(&self) -> IndexStatistics { + IndexStatistics { + displacement: ValueStats::from_iter(|| self.data.iter().enumerate().filter( + |&(_, entry)| entry.is_used()).map( + |(index, entry)| self.get_displacement(entry, index) as f32)) + } + } + } + + +#[derive(Debug)] +pub struct IndexStatistics { + pub displacement: ValueStats +} \ No newline at end of file diff --git a/src/prelude.rs b/src/prelude.rs index 5f1d6a4..3979c10 100644 --- a/src/prelude.rs +++ b/src/prelude.rs @@ -4,8 +4,9 @@ pub use bundledb::{BundleReader, BundleMode, BundleWriter, BundleInfo, BundleId, pub use chunker::{ChunkerType, Chunker, ChunkerStatus, ChunkerError}; pub use repository::{Repository, Backup, Config, RepositoryError, RepositoryInfo, Inode, FileType, IntegrityError, BackupFileError, BackupError, BackupOptions, BundleAnalysis, - FileData, DiffType, InodeError, RepositoryLayout, Location}; -pub use index::{Index, IndexError}; + FileData, DiffType, InodeError, RepositoryLayout, Location, + RepositoryStatistics}; +pub use index::{Index, IndexError, IndexStatistics}; pub use mount::FuseFilesystem; pub use translation::CowStr; diff --git a/src/repository/info.rs b/src/repository/info.rs index 9717928..52ffb65 100644 --- a/src/repository/info.rs +++ b/src/repository/info.rs @@ -39,6 +39,12 @@ pub struct RepositoryInfo { } +#[derive(Debug)] +pub struct RepositoryStatistics { + pub index: IndexStatistics +} + + impl Repository { fn mark_used( &self, @@ -147,4 +153,11 @@ impl Repository { index_entries: self.index.len() } } + + #[allow(dead_code)] + pub fn statistics(&self) -> RepositoryStatistics { + RepositoryStatistics { + index: self.index.statistics() + } + } } diff --git a/src/repository/mod.rs b/src/repository/mod.rs index 301932d..e90a716 100644 --- a/src/repository/mod.rs +++ b/src/repository/mod.rs @@ -27,7 +27,7 @@ pub use self::metadata::{Inode, FileType, FileData, InodeError}; pub use self::backup::{BackupError, BackupOptions, DiffType}; pub use self::backup_file::{Backup, BackupFileError}; pub use self::integrity::IntegrityError; -pub use self::info::{RepositoryInfo, BundleAnalysis}; +pub use self::info::{RepositoryInfo, BundleAnalysis, RepositoryStatistics}; pub use self::layout::RepositoryLayout; use self::bundle_map::BundleMap; diff --git a/src/util/mod.rs b/src/util/mod.rs index 0e26fc1..e651777 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -9,6 +9,7 @@ mod cli; mod hostname; mod fs; mod lock; +mod statistics; pub mod msgpack; pub use self::fs::*; @@ -22,3 +23,4 @@ pub use self::hex::*; pub use self::cli::*; pub use self::hostname::*; pub use self::lock::*; +pub use self::statistics::*; \ No newline at end of file diff --git a/src/util/statistics.rs b/src/util/statistics.rs new file mode 100644 index 0000000..a4664ac --- /dev/null +++ b/src/util/statistics.rs @@ -0,0 +1,57 @@ + + +#[derive(Debug, Default)] +pub struct ValueStats { + pub min: f32, + pub max: f32, + pub avg: f32, + pub stddev: f32, + pub count: usize, + pub count_xs: usize, + pub count_s: usize, + pub count_m: usize, + pub count_l: usize, + pub count_xl: usize, +} + +impl ValueStats { + pub fn from_iter, F: Fn() -> T>(iter: F) -> ValueStats { + let mut stats = ValueStats::default(); + stats.min = ::std::f32::INFINITY; + let mut sum = 0.0f64; + for val in iter() { + if stats.min > val { + stats.min = val; + } + if stats.max < val { + stats.max = val; + } + sum += val as f64; + stats.count += 1; + } + stats.avg = (sum as f32) / (stats.count as f32); + if stats.count < 2 { + stats.count_m = stats.count; + return stats; + } + sum = 0.0; + for val in iter() { + sum += f64::from(val - stats.avg) * f64::from(val - stats.avg); + } + stats.stddev = (sum.sqrt() as f32)/(stats.count as f32-1.0); + for val in iter() { + if val < stats.avg - 2.0 * stats.stddev { + stats.count_xs += 1; + } else if val < stats.avg - stats.stddev { + stats.count_s += 1; + } else if val < stats.avg + stats.stddev { + stats.count_m += 1; + } else if val < stats.avg + 2.0 * stats.stddev { + stats.count_l += 1; + } else { + stats.count_xl += 1; + } + } + stats + } +} \ No newline at end of file