Index stats

master
Dennis Schwerdel 2018-03-06 22:22:52 +01:00
parent 3b7bb52620
commit 224bf1d25c
8 changed files with 104 additions and 8 deletions

View File

@ -100,8 +100,8 @@ regarded as not set at all.
Examples:
- `~/.zvault` references the repository in `~/.zvault` and is identical with
`::`.
- `~/.zvault/repos/default` references the repository in
`~/.zvault/repos/default` and is identical with `::`.
- `::backup1` references the backup `backup1` in the default repository
- `::backup1::/` references the root folder of the backup `backup1` in the
default repository
@ -189,7 +189,7 @@ The chunker algortihm and chunk size are configured together in the format
`algorithm/size` where algorithm is one of `rabin`, `ae` and `fastcdc` and size
is the size in KiB e.g. `16`. So the recommended configuration is `fastcdc/16`.
Please not that since the chunker algorithm and chunk size affect the chunks
Please note that since the chunker algorithm and chunk size affect the chunks
created from the input data, any change to those values will make existing
chunks inaccessible for deduplication purposes. The old data is still readable
but new backups will have to store all data again.
@ -198,7 +198,7 @@ but new backups will have to store all data again.
### Compression
ZVault offers different compression algorithms that can be used to compress the
stored data after deduplication. The compression ratio that can be achieved
mostly depends on the input data (test data can be compressed well and media
mostly depends on the input data (text data can be compressed well and media
data like music and videos are already compressed and can not be compressed
significantly).

View File

@ -859,6 +859,7 @@ pub fn run() -> Result<(), ErrorCode> {
print_backup(&backup);
}
} else {
println!("{:?}", repo.statistics());
print_repoinfo(&repo.info());
}
}

View File

@ -8,6 +8,7 @@ use std::os::unix::io::AsRawFd;
use mmap::{MemoryMap, MapOption, MapError};
use ::prelude::*;
pub const MAX_USAGE: f64 = 0.9;
pub const MIN_USAGE: f64 = 0.35;
@ -373,6 +374,11 @@ impl<K: Key, V: Value> Index<K, V> {
self.header.capacity = self.capacity as u64;
}
#[inline]
fn get_displacement(&self, entry: &Entry<K, V>, pos: usize) -> usize {
(pos + self.capacity - (entry.get_key().hash() as usize & self.mask)) & self.mask
}
/// Finds the position for this key
/// If the key is in the table, it will be the position of the key,
/// otherwise it will be the position where this key should be inserted
@ -387,7 +393,7 @@ impl<K: Key, V: Value> Index<K, V> {
if entry.get_key() == key {
return LocateResult::Found(pos);
}
let odist = (pos + self.capacity - (entry.get_key().hash() as usize & self.mask)) & self.mask;
let odist = self.get_displacement(entry, pos);
if dist > odist {
return LocateResult::Steal(pos);
}
@ -579,4 +585,20 @@ impl<K: Key, V: Value> Index<K, V> {
}
self.entries = 0;
}
#[allow(dead_code)]
pub fn statistics(&self) -> IndexStatistics {
IndexStatistics {
displacement: ValueStats::from_iter(|| self.data.iter().enumerate().filter(
|&(_, entry)| entry.is_used()).map(
|(index, entry)| self.get_displacement(entry, index) as f32))
}
}
}
#[derive(Debug)]
pub struct IndexStatistics {
pub displacement: ValueStats
}

View File

@ -4,8 +4,9 @@ pub use bundledb::{BundleReader, BundleMode, BundleWriter, BundleInfo, BundleId,
pub use chunker::{ChunkerType, Chunker, ChunkerStatus, ChunkerError};
pub use repository::{Repository, Backup, Config, RepositoryError, RepositoryInfo, Inode, FileType,
IntegrityError, BackupFileError, BackupError, BackupOptions, BundleAnalysis,
FileData, DiffType, InodeError, RepositoryLayout, Location};
pub use index::{Index, IndexError};
FileData, DiffType, InodeError, RepositoryLayout, Location,
RepositoryStatistics};
pub use index::{Index, IndexError, IndexStatistics};
pub use mount::FuseFilesystem;
pub use translation::CowStr;

View File

@ -39,6 +39,12 @@ pub struct RepositoryInfo {
}
#[derive(Debug)]
pub struct RepositoryStatistics {
pub index: IndexStatistics
}
impl Repository {
fn mark_used(
&self,
@ -147,4 +153,11 @@ impl Repository {
index_entries: self.index.len()
}
}
#[allow(dead_code)]
pub fn statistics(&self) -> RepositoryStatistics {
RepositoryStatistics {
index: self.index.statistics()
}
}
}

View File

@ -27,7 +27,7 @@ pub use self::metadata::{Inode, FileType, FileData, InodeError};
pub use self::backup::{BackupError, BackupOptions, DiffType};
pub use self::backup_file::{Backup, BackupFileError};
pub use self::integrity::IntegrityError;
pub use self::info::{RepositoryInfo, BundleAnalysis};
pub use self::info::{RepositoryInfo, BundleAnalysis, RepositoryStatistics};
pub use self::layout::RepositoryLayout;
use self::bundle_map::BundleMap;

View File

@ -9,6 +9,7 @@ mod cli;
mod hostname;
mod fs;
mod lock;
mod statistics;
pub mod msgpack;
pub use self::fs::*;
@ -22,3 +23,4 @@ pub use self::hex::*;
pub use self::cli::*;
pub use self::hostname::*;
pub use self::lock::*;
pub use self::statistics::*;

57
src/util/statistics.rs Normal file
View File

@ -0,0 +1,57 @@
#[derive(Debug, Default)]
pub struct ValueStats {
pub min: f32,
pub max: f32,
pub avg: f32,
pub stddev: f32,
pub count: usize,
pub count_xs: usize,
pub count_s: usize,
pub count_m: usize,
pub count_l: usize,
pub count_xl: usize,
}
impl ValueStats {
pub fn from_iter<T: Iterator<Item=f32>, F: Fn() -> T>(iter: F) -> ValueStats {
let mut stats = ValueStats::default();
stats.min = ::std::f32::INFINITY;
let mut sum = 0.0f64;
for val in iter() {
if stats.min > val {
stats.min = val;
}
if stats.max < val {
stats.max = val;
}
sum += val as f64;
stats.count += 1;
}
stats.avg = (sum as f32) / (stats.count as f32);
if stats.count < 2 {
stats.count_m = stats.count;
return stats;
}
sum = 0.0;
for val in iter() {
sum += f64::from(val - stats.avg) * f64::from(val - stats.avg);
}
stats.stddev = (sum.sqrt() as f32)/(stats.count as f32-1.0);
for val in iter() {
if val < stats.avg - 2.0 * stats.stddev {
stats.count_xs += 1;
} else if val < stats.avg - stats.stddev {
stats.count_s += 1;
} else if val < stats.avg + stats.stddev {
stats.count_m += 1;
} else if val < stats.avg + 2.0 * stats.stddev {
stats.count_l += 1;
} else {
stats.count_xl += 1;
}
}
stats
}
}