This commit is contained in:
Dennis Schwerdel 2017-03-17 11:03:07 +01:00
parent 47d316ace3
commit 69eaf4085e
10 changed files with 489 additions and 419 deletions

1
.gitignore vendored
View File

@ -2,3 +2,4 @@ target
squash
test.tar
test_*
restored

98
Cargo.lock generated
View File

@ -5,7 +5,7 @@ dependencies = [
"blake2-rfc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
"chrono 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"clap 2.21.1 (registry+https://github.com/rust-lang/crates.io-index)",
"docopt 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
"mmap 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"murmurhash3 0.0.5 (registry+https://github.com/rust-lang/crates.io-index)",
"quick-error 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
@ -17,14 +17,6 @@ dependencies = [
"squash-sys 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "aho-corasick"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "ansi_term"
version = "0.9.0"
@ -92,17 +84,6 @@ name = "constant_time_eq"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "docopt"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"lazy_static 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"rustc-serialize 0.3.22 (registry+https://github.com/rust-lang/crates.io-index)",
"strsim 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "kernel32-sys"
version = "0.2.2"
@ -112,11 +93,6 @@ dependencies = [
"winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "lazy_static"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "libc"
version = "0.1.12"
@ -133,12 +109,9 @@ version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "memchr"
version = "1.0.1"
name = "log"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.21 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "mmap"
@ -209,23 +182,6 @@ name = "redox_syscall"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "regex"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"aho-corasick 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"regex-syntax 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"thread_local 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
"utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "regex-syntax"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "rmp"
version = "0.8.4"
@ -306,24 +262,6 @@ dependencies = [
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "thread-id"
version = "3.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.21 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "thread_local"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"thread-id 3.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"unreachable 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "time"
version = "0.1.36"
@ -345,29 +283,11 @@ name = "unicode-width"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "unreachable"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "utf8-ranges"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "vec_map"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "void"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "winapi"
version = "0.2.8"
@ -387,7 +307,6 @@ dependencies = [
]
[metadata]
"checksum aho-corasick 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "0638fd549427caa90c499814196d1b9e3725eb4d15d7339d6de073a680ed0ca2"
"checksum ansi_term 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "23ac7c30002a5accbf7e8987d0632fa6de155b7c3d39d0067317a391e00a2ef6"
"checksum atty 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d912da0db7fa85514874458ca3651fe2cddace8d0b0505571dbdcd41ab490159"
"checksum bitflags 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "aad18937a628ec6abcd26d1489012cc0e18c21798210f491af69ded9b881106d"
@ -397,13 +316,11 @@ dependencies = [
"checksum chrono 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "158b0bd7d75cbb6bf9c25967a48a2e9f77da95876b858eadfabaa99cd069de6e"
"checksum clap 2.21.1 (registry+https://github.com/rust-lang/crates.io-index)" = "74a80f603221c9cd9aa27a28f52af452850051598537bb6b359c38a7d61e5cda"
"checksum constant_time_eq 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "07dcb7959f0f6f1cf662f9a7ff389bcb919924d99ac41cf31f10d611d8721323"
"checksum docopt 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ab32ea6e284d87987066f21a9e809a73c14720571ef34516f0890b3d355ccfd8"
"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
"checksum lazy_static 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "7291b1dd97d331f752620b02dfdbc231df7fc01bf282a00769e1cdb963c460dc"
"checksum libc 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)" = "e32a70cf75e5846d53a673923498228bbec6a8624708a9ea5645f075d6276122"
"checksum libc 0.2.21 (registry+https://github.com/rust-lang/crates.io-index)" = "88ee81885f9f04bff991e306fea7c1c60a5f0f9e409e99f6b40e3311a3363135"
"checksum linked-hash-map 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "6d262045c5b87c0861b3f004610afd0e2c851e2908d08b6c870cbb9d5f494ecd"
"checksum memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1dbccc0e46f1ea47b9f17e6d67c5a96bd27030519c519c9c91327e31275a47b4"
"checksum log 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "5141eca02775a762cc6cd564d8d2c50f67c0ea3a372cbf1c51592b3e029e10ad"
"checksum mmap 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0bc85448a6006dd2ba26a385a564a8a0f1f2c7e78c70f1a70b2e0f4af286b823"
"checksum murmurhash3 0.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "a2983372caf4480544083767bf2d27defafe32af49ab4df3a0b7fc90793a3664"
"checksum num 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)" = "98b15ba84e910ea7a1973bccd3df7b31ae282bf9d8bd2897779950c9b8303d40"
@ -414,8 +331,6 @@ dependencies = [
"checksum quick-error 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "0aad603e8d7fb67da22dbdf1f4b826ce8829e406124109e73cf1b2454b93a71c"
"checksum rand 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)" = "022e0636ec2519ddae48154b028864bdce4eaf7d35226ab8e65c611be97b189d"
"checksum redox_syscall 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)" = "8dd35cc9a8bdec562c757e3d43c1526b5c6d2653e23e2315065bc25556550753"
"checksum regex 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4278c17d0f6d62dfef0ab00028feb45bd7d2102843f80763474eeb1be8a10c01"
"checksum regex-syntax 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9191b1f57603095f105d317e375d19b1c9c5c3185ea9633a99a6dcbed04457"
"checksum rmp 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)" = "e59917c01f49718a59c644a621a4848aafc6577c4a47d66270d78951a807541a"
"checksum rmp-serde 0.12.2 (registry+https://github.com/rust-lang/crates.io-index)" = "06ec4d0cdea2645de5d0e649f90c3e654205d913e14adefa452257314a24e76e"
"checksum rustc-serialize 0.3.22 (registry+https://github.com/rust-lang/crates.io-index)" = "237546c689f20bb44980270c73c3b9edd0891c1be49cc1274406134a66d3957b"
@ -426,15 +341,10 @@ dependencies = [
"checksum strsim 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b4d15c810519a91cf877e7e36e63fe068815c678181439f2f29e2562147c3694"
"checksum tempdir 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "87974a6f5c1dfb344d733055601650059a3363de2a6104819293baff662132d6"
"checksum term_size 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "07b6c1ac5b3fffd75073276bca1ceed01f67a28537097a2a9539e116e50fb21a"
"checksum thread-id 3.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4437c97558c70d129e40629a5b385b3fb1ffac301e63941335e4d354081ec14a"
"checksum thread_local 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "c85048c6260d17cf486ceae3282d9fb6b90be220bf5b28c400f5485ffc29f0c7"
"checksum time 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)" = "211b63c112206356ef1ff9b19355f43740fc3f85960c598a93d3a3d3ba7beade"
"checksum unicode-segmentation 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "18127285758f0e2c6cf325bb3f3d138a12fee27de4f23e146cd6a179f26c2cf3"
"checksum unicode-width 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "bf3a113775714a22dcb774d8ea3655c53a32debae63a063acc00a91cc586245f"
"checksum unreachable 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1f2ae5ddb18e1c92664717616dd9549dde73f539f01bd7b77c2edb2446bdff91"
"checksum utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "662fab6525a98beff2921d7f61a39e7d59e0b425ebc7d0d9e66d316e55124122"
"checksum vec_map 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f8cdc8b93bd0198ed872357fb2e667f7125646b1762f16d60b2c96350d361897"
"checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d"
"checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a"
"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc"
"checksum yaml-rust 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "e66366e18dc58b46801afbf2ca7661a9f59cc8c5962c29892b6039b4f86fa992"

View File

@ -13,7 +13,7 @@ mmap = "*"
quick-error = "1.1"
blake2-rfc = "*"
murmurhash3 = "*"
docopt = "0.7"
rustc-serialize = "0.3"
chrono = "0.3"
clap = "2.19"
log = "0.3"

View File

@ -1,4 +1,5 @@
use std::io::{self, Write, Read};
use std::str::FromStr;
mod ae;
mod rabin;
@ -100,6 +101,19 @@ impl ChunkerType {
}
}
#[inline]
pub fn from_string(name: &str) -> Result<Self, &'static str> {
let (name, size) = if let Some(pos) = name.find('/') {
let size = try!(usize::from_str(&name[pos+1..]).map_err(|_| "Chunk size must be a number"));
let name = &name[..pos];
(name, size)
} else {
(name, 8)
};
Self::from(name, size * 1024, 0)
}
#[inline]
pub fn create(&self) -> Chunker {
match *self {

View File

@ -1,122 +1,152 @@
use std::io::{Cursor, Read};
use std::io::{self, Cursor, Read, Write};
use std::fs::File;
use std::time;
use std::collections::HashSet;
use chrono::Duration;
use ::chunker::*;
use ::util::*;
use ::util::cli::*;
fn speed_chunk<C: IChunker>(chunker: &mut C, data: &[u8]) {
let mut input = Cursor::new(data);
let mut chunk = Vec::with_capacity(1_000_000);
loop {
chunk.clear();
let result = chunker.chunk(&mut input, &mut chunk).unwrap();
if result == ChunkerStatus::Finished {
return
struct ChunkSink {
chunks: Vec<(usize, usize)>,
pos: usize,
written: usize
}
impl ChunkSink {
fn end_chunk(&mut self) {
self.chunks.push((self.pos, self.written));
self.pos += self.written;
self.written = 0;
}
}
fn chunk<C: IChunker>(chunker: &mut C, data: &[u8]) -> Vec<Vec<u8>> {
let mut input = Cursor::new(data);
let mut chunks = Vec::with_capacity(100_000);
loop {
let mut chunk = Vec::with_capacity(100_000);
let result = chunker.chunk(&mut input, &mut chunk).unwrap();
chunks.push(chunk);
if result == ChunkerStatus::Finished {
return chunks;
impl Write for ChunkSink {
fn write(&mut self, data: &[u8]) -> Result<usize, io::Error> {
self.written += data.len();
Ok(data.len())
}
fn flush(&mut self) -> Result<(), io::Error> {
Ok(())
}
}
fn analyze_chunks(mut chunks: Vec<Vec<u8>>) -> (usize, f64, f64, f64) {
let count = chunks.len();
let total = chunks.iter().map(|c| c.len()).sum::<usize>();
let avg_size = total as f64 / count as f64;
let stddev = (chunks.iter().map(|c| (c.len() as f64 - avg_size).powi(2)).sum::<f64>() / (count as f64 - 1.0)).sqrt();
chunks.sort();
chunks.dedup();
let non_dup: usize = chunks.iter().map(|c| c.len()).sum();
let saved = 1.0 - non_dup as f64 / total as f64;
(count, avg_size, stddev, saved)
fn chunk(data: &[u8], mut chunker: Chunker, sink: &mut ChunkSink) {
let mut cursor = Cursor::new(data);
while chunker.chunk(&mut cursor, sink).unwrap() == ChunkerStatus::Continue {
sink.end_chunk();
}
fn compare_chunker<C: IChunker>(name: &str, mut chunker: C, data: &[u8]) {
let start = time::Instant::now();
speed_chunk(&mut chunker, data);
let elapsed = start.elapsed();
let chunks = chunk(&mut chunker, data);
let duration = elapsed.as_secs() as f64 * 1.0 + elapsed.subsec_nanos() as f64 / 1_000_000_000.0;
let speed = data.len() as f64 / duration;
assert_eq!(chunks.iter().map(|c| c.len()).sum::<usize>(), data.len());
let (_count, avg_size, stddev, saved) = analyze_chunks(chunks);
println!("{}: \tavg chunk size {:.1}\t± {:.1} bytes, \t{:.1}% saved,\tspeed {:.1} MB/s",
name, avg_size, stddev, saved * 100.0, speed / 1_000_000.0);
}
fn compare_hash(name: &str, hash: HashMethod, data: &[u8]) {
let start = time::Instant::now();
let _ = hash.hash(data);
let elapsed = start.elapsed();
let duration = elapsed.as_secs() as f64 * 1.0 + elapsed.subsec_nanos() as f64 / 1_000_000_000.0;
let speed = data.len() as f64 / duration;
println!("{}: {:.1} MB/s", name, speed / 1_000_000.0);
}
fn compare_compression(name: &str, method: Compression, data: &[u8]) {
let start = time::Instant::now();
let compressed = method.compress(data).unwrap();
let elapsed = start.elapsed();
let duration = elapsed.as_secs() as f64 * 1.0 + elapsed.subsec_nanos() as f64 / 1_000_000_000.0;
let cspeed = data.len() as f64 / duration;
let ratio = compressed.len() as f64 / data.len() as f64;
let start = time::Instant::now();
let uncompressed = method.decompress(&compressed).unwrap();
if uncompressed != data {
panic!("{} did not uncompress to the same value", name);
}
let elapsed = start.elapsed();
let duration = elapsed.as_secs() as f64 * 1.0 + elapsed.subsec_nanos() as f64 / 1_000_000_000.0;
let dspeed = data.len() as f64 / duration;
println!("{}:\tratio: {:.1}%,\tcompress: {:.1} MB/s,\tdecompress: {:.1} MB/s",
name, ratio * 100.0, cspeed / 1_000_000.0, dspeed / 1_000_000.0);
sink.end_chunk();
}
#[allow(dead_code)]
pub fn run(path: &str) {
println!("Algorithm comparison on file {}", path);
println!();
print!("Reading input file...");
pub fn run(path: &str, bundle_size: usize, chunker: ChunkerType, compression: Option<Compression>, hash: HashMethod) {
let mut total_time = 0.0;
println!("Reading input file ...");
let mut file = File::open(path).unwrap();
let mut data = Vec::new();
let total_size = file.metadata().unwrap().len();
let mut size = total_size;
let mut data = Vec::with_capacity(size as usize);
let read_time = Duration::span(|| {
file.read_to_end(&mut data).unwrap();
println!(" done. {} bytes", data.len());
}).num_milliseconds() as f32 / 1_000.0;
println!("- {}, {}", to_duration(read_time), to_speed(size, read_time));
println!();
println!("Chunker algorithms");
for size in &[4usize, 8, 16, 32, 64] {
println!(" Chunk size: {} KiB", size);
compare_chunker(" AE", AeChunker::new(size*1024), &data);
compare_chunker(" Rabin", RabinChunker::new(size*1024, 0), &data);
compare_chunker(" FastCdc", FastCdcChunker::new(size*1024, 0), &data);
}
println!("Chunking data with {}, avg chunk size {} ...", chunker.name(), to_file_size(chunker.avg_size() as u64));
let mut chunk_sink = ChunkSink {
chunks: Vec::with_capacity(2*size as usize/chunker.avg_size()),
written: 0,
pos: 0
};
let chunker = chunker.create();
let chunk_time = Duration::span(|| {
chunk(&data, chunker, &mut chunk_sink)
}).num_milliseconds() as f32 / 1_000.0;
total_time += chunk_time;
println!("- {}, {}", to_duration(chunk_time), to_speed(size, chunk_time));
let mut chunks = chunk_sink.chunks;
assert_eq!(chunks.iter().map(|c| c.1).sum::<usize>(), size as usize);
let chunk_size_avg = size as f32 / chunks.len() as f32;
let chunk_size_stddev = (chunks.iter().map(|c| (c.1 as f32 - chunk_size_avg).powi(2)).sum::<f32>() / (chunks.len() as f32 - 1.0)).sqrt();
println!("- {} chunks, avg size: {}, stddev: {}", chunks.len(), to_file_size(chunk_size_avg as u64), to_file_size(chunk_size_stddev as u64));
println!();
println!("Hash algorithms");
compare_hash(" Blake2", HashMethod::Blake2, &data);
compare_hash(" Murmur3", HashMethod::Murmur3, &data);
println!("Hashing chunks with {} ...", hash.name());
let mut hashes = Vec::with_capacity(chunks.len());
let hash_time = Duration::span(|| {
for &(pos, len) in &chunks {
hashes.push(hash.hash(&data[pos..pos+len]))
}
}).num_milliseconds() as f32 / 1_000.0;
total_time += hash_time;
println!("- {}, {}", to_duration(hash_time), to_speed(size, hash_time));
let mut seen_hashes = HashSet::with_capacity(hashes.len());
let mut dups = Vec::new();
for (i, hash) in hashes.into_iter().enumerate() {
if !seen_hashes.insert(hash) {
dups.push(i);
}
}
let mut dup_size = 0;
dups.reverse();
for i in &dups {
let (_, len) = chunks.remove(*i);
dup_size += len;
}
println!("- {} duplicate chunks, {}, {:.1}% saved", dups.len(), to_file_size(dup_size as u64), dup_size as f32 / size as f32*100.0);
size -= dup_size as u64;
if let Some(compression) = compression {
println!();
println!("Compression algorithms");
compare_compression(" Snappy", Compression::Snappy(()), &data);
for level in 1..10 {
compare_compression(&format!(" ZStd/{}", level), Compression::ZStd(level), &data);
}
for level in 1..10 {
compare_compression(&format!(" Deflate/{}", level), Compression::Deflate(level), &data);
}
for level in 1..10 {
compare_compression(&format!(" Brotli/{}", level), Compression::Brotli(level), &data);
}
for level in 1..7 {
compare_compression(&format!(" Lzma2/{}", level), Compression::Lzma2(level), &data);
println!("Compressing chunks with {} ...", compression.to_string());
let mut bundles = Vec::new();
let compress_time = Duration::span(|| {
let mut bundle = Vec::with_capacity(bundle_size + 2*chunk_size_avg as usize);
let mut c = compression.compress_stream().unwrap();
for &(pos, len) in &chunks {
c.process(&data[pos..pos+len], &mut bundle).unwrap();
if bundle.len() >= bundle_size {
c.finish(&mut bundle).unwrap();
bundles.push(bundle);
bundle = Vec::with_capacity(bundle_size + 2*chunk_size_avg as usize);
c = compression.compress_stream().unwrap();
}
}
c.finish(&mut bundle).unwrap();
bundles.push(bundle);
}).num_milliseconds() as f32 / 1_000.0;
total_time += compress_time;
println!("- {}, {}", to_duration(compress_time), to_speed(size, compress_time));
let compressed_size = bundles.iter().map(|b| b.len()).sum::<usize>();
println!("- {} bundles, {}, {:.1}% saved", bundles.len(), to_file_size(compressed_size as u64), (size as f32 - compressed_size as f32)/size as f32*100.0);
size = compressed_size as u64;
println!();
println!("Decompressing bundles with {} ...", compression.to_string());
let mut dummy = ChunkSink { chunks: vec![], written: 0, pos: 0 };
let decompress_time = Duration::span(|| {
for bundle in &bundles {
let mut c = compression.decompress_stream().unwrap();
c.process(bundle, &mut dummy).unwrap();
c.finish(&mut dummy).unwrap();
}
}).num_milliseconds() as f32 / 1_000.0;
println!("- {}, {}", to_duration(decompress_time), to_speed(size, decompress_time));
}
println!();
let total_saved = total_size - size;
println!("Total space saved: {}, {:.1}%", to_file_size(total_saved as u64), total_saved as f32/total_size as f32*100.0);
println!("Total processing speed: {}", to_speed(total_size, total_time));
}

View File

@ -1,68 +1,7 @@
use clap::{Arg, App, SubCommand};
use docopt::Docopt;
use ::chunker::ChunkerType;
use ::util::{ChecksumType, Compression, HashMethod};
use ::util::{Compression, HashMethod, ChecksumType};
use std::process::exit;
use std::path::Path;
static USAGE: &'static str = "
Usage:
zvault init [--bundle-size SIZE] [--chunker METHOD] [--chunk-size SIZE] [--compression COMPRESSION] <repo>
zvault backup [--full] <backup> <path>
zvault restore <backup> [<src>] <dst>
zvault check [--full] <repo>
zvault backups <repo>
zvault info <backup>
zvault list [--tree] <backup> <path>
zvault stats <repo>
zvault bundles <repo>
zvault algotest <file>
Options:
--tree Print the whole (sub-)tree from the backup
--full Whether to verify the repository by loading all bundles
--bundle-size SIZE The target size of a full bundle in MiB [default: 25]
--chunker METHOD The chunking algorithm to use [default: fastcdc]
--chunk-size SIZE The target average chunk size in KiB [default: 8]
--compression COMPRESSION The compression to use [default: brotli/3]
";
#[derive(RustcDecodable, Debug)]
pub struct DocoptArgs {
pub cmd_init: bool,
pub cmd_backup: bool,
pub cmd_restore: bool,
pub cmd_check: bool,
pub cmd_backups: bool,
pub cmd_info: bool,
pub cmd_list: bool,
pub cmd_stats: bool,
pub cmd_bundles: bool,
pub cmd_algotest: bool,
pub cmd_stat: bool,
pub arg_file: Option<String>,
pub arg_repo: Option<String>,
pub arg_path: Option<String>,
pub arg_src: Option<String>,
pub arg_dst: Option<String>,
pub arg_backup: Option<String>,
pub flag_full: bool,
pub flag_bundle_size: usize,
pub flag_chunker: String,
pub flag_chunk_size: usize,
pub flag_compression: String,
pub flag_tree: bool
}
pub enum Arguments {
@ -70,8 +9,8 @@ pub enum Arguments {
repo_path: String,
bundle_size: usize,
chunker: ChunkerType,
chunk_size: usize,
compression: Compression
compression: Option<Compression>,
hash: HashMethod
},
Backup {
repo_path: String,
@ -105,26 +44,89 @@ pub enum Arguments {
repo_path: String
},
AlgoTest {
file: String
file: String,
bundle_size: usize,
chunker: ChunkerType,
compression: Option<Compression>,
hash: HashMethod
}
}
pub fn parse() -> DocoptArgs {
Docopt::new(USAGE).and_then(|d| d.decode()).unwrap_or_else(|e| e.exit())
pub fn split_repo_path(repo_path: &str) -> (&str, Option<&str>, Option<&str>) {
let mut parts = repo_path.splitn(3, "::");
let repo = parts.next().unwrap();
let backup = parts.next();
let inode = parts.next();
(repo, backup, inode)
}
pub fn parse2() -> Arguments {
fn parse_num(num: &str, name: &str) -> u64 {
if let Ok(num) = num.parse::<u64>() {
num
} else {
error!("{} must be a number, was '{}'", name, num);
exit(1);
}
}
fn parse_chunker(val: Option<&str>) -> ChunkerType {
if let Ok(chunker) = ChunkerType::from_string(val.unwrap_or("fastcdc/8")) {
chunker
} else {
error!("Invalid chunker method/size: {}", val.unwrap());
exit(1);
}
}
fn parse_compression(val: Option<&str>) -> Option<Compression> {
let val = val.unwrap_or("brotli/3");
if val == "none" {
return None
}
if let Ok(compression) = Compression::from_string(val) {
Some(compression)
} else {
error!("Invalid compression method/level: {}", val);
exit(1);
}
}
#[allow(dead_code)]
fn parse_checksum(val: Option<&str>) -> ChecksumType {
if let Ok(checksum) = ChecksumType::from(val.unwrap_or("blake2")) {
checksum
} else {
error!("Invalid checksum method: {}", val.unwrap());
exit(1);
}
}
fn parse_hash(val: Option<&str>) -> HashMethod {
if let Ok(hash) = HashMethod::from(val.unwrap_or("blake2")) {
hash
} else {
error!("Invalid hash method: {}", val.unwrap());
exit(1);
}
}
pub fn parse() -> Arguments {
let args = clap_app!(zvault =>
(version: "0.1")
(version: env!("CARGO_PKG_VERSION"))
(author: "Dennis Schwerdel <schwerdel@googlemail.com>")
(about: "Deduplicating backup tool")
(@setting SubcommandRequiredElseHelp)
(@setting GlobalVersion)
(@setting VersionlessSubcommands)
(@setting UnifiedHelpMessage)
(@subcommand init =>
(about: "initializes a new repository")
(@arg bundle_size: --bundle-size +takes_value "maximal bundle size")
(@arg chunker: --chunker +takes_value "chunker algorithm")
(@arg chunk_size: --chunk-size +takes_value "average chunk size")
(@arg compression: --compression -c +takes_value "compression to use")
(@arg bundle_size: --bundle-size +takes_value "maximal bundle size in MiB [default: 25]")
(@arg chunker: --chunker +takes_value "chunker algorithm [default: fastcdc/8]")
(@arg compression: --compression -c +takes_value "compression to use [default: brotli/3]")
(@arg hash: --hash +takes_value "hash method to use [default: blake2]")
(@arg REPO: +required "path of the repository")
)
(@subcommand backup =>
@ -157,14 +159,101 @@ pub fn parse2() -> Arguments {
)
(@subcommand algotest =>
(about: "test a specific algorithm combination")
(@arg bundle_size: --bundle-size +takes_value "maximal bundle size")
(@arg chunker: --chunker +takes_value "chunker algorithm")
(@arg chunk_size: --chunk-size +takes_value "average chunk size")
(@arg compression: --compression -c +takes_value "compression to use")
(@arg bundle_size: --bundle-size +takes_value "maximal bundle size in MiB [default: 25]")
(@arg chunker: --chunker +takes_value "chunker algorithm [default: fastcdc/8]")
(@arg compression: --compression -c +takes_value "compression to use [default: brotli/3]")
(@arg hash: --hash +takes_value "hash method to use [default: blake2]")
(@arg FILE: +required "the file to test the algorithms with")
)
).get_matches();
if let Some(args) = args.subcommand_matches("init") {
let (repository, backup, inode) = split_repo_path(args.value_of("REPO").unwrap());
if backup.is_some() || inode.is_some() {
println!("No backups or subpaths may be given here");
exit(1);
}
unimplemented!()
return Arguments::Init {
bundle_size: (parse_num(args.value_of("bundle_size").unwrap_or("25"), "Bundle size") * 1024 * 1024) as usize,
chunker: parse_chunker(args.value_of("chunker")),
compression: parse_compression(args.value_of("compression")),
hash: parse_hash(args.value_of("hash")),
repo_path: repository.to_string(),
}
}
if let Some(args) = args.subcommand_matches("backup") {
let (repository, backup, inode) = split_repo_path(args.value_of("BACKUP").unwrap());
if backup.is_none() {
println!("A backup must be specified");
exit(1);
}
if inode.is_some() {
println!("No subpaths may be given here");
exit(1);
}
return Arguments::Backup {
repo_path: repository.to_string(),
backup_name: backup.unwrap().to_string(),
full: args.is_present("full"),
src_path: args.value_of("SRC").unwrap().to_string()
}
}
if let Some(args) = args.subcommand_matches("restore") {
let (repository, backup, inode) = split_repo_path(args.value_of("BACKUP").unwrap());
if backup.is_none() {
println!("A backup must be specified");
exit(1);
}
return Arguments::Restore {
repo_path: repository.to_string(),
backup_name: backup.unwrap().to_string(),
inode: inode.map(|v| v.to_string()),
dst_path: args.value_of("DST").unwrap().to_string()
}
}
if let Some(args) = args.subcommand_matches("check") {
let (repository, backup, inode) = split_repo_path(args.value_of("PATH").unwrap());
return Arguments::Check {
repo_path: repository.to_string(),
backup_name: backup.map(|v| v.to_string()),
inode: inode.map(|v| v.to_string()),
full: args.is_present("full")
}
}
if let Some(args) = args.subcommand_matches("list") {
let (repository, backup, inode) = split_repo_path(args.value_of("PATH").unwrap());
return Arguments::List {
repo_path: repository.to_string(),
backup_name: backup.map(|v| v.to_string()),
inode: inode.map(|v| v.to_string())
}
}
if let Some(args) = args.subcommand_matches("listbundles") {
let (repository, backup, inode) = split_repo_path(args.value_of("PATH").unwrap());
if backup.is_some() || inode.is_some() {
println!("No backups or subpaths may be given here");
exit(1);
}
return Arguments::ListBundles {
repo_path: repository.to_string(),
}
}
if let Some(args) = args.subcommand_matches("info") {
let (repository, backup, inode) = split_repo_path(args.value_of("PATH").unwrap());
return Arguments::Info {
repo_path: repository.to_string(),
backup_name: backup.map(|v| v.to_string()),
inode: inode.map(|v| v.to_string())
}
}
if let Some(args) = args.subcommand_matches("algotest") {
return Arguments::AlgoTest {
bundle_size: (parse_num(args.value_of("bundle_size").unwrap_or("25"), "Bundle size") * 1024 * 1024) as usize,
chunker: parse_chunker(args.value_of("chunker")),
compression: parse_compression(args.value_of("compression")),
hash: parse_hash(args.value_of("hash")),
file: args.value_of("FILE").unwrap().to_string(),
}
}
error!("No subcommand given");
exit(1);
}

23
src/cli/logger.rs Normal file
View File

@ -0,0 +1,23 @@
use log::{self, LogRecord, LogLevel, LogMetadata, LogLevelFilter};
pub use log::SetLoggerError;
struct Logger;
impl log::Log for Logger {
fn enabled(&self, metadata: &LogMetadata) -> bool {
metadata.level() <= LogLevel::Info
}
fn log(&self, record: &LogRecord) {
if self.enabled(record.metadata()) {
println!("{} - {}", record.level(), record.args());
}
}
}
pub fn init() -> Result<(), SetLoggerError> {
log::set_logger(|max_log_level| {
max_log_level.set(LogLevelFilter::Info);
Box::new(Logger)
})
}

View File

@ -1,62 +1,119 @@
mod args;
mod logger;
mod algotest;
use chrono::prelude::*;
use std::process::exit;
use ::chunker::ChunkerType;
use ::repository::{Repository, Config, Inode};
use ::util::{ChecksumType, Compression, HashMethod};
use ::repository::{Repository, Config, Inode, Backup};
use ::util::ChecksumType;
use ::util::cli::*;
use self::args::Arguments;
pub fn run() {
let args = args::parse();
if args.cmd_algotest {
let file = args.arg_file.unwrap();
algotest::run(&file);
return
fn open_repository(path: &str) -> Repository {
match Repository::open(path) {
Ok(repo) => repo,
Err(err) => {
error!("Failed to open repository: {}", err);
exit(2);
}
}
}
if args.cmd_init {
let chunker = ChunkerType::from(&args.flag_chunker, args.flag_chunk_size*1024, 0).expect("No such chunk algorithm");
let compression = if args.flag_compression == "none" {
None
} else {
Some(Compression::from_string(&args.flag_compression).expect("Failed to parse compression"))
};
Repository::create(&args.arg_repo.unwrap(), Config {
bundle_size: args.flag_bundle_size*1024*1024,
fn get_backup(repo: &Repository, backup_name: &str) -> Backup {
match repo.get_backup(backup_name) {
Ok(backup) => backup,
Err(err) => {
error!("Failed to load backup: {}", err);
exit(3);
}
}
}
pub fn run() {
if let Err(err) = logger::init() {
println!("Failed to initialize the logger: {}", err);
exit(-1)
}
match args::parse() {
Arguments::Init{repo_path, bundle_size, chunker, compression, hash} => {
Repository::create(repo_path, Config {
bundle_size: bundle_size,
checksum: ChecksumType::Blake2_256,
chunker: chunker,
compression: compression,
hash: HashMethod::Blake2
hash: hash
}).unwrap();
return
},
Arguments::Backup{repo_path, backup_name, src_path, full} => {
let mut repo = open_repository(&repo_path);
if !full {
warn!("Partial backups are not implemented yet, creating full backup");
}
if args.cmd_stat {
println!("{:?}", Inode::get_from(&args.arg_path.unwrap()).unwrap());
return
}
let mut repo;
if let Some(path) = args.arg_repo {
repo = Repository::open(path).unwrap();
} else if let Some(ref backup) = args.arg_backup {
let path = backup.splitn(2, "::").nth(0).unwrap();
repo = Repository::open(path).unwrap();
let backup = repo.create_full_backup(&src_path).unwrap();
repo.save_backup(&backup, &backup_name).unwrap();
},
Arguments::Restore{repo_path, backup_name, inode, dst_path} => {
let mut repo = open_repository(&repo_path);
let backup = get_backup(&repo, &backup_name);
if let Some(inode) = inode {
let inode = repo.get_backup_inode(&backup, &inode).unwrap();
repo.restore_inode_tree(inode, &dst_path).unwrap();
} else {
panic!("Repository is needed");
repo.restore_backup(&backup, &dst_path).unwrap();
}
if args.cmd_check {
repo.check(args.flag_full).unwrap();
return
},
Arguments::Check{repo_path, backup_name, inode, full} => {
let mut repo = open_repository(&repo_path);
if let Some(backup_name) = backup_name {
let backup = get_backup(&repo, &backup_name);
if let Some(inode) = inode {
unimplemented!()
} else {
unimplemented!()
}
if args.cmd_stats {
} else {
repo.check(full).unwrap()
}
},
Arguments::List{repo_path, backup_name, inode} => {
let mut repo = open_repository(&repo_path);
if let Some(backup_name) = backup_name {
let backup = get_backup(&repo, &backup_name);
let inode = repo.get_backup_inode(&backup, inode.as_ref().map(|v| v as &str).unwrap_or("/")).unwrap();
println!("{}", format_inode_one_line(&inode));
if let Some(children) = inode.children {
for chunks in children.values() {
let inode = repo.get_inode(chunks).unwrap();
println!("- {}", format_inode_one_line(&inode));
}
}
} else {
for backup in repo.list_backups().unwrap() {
println!("{}", backup);
}
}
}
Arguments::Info{repo_path, backup_name, inode} => {
let repo = open_repository(&repo_path);
if let Some(backup_name) = backup_name {
let backup = get_backup(&repo, &backup_name);
if let Some(inode) = inode {
unimplemented!()
} else {
println!("Date: {}", Local.timestamp(backup.date, 0).to_rfc2822());
println!("Duration: {}", to_duration(backup.duration));
println!("Entries: {} files, {} dirs", backup.file_count, backup.dir_count);
println!("Total backup size: {}", to_file_size(backup.total_data_size));
println!("Modified data size: {}", to_file_size(backup.changed_data_size));
let dedup_ratio = backup.deduplicated_data_size as f32 / backup.changed_data_size as f32;
println!("Deduplicated size: {}, {:.1}% saved", to_file_size(backup.deduplicated_data_size), (1.0 - dedup_ratio)*100.0);
let compress_ratio = backup.encoded_data_size as f32 / backup.deduplicated_data_size as f32;
println!("Compressed size: {} in {} bundles, {:.1}% saved", to_file_size(backup.encoded_data_size), backup.bundle_count, (1.0 - compress_ratio)*100.0);
println!("Chunk count: {}, avg size: {}", backup.chunk_count, to_file_size(backup.avg_chunk_size as u64));
}
} else {
let info = repo.info();
println!("Bundles: {}", info.bundle_count);
println!("Total size: {}", to_file_size(info.encoded_data_size));
@ -66,17 +123,10 @@ pub fn run() {
println!("Average chunk size: {}", to_file_size(info.avg_chunk_size as u64));
let index_usage = info.index_entries as f32 / info.index_capacity as f32;
println!("Index: {}, {:.0}% full", to_file_size(info.index_size as u64), index_usage * 100.0);
return
}
if args.cmd_backups {
for backup in repo.list_backups().unwrap() {
println!("{}", backup);
}
return
}
if args.cmd_bundles {
Arguments::ListBundles{repo_path} => {
let repo = open_repository(&repo_path);
for bundle in repo.list_bundles() {
println!("Bundle {}", bundle.id);
println!(" - Mode: {:?}", bundle.mode);
@ -92,53 +142,9 @@ pub fn run() {
println!(" - Compression: {}, ratio: {:.1}%", compression, ratio * 100.0);
println!();
}
return
}
let backup_name = args.arg_backup.unwrap().splitn(2, "::").nth(1).unwrap().to_string();
if args.cmd_backup {
let backup = repo.create_full_backup(&args.arg_path.unwrap()).unwrap();
repo.save_backup(&backup, &backup_name).unwrap();
return
}
let backup = repo.get_backup(&backup_name).unwrap();
if args.cmd_info {
println!("Date: {}", Local.timestamp(backup.date, 0).to_rfc2822());
println!("Duration: {}", to_duration(backup.duration));
println!("Entries: {} files, {} dirs", backup.file_count, backup.dir_count);
println!("Total backup size: {}", to_file_size(backup.total_data_size));
println!("Modified data size: {}", to_file_size(backup.changed_data_size));
let dedup_ratio = backup.deduplicated_data_size as f32 / backup.changed_data_size as f32;
println!("Deduplicated size: {}, {:.1}% saved", to_file_size(backup.deduplicated_data_size), (1.0 - dedup_ratio)*100.0);
let compress_ratio = backup.encoded_data_size as f32 / backup.deduplicated_data_size as f32;
println!("Compressed size: {} in {} bundles, {:.1}% saved", to_file_size(backup.encoded_data_size), backup.bundle_count, (1.0 - compress_ratio)*100.0);
println!("Chunk count: {}, avg size: {}", backup.chunk_count, to_file_size(backup.avg_chunk_size as u64));
return
}
if args.cmd_restore {
let dst = args.arg_dst.unwrap();
if let Some(src) = args.arg_src {
let inode = repo.get_backup_inode(&backup, src).unwrap();
repo.restore_inode_tree(inode, &dst).unwrap();
} else {
repo.restore_backup(&backup, &dst).unwrap();
}
return
}
if args.cmd_list {
let inode = repo.get_backup_inode(&backup, &args.arg_path.unwrap()).unwrap();
println!("{}", format_inode_one_line(&inode));
if let Some(children) = inode.children {
for chunks in children.values() {
let inode = repo.get_inode(chunks).unwrap();
println!("- {}", format_inode_one_line(&inode));
Arguments::AlgoTest{bundle_size, chunker, compression, hash, file} => {
algotest::run(&file, bundle_size, chunker, compression, hash);
}
}
return
}
}

View File

@ -7,10 +7,10 @@ extern crate blake2_rfc as blake2;
extern crate murmurhash3;
extern crate serde_yaml;
#[macro_use] extern crate quick_error;
extern crate docopt;
extern crate rustc_serialize;
extern crate chrono;
#[macro_use] extern crate clap;
#[macro_use] extern crate log;
pub mod util;
pub mod bundle;
@ -22,7 +22,8 @@ mod cli;
// TODO: Seperate remote folder
// TODO: Copy backup files to remote folder
// TODO: Keep meta bundles also locally
// TODO: Remove backups (based on age like attic)
// TODO: Remove backups/subtrees
// TODO: Prune backups (based on age like attic)
// TODO: Backup files tree structure
// TODO: Recompress & combine bundles
// TODO: Check backup integrity
@ -30,7 +31,6 @@ mod cli;
// TODO: list --tree
// TODO: Partial backups
// TODO: Load and compare remote bundles to bundle map
// TODO: Nice errors / checks for CLI
// TODO: Import remote backup
// TODO: Continue on errors

View File

@ -1,13 +1,5 @@
use ::repository::{Inode, FileType};
pub fn split_repo_path(repo_path: &str) -> (&str, Option<&str>, Option<&str>) {
let mut parts = repo_path.splitn(3, "::");
let repo = parts.next().unwrap();
let backup = parts.next();
let inode = parts.next();
(repo, backup, inode)
}
pub fn to_file_size(size: u64) -> String {
let mut size = size as f32;
if size >= 512.0 {
@ -33,6 +25,11 @@ pub fn to_file_size(size: u64) -> String {
format!("{:.1} TiB", size)
}
pub fn to_speed(size: u64, dur: f32) -> String {
let speed = (size as f32 / dur) as u64;
to_file_size(speed) + "/s"
}
pub fn to_duration(dur: f32) -> String {
let secs = dur.floor() as u64;
let subsecs = dur - dur.floor();