Browse Source

First working version

pull/10/head
Dennis Schwerdel 5 years ago
parent
commit
231aa9fb58
  1. 4
      .gitignore
  2. 68
      Algotest1.txt
  3. 246
      Cargo.lock
  4. 16
      Cargo.toml
  5. 143
      README.md
  6. 122
      src/algotest.rs
  7. 474
      src/bundle.rs
  8. 70
      src/chunker/ae.rs
  9. 120
      src/chunker/fastcdc.rs
  10. 118
      src/chunker/mod.rs
  11. 116
      src/chunker/rabin.rs
  12. 72
      src/errors.rs
  13. 495
      src/index.rs
  14. 79
      src/main.rs
  15. 124
      src/repository/basic_io.rs
  16. 74
      src/repository/bundle_map.rs
  17. 182
      src/repository/config.rs
  18. 61
      src/repository/integrity.rs
  19. 138
      src/repository/mod.rs
  20. 66
      src/util/checksum.rs
  21. 216
      src/util/compression.rs
  22. 54
      src/util/encryption.rs
  23. 104
      src/util/hash.rs
  24. 53
      src/util/lru_cache.rs
  25. 11
      src/util/mod.rs

4
.gitignore vendored

@ -0,0 +1,4 @@
target
squash
test.tar
test_data

68
Algotest1.txt

@ -0,0 +1,68 @@
~/shared
Algorithm comparison on file test.tar
Reading input file... done. 2175416320 bytes
Chunker algorithms
Chunk size: 4 KiB
AE: avg chunk size 2756.5 ± 543.6 bytes, 12.1% saved, speed 748.9 MB/s
Rabin: avg chunk size 4902.3 ± 3826.2 bytes, 11.7% saved, speed 336.7 MB/s
FastCdc: avg chunk size 4783.3 ± 1940.5 bytes, 12.1% saved, speed 544.1 MB/s
Chunk size: 8 KiB
AE: avg chunk size 5245.1 ± 890.8 bytes, 10.0% saved, speed 756.3 MB/s
Rabin: avg chunk size 9774.2 ± 7636.0 bytes, 10.3% saved, speed 344.9 MB/s
FastCdc: avg chunk size 9583.2 ± 3933.2 bytes, 10.7% saved, speed 541.6 MB/s
Chunk size: 16 KiB
AE: avg chunk size 10169.5 ± 1485.8 bytes, 7.4% saved, speed 781.5 MB/s
Rabin: avg chunk size 19641.7 ± 15292.5 bytes, 9.0% saved, speed 345.9 MB/s
FastCdc: avg chunk size 19262.9 ± 7697.4 bytes, 9.0% saved, speed 548.1 MB/s
Chunk size: 32 KiB
AE: avg chunk size 20004.6 ± 2705.6 bytes, 5.6% saved, speed 787.0 MB/s
Rabin: avg chunk size 38963.6 ± 30218.2 bytes, 7.6% saved, speed 345.7 MB/s
FastCdc: avg chunk size 39159.3 ± 16834.6 bytes, 7.7% saved, speed 547.1 MB/s
Chunk size: 64 KiB
AE: avg chunk size 39627.2 ± 5310.6 bytes, 3.8% saved, speed 788.2 MB/s
Rabin: avg chunk size 78339.7 ± 60963.7 bytes, 6.4% saved, speed 345.6 MB/s
FastCdc: avg chunk size 76981.4 ± 30784.6 bytes, 6.1% saved, speed 548.4 MB/s
Hash algorithms
Blake2: 724.2 MB/s
Murmur3: 5358.3 MB/s
Compression algorithms
Snappy: ratio: 83.6%, compress: 301.7 MB/s, decompress: 876.2 MB/s
fatal runtime error: out of memory
ZStd/1: ratio: 77.2%, compress: 493.9 MB/s, decompress: 0.0 MB/s
ZStd/2: ratio: 76.7%, compress: 420.6 MB/s, decompress: 0.0 MB/s
ZStd/3: ratio: 75.4%, compress: 314.6 MB/s, decompress: 0.0 MB/s
ZStd/4: ratio: 75.3%, compress: 273.0 MB/s, decompress: 0.0 MB/s
ZStd/5: ratio: 74.9%, compress: 131.4 MB/s, decompress: 0.0 MB/s
ZStd/6: ratio: 73.6%, compress: 121.4 MB/s, decompress: 0.0 MB/s
ZStd/7: ratio: 73.5%, compress: 88.7 MB/s, decompress: 0.0 MB/s
ZStd/8: ratio: 73.4%, compress: 76.8 MB/s, decompress: 0.0 MB/s
ZStd/9: ratio: 73.3%, compress: 51.8 MB/s, decompress: 0.0 MB/s
Deflate/1: ratio: 78.3%, compress: 95.7 MB/s, decompress: 0.0 MB/s
Deflate/2: ratio: 78.2%, compress: 94.7 MB/s, decompress: 0.0 MB/s
Deflate/3: ratio: 78.1%, compress: 92.5 MB/s, decompress: 0.0 MB/s
Deflate/4: ratio: 78.0%, compress: 87.9 MB/s, decompress: 0.0 MB/s
Deflate/5: ratio: 77.8%, compress: 86.5 MB/s, decompress: 0.0 MB/s
Deflate/6: ratio: 77.7%, compress: 83.8 MB/s, decompress: 0.0 MB/s
Deflate/7: ratio: 77.7%, compress: 73.4 MB/s, decompress: 0.0 MB/s
Deflate/8: ratio: 77.6%, compress: 31.6 MB/s, decompress: 0.0 MB/s
Deflate/9: ratio: 77.4%, compress: 25.8 MB/s, decompress: 0.0 MB/s
Brotli/1: ratio: 77.6%, compress: 433.1 MB/s, decompress: 0.0 MB/s
Brotli/2: ratio: 75.4%, compress: 242.2 MB/s, decompress: 0.0 MB/s
Brotli/3: ratio: 75.3%, compress: 195.5 MB/s, decompress: 0.0 MB/s
Brotli/4: ratio: 72.4%, compress: 81.6 MB/s, decompress: 0.0 MB/s
Brotli/5: ratio: 73.9%, compress: 62.4 MB/s, decompress: 0.0 MB/s
Brotli/6: ratio: 72.9%, compress: 46.6 MB/s, decompress: 0.0 MB/s
Brotli/7: ratio: 71.5%, compress: 23.4 MB/s, decompress: 0.0 MB/s
Brotli/8: ratio: 71.5%, compress: 20.7 MB/s, decompress: 0.0 MB/s
Brotli/9: ratio: 71.2%, compress: 11.2 MB/s, decompress: 0.0 MB/s
Lzma2/1: ratio: 69.8%, compress: 4.2 MB/s, decompress: 0.0 MB/s

246
Cargo.lock generated

@ -0,0 +1,246 @@
[root]
name = "zvault"
version = "0.1.0"
dependencies = [
"blake2-rfc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
"mmap 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"murmurhash3 0.0.5 (registry+https://github.com/rust-lang/crates.io-index)",
"quick-error 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"rmp-serde 0.12.2 (registry+https://github.com/rust-lang/crates.io-index)",
"rust-crypto 0.2.36 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 0.9.11 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_utils 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_yaml 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
"squash-sys 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "bitflags"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "blake2-rfc"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"constant_time_eq 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "byteorder"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "constant_time_eq"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "gcc"
version = "0.3.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "kernel32-sys"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "libc"
version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "libc"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "linked-hash-map"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "mmap"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)",
"tempdir 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "murmurhash3"
version = "0.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "num-traits"
version = "0.1.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "pkg-config"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "quick-error"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "rand"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.21 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "redox_syscall"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "rmp"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"byteorder 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"num-traits 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rmp-serde"
version = "0.12.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"byteorder 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"rmp 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 0.9.11 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rust-crypto"
version = "0.2.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"gcc 0.3.43 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.21 (registry+https://github.com/rust-lang/crates.io-index)",
"rand 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)",
"rustc-serialize 0.3.22 (registry+https://github.com/rust-lang/crates.io-index)",
"time 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rustc-serialize"
version = "0.3.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "serde"
version = "0.9.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "serde_utils"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"serde 0.9.11 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "serde_yaml"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"linked-hash-map 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 0.9.11 (registry+https://github.com/rust-lang/crates.io-index)",
"yaml-rust 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "squash-sys"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"bitflags 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.21 (registry+https://github.com/rust-lang/crates.io-index)",
"pkg-config 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "tempdir"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"rand 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "time"
version = "0.1.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.21 (registry+https://github.com/rust-lang/crates.io-index)",
"redox_syscall 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "winapi"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "winapi-build"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "yaml-rust"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"linked-hash-map 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[metadata]
"checksum bitflags 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "aad18937a628ec6abcd26d1489012cc0e18c21798210f491af69ded9b881106d"
"checksum blake2-rfc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)" = "0c6a476f32fef3402f1161f89d0d39822809627754a126f8441ff2a9d45e2d59"
"checksum byteorder 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c40977b0ee6b9885c9013cd41d9feffdd22deb3bb4dc3a71d901cc7a77de18c8"
"checksum constant_time_eq 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "07dcb7959f0f6f1cf662f9a7ff389bcb919924d99ac41cf31f10d611d8721323"
"checksum gcc 0.3.43 (registry+https://github.com/rust-lang/crates.io-index)" = "c07c758b972368e703a562686adb39125707cc1ef3399da8c019fc6c2498a75d"
"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
"checksum libc 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)" = "e32a70cf75e5846d53a673923498228bbec6a8624708a9ea5645f075d6276122"
"checksum libc 0.2.21 (registry+https://github.com/rust-lang/crates.io-index)" = "88ee81885f9f04bff991e306fea7c1c60a5f0f9e409e99f6b40e3311a3363135"
"checksum linked-hash-map 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "6d262045c5b87c0861b3f004610afd0e2c851e2908d08b6c870cbb9d5f494ecd"
"checksum mmap 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0bc85448a6006dd2ba26a385a564a8a0f1f2c7e78c70f1a70b2e0f4af286b823"
"checksum murmurhash3 0.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "a2983372caf4480544083767bf2d27defafe32af49ab4df3a0b7fc90793a3664"
"checksum num-traits 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)" = "e1cbfa3781f3fe73dc05321bed52a06d2d491eaa764c52335cf4399f046ece99"
"checksum pkg-config 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "3a8b4c6b8165cd1a1cd4b9b120978131389f64bdaf456435caa41e630edba903"
"checksum quick-error 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "0aad603e8d7fb67da22dbdf1f4b826ce8829e406124109e73cf1b2454b93a71c"
"checksum rand 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)" = "022e0636ec2519ddae48154b028864bdce4eaf7d35226ab8e65c611be97b189d"
"checksum redox_syscall 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)" = "8dd35cc9a8bdec562c757e3d43c1526b5c6d2653e23e2315065bc25556550753"
"checksum rmp 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)" = "e59917c01f49718a59c644a621a4848aafc6577c4a47d66270d78951a807541a"
"checksum rmp-serde 0.12.2 (registry+https://github.com/rust-lang/crates.io-index)" = "06ec4d0cdea2645de5d0e649f90c3e654205d913e14adefa452257314a24e76e"
"checksum rust-crypto 0.2.36 (registry+https://github.com/rust-lang/crates.io-index)" = "f76d05d3993fd5f4af9434e8e436db163a12a9d40e1a58a726f27a01dfd12a2a"
"checksum rustc-serialize 0.3.22 (registry+https://github.com/rust-lang/crates.io-index)" = "237546c689f20bb44980270c73c3b9edd0891c1be49cc1274406134a66d3957b"
"checksum serde 0.9.11 (registry+https://github.com/rust-lang/crates.io-index)" = "a702319c807c016e51f672e5c77d6f0b46afddd744b5e437d6b8436b888b458f"
"checksum serde_utils 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b34a52969c7fc0254e214b82518c9a95dc88c84fc84cd847add314996a031be6"
"checksum serde_yaml 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f8bd3f24ad8c7bcd34a6d70ba676dc11302b96f4f166aa5f947762e01098844d"
"checksum squash-sys 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "db1f9dde91d819b7746e153bc32489fa19e6a106c3d7f2b92187a4efbdc88b40"
"checksum tempdir 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "87974a6f5c1dfb344d733055601650059a3363de2a6104819293baff662132d6"
"checksum time 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)" = "211b63c112206356ef1ff9b19355f43740fc3f85960c598a93d3a3d3ba7beade"
"checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a"
"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc"
"checksum yaml-rust 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "e66366e18dc58b46801afbf2ca7661a9f59cc8c5962c29892b6039b4f86fa992"

16
Cargo.toml

@ -0,0 +1,16 @@
[package]
name = "zvault"
version = "0.1.0"
authors = ["Dennis Schwerdel <schwerdel@informatik.uni-kl.de>"]
[dependencies]
serde = "0.9"
rmp-serde = "0.12"
serde_yaml = "0.6"
serde_utils = "0.5.1"
rust-crypto = "0.2"
squash-sys = "0.9"
mmap = "*"
quick-error = "1.1"
blake2-rfc = "*"
murmurhash3 = "*"

143
README.md

@ -1,10 +1,96 @@
# ZVault Backup solution
## Goals
## Goals / Features
### Space-efficient storage with deduplication
The backup data is split into chunks. Fingerprints make sure that each chunk is
only stored once. The chunking algorithm is designed so that small changes to a
file only change a few chunks and leave most chunks unchanged.
Multiple backups of the same data set will only take up the space of one copy.
The chunks are combined into bundles. Each bundle holds chunks up to a maximum
data size and is compressed as a whole to save space ("solid archive").
### Independent backups
All backups share common data in form of chunks but are independent on a higher
level. Backups can be delete and chunks that are not used by any backup can be
removed.
Other backup solutions use differential backups organized in chains. This makes
those backups dependent on previous backups in the chain, so that those backups
can not be deleted. Also, restoring chained backups is much less efficient.
### Fast backup runs
* Only adding changed files
* In-Memory Hashtable
### Backup verification
* Bundles verification
* Index verification
* File structure verification
## Configuration options
There are several configuration options with trade-offs attached so these are
exposed to users.
### Chunker algorithm
The chunker algorithm is responsible for splitting files into chunks in a way
that survives small changes to the file so that small changes still yield
many matching chunks. The quality of the algorithm affects the deduplication
rate and its speed affects the backup speed.
There are 3 algorithms to choose from:
The **Rabin chunker** is a very common algorithm with a good quality but a
mediocre speed (about 350 MB/s).
The **AE chunker** is a novel approach that can reach very high speeds
(over 750 MB/s) but at a cost of quality.
The **FastCDC** algorithm has a slightly higher quality than the Rabin chunker
and is quite fast (about 550 MB/s).
The recommendation is **FastCDC**.
### Chunk size
The chunk size determines the memory usage during backup runs. For every chunk
in the backup repository, 24 bytes of memory are needed. That means that for
every GiB stored in the repository the following amount of memory is needed:
- 8 KiB chunks => 3 MiB / GiB
- 16 KiB chunks => 1.5 MiB / GiB
- 32 KiB chunks => 750 KiB / GiB
- 64 KiB chunks => 375 KiB / GiB
On the other hand, bigger chunks reduce the deduplication efficiency. Even small
changes of only one byte will result in at least one complete chunk changing.
### Hash algorithm
Blake2
Murmur3
Recommended: Blake2
### Bundle size
10 M
25 M
100 M
Recommended: 25 MiB
### Compression
Recommended: Brotli/2-7
- Blazingly fast backup runs
- Space-efficient storage
- Independent backups
## Design
@ -43,3 +129,52 @@
- Remote block writing and compression/encryption
- Inode data serialization
- Recursive directory scanning, difference calculation, new entry sorting
### ChunkDB
- Stores data in chunks
- A chunk is a file
- Per Chunk properties
- Format version
- Encryption method
- Encryption key
- Compression method / level
- Chunk ID is the hash of the contents
- No locks needed on shared chunk repository !!!
- Chunk ID is calculated after compression and encryption
- Chunk header
- "zvault01"
- Chunk size compressed / raw
- Content hash method / value
- Encryption method / options / key hash
- Compression method / options
- Chunks are write-once read-often
- Chunks are prepared outside the repository
- Only one chunk is being prepared at a time
- Adding data to the chunk returns starting position in raw data
- Operations:
- List available chunks
- Add data
- Flush chunk
- Delete chunk
- Get data
- Check chunk
- Chunk path is `checksum.chunk` or `chec/ksum.chunk`
- Data is added to current chunk and compressed in memory
- Operations on chunk files are just sequencial read/write and delete
- Ability to recompress chunks
### Index
16 Bytes per hash key
8 Bytes data per entry (4 bytes bundle id, 4 bytes chunk id)
=> 24 Bytes per entry
Average chunk sizes
8 Kib => 3 MiB / 1 GiB
16 Kib => 1.5 MiB / 1 GiB
24 Kib => 1.0 MiB / 1 GiB
32 Kib => 750 Kib / 1 GiB
64 Kib => 375 Kib / 1 GiB

122
src/algotest.rs

@ -0,0 +1,122 @@
use std::io::{Cursor, Read};
use std::fs::File;
use std::time;
use super::chunker::*;
use super::util::*;
fn speed_chunk<C: IChunker>(chunker: &mut C, data: &[u8]) {
let mut input = Cursor::new(data);
let mut chunk = Vec::with_capacity(1_000_000);
loop {
chunk.clear();
let result = chunker.chunk(&mut input, &mut chunk).unwrap();
if result == ChunkerStatus::Finished {
return
}
}
}
fn chunk<C: IChunker>(chunker: &mut C, data: &[u8]) -> Vec<Vec<u8>> {
let mut input = Cursor::new(data);
let mut chunks = Vec::with_capacity(100_000);
loop {
let mut chunk = Vec::with_capacity(100_000);
let result = chunker.chunk(&mut input, &mut chunk).unwrap();
chunks.push(chunk);
if result == ChunkerStatus::Finished {
return chunks;
}
}
}
fn analyze_chunks(mut chunks: Vec<Vec<u8>>) -> (usize, f64, f64, f64) {
let count = chunks.len();
let total = chunks.iter().map(|c| c.len()).sum::<usize>();
let avg_size = total as f64 / count as f64;
let stddev = (chunks.iter().map(|c| (c.len() as f64 - avg_size).powi(2)).sum::<f64>() / (count as f64 - 1.0)).sqrt();
chunks.sort();
chunks.dedup();
let non_dup: usize = chunks.iter().map(|c| c.len()).sum();
let saved = 1.0 - non_dup as f64 / total as f64;
(count, avg_size, stddev, saved)
}
fn compare_chunker<C: IChunker>(name: &str, mut chunker: C, data: &[u8]) {
let start = time::Instant::now();
speed_chunk(&mut chunker, data);
let elapsed = start.elapsed();
let chunks = chunk(&mut chunker, data);
let duration = elapsed.as_secs() as f64 * 1.0 + elapsed.subsec_nanos() as f64 / 1_000_000_000.0;
let speed = data.len() as f64 / duration;
assert_eq!(chunks.iter().map(|c| c.len()).sum::<usize>(), data.len());
let (_count, avg_size, stddev, saved) = analyze_chunks(chunks);
println!("{}: \tavg chunk size {:.1}\t± {:.1} bytes, \t{:.1}% saved,\tspeed {:.1} MB/s",
name, avg_size, stddev, saved * 100.0, speed / 1_000_000.0);
}
fn compare_hash(name: &str, hash: HashMethod, data: &[u8]) {
let start = time::Instant::now();
let _ = hash.hash(data);
let elapsed = start.elapsed();
let duration = elapsed.as_secs() as f64 * 1.0 + elapsed.subsec_nanos() as f64 / 1_000_000_000.0;
let speed = data.len() as f64 / duration;
println!("{}: {:.1} MB/s", name, speed / 1_000_000.0);
}
fn compare_compression(name: &str, method: Compression, data: &[u8]) {
let start = time::Instant::now();
let compressed = method.compress(data).unwrap();
let elapsed = start.elapsed();
let duration = elapsed.as_secs() as f64 * 1.0 + elapsed.subsec_nanos() as f64 / 1_000_000_000.0;
let cspeed = data.len() as f64 / duration;
let ratio = compressed.len() as f64 / data.len() as f64;
/*let start = time::Instant::now();
let uncompressed = method.decompress(&compressed).unwrap();
if uncompressed != data {
panic!("{} did not uncompress to the same value", name);
}
let elapsed = start.elapsed();
let duration = elapsed.as_secs() as f64 * 1.0 + elapsed.subsec_nanos() as f64 / 1_000_000_000.0;*/
let dspeed = 0.0;//data.len() as f64 / duration;
println!("{}:\tratio: {:.1}%,\tcompress: {:.1} MB/s,\tdecompress: {:.1} MB/s",
name, ratio * 100.0, cspeed / 1_000_000.0, dspeed / 1_000_000.0);
}
#[allow(dead_code)]
pub fn run(path: &str) {
println!("Algorithm comparison on file {}", path);
println!();
print!("Reading input file...");
let mut file = File::open(path).unwrap();
let mut data = Vec::new();
file.read_to_end(&mut data).unwrap();
println!(" done. {} bytes", data.len());
println!();
println!("Chunker algorithms");
for size in &[4usize, 8, 16, 32, 64] {
println!(" Chunk size: {} KiB", size);
compare_chunker(" AE", AeChunker::new(size*1024), &data);
compare_chunker(" Rabin", RabinChunker::new(size*1024, 0), &data);
compare_chunker(" FastCdc", FastCdcChunker::new(size*1024, 0), &data);
}
println!();
println!("Hash algorithms");
compare_hash(" Blake2", HashMethod::Blake2, &data);
compare_hash(" Murmur3", HashMethod::Murmur3, &data);
println!();
println!("Compression algorithms");
compare_compression(" Snappy", Compression::Snappy(()), &data);
for level in 1..10 {
compare_compression(&format!(" ZStd/{}", level), Compression::ZStd(level), &data);
}
for level in 1..10 {
compare_compression(&format!(" Deflate/{}", level), Compression::Deflate(level), &data);
}
for level in 1..10 {
compare_compression(&format!(" Brotli/{}", level), Compression::Brotli(level), &data);
}
for level in 1..7 {
compare_compression(&format!(" Lzma2/{}", level), Compression::Lzma2(level), &data);
}
}

474
src/bundle.rs

@ -0,0 +1,474 @@
use std::path::{Path, PathBuf};
use std::collections::HashMap;
use std::fs::{self, File};
use std::io::{Read, Write, Seek, SeekFrom, BufWriter, BufReader};
use std::cmp::max;
use std::fmt::{self, Debug, Write as FmtWrite};
use std::sync::{Arc, Mutex};
use serde::{self, Serialize, Deserialize};
use serde::bytes::ByteBuf;
use rmp_serde;
use errors::BundleError;
use util::*;
static HEADER_STRING: [u8; 7] = *b"zbundle";
static HEADER_VERSION: u8 = 1;
// TODO: Test cases
// TODO: Benchmarks
#[derive(Hash, PartialEq, Eq, Clone, Default)]
pub struct BundleId(pub Vec<u8>);
impl Serialize for BundleId {
fn serialize<S: serde::Serializer>(&self, ser: S) -> Result<S::Ok, S::Error> {
ser.serialize_bytes(&self.0)
}
}
impl Deserialize for BundleId {
fn deserialize<D: serde::Deserializer>(de: D) -> Result<Self, D::Error> {
let bytes = try!(ByteBuf::deserialize(de));
Ok(BundleId(bytes.into()))
}
}
impl BundleId {
#[inline]
fn to_string(&self) -> String {
let mut buf = String::with_capacity(self.0.len()*2);
for b in &self.0 {
write!(&mut buf, "{:2x}", b).unwrap()
}
buf
}
}
impl fmt::Display for BundleId {
#[inline]
fn fmt(&self, fmt: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(fmt, "{}", self.to_string())
}
}
impl fmt::Debug for BundleId {
#[inline]
fn fmt(&self, fmt: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(fmt, "{}", self.to_string())
}
}
#[derive(Clone)]
pub struct BundleHeader {
pub id: BundleId,
pub compression: Option<Compression>,
pub encryption: Option<Encryption>,
pub checksum: Checksum,
pub raw_size: usize,
pub encoded_size: usize,
pub chunk_count: usize,
pub chunk_sizes: Vec<usize>
}
serde_impl!(BundleHeader(u64) {
id: BundleId => 0,
compression: Option<Compression> => 1,
encryption: Option<Encryption> => 2,
checksum: Checksum => 3,
raw_size: usize => 4,
encoded_size: usize => 5,
chunk_count: usize => 6,
chunk_sizes: Vec<usize> => 7
});
impl Default for BundleHeader {
fn default() -> Self {
BundleHeader {
id: BundleId(vec![]),
compression: None,
encryption: None,
checksum: (ChecksumType::Sha3_256, ByteBuf::new()),
raw_size: 0,
encoded_size: 0,
chunk_count: 0,
chunk_sizes: vec![]
}
}
}
pub struct Bundle {
pub id: BundleId,
pub version: u8,
pub path: PathBuf,
crypto: Arc<Mutex<Crypto>>,
pub compression: Option<Compression>,
pub encryption: Option<Encryption>,
pub raw_size: usize,
pub encoded_size: usize,
pub checksum: Checksum,
pub content_start: usize,
pub chunk_count: usize,
pub chunk_sizes: Vec<usize>,
pub chunk_positions: Vec<usize>
}
impl Bundle {
fn new(path: PathBuf, version: u8, content_start: usize, crypto: Arc<Mutex<Crypto>>, header: BundleHeader) -> Self {
let mut chunk_positions = Vec::with_capacity(header.chunk_sizes.len());
let mut pos = 0;
for len in &header.chunk_sizes {
chunk_positions.push(pos);
pos += *len;
}
Bundle {
id: header.id,
version: version,
path: path,
crypto: crypto,
compression: header.compression,
encryption: header.encryption,
raw_size: header.raw_size,
encoded_size: header.encoded_size,
chunk_count: header.chunk_count,
checksum: header.checksum,
content_start: content_start,
chunk_sizes: header.chunk_sizes,
chunk_positions: chunk_positions
}
}
pub fn load(path: PathBuf, crypto: Arc<Mutex<Crypto>>) -> Result<Self, BundleError> {
let mut file = BufReader::new(try!(File::open(&path)
.map_err(|e| BundleError::Read(e, path.clone(), "Failed to open bundle file"))));
let mut header = [0u8; 8];
try!(file.read_exact(&mut header)
.map_err(|e| BundleError::Read(e, path.clone(), "Failed to read bundle header")));
if header[..HEADER_STRING.len()] != HEADER_STRING {
return Err(BundleError::Format(path.clone(), "Wrong header string"))
}
let version = header[HEADER_STRING.len()];
if version != HEADER_VERSION {
return Err(BundleError::Format(path.clone(), "Unsupported bundle file version"))
}
let mut reader = rmp_serde::Deserializer::new(file);
let header = try!(BundleHeader::deserialize(&mut reader)
.map_err(|e| BundleError::Decode(e, path.clone())));
file = reader.into_inner();
let content_start = file.seek(SeekFrom::Current(0)).unwrap() as usize;
Ok(Bundle::new(path, version, content_start, crypto, header))
}
#[inline]
fn load_encoded_contents(&self) -> Result<Vec<u8>, BundleError> {
let mut file = BufReader::new(try!(File::open(&self.path)
.map_err(|e| BundleError::Read(e, self.path.clone(), "Failed to open bundle file"))));
try!(file.seek(SeekFrom::Start(self.content_start as u64))
.map_err(|e| BundleError::Read(e, self.path.clone(), "Failed to seek to data")));
let mut data = Vec::with_capacity(max(self.encoded_size, self.raw_size)+1024);
try!(file.read_to_end(&mut data).map_err(|_| "Failed to read data"));
Ok(data)
}
#[inline]
fn decode_contents(&self, mut data: Vec<u8>) -> Result<Vec<u8>, BundleError> {
if let Some(ref encryption) = self.encryption {
data = try!(self.crypto.lock().unwrap().decrypt(encryption.clone(), &data));
}
if let Some(ref compression) = self.compression {
data = try!(compression.decompress(&data));
}
Ok(data)
}
#[inline]
pub fn load_contents(&self) -> Result<Vec<u8>, BundleError> {
self.load_encoded_contents().and_then(|data| self.decode_contents(data))
}
#[inline]
pub fn get_chunk_position(&self, id: usize) -> Result<(usize, usize), BundleError> {
if id >= self.chunk_count {
return Err("Invalid chunk id".into())
}
Ok((self.chunk_positions[id], self.chunk_sizes[id]))
}
pub fn check(&self, full: bool) -> Result<(), BundleError> {
if self.chunk_count != self.chunk_sizes.len() {
return Err(BundleError::Integrity(self.id.clone(),
"Chunk list size does not match chunk count"))
}
if self.chunk_sizes.iter().sum::<usize>() != self.raw_size {
return Err(BundleError::Integrity(self.id.clone(),
"Individual chunk sizes do not add up to total size"))
}
if !full {
let size = try!(fs::metadata(&self.path)
.map_err(|e| BundleError::Read(e, self.path.clone(), "Failed to get size of file"))
).len();
if size as usize != self.encoded_size + self.content_start {
return Err(BundleError::Integrity(self.id.clone(),
"File size does not match size in header, truncated file"))
}
return Ok(())
}
let encoded_contents = try!(self.load_encoded_contents());
if self.encoded_size != encoded_contents.len() {
return Err(BundleError::Integrity(self.id.clone(),
"Encoded data size does not match size in header, truncated bundle"))
}
let contents = try!(self.decode_contents(encoded_contents));
if self.raw_size != contents.len() {
return Err(BundleError::Integrity(self.id.clone(),
"Raw data size does not match size in header, truncated bundle"))
}
//TODO: verify checksum
Ok(())
}
}
impl Debug for Bundle {
fn fmt(&self, fmt: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(fmt, "Bundle(\n\tid: {}\n\tpath: {:?}\n\tchunks: {}\n\tsize: {}, encoded: {}\n\tcompression: {:?}\n)",
self.id.to_string(), self.path, self.chunk_count, self.raw_size, self.encoded_size, self.compression)
}
}
pub struct BundleWriter {
data: Vec<u8>,
compression: Option<Compression>,
compression_stream: Option<CompressionStream>,
encryption: Option<Encryption>,
crypto: Arc<Mutex<Crypto>>,
checksum: ChecksumCreator,
raw_size: usize,
chunk_count: usize,
chunk_sizes: Vec<usize>
}
impl BundleWriter {
fn new(compression: Option<Compression>, encryption: Option<Encryption>, crypto: Arc<Mutex<Crypto>>, checksum: ChecksumType) -> Result<Self, BundleError> {
let compression_stream = match compression {
Some(ref compression) => Some(try!(compression.compress_stream())),
None => None
};
Ok(BundleWriter {
data: vec![],
compression: compression,
compression_stream: compression_stream,
encryption: encryption,
crypto: crypto,
checksum: ChecksumCreator::new(checksum),
raw_size: 0,
chunk_count: 0,
chunk_sizes: vec![]
})
}
pub fn add(&mut self, chunk: &[u8]) -> Result<usize, BundleError> {
if let Some(ref mut stream) = self.compression_stream {
try!(stream.process(chunk, &mut self.data))
} else {
self.data.extend_from_slice(chunk)
}
self.checksum.update(chunk);
self.raw_size += chunk.len();
self.chunk_count += 1;
self.chunk_sizes.push(chunk.len());
Ok(self.chunk_count-1)
}
fn finish(mut self, db: &BundleDb) -> Result<Bundle, BundleError> {
if let Some(stream) = self.compression_stream {
try!(stream.finish(&mut self.data))
}
if let Some(ref encryption) = self.encryption {
self.data = try!(self.crypto.lock().unwrap().encrypt(encryption.clone(), &self.data));
}
let encoded_size = self.data.len();
let checksum = self.checksum.finish();
let id = BundleId(checksum.1.to_vec());
let (folder, file) = db.bundle_path(&id);
let path = folder.join(file);
try!(fs::create_dir_all(&folder)
.map_err(|e| BundleError::Write(e, path.clone(), "Failed to create folder")));
let mut file = BufWriter::new(try!(File::create(&path)
.map_err(|e| BundleError::Write(e, path.clone(), "Failed to create bundle file"))));
try!(file.write_all(&HEADER_STRING)
.map_err(|e| BundleError::Write(e, path.clone(), "Failed to write bundle header")));
try!(file.write_all(&[HEADER_VERSION])
.map_err(|e| BundleError::Write(e, path.clone(), "Failed to write bundle header")));
let header = BundleHeader {
checksum: checksum,
compression: self.compression,
encryption: self.encryption,
chunk_count: self.chunk_count,
id: id.clone(),
raw_size: self.raw_size,
encoded_size: encoded_size,
chunk_sizes: self.chunk_sizes
};
{
let mut writer = rmp_serde::Serializer::new(&mut file);
try!(header.serialize(&mut writer)
.map_err(|e| BundleError::Encode(e, path.clone())));
}
let content_start = file.seek(SeekFrom::Current(0)).unwrap() as usize;
try!(file.write_all(&self.data)
.map_err(|e| BundleError::Write(e, path.clone(), "Failed to write bundle data")));
Ok(Bundle::new(path, HEADER_VERSION, content_start, self.crypto, header))
}
#[inline]
pub fn size(&self) -> usize {
self.data.len()
}
}
pub struct BundleDb {
path: PathBuf,
compression: Option<Compression>,
encryption: Option<Encryption>,
crypto: Arc<Mutex<Crypto>>,
checksum: ChecksumType,
bundles: HashMap<BundleId, Bundle>,
bundle_cache: LruCache<BundleId, Vec<u8>>
}
impl BundleDb {
fn new(path: PathBuf, compression: Option<Compression>, encryption: Option<Encryption>, checksum: ChecksumType) -> Self {
BundleDb {
path: path,
compression:
compression,
crypto: Arc::new(Mutex::new(Crypto::new())),
encryption: encryption,
checksum: checksum,
bundles: HashMap::new(),
bundle_cache: LruCache::new(5, 10)
}
}
fn bundle_path(&self, bundle: &BundleId) -> (PathBuf, PathBuf) {
let mut folder = self.path.clone();
let mut file = bundle.to_string() + ".bundle";
let mut count = self.bundles.len();
while count >= 1000 {
if file.len() < 10 {
break
}
folder = folder.join(&file[0..3]);
file = file[3..].to_string();
count /= 1000;
}
(folder, file.into())
}
fn load_bundle_list(&mut self) -> Result<(), BundleError> {
self.bundles.clear();
let mut paths = Vec::new();
paths.push(self.path.clone());
while let Some(path) = paths.pop() {
for entry in try!(fs::read_dir(path).map_err(BundleError::List)) {
let entry = try!(entry.map_err(BundleError::List));
let path = entry.path();
if path.is_dir() {
paths.push(path);
} else {
let bundle = try!(Bundle::load(path, self.crypto.clone()));
self.bundles.insert(bundle.id.clone(), bundle);
}
}
}
Ok(())
}
#[inline]
pub fn open<P: AsRef<Path>>(path: P, compression: Option<Compression>, encryption: Option<Encryption>, checksum: ChecksumType) -> Result<Self, BundleError> {
let path = path.as_ref().to_owned();
let mut self_ = Self::new(path, compression, encryption, checksum);
try!(self_.load_bundle_list());
Ok(self_)
}
#[inline]
pub fn create<P: AsRef<Path>>(path: P, compression: Option<Compression>, encryption: Option<Encryption>, checksum: ChecksumType) -> Result<Self, BundleError> {
let path = path.as_ref().to_owned();
try!(fs::create_dir_all(&path)
.map_err(|e| BundleError::Write(e, path.clone(), "Failed to create folder")));
Ok(Self::new(path, compression, encryption, checksum))
}
#[inline]
pub fn open_or_create<P: AsRef<Path>>(path: P, compression: Option<Compression>, encryption: Option<Encryption>, checksum: ChecksumType) -> Result<Self, BundleError> {
if path.as_ref().exists() {
Self::open(path, compression, encryption, checksum)
} else {
Self::create(path, compression, encryption, checksum)
}
}
#[inline]
pub fn create_bundle(&self) -> Result<BundleWriter, BundleError> {
BundleWriter::new(self.compression.clone(), self.encryption.clone(), self.crypto.clone(), self.checksum)
}
pub fn get_chunk(&mut self, bundle_id: &BundleId, id: usize) -> Result<Vec<u8>, BundleError> {
let bundle = try!(self.bundles.get(bundle_id).ok_or("Bundle not found"));
let (pos, len) = try!(bundle.get_chunk_position(id));
let mut chunk = Vec::with_capacity(len);
if let Some(data) = self.bundle_cache.get(bundle_id) {
chunk.extend_from_slice(&data[pos..pos+len]);
return Ok(chunk);
}
let data = try!(bundle.load_contents());
chunk.extend_from_slice(&data[pos..pos+len]);
self.bundle_cache.put(bundle_id.clone(), data);
Ok(chunk)
}
#[inline]
pub fn add_bundle(&mut self, bundle: BundleWriter) -> Result<&Bundle, BundleError> {
let bundle = try!(bundle.finish(&self));
let id = bundle.id.clone();
self.bundles.insert(id.clone(), bundle);
Ok(self.get_bundle(&id).unwrap())
}
#[inline]
pub fn get_bundle(&self, bundle: &BundleId) -> Option<&Bundle> {
self.bundles.get(bundle)
}
#[inline]
pub fn list_bundles(&self) -> Vec<&Bundle> {
self.bundles.values().collect()
}
#[inline]
pub fn delete_bundle(&mut self, bundle: &BundleId) -> Result<(), BundleError> {
if let Some(bundle) = self.bundles.remove(bundle) {
fs::remove_file(&bundle.path).map_err(|e| BundleError::Remove(e, bundle.id.clone()))
} else {
Err("No such bundle".into())
}
}
#[inline]
pub fn check(&self, full: bool) -> Result<(), BundleError> {
for bundle in self.bundles.values() {
try!(bundle.check(full))
}
Ok(())
}
}

70
src/chunker/ae.rs

@ -0,0 +1,70 @@
use super::*;
//use std::f64::consts;
use std::ptr;
// AE Chunker
// Paper: "AE: An Asymmetric Extremum Content Defined Chunking Algorithm for Fast and Bandwidth-Efficient Data Deduplication"
pub struct AeChunker {
buffer: [u8; 4096],
buffered: usize,
avg_size: usize,
window_size: usize
}
impl AeChunker {
pub fn new(avg_size: usize) -> AeChunker {
// Experiments show that this claim from the paper is wrong and results in smaller chunks
//let window_size = (avg_size as f64 / (consts::E - 1.0)) as usize;
let window_size = avg_size - 256;
AeChunker{
buffer: [0; 4096],
buffered: 0,
window_size: window_size,
avg_size: avg_size
}
}
}
impl IChunker for AeChunker {
#[inline]
fn get_type(&self) -> ChunkerType {
ChunkerType::Ae(self.avg_size)
}
#[allow(unknown_lints,explicit_counter_loop)]
fn chunk<R: Read, W: Write>(&mut self, r: &mut R, mut w: &mut W) -> Result<ChunkerStatus, ChunkerError> {
let mut max;
let mut pos = 0;
let mut max_pos = 0;
let mut max_val = 0;
loop {
// Fill the buffer, there might be some bytes still in there from last chunk
max = try!(r.read(&mut self.buffer[self.buffered..]).map_err(ChunkerError::Read)) + self.buffered;
// If nothing to do, finish
if max == 0 {
return Ok(ChunkerStatus::Finished)
}
for i in 0..max {
let val = self.buffer[i];
if val <= max_val {
if pos == max_pos + self.window_size {
// Write all bytes from this chunk out to sink and store rest for next chunk
try!(w.write_all(&self.buffer[..i+1]).map_err(ChunkerError::Write));
unsafe { ptr::copy(self.buffer[i+1..].as_ptr(), self.buffer.as_mut_ptr(), max-i-1) };
self.buffered = max-i-1;
return Ok(ChunkerStatus::Continue);
}
} else {
max_val = val;
max_pos = pos;
}
pos += 1;
}
try!(w.write_all(&self.buffer[..max]).map_err(ChunkerError::Write));
self.buffered = 0;
}
}
}

120
src/chunker/fastcdc.rs

@ -0,0 +1,120 @@
use super::*;
use std::ptr;
// FastCDC
// Paper: "FastCDC: a Fast and Efficient Content-Defined Chunking Approach for Data Deduplication"
// Paper-URL: https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf
// Presentation: https://www.usenix.org/sites/default/files/conference/protected-files/atc16_slides_xia.pdf
// Creating 256 pseudo-random values (based on Knuth's MMIX)
fn create_gear(seed: u64) -> [u64; 256] {
let mut table = [0u64; 256];
let a = 6364136223846793005;
let c = 1442695040888963407;
let mut v = seed;
for t in &mut table.iter_mut() {
v = v.wrapping_mul(a).wrapping_add(c);
*t = v;
}
table
}
fn get_masks(avg_size: usize, nc_level: usize, seed: u64) -> (u64, u64) {
let bits = (avg_size.next_power_of_two() - 1).count_ones();
if bits == 13 {
// From the paper
return (0x0003590703530000, 0x0000d90003530000);
}
let mut mask = 0u64;
let mut v = seed;
let a = 6364136223846793005;
let c = 1442695040888963407;
while mask.count_ones() < bits - nc_level as u32 {
v = v.wrapping_mul(a).wrapping_add(c);
mask = (mask | 1).rotate_left(v as u32 & 0x3f);
}
let mask_long = mask;
while mask.count_ones() < bits + nc_level as u32 {
v = v.wrapping_mul(a).wrapping_add(c);
mask = (mask | 1).rotate_left(v as u32 & 0x3f);
}
let mask_short = mask;
(mask_short, mask_long)
}
pub struct FastCdcChunker {
buffer: [u8; 4096],
buffered: usize,
gear: [u64; 256],
min_size: usize,
max_size: usize,
avg_size: usize,
mask_long: u64,
mask_short: u64,
seed: u64
}
impl FastCdcChunker {
pub fn new(avg_size: usize, seed: u64) -> Self {
let (mask_short, mask_long) = get_masks(avg_size, 2, seed);
FastCdcChunker {
buffer: [0; 4096],
buffered: 0,
gear: create_gear(seed),
min_size: avg_size/4,
max_size: avg_size*8,
avg_size: avg_size,
mask_long: mask_long,
mask_short: mask_short,
seed: seed
}
}
}
impl IChunker for FastCdcChunker {
#[inline]
fn get_type(&self) -> ChunkerType {
ChunkerType::FastCdc((self.avg_size, self.seed))
}
#[allow(unknown_lints,explicit_counter_loop)]
fn chunk<R: Read, W: Write>(&mut self, r: &mut R, mut w: &mut W) -> Result<ChunkerStatus, ChunkerError> {
let mut max;
let mut hash = 0u64;
let mut pos = 0;
loop {
// Fill the buffer, there might be some bytes still in there from last chunk
max = try!(r.read(&mut self.buffer[self.buffered..]).map_err(ChunkerError::Read)) + self.buffered;
// If nothing to do, finish
if max == 0 {
return Ok(ChunkerStatus::Finished)
}
for i in 0..max {
if pos >= self.min_size {
// Hash update
hash = (hash << 1).wrapping_add(self.gear[self.buffer[i] as usize]);
// 3 options for break point
// 1) mask_short matches and chunk is smaller than average
// 2) mask_long matches and chunk is longer or equal to average
// 3) chunk reached max_size
if pos < self.avg_size && hash & self.mask_short == 0
|| pos >= self.avg_size && hash & self.mask_long == 0
|| pos >= self.max_size {
// Write all bytes from this chunk out to sink and store rest for next chunk
try!(w.write_all(&self.buffer[..i+1]).map_err(ChunkerError::Write));
unsafe { ptr::copy(self.buffer[i+1..].as_ptr(), self.buffer.as_mut_ptr(), max-i-1) };
self.buffered = max-i-1;
return Ok(ChunkerStatus::Continue);
}
}
pos += 1;
}
try!(w.write_all(&self.buffer[..max]).map_err(ChunkerError::Write));
self.buffered = 0;
}
}
}

118
src/chunker/mod.rs

@ -0,0 +1,118 @@
use std::io::{Write, Read};
use super::errors::ChunkerError;
mod ae;
mod rabin;
mod fastcdc;
pub use self::ae::AeChunker;
pub use self::rabin::RabinChunker;
pub use self::fastcdc::FastCdcChunker;
// https://moinakg.wordpress.com/2013/06/22/high-performance-content-defined-chunking/
// Paper: "A Comprehensive Study of the Past, Present, and Future of Data Deduplication"
// Paper-URL: http://wxia.hustbackup.cn/IEEE-Survey-final.pdf
// https://borgbackup.readthedocs.io/en/stable/internals.html#chunks
// https://github.com/bup/bup/blob/master/lib/bup/bupsplit.c
#[derive(Debug, Eq, PartialEq)]
pub enum ChunkerStatus {
Continue,
Finished
}
pub trait IChunker: Sized {
fn chunk<R: Read, W: Write>(&mut self, r: &mut R, w: &mut W) -> Result<ChunkerStatus, ChunkerError>;
fn get_type(&self) -> ChunkerType;
}
pub enum Chunker {
Ae(Box<AeChunker>),
Rabin(Box<RabinChunker>),
FastCdc(Box<FastCdcChunker>)
}
impl IChunker for Chunker {
#[inline]
fn get_type(&self) -> ChunkerType {
match *self {
Chunker::Ae(ref c) => c.get_type(),
Chunker::Rabin(ref c) => c.get_type(),
Chunker::FastCdc(ref c) => c.get_type()
}
}
#[inline]
fn chunk<R: Read, W: Write>(&mut self, r: &mut R, w: &mut W) -> Result<ChunkerStatus, ChunkerError> {
match *self {
Chunker::Ae(ref mut c) => c.chunk(r, w),
Chunker::Rabin(ref mut c) => c.chunk(r, w),
Chunker::FastCdc(ref mut c) => c.chunk(r, w)
}
}
}
#[derive(Debug)]
pub enum ChunkerType {
Ae(usize),
Rabin((usize, u32)),
FastCdc((usize, u64))
}
serde_impl!(ChunkerType(u64) {
Ae(usize) => 1,
Rabin((usize, u32)) => 2,
FastCdc