2017-03-10 11:43:32 +00:00
|
|
|
use super::*;
|
|
|
|
|
|
|
|
use std::ptr;
|
2017-08-02 20:18:37 +00:00
|
|
|
use std::cmp;
|
2017-03-10 11:43:32 +00:00
|
|
|
|
|
|
|
// FastCDC
|
|
|
|
// Paper: "FastCDC: a Fast and Efficient Content-Defined Chunking Approach for Data Deduplication"
|
|
|
|
// Paper-URL: https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf
|
|
|
|
// Presentation: https://www.usenix.org/sites/default/files/conference/protected-files/atc16_slides_xia.pdf
|
|
|
|
|
|
|
|
|
2018-02-19 20:18:47 +00:00
|
|
|
|
2017-03-10 11:43:32 +00:00
|
|
|
// Creating 256 pseudo-random values (based on Knuth's MMIX)
|
|
|
|
fn create_gear(seed: u64) -> [u64; 256] {
|
|
|
|
let mut table = [0u64; 256];
|
2018-02-24 22:28:18 +00:00
|
|
|
let a = 6_364_136_223_846_793_005;
|
|
|
|
let c = 1_442_695_040_888_963_407;
|
2017-03-10 11:43:32 +00:00
|
|
|
let mut v = seed;
|
|
|
|
for t in &mut table.iter_mut() {
|
|
|
|
v = v.wrapping_mul(a).wrapping_add(c);
|
|
|
|
*t = v;
|
|
|
|
}
|
|
|
|
table
|
|
|
|
}
|
|
|
|
|
|
|
|
fn get_masks(avg_size: usize, nc_level: usize, seed: u64) -> (u64, u64) {
|
|
|
|
let bits = (avg_size.next_power_of_two() - 1).count_ones();
|
|
|
|
if bits == 13 {
|
|
|
|
// From the paper
|
2018-02-24 22:28:18 +00:00
|
|
|
return (0x0003_5907_0353_0000, 0x0000_d900_0353_0000);
|
2017-03-10 11:43:32 +00:00
|
|
|
}
|
|
|
|
let mut mask = 0u64;
|
|
|
|
let mut v = seed;
|
2018-02-24 22:28:18 +00:00
|
|
|
let a = 6_364_136_223_846_793_005;
|
|
|
|
let c = 1_442_695_040_888_963_407;
|
2017-03-10 11:43:32 +00:00
|
|
|
while mask.count_ones() < bits - nc_level as u32 {
|
|
|
|
v = v.wrapping_mul(a).wrapping_add(c);
|
|
|
|
mask = (mask | 1).rotate_left(v as u32 & 0x3f);
|
|
|
|
}
|
|
|
|
let mask_long = mask;
|
|
|
|
while mask.count_ones() < bits + nc_level as u32 {
|
|
|
|
v = v.wrapping_mul(a).wrapping_add(c);
|
|
|
|
mask = (mask | 1).rotate_left(v as u32 & 0x3f);
|
|
|
|
}
|
|
|
|
let mask_short = mask;
|
|
|
|
(mask_short, mask_long)
|
|
|
|
}
|
|
|
|
|
|
|
|
pub struct FastCdcChunker {
|
2018-02-24 22:28:18 +00:00
|
|
|
buffer: [u8; 0x1000],
|
2017-03-10 11:43:32 +00:00
|
|
|
buffered: usize,
|
|
|
|
gear: [u64; 256],
|
|
|
|
min_size: usize,
|
|
|
|
max_size: usize,
|
|
|
|
avg_size: usize,
|
|
|
|
mask_long: u64,
|
|
|
|
mask_short: u64,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl FastCdcChunker {
|
2017-08-03 05:34:16 +00:00
|
|
|
pub fn new(avg_size: usize, seed: u64) -> Self {
|
2017-03-10 11:43:32 +00:00
|
|
|
let (mask_short, mask_long) = get_masks(avg_size, 2, seed);
|
|
|
|
FastCdcChunker {
|
2018-02-24 22:28:18 +00:00
|
|
|
buffer: [0; 0x1000],
|
2017-03-10 11:43:32 +00:00
|
|
|
buffered: 0,
|
|
|
|
gear: create_gear(seed),
|
|
|
|
min_size: avg_size/4,
|
|
|
|
max_size: avg_size*8,
|
|
|
|
avg_size: avg_size,
|
|
|
|
mask_long: mask_long,
|
|
|
|
mask_short: mask_short,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-08-02 21:12:46 +00:00
|
|
|
|
|
|
|
impl FastCdcChunker {
|
|
|
|
fn write_output(&mut self, w: &mut Write, pos: usize, max: usize) -> Result<ChunkerStatus, ChunkerError> {
|
2018-02-19 20:18:47 +00:00
|
|
|
debug_assert!(max <= self.buffer.len());
|
|
|
|
debug_assert!(pos <= self.buffer.len());
|
2017-08-02 21:12:46 +00:00
|
|
|
try!(w.write_all(&self.buffer[..pos]).map_err(ChunkerError::Write));
|
|
|
|
unsafe { ptr::copy(self.buffer[pos..].as_ptr(), self.buffer.as_mut_ptr(), max-pos) };
|
|
|
|
self.buffered = max-pos;
|
|
|
|
Ok(ChunkerStatus::Continue)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-04-19 15:35:25 +00:00
|
|
|
impl Chunker for FastCdcChunker {
|
2017-04-08 13:10:02 +00:00
|
|
|
#[allow(unknown_lints,explicit_counter_loop,needless_range_loop)]
|
2018-02-19 21:30:59 +00:00
|
|
|
fn chunk(&mut self, r: &mut Read, w: &mut Write) -> Result<ChunkerStatus, ChunkerError> {
|
2017-03-10 11:43:32 +00:00
|
|
|
let mut max;
|
|
|
|
let mut hash = 0u64;
|
|
|
|
let mut pos = 0;
|
|
|
|
loop {
|
|
|
|
// Fill the buffer, there might be some bytes still in there from last chunk
|
2017-08-02 21:12:46 +00:00
|
|
|
max = try!(r.read(&mut self.buffer[self.buffered..]).map_err(ChunkerError::Read)) + self.buffered;
|
2017-03-10 11:43:32 +00:00
|
|
|
// If nothing to do, finish
|
|
|
|
if max == 0 {
|
|
|
|
return Ok(ChunkerStatus::Finished)
|
|
|
|
}
|
2017-08-02 21:12:46 +00:00
|
|
|
let min_size_p = cmp::min(max, cmp::max(self.min_size as isize - pos as isize, 0) as usize);
|
|
|
|
let avg_size_p = cmp::min(max, cmp::max(self.avg_size as isize - pos as isize, 0) as usize);
|
|
|
|
let max_size_p = cmp::min(max, cmp::max(self.max_size as isize - pos as isize, 0) as usize);
|
2017-08-03 05:34:16 +00:00
|
|
|
// Skipping first min_size bytes. This is ok as same data still results in same hash.
|
2017-08-02 21:12:46 +00:00
|
|
|
if self.avg_size > pos {
|
2017-08-02 20:18:37 +00:00
|
|
|
for i in min_size_p..avg_size_p {
|
2017-08-02 21:12:46 +00:00
|
|
|
hash = (hash << 1).wrapping_add(self.gear[self.buffer[i] as usize]);
|
|
|
|
if hash & self.mask_short == 0 {
|
2018-02-19 21:30:59 +00:00
|
|
|
return self.write_output(w, i + 1, max);
|
2017-08-02 20:18:37 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2017-08-02 21:12:46 +00:00
|
|
|
if self.max_size > pos {
|
2017-08-02 20:18:37 +00:00
|
|
|
for i in avg_size_p..max_size_p {
|
2017-08-02 21:12:46 +00:00
|
|
|
hash = (hash << 1).wrapping_add(self.gear[self.buffer[i] as usize]);
|
|
|
|
if hash & self.mask_long == 0 {
|
|
|
|
return self.write_output(w, i+1, max);
|
2017-03-10 11:43:32 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2017-08-02 21:12:46 +00:00
|
|
|
if max + pos >= self.max_size {
|
|
|
|
return self.write_output(w, max_size_p, max);
|
2017-08-02 20:18:37 +00:00
|
|
|
}
|
|
|
|
pos += max;
|
2017-08-02 21:12:46 +00:00
|
|
|
try!(w.write_all(&self.buffer[..max]).map_err(ChunkerError::Write));
|
2017-03-10 11:43:32 +00:00
|
|
|
self.buffered = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|