Also including the first min_size bytes in hash (oops), performance improvements

This commit is contained in:
Dennis Schwerdel 2017-08-02 22:18:37 +02:00
parent 54e2329228
commit 837df8bbd3
1 changed files with 31 additions and 12 deletions

View File

@ -1,6 +1,7 @@
use super::*; use super::*;
use std::ptr; use std::ptr;
use std::cmp;
// FastCDC // FastCDC
// Paper: "FastCDC: a Fast and Efficient Content-Defined Chunking Approach for Data Deduplication" // Paper: "FastCDC: a Fast and Efficient Content-Defined Chunking Approach for Data Deduplication"
@ -92,26 +93,44 @@ impl Chunker for FastCdcChunker {
if max == 0 { if max == 0 {
return Ok(ChunkerStatus::Finished) return Ok(ChunkerStatus::Finished)
} }
for i in 0..max { let min_size_p = cmp::min(max, cmp::max(min_size as isize - pos as isize, 0) as usize);
if pos >= min_size { let avg_size_p = cmp::min(max, cmp::max(avg_size as isize - pos as isize, 0) as usize);
// Hash update let max_size_p = cmp::min(max, cmp::max(max_size as isize - pos as isize, 0) as usize);
if min_size > pos {
for i in 0..min_size_p {
hash = (hash << 1).wrapping_add(gear[buffer[i] as usize]); hash = (hash << 1).wrapping_add(gear[buffer[i] as usize]);
// 3 options for break point }
// 1) mask_short matches and chunk is smaller than average }
// 2) mask_long matches and chunk is longer or equal to average if avg_size > pos {
// 3) chunk reached max_size for i in min_size_p..avg_size_p {
if pos < avg_size && hash & mask_short == 0 hash = (hash << 1).wrapping_add(gear[buffer[i] as usize]);
|| pos >= avg_size && hash & mask_long == 0 if hash & mask_short == 0 {
|| pos >= max_size {
// Write all bytes from this chunk out to sink and store rest for next chunk
try!(w.write_all(&buffer[..i+1]).map_err(ChunkerError::Write)); try!(w.write_all(&buffer[..i+1]).map_err(ChunkerError::Write));
unsafe { ptr::copy(buffer[i+1..].as_ptr(), buffer.as_mut_ptr(), max-i-1) }; unsafe { ptr::copy(buffer[i+1..].as_ptr(), buffer.as_mut_ptr(), max-i-1) };
self.buffered = max-i-1; self.buffered = max-i-1;
return Ok(ChunkerStatus::Continue); return Ok(ChunkerStatus::Continue);
} }
} }
pos += 1;
} }
if max_size > pos {
for i in avg_size_p..max_size_p {
hash = (hash << 1).wrapping_add(gear[buffer[i] as usize]);
if hash & mask_long == 0 {
try!(w.write_all(&buffer[..i+1]).map_err(ChunkerError::Write));
unsafe { ptr::copy(buffer[i+1..].as_ptr(), buffer.as_mut_ptr(), max-i-1) };
self.buffered = max-i-1;
return Ok(ChunkerStatus::Continue);
}
}
}
if max + pos >= max_size {
let i = max_size_p;
try!(w.write_all(&buffer[..i]).map_err(ChunkerError::Write));
unsafe { ptr::copy(buffer[i..].as_ptr(), buffer.as_mut_ptr(), max-i) };
self.buffered = max-i;
return Ok(ChunkerStatus::Continue);
}
pos += max;
try!(w.write_all(&buffer[..max]).map_err(ChunkerError::Write)); try!(w.write_all(&buffer[..max]).map_err(ChunkerError::Write));
self.buffered = 0; self.buffered = 0;
} }