From fc45fa4e3348b7cc7d1ec59b0bf42bc66d4069ab Mon Sep 17 00:00:00 2001 From: Dennis Schwerdel Date: Mon, 20 Mar 2017 14:03:29 +0100 Subject: [PATCH] Vacuum --- src/cli/algotest.rs | 1 - src/cli/args.rs | 10 ++- src/cli/mod.rs | 14 ++-- src/index.rs | 9 +++ src/main.rs | 1 - src/repository/basic_io.rs | 5 ++ src/repository/bundle_map.rs | 9 ++- src/repository/info.rs | 2 +- src/repository/integrity.rs | 114 +++++++++++++++++++------- src/repository/mod.rs | 2 + src/repository/vacuum.rs | 153 +++++++++++++++++++++++++++++++++++ src/util/bitmap.rs | 77 ++++++++++++++++++ src/util/hex.rs | 31 +++++++ src/util/mod.rs | 39 ++------- 14 files changed, 391 insertions(+), 76 deletions(-) create mode 100644 src/repository/vacuum.rs create mode 100644 src/util/bitmap.rs create mode 100644 src/util/hex.rs diff --git a/src/cli/algotest.rs b/src/cli/algotest.rs index d4edc9d..991089f 100644 --- a/src/cli/algotest.rs +++ b/src/cli/algotest.rs @@ -6,7 +6,6 @@ use chrono::Duration; use ::chunker::*; use ::util::*; -use ::util::cli::*; struct ChunkSink { diff --git a/src/cli/args.rs b/src/cli/args.rs index e470f95..e348a8b 100644 --- a/src/cli/args.rs +++ b/src/cli/args.rs @@ -28,11 +28,13 @@ pub enum Arguments { Remove { repo_path: String, backup_name: String, + vacuum: bool, inode: Option }, Vacuum { repo_path: String, - ratio: f32 + ratio: f32, + simulate: bool }, Check { repo_path: String, @@ -204,11 +206,13 @@ pub fn parse() -> Arguments { ) (@subcommand remove => (about: "removes a backup or a subpath") + (@arg vacuum: --vacuum "run vacuum afterwards to reclaim space") (@arg BACKUP: +required "repository::backup[::subpath] path") ) (@subcommand vacuum => (about: "saves space by combining and recompressing bundles") - (@arg ratio: --ratio -r "ratio of unused chunks in a bundle to rewrite that bundle") + (@arg ratio: --ratio -r +takes_value "ratio of unused chunks in a bundle to rewrite that bundle") + (@arg ratio: --simulate "only simulate the vacuum, do not remove any bundles") (@arg REPO: +required "path of the repository") ) (@subcommand check => @@ -317,6 +321,7 @@ pub fn parse() -> Arguments { return Arguments::Remove { repo_path: repository.to_string(), backup_name: backup.unwrap().to_string(), + vacuum: args.is_present("vacuum"), inode: inode.map(|v| v.to_string()) } } @@ -328,6 +333,7 @@ pub fn parse() -> Arguments { } return Arguments::Vacuum { repo_path: repository.to_string(), + simulate: args.is_present("simulate"), ratio: parse_float(args.value_of("ratio").unwrap_or("0.5"), "ratio") as f32 } } diff --git a/src/cli/mod.rs b/src/cli/mod.rs index 32d0ca1..1ce8c4b 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -6,7 +6,6 @@ use chrono::prelude::*; use std::process::exit; use ::repository::{Repository, Config, Backup}; -use ::util::cli::*; use ::util::*; use self::args::Arguments; @@ -72,8 +71,8 @@ pub fn run() { repo.restore_backup(&backup, &dst_path).unwrap(); } }, - Arguments::Remove{repo_path, backup_name, inode} => { - let repo = open_repository(&repo_path); + Arguments::Remove{repo_path, backup_name, inode, vacuum} => { + let mut repo = open_repository(&repo_path); if let Some(_inode) = inode { let _backup = get_backup(&repo, &backup_name); error!("Removing backup subtrees is not implemented yet"); @@ -82,10 +81,13 @@ pub fn run() { repo.delete_backup(&backup_name).unwrap(); info!("The backup has been deleted, run vacuum to reclaim space"); } + if vacuum { + repo.vacuum(0.5, false).unwrap(); + } }, - Arguments::Vacuum{repo_path, ..} => { - let _repo = open_repository(&repo_path); - error!("Vaccum is not implemented yet"); + Arguments::Vacuum{repo_path, ratio, simulate} => { + let mut repo = open_repository(&repo_path); + repo.vacuum(ratio, simulate).unwrap(); return }, Arguments::Check{repo_path, backup_name, inode, full} => { diff --git a/src/index.rs b/src/index.rs index 24d8373..ba1c474 100644 --- a/src/index.rs +++ b/src/index.rs @@ -392,6 +392,15 @@ impl Index { } } + #[inline] + pub fn pos(&self, key: &Hash) -> Option { + debug_assert!(self.check().is_ok(), "Inconsistent before get"); + match self.locate(key) { + LocateResult::Found(pos) => Some(pos), + _ => None + } + } + #[inline] pub fn get(&self, key: &Hash) -> Option { debug_assert!(self.check().is_ok(), "Inconsistent before get"); diff --git a/src/main.rs b/src/main.rs index a8a5d56..b9f9a95 100644 --- a/src/main.rs +++ b/src/main.rs @@ -31,7 +31,6 @@ mod cli; // TODO: Remove backup subtrees // TODO: Recompress & combine bundles // TODO: Prune backups (based on age like attic) -// TODO: Check backup integrity too // TODO: Encrypt backup files too // TODO: list --tree // TODO: Partial backups diff --git a/src/repository/basic_io.rs b/src/repository/basic_io.rs index 7345664..5b73d77 100644 --- a/src/repository/basic_io.rs +++ b/src/repository/basic_io.rs @@ -32,11 +32,16 @@ impl Repository { Ok(Some(try!(self.bundles.get_chunk(&bundle_id, found.chunk as usize)))) } + #[inline] pub fn put_chunk(&mut self, mode: BundleMode, hash: Hash, data: &[u8]) -> Result<(), RepositoryError> { // If this chunk is in the index, ignore it if self.index.contains(&hash) { return Ok(()) } + self.put_chunk_override(mode, hash, data) + } + + pub fn put_chunk_override(&mut self, mode: BundleMode, hash: Hash, data: &[u8]) -> Result<(), RepositoryError> { // Calculate the next free bundle id now (late lifetime prevents this) let next_free_bundle_id = self.next_free_bundle_id(); // Select a bundle writer according to the mode and... diff --git a/src/repository/bundle_map.rs b/src/repository/bundle_map.rs index b8aaa04..91964a9 100644 --- a/src/repository/bundle_map.rs +++ b/src/repository/bundle_map.rs @@ -89,6 +89,11 @@ impl BundleMap { self.0.get(&id) } + #[inline] + pub fn remove(&mut self, id: u32) -> Option { + self.0.remove(&id) + } + #[inline] pub fn set(&mut self, id: u32, bundle: &Bundle) { let data = BundleData { info: bundle.info.clone() }; @@ -96,7 +101,7 @@ impl BundleMap { } #[inline] - pub fn bundles(&self) -> Vec<&BundleData> { - self.0.values().collect() + pub fn bundles(&self) -> Vec<(u32, &BundleData)> { + self.0.iter().map(|(id, bundle)| (*id, bundle)).collect() } } diff --git a/src/repository/info.rs b/src/repository/info.rs index fd961a2..50582ea 100644 --- a/src/repository/info.rs +++ b/src/repository/info.rs @@ -17,7 +17,7 @@ pub struct RepositoryInfo { impl Repository { #[inline] pub fn list_bundles(&self) -> Vec<&BundleInfo> { - self.bundle_map.bundles().iter().map(|b| &b.info).collect() + self.bundle_map.bundles().into_iter().map(|(_id, b)| &b.info).collect() } pub fn info(&self) -> RepositoryInfo { diff --git a/src/repository/integrity.rs b/src/repository/integrity.rs index 22f7e5c..7ec92e9 100644 --- a/src/repository/integrity.rs +++ b/src/repository/integrity.rs @@ -1,7 +1,10 @@ use super::{Repository, RepositoryError}; +use super::metadata::FileContents; use ::bundle::BundleId; -use ::util::Hash; +use ::util::*; + +use std::collections::VecDeque; quick_error!{ @@ -33,34 +36,7 @@ quick_error!{ } impl Repository { - fn check_chunk(&self, hash: Hash) -> Result<(), RepositoryError> { - // Find bundle and chunk id in index - let found = if let Some(found) = self.index.get(&hash) { - found - } else { - return Err(RepositoryIntegrityError::MissingChunk(hash).into()); - }; - // Lookup bundle id from map - let bundle_id = try!(self.get_bundle_id(found.bundle)); - // Get bundle object from bundledb - let bundle = if let Some(bundle) = self.bundles.get_bundle(&bundle_id) { - bundle - } else { - return Err(RepositoryIntegrityError::MissingBundle(bundle_id.clone()).into()) - }; - // Get chunk from bundle - if bundle.info.chunk_count > found.chunk as usize { - Ok(()) - } else { - Err(RepositoryIntegrityError::NoSuchChunk(bundle_id.clone(), found.chunk).into()) - } - //TODO: check that contents match their hash - } - - pub fn check(&mut self, full: bool) -> Result<(), RepositoryError> { - try!(self.flush()); - try!(self.bundles.check(full)); - try!(self.index.check()); + fn check_index_chunks(&self) -> Result<(), RepositoryError> { let mut pos = 0; loop { pos = if let Some(pos) = self.index.next_entry(pos) { @@ -69,9 +45,24 @@ impl Repository { break }; let entry = self.index.get_entry(pos).unwrap(); - try!(self.check_chunk(entry.key)); + // Lookup bundle id from map + let bundle_id = try!(self.get_bundle_id(entry.data.bundle)); + // Get bundle object from bundledb + let bundle = if let Some(bundle) = self.bundles.get_bundle(&bundle_id) { + bundle + } else { + return Err(RepositoryIntegrityError::MissingBundle(bundle_id.clone()).into()) + }; + // Get chunk from bundle + if bundle.info.chunk_count <= entry.data.chunk as usize { + return Err(RepositoryIntegrityError::NoSuchChunk(bundle_id.clone(), entry.data.chunk).into()) + } pos += 1; } + Ok(()) + } + + fn check_repository(&self) -> Result<(), RepositoryError> { if self.next_content_bundle == self.next_meta_bundle { return Err(RepositoryIntegrityError::InvalidNextBundleId.into()) } @@ -83,4 +74,67 @@ impl Repository { } Ok(()) } + + fn check_chunks(&self, checked: &mut Bitmap, chunks: &[Chunk]) -> Result { + let mut new = false; + for &(hash, _len) in chunks { + if let Some(pos) = self.index.pos(&hash) { + new |= checked.get(pos); + checked.set(pos); + } else { + return Err(RepositoryIntegrityError::MissingChunk(hash).into()) + } + } + Ok(new) + } + + fn check_backups(&mut self) -> Result<(), RepositoryError> { + let mut checked = Bitmap::new(self.index.capacity()); + for name in try!(self.list_backups()) { + let backup = try!(self.get_backup(&name)); + let mut todo = VecDeque::new(); + todo.push_back(backup.root); + while let Some(chunks) = todo.pop_front() { + if !try!(self.check_chunks(&mut checked, &chunks)) { + continue + } + let inode = try!(self.get_inode(&chunks)); + // Mark the content chunks as used + match inode.contents { + Some(FileContents::ChunkedDirect(chunks)) => { + try!(self.check_chunks(&mut checked, &chunks)); + }, + Some(FileContents::ChunkedIndirect(chunks)) => { + if try!(self.check_chunks(&mut checked, &chunks)) { + let chunk_data = try!(self.get_data(&chunks)); + let chunks = ChunkList::read_from(&chunk_data); + try!(self.check_chunks(&mut checked, &chunks)); + } + } + _ => () + } + // Put children in todo + if let Some(children) = inode.children { + for (_name, chunks) in children { + todo.push_back(chunks); + } + } + } + } + Ok(()) + } + + pub fn check(&mut self, full: bool) -> Result<(), RepositoryError> { + try!(self.flush()); + info!("Checking bundle integrity..."); + try!(self.bundles.check(full)); + info!("Checking index integrity..."); + try!(self.index.check()); + try!(self.check_index_chunks()); + info!("Checking backup integrity..."); + try!(self.check_backups()); + info!("Checking repository integrity..."); + try!(self.check_repository()); + Ok(()) + } } diff --git a/src/repository/mod.rs b/src/repository/mod.rs index 35497b1..ecd5d1b 100644 --- a/src/repository/mod.rs +++ b/src/repository/mod.rs @@ -6,6 +6,7 @@ mod info; mod metadata; mod backup; mod error; +mod vacuum; use std::mem; use std::cmp::max; @@ -22,6 +23,7 @@ pub use self::error::RepositoryError; pub use self::config::Config; pub use self::metadata::{Inode, FileType}; pub use self::backup::Backup; +pub use self::integrity::RepositoryIntegrityError; use self::bundle_map::BundleMap; diff --git a/src/repository/vacuum.rs b/src/repository/vacuum.rs new file mode 100644 index 0000000..1660132 --- /dev/null +++ b/src/repository/vacuum.rs @@ -0,0 +1,153 @@ +use super::{Repository, RepositoryError, RepositoryIntegrityError}; +use super::metadata::FileContents; + +use std::collections::{HashMap, HashSet, VecDeque}; + +use ::bundle::BundleMode; +use ::util::*; + + +pub struct BundleUsage { + pub used: Bitmap, + pub mode: Bitmap, + pub chunk_count: usize, + pub total_size: usize, + pub used_size: usize +} + +impl Repository { + fn mark_used(&self, bundles: &mut HashMap, chunks: &[Chunk], mode: BundleMode) -> Result { + let mut new = false; + for chunk in chunks { + if let Some(pos) = self.index.get(&chunk.0) { + if let Some(bundle) = bundles.get_mut(&pos.bundle) { + if !bundle.used.get(pos.chunk as usize) { + new = true; + bundle.used.set(pos.chunk as usize); + bundle.used_size += chunk.1 as usize; + if mode == BundleMode::Meta { + bundle.mode.set(pos.chunk as usize); + } + } + } + } else { + return Err(RepositoryIntegrityError::MissingChunk(chunk.0).into()); + } + } + Ok(new) + } + + pub fn analyze_usage(&mut self) -> Result, RepositoryError> { + let mut usage = HashMap::new(); + for (id, bundle) in self.bundle_map.bundles() { + usage.insert(id, BundleUsage { + used: Bitmap::new(bundle.info.chunk_count), + mode: Bitmap::new(bundle.info.chunk_count), + chunk_count: bundle.info.chunk_count, + total_size: bundle.info.raw_size, + used_size: 0 + }); + } + for name in try!(self.list_backups()) { + let backup = try!(self.get_backup(&name)); + let mut todo = VecDeque::new(); + todo.push_back(backup.root); + while let Some(chunks) = todo.pop_front() { + if !try!(self.mark_used(&mut usage, &chunks, BundleMode::Meta)) { + continue + } + let inode = try!(self.get_inode(&chunks)); + // Mark the content chunks as used + match inode.contents { + Some(FileContents::ChunkedDirect(chunks)) => { + try!(self.mark_used(&mut usage, &chunks, BundleMode::Content)); + }, + Some(FileContents::ChunkedIndirect(chunks)) => { + if try!(self.mark_used(&mut usage, &chunks, BundleMode::Meta)) { + let chunk_data = try!(self.get_data(&chunks)); + let chunks = ChunkList::read_from(&chunk_data); + try!(self.mark_used(&mut usage, &chunks, BundleMode::Content)); + } + } + _ => () + } + // Put children in todo + if let Some(children) = inode.children { + for (_name, chunks) in children { + todo.push_back(chunks); + } + } + } + } + Ok(usage) + } + + fn delete_bundle(&mut self, id: u32) -> Result<(), RepositoryError> { + if let Some(bundle) = self.bundle_map.remove(id) { + try!(self.bundles.delete_bundle(&bundle.id())); + Ok(()) + } else { + Err(RepositoryIntegrityError::MissingBundleId(id).into()) + } + } + + pub fn vacuum(&mut self, ratio: f32, simulate: bool) -> Result<(), RepositoryError> { + try!(self.flush()); + info!("Analyzing chunk usage"); + let usage = try!(self.analyze_usage()); + let total = usage.values().map(|b| b.total_size).sum::(); + let used = usage.values().map(|b| b.used_size).sum::(); + info!("Usage: {} of {}, {:.1}%", to_file_size(used as u64), to_file_size(total as u64), used as f32/total as f32*100.0); + let mut rewrite_bundles = HashSet::new(); + let mut reclaim_space = 0; + for (id, bundle) in &usage { + if bundle.used_size as f32 / bundle.total_size as f32 <= ratio { + rewrite_bundles.insert(*id); + reclaim_space += bundle.total_size - bundle.used_size; + } + } + info!("Reclaiming {} by rewriting {} bundles", to_file_size(reclaim_space as u64), rewrite_bundles.len()); + if simulate { + return Ok(()) + } + for id in &rewrite_bundles { + let bundle = usage.get(id).unwrap(); + let bundle_id = self.bundle_map.get(*id).unwrap().id(); + for chunk in 0..bundle.chunk_count { + let data = try!(self.bundles.get_chunk(&bundle_id, chunk)); + let hash = self.config.hash.hash(&data); + if !bundle.used.get(chunk) { + try!(self.index.delete(&hash)); + continue + } + let mode = if bundle.mode.get(chunk) { + BundleMode::Meta + } else { + BundleMode::Content + }; + try!(self.put_chunk_override(mode, hash, &data)); + } + } + try!(self.flush()); + info!("Checking index"); + let mut pos = 0; + loop { + pos = if let Some(pos) = self.index.next_entry(pos) { + pos + } else { + break + }; + let entry = self.index.get_entry(pos).unwrap(); + if rewrite_bundles.contains(&entry.data.bundle) { + panic!("Removed bundle is still referenced from index"); + } + pos += 1; + } + info!("Deleting {} bundles", rewrite_bundles.len()); + for id in rewrite_bundles { + try!(self.delete_bundle(id)); + } + try!(self.bundle_map.save(self.path.join("bundles.map"))); + Ok(()) + } +} diff --git a/src/util/bitmap.rs b/src/util/bitmap.rs new file mode 100644 index 0000000..ddbc2e2 --- /dev/null +++ b/src/util/bitmap.rs @@ -0,0 +1,77 @@ +use std::ops::Deref; + +pub struct Bitmap { + bytes: Vec +} + +impl Bitmap { + pub fn new(len: usize) -> Self { + let len = (len+7)/8; + let mut bytes = Vec::with_capacity(len); + bytes.resize(len, 0); + Self { bytes: bytes } + } + + #[inline] + pub fn len(&self) -> usize { + self.bytes.len() * 8 + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + #[inline] + fn convert_index(&self, index: usize) -> (usize, u8) { + (index/8, 1u8<<(index%8)) + } + + #[inline] + pub fn set(&mut self, index: usize) { + let (byte, mask) = self.convert_index(index); + self.bytes[byte] |= mask + } + + #[inline] + pub fn unset(&mut self, index: usize) { + let (byte, mask) = self.convert_index(index); + self.bytes[byte] &= !mask + } + + #[inline] + pub fn flip(&mut self, index: usize) { + let (byte, mask) = self.convert_index(index); + self.bytes[byte] ^= mask + } + + #[inline] + pub fn get(&self, index: usize) -> bool { + let (byte, mask) = self.convert_index(index); + self.bytes[byte] & mask != 0 + } + + #[inline] + pub fn into_bytes(self) -> Vec { + self.bytes + } + + #[inline] + pub fn as_bytes(&self) -> &[u8] { + &self.bytes + } + + #[inline] + pub fn from_bytes(bytes: Vec) -> Self { + Self { bytes: bytes } + } +} + +impl Deref for Bitmap { + type Target = [u8]; + + #[inline] + fn deref(&self) -> &[u8] { + &self.bytes + } +} diff --git a/src/util/hex.rs b/src/util/hex.rs new file mode 100644 index 0000000..1b99904 --- /dev/null +++ b/src/util/hex.rs @@ -0,0 +1,31 @@ +pub fn to_hex(data: &[u8]) -> String { + data.iter().map(|b| format!("{:02x}", b)).collect::>().join("") +} + +pub fn parse_hex(hex: &str) -> Result, ()> { + let mut b = Vec::with_capacity(hex.len() / 2); + let mut modulus = 0; + let mut buf = 0; + for (_, byte) in hex.bytes().enumerate() { + buf <<= 4; + match byte { + b'A'...b'F' => buf |= byte - b'A' + 10, + b'a'...b'f' => buf |= byte - b'a' + 10, + b'0'...b'9' => buf |= byte - b'0', + b' '|b'\r'|b'\n'|b'\t' => { + buf >>= 4; + continue + } + _ => return Err(()), + } + modulus += 1; + if modulus == 2 { + modulus = 0; + b.push(buf); + } + } + match modulus { + 0 => Ok(b.into_iter().collect()), + _ => Err(()), + } +} diff --git a/src/util/mod.rs b/src/util/mod.rs index 3da389e..a7a9e3b 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -4,7 +4,9 @@ mod encryption; mod hash; mod lru_cache; mod chunk; -pub mod cli; +mod bitmap; +mod hex; +mod cli; pub mod msgpack; pub use self::chunk::*; @@ -12,35 +14,6 @@ pub use self::compression::*; pub use self::encryption::*; pub use self::hash::*; pub use self::lru_cache::*; - -pub fn to_hex(data: &[u8]) -> String { - data.iter().map(|b| format!("{:02x}", b)).collect::>().join("") -} - -pub fn parse_hex(hex: &str) -> Result, ()> { - let mut b = Vec::with_capacity(hex.len() / 2); - let mut modulus = 0; - let mut buf = 0; - for (_, byte) in hex.bytes().enumerate() { - buf <<= 4; - match byte { - b'A'...b'F' => buf |= byte - b'A' + 10, - b'a'...b'f' => buf |= byte - b'a' + 10, - b'0'...b'9' => buf |= byte - b'0', - b' '|b'\r'|b'\n'|b'\t' => { - buf >>= 4; - continue - } - _ => return Err(()), - } - modulus += 1; - if modulus == 2 { - modulus = 0; - b.push(buf); - } - } - match modulus { - 0 => Ok(b.into_iter().collect()), - _ => Err(()), - } -} +pub use self::bitmap::*; +pub use self::hex::*; +pub use self::cli::*;