|
|
@ -3,7 +3,13 @@ |
|
|
|
#include <sys/ioctl.h> |
|
|
|
#include <sys/stat.h> |
|
|
|
#include <fcntl.h> |
|
|
|
#include <unistd.h> |
|
|
|
#include <stdint.h> |
|
|
|
#include <linux/fs.h> |
|
|
|
|
|
|
|
#include <vector> |
|
|
|
#include <map> |
|
|
|
|
|
|
|
#include "allocator.h" |
|
|
|
#include "sparsepp/sparsepp/spp.h" |
|
|
|
|
|
|
@ -23,65 +29,147 @@ |
|
|
|
#define ST_D_META_MOVED 21 |
|
|
|
#define ST_D_META_COMMITTED 22 |
|
|
|
#define ST_CURRENT 32 |
|
|
|
#define IS_STABLE(st) ((st) == 4 || (st) == 5 || (st) == 6 || (st) == 20 || (st) == 21 || (st) == 22 || (st) == 32) |
|
|
|
#define IS_JOURNAL(st) (st >= 2 && st <= 6) |
|
|
|
|
|
|
|
// Default object size is 128 KB
|
|
|
|
#define DEFAULT_ORDER 17 |
|
|
|
#define MAX_BLOCK_SIZE 128*1024*1024 |
|
|
|
#define DISK_ALIGNMENT 4096 |
|
|
|
#define MIN_JOURNAL_SIZE 4*1024*1024 |
|
|
|
#define JOURNAL_MAGIC 0x4A33 |
|
|
|
|
|
|
|
#define STRIPE_NUM(oid) ((oid) >> 4) |
|
|
|
#define STRIPE_REPLICA(oid) ((oid) & 0xf) |
|
|
|
|
|
|
|
struct __attribute__((__packed__)) oid |
|
|
|
// 16 bytes per object/stripe id
|
|
|
|
// stripe includes replica number in 4 least significant bits
|
|
|
|
struct __attribute__((__packed__)) object_id |
|
|
|
{ |
|
|
|
uint64_t inode; |
|
|
|
uint64_t stripe; |
|
|
|
}; |
|
|
|
|
|
|
|
struct __attribute__((__packed__)) meta_entry |
|
|
|
bool operator == (const object_id & a, const object_id & b) |
|
|
|
{ |
|
|
|
return b.inode == a.inode && b.stripe == a.stripe; |
|
|
|
} |
|
|
|
|
|
|
|
// 32 bytes per "clean" entry on disk with fixed metadata tables
|
|
|
|
struct __attribute__((__packed__)) clean_disk_entry |
|
|
|
{ |
|
|
|
uint64_t inode; |
|
|
|
uint64_t stripe; |
|
|
|
uint32_t epoch; |
|
|
|
uint32_t version; |
|
|
|
uint64_t location_flags; |
|
|
|
uint64_t version; |
|
|
|
uint8_t flags; |
|
|
|
uint8_t reserved[7]; |
|
|
|
}; |
|
|
|
|
|
|
|
struct __attribute__((__packed__)) object_version |
|
|
|
// 28 bytes per "clean" entry in memory
|
|
|
|
struct __attribute__((__packed__)) clean_entry |
|
|
|
{ |
|
|
|
uint32_t epoch; |
|
|
|
uint32_t version; |
|
|
|
uint64_t version; |
|
|
|
uint32_t state; |
|
|
|
uint64_t location; |
|
|
|
uint32_t size; |
|
|
|
}; |
|
|
|
|
|
|
|
// 48 bytes per dirty entry in memory
|
|
|
|
struct __attribute__((__packed__)) dirty_entry |
|
|
|
{ |
|
|
|
uint64_t version; |
|
|
|
uint32_t state; |
|
|
|
uint32_t flags; |
|
|
|
uint64_t location; // location in either journal or data
|
|
|
|
uint32_t offset; // offset within stripe
|
|
|
|
uint32_t size; // entry size
|
|
|
|
}; |
|
|
|
|
|
|
|
bool in_journal() |
|
|
|
{ |
|
|
|
return (location & (1 << 63)); |
|
|
|
} |
|
|
|
// Journal entries
|
|
|
|
// Journal entries are linked to each other by their crc32 value
|
|
|
|
// The journal is almost a blockchain, because object versions constantly increase
|
|
|
|
#define JE_START 0x01 |
|
|
|
#define JE_SMALL_WRITE 0x02 |
|
|
|
#define JE_BIG_WRITE 0x03 |
|
|
|
#define JE_STABLE 0x04 |
|
|
|
#define JE_DELETE 0x05 |
|
|
|
|
|
|
|
uint64_t offset() |
|
|
|
{ |
|
|
|
return (location & ~(1 << 63)); |
|
|
|
} |
|
|
|
struct __attribute__((__packed__)) journal_entry_start |
|
|
|
{ |
|
|
|
uint32_t type; |
|
|
|
uint32_t size; |
|
|
|
uint32_t crc32; |
|
|
|
uint32_t reserved1; |
|
|
|
uint64_t offset; |
|
|
|
}; |
|
|
|
|
|
|
|
struct __attribute__((__packed__)) journal_entry_small_write |
|
|
|
{ |
|
|
|
uint32_t type; |
|
|
|
uint32_t size; |
|
|
|
uint32_t crc32; |
|
|
|
uint32_t crc32_prev; |
|
|
|
object_id oid; |
|
|
|
uint64_t version; |
|
|
|
uint32_t offset; |
|
|
|
uint32_t len; |
|
|
|
}; |
|
|
|
|
|
|
|
struct __attribute__((__packed__)) journal_entry_big_write |
|
|
|
{ |
|
|
|
uint32_t type; |
|
|
|
uint32_t size; |
|
|
|
uint32_t crc32; |
|
|
|
uint32_t crc32_prev; |
|
|
|
object_id oid; |
|
|
|
uint64_t version; |
|
|
|
uint64_t block; |
|
|
|
}; |
|
|
|
|
|
|
|
struct __attribute__((__packed__)) journal_entry_stable |
|
|
|
{ |
|
|
|
uint32_t type; |
|
|
|
uint32_t size; |
|
|
|
uint32_t crc32; |
|
|
|
uint32_t crc32_prev; |
|
|
|
object_id oid; |
|
|
|
uint64_t version; |
|
|
|
}; |
|
|
|
|
|
|
|
struct __attribute__((__packed__)) object_ver_list |
|
|
|
struct __attribute__((__packed__)) journal_entry_del |
|
|
|
{ |
|
|
|
uint64_t count; |
|
|
|
object_version versions[]; |
|
|
|
uint32_t type; |
|
|
|
uint32_t size; |
|
|
|
uint32_t crc32; |
|
|
|
uint32_t crc32_prev; |
|
|
|
object_id oid; |
|
|
|
uint64_t version; |
|
|
|
}; |
|
|
|
|
|
|
|
struct __attribute__((__packed__)) object_info |
|
|
|
struct __attribute__((__packed__)) journal_entry |
|
|
|
{ |
|
|
|
object_version first; |
|
|
|
object_ver_list *other; |
|
|
|
union |
|
|
|
{ |
|
|
|
struct __attribute__((__packed__)) |
|
|
|
{ |
|
|
|
uint16_t magic; |
|
|
|
uint16_t type; |
|
|
|
uint32_t size; |
|
|
|
uint32_t crc32; |
|
|
|
}; |
|
|
|
journal_entry_start start; |
|
|
|
journal_entry_small_write small_write; |
|
|
|
journal_entry_big_write big_write; |
|
|
|
journal_entry_stable stable; |
|
|
|
journal_entry_del del; |
|
|
|
}; |
|
|
|
}; |
|
|
|
|
|
|
|
typedef std::vector<dirty_entry> dirty_list; |
|
|
|
|
|
|
|
class oid_hash |
|
|
|
{ |
|
|
|
public: |
|
|
|
size_t operator()(const oid &s) const |
|
|
|
size_t operator()(const object_id &s) const |
|
|
|
{ |
|
|
|
size_t seed = 0; |
|
|
|
spp::hash_combine(seed, s.inode); |
|
|
@ -93,18 +181,23 @@ public: |
|
|
|
class blockstore |
|
|
|
{ |
|
|
|
public: |
|
|
|
spp::sparse_hash_map<oid, object_info, oid_hash> object_db; |
|
|
|
spp::sparse_hash_map<object_id, clean_entry, oid_hash> object_db; |
|
|
|
spp::sparse_hash_map<object_id, dirty_list, oid_hash> dirty_queue; |
|
|
|
int block_order, block_size; |
|
|
|
uint64_t block_count; |
|
|
|
allocator *data_alloc; |
|
|
|
|
|
|
|
int journal_fd; |
|
|
|
int meta_fd; |
|
|
|
int data_fd; |
|
|
|
|
|
|
|
uint64_t journal_offset, journal_size, journal_len; |
|
|
|
uint64_t meta_offset, meta_size, meta_len; |
|
|
|
uint64_t data_offset, data_size, data_len; |
|
|
|
|
|
|
|
blockstore(std::unordered_map<std::string, std::string> & config) |
|
|
|
uint64_t journal_start, journal_end; |
|
|
|
|
|
|
|
blockstore(spp::sparse_hash_map<std::string, std::string> & config) |
|
|
|
{ |
|
|
|
block_order = stoll(config["block_size_order"]); |
|
|
|
block_size = 1 << block_order; |
|
|
@ -145,7 +238,7 @@ public: |
|
|
|
close(journal_fd); |
|
|
|
} |
|
|
|
|
|
|
|
void calc_lengths(std::unordered_map<std::string, std::string> & config) |
|
|
|
void calc_lengths(spp::sparse_hash_map<std::string, std::string> & config) |
|
|
|
{ |
|
|
|
// data
|
|
|
|
data_len = data_size - data_offset; |
|
|
@ -182,7 +275,7 @@ public: |
|
|
|
} |
|
|
|
// required metadata size
|
|
|
|
block_count = data_len / block_size; |
|
|
|
uint64_t meta_required = block_count * sizeof(meta_entry); |
|
|
|
uint64_t meta_required = block_count * sizeof(clean_disk_entry); |
|
|
|
if (meta_len < meta_required) |
|
|
|
{ |
|
|
|
throw new std::runtime_error("Metadata area is too small"); |
|
|
@ -203,7 +296,7 @@ public: |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
void open_data(std::unordered_map<std::string, std::string> & config) |
|
|
|
void open_data(spp::sparse_hash_map<std::string, std::string> & config) |
|
|
|
{ |
|
|
|
int sectsize; |
|
|
|
data_offset = stoll(config["data_offset"]); |
|
|
@ -211,7 +304,7 @@ public: |
|
|
|
{ |
|
|
|
throw new std::runtime_error("data_offset not aligned"); |
|
|
|
} |
|
|
|
data_fd = open(config["data_device"], O_DIRECT|O_RDWR); |
|
|
|
data_fd = open(config["data_device"].c_str(), O_DIRECT|O_RDWR); |
|
|
|
if (data_fd == -1) |
|
|
|
{ |
|
|
|
throw new std::runtime_error("Failed to open data device"); |
|
|
@ -228,7 +321,7 @@ public: |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
void open_meta(std::unordered_map<std::string, std::string> & config) |
|
|
|
void open_meta(spp::sparse_hash_map<std::string, std::string> & config) |
|
|
|
{ |
|
|
|
int sectsize; |
|
|
|
meta_offset = stoll(config["meta_offset"]); |
|
|
@ -239,7 +332,7 @@ public: |
|
|
|
if (config["meta_device"] != "") |
|
|
|
{ |
|
|
|
meta_offset = 0; |
|
|
|
meta_fd = open(config["meta_device"], O_DIRECT|O_RDWR); |
|
|
|
meta_fd = open(config["meta_device"].c_str(), O_DIRECT|O_RDWR); |
|
|
|
if (meta_fd == -1) |
|
|
|
{ |
|
|
|
throw new std::runtime_error("Failed to open metadata device"); |
|
|
@ -266,7 +359,7 @@ public: |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
void open_journal(std::unordered_map<std::string, std::string> & config) |
|
|
|
void open_journal(spp::sparse_hash_map<std::string, std::string> & config) |
|
|
|
{ |
|
|
|
int sectsize; |
|
|
|
journal_offset = stoll(config["journal_offset"]); |
|
|
@ -276,7 +369,7 @@ public: |
|
|
|
} |
|
|
|
if (config["journal_device"] != "") |
|
|
|
{ |
|
|
|
journal_fd = open(config["journal_device"], O_DIRECT|O_RDWR); |
|
|
|
journal_fd = open(config["journal_device"].c_str(), O_DIRECT|O_RDWR); |
|
|
|
if (journal_fd == -1) |
|
|
|
{ |
|
|
|
throw new std::runtime_error("Failed to open journal device"); |
|
|
@ -299,15 +392,83 @@ public: |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
int read(oid stripe, uint32_t offset, uint32_t len, void *buf, void (*callback)(int arg), int arg) |
|
|
|
struct read_fulfill |
|
|
|
{ |
|
|
|
uint64_t flags; |
|
|
|
uint64_t offset; |
|
|
|
uint64_t len; |
|
|
|
void *buf; |
|
|
|
}; |
|
|
|
|
|
|
|
void fulfill_read(std::map<uint64_t, read_fulfill> & fulfill, uint8_t* buf, uint32_t offset, uint32_t len, |
|
|
|
uint32_t item_start, uint32_t dirty_end, uint32_t item_state, uint64_t item_location) |
|
|
|
{ |
|
|
|
auto o = object_db.find(stripe); |
|
|
|
if (o == object_db.end()) |
|
|
|
uint32_t dirty_start = item_start; |
|
|
|
if (dirty_start < offset+len && dirty_end > offset) |
|
|
|
{ |
|
|
|
dirty_start = dirty_start < offset ? offset : dirty_start; |
|
|
|
dirty_end = dirty_end > offset+len ? offset+len : dirty_end; |
|
|
|
auto fulfill_near = fulfill.lower_bound(dirty_start); |
|
|
|
if (fulfill_near != fulfill.begin()) |
|
|
|
{ |
|
|
|
fulfill_near--; |
|
|
|
if (fulfill_near->second.offset + fulfill_near->second.len <= dirty_start) |
|
|
|
fulfill_near++; |
|
|
|
} |
|
|
|
while (fulfill_near != fulfill.end() && fulfill_near->second.offset < dirty_end) |
|
|
|
{ |
|
|
|
if (fulfill_near->second.offset > dirty_start) |
|
|
|
{ |
|
|
|
fulfill[dirty_start] = (read_fulfill){ |
|
|
|
item_state, |
|
|
|
item_location + dirty_start - item_start, |
|
|
|
fulfill_near->second.offset - dirty_start, |
|
|
|
buf + dirty_start - offset, |
|
|
|
}; |
|
|
|
} |
|
|
|
dirty_start = fulfill_near->second.offset + fulfill_near->second.len; |
|
|
|
} |
|
|
|
if (dirty_start < dirty_end) |
|
|
|
{ |
|
|
|
fulfill[dirty_start] = (read_fulfill){ |
|
|
|
item_state, |
|
|
|
item_location + dirty_start - item_start, |
|
|
|
dirty_end - dirty_start, |
|
|
|
buf + dirty_start - offset |
|
|
|
}; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
// flags: READ_DIRTY
|
|
|
|
#define READ_DIRTY 1 |
|
|
|
int read(object_id oid, uint32_t offset, uint32_t len, uint32_t flags, uint8_t *buf, void (*callback)(int arg), int arg) |
|
|
|
{ |
|
|
|
auto clean_it = object_db.find(oid); |
|
|
|
auto dirty_it = dirty_queue.find(oid); |
|
|
|
if (clean_it == object_db.end() && dirty_it == object_db.end()) |
|
|
|
{ |
|
|
|
memset(buf, 0, len); |
|
|
|
callback(arg); |
|
|
|
return; |
|
|
|
return 0; |
|
|
|
} |
|
|
|
uint64_t fulfilled = 0; |
|
|
|
std::map<uint64_t, read_fulfill> fulfill; |
|
|
|
//std::vector<read_fulfill> fulfill;
|
|
|
|
if (dirty_it != object_db.end()) |
|
|
|
{ |
|
|
|
dirty_list dirty = dirty_it->second; |
|
|
|
for (int i = dirty.size()-1; i >= 0; i--) |
|
|
|
{ |
|
|
|
if ((flags & READ_DIRTY) || IS_STABLE(dirty[i].state)) |
|
|
|
{ |
|
|
|
fulfill_read(fulfill, buf, offset, len, dirty[i].offset, dirty[i].offset + dirty[i].size, IS_JOURNAL(dirty[i].state), dirty[i].location); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
if (clean_it != object_db.end()) |
|
|
|
{ |
|
|
|
fulfill_read(fulfill, buf, offset, len, 0, block_size, 0, clean_it->second.location); |
|
|
|
} |
|
|
|
auto info = o->second; |
|
|
|
} |
|
|
|
}; |
|
|
|