Begin journal init reader
parent
9babacb00a
commit
e1c92d2227
4
Makefile
4
Makefile
|
@ -1,4 +1,6 @@
|
||||||
all: allocator.o blockstore.o blockstore_init.o blockstore_open.o blockstore_read.o test
|
all: allocator.o blockstore.o blockstore_init.o blockstore_open.o blockstore_read.o crc32c.o test
|
||||||
|
crc32c.o: crc32c.c
|
||||||
|
gcc -c -o $@ $<
|
||||||
%.o: %.cpp
|
%.o: %.cpp
|
||||||
gcc -c -o $@ $<
|
gcc -c -o $@ $<
|
||||||
test: test.cpp
|
test: test.cpp
|
||||||
|
|
|
@ -147,7 +147,7 @@ public:
|
||||||
spp::sparse_hash_map<object_id, dirty_list, oid_hash> dirty_queue;
|
spp::sparse_hash_map<object_id, dirty_list, oid_hash> dirty_queue;
|
||||||
std::deque<blockstore_operation*> submit_queue;
|
std::deque<blockstore_operation*> submit_queue;
|
||||||
std::set<blockstore_operation*> in_process_ops;
|
std::set<blockstore_operation*> in_process_ops;
|
||||||
int block_order, block_size;
|
uint32_t block_order, block_size;
|
||||||
uint64_t block_count;
|
uint64_t block_count;
|
||||||
allocator *data_alloc;
|
allocator *data_alloc;
|
||||||
|
|
||||||
|
@ -160,6 +160,7 @@ public:
|
||||||
uint64_t data_offset, data_size, data_len;
|
uint64_t data_offset, data_size, data_len;
|
||||||
|
|
||||||
uint64_t journal_start, journal_end;
|
uint64_t journal_start, journal_end;
|
||||||
|
uint32_t journal_crc32_last;
|
||||||
|
|
||||||
struct io_uring *ring;
|
struct io_uring *ring;
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#include "blockstore.h"
|
#include "blockstore.h"
|
||||||
|
#include "crc32c.h"
|
||||||
|
|
||||||
blockstore_init_meta::blockstore_init_meta(blockstore *bs)
|
blockstore_init_meta::blockstore_init_meta(blockstore *bs)
|
||||||
{
|
{
|
||||||
|
@ -92,12 +93,223 @@ blockstore_init_journal::blockstore_init_journal(blockstore *bs)
|
||||||
this->bs = bs;
|
this->bs = bs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool iszero(uint64_t *buf, int len)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < len; i++)
|
||||||
|
if (buf[i] != 0)
|
||||||
|
return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline uint32_t je_crc32(journal_entry *je)
|
||||||
|
{
|
||||||
|
return crc32c_zero4(((uint8_t*)je)+4, je->size-4);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define JOURNAL_BUFFER_SIZE 4*1024*1024
|
||||||
|
|
||||||
int blockstore_init_journal::read_loop()
|
int blockstore_init_journal::read_loop()
|
||||||
{
|
{
|
||||||
|
if (step == 100)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
if (!journal_buffer)
|
if (!journal_buffer)
|
||||||
{
|
{
|
||||||
journal_buffer = new uint8_t[4*1024*1024];
|
journal_buffer = (uint8_t*)memalign(DISK_ALIGNMENT, 2*JOURNAL_BUFFER_SIZE);
|
||||||
}
|
}
|
||||||
|
if (step == 0)
|
||||||
return 0;
|
{
|
||||||
|
// Step 1: Read first block of the journal
|
||||||
|
struct io_uring_sqe *sqe = io_uring_get_sqe(bs->ring);
|
||||||
|
if (!sqe)
|
||||||
|
{
|
||||||
|
throw new std::runtime_error("io_uring is full while trying to read journal");
|
||||||
|
}
|
||||||
|
submit_iov = { journal_buffer, 512 };
|
||||||
|
io_uring_prep_readv(sqe, bs->journal_fd, &submit_iov, 1, bs->journal_offset);
|
||||||
|
io_uring_submit(bs->ring);
|
||||||
|
step = 1;
|
||||||
|
}
|
||||||
|
if (step == 1)
|
||||||
|
{
|
||||||
|
// Step 2: Get the completion event and check the beginning for <START> entry
|
||||||
|
struct io_uring_cqe *cqe;
|
||||||
|
io_uring_peek_cqe(bs->ring, &cqe);
|
||||||
|
if (cqe)
|
||||||
|
{
|
||||||
|
if (cqe->res < 0)
|
||||||
|
{
|
||||||
|
throw new std::runtime_error(
|
||||||
|
std::string("read journal failed at offset ") + std::to_string(0) +
|
||||||
|
std::string(": ") + strerror(-cqe->res)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (iszero((uint64_t*)journal_buffer, 3))
|
||||||
|
{
|
||||||
|
// Journal is empty
|
||||||
|
bs->journal_start = 0;
|
||||||
|
bs->journal_end = 0;
|
||||||
|
step = 99;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// First block always contains a single JE_START entry
|
||||||
|
journal_entry_start *je = (journal_entry_start*)journal_buffer;
|
||||||
|
if (je->magic != JOURNAL_MAGIC ||
|
||||||
|
je->type != JE_START ||
|
||||||
|
je->size != sizeof(journal_entry_start) ||
|
||||||
|
je_crc32((journal_entry*)je) != je->crc32)
|
||||||
|
{
|
||||||
|
// Entry is corrupt
|
||||||
|
throw new std::runtime_error("first entry of the journal is corrupt");
|
||||||
|
}
|
||||||
|
journal_pos = bs->journal_start = je->journal_start;
|
||||||
|
crc32_last = je->crc32_replaced;
|
||||||
|
step = 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (step == 2 || step == 3)
|
||||||
|
{
|
||||||
|
// Step 3: Read journal
|
||||||
|
if (submitted)
|
||||||
|
{
|
||||||
|
struct io_uring_cqe *cqe;
|
||||||
|
io_uring_peek_cqe(bs->ring, &cqe);
|
||||||
|
if (cqe)
|
||||||
|
{
|
||||||
|
if (cqe->res < 0)
|
||||||
|
{
|
||||||
|
throw new std::runtime_error(
|
||||||
|
std::string("read journal failed at offset ") + std::to_string(journal_pos) +
|
||||||
|
std::string(": ") + strerror(-cqe->res)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
done_pos = journal_pos;
|
||||||
|
done_buf = submitted;
|
||||||
|
done_len = cqe->res;
|
||||||
|
journal_pos += cqe->res;
|
||||||
|
if (journal_pos >= bs->journal_len)
|
||||||
|
{
|
||||||
|
// Continue from the beginning
|
||||||
|
journal_pos = 512;
|
||||||
|
}
|
||||||
|
submitted = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!submitted && step != 3)
|
||||||
|
{
|
||||||
|
struct io_uring_sqe *sqe = io_uring_get_sqe(bs->ring);
|
||||||
|
if (!sqe)
|
||||||
|
{
|
||||||
|
throw new std::runtime_error("io_uring is full while trying to read journal");
|
||||||
|
}
|
||||||
|
uint64_t end = bs->journal_len;
|
||||||
|
if (journal_pos < bs->journal_start)
|
||||||
|
{
|
||||||
|
end = bs->journal_start;
|
||||||
|
}
|
||||||
|
submit_iov = {
|
||||||
|
journal_buffer + (done_buf == 1 ? JOURNAL_BUFFER_SIZE : 0),
|
||||||
|
end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE,
|
||||||
|
};
|
||||||
|
io_uring_prep_readv(sqe, bs->journal_fd, &submit_iov, 1, bs->journal_offset + journal_pos);
|
||||||
|
io_uring_submit(bs->ring);
|
||||||
|
submitted = done_buf == 1 ? 2 : 1;
|
||||||
|
}
|
||||||
|
if (done_buf && step != 3)
|
||||||
|
{
|
||||||
|
// handle journal entries
|
||||||
|
if (handle_journal(journal_buffer + (done_buf == 1 ? 0 : JOURNAL_BUFFER_SIZE), done_len) == 0)
|
||||||
|
{
|
||||||
|
// finish
|
||||||
|
step = 3;
|
||||||
|
}
|
||||||
|
done_buf = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (step == 99)
|
||||||
|
{
|
||||||
|
free(journal_buffer);
|
||||||
|
journal_buffer = NULL;
|
||||||
|
step = 100;
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int blockstore_init_journal::handle_journal(void *buf, int len)
|
||||||
|
{
|
||||||
|
int total_pos = 0;
|
||||||
|
while (total_pos < len)
|
||||||
|
{
|
||||||
|
int pos = 0, skip = 0;
|
||||||
|
while (pos < 512)
|
||||||
|
{
|
||||||
|
journal_entry *je = (journal_entry*)((uint8_t*)buf + total_pos + pos);
|
||||||
|
if (je->magic != JOURNAL_MAGIC || je_crc32(je) != je->crc32 ||
|
||||||
|
je->type < JE_SMALL_WRITE || je->type > JE_DELETE || je->crc32_prev != crc32_last)
|
||||||
|
{
|
||||||
|
// Invalid entry - end of the journal
|
||||||
|
bs->journal_end = done_pos + total_pos + pos;
|
||||||
|
// FIXME: save <skip>
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
pos += je->size;
|
||||||
|
if (je->type == JE_SMALL_WRITE)
|
||||||
|
{
|
||||||
|
// oid, version, offset, len
|
||||||
|
bs->dirty_queue[je->small_write.oid].push_back((dirty_entry){
|
||||||
|
.version = je->small_write.version,
|
||||||
|
.state = ST_J_SYNCED,
|
||||||
|
.flags = 0,
|
||||||
|
// FIXME: data in journal may never be non-contiguous
|
||||||
|
.location = done_pos + total_pos + 512 + skip,
|
||||||
|
.offset = je->small_write.offset,
|
||||||
|
.size = je->small_write.len,
|
||||||
|
});
|
||||||
|
skip += je->small_write.len;
|
||||||
|
}
|
||||||
|
else if (je->type == JE_BIG_WRITE)
|
||||||
|
{
|
||||||
|
// oid, version, block
|
||||||
|
bs->dirty_queue[je->big_write.oid].push_back((dirty_entry){
|
||||||
|
.version = je->big_write.version,
|
||||||
|
.state = ST_D_META_SYNCED,
|
||||||
|
.flags = 0,
|
||||||
|
.location = je->big_write.block,
|
||||||
|
.offset = 0,
|
||||||
|
.size = bs->block_size,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else if (je->type == JE_STABLE)
|
||||||
|
{
|
||||||
|
// oid, version
|
||||||
|
auto it = bs->dirty_queue.find(je->stable.oid);
|
||||||
|
if (it == bs->dirty_queue.end())
|
||||||
|
{
|
||||||
|
// FIXME ignore entry, but warn
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
auto & lst = it->second;
|
||||||
|
for (int i = 0; i < lst.size(); i++)
|
||||||
|
{
|
||||||
|
if (lst[i].version == je->stable.version)
|
||||||
|
{
|
||||||
|
lst[i].state = (lst[i].state == ST_D_META_SYNCED ? ST_D_STABLE : ST_J_STABLE);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (je->type == JE_DELETE)
|
||||||
|
{
|
||||||
|
// oid, version
|
||||||
|
// FIXME
|
||||||
|
}
|
||||||
|
}
|
||||||
|
total_pos += 512 + skip;
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,20 +3,27 @@
|
||||||
class blockstore_init_meta
|
class blockstore_init_meta
|
||||||
{
|
{
|
||||||
blockstore *bs;
|
blockstore *bs;
|
||||||
uint8_t *metadata_buffer;
|
uint8_t *metadata_buffer = NULL;
|
||||||
uint64_t metadata_read = 0;
|
uint64_t metadata_read = 0;
|
||||||
struct iovec submit_iov;
|
struct iovec submit_iov;
|
||||||
int prev = 0, prev_done = 0, done_len = 0, submitted = 0, done_cnt = 0;
|
int prev = 0, prev_done = 0, done_len = 0, submitted = 0, done_cnt = 0;
|
||||||
|
void handle_entries(struct clean_disk_entry* entries, int count);
|
||||||
public:
|
public:
|
||||||
blockstore_init_meta(blockstore* bs);
|
blockstore_init_meta(blockstore* bs);
|
||||||
int read_loop();
|
int read_loop();
|
||||||
void handle_entries(struct clean_disk_entry* entries, int count);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
class blockstore_init_journal
|
class blockstore_init_journal
|
||||||
{
|
{
|
||||||
blockstore *bs;
|
blockstore *bs;
|
||||||
uint8_t *journal_buffer;
|
uint8_t *journal_buffer = NULL;
|
||||||
|
int step = 0;
|
||||||
|
uint32_t crc32_last = 0;
|
||||||
|
struct iovec submit_iov;
|
||||||
|
uint64_t done_pos = 0, journal_pos = 0;
|
||||||
|
uint64_t cur_skip = 0;
|
||||||
|
int submitted = 0, done_buf = 0, done_len = 0;
|
||||||
|
int handle_journal(void *buf, int len);
|
||||||
public:
|
public:
|
||||||
blockstore_init_journal(blockstore* bs);
|
blockstore_init_journal(blockstore* bs);
|
||||||
int read_loop();
|
int read_loop();
|
||||||
|
|
|
@ -12,22 +12,23 @@
|
||||||
#define JE_STABLE 0x04
|
#define JE_STABLE 0x04
|
||||||
#define JE_DELETE 0x05
|
#define JE_DELETE 0x05
|
||||||
|
|
||||||
|
// crc32c comes first to ease calculation and is equal to crc32()
|
||||||
struct __attribute__((__packed__)) journal_entry_start
|
struct __attribute__((__packed__)) journal_entry_start
|
||||||
{
|
{
|
||||||
|
uint32_t crc32;
|
||||||
uint16_t magic;
|
uint16_t magic;
|
||||||
uint16_t type;
|
uint16_t type;
|
||||||
uint32_t size;
|
uint32_t size;
|
||||||
uint32_t crc32;
|
uint32_t crc32_replaced;
|
||||||
uint32_t reserved1;
|
uint64_t journal_start;
|
||||||
uint64_t offset;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct __attribute__((__packed__)) journal_entry_small_write
|
struct __attribute__((__packed__)) journal_entry_small_write
|
||||||
{
|
{
|
||||||
|
uint32_t crc32;
|
||||||
uint16_t magic;
|
uint16_t magic;
|
||||||
uint16_t type;
|
uint16_t type;
|
||||||
uint32_t size;
|
uint32_t size;
|
||||||
uint32_t crc32;
|
|
||||||
uint32_t crc32_prev;
|
uint32_t crc32_prev;
|
||||||
object_id oid;
|
object_id oid;
|
||||||
uint64_t version;
|
uint64_t version;
|
||||||
|
@ -38,10 +39,10 @@ struct __attribute__((__packed__)) journal_entry_small_write
|
||||||
|
|
||||||
struct __attribute__((__packed__)) journal_entry_big_write
|
struct __attribute__((__packed__)) journal_entry_big_write
|
||||||
{
|
{
|
||||||
|
uint32_t crc32;
|
||||||
uint16_t magic;
|
uint16_t magic;
|
||||||
uint16_t type;
|
uint16_t type;
|
||||||
uint32_t size;
|
uint32_t size;
|
||||||
uint32_t crc32;
|
|
||||||
uint32_t crc32_prev;
|
uint32_t crc32_prev;
|
||||||
object_id oid;
|
object_id oid;
|
||||||
uint64_t version;
|
uint64_t version;
|
||||||
|
@ -50,10 +51,10 @@ struct __attribute__((__packed__)) journal_entry_big_write
|
||||||
|
|
||||||
struct __attribute__((__packed__)) journal_entry_stable
|
struct __attribute__((__packed__)) journal_entry_stable
|
||||||
{
|
{
|
||||||
|
uint32_t crc32;
|
||||||
uint16_t magic;
|
uint16_t magic;
|
||||||
uint16_t type;
|
uint16_t type;
|
||||||
uint32_t size;
|
uint32_t size;
|
||||||
uint32_t crc32;
|
|
||||||
uint32_t crc32_prev;
|
uint32_t crc32_prev;
|
||||||
object_id oid;
|
object_id oid;
|
||||||
uint64_t version;
|
uint64_t version;
|
||||||
|
@ -61,10 +62,10 @@ struct __attribute__((__packed__)) journal_entry_stable
|
||||||
|
|
||||||
struct __attribute__((__packed__)) journal_entry_del
|
struct __attribute__((__packed__)) journal_entry_del
|
||||||
{
|
{
|
||||||
|
uint32_t crc32;
|
||||||
uint16_t magic;
|
uint16_t magic;
|
||||||
uint16_t type;
|
uint16_t type;
|
||||||
uint32_t size;
|
uint32_t size;
|
||||||
uint32_t crc32;
|
|
||||||
uint32_t crc32_prev;
|
uint32_t crc32_prev;
|
||||||
object_id oid;
|
object_id oid;
|
||||||
uint64_t version;
|
uint64_t version;
|
||||||
|
@ -76,10 +77,11 @@ struct __attribute__((__packed__)) journal_entry
|
||||||
{
|
{
|
||||||
struct __attribute__((__packed__))
|
struct __attribute__((__packed__))
|
||||||
{
|
{
|
||||||
|
uint32_t crc32;
|
||||||
uint16_t magic;
|
uint16_t magic;
|
||||||
uint16_t type;
|
uint16_t type;
|
||||||
uint32_t size;
|
uint32_t size;
|
||||||
uint32_t crc32;
|
uint32_t crc32_prev;
|
||||||
};
|
};
|
||||||
journal_entry_start start;
|
journal_entry_start start;
|
||||||
journal_entry_small_write small_write;
|
journal_entry_small_write small_write;
|
||||||
|
|
15
crc32c.c
15
crc32c.c
|
@ -96,3 +96,18 @@ uint32_t crc32c(uint8_t *buf, int len)
|
||||||
}
|
}
|
||||||
return crc^0xffffffff;
|
return crc^0xffffffff;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint32_t crc32c_zero4(uint8_t *buf, int len)
|
||||||
|
{
|
||||||
|
uint32_t crc = 0xffffffff;
|
||||||
|
// pretend that first 4 bytes are zero
|
||||||
|
crc = (crc>>8) ^ crctable[(crc ^ 0) & 0xFF];
|
||||||
|
crc = (crc>>8) ^ crctable[(crc ^ 0) & 0xFF];
|
||||||
|
crc = (crc>>8) ^ crctable[(crc ^ 0) & 0xFF];
|
||||||
|
crc = (crc>>8) ^ crctable[(crc ^ 0) & 0xFF];
|
||||||
|
while (len-- > 0)
|
||||||
|
{
|
||||||
|
crc = (crc>>8) ^ crctable[(crc ^ (*buf++)) & 0xFF];
|
||||||
|
}
|
||||||
|
return crc^0xffffffff;
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue