forked from vitalif/vitastor
Implement blockstore rollback operation
Rollback operation is required for the primary OSD to kill unstable object versions in OSD peers so they don't occupy journal spaceblocking-uring-test
parent
98efdb78bd
commit
2b09710d6f
2
Makefile
2
Makefile
|
@ -1,5 +1,5 @@
|
||||||
BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o blockstore_open.o blockstore_journal.o blockstore_read.o \
|
BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o blockstore_open.o blockstore_journal.o blockstore_read.o \
|
||||||
blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_flush.o crc32c.o ringloop.o timerfd_interval.o
|
blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o timerfd_interval.o
|
||||||
CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
|
CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
|
||||||
all: $(BLOCKSTORE_OBJS) libfio_blockstore.so osd libfio_sec_osd.so test_blockstore stub_osd
|
all: $(BLOCKSTORE_OBJS) libfio_blockstore.so osd libfio_sec_osd.so test_blockstore stub_osd
|
||||||
clean:
|
clean:
|
||||||
|
|
|
@ -25,8 +25,8 @@
|
||||||
#define BS_OP_STABLE 4
|
#define BS_OP_STABLE 4
|
||||||
#define BS_OP_DELETE 5
|
#define BS_OP_DELETE 5
|
||||||
#define BS_OP_LIST 6
|
#define BS_OP_LIST 6
|
||||||
#define BS_OP_MAX 6
|
#define BS_OP_ROLLBACK 7
|
||||||
#define BS_OP_TYPE_MASK 0x7
|
#define BS_OP_MAX 7
|
||||||
|
|
||||||
#define BS_OP_PRIVATE_DATA_SIZE 256
|
#define BS_OP_PRIVATE_DATA_SIZE 256
|
||||||
|
|
||||||
|
|
|
@ -438,7 +438,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
|
||||||
{
|
{
|
||||||
goto resume_0;
|
goto resume_0;
|
||||||
}
|
}
|
||||||
dirty_it = dirty_end;
|
dirty_it = dirty_start = dirty_end;
|
||||||
v.clear();
|
v.clear();
|
||||||
wait_count = 0;
|
wait_count = 0;
|
||||||
copy_count = 0;
|
copy_count = 0;
|
||||||
|
@ -511,6 +511,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
|
||||||
);
|
);
|
||||||
throw std::runtime_error(err);
|
throw std::runtime_error(err);
|
||||||
}
|
}
|
||||||
|
dirty_start = dirty_it;
|
||||||
if (dirty_it == bs->dirty_db.begin())
|
if (dirty_it == bs->dirty_db.begin())
|
||||||
{
|
{
|
||||||
break;
|
break;
|
||||||
|
@ -593,38 +594,7 @@ void journal_flusher_co::update_clean_db()
|
||||||
.location = clean_loc,
|
.location = clean_loc,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
dirty_it = dirty_end;
|
bs->erase_dirty(dirty_start, std::next(dirty_end), clean_loc);
|
||||||
while (1)
|
|
||||||
{
|
|
||||||
if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc)
|
|
||||||
{
|
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
|
||||||
printf("Free block %lu\n", dirty_it->second.location >> bs->block_order);
|
|
||||||
#endif
|
|
||||||
bs->data_alloc->set(dirty_it->second.location >> bs->block_order, false);
|
|
||||||
}
|
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
|
||||||
printf("remove usage of journal offset %lu by %lu:%lu v%lu\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
|
|
||||||
#endif
|
|
||||||
int used = --bs->journal.used_sectors[dirty_it->second.journal_sector];
|
|
||||||
if (used == 0)
|
|
||||||
{
|
|
||||||
bs->journal.used_sectors.erase(dirty_it->second.journal_sector);
|
|
||||||
}
|
|
||||||
if (dirty_it == bs->dirty_db.begin())
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
dirty_it--;
|
|
||||||
if (dirty_it->first.oid != cur.oid)
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Then, basically, remove everything up to the current version from dirty_db...
|
|
||||||
if (dirty_it->first.oid != cur.oid)
|
|
||||||
dirty_it++;
|
|
||||||
bs->dirty_db.erase(dirty_it, std::next(dirty_end));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
|
bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
|
||||||
|
|
|
@ -41,7 +41,7 @@ class journal_flusher_co
|
||||||
std::list<flusher_sync_t>::iterator cur_sync;
|
std::list<flusher_sync_t>::iterator cur_sync;
|
||||||
|
|
||||||
obj_ver_id cur;
|
obj_ver_id cur;
|
||||||
std::map<obj_ver_id, dirty_entry>::iterator dirty_it, dirty_end;
|
std::map<obj_ver_id, dirty_entry>::iterator dirty_it, dirty_start, dirty_end;
|
||||||
std::map<object_id, uint64_t>::iterator repeat_it;
|
std::map<object_id, uint64_t>::iterator repeat_it;
|
||||||
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
|
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
|
||||||
|
|
||||||
|
|
|
@ -127,8 +127,7 @@ void blockstore_impl_t::loop()
|
||||||
}
|
}
|
||||||
else if (PRIV(op)->wait_for)
|
else if (PRIV(op)->wait_for)
|
||||||
{
|
{
|
||||||
if ((op->opcode & BS_OP_TYPE_MASK) == BS_OP_WRITE ||
|
if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_DELETE)
|
||||||
(op->opcode & BS_OP_TYPE_MASK) == BS_OP_DELETE)
|
|
||||||
{
|
{
|
||||||
has_writes = 2;
|
has_writes = 2;
|
||||||
}
|
}
|
||||||
|
@ -138,12 +137,11 @@ void blockstore_impl_t::loop()
|
||||||
unsigned ring_space = ringloop->space_left();
|
unsigned ring_space = ringloop->space_left();
|
||||||
unsigned prev_sqe_pos = ringloop->save();
|
unsigned prev_sqe_pos = ringloop->save();
|
||||||
int dequeue_op = 0;
|
int dequeue_op = 0;
|
||||||
if ((op->opcode & BS_OP_TYPE_MASK) == BS_OP_READ)
|
if (op->opcode == BS_OP_READ)
|
||||||
{
|
{
|
||||||
dequeue_op = dequeue_read(op);
|
dequeue_op = dequeue_read(op);
|
||||||
}
|
}
|
||||||
else if ((op->opcode & BS_OP_TYPE_MASK) == BS_OP_WRITE ||
|
else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_DELETE)
|
||||||
(op->opcode & BS_OP_TYPE_MASK) == BS_OP_DELETE)
|
|
||||||
{
|
{
|
||||||
if (has_writes == 2)
|
if (has_writes == 2)
|
||||||
{
|
{
|
||||||
|
@ -153,7 +151,7 @@ void blockstore_impl_t::loop()
|
||||||
dequeue_op = dequeue_write(op);
|
dequeue_op = dequeue_write(op);
|
||||||
has_writes = dequeue_op ? 1 : 2;
|
has_writes = dequeue_op ? 1 : 2;
|
||||||
}
|
}
|
||||||
else if ((op->opcode & BS_OP_TYPE_MASK) == BS_OP_SYNC)
|
else if (op->opcode == BS_OP_SYNC)
|
||||||
{
|
{
|
||||||
// wait for all small writes to be submitted
|
// wait for all small writes to be submitted
|
||||||
// wait for all big writes to complete, submit data device fsync
|
// wait for all big writes to complete, submit data device fsync
|
||||||
|
@ -166,11 +164,15 @@ void blockstore_impl_t::loop()
|
||||||
}
|
}
|
||||||
dequeue_op = dequeue_sync(op);
|
dequeue_op = dequeue_sync(op);
|
||||||
}
|
}
|
||||||
else if ((op->opcode & BS_OP_TYPE_MASK) == BS_OP_STABLE)
|
else if (op->opcode == BS_OP_STABLE)
|
||||||
{
|
{
|
||||||
dequeue_op = dequeue_stable(op);
|
dequeue_op = dequeue_stable(op);
|
||||||
}
|
}
|
||||||
else if ((op->opcode & BS_OP_TYPE_MASK) == BS_OP_LIST)
|
else if (op->opcode == BS_OP_ROLLBACK)
|
||||||
|
{
|
||||||
|
dequeue_op = dequeue_rollback(op);
|
||||||
|
}
|
||||||
|
else if (op->opcode == BS_OP_LIST)
|
||||||
{
|
{
|
||||||
process_list(op);
|
process_list(op);
|
||||||
dequeue_op = true;
|
dequeue_op = true;
|
||||||
|
@ -296,15 +298,14 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
|
||||||
|
|
||||||
void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
|
void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
|
||||||
{
|
{
|
||||||
int type = op->opcode & BS_OP_TYPE_MASK;
|
if (op->opcode < BS_OP_MIN || op->opcode > BS_OP_MAX ||
|
||||||
if (type < BS_OP_MIN || type > BS_OP_MAX ||
|
((op->opcode == BS_OP_READ || op->opcode == BS_OP_WRITE) && (
|
||||||
((type == BS_OP_READ || type == BS_OP_WRITE) && (
|
|
||||||
op->offset >= block_size ||
|
op->offset >= block_size ||
|
||||||
op->len > block_size-op->offset ||
|
op->len > block_size-op->offset ||
|
||||||
(op->len % disk_alignment)
|
(op->len % disk_alignment)
|
||||||
)) ||
|
)) ||
|
||||||
readonly && type != BS_OP_READ ||
|
readonly && op->opcode != BS_OP_READ ||
|
||||||
first && type == BS_OP_WRITE)
|
first && op->opcode == BS_OP_WRITE)
|
||||||
{
|
{
|
||||||
// Basic verification not passed
|
// Basic verification not passed
|
||||||
op->retval = -EINVAL;
|
op->retval = -EINVAL;
|
||||||
|
@ -324,7 +325,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
|
||||||
{
|
{
|
||||||
submit_queue.push_front(op);
|
submit_queue.push_front(op);
|
||||||
}
|
}
|
||||||
if (type == BS_OP_WRITE)
|
if (op->opcode == BS_OP_WRITE)
|
||||||
{
|
{
|
||||||
enqueue_write(op);
|
enqueue_write(op);
|
||||||
}
|
}
|
||||||
|
|
|
@ -167,6 +167,8 @@ struct blockstore_op_private_t
|
||||||
int sync_state, prev_sync_count;
|
int sync_state, prev_sync_count;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;
|
||||||
|
|
||||||
#include "blockstore_init.h"
|
#include "blockstore_init.h"
|
||||||
|
|
||||||
#include "blockstore_flush.h"
|
#include "blockstore_flush.h"
|
||||||
|
@ -199,7 +201,7 @@ class blockstore_impl_t
|
||||||
// Another option is https://github.com/algorithm-ninja/cpp-btree
|
// Another option is https://github.com/algorithm-ninja/cpp-btree
|
||||||
spp::sparse_hash_map<object_id, clean_entry> clean_db;
|
spp::sparse_hash_map<object_id, clean_entry> clean_db;
|
||||||
uint8_t *clean_bitmap = NULL;
|
uint8_t *clean_bitmap = NULL;
|
||||||
std::map<obj_ver_id, dirty_entry> dirty_db;
|
blockstore_dirty_db_t dirty_db;
|
||||||
std::list<blockstore_op_t*> submit_queue; // FIXME: funny thing is that vector is better here
|
std::list<blockstore_op_t*> submit_queue; // FIXME: funny thing is that vector is better here
|
||||||
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
|
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
|
||||||
std::list<blockstore_op_t*> in_progress_syncs; // ...and probably here, too
|
std::list<blockstore_op_t*> in_progress_syncs; // ...and probably here, too
|
||||||
|
@ -278,6 +280,11 @@ class blockstore_impl_t
|
||||||
void handle_stable_event(ring_data_t *data, blockstore_op_t *op);
|
void handle_stable_event(ring_data_t *data, blockstore_op_t *op);
|
||||||
void stabilize_object(object_id oid, uint64_t max_ver);
|
void stabilize_object(object_id oid, uint64_t max_ver);
|
||||||
|
|
||||||
|
// Rollback
|
||||||
|
int dequeue_rollback(blockstore_op_t *op);
|
||||||
|
void handle_rollback_event(ring_data_t *data, blockstore_op_t *op);
|
||||||
|
void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc);
|
||||||
|
|
||||||
// List
|
// List
|
||||||
void process_list(blockstore_op_t *op);
|
void process_list(blockstore_op_t *op);
|
||||||
|
|
||||||
|
|
|
@ -559,7 +559,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
if (it == bs->dirty_db.end())
|
if (it == bs->dirty_db.end())
|
||||||
{
|
{
|
||||||
// journal contains a legitimate STABLE entry for a non-existing dirty write
|
// journal contains a legitimate STABLE entry for a non-existing dirty write
|
||||||
// this probably means that journal was trimmed between WRITTEN and STABLE entries
|
// this probably means that journal was trimmed between WRITE and STABLE entries
|
||||||
// skip it
|
// skip it
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -583,6 +583,57 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
bs->unstable_writes.erase(unstab_it);
|
bs->unstable_writes.erase(unstab_it);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if (je->type == JE_ROLLBACK)
|
||||||
|
{
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
printf("je_rollback oid=%lu:%lu ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
|
||||||
|
#endif
|
||||||
|
// rollback dirty writes of <oid> up to <version>
|
||||||
|
auto it = bs->dirty_db.lower_bound((obj_ver_id){
|
||||||
|
.oid = je->rollback.oid,
|
||||||
|
.version = UINT64_MAX,
|
||||||
|
});
|
||||||
|
if (it != bs->dirty_db.begin())
|
||||||
|
{
|
||||||
|
uint64_t max_unstable = 0;
|
||||||
|
auto rm_start = it;
|
||||||
|
auto rm_end = it;
|
||||||
|
it--;
|
||||||
|
while (it->first.oid == je->rollback.oid &&
|
||||||
|
it->first.version > je->rollback.version &&
|
||||||
|
!IS_IN_FLIGHT(it->second.state) &&
|
||||||
|
!IS_STABLE(it->second.state))
|
||||||
|
{
|
||||||
|
if (it->first.oid != je->rollback.oid)
|
||||||
|
break;
|
||||||
|
else if (it->first.version <= je->rollback.version)
|
||||||
|
{
|
||||||
|
if (!IS_STABLE(it->second.state))
|
||||||
|
max_unstable = it->first.version;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else if (IS_STABLE(it->second.state))
|
||||||
|
break;
|
||||||
|
// Remove entry
|
||||||
|
rm_start = it;
|
||||||
|
if (it == bs->dirty_db.begin())
|
||||||
|
break;
|
||||||
|
it--;
|
||||||
|
}
|
||||||
|
if (rm_start != rm_end)
|
||||||
|
{
|
||||||
|
bs->erase_dirty(rm_start, rm_end, UINT64_MAX);
|
||||||
|
}
|
||||||
|
auto unstab_it = bs->unstable_writes.find(je->rollback.oid);
|
||||||
|
if (unstab_it != bs->unstable_writes.end())
|
||||||
|
{
|
||||||
|
if (max_unstable == 0)
|
||||||
|
bs->unstable_writes.erase(unstab_it);
|
||||||
|
else
|
||||||
|
unstab_it->second = max_unstable;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
else if (je->type == JE_DELETE)
|
else if (je->type == JE_DELETE)
|
||||||
{
|
{
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
#define JE_BIG_WRITE 0x03
|
#define JE_BIG_WRITE 0x03
|
||||||
#define JE_STABLE 0x04
|
#define JE_STABLE 0x04
|
||||||
#define JE_DELETE 0x05
|
#define JE_DELETE 0x05
|
||||||
|
#define JE_ROLLBACK 0x06
|
||||||
|
|
||||||
// crc32c comes first to ease calculation and is equal to crc32()
|
// crc32c comes first to ease calculation and is equal to crc32()
|
||||||
struct __attribute__((__packed__)) journal_entry_start
|
struct __attribute__((__packed__)) journal_entry_start
|
||||||
|
@ -71,6 +72,17 @@ struct __attribute__((__packed__)) journal_entry_stable
|
||||||
uint64_t version;
|
uint64_t version;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct __attribute__((__packed__)) journal_entry_rollback
|
||||||
|
{
|
||||||
|
uint32_t crc32;
|
||||||
|
uint16_t magic;
|
||||||
|
uint16_t type;
|
||||||
|
uint32_t size;
|
||||||
|
uint32_t crc32_prev;
|
||||||
|
object_id oid;
|
||||||
|
uint64_t version;
|
||||||
|
};
|
||||||
|
|
||||||
struct __attribute__((__packed__)) journal_entry_del
|
struct __attribute__((__packed__)) journal_entry_del
|
||||||
{
|
{
|
||||||
uint32_t crc32;
|
uint32_t crc32;
|
||||||
|
@ -98,6 +110,7 @@ struct __attribute__((__packed__)) journal_entry
|
||||||
journal_entry_small_write small_write;
|
journal_entry_small_write small_write;
|
||||||
journal_entry_big_write big_write;
|
journal_entry_big_write big_write;
|
||||||
journal_entry_stable stable;
|
journal_entry_stable stable;
|
||||||
|
journal_entry_rollback rollback;
|
||||||
journal_entry_del del;
|
journal_entry_del del;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
|
@ -0,0 +1,187 @@
|
||||||
|
#include "blockstore_impl.h"
|
||||||
|
|
||||||
|
int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
|
||||||
|
{
|
||||||
|
obj_ver_id* v;
|
||||||
|
int i, todo = op->len;
|
||||||
|
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||||
|
{
|
||||||
|
// Check that there are some versions greater than v->version (which may be zero),
|
||||||
|
// check that they're unstable, synced, and not currently written to
|
||||||
|
auto dirty_it = dirty_db.lower_bound((obj_ver_id){
|
||||||
|
.oid = v->oid,
|
||||||
|
.version = UINT64_MAX,
|
||||||
|
});
|
||||||
|
if (dirty_it == dirty_db.begin())
|
||||||
|
{
|
||||||
|
bad_op:
|
||||||
|
op->retval = -EINVAL;
|
||||||
|
FINISH_OP(op);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
dirty_it--;
|
||||||
|
if (dirty_it->first.oid != v->oid || dirty_it->first.version < v->version)
|
||||||
|
{
|
||||||
|
goto bad_op;
|
||||||
|
}
|
||||||
|
while (dirty_it->first.oid == v->oid && dirty_it->first.version > v->version)
|
||||||
|
{
|
||||||
|
if (!IS_SYNCED(dirty_it->second.state) ||
|
||||||
|
IS_STABLE(dirty_it->second.state))
|
||||||
|
{
|
||||||
|
goto bad_op;
|
||||||
|
}
|
||||||
|
if (dirty_it == dirty_db.begin())
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
dirty_it--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Check journal space
|
||||||
|
blockstore_journal_check_t space_check(this);
|
||||||
|
if (!space_check.check_available(op, todo, sizeof(journal_entry_rollback), 0))
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
// There is sufficient space. Get SQEs
|
||||||
|
struct io_uring_sqe *sqe[space_check.sectors_required];
|
||||||
|
for (i = 0; i < space_check.sectors_required; i++)
|
||||||
|
{
|
||||||
|
BS_SUBMIT_GET_SQE_DECL(sqe[i]);
|
||||||
|
}
|
||||||
|
// Prepare and submit journal entries
|
||||||
|
auto cb = [this, op](ring_data_t *data) { handle_rollback_event(data, op); };
|
||||||
|
int s = 0, cur_sector = -1;
|
||||||
|
if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_rollback) &&
|
||||||
|
journal.sector_info[journal.cur_sector].dirty)
|
||||||
|
{
|
||||||
|
if (cur_sector == -1)
|
||||||
|
PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
|
||||||
|
cur_sector = journal.cur_sector;
|
||||||
|
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
|
||||||
|
}
|
||||||
|
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||||
|
{
|
||||||
|
// FIXME This is here only for the purpose of tracking unstable_writes. Remove if not required
|
||||||
|
// FIXME ...aaaand this is similar to blockstore_init.cpp - maybe dedup it?
|
||||||
|
auto dirty_it = dirty_db.lower_bound((obj_ver_id){
|
||||||
|
.oid = v->oid,
|
||||||
|
.version = UINT64_MAX,
|
||||||
|
});
|
||||||
|
uint64_t max_unstable = 0;
|
||||||
|
while (dirty_it != dirty_db.begin())
|
||||||
|
{
|
||||||
|
dirty_it--;
|
||||||
|
if (dirty_it->first.oid != v->oid)
|
||||||
|
break;
|
||||||
|
else if (dirty_it->first.version <= v->version)
|
||||||
|
{
|
||||||
|
if (!IS_STABLE(dirty_it->second.state))
|
||||||
|
max_unstable = dirty_it->first.version;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto unstab_it = unstable_writes.find(v->oid);
|
||||||
|
if (unstab_it != unstable_writes.end())
|
||||||
|
{
|
||||||
|
if (max_unstable == 0)
|
||||||
|
unstable_writes.erase(unstab_it);
|
||||||
|
else
|
||||||
|
unstab_it->second = max_unstable;
|
||||||
|
}
|
||||||
|
journal_entry_rollback *je = (journal_entry_rollback*)
|
||||||
|
prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback));
|
||||||
|
journal.sector_info[journal.cur_sector].dirty = false;
|
||||||
|
je->oid = v->oid;
|
||||||
|
je->version = v->version;
|
||||||
|
je->crc32 = je_crc32((journal_entry*)je);
|
||||||
|
journal.crc32_last = je->crc32;
|
||||||
|
if (cur_sector != journal.cur_sector)
|
||||||
|
{
|
||||||
|
if (cur_sector == -1)
|
||||||
|
PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
|
||||||
|
cur_sector = journal.cur_sector;
|
||||||
|
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
|
||||||
|
PRIV(op)->pending_ops = s;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t *op)
|
||||||
|
{
|
||||||
|
live = true;
|
||||||
|
if (data->res != data->iov.iov_len)
|
||||||
|
{
|
||||||
|
throw std::runtime_error(
|
||||||
|
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
|
||||||
|
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
PRIV(op)->pending_ops--;
|
||||||
|
if (PRIV(op)->pending_ops == 0)
|
||||||
|
{
|
||||||
|
// Release used journal sectors
|
||||||
|
release_journal_sectors(op);
|
||||||
|
obj_ver_id* v;
|
||||||
|
int i;
|
||||||
|
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||||
|
{
|
||||||
|
// Erase dirty_db entries
|
||||||
|
auto rm_end = dirty_db.lower_bound((obj_ver_id){
|
||||||
|
.oid = v->oid,
|
||||||
|
.version = UINT64_MAX,
|
||||||
|
});
|
||||||
|
rm_end--;
|
||||||
|
auto rm_start = rm_end;
|
||||||
|
while (1)
|
||||||
|
{
|
||||||
|
if (rm_end->first.oid != v->oid)
|
||||||
|
break;
|
||||||
|
else if (rm_end->first.version <= v->version)
|
||||||
|
break;
|
||||||
|
rm_start = rm_end;
|
||||||
|
if (rm_end == dirty_db.begin())
|
||||||
|
break;
|
||||||
|
rm_end--;
|
||||||
|
}
|
||||||
|
if (rm_end != rm_start)
|
||||||
|
erase_dirty(rm_start, rm_end, UINT64_MAX);
|
||||||
|
}
|
||||||
|
journal.trim();
|
||||||
|
// Acknowledge op
|
||||||
|
op->retval = 0;
|
||||||
|
FINISH_OP(op);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc)
|
||||||
|
{
|
||||||
|
auto dirty_it = dirty_end;
|
||||||
|
while (dirty_it != dirty_start)
|
||||||
|
{
|
||||||
|
dirty_it--;
|
||||||
|
if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc)
|
||||||
|
{
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
printf("Free block %lu\n", dirty_it->second.location >> block_order);
|
||||||
|
#endif
|
||||||
|
data_alloc->set(dirty_it->second.location >> block_order, false);
|
||||||
|
}
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
printf("remove usage of journal offset %lu by %lu:%lu v%lu\n", dirty_it->second.journal_sector,
|
||||||
|
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
|
||||||
|
#endif
|
||||||
|
int used = --journal.used_sectors[dirty_it->second.journal_sector];
|
||||||
|
if (used == 0)
|
||||||
|
{
|
||||||
|
journal.used_sectors.erase(dirty_it->second.journal_sector);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dirty_db.erase(dirty_start, dirty_end);
|
||||||
|
}
|
|
@ -145,7 +145,7 @@ void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *
|
||||||
{
|
{
|
||||||
// Release used journal sectors
|
// Release used journal sectors
|
||||||
release_journal_sectors(op);
|
release_journal_sectors(op);
|
||||||
// First step: mark dirty_db entries as stable, acknowledge op completion
|
// Mark dirty_db entries as stable, acknowledge op completion
|
||||||
obj_ver_id* v;
|
obj_ver_id* v;
|
||||||
int i;
|
int i;
|
||||||
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
void blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
void blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||||
{
|
{
|
||||||
// Check or assign version number
|
// Check or assign version number
|
||||||
bool found = false, deleted = false, is_del = (op->opcode & BS_OP_TYPE_MASK) == BS_OP_DELETE;
|
bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE);
|
||||||
uint64_t version = 1;
|
uint64_t version = 1;
|
||||||
if (dirty_db.size() > 0)
|
if (dirty_db.size() > 0)
|
||||||
{
|
{
|
||||||
|
|
|
@ -139,5 +139,9 @@ void osd_t::make_reply(osd_op_t *op)
|
||||||
op->reply.hdr.retval = op->bs_op.retval;
|
op->reply.hdr.retval = op->bs_op.retval;
|
||||||
if (op->op.hdr.opcode == OSD_OP_SECONDARY_LIST)
|
if (op->op.hdr.opcode == OSD_OP_SECONDARY_LIST)
|
||||||
op->reply.sec_list.stable_count = op->bs_op.version;
|
op->reply.sec_list.stable_count = op->bs_op.version;
|
||||||
|
else if (op->op.hdr.opcode == OSD_OP_SECONDARY_WRITE)
|
||||||
|
op->reply.sec_rw.version = op->bs_op.version;
|
||||||
|
else if (op->op.hdr.opcode == OSD_OP_SECONDARY_DELETE)
|
||||||
|
op->reply.sec_del.version = op->bs_op.version;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -64,6 +64,8 @@ struct __attribute__((__packed__)) osd_op_secondary_rw_t
|
||||||
struct __attribute__((__packed__)) osd_reply_secondary_rw_t
|
struct __attribute__((__packed__)) osd_reply_secondary_rw_t
|
||||||
{
|
{
|
||||||
osd_reply_header_t header;
|
osd_reply_header_t header;
|
||||||
|
// for writes: assigned version number
|
||||||
|
uint64_t version;
|
||||||
};
|
};
|
||||||
|
|
||||||
// delete object on the secondary OSD
|
// delete object on the secondary OSD
|
||||||
|
@ -79,6 +81,7 @@ struct __attribute__((__packed__)) osd_op_secondary_del_t
|
||||||
struct __attribute__((__packed__)) osd_reply_secondary_del_t
|
struct __attribute__((__packed__)) osd_reply_secondary_del_t
|
||||||
{
|
{
|
||||||
osd_reply_header_t header;
|
osd_reply_header_t header;
|
||||||
|
uint64_t version;
|
||||||
};
|
};
|
||||||
|
|
||||||
// sync to the secondary OSD
|
// sync to the secondary OSD
|
||||||
|
|
Loading…
Reference in New Issue