WIP OP_DELETE

blocking-uring-test
Vitaliy Filippov 2019-12-01 17:25:59 +03:00
parent 14b2f49f4d
commit a7a0946ba8
4 changed files with 75 additions and 11 deletions

View File

@ -29,19 +29,20 @@
// States are not stored on disk. Instead, they're deduced from the journal // States are not stored on disk. Instead, they're deduced from the journal
#define ST_IN_FLIGHT 1 #define ST_J_IN_FLIGHT 1
#define ST_J_SUBMITTED 2 #define ST_J_SUBMITTED 2
#define ST_J_WRITTEN 3 #define ST_J_WRITTEN 3
#define ST_J_SYNCED 4 #define ST_J_SYNCED 4
#define ST_J_STABLE 5 #define ST_J_STABLE 5
#define ST_D_IN_FLIGHT 15
#define ST_D_SUBMITTED 16 #define ST_D_SUBMITTED 16
#define ST_D_WRITTEN 17 #define ST_D_WRITTEN 17
#define ST_D_META_WRITTEN 19 #define ST_D_META_WRITTEN 19
#define ST_D_META_SYNCED 20 #define ST_D_META_SYNCED 20
#define ST_D_STABLE 21 #define ST_D_STABLE 21
#define ST_DEL_IN_FLIGHT 31
#define ST_DEL_SUBMITTED 32 #define ST_DEL_SUBMITTED 32
#define ST_DEL_WRITTEN 33 #define ST_DEL_WRITTEN 33
#define ST_DEL_SYNCED 34 #define ST_DEL_SYNCED 34
@ -49,9 +50,9 @@
#define ST_CURRENT 48 #define ST_CURRENT 48
#define IS_IN_FLIGHT(st) (st == ST_IN_FLIGHT || st == ST_J_SUBMITTED || st == ST_D_SUBMITTED || st == ST_DEL_SUBMITTED) #define IS_IN_FLIGHT(st) (st == ST_J_IN_FLIGHT || st == ST_D_IN_FLIGHT || st == ST_DEL_IN_FLIGHT || st == ST_J_SUBMITTED || st == ST_D_SUBMITTED || st == ST_DEL_SUBMITTED)
#define IS_STABLE(st) (st == ST_J_STABLE || st == ST_D_STABLE || st == ST_DEL_STABLE || st == ST_CURRENT) #define IS_STABLE(st) (st == ST_J_STABLE || st == ST_D_STABLE || st == ST_DEL_STABLE || st == ST_CURRENT)
#define IS_SYNCED(st) (IS_STABLE(st) || st == ST_J_SYNCED || st == ST_D_META_SYNCED) #define IS_SYNCED(st) (IS_STABLE(st) || st == ST_J_SYNCED || st == ST_D_META_SYNCED || st == ST_DEL_SYNCED)
#define IS_JOURNAL(st) (st >= ST_J_SUBMITTED && st <= ST_J_STABLE) #define IS_JOURNAL(st) (st >= ST_J_SUBMITTED && st <= ST_J_STABLE)
#define IS_BIG_WRITE(st) (st >= ST_D_SUBMITTED && st <= ST_D_STABLE) #define IS_BIG_WRITE(st) (st >= ST_D_SUBMITTED && st <= ST_D_STABLE)
#define IS_DELETE(st) (st >= ST_DEL_SUBMITTED && st <= ST_DEL_STABLE) #define IS_DELETE(st) (st >= ST_DEL_SUBMITTED && st <= ST_DEL_STABLE)
@ -314,6 +315,7 @@ class blockstore
// Write // Write
void enqueue_write(blockstore_operation *op); void enqueue_write(blockstore_operation *op);
int dequeue_write(blockstore_operation *op); int dequeue_write(blockstore_operation *op);
int dequeue_del(blockstore_operation *op);
void handle_write_event(ring_data_t *data, blockstore_operation *op); void handle_write_event(ring_data_t *data, blockstore_operation *op);
// Sync // Sync

View File

@ -19,6 +19,14 @@
// 128K (data) + sync + 512b (journal) + sync + 512b (journal) + sync + 512b (metadata) + sync. // 128K (data) + sync + 512b (journal) + sync + 512b (journal) + sync + 512b (metadata) + sync.
// WA = 1.012. Very good :) // WA = 1.012. Very good :)
// Stabilize delete:
// 1) Remove metadata entry and sync it
// 2) Remove dirty_db entry and clear previous journal entries
// Note that it will lead to problems in a degraded cluster, because deleting 2 of 3 replicas
// and restarting the last replica will then result in extra "missing" objects. To solve that
// we need to store the "tombstones" of deleted objects. We can't do that with current simple
// metadata storage so we'll skip TRIM implementation for now.
// AND We must do it in batches, for the sake of reduced fsync call count // AND We must do it in batches, for the sake of reduced fsync call count
// AND We must know what we stabilize. Basic workflow is like: // AND We must know what we stabilize. Basic workflow is like:
// 1) primary OSD receives sync request // 1) primary OSD receives sync request
@ -154,6 +162,10 @@ void blockstore::handle_stable_event(ring_data_t *data, blockstore_operation *op
{ {
dirty_it->second.state = ST_D_STABLE; dirty_it->second.state = ST_D_STABLE;
} }
else if (dirty_it->second.state == ST_DEL_SYNCED)
{
dirty_it->second.state = ST_DEL_STABLE;
}
else if (IS_STABLE(dirty_it->second.state)) else if (IS_STABLE(dirty_it->second.state))
{ {
break; break;

View File

@ -218,7 +218,7 @@ void blockstore::ack_one_sync(blockstore_operation *op)
#endif #endif
auto & unstab = unstable_writes[it->oid]; auto & unstab = unstable_writes[it->oid];
unstab = unstab < it->version ? it->version : unstab; unstab = unstab < it->version ? it->version : unstab;
dirty_db[*it].state = ST_J_SYNCED; dirty_db[*it].state = dirty_db[*it].state == ST_DEL_WRITTEN ? ST_DEL_SYNCED : ST_J_SYNCED;
} }
in_progress_syncs.erase(op->in_progress_ptr); in_progress_syncs.erase(op->in_progress_ptr);
op->retval = 0; op->retval = 0;

View File

@ -3,7 +3,7 @@
void blockstore::enqueue_write(blockstore_operation *op) void blockstore::enqueue_write(blockstore_operation *op)
{ {
// Assign version number // Assign version number
bool found = false; bool found = false, deleted = false, is_del = (op->flags & OP_TYPE_MASK) == OP_DELETE;
if (dirty_db.size() > 0) if (dirty_db.size() > 0)
{ {
auto dirty_it = dirty_db.upper_bound((obj_ver_id){ auto dirty_it = dirty_db.upper_bound((obj_ver_id){
@ -15,6 +15,7 @@ void blockstore::enqueue_write(blockstore_operation *op)
{ {
found = true; found = true;
op->version = dirty_it->first.version + 1; op->version = dirty_it->first.version + 1;
deleted = IS_DELETE(dirty_it->second.state);
} }
} }
if (!found) if (!found)
@ -26,22 +27,34 @@ void blockstore::enqueue_write(blockstore_operation *op)
} }
else else
{ {
deleted = true;
op->version = 1; op->version = 1;
} }
} }
if (deleted && is_del)
{
// Already deleted
op->retval = 0;
op->callback(op);
return;
}
// Immediately add the operation into dirty_db, so subsequent reads could see it // Immediately add the operation into dirty_db, so subsequent reads could see it
#ifdef BLOCKSTORE_DEBUG #ifdef BLOCKSTORE_DEBUG
printf("Write %lu:%lu v%lu\n", op->oid.inode, op->oid.stripe, op->version); printf("%s %lu:%lu v%lu\n", is_del ? "Delete" : "Write", op->oid.inode, op->oid.stripe, op->version);
#endif #endif
dirty_db.emplace((obj_ver_id){ dirty_db.emplace((obj_ver_id){
.oid = op->oid, .oid = op->oid,
.version = op->version, .version = op->version,
}, (dirty_entry){ }, (dirty_entry){
.state = ST_IN_FLIGHT, .state = (uint32_t)(
is_del
? ST_DEL_IN_FLIGHT
: (op->len == block_size || deleted ? ST_D_IN_FLIGHT : ST_J_IN_FLIGHT)
),
.flags = 0, .flags = 0,
.location = 0, .location = 0,
.offset = op->offset, .offset = is_del ? 0 : op->offset,
.len = op->len, .len = is_del ? 0 : op->len,
.journal_sector = 0, .journal_sector = 0,
}); });
} }
@ -53,7 +66,7 @@ int blockstore::dequeue_write(blockstore_operation *op)
.oid = op->oid, .oid = op->oid,
.version = op->version, .version = op->version,
}); });
if (op->len == block_size || op->version == 1) if (dirty_it->second.state == ST_D_IN_FLIGHT)
{ {
// Big (redirect) write // Big (redirect) write
uint64_t loc = data_alloc->find_free(); uint64_t loc = data_alloc->find_free();
@ -222,3 +235,40 @@ void blockstore::handle_write_event(ring_data_t *data, blockstore_operation *op)
op->callback(op); op->callback(op);
} }
} }
int blockstore::dequeue_del(blockstore_operation *op)
{
auto dirty_it = dirty_db.find((obj_ver_id){
.oid = op->oid,
.version = op->version,
});
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), 0))
{
return 0;
}
BS_SUBMIT_GET_ONLY_SQE(sqe);
// Prepare journal sector write
journal_entry_del *je = (journal_entry_del*)
prefill_single_journal_entry(journal, JE_DELETE, sizeof(struct journal_entry_del));
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
#ifdef BLOCKSTORE_DEBUG
printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
#endif
je->oid = op->oid;
je->version = op->version;
je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32;
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
prepare_journal_sector_write(journal, sqe, cb);
op->min_used_journal_sector = op->max_used_journal_sector = 1 + journal.cur_sector;
op->pending_ops = 1;
dirty_it->second.state = ST_DEL_SUBMITTED;
// Remember small write as unsynced
unsynced_small_writes.push_back((obj_ver_id){
.oid = op->oid,
.version = op->version,
});
return 1;
}