vitastor/osd_primary.cpp

// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.0 (see README.md for details)

#include "osd_primary.h"

// read: read directly or read paired stripe(s), reconstruct, return
// write: read paired stripe(s), reconstruct, modify, calculate parity, write
//
// nuance: take care to read the same version from paired stripes!
// to do so, we remember "last readable" version until a write request completes
// and we postpone other write requests to the same stripe until completion of previous ones
//
// sync: sync peers, get unstable versions, stabilize them

bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
{
    // PG number is calculated from the offset
    // Our EC scheme stores data in fixed chunks equal to (K*block size)
    // K = pg_minsize in case of EC/XOR, or 1 for replicated pools
    pool_id_t pool_id = INODE_POOL(cur_op->req.rw.inode);
    auto pool_cfg_it = st_cli.pool_config.find(pool_id);
    if (pool_cfg_it == st_cli.pool_config.end())
    {
        // Pool config is not loaded yet
        finish_op(cur_op, -EPIPE);
        return false;
    }
    auto & pool_cfg = pool_cfg_it->second;
    uint64_t pg_block_size = bs_block_size * (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_minsize);
    object_id oid = {
        .inode = cur_op->req.rw.inode,
        // oid.stripe = starting offset of the parity stripe
        .stripe = (cur_op->req.rw.offset/pg_block_size)*pg_block_size,
    };
    pg_num_t pg_num = (cur_op->req.rw.inode + oid.stripe/pool_cfg.pg_stripe_size) % pg_counts[pool_id] + 1;
    auto pg_it = pgs.find({ .pool_id = pool_id, .pg_num = pg_num });
    if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE))
    {
        // This OSD is not primary for this PG or the PG is inactive
        finish_op(cur_op, -EPIPE);
        return false;
    }
    if ((cur_op->req.rw.offset + cur_op->req.rw.len) > (oid.stripe + pg_block_size) ||
        (cur_op->req.rw.offset % bs_disk_alignment) != 0 ||
        (cur_op->req.rw.len % bs_disk_alignment) != 0)
    {
        finish_op(cur_op, -EINVAL);
        return false;
    }
    osd_primary_op_data_t *op_data = (osd_primary_op_data_t*)calloc_or_die(
        1, sizeof(osd_primary_op_data_t) + sizeof(osd_rmw_stripe_t) * (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size)
    );
    op_data->pg_num = pg_num;
    op_data->oid = oid;
    op_data->stripes = ((osd_rmw_stripe_t*)(op_data+1));
    op_data->scheme = pool_cfg.scheme;
    cur_op->op_data = op_data;
    split_stripes((pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_minsize),
        bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
    pg_it->second.inflight++;
    return true;
}

static uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
{
    if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
    {
        *object_state = NULL;
        return def;
    }
    auto st_it = pg.incomplete_objects.find(oid);
    if (st_it != pg.incomplete_objects.end())
    {
        *object_state = st_it->second;
        return st_it->second->read_target.data();
    }
    st_it = pg.degraded_objects.find(oid);
    if (st_it != pg.degraded_objects.end())
    {
        *object_state = st_it->second;
        return st_it->second->read_target.data();
    }
    st_it = pg.misplaced_objects.find(oid);
    if (st_it != pg.misplaced_objects.end())
    {
        *object_state = st_it->second;
        return st_it->second->read_target.data();
    }
    *object_state = NULL;
    return def;
}

void osd_t::continue_primary_read(osd_op_t *cur_op)
{
    if (!cur_op->op_data && !prepare_primary_rw(cur_op))
    {
        return;
    }
    osd_primary_op_data_t *op_data = cur_op->op_data;
    if (op_data->st == 1)      goto resume_1;
    else if (op_data->st == 2) goto resume_2;
    {
        auto & pg = pgs[{ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }];
        for (int role = 0; role < (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_minsize); role++)
        {
            op_data->stripes[role].read_start = op_data->stripes[role].req_start;
            op_data->stripes[role].read_end = op_data->stripes[role].req_end;
        }
        // Determine version
        auto vo_it = pg.ver_override.find(op_data->oid);
        op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
        if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
        {
            // Fast happy-path
            cur_op->buf = alloc_read_buffer(op_data->stripes,
                (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_minsize), 0);
            submit_primary_subops(SUBMIT_READ, op_data->target_ver,
                (op_data->scheme == POOL_SCHEME_REPLICATED ? pg.pg_size : pg.pg_minsize), pg.cur_set.data(), cur_op);
            op_data->st = 1;
        }
        else
        {
            // PG may be degraded or have misplaced objects
            uint64_t* cur_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
            if (extend_missing_stripes(op_data->stripes, cur_set, pg.pg_minsize, pg.pg_size) < 0)
            {
                finish_op(cur_op, -EIO);
                return;
            }
            // Submit reads
            op_data->pg_minsize = pg.pg_minsize;
            op_data->pg_size = pg.pg_size;
            op_data->degraded = 1;
            cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0);
            submit_primary_subops(SUBMIT_READ, op_data->target_ver, pg.pg_size, cur_set, cur_op);
            op_data->st = 1;
        }
    }
resume_1:
    return;
resume_2:
    if (op_data->errors > 0)
    {
        finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
        return;
    }
    if (op_data->degraded)
    {
        // Reconstruct missing stripes
        // FIXME: Always EC(k+1) by now. Add different coding schemes
        osd_rmw_stripe_t *stripes = op_data->stripes;
        for (int role = 0; role < op_data->pg_minsize; role++)
        {
            if (stripes[role].read_end != 0 && stripes[role].missing)
            {
                reconstruct_stripe_xor(stripes, op_data->pg_size, role);
            }
            if (stripes[role].req_end != 0)
            {
                // Send buffer in parts to avoid copying
                cur_op->iov.push_back(
                    stripes[role].read_buf + (stripes[role].req_start - stripes[role].read_start),
                    stripes[role].req_end - stripes[role].req_start
                );
            }
        }
    }
    else
    {
        cur_op->iov.push_back(cur_op->buf, cur_op->req.rw.len);
    }
    finish_op(cur_op, cur_op->req.rw.len);
}

bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
{
    osd_primary_op_data_t *op_data = cur_op->op_data;
    // Check if actions are pending for this object
    auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){
        .oid = op_data->oid,
        .osd_num = 0,
    });
    if (act_it != pg.flush_actions.end() &&
        act_it->first.oid.inode == op_data->oid.inode &&
        (act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
    {
        pg.write_queue.emplace(op_data->oid, cur_op);
        return false;
    }
    // Check if there are other write requests to the same object
    auto vo_it = pg.write_queue.find(op_data->oid);
    if (vo_it != pg.write_queue.end())
    {
        op_data->st = 1;
        pg.write_queue.emplace(op_data->oid, cur_op);
        return false;
    }
    pg.write_queue.emplace(op_data->oid, cur_op);
    return true;
}

void osd_t::continue_primary_write(osd_op_t *cur_op)
{
    if (!cur_op->op_data && !prepare_primary_rw(cur_op))
    {
        return;
    }
    osd_primary_op_data_t *op_data = cur_op->op_data;
    auto & pg = pgs[{ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }];
    if (op_data->st == 1)      goto resume_1;
    else if (op_data->st == 2) goto resume_2;
    else if (op_data->st == 3) goto resume_3;
    else if (op_data->st == 4) goto resume_4;
    else if (op_data->st == 5) goto resume_5;
    else if (op_data->st == 6) goto resume_6;
    else if (op_data->st == 7) goto resume_7;
    else if (op_data->st == 8) goto resume_8;
    else if (op_data->st == 9) goto resume_9;
    else if (op_data->st == 10) goto resume_10;
    assert(op_data->st == 0);
    if (!check_write_queue(cur_op, pg))
    {
        return;
    }
resume_1:
    // Determine blocks to read and write
    // Missing chunks are allowed to be overwritten even in incomplete objects
    // FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for lower performance impact
    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
    if (op_data->scheme == POOL_SCHEME_REPLICATED)
    {
        // Simplified algorithm
        op_data->stripes[0].write_start = op_data->stripes[0].req_start;
        op_data->stripes[0].write_end = op_data->stripes[0].req_end;
        op_data->stripes[0].write_buf = cur_op->buf;
        if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
            op_data->stripes[0].write_end != bs_block_size))
        {
            // Object is degraded/misplaced and will be moved to <write_osd_set>
            op_data->stripes[0].read_start = 0;
            op_data->stripes[0].read_end = bs_block_size;
            cur_op->rmw_buf = op_data->stripes[0].read_buf = memalign_or_die(MEM_ALIGNMENT, bs_block_size);
        }
    }
    else
    {
        cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
            pg.pg_size, pg.pg_minsize, pg.pg_cursize, pg.cur_set.data(), bs_block_size);
        if (!cur_op->rmw_buf)
        {
            // Refuse partial overwrite of an incomplete object
            cur_op->reply.hdr.retval = -EINVAL;
            goto continue_others;
        }
    }
    // Read required blocks
    submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, pg.pg_size, op_data->prev_set, cur_op);
resume_2:
    op_data->st = 2;
    return;
resume_3:
    if (op_data->errors > 0)
    {
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
        return;
    }
    // Save version override for parallel reads
    pg.ver_override[op_data->oid] = op_data->fact_ver;
    if (op_data->scheme == POOL_SCHEME_REPLICATED)
    {
        // Only (possibly) copy new data from the request into the recovery buffer
        if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
            op_data->stripes[0].write_end != bs_block_size))
        {
            memcpy(
                op_data->stripes[0].read_buf + op_data->stripes[0].req_start,
                op_data->stripes[0].write_buf,
                op_data->stripes[0].req_end - op_data->stripes[0].req_start
            );
            op_data->stripes[0].write_buf = op_data->stripes[0].read_buf;
            op_data->stripes[0].write_start = 0;
            op_data->stripes[0].write_end = bs_block_size;
        }
    }
    else
    {
        // Recover missing stripes, calculate parity
        calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
    }
    // Send writes
    if ((op_data->fact_ver >> (64-PG_EPOCH_BITS)) < pg.epoch)
    {
        op_data->target_ver = ((uint64_t)pg.epoch << (64-PG_EPOCH_BITS)) | 1;
    }
    else
    {
        if ((op_data->fact_ver & (1ul<<(64-PG_EPOCH_BITS) - 1)) == (1ul<<(64-PG_EPOCH_BITS) - 1))
        {
            assert(pg.epoch != ((1ul << PG_EPOCH_BITS)-1));
            pg.epoch++;
        }
        op_data->target_ver = op_data->fact_ver + 1;
    }
    if (pg.epoch > pg.reported_epoch)
    {
        // Report newer epoch before writing
        // FIXME: We may report only one PG state here...
        this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
        pg.history_changed = true;
        report_pg_states();
resume_10:
        if (pg.epoch > pg.reported_epoch)
        {
            op_data->st = 10;
            return;
        }
    }
    submit_primary_subops(SUBMIT_WRITE, op_data->target_ver, pg.pg_size, pg.cur_set.data(), cur_op);
resume_4:
    op_data->st = 4;
    return;
resume_5:
    if (op_data->errors > 0)
    {
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
        return;
    }
resume_6:
resume_7:
    if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
    {
        // FIXME: Check for immediate_commit == IMMEDIATE_SMALL
        return;
    }
    if (op_data->fact_ver == 1)
    {
        // Object is created
        pg.clean_count++;
        pg.total_count++;
    }
    if (op_data->object_state)
    {
        {
            int recovery_type = op_data->object_state->state & (OBJ_DEGRADED|OBJ_INCOMPLETE) ? 0 : 1;
            recovery_stat_count[0][recovery_type]++;
            if (!recovery_stat_count[0][recovery_type])
            {
                recovery_stat_count[0][recovery_type]++;
                recovery_stat_bytes[0][recovery_type] = 0;
            }
            for (int role = 0; role < (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size); role++)
            {
                recovery_stat_bytes[0][recovery_type] += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
            }
        }
        if (op_data->object_state->state & OBJ_MISPLACED)
        {
            // Remove extra chunks
            submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
            if (op_data->n_subops > 0)
            {
resume_8:
                op_data->st = 8;
                return;
resume_9:
                if (op_data->errors > 0)
                {
                    pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
                    return;
                }
            }
        }
        // Clear object state
        remove_object_from_state(op_data->oid, op_data->object_state, pg);
        pg.clean_count++;
    }
    cur_op->reply.hdr.retval = cur_op->req.rw.len;
continue_others:
    // Remove version override
    pg.ver_override.erase(op_data->oid);
    object_id oid = op_data->oid;
    finish_op(cur_op, cur_op->reply.hdr.retval);
    // Continue other write operations to the same object
    auto next_it = pg.write_queue.find(oid);
    auto this_it = next_it;
    if (this_it != pg.write_queue.end() && this_it->second == cur_op)
    {
        next_it++;
        pg.write_queue.erase(this_it);
        if (next_it != pg.write_queue.end() && next_it->first == oid)
        {
            osd_op_t *next_op = next_it->second;
            continue_primary_write(next_op);
        }
    }
}

bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
{
    osd_primary_op_data_t *op_data = cur_op->op_data;
    if (op_data->st == base_state)
    {
        goto resume_6;
    }
    else if (op_data->st == base_state+1)
    {
        goto resume_7;
    }
    // FIXME: Check for immediate_commit == IMMEDIATE_SMALL
    if (immediate_commit == IMMEDIATE_ALL)
    {
        if (op_data->scheme != POOL_SCHEME_REPLICATED)
        {
            // Send STABILIZE ops immediately
            op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
            op_data->unstable_writes = new obj_ver_id[loc_set.size()];
            {
                int last_start = 0;
                for (auto & chunk: loc_set)
                {
                    op_data->unstable_writes[last_start] = (obj_ver_id){
                        .oid = {
                            .inode = op_data->oid.inode,
                            .stripe = op_data->oid.stripe | chunk.role,
                        },
                        .version = op_data->fact_ver,
                    };
                    op_data->unstable_write_osds->push_back((unstable_osd_num_t){
                        .osd_num = chunk.osd_num,
                        .start = last_start,
                        .len = 1,
                    });
                    last_start++;
                }
            }
            submit_primary_stab_subops(cur_op);
resume_6:
            op_data->st = 6;
            return false;
resume_7:
            // FIXME: Free those in the destructor?
            delete op_data->unstable_write_osds;
            delete[] op_data->unstable_writes;
            op_data->unstable_writes = NULL;
            op_data->unstable_write_osds = NULL;
            if (op_data->errors > 0)
            {
                pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
                return false;
            }
        }
    }
    else
    {
        if (op_data->scheme != POOL_SCHEME_REPLICATED)
        {
            // Remember version as unstable for EC/XOR
            for (auto & chunk: loc_set)
            {
                this->dirty_osds.insert(chunk.osd_num);
                this->unstable_writes[(osd_object_id_t){
                    .osd_num = chunk.osd_num,
                    .oid = {
                        .inode = op_data->oid.inode,
                        .stripe = op_data->oid.stripe | chunk.role,
                    },
                }] = op_data->fact_ver;
            }
        }
        else
        {
            // Only remember to sync OSDs for replicated pools
            for (auto & chunk: loc_set)
            {
                this->dirty_osds.insert(chunk.osd_num);
            }
        }
        // Remember PG as dirty to drop the connection when PG goes offline
        // (this is required because of the "lazy sync")
        c_cli.clients[cur_op->peer_fd]->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
        dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
    }
    return true;
}

// Save and clear unstable_writes -> SYNC all -> STABLE all
void osd_t::continue_primary_sync(osd_op_t *cur_op)
{
    if (!cur_op->op_data)
    {
        cur_op->op_data = (osd_primary_op_data_t*)calloc_or_die(1, sizeof(osd_primary_op_data_t));
    }
    osd_primary_op_data_t *op_data = cur_op->op_data;
    if (op_data->st == 1)      goto resume_1;
    else if (op_data->st == 2) goto resume_2;
    else if (op_data->st == 3) goto resume_3;
    else if (op_data->st == 4) goto resume_4;
    else if (op_data->st == 5) goto resume_5;
    else if (op_data->st == 6) goto resume_6;
    assert(op_data->st == 0);
    if (syncs_in_progress.size() > 0)
    {
        // Wait for previous syncs, if any
        // FIXME: We may try to execute the current one in parallel, like in Blockstore, but I'm not sure if it matters at all
        syncs_in_progress.push_back(cur_op);
        op_data->st = 1;
resume_1:
        return;
    }
    else
    {
        syncs_in_progress.push_back(cur_op);
    }
resume_2:
    if (dirty_osds.size() == 0)
    {
        // Nothing to sync
        goto finish;
    }
    // Save and clear unstable_writes
    // In theory it is possible to do in on a per-client basis, but this seems to be an unnecessary complication
    // It would be cool not to copy these here at all, but someone has to deduplicate them by object IDs anyway
    if (unstable_writes.size() > 0)
    {
        op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
        op_data->unstable_writes = new obj_ver_id[this->unstable_writes.size()];
        osd_num_t last_osd = 0;
        int last_start = 0, last_end = 0;
        for (auto it = this->unstable_writes.begin(); it != this->unstable_writes.end(); it++)
        {
            if (last_osd != it->first.osd_num)
            {
                if (last_osd != 0)
                {
                    op_data->unstable_write_osds->push_back((unstable_osd_num_t){
                        .osd_num = last_osd,
                        .start = last_start,
                        .len = last_end - last_start,
                    });
                }
                last_osd = it->first.osd_num;
                last_start = last_end;
            }
            op_data->unstable_writes[last_end] = (obj_ver_id){
                .oid = it->first.oid,
                .version = it->second,
            };
            last_end++;
        }
        if (last_osd != 0)
        {
            op_data->unstable_write_osds->push_back((unstable_osd_num_t){
                .osd_num = last_osd,
                .start = last_start,
                .len = last_end - last_start,
            });
        }
        this->unstable_writes.clear();
    }
    {
        void *dirty_buf = malloc_or_die(sizeof(pool_pg_num_t)*dirty_pgs.size() + sizeof(osd_num_t)*dirty_osds.size());
        op_data->dirty_pgs = (pool_pg_num_t*)dirty_buf;
        op_data->dirty_osds = (osd_num_t*)(dirty_buf + sizeof(pool_pg_num_t)*dirty_pgs.size());
        op_data->dirty_pg_count = dirty_pgs.size();
        op_data->dirty_osd_count = dirty_osds.size();
        int dpg = 0;
        for (auto dirty_pg_num: dirty_pgs)
        {
            pgs[dirty_pg_num].inflight++;
            op_data->dirty_pgs[dpg++] = dirty_pg_num;
        }
        dirty_pgs.clear();
        dpg = 0;
        for (auto osd_num: dirty_osds)
        {
            op_data->dirty_osds[dpg++] = osd_num;
        }
        dirty_osds.clear();
    }
    if (immediate_commit != IMMEDIATE_ALL)
    {
        // SYNC
        submit_primary_sync_subops(cur_op);
resume_3:
        op_data->st = 3;
        return;
resume_4:
        if (op_data->errors > 0)
        {
            goto resume_6;
        }
    }
    if (op_data->unstable_writes)
    {
        // Stabilize version sets, if any
        submit_primary_stab_subops(cur_op);
resume_5:
        op_data->st = 5;
        return;
    }
resume_6:
    if (op_data->errors > 0)
    {
        // Return PGs and OSDs back into their dirty sets
        for (int i = 0; i < op_data->dirty_pg_count; i++)
        {
            dirty_pgs.insert(op_data->dirty_pgs[i]);
        }
        for (int i = 0; i < op_data->dirty_osd_count; i++)
        {
            dirty_osds.insert(op_data->dirty_osds[i]);
        }
        if (op_data->unstable_writes)
        {
            // Return objects back into the unstable write set
            for (auto unstable_osd: *(op_data->unstable_write_osds))
            {
                for (int i = 0; i < unstable_osd.len; i++)
                {
                    // Except those from peered PGs
                    auto & w = op_data->unstable_writes[i];
                    pool_pg_num_t wpg = {
                        .pool_id = INODE_POOL(w.oid.inode),
                        .pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
                    };
                    if (pgs[wpg].state & PG_ACTIVE)
                    {
                        uint64_t & dest = this->unstable_writes[(osd_object_id_t){
                            .osd_num = unstable_osd.osd_num,
                            .oid = w.oid,
                        }];
                        dest = dest < w.version ? w.version : dest;
                        dirty_pgs.insert(wpg);
                    }
                }
            }
        }
    }
    for (int i = 0; i < op_data->dirty_pg_count; i++)
    {
        auto & pg = pgs.at(op_data->dirty_pgs[i]);
        pg.inflight--;
        if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
        {
            finish_stop_pg(pg);
        }
    }
    // FIXME: Free those in the destructor?
    free(op_data->dirty_pgs);
    op_data->dirty_pgs = NULL;
    op_data->dirty_osds = NULL;
    if (op_data->unstable_writes)
    {
        delete op_data->unstable_write_osds;
        delete[] op_data->unstable_writes;
        op_data->unstable_writes = NULL;
        op_data->unstable_write_osds = NULL;
    }
    if (op_data->errors > 0)
    {
        finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
    }
    else
    {
finish:
        if (cur_op->peer_fd)
        {
            auto it = c_cli.clients.find(cur_op->peer_fd);
            if (it != c_cli.clients.end())
                it->second->dirty_pgs.clear();
        }
        finish_op(cur_op, 0);
    }
    assert(syncs_in_progress.front() == cur_op);
    syncs_in_progress.pop_front();
    if (syncs_in_progress.size() > 0)
    {
        cur_op = syncs_in_progress.front();
        op_data = cur_op->op_data;
        op_data->st++;
        goto resume_2;
    }
}

// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t & pg)
{
    if (object_state->state & OBJ_INCOMPLETE)
    {
        // Successful write means that object is not incomplete anymore
        this->incomplete_objects--;
        pg.incomplete_objects.erase(oid);
        if (!pg.incomplete_objects.size())
        {
            pg.state = pg.state & ~PG_HAS_INCOMPLETE;
            report_pg_state(pg);
        }
    }
    else if (object_state->state & OBJ_DEGRADED)
    {
        this->degraded_objects--;
        pg.degraded_objects.erase(oid);
        if (!pg.degraded_objects.size())
        {
            pg.state = pg.state & ~PG_HAS_DEGRADED;
            report_pg_state(pg);
        }
    }
    else if (object_state->state & OBJ_MISPLACED)
    {
        this->misplaced_objects--;
        pg.misplaced_objects.erase(oid);
        if (!pg.misplaced_objects.size())
        {
            pg.state = pg.state & ~PG_HAS_MISPLACED;
            report_pg_state(pg);
        }
    }
    else
    {
        throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
    }
    object_state->object_count--;
    if (!object_state->object_count)
    {
        pg.state_dict.erase(object_state->osd_set);
    }
}

void osd_t::continue_primary_del(osd_op_t *cur_op)
{
    if (!cur_op->op_data && !prepare_primary_rw(cur_op))
    {
        return;
    }
    osd_primary_op_data_t *op_data = cur_op->op_data;
    auto & pg = pgs[{ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }];
    if (op_data->st == 1)      goto resume_1;
    else if (op_data->st == 2) goto resume_2;
    else if (op_data->st == 3) goto resume_3;
    else if (op_data->st == 4) goto resume_4;
    else if (op_data->st == 5) goto resume_5;
    assert(op_data->st == 0);
    // Delete is forbidden even in active PGs if they're also degraded or have previous dead OSDs
    if (pg.state & (PG_DEGRADED | PG_LEFT_ON_DEAD))
    {
        finish_op(cur_op, -EBUSY);
        return;
    }
    if (!check_write_queue(cur_op, pg))
    {
        return;
    }
resume_1:
    // Determine which OSDs contain this object and delete it
    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
    // Submit 1 read to determine the actual version number
    submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, pg.pg_size, op_data->prev_set, cur_op);
resume_2:
    op_data->st = 2;
    return;
resume_3:
    if (op_data->errors > 0)
    {
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
        return;
    }
    // Save version override for parallel reads
    pg.ver_override[op_data->oid] = op_data->fact_ver;
    // Submit deletes
    op_data->fact_ver++;
    submit_primary_del_subops(cur_op, NULL, 0, op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set);
resume_4:
    op_data->st = 4;
    return;
resume_5:
    if (op_data->errors > 0)
    {
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
        return;
    }
    // Remove version override
    pg.ver_override.erase(op_data->oid);
    // Adjust PG stats after "instant stabilize", because we need object_state above
    if (!op_data->object_state)
    {
        pg.clean_count--;
    }
    else
    {
        remove_object_from_state(op_data->oid, op_data->object_state, pg);
    }
    pg.total_count--;
    object_id oid = op_data->oid;
    finish_op(cur_op, cur_op->req.rw.len);
    // Continue other write operations to the same object
    auto next_it = pg.write_queue.find(oid);
    auto this_it = next_it;
    if (this_it != pg.write_queue.end() && this_it->second == cur_op)
    {
        next_it++;
        pg.write_queue.erase(this_it);
        if (next_it != pg.write_queue.end() &&
            next_it->first == oid)
        {
            osd_op_t *next_op = next_it->second;
            continue_primary_write(next_op);
        }
    }
}