diff --git a/src/osd.h b/src/osd.h index 3a86efad6..01a17619a 100644 --- a/src/osd.h +++ b/src/osd.h @@ -198,6 +198,7 @@ class osd_t void continue_primary_del(osd_op_t *cur_op); bool check_write_queue(osd_op_t *cur_op, pg_t & pg); void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg); + void free_object_state(pg_t & pg, pg_osd_set_state_t **object_state); bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state); void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op); void handle_primary_bs_subop(osd_op_t *subop); diff --git a/src/osd_primary.cpp b/src/osd_primary.cpp index cd12f395e..c08445363 100644 --- a/src/osd_primary.cpp +++ b/src/osd_primary.cpp @@ -18,7 +18,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op) // Our EC scheme stores data in fixed chunks equal to (K*block size) // K = (pg_size-parity_chunks) in case of EC/XOR, or 1 for replicated pools pool_id_t pool_id = INODE_POOL(cur_op->req.rw.inode); - // FIXME: We have to access pool config here, so make sure that it doesn't change while its PGs are active... + // Note: We read pool config here, so we must NOT change it when PGs are active auto pool_cfg_it = st_cli.pool_config.find(pool_id); if (pool_cfg_it == st_cli.pool_config.end()) { @@ -269,8 +269,6 @@ resume_3: pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO); return; } - // Save version override for parallel reads - pg.ver_override[op_data->oid] = op_data->fact_ver; if (op_data->scheme == POOL_SCHEME_REPLICATED) { // Only (possibly) copy new data from the request into the recovery buffer @@ -289,6 +287,9 @@ resume_3: } else { + // For EC/XOR pools, save version override to make it impossible + // for parallel reads to read different versions of data and parity + pg.ver_override[op_data->oid] = op_data->fact_ver; // Recover missing stripes, calculate parity if (pg.scheme == POOL_SCHEME_XOR) { @@ -332,8 +333,22 @@ resume_4: op_data->st = 4; return; resume_5: + if (op_data->scheme != POOL_SCHEME_REPLICATED) + { + // Remove version override just after the write, but before stabilizing + pg.ver_override.erase(op_data->oid); + } + if (op_data->object_state) + { + // We must forget the unclean state of the object before deleting it + // so the next reads don't accidentally read a deleted version + // And it should be done at the same time as the removal of the version override + remove_object_from_state(op_data->oid, op_data->object_state, pg); + pg.clean_count++; + } if (op_data->errors > 0) { + free_object_state(pg, &op_data->object_state); pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO); return; } @@ -342,6 +357,7 @@ resume_7: if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6)) { // FIXME: Check for immediate_commit == IMMEDIATE_SMALL + free_object_state(pg, &op_data->object_state); return; } if (op_data->fact_ver == 1) @@ -390,10 +406,12 @@ resume_7: copies_to_delete_after_sync_count++; } } + free_object_state(pg, &op_data->object_state); } else { submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set); + free_object_state(pg, &op_data->object_state); if (op_data->n_subops > 0) { resume_8: @@ -407,14 +425,9 @@ resume_9: } } } - // Clear object state - remove_object_from_state(op_data->oid, op_data->object_state, pg); - pg.clean_count++; } cur_op->reply.hdr.retval = cur_op->req.rw.len; continue_others: - // Remove version override - pg.ver_override.erase(op_data->oid); object_id oid = op_data->oid; // Remove the operation from queue before calling finish_op so it doesn't see the completed operation in queue auto next_it = pg.write_queue.find(oid); @@ -818,10 +831,14 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object { throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state)); } - object_state->object_count--; - if (!object_state->object_count) +} + +void osd_t::free_object_state(pg_t & pg, pg_osd_set_state_t **object_state) +{ + if (*object_state && !(--(*object_state)->object_count)) { - pg.state_dict.erase(object_state->osd_set); + pg.state_dict.erase((*object_state)->osd_set); + *object_state = NULL; } } @@ -887,6 +904,7 @@ resume_5: else { remove_object_from_state(op_data->oid, op_data->object_state, pg); + free_object_state(pg, &op_data->object_state); } pg.total_count--; object_id oid = op_data->oid; diff --git a/tests/test_change_pg_count.sh b/tests/test_change_pg_count.sh index 5ec682944..33a23e6cd 100755 --- a/tests/test_change_pg_count.sh +++ b/tests/test_change_pg_count.sh @@ -2,6 +2,14 @@ . `dirname $0`/common.sh +if [ "$EC" != "" ]; then + POOLCFG='"scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1' + NOBJ=512 +else + POOLCFG='"scheme":"replicated","pg_size":2,"pg_minsize":2' + NOBJ=1024 +fi + dd if=/dev/zero of=./testdata/test_osd1.bin bs=1024 count=1 seek=$((1024*1024-1)) dd if=/dev/zero of=./testdata/test_osd2.bin bs=1024 count=1 seek=$((1024*1024-1)) dd if=/dev/zero of=./testdata/test_osd3.bin bs=1024 count=1 seek=$((1024*1024-1)) @@ -28,7 +36,7 @@ cd .. node mon/mon-main.js --etcd_url http://$ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log & MON_PID=$! -$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":2,"pg_count":16,"failure_domain":"osd"}}' +$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":16,"failure_domain":"osd"}}' sleep 2 @@ -52,7 +60,7 @@ try_change() echo --- Change PG count to $n --- >>testdata/osd$i.log done - $ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":2,"pg_count":'$n',"failure_domain":"osd"}}' + $ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":'$n',"failure_domain":"osd"}}' for i in {1..10}; do ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == 2) | length) == '$n) && \ @@ -82,8 +90,8 @@ try_change() # Check that no objects are lost ! nobj=`$ETCDCTL get --prefix '/vitastor/pg/stats' --print-value-only | jq -s '[ .[].object_count ] | reduce .[] as $num (0; .+$num)'` - if [ "$nobj" -ne 1024 ]; then - format_error "Data lost after changing PG count to $n: 1024 objects expected, but got $nobj" + if [ "$nobj" -ne $NOBJ ]; then + format_error "Data lost after changing PG count to $n: $NOBJ objects expected, but got $nobj" fi }