Fix two potential read/write ordering problems (even though not yet seen in tests)

- Write operations could be 'stabilized' and previous versions could be
  purged from OSDs before the removal of version_override and following
  reads could potentially hit different version in EC pools
- Object was marked clean after completing the delete during recovery, so
  reads could in theory hit a deleted version and return nothing
rel-0.5
Vitaliy Filippov 2021-03-21 15:02:24 +03:00
parent 98b54ca948
commit 05db1308aa
3 changed files with 42 additions and 15 deletions

View File

@ -198,6 +198,7 @@ class osd_t
void continue_primary_del(osd_op_t *cur_op); void continue_primary_del(osd_op_t *cur_op);
bool check_write_queue(osd_op_t *cur_op, pg_t & pg); bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg); void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
void free_object_state(pg_t & pg, pg_osd_set_state_t **object_state);
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state); bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op); void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
void handle_primary_bs_subop(osd_op_t *subop); void handle_primary_bs_subop(osd_op_t *subop);

View File

@ -18,7 +18,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
// Our EC scheme stores data in fixed chunks equal to (K*block size) // Our EC scheme stores data in fixed chunks equal to (K*block size)
// K = (pg_size-parity_chunks) in case of EC/XOR, or 1 for replicated pools // K = (pg_size-parity_chunks) in case of EC/XOR, or 1 for replicated pools
pool_id_t pool_id = INODE_POOL(cur_op->req.rw.inode); pool_id_t pool_id = INODE_POOL(cur_op->req.rw.inode);
// FIXME: We have to access pool config here, so make sure that it doesn't change while its PGs are active... // Note: We read pool config here, so we must NOT change it when PGs are active
auto pool_cfg_it = st_cli.pool_config.find(pool_id); auto pool_cfg_it = st_cli.pool_config.find(pool_id);
if (pool_cfg_it == st_cli.pool_config.end()) if (pool_cfg_it == st_cli.pool_config.end())
{ {
@ -269,8 +269,6 @@ resume_3:
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO); pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return; return;
} }
// Save version override for parallel reads
pg.ver_override[op_data->oid] = op_data->fact_ver;
if (op_data->scheme == POOL_SCHEME_REPLICATED) if (op_data->scheme == POOL_SCHEME_REPLICATED)
{ {
// Only (possibly) copy new data from the request into the recovery buffer // Only (possibly) copy new data from the request into the recovery buffer
@ -289,6 +287,9 @@ resume_3:
} }
else else
{ {
// For EC/XOR pools, save version override to make it impossible
// for parallel reads to read different versions of data and parity
pg.ver_override[op_data->oid] = op_data->fact_ver;
// Recover missing stripes, calculate parity // Recover missing stripes, calculate parity
if (pg.scheme == POOL_SCHEME_XOR) if (pg.scheme == POOL_SCHEME_XOR)
{ {
@ -332,8 +333,22 @@ resume_4:
op_data->st = 4; op_data->st = 4;
return; return;
resume_5: resume_5:
if (op_data->scheme != POOL_SCHEME_REPLICATED)
{
// Remove version override just after the write, but before stabilizing
pg.ver_override.erase(op_data->oid);
}
if (op_data->object_state)
{
// We must forget the unclean state of the object before deleting it
// so the next reads don't accidentally read a deleted version
// And it should be done at the same time as the removal of the version override
remove_object_from_state(op_data->oid, op_data->object_state, pg);
pg.clean_count++;
}
if (op_data->errors > 0) if (op_data->errors > 0)
{ {
free_object_state(pg, &op_data->object_state);
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO); pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return; return;
} }
@ -342,6 +357,7 @@ resume_7:
if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6)) if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
{ {
// FIXME: Check for immediate_commit == IMMEDIATE_SMALL // FIXME: Check for immediate_commit == IMMEDIATE_SMALL
free_object_state(pg, &op_data->object_state);
return; return;
} }
if (op_data->fact_ver == 1) if (op_data->fact_ver == 1)
@ -390,10 +406,12 @@ resume_7:
copies_to_delete_after_sync_count++; copies_to_delete_after_sync_count++;
} }
} }
free_object_state(pg, &op_data->object_state);
} }
else else
{ {
submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set); submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
free_object_state(pg, &op_data->object_state);
if (op_data->n_subops > 0) if (op_data->n_subops > 0)
{ {
resume_8: resume_8:
@ -407,14 +425,9 @@ resume_9:
} }
} }
} }
// Clear object state
remove_object_from_state(op_data->oid, op_data->object_state, pg);
pg.clean_count++;
} }
cur_op->reply.hdr.retval = cur_op->req.rw.len; cur_op->reply.hdr.retval = cur_op->req.rw.len;
continue_others: continue_others:
// Remove version override
pg.ver_override.erase(op_data->oid);
object_id oid = op_data->oid; object_id oid = op_data->oid;
// Remove the operation from queue before calling finish_op so it doesn't see the completed operation in queue // Remove the operation from queue before calling finish_op so it doesn't see the completed operation in queue
auto next_it = pg.write_queue.find(oid); auto next_it = pg.write_queue.find(oid);
@ -818,10 +831,14 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object
{ {
throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state)); throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
} }
object_state->object_count--; }
if (!object_state->object_count)
void osd_t::free_object_state(pg_t & pg, pg_osd_set_state_t **object_state)
{
if (*object_state && !(--(*object_state)->object_count))
{ {
pg.state_dict.erase(object_state->osd_set); pg.state_dict.erase((*object_state)->osd_set);
*object_state = NULL;
} }
} }
@ -887,6 +904,7 @@ resume_5:
else else
{ {
remove_object_from_state(op_data->oid, op_data->object_state, pg); remove_object_from_state(op_data->oid, op_data->object_state, pg);
free_object_state(pg, &op_data->object_state);
} }
pg.total_count--; pg.total_count--;
object_id oid = op_data->oid; object_id oid = op_data->oid;

View File

@ -2,6 +2,14 @@
. `dirname $0`/common.sh . `dirname $0`/common.sh
if [ "$EC" != "" ]; then
POOLCFG='"scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1'
NOBJ=512
else
POOLCFG='"scheme":"replicated","pg_size":2,"pg_minsize":2'
NOBJ=1024
fi
dd if=/dev/zero of=./testdata/test_osd1.bin bs=1024 count=1 seek=$((1024*1024-1)) dd if=/dev/zero of=./testdata/test_osd1.bin bs=1024 count=1 seek=$((1024*1024-1))
dd if=/dev/zero of=./testdata/test_osd2.bin bs=1024 count=1 seek=$((1024*1024-1)) dd if=/dev/zero of=./testdata/test_osd2.bin bs=1024 count=1 seek=$((1024*1024-1))
dd if=/dev/zero of=./testdata/test_osd3.bin bs=1024 count=1 seek=$((1024*1024-1)) dd if=/dev/zero of=./testdata/test_osd3.bin bs=1024 count=1 seek=$((1024*1024-1))
@ -28,7 +36,7 @@ cd ..
node mon/mon-main.js --etcd_url http://$ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log & node mon/mon-main.js --etcd_url http://$ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log &
MON_PID=$! MON_PID=$!
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":2,"pg_count":16,"failure_domain":"osd"}}' $ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":16,"failure_domain":"osd"}}'
sleep 2 sleep 2
@ -52,7 +60,7 @@ try_change()
echo --- Change PG count to $n --- >>testdata/osd$i.log echo --- Change PG count to $n --- >>testdata/osd$i.log
done done
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":2,"pg_count":'$n',"failure_domain":"osd"}}' $ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":'$n',"failure_domain":"osd"}}'
for i in {1..10}; do for i in {1..10}; do
($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == 2) | length) == '$n) && \ ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == 2) | length) == '$n) && \
@ -82,8 +90,8 @@ try_change()
# Check that no objects are lost ! # Check that no objects are lost !
nobj=`$ETCDCTL get --prefix '/vitastor/pg/stats' --print-value-only | jq -s '[ .[].object_count ] | reduce .[] as $num (0; .+$num)'` nobj=`$ETCDCTL get --prefix '/vitastor/pg/stats' --print-value-only | jq -s '[ .[].object_count ] | reduce .[] as $num (0; .+$num)'`
if [ "$nobj" -ne 1024 ]; then if [ "$nobj" -ne $NOBJ ]; then
format_error "Data lost after changing PG count to $n: 1024 objects expected, but got $nobj" format_error "Data lost after changing PG count to $n: $NOBJ objects expected, but got $nobj"
fi fi
} }