Browse Source

Fix two potential read/write ordering problems (even though not yet seen in tests)

- Write operations could be 'stabilized' and previous versions could be
  purged from OSDs before the removal of version_override and following
  reads could potentially hit different version in EC pools
- Object was marked clean after completing the delete during recovery, so
  reads could in theory hit a deleted version and return nothing
rel-0.5
Vitaliy Filippov 2 years ago
parent
commit
05db1308aa
  1. 1
      src/osd.h
  2. 40
      src/osd_primary.cpp
  3. 16
      tests/test_change_pg_count.sh

1
src/osd.h

@ -198,6 +198,7 @@ class osd_t
void continue_primary_del(osd_op_t *cur_op);
bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
void free_object_state(pg_t & pg, pg_osd_set_state_t **object_state);
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
void handle_primary_bs_subop(osd_op_t *subop);

40
src/osd_primary.cpp

@ -18,7 +18,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
// Our EC scheme stores data in fixed chunks equal to (K*block size)
// K = (pg_size-parity_chunks) in case of EC/XOR, or 1 for replicated pools
pool_id_t pool_id = INODE_POOL(cur_op->req.rw.inode);
// FIXME: We have to access pool config here, so make sure that it doesn't change while its PGs are active...
// Note: We read pool config here, so we must NOT change it when PGs are active
auto pool_cfg_it = st_cli.pool_config.find(pool_id);
if (pool_cfg_it == st_cli.pool_config.end())
{
@ -269,8 +269,6 @@ resume_3:
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return;
}
// Save version override for parallel reads
pg.ver_override[op_data->oid] = op_data->fact_ver;
if (op_data->scheme == POOL_SCHEME_REPLICATED)
{
// Only (possibly) copy new data from the request into the recovery buffer
@ -289,6 +287,9 @@ resume_3:
}
else
{
// For EC/XOR pools, save version override to make it impossible
// for parallel reads to read different versions of data and parity
pg.ver_override[op_data->oid] = op_data->fact_ver;
// Recover missing stripes, calculate parity
if (pg.scheme == POOL_SCHEME_XOR)
{
@ -332,8 +333,22 @@ resume_4:
op_data->st = 4;
return;
resume_5:
if (op_data->scheme != POOL_SCHEME_REPLICATED)
{
// Remove version override just after the write, but before stabilizing
pg.ver_override.erase(op_data->oid);
}
if (op_data->object_state)
{
// We must forget the unclean state of the object before deleting it
// so the next reads don't accidentally read a deleted version
// And it should be done at the same time as the removal of the version override
remove_object_from_state(op_data->oid, op_data->object_state, pg);
pg.clean_count++;
}
if (op_data->errors > 0)
{
free_object_state(pg, &op_data->object_state);
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return;
}
@ -342,6 +357,7 @@ resume_7:
if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
{
// FIXME: Check for immediate_commit == IMMEDIATE_SMALL
free_object_state(pg, &op_data->object_state);
return;
}
if (op_data->fact_ver == 1)
@ -390,10 +406,12 @@ resume_7:
copies_to_delete_after_sync_count++;
}
}
free_object_state(pg, &op_data->object_state);
}
else
{
submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
free_object_state(pg, &op_data->object_state);
if (op_data->n_subops > 0)
{
resume_8:
@ -407,14 +425,9 @@ resume_9:
}
}
}
// Clear object state
remove_object_from_state(op_data->oid, op_data->object_state, pg);
pg.clean_count++;
}
cur_op->reply.hdr.retval = cur_op->req.rw.len;
continue_others:
// Remove version override
pg.ver_override.erase(op_data->oid);
object_id oid = op_data->oid;
// Remove the operation from queue before calling finish_op so it doesn't see the completed operation in queue
auto next_it = pg.write_queue.find(oid);
@ -818,10 +831,14 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object
{
throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
}
object_state->object_count--;
if (!object_state->object_count)
}
void osd_t::free_object_state(pg_t & pg, pg_osd_set_state_t **object_state)
{
if (*object_state && !(--(*object_state)->object_count))
{
pg.state_dict.erase(object_state->osd_set);
pg.state_dict.erase((*object_state)->osd_set);
*object_state = NULL;
}
}
@ -887,6 +904,7 @@ resume_5:
else
{
remove_object_from_state(op_data->oid, op_data->object_state, pg);
free_object_state(pg, &op_data->object_state);
}
pg.total_count--;
object_id oid = op_data->oid;

16
tests/test_change_pg_count.sh

@ -2,6 +2,14 @@
. `dirname $0`/common.sh
if [ "$EC" != "" ]; then
POOLCFG='"scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1'
NOBJ=512
else
POOLCFG='"scheme":"replicated","pg_size":2,"pg_minsize":2'
NOBJ=1024
fi
dd if=/dev/zero of=./testdata/test_osd1.bin bs=1024 count=1 seek=$((1024*1024-1))
dd if=/dev/zero of=./testdata/test_osd2.bin bs=1024 count=1 seek=$((1024*1024-1))
dd if=/dev/zero of=./testdata/test_osd3.bin bs=1024 count=1 seek=$((1024*1024-1))
@ -28,7 +36,7 @@ cd ..
node mon/mon-main.js --etcd_url http://$ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log &
MON_PID=$!
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":2,"pg_count":16,"failure_domain":"osd"}}'
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":16,"failure_domain":"osd"}}'
sleep 2
@ -52,7 +60,7 @@ try_change()
echo --- Change PG count to $n --- >>testdata/osd$i.log
done
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":2,"pg_count":'$n',"failure_domain":"osd"}}'
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":'$n',"failure_domain":"osd"}}'
for i in {1..10}; do
($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == 2) | length) == '$n) && \
@ -82,8 +90,8 @@ try_change()
# Check that no objects are lost !
nobj=`$ETCDCTL get --prefix '/vitastor/pg/stats' --print-value-only | jq -s '[ .[].object_count ] | reduce .[] as $num (0; .+$num)'`
if [ "$nobj" -ne 1024 ]; then
format_error "Data lost after changing PG count to $n: 1024 objects expected, but got $nobj"
if [ "$nobj" -ne $NOBJ ]; then
format_error "Data lost after changing PG count to $n: $NOBJ objects expected, but got $nobj"
fi
}

Loading…
Cancel
Save