Fix two potential read/write ordering problems (even though not yet seen in tests)
- Write operations could be 'stabilized' and previous versions could be purged from OSDs before the removal of version_override and following reads could potentially hit different version in EC pools - Object was marked clean after completing the delete during recovery, so reads could in theory hit a deleted version and return nothingrel-0.5
parent
98b54ca948
commit
05db1308aa
|
@ -198,6 +198,7 @@ class osd_t
|
||||||
void continue_primary_del(osd_op_t *cur_op);
|
void continue_primary_del(osd_op_t *cur_op);
|
||||||
bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
|
bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
|
||||||
void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
|
void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
|
||||||
|
void free_object_state(pg_t & pg, pg_osd_set_state_t **object_state);
|
||||||
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
|
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
|
||||||
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
|
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
|
||||||
void handle_primary_bs_subop(osd_op_t *subop);
|
void handle_primary_bs_subop(osd_op_t *subop);
|
||||||
|
|
|
@ -18,7 +18,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||||
// Our EC scheme stores data in fixed chunks equal to (K*block size)
|
// Our EC scheme stores data in fixed chunks equal to (K*block size)
|
||||||
// K = (pg_size-parity_chunks) in case of EC/XOR, or 1 for replicated pools
|
// K = (pg_size-parity_chunks) in case of EC/XOR, or 1 for replicated pools
|
||||||
pool_id_t pool_id = INODE_POOL(cur_op->req.rw.inode);
|
pool_id_t pool_id = INODE_POOL(cur_op->req.rw.inode);
|
||||||
// FIXME: We have to access pool config here, so make sure that it doesn't change while its PGs are active...
|
// Note: We read pool config here, so we must NOT change it when PGs are active
|
||||||
auto pool_cfg_it = st_cli.pool_config.find(pool_id);
|
auto pool_cfg_it = st_cli.pool_config.find(pool_id);
|
||||||
if (pool_cfg_it == st_cli.pool_config.end())
|
if (pool_cfg_it == st_cli.pool_config.end())
|
||||||
{
|
{
|
||||||
|
@ -269,8 +269,6 @@ resume_3:
|
||||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// Save version override for parallel reads
|
|
||||||
pg.ver_override[op_data->oid] = op_data->fact_ver;
|
|
||||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||||
{
|
{
|
||||||
// Only (possibly) copy new data from the request into the recovery buffer
|
// Only (possibly) copy new data from the request into the recovery buffer
|
||||||
|
@ -289,6 +287,9 @@ resume_3:
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
// For EC/XOR pools, save version override to make it impossible
|
||||||
|
// for parallel reads to read different versions of data and parity
|
||||||
|
pg.ver_override[op_data->oid] = op_data->fact_ver;
|
||||||
// Recover missing stripes, calculate parity
|
// Recover missing stripes, calculate parity
|
||||||
if (pg.scheme == POOL_SCHEME_XOR)
|
if (pg.scheme == POOL_SCHEME_XOR)
|
||||||
{
|
{
|
||||||
|
@ -332,8 +333,22 @@ resume_4:
|
||||||
op_data->st = 4;
|
op_data->st = 4;
|
||||||
return;
|
return;
|
||||||
resume_5:
|
resume_5:
|
||||||
|
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||||
|
{
|
||||||
|
// Remove version override just after the write, but before stabilizing
|
||||||
|
pg.ver_override.erase(op_data->oid);
|
||||||
|
}
|
||||||
|
if (op_data->object_state)
|
||||||
|
{
|
||||||
|
// We must forget the unclean state of the object before deleting it
|
||||||
|
// so the next reads don't accidentally read a deleted version
|
||||||
|
// And it should be done at the same time as the removal of the version override
|
||||||
|
remove_object_from_state(op_data->oid, op_data->object_state, pg);
|
||||||
|
pg.clean_count++;
|
||||||
|
}
|
||||||
if (op_data->errors > 0)
|
if (op_data->errors > 0)
|
||||||
{
|
{
|
||||||
|
free_object_state(pg, &op_data->object_state);
|
||||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -342,6 +357,7 @@ resume_7:
|
||||||
if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
|
if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
|
||||||
{
|
{
|
||||||
// FIXME: Check for immediate_commit == IMMEDIATE_SMALL
|
// FIXME: Check for immediate_commit == IMMEDIATE_SMALL
|
||||||
|
free_object_state(pg, &op_data->object_state);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (op_data->fact_ver == 1)
|
if (op_data->fact_ver == 1)
|
||||||
|
@ -390,10 +406,12 @@ resume_7:
|
||||||
copies_to_delete_after_sync_count++;
|
copies_to_delete_after_sync_count++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
free_object_state(pg, &op_data->object_state);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
|
submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
|
||||||
|
free_object_state(pg, &op_data->object_state);
|
||||||
if (op_data->n_subops > 0)
|
if (op_data->n_subops > 0)
|
||||||
{
|
{
|
||||||
resume_8:
|
resume_8:
|
||||||
|
@ -407,14 +425,9 @@ resume_9:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Clear object state
|
|
||||||
remove_object_from_state(op_data->oid, op_data->object_state, pg);
|
|
||||||
pg.clean_count++;
|
|
||||||
}
|
}
|
||||||
cur_op->reply.hdr.retval = cur_op->req.rw.len;
|
cur_op->reply.hdr.retval = cur_op->req.rw.len;
|
||||||
continue_others:
|
continue_others:
|
||||||
// Remove version override
|
|
||||||
pg.ver_override.erase(op_data->oid);
|
|
||||||
object_id oid = op_data->oid;
|
object_id oid = op_data->oid;
|
||||||
// Remove the operation from queue before calling finish_op so it doesn't see the completed operation in queue
|
// Remove the operation from queue before calling finish_op so it doesn't see the completed operation in queue
|
||||||
auto next_it = pg.write_queue.find(oid);
|
auto next_it = pg.write_queue.find(oid);
|
||||||
|
@ -818,10 +831,14 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object
|
||||||
{
|
{
|
||||||
throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
|
throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
|
||||||
}
|
}
|
||||||
object_state->object_count--;
|
}
|
||||||
if (!object_state->object_count)
|
|
||||||
|
void osd_t::free_object_state(pg_t & pg, pg_osd_set_state_t **object_state)
|
||||||
{
|
{
|
||||||
pg.state_dict.erase(object_state->osd_set);
|
if (*object_state && !(--(*object_state)->object_count))
|
||||||
|
{
|
||||||
|
pg.state_dict.erase((*object_state)->osd_set);
|
||||||
|
*object_state = NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -887,6 +904,7 @@ resume_5:
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
remove_object_from_state(op_data->oid, op_data->object_state, pg);
|
remove_object_from_state(op_data->oid, op_data->object_state, pg);
|
||||||
|
free_object_state(pg, &op_data->object_state);
|
||||||
}
|
}
|
||||||
pg.total_count--;
|
pg.total_count--;
|
||||||
object_id oid = op_data->oid;
|
object_id oid = op_data->oid;
|
||||||
|
|
|
@ -2,6 +2,14 @@
|
||||||
|
|
||||||
. `dirname $0`/common.sh
|
. `dirname $0`/common.sh
|
||||||
|
|
||||||
|
if [ "$EC" != "" ]; then
|
||||||
|
POOLCFG='"scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1'
|
||||||
|
NOBJ=512
|
||||||
|
else
|
||||||
|
POOLCFG='"scheme":"replicated","pg_size":2,"pg_minsize":2'
|
||||||
|
NOBJ=1024
|
||||||
|
fi
|
||||||
|
|
||||||
dd if=/dev/zero of=./testdata/test_osd1.bin bs=1024 count=1 seek=$((1024*1024-1))
|
dd if=/dev/zero of=./testdata/test_osd1.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||||
dd if=/dev/zero of=./testdata/test_osd2.bin bs=1024 count=1 seek=$((1024*1024-1))
|
dd if=/dev/zero of=./testdata/test_osd2.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||||
dd if=/dev/zero of=./testdata/test_osd3.bin bs=1024 count=1 seek=$((1024*1024-1))
|
dd if=/dev/zero of=./testdata/test_osd3.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||||
|
@ -28,7 +36,7 @@ cd ..
|
||||||
node mon/mon-main.js --etcd_url http://$ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log &
|
node mon/mon-main.js --etcd_url http://$ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log &
|
||||||
MON_PID=$!
|
MON_PID=$!
|
||||||
|
|
||||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":2,"pg_count":16,"failure_domain":"osd"}}'
|
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":16,"failure_domain":"osd"}}'
|
||||||
|
|
||||||
sleep 2
|
sleep 2
|
||||||
|
|
||||||
|
@ -52,7 +60,7 @@ try_change()
|
||||||
echo --- Change PG count to $n --- >>testdata/osd$i.log
|
echo --- Change PG count to $n --- >>testdata/osd$i.log
|
||||||
done
|
done
|
||||||
|
|
||||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":2,"pg_count":'$n',"failure_domain":"osd"}}'
|
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":'$n',"failure_domain":"osd"}}'
|
||||||
|
|
||||||
for i in {1..10}; do
|
for i in {1..10}; do
|
||||||
($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == 2) | length) == '$n) && \
|
($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == 2) | length) == '$n) && \
|
||||||
|
@ -82,8 +90,8 @@ try_change()
|
||||||
|
|
||||||
# Check that no objects are lost !
|
# Check that no objects are lost !
|
||||||
nobj=`$ETCDCTL get --prefix '/vitastor/pg/stats' --print-value-only | jq -s '[ .[].object_count ] | reduce .[] as $num (0; .+$num)'`
|
nobj=`$ETCDCTL get --prefix '/vitastor/pg/stats' --print-value-only | jq -s '[ .[].object_count ] | reduce .[] as $num (0; .+$num)'`
|
||||||
if [ "$nobj" -ne 1024 ]; then
|
if [ "$nobj" -ne $NOBJ ]; then
|
||||||
format_error "Data lost after changing PG count to $n: 1024 objects expected, but got $nobj"
|
format_error "Data lost after changing PG count to $n: $NOBJ objects expected, but got $nobj"
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue