Add a test for snapshots, fix bugs. Now the test passes

rdma-zerocopy
Vitaliy Filippov 2021-02-07 16:14:54 +03:00
parent 9038d42327
commit d0c2e31312
10 changed files with 166 additions and 13 deletions

23
mon/merge.js Normal file
View File

@ -0,0 +1,23 @@
const fsp = require('fs').promises;
async function merge(file1, file2, out)
{
if (!out)
{
console.error('USAGE: nodejs merge.js layer1 layer2 output');
process.exit();
}
const layer1 = await fsp.readFile(file1);
const layer2 = await fsp.readFile(file2);
const zero = Buffer.alloc(4096);
for (let i = 0; i < layer2.length; i += 4096)
{
if (zero.compare(layer2, i, i+4096) != 0)
{
layer2.copy(layer1, i, i, i+4096);
}
}
await fsp.writeFile(out, layer1);
}
merge(process.argv[2], process.argv[3], process.argv[4]);

View File

@ -153,6 +153,11 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
{
throw std::runtime_error("meta_offset must be a multiple of meta_block_size = "+std::to_string(meta_block_size));
}
// FIXME: Due to the recent changes in entry_attr handling rename it back to bitmap
if (entry_attr_size > meta_block_size/2)
{
throw std::runtime_error("entry_attr_size is too big");
}
if (journal.offset % journal_block_size)
{
throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));

View File

@ -8,7 +8,12 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
// Check or assign version number
bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE);
bool wait_big = false, wait_del = false;
void *bmp = NULL;
uint64_t version = 1;
if (!is_del && entry_attr_size > sizeof(void*))
{
bmp = calloc_or_die(1, entry_attr_size);
}
if (dirty_db.size() > 0)
{
auto dirty_it = dirty_db.upper_bound((obj_ver_id){
@ -25,6 +30,10 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
wait_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
? !IS_SYNCED(dirty_it->second.state)
: ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG);
if (entry_attr_size > sizeof(void*))
memcpy(bmp, dirty_it->second.bitmap, entry_attr_size);
else
bmp = dirty_it->second.bitmap;
}
}
if (!found)
@ -33,6 +42,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
if (clean_it != clean_db.end())
{
version = clean_it->second.version + 1;
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size);
memcpy((entry_attr_size > sizeof(void*) ? bmp : &bmp), bmp_ptr, entry_attr_size);
}
else
{
@ -72,6 +83,10 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
{
// Invalid version requested
op->retval = -EEXIST;
if (!is_del && entry_attr_size > sizeof(void*))
{
free(bmp);
}
return false;
}
}
@ -95,7 +110,6 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
#endif
// FIXME No strict need to add it into dirty_db here, it's just left
// from the previous implementation where reads waited for writes
void *bmp = NULL;
uint32_t state;
if (is_del)
state = BS_ST_DELETE | BS_ST_IN_FLIGHT;
@ -110,10 +124,28 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
state |= BS_ST_IN_FLIGHT;
if (op->opcode == BS_OP_WRITE_STABLE)
state |= BS_ST_INSTANT;
if (entry_attr_size > sizeof(void*))
bmp = calloc_or_die(1, entry_attr_size);
if (op->bitmap)
memcpy((entry_attr_size > sizeof(void*) ? bmp : &bmp), op->bitmap, entry_attr_size);
{
// Only allow to overwrite part of the object bitmap respective to the write's offset/len
uint8_t *bmp_ptr = (uint8_t*)(entry_attr_size > sizeof(void*) ? bmp : &bmp);
uint32_t bit = op->offset/bitmap_granularity;
uint32_t bits_left = op->len/bitmap_granularity;
while (!(bit % 8) && bits_left > 8)
{
// Copy bytes
bmp_ptr[bit/8] = ((uint8_t*)op->bitmap)[bit/8];
bit += 8;
bits_left -= 8;
}
while (bits_left > 0)
{
// Copy bits
bmp_ptr[bit/8] = (bmp_ptr[bit/8] & ~(1 << (bit%8)))
| (((uint8_t*)op->bitmap)[bit/8] & (1 << bit%8));
bit++;
bits_left--;
}
}
}
dirty_db.emplace((obj_ver_id){
.oid = op->oid,

View File

@ -737,7 +737,11 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
}
assert(cur > prev);
if (skip_prev)
{
// Just advance iov_idx & iov_pos
add_iov(end-prev, true, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
end = prev;
}
else
add_iov(cur-prev, skip_prev, op, iov_idx, iov_pos, op->parts[i].iov, scrap_buffer, scrap_buffer_size);
if (end == begin)

View File

@ -271,6 +271,12 @@ void etcd_state_client_t::load_pgs()
{ "key", base64_encode(etcd_prefix+"/config/pgs") },
} }
},
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/config/inode/") },
{ "range_end", base64_encode(etcd_prefix+"/config/inode0") },
} }
},
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/pg/history/") },
@ -666,7 +672,9 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso
if (parent_inode_num && !(parent_inode_num >> (64-POOL_ID_BITS)))
{
uint64_t parent_pool_id = value["parent_pool"].uint64_value();
if (parent_pool_id >= POOL_ID_MAX)
if (!parent_pool_id)
parent_inode_num |= pool_id << (64-POOL_ID_BITS);
else if (parent_pool_id >= POOL_ID_MAX)
{
printf(
"Inode %lu/%lu parent_pool value is invalid, ignoring parent setting\n",

View File

@ -273,11 +273,9 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
osd_op_t *op = req_it->second;
memcpy(op->reply.buf, cl->read_op->req.buf, OSD_PACKET_SIZE);
cl->sent_ops.erase(req_it);
if ((op->reply.hdr.opcode == OSD_OP_SEC_READ || op->reply.hdr.opcode == OSD_OP_READ) &&
op->reply.hdr.retval > 0)
if (op->reply.hdr.opcode == OSD_OP_SEC_READ || op->reply.hdr.opcode == OSD_OP_READ)
{
// Read data. In this case we assume that the buffer is preallocated by the caller (!)
assert(op->iov.count > 0);
unsigned bmp_len = (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->reply.sec_rw.attr_len : op->reply.rw.bitmap_len);
if (op->reply.hdr.retval != (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->req.sec_rw.len : op->req.rw.len) ||
bmp_len > op->bitmap_len)
@ -292,11 +290,19 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
{
cl->recv_list.push_back(op->bitmap, bmp_len);
}
cl->recv_list.append(op->iov);
if (op->reply.hdr.retval > 0)
{
assert(op->iov.count > 0);
cl->recv_list.append(op->iov);
}
cl->read_remaining = op->reply.hdr.retval + bmp_len;
if (cl->read_remaining == 0)
{
goto reuse;
}
delete cl->read_op;
cl->read_op = op;
cl->read_state = CL_READ_REPLY_DATA;
cl->read_remaining = op->reply.hdr.retval + bmp_len;
}
else if (op->reply.hdr.opcode == OSD_OP_SEC_LIST && op->reply.hdr.retval > 0)
{
@ -320,6 +326,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
}
else
{
reuse:
// It's fine to reuse cl->read_op for the next reply
handle_reply_ready(op);
cl->recv_list.push_back(cl->read_op->req.buf, OSD_PACKET_SIZE);

View File

@ -21,7 +21,7 @@ osd_t::osd_t(blockstore_config_t & config, ring_loop_t *ringloop)
// Force external bitmap size
entry_attr_size = bs_block_size / bs_bitmap_granularity / 8;
config["entry_attr_size"] = entry_attr_size;
config["entry_attr_size"] = std::to_string(entry_attr_size);
this->config = config;
this->ringloop = ringloop;

View File

@ -154,6 +154,7 @@ resume_2:
finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
return;
}
cur_op->reply.rw.bitmap_len = op_data->pg_data_size * entry_attr_size;
if (op_data->degraded)
{
// Reconstruct missing stripes
@ -166,6 +167,7 @@ resume_2:
{
reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size, entry_attr_size);
}
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
for (int role = 0; role < op_data->pg_size; role++)
{
if (stripes[role].req_end != 0)
@ -180,10 +182,9 @@ resume_2:
}
else
{
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
cur_op->iov.push_back(cur_op->buf, cur_op->req.rw.len);
}
cur_op->reply.rw.bitmap_len = op_data->pg_data_size * entry_attr_size;
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
finish_op(cur_op, cur_op->req.rw.len);
}

View File

@ -154,6 +154,8 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
{
clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
subops[i].op_type = (uint64_t)cur_op;
subops[i].bitmap = stripes[stripe_num].bmp_buf;
subops[i].bitmap_len = entry_attr_size;
subops[i].bs_op = new blockstore_op_t({
.opcode = (uint64_t)(wr ? (rep ? BS_OP_WRITE_STABLE : BS_OP_WRITE) : BS_OP_READ),
.callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)

71
tests/test_snapshot.sh Executable file
View File

@ -0,0 +1,71 @@
#!/bin/bash -ex
. `dirname $0`/common.sh
dd if=/dev/zero of=./testdata/test_osd1.bin bs=1024 count=1 seek=$((1024*1024-1))
dd if=/dev/zero of=./testdata/test_osd2.bin bs=1024 count=1 seek=$((1024*1024-1))
dd if=/dev/zero of=./testdata/test_osd3.bin bs=1024 count=1 seek=$((1024*1024-1))
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd1.bin 2>/dev/null) &>./testdata/osd1.log &
OSD1_PID=$!
build/src/vitastor-osd --osd_num 2 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd2.bin 2>/dev/null) &>./testdata/osd2.log &
OSD2_PID=$!
build/src/vitastor-osd --osd_num 3 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd3.bin 2>/dev/null) &>./testdata/osd3.log &
OSD3_PID=$!
cd mon
npm install
cd ..
node mon/mon-main.js --etcd_url http://$ETCD_URL --etcd_prefix "/vitastor" &>./testdata/mon.log &
MON_PID=$!
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1,"pg_count":1,"failure_domain":"osd"}}'
sleep 2
if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(. | length) != 0 and (.[0].items["1"]["1"].osd_set | sort) == ["1","2","3"]'); then
format_error "FAILED: 1 PG NOT CONFIGURED"
fi
if ! ($ETCDCTL get /vitastor/pg/state/1/1 --print-value-only | jq -s -e '(. | length) != 0 and .[0].state == ["active"]'); then
format_error "FAILED: 1 PG NOT UP"
fi
if ! cmp build/src/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so; then
sudo rm -f /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so
sudo ln -s "$(realpath .)/build/src/block-vitastor.so" /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so
fi
# Test basic write and snapshot
LD_PRELOAD=libasan.so.5 \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=2 -size=32M -cluster_log_level=10
$ETCDCTL put /vitastor/config/inode/1/2 '{"name":"testimg@0"}'
$ETCDCTL put /vitastor/config/inode/1/3 '{"parent_id":2,"name":"testimg"}'
LD_PRELOAD=libasan.so.5 \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=1 -fsync=32 -buffer_pattern=0xdeadface -rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=3 -size=32M -number_ios=1024
LD_PRELOAD=libasan.so.5 \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -rw=read -etcd=$ETCD_URL -pool=1 -inode=3 -size=32M
qemu-img convert -S 4096 -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024))" \
-O raw ./testdata/merged.bin
qemu-img convert -S 4096 -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=2:size=$((32*1024*1024))" \
-O raw ./testdata/layer0.bin
$ETCDCTL put /vitastor/config/inode/1/3 '{"name":"testimg"}'
qemu-img convert -S 4096 -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024))" \
-O raw ./testdata/layer1.bin
node mon/merge.js ./testdata/layer0.bin ./testdata/layer1.bin ./testdata/check.bin
cmp ./testdata/merged.bin ./testdata/check.bin
format_green OK