From d0c2e313128b13288c6f733968aacbdc4ed4a86c Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 7 Feb 2021 16:14:54 +0300 Subject: [PATCH] Add a test for snapshots, fix bugs. Now the test passes --- mon/merge.js | 23 ++++++++++++ src/blockstore_open.cpp | 5 +++ src/blockstore_write.cpp | 40 ++++++++++++++++++--- src/cluster_client.cpp | 4 +++ src/etcd_state_client.cpp | 10 +++++- src/msgr_receive.cpp | 17 ++++++--- src/osd.cpp | 2 +- src/osd_primary.cpp | 5 +-- src/osd_primary_subops.cpp | 2 ++ tests/test_snapshot.sh | 71 ++++++++++++++++++++++++++++++++++++++ 10 files changed, 166 insertions(+), 13 deletions(-) create mode 100644 mon/merge.js create mode 100755 tests/test_snapshot.sh diff --git a/mon/merge.js b/mon/merge.js new file mode 100644 index 00000000..bf69c7ed --- /dev/null +++ b/mon/merge.js @@ -0,0 +1,23 @@ +const fsp = require('fs').promises; + +async function merge(file1, file2, out) +{ + if (!out) + { + console.error('USAGE: nodejs merge.js layer1 layer2 output'); + process.exit(); + } + const layer1 = await fsp.readFile(file1); + const layer2 = await fsp.readFile(file2); + const zero = Buffer.alloc(4096); + for (let i = 0; i < layer2.length; i += 4096) + { + if (zero.compare(layer2, i, i+4096) != 0) + { + layer2.copy(layer1, i, i, i+4096); + } + } + await fsp.writeFile(out, layer1); +} + +merge(process.argv[2], process.argv[3], process.argv[4]); diff --git a/src/blockstore_open.cpp b/src/blockstore_open.cpp index 8531c13c..a9014162 100644 --- a/src/blockstore_open.cpp +++ b/src/blockstore_open.cpp @@ -153,6 +153,11 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config) { throw std::runtime_error("meta_offset must be a multiple of meta_block_size = "+std::to_string(meta_block_size)); } + // FIXME: Due to the recent changes in entry_attr handling rename it back to bitmap + if (entry_attr_size > meta_block_size/2) + { + throw std::runtime_error("entry_attr_size is too big"); + } if (journal.offset % journal_block_size) { throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size)); diff --git a/src/blockstore_write.cpp b/src/blockstore_write.cpp index 3f840f2b..e7a5ef3a 100644 --- a/src/blockstore_write.cpp +++ b/src/blockstore_write.cpp @@ -8,7 +8,12 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) // Check or assign version number bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE); bool wait_big = false, wait_del = false; + void *bmp = NULL; uint64_t version = 1; + if (!is_del && entry_attr_size > sizeof(void*)) + { + bmp = calloc_or_die(1, entry_attr_size); + } if (dirty_db.size() > 0) { auto dirty_it = dirty_db.upper_bound((obj_ver_id){ @@ -25,6 +30,10 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) wait_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE ? !IS_SYNCED(dirty_it->second.state) : ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG); + if (entry_attr_size > sizeof(void*)) + memcpy(bmp, dirty_it->second.bitmap, entry_attr_size); + else + bmp = dirty_it->second.bitmap; } } if (!found) @@ -33,6 +42,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) if (clean_it != clean_db.end()) { version = clean_it->second.version + 1; + void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size); + memcpy((entry_attr_size > sizeof(void*) ? bmp : &bmp), bmp_ptr, entry_attr_size); } else { @@ -72,6 +83,10 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) { // Invalid version requested op->retval = -EEXIST; + if (!is_del && entry_attr_size > sizeof(void*)) + { + free(bmp); + } return false; } } @@ -95,7 +110,6 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) #endif // FIXME No strict need to add it into dirty_db here, it's just left // from the previous implementation where reads waited for writes - void *bmp = NULL; uint32_t state; if (is_del) state = BS_ST_DELETE | BS_ST_IN_FLIGHT; @@ -110,10 +124,28 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) state |= BS_ST_IN_FLIGHT; if (op->opcode == BS_OP_WRITE_STABLE) state |= BS_ST_INSTANT; - if (entry_attr_size > sizeof(void*)) - bmp = calloc_or_die(1, entry_attr_size); if (op->bitmap) - memcpy((entry_attr_size > sizeof(void*) ? bmp : &bmp), op->bitmap, entry_attr_size); + { + // Only allow to overwrite part of the object bitmap respective to the write's offset/len + uint8_t *bmp_ptr = (uint8_t*)(entry_attr_size > sizeof(void*) ? bmp : &bmp); + uint32_t bit = op->offset/bitmap_granularity; + uint32_t bits_left = op->len/bitmap_granularity; + while (!(bit % 8) && bits_left > 8) + { + // Copy bytes + bmp_ptr[bit/8] = ((uint8_t*)op->bitmap)[bit/8]; + bit += 8; + bits_left -= 8; + } + while (bits_left > 0) + { + // Copy bits + bmp_ptr[bit/8] = (bmp_ptr[bit/8] & ~(1 << (bit%8))) + | (((uint8_t*)op->bitmap)[bit/8] & (1 << bit%8)); + bit++; + bits_left--; + } + } } dirty_db.emplace((obj_ver_id){ .oid = op->oid, diff --git a/src/cluster_client.cpp b/src/cluster_client.cpp index 23019a93..bfed1180 100644 --- a/src/cluster_client.cpp +++ b/src/cluster_client.cpp @@ -737,7 +737,11 @@ void cluster_client_t::slice_rw(cluster_op_t *op) } assert(cur > prev); if (skip_prev) + { + // Just advance iov_idx & iov_pos + add_iov(end-prev, true, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0); end = prev; + } else add_iov(cur-prev, skip_prev, op, iov_idx, iov_pos, op->parts[i].iov, scrap_buffer, scrap_buffer_size); if (end == begin) diff --git a/src/etcd_state_client.cpp b/src/etcd_state_client.cpp index 37ee521e..cbef34b6 100644 --- a/src/etcd_state_client.cpp +++ b/src/etcd_state_client.cpp @@ -271,6 +271,12 @@ void etcd_state_client_t::load_pgs() { "key", base64_encode(etcd_prefix+"/config/pgs") }, } } }, + json11::Json::object { + { "request_range", json11::Json::object { + { "key", base64_encode(etcd_prefix+"/config/inode/") }, + { "range_end", base64_encode(etcd_prefix+"/config/inode0") }, + } } + }, json11::Json::object { { "request_range", json11::Json::object { { "key", base64_encode(etcd_prefix+"/pg/history/") }, @@ -666,7 +672,9 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso if (parent_inode_num && !(parent_inode_num >> (64-POOL_ID_BITS))) { uint64_t parent_pool_id = value["parent_pool"].uint64_value(); - if (parent_pool_id >= POOL_ID_MAX) + if (!parent_pool_id) + parent_inode_num |= pool_id << (64-POOL_ID_BITS); + else if (parent_pool_id >= POOL_ID_MAX) { printf( "Inode %lu/%lu parent_pool value is invalid, ignoring parent setting\n", diff --git a/src/msgr_receive.cpp b/src/msgr_receive.cpp index 2c23a778..afcd2a0c 100644 --- a/src/msgr_receive.cpp +++ b/src/msgr_receive.cpp @@ -273,11 +273,9 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl) osd_op_t *op = req_it->second; memcpy(op->reply.buf, cl->read_op->req.buf, OSD_PACKET_SIZE); cl->sent_ops.erase(req_it); - if ((op->reply.hdr.opcode == OSD_OP_SEC_READ || op->reply.hdr.opcode == OSD_OP_READ) && - op->reply.hdr.retval > 0) + if (op->reply.hdr.opcode == OSD_OP_SEC_READ || op->reply.hdr.opcode == OSD_OP_READ) { // Read data. In this case we assume that the buffer is preallocated by the caller (!) - assert(op->iov.count > 0); unsigned bmp_len = (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->reply.sec_rw.attr_len : op->reply.rw.bitmap_len); if (op->reply.hdr.retval != (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->req.sec_rw.len : op->req.rw.len) || bmp_len > op->bitmap_len) @@ -292,11 +290,19 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl) { cl->recv_list.push_back(op->bitmap, bmp_len); } - cl->recv_list.append(op->iov); + if (op->reply.hdr.retval > 0) + { + assert(op->iov.count > 0); + cl->recv_list.append(op->iov); + } + cl->read_remaining = op->reply.hdr.retval + bmp_len; + if (cl->read_remaining == 0) + { + goto reuse; + } delete cl->read_op; cl->read_op = op; cl->read_state = CL_READ_REPLY_DATA; - cl->read_remaining = op->reply.hdr.retval + bmp_len; } else if (op->reply.hdr.opcode == OSD_OP_SEC_LIST && op->reply.hdr.retval > 0) { @@ -320,6 +326,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl) } else { +reuse: // It's fine to reuse cl->read_op for the next reply handle_reply_ready(op); cl->recv_list.push_back(cl->read_op->req.buf, OSD_PACKET_SIZE); diff --git a/src/osd.cpp b/src/osd.cpp index 601aa434..c8cb2ec0 100644 --- a/src/osd.cpp +++ b/src/osd.cpp @@ -21,7 +21,7 @@ osd_t::osd_t(blockstore_config_t & config, ring_loop_t *ringloop) // Force external bitmap size entry_attr_size = bs_block_size / bs_bitmap_granularity / 8; - config["entry_attr_size"] = entry_attr_size; + config["entry_attr_size"] = std::to_string(entry_attr_size); this->config = config; this->ringloop = ringloop; diff --git a/src/osd_primary.cpp b/src/osd_primary.cpp index 8994d695..1ccc5d78 100644 --- a/src/osd_primary.cpp +++ b/src/osd_primary.cpp @@ -154,6 +154,7 @@ resume_2: finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO); return; } + cur_op->reply.rw.bitmap_len = op_data->pg_data_size * entry_attr_size; if (op_data->degraded) { // Reconstruct missing stripes @@ -166,6 +167,7 @@ resume_2: { reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size, entry_attr_size); } + cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len); for (int role = 0; role < op_data->pg_size; role++) { if (stripes[role].req_end != 0) @@ -180,10 +182,9 @@ resume_2: } else { + cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len); cur_op->iov.push_back(cur_op->buf, cur_op->req.rw.len); } - cur_op->reply.rw.bitmap_len = op_data->pg_data_size * entry_attr_size; - cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len); finish_op(cur_op, cur_op->req.rw.len); } diff --git a/src/osd_primary_subops.cpp b/src/osd_primary_subops.cpp index 1b6e4f5b..e4fb06aa 100644 --- a/src/osd_primary_subops.cpp +++ b/src/osd_primary_subops.cpp @@ -154,6 +154,8 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s { clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin); subops[i].op_type = (uint64_t)cur_op; + subops[i].bitmap = stripes[stripe_num].bmp_buf; + subops[i].bitmap_len = entry_attr_size; subops[i].bs_op = new blockstore_op_t({ .opcode = (uint64_t)(wr ? (rep ? BS_OP_WRITE_STABLE : BS_OP_WRITE) : BS_OP_READ), .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop) diff --git a/tests/test_snapshot.sh b/tests/test_snapshot.sh new file mode 100755 index 00000000..7f15464a --- /dev/null +++ b/tests/test_snapshot.sh @@ -0,0 +1,71 @@ +#!/bin/bash -ex + +. `dirname $0`/common.sh + +dd if=/dev/zero of=./testdata/test_osd1.bin bs=1024 count=1 seek=$((1024*1024-1)) +dd if=/dev/zero of=./testdata/test_osd2.bin bs=1024 count=1 seek=$((1024*1024-1)) +dd if=/dev/zero of=./testdata/test_osd3.bin bs=1024 count=1 seek=$((1024*1024-1)) + +build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd1.bin 2>/dev/null) &>./testdata/osd1.log & +OSD1_PID=$! +build/src/vitastor-osd --osd_num 2 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd2.bin 2>/dev/null) &>./testdata/osd2.log & +OSD2_PID=$! +build/src/vitastor-osd --osd_num 3 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd3.bin 2>/dev/null) &>./testdata/osd3.log & +OSD3_PID=$! + +cd mon +npm install +cd .. +node mon/mon-main.js --etcd_url http://$ETCD_URL --etcd_prefix "/vitastor" &>./testdata/mon.log & +MON_PID=$! + +$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1,"pg_count":1,"failure_domain":"osd"}}' + +sleep 2 + +if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(. | length) != 0 and (.[0].items["1"]["1"].osd_set | sort) == ["1","2","3"]'); then + format_error "FAILED: 1 PG NOT CONFIGURED" +fi + +if ! ($ETCDCTL get /vitastor/pg/state/1/1 --print-value-only | jq -s -e '(. | length) != 0 and .[0].state == ["active"]'); then + format_error "FAILED: 1 PG NOT UP" +fi + +if ! cmp build/src/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so; then + sudo rm -f /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so + sudo ln -s "$(realpath .)/build/src/block-vitastor.so" /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so +fi + +# Test basic write and snapshot + +LD_PRELOAD=libasan.so.5 \ + fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=2 -size=32M -cluster_log_level=10 + +$ETCDCTL put /vitastor/config/inode/1/2 '{"name":"testimg@0"}' +$ETCDCTL put /vitastor/config/inode/1/3 '{"parent_id":2,"name":"testimg"}' + +LD_PRELOAD=libasan.so.5 \ + fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=1 -fsync=32 -buffer_pattern=0xdeadface -rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=3 -size=32M -number_ios=1024 + +LD_PRELOAD=libasan.so.5 \ + fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -rw=read -etcd=$ETCD_URL -pool=1 -inode=3 -size=32M + +qemu-img convert -S 4096 -p \ + -f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024))" \ + -O raw ./testdata/merged.bin + +qemu-img convert -S 4096 -p \ + -f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=2:size=$((32*1024*1024))" \ + -O raw ./testdata/layer0.bin + +$ETCDCTL put /vitastor/config/inode/1/3 '{"name":"testimg"}' + +qemu-img convert -S 4096 -p \ + -f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024))" \ + -O raw ./testdata/layer1.bin + +node mon/merge.js ./testdata/layer0.bin ./testdata/layer1.bin ./testdata/check.bin + +cmp ./testdata/merged.bin ./testdata/check.bin + +format_green OK