From 47663bd1dcee2003f14f14b411c7af16ce796e3e Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Tue, 28 Jan 2020 22:40:50 +0300 Subject: [PATCH] Add (empty) osd_primary.cpp, rename osd_read to osd_receive, add FIXMEs for fsync --- Makefile | 6 ++++-- blockstore_impl.cpp | 2 +- blockstore_init.cpp | 2 ++ blockstore_sync.cpp | 4 ++++ osd.cpp | 13 ++----------- osd.h | 6 ++++++ osd_primary.cpp | 21 +++++++++++++++++++++ osd_read.cpp => osd_receive.cpp | 0 8 files changed, 40 insertions(+), 14 deletions(-) create mode 100644 osd_primary.cpp rename osd_read.cpp => osd_receive.cpp (100%) diff --git a/Makefile b/Makefile index 1048b7b5..b9166d20 100644 --- a/Makefile +++ b/Makefile @@ -24,10 +24,10 @@ libblockstore.so: $(BLOCKSTORE_OBJS) libfio_blockstore.so: ./libblockstore.so fio_engine.cpp json11.o g++ $(CXXFLAGS) -shared -o libfio_blockstore.so fio_engine.cpp json11.o ./libblockstore.so -ltcmalloc_minimal -luring -OSD_OBJS := osd.o osd_exec_secondary.o osd_read.o osd_send.o osd_peering.o osd_peering_pg.o json11.o +OSD_OBJS := osd.o osd_exec_secondary.o osd_receive.o osd_send.o osd_peering.o osd_peering_pg.o osd_primary.o json11.o osd_exec_secondary.o: osd_exec_secondary.cpp osd.h osd_ops.h g++ $(CXXFLAGS) -c -o $@ $< -osd_read.o: osd_read.cpp osd.h osd_ops.h +osd_receive.o: osd_receive.cpp osd.h osd_ops.h g++ $(CXXFLAGS) -c -o $@ $< osd_send.o: osd_send.cpp osd.h osd_ops.h g++ $(CXXFLAGS) -c -o $@ $< @@ -35,6 +35,8 @@ osd_peering.o: osd_peering.cpp osd.h osd_ops.h osd_peering_pg.h g++ $(CXXFLAGS) -c -o $@ $< osd_peering_pg.o: osd_peering_pg.cpp object_id.h osd_peering_pg.h g++ $(CXXFLAGS) -c -o $@ $< +osd_primary.o: osd_primary.cpp osd.h osd_ops.h osd_peering_pg.h + g++ $(CXXFLAGS) -c -o $@ $< osd.o: osd.cpp osd.h osd_ops.h osd_peering_pg.h g++ $(CXXFLAGS) -c -o $@ $< osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS) diff --git a/blockstore_impl.cpp b/blockstore_impl.cpp index 9f4cb496..98cf43f6 100644 --- a/blockstore_impl.cpp +++ b/blockstore_impl.cpp @@ -110,7 +110,7 @@ void blockstore_impl_t::loop() auto op_ptr = cur; auto op = *(cur++); // FIXME: This needs some simplification - // Writes should not block reads if the ring is not full and if reads don't depend on them + // Writes should not block reads if the ring is not full and reads don't depend on them // In all other cases we should stop submission if (PRIV(op)->wait_for) { diff --git a/blockstore_init.cpp b/blockstore_init.cpp index bdbe12d6..c031e05f 100644 --- a/blockstore_init.cpp +++ b/blockstore_init.cpp @@ -254,6 +254,7 @@ resume_1: if (!bs->disable_journal_fsync) { GET_SQE(); + // FIXME Wait for completion of writes before issuing an fsync my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC); data->iov = { 0 }; data->callback = simple_callback; @@ -337,6 +338,7 @@ resume_1: data->iov = { 0 }; data->callback = simple_callback; wait_count++; + // FIXME Wait for completion of writes before issuing an fsync my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC); } bs->ringloop->submit(); diff --git a/blockstore_sync.cpp b/blockstore_sync.cpp index 5bbfc436..1832de11 100644 --- a/blockstore_sync.cpp +++ b/blockstore_sync.cpp @@ -63,6 +63,8 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op) } if (!disable_journal_fsync) { + // FIXME: Wait for completion of writes before issuing an fsync + // Fsync and write requests posted at the same time can be reordered ring_data_t *data = ((ring_data_t*)sqes[s]->user_data); my_uring_prep_fsync(sqes[s++], journal.fd, IORING_FSYNC_DATASYNC); data->iov = { 0 }; @@ -81,6 +83,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op) // 1st step: fsync data if (!disable_data_fsync) { + // FIXME Wait for completion of writes before issuing an fsync BS_SUBMIT_GET_SQE(sqe, data); my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC); data->iov = { 0 }; @@ -150,6 +153,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op) // ... And a journal fsync if (!disable_journal_fsync) { + // FIXME Wait for completion of writes before issuing an fsync my_uring_prep_fsync(sqe[s], journal.fd, IORING_FSYNC_DATASYNC); struct ring_data_t *data = ((ring_data_t*)sqe[s]->user_data); data->iov = { 0 }; diff --git a/osd.cpp b/osd.cpp index de216619..afbddccc 100644 --- a/osd.cpp +++ b/osd.cpp @@ -278,18 +278,9 @@ void osd_t::exec_op(osd_op_t *cur_op) { exec_show_config(cur_op); } - else if (cur_op->op.hdr.opcode == OSD_OP_READ) + else if (cur_op->op.hdr.opcode == OSD_OP_READ || cur_op->op.hdr.opcode == OSD_OP_WRITE) { - // Primary OSD also works with individual stripes, but they're twice the size of the blockstore's stripe - // - convert offset & len to stripe number - // - fail operation if offset & len span multiple stripes - // - calc stripe hash and determine PG - // - check if this is our PG - // - redirect or fail operation if not - // - determine whether we need to read A and B or just A or just B or A + parity or B + parity - // and determine read ranges for both objects - // - send read requests - // - reconstruct result + exec_primary(cur_op); } else { diff --git a/osd.h b/osd.h index 88159b06..297809f7 100644 --- a/osd.h +++ b/osd.h @@ -164,10 +164,16 @@ class osd_t // op execution void exec_op(osd_op_t *cur_op); + + // secondary ops void exec_sync_stab_all(osd_op_t *cur_op); void exec_show_config(osd_op_t *cur_op); void exec_secondary(osd_op_t *cur_op); void secondary_op_callback(osd_op_t *cur_op); + + // primary ops + void exec_primary(osd_op_t *cur_op); + void make_primary_reply(osd_op_t *op); public: osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop); ~osd_t(); diff --git a/osd_primary.cpp b/osd_primary.cpp new file mode 100644 index 00000000..13eacbe5 --- /dev/null +++ b/osd_primary.cpp @@ -0,0 +1,21 @@ +#include "osd.h" + +void osd_t::exec_primary(osd_op_t *cur_op) +{ + // read: read directly or read paired stripe(s), reconstruct, return + // write: read paired stripe(s), modify, write + // nuance: take care to read the same version from paired stripes! + // if there are no write requests in progress we're good (stripes must be in sync) + // and... remember the last readable version during a write request + // and... postpone other write requests to the same stripe until the completion of previous ones + // + // sync: sync peers, get unstable versions from somewhere, stabilize them + +} + +void osd_t::make_primary_reply(osd_op_t *op) +{ + op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC; + op->reply.hdr.id = op->op.hdr.id; + op->reply.hdr.opcode = op->op.hdr.opcode; +} diff --git a/osd_read.cpp b/osd_receive.cpp similarity index 100% rename from osd_read.cpp rename to osd_receive.cpp