Make basic primary-write work
parent
09588a349f
commit
74673c761f
6
Makefile
6
Makefile
|
@ -1,7 +1,7 @@
|
||||||
BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o blockstore_open.o blockstore_journal.o blockstore_read.o \
|
BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o blockstore_open.o blockstore_journal.o blockstore_read.o \
|
||||||
blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o timerfd_interval.o
|
blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o timerfd_interval.o
|
||||||
CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
|
CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
|
||||||
all: $(BLOCKSTORE_OBJS) libfio_blockstore.so osd libfio_sec_osd.so test_blockstore stub_osd test_osd
|
all: $(BLOCKSTORE_OBJS) libfio_blockstore.so osd libfio_sec_osd.so test_blockstore stub_osd osd_test
|
||||||
clean:
|
clean:
|
||||||
rm -f *.o
|
rm -f *.o
|
||||||
|
|
||||||
|
@ -49,8 +49,8 @@ stub_osd: stub_osd.cpp osd_ops.h rw_blocking.o
|
||||||
g++ $(CXXFLAGS) -o stub_osd stub_osd.cpp rw_blocking.o -ltcmalloc_minimal
|
g++ $(CXXFLAGS) -o stub_osd stub_osd.cpp rw_blocking.o -ltcmalloc_minimal
|
||||||
rw_blocking.o: rw_blocking.cpp rw_blocking.h
|
rw_blocking.o: rw_blocking.cpp rw_blocking.h
|
||||||
g++ $(CXXFLAGS) -c -o $@ $<
|
g++ $(CXXFLAGS) -c -o $@ $<
|
||||||
test_osd: test_osd.cpp osd_ops.h rw_blocking.o
|
osd_test: osd_test.cpp osd_ops.h rw_blocking.o
|
||||||
g++ $(CXXFLAGS) -o test_osd test_osd.cpp rw_blocking.o -ltcmalloc_minimal
|
g++ $(CXXFLAGS) -o osd_test osd_test.cpp rw_blocking.o -ltcmalloc_minimal
|
||||||
|
|
||||||
libfio_sec_osd.so: fio_sec_osd.cpp osd_ops.h rw_blocking.o
|
libfio_sec_osd.so: fio_sec_osd.cpp osd_ops.h rw_blocking.o
|
||||||
g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o libfio_sec_osd.so fio_sec_osd.cpp rw_blocking.o -luring
|
g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o libfio_sec_osd.so fio_sec_osd.cpp rw_blocking.o -luring
|
||||||
|
|
|
@ -535,7 +535,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
else if (je->type == JE_BIG_WRITE)
|
else if (je->type == JE_BIG_WRITE)
|
||||||
{
|
{
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("je_big_write oid=%lu:%lu ver=%lu\n", je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version);
|
printf("je_big_write oid=%lu:%lu ver=%lu loc=%lu\n", je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location);
|
||||||
#endif
|
#endif
|
||||||
auto clean_it = bs->clean_db.find(je->big_write.oid);
|
auto clean_it = bs->clean_db.find(je->big_write.oid);
|
||||||
if (clean_it == bs->clean_db.end() ||
|
if (clean_it == bs->clean_db.end() ||
|
||||||
|
|
8
osd.cpp
8
osd.cpp
|
@ -310,15 +310,17 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
||||||
}
|
}
|
||||||
if (cur_op->req.hdr.magic != SECONDARY_OSD_OP_MAGIC ||
|
if (cur_op->req.hdr.magic != SECONDARY_OSD_OP_MAGIC ||
|
||||||
cur_op->req.hdr.opcode < OSD_OP_MIN || cur_op->req.hdr.opcode > OSD_OP_MAX ||
|
cur_op->req.hdr.opcode < OSD_OP_MIN || cur_op->req.hdr.opcode > OSD_OP_MAX ||
|
||||||
(cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ || cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE ||
|
(cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ || cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE) &&
|
||||||
cur_op->req.hdr.opcode == OSD_OP_READ || cur_op->req.hdr.opcode == OSD_OP_WRITE) &&
|
(cur_op->req.sec_rw.len > OSD_RW_MAX || cur_op->req.sec_rw.len % OSD_RW_ALIGN || cur_op->req.sec_rw.offset % OSD_RW_ALIGN) ||
|
||||||
(cur_op->req.sec_rw.len > OSD_RW_MAX || cur_op->req.sec_rw.len % OSD_RW_ALIGN || cur_op->req.sec_rw.offset % OSD_RW_ALIGN))
|
(cur_op->req.hdr.opcode == OSD_OP_READ || cur_op->req.hdr.opcode == OSD_OP_WRITE) &&
|
||||||
|
(cur_op->req.rw.len > OSD_RW_MAX || cur_op->req.rw.len % OSD_RW_ALIGN || cur_op->req.rw.offset % OSD_RW_ALIGN))
|
||||||
{
|
{
|
||||||
// Bad command
|
// Bad command
|
||||||
cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
||||||
cur_op->reply.hdr.id = cur_op->req.hdr.id;
|
cur_op->reply.hdr.id = cur_op->req.hdr.id;
|
||||||
cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
|
cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
|
||||||
cur_op->reply.hdr.retval = -EINVAL;
|
cur_op->reply.hdr.retval = -EINVAL;
|
||||||
|
outbox_push(this->clients[cur_op->peer_fd], cur_op);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
inflight_ops++;
|
inflight_ops++;
|
||||||
|
|
8
osd.h
8
osd.h
|
@ -157,7 +157,6 @@ struct osd_client_t
|
||||||
int write_state = 0;
|
int write_state = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct osd_primary_read_t;
|
|
||||||
struct osd_rmw_stripe_t;
|
struct osd_rmw_stripe_t;
|
||||||
|
|
||||||
class osd_t
|
class osd_t
|
||||||
|
@ -181,6 +180,9 @@ class osd_t
|
||||||
unsigned pg_count = 0;
|
unsigned pg_count = 0;
|
||||||
uint64_t next_subop_id = 1;
|
uint64_t next_subop_id = 1;
|
||||||
|
|
||||||
|
// Unstable writes
|
||||||
|
spp::sparse_hash_map<osd_num_t, spp::sparse_hash_map<object_id, uint64_t>> unstable_writes;
|
||||||
|
|
||||||
// client & peer I/O
|
// client & peer I/O
|
||||||
|
|
||||||
bool stopping = false;
|
bool stopping = false;
|
||||||
|
@ -207,8 +209,8 @@ class osd_t
|
||||||
int handle_epoll_events();
|
int handle_epoll_events();
|
||||||
void read_requests();
|
void read_requests();
|
||||||
void handle_read(ring_data_t *data, int peer_fd);
|
void handle_read(ring_data_t *data, int peer_fd);
|
||||||
void handle_read_op(osd_client_t *cl);
|
void handle_op_hdr(osd_client_t *cl);
|
||||||
void handle_read_reply(osd_client_t *cl);
|
void handle_reply_hdr(osd_client_t *cl);
|
||||||
void send_replies();
|
void send_replies();
|
||||||
void handle_send(ring_data_t *data, int peer_fd);
|
void handle_send(ring_data_t *data, int peer_fd);
|
||||||
void outbox_push(osd_client_t & cl, osd_op_t *op);
|
void outbox_push(osd_client_t & cl, osd_op_t *op);
|
||||||
|
|
|
@ -332,7 +332,7 @@ void osd_t::start_pg_peering(int pg_idx)
|
||||||
throw std::runtime_error("local OP_LIST failed");
|
throw std::runtime_error("local OP_LIST failed");
|
||||||
}
|
}
|
||||||
printf(
|
printf(
|
||||||
"Got object list from OSD %lu (local): %d objects (%lu of them stable)\n",
|
"Got object list from OSD %lu (local): %d object versions (%lu of them stable)\n",
|
||||||
role_osd, bs_op->retval, bs_op->version
|
role_osd, bs_op->retval, bs_op->version
|
||||||
);
|
);
|
||||||
ps->list_results[role_osd] = {
|
ps->list_results[role_osd] = {
|
||||||
|
@ -377,7 +377,7 @@ void osd_t::start_pg_peering(int pg_idx)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
printf(
|
printf(
|
||||||
"Got object list from OSD %lu: %ld objects (%lu of them stable)\n",
|
"Got object list from OSD %lu: %ld object versions (%lu of them stable)\n",
|
||||||
role_osd, op->reply.hdr.retval, op->reply.sec_list.stable_count
|
role_osd, op->reply.hdr.retval, op->reply.sec_list.stable_count
|
||||||
);
|
);
|
||||||
ps->list_results[role_osd] = {
|
ps->list_results[role_osd] = {
|
||||||
|
|
|
@ -17,11 +17,13 @@ void pg_t::remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &
|
||||||
}
|
}
|
||||||
else if (st.n_roles < pg.pg_minsize)
|
else if (st.n_roles < pg.pg_minsize)
|
||||||
{
|
{
|
||||||
|
printf("Object is unfound: inode=%lu stripe=%lu version=%lu/%lu\n", st.oid.inode, st.oid.stripe, st.target_ver, st.max_ver);
|
||||||
state = OBJ_INCOMPLETE;
|
state = OBJ_INCOMPLETE;
|
||||||
pg.state = pg.state | PG_HAS_UNFOUND;
|
pg.state = pg.state | PG_HAS_UNFOUND;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", st.oid.inode, st.oid.stripe, st.target_ver, st.max_ver);
|
||||||
state = OBJ_DEGRADED;
|
state = OBJ_DEGRADED;
|
||||||
pg.state = pg.state | PG_HAS_DEGRADED;
|
pg.state = pg.state | PG_HAS_DEGRADED;
|
||||||
}
|
}
|
||||||
|
@ -133,6 +135,7 @@ void pg_t::remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &
|
||||||
pg.clean_count++;
|
pg.clean_count++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FIXME: Write at least some tests for this function
|
||||||
void pg_t::calc_object_states()
|
void pg_t::calc_object_states()
|
||||||
{
|
{
|
||||||
auto & pg = *this;
|
auto & pg = *this;
|
||||||
|
@ -188,7 +191,7 @@ void pg_t::calc_object_states()
|
||||||
{
|
{
|
||||||
if (st.n_stable > 0 || st.n_roles >= pg.pg_minsize)
|
if (st.n_stable > 0 || st.n_roles >= pg.pg_minsize)
|
||||||
{
|
{
|
||||||
// Version is either recoverable or stable, choose it as target and skip previous versions
|
// Last processed version is either recoverable or stable, choose it as target and skip previous versions
|
||||||
st.ver_end = i;
|
st.ver_end = i;
|
||||||
i++;
|
i++;
|
||||||
while (i < all.size() && st.oid.inode == all[i].oid.inode &&
|
while (i < all.size() && st.oid.inode == all[i].oid.inode &&
|
||||||
|
@ -201,13 +204,13 @@ void pg_t::calc_object_states()
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
st.obj_end = i;
|
st.obj_end = i;
|
||||||
remember_object(st, all);
|
|
||||||
i--;
|
i--;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Remember that there are newer unrecoverable versions
|
// Last processed version is unstable and unrecoverable
|
||||||
|
// We'll know that because target_ver < max_ver
|
||||||
st.ver_start = i;
|
st.ver_start = i;
|
||||||
st.target_ver = all[i].version;
|
st.target_ver = all[i].version;
|
||||||
st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0;
|
st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0;
|
||||||
|
|
|
@ -127,6 +127,8 @@ struct pg_t
|
||||||
spp::sparse_hash_map<object_id, uint64_t> ver_override;
|
spp::sparse_hash_map<object_id, uint64_t> ver_override;
|
||||||
pg_peering_state_t *peering_state = NULL;
|
pg_peering_state_t *peering_state = NULL;
|
||||||
|
|
||||||
|
std::multimap<object_id, osd_op_t*> write_queue;
|
||||||
|
|
||||||
void calc_object_states();
|
void calc_object_states();
|
||||||
void remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &all);
|
void remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &all);
|
||||||
};
|
};
|
||||||
|
@ -139,11 +141,13 @@ inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
|
||||||
|
|
||||||
inline bool operator < (const obj_ver_role & a, const obj_ver_role & b)
|
inline bool operator < (const obj_ver_role & a, const obj_ver_role & b)
|
||||||
{
|
{
|
||||||
return a.oid < b.oid ||
|
// ORDER BY inode ASC, stripe & ~STRIPE_MASK ASC, version DESC, osd_num ASC
|
||||||
// object versions come in descending order
|
return a.oid.inode < b.oid.inode || a.oid.inode == b.oid.inode && (
|
||||||
a.oid == b.oid && a.version > b.version ||
|
(a.oid.stripe & ~STRIPE_MASK) < (b.oid.stripe & ~STRIPE_MASK) ||
|
||||||
a.oid == b.oid && a.version == b.version ||
|
(a.oid.stripe & ~STRIPE_MASK) == (b.oid.stripe & ~STRIPE_MASK) && (
|
||||||
a.oid == b.oid && a.version == b.version && a.osd_num < b.osd_num;
|
a.version > b.version || a.version == b.version && a.osd_num < b.osd_num
|
||||||
|
)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)
|
inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)
|
||||||
|
|
|
@ -174,8 +174,7 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
|
||||||
{
|
{
|
||||||
zero_read = role;
|
zero_read = role;
|
||||||
}
|
}
|
||||||
if (osd_set[role] != 0 &&
|
if (osd_set[role] != 0 && (w || stripes[role].read_end != 0))
|
||||||
(w ? stripes[role].write_end : stripes[role].read_end) != 0)
|
|
||||||
{
|
{
|
||||||
n_subops++;
|
n_subops++;
|
||||||
}
|
}
|
||||||
|
@ -195,11 +194,12 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
|
||||||
int subop = 0;
|
int subop = 0;
|
||||||
for (int role = 0; role < pg_size; role++)
|
for (int role = 0; role < pg_size; role++)
|
||||||
{
|
{
|
||||||
if ((submit_type == SUBMIT_WRITE ? stripes[role].write_end : stripes[role].read_end) == 0 && zero_read != role)
|
// We always submit zero-length writes to all replicas, even if the stripe is not modified
|
||||||
|
if (!(w || stripes[role].read_end != 0 || zero_read == role))
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
auto role_osd_num = osd_set[role];
|
osd_num_t role_osd_num = osd_set[role];
|
||||||
if (role_osd_num != 0)
|
if (role_osd_num != 0)
|
||||||
{
|
{
|
||||||
if (role_osd_num == this->osd_num)
|
if (role_osd_num == this->osd_num)
|
||||||
|
@ -240,6 +240,10 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
|
||||||
.len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
|
.len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
|
||||||
};
|
};
|
||||||
subops[subop].buf = w ? stripes[role].write_buf : stripes[role].read_buf;
|
subops[subop].buf = w ? stripes[role].write_buf : stripes[role].read_buf;
|
||||||
|
if (w && stripes[role].write_end > 0)
|
||||||
|
{
|
||||||
|
subops[subop].send_list.push_back(stripes[role].write_buf, stripes[role].write_end - stripes[role].write_start);
|
||||||
|
}
|
||||||
subops[subop].callback = [cur_op, this](osd_op_t *subop)
|
subops[subop].callback = [cur_op, this](osd_op_t *subop)
|
||||||
{
|
{
|
||||||
// so it doesn't get freed
|
// so it doesn't get freed
|
||||||
|
@ -318,7 +322,7 @@ resume_1:
|
||||||
if (vo_it != pg.ver_override.end())
|
if (vo_it != pg.ver_override.end())
|
||||||
{
|
{
|
||||||
op_data->st = 1;
|
op_data->st = 1;
|
||||||
//pg.write_queue.push_back(cur_op);
|
pg.write_queue.emplace(op_data->oid, cur_op);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -326,25 +330,48 @@ resume_1:
|
||||||
cur_op->rmw_buf = calc_rmw_reads(cur_op->buf, op_data->stripes, pg.cur_set.data(), pg.pg_size, pg.pg_minsize, pg.pg_cursize);
|
cur_op->rmw_buf = calc_rmw_reads(cur_op->buf, op_data->stripes, pg.cur_set.data(), pg.pg_size, pg.pg_minsize, pg.pg_cursize);
|
||||||
// Read required blocks
|
// Read required blocks
|
||||||
submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, pg.cur_set.data(), cur_op);
|
submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, pg.cur_set.data(), cur_op);
|
||||||
op_data->st = 2;
|
|
||||||
resume_2:
|
resume_2:
|
||||||
|
op_data->st = 2;
|
||||||
return;
|
return;
|
||||||
resume_3:
|
resume_3:
|
||||||
// Save version override
|
// Save version override for parallel reads
|
||||||
pg.ver_override[op_data->oid] = op_data->fact_ver;
|
pg.ver_override[op_data->oid] = op_data->fact_ver;
|
||||||
// Calculate parity
|
// Calculate parity
|
||||||
calc_rmw_parity(op_data->stripes, op_data->pg_size);
|
calc_rmw_parity(op_data->stripes, pg.pg_size);
|
||||||
// Send writes
|
// Send writes
|
||||||
submit_primary_subops(SUBMIT_WRITE, pg.pg_size, pg.cur_set.data(), cur_op);
|
submit_primary_subops(SUBMIT_WRITE, pg.pg_size, pg.cur_set.data(), cur_op);
|
||||||
op_data->st = 4;
|
|
||||||
resume_4:
|
resume_4:
|
||||||
|
op_data->st = 4;
|
||||||
return;
|
return;
|
||||||
resume_5:
|
resume_5:
|
||||||
// Remember version as unstable
|
// Remember version as unstable
|
||||||
|
osd_num_t *osd_set = pg.cur_set.data();
|
||||||
// Remove version override if degraded
|
for (int role = 0; role < pg.pg_size; role++)
|
||||||
|
{
|
||||||
|
if (osd_set[role] != 0)
|
||||||
|
{
|
||||||
|
this->unstable_writes[osd_set[role]][(object_id){
|
||||||
|
.inode = op_data->oid.inode,
|
||||||
|
.stripe = op_data->oid.stripe | role,
|
||||||
|
}] = op_data->fact_ver;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Remember PG as dirty to drop the connection when PG goes offline
|
||||||
|
// (this is required because of the "lazy sync")
|
||||||
|
this->clients[cur_op->peer_fd].dirty_pgs.insert(op_data->pg_num);
|
||||||
|
// Remove version override
|
||||||
|
pg.ver_override.erase(op_data->oid);
|
||||||
finish_primary_op(cur_op, cur_op->req.rw.len);
|
finish_primary_op(cur_op, cur_op->req.rw.len);
|
||||||
|
// Continue other write operations to the same object
|
||||||
|
{
|
||||||
|
auto next_it = pg.write_queue.find(op_data->oid);
|
||||||
|
if (next_it != pg.write_queue.end())
|
||||||
|
{
|
||||||
|
osd_op_t *next_op = next_it->second;
|
||||||
|
pg.write_queue.erase(next_it);
|
||||||
|
continue_primary_write(next_op);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::exec_primary_sync(osd_op_t *cur_op)
|
void osd_t::exec_primary_sync(osd_op_t *cur_op)
|
||||||
|
|
|
@ -68,11 +68,11 @@ void osd_t::handle_read(ring_data_t *data, int peer_fd)
|
||||||
{
|
{
|
||||||
if (cl.read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
|
if (cl.read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
|
||||||
{
|
{
|
||||||
handle_read_reply(&cl);
|
handle_reply_hdr(&cl);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
handle_read_op(&cl);
|
handle_op_hdr(&cl);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (cl.read_state == CL_READ_DATA)
|
else if (cl.read_state == CL_READ_DATA)
|
||||||
|
@ -97,32 +97,39 @@ void osd_t::handle_read(ring_data_t *data, int peer_fd)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::handle_read_op(osd_client_t *cl)
|
void osd_t::handle_op_hdr(osd_client_t *cl)
|
||||||
{
|
{
|
||||||
osd_op_t *cur_op = cl->read_op;
|
osd_op_t *cur_op = cl->read_op;
|
||||||
if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ ||
|
if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ)
|
||||||
cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE ||
|
{
|
||||||
|
if (cur_op->req.sec_rw.len > 0)
|
||||||
|
cur_op->buf = memalign(512, cur_op->req.sec_rw.len);
|
||||||
|
cl->read_remaining = 0;
|
||||||
|
}
|
||||||
|
else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE ||
|
||||||
cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ||
|
cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ||
|
||||||
cur_op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK)
|
cur_op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK)
|
||||||
{
|
{
|
||||||
// Allocate a buffer
|
if (cur_op->req.sec_rw.len > 0)
|
||||||
cur_op->buf = memalign(512, cur_op->req.sec_rw.len);
|
cur_op->buf = memalign(512, cur_op->req.sec_rw.len);
|
||||||
|
cl->read_remaining = cur_op->req.sec_rw.len;
|
||||||
}
|
}
|
||||||
else if (cur_op->req.hdr.opcode == OSD_OP_READ ||
|
else if (cur_op->req.hdr.opcode == OSD_OP_READ)
|
||||||
cur_op->req.hdr.opcode == OSD_OP_WRITE)
|
|
||||||
{
|
{
|
||||||
|
if (cur_op->req.rw.len > 0)
|
||||||
cur_op->buf = memalign(512, cur_op->req.rw.len);
|
cur_op->buf = memalign(512, cur_op->req.rw.len);
|
||||||
|
cl->read_remaining = 0;
|
||||||
}
|
}
|
||||||
if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE ||
|
else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
|
||||||
cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ||
|
{
|
||||||
cur_op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK ||
|
if (cur_op->req.rw.len > 0)
|
||||||
cur_op->req.hdr.opcode == OSD_OP_WRITE)
|
cur_op->buf = memalign(512, cur_op->req.rw.len);
|
||||||
|
cl->read_remaining = cur_op->req.rw.len;
|
||||||
|
}
|
||||||
|
if (cl->read_remaining > 0)
|
||||||
{
|
{
|
||||||
// Read data
|
// Read data
|
||||||
cl->read_buf = cur_op->buf;
|
cl->read_buf = cur_op->buf;
|
||||||
cl->read_remaining = (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE
|
|
||||||
? cur_op->req.sec_rw.len
|
|
||||||
: cur_op->req.rw.len);
|
|
||||||
cl->read_state = CL_READ_DATA;
|
cl->read_state = CL_READ_DATA;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -134,7 +141,7 @@ void osd_t::handle_read_op(osd_client_t *cl)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::handle_read_reply(osd_client_t *cl)
|
void osd_t::handle_reply_hdr(osd_client_t *cl)
|
||||||
{
|
{
|
||||||
osd_op_t *cur_op = cl->read_op;
|
osd_op_t *cur_op = cl->read_op;
|
||||||
auto req_it = cl->sent_ops.find(cur_op->req.hdr.id);
|
auto req_it = cl->sent_ops.find(cur_op->req.hdr.id);
|
||||||
|
|
|
@ -170,6 +170,8 @@ void* calc_rmw_reads(void *write_buf, osd_rmw_stripe_t *stripes, uint64_t *osd_s
|
||||||
{
|
{
|
||||||
start = !end || stripes[role].req_start < start ? stripes[role].req_start : start;
|
start = !end || stripes[role].req_start < start ? stripes[role].req_start : start;
|
||||||
end = std::max(stripes[role].req_end, end);
|
end = std::max(stripes[role].req_end, end);
|
||||||
|
stripes[role].write_start = stripes[role].req_start;
|
||||||
|
stripes[role].write_end = stripes[role].req_end;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (int role = 0; role < pg_minsize; role++)
|
for (int role = 0; role < pg_minsize; role++)
|
||||||
|
@ -251,7 +253,7 @@ static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, ui
|
||||||
stripe.read_start < wr_end)
|
stripe.read_start < wr_end)
|
||||||
{
|
{
|
||||||
os = std::max(stripe.read_start, wr_start);
|
os = std::max(stripe.read_start, wr_start);
|
||||||
oe = std::min(stripe.req_end, wr_end);
|
oe = std::min(stripe.read_end, wr_end);
|
||||||
}
|
}
|
||||||
if (ne && (!oe || ns <= os))
|
if (ne && (!oe || ns <= os))
|
||||||
{
|
{
|
||||||
|
|
|
@ -1,13 +1,6 @@
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "osd_rmw.cpp"
|
#include "osd_rmw.cpp"
|
||||||
|
#include "test_pattern.h"
|
||||||
#define PATTERN0 0x8c4641acc762840e
|
|
||||||
#define PATTERN1 0x70a549add9a2280a
|
|
||||||
#define PATTERN2 0xffe3bad5f578a78e
|
|
||||||
#define PATTERN3 0x426bd7854eb08509
|
|
||||||
|
|
||||||
#define set_pattern(buf, len, pattern) for (uint64_t i = 0; i < len; i += 8) { *(uint64_t*)((void*)buf + i) = pattern; }
|
|
||||||
#define check_pattern(buf, len, pattern) for (uint64_t i = 0; i < len; i += 8) { assert(*(uint64_t*)(buf + i) == pattern); }
|
|
||||||
|
|
||||||
int main(int narg, char *args[])
|
int main(int narg, char *args[])
|
||||||
{
|
{
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
|
|
||||||
#include "osd_ops.h"
|
#include "osd_ops.h"
|
||||||
#include "rw_blocking.h"
|
#include "rw_blocking.h"
|
||||||
|
#include "test_pattern.h"
|
||||||
|
|
||||||
int connect_osd(const char *osd_address, int osd_port);
|
int connect_osd(const char *osd_address, int osd_port);
|
||||||
|
|
||||||
|
@ -22,11 +23,9 @@ uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t ve
|
||||||
|
|
||||||
void* test_primary_read(int connect_fd, uint64_t inode, uint64_t offset, uint64_t len);
|
void* test_primary_read(int connect_fd, uint64_t inode, uint64_t offset, uint64_t len);
|
||||||
|
|
||||||
bool check_pattern(void *buf, uint64_t offset, uint64_t len, uint64_t pattern);
|
void test_primary_write(int connect_fd, uint64_t inode, uint64_t offset, uint64_t len, uint64_t pattern);
|
||||||
|
|
||||||
#define PATTERN0 0x8c4641acc762840e
|
void test_sync_stab_all(int connect_fd);
|
||||||
#define PATTERN1 0x70a549add9a2280a
|
|
||||||
#define PATTERN2 (PATTERN0 ^ PATTERN1)
|
|
||||||
|
|
||||||
int main0(int narg, char *args[])
|
int main0(int narg, char *args[])
|
||||||
{
|
{
|
||||||
|
@ -39,7 +38,32 @@ int main0(int narg, char *args[])
|
||||||
test_write(connect_fd, 2, 1, 1, PATTERN1);
|
test_write(connect_fd, 2, 1, 1, PATTERN1);
|
||||||
close(connect_fd);
|
close(connect_fd);
|
||||||
connect_fd = connect_osd("127.0.0.1", 11205);
|
connect_fd = connect_osd("127.0.0.1", 11205);
|
||||||
test_write(connect_fd, 2, 2, 1, PATTERN2);
|
test_write(connect_fd, 2, 2, 1, PATTERN0^PATTERN1);
|
||||||
|
close(connect_fd);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main1(int narg, char *args[])
|
||||||
|
{
|
||||||
|
int connect_fd;
|
||||||
|
void *data;
|
||||||
|
// Cluster read
|
||||||
|
connect_fd = connect_osd("127.0.0.1", 11203);
|
||||||
|
data = test_primary_read(connect_fd, 2, 0, 128*1024);
|
||||||
|
if (data)
|
||||||
|
{
|
||||||
|
check_pattern(data, 128*1024, PATTERN0);
|
||||||
|
printf("inode=2 0-128K OK\n");
|
||||||
|
free(data);
|
||||||
|
}
|
||||||
|
data = test_primary_read(connect_fd, 2, 0, 256*1024);
|
||||||
|
if (data)
|
||||||
|
{
|
||||||
|
check_pattern(data, 128*1024, PATTERN0);
|
||||||
|
check_pattern(data+128*1024, 128*1024, PATTERN1);
|
||||||
|
printf("inode=2 0-256K OK\n");
|
||||||
|
free(data);
|
||||||
|
}
|
||||||
close(connect_fd);
|
close(connect_fd);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -47,21 +71,24 @@ int main0(int narg, char *args[])
|
||||||
int main(int narg, char *args[])
|
int main(int narg, char *args[])
|
||||||
{
|
{
|
||||||
int connect_fd;
|
int connect_fd;
|
||||||
void *data;
|
// Cluster write (sync not implemented yet)
|
||||||
// Cluster read
|
|
||||||
connect_fd = connect_osd("127.0.0.1", 11203);
|
connect_fd = connect_osd("127.0.0.1", 11203);
|
||||||
data = test_primary_read(connect_fd, 2, 0, 128*1024);
|
test_primary_write(connect_fd, 2, 0, 128*1024, PATTERN0);
|
||||||
if (data && check_pattern(data, 0, 128*1024, PATTERN0))
|
test_primary_write(connect_fd, 2, 128*1024, 128*1024, PATTERN1);
|
||||||
printf("inode=2 0-128K OK\n");
|
test_sync_stab_all(connect_fd);
|
||||||
if (data)
|
|
||||||
free(data);
|
|
||||||
data = test_primary_read(connect_fd, 2, 0, 256*1024);
|
|
||||||
if (data && check_pattern(data, 0, 128*1024, PATTERN0) &&
|
|
||||||
check_pattern(data, 128*1024, 128*1024, PATTERN1))
|
|
||||||
printf("inode=2 0-256K OK\n");
|
|
||||||
if (data)
|
|
||||||
free(data);
|
|
||||||
close(connect_fd);
|
close(connect_fd);
|
||||||
|
connect_fd = connect_osd("127.0.0.1", 11204);
|
||||||
|
if (connect_fd >= 0)
|
||||||
|
{
|
||||||
|
test_sync_stab_all(connect_fd);
|
||||||
|
close(connect_fd);
|
||||||
|
}
|
||||||
|
connect_fd = connect_osd("127.0.0.1", 11205);
|
||||||
|
if (connect_fd >= 0)
|
||||||
|
{
|
||||||
|
test_sync_stab_all(connect_fd);
|
||||||
|
close(connect_fd);
|
||||||
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -182,15 +209,33 @@ void* test_primary_read(int connect_fd, uint64_t inode, uint64_t offset, uint64_
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool check_pattern(void *buf, uint64_t offset, uint64_t len, uint64_t pattern)
|
void test_primary_write(int connect_fd, uint64_t inode, uint64_t offset, uint64_t len, uint64_t pattern)
|
||||||
{
|
{
|
||||||
for (int i = 0; i < len/sizeof(uint64_t); i++)
|
osd_any_op_t op;
|
||||||
{
|
osd_any_reply_t reply;
|
||||||
if (((uint64_t*)(buf+offset))[i] != pattern)
|
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
|
||||||
{
|
op.hdr.id = 1;
|
||||||
printf("(result + %lu bytes = %lx) != %lx\n", i*sizeof(uint64_t)+offset, ((uint64_t*)buf+offset)[i], pattern);
|
op.hdr.opcode = OSD_OP_WRITE;
|
||||||
return false;
|
op.rw.inode = inode;
|
||||||
}
|
op.rw.offset = offset;
|
||||||
}
|
op.rw.len = len;
|
||||||
return true;
|
void *data = memalign(512, len);
|
||||||
|
set_pattern(data, len, pattern);
|
||||||
|
write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
|
||||||
|
write_blocking(connect_fd, data, len);
|
||||||
|
free(data);
|
||||||
|
int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
|
||||||
|
assert(check_reply(r, op, reply, len));
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_sync_stab_all(int connect_fd)
|
||||||
|
{
|
||||||
|
osd_any_op_t op;
|
||||||
|
osd_any_reply_t reply;
|
||||||
|
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
|
||||||
|
op.hdr.id = 1;
|
||||||
|
op.hdr.opcode = OSD_OP_TEST_SYNC_STAB_ALL;
|
||||||
|
write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
|
||||||
|
int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
|
||||||
|
assert(check_reply(r, op, reply, 0));
|
||||||
}
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#define PATTERN0 0x8c4641acc762840e
|
||||||
|
#define PATTERN1 0x70a549add9a2280a
|
||||||
|
#define PATTERN2 0xffe3bad5f578a78e
|
||||||
|
#define PATTERN3 0x426bd7854eb08509
|
||||||
|
|
||||||
|
#define set_pattern(buf, len, pattern) for (uint64_t i = 0; i < len; i += 8) { *(uint64_t*)((void*)buf + i) = pattern; }
|
||||||
|
#define check_pattern(buf, len, pattern) for (uint64_t i = 0; i < len; i += 8) { assert(*(uint64_t*)(buf + i) == pattern); }
|
Loading…
Reference in New Issue