diff --git a/src/blockstore.h b/src/blockstore.h index b99f6f0e..c0b24a7d 100644 --- a/src/blockstore.h +++ b/src/blockstore.h @@ -65,10 +65,9 @@ Input: - offset, len = offset and length within object. length may be zero, in that case read operation only returns the version / write operation only bumps the version - buf = pre-allocated buffer for data (read) / with data (write). may be NULL if len == 0. -- bitmap = bytes long arbitrary data stored for each object in the metadata area. - when fits into pointer size, it should be passed as this field's value. - when it doesn't fit, this field should be a pointer to that piece of data. - named "bitmap" because it's used for the "external bitmap" in Vitastor. +- bitmap = pointer to bytes long (usually very short) arbitrary data + stored for each object in the metadata area. + Called "bitmap" because it's used for the "external bitmap" in Vitastor. Output: - retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC) diff --git a/src/blockstore_read.cpp b/src/blockstore_read.cpp index c0d79576..4747709a 100644 --- a/src/blockstore_read.cpp +++ b/src/blockstore_read.cpp @@ -149,10 +149,11 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) if (!result_version) { result_version = dirty_it->first.version; - if (entry_attr_size <= sizeof(void*)) - read_op->bitmap = dirty_it->second.bitmap; - else if (read_op->bitmap) - memcpy(read_op->bitmap, dirty_it->second.bitmap, entry_attr_size); + if (read_op->bitmap) + { + void *bmp_ptr = (entry_attr_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap); + memcpy(read_op->bitmap, bmp_ptr, entry_attr_size); + } } if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len, dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset))) @@ -174,11 +175,11 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) if (!result_version) { result_version = clean_it->second.version; - void *clean_entry_bitmap = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size); - if (entry_attr_size <= sizeof(void*)) - memcpy(&read_op->bitmap, clean_entry_bitmap, entry_attr_size); - else if (read_op->bitmap) - memcpy(read_op->bitmap, clean_entry_bitmap, entry_attr_size); + if (read_op->bitmap) + { + void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size); + memcpy(read_op->bitmap, bmp_ptr, entry_attr_size); + } } if (fulfilled < read_op->len) { diff --git a/src/blockstore_write.cpp b/src/blockstore_write.cpp index 83a0e81e..3f840f2b 100644 --- a/src/blockstore_write.cpp +++ b/src/blockstore_write.cpp @@ -111,13 +111,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) if (op->opcode == BS_OP_WRITE_STABLE) state |= BS_ST_INSTANT; if (entry_attr_size > sizeof(void*)) - { bmp = calloc_or_die(1, entry_attr_size); - if (op->bitmap) - memcpy(bmp, op->bitmap, entry_attr_size); - } - else - bmp = op->bitmap; + if (op->bitmap) + memcpy((entry_attr_size > sizeof(void*) ? bmp : &bmp), op->bitmap, entry_attr_size); } dirty_db.emplace((obj_ver_id){ .oid = op->oid, diff --git a/src/msgr_op.h b/src/msgr_op.h index 96612297..b2d268f1 100644 --- a/src/msgr_op.h +++ b/src/msgr_op.h @@ -162,6 +162,7 @@ struct osd_op_t blockstore_op_t *bs_op = NULL; void *buf = NULL; void *bitmap = NULL; + unsigned bmp_data = 0; void *rmw_buf = NULL; osd_primary_op_data_t* op_data = NULL; std::function callback; diff --git a/src/msgr_receive.cpp b/src/msgr_receive.cpp index 20fd085a..6f0d1188 100644 --- a/src/msgr_receive.cpp +++ b/src/msgr_receive.cpp @@ -209,13 +209,11 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl) { if (cur_op->req.sec_rw.attr_len > 0) { - if (cur_op->req.sec_rw.attr_len > sizeof(void*)) - { + if (cur_op->req.sec_rw.attr_len > sizeof(unsigned)) cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(cur_op->req.sec_rw.attr_len); - cl->recv_list.push_back(cur_op->bitmap, cur_op->req.sec_rw.attr_len); - } else - cl->recv_list.push_back(&cur_op->bitmap, cur_op->req.sec_rw.attr_len); + cur_op->bitmap = &cur_op->bmp_data; + cl->recv_list.push_back(cur_op->bitmap, cur_op->req.sec_rw.attr_len); } if (cur_op->req.sec_rw.len > 0) { diff --git a/src/msgr_send.cpp b/src/msgr_send.cpp index c3cdefdf..d5c8090d 100644 --- a/src/msgr_send.cpp +++ b/src/msgr_send.cpp @@ -66,7 +66,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op) cur_op->req.sec_rw.attr_len > 0) { to_send_list.push_back((iovec){ - .iov_base = (cur_op->reply.sec_rw.attr_len > sizeof(void*) ? cur_op->bitmap : &cur_op->bitmap), + .iov_base = cur_op->bitmap, .iov_len = cur_op->reply.sec_rw.attr_len, }); to_outbox.push_back(NULL); diff --git a/src/osd_primary.cpp b/src/osd_primary.cpp index 1bcf60df..c7da0fa9 100644 --- a/src/osd_primary.cpp +++ b/src/osd_primary.cpp @@ -2,6 +2,7 @@ // License: VNPL-1.1 (see README.md for details) #include "osd_primary.h" +#include "allocator.h" // read: read directly or read paired stripe(s), reconstruct, return // write: read paired stripe(s), reconstruct, modify, calculate parity, write @@ -51,7 +52,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op) return false; } osd_primary_op_data_t *op_data = (osd_primary_op_data_t*)calloc_or_die( - 1, sizeof(osd_primary_op_data_t) + sizeof(osd_rmw_stripe_t) * (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size) + 1, sizeof(osd_primary_op_data_t) + entry_attr_size + + sizeof(osd_rmw_stripe_t) * (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size) ); op_data->pg_num = pg_num; op_data->oid = oid; @@ -115,7 +117,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op) if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED) { // Fast happy-path - cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0, 0); + cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0, entry_attr_size); submit_primary_subops(SUBMIT_READ, op_data->target_ver, (op_data->scheme == POOL_SCHEME_REPLICATED ? pg.pg_size : op_data->pg_data_size), pg.cur_set.data(), cur_op); op_data->st = 1; @@ -133,7 +135,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op) op_data->pg_size = pg.pg_size; op_data->scheme = pg.scheme; op_data->degraded = 1; - cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0, 0); + cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0, entry_attr_size); submit_primary_subops(SUBMIT_READ, op_data->target_ver, pg.pg_size, cur_set, cur_op); op_data->st = 1; } @@ -152,11 +154,11 @@ resume_2: osd_rmw_stripe_t *stripes = op_data->stripes; if (op_data->scheme == POOL_SCHEME_XOR) { - reconstruct_stripes_xor(stripes, op_data->pg_size, 0); + reconstruct_stripes_xor(stripes, op_data->pg_size, entry_attr_size); } else if (op_data->scheme == POOL_SCHEME_JERASURE) { - reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size, 0); + reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size, entry_attr_size); } for (int role = 0; role < op_data->pg_size; role++) { diff --git a/src/osd_primary_subops.cpp b/src/osd_primary_subops.cpp index 47aa0a0c..0a81ebf4 100644 --- a/src/osd_primary_subops.cpp +++ b/src/osd_primary_subops.cpp @@ -145,6 +145,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start, .len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start, .buf = wr ? stripes[stripe_num].write_buf : stripes[stripe_num].read_buf, + .bitmap = stripes[stripe_num].bmp_buf, }); #ifdef OSD_DEBUG printf( @@ -159,6 +160,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s { subops[i].op_type = OSD_OP_OUT; subops[i].peer_fd = c_cli.osd_peer_fds.at(role_osd_num); + subops[i].bitmap = stripes[stripe_num].bmp_buf; subops[i].req.sec_rw = { .header = { .magic = SECONDARY_OSD_OP_MAGIC, @@ -172,6 +174,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s .version = op_version, .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start, .len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start, + .attr_len = entry_attr_size, }; #ifdef OSD_DEBUG printf( diff --git a/src/osd_primary_write.cpp b/src/osd_primary_write.cpp index e984b219..0b242182 100644 --- a/src/osd_primary_write.cpp +++ b/src/osd_primary_write.cpp @@ -65,6 +65,7 @@ resume_1: op_data->stripes[0].write_start = op_data->stripes[0].req_start; op_data->stripes[0].write_end = op_data->stripes[0].req_end; op_data->stripes[0].write_buf = cur_op->buf; + op_data->stripes[0].bmp_buf = (void*)(op_data->stripes+1); if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 || op_data->stripes[0].write_end != bs_block_size)) { @@ -77,7 +78,7 @@ resume_1: else { cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set, - pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size, 0); + pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size, entry_attr_size); if (!cur_op->rmw_buf) { // Refuse partial overwrite of an incomplete object @@ -98,7 +99,9 @@ resume_3: } if (op_data->scheme == POOL_SCHEME_REPLICATED) { - // Only (possibly) copy new data from the request into the recovery buffer + // Set bitmap bits + bitmap_set(op_data->stripes[0].bmp_buf, op_data->stripes[0].write_start, op_data->stripes[0].write_end, bs_bitmap_granularity); + // Possibly copy new data from the request into the recovery buffer if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 || op_data->stripes[0].write_end != bs_block_size)) { @@ -120,11 +123,11 @@ resume_3: // Recover missing stripes, calculate parity if (pg.scheme == POOL_SCHEME_XOR) { - calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, 0); + calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, entry_attr_size); } else if (pg.scheme == POOL_SCHEME_JERASURE) { - calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, 0); + calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, entry_attr_size); } } // Send writes diff --git a/src/osd_rmw.cpp b/src/osd_rmw.cpp index 1796b9ca..0000d83f 100644 --- a/src/osd_rmw.cpp +++ b/src/osd_rmw.cpp @@ -332,6 +332,7 @@ void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t ad } } // Bitmaps are allocated in the end so data buffers remain aligned + // FIXME: Don't allocate bitmaps here because it probably increases memory fragmentation if (bitmap_size > 0) { for (int role = 0; role < read_pg_size; role++) diff --git a/src/osd_secondary.cpp b/src/osd_secondary.cpp index e25850db..bb596612 100644 --- a/src/osd_secondary.cpp +++ b/src/osd_secondary.cpp @@ -19,11 +19,6 @@ void osd_t::secondary_op_callback(osd_op_t *op) } if (op->req.hdr.opcode == OSD_OP_SEC_READ) { - if (entry_attr_size > 0) - { - op->reply.sec_rw.attr_len = entry_attr_size; - op->iov.push_back((entry_attr_size > sizeof(void*) ? op->bitmap : &op->bs_op->bitmap), entry_attr_size); - } if (op->bs_op->retval > 0) { op->iov.push_back(op->buf, op->bs_op->retval); @@ -65,8 +60,10 @@ void osd_t::exec_secondary(osd_op_t *cur_op) if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ) { // Allocate memory for the read operation - if (entry_attr_size > sizeof(void*)) + if (entry_attr_size > sizeof(unsigned)) cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(entry_attr_size); + else + cur_op->bitmap = &cur_op->bmp_data; if (cur_op->req.sec_rw.len > 0) cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.sec_rw.len); }