diff --git a/src/blockstore.h b/src/blockstore.h index 05dd9dbd..6d5f0f5f 100644 --- a/src/blockstore.h +++ b/src/blockstore.h @@ -64,6 +64,10 @@ Input: - offset, len = offset and length within object. length may be zero, in that case read operation only returns the version / write operation only bumps the version - buf = pre-allocated buffer for data (read) / with data (write). may be NULL if len == 0. +- bitmap = bytes long arbitrary data stored for each object in the metadata area. + when fits into pointer size, it should be passed as this field's value. + when it doesn't fit, this field should be a pointer to that piece of data. + named "bitmap" because it's used for the "external bitmap" in Vitastor. Output: - retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC) @@ -141,6 +145,7 @@ struct blockstore_op_t uint32_t offset; uint32_t len; void *buf; + void *bitmap; int retval; uint8_t private_data[BS_OP_PRIVATE_DATA_SIZE]; diff --git a/src/blockstore_flush.cpp b/src/blockstore_flush.cpp index ee573954..b27d7a40 100644 --- a/src/blockstore_flush.cpp +++ b/src/blockstore_flush.cpp @@ -428,7 +428,7 @@ resume_1: { new_clean_bitmap = (bs->inmemory_meta ? meta_new.buf + meta_new.pos*bs->clean_entry_size + sizeof(clean_disk_entry) - : bs->clean_bitmap + (clean_loc >> bs->block_order)*bs->clean_entry_bitmap_size); + : bs->clean_bitmap + (clean_loc >> bs->block_order)*(bs->clean_entry_bitmap_size + bs->entry_attr_size)); if (clean_init_bitmap) { memset(new_clean_bitmap, 0, bs->clean_entry_bitmap_size); @@ -473,6 +473,7 @@ resume_1: wait_state = 5; return false; } + // zero out old metadata entry memset(meta_old.buf + meta_old.pos*bs->clean_entry_size, 0, bs->clean_entry_size); await_sqe(15); data->iov = (struct iovec){ meta_old.buf, bs->meta_block_size }; @@ -509,6 +510,12 @@ resume_1: { memcpy(&new_entry->bitmap, new_clean_bitmap, bs->clean_entry_bitmap_size); } + if (bs->entry_attr_size) + { + // copy latest external bitmap/attributes + void *bmp_ptr = bs->entry_attr_size > sizeof(void*) ? dirty_end->second.bitmap : &dirty_end->second.bitmap; + memcpy((void*)(new_entry+1) + bs->clean_entry_bitmap_size, bmp_ptr, bs->entry_attr_size); + } } await_sqe(6); data->iov = (struct iovec){ meta_new.buf, bs->meta_block_size }; diff --git a/src/blockstore_impl.h b/src/blockstore_impl.h index 8503f81c..035ca280 100644 --- a/src/blockstore_impl.h +++ b/src/blockstore_impl.h @@ -77,7 +77,8 @@ #include "blockstore_journal.h" -// 24 bytes + block bitmap per "clean" entry on disk with fixed metadata tables +// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default) +// per "clean" entry on disk with fixed metadata tables // FIXME: maybe add crc32's to metadata struct __attribute__((__packed__)) clean_disk_entry { @@ -93,7 +94,7 @@ struct __attribute__((__packed__)) clean_entry uint64_t location; }; -// 56 = 24 + 32 bytes per dirty entry in memory (obj_ver_id => dirty_entry) +// 64 = 24 + 40 bytes per dirty entry in memory (obj_ver_id => dirty_entry) struct __attribute__((__packed__)) dirty_entry { uint32_t state; @@ -102,6 +103,7 @@ struct __attribute__((__packed__)) dirty_entry uint32_t offset; // data offset within object (stripe) uint32_t len; // data length uint64_t journal_sector; // journal sector used for this entry + void* bitmap; // either external bitmap itself when it fits, or a pointer to it when it doesn't }; // - Sync must be submitted after previous writes/deletes (not before!) @@ -216,7 +218,7 @@ class blockstore_impl_t uint32_t block_order; uint64_t block_count; - uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0; + uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0, entry_attr_size = 0; int meta_fd; int data_fd; @@ -250,6 +252,7 @@ class blockstore_impl_t void open_data(); void open_meta(); void open_journal(); + uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset); // Asynchronous init int initialized; diff --git a/src/blockstore_init.cpp b/src/blockstore_init.cpp index 3978112e..406d8eed 100644 --- a/src/blockstore_init.cpp +++ b/src/blockstore_init.cpp @@ -98,9 +98,9 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo for (unsigned i = 0; i < count; i++) { clean_disk_entry *entry = (clean_disk_entry*)(entries + i*bs->clean_entry_size); - if (!bs->inmemory_meta && bs->clean_entry_bitmap_size) + if (!bs->inmemory_meta && (bs->clean_entry_bitmap_size || bs->entry_attr_size)) { - memcpy(bs->clean_bitmap + (done_cnt+i)*bs->clean_entry_bitmap_size, &entry->bitmap, bs->clean_entry_bitmap_size); + memcpy(bs->clean_bitmap + (done_cnt+i)*(bs->clean_entry_bitmap_size + bs->entry_attr_size), &entry->bitmap, (bs->clean_entry_bitmap_size + bs->entry_attr_size)); } if (entry->oid.inode > 0) { @@ -545,6 +545,21 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u .oid = je->small_write.oid, .version = je->small_write.version, }; + void *bmp = (void*)je + sizeof(journal_entry_small_write); + if (bs->entry_attr_size <= sizeof(void*)) + { + memcpy(&bmp, bmp, bs->entry_attr_size); + } + else if (!bs->journal.inmemory) + { + // FIXME Using large blockstore objects and not keeping journal in memory + // will result in a lot of small allocations for entry bitmaps. This can + // only be fixed by using a patched map with dynamic entry size, but not + // the btree_map, because it doesn't keep iterators valid all the time. + void *bmp_cp = malloc_or_die(bs->entry_attr_size); + memcpy(bmp_cp, bmp, bs->entry_attr_size); + bmp = bmp_cp; + } bs->dirty_db.emplace(ov, (dirty_entry){ .state = (BS_ST_SMALL_WRITE | BS_ST_SYNCED), .flags = 0, @@ -552,6 +567,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u .offset = je->small_write.offset, .len = je->small_write.len, .journal_sector = proc_pos, + .bitmap = bmp, }); bs->journal.used_sectors[proc_pos]++; #ifdef BLOCKSTORE_DEBUG @@ -609,6 +625,21 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u .oid = je->big_write.oid, .version = je->big_write.version, }; + void *bmp = (void*)je + sizeof(journal_entry_big_write); + if (bs->entry_attr_size <= sizeof(void*)) + { + memcpy(&bmp, bmp, bs->entry_attr_size); + } + else if (!bs->journal.inmemory) + { + // FIXME Using large blockstore objects and not keeping journal in memory + // will result in a lot of small allocations for entry bitmaps. This can + // only be fixed by using a patched map with dynamic entry size, but not + // the btree_map, because it doesn't keep iterators valid all the time. + void *bmp_cp = malloc_or_die(bs->entry_attr_size); + memcpy(bmp_cp, bmp, bs->entry_attr_size); + bmp = bmp_cp; + } auto dirty_it = bs->dirty_db.emplace(ov, (dirty_entry){ .state = (BS_ST_BIG_WRITE | BS_ST_SYNCED), .flags = 0, @@ -616,6 +647,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u .offset = je->big_write.offset, .len = je->big_write.len, .journal_sector = proc_pos, + .bitmap = bmp, }).first; if (bs->data_alloc->get(je->big_write.location >> bs->block_order)) { diff --git a/src/blockstore_journal.h b/src/blockstore_journal.h index b6600447..aadc01a6 100644 --- a/src/blockstore_journal.h +++ b/src/blockstore_journal.h @@ -54,6 +54,9 @@ struct __attribute__((__packed__)) journal_entry_small_write // data_offset is its offset within journal uint64_t data_offset; uint32_t crc32_data; + // small_write and big_write entries are followed by the "external" bitmap + // its size is dynamic and included in journal entry's field + uint8_t bitmap[]; }; struct __attribute__((__packed__)) journal_entry_big_write @@ -68,6 +71,9 @@ struct __attribute__((__packed__)) journal_entry_big_write uint32_t offset; uint32_t len; uint64_t location; + // small_write and big_write entries are followed by the "external" bitmap + // its size is dynamic and included in journal entry's field + uint8_t bitmap[]; }; struct __attribute__((__packed__)) journal_entry_stable diff --git a/src/blockstore_open.cpp b/src/blockstore_open.cpp index 4c70dac5..eaebeaf9 100644 --- a/src/blockstore_open.cpp +++ b/src/blockstore_open.cpp @@ -62,6 +62,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config) cfg_data_size = strtoull(config["data_size"].c_str(), NULL, 10); meta_device = config["meta_device"]; meta_offset = strtoull(config["meta_offset"].c_str(), NULL, 10); + entry_attr_size = strtoull(config["entry_attr_size"].c_str(), NULL, 10); block_size = strtoull(config["block_size"].c_str(), NULL, 10); inmemory_meta = config["inmemory_metadata"] != "false"; journal_device = config["journal_device"]; @@ -106,7 +107,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config) } else if (disk_alignment % MEM_ALIGNMENT) { - throw std::runtime_error("disk_alingment must be a multiple of "+std::to_string(MEM_ALIGNMENT)); + throw std::runtime_error("disk_alignment must be a multiple of "+std::to_string(MEM_ALIGNMENT)); } if (!journal_block_size) { @@ -182,7 +183,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config) } // init some fields clean_entry_bitmap_size = block_size / bitmap_granularity / 8; - clean_entry_size = sizeof(clean_disk_entry) + clean_entry_bitmap_size; + clean_entry_size = sizeof(clean_disk_entry) + clean_entry_bitmap_size + entry_attr_size; journal.block_size = journal_block_size; journal.next_free = journal_block_size; journal.used_start = journal_block_size; @@ -247,9 +248,9 @@ void blockstore_impl_t::calc_lengths() if (!metadata_buffer) throw std::runtime_error("Failed to allocate memory for the metadata"); } - else if (clean_entry_bitmap_size) + else if (clean_entry_bitmap_size || entry_attr_size) { - clean_bitmap = (uint8_t*)malloc(block_count * clean_entry_bitmap_size); + clean_bitmap = (uint8_t*)malloc(block_count * (clean_entry_bitmap_size + entry_attr_size)); if (!clean_bitmap) throw std::runtime_error("Failed to allocate memory for the metadata sparse write bitmap"); } diff --git a/src/blockstore_read.cpp b/src/blockstore_read.cpp index 4b84449d..c0d79576 100644 --- a/src/blockstore_read.cpp +++ b/src/blockstore_read.cpp @@ -94,6 +94,21 @@ endwhile: return 1; } +uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offset) +{ + uint8_t *clean_entry_bitmap; + uint64_t meta_loc = block_loc >> block_order; + if (inmemory_meta) + { + uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size; + uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size)); + clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry) + offset); + } + else + clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*(clean_entry_bitmap_size + entry_attr_size) + offset); + return clean_entry_bitmap; +} + int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) { auto clean_it = clean_db.find(read_op->oid); @@ -134,6 +149,10 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) if (!result_version) { result_version = dirty_it->first.version; + if (entry_attr_size <= sizeof(void*)) + read_op->bitmap = dirty_it->second.bitmap; + else if (read_op->bitmap) + memcpy(read_op->bitmap, dirty_it->second.bitmap, entry_attr_size); } if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len, dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset))) @@ -155,6 +174,11 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) if (!result_version) { result_version = clean_it->second.version; + void *clean_entry_bitmap = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size); + if (entry_attr_size <= sizeof(void*)) + memcpy(&read_op->bitmap, clean_entry_bitmap, entry_attr_size); + else if (read_op->bitmap) + memcpy(read_op->bitmap, clean_entry_bitmap, entry_attr_size); } if (fulfilled < read_op->len) { @@ -169,18 +193,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) } else { - uint64_t meta_loc = clean_it->second.location >> block_order; - uint8_t *clean_entry_bitmap; - if (inmemory_meta) - { - uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size; - uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size)); - clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry)); - } - else - { - clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size); - } + uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_it->second.location, 0); uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/bitmap_granularity; while (bmp_start < bmp_size) { diff --git a/src/blockstore_rollback.cpp b/src/blockstore_rollback.cpp index d5037b09..da49aaed 100644 --- a/src/blockstore_rollback.cpp +++ b/src/blockstore_rollback.cpp @@ -268,6 +268,11 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start, { journal.used_sectors.erase(dirty_it->second.journal_sector); } + if (entry_attr_size > sizeof(void*)) + { + free(dirty_it->second.bitmap); + dirty_it->second.bitmap = NULL; + } if (dirty_it == dirty_start) { break; diff --git a/src/blockstore_write.cpp b/src/blockstore_write.cpp index a2c62f6e..83a0e81e 100644 --- a/src/blockstore_write.cpp +++ b/src/blockstore_write.cpp @@ -95,6 +95,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) #endif // FIXME No strict need to add it into dirty_db here, it's just left // from the previous implementation where reads waited for writes + void *bmp = NULL; uint32_t state; if (is_del) state = BS_ST_DELETE | BS_ST_IN_FLIGHT; @@ -109,6 +110,14 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) state |= BS_ST_IN_FLIGHT; if (op->opcode == BS_OP_WRITE_STABLE) state |= BS_ST_INSTANT; + if (entry_attr_size > sizeof(void*)) + { + bmp = calloc_or_die(1, entry_attr_size); + if (op->bitmap) + memcpy(bmp, op->bitmap, entry_attr_size); + } + else + bmp = op->bitmap; } dirty_db.emplace((obj_ver_id){ .oid = op->oid, @@ -120,6 +129,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) .offset = is_del ? 0 : op->offset, .len = is_del ? 0 : op->len, .journal_sector = 0, + .bitmap = bmp, }); return true; } @@ -128,6 +138,8 @@ void blockstore_impl_t::cancel_all_writes(blockstore_op_t *op, blockstore_dirty_ { while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid) { + if (entry_attr_size > sizeof(void*)) + free(dirty_it->second.bitmap); dirty_db.erase(dirty_it++); } bool found = false; @@ -305,7 +317,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) // Then pre-fill journal entry journal_entry_small_write *je = (journal_entry_small_write*)prefill_single_journal_entry( journal, op->opcode == BS_OP_WRITE_STABLE ? JE_SMALL_WRITE_INSTANT : JE_SMALL_WRITE, - sizeof(journal_entry_small_write) + sizeof(journal_entry_small_write) + entry_attr_size ); dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset; journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++; @@ -324,6 +336,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) je->len = op->len; je->data_offset = journal.next_free; je->crc32_data = crc32c(0, op->buf, op->len); + memcpy((void*)(je+1), (entry_attr_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), entry_attr_size); je->crc32 = je_crc32((journal_entry*)je); journal.crc32_last = je->crc32; if (immediate_commit != IMMEDIATE_NONE) @@ -396,7 +409,7 @@ resume_2: BS_SUBMIT_GET_SQE_DECL(sqe); je = (journal_entry_big_write*)prefill_single_journal_entry( journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE, - sizeof(journal_entry_big_write) + sizeof(journal_entry_big_write) + entry_attr_size ); dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset; journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++; @@ -412,6 +425,7 @@ resume_2: je->offset = op->offset; je->len = op->len; je->location = dirty_it->second.location; + memcpy((void*)(je+1), (entry_attr_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), entry_attr_size); je->crc32 = je_crc32((journal_entry*)je); journal.crc32_last = je->crc32; prepare_journal_sector_write(journal, journal.cur_sector, sqe, diff --git a/src/osd_rmw.cpp b/src/osd_rmw.cpp index 02ef23e4..1796b9ca 100644 --- a/src/osd_rmw.cpp +++ b/src/osd_rmw.cpp @@ -215,7 +215,7 @@ int* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size, int pg auto dec_it = matrix->decodings.find((reed_sol_erased_t){ .data = erased, .size = pg_size }); if (dec_it == matrix->decodings.end()) { - int *dm_ids = (int*)malloc(sizeof(int)*(pg_minsize + pg_minsize*pg_minsize + pg_size)); + int *dm_ids = (int*)malloc_or_die(sizeof(int)*(pg_minsize + pg_minsize*pg_minsize + pg_size)); int *decoding_matrix = dm_ids + pg_minsize; if (!dm_ids) throw std::bad_alloc();