diff --git a/src/cli_snap_rm.cpp b/src/cli_snap_rm.cpp index 0e5eba18..0cd40156 100644 --- a/src/cli_snap_rm.cpp +++ b/src/cli_snap_rm.cpp @@ -6,10 +6,21 @@ #include "base64.h" // Remove layer(s): similar to merge, but alters metadata and processes multiple merge targets -// If the requested snapshot chain has only 1 child and --writers-stopped is specified -// then that child can be merged "down" into the snapshot chain. -// Otherwise we iterate over all children of the chain, merge removed parents into them, -// and delete children afterwards. +// +// Exactly one child of the requested layers may be merged using the "inverted" workflow, +// where we merge it "down" into one of the "to-be-removed" layers and then rename the +// "to-be-removed" layer to the child. It may be done either if all writers are stopped +// before trying to delete layers (which is signaled by --writers-stopped) or if that child +// is a read-only layer (snapshot) itself. +// +// This "inverted" workflow trades copying data of one of the deleted layers for copying +// data of one child of the chain which is also a child of the "traded" layer. So we +// choose the (parent,child) pair which has the largest difference between "parent" and +// "child" inode sizes. +// +// All other children of the chain are processed by iterating though them, merging removed +// parents into them and rebasing them to the last layer which isn't a member of the removed +// chain. // // Example: // @@ -18,19 +29,17 @@ // \ \- // \- // -// 1) Merge .. to -// 2) Set parent to -// 3) Variant #1, trickier, beneficial when has less data than -// (not implemented yet): -// - Merge .. to -// - Rename to -// It can be done without extra precautions if is a read-only layer itself -// Otherwise it should be either done offline or by pausing writers -// - is now deleted, repeat deletion with .. -// 4) Variant #2, simple: -// - Repeat 1-2 with -// - Delete -// 5) Process all other children +// 1) Find optimal pair for the "reverse" scenario +// Imagine that it's (, ) in this example +// 2) Process all children except : +// - Merge .. to +// - Set parent to +// - Repeat for others +// 3) Process : +// - Merge .. to +// - Set parent to +// - Rename to +// 4) Delete other layers of the chain (, ) struct snap_remover_t { cli_tool_t *parent; @@ -44,13 +53,123 @@ struct snap_remover_t // interval between fsyncs int fsync_interval = 128; + std::map sources; + std::map inode_used; std::vector merge_children; std::vector chain_list; + std::map inverse_candidates; + inode_t inverse_parent = 0, inverse_child = 0; inode_t new_parent = 0; int state = 0; int current_child = 0; std::function cb; + bool is_done() + { + return state == 9; + } + + void loop() + { + if (state == 1) + goto resume_1; + else if (state == 2) + goto resume_2; + else if (state == 3) + goto resume_3; + else if (state == 4) + goto resume_4; + else if (state == 5) + goto resume_5; + else if (state == 6) + goto resume_6; + else if (state == 7) + goto resume_7; + else if (state == 8) + goto resume_8; + else if (state == 9) + goto resume_9; + // Get children to merge + get_merge_children(); + // Try to select an inode for the "inverse" optimized scenario + // Read statistics from etcd to do it + read_stats(); + state = 1; +resume_1: + if (parent->waiting > 0) + return; + choose_inverse_candidate(); + // Merge children one by one, except our "inverse" child + for (current_child = 0; current_child < merge_children.size(); current_child++) + { + if (merge_children[current_child] == inverse_child) + continue; + start_merge_child(merge_children[current_child], merge_children[current_child]); +resume_2: + while (!cb()) + { + state = 2; + return; + } + cb = NULL; + parent->change_parent(merge_children[current_child], new_parent); + state = 3; +resume_3: + if (parent->waiting > 0) + return; + } + // Merge our "inverse" child into our "inverse" parent + if (inverse_child != 0) + { + start_merge_child(inverse_child, inverse_parent); +resume_4: + while (!cb()) + { + state = 4; + return; + } + cb = NULL; + // Delete "inverse" child data + start_delete_source(inverse_child); +resume_5: + while (!cb()) + { + state = 5; + return; + } + cb = NULL; + // Delete "inverse" child metadata and rename parent over it + rename_inverse_parent(); + state = 6; +resume_6: + if (parent->waiting > 0) + return; + } + // Delete parents, except the "inverse" one + for (current_child = 0; current_child < chain_list.size(); current_child++) + { + if (chain_list[current_child] == inverse_parent) + continue; + start_delete_source(chain_list[current_child]); +resume_7: + while (!cb()) + { + state = 7; + return; + } + cb = NULL; + delete_inode_config(chain_list[current_child]); + state = 8; +resume_8: + if (parent->waiting > 0) + return; + } + state = 9; +resume_9: + // Done + return; + } + void get_merge_children() { // Get all children of from..to @@ -78,7 +197,6 @@ struct snap_remover_t } new_parent = from_cfg->parent_id; // Calculate ranks - std::map sources; int i = chain_list.size()-1; for (inode_t item: chain_list) { @@ -94,67 +212,196 @@ struct snap_remover_t if (it != sources.end() && sources.find(ic.second.num) == sources.end()) { merge_children.push_back(ic.second.num); + if (ic.second.readonly || writers_stopped) + { + inverse_candidates[ic.second.num] = it->second; + } } } } - bool is_done() + void read_stats() { - return state == 5; + if (inverse_candidates.size() == 0) + { + return; + } + json11::Json::array reads; + for (auto cp: inverse_candidates) + { + inode_t inode = cp.first; + reads.push_back(json11::Json::object { + { "request_range", json11::Json::object { + { "key", base64_encode( + parent->cli->st_cli.etcd_prefix+ + "/inode/stats/"+std::to_string(INODE_POOL(inode))+ + "/"+std::to_string(INODE_NO_POOL(inode)) + ) }, + } } + }); + } + for (auto cp: sources) + { + inode_t inode = cp.first; + reads.push_back(json11::Json::object { + { "request_range", json11::Json::object { + { "key", base64_encode( + parent->cli->st_cli.etcd_prefix+ + "/inode/stats/"+std::to_string(INODE_POOL(inode))+ + "/"+std::to_string(INODE_NO_POOL(inode)) + ) }, + } } + }); + } + parent->waiting++; + parent->cli->st_cli.etcd_txn(json11::Json::object { + { "success", reads }, + }, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data) + { + parent->waiting--; + if (err != "") + { + fprintf(stderr, "Error reading layer statistics from etcd: %s\n", err.c_str()); + exit(1); + } + for (auto inode_result: data["responses"].array_items()) + { + auto kv = parent->cli->st_cli.parse_etcd_kv(inode_result["kvs"][0]); + pool_id_t pool_id = 0; + inode_t inode = 0; + char null_byte = 0; + sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte); + if (!inode || null_byte != 0) + { + fprintf(stderr, "Bad key returned from etcd: %s\n", kv.key.c_str()); + exit(1); + } + auto pool_cfg_it = parent->cli->st_cli.pool_config.find(pool_id); + if (pool_cfg_it == parent->cli->st_cli.pool_config.end()) + { + fprintf(stderr, "Pool %u does not exist\n", pool_id); + exit(1); + } + inode = INODE_WITH_POOL(pool_id, inode); + auto & pool_cfg = pool_cfg_it->second; + uint64_t used_bytes = kv.value["raw_used"].uint64_value() / pool_cfg.pg_size; + if (pool_cfg.scheme != POOL_SCHEME_REPLICATED) + { + used_bytes *= (pool_cfg.pg_size - pool_cfg.parity_chunks); + } + inode_used[inode] = used_bytes; + } + parent->ringloop->wakeup(); + }); } - void loop() + void choose_inverse_candidate() { - if (state == 1) - goto resume_1; - else if (state == 2) - goto resume_2; - else if (state == 3) - goto resume_3; - else if (state == 4) - goto resume_4; - else if (state == 5) - goto resume_5; - // Get children to merge - get_merge_children(); - // Merge children one by one - for (current_child = 0; current_child < merge_children.size(); current_child++) + uint64_t max_diff = 0; + for (auto cp: inverse_candidates) { - start_merge_child(); -resume_1: - while (!cb()) + inode_t child = cp.first; + uint64_t child_used = inode_used[child]; + int rank = cp.second; + for (int i = chain_list.size()-rank; i < chain_list.size(); i++) { - state = 1; - return; + inode_t parent = chain_list[i]; + uint64_t parent_used = inode_used[parent]; + if (parent_used > child_used && (!max_diff || max_diff < (parent_used-child_used))) + { + max_diff = (parent_used-child_used); + inverse_parent = parent; + inverse_child = child; + } } - cb = NULL; - parent->change_parent(merge_children[current_child], new_parent); - state = 2; -resume_2: - if (parent->waiting > 0) - return; } - // Delete sources - for (current_child = 0; current_child < chain_list.size(); current_child++) + } + + void rename_inverse_parent() + { + auto child_it = parent->cli->st_cli.inode_config.find(inverse_child); + if (child_it == parent->cli->st_cli.inode_config.end()) { - start_delete_source(); -resume_3: - while (!cb()) - { - state = 3; - return; - } - cb = NULL; - delete_inode_config(chain_list[current_child]); - state = 4; -resume_4: - if (parent->waiting > 0) - return; + fprintf(stderr, "Inode %ld disappeared\n", inverse_child); + exit(1); } - state = 5; -resume_5: - // Done - return; + auto target_it = parent->cli->st_cli.inode_config.find(inverse_parent); + if (target_it == parent->cli->st_cli.inode_config.end()) + { + fprintf(stderr, "Inode %ld disappeared\n", inverse_parent); + exit(1); + } + inode_config_t *child_cfg = &child_it->second; + inode_config_t *target_cfg = &target_it->second; + std::string child_name = child_cfg->name; + std::string target_name = target_cfg->name; + std::string child_cfg_key = base64_encode( + parent->cli->st_cli.etcd_prefix+ + "/config/inode/"+std::to_string(INODE_POOL(inverse_child))+ + "/"+std::to_string(INODE_NO_POOL(inverse_child)) + ); + std::string target_cfg_key = base64_encode( + parent->cli->st_cli.etcd_prefix+ + "/config/inode/"+std::to_string(INODE_POOL(inverse_parent))+ + "/"+std::to_string(INODE_NO_POOL(inverse_parent)) + ); + json11::Json::object new_cfg = json11::Json::object { + { "name", child_cfg->name }, + { "size", child_cfg->size }, + }; + if (new_parent) + { + if (INODE_POOL(inverse_parent) != INODE_POOL(new_parent)) + new_cfg["parent_pool"] = (uint64_t)INODE_POOL(new_parent); + new_cfg["parent_id"] = (uint64_t)INODE_NO_POOL(new_parent); + } + if (child_cfg->readonly) + { + new_cfg["readonly"] = true; + } + parent->waiting++; + parent->cli->st_cli.etcd_txn(json11::Json::object { + { "compare", json11::Json::array { + json11::Json::object { + { "target", "MOD" }, + { "key", child_cfg_key }, + { "result", "LESS" }, + { "mod_revision", child_cfg->mod_revision+1 }, + }, + json11::Json::object { + { "target", "MOD" }, + { "key", target_cfg_key }, + { "result", "LESS" }, + { "mod_revision", target_cfg->mod_revision+1 }, + }, + } }, + { "success", json11::Json::array { + json11::Json::object { + { "request_delete_range", json11::Json::object { + { "key", child_cfg_key }, + } }, + { "request_put", json11::Json::object { + { "key", target_cfg_key }, + { "value", base64_encode(json11::Json(new_cfg).dump()) }, + } } + }, + } }, + }, ETCD_SLOW_TIMEOUT, [this, target_name, child_name](std::string err, json11::Json res) + { + parent->waiting--; + if (err != "") + { + fprintf(stderr, "Error renaming %s to %s: %s\n", target_name.c_str(), child_name.c_str(), err.c_str()); + exit(1); + } + if (!res["succeeded"].bool_value()) + { + fprintf(stderr, "Layer %s or %s configuration was modified during renaming\n", target_name.c_str(), child_name.c_str()); + exit(1); + } + printf("Layer %s renamed to %s\n", target_name.c_str(), child_name.c_str()); + parent->ringloop->wakeup(); + }); } void delete_inode_config(inode_t cur) @@ -184,11 +431,12 @@ resume_5: json11::Json::object { { "request_delete_range", json11::Json::object { { "key", cur_cfg_key }, - } } + } }, }, } }, }, ETCD_SLOW_TIMEOUT, [this, cur_name](std::string err, json11::Json res) { + parent->waiting--; if (err != "") { fprintf(stderr, "Error deleting %s: %s\n", cur_name.c_str(), err.c_str()); @@ -200,39 +448,44 @@ resume_5: exit(1); } printf("Layer %s deleted\n", cur_name.c_str()); - parent->waiting--; parent->ringloop->wakeup(); }); } - void start_merge_child() + void start_merge_child(inode_t child_inode, inode_t target_inode) { - auto target = parent->cli->st_cli.inode_config.find(merge_children[current_child]); - if (target == parent->cli->st_cli.inode_config.end()) + auto child_it = parent->cli->st_cli.inode_config.find(child_inode); + if (child_it == parent->cli->st_cli.inode_config.end()) { - fprintf(stderr, "Inode %ld disappeared\n", merge_children[current_child]); + fprintf(stderr, "Inode %ld disappeared\n", child_inode); + exit(1); + } + auto target_it = parent->cli->st_cli.inode_config.find(target_inode); + if (target_it == parent->cli->st_cli.inode_config.end()) + { + fprintf(stderr, "Inode %ld disappeared\n", target_inode); exit(1); } cb = parent->start_merge(json11::Json::object { - { "command", json11::Json::array{ "merge", from_name, target->second.name } }, - { "target", target->second.name }, + { "command", json11::Json::array{ "merge", from_name, child_it->second.name } }, + { "target", target_it->second.name }, { "delete-source", false }, { "cas", use_cas }, { "fsync-interval", fsync_interval }, }); } - void start_delete_source() + void start_delete_source(inode_t inode) { - auto source = parent->cli->st_cli.inode_config.find(chain_list[current_child]); + auto source = parent->cli->st_cli.inode_config.find(inode); if (source == parent->cli->st_cli.inode_config.end()) { - fprintf(stderr, "Inode %ld disappeared\n", chain_list[current_child]); + fprintf(stderr, "Inode %ld disappeared\n", inode); exit(1); } cb = parent->start_rm(json11::Json::object { - { "inode", chain_list[current_child] }, - { "pool", (uint64_t)INODE_POOL(chain_list[current_child]) }, + { "inode", inode }, + { "pool", (uint64_t)INODE_POOL(inode) }, { "fsync-interval", fsync_interval }, }); } @@ -259,6 +512,8 @@ std::function cli_tool_t::start_snap_rm(json11::Json cfg) snap_remover->fsync_interval = 128; if (!cfg["cas"].is_null()) snap_remover->use_cas = cfg["cas"].uint64_value() ? 2 : 0; + if (!cfg["writers_stopped"].is_null()) + snap_remover->writers_stopped = true; return [snap_remover]() { snap_remover->loop(); diff --git a/src/osd_id.h b/src/osd_id.h index b14f11d3..a1154eb5 100644 --- a/src/osd_id.h +++ b/src/osd_id.h @@ -10,6 +10,7 @@ #define POOL_ID_BITS 16 #define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS)) #define INODE_NO_POOL(inode) (inode_t)(inode & ((1l << (64-POOL_ID_BITS)) - 1)) +#define INODE_WITH_POOL(pool_id, inode) (((inode_t)(pool_id) << (64-POOL_ID_BITS)) | INODE_NO_POOL(inode)) // Pool ID is 16 bits long typedef uint32_t pool_id_t;