// Copyright (c) Vitaliy Filippov, 2019+ // License: VNPL-1.1 (see README.md for details) #include #include "cli.h" #include "cluster_client.h" #include "base64.h" // Remove layer(s): similar to merge, but alters metadata and processes multiple merge targets // // Exactly one child of the requested layers may be merged using the "inverted" workflow, // where we merge it "down" into one of the "to-be-removed" layers and then rename the // "to-be-removed" layer to the child. It may be done either if all writers are stopped // before trying to delete layers (which is signaled by --writers-stopped) or if that child // is a read-only layer (snapshot) itself. // // This "inverted" workflow trades copying data of one of the deleted layers for copying // data of one child of the chain which is also a child of the "traded" layer. So we // choose the (parent,child) pair which has the largest difference between "parent" and // "child" inode sizes. // // All other children of the chain are processed by iterating though them, merging removed // parents into them and rebasing them to the last layer which isn't a member of the removed // chain. // // Example: // // - - - - // \ \ \- // \ \- // \- // // 1) Find optimal pair for the "reverse" scenario // Imagine that it's (, ) in this example // 2) Process all children except : // - Merge .. to // - Set parent to // - Repeat for others // 3) Process : // - Merge .. to // - Set parent to // - Rename to // 4) Delete other layers of the chain (, ) struct snap_remover_t { cli_tool_t *parent; // remove from..to std::string from_name, to_name; // writers are stopped, we can safely change writable layers bool writers_stopped = false; // use CAS writes (0 = never, 1 = auto, 2 = always) int use_cas = 1; // interval between fsyncs int fsync_interval = 128; std::map sources; std::map inode_used; std::vector merge_children; std::vector chain_list; std::map inverse_candidates; inode_t inverse_parent = 0, inverse_child = 0; inode_t new_parent = 0; int state = 0; int current_child = 0; std::function cb; bool is_done() { return state == 9; } void loop() { if (state == 1) goto resume_1; else if (state == 2) goto resume_2; else if (state == 3) goto resume_3; else if (state == 4) goto resume_4; else if (state == 5) goto resume_5; else if (state == 6) goto resume_6; else if (state == 7) goto resume_7; else if (state == 8) goto resume_8; else if (state == 9) goto resume_9; // Get children to merge get_merge_children(); // Try to select an inode for the "inverse" optimized scenario // Read statistics from etcd to do it read_stats(); state = 1; resume_1: if (parent->waiting > 0) return; choose_inverse_candidate(); // Merge children one by one, except our "inverse" child for (current_child = 0; current_child < merge_children.size(); current_child++) { if (merge_children[current_child] == inverse_child) continue; start_merge_child(merge_children[current_child], merge_children[current_child]); resume_2: while (!cb()) { state = 2; return; } cb = NULL; parent->change_parent(merge_children[current_child], new_parent); state = 3; resume_3: if (parent->waiting > 0) return; } // Merge our "inverse" child into our "inverse" parent if (inverse_child != 0) { start_merge_child(inverse_child, inverse_parent); resume_4: while (!cb()) { state = 4; return; } cb = NULL; // Delete "inverse" child data start_delete_source(inverse_child); resume_5: while (!cb()) { state = 5; return; } cb = NULL; // Delete "inverse" child metadata, rename parent over it, // and also change parent links of the previous "inverse" child rename_inverse_parent(); state = 6; resume_6: if (parent->waiting > 0) return; } // Delete parents, except the "inverse" one for (current_child = 0; current_child < chain_list.size(); current_child++) { if (chain_list[current_child] == inverse_parent) continue; start_delete_source(chain_list[current_child]); resume_7: while (!cb()) { state = 7; return; } cb = NULL; delete_inode_config(chain_list[current_child]); state = 8; resume_8: if (parent->waiting > 0) return; } state = 9; resume_9: // Done return; } void get_merge_children() { // Get all children of from..to inode_config_t *from_cfg = parent->get_inode_cfg(from_name); inode_config_t *to_cfg = parent->get_inode_cfg(to_name); // Check that to_cfg is actually a child of from_cfg // FIXME de-copypaste the following piece of code with snap_merger_t inode_config_t *cur = to_cfg; chain_list.push_back(cur->num); while (cur->num != from_cfg->num && cur->parent_id != 0) { auto it = parent->cli->st_cli.inode_config.find(cur->parent_id); if (it == parent->cli->st_cli.inode_config.end()) { fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id); exit(1); } cur = &it->second; chain_list.push_back(cur->num); } if (cur->num != from_cfg->num) { fprintf(stderr, "Layer %s is not a child of %s\n", to_name.c_str(), from_name.c_str()); exit(1); } new_parent = from_cfg->parent_id; // Calculate ranks int i = chain_list.size()-1; for (inode_t item: chain_list) { sources[item] = i--; } for (auto & ic: parent->cli->st_cli.inode_config) { if (!ic.second.parent_id) { continue; } auto it = sources.find(ic.second.parent_id); if (it != sources.end() && sources.find(ic.second.num) == sources.end()) { merge_children.push_back(ic.second.num); if (ic.second.readonly || writers_stopped) { inverse_candidates[ic.second.num] = it->second; } } } } void read_stats() { if (inverse_candidates.size() == 0) { return; } json11::Json::array reads; for (auto cp: inverse_candidates) { inode_t inode = cp.first; reads.push_back(json11::Json::object { { "request_range", json11::Json::object { { "key", base64_encode( parent->cli->st_cli.etcd_prefix+ "/inode/stats/"+std::to_string(INODE_POOL(inode))+ "/"+std::to_string(INODE_NO_POOL(inode)) ) }, } } }); } for (auto cp: sources) { inode_t inode = cp.first; reads.push_back(json11::Json::object { { "request_range", json11::Json::object { { "key", base64_encode( parent->cli->st_cli.etcd_prefix+ "/inode/stats/"+std::to_string(INODE_POOL(inode))+ "/"+std::to_string(INODE_NO_POOL(inode)) ) }, } } }); } parent->waiting++; parent->cli->st_cli.etcd_txn(json11::Json::object { { "success", reads }, }, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data) { parent->waiting--; if (err != "") { fprintf(stderr, "Error reading layer statistics from etcd: %s\n", err.c_str()); exit(1); } for (auto inode_result: data["responses"].array_items()) { auto kv = parent->cli->st_cli.parse_etcd_kv(inode_result["kvs"][0]); pool_id_t pool_id = 0; inode_t inode = 0; char null_byte = 0; sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte); if (!inode || null_byte != 0) { fprintf(stderr, "Bad key returned from etcd: %s\n", kv.key.c_str()); exit(1); } auto pool_cfg_it = parent->cli->st_cli.pool_config.find(pool_id); if (pool_cfg_it == parent->cli->st_cli.pool_config.end()) { fprintf(stderr, "Pool %u does not exist\n", pool_id); exit(1); } inode = INODE_WITH_POOL(pool_id, inode); auto & pool_cfg = pool_cfg_it->second; uint64_t used_bytes = kv.value["raw_used"].uint64_value() / pool_cfg.pg_size; if (pool_cfg.scheme != POOL_SCHEME_REPLICATED) { used_bytes *= (pool_cfg.pg_size - pool_cfg.parity_chunks); } inode_used[inode] = used_bytes; } parent->ringloop->wakeup(); }); } void choose_inverse_candidate() { uint64_t max_diff = 0; for (auto cp: inverse_candidates) { inode_t child = cp.first; uint64_t child_used = inode_used[child]; int rank = cp.second; for (int i = chain_list.size()-rank; i < chain_list.size(); i++) { inode_t parent = chain_list[i]; uint64_t parent_used = inode_used[parent]; if (parent_used > child_used && (!max_diff || max_diff < (parent_used-child_used))) { max_diff = (parent_used-child_used); inverse_parent = parent; inverse_child = child; } } } } void rename_inverse_parent() { auto child_it = parent->cli->st_cli.inode_config.find(inverse_child); if (child_it == parent->cli->st_cli.inode_config.end()) { fprintf(stderr, "Inode %ld disappeared\n", inverse_child); exit(1); } auto target_it = parent->cli->st_cli.inode_config.find(inverse_parent); if (target_it == parent->cli->st_cli.inode_config.end()) { fprintf(stderr, "Inode %ld disappeared\n", inverse_parent); exit(1); } inode_config_t *child_cfg = &child_it->second; inode_config_t *target_cfg = &target_it->second; std::string child_name = child_cfg->name; std::string target_name = target_cfg->name; std::string child_cfg_key = base64_encode( parent->cli->st_cli.etcd_prefix+ "/config/inode/"+std::to_string(INODE_POOL(inverse_child))+ "/"+std::to_string(INODE_NO_POOL(inverse_child)) ); std::string target_cfg_key = base64_encode( parent->cli->st_cli.etcd_prefix+ "/config/inode/"+std::to_string(INODE_POOL(inverse_parent))+ "/"+std::to_string(INODE_NO_POOL(inverse_parent)) ); // Fill new configuration inode_config_t new_cfg = *child_cfg; new_cfg.num = target_cfg->num; new_cfg.parent_id = new_parent; json11::Json::array cmp = json11::Json::array { json11::Json::object { { "target", "MOD" }, { "key", child_cfg_key }, { "result", "LESS" }, { "mod_revision", child_cfg->mod_revision+1 }, }, json11::Json::object { { "target", "MOD" }, { "key", target_cfg_key }, { "result", "LESS" }, { "mod_revision", target_cfg->mod_revision+1 }, }, }; json11::Json::array txn = json11::Json::array { json11::Json::object { { "request_delete_range", json11::Json::object { { "key", child_cfg_key }, } }, }, json11::Json::object { { "request_put", json11::Json::object { { "key", target_cfg_key }, { "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&new_cfg)).dump()) }, } }, }, json11::Json::object { { "request_put", json11::Json::object { { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+child_cfg->name) }, { "value", base64_encode(json11::Json({ { "id", INODE_NO_POOL(inverse_parent) }, { "pool_id", (uint64_t)INODE_POOL(inverse_parent) }, }).dump()) }, } }, }, }; // Reparent children of inverse_child for (auto & cp: parent->cli->st_cli.inode_config) { if (cp.second.parent_id == child_cfg->num) { auto cp_cfg = cp.second; cp_cfg.parent_id = inverse_parent; auto cp_key = base64_encode( parent->cli->st_cli.etcd_prefix+ "/config/inode/"+std::to_string(INODE_POOL(cp.second.num))+ "/"+std::to_string(INODE_NO_POOL(cp.second.num)) ); cmp.push_back(json11::Json::object { { "target", "MOD" }, { "key", cp_key }, { "result", "LESS" }, { "mod_revision", cp.second.mod_revision+1 }, }); txn.push_back(json11::Json::object { { "request_put", json11::Json::object { { "key", cp_key }, { "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&cp_cfg)).dump()) }, } }, }); } } parent->waiting++; parent->cli->st_cli.etcd_txn(json11::Json::object { { "compare", cmp }, { "success", txn }, }, ETCD_SLOW_TIMEOUT, [this, target_name, child_name](std::string err, json11::Json res) { parent->waiting--; if (err != "") { fprintf(stderr, "Error renaming %s to %s: %s\n", target_name.c_str(), child_name.c_str(), err.c_str()); exit(1); } if (!res["succeeded"].bool_value()) { fprintf( stderr, "Parent (%s), child (%s), or one of its children" " configuration was modified during rename\n", target_name.c_str(), child_name.c_str() ); exit(1); } printf("Layer %s renamed to %s\n", target_name.c_str(), child_name.c_str()); parent->ringloop->wakeup(); }); } void delete_inode_config(inode_t cur) { auto cur_cfg_it = parent->cli->st_cli.inode_config.find(cur); if (cur_cfg_it == parent->cli->st_cli.inode_config.end()) { fprintf(stderr, "Inode 0x%lx disappeared\n", cur); exit(1); } inode_config_t *cur_cfg = &cur_cfg_it->second; std::string cur_name = cur_cfg->name; std::string cur_cfg_key = base64_encode( parent->cli->st_cli.etcd_prefix+ "/config/inode/"+std::to_string(INODE_POOL(cur))+ "/"+std::to_string(INODE_NO_POOL(cur)) ); parent->waiting++; parent->cli->st_cli.etcd_txn(json11::Json::object { { "compare", json11::Json::array { json11::Json::object { { "target", "MOD" }, { "key", cur_cfg_key }, { "result", "LESS" }, { "mod_revision", cur_cfg->mod_revision+1 }, }, } }, { "success", json11::Json::array { json11::Json::object { { "request_delete_range", json11::Json::object { { "key", cur_cfg_key }, } }, { "request_delete_range", json11::Json::object { { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+cur_name) }, } }, }, } }, }, ETCD_SLOW_TIMEOUT, [this, cur_name](std::string err, json11::Json res) { parent->waiting--; if (err != "") { fprintf(stderr, "Error deleting %s: %s\n", cur_name.c_str(), err.c_str()); exit(1); } if (!res["succeeded"].bool_value()) { fprintf(stderr, "Layer %s configuration was modified during deletion\n", cur_name.c_str()); exit(1); } printf("Layer %s deleted\n", cur_name.c_str()); parent->ringloop->wakeup(); }); } void start_merge_child(inode_t child_inode, inode_t target_inode) { auto child_it = parent->cli->st_cli.inode_config.find(child_inode); if (child_it == parent->cli->st_cli.inode_config.end()) { fprintf(stderr, "Inode %ld disappeared\n", child_inode); exit(1); } auto target_it = parent->cli->st_cli.inode_config.find(target_inode); if (target_it == parent->cli->st_cli.inode_config.end()) { fprintf(stderr, "Inode %ld disappeared\n", target_inode); exit(1); } cb = parent->start_merge(json11::Json::object { { "command", json11::Json::array{ "merge-data", from_name, child_it->second.name } }, { "target", target_it->second.name }, { "delete-source", false }, { "cas", use_cas }, { "fsync-interval", fsync_interval }, }); } void start_delete_source(inode_t inode) { auto source = parent->cli->st_cli.inode_config.find(inode); if (source == parent->cli->st_cli.inode_config.end()) { fprintf(stderr, "Inode %ld disappeared\n", inode); exit(1); } cb = parent->start_rm(json11::Json::object { { "inode", inode }, { "pool", (uint64_t)INODE_POOL(inode) }, { "fsync-interval", fsync_interval }, }); } }; std::function cli_tool_t::start_snap_rm(json11::Json cfg) { json11::Json::array cmd = cfg["command"].array_items(); auto snap_remover = new snap_remover_t(); snap_remover->parent = this; snap_remover->from_name = cmd.size() > 1 ? cmd[1].string_value() : ""; snap_remover->to_name = cmd.size() > 2 ? cmd[2].string_value() : ""; if (snap_remover->from_name == "") { fprintf(stderr, "Layer to remove argument is missing\n"); exit(1); } if (snap_remover->to_name == "") { snap_remover->to_name = snap_remover->from_name; } snap_remover->fsync_interval = cfg["fsync-interval"].uint64_value(); if (!snap_remover->fsync_interval) snap_remover->fsync_interval = 128; if (!cfg["cas"].is_null()) snap_remover->use_cas = cfg["cas"].uint64_value() ? 2 : 0; if (!cfg["writers_stopped"].is_null()) snap_remover->writers_stopped = true; return [snap_remover]() { snap_remover->loop(); if (snap_remover->is_done()) { delete snap_remover; return true; } return false; }; }