vitastor/osd.h

252 lines
6.1 KiB
C
Raw Normal View History

2019-12-15 01:11:51 +03:00
#pragma once
#include <sys/types.h>
2020-01-09 20:20:56 +03:00
#include <sys/time.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <malloc.h>
2019-12-15 01:11:51 +03:00
#include <arpa/inet.h>
#include <malloc.h>
2019-12-15 01:11:51 +03:00
#include <set>
#include <deque>
2019-12-15 01:11:51 +03:00
#include "blockstore.h"
2019-12-15 01:11:51 +03:00
#include "ringloop.h"
2020-02-11 02:30:46 +03:00
#include "timerfd_interval.h"
2019-12-15 01:11:51 +03:00
#include "osd_ops.h"
#include "osd_peering_pg.h"
2019-12-15 01:11:51 +03:00
#include "sparsepp/sparsepp/spp.h"
#define OSD_OP_IN 0
#define OSD_OP_OUT 1
2019-12-28 01:25:55 +03:00
#define CL_READ_OP 1
#define CL_READ_DATA 2
#define CL_READ_REPLY_DATA 3
#define SQE_SENT 0x100l
#define CL_WRITE_READY 1
#define CL_WRITE_REPLY 2
#define CL_WRITE_DATA 3
#define MAX_EPOLL_EVENTS 16
2020-02-03 12:35:02 +03:00
#define OSD_OP_INLINE_BUF_COUNT 4
2019-12-28 01:25:55 +03:00
#define PEER_CONNECTING 1
#define PEER_CONNECTED 2
2020-02-11 02:30:46 +03:00
#define OSD_PEERING_PEERS 1
#define OSD_PEERING_PGS 2
//#define OSD_STUB
2020-02-03 12:35:02 +03:00
struct osd_op_buf_t
{
void *buf;
int len;
};
struct osd_op_buf_list_t
{
int count = 0, alloc = 0, sent = 0;
osd_op_buf_t *buf = NULL;
osd_op_buf_t inline_buf[OSD_OP_INLINE_BUF_COUNT];
~osd_op_buf_list_t()
{
if (buf && buf != inline_buf)
{
free(buf);
}
}
inline void push_back(void *nbuf, int len)
{
if (count >= alloc)
{
if (!alloc)
{
alloc = OSD_OP_INLINE_BUF_COUNT;
buf = inline_buf;
}
else if (buf == inline_buf)
{
int old = alloc;
alloc = ((alloc/16)*16 + 1);
buf = (osd_op_buf_t*)malloc(sizeof(osd_op_buf_t) * alloc);
memcpy(buf, inline_buf, sizeof(osd_op_buf_t)*old);
}
else
{
alloc = ((alloc/16)*16 + 1);
buf = (osd_op_buf_t*)realloc(buf, sizeof(osd_op_buf_t) * alloc);
}
}
buf[count++] = { .buf = nbuf, .len = len };
}
inline osd_op_buf_t & operator [] (int i)
{
return buf[i];
}
};
struct osd_primary_op_data_t;
2019-12-15 01:11:51 +03:00
struct osd_op_t
{
int op_type;
2019-12-15 01:11:51 +03:00
int peer_fd;
2020-02-23 19:03:06 +03:00
osd_any_op_t req;
osd_any_reply_t reply;
2020-02-23 23:19:11 +03:00
blockstore_op_t *bs_op = NULL;
2019-12-15 01:11:51 +03:00
void *buf = NULL;
2020-02-24 01:01:34 +03:00
void *rmw_buf = NULL;
osd_primary_op_data_t* op_data = NULL;
std::function<void(osd_op_t*)> callback;
2019-12-15 01:11:51 +03:00
2020-02-03 12:35:02 +03:00
osd_op_buf_list_t send_list;
~osd_op_t();
2019-12-15 01:11:51 +03:00
};
struct osd_peer_def_t
{
2020-02-03 12:35:02 +03:00
osd_num_t osd_num = 0;
std::string addr;
int port = 0;
time_t last_connect_attempt = 0;
};
2020-01-04 01:23:25 +03:00
2019-12-15 01:11:51 +03:00
struct osd_client_t
{
sockaddr_in peer_addr;
2020-01-04 01:23:25 +03:00
int peer_port;
2019-12-15 01:11:51 +03:00
int peer_fd;
2020-01-04 01:23:25 +03:00
int peer_state;
2020-02-09 18:22:29 +03:00
std::function<void(osd_num_t, int)> connect_callback;
2020-02-03 12:35:02 +03:00
osd_num_t osd_num = 0;
2019-12-15 01:11:51 +03:00
// Read state
bool read_ready = false;
bool reading = false;
osd_op_t *read_op = NULL;
int read_reply_id = 0;
2019-12-15 01:11:51 +03:00
iovec read_iov;
msghdr read_msg;
void *read_buf = NULL;
int read_remaining = 0;
int read_state = 0;
// Outbound operations sent to this client (which is probably an OSD peer)
std::map<int, osd_op_t*> sent_ops;
// Outbound messages (replies or requests)
std::deque<osd_op_t*> outbox;
2019-12-15 01:11:51 +03:00
// PGs dirtied by this client's primary-writes
std::set<pg_num_t> dirty_pgs;
2019-12-15 01:11:51 +03:00
// Write state
osd_op_t *write_op = NULL;
iovec write_iov;
msghdr write_msg;
void *write_buf = NULL;
int write_remaining = 0;
int write_state = 0;
};
struct osd_rmw_stripe_t;
2020-02-03 12:35:02 +03:00
2019-12-15 01:11:51 +03:00
class osd_t
{
// config
2020-02-03 12:35:02 +03:00
osd_num_t osd_num = 1; // OSD numbers start with 1
bool run_primary = false;
std::vector<osd_peer_def_t> peers;
blockstore_config_t config;
2019-12-15 01:52:08 +03:00
std::string bind_address;
int bind_port, listen_backlog;
2019-12-15 01:11:51 +03:00
int client_queue_depth = 128;
2019-12-15 15:30:51 +03:00
bool allow_test_ops = true;
2019-12-15 01:11:51 +03:00
// peer OSDs
std::map<uint64_t, int> osd_peer_fds;
std::vector<pg_t> pgs;
int peering_state = 0;
unsigned pg_count = 0;
uint64_t next_subop_id = 1;
2020-02-25 01:20:45 +03:00
// Unstable writes
spp::sparse_hash_map<osd_num_t, spp::sparse_hash_map<object_id, uint64_t>> unstable_writes;
// client & peer I/O
2019-12-15 01:11:51 +03:00
2019-12-19 22:16:04 +03:00
bool stopping = false;
int inflight_ops = 0;
blockstore_t *bs;
2020-01-30 22:06:46 +03:00
uint32_t bs_block_size, bs_disk_alignment;
uint64_t parity_block_size = 4*1024*1024; // 4 MB by default
2019-12-15 01:11:51 +03:00
ring_loop_t *ringloop;
2020-02-11 02:30:46 +03:00
timerfd_interval *tick_tfd;
2019-12-15 01:11:51 +03:00
int wait_state = 0;
int epoll_fd = 0;
int listen_fd = 0;
ring_consumer_t consumer;
std::unordered_map<int,osd_client_t> clients;
std::vector<int> read_ready_clients;
std::vector<int> write_ready_clients;
// methods
2020-01-04 01:23:25 +03:00
// event loop, socket read/write
2019-12-15 01:11:51 +03:00
void loop();
int handle_epoll_events();
void read_requests();
void handle_read(ring_data_t *data, int peer_fd);
2020-02-25 01:20:45 +03:00
void handle_op_hdr(osd_client_t *cl);
void handle_reply_hdr(osd_client_t *cl);
2019-12-15 01:11:51 +03:00
void send_replies();
void handle_send(ring_data_t *data, int peer_fd);
void outbox_push(osd_client_t & cl, osd_op_t *op);
// peer handling (primary OSD logic)
2020-02-09 18:22:29 +03:00
void connect_peer(osd_num_t osd_num, const char *peer_host, int peer_port, std::function<void(osd_num_t, int)> callback);
2020-01-04 01:23:25 +03:00
void handle_connect_result(int peer_fd);
void cancel_osd_ops(osd_client_t & cl);
void cancel_op(osd_op_t *op);
2020-01-04 01:23:25 +03:00
void stop_client(int peer_fd);
osd_peer_def_t parse_peer(std::string peer);
void init_primary();
void handle_peers();
2020-02-11 02:30:46 +03:00
void repeer_pgs(osd_num_t osd_num, bool is_connected);
void start_pg_peering(int i);
2020-01-04 01:23:25 +03:00
// op execution
2019-12-28 01:25:55 +03:00
void exec_op(osd_op_t *cur_op);
// secondary ops
2019-12-28 01:25:55 +03:00
void exec_sync_stab_all(osd_op_t *cur_op);
void exec_show_config(osd_op_t *cur_op);
void exec_secondary(osd_op_t *cur_op);
2019-12-19 13:56:26 +03:00
void secondary_op_callback(osd_op_t *cur_op);
// primary ops
bool prepare_primary_rw(osd_op_t *cur_op);
void continue_primary_read(osd_op_t *cur_op);
void continue_primary_write(osd_op_t *cur_op);
2020-01-30 22:06:46 +03:00
void exec_primary_sync(osd_op_t *cur_op);
2020-02-03 12:35:02 +03:00
void finish_primary_op(osd_op_t *cur_op, int retval);
void handle_primary_subop(osd_op_t *cur_op, int ok, uint64_t version);
void submit_primary_subops(int submit_type, int read_pg_size, const uint64_t* osd_set, osd_op_t *cur_op);
2019-12-15 01:11:51 +03:00
public:
osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop);
2019-12-15 01:11:51 +03:00
~osd_t();
2019-12-15 01:52:08 +03:00
bool shutdown();
2019-12-15 01:11:51 +03:00
};