summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
author智皓 张 <[email protected]>2023-08-04 15:26:31 +0800
committer智皓 张 <[email protected]>2023-08-04 15:26:31 +0800
commite0ded3bfc164f48b3f7a4083cffd6c1b2fff4439 (patch)
tree9a7eabc2bebe3f5e5a967b1170abcacd3356828f /src
parent1c82c0c7a27ea7778a5d2ca5104d822209afeb75 (diff)
update
Diffstat (limited to 'src')
-rw-r--r--src/rdma_client.c553
-rw-r--r--src/rdma_common.c210
-rw-r--r--src/rdma_common.h133
-rw-r--r--src/rdma_server.c489
4 files changed, 1385 insertions, 0 deletions
diff --git a/src/rdma_client.c b/src/rdma_client.c
new file mode 100644
index 0000000..a640be6
--- /dev/null
+++ b/src/rdma_client.c
@@ -0,0 +1,553 @@
+/*
+ * An example RDMA client side code.
+ * Author: Animesh Trivedi
+ */
+
+#include "rdma_common.h"
+
+/* These are basic RDMA resources */
+/* These are RDMA connection related resources */
+static struct rdma_event_channel *cm_event_channel = NULL;
+static struct rdma_cm_id *cm_client_id = NULL;
+static struct ibv_pd *pd = NULL;
+static struct ibv_comp_channel *io_completion_channel = NULL;
+static struct ibv_cq *client_cq = NULL;
+static struct ibv_qp_init_attr qp_init_attr;
+static struct ibv_qp *client_qp;
+/* These are memory buffers related resources */
+static struct ibv_mr *client_metadata_mr = NULL,
+ *client_src_mr = NULL,
+ *client_dst_mr = NULL,
+ *server_metadata_mr = NULL;
+static struct rdma_buffer_attr client_metadata_attr, server_metadata_attr;
+static struct ibv_send_wr client_send_wr, *bad_client_send_wr = NULL;
+static struct ibv_recv_wr server_recv_wr, *bad_server_recv_wr = NULL;
+static struct ibv_sge client_send_sge, server_recv_sge;
+/* Source and Destination buffers, where RDMA operations source and sink */
+static char *src = NULL, *dst = NULL;
+
+/* This is our testing function */
+static int check_src_dst()
+{
+ return memcmp((void*) src, (void*) dst, strlen(src));
+}
+
+/* This function prepares client side connection resources for an RDMA connection */
+static int client_prepare_connection(struct sockaddr_in *s_addr)
+{
+ struct rdma_cm_event *cm_event = NULL;
+ int ret = -1;
+ /* Open a channel used to report asynchronous communication event */
+ cm_event_channel = rdma_create_event_channel();
+ if (!cm_event_channel) {
+ rdma_error("Creating cm event channel failed, errno: %d \n", -errno);
+ return -errno;
+ }
+ debug("RDMA CM event channel is created at : %p \n", cm_event_channel);
+ /* rdma_cm_id is the connection identifier (like socket) which is used
+ * to define an RDMA connection.
+ */
+ ret = rdma_create_id(cm_event_channel, &cm_client_id,
+ NULL,
+ RDMA_PS_TCP);
+ if (ret) {
+ rdma_error("Creating cm id failed with errno: %d \n", -errno);
+ return -errno;
+ }
+ /* Resolve destination and optional source addresses from IP addresses to
+ * an RDMA address. If successful, the specified rdma_cm_id will be bound
+ * to a local device. */
+ ret = rdma_resolve_addr(cm_client_id, NULL, (struct sockaddr*) s_addr, 2000);
+ if (ret) {
+ rdma_error("Failed to resolve address, errno: %d \n", -errno);
+ return -errno;
+ }
+ debug("waiting for cm event: RDMA_CM_EVENT_ADDR_RESOLVED\n");
+ ret = process_rdma_cm_event(cm_event_channel,
+ RDMA_CM_EVENT_ADDR_RESOLVED,
+ &cm_event);
+ if (ret) {
+ rdma_error("Failed to receive a valid event, ret = %d \n", ret);
+ return ret;
+ }
+ /* we ack the event */
+ ret = rdma_ack_cm_event(cm_event);
+ if (ret) {
+ rdma_error("Failed to acknowledge the CM event, errno: %d\n", -errno);
+ return -errno;
+ }
+ debug("RDMA address is resolved \n");
+
+ /* Resolves an RDMA route to the destination address in order to
+ * establish a connection */
+ ret = rdma_resolve_route(cm_client_id, 2000);
+ if (ret) {
+ rdma_error("Failed to resolve route, erno: %d \n", -errno);
+ return -errno;
+ }
+ debug("waiting for cm event: RDMA_CM_EVENT_ROUTE_RESOLVED\n");
+ ret = process_rdma_cm_event(cm_event_channel,
+ RDMA_CM_EVENT_ROUTE_RESOLVED,
+ &cm_event);
+ if (ret) {
+ rdma_error("Failed to receive a valid event, ret = %d \n", ret);
+ return ret;
+ }
+ /* we ack the event */
+ ret = rdma_ack_cm_event(cm_event);
+ if (ret) {
+ rdma_error("Failed to acknowledge the CM event, errno: %d \n", -errno);
+ return -errno;
+ }
+ printf("Trying to connect to server at : %s port: %d \n",
+ inet_ntoa(s_addr->sin_addr),
+ ntohs(s_addr->sin_port));
+ /* Protection Domain (PD) is similar to a "process abstraction"
+ * in the operating system. All resources are tied to a particular PD.
+ * And accessing recourses across PD will result in a protection fault.
+ */
+ pd = ibv_alloc_pd(cm_client_id->verbs);
+ if (!pd) {
+ rdma_error("Failed to alloc pd, errno: %d \n", -errno);
+ return -errno;
+ }
+ debug("pd allocated at %p \n", pd);
+ /* Now we need a completion channel, were the I/O completion
+ * notifications are sent. Remember, this is different from connection
+ * management (CM) event notifications.
+ * A completion channel is also tied to an RDMA device, hence we will
+ * use cm_client_id->verbs.
+ */
+ io_completion_channel = ibv_create_comp_channel(cm_client_id->verbs);
+ if (!io_completion_channel) {
+ rdma_error("Failed to create IO completion event channel, errno: %d\n",
+ -errno);
+ return -errno;
+ }
+ debug("completion event channel created at : %p \n", io_completion_channel);
+ /* Now we create a completion queue (CQ) where actual I/O
+ * completion metadata is placed. The metadata is packed into a structure
+ * called struct ibv_wc (wc = work completion). ibv_wc has detailed
+ * information about the work completion. An I/O request in RDMA world
+ * is called "work" ;)
+ */
+ client_cq = ibv_create_cq(cm_client_id->verbs /* which device*/,
+ CQ_CAPACITY /* maximum capacity*/,
+ NULL /* user context, not used here */,
+ io_completion_channel /* which IO completion channel */,
+ 0 /* signaling vector, not used here*/);
+ if (!client_cq) {
+ rdma_error("Failed to create CQ, errno: %d \n", -errno);
+ return -errno;
+ }
+ debug("CQ created at %p with %d elements \n", client_cq, client_cq->cqe);
+ ret = ibv_req_notify_cq(client_cq, 0);
+ if (ret) {
+ rdma_error("Failed to request notifications, errno: %d\n", -errno);
+ return -errno;
+ }
+ /* Now the last step, set up the queue pair (send, recv) queues and their capacity.
+ * The capacity here is define statically but this can be probed from the
+ * device. We just use a small number as defined in rdma_common.h */
+ bzero(&qp_init_attr, sizeof qp_init_attr);
+ qp_init_attr.cap.max_recv_sge = MAX_SGE; /* Maximum SGE per receive posting */
+ qp_init_attr.cap.max_recv_wr = MAX_WR; /* Maximum receive posting capacity */
+ qp_init_attr.cap.max_send_sge = MAX_SGE; /* Maximum SGE per send posting */
+ qp_init_attr.cap.max_send_wr = MAX_WR; /* Maximum send posting capacity */
+ qp_init_attr.qp_type = IBV_QPT_RC; /* QP type, RC = Reliable connection */
+ /* We use same completion queue, but one can use different queues */
+ qp_init_attr.recv_cq = client_cq; /* Where should I notify for receive completion operations */
+ qp_init_attr.send_cq = client_cq; /* Where should I notify for send completion operations */
+ /*Lets create a QP */
+ ret = rdma_create_qp(cm_client_id /* which connection id */,
+ pd /* which protection domain*/,
+ &qp_init_attr /* Initial attributes */);
+ if (ret) {
+ rdma_error("Failed to create QP, errno: %d \n", -errno);
+ return -errno;
+ }
+ client_qp = cm_client_id->qp;
+ debug("QP created at %p \n", client_qp);
+ return 0;
+}
+
+/* Pre-posts a receive buffer before calling rdma_connect () */
+static int client_pre_post_recv_buffer()
+{
+ int ret = -1;
+ server_metadata_mr = rdma_buffer_register(pd,
+ &server_metadata_attr,
+ sizeof(server_metadata_attr),
+ (IBV_ACCESS_LOCAL_WRITE));
+ if(!server_metadata_mr){
+ rdma_error("Failed to setup the server metadata mr , -ENOMEM\n");
+ return -ENOMEM;
+ }
+ server_recv_sge.addr = (uint64_t) server_metadata_mr->addr;
+ server_recv_sge.length = (uint32_t) server_metadata_mr->length;
+ server_recv_sge.lkey = (uint32_t) server_metadata_mr->lkey;
+ /* now we link it to the request */
+ bzero(&server_recv_wr, sizeof(server_recv_wr));
+ server_recv_wr.sg_list = &server_recv_sge;
+ server_recv_wr.num_sge = 1;
+ ret = ibv_post_recv(client_qp /* which QP */,
+ &server_recv_wr /* receive work request*/,
+ &bad_server_recv_wr /* error WRs */);
+ if (ret) {
+ rdma_error("Failed to pre-post the receive buffer, errno: %d \n", ret);
+ return ret;
+ }
+ debug("Receive buffer pre-posting is successful \n");
+ return 0;
+}
+
+/* Connects to the RDMA server */
+static int client_connect_to_server()
+{
+ struct rdma_conn_param conn_param;
+ struct rdma_cm_event *cm_event = NULL;
+ int ret = -1;
+ bzero(&conn_param, sizeof(conn_param));
+ conn_param.initiator_depth = 3;
+ conn_param.responder_resources = 3;
+ conn_param.retry_count = 3; // if fail, then how many times to retry
+ ret = rdma_connect(cm_client_id, &conn_param);
+ if (ret) {
+ rdma_error("Failed to connect to remote host , errno: %d\n", -errno);
+ return -errno;
+ }
+ debug("waiting for cm event: RDMA_CM_EVENT_ESTABLISHED\n");
+ ret = process_rdma_cm_event(cm_event_channel,
+ RDMA_CM_EVENT_ESTABLISHED,
+ &cm_event);
+ if (ret) {
+ rdma_error("Failed to get cm event, ret = %d \n", ret);
+ return ret;
+ }
+ ret = rdma_ack_cm_event(cm_event);
+ if (ret) {
+ rdma_error("Failed to acknowledge cm event, errno: %d\n",
+ -errno);
+ return -errno;
+ }
+ printf("The client is connected successfully \n");
+ return 0;
+}
+
+/* Exchange buffer metadata with the server. The client sends its, and then receives
+ * from the server. The client-side metadata on the server is _not_ used because
+ * this program is client driven. But it shown here how to do it for the illustration
+ * purposes
+ */
+static int client_xchange_metadata_with_server()
+{
+ struct ibv_wc wc[2];
+ int ret = -1;
+ client_src_mr = rdma_buffer_register(pd,
+ src,
+ strlen(src),
+ (IBV_ACCESS_LOCAL_WRITE|
+ IBV_ACCESS_REMOTE_READ|
+ IBV_ACCESS_REMOTE_WRITE));
+ if(!client_src_mr){
+ rdma_error("Failed to register the first buffer, ret = %d \n", ret);
+ return ret;
+ }
+ /* we prepare metadata for the first buffer */
+ client_metadata_attr.address = (uint64_t) client_src_mr->addr;
+ client_metadata_attr.length = client_src_mr->length;
+ client_metadata_attr.stag.local_stag = client_src_mr->lkey;
+ /* now we register the metadata memory */
+ client_metadata_mr = rdma_buffer_register(pd,
+ &client_metadata_attr,
+ sizeof(client_metadata_attr),
+ IBV_ACCESS_LOCAL_WRITE);
+ if(!client_metadata_mr) {
+ rdma_error("Failed to register the client metadata buffer, ret = %d \n", ret);
+ return ret;
+ }
+ /* now we fill up SGE */
+ client_send_sge.addr = (uint64_t) client_metadata_mr->addr;
+ client_send_sge.length = (uint32_t) client_metadata_mr->length;
+ client_send_sge.lkey = client_metadata_mr->lkey;
+ /* now we link to the send work request */
+ bzero(&client_send_wr, sizeof(client_send_wr));
+ client_send_wr.sg_list = &client_send_sge;
+ client_send_wr.num_sge = 1;
+ client_send_wr.opcode = IBV_WR_SEND;
+ client_send_wr.send_flags = IBV_SEND_SIGNALED;
+ /* Now we post it */
+ ret = ibv_post_send(client_qp,
+ &client_send_wr,
+ &bad_client_send_wr);
+ if (ret) {
+ rdma_error("Failed to send client metadata, errno: %d \n",
+ -errno);
+ return -errno;
+ }
+ /* at this point we are expecting 2 work completion. One for our
+ * send and one for recv that we will get from the server for
+ * its buffer information */
+ ret = process_work_completion_events(io_completion_channel,
+ wc, 2);
+ if(ret != 2) {
+ rdma_error("We failed to get 2 work completions , ret = %d \n",
+ ret);
+ return ret;
+ }
+ debug("Server sent us its buffer location and credentials, showing \n");
+ show_rdma_buffer_attr(&server_metadata_attr);
+ return 0;
+}
+
+/* This function does :
+ * 1) Prepare memory buffers for RDMA operations
+ * 1) RDMA write from src -> remote buffer
+ * 2) RDMA read from remote bufer -> dst
+ */
+static int client_remote_memory_ops()
+{
+ struct ibv_wc wc;
+ int ret = -1;
+ client_dst_mr = rdma_buffer_register(pd,
+ dst,
+ strlen(src),
+ (IBV_ACCESS_LOCAL_WRITE |
+ IBV_ACCESS_REMOTE_WRITE |
+ IBV_ACCESS_REMOTE_READ));
+ if (!client_dst_mr) {
+ rdma_error("We failed to create the destination buffer, -ENOMEM\n");
+ return -ENOMEM;
+ }
+ /* Step 1: is to copy the local buffer into the remote buffer. We will
+ * reuse the previous variables. */
+ /* now we fill up SGE */
+ client_send_sge.addr = (uint64_t) client_src_mr->addr;
+ client_send_sge.length = (uint32_t) client_src_mr->length;
+ client_send_sge.lkey = client_src_mr->lkey;
+ /* now we link to the send work request */
+ bzero(&client_send_wr, sizeof(client_send_wr));
+ client_send_wr.sg_list = &client_send_sge;
+ client_send_wr.num_sge = 1;
+ client_send_wr.opcode = IBV_WR_RDMA_WRITE;
+ client_send_wr.send_flags = IBV_SEND_SIGNALED;
+ /* we have to tell server side info for RDMA */
+ client_send_wr.wr.rdma.rkey = server_metadata_attr.stag.remote_stag;
+ client_send_wr.wr.rdma.remote_addr = server_metadata_attr.address;
+ /* Now we post it */
+ ret = ibv_post_send(client_qp,
+ &client_send_wr,
+ &bad_client_send_wr);
+ if (ret) {
+ rdma_error("Failed to write client src buffer, errno: %d \n",
+ -errno);
+ return -errno;
+ }
+ /* at this point we are expecting 1 work completion for the write */
+ ret = process_work_completion_events(io_completion_channel,
+ &wc, 1);
+ if(ret != 1) {
+ rdma_error("We failed to get 1 work completions , ret = %d \n",
+ ret);
+ return ret;
+ }
+ debug("Client side WRITE is complete \n");
+ /* Now we prepare a READ using same variables but for destination */
+ client_send_sge.addr = (uint64_t) client_dst_mr->addr;
+ client_send_sge.length = (uint32_t) client_dst_mr->length;
+ client_send_sge.lkey = client_dst_mr->lkey;
+ /* now we link to the send work request */
+ bzero(&client_send_wr, sizeof(client_send_wr));
+ client_send_wr.sg_list = &client_send_sge;
+ client_send_wr.num_sge = 1;
+ client_send_wr.opcode = IBV_WR_RDMA_READ;
+ client_send_wr.send_flags = IBV_SEND_SIGNALED;
+ /* we have to tell server side info for RDMA */
+ client_send_wr.wr.rdma.rkey = server_metadata_attr.stag.remote_stag;
+ client_send_wr.wr.rdma.remote_addr = server_metadata_attr.address;
+ /* Now we post it */
+ ret = ibv_post_send(client_qp,
+ &client_send_wr,
+ &bad_client_send_wr);
+ if (ret) {
+ rdma_error("Failed to read client dst buffer from the master, errno: %d \n",
+ -errno);
+ return -errno;
+ }
+ /* at this point we are expecting 1 work completion for the write */
+ ret = process_work_completion_events(io_completion_channel,
+ &wc, 1);
+ if(ret != 1) {
+ rdma_error("We failed to get 1 work completions , ret = %d \n",
+ ret);
+ return ret;
+ }
+ debug("Client side READ is complete \n");
+ return 0;
+}
+
+/* This function disconnects the RDMA connection from the server and cleans up
+ * all the resources.
+ */
+static int client_disconnect_and_clean()
+{
+ struct rdma_cm_event *cm_event = NULL;
+ int ret = -1;
+ /* active disconnect from the client side */
+ ret = rdma_disconnect(cm_client_id);
+ if (ret) {
+ rdma_error("Failed to disconnect, errno: %d \n", -errno);
+ //continuing anyways
+ }
+ ret = process_rdma_cm_event(cm_event_channel,
+ RDMA_CM_EVENT_DISCONNECTED,
+ &cm_event);
+ if (ret) {
+ rdma_error("Failed to get RDMA_CM_EVENT_DISCONNECTED event, ret = %d\n",
+ ret);
+ //continuing anyways
+ }
+ ret = rdma_ack_cm_event(cm_event);
+ if (ret) {
+ rdma_error("Failed to acknowledge cm event, errno: %d\n",
+ -errno);
+ //continuing anyways
+ }
+ /* Destroy QP */
+ rdma_destroy_qp(cm_client_id);
+ /* Destroy client cm id */
+ ret = rdma_destroy_id(cm_client_id);
+ if (ret) {
+ rdma_error("Failed to destroy client id cleanly, %d \n", -errno);
+ // we continue anyways;
+ }
+ /* Destroy CQ */
+ ret = ibv_destroy_cq(client_cq);
+ if (ret) {
+ rdma_error("Failed to destroy completion queue cleanly, %d \n", -errno);
+ // we continue anyways;
+ }
+ /* Destroy completion channel */
+ ret = ibv_destroy_comp_channel(io_completion_channel);
+ if (ret) {
+ rdma_error("Failed to destroy completion channel cleanly, %d \n", -errno);
+ // we continue anyways;
+ }
+ /* Destroy memory buffers */
+ rdma_buffer_deregister(server_metadata_mr);
+ rdma_buffer_deregister(client_metadata_mr);
+ rdma_buffer_deregister(client_src_mr);
+ rdma_buffer_deregister(client_dst_mr);
+ /* We free the buffers */
+ free(src);
+ free(dst);
+ /* Destroy protection domain */
+ ret = ibv_dealloc_pd(pd);
+ if (ret) {
+ rdma_error("Failed to destroy client protection domain cleanly, %d \n", -errno);
+ // we continue anyways;
+ }
+ rdma_destroy_event_channel(cm_event_channel);
+ printf("Client resource clean up is complete \n");
+ return 0;
+}
+
+void usage() {
+ printf("Usage:\n");
+ printf("rdma_client: [-a <server_addr>] [-p <server_port>] -s string (required)\n");
+ printf("(default IP is 127.0.0.1 and port is %d)\n", DEFAULT_RDMA_PORT);
+ exit(1);
+}
+
+int main(int argc, char **argv) {
+ struct sockaddr_in server_sockaddr;
+ int ret, option;
+ bzero(&server_sockaddr, sizeof server_sockaddr);
+ server_sockaddr.sin_family = AF_INET;
+ server_sockaddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ /* buffers are NULL */
+ src = dst = NULL;
+ /* Parse Command Line Arguments */
+ while ((option = getopt(argc, argv, "s:a:p:")) != -1) {
+ switch (option) {
+ case 's':
+ printf("Passed string is : %s , with count %u \n",
+ optarg,
+ (unsigned int) strlen(optarg));
+ src = calloc(strlen(optarg) , 1);
+ if (!src) {
+ rdma_error("Failed to allocate memory : -ENOMEM\n");
+ return -ENOMEM;
+ }
+ /* Copy the passes arguments */
+ strncpy(src, optarg, strlen(optarg));
+ dst = calloc(strlen(optarg), 1);
+ if (!dst) {
+ rdma_error("Failed to allocate destination memory, -ENOMEM\n");
+ free(src);
+ return -ENOMEM;
+ }
+ break;
+ case 'a':
+ /* remember, this overwrites the port info */
+ ret = get_addr(optarg, (struct sockaddr*) &server_sockaddr);
+ if (ret) {
+ rdma_error("Invalid IP \n");
+ return ret;
+ }
+ break;
+ case 'p':
+ /* passed port to listen on */
+ server_sockaddr.sin_port = htons(strtol(optarg, NULL, 0));
+ break;
+ default:
+ usage();
+ break;
+ }
+ }
+ if (!server_sockaddr.sin_port) {
+ /* no port provided, use the default port */
+ server_sockaddr.sin_port = htons(DEFAULT_RDMA_PORT);
+ }
+ if (src == NULL) {
+ printf("Please provide a string to copy \n");
+ usage();
+ }
+ ret = client_prepare_connection(&server_sockaddr);
+ if (ret) {
+ rdma_error("Failed to setup client connection , ret = %d \n", ret);
+ return ret;
+ }
+ ret = client_pre_post_recv_buffer();
+ if (ret) {
+ rdma_error("Failed to setup client connection , ret = %d \n", ret);
+ return ret;
+ }
+ ret = client_connect_to_server();
+ if (ret) {
+ rdma_error("Failed to setup client connection , ret = %d \n", ret);
+ return ret;
+ }
+ ret = client_xchange_metadata_with_server();
+ if (ret) {
+ rdma_error("Failed to setup client connection , ret = %d \n", ret);
+ return ret;
+ }
+ ret = client_remote_memory_ops();
+ if (ret) {
+ rdma_error("Failed to finish remote memory ops, ret = %d \n", ret);
+ return ret;
+ }
+ if (check_src_dst()) {
+ rdma_error("src and dst buffers do not match \n");
+ } else {
+ printf("...\nSUCCESS, source and destination buffers match \n");
+ }
+ ret = client_disconnect_and_clean();
+ if (ret) {
+ rdma_error("Failed to cleanly disconnect and clean up resources \n");
+ }
+ return ret;
+}
+
diff --git a/src/rdma_common.c b/src/rdma_common.c
new file mode 100644
index 0000000..0478c29
--- /dev/null
+++ b/src/rdma_common.c
@@ -0,0 +1,210 @@
+/*
+ * Implementation of the common RDMA functions.
+ *
+ * Authors: Animesh Trivedi
+ */
+
+#include "rdma_common.h"
+
+void show_rdma_cmid(struct rdma_cm_id *id)
+{
+ if(!id){
+ rdma_error("Passed ptr is NULL\n");
+ return;
+ }
+ printf("RDMA cm id at %p \n", id);
+ if(id->verbs && id->verbs->device)
+ printf("dev_ctx: %p (device name: %s) \n", id->verbs,
+ id->verbs->device->name);
+ if(id->channel)
+ printf("cm event channel %p\n", id->channel);
+ printf("QP: %p, port_space %x, port_num %u \n", id->qp,
+ id->ps,
+ id->port_num);
+}
+
+void show_rdma_buffer_attr(struct rdma_buffer_attr *attr){
+ if(!attr){
+ rdma_error("Passed attr is NULL\n");
+ return;
+ }
+ printf("---------------------------------------------------------\n");
+ printf("buffer attr, addr: %p , len: %u , stag : 0x%x \n",
+ (void*) attr->address,
+ (unsigned int) attr->length,
+ attr->stag.local_stag);
+ printf("---------------------------------------------------------\n");
+}
+
+struct ibv_mr* rdma_buffer_alloc(struct ibv_pd *pd, uint32_t size,
+ enum ibv_access_flags permission)
+{
+ struct ibv_mr *mr = NULL;
+ if (!pd) {
+ rdma_error("Protection domain is NULL \n");
+ return NULL;
+ }
+ void *buf = calloc(1, size);
+ if (!buf) {
+ rdma_error("failed to allocate buffer, -ENOMEM\n");
+ return NULL;
+ }
+ debug("Buffer allocated: %p , len: %u \n", buf, size);
+ mr = rdma_buffer_register(pd, buf, size, permission);
+ if(!mr){
+ free(buf);
+ }
+ return mr;
+}
+
+struct ibv_mr *rdma_buffer_register(struct ibv_pd *pd,
+ void *addr, uint32_t length,
+ enum ibv_access_flags permission)
+{
+ struct ibv_mr *mr = NULL;
+ if (!pd) {
+ rdma_error("Protection domain is NULL, ignoring \n");
+ return NULL;
+ }
+ mr = ibv_reg_mr(pd, addr, length, permission);
+ if (!mr) {
+ rdma_error("Failed to create mr on buffer, errno: %d \n", -errno);
+ return NULL;
+ }
+ debug("Registered: %p , len: %u , stag: 0x%x \n",
+ mr->addr,
+ (unsigned int) mr->length,
+ mr->lkey);
+ return mr;
+}
+
+void rdma_buffer_free(struct ibv_mr *mr)
+{
+ if (!mr) {
+ rdma_error("Passed memory region is NULL, ignoring\n");
+ return ;
+ }
+ void *to_free = mr->addr;
+ rdma_buffer_deregister(mr);
+ debug("Buffer %p free'ed\n", to_free);
+ free(to_free);
+}
+
+void rdma_buffer_deregister(struct ibv_mr *mr)
+{
+ if (!mr) {
+ rdma_error("Passed memory region is NULL, ignoring\n");
+ return;
+ }
+ debug("Deregistered: %p , len: %u , stag : 0x%x \n",
+ mr->addr,
+ (unsigned int) mr->length,
+ mr->lkey);
+ ibv_dereg_mr(mr);
+}
+
+int process_rdma_cm_event(struct rdma_event_channel *echannel,
+ enum rdma_cm_event_type expected_event,
+ struct rdma_cm_event **cm_event)
+{
+ int ret = 1;
+ ret = rdma_get_cm_event(echannel, cm_event);
+ if (ret) {
+ rdma_error("Failed to retrieve a cm event, errno: %d \n",
+ -errno);
+ return -errno;
+ }
+ /* lets see, if it was a good event */
+ if(0 != (*cm_event)->status){
+ rdma_error("CM event has non zero status: %d\n", (*cm_event)->status);
+ ret = -((*cm_event)->status);
+ /* important, we acknowledge the event */
+ rdma_ack_cm_event(*cm_event);
+ return ret;
+ }
+ /* if it was a good event, was it of the expected type */
+ if ((*cm_event)->event != expected_event) {
+ rdma_error("Unexpected event received: %s [ expecting: %s ]",
+ rdma_event_str((*cm_event)->event),
+ rdma_event_str(expected_event));
+ /* important, we acknowledge the event */
+ rdma_ack_cm_event(*cm_event);
+ return -1; // unexpected event :(
+ }
+ debug("A new %s type event is received \n", rdma_event_str((*cm_event)->event));
+ /* The caller must acknowledge the event */
+ return ret;
+}
+
+
+int process_work_completion_events (struct ibv_comp_channel *comp_channel,
+ struct ibv_wc *wc, int max_wc)
+{
+ struct ibv_cq *cq_ptr = NULL;
+ void *context = NULL;
+ int ret = -1, i, total_wc = 0;
+ /* We wait for the notification on the CQ channel */
+ ret = ibv_get_cq_event(comp_channel, /* IO channel where we are expecting the notification */
+ &cq_ptr, /* which CQ has an activity. This should be the same as CQ we created before */
+ &context); /* Associated CQ user context, which we did set */
+ if (ret) {
+ rdma_error("Failed to get next CQ event due to %d \n", -errno);
+ return -errno;
+ }
+ /* Request for more notifications. */
+ ret = ibv_req_notify_cq(cq_ptr, 0);
+ if (ret){
+ rdma_error("Failed to request further notifications %d \n", -errno);
+ return -errno;
+ }
+ /* We got notification. We reap the work completion (WC) element. It is
+ * unlikely but a good practice it write the CQ polling code that
+ * can handle zero WCs. ibv_poll_cq can return zero. Same logic as
+ * MUTEX conditional variables in pthread programming.
+ */
+ total_wc = 0;
+ do {
+ ret = ibv_poll_cq(cq_ptr /* the CQ, we got notification for */,
+ max_wc - total_wc /* number of remaining WC elements*/,
+ wc + total_wc/* where to store */);
+ if (ret < 0) {
+ rdma_error("Failed to poll cq for wc due to %d \n", ret);
+ /* ret is errno here */
+ return ret;
+ }
+ total_wc += ret;
+ } while (total_wc < max_wc);
+ debug("%d WC are completed \n", total_wc);
+ /* Now we check validity and status of I/O work completions */
+ for( i = 0 ; i < total_wc ; i++) {
+ if (wc[i].status != IBV_WC_SUCCESS) {
+ rdma_error("Work completion (WC) has error status: %s at index %d",
+ ibv_wc_status_str(wc[i].status), i);
+ /* return negative value */
+ return -(wc[i].status);
+ }
+ }
+ /* Similar to connection management events, we need to acknowledge CQ events */
+ ibv_ack_cq_events(cq_ptr,
+ 1 /* we received one event notification. This is not
+ number of WC elements */);
+ return total_wc;
+}
+
+
+/* Code acknowledgment: rping.c from librdmacm/examples */
+int get_addr(char *dst, struct sockaddr *addr)
+{
+ struct addrinfo *res;
+ int ret = -1;
+ ret = getaddrinfo(dst, NULL, NULL, &res);
+ if (ret) {
+ rdma_error("getaddrinfo failed - invalid hostname or IP address\n");
+ return ret;
+ }
+ memcpy(addr, res->ai_addr, sizeof(struct sockaddr_in));
+ freeaddrinfo(res);
+ return ret;
+}
+
diff --git a/src/rdma_common.h b/src/rdma_common.h
new file mode 100644
index 0000000..5a228c9
--- /dev/null
+++ b/src/rdma_common.h
@@ -0,0 +1,133 @@
+/*
+ * Header file for the common RDMA routines used in the server/client example
+ * program.
+ *
+ * Author: Animesh Trivedi
+ *
+ */
+
+#ifndef RDMA_COMMON_H
+#define RDMA_COMMON_H
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <getopt.h>
+
+#include <netdb.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+
+#include <rdma/rdma_cma.h>
+#include <infiniband/verbs.h>
+
+/* Error Macro*/
+#define rdma_error(msg, args...) do {\
+ fprintf(stderr, "%s : %d : ERROR : "msg, __FILE__, __LINE__, ## args);\
+}while(0);
+
+#ifdef ACN_RDMA_DEBUG
+/* Debug Macro */
+#define debug(msg, args...) do {\
+ printf("DEBUG: "msg, ## args);\
+}while(0);
+
+#else
+
+#define debug(msg, args...)
+
+#endif /* ACN_RDMA_DEBUG */
+
+/* Capacity of the completion queue (CQ) */
+#define CQ_CAPACITY (16)
+/* MAX SGE capacity */
+#define MAX_SGE (2)
+/* MAX work requests */
+#define MAX_WR (8)
+/* Default port where the RDMA server is listening */
+#define DEFAULT_RDMA_PORT (20886)
+
+/*
+ * We use attribute so that compiler does not step in and try to pad the structure.
+ * We use this structure to exchange information between the server and the client.
+ *
+ * For details see: http://gcc.gnu.org/onlinedocs/gcc/Type-Attributes.html
+ */
+struct __attribute((packed)) rdma_buffer_attr {
+ uint64_t address;
+ uint32_t length;
+ union stag {
+ /* if we send, we call it local stags */
+ uint32_t local_stag;
+ /* if we receive, we call it remote stag */
+ uint32_t remote_stag;
+ }stag;
+};
+/* resolves a given destination name to sin_addr */
+int get_addr(char *dst, struct sockaddr *addr);
+
+/* prints RDMA buffer info structure */
+void show_rdma_buffer_attr(struct rdma_buffer_attr *attr);
+
+/*
+ * Processes an RDMA connection management (CM) event.
+ * @echannel: CM event channel where the event is expected.
+ * @expected_event: Expected event type
+ * @cm_event: where the event will be stored
+ */
+int process_rdma_cm_event(struct rdma_event_channel *echannel,
+ enum rdma_cm_event_type expected_event,
+ struct rdma_cm_event **cm_event);
+
+/* Allocates an RDMA buffer of size 'length' with permission permission. This
+ * function will also register the memory and returns a memory region (MR)
+ * identifier or NULL on error.
+ * @pd: Protection domain where the buffer should be allocated
+ * @length: Length of the buffer
+ * @permission: OR of IBV_ACCESS_* permissions as defined for the enum ibv_access_flags
+ */
+struct ibv_mr* rdma_buffer_alloc(struct ibv_pd *pd,
+ uint32_t length,
+ enum ibv_access_flags permission);
+
+/* Frees a previously allocated RDMA buffer. The buffer must be allocated by
+ * calling rdma_buffer_alloc();
+ * @mr: RDMA memory region to free
+ */
+void rdma_buffer_free(struct ibv_mr *mr);
+
+/* This function registers a previously allocated memory. Returns a memory region
+ * (MR) identifier or NULL on error.
+ * @pd: protection domain where to register memory
+ * @addr: Buffer address
+ * @length: Length of the buffer
+ * @permission: OR of IBV_ACCESS_* permissions as defined for the enum ibv_access_flags
+ */
+struct ibv_mr *rdma_buffer_register(struct ibv_pd *pd,
+ void *addr,
+ uint32_t length,
+ enum ibv_access_flags permission);
+/* Deregisters a previously register memory
+ * @mr: Memory region to deregister
+ */
+void rdma_buffer_deregister(struct ibv_mr *mr);
+
+/* Processes a work completion (WC) notification.
+ * @comp_channel: Completion channel where the notifications are expected to arrive
+ * @wc: Array where to hold the work completion elements
+ * @max_wc: Maximum number of expected work completion (WC) elements. wc must be
+ * atleast this size.
+ */
+int process_work_completion_events(struct ibv_comp_channel *comp_channel,
+ struct ibv_wc *wc,
+ int max_wc);
+
+/* prints some details from the cm id */
+void show_rdma_cmid(struct rdma_cm_id *id);
+
+#endif /* RDMA_COMMON_H */
diff --git a/src/rdma_server.c b/src/rdma_server.c
new file mode 100644
index 0000000..42f018f
--- /dev/null
+++ b/src/rdma_server.c
@@ -0,0 +1,489 @@
+/*
+ * This is a RDMA server side code.
+ *
+ * Author: Animesh Trivedi
+ *
+ * TODO: Cleanup previously allocated resources in case of an error condition
+ */
+
+#include "rdma_common.h"
+
+/* These are the RDMA resources needed to setup an RDMA connection */
+/* Event channel, where connection management (cm) related events are relayed */
+static struct rdma_event_channel *cm_event_channel = NULL;
+static struct rdma_cm_id *cm_server_id = NULL, *cm_client_id = NULL;
+static struct ibv_pd *pd = NULL;
+static struct ibv_comp_channel *io_completion_channel = NULL;
+static struct ibv_cq *cq = NULL;
+static struct ibv_qp_init_attr qp_init_attr;
+static struct ibv_qp *client_qp = NULL;
+/* RDMA memory resources */
+static struct ibv_mr *client_metadata_mr = NULL, *server_buffer_mr = NULL, *server_metadata_mr = NULL;
+static struct rdma_buffer_attr client_metadata_attr, server_metadata_attr;
+static struct ibv_recv_wr client_recv_wr, *bad_client_recv_wr = NULL;
+static struct ibv_send_wr server_send_wr, *bad_server_send_wr = NULL;
+static struct ibv_sge client_recv_sge, server_send_sge;
+
+/* When we call this function cm_client_id must be set to a valid identifier.
+ * This is where, we prepare client connection before we accept it. This
+ * mainly involve pre-posting a receive buffer to receive client side
+ * RDMA credentials
+ */
+static int setup_client_resources()
+{
+ int ret = -1;
+ if(!cm_client_id){
+ rdma_error("Client id is still NULL \n");
+ return -EINVAL;
+ }
+ /* We have a valid connection identifier, lets start to allocate
+ * resources. We need:
+ * 1. Protection Domains (PD)
+ * 2. Memory Buffers
+ * 3. Completion Queues (CQ)
+ * 4. Queue Pair (QP)
+ * Protection Domain (PD) is similar to a "process abstraction"
+ * in the operating system. All resources are tied to a particular PD.
+ * And accessing recourses across PD will result in a protection fault.
+ */
+ pd = ibv_alloc_pd(cm_client_id->verbs
+ /* verbs defines a verb's provider,
+ * i.e an RDMA device where the incoming
+ * client connection came */);
+ if (!pd) {
+ rdma_error("Failed to allocate a protection domain errno: %d\n",
+ -errno);
+ return -errno;
+ }
+ debug("A new protection domain is allocated at %p \n", pd);
+ /* Now we need a completion channel, were the I/O completion
+ * notifications are sent. Remember, this is different from connection
+ * management (CM) event notifications.
+ * A completion channel is also tied to an RDMA device, hence we will
+ * use cm_client_id->verbs.
+ */
+ io_completion_channel = ibv_create_comp_channel(cm_client_id->verbs);
+ if (!io_completion_channel) {
+ rdma_error("Failed to create an I/O completion event channel, %d\n",
+ -errno);
+ return -errno;
+ }
+ debug("An I/O completion event channel is created at %p \n",
+ io_completion_channel);
+ /* Now we create a completion queue (CQ) where actual I/O
+ * completion metadata is placed. The metadata is packed into a structure
+ * called struct ibv_wc (wc = work completion). ibv_wc has detailed
+ * information about the work completion. An I/O request in RDMA world
+ * is called "work" ;)
+ */
+ cq = ibv_create_cq(cm_client_id->verbs /* which device*/,
+ CQ_CAPACITY /* maximum capacity*/,
+ NULL /* user context, not used here */,
+ io_completion_channel /* which IO completion channel */,
+ 0 /* signaling vector, not used here*/);
+ if (!cq) {
+ rdma_error("Failed to create a completion queue (cq), errno: %d\n",
+ -errno);
+ return -errno;
+ }
+ debug("Completion queue (CQ) is created at %p with %d elements \n",
+ cq, cq->cqe);
+ /* Ask for the event for all activities in the completion queue*/
+ ret = ibv_req_notify_cq(cq /* on which CQ */,
+ 0 /* 0 = all event type, no filter*/);
+ if (ret) {
+ rdma_error("Failed to request notifications on CQ errno: %d \n",
+ -errno);
+ return -errno;
+ }
+ /* Now the last step, set up the queue pair (send, recv) queues and their capacity.
+ * The capacity here is define statically but this can be probed from the
+ * device. We just use a small number as defined in rdma_common.h */
+ bzero(&qp_init_attr, sizeof qp_init_attr);
+ qp_init_attr.cap.max_recv_sge = MAX_SGE; /* Maximum SGE per receive posting */
+ qp_init_attr.cap.max_recv_wr = MAX_WR; /* Maximum receive posting capacity */
+ qp_init_attr.cap.max_send_sge = MAX_SGE; /* Maximum SGE per send posting */
+ qp_init_attr.cap.max_send_wr = MAX_WR; /* Maximum send posting capacity */
+ qp_init_attr.qp_type = IBV_QPT_RC; /* QP type, RC = Reliable connection */
+ /* We use same completion queue, but one can use different queues */
+ qp_init_attr.recv_cq = cq; /* Where should I notify for receive completion operations */
+ qp_init_attr.send_cq = cq; /* Where should I notify for send completion operations */
+ /*Lets create a QP */
+ ret = rdma_create_qp(cm_client_id /* which connection id */,
+ pd /* which protection domain*/,
+ &qp_init_attr /* Initial attributes */);
+ if (ret) {
+ rdma_error("Failed to create QP due to errno: %d\n", -errno);
+ return -errno;
+ }
+ /* Save the reference for handy typing but is not required */
+ client_qp = cm_client_id->qp;
+ debug("Client QP created at %p\n", client_qp);
+ return ret;
+}
+
+/* Starts an RDMA server by allocating basic connection resources */
+static int start_rdma_server(struct sockaddr_in *server_addr)
+{
+ struct rdma_cm_event *cm_event = NULL;
+ int ret = -1;
+ /* Open a channel used to report asynchronous communication event */
+ cm_event_channel = rdma_create_event_channel();
+ if (!cm_event_channel) {
+ rdma_error("Creating cm event channel failed with errno : (%d)", -errno);
+ return -errno;
+ }
+ debug("RDMA CM event channel is created successfully at %p \n",
+ cm_event_channel);
+ /* rdma_cm_id is the connection identifier (like socket) which is used
+ * to define an RDMA connection.
+ */
+ ret = rdma_create_id(cm_event_channel, &cm_server_id, NULL, RDMA_PS_TCP);
+ if (ret) {
+ rdma_error("Creating server cm id failed with errno: %d ", -errno);
+ return -errno;
+ }
+ debug("A RDMA connection id for the server is created \n");
+ /* Explicit binding of rdma cm id to the socket credentials */
+ ret = rdma_bind_addr(cm_server_id, (struct sockaddr*) server_addr);
+ if (ret) {
+ rdma_error("Failed to bind server address, errno: %d \n", -errno);
+ return -errno;
+ }
+ debug("Server RDMA CM id is successfully binded \n");
+ /* Now we start to listen on the passed IP and port. However unlike
+ * normal TCP listen, this is a non-blocking call. When a new client is
+ * connected, a new connection management (CM) event is generated on the
+ * RDMA CM event channel from where the listening id was created. Here we
+ * have only one channel, so it is easy. */
+ ret = rdma_listen(cm_server_id, 8); /* backlog = 8 clients, same as TCP, see man listen*/
+ if (ret) {
+ rdma_error("rdma_listen failed to listen on server address, errno: %d ",
+ -errno);
+ return -errno;
+ }
+ printf("Server is listening successfully at: %s , port: %d \n",
+ inet_ntoa(server_addr->sin_addr),
+ ntohs(server_addr->sin_port));
+ /* now, we expect a client to connect and generate a RDMA_CM_EVNET_CONNECT_REQUEST
+ * We wait (block) on the connection management event channel for
+ * the connect event.
+ */
+ ret = process_rdma_cm_event(cm_event_channel,
+ RDMA_CM_EVENT_CONNECT_REQUEST,
+ &cm_event);
+ if (ret) {
+ rdma_error("Failed to get cm event, ret = %d \n" , ret);
+ return ret;
+ }
+ /* Much like TCP connection, listening returns a new connection identifier
+ * for newly connected client. In the case of RDMA, this is stored in id
+ * field. For more details: man rdma_get_cm_event
+ */
+ cm_client_id = cm_event->id;
+ /* now we acknowledge the event. Acknowledging the event free the resources
+ * associated with the event structure. Hence any reference to the event
+ * must be made before acknowledgment. Like, we have already saved the
+ * client id from "id" field before acknowledging the event.
+ */
+ ret = rdma_ack_cm_event(cm_event);
+ if (ret) {
+ rdma_error("Failed to acknowledge the cm event errno: %d \n", -errno);
+ return -errno;
+ }
+ debug("A new RDMA client connection id is stored at %p\n", cm_client_id);
+ return ret;
+}
+
+/* Pre-posts a receive buffer and accepts an RDMA client connection */
+static int accept_client_connection()
+{
+ struct rdma_conn_param conn_param;
+ struct rdma_cm_event *cm_event = NULL;
+ struct sockaddr_in remote_sockaddr;
+ int ret = -1;
+ if(!cm_client_id || !client_qp) {
+ rdma_error("Client resources are not properly setup\n");
+ return -EINVAL;
+ }
+ /* we prepare the receive buffer in which we will receive the client metadata*/
+ client_metadata_mr = rdma_buffer_register(pd /* which protection domain */,
+ &client_metadata_attr /* what memory */,
+ sizeof(client_metadata_attr) /* what length */,
+ (IBV_ACCESS_LOCAL_WRITE) /* access permissions */);
+ if(!client_metadata_mr){
+ rdma_error("Failed to register client attr buffer\n");
+ //we assume ENOMEM
+ return -ENOMEM;
+ }
+ /* We pre-post this receive buffer on the QP. SGE credentials is where we
+ * receive the metadata from the client */
+ client_recv_sge.addr = (uint64_t) client_metadata_mr->addr; // same as &client_buffer_attr
+ client_recv_sge.length = client_metadata_mr->length;
+ client_recv_sge.lkey = client_metadata_mr->lkey;
+ /* Now we link this SGE to the work request (WR) */
+ bzero(&client_recv_wr, sizeof(client_recv_wr));
+ client_recv_wr.sg_list = &client_recv_sge;
+ client_recv_wr.num_sge = 1; // only one SGE
+ ret = ibv_post_recv(client_qp /* which QP */,
+ &client_recv_wr /* receive work request*/,
+ &bad_client_recv_wr /* error WRs */);
+ if (ret) {
+ rdma_error("Failed to pre-post the receive buffer, errno: %d \n", ret);
+ return ret;
+ }
+ debug("Receive buffer pre-posting is successful \n");
+ /* Now we accept the connection. Recall we have not accepted the connection
+ * yet because we have to do lots of resource pre-allocation */
+ memset(&conn_param, 0, sizeof(conn_param));
+ /* this tell how many outstanding requests can we handle */
+ conn_param.initiator_depth = 3; /* For this exercise, we put a small number here */
+ /* This tell how many outstanding requests we expect other side to handle */
+ conn_param.responder_resources = 3; /* For this exercise, we put a small number */
+ ret = rdma_accept(cm_client_id, &conn_param);
+ if (ret) {
+ rdma_error("Failed to accept the connection, errno: %d \n", -errno);
+ return -errno;
+ }
+ /* We expect an RDMA_CM_EVNET_ESTABLISHED to indicate that the RDMA
+ * connection has been established and everything is fine on both, server
+ * as well as the client sides.
+ */
+ debug("Going to wait for : RDMA_CM_EVENT_ESTABLISHED event \n");
+ ret = process_rdma_cm_event(cm_event_channel,
+ RDMA_CM_EVENT_ESTABLISHED,
+ &cm_event);
+ if (ret) {
+ rdma_error("Failed to get the cm event, errnp: %d \n", -errno);
+ return -errno;
+ }
+ /* We acknowledge the event */
+ ret = rdma_ack_cm_event(cm_event);
+ if (ret) {
+ rdma_error("Failed to acknowledge the cm event %d\n", -errno);
+ return -errno;
+ }
+ /* Just FYI: How to extract connection information */
+ memcpy(&remote_sockaddr /* where to save */,
+ rdma_get_peer_addr(cm_client_id) /* gives you remote sockaddr */,
+ sizeof(struct sockaddr_in) /* max size */);
+ printf("A new connection is accepted from %s \n",
+ inet_ntoa(remote_sockaddr.sin_addr));
+ return ret;
+}
+
+/* This function sends server side buffer metadata to the connected client */
+static int send_server_metadata_to_client()
+{
+ struct ibv_wc wc;
+ int ret = -1;
+ /* Now, we first wait for the client to start the communication by
+ * sending the server its metadata info. The server does not use it
+ * in our example. We will receive a work completion notification for
+ * our pre-posted receive request.
+ */
+ ret = process_work_completion_events(io_completion_channel, &wc, 1);
+ if (ret != 1) {
+ rdma_error("Failed to receive , ret = %d \n", ret);
+ return ret;
+ }
+ /* if all good, then we should have client's buffer information, lets see */
+ printf("Client side buffer information is received...\n");
+ show_rdma_buffer_attr(&client_metadata_attr);
+ printf("The client has requested buffer length of : %u bytes \n",
+ client_metadata_attr.length);
+ /* We need to setup requested memory buffer. This is where the client will
+ * do RDMA READs and WRITEs. */
+ server_buffer_mr = rdma_buffer_alloc(pd /* which protection domain */,
+ client_metadata_attr.length /* what size to allocate */,
+ (IBV_ACCESS_LOCAL_WRITE|
+ IBV_ACCESS_REMOTE_READ|
+ IBV_ACCESS_REMOTE_WRITE) /* access permissions */);
+ if(!server_buffer_mr){
+ rdma_error("Server failed to create a buffer \n");
+ /* we assume that it is due to out of memory error */
+ return -ENOMEM;
+ }
+ /* This buffer is used to transmit information about the above
+ * buffer to the client. So this contains the metadata about the server
+ * buffer. Hence this is called metadata buffer. Since this is already
+ * on allocated, we just register it.
+ * We need to prepare a send I/O operation that will tell the
+ * client the address of the server buffer.
+ */
+ server_metadata_attr.address = (uint64_t) server_buffer_mr->addr;
+ server_metadata_attr.length = (uint32_t) server_buffer_mr->length;
+ server_metadata_attr.stag.local_stag = (uint32_t) server_buffer_mr->lkey;
+ server_metadata_mr = rdma_buffer_register(pd /* which protection domain*/,
+ &server_metadata_attr /* which memory to register */,
+ sizeof(server_metadata_attr) /* what is the size of memory */,
+ IBV_ACCESS_LOCAL_WRITE /* what access permission */);
+ if(!server_metadata_mr){
+ rdma_error("Server failed to create to hold server metadata \n");
+ /* we assume that this is due to out of memory error */
+ return -ENOMEM;
+ }
+ /* We need to transmit this buffer. So we create a send request.
+ * A send request consists of multiple SGE elements. In our case, we only
+ * have one
+ */
+ server_send_sge.addr = (uint64_t) &server_metadata_attr;
+ server_send_sge.length = sizeof(server_metadata_attr);
+ server_send_sge.lkey = server_metadata_mr->lkey;
+ /* now we link this sge to the send request */
+ bzero(&server_send_wr, sizeof(server_send_wr));
+ server_send_wr.sg_list = &server_send_sge;
+ server_send_wr.num_sge = 1; // only 1 SGE element in the array
+ server_send_wr.opcode = IBV_WR_SEND; // This is a send request
+ server_send_wr.send_flags = IBV_SEND_SIGNALED; // We want to get notification
+ /* This is a fast data path operation. Posting an I/O request */
+ ret = ibv_post_send(client_qp /* which QP */,
+ &server_send_wr /* Send request that we prepared before */,
+ &bad_server_send_wr /* In case of error, this will contain failed requests */);
+ if (ret) {
+ rdma_error("Posting of server metdata failed, errno: %d \n",
+ -errno);
+ return -errno;
+ }
+ /* We check for completion notification */
+ ret = process_work_completion_events(io_completion_channel, &wc, 1);
+ if (ret != 1) {
+ rdma_error("Failed to send server metadata, ret = %d \n", ret);
+ return ret;
+ }
+ debug("Local buffer metadata has been sent to the client \n");
+ return 0;
+}
+
+/* This is server side logic. Server passively waits for the client to call
+ * rdma_disconnect() and then it will clean up its resources */
+static int disconnect_and_cleanup()
+{
+ struct rdma_cm_event *cm_event = NULL;
+ int ret = -1;
+ /* Now we wait for the client to send us disconnect event */
+ debug("Waiting for cm event: RDMA_CM_EVENT_DISCONNECTED\n");
+ ret = process_rdma_cm_event(cm_event_channel,
+ RDMA_CM_EVENT_DISCONNECTED,
+ &cm_event);
+ if (ret) {
+ rdma_error("Failed to get disconnect event, ret = %d \n", ret);
+ return ret;
+ }
+ /* We acknowledge the event */
+ ret = rdma_ack_cm_event(cm_event);
+ if (ret) {
+ rdma_error("Failed to acknowledge the cm event %d\n", -errno);
+ return -errno;
+ }
+ printf("A disconnect event is received from the client...\n");
+ /* We free all the resources */
+ /* Destroy QP */
+ rdma_destroy_qp(cm_client_id);
+ /* Destroy client cm id */
+ ret = rdma_destroy_id(cm_client_id);
+ if (ret) {
+ rdma_error("Failed to destroy client id cleanly, %d \n", -errno);
+ // we continue anyways;
+ }
+ /* Destroy CQ */
+ ret = ibv_destroy_cq(cq);
+ if (ret) {
+ rdma_error("Failed to destroy completion queue cleanly, %d \n", -errno);
+ // we continue anyways;
+ }
+ /* Destroy completion channel */
+ ret = ibv_destroy_comp_channel(io_completion_channel);
+ if (ret) {
+ rdma_error("Failed to destroy completion channel cleanly, %d \n", -errno);
+ // we continue anyways;
+ }
+ /* Destroy memory buffers */
+ rdma_buffer_free(server_buffer_mr);
+ rdma_buffer_deregister(server_metadata_mr);
+ rdma_buffer_deregister(client_metadata_mr);
+ /* Destroy protection domain */
+ ret = ibv_dealloc_pd(pd);
+ if (ret) {
+ rdma_error("Failed to destroy client protection domain cleanly, %d \n", -errno);
+ // we continue anyways;
+ }
+ /* Destroy rdma server id */
+ ret = rdma_destroy_id(cm_server_id);
+ if (ret) {
+ rdma_error("Failed to destroy server id cleanly, %d \n", -errno);
+ // we continue anyways;
+ }
+ rdma_destroy_event_channel(cm_event_channel);
+ printf("Server shut-down is complete \n");
+ return 0;
+}
+
+
+void usage()
+{
+ printf("Usage:\n");
+ printf("rdma_server: [-a <server_addr>] [-p <server_port>]\n");
+ printf("(default port is %d)\n", DEFAULT_RDMA_PORT);
+ exit(1);
+}
+
+int main(int argc, char **argv)
+{
+ int ret, option;
+ struct sockaddr_in server_sockaddr;
+ bzero(&server_sockaddr, sizeof server_sockaddr);
+ server_sockaddr.sin_family = AF_INET; /* standard IP NET address */
+ server_sockaddr.sin_addr.s_addr = htonl(INADDR_ANY); /* passed address */
+ /* Parse Command Line Arguments, not the most reliable code */
+ while ((option = getopt(argc, argv, "a:p:")) != -1) {
+ switch (option) {
+ case 'a':
+ /* Remember, this will overwrite the port info */
+ ret = get_addr(optarg, (struct sockaddr*) &server_sockaddr);
+ if (ret) {
+ rdma_error("Invalid IP \n");
+ return ret;
+ }
+ break;
+ case 'p':
+ /* passed port to listen on */
+ server_sockaddr.sin_port = htons(strtol(optarg, NULL, 0));
+ break;
+ default:
+ usage();
+ break;
+ }
+ }
+ if(!server_sockaddr.sin_port) {
+ /* If still zero, that mean no port info provided */
+ server_sockaddr.sin_port = htons(DEFAULT_RDMA_PORT); /* use default port */
+ }
+ ret = start_rdma_server(&server_sockaddr);
+ if (ret) {
+ rdma_error("RDMA server failed to start cleanly, ret = %d \n", ret);
+ return ret;
+ }
+ ret = setup_client_resources();
+ if (ret) {
+ rdma_error("Failed to setup client resources, ret = %d \n", ret);
+ return ret;
+ }
+ ret = accept_client_connection();
+ if (ret) {
+ rdma_error("Failed to handle client cleanly, ret = %d \n", ret);
+ return ret;
+ }
+ ret = send_server_metadata_to_client();
+ if (ret) {
+ rdma_error("Failed to send server metadata to the client, ret = %d \n", ret);
+ return ret;
+ }
+ ret = disconnect_and_cleanup();
+ if (ret) {
+ rdma_error("Failed to clean up resources properly, ret = %d \n", ret);
+ return ret;
+ }
+ return 0;
+}