diff options
| author | 智皓 张 <[email protected]> | 2023-08-04 15:24:55 +0800 |
|---|---|---|
| committer | 智皓 张 <[email protected]> | 2023-08-04 15:24:55 +0800 |
| commit | 1c82c0c7a27ea7778a5d2ca5104d822209afeb75 (patch) | |
| tree | a6911a2886f1fe4339e3d2b8dc0cded4f0c82618 | |
| parent | 8f0fe11da82349c15272b92115b2bc427a8e3a8e (diff) | |
update
| -rw-r--r-- | rdma-example-master/rdma-example-master/CMakeLists.txt | 34 | ||||
| -rw-r--r-- | rdma-example-master/rdma-example-master/LICENSE | 201 | ||||
| -rw-r--r-- | rdma-example-master/rdma-example-master/README.md | 51 | ||||
| -rw-r--r-- | rdma-example-master/rdma-example-master/src/rdma_client.c | 553 | ||||
| -rw-r--r-- | rdma-example-master/rdma-example-master/src/rdma_common.c | 210 | ||||
| -rw-r--r-- | rdma-example-master/rdma-example-master/src/rdma_common.h | 133 | ||||
| -rw-r--r-- | rdma-example-master/rdma-example-master/src/rdma_server.c | 489 |
7 files changed, 1671 insertions, 0 deletions
diff --git a/rdma-example-master/rdma-example-master/CMakeLists.txt b/rdma-example-master/rdma-example-master/CMakeLists.txt new file mode 100644 index 0000000..2471b09 --- /dev/null +++ b/rdma-example-master/rdma-example-master/CMakeLists.txt @@ -0,0 +1,34 @@ +# Author : Animesh Trivedi + +cmake_minimum_required (VERSION 2.6) + +project (rdma-example) + +set(PROJECT_SOURCE_DIR ${CMAKE_SOURCE_DIR}/src) +set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/bin) +set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}) +set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +set(CMAKE_BUILD_TYPE Release) # or debug if you need to use gdb on it. + +# Some how g++ has better performance than clang++. Of course I don't know all flags for clang++. +set(CMAKE_CXX_COMPILER g++) # or clang++ +#set(CMAKE_CXX_COMPILER clang++) # or clang++ +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -Ofast -ffast-math -funroll-loops -march=native") # get crazy here +#add_compile_options("-std=c++17") +#set(CMAKE_CXX_FLAGS "-O0 -ggdb") + +find_library(IBVERBS_LIBRARY ibverbs HINTS /home/atr/local/lib) +find_library(RDMACM_LIBRARY rdmacm HINTS /home/atr/local/lib) + +link_libraries(pthread ${IBVERBS_LIBRARY} ${RDMACM_LIBRARY}) + +include_directories("${PROJECT_SOURCE_DIR}" "/home/atr/local/include/") + +add_executable(rdma_server ${PROJECT_SOURCE_DIR}/rdma_common.c ${PROJECT_SOURCE_DIR}/rdma_server.c) +add_executable(rdma_client ${PROJECT_SOURCE_DIR}/rdma_common.c ${PROJECT_SOURCE_DIR}/rdma_client.c) + diff --git a/rdma-example-master/rdma-example-master/LICENSE b/rdma-example-master/rdma-example-master/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/rdma-example-master/rdma-example-master/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/rdma-example-master/rdma-example-master/README.md b/rdma-example-master/rdma-example-master/README.md new file mode 100644 index 0000000..fc6af6e --- /dev/null +++ b/rdma-example-master/rdma-example-master/README.md @@ -0,0 +1,51 @@ +# RDMA exmaple + +A simple RDMA server client example. The code contains a lot of comments. Here is the workflow that happens in the example: + +Client: + 1. setup RDMA resources + 2. connect to the server + 3. receive server side buffer information via send/recv exchange + 4. do an RDMA write to the server buffer from a (first) local buffer. The content of the buffer is the string passed with the `-s` argument. + 5. do an RDMA read to read the content of the server buffer into a second local buffer. + 6. compare the content of the first and second buffers, and match them. + 7. disconnect + +Server: + 1. setup RDMA resources + 2. wait for a client to connect + 3. allocate and pin a server buffer + 4. accept the incoming client connection + 5. send information about the local server buffer to the client + 6. wait for disconnect + +###### How to run +```text +git clone https://github.com/animeshtrivedi/rdma-example.git +cd ./rdma-example +cmake . +make +``` + +###### server +```text +./bin/rdma_server +``` +###### client +```text +atr@atr:~/rdma-example$ ./bin/rdma_client -a 127.0.0.1 -s textstring +Passed string is : textstring , with count 10 +Trying to connect to server at : 127.0.0.1 port: 20886 +The client is connected successfully +--------------------------------------------------------- +buffer attr, addr: 0x5629832e22c0 , len: 10 , stag : 0x1617b400 +--------------------------------------------------------- +... +SUCCESS, source and destination buffers match +Client resource clean up is complete +atr@atr:~/rdma-example$ + +``` + +## Does not have an RDMA device? +In case you do not have an RDMA device to test the code, you can setup SofitWARP software RDMA device on your Linux machine. Follow instructions here: [https://github.com/animeshtrivedi/blog/blob/master/post/2019-06-26-siw.md](https://github.com/animeshtrivedi/blog/blob/master/post/2019-06-26-siw.md). diff --git a/rdma-example-master/rdma-example-master/src/rdma_client.c b/rdma-example-master/rdma-example-master/src/rdma_client.c new file mode 100644 index 0000000..a640be6 --- /dev/null +++ b/rdma-example-master/rdma-example-master/src/rdma_client.c @@ -0,0 +1,553 @@ +/* + * An example RDMA client side code. + * Author: Animesh Trivedi + */ + +#include "rdma_common.h" + +/* These are basic RDMA resources */ +/* These are RDMA connection related resources */ +static struct rdma_event_channel *cm_event_channel = NULL; +static struct rdma_cm_id *cm_client_id = NULL; +static struct ibv_pd *pd = NULL; +static struct ibv_comp_channel *io_completion_channel = NULL; +static struct ibv_cq *client_cq = NULL; +static struct ibv_qp_init_attr qp_init_attr; +static struct ibv_qp *client_qp; +/* These are memory buffers related resources */ +static struct ibv_mr *client_metadata_mr = NULL, + *client_src_mr = NULL, + *client_dst_mr = NULL, + *server_metadata_mr = NULL; +static struct rdma_buffer_attr client_metadata_attr, server_metadata_attr; +static struct ibv_send_wr client_send_wr, *bad_client_send_wr = NULL; +static struct ibv_recv_wr server_recv_wr, *bad_server_recv_wr = NULL; +static struct ibv_sge client_send_sge, server_recv_sge; +/* Source and Destination buffers, where RDMA operations source and sink */ +static char *src = NULL, *dst = NULL; + +/* This is our testing function */ +static int check_src_dst() +{ + return memcmp((void*) src, (void*) dst, strlen(src)); +} + +/* This function prepares client side connection resources for an RDMA connection */ +static int client_prepare_connection(struct sockaddr_in *s_addr) +{ + struct rdma_cm_event *cm_event = NULL; + int ret = -1; + /* Open a channel used to report asynchronous communication event */ + cm_event_channel = rdma_create_event_channel(); + if (!cm_event_channel) { + rdma_error("Creating cm event channel failed, errno: %d \n", -errno); + return -errno; + } + debug("RDMA CM event channel is created at : %p \n", cm_event_channel); + /* rdma_cm_id is the connection identifier (like socket) which is used + * to define an RDMA connection. + */ + ret = rdma_create_id(cm_event_channel, &cm_client_id, + NULL, + RDMA_PS_TCP); + if (ret) { + rdma_error("Creating cm id failed with errno: %d \n", -errno); + return -errno; + } + /* Resolve destination and optional source addresses from IP addresses to + * an RDMA address. If successful, the specified rdma_cm_id will be bound + * to a local device. */ + ret = rdma_resolve_addr(cm_client_id, NULL, (struct sockaddr*) s_addr, 2000); + if (ret) { + rdma_error("Failed to resolve address, errno: %d \n", -errno); + return -errno; + } + debug("waiting for cm event: RDMA_CM_EVENT_ADDR_RESOLVED\n"); + ret = process_rdma_cm_event(cm_event_channel, + RDMA_CM_EVENT_ADDR_RESOLVED, + &cm_event); + if (ret) { + rdma_error("Failed to receive a valid event, ret = %d \n", ret); + return ret; + } + /* we ack the event */ + ret = rdma_ack_cm_event(cm_event); + if (ret) { + rdma_error("Failed to acknowledge the CM event, errno: %d\n", -errno); + return -errno; + } + debug("RDMA address is resolved \n"); + + /* Resolves an RDMA route to the destination address in order to + * establish a connection */ + ret = rdma_resolve_route(cm_client_id, 2000); + if (ret) { + rdma_error("Failed to resolve route, erno: %d \n", -errno); + return -errno; + } + debug("waiting for cm event: RDMA_CM_EVENT_ROUTE_RESOLVED\n"); + ret = process_rdma_cm_event(cm_event_channel, + RDMA_CM_EVENT_ROUTE_RESOLVED, + &cm_event); + if (ret) { + rdma_error("Failed to receive a valid event, ret = %d \n", ret); + return ret; + } + /* we ack the event */ + ret = rdma_ack_cm_event(cm_event); + if (ret) { + rdma_error("Failed to acknowledge the CM event, errno: %d \n", -errno); + return -errno; + } + printf("Trying to connect to server at : %s port: %d \n", + inet_ntoa(s_addr->sin_addr), + ntohs(s_addr->sin_port)); + /* Protection Domain (PD) is similar to a "process abstraction" + * in the operating system. All resources are tied to a particular PD. + * And accessing recourses across PD will result in a protection fault. + */ + pd = ibv_alloc_pd(cm_client_id->verbs); + if (!pd) { + rdma_error("Failed to alloc pd, errno: %d \n", -errno); + return -errno; + } + debug("pd allocated at %p \n", pd); + /* Now we need a completion channel, were the I/O completion + * notifications are sent. Remember, this is different from connection + * management (CM) event notifications. + * A completion channel is also tied to an RDMA device, hence we will + * use cm_client_id->verbs. + */ + io_completion_channel = ibv_create_comp_channel(cm_client_id->verbs); + if (!io_completion_channel) { + rdma_error("Failed to create IO completion event channel, errno: %d\n", + -errno); + return -errno; + } + debug("completion event channel created at : %p \n", io_completion_channel); + /* Now we create a completion queue (CQ) where actual I/O + * completion metadata is placed. The metadata is packed into a structure + * called struct ibv_wc (wc = work completion). ibv_wc has detailed + * information about the work completion. An I/O request in RDMA world + * is called "work" ;) + */ + client_cq = ibv_create_cq(cm_client_id->verbs /* which device*/, + CQ_CAPACITY /* maximum capacity*/, + NULL /* user context, not used here */, + io_completion_channel /* which IO completion channel */, + 0 /* signaling vector, not used here*/); + if (!client_cq) { + rdma_error("Failed to create CQ, errno: %d \n", -errno); + return -errno; + } + debug("CQ created at %p with %d elements \n", client_cq, client_cq->cqe); + ret = ibv_req_notify_cq(client_cq, 0); + if (ret) { + rdma_error("Failed to request notifications, errno: %d\n", -errno); + return -errno; + } + /* Now the last step, set up the queue pair (send, recv) queues and their capacity. + * The capacity here is define statically but this can be probed from the + * device. We just use a small number as defined in rdma_common.h */ + bzero(&qp_init_attr, sizeof qp_init_attr); + qp_init_attr.cap.max_recv_sge = MAX_SGE; /* Maximum SGE per receive posting */ + qp_init_attr.cap.max_recv_wr = MAX_WR; /* Maximum receive posting capacity */ + qp_init_attr.cap.max_send_sge = MAX_SGE; /* Maximum SGE per send posting */ + qp_init_attr.cap.max_send_wr = MAX_WR; /* Maximum send posting capacity */ + qp_init_attr.qp_type = IBV_QPT_RC; /* QP type, RC = Reliable connection */ + /* We use same completion queue, but one can use different queues */ + qp_init_attr.recv_cq = client_cq; /* Where should I notify for receive completion operations */ + qp_init_attr.send_cq = client_cq; /* Where should I notify for send completion operations */ + /*Lets create a QP */ + ret = rdma_create_qp(cm_client_id /* which connection id */, + pd /* which protection domain*/, + &qp_init_attr /* Initial attributes */); + if (ret) { + rdma_error("Failed to create QP, errno: %d \n", -errno); + return -errno; + } + client_qp = cm_client_id->qp; + debug("QP created at %p \n", client_qp); + return 0; +} + +/* Pre-posts a receive buffer before calling rdma_connect () */ +static int client_pre_post_recv_buffer() +{ + int ret = -1; + server_metadata_mr = rdma_buffer_register(pd, + &server_metadata_attr, + sizeof(server_metadata_attr), + (IBV_ACCESS_LOCAL_WRITE)); + if(!server_metadata_mr){ + rdma_error("Failed to setup the server metadata mr , -ENOMEM\n"); + return -ENOMEM; + } + server_recv_sge.addr = (uint64_t) server_metadata_mr->addr; + server_recv_sge.length = (uint32_t) server_metadata_mr->length; + server_recv_sge.lkey = (uint32_t) server_metadata_mr->lkey; + /* now we link it to the request */ + bzero(&server_recv_wr, sizeof(server_recv_wr)); + server_recv_wr.sg_list = &server_recv_sge; + server_recv_wr.num_sge = 1; + ret = ibv_post_recv(client_qp /* which QP */, + &server_recv_wr /* receive work request*/, + &bad_server_recv_wr /* error WRs */); + if (ret) { + rdma_error("Failed to pre-post the receive buffer, errno: %d \n", ret); + return ret; + } + debug("Receive buffer pre-posting is successful \n"); + return 0; +} + +/* Connects to the RDMA server */ +static int client_connect_to_server() +{ + struct rdma_conn_param conn_param; + struct rdma_cm_event *cm_event = NULL; + int ret = -1; + bzero(&conn_param, sizeof(conn_param)); + conn_param.initiator_depth = 3; + conn_param.responder_resources = 3; + conn_param.retry_count = 3; // if fail, then how many times to retry + ret = rdma_connect(cm_client_id, &conn_param); + if (ret) { + rdma_error("Failed to connect to remote host , errno: %d\n", -errno); + return -errno; + } + debug("waiting for cm event: RDMA_CM_EVENT_ESTABLISHED\n"); + ret = process_rdma_cm_event(cm_event_channel, + RDMA_CM_EVENT_ESTABLISHED, + &cm_event); + if (ret) { + rdma_error("Failed to get cm event, ret = %d \n", ret); + return ret; + } + ret = rdma_ack_cm_event(cm_event); + if (ret) { + rdma_error("Failed to acknowledge cm event, errno: %d\n", + -errno); + return -errno; + } + printf("The client is connected successfully \n"); + return 0; +} + +/* Exchange buffer metadata with the server. The client sends its, and then receives + * from the server. The client-side metadata on the server is _not_ used because + * this program is client driven. But it shown here how to do it for the illustration + * purposes + */ +static int client_xchange_metadata_with_server() +{ + struct ibv_wc wc[2]; + int ret = -1; + client_src_mr = rdma_buffer_register(pd, + src, + strlen(src), + (IBV_ACCESS_LOCAL_WRITE| + IBV_ACCESS_REMOTE_READ| + IBV_ACCESS_REMOTE_WRITE)); + if(!client_src_mr){ + rdma_error("Failed to register the first buffer, ret = %d \n", ret); + return ret; + } + /* we prepare metadata for the first buffer */ + client_metadata_attr.address = (uint64_t) client_src_mr->addr; + client_metadata_attr.length = client_src_mr->length; + client_metadata_attr.stag.local_stag = client_src_mr->lkey; + /* now we register the metadata memory */ + client_metadata_mr = rdma_buffer_register(pd, + &client_metadata_attr, + sizeof(client_metadata_attr), + IBV_ACCESS_LOCAL_WRITE); + if(!client_metadata_mr) { + rdma_error("Failed to register the client metadata buffer, ret = %d \n", ret); + return ret; + } + /* now we fill up SGE */ + client_send_sge.addr = (uint64_t) client_metadata_mr->addr; + client_send_sge.length = (uint32_t) client_metadata_mr->length; + client_send_sge.lkey = client_metadata_mr->lkey; + /* now we link to the send work request */ + bzero(&client_send_wr, sizeof(client_send_wr)); + client_send_wr.sg_list = &client_send_sge; + client_send_wr.num_sge = 1; + client_send_wr.opcode = IBV_WR_SEND; + client_send_wr.send_flags = IBV_SEND_SIGNALED; + /* Now we post it */ + ret = ibv_post_send(client_qp, + &client_send_wr, + &bad_client_send_wr); + if (ret) { + rdma_error("Failed to send client metadata, errno: %d \n", + -errno); + return -errno; + } + /* at this point we are expecting 2 work completion. One for our + * send and one for recv that we will get from the server for + * its buffer information */ + ret = process_work_completion_events(io_completion_channel, + wc, 2); + if(ret != 2) { + rdma_error("We failed to get 2 work completions , ret = %d \n", + ret); + return ret; + } + debug("Server sent us its buffer location and credentials, showing \n"); + show_rdma_buffer_attr(&server_metadata_attr); + return 0; +} + +/* This function does : + * 1) Prepare memory buffers for RDMA operations + * 1) RDMA write from src -> remote buffer + * 2) RDMA read from remote bufer -> dst + */ +static int client_remote_memory_ops() +{ + struct ibv_wc wc; + int ret = -1; + client_dst_mr = rdma_buffer_register(pd, + dst, + strlen(src), + (IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ)); + if (!client_dst_mr) { + rdma_error("We failed to create the destination buffer, -ENOMEM\n"); + return -ENOMEM; + } + /* Step 1: is to copy the local buffer into the remote buffer. We will + * reuse the previous variables. */ + /* now we fill up SGE */ + client_send_sge.addr = (uint64_t) client_src_mr->addr; + client_send_sge.length = (uint32_t) client_src_mr->length; + client_send_sge.lkey = client_src_mr->lkey; + /* now we link to the send work request */ + bzero(&client_send_wr, sizeof(client_send_wr)); + client_send_wr.sg_list = &client_send_sge; + client_send_wr.num_sge = 1; + client_send_wr.opcode = IBV_WR_RDMA_WRITE; + client_send_wr.send_flags = IBV_SEND_SIGNALED; + /* we have to tell server side info for RDMA */ + client_send_wr.wr.rdma.rkey = server_metadata_attr.stag.remote_stag; + client_send_wr.wr.rdma.remote_addr = server_metadata_attr.address; + /* Now we post it */ + ret = ibv_post_send(client_qp, + &client_send_wr, + &bad_client_send_wr); + if (ret) { + rdma_error("Failed to write client src buffer, errno: %d \n", + -errno); + return -errno; + } + /* at this point we are expecting 1 work completion for the write */ + ret = process_work_completion_events(io_completion_channel, + &wc, 1); + if(ret != 1) { + rdma_error("We failed to get 1 work completions , ret = %d \n", + ret); + return ret; + } + debug("Client side WRITE is complete \n"); + /* Now we prepare a READ using same variables but for destination */ + client_send_sge.addr = (uint64_t) client_dst_mr->addr; + client_send_sge.length = (uint32_t) client_dst_mr->length; + client_send_sge.lkey = client_dst_mr->lkey; + /* now we link to the send work request */ + bzero(&client_send_wr, sizeof(client_send_wr)); + client_send_wr.sg_list = &client_send_sge; + client_send_wr.num_sge = 1; + client_send_wr.opcode = IBV_WR_RDMA_READ; + client_send_wr.send_flags = IBV_SEND_SIGNALED; + /* we have to tell server side info for RDMA */ + client_send_wr.wr.rdma.rkey = server_metadata_attr.stag.remote_stag; + client_send_wr.wr.rdma.remote_addr = server_metadata_attr.address; + /* Now we post it */ + ret = ibv_post_send(client_qp, + &client_send_wr, + &bad_client_send_wr); + if (ret) { + rdma_error("Failed to read client dst buffer from the master, errno: %d \n", + -errno); + return -errno; + } + /* at this point we are expecting 1 work completion for the write */ + ret = process_work_completion_events(io_completion_channel, + &wc, 1); + if(ret != 1) { + rdma_error("We failed to get 1 work completions , ret = %d \n", + ret); + return ret; + } + debug("Client side READ is complete \n"); + return 0; +} + +/* This function disconnects the RDMA connection from the server and cleans up + * all the resources. + */ +static int client_disconnect_and_clean() +{ + struct rdma_cm_event *cm_event = NULL; + int ret = -1; + /* active disconnect from the client side */ + ret = rdma_disconnect(cm_client_id); + if (ret) { + rdma_error("Failed to disconnect, errno: %d \n", -errno); + //continuing anyways + } + ret = process_rdma_cm_event(cm_event_channel, + RDMA_CM_EVENT_DISCONNECTED, + &cm_event); + if (ret) { + rdma_error("Failed to get RDMA_CM_EVENT_DISCONNECTED event, ret = %d\n", + ret); + //continuing anyways + } + ret = rdma_ack_cm_event(cm_event); + if (ret) { + rdma_error("Failed to acknowledge cm event, errno: %d\n", + -errno); + //continuing anyways + } + /* Destroy QP */ + rdma_destroy_qp(cm_client_id); + /* Destroy client cm id */ + ret = rdma_destroy_id(cm_client_id); + if (ret) { + rdma_error("Failed to destroy client id cleanly, %d \n", -errno); + // we continue anyways; + } + /* Destroy CQ */ + ret = ibv_destroy_cq(client_cq); + if (ret) { + rdma_error("Failed to destroy completion queue cleanly, %d \n", -errno); + // we continue anyways; + } + /* Destroy completion channel */ + ret = ibv_destroy_comp_channel(io_completion_channel); + if (ret) { + rdma_error("Failed to destroy completion channel cleanly, %d \n", -errno); + // we continue anyways; + } + /* Destroy memory buffers */ + rdma_buffer_deregister(server_metadata_mr); + rdma_buffer_deregister(client_metadata_mr); + rdma_buffer_deregister(client_src_mr); + rdma_buffer_deregister(client_dst_mr); + /* We free the buffers */ + free(src); + free(dst); + /* Destroy protection domain */ + ret = ibv_dealloc_pd(pd); + if (ret) { + rdma_error("Failed to destroy client protection domain cleanly, %d \n", -errno); + // we continue anyways; + } + rdma_destroy_event_channel(cm_event_channel); + printf("Client resource clean up is complete \n"); + return 0; +} + +void usage() { + printf("Usage:\n"); + printf("rdma_client: [-a <server_addr>] [-p <server_port>] -s string (required)\n"); + printf("(default IP is 127.0.0.1 and port is %d)\n", DEFAULT_RDMA_PORT); + exit(1); +} + +int main(int argc, char **argv) { + struct sockaddr_in server_sockaddr; + int ret, option; + bzero(&server_sockaddr, sizeof server_sockaddr); + server_sockaddr.sin_family = AF_INET; + server_sockaddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + /* buffers are NULL */ + src = dst = NULL; + /* Parse Command Line Arguments */ + while ((option = getopt(argc, argv, "s:a:p:")) != -1) { + switch (option) { + case 's': + printf("Passed string is : %s , with count %u \n", + optarg, + (unsigned int) strlen(optarg)); + src = calloc(strlen(optarg) , 1); + if (!src) { + rdma_error("Failed to allocate memory : -ENOMEM\n"); + return -ENOMEM; + } + /* Copy the passes arguments */ + strncpy(src, optarg, strlen(optarg)); + dst = calloc(strlen(optarg), 1); + if (!dst) { + rdma_error("Failed to allocate destination memory, -ENOMEM\n"); + free(src); + return -ENOMEM; + } + break; + case 'a': + /* remember, this overwrites the port info */ + ret = get_addr(optarg, (struct sockaddr*) &server_sockaddr); + if (ret) { + rdma_error("Invalid IP \n"); + return ret; + } + break; + case 'p': + /* passed port to listen on */ + server_sockaddr.sin_port = htons(strtol(optarg, NULL, 0)); + break; + default: + usage(); + break; + } + } + if (!server_sockaddr.sin_port) { + /* no port provided, use the default port */ + server_sockaddr.sin_port = htons(DEFAULT_RDMA_PORT); + } + if (src == NULL) { + printf("Please provide a string to copy \n"); + usage(); + } + ret = client_prepare_connection(&server_sockaddr); + if (ret) { + rdma_error("Failed to setup client connection , ret = %d \n", ret); + return ret; + } + ret = client_pre_post_recv_buffer(); + if (ret) { + rdma_error("Failed to setup client connection , ret = %d \n", ret); + return ret; + } + ret = client_connect_to_server(); + if (ret) { + rdma_error("Failed to setup client connection , ret = %d \n", ret); + return ret; + } + ret = client_xchange_metadata_with_server(); + if (ret) { + rdma_error("Failed to setup client connection , ret = %d \n", ret); + return ret; + } + ret = client_remote_memory_ops(); + if (ret) { + rdma_error("Failed to finish remote memory ops, ret = %d \n", ret); + return ret; + } + if (check_src_dst()) { + rdma_error("src and dst buffers do not match \n"); + } else { + printf("...\nSUCCESS, source and destination buffers match \n"); + } + ret = client_disconnect_and_clean(); + if (ret) { + rdma_error("Failed to cleanly disconnect and clean up resources \n"); + } + return ret; +} + diff --git a/rdma-example-master/rdma-example-master/src/rdma_common.c b/rdma-example-master/rdma-example-master/src/rdma_common.c new file mode 100644 index 0000000..0478c29 --- /dev/null +++ b/rdma-example-master/rdma-example-master/src/rdma_common.c @@ -0,0 +1,210 @@ +/* + * Implementation of the common RDMA functions. + * + * Authors: Animesh Trivedi + */ + +#include "rdma_common.h" + +void show_rdma_cmid(struct rdma_cm_id *id) +{ + if(!id){ + rdma_error("Passed ptr is NULL\n"); + return; + } + printf("RDMA cm id at %p \n", id); + if(id->verbs && id->verbs->device) + printf("dev_ctx: %p (device name: %s) \n", id->verbs, + id->verbs->device->name); + if(id->channel) + printf("cm event channel %p\n", id->channel); + printf("QP: %p, port_space %x, port_num %u \n", id->qp, + id->ps, + id->port_num); +} + +void show_rdma_buffer_attr(struct rdma_buffer_attr *attr){ + if(!attr){ + rdma_error("Passed attr is NULL\n"); + return; + } + printf("---------------------------------------------------------\n"); + printf("buffer attr, addr: %p , len: %u , stag : 0x%x \n", + (void*) attr->address, + (unsigned int) attr->length, + attr->stag.local_stag); + printf("---------------------------------------------------------\n"); +} + +struct ibv_mr* rdma_buffer_alloc(struct ibv_pd *pd, uint32_t size, + enum ibv_access_flags permission) +{ + struct ibv_mr *mr = NULL; + if (!pd) { + rdma_error("Protection domain is NULL \n"); + return NULL; + } + void *buf = calloc(1, size); + if (!buf) { + rdma_error("failed to allocate buffer, -ENOMEM\n"); + return NULL; + } + debug("Buffer allocated: %p , len: %u \n", buf, size); + mr = rdma_buffer_register(pd, buf, size, permission); + if(!mr){ + free(buf); + } + return mr; +} + +struct ibv_mr *rdma_buffer_register(struct ibv_pd *pd, + void *addr, uint32_t length, + enum ibv_access_flags permission) +{ + struct ibv_mr *mr = NULL; + if (!pd) { + rdma_error("Protection domain is NULL, ignoring \n"); + return NULL; + } + mr = ibv_reg_mr(pd, addr, length, permission); + if (!mr) { + rdma_error("Failed to create mr on buffer, errno: %d \n", -errno); + return NULL; + } + debug("Registered: %p , len: %u , stag: 0x%x \n", + mr->addr, + (unsigned int) mr->length, + mr->lkey); + return mr; +} + +void rdma_buffer_free(struct ibv_mr *mr) +{ + if (!mr) { + rdma_error("Passed memory region is NULL, ignoring\n"); + return ; + } + void *to_free = mr->addr; + rdma_buffer_deregister(mr); + debug("Buffer %p free'ed\n", to_free); + free(to_free); +} + +void rdma_buffer_deregister(struct ibv_mr *mr) +{ + if (!mr) { + rdma_error("Passed memory region is NULL, ignoring\n"); + return; + } + debug("Deregistered: %p , len: %u , stag : 0x%x \n", + mr->addr, + (unsigned int) mr->length, + mr->lkey); + ibv_dereg_mr(mr); +} + +int process_rdma_cm_event(struct rdma_event_channel *echannel, + enum rdma_cm_event_type expected_event, + struct rdma_cm_event **cm_event) +{ + int ret = 1; + ret = rdma_get_cm_event(echannel, cm_event); + if (ret) { + rdma_error("Failed to retrieve a cm event, errno: %d \n", + -errno); + return -errno; + } + /* lets see, if it was a good event */ + if(0 != (*cm_event)->status){ + rdma_error("CM event has non zero status: %d\n", (*cm_event)->status); + ret = -((*cm_event)->status); + /* important, we acknowledge the event */ + rdma_ack_cm_event(*cm_event); + return ret; + } + /* if it was a good event, was it of the expected type */ + if ((*cm_event)->event != expected_event) { + rdma_error("Unexpected event received: %s [ expecting: %s ]", + rdma_event_str((*cm_event)->event), + rdma_event_str(expected_event)); + /* important, we acknowledge the event */ + rdma_ack_cm_event(*cm_event); + return -1; // unexpected event :( + } + debug("A new %s type event is received \n", rdma_event_str((*cm_event)->event)); + /* The caller must acknowledge the event */ + return ret; +} + + +int process_work_completion_events (struct ibv_comp_channel *comp_channel, + struct ibv_wc *wc, int max_wc) +{ + struct ibv_cq *cq_ptr = NULL; + void *context = NULL; + int ret = -1, i, total_wc = 0; + /* We wait for the notification on the CQ channel */ + ret = ibv_get_cq_event(comp_channel, /* IO channel where we are expecting the notification */ + &cq_ptr, /* which CQ has an activity. This should be the same as CQ we created before */ + &context); /* Associated CQ user context, which we did set */ + if (ret) { + rdma_error("Failed to get next CQ event due to %d \n", -errno); + return -errno; + } + /* Request for more notifications. */ + ret = ibv_req_notify_cq(cq_ptr, 0); + if (ret){ + rdma_error("Failed to request further notifications %d \n", -errno); + return -errno; + } + /* We got notification. We reap the work completion (WC) element. It is + * unlikely but a good practice it write the CQ polling code that + * can handle zero WCs. ibv_poll_cq can return zero. Same logic as + * MUTEX conditional variables in pthread programming. + */ + total_wc = 0; + do { + ret = ibv_poll_cq(cq_ptr /* the CQ, we got notification for */, + max_wc - total_wc /* number of remaining WC elements*/, + wc + total_wc/* where to store */); + if (ret < 0) { + rdma_error("Failed to poll cq for wc due to %d \n", ret); + /* ret is errno here */ + return ret; + } + total_wc += ret; + } while (total_wc < max_wc); + debug("%d WC are completed \n", total_wc); + /* Now we check validity and status of I/O work completions */ + for( i = 0 ; i < total_wc ; i++) { + if (wc[i].status != IBV_WC_SUCCESS) { + rdma_error("Work completion (WC) has error status: %s at index %d", + ibv_wc_status_str(wc[i].status), i); + /* return negative value */ + return -(wc[i].status); + } + } + /* Similar to connection management events, we need to acknowledge CQ events */ + ibv_ack_cq_events(cq_ptr, + 1 /* we received one event notification. This is not + number of WC elements */); + return total_wc; +} + + +/* Code acknowledgment: rping.c from librdmacm/examples */ +int get_addr(char *dst, struct sockaddr *addr) +{ + struct addrinfo *res; + int ret = -1; + ret = getaddrinfo(dst, NULL, NULL, &res); + if (ret) { + rdma_error("getaddrinfo failed - invalid hostname or IP address\n"); + return ret; + } + memcpy(addr, res->ai_addr, sizeof(struct sockaddr_in)); + freeaddrinfo(res); + return ret; +} + diff --git a/rdma-example-master/rdma-example-master/src/rdma_common.h b/rdma-example-master/rdma-example-master/src/rdma_common.h new file mode 100644 index 0000000..5a228c9 --- /dev/null +++ b/rdma-example-master/rdma-example-master/src/rdma_common.h @@ -0,0 +1,133 @@ +/* + * Header file for the common RDMA routines used in the server/client example + * program. + * + * Author: Animesh Trivedi + * + */ + +#ifndef RDMA_COMMON_H +#define RDMA_COMMON_H + + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <getopt.h> + +#include <netdb.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <sys/socket.h> + +#include <rdma/rdma_cma.h> +#include <infiniband/verbs.h> + +/* Error Macro*/ +#define rdma_error(msg, args...) do {\ + fprintf(stderr, "%s : %d : ERROR : "msg, __FILE__, __LINE__, ## args);\ +}while(0); + +#ifdef ACN_RDMA_DEBUG +/* Debug Macro */ +#define debug(msg, args...) do {\ + printf("DEBUG: "msg, ## args);\ +}while(0); + +#else + +#define debug(msg, args...) + +#endif /* ACN_RDMA_DEBUG */ + +/* Capacity of the completion queue (CQ) */ +#define CQ_CAPACITY (16) +/* MAX SGE capacity */ +#define MAX_SGE (2) +/* MAX work requests */ +#define MAX_WR (8) +/* Default port where the RDMA server is listening */ +#define DEFAULT_RDMA_PORT (20886) + +/* + * We use attribute so that compiler does not step in and try to pad the structure. + * We use this structure to exchange information between the server and the client. + * + * For details see: http://gcc.gnu.org/onlinedocs/gcc/Type-Attributes.html + */ +struct __attribute((packed)) rdma_buffer_attr { + uint64_t address; + uint32_t length; + union stag { + /* if we send, we call it local stags */ + uint32_t local_stag; + /* if we receive, we call it remote stag */ + uint32_t remote_stag; + }stag; +}; +/* resolves a given destination name to sin_addr */ +int get_addr(char *dst, struct sockaddr *addr); + +/* prints RDMA buffer info structure */ +void show_rdma_buffer_attr(struct rdma_buffer_attr *attr); + +/* + * Processes an RDMA connection management (CM) event. + * @echannel: CM event channel where the event is expected. + * @expected_event: Expected event type + * @cm_event: where the event will be stored + */ +int process_rdma_cm_event(struct rdma_event_channel *echannel, + enum rdma_cm_event_type expected_event, + struct rdma_cm_event **cm_event); + +/* Allocates an RDMA buffer of size 'length' with permission permission. This + * function will also register the memory and returns a memory region (MR) + * identifier or NULL on error. + * @pd: Protection domain where the buffer should be allocated + * @length: Length of the buffer + * @permission: OR of IBV_ACCESS_* permissions as defined for the enum ibv_access_flags + */ +struct ibv_mr* rdma_buffer_alloc(struct ibv_pd *pd, + uint32_t length, + enum ibv_access_flags permission); + +/* Frees a previously allocated RDMA buffer. The buffer must be allocated by + * calling rdma_buffer_alloc(); + * @mr: RDMA memory region to free + */ +void rdma_buffer_free(struct ibv_mr *mr); + +/* This function registers a previously allocated memory. Returns a memory region + * (MR) identifier or NULL on error. + * @pd: protection domain where to register memory + * @addr: Buffer address + * @length: Length of the buffer + * @permission: OR of IBV_ACCESS_* permissions as defined for the enum ibv_access_flags + */ +struct ibv_mr *rdma_buffer_register(struct ibv_pd *pd, + void *addr, + uint32_t length, + enum ibv_access_flags permission); +/* Deregisters a previously register memory + * @mr: Memory region to deregister + */ +void rdma_buffer_deregister(struct ibv_mr *mr); + +/* Processes a work completion (WC) notification. + * @comp_channel: Completion channel where the notifications are expected to arrive + * @wc: Array where to hold the work completion elements + * @max_wc: Maximum number of expected work completion (WC) elements. wc must be + * atleast this size. + */ +int process_work_completion_events(struct ibv_comp_channel *comp_channel, + struct ibv_wc *wc, + int max_wc); + +/* prints some details from the cm id */ +void show_rdma_cmid(struct rdma_cm_id *id); + +#endif /* RDMA_COMMON_H */ diff --git a/rdma-example-master/rdma-example-master/src/rdma_server.c b/rdma-example-master/rdma-example-master/src/rdma_server.c new file mode 100644 index 0000000..42f018f --- /dev/null +++ b/rdma-example-master/rdma-example-master/src/rdma_server.c @@ -0,0 +1,489 @@ +/* + * This is a RDMA server side code. + * + * Author: Animesh Trivedi + * + * TODO: Cleanup previously allocated resources in case of an error condition + */ + +#include "rdma_common.h" + +/* These are the RDMA resources needed to setup an RDMA connection */ +/* Event channel, where connection management (cm) related events are relayed */ +static struct rdma_event_channel *cm_event_channel = NULL; +static struct rdma_cm_id *cm_server_id = NULL, *cm_client_id = NULL; +static struct ibv_pd *pd = NULL; +static struct ibv_comp_channel *io_completion_channel = NULL; +static struct ibv_cq *cq = NULL; +static struct ibv_qp_init_attr qp_init_attr; +static struct ibv_qp *client_qp = NULL; +/* RDMA memory resources */ +static struct ibv_mr *client_metadata_mr = NULL, *server_buffer_mr = NULL, *server_metadata_mr = NULL; +static struct rdma_buffer_attr client_metadata_attr, server_metadata_attr; +static struct ibv_recv_wr client_recv_wr, *bad_client_recv_wr = NULL; +static struct ibv_send_wr server_send_wr, *bad_server_send_wr = NULL; +static struct ibv_sge client_recv_sge, server_send_sge; + +/* When we call this function cm_client_id must be set to a valid identifier. + * This is where, we prepare client connection before we accept it. This + * mainly involve pre-posting a receive buffer to receive client side + * RDMA credentials + */ +static int setup_client_resources() +{ + int ret = -1; + if(!cm_client_id){ + rdma_error("Client id is still NULL \n"); + return -EINVAL; + } + /* We have a valid connection identifier, lets start to allocate + * resources. We need: + * 1. Protection Domains (PD) + * 2. Memory Buffers + * 3. Completion Queues (CQ) + * 4. Queue Pair (QP) + * Protection Domain (PD) is similar to a "process abstraction" + * in the operating system. All resources are tied to a particular PD. + * And accessing recourses across PD will result in a protection fault. + */ + pd = ibv_alloc_pd(cm_client_id->verbs + /* verbs defines a verb's provider, + * i.e an RDMA device where the incoming + * client connection came */); + if (!pd) { + rdma_error("Failed to allocate a protection domain errno: %d\n", + -errno); + return -errno; + } + debug("A new protection domain is allocated at %p \n", pd); + /* Now we need a completion channel, were the I/O completion + * notifications are sent. Remember, this is different from connection + * management (CM) event notifications. + * A completion channel is also tied to an RDMA device, hence we will + * use cm_client_id->verbs. + */ + io_completion_channel = ibv_create_comp_channel(cm_client_id->verbs); + if (!io_completion_channel) { + rdma_error("Failed to create an I/O completion event channel, %d\n", + -errno); + return -errno; + } + debug("An I/O completion event channel is created at %p \n", + io_completion_channel); + /* Now we create a completion queue (CQ) where actual I/O + * completion metadata is placed. The metadata is packed into a structure + * called struct ibv_wc (wc = work completion). ibv_wc has detailed + * information about the work completion. An I/O request in RDMA world + * is called "work" ;) + */ + cq = ibv_create_cq(cm_client_id->verbs /* which device*/, + CQ_CAPACITY /* maximum capacity*/, + NULL /* user context, not used here */, + io_completion_channel /* which IO completion channel */, + 0 /* signaling vector, not used here*/); + if (!cq) { + rdma_error("Failed to create a completion queue (cq), errno: %d\n", + -errno); + return -errno; + } + debug("Completion queue (CQ) is created at %p with %d elements \n", + cq, cq->cqe); + /* Ask for the event for all activities in the completion queue*/ + ret = ibv_req_notify_cq(cq /* on which CQ */, + 0 /* 0 = all event type, no filter*/); + if (ret) { + rdma_error("Failed to request notifications on CQ errno: %d \n", + -errno); + return -errno; + } + /* Now the last step, set up the queue pair (send, recv) queues and their capacity. + * The capacity here is define statically but this can be probed from the + * device. We just use a small number as defined in rdma_common.h */ + bzero(&qp_init_attr, sizeof qp_init_attr); + qp_init_attr.cap.max_recv_sge = MAX_SGE; /* Maximum SGE per receive posting */ + qp_init_attr.cap.max_recv_wr = MAX_WR; /* Maximum receive posting capacity */ + qp_init_attr.cap.max_send_sge = MAX_SGE; /* Maximum SGE per send posting */ + qp_init_attr.cap.max_send_wr = MAX_WR; /* Maximum send posting capacity */ + qp_init_attr.qp_type = IBV_QPT_RC; /* QP type, RC = Reliable connection */ + /* We use same completion queue, but one can use different queues */ + qp_init_attr.recv_cq = cq; /* Where should I notify for receive completion operations */ + qp_init_attr.send_cq = cq; /* Where should I notify for send completion operations */ + /*Lets create a QP */ + ret = rdma_create_qp(cm_client_id /* which connection id */, + pd /* which protection domain*/, + &qp_init_attr /* Initial attributes */); + if (ret) { + rdma_error("Failed to create QP due to errno: %d\n", -errno); + return -errno; + } + /* Save the reference for handy typing but is not required */ + client_qp = cm_client_id->qp; + debug("Client QP created at %p\n", client_qp); + return ret; +} + +/* Starts an RDMA server by allocating basic connection resources */ +static int start_rdma_server(struct sockaddr_in *server_addr) +{ + struct rdma_cm_event *cm_event = NULL; + int ret = -1; + /* Open a channel used to report asynchronous communication event */ + cm_event_channel = rdma_create_event_channel(); + if (!cm_event_channel) { + rdma_error("Creating cm event channel failed with errno : (%d)", -errno); + return -errno; + } + debug("RDMA CM event channel is created successfully at %p \n", + cm_event_channel); + /* rdma_cm_id is the connection identifier (like socket) which is used + * to define an RDMA connection. + */ + ret = rdma_create_id(cm_event_channel, &cm_server_id, NULL, RDMA_PS_TCP); + if (ret) { + rdma_error("Creating server cm id failed with errno: %d ", -errno); + return -errno; + } + debug("A RDMA connection id for the server is created \n"); + /* Explicit binding of rdma cm id to the socket credentials */ + ret = rdma_bind_addr(cm_server_id, (struct sockaddr*) server_addr); + if (ret) { + rdma_error("Failed to bind server address, errno: %d \n", -errno); + return -errno; + } + debug("Server RDMA CM id is successfully binded \n"); + /* Now we start to listen on the passed IP and port. However unlike + * normal TCP listen, this is a non-blocking call. When a new client is + * connected, a new connection management (CM) event is generated on the + * RDMA CM event channel from where the listening id was created. Here we + * have only one channel, so it is easy. */ + ret = rdma_listen(cm_server_id, 8); /* backlog = 8 clients, same as TCP, see man listen*/ + if (ret) { + rdma_error("rdma_listen failed to listen on server address, errno: %d ", + -errno); + return -errno; + } + printf("Server is listening successfully at: %s , port: %d \n", + inet_ntoa(server_addr->sin_addr), + ntohs(server_addr->sin_port)); + /* now, we expect a client to connect and generate a RDMA_CM_EVNET_CONNECT_REQUEST + * We wait (block) on the connection management event channel for + * the connect event. + */ + ret = process_rdma_cm_event(cm_event_channel, + RDMA_CM_EVENT_CONNECT_REQUEST, + &cm_event); + if (ret) { + rdma_error("Failed to get cm event, ret = %d \n" , ret); + return ret; + } + /* Much like TCP connection, listening returns a new connection identifier + * for newly connected client. In the case of RDMA, this is stored in id + * field. For more details: man rdma_get_cm_event + */ + cm_client_id = cm_event->id; + /* now we acknowledge the event. Acknowledging the event free the resources + * associated with the event structure. Hence any reference to the event + * must be made before acknowledgment. Like, we have already saved the + * client id from "id" field before acknowledging the event. + */ + ret = rdma_ack_cm_event(cm_event); + if (ret) { + rdma_error("Failed to acknowledge the cm event errno: %d \n", -errno); + return -errno; + } + debug("A new RDMA client connection id is stored at %p\n", cm_client_id); + return ret; +} + +/* Pre-posts a receive buffer and accepts an RDMA client connection */ +static int accept_client_connection() +{ + struct rdma_conn_param conn_param; + struct rdma_cm_event *cm_event = NULL; + struct sockaddr_in remote_sockaddr; + int ret = -1; + if(!cm_client_id || !client_qp) { + rdma_error("Client resources are not properly setup\n"); + return -EINVAL; + } + /* we prepare the receive buffer in which we will receive the client metadata*/ + client_metadata_mr = rdma_buffer_register(pd /* which protection domain */, + &client_metadata_attr /* what memory */, + sizeof(client_metadata_attr) /* what length */, + (IBV_ACCESS_LOCAL_WRITE) /* access permissions */); + if(!client_metadata_mr){ + rdma_error("Failed to register client attr buffer\n"); + //we assume ENOMEM + return -ENOMEM; + } + /* We pre-post this receive buffer on the QP. SGE credentials is where we + * receive the metadata from the client */ + client_recv_sge.addr = (uint64_t) client_metadata_mr->addr; // same as &client_buffer_attr + client_recv_sge.length = client_metadata_mr->length; + client_recv_sge.lkey = client_metadata_mr->lkey; + /* Now we link this SGE to the work request (WR) */ + bzero(&client_recv_wr, sizeof(client_recv_wr)); + client_recv_wr.sg_list = &client_recv_sge; + client_recv_wr.num_sge = 1; // only one SGE + ret = ibv_post_recv(client_qp /* which QP */, + &client_recv_wr /* receive work request*/, + &bad_client_recv_wr /* error WRs */); + if (ret) { + rdma_error("Failed to pre-post the receive buffer, errno: %d \n", ret); + return ret; + } + debug("Receive buffer pre-posting is successful \n"); + /* Now we accept the connection. Recall we have not accepted the connection + * yet because we have to do lots of resource pre-allocation */ + memset(&conn_param, 0, sizeof(conn_param)); + /* this tell how many outstanding requests can we handle */ + conn_param.initiator_depth = 3; /* For this exercise, we put a small number here */ + /* This tell how many outstanding requests we expect other side to handle */ + conn_param.responder_resources = 3; /* For this exercise, we put a small number */ + ret = rdma_accept(cm_client_id, &conn_param); + if (ret) { + rdma_error("Failed to accept the connection, errno: %d \n", -errno); + return -errno; + } + /* We expect an RDMA_CM_EVNET_ESTABLISHED to indicate that the RDMA + * connection has been established and everything is fine on both, server + * as well as the client sides. + */ + debug("Going to wait for : RDMA_CM_EVENT_ESTABLISHED event \n"); + ret = process_rdma_cm_event(cm_event_channel, + RDMA_CM_EVENT_ESTABLISHED, + &cm_event); + if (ret) { + rdma_error("Failed to get the cm event, errnp: %d \n", -errno); + return -errno; + } + /* We acknowledge the event */ + ret = rdma_ack_cm_event(cm_event); + if (ret) { + rdma_error("Failed to acknowledge the cm event %d\n", -errno); + return -errno; + } + /* Just FYI: How to extract connection information */ + memcpy(&remote_sockaddr /* where to save */, + rdma_get_peer_addr(cm_client_id) /* gives you remote sockaddr */, + sizeof(struct sockaddr_in) /* max size */); + printf("A new connection is accepted from %s \n", + inet_ntoa(remote_sockaddr.sin_addr)); + return ret; +} + +/* This function sends server side buffer metadata to the connected client */ +static int send_server_metadata_to_client() +{ + struct ibv_wc wc; + int ret = -1; + /* Now, we first wait for the client to start the communication by + * sending the server its metadata info. The server does not use it + * in our example. We will receive a work completion notification for + * our pre-posted receive request. + */ + ret = process_work_completion_events(io_completion_channel, &wc, 1); + if (ret != 1) { + rdma_error("Failed to receive , ret = %d \n", ret); + return ret; + } + /* if all good, then we should have client's buffer information, lets see */ + printf("Client side buffer information is received...\n"); + show_rdma_buffer_attr(&client_metadata_attr); + printf("The client has requested buffer length of : %u bytes \n", + client_metadata_attr.length); + /* We need to setup requested memory buffer. This is where the client will + * do RDMA READs and WRITEs. */ + server_buffer_mr = rdma_buffer_alloc(pd /* which protection domain */, + client_metadata_attr.length /* what size to allocate */, + (IBV_ACCESS_LOCAL_WRITE| + IBV_ACCESS_REMOTE_READ| + IBV_ACCESS_REMOTE_WRITE) /* access permissions */); + if(!server_buffer_mr){ + rdma_error("Server failed to create a buffer \n"); + /* we assume that it is due to out of memory error */ + return -ENOMEM; + } + /* This buffer is used to transmit information about the above + * buffer to the client. So this contains the metadata about the server + * buffer. Hence this is called metadata buffer. Since this is already + * on allocated, we just register it. + * We need to prepare a send I/O operation that will tell the + * client the address of the server buffer. + */ + server_metadata_attr.address = (uint64_t) server_buffer_mr->addr; + server_metadata_attr.length = (uint32_t) server_buffer_mr->length; + server_metadata_attr.stag.local_stag = (uint32_t) server_buffer_mr->lkey; + server_metadata_mr = rdma_buffer_register(pd /* which protection domain*/, + &server_metadata_attr /* which memory to register */, + sizeof(server_metadata_attr) /* what is the size of memory */, + IBV_ACCESS_LOCAL_WRITE /* what access permission */); + if(!server_metadata_mr){ + rdma_error("Server failed to create to hold server metadata \n"); + /* we assume that this is due to out of memory error */ + return -ENOMEM; + } + /* We need to transmit this buffer. So we create a send request. + * A send request consists of multiple SGE elements. In our case, we only + * have one + */ + server_send_sge.addr = (uint64_t) &server_metadata_attr; + server_send_sge.length = sizeof(server_metadata_attr); + server_send_sge.lkey = server_metadata_mr->lkey; + /* now we link this sge to the send request */ + bzero(&server_send_wr, sizeof(server_send_wr)); + server_send_wr.sg_list = &server_send_sge; + server_send_wr.num_sge = 1; // only 1 SGE element in the array + server_send_wr.opcode = IBV_WR_SEND; // This is a send request + server_send_wr.send_flags = IBV_SEND_SIGNALED; // We want to get notification + /* This is a fast data path operation. Posting an I/O request */ + ret = ibv_post_send(client_qp /* which QP */, + &server_send_wr /* Send request that we prepared before */, + &bad_server_send_wr /* In case of error, this will contain failed requests */); + if (ret) { + rdma_error("Posting of server metdata failed, errno: %d \n", + -errno); + return -errno; + } + /* We check for completion notification */ + ret = process_work_completion_events(io_completion_channel, &wc, 1); + if (ret != 1) { + rdma_error("Failed to send server metadata, ret = %d \n", ret); + return ret; + } + debug("Local buffer metadata has been sent to the client \n"); + return 0; +} + +/* This is server side logic. Server passively waits for the client to call + * rdma_disconnect() and then it will clean up its resources */ +static int disconnect_and_cleanup() +{ + struct rdma_cm_event *cm_event = NULL; + int ret = -1; + /* Now we wait for the client to send us disconnect event */ + debug("Waiting for cm event: RDMA_CM_EVENT_DISCONNECTED\n"); + ret = process_rdma_cm_event(cm_event_channel, + RDMA_CM_EVENT_DISCONNECTED, + &cm_event); + if (ret) { + rdma_error("Failed to get disconnect event, ret = %d \n", ret); + return ret; + } + /* We acknowledge the event */ + ret = rdma_ack_cm_event(cm_event); + if (ret) { + rdma_error("Failed to acknowledge the cm event %d\n", -errno); + return -errno; + } + printf("A disconnect event is received from the client...\n"); + /* We free all the resources */ + /* Destroy QP */ + rdma_destroy_qp(cm_client_id); + /* Destroy client cm id */ + ret = rdma_destroy_id(cm_client_id); + if (ret) { + rdma_error("Failed to destroy client id cleanly, %d \n", -errno); + // we continue anyways; + } + /* Destroy CQ */ + ret = ibv_destroy_cq(cq); + if (ret) { + rdma_error("Failed to destroy completion queue cleanly, %d \n", -errno); + // we continue anyways; + } + /* Destroy completion channel */ + ret = ibv_destroy_comp_channel(io_completion_channel); + if (ret) { + rdma_error("Failed to destroy completion channel cleanly, %d \n", -errno); + // we continue anyways; + } + /* Destroy memory buffers */ + rdma_buffer_free(server_buffer_mr); + rdma_buffer_deregister(server_metadata_mr); + rdma_buffer_deregister(client_metadata_mr); + /* Destroy protection domain */ + ret = ibv_dealloc_pd(pd); + if (ret) { + rdma_error("Failed to destroy client protection domain cleanly, %d \n", -errno); + // we continue anyways; + } + /* Destroy rdma server id */ + ret = rdma_destroy_id(cm_server_id); + if (ret) { + rdma_error("Failed to destroy server id cleanly, %d \n", -errno); + // we continue anyways; + } + rdma_destroy_event_channel(cm_event_channel); + printf("Server shut-down is complete \n"); + return 0; +} + + +void usage() +{ + printf("Usage:\n"); + printf("rdma_server: [-a <server_addr>] [-p <server_port>]\n"); + printf("(default port is %d)\n", DEFAULT_RDMA_PORT); + exit(1); +} + +int main(int argc, char **argv) +{ + int ret, option; + struct sockaddr_in server_sockaddr; + bzero(&server_sockaddr, sizeof server_sockaddr); + server_sockaddr.sin_family = AF_INET; /* standard IP NET address */ + server_sockaddr.sin_addr.s_addr = htonl(INADDR_ANY); /* passed address */ + /* Parse Command Line Arguments, not the most reliable code */ + while ((option = getopt(argc, argv, "a:p:")) != -1) { + switch (option) { + case 'a': + /* Remember, this will overwrite the port info */ + ret = get_addr(optarg, (struct sockaddr*) &server_sockaddr); + if (ret) { + rdma_error("Invalid IP \n"); + return ret; + } + break; + case 'p': + /* passed port to listen on */ + server_sockaddr.sin_port = htons(strtol(optarg, NULL, 0)); + break; + default: + usage(); + break; + } + } + if(!server_sockaddr.sin_port) { + /* If still zero, that mean no port info provided */ + server_sockaddr.sin_port = htons(DEFAULT_RDMA_PORT); /* use default port */ + } + ret = start_rdma_server(&server_sockaddr); + if (ret) { + rdma_error("RDMA server failed to start cleanly, ret = %d \n", ret); + return ret; + } + ret = setup_client_resources(); + if (ret) { + rdma_error("Failed to setup client resources, ret = %d \n", ret); + return ret; + } + ret = accept_client_connection(); + if (ret) { + rdma_error("Failed to handle client cleanly, ret = %d \n", ret); + return ret; + } + ret = send_server_metadata_to_client(); + if (ret) { + rdma_error("Failed to send server metadata to the client, ret = %d \n", ret); + return ret; + } + ret = disconnect_and_cleanup(); + if (ret) { + rdma_error("Failed to clean up resources properly, ret = %d \n", ret); + return ret; + } + return 0; +} |
